基于三元组提取领域元数据...

v_0.0.2
xueqingkun 4 weeks ago
parent 307793842e
commit 87dd276078

@ -26,11 +26,6 @@ public class ErAttribute implements Serializable {
/**
*
*/
private String erName;
/**
*
*/
private String attrName;
/**

@ -2,6 +2,8 @@ package com.supervision.pdfqaserver.dto;
import com.supervision.pdfqaserver.domain.DomainMetadata;
import lombok.Data;
import java.util.ArrayList;
import java.util.List;
/**
@ -28,21 +30,21 @@ public class DomainMetadataDTO {
*/
private String sourceType;
private List<ERAttributeDTO> sourceAttributes;
private List<ERAttributeDTO> sourceAttributes = new ArrayList<>();
/**
*
*/
private String relation;
private List<ERAttributeDTO> relationAttributes;
private List<ERAttributeDTO> relationAttributes = new ArrayList<>();
/**
*
*/
private String targetType;
private List<ERAttributeDTO> targetAttributes;
private List<ERAttributeDTO> targetAttributes = new ArrayList<>();
/**
* 0=1=

@ -1,5 +1,6 @@
package com.supervision.pdfqaserver.dto;
import com.supervision.pdfqaserver.domain.ErAttribute;
import lombok.Data;
@Data
@ -31,4 +32,14 @@ public class ERAttributeDTO {
* 1 2
*/
private String erType;
public ErAttribute toErAttribute() {
ErAttribute erAttribute = new ErAttribute();
erAttribute.setId(this.id);
erAttribute.setDomainMetadataId(this.domainMetadataId);
erAttribute.setAttrName(this.attrName);
erAttribute.setAttrValueType(this.attrValueType);
erAttribute.setErType(this.erType);
return erAttribute;
}
}

@ -1,5 +1,6 @@
package com.supervision.pdfqaserver.dto;
import com.supervision.pdfqaserver.domain.Intention;
import lombok.Data;
/**
@ -8,4 +9,34 @@ import lombok.Data;
@Data
public class IntentDTO {
private String id;
/**
*
*/
private String digest;
/**
*
*/
private String desc;
/**
* id
*/
private String domainCategoryId;
/**
* 0=1=
*/
private String generationType;
public IntentDTO(Intention intention){
this.id = intention.getId();
this.digest = intention.getDigest();
this.desc = intention.getDesc();
this.domainCategoryId = intention.getDomainCategoryId();
this.generationType = intention.getGenerationType();
}
}

@ -3,6 +3,8 @@ package com.supervision.pdfqaserver.mapper;
import com.supervision.pdfqaserver.domain.DocumentTruncation;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import java.util.List;
/**
* @author Administrator
* @description document_truncation()Mapper
@ -11,6 +13,7 @@ import com.baomidou.mybatisplus.core.mapper.BaseMapper;
*/
public interface DocumentTruncationMapper extends BaseMapper<DocumentTruncation> {
List<DocumentTruncation> listByPdfId(Integer pdfId);
}

@ -25,4 +25,7 @@ public interface DocumentTruncationService extends IService<DocumentTruncation>
List<DocumentTruncation> queryByDocumentIds(List<String> documentIds);
List<DocumentTruncation> queryNotERETruncate(List<String> documentIds);
List<DocumentTruncation> listByPdfId(Integer pdfId);
}

@ -25,6 +25,12 @@ public interface DomainMetadataService extends IService<DomainMetadata> {
void saveIfNotExists(DomainMetadata metadata, String domainCategoryId);
/**
*
* @param metadatas
* @param intentionId
* @param domainCategoryId
*/
void batchSaveOrUpdateMetadata(List<DomainMetadataDTO> metadatas,String intentionId,String domainCategoryId);
void completeSave(DomainMetadataDTO domainMetadataDTO);

@ -3,6 +3,8 @@ package com.supervision.pdfqaserver.service;
import com.supervision.pdfqaserver.domain.ErAttribute;
import com.baomidou.mybatisplus.extension.service.IService;
import java.util.List;
/**
* @author Administrator
* @description er_attribute()Service
@ -10,4 +12,9 @@ import com.baomidou.mybatisplus.extension.service.IService;
*/
public interface ErAttributeService extends IService<ErAttribute> {
void saveIfAbsents(ErAttribute erAttribute, String domainMetadataId);
List<ErAttribute> listByDomainMetadataId(String domainMetadataId);
}

@ -31,4 +31,6 @@ public interface IntentionService extends IService<Intention> {
* @return
*/
Intention queryByDigestAndDomainCategoryId(String digest, String domainCategoryId);
List<Intention> queryByDomainCategoryId(String domainCategoryId);
}

@ -30,6 +30,12 @@ public interface KnowledgeGraphService {
void generateGraphBaseTrain(Integer pdfId);
/**
*
* @param contentType
* @param industry
* @return
*/
TripleConversionPipeline getTripleConversionPipeline(String contentType,String industry);
void generateGraph(List<EREDTO> eredtoList);

@ -44,7 +44,7 @@ public interface TripleConversionPipeline {
/**
* truncate
* truncate, 使
* @param truncate
* @return DomainMetadataDTO
*/

@ -68,6 +68,11 @@ public class DocumentTruncationServiceImpl extends ServiceImpl<DocumentTruncatio
public List<DocumentTruncation> queryNotERETruncate(List<String> documentIds) {
return null;
}
@Override
public List<DocumentTruncation> listByPdfId(Integer pdfId) {
return super.baseMapper.listByPdfId(pdfId);
}
}

@ -5,6 +5,7 @@ import cn.hutool.core.lang.Assert;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.supervision.pdfqaserver.domain.DomainMetadata;
import com.supervision.pdfqaserver.dto.DomainMetadataDTO;
import com.supervision.pdfqaserver.dto.ERAttributeDTO;
import com.supervision.pdfqaserver.service.DomainMetadataService;
import com.supervision.pdfqaserver.mapper.DomainMetadataMapper;
import com.supervision.pdfqaserver.service.ErAttributeService;
@ -31,6 +32,7 @@ public class DomainMetadataServiceImpl extends ServiceImpl<DomainMetadataMapper,
private final IntentionDomainMetadataService intentionDomainMetadataService;
private final ErAttributeService erAttributeService;
@Override
public void saveIfNotExists(DomainMetadata metadata) {
@ -71,6 +73,27 @@ public class DomainMetadataServiceImpl extends ServiceImpl<DomainMetadataMapper,
this.saveIfNotExists(domainMetadata, domainCategoryId);
metadata.setId(domainMetadata.getId());
}
// 保存意图和领域元数据的关系
intentionDomainMetadataService.batchSaveIfAbsent(intentionId, List.of(metadata.getId()));
// 保存意图和领域元数据的关系属性
List<ERAttributeDTO> relationAttributes = metadata.getRelationAttributes();
if (CollUtil.isNotEmpty(relationAttributes)){
for (ERAttributeDTO relationAttribute : relationAttributes) {
relationAttribute.setDomainMetadataId(metadata.getId());
relationAttribute.setErType("2");
erAttributeService.saveIfAbsents(relationAttribute.toErAttribute(), metadata.getId());
}
}
// 保存意图和领域元数据的节点属性
List<ERAttributeDTO> nodeAttributes = metadata.getSourceAttributes();
nodeAttributes.addAll(metadata.getTargetAttributes());
if (CollUtil.isNotEmpty(nodeAttributes)){
for (ERAttributeDTO nodeAttribute : nodeAttributes) {
nodeAttribute.setDomainMetadataId(metadata.getId());
nodeAttribute.setErType("1");
erAttributeService.saveIfAbsents(nodeAttribute.toErAttribute(), metadata.getId());
}
}
}
}

@ -1,20 +1,44 @@
package com.supervision.pdfqaserver.service.impl;
import cn.hutool.core.lang.Assert;
import cn.hutool.core.util.StrUtil;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.supervision.pdfqaserver.domain.ErAttribute;
import com.supervision.pdfqaserver.service.ErAttributeService;
import com.supervision.pdfqaserver.mapper.ErAttributeMapper;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.util.List;
/**
* @author Administrator
* @description er_attribute()Service
* @createDate 2025-05-14 15:23:54
*/
@Slf4j
@Service
public class ErAttributeServiceImpl extends ServiceImpl<ErAttributeMapper, ErAttribute>
implements ErAttributeService{
@Override
public void saveIfAbsents(ErAttribute erAttribute, String domainMetadataId) {
Assert.notEmpty(domainMetadataId, "领域分类id不能为空");
List<ErAttribute> erAttributes = this.listByDomainMetadataId(domainMetadataId);
boolean exists = erAttributes.stream().anyMatch(item -> StrUtil.equals(item.getAttrName(), erAttribute.getAttrName())
&& StrUtil.equals(item.getAttrValueType(), erAttribute.getAttrValueType()));
if (exists){
log.info("属性已存在,{},不进行保存...", erAttribute.getAttrName());
return;
}
erAttribute.setDomainMetadataId(domainMetadataId);
super.save(erAttribute);
}
@Override
public List<ErAttribute> listByDomainMetadataId(String domainMetadataId) {
return super.lambdaQuery().eq(ErAttribute::getDomainMetadataId, domainMetadataId).list();
}
}

@ -60,6 +60,11 @@ public class IntentionServiceImpl extends ServiceImpl<IntentionMapper, Intention
return this.lambdaQuery().eq(Intention::getDigest, digest)
.eq(Intention::getDomainCategoryId, domainCategoryId).one();
}
@Override
public List<Intention> queryByDomainCategoryId(String domainCategoryId) {
return super.lambdaQuery().eq(Intention::getDomainCategoryId, domainCategoryId).list();
}
}

@ -87,12 +87,18 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
Assert.notNull(pdfInfo, "pdfId:{}没有找到对应的pdf信息", pdfId);
if (null == pdfInfo.getTrainStatus()){
// todo:训练异常,需要记录异常状态
log.info("pdfId:{}没有找到对应的pdf训练状态,开始识别文档训练状态...", pdfId);
pdfInfoService.pdfToGraphStart(pdfId);
if (StrUtil.isEmpty(pdfInfo.getContentType())){
log.info("pdfId:{}没有找到对应的pdf内容类型,开始识别文档内容类型...", pdfId);
DocumentContentTypeEnum documentContentTypeEnum = tripleConversionPipeline.makeOutPdfContentType(pdfId);
log.info("pdfId:{}识别文档内容类型完成,内容类型:{}", pdfId, documentContentTypeEnum.getType());
if (StrUtil.isEmpty(documentContentTypeEnum.getType())){
log.info("pdfId:{}没有找到对应的pdf内容类型,停止后续任务...", pdfId);
pdfInfoService.pdfTrainFail(pdfId);
return;
}
pdfInfo.setContentType(documentContentTypeEnum.getType());
pdfInfoService.updateContentType(pdfId, documentContentTypeEnum.getType());
}
@ -100,6 +106,11 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
log.info("pdfId:{}没有找到对应的pdf行业,开始识别文档行业...", pdfId);
String industry = tripleConversionPipeline.makeOutPdfIndustry(pdfId);
log.info("pdfId:{}识别文档行业完成,行业:{}", pdfId, industry);
if (StrUtil.isEmpty(industry)){
log.info("pdfId:{}没有找到对应的pdf行业,停止后续任务...", pdfId);
pdfInfoService.pdfTrainFail(pdfId);
return;
}
pdfInfo.setDomainCategoryId(industry);
pdfInfoService.updateCategory(pdfId, industry);
}
@ -107,43 +118,86 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
TripleConversionPipeline tripleConversionPipeline = this.getTripleConversionPipeline(pdfInfo.getContentType(), pdfInfo.getDomainCategoryId());
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(pdfId);
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
log.warn("没有找到pdfId为{}的pdf分析结果,不再进行下一步操作...", pdfId);
return;
}
List<String> documentIds = pdfAnalysisOutputs.stream().map(p->String.valueOf(p.getId())).collect(Collectors.toList());
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
List<TruncateDTO> truncateDTOS = new ArrayList<>();
if (CollUtil.isNotEmpty(documentTruncations)){
log.info("没有找到文档切分数据,pdfId:{},不用重置数据...", pdfId);
truncateDTOS = documentTruncations.stream().map(TruncateDTO::new).collect(Collectors.toList());
log.info("文档切分数据不为空,pdfId:{},清除切分数据...", pdfId);
documentTruncationService.deleteByDocumentIds(documentIds);
}
if (CollUtil.isEmpty(documentTruncations)){
log.info("开始切割文档切片,pdfId:{}", pdfId);
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).collect(Collectors.toList());
truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
log.info("切割文档切片完成,切片个数:{}", truncateDTOS.size());
// 保存分片信息
documentTruncationService.batchSave(truncateDTOS);
}
for (TruncateDTO truncateDTO : truncateDTOS) {
try {
List<String> intents = tripleConversionPipeline.makeOutTruncationIntent(truncateDTO);
List<DomainMetadataDTO> domainMetadataDTOS = tripleConversionPipeline.makeOutDomainMetadata(truncateDTO, intents);
// 保存意图数据
List<Intention> intentions = intentionService.batchSaveIfAbsent(intents, pdfInfo.getDomainCategoryId(), pdfId.toString());
for (Intention intention : intentions) {
List<DomainMetadataDTO> metadataDTOS = domainMetadataDTOS.stream()
.filter(d -> StrUtil.equals(d.getIntentDigest(), intention.getDigest())).toList();
domainMetadataService.batchSaveOrUpdateMetadata(metadataDTOS,intention.getId(), pdfInfo.getDomainCategoryId());
}
}catch (Exception e){
log.error("切分文档id:{},意图识别失败", truncateDTO.getId(), e);
}
}
}
@Override
public void generateGraphBaseTrain(Integer pdfId) {
Assert.notNull(pdfId, "pdfId不能为空");
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
Assert.notNull(pdfInfo, "pdfId:{}没有找到对应的pdf信息", pdfId);
Assert.isTrue((null !=pdfInfo.getTrainStatus() && pdfInfo.getTrainStatus() == 1),
"pdfId:{}的pdf训练状态{} 不符合要求", pdfId, pdfInfo.getTrainStatus());
List<TruncateDTO> truncateDTOS = documentTruncationService.listByPdfId(pdfId).stream().map(TruncateDTO::new).collect(Collectors.toList());
TripleConversionPipeline conversionPipeline = this.getTripleConversionPipeline(pdfInfo.getContentType(), pdfInfo.getDomainCategoryId());
if (CollUtil.isEmpty(truncateDTOS)){
log.info("没有找到pdfId为{}的文档切分数据,开始切分数据...", pdfId);
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(pdfId);
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).collect(Collectors.toList());
truncateDTOS = conversionPipeline.sliceDocuments(documentDTOList);
documentTruncationService.batchSave(truncateDTOS);
log.info("切分数据完成,切分个数:{}", truncateDTOS.size());
}
log.info("开始命名实体识别,切分文档个数:{}", truncateDTOS.size());
// 查询当前行业分类下的意图
List<IntentDTO> intentionDTOs = intentionService.queryByDomainCategoryId(pdfInfo.getDomainCategoryId()).stream().map(IntentDTO::new).distinct().toList();
if (CollUtil.isEmpty(intentionDTOs)){
log.info("没有找到行业分类id为{}的意图数据,不再进行下一步操作...", pdfInfo.getDomainCategoryId());
return;
}
for (TruncateDTO truncateDTO : truncateDTOS) {
try {
List<IntentDTO> intents = conversionPipeline.makeOutTruncationIntent(truncateDTO,intentionDTOs);
if (CollUtil.isEmpty(intents)){
log.info("切分文档id:{},未正确识别出意图...", truncateDTO.getId());
continue;
}
EREDTO eredto = conversionPipeline.doEre(truncateDTO, intents);
if (null == eredto){
log.info("切分文档id:{},命名实体识别结果为空...", truncateDTO.getId());
continue;
}
// 保存实体关系抽取结果
this.saveERE(eredto, truncateDTO.getId());
}catch (Exception e){
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
}
}
}

@ -20,4 +20,10 @@
layout_type,content,create_time,
update_time
</sql>
<select id="listByPdfId" resultType="com.supervision.pdfqaserver.domain.DocumentTruncation">
select d.*
from pdf_analysis_output o
join document_truncation d on o.id::text = d.document_id
where o.pdf_id = #{pdfId}
</select>
</mapper>

@ -7,7 +7,6 @@
<resultMap id="BaseResultMap" type="com.supervision.pdfqaserver.domain.ErAttribute">
<id property="id" column="id" jdbcType="VARCHAR"/>
<result property="domainMetadataId" column="domain_metadata_id" jdbcType="VARCHAR"/>
<result property="erName" column="er_name" jdbcType="VARCHAR"/>
<result property="attrName" column="attr_name" jdbcType="VARCHAR"/>
<result property="attrValueType" column="attr_value_type" jdbcType="VARCHAR"/>
<result property="erType" column="er_type" jdbcType="VARCHAR"/>
@ -16,7 +15,7 @@
</resultMap>
<sql id="Base_Column_List">
id,domain_metadata_id,er_name,
id,domain_metadata_id,
attr_name,attr_value_type,er_type,
create_time,update_time
</sql>

Loading…
Cancel
Save