|
|
|
@ -2,14 +2,15 @@ package com.supervision.pdfqaserver.service.impl;
|
|
|
|
|
|
|
|
|
|
import cn.hutool.core.collection.CollUtil;
|
|
|
|
|
import cn.hutool.core.date.TimeInterval;
|
|
|
|
|
import cn.hutool.core.lang.Assert;
|
|
|
|
|
import cn.hutool.core.util.NumberUtil;
|
|
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
|
|
import cn.hutool.json.JSONUtil;
|
|
|
|
|
import com.supervision.pdfqaserver.constant.DocumentContentTypeEnum;
|
|
|
|
|
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
|
|
|
|
|
import com.supervision.pdfqaserver.domain.*;
|
|
|
|
|
import com.supervision.pdfqaserver.dto.*;
|
|
|
|
|
import com.supervision.pdfqaserver.service.*;
|
|
|
|
|
import com.supervision.pdfqaserver.thread.KnowledgeGraphGenerateTreadPool;
|
|
|
|
|
import lombok.RequiredArgsConstructor;
|
|
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
|
|
import org.springframework.aop.framework.AopContext;
|
|
|
|
@ -46,6 +47,10 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
|
|
|
|
|
private final ChinesEsToEnglishGenerator chinesEsToEnglishGenerator;
|
|
|
|
|
|
|
|
|
|
private final PdfInfoService pdfInfoService;
|
|
|
|
|
|
|
|
|
|
private final IntentionService intentionService;
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void generateGraph(String pdfId) {
|
|
|
|
|
|
|
|
|
@ -76,6 +81,79 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void metaDataTrain(Integer pdfId) {
|
|
|
|
|
Assert.notNull(pdfId, "pdfId不能为空");
|
|
|
|
|
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
|
|
|
|
|
Assert.notNull(pdfInfo, "pdfId:{}没有找到对应的pdf信息", pdfId);
|
|
|
|
|
if (null == pdfInfo.getTrainStatus()){
|
|
|
|
|
log.info("pdfId:{}没有找到对应的pdf训练状态,开始识别文档训练状态...", pdfId);
|
|
|
|
|
pdfInfoService.pdfToGraphStart(pdfId);
|
|
|
|
|
if (StrUtil.isEmpty(pdfInfo.getContentType())){
|
|
|
|
|
log.info("pdfId:{}没有找到对应的pdf内容类型,开始识别文档内容类型...", pdfId);
|
|
|
|
|
DocumentContentTypeEnum documentContentTypeEnum = tripleConversionPipeline.makeOutPdfContentType(pdfId);
|
|
|
|
|
log.info("pdfId:{}识别文档内容类型完成,内容类型:{}", pdfId, documentContentTypeEnum.getType());
|
|
|
|
|
pdfInfo.setContentType(documentContentTypeEnum.getType());
|
|
|
|
|
pdfInfoService.updateContentType(pdfId, documentContentTypeEnum.getType());
|
|
|
|
|
}
|
|
|
|
|
if (StrUtil.isEmpty(pdfInfo.getDomainCategoryId())){
|
|
|
|
|
log.info("pdfId:{}没有找到对应的pdf行业,开始识别文档行业...", pdfId);
|
|
|
|
|
String industry = tripleConversionPipeline.makeOutPdfIndustry(pdfId);
|
|
|
|
|
log.info("pdfId:{}识别文档行业完成,行业:{}", pdfId, industry);
|
|
|
|
|
pdfInfo.setDomainCategoryId(industry);
|
|
|
|
|
pdfInfoService.updateCategory(pdfId, industry);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
TripleConversionPipeline tripleConversionPipeline = this.getTripleConversionPipeline(pdfInfo.getContentType(), pdfInfo.getDomainCategoryId());
|
|
|
|
|
|
|
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(pdfId);
|
|
|
|
|
List<String> documentIds = pdfAnalysisOutputs.stream().map(p->String.valueOf(p.getId())).collect(Collectors.toList());
|
|
|
|
|
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
|
|
|
|
|
List<TruncateDTO> truncateDTOS = new ArrayList<>();
|
|
|
|
|
if (CollUtil.isNotEmpty(documentTruncations)){
|
|
|
|
|
log.info("没有找到文档切分数据,pdfId:{},不用重置数据...", pdfId);
|
|
|
|
|
truncateDTOS = documentTruncations.stream().map(TruncateDTO::new).collect(Collectors.toList());
|
|
|
|
|
}
|
|
|
|
|
if (CollUtil.isEmpty(documentTruncations)){
|
|
|
|
|
log.info("开始切割文档切片,pdfId:{}", pdfId);
|
|
|
|
|
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).collect(Collectors.toList());
|
|
|
|
|
truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
|
|
|
|
|
log.info("切割文档切片完成,切片个数:{}", truncateDTOS.size());
|
|
|
|
|
// 保存分片信息
|
|
|
|
|
documentTruncationService.batchSave(truncateDTOS);
|
|
|
|
|
}
|
|
|
|
|
for (TruncateDTO truncateDTO : truncateDTOS) {
|
|
|
|
|
List<String> intents = tripleConversionPipeline.makeOutTruncationIntent(truncateDTO);
|
|
|
|
|
List<DomainMetadataDTO> domainMetadataDTOS = tripleConversionPipeline.makeOutDomainMetadata(truncateDTO, intents);
|
|
|
|
|
// 保存意图数据
|
|
|
|
|
List<Intention> intentions = intentionService.batchSaveIfAbsent(intents, pdfInfo.getDomainCategoryId(), pdfId.toString());
|
|
|
|
|
|
|
|
|
|
for (Intention intention : intentions) {
|
|
|
|
|
List<DomainMetadataDTO> metadataDTOS = domainMetadataDTOS.stream()
|
|
|
|
|
.filter(d -> StrUtil.equals(d.getIntentDigest(), intention.getDigest())).toList();
|
|
|
|
|
domainMetadataService.batchSaveOrUpdateMetadata(metadataDTOS,intention.getId(), pdfInfo.getDomainCategoryId());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void generateGraphBaseTrain(Integer pdfId) {
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public TripleConversionPipeline getTripleConversionPipeline(String contentType, String industry) {
|
|
|
|
|
// 内容类型决定了文本片段的切分方式,行业类别决定了文本片段的意图
|
|
|
|
|
// 内容类型和行业类型确定tripleConversionPipeline的具体实现方式,现在默认是pdf类型
|
|
|
|
|
return this.tripleConversionPipeline;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void generateGraph(List<EREDTO> eredtoList) {
|
|
|
|
|
log.info("开始合并实体关系抽取结果...");
|
|
|
|
@ -108,7 +186,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
for (EntityExtractionDTO entityDTO : entities) {
|
|
|
|
|
saveWordsIfNecessary(entityDTO.getEntity(), allWords);
|
|
|
|
|
if (CollUtil.isNotEmpty(entityDTO.getAttributes())){
|
|
|
|
|
for (ERAttributeDTO attribute : entityDTO.getAttributes()) {
|
|
|
|
|
for (TruncationERAttributeDTO attribute : entityDTO.getAttributes()) {
|
|
|
|
|
saveWordsIfNecessary(attribute.getAttribute(), allWords);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -119,7 +197,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
for (RelationExtractionDTO relationDTO : relations) {
|
|
|
|
|
saveWordsIfNecessary(relationDTO.getRelation(), allWords);
|
|
|
|
|
if (CollUtil.isNotEmpty(relationDTO.getAttributes())){
|
|
|
|
|
for (ERAttributeDTO attribute : relationDTO.getAttributes()) {
|
|
|
|
|
for (TruncationERAttributeDTO attribute : relationDTO.getAttributes()) {
|
|
|
|
|
saveWordsIfNecessary(attribute.getAttribute(), allWords);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -294,8 +372,8 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
for (TruncationEntityExtraction entityExtraction : truncationEntityExtractions) {
|
|
|
|
|
EREDTO eredto = new EREDTO();
|
|
|
|
|
EntityExtractionDTO extractionDTO = new EntityExtractionDTO(entityExtraction);
|
|
|
|
|
List<ERAttributeDTO> attributes = truncationErAttributes.stream()
|
|
|
|
|
.filter(t -> StrUtil.equals(entityExtraction.getId(), t.getTerId())).map(ERAttributeDTO::new).collect(Collectors.toList());
|
|
|
|
|
List<TruncationERAttributeDTO> attributes = truncationErAttributes.stream()
|
|
|
|
|
.filter(t -> StrUtil.equals(entityExtraction.getId(), t.getTerId())).map(TruncationERAttributeDTO::new).collect(Collectors.toList());
|
|
|
|
|
extractionDTO.setAttributes(attributes);
|
|
|
|
|
eredto.getEntities().add(extractionDTO);
|
|
|
|
|
eres.add(eredto);
|
|
|
|
@ -303,8 +381,8 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
for (TruncationRelationExtraction relationExtraction : truncationRelationExtractions) {
|
|
|
|
|
EREDTO eredto = new EREDTO();
|
|
|
|
|
RelationExtractionDTO extractionDTO = new RelationExtractionDTO(relationExtraction);
|
|
|
|
|
List<ERAttributeDTO> attributes = truncationErAttributes.stream()
|
|
|
|
|
.filter(t -> StrUtil.equals(relationExtraction.getId(), t.getTerId())).map(ERAttributeDTO::new).collect(Collectors.toList());
|
|
|
|
|
List<TruncationERAttributeDTO> attributes = truncationErAttributes.stream()
|
|
|
|
|
.filter(t -> StrUtil.equals(relationExtraction.getId(), t.getTerId())).map(TruncationERAttributeDTO::new).collect(Collectors.toList());
|
|
|
|
|
extractionDTO.setAttributes(attributes);
|
|
|
|
|
eredto.getRelations().add(extractionDTO);
|
|
|
|
|
eres.add(eredto);
|
|
|
|
|