|
|
|
@ -2,13 +2,12 @@ package com.supervision.pdfqaserver.service.impl;
|
|
|
|
|
|
|
|
|
|
import cn.hutool.core.collection.CollUtil;
|
|
|
|
|
import cn.hutool.core.date.TimeInterval;
|
|
|
|
|
import cn.hutool.core.util.NumberUtil;
|
|
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
|
|
import cn.hutool.json.JSONUtil;
|
|
|
|
|
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
|
|
|
|
|
import com.supervision.pdfqaserver.domain.ChineseEnglishWords;
|
|
|
|
|
import com.supervision.pdfqaserver.domain.DocumentTruncation;
|
|
|
|
|
import com.supervision.pdfqaserver.domain.DomainMetadata;
|
|
|
|
|
import com.supervision.pdfqaserver.domain.*;
|
|
|
|
|
import com.supervision.pdfqaserver.dto.*;
|
|
|
|
|
import com.supervision.pdfqaserver.domain.PdfAnalysisOutput;
|
|
|
|
|
import com.supervision.pdfqaserver.service.*;
|
|
|
|
|
import com.supervision.pdfqaserver.thread.KnowledgeGraphGenerateTreadPool;
|
|
|
|
|
import lombok.RequiredArgsConstructor;
|
|
|
|
@ -18,6 +17,7 @@ import org.springframework.stereotype.Service;
|
|
|
|
|
import org.springframework.transaction.annotation.Transactional;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
|
|
@Slf4j
|
|
|
|
|
@Service
|
|
|
|
@ -38,6 +38,10 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
|
|
|
|
|
private final TruncationEntityExtractionService truncationEntityExtractionService;
|
|
|
|
|
|
|
|
|
|
private final TruncationRelationExtractionService truncationRelationExtractionService;
|
|
|
|
|
|
|
|
|
|
private final TruncationErAttributeService truncationErAttributeService;
|
|
|
|
|
|
|
|
|
|
private final TruncationRelationExtractionService relationExtractionService;
|
|
|
|
|
|
|
|
|
|
private final ChinesEsToEnglishGenerator chinesEsToEnglishGenerator;
|
|
|
|
@ -65,24 +69,15 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
// 对切分后的文档进行命名实体识别
|
|
|
|
|
timer.start("doEre");
|
|
|
|
|
log.info("开始命名实体识别...");
|
|
|
|
|
List<EREDTO> eredtoList = new ArrayList<>();
|
|
|
|
|
for (TruncateDTO truncateDTO : truncateDTOS) {
|
|
|
|
|
EREDTO eredto = null;
|
|
|
|
|
try {
|
|
|
|
|
eredto = tripleConversionPipeline.doEre(truncateDTO);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
|
|
|
|
|
}
|
|
|
|
|
if (null == eredto){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// 保存实体关系抽取结果
|
|
|
|
|
this.saveERE(eredto, truncateDTO.getId());
|
|
|
|
|
eredtoList.add(eredto);
|
|
|
|
|
}
|
|
|
|
|
List<EREDTO> eredtoList = truncateERE(truncateDTOS);
|
|
|
|
|
log.info("命名实体识别完成,耗时:{}秒", timer.intervalSecond("doEre"));
|
|
|
|
|
|
|
|
|
|
// 合并实体关系抽取结果
|
|
|
|
|
generateGraph(eredtoList);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void generateGraph(List<EREDTO> eredtoList) {
|
|
|
|
|
log.info("开始合并实体关系抽取结果...");
|
|
|
|
|
List<EREDTO> mergedList = tripleConversionPipeline.mergeEreResults(eredtoList);
|
|
|
|
|
log.info("合并实体关系抽取结果完成,合并后个数:{}", mergedList.size());
|
|
|
|
@ -137,23 +132,100 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
if (CollUtil.isEmpty(eredto.getEntities()) && CollUtil.isEmpty(eredto.getRelations())){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// 构造一个字典
|
|
|
|
|
allWords = getChineseEnglishWords(eredto);
|
|
|
|
|
|
|
|
|
|
eredto.setEn(allWords);
|
|
|
|
|
try {
|
|
|
|
|
tripleToCypherExecutor.saveERE(eredto);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
log.info("生成cypher语句失败,切分文档id:{}", JSONUtil.toJsonStr(eredto), e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static List<ChineseEnglishWords> getChineseEnglishWords(EREDTO eredto) {
|
|
|
|
|
List<ChineseEnglishWords> allWords;
|
|
|
|
|
allWords = eredto.getEntities().stream().flatMap(entity -> {
|
|
|
|
|
List<ChineseEnglishWords> collect = entity.getAttributes().stream().map(e -> {
|
|
|
|
|
ChineseEnglishWords words = new ChineseEnglishWords();
|
|
|
|
|
words.setChineseWord(e.getAttribute());
|
|
|
|
|
words.setEnglishWord(e.getAttribute());
|
|
|
|
|
return words;
|
|
|
|
|
}).collect(Collectors.toList());
|
|
|
|
|
ChineseEnglishWords words = new ChineseEnglishWords();
|
|
|
|
|
words.setChineseWord(entity.getEntity());
|
|
|
|
|
words.setEnglishWord(entity.getEntity());
|
|
|
|
|
collect.add(words);
|
|
|
|
|
return collect.stream();
|
|
|
|
|
}).collect(Collectors.toList());
|
|
|
|
|
|
|
|
|
|
eredto.getRelations().stream().flatMap(relation -> {
|
|
|
|
|
List<ChineseEnglishWords> words = relation.getAttributes().stream().map(e -> {
|
|
|
|
|
ChineseEnglishWords word = new ChineseEnglishWords();
|
|
|
|
|
word.setChineseWord(e.getAttribute());
|
|
|
|
|
word.setEnglishWord(e.getAttribute());
|
|
|
|
|
return word;
|
|
|
|
|
}).collect(Collectors.toList());
|
|
|
|
|
ChineseEnglishWords words1 = new ChineseEnglishWords();
|
|
|
|
|
words1.setChineseWord(relation.getRelation());
|
|
|
|
|
words1.setEnglishWord(relation.getRelation());
|
|
|
|
|
words.add(words1);
|
|
|
|
|
ChineseEnglishWords words2 = new ChineseEnglishWords();
|
|
|
|
|
words2.setChineseWord(relation.getSourceType());
|
|
|
|
|
words2.setEnglishWord(relation.getSourceType());
|
|
|
|
|
words.add(words2);
|
|
|
|
|
ChineseEnglishWords words3 = new ChineseEnglishWords();
|
|
|
|
|
words3.setChineseWord(relation.getTargetType());
|
|
|
|
|
words3.setEnglishWord(relation.getTargetType());
|
|
|
|
|
words.add(words3);
|
|
|
|
|
return words.stream();
|
|
|
|
|
}).forEach(allWords::add);
|
|
|
|
|
|
|
|
|
|
return allWords;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public List<EREDTO> truncateERE(List<TruncateDTO> truncateDTOS) {
|
|
|
|
|
List<EREDTO> eredtoList = new ArrayList<>();
|
|
|
|
|
int truncateSize = truncateDTOS.size();
|
|
|
|
|
int index = 1;
|
|
|
|
|
for (TruncateDTO truncateDTO : truncateDTOS) {
|
|
|
|
|
log.info("开始命名实体识别,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
|
|
|
|
|
index++;
|
|
|
|
|
EREDTO eredto = null;
|
|
|
|
|
try {
|
|
|
|
|
eredto = tripleConversionPipeline.doEre(truncateDTO);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
|
|
|
|
|
}
|
|
|
|
|
if (null == eredto){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// 保存实体关系抽取结果
|
|
|
|
|
this.saveERE(eredto, truncateDTO.getId());
|
|
|
|
|
eredtoList.add(eredto);
|
|
|
|
|
}
|
|
|
|
|
return eredtoList;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
@Transactional(rollbackFor = Exception.class)
|
|
|
|
|
public void resetGraphData(String documentId) {
|
|
|
|
|
log.info("resetGraphData:重置知识图谱数据,documentId:{}", documentId);
|
|
|
|
|
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentId(documentId);
|
|
|
|
|
public void resetGraphData(String pdfId) {
|
|
|
|
|
log.info("resetGraphData:重置知识图谱数据,pdfId:{}", pdfId);
|
|
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId));
|
|
|
|
|
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
|
|
|
|
|
log.info("没有找到pdfId为{}的pdf分析结果", pdfId);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
List<String> documentIds = pdfAnalysisOutputs.stream().map(p -> String.valueOf(p.getId())).toList();
|
|
|
|
|
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
|
|
|
|
|
if (CollUtil.isEmpty(documentTruncations)){
|
|
|
|
|
log.info("没有找到文档切分数据,documentId:{},不用重置数据...", documentId);
|
|
|
|
|
log.info("没有找到文档切分数据,pdfId:{},不用重置数据...", pdfId);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
// 删除切分数据
|
|
|
|
|
documentTruncationService.deleteByDocumentId(documentId);
|
|
|
|
|
documentTruncationService.deleteByDocumentIds(documentIds);
|
|
|
|
|
for (DocumentTruncation documentTruncation : documentTruncations) {
|
|
|
|
|
String truncationId = documentTruncation.getId();
|
|
|
|
|
// 删除实体数据
|
|
|
|
@ -161,7 +233,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
// 删除关系数据
|
|
|
|
|
relationExtractionService.deleteByTruncationId(truncationId);
|
|
|
|
|
}
|
|
|
|
|
log.info("重置知识图谱数据完成,documentId:{}", documentId);
|
|
|
|
|
log.info("重置知识图谱数据完成,pdfId:{}", pdfId);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -214,4 +286,46 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
relationExtractionService.saveERE(eredto.getRelations());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public List<EREDTO> listPdfEREDTO(String pdfId) {
|
|
|
|
|
|
|
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId));
|
|
|
|
|
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
|
|
|
|
|
log.info("没有找到pdfId为{}的pdf分析结果", pdfId);
|
|
|
|
|
return new ArrayList<>();
|
|
|
|
|
}
|
|
|
|
|
List<String> documentIds = pdfAnalysisOutputs.stream().map(p -> p.getId().toString()).toList();
|
|
|
|
|
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
|
|
|
|
|
List<String> truncationIds = documentTruncations.stream().map(DocumentTruncation::getId).toList();
|
|
|
|
|
List<TruncationEntityExtraction> truncationEntityExtractions = truncationEntityExtractionService.queryByTruncationIds(truncationIds);
|
|
|
|
|
|
|
|
|
|
List<TruncationRelationExtraction> truncationRelationExtractions = truncationRelationExtractionService.queryByTruncationIds(truncationIds);
|
|
|
|
|
|
|
|
|
|
List<String> teIds = truncationEntityExtractions.stream().map(TruncationEntityExtraction::getId).toList();
|
|
|
|
|
List<String> trIds = truncationRelationExtractions.stream().map(TruncationRelationExtraction::getId).collect(Collectors.toList());
|
|
|
|
|
trIds.addAll(teIds);
|
|
|
|
|
List<TruncationErAttribute> truncationErAttributes = truncationErAttributeService.queryByTerIds(trIds);
|
|
|
|
|
|
|
|
|
|
List<EREDTO> eres = new ArrayList<>();
|
|
|
|
|
for (TruncationEntityExtraction entityExtraction : truncationEntityExtractions) {
|
|
|
|
|
EREDTO eredto = new EREDTO();
|
|
|
|
|
EntityExtractionDTO extractionDTO = new EntityExtractionDTO(entityExtraction);
|
|
|
|
|
List<ERAttributeDTO> attributes = truncationErAttributes.stream()
|
|
|
|
|
.filter(t -> StrUtil.equals(entityExtraction.getId(), t.getTerId())).map(ERAttributeDTO::new).collect(Collectors.toList());
|
|
|
|
|
extractionDTO.setAttributes(attributes);
|
|
|
|
|
eredto.getEntities().add(extractionDTO);
|
|
|
|
|
eres.add(eredto);
|
|
|
|
|
}
|
|
|
|
|
for (TruncationRelationExtraction relationExtraction : truncationRelationExtractions) {
|
|
|
|
|
EREDTO eredto = new EREDTO();
|
|
|
|
|
RelationExtractionDTO extractionDTO = new RelationExtractionDTO(relationExtraction);
|
|
|
|
|
List<ERAttributeDTO> attributes = truncationErAttributes.stream()
|
|
|
|
|
.filter(t -> StrUtil.equals(relationExtraction.getId(), t.getTerId())).map(ERAttributeDTO::new).collect(Collectors.toList());
|
|
|
|
|
extractionDTO.setAttributes(attributes);
|
|
|
|
|
eredto.getRelations().add(extractionDTO);
|
|
|
|
|
eres.add(eredto);
|
|
|
|
|
}
|
|
|
|
|
return eres;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|