|
|
@ -1,6 +1,7 @@
|
|
|
|
package com.supervision.pdfqaserver.service.impl;
|
|
|
|
package com.supervision.pdfqaserver.service.impl;
|
|
|
|
|
|
|
|
|
|
|
|
import cn.hutool.core.collection.CollUtil;
|
|
|
|
import cn.hutool.core.collection.CollUtil;
|
|
|
|
|
|
|
|
import cn.hutool.core.date.TimeInterval;
|
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
|
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
|
|
|
|
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
|
|
|
|
import com.supervision.pdfqaserver.domain.ChineseEnglishWords;
|
|
|
|
import com.supervision.pdfqaserver.domain.ChineseEnglishWords;
|
|
|
@ -39,18 +40,24 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
|
public void generateGraph(String documentId) {
|
|
|
|
public void generateGraph(String documentId) {
|
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(documentId);
|
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(documentId));
|
|
|
|
if (CollUtil.isEmpty(pdfAnalysisOutputs)) {
|
|
|
|
if (CollUtil.isEmpty(pdfAnalysisOutputs)) {
|
|
|
|
log.info("没有找到pdfId为{}的pdf分析结果", documentId);
|
|
|
|
log.info("没有找到pdfId为{}的pdf分析结果", documentId);
|
|
|
|
return;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).toList();
|
|
|
|
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).toList();
|
|
|
|
// 对文档进行切分
|
|
|
|
// 对文档进行切分
|
|
|
|
|
|
|
|
TimeInterval timer = new TimeInterval();
|
|
|
|
|
|
|
|
timer.start("sliceDocuments");
|
|
|
|
|
|
|
|
log.info("开始切分文档,初始文档个数:{}",documentDTOList.size());
|
|
|
|
List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
|
|
|
|
List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
|
|
|
|
|
|
|
|
log.info("切分文档完成,切分后文档个数:{},耗时:{}秒",truncateDTOS.size(), timer.intervalSecond("sliceDocuments"));
|
|
|
|
// 保存分片信息
|
|
|
|
// 保存分片信息
|
|
|
|
documentTruncationService.batchSave(truncateDTOS);
|
|
|
|
documentTruncationService.batchSave(truncateDTOS);
|
|
|
|
|
|
|
|
|
|
|
|
// 对切分后的文档进行命名实体识别
|
|
|
|
// 对切分后的文档进行命名实体识别
|
|
|
|
|
|
|
|
timer.start("doEre");
|
|
|
|
|
|
|
|
log.info("开始命名实体识别...");
|
|
|
|
List<EREDTO> eredtoList = new ArrayList<>();
|
|
|
|
List<EREDTO> eredtoList = new ArrayList<>();
|
|
|
|
for (TruncateDTO truncateDTO : truncateDTOS) {
|
|
|
|
for (TruncateDTO truncateDTO : truncateDTOS) {
|
|
|
|
EREDTO eredto = tripleConversionPipeline.doEre(truncateDTO);
|
|
|
|
EREDTO eredto = tripleConversionPipeline.doEre(truncateDTO);
|
|
|
@ -59,12 +66,17 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// 保存实体关系抽取结果
|
|
|
|
// 保存实体关系抽取结果
|
|
|
|
this.saveERE(eredto, truncateDTO.getId());
|
|
|
|
this.saveERE(eredto, truncateDTO.getId());
|
|
|
|
|
|
|
|
eredtoList.add(eredto);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
log.info("命名实体识别完成,耗时:{}秒", timer.intervalSecond("doEre"));
|
|
|
|
|
|
|
|
|
|
|
|
// 合并实体关系抽取结果
|
|
|
|
// 合并实体关系抽取结果
|
|
|
|
|
|
|
|
log.info("开始合并实体关系抽取结果...");
|
|
|
|
List<EREDTO> mergedList = tripleConversionPipeline.mergeEreResults(eredtoList);
|
|
|
|
List<EREDTO> mergedList = tripleConversionPipeline.mergeEreResults(eredtoList);
|
|
|
|
|
|
|
|
log.info("合并实体关系抽取结果完成,合并后个数:{}", mergedList.size());
|
|
|
|
|
|
|
|
|
|
|
|
// 保存领域元数据
|
|
|
|
// 保存领域元数据
|
|
|
|
|
|
|
|
log.info("开始保存领域元数据...");
|
|
|
|
for (EREDTO eredto : mergedList) {
|
|
|
|
for (EREDTO eredto : mergedList) {
|
|
|
|
List<RelationExtractionDTO> relations = eredto.getRelations();
|
|
|
|
List<RelationExtractionDTO> relations = eredto.getRelations();
|
|
|
|
if (CollUtil.isEmpty(relations)){
|
|
|
|
if (CollUtil.isEmpty(relations)){
|
|
|
@ -77,9 +89,12 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
domainMetadataService.saveIfNotExists(domainMetadata);
|
|
|
|
domainMetadataService.saveIfNotExists(domainMetadata);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
log.info("保存领域元数据完成");
|
|
|
|
|
|
|
|
|
|
|
|
// 保存字典
|
|
|
|
// 保存字典
|
|
|
|
|
|
|
|
log.info("开始保存字典...");
|
|
|
|
List<ChineseEnglishWords> allWords = chineseEnglishWordsService.queryAll();
|
|
|
|
List<ChineseEnglishWords> allWords = chineseEnglishWordsService.queryAll();
|
|
|
|
|
|
|
|
int wordsSize = allWords.size();
|
|
|
|
for (EREDTO eredto : mergedList) {
|
|
|
|
for (EREDTO eredto : mergedList) {
|
|
|
|
List<EntityExtractionDTO> entities = eredto.getEntities();
|
|
|
|
List<EntityExtractionDTO> entities = eredto.getEntities();
|
|
|
|
if (CollUtil.isNotEmpty(entities)){
|
|
|
|
if (CollUtil.isNotEmpty(entities)){
|
|
|
@ -94,7 +109,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
log.info("保存字典完成,新增字典个数:{}", allWords.size() - wordsSize);
|
|
|
|
// 生成cypher语句
|
|
|
|
// 生成cypher语句
|
|
|
|
for (EREDTO eredto : mergedList) {
|
|
|
|
for (EREDTO eredto : mergedList) {
|
|
|
|
eredto.setEn(allWords);
|
|
|
|
eredto.setEn(allWords);
|
|
|
|