|
|
package com.supervision.pdfqaserver.service.impl;
|
|
|
|
|
|
import cn.hutool.core.collection.CollUtil;
|
|
|
import cn.hutool.core.date.TimeInterval;
|
|
|
import cn.hutool.core.lang.Assert;
|
|
|
import cn.hutool.core.util.NumberUtil;
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
import cn.hutool.json.JSONUtil;
|
|
|
import com.supervision.pdfqaserver.constant.DocumentContentTypeEnum;
|
|
|
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
|
|
|
import com.supervision.pdfqaserver.constant.LayoutTypeEnum;
|
|
|
import com.supervision.pdfqaserver.domain.*;
|
|
|
import com.supervision.pdfqaserver.dto.*;
|
|
|
import com.supervision.pdfqaserver.service.*;
|
|
|
import lombok.RequiredArgsConstructor;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.springframework.aop.framework.AopContext;
|
|
|
import org.springframework.stereotype.Service;
|
|
|
import org.springframework.transaction.annotation.Transactional;
|
|
|
import java.util.ArrayList;
|
|
|
import java.util.List;
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
@Slf4j
|
|
|
@Service
|
|
|
@RequiredArgsConstructor
|
|
|
public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
|
private final TripleConversionPipeline tripleConversionPipeline;
|
|
|
|
|
|
private final TripleToCypherExecutor tripleToCypherExecutor;
|
|
|
|
|
|
private final ChineseEnglishWordsService chineseEnglishWordsService;
|
|
|
|
|
|
private final DocumentTruncationService documentTruncationService;
|
|
|
|
|
|
private final DomainMetadataService domainMetadataService;
|
|
|
|
|
|
private final PdfAnalysisOutputService pdfAnalysisOutputService;
|
|
|
|
|
|
private final TruncationEntityExtractionService truncationEntityExtractionService;
|
|
|
|
|
|
private final TruncationRelationExtractionService truncationRelationExtractionService;
|
|
|
|
|
|
private final TruncationErAttributeService truncationErAttributeService;
|
|
|
|
|
|
private final TruncationRelationExtractionService relationExtractionService;
|
|
|
|
|
|
private final ChinesEsToEnglishGenerator chinesEsToEnglishGenerator;
|
|
|
|
|
|
private final PdfInfoService pdfInfoService;
|
|
|
|
|
|
private final IntentionService intentionService;
|
|
|
|
|
|
private final DomainCategoryService domainCategoryService;
|
|
|
|
|
|
@Override
|
|
|
public void generateGraph(String pdfId) {
|
|
|
|
|
|
((KnowledgeGraphService)AopContext.currentProxy()).resetGraphData(pdfId);
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId));
|
|
|
if (CollUtil.isEmpty(pdfAnalysisOutputs)) {
|
|
|
log.info("没有找到pdfId为{}的pdf分析结果", pdfId);
|
|
|
return;
|
|
|
}
|
|
|
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).toList();
|
|
|
// 对文档进行切分
|
|
|
TimeInterval timer = new TimeInterval();
|
|
|
timer.start("sliceDocuments");
|
|
|
log.info("开始切分文档,初始文档个数:{}",documentDTOList.size());
|
|
|
List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
|
|
|
log.info("切分文档完成,切分后文档个数:{},耗时:{}秒",truncateDTOS.size(), timer.intervalSecond("sliceDocuments"));
|
|
|
// 保存分片信息
|
|
|
documentTruncationService.batchSave(truncateDTOS);
|
|
|
|
|
|
// 对切分后的文档进行命名实体识别
|
|
|
timer.start("doEre");
|
|
|
log.info("开始命名实体识别...");
|
|
|
List<EREDTO> eredtoList = truncateERE(truncateDTOS);
|
|
|
log.info("命名实体识别完成,耗时:{}秒", timer.intervalSecond("doEre"));
|
|
|
|
|
|
generateGraph(eredtoList);
|
|
|
log.info("生成知识图谱完成,耗时:{}秒", timer.intervalSecond());
|
|
|
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* 元数据训练
|
|
|
* @param pdfId pdfId
|
|
|
*/
|
|
|
@Override
|
|
|
public void metaDataTrain(Integer pdfId) {
|
|
|
TimeInterval timer = new TimeInterval();
|
|
|
try {
|
|
|
metaDataTrainExecutor(pdfId);
|
|
|
pdfInfoService.pdfTrainComplete(pdfId);
|
|
|
log.info("pdfId:{}元数据训练完成,耗时:{}秒", pdfId, timer.intervalSecond());
|
|
|
}catch (Exception e){
|
|
|
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
|
|
|
if ( null == pdfInfo.getTrainStatus() || pdfInfo.getTrainStatus() == 0) {
|
|
|
log.error("pdfId:{}元数据训练失败...", pdfId, e);
|
|
|
pdfInfoService.pdfTrainFail(pdfId);
|
|
|
}
|
|
|
log.error("pdfId:{}元数据训练失败,耗时:{}秒", pdfId, timer.intervalSecond(),e);
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
public void generateGraphBaseTrain(Integer pdfId) {
|
|
|
Assert.notNull(pdfId, "pdfId不能为空");
|
|
|
TimeInterval timer = new TimeInterval();
|
|
|
try {
|
|
|
log.info("开始生成知识图谱, pdfId:{}", pdfId);
|
|
|
((KnowledgeGraphService)AopContext.currentProxy()).resetGraphData(pdfId.toString());
|
|
|
pdfInfoService.pdfToGraphStart(pdfId);
|
|
|
generateGraphBaseTrainExecutor(pdfId);
|
|
|
pdfInfoService.pdfToGraphComplete(pdfId);
|
|
|
log.info("pdfId:{}知识图谱生成完成,总耗时:{}秒", pdfId,timer.intervalSecond());
|
|
|
}catch (Exception e){
|
|
|
log.error("pdfId:{}知识图谱生成失败...", pdfId, e);
|
|
|
pdfInfoService.pdfToGraphFail(pdfId);
|
|
|
log.info("pdfId:{}知识图谱生成失败,总耗时:{}秒", pdfId,timer.intervalSecond());
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
private void metaDataTrainExecutor(Integer pdfId) {
|
|
|
Assert.notNull(pdfId, "pdfId不能为空");
|
|
|
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
|
|
|
Assert.notNull(pdfInfo, "pdfId:{}没有找到对应的pdf信息", pdfId);
|
|
|
if (null == pdfInfo.getTrainStatus() || pdfInfo.getTrainStatus() == 2){
|
|
|
log.info("pdfId:{}没有找到对应的pdf训练状态,开始识别文档训练状态...", pdfId);
|
|
|
pdfInfoService.pdfTrainStart(pdfId);
|
|
|
if (StrUtil.isEmpty(pdfInfo.getContentType())){
|
|
|
log.info("pdfId:{}没有找到对应的pdf内容类型,开始识别文档内容类型...", pdfId);
|
|
|
DocumentContentTypeEnum documentContentTypeEnum = tripleConversionPipeline.makeOutPdfContentType(pdfId);
|
|
|
log.info("pdfId:{}识别文档内容类型完成,内容类型:{}", pdfId, documentContentTypeEnum.getType());
|
|
|
if (StrUtil.isEmpty(documentContentTypeEnum.getType())){
|
|
|
log.info("pdfId:{}没有找到对应的pdf内容类型,停止后续任务...", pdfId);
|
|
|
pdfInfoService.pdfTrainFail(pdfId);
|
|
|
return;
|
|
|
}
|
|
|
pdfInfo.setContentType(documentContentTypeEnum.getType());
|
|
|
pdfInfoService.updateContentType(pdfId, documentContentTypeEnum.getType());
|
|
|
}
|
|
|
if (StrUtil.isEmpty(pdfInfo.getDomainCategoryId())){
|
|
|
log.info("pdfId:{}没有找到对应的pdf行业,开始识别文档行业...", pdfId);
|
|
|
String industry = tripleConversionPipeline.makeOutPdfIndustry(pdfId);
|
|
|
log.info("pdfId:{}识别文档行业完成,行业:{}", pdfId, industry);
|
|
|
if (StrUtil.isEmpty(industry)){
|
|
|
log.info("pdfId:{}没有找到对应的pdf行业,停止后续任务...", pdfId);
|
|
|
pdfInfoService.pdfTrainFail(pdfId);
|
|
|
return;
|
|
|
}
|
|
|
DomainCategory domainCategory = domainCategoryService.queryByIndustryName(industry);
|
|
|
if (null == domainCategory){
|
|
|
log.info("pdfId:{}没有找到:{}对应的行业分类,停止后续任务...", pdfId, industry);
|
|
|
pdfInfoService.pdfTrainFail(pdfId);
|
|
|
return;
|
|
|
}
|
|
|
pdfInfo.setDomainCategoryId(domainCategory.getId());
|
|
|
pdfInfoService.updateCategory(pdfId, domainCategory.getId());
|
|
|
}
|
|
|
}
|
|
|
TripleConversionPipeline tripleConversionPipeline = this.getTripleConversionPipeline(pdfInfo.getContentType(), pdfInfo.getDomainCategoryId());
|
|
|
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(pdfId);
|
|
|
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
|
|
|
log.warn("没有找到pdfId为{}的pdf分析结果,不再进行下一步操作...", pdfId);
|
|
|
return;
|
|
|
}
|
|
|
List<String> documentIds = pdfAnalysisOutputs.stream().map(p->String.valueOf(p.getId())).collect(Collectors.toList());
|
|
|
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
|
|
|
if (CollUtil.isNotEmpty(documentTruncations)){
|
|
|
log.info("文档切分数据不为空,pdfId:{},清除切分数据...", pdfId);
|
|
|
documentTruncationService.deleteByDocumentIds(documentIds);
|
|
|
}
|
|
|
log.info("开始切割文档切片,pdfId:{}", pdfId);
|
|
|
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).collect(Collectors.toList());
|
|
|
List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
|
|
|
log.info("切割文档切片完成,切片个数:{}", truncateDTOS.size());
|
|
|
// 保存分片信息
|
|
|
documentTruncationService.batchSave(truncateDTOS);
|
|
|
|
|
|
// 只识别文本类型数据
|
|
|
truncateDTOS = truncateDTOS.stream()
|
|
|
.filter(t->StrUtil.equals(t.getLayoutType(), String.valueOf(LayoutTypeEnum.TEXT.getCode()))).collect(Collectors.toList());
|
|
|
log.info("只识别文本类型数据,个数:{}", truncateDTOS.size());
|
|
|
int truncateSize = truncateDTOS.size();
|
|
|
int index = 1;
|
|
|
int intentSize = 0;
|
|
|
TimeInterval interval = new TimeInterval();
|
|
|
for (TruncateDTO truncateDTO : truncateDTOS) {
|
|
|
try {
|
|
|
log.info("正在意图、元数据抽取,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
|
|
|
log.info("开始意图识别,切分文档id:{}", truncateDTO.getId());
|
|
|
interval.start("makeOutTruncationIntent");
|
|
|
List<String> intents = tripleConversionPipeline.makeOutTruncationIntent(truncateDTO);
|
|
|
log.info("意图识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(),interval.intervalMs("makeOutTruncationIntent"));
|
|
|
if (CollUtil.isEmpty(intents)){
|
|
|
log.info("切分文档id:{},未正确识别出意图...", truncateDTO.getId());
|
|
|
continue;
|
|
|
}
|
|
|
log.info("开始意图元数据识别,切分文档id:{}", truncateDTO.getId());
|
|
|
interval.start("makeOutDomainMetadata");
|
|
|
List<List<String>> intentSplit = CollUtil.split(intents, 10);
|
|
|
log.info("切分意图列表,切分前数据总数:{},切分出:{}组数据", intents.size(), intentSplit.size());
|
|
|
for (List<String> intentList : intentSplit) {
|
|
|
// 每10个意图进行一次元数据识别
|
|
|
List<DomainMetadataDTO> domainMetadataDTOS = tripleConversionPipeline.makeOutDomainMetadata(truncateDTO, intentList);
|
|
|
log.info("意图元数据识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(),interval.intervalMs("makeOutDomainMetadata"));
|
|
|
// 保存意图数据
|
|
|
List<Intention> intentions = intentionService.batchSaveIfAbsent(intents, pdfInfo.getDomainCategoryId(), pdfId.toString());
|
|
|
for (Intention intention : intentions) {
|
|
|
List<DomainMetadataDTO> metadataDTOS = domainMetadataDTOS.stream()
|
|
|
.filter(d -> StrUtil.equals(d.getIntentDigest(), intention.getDigest())).toList();
|
|
|
domainMetadataService.batchSaveOrUpdateMetadata(metadataDTOS,intention.getId(), pdfInfo.getDomainCategoryId());
|
|
|
}
|
|
|
}
|
|
|
intentSize ++;
|
|
|
index ++;
|
|
|
}catch (Exception e){
|
|
|
index ++;
|
|
|
log.error("切分文档id:{},意图识别失败", truncateDTO.getId(), e);
|
|
|
}
|
|
|
|
|
|
}
|
|
|
log.info("意图、元数据抽取完成,耗时:{}秒,一共处理片段数:{}个,抽取出意图数量:{}个", interval.intervalSecond(),truncateSize,intentSize);
|
|
|
|
|
|
}
|
|
|
|
|
|
private void generateGraphBaseTrainExecutor(Integer pdfId){
|
|
|
Assert.notNull(pdfId, "pdfId不能为空");
|
|
|
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
|
|
|
Assert.notNull(pdfInfo, "pdfId:{}没有找到对应的pdf信息", pdfId);
|
|
|
if (StrUtil.isEmpty(pdfInfo.getContentType())){
|
|
|
log.info("pdfId:{}没有找到对应的pdf内容类型,开始识别文档内容类型...", pdfId);
|
|
|
DocumentContentTypeEnum documentContentTypeEnum = tripleConversionPipeline.makeOutPdfContentType(pdfId);
|
|
|
if (null == documentContentTypeEnum){
|
|
|
log.info("pdfId:{}没有找到对应的pdf内容类型,停止后续任务...", pdfId);
|
|
|
return;
|
|
|
}
|
|
|
pdfInfo.setContentType(documentContentTypeEnum.getType());
|
|
|
pdfInfoService.updateContentType(pdfId, documentContentTypeEnum.getType());
|
|
|
}
|
|
|
if (null == pdfInfo.getDomainCategoryId()){
|
|
|
log.info("pdfId:{}没有找到对应的pdf行业,开始识别文档行业...", pdfId);
|
|
|
String industry = tripleConversionPipeline.makeOutPdfIndustry(pdfId);
|
|
|
if (StrUtil.isEmpty(industry)){
|
|
|
log.info("pdfId:{}没有找到对应的pdf行业,停止后续任务...", pdfId);
|
|
|
return;
|
|
|
}
|
|
|
DomainCategory domainCategory = domainCategoryService.queryByIndustryName(industry);
|
|
|
if (null == domainCategory){
|
|
|
log.info("pdfId:{}没有找到:{}对应的行业分类,停止后续任务...", pdfId, industry);
|
|
|
return;
|
|
|
}
|
|
|
pdfInfo.setDomainCategoryId(domainCategory.getId());
|
|
|
pdfInfoService.updateCategory(pdfId, domainCategory.getId());
|
|
|
}
|
|
|
|
|
|
List<TruncateDTO> truncateDTOS = documentTruncationService.listByPdfId(pdfId).stream().map(TruncateDTO::new).collect(Collectors.toList());
|
|
|
TripleConversionPipeline conversionPipeline = this.getTripleConversionPipeline(pdfInfo.getContentType(), pdfInfo.getDomainCategoryId());
|
|
|
if (CollUtil.isEmpty(truncateDTOS)){
|
|
|
log.info("没有找到pdfId为{}的文档切分数据,开始切分数据...", pdfId);
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(pdfId);
|
|
|
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).collect(Collectors.toList());
|
|
|
truncateDTOS = conversionPipeline.sliceDocuments(documentDTOList);
|
|
|
documentTruncationService.batchSave(truncateDTOS);
|
|
|
log.info("切分数据完成,切分个数:{}", truncateDTOS.size());
|
|
|
}
|
|
|
// 查询当前行业分类下的意图
|
|
|
List<IntentDTO> intentionDTOs = intentionService.queryByDomainCategoryId(pdfInfo.getDomainCategoryId()).stream()
|
|
|
.filter(intention -> StrUtil.equals("0",intention.getGenerationType())) // 过滤出手动确认的数据
|
|
|
.map(IntentDTO::new).distinct().toList();
|
|
|
if (CollUtil.isEmpty(intentionDTOs)){
|
|
|
log.info("没有找到行业分类id为{}的意图数据,不再进行下一步操作...", pdfInfo.getDomainCategoryId());
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
TimeInterval timer = new TimeInterval();
|
|
|
int index = 1;
|
|
|
int truncateSize = truncateDTOS.size();
|
|
|
log.info("开始实体关系抽取,耗时:{}秒,一共处理片段数:{}个", timer.intervalSecond(), truncateDTOS.size());
|
|
|
List<EREDTO> eredtos = new ArrayList<>();
|
|
|
for (TruncateDTO truncateDTO : truncateDTOS) {
|
|
|
index ++;
|
|
|
log.info("开始命名实体识别,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
|
|
|
try {
|
|
|
if (StrUtil.equals(truncateDTO.getLayoutType(), String.valueOf(LayoutTypeEnum.TABLE.getCode()))){
|
|
|
log.info("切分文档id:{},表格类型数据,不进行意图识别...", truncateDTO.getId());
|
|
|
EREDTO eredto = conversionPipeline.doEre(truncateDTO, new ArrayList<>());
|
|
|
if (null == eredto){
|
|
|
log.info("切分文档id:{},命名实体识别结果为空...", truncateDTO.getId());
|
|
|
continue;
|
|
|
}
|
|
|
this.saveERE(eredto, truncateDTO.getId());
|
|
|
eredtos.add(eredto);
|
|
|
}
|
|
|
|
|
|
timer.start("makeOutTruncationIntent");
|
|
|
log.info("开始意图识别,切分文档id:{}", truncateDTO.getId());
|
|
|
List<IntentDTO> intents = conversionPipeline.makeOutTruncationIntent(truncateDTO,intentionDTOs);
|
|
|
log.info("意图识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(), timer.intervalMs("makeOutTruncationIntent"));
|
|
|
if (CollUtil.isEmpty(intents)){
|
|
|
log.info("切分文档id:{},未正确识别出意图...", truncateDTO.getId());
|
|
|
continue;
|
|
|
}
|
|
|
log.info("开始命名实体识别,切分文档id:{}", truncateDTO.getId());
|
|
|
timer.start("doEre");
|
|
|
EREDTO eredto = conversionPipeline.doEre(truncateDTO, intents);
|
|
|
log.info("命名实体识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(), timer.intervalMs("doEre"));
|
|
|
if (null == eredto){
|
|
|
log.info("切分文档id:{},命名实体识别结果为空...", truncateDTO.getId());
|
|
|
continue;
|
|
|
}
|
|
|
// 保存实体关系抽取结果
|
|
|
this.saveERE(eredto, truncateDTO.getId());
|
|
|
eredtos.add(eredto);
|
|
|
}catch (Exception e){
|
|
|
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
|
|
|
}
|
|
|
}
|
|
|
log.info("实体关系抽取完成,耗时:{}秒", timer.intervalSecond());
|
|
|
|
|
|
log.info("开始生成知识图谱...");
|
|
|
timer.start("generateGraph");
|
|
|
generateGraphSimple(eredtos);
|
|
|
log.info("生成知识图谱完成,耗时:{}秒", timer.intervalSecond("generateGraph"));
|
|
|
log.info("刷新图谱schema向量...");
|
|
|
tripleToCypherExecutor.refreshSchemaSegmentVector();
|
|
|
log.info("刷新图谱schema向量完成");
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
public TripleConversionPipeline getTripleConversionPipeline(String contentType, String industry) {
|
|
|
// 内容类型决定了文本片段的切分方式,行业类别决定了文本片段的意图
|
|
|
// 内容类型和行业类型确定tripleConversionPipeline的具体实现方式,现在默认是pdf类型
|
|
|
return this.tripleConversionPipeline;
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
public void generateGraph(List<EREDTO> eredtoList) {
|
|
|
log.info("开始合并实体关系抽取结果...");
|
|
|
List<EREDTO> mergedList = tripleConversionPipeline.mergeEreResults(eredtoList);
|
|
|
log.info("合并实体关系抽取结果完成,合并后个数:{}", mergedList.size());
|
|
|
|
|
|
// 保存领域元数据
|
|
|
log.info("开始保存领域元数据...");
|
|
|
for (EREDTO eredto : mergedList) {
|
|
|
List<RelationExtractionDTO> relations = eredto.getRelations();
|
|
|
if (CollUtil.isEmpty(relations)){
|
|
|
continue;
|
|
|
}
|
|
|
for (RelationExtractionDTO relation : relations) {
|
|
|
DomainMetadata domainMetadata = relation.toDomainMetadata();
|
|
|
domainMetadata.setGenerationType(DomainMetaGenerationEnum.SYSTEM_AUTO_GENERATION.getCode());
|
|
|
domainMetadataService.saveIfNotExists(domainMetadata);
|
|
|
}
|
|
|
}
|
|
|
log.info("保存领域元数据完成....");
|
|
|
|
|
|
// 保存字典
|
|
|
log.info("开始保存字典...");
|
|
|
List<ChineseEnglishWords> allWords = chineseEnglishWordsService.queryAll();
|
|
|
int wordsSize = allWords.size();
|
|
|
for (EREDTO eredto : mergedList) {
|
|
|
List<EntityExtractionDTO> entities = eredto.getEntities();
|
|
|
if (CollUtil.isNotEmpty(entities)){
|
|
|
for (EntityExtractionDTO entityDTO : entities) {
|
|
|
saveWordsIfNecessary(entityDTO.getEntity(), allWords);
|
|
|
if (CollUtil.isNotEmpty(entityDTO.getAttributes())){
|
|
|
for (TruncationERAttributeDTO attribute : entityDTO.getAttributes()) {
|
|
|
saveWordsIfNecessary(attribute.getAttribute(), allWords);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
List<RelationExtractionDTO> relations = eredto.getRelations();
|
|
|
if (CollUtil.isNotEmpty(relations)){
|
|
|
for (RelationExtractionDTO relationDTO : relations) {
|
|
|
saveWordsIfNecessary(relationDTO.getRelation(), allWords);
|
|
|
if (CollUtil.isNotEmpty(relationDTO.getAttributes())){
|
|
|
for (TruncationERAttributeDTO attribute : relationDTO.getAttributes()) {
|
|
|
saveWordsIfNecessary(attribute.getAttribute(), allWords);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
log.info("保存字典完成,新增字典个数:{}", allWords.size() - wordsSize);
|
|
|
// 生成cypher语句
|
|
|
for (EREDTO eredto : mergedList) {
|
|
|
if (CollUtil.isEmpty(eredto.getEntities()) && CollUtil.isEmpty(eredto.getRelations())){
|
|
|
continue;
|
|
|
}
|
|
|
// 构造一个字典
|
|
|
allWords = getChineseEnglishWords(eredto);
|
|
|
|
|
|
eredto.setEn(allWords);
|
|
|
try {
|
|
|
tripleToCypherExecutor.saveERE(eredto);
|
|
|
} catch (Exception e) {
|
|
|
log.info("生成cypher语句失败,切分文档id:{}", JSONUtil.toJsonStr(eredto), e);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
public void generateGraphSimple(List<EREDTO> eredtoList) {
|
|
|
log.info("开始合并实体关系抽取结果...");
|
|
|
List<EREDTO> mergedList = tripleConversionPipeline.mergeEreResults(eredtoList);
|
|
|
log.info("合并实体关系抽取结果完成,合并后个数:{}", mergedList.size());
|
|
|
|
|
|
for (EREDTO eredto : mergedList) {
|
|
|
if (CollUtil.isEmpty(eredto.getEntities()) && CollUtil.isEmpty(eredto.getRelations())){
|
|
|
continue;
|
|
|
}
|
|
|
eredto.setEn();
|
|
|
try {
|
|
|
tripleToCypherExecutor.saveERE(eredto);
|
|
|
} catch (Exception e) {
|
|
|
log.info("生成cypher语句失败,切分文档id:{}", JSONUtil.toJsonStr(eredto), e);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
private static List<ChineseEnglishWords> getChineseEnglishWords(EREDTO eredto) {
|
|
|
List<ChineseEnglishWords> allWords;
|
|
|
allWords = eredto.getEntities().stream().flatMap(entity -> {
|
|
|
List<ChineseEnglishWords> collect = entity.getAttributes().stream().map(e -> {
|
|
|
ChineseEnglishWords words = new ChineseEnglishWords();
|
|
|
words.setChineseWord(e.getAttribute());
|
|
|
words.setEnglishWord(e.getAttribute());
|
|
|
return words;
|
|
|
}).collect(Collectors.toList());
|
|
|
ChineseEnglishWords words = new ChineseEnglishWords();
|
|
|
words.setChineseWord(entity.getEntity());
|
|
|
words.setEnglishWord(entity.getEntity());
|
|
|
collect.add(words);
|
|
|
return collect.stream();
|
|
|
}).collect(Collectors.toList());
|
|
|
|
|
|
eredto.getRelations().stream().flatMap(relation -> {
|
|
|
List<ChineseEnglishWords> words = relation.getAttributes().stream().map(e -> {
|
|
|
ChineseEnglishWords word = new ChineseEnglishWords();
|
|
|
word.setChineseWord(e.getAttribute());
|
|
|
word.setEnglishWord(e.getAttribute());
|
|
|
return word;
|
|
|
}).collect(Collectors.toList());
|
|
|
ChineseEnglishWords words1 = new ChineseEnglishWords();
|
|
|
words1.setChineseWord(relation.getRelation());
|
|
|
words1.setEnglishWord(relation.getRelation());
|
|
|
words.add(words1);
|
|
|
ChineseEnglishWords words2 = new ChineseEnglishWords();
|
|
|
words2.setChineseWord(relation.getSourceType());
|
|
|
words2.setEnglishWord(relation.getSourceType());
|
|
|
words.add(words2);
|
|
|
ChineseEnglishWords words3 = new ChineseEnglishWords();
|
|
|
words3.setChineseWord(relation.getTargetType());
|
|
|
words3.setEnglishWord(relation.getTargetType());
|
|
|
words.add(words3);
|
|
|
return words.stream();
|
|
|
}).forEach(allWords::add);
|
|
|
|
|
|
return allWords;
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
public List<EREDTO> truncateERE(List<TruncateDTO> truncateDTOS) {
|
|
|
List<EREDTO> eredtoList = new ArrayList<>();
|
|
|
int truncateSize = truncateDTOS.size();
|
|
|
int index = 1;
|
|
|
for (TruncateDTO truncateDTO : truncateDTOS) {
|
|
|
log.info("开始命名实体识别,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
|
|
|
index++;
|
|
|
EREDTO eredto = null;
|
|
|
try {
|
|
|
eredto = tripleConversionPipeline.doEre(truncateDTO);
|
|
|
} catch (Exception e) {
|
|
|
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
|
|
|
}
|
|
|
if (null == eredto){
|
|
|
continue;
|
|
|
}
|
|
|
// 保存实体关系抽取结果
|
|
|
this.saveERE(eredto, truncateDTO.getId());
|
|
|
eredtoList.add(eredto);
|
|
|
}
|
|
|
return eredtoList;
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
@Transactional(rollbackFor = Exception.class)
|
|
|
public void resetGraphData(String pdfId) {
|
|
|
log.info("resetGraphData:重置知识图谱数据,pdfId:{}", pdfId);
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId));
|
|
|
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
|
|
|
log.info("没有找到pdfId为{}的pdf分析结果", pdfId);
|
|
|
return;
|
|
|
}
|
|
|
List<String> documentIds = pdfAnalysisOutputs.stream().map(p -> String.valueOf(p.getId())).toList();
|
|
|
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
|
|
|
if (CollUtil.isEmpty(documentTruncations)){
|
|
|
log.info("没有找到文档切分数据,pdfId:{},不用重置数据...", pdfId);
|
|
|
return;
|
|
|
}
|
|
|
// 删除切分数据
|
|
|
//documentTruncationService.deleteByDocumentIds(documentIds);
|
|
|
for (DocumentTruncation documentTruncation : documentTruncations) {
|
|
|
String truncationId = documentTruncation.getId();
|
|
|
// 删除实体数据
|
|
|
truncationEntityExtractionService.deleteByTruncationId(truncationId);
|
|
|
// 删除关系数据
|
|
|
relationExtractionService.deleteByTruncationId(truncationId);
|
|
|
}
|
|
|
log.info("重置知识图谱数据完成,pdfId:{}", pdfId);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
private void saveWordsIfNecessary(String word, List<ChineseEnglishWords> allWords) {
|
|
|
boolean exists = chineseEnglishWordsService.wordsExists(word, allWords);
|
|
|
if (exists){
|
|
|
return;
|
|
|
}
|
|
|
String generate = chinesEsToEnglishGenerator.generate(word);
|
|
|
if (StrUtil.isEmpty(generate)){
|
|
|
log.warn("生成英文名称失败,entity:{}", word);
|
|
|
return;
|
|
|
}
|
|
|
ChineseEnglishWords words = new ChineseEnglishWords();
|
|
|
words.setChineseWord(word);
|
|
|
words.setEnglishWord(generate);
|
|
|
chineseEnglishWordsService.saveIfNotExists(words);
|
|
|
allWords.add(words);// 更新缓存
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
public void queryGraph(String databaseId, String query) {
|
|
|
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
public void saveERE(EREDTO eredto, String truncationId) {
|
|
|
|
|
|
// 保存实体信息
|
|
|
truncationEntityExtractionService.saveERE(eredto.getEntities());
|
|
|
|
|
|
// 保存关系
|
|
|
relationExtractionService.saveERE(eredto.getRelations());
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
public List<EREDTO> listPdfEREDTO(String pdfId) {
|
|
|
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId));
|
|
|
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
|
|
|
log.info("没有找到pdfId为{}的pdf分析结果", pdfId);
|
|
|
return new ArrayList<>();
|
|
|
}
|
|
|
List<String> documentIds = pdfAnalysisOutputs.stream().map(p -> p.getId().toString()).toList();
|
|
|
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
|
|
|
List<String> truncationIds = documentTruncations.stream().map(DocumentTruncation::getId).toList();
|
|
|
List<TruncationEntityExtraction> truncationEntityExtractions = truncationEntityExtractionService.queryByTruncationIds(truncationIds);
|
|
|
|
|
|
List<TruncationRelationExtraction> truncationRelationExtractions = truncationRelationExtractionService.queryByTruncationIds(truncationIds);
|
|
|
|
|
|
List<String> teIds = truncationEntityExtractions.stream().map(TruncationEntityExtraction::getId).toList();
|
|
|
List<String> trIds = truncationRelationExtractions.stream().map(TruncationRelationExtraction::getId).collect(Collectors.toList());
|
|
|
trIds.addAll(teIds);
|
|
|
List<TruncationErAttribute> truncationErAttributes = truncationErAttributeService.queryByTerIds(trIds);
|
|
|
|
|
|
List<EREDTO> eres = new ArrayList<>();
|
|
|
for (TruncationEntityExtraction entityExtraction : truncationEntityExtractions) {
|
|
|
EREDTO eredto = new EREDTO();
|
|
|
EntityExtractionDTO extractionDTO = new EntityExtractionDTO(entityExtraction);
|
|
|
List<TruncationERAttributeDTO> attributes = truncationErAttributes.stream()
|
|
|
.filter(t -> StrUtil.equals(entityExtraction.getId(), t.getTerId())).map(TruncationERAttributeDTO::new).collect(Collectors.toList());
|
|
|
extractionDTO.setAttributes(attributes);
|
|
|
eredto.getEntities().add(extractionDTO);
|
|
|
eres.add(eredto);
|
|
|
}
|
|
|
for (TruncationRelationExtraction relationExtraction : truncationRelationExtractions) {
|
|
|
EREDTO eredto = new EREDTO();
|
|
|
RelationExtractionDTO extractionDTO = new RelationExtractionDTO(relationExtraction);
|
|
|
List<TruncationERAttributeDTO> attributes = truncationErAttributes.stream()
|
|
|
.filter(t -> StrUtil.equals(relationExtraction.getId(), t.getTerId())).map(TruncationERAttributeDTO::new).collect(Collectors.toList());
|
|
|
extractionDTO.setAttributes(attributes);
|
|
|
eredto.getRelations().add(extractionDTO);
|
|
|
eres.add(eredto);
|
|
|
}
|
|
|
return eres;
|
|
|
}
|
|
|
|
|
|
}
|