You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

598 lines
30 KiB
Java

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

package com.supervision.pdfqaserver.service.impl;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.date.TimeInterval;
import cn.hutool.core.lang.Assert;
import cn.hutool.core.util.NumberUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.json.JSONUtil;
import com.supervision.pdfqaserver.constant.DocumentContentTypeEnum;
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
import com.supervision.pdfqaserver.constant.LayoutTypeEnum;
import com.supervision.pdfqaserver.domain.*;
import com.supervision.pdfqaserver.dto.*;
import com.supervision.pdfqaserver.service.*;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.aop.framework.AopContext;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@Slf4j
@Service
@RequiredArgsConstructor
public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
private final TripleConversionPipeline tripleConversionPipeline;
private final TripleToCypherExecutor tripleToCypherExecutor;
private final ChineseEnglishWordsService chineseEnglishWordsService;
private final DocumentTruncationService documentTruncationService;
private final DomainMetadataService domainMetadataService;
private final PdfAnalysisOutputService pdfAnalysisOutputService;
private final TruncationEntityExtractionService truncationEntityExtractionService;
private final TruncationRelationExtractionService truncationRelationExtractionService;
private final TruncationErAttributeService truncationErAttributeService;
private final TruncationRelationExtractionService relationExtractionService;
private final ChinesEsToEnglishGenerator chinesEsToEnglishGenerator;
private final PdfInfoService pdfInfoService;
private final IntentionService intentionService;
private final DomainCategoryService domainCategoryService;
@Override
public void generateGraph(String pdfId) {
((KnowledgeGraphService)AopContext.currentProxy()).resetGraphData(pdfId);
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId));
if (CollUtil.isEmpty(pdfAnalysisOutputs)) {
log.info("没有找到pdfId为{}的pdf分析结果", pdfId);
return;
}
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).toList();
// 对文档进行切分
TimeInterval timer = new TimeInterval();
timer.start("sliceDocuments");
log.info("开始切分文档,初始文档个数:{}",documentDTOList.size());
List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
log.info("切分文档完成,切分后文档个数:{},耗时:{}秒",truncateDTOS.size(), timer.intervalSecond("sliceDocuments"));
// 保存分片信息
documentTruncationService.batchSave(truncateDTOS);
// 对切分后的文档进行命名实体识别
timer.start("doEre");
log.info("开始命名实体识别...");
List<EREDTO> eredtoList = truncateERE(truncateDTOS);
log.info("命名实体识别完成,耗时:{}秒", timer.intervalSecond("doEre"));
generateGraph(eredtoList);
log.info("生成知识图谱完成,耗时:{}秒", timer.intervalSecond());
}
/**
* 元数据训练
* @param pdfId pdfId
*/
@Override
public void metaDataTrain(Integer pdfId) {
TimeInterval timer = new TimeInterval();
try {
metaDataTrainExecutor(pdfId);
pdfInfoService.pdfTrainComplete(pdfId);
log.info("pdfId:{}元数据训练完成,耗时:{}秒", pdfId, timer.intervalSecond());
}catch (Exception e){
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
if ( null == pdfInfo.getTrainStatus() || pdfInfo.getTrainStatus() == 0) {
log.error("pdfId:{}元数据训练失败...", pdfId, e);
pdfInfoService.pdfTrainFail(pdfId);
}
log.error("pdfId:{}元数据训练失败,耗时:{}秒", pdfId, timer.intervalSecond(),e);
}
}
@Override
public void generateGraphBaseTrain(Integer pdfId) {
Assert.notNull(pdfId, "pdfId不能为空");
TimeInterval timer = new TimeInterval();
try {
log.info("开始生成知识图谱, pdfId:{}", pdfId);
((KnowledgeGraphService)AopContext.currentProxy()).resetGraphData(pdfId.toString());
pdfInfoService.pdfToGraphStart(pdfId);
generateGraphBaseTrainExecutor(pdfId);
pdfInfoService.pdfToGraphComplete(pdfId);
log.info("pdfId:{}知识图谱生成完成,总耗时:{}秒", pdfId,timer.intervalSecond());
}catch (Exception e){
log.error("pdfId:{}知识图谱生成失败...", pdfId, e);
pdfInfoService.pdfToGraphFail(pdfId);
log.info("pdfId:{}知识图谱生成失败,总耗时:{}秒", pdfId,timer.intervalSecond());
}
}
private void metaDataTrainExecutor(Integer pdfId) {
Assert.notNull(pdfId, "pdfId不能为空");
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
Assert.notNull(pdfInfo, "pdfId:{}没有找到对应的pdf信息", pdfId);
if (null == pdfInfo.getTrainStatus() || pdfInfo.getTrainStatus() == 2){
log.info("pdfId:{}没有找到对应的pdf训练状态,开始识别文档训练状态...", pdfId);
pdfInfoService.pdfTrainStart(pdfId);
if (StrUtil.isEmpty(pdfInfo.getContentType())){
log.info("pdfId:{}没有找到对应的pdf内容类型,开始识别文档内容类型...", pdfId);
DocumentContentTypeEnum documentContentTypeEnum = tripleConversionPipeline.makeOutPdfContentType(pdfId);
log.info("pdfId:{}识别文档内容类型完成,内容类型:{}", pdfId, documentContentTypeEnum.getType());
if (StrUtil.isEmpty(documentContentTypeEnum.getType())){
log.info("pdfId:{}没有找到对应的pdf内容类型,停止后续任务...", pdfId);
pdfInfoService.pdfTrainFail(pdfId);
return;
}
pdfInfo.setContentType(documentContentTypeEnum.getType());
pdfInfoService.updateContentType(pdfId, documentContentTypeEnum.getType());
}
if (StrUtil.isEmpty(pdfInfo.getDomainCategoryId())){
log.info("pdfId:{}没有找到对应的pdf行业,开始识别文档行业...", pdfId);
String industry = tripleConversionPipeline.makeOutPdfIndustry(pdfId);
log.info("pdfId:{}识别文档行业完成,行业:{}", pdfId, industry);
if (StrUtil.isEmpty(industry)){
log.info("pdfId:{}没有找到对应的pdf行业,停止后续任务...", pdfId);
pdfInfoService.pdfTrainFail(pdfId);
return;
}
DomainCategory domainCategory = domainCategoryService.queryByIndustryName(industry);
if (null == domainCategory){
log.info("pdfId:{}没有找到:{}对应的行业分类,停止后续任务...", pdfId, industry);
pdfInfoService.pdfTrainFail(pdfId);
return;
}
pdfInfo.setDomainCategoryId(domainCategory.getId());
pdfInfoService.updateCategory(pdfId, domainCategory.getId());
}
}
TripleConversionPipeline tripleConversionPipeline = this.getTripleConversionPipeline(pdfInfo.getContentType(), pdfInfo.getDomainCategoryId());
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(pdfId);
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
log.warn("没有找到pdfId为{}的pdf分析结果,不再进行下一步操作...", pdfId);
return;
}
List<String> documentIds = pdfAnalysisOutputs.stream().map(p->String.valueOf(p.getId())).collect(Collectors.toList());
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
if (CollUtil.isNotEmpty(documentTruncations)){
log.info("文档切分数据不为空,pdfId:{},清除切分数据...", pdfId);
documentTruncationService.deleteByDocumentIds(documentIds);
}
log.info("开始切割文档切片,pdfId:{}", pdfId);
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).collect(Collectors.toList());
List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
log.info("切割文档切片完成,切片个数:{}", truncateDTOS.size());
// 保存分片信息
documentTruncationService.batchSave(truncateDTOS);
// 只识别文本类型数据
truncateDTOS = truncateDTOS.stream()
.filter(t->StrUtil.equals(t.getLayoutType(), String.valueOf(LayoutTypeEnum.TEXT.getCode()))).collect(Collectors.toList());
log.info("只识别文本类型数据,个数:{}", truncateDTOS.size());
int truncateSize = truncateDTOS.size();
int index = 1;
int intentSize = 0;
TimeInterval interval = new TimeInterval();
for (TruncateDTO truncateDTO : truncateDTOS) {
try {
log.info("正在意图、元数据抽取,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
log.info("开始意图识别,切分文档id:{}", truncateDTO.getId());
interval.start("makeOutTruncationIntent");
List<String> intents = tripleConversionPipeline.makeOutTruncationIntent(truncateDTO);
log.info("意图识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(),interval.intervalMs("makeOutTruncationIntent"));
if (CollUtil.isEmpty(intents)){
log.info("切分文档id:{},未正确识别出意图...", truncateDTO.getId());
continue;
}
log.info("开始意图元数据识别,切分文档id:{}", truncateDTO.getId());
interval.start("makeOutDomainMetadata");
List<List<String>> intentSplit = CollUtil.split(intents, 10);
log.info("切分意图列表,切分前数据总数:{},切分出:{}组数据", intents.size(), intentSplit.size());
for (List<String> intentList : intentSplit) {
// 每10个意图进行一次元数据识别
List<DomainMetadataDTO> domainMetadataDTOS = tripleConversionPipeline.makeOutDomainMetadata(truncateDTO, intentList);
log.info("意图元数据识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(),interval.intervalMs("makeOutDomainMetadata"));
// 保存意图数据
List<Intention> intentions = intentionService.batchSaveIfAbsent(intents, pdfInfo.getDomainCategoryId(), pdfId.toString());
for (Intention intention : intentions) {
List<DomainMetadataDTO> metadataDTOS = domainMetadataDTOS.stream()
.filter(d -> StrUtil.equals(d.getIntentDigest(), intention.getDigest())).toList();
domainMetadataService.batchSaveOrUpdateMetadata(metadataDTOS,intention.getId(), pdfInfo.getDomainCategoryId());
}
}
intentSize ++;
index ++;
}catch (Exception e){
index ++;
log.error("切分文档id:{},意图识别失败", truncateDTO.getId(), e);
}
}
log.info("意图、元数据抽取完成,耗时:{}秒,一共处理片段数:{}个,抽取出意图数量:{}个", interval.intervalSecond(),truncateSize,intentSize);
}
private void generateGraphBaseTrainExecutor(Integer pdfId){
Assert.notNull(pdfId, "pdfId不能为空");
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
Assert.notNull(pdfInfo, "pdfId:{}没有找到对应的pdf信息", pdfId);
if (StrUtil.isEmpty(pdfInfo.getContentType())){
log.info("pdfId:{}没有找到对应的pdf内容类型,开始识别文档内容类型...", pdfId);
DocumentContentTypeEnum documentContentTypeEnum = tripleConversionPipeline.makeOutPdfContentType(pdfId);
if (null == documentContentTypeEnum){
log.info("pdfId:{}没有找到对应的pdf内容类型,停止后续任务...", pdfId);
return;
}
pdfInfo.setContentType(documentContentTypeEnum.getType());
pdfInfoService.updateContentType(pdfId, documentContentTypeEnum.getType());
}
if (null == pdfInfo.getDomainCategoryId()){
log.info("pdfId:{}没有找到对应的pdf行业,开始识别文档行业...", pdfId);
String industry = tripleConversionPipeline.makeOutPdfIndustry(pdfId);
if (StrUtil.isEmpty(industry)){
log.info("pdfId:{}没有找到对应的pdf行业,停止后续任务...", pdfId);
return;
}
DomainCategory domainCategory = domainCategoryService.queryByIndustryName(industry);
if (null == domainCategory){
log.info("pdfId:{}没有找到:{}对应的行业分类,停止后续任务...", pdfId, industry);
return;
}
pdfInfo.setDomainCategoryId(domainCategory.getId());
pdfInfoService.updateCategory(pdfId, domainCategory.getId());
}
List<TruncateDTO> truncateDTOS = documentTruncationService.listByPdfId(pdfId).stream().map(TruncateDTO::new).collect(Collectors.toList());
TripleConversionPipeline conversionPipeline = this.getTripleConversionPipeline(pdfInfo.getContentType(), pdfInfo.getDomainCategoryId());
if (CollUtil.isEmpty(truncateDTOS)){
log.info("没有找到pdfId为{}的文档切分数据,开始切分数据...", pdfId);
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(pdfId);
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).collect(Collectors.toList());
truncateDTOS = conversionPipeline.sliceDocuments(documentDTOList);
documentTruncationService.batchSave(truncateDTOS);
log.info("切分数据完成,切分个数:{}", truncateDTOS.size());
}
// 查询当前行业分类下的意图
List<IntentDTO> intentionDTOs = intentionService.queryByDomainCategoryId(pdfInfo.getDomainCategoryId()).stream()
.filter(intention -> StrUtil.equals("0",intention.getGenerationType())) // 过滤出手动确认的数据
.map(IntentDTO::new).distinct().toList();
if (CollUtil.isEmpty(intentionDTOs)){
log.info("没有找到行业分类id为{}的意图数据,不再进行下一步操作...", pdfInfo.getDomainCategoryId());
return;
}
TimeInterval timer = new TimeInterval();
int index = 1;
int truncateSize = truncateDTOS.size();
log.info("开始实体关系抽取,耗时:{}秒,一共处理片段数:{}个", timer.intervalSecond(), truncateDTOS.size());
List<EREDTO> eredtos = new ArrayList<>();
for (TruncateDTO truncateDTO : truncateDTOS) {
index ++;
log.info("开始命名实体识别,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
try {
if (StrUtil.equals(truncateDTO.getLayoutType(), String.valueOf(LayoutTypeEnum.TABLE.getCode()))){
log.info("切分文档id:{},表格类型数据,不进行意图识别...", truncateDTO.getId());
EREDTO eredto = conversionPipeline.doEre(truncateDTO, new ArrayList<>());
if (null == eredto){
log.info("切分文档id:{},命名实体识别结果为空...", truncateDTO.getId());
continue;
}
this.saveERE(eredto, truncateDTO.getId());
eredtos.add(eredto);
}
timer.start("makeOutTruncationIntent");
log.info("开始意图识别,切分文档id:{}", truncateDTO.getId());
List<IntentDTO> intents = conversionPipeline.makeOutTruncationIntent(truncateDTO,intentionDTOs);
log.info("意图识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(), timer.intervalMs("makeOutTruncationIntent"));
if (CollUtil.isEmpty(intents)){
log.info("切分文档id:{},未正确识别出意图...", truncateDTO.getId());
continue;
}
log.info("开始命名实体识别,切分文档id:{}", truncateDTO.getId());
timer.start("doEre");
EREDTO eredto = conversionPipeline.doEre(truncateDTO, intents);
log.info("命名实体识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(), timer.intervalMs("doEre"));
if (null == eredto){
log.info("切分文档id:{},命名实体识别结果为空...", truncateDTO.getId());
continue;
}
// 保存实体关系抽取结果
this.saveERE(eredto, truncateDTO.getId());
eredtos.add(eredto);
}catch (Exception e){
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
}
}
log.info("实体关系抽取完成,耗时:{}秒", timer.intervalSecond());
log.info("开始生成知识图谱...");
timer.start("generateGraph");
generateGraphSimple(eredtos);
log.info("生成知识图谱完成,耗时:{}秒", timer.intervalSecond("generateGraph"));
log.info("刷新图谱schema向量...");
tripleToCypherExecutor.refreshSchemaSegmentVector();
log.info("刷新图谱schema向量完成");
}
@Override
public TripleConversionPipeline getTripleConversionPipeline(String contentType, String industry) {
// 内容类型决定了文本片段的切分方式,行业类别决定了文本片段的意图
// 内容类型和行业类型确定tripleConversionPipeline的具体实现方式现在默认是pdf类型
return this.tripleConversionPipeline;
}
@Override
public void generateGraph(List<EREDTO> eredtoList) {
log.info("开始合并实体关系抽取结果...");
List<EREDTO> mergedList = tripleConversionPipeline.mergeEreResults(eredtoList);
log.info("合并实体关系抽取结果完成,合并后个数:{}", mergedList.size());
// 保存领域元数据
log.info("开始保存领域元数据...");
for (EREDTO eredto : mergedList) {
List<RelationExtractionDTO> relations = eredto.getRelations();
if (CollUtil.isEmpty(relations)){
continue;
}
for (RelationExtractionDTO relation : relations) {
DomainMetadata domainMetadata = relation.toDomainMetadata();
domainMetadata.setGenerationType(DomainMetaGenerationEnum.SYSTEM_AUTO_GENERATION.getCode());
domainMetadataService.saveIfNotExists(domainMetadata);
}
}
log.info("保存领域元数据完成....");
// 保存字典
log.info("开始保存字典...");
List<ChineseEnglishWords> allWords = chineseEnglishWordsService.queryAll();
int wordsSize = allWords.size();
for (EREDTO eredto : mergedList) {
List<EntityExtractionDTO> entities = eredto.getEntities();
if (CollUtil.isNotEmpty(entities)){
for (EntityExtractionDTO entityDTO : entities) {
saveWordsIfNecessary(entityDTO.getEntity(), allWords);
if (CollUtil.isNotEmpty(entityDTO.getAttributes())){
for (TruncationERAttributeDTO attribute : entityDTO.getAttributes()) {
saveWordsIfNecessary(attribute.getAttribute(), allWords);
}
}
}
}
List<RelationExtractionDTO> relations = eredto.getRelations();
if (CollUtil.isNotEmpty(relations)){
for (RelationExtractionDTO relationDTO : relations) {
saveWordsIfNecessary(relationDTO.getRelation(), allWords);
if (CollUtil.isNotEmpty(relationDTO.getAttributes())){
for (TruncationERAttributeDTO attribute : relationDTO.getAttributes()) {
saveWordsIfNecessary(attribute.getAttribute(), allWords);
}
}
}
}
}
log.info("保存字典完成,新增字典个数:{}", allWords.size() - wordsSize);
// 生成cypher语句
for (EREDTO eredto : mergedList) {
if (CollUtil.isEmpty(eredto.getEntities()) && CollUtil.isEmpty(eredto.getRelations())){
continue;
}
// 构造一个字典
allWords = getChineseEnglishWords(eredto);
eredto.setEn(allWords);
try {
tripleToCypherExecutor.saveERE(eredto);
} catch (Exception e) {
log.info("生成cypher语句失败,切分文档id:{}", JSONUtil.toJsonStr(eredto), e);
}
}
}
@Override
public void generateGraphSimple(List<EREDTO> eredtoList) {
log.info("开始合并实体关系抽取结果...");
List<EREDTO> mergedList = tripleConversionPipeline.mergeEreResults(eredtoList);
log.info("合并实体关系抽取结果完成,合并后个数:{}", mergedList.size());
for (EREDTO eredto : mergedList) {
if (CollUtil.isEmpty(eredto.getEntities()) && CollUtil.isEmpty(eredto.getRelations())){
continue;
}
eredto.setEn();
try {
tripleToCypherExecutor.saveERE(eredto);
} catch (Exception e) {
log.info("生成cypher语句失败,切分文档id:{}", JSONUtil.toJsonStr(eredto), e);
}
}
}
private static List<ChineseEnglishWords> getChineseEnglishWords(EREDTO eredto) {
List<ChineseEnglishWords> allWords;
allWords = eredto.getEntities().stream().flatMap(entity -> {
List<ChineseEnglishWords> collect = entity.getAttributes().stream().map(e -> {
ChineseEnglishWords words = new ChineseEnglishWords();
words.setChineseWord(e.getAttribute());
words.setEnglishWord(e.getAttribute());
return words;
}).collect(Collectors.toList());
ChineseEnglishWords words = new ChineseEnglishWords();
words.setChineseWord(entity.getEntity());
words.setEnglishWord(entity.getEntity());
collect.add(words);
return collect.stream();
}).collect(Collectors.toList());
eredto.getRelations().stream().flatMap(relation -> {
List<ChineseEnglishWords> words = relation.getAttributes().stream().map(e -> {
ChineseEnglishWords word = new ChineseEnglishWords();
word.setChineseWord(e.getAttribute());
word.setEnglishWord(e.getAttribute());
return word;
}).collect(Collectors.toList());
ChineseEnglishWords words1 = new ChineseEnglishWords();
words1.setChineseWord(relation.getRelation());
words1.setEnglishWord(relation.getRelation());
words.add(words1);
ChineseEnglishWords words2 = new ChineseEnglishWords();
words2.setChineseWord(relation.getSourceType());
words2.setEnglishWord(relation.getSourceType());
words.add(words2);
ChineseEnglishWords words3 = new ChineseEnglishWords();
words3.setChineseWord(relation.getTargetType());
words3.setEnglishWord(relation.getTargetType());
words.add(words3);
return words.stream();
}).forEach(allWords::add);
return allWords;
}
@Override
public List<EREDTO> truncateERE(List<TruncateDTO> truncateDTOS) {
List<EREDTO> eredtoList = new ArrayList<>();
int truncateSize = truncateDTOS.size();
int index = 1;
for (TruncateDTO truncateDTO : truncateDTOS) {
log.info("开始命名实体识别,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
index++;
EREDTO eredto = null;
try {
eredto = tripleConversionPipeline.doEre(truncateDTO);
} catch (Exception e) {
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
}
if (null == eredto){
continue;
}
// 保存实体关系抽取结果
this.saveERE(eredto, truncateDTO.getId());
eredtoList.add(eredto);
}
return eredtoList;
}
@Override
@Transactional(rollbackFor = Exception.class)
public void resetGraphData(String pdfId) {
log.info("resetGraphData:重置知识图谱数据,pdfId:{}", pdfId);
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId));
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
log.info("没有找到pdfId为{}的pdf分析结果", pdfId);
return;
}
List<String> documentIds = pdfAnalysisOutputs.stream().map(p -> String.valueOf(p.getId())).toList();
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
if (CollUtil.isEmpty(documentTruncations)){
log.info("没有找到文档切分数据,pdfId:{},不用重置数据...", pdfId);
return;
}
// 删除切分数据
//documentTruncationService.deleteByDocumentIds(documentIds);
for (DocumentTruncation documentTruncation : documentTruncations) {
String truncationId = documentTruncation.getId();
// 删除实体数据
truncationEntityExtractionService.deleteByTruncationId(truncationId);
// 删除关系数据
relationExtractionService.deleteByTruncationId(truncationId);
}
log.info("重置知识图谱数据完成,pdfId:{}", pdfId);
}
private void saveWordsIfNecessary(String word, List<ChineseEnglishWords> allWords) {
boolean exists = chineseEnglishWordsService.wordsExists(word, allWords);
if (exists){
return;
}
String generate = chinesEsToEnglishGenerator.generate(word);
if (StrUtil.isEmpty(generate)){
log.warn("生成英文名称失败entity:{}", word);
return;
}
ChineseEnglishWords words = new ChineseEnglishWords();
words.setChineseWord(word);
words.setEnglishWord(generate);
chineseEnglishWordsService.saveIfNotExists(words);
allWords.add(words);// 更新缓存
}
@Override
public void queryGraph(String databaseId, String query) {
}
@Override
public void saveERE(EREDTO eredto, String truncationId) {
// 保存实体信息
truncationEntityExtractionService.saveERE(eredto.getEntities());
// 保存关系
relationExtractionService.saveERE(eredto.getRelations());
}
@Override
public List<EREDTO> listPdfEREDTO(String pdfId) {
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId));
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
log.info("没有找到pdfId为{}的pdf分析结果", pdfId);
return new ArrayList<>();
}
List<String> documentIds = pdfAnalysisOutputs.stream().map(p -> p.getId().toString()).toList();
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
List<String> truncationIds = documentTruncations.stream().map(DocumentTruncation::getId).toList();
List<TruncationEntityExtraction> truncationEntityExtractions = truncationEntityExtractionService.queryByTruncationIds(truncationIds);
List<TruncationRelationExtraction> truncationRelationExtractions = truncationRelationExtractionService.queryByTruncationIds(truncationIds);
List<String> teIds = truncationEntityExtractions.stream().map(TruncationEntityExtraction::getId).toList();
List<String> trIds = truncationRelationExtractions.stream().map(TruncationRelationExtraction::getId).collect(Collectors.toList());
trIds.addAll(teIds);
List<TruncationErAttribute> truncationErAttributes = truncationErAttributeService.queryByTerIds(trIds);
List<EREDTO> eres = new ArrayList<>();
for (TruncationEntityExtraction entityExtraction : truncationEntityExtractions) {
EREDTO eredto = new EREDTO();
EntityExtractionDTO extractionDTO = new EntityExtractionDTO(entityExtraction);
List<TruncationERAttributeDTO> attributes = truncationErAttributes.stream()
.filter(t -> StrUtil.equals(entityExtraction.getId(), t.getTerId())).map(TruncationERAttributeDTO::new).collect(Collectors.toList());
extractionDTO.setAttributes(attributes);
eredto.getEntities().add(extractionDTO);
eres.add(eredto);
}
for (TruncationRelationExtraction relationExtraction : truncationRelationExtractions) {
EREDTO eredto = new EREDTO();
RelationExtractionDTO extractionDTO = new RelationExtractionDTO(relationExtraction);
List<TruncationERAttributeDTO> attributes = truncationErAttributes.stream()
.filter(t -> StrUtil.equals(relationExtraction.getId(), t.getTerId())).map(TruncationERAttributeDTO::new).collect(Collectors.toList());
extractionDTO.setAttributes(attributes);
eredto.getRelations().add(extractionDTO);
eres.add(eredto);
}
return eres;
}
}