You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

315 lines
14 KiB
Java

5 months ago
package com.supervision.pdfqaserver.service.impl;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.date.TimeInterval;
import cn.hutool.core.util.NumberUtil;
5 months ago
import cn.hutool.core.util.StrUtil;
import cn.hutool.json.JSONUtil;
5 months ago
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
import com.supervision.pdfqaserver.domain.*;
5 months ago
import com.supervision.pdfqaserver.dto.*;
5 months ago
import com.supervision.pdfqaserver.service.*;
import com.supervision.pdfqaserver.thread.KnowledgeGraphGenerateTreadPool;
5 months ago
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.aop.framework.AopContext;
5 months ago
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
5 months ago
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
5 months ago
@Slf4j
@Service
@RequiredArgsConstructor
public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
private final TripleConversionPipeline tripleConversionPipeline;
private final TripleToCypherExecutor tripleToCypherExecutor;
private final ChineseEnglishWordsService chineseEnglishWordsService;
private final DocumentTruncationService documentTruncationService;
private final DomainMetadataService domainMetadataService;
private final PdfAnalysisOutputService pdfAnalysisOutputService;
private final TruncationEntityExtractionService truncationEntityExtractionService;
private final TruncationRelationExtractionService truncationRelationExtractionService;
private final TruncationErAttributeService truncationErAttributeService;
5 months ago
private final TruncationRelationExtractionService relationExtractionService;
5 months ago
private final ChinesEsToEnglishGenerator chinesEsToEnglishGenerator;
5 months ago
@Override
public void generateGraph(String pdfId) {
((KnowledgeGraphService)AopContext.currentProxy()).resetGraphData(pdfId);
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId));
5 months ago
if (CollUtil.isEmpty(pdfAnalysisOutputs)) {
log.info("没有找到pdfId为{}的pdf分析结果", pdfId);
5 months ago
return;
}
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).toList();
// 对文档进行切分
TimeInterval timer = new TimeInterval();
timer.start("sliceDocuments");
log.info("开始切分文档,初始文档个数:{}",documentDTOList.size());
5 months ago
List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
log.info("切分文档完成,切分后文档个数:{},耗时:{}秒",truncateDTOS.size(), timer.intervalSecond("sliceDocuments"));
5 months ago
// 保存分片信息
documentTruncationService.batchSave(truncateDTOS);
// 对切分后的文档进行命名实体识别
timer.start("doEre");
log.info("开始命名实体识别...");
List<EREDTO> eredtoList = truncateERE(truncateDTOS);
log.info("命名实体识别完成,耗时:{}秒", timer.intervalSecond("doEre"));
5 months ago
generateGraph(eredtoList);
}
@Override
public void generateGraph(List<EREDTO> eredtoList) {
log.info("开始合并实体关系抽取结果...");
5 months ago
List<EREDTO> mergedList = tripleConversionPipeline.mergeEreResults(eredtoList);
log.info("合并实体关系抽取结果完成,合并后个数:{}", mergedList.size());
5 months ago
5 months ago
// 保存领域元数据
log.info("开始保存领域元数据...");
5 months ago
for (EREDTO eredto : mergedList) {
5 months ago
List<RelationExtractionDTO> relations = eredto.getRelations();
if (CollUtil.isEmpty(relations)){
continue;
}
for (RelationExtractionDTO relation : relations) {
DomainMetadata domainMetadata = relation.toDomainMetadata();
domainMetadata.setDomainType("1");
domainMetadata.setGenerationType(DomainMetaGenerationEnum.SYSTEM_AUTO_GENERATION.getCode());
domainMetadataService.saveIfNotExists(domainMetadata);
}
}
log.info("保存领域元数据完成....");
5 months ago
5 months ago
// 保存字典
log.info("开始保存字典...");
5 months ago
List<ChineseEnglishWords> allWords = chineseEnglishWordsService.queryAll();
int wordsSize = allWords.size();
5 months ago
for (EREDTO eredto : mergedList) {
List<EntityExtractionDTO> entities = eredto.getEntities();
if (CollUtil.isNotEmpty(entities)){
for (EntityExtractionDTO entityDTO : entities) {
saveWordsIfNecessary(entityDTO.getEntity(), allWords);
if (CollUtil.isNotEmpty(entityDTO.getAttributes())){
for (ERAttributeDTO attribute : entityDTO.getAttributes()) {
saveWordsIfNecessary(attribute.getAttribute(), allWords);
}
}
5 months ago
}
}
List<RelationExtractionDTO> relations = eredto.getRelations();
if (CollUtil.isNotEmpty(relations)){
for (RelationExtractionDTO relationDTO : relations) {
saveWordsIfNecessary(relationDTO.getRelation(), allWords);
if (CollUtil.isNotEmpty(relationDTO.getAttributes())){
for (ERAttributeDTO attribute : relationDTO.getAttributes()) {
saveWordsIfNecessary(attribute.getAttribute(), allWords);
}
}
5 months ago
}
}
}
log.info("保存字典完成,新增字典个数:{}", allWords.size() - wordsSize);
5 months ago
// 生成cypher语句
for (EREDTO eredto : mergedList) {
if (CollUtil.isEmpty(eredto.getEntities()) && CollUtil.isEmpty(eredto.getRelations())){
continue;
}
// 构造一个字典
allWords = getChineseEnglishWords(eredto);
5 months ago
eredto.setEn(allWords);
try {
tripleToCypherExecutor.saveERE(eredto);
} catch (Exception e) {
log.info("生成cypher语句失败,切分文档id:{}", JSONUtil.toJsonStr(eredto), e);
}
5 months ago
}
}
private static List<ChineseEnglishWords> getChineseEnglishWords(EREDTO eredto) {
List<ChineseEnglishWords> allWords;
allWords = eredto.getEntities().stream().flatMap(entity -> {
List<ChineseEnglishWords> collect = entity.getAttributes().stream().map(e -> {
ChineseEnglishWords words = new ChineseEnglishWords();
words.setChineseWord(e.getAttribute());
words.setEnglishWord(e.getAttribute());
return words;
}).collect(Collectors.toList());
ChineseEnglishWords words = new ChineseEnglishWords();
words.setChineseWord(entity.getEntity());
words.setEnglishWord(entity.getEntity());
collect.add(words);
return collect.stream();
}).collect(Collectors.toList());
eredto.getRelations().stream().flatMap(relation -> {
List<ChineseEnglishWords> words = relation.getAttributes().stream().map(e -> {
ChineseEnglishWords word = new ChineseEnglishWords();
word.setChineseWord(e.getAttribute());
word.setEnglishWord(e.getAttribute());
return word;
}).collect(Collectors.toList());
ChineseEnglishWords words1 = new ChineseEnglishWords();
words1.setChineseWord(relation.getRelation());
words1.setEnglishWord(relation.getRelation());
words.add(words1);
ChineseEnglishWords words2 = new ChineseEnglishWords();
words2.setChineseWord(relation.getSourceType());
words2.setEnglishWord(relation.getSourceType());
words.add(words2);
ChineseEnglishWords words3 = new ChineseEnglishWords();
words3.setChineseWord(relation.getTargetType());
words3.setEnglishWord(relation.getTargetType());
words.add(words3);
return words.stream();
}).forEach(allWords::add);
return allWords;
}
5 months ago
@Override
public List<EREDTO> truncateERE(List<TruncateDTO> truncateDTOS) {
List<EREDTO> eredtoList = new ArrayList<>();
int truncateSize = truncateDTOS.size();
int index = 1;
for (TruncateDTO truncateDTO : truncateDTOS) {
log.info("开始命名实体识别,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
index++;
EREDTO eredto = null;
try {
eredto = tripleConversionPipeline.doEre(truncateDTO);
} catch (Exception e) {
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
}
if (null == eredto){
continue;
}
// 保存实体关系抽取结果
this.saveERE(eredto, truncateDTO.getId());
eredtoList.add(eredto);
}
return eredtoList;
5 months ago
}
@Override
@Transactional(rollbackFor = Exception.class)
public void resetGraphData(String pdfId) {
log.info("resetGraphData:重置知识图谱数据,pdfId:{}", pdfId);
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId));
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
log.info("没有找到pdfId为{}的pdf分析结果", pdfId);
return;
}
List<String> documentIds = pdfAnalysisOutputs.stream().map(p -> String.valueOf(p.getId())).toList();
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
if (CollUtil.isEmpty(documentTruncations)){
log.info("没有找到文档切分数据,pdfId:{},不用重置数据...", pdfId);
return;
}
// 删除切分数据
documentTruncationService.deleteByDocumentIds(documentIds);
for (DocumentTruncation documentTruncation : documentTruncations) {
String truncationId = documentTruncation.getId();
// 删除实体数据
truncationEntityExtractionService.deleteByTruncationId(truncationId);
// 删除关系数据
relationExtractionService.deleteByTruncationId(truncationId);
}
log.info("重置知识图谱数据完成,pdfId:{}", pdfId);
}
5 months ago
5 months ago
private void saveWordsIfNecessary(String word, List<ChineseEnglishWords> allWords) {
boolean exists = chineseEnglishWordsService.wordsExists(word, allWords);
if (exists){
return;
}
String generate = chinesEsToEnglishGenerator.generate(word);
if (StrUtil.isEmpty(generate)){
log.warn("生成英文名称失败entity:{}", word);
5 months ago
return;
}
ChineseEnglishWords words = new ChineseEnglishWords();
words.setChineseWord(word);
words.setEnglishWord(generate);
chineseEnglishWordsService.saveIfNotExists(words);
allWords.add(words);// 更新缓存
5 months ago
}
@Override
public void queryGraph(String databaseId, String query) {
}
@Override
public void saveERE(EREDTO eredto, String truncationId) {
5 months ago
// 保存实体信息
truncationEntityExtractionService.saveERE(eredto.getEntities());
// 保存关系
relationExtractionService.saveERE(eredto.getRelations());
5 months ago
}
@Override
public List<EREDTO> listPdfEREDTO(String pdfId) {
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId));
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
log.info("没有找到pdfId为{}的pdf分析结果", pdfId);
return new ArrayList<>();
}
List<String> documentIds = pdfAnalysisOutputs.stream().map(p -> p.getId().toString()).toList();
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
List<String> truncationIds = documentTruncations.stream().map(DocumentTruncation::getId).toList();
List<TruncationEntityExtraction> truncationEntityExtractions = truncationEntityExtractionService.queryByTruncationIds(truncationIds);
List<TruncationRelationExtraction> truncationRelationExtractions = truncationRelationExtractionService.queryByTruncationIds(truncationIds);
List<String> teIds = truncationEntityExtractions.stream().map(TruncationEntityExtraction::getId).toList();
List<String> trIds = truncationRelationExtractions.stream().map(TruncationRelationExtraction::getId).collect(Collectors.toList());
trIds.addAll(teIds);
List<TruncationErAttribute> truncationErAttributes = truncationErAttributeService.queryByTerIds(trIds);
List<EREDTO> eres = new ArrayList<>();
for (TruncationEntityExtraction entityExtraction : truncationEntityExtractions) {
EREDTO eredto = new EREDTO();
EntityExtractionDTO extractionDTO = new EntityExtractionDTO(entityExtraction);
List<ERAttributeDTO> attributes = truncationErAttributes.stream()
.filter(t -> StrUtil.equals(entityExtraction.getId(), t.getTerId())).map(ERAttributeDTO::new).collect(Collectors.toList());
extractionDTO.setAttributes(attributes);
eredto.getEntities().add(extractionDTO);
eres.add(eredto);
}
for (TruncationRelationExtraction relationExtraction : truncationRelationExtractions) {
EREDTO eredto = new EREDTO();
RelationExtractionDTO extractionDTO = new RelationExtractionDTO(relationExtraction);
List<ERAttributeDTO> attributes = truncationErAttributes.stream()
.filter(t -> StrUtil.equals(relationExtraction.getId(), t.getTerId())).map(ERAttributeDTO::new).collect(Collectors.toList());
extractionDTO.setAttributes(attributes);
eredto.getRelations().add(extractionDTO);
eres.add(eredto);
}
return eres;
}
5 months ago
}