diff --git a/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java b/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java index 2ac1f02..91938bd 100644 --- a/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java +++ b/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java @@ -19,6 +19,8 @@ public class PromptCache { public static final String CLASSIFY_TABLE = "CLASSIFY_TABLE"; + public static final String EXTRACT_TABLE_TITLE = "EXTRACT_TABLE_TITLE"; + public static final Map promptMap = new HashMap<>(); static { @@ -32,6 +34,7 @@ public class PromptCache { promptMap.put(TEXT_TO_CYPHER, TEXT_TO_CYPHER_PROMPT); promptMap.put(GENERATE_ANSWER, GENERATE_ANSWER_PROMPT); promptMap.put(CLASSIFY_TABLE, CLASSIFY_TABLE_PROMPT); + promptMap.put(EXTRACT_TABLE_TITLE, EXTRACT_TABLE_TITLE_PROMPT); } @@ -152,7 +155,8 @@ public class PromptCache { 1. 完全保留原始表头字段名称,不做任何中英文转换或修改 2. 将每行数据转换为一个独立对象 3. 所有数值保留原始格式(包括逗号分隔符和小数点) - 4. 表格第一列作为主键字段 + 4. 去除数值中的转移符号 + 5. 表格第一列作为主键字段 **输出格式:** { @@ -361,4 +365,18 @@ public class PromptCache { 请处理以下表格: {} """; + + private static final String EXTRACT_TABLE_TITLE_PROMPT = """ + 你是一个表格处理专家,直接给出结果,不用解释。 + **任务** + - 从文本中挑选出表格的标题。 + **说明** + - 文本内容是表格上部的一段文字。 + + **输出要求**: + - 直接给出结果,不要解释 + + **需要处理的文本** + {} + """; } diff --git a/src/main/java/com/supervision/pdfqaserver/config/OllamaChatModelAspect.java b/src/main/java/com/supervision/pdfqaserver/config/OllamaChatModelAspect.java index 9ebebdd..c2a7678 100644 --- a/src/main/java/com/supervision/pdfqaserver/config/OllamaChatModelAspect.java +++ b/src/main/java/com/supervision/pdfqaserver/config/OllamaChatModelAspect.java @@ -30,11 +30,11 @@ public class OllamaChatModelAspect { // 获取原始参数 Object[] args = joinPoint.getArgs(); // 如果是String类型的call方法,修改其参数 - if (StrUtil.equals(signature, callStringMessage) && args.length > 0 && args[0] instanceof String originalPrompt) { - args[0] = originalPrompt + "/no_think"; + if (StrUtil.equals(signature, callStringMessage) && args.length > 0) { + args[0] = args[0] + "\n /no_think"; } // 执行原方法 - Object result = joinPoint.proceed(); + Object result = joinPoint.proceed(args); if (StrUtil.equals(model,"qwen3:30b-a3b") ) { if(StrUtil.equals(signature, callStringMessage)){ result = ((String) result).replaceAll("(?is)]*>(.*?)", "").trim(); diff --git a/src/main/java/com/supervision/pdfqaserver/dto/ERAttributeDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/ERAttributeDTO.java index 267e676..e90f037 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/ERAttributeDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/ERAttributeDTO.java @@ -41,6 +41,15 @@ public class ERAttributeDTO { public ERAttributeDTO() { } + public ERAttributeDTO(TruncationErAttribute truncationErAttribute) { + this.id = truncationErAttribute.getId(); + this.terId = truncationErAttribute.getTerId(); + this.associationType = truncationErAttribute.getAssociationType(); + this.attribute = truncationErAttribute.getAttribute(); + this.value = truncationErAttribute.getValue(); + this.dataType = truncationErAttribute.getDataType(); + } + public ERAttributeDTO(String attribute, String value, String dataType) { this.attribute = attribute; this.value = value; diff --git a/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java index 349b7c2..3707b85 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java @@ -2,6 +2,7 @@ package com.supervision.pdfqaserver.dto; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.lang.UUID; +import cn.hutool.core.util.RandomUtil; import cn.hutool.core.util.StrUtil; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; @@ -36,6 +37,11 @@ public class EREDTO { JSONObject nodeJson = (JSONObject) node; String name = nodeJson.getString("name"); String type = nodeJson.getString("type"); + if (StrUtil.hasBlank(name,type)){ + continue; + } + name = StrUtil.trim(name); + type = StrUtil.trim(type); JSONObject attributes = nodeJson.getJSONObject("attributes"); List erAttributeDTOS = new ArrayList<>(); if (CollUtil.isNotEmpty(attributes)){ @@ -60,22 +66,33 @@ public class EREDTO { List erAttributeDTOS = new ArrayList<>(); if (CollUtil.isNotEmpty(attributes)){ for (String key : attributes.keySet()) { + if (StrUtil.isBlank(key)){ + continue; + } Object value = attributes.get(key); + if (value instanceof String){ + if (StrUtil.isBlank((String) value)){ + continue; + } + value = StrUtil.trim((String) value); + } String valueString = attributes.getString(key); ERAttributeDTO erAttributeDTO = new ERAttributeDTO(key, valueString, value instanceof Number?"1":"0"); erAttributeDTOS.add(erAttributeDTO); } } - if (StrUtil.isEmpty(source) || StrUtil.isEmpty(target)){ + if (StrUtil.isBlank(source) || StrUtil.isBlank(target)){ log.warn("truncationId:{} relation:{} 关系中source or target is empty",truncationId,relationJson); continue; } - Optional sourceTypeOpt = entities.stream().filter(e -> StrUtil.equals(e.getName(), source)).findFirst(); + final String sourceTrim = StrUtil.trim(source); + Optional sourceTypeOpt = entities.stream().filter(e -> StrUtil.equals(e.getName(), sourceTrim)).findFirst(); if (sourceTypeOpt.isEmpty()){ log.warn("truncationId:{} relation:{} 关系中source在实体中不存在",truncationId,relationJson); continue; } - Optional targetTypeOpt = entities.stream().filter(e -> StrUtil.equals(e.getName(), target)).findFirst(); + final String targetTrim = StrUtil.trim(target); + Optional targetTypeOpt = entities.stream().filter(e -> StrUtil.equals(e.getName(), targetTrim)).findFirst(); if (targetTypeOpt.isEmpty()){ log.warn("truncationId:{} relation:{} 关系中target在实体中不存在",truncationId,relationJson); continue; @@ -108,12 +125,22 @@ public class EREDTO { EntityExtractionDTO entityExtractionDTO = new EntityExtractionDTO(); entityExtractionDTO.setEntity("行"); // 避免表格行名重复 - entityExtractionDTO.setName("行-" + UUID.randomUUID()); + entityExtractionDTO.setName("行-" + RandomUtil.randomString(UUID.randomUUID().toString(), 10)); entityExtractionDTO.setTruncationId(truncationId); List erAttributeDTOS = new ArrayList<>(); for (Map.Entry tableEntry : tableJson.entrySet()) { String key = tableEntry.getKey(); + if (StrUtil.isBlank(key)){ + continue; + } + key = StrUtil.trim(key); Object value = tableEntry.getValue(); + if (value instanceof String){ + if (StrUtil.isBlank(value.toString())){ + continue; + } + value = StrUtil.trim((String) value); + } ERAttributeDTO erAttributeDTO = new ERAttributeDTO(key, value.toString(), value instanceof Number ? "1" : "0"); erAttributeDTOS.add(erAttributeDTO); } diff --git a/src/main/java/com/supervision/pdfqaserver/dto/EntityExtractionDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/EntityExtractionDTO.java index e202b2c..ca97a98 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/EntityExtractionDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/EntityExtractionDTO.java @@ -1,5 +1,6 @@ package com.supervision.pdfqaserver.dto; +import com.supervision.pdfqaserver.domain.TruncationEntityExtraction; import lombok.Data; import java.util.ArrayList; @@ -35,6 +36,13 @@ public class EntityExtractionDTO { public EntityExtractionDTO() { } + public EntityExtractionDTO(TruncationEntityExtraction entityExtraction) { + this.id = entityExtraction.getId(); + this.truncationId = entityExtraction.getTruncationId(); + this.entity = entityExtraction.getEntity(); + this.name = entityExtraction.getName(); + } + public EntityExtractionDTO(String truncationId, String entity, String name, List attributes) { this.truncationId = truncationId; this.entity = entity; diff --git a/src/main/java/com/supervision/pdfqaserver/dto/RelationExtractionDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/RelationExtractionDTO.java index f01010d..212f51e 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/RelationExtractionDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/RelationExtractionDTO.java @@ -51,7 +51,17 @@ public class RelationExtractionDTO { public RelationExtractionDTO() { } - public RelationExtractionDTO(String truncationId,String source, String sourceType,String relation, String target,String targetType, List attributes) { + public RelationExtractionDTO(TruncationRelationExtraction relationExtraction) { + this.id = relationExtraction.getId(); + this.truncationId = relationExtraction.getTruncationId(); + this.source = relationExtraction.getSource(); + this.sourceType = relationExtraction.getSourceType(); + this.relation = relationExtraction.getRelation(); + this.target = relationExtraction.getTarget(); + this.targetType = relationExtraction.getTargetType(); + } + + public RelationExtractionDTO(String truncationId, String source, String sourceType, String relation, String target, String targetType, List attributes) { this.truncationId = truncationId; this.source = source; this.relation = relation; diff --git a/src/main/java/com/supervision/pdfqaserver/service/DocumentTruncationService.java b/src/main/java/com/supervision/pdfqaserver/service/DocumentTruncationService.java index 6a871a6..64f9de8 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/DocumentTruncationService.java +++ b/src/main/java/com/supervision/pdfqaserver/service/DocumentTruncationService.java @@ -18,5 +18,11 @@ public interface DocumentTruncationService extends IService void deleteByDocumentId(String documentId); + void deleteByDocumentIds(List documentIds); + List queryByDocumentId(String documentId); + + List queryByDocumentIds(List documentIds); + + List queryNotERETruncate(List documentIds); } diff --git a/src/main/java/com/supervision/pdfqaserver/service/KnowledgeGraphService.java b/src/main/java/com/supervision/pdfqaserver/service/KnowledgeGraphService.java index 1a246d8..159f0e5 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/KnowledgeGraphService.java +++ b/src/main/java/com/supervision/pdfqaserver/service/KnowledgeGraphService.java @@ -1,6 +1,8 @@ package com.supervision.pdfqaserver.service; import com.supervision.pdfqaserver.dto.EREDTO; +import com.supervision.pdfqaserver.dto.TruncateDTO; +import java.util.List; /** * 知识图谱服务接口 @@ -14,12 +16,16 @@ public interface KnowledgeGraphService { */ void generateGraph(String documentId); + void generateGraph(List eredtoList); + + List truncateERE(List truncateDTOS); + /** * 重置图数据 - * @param documentId 文档ID + * @param pdfId pdfId */ - void resetGraphData(String documentId); + void resetGraphData(String pdfId); /** * 提交生成图任务 @@ -31,4 +37,6 @@ public interface KnowledgeGraphService { void saveERE(EREDTO eredto, String truncationId); + + List listPdfEREDTO(String pdfId); } diff --git a/src/main/java/com/supervision/pdfqaserver/service/TruncationEntityExtractionService.java b/src/main/java/com/supervision/pdfqaserver/service/TruncationEntityExtractionService.java index dd01403..7efe9f6 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/TruncationEntityExtractionService.java +++ b/src/main/java/com/supervision/pdfqaserver/service/TruncationEntityExtractionService.java @@ -15,4 +15,6 @@ public interface TruncationEntityExtractionService extends IService entities); void deleteByTruncationId(String truncationId); + + List queryByTruncationIds(List truncationIds); } diff --git a/src/main/java/com/supervision/pdfqaserver/service/TruncationErAttributeService.java b/src/main/java/com/supervision/pdfqaserver/service/TruncationErAttributeService.java index 5638327..0765f01 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/TruncationErAttributeService.java +++ b/src/main/java/com/supervision/pdfqaserver/service/TruncationErAttributeService.java @@ -14,4 +14,6 @@ public interface TruncationErAttributeService extends IService terIds); + + List queryByTerIds(List terIds); } diff --git a/src/main/java/com/supervision/pdfqaserver/service/TruncationRelationExtractionService.java b/src/main/java/com/supervision/pdfqaserver/service/TruncationRelationExtractionService.java index b8ca062..d953431 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/TruncationRelationExtractionService.java +++ b/src/main/java/com/supervision/pdfqaserver/service/TruncationRelationExtractionService.java @@ -16,4 +16,5 @@ public interface TruncationRelationExtractionService extends IService queryByTruncationIds(List documentIds); } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java index 66250b8..b867383 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java @@ -19,7 +19,8 @@ public class ChinesEsToEnglishGeneratorImpl implements ChinesEsToEnglishGenerato public String generate(String chinese) { log.info("generate:开始翻译: {}",chinese); String prompt = PromptCache.promptMap.get(CHINESE_TO_ENGLISH); - String response = ollamaChatModel.call(StrUtil.format(prompt, chinese)); + String format = StrUtil.format(prompt, chinese); + String response = ollamaChatModel.call(format); log.info("generate:chinese:{}翻译结果: {}",chinese,response); return response; } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/DocumentTruncationServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/DocumentTruncationServiceImpl.java index 05e89e4..4d8db68 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/DocumentTruncationServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/DocumentTruncationServiceImpl.java @@ -10,6 +10,7 @@ import com.supervision.pdfqaserver.mapper.DocumentTruncationMapper; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; +import java.util.ArrayList; import java.util.List; /** @@ -42,10 +43,31 @@ public class DocumentTruncationServiceImpl extends ServiceImpl documentIds) { + if (CollUtil.isEmpty(documentIds)){ + return; + } + this.lambdaUpdate().in(DocumentTruncation::getDocumentId, documentIds).remove(); + } + @Override public List queryByDocumentId(String documentId) { return this.lambdaQuery().eq(DocumentTruncation::getDocumentId, documentId).list(); } + + @Override + public List queryByDocumentIds(List documentIds) { + if (CollUtil.isEmpty(documentIds)){ + return new ArrayList<>(); + } + return this.lambdaQuery().in(DocumentTruncation::getDocumentId, documentIds).list(); + } + + @Override + public List queryNotERETruncate(List documentIds) { + return null; + } } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java index f2e7dbb..79c1ccf 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java @@ -2,13 +2,12 @@ package com.supervision.pdfqaserver.service.impl; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.date.TimeInterval; +import cn.hutool.core.util.NumberUtil; import cn.hutool.core.util.StrUtil; +import cn.hutool.json.JSONUtil; import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum; -import com.supervision.pdfqaserver.domain.ChineseEnglishWords; -import com.supervision.pdfqaserver.domain.DocumentTruncation; -import com.supervision.pdfqaserver.domain.DomainMetadata; +import com.supervision.pdfqaserver.domain.*; import com.supervision.pdfqaserver.dto.*; -import com.supervision.pdfqaserver.domain.PdfAnalysisOutput; import com.supervision.pdfqaserver.service.*; import com.supervision.pdfqaserver.thread.KnowledgeGraphGenerateTreadPool; import lombok.RequiredArgsConstructor; @@ -18,6 +17,7 @@ import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; @Slf4j @Service @@ -38,6 +38,10 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { private final TruncationEntityExtractionService truncationEntityExtractionService; + private final TruncationRelationExtractionService truncationRelationExtractionService; + + private final TruncationErAttributeService truncationErAttributeService; + private final TruncationRelationExtractionService relationExtractionService; private final ChinesEsToEnglishGenerator chinesEsToEnglishGenerator; @@ -65,24 +69,15 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { // 对切分后的文档进行命名实体识别 timer.start("doEre"); log.info("开始命名实体识别..."); - List eredtoList = new ArrayList<>(); - for (TruncateDTO truncateDTO : truncateDTOS) { - EREDTO eredto = null; - try { - eredto = tripleConversionPipeline.doEre(truncateDTO); - } catch (Exception e) { - log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e); - } - if (null == eredto){ - continue; - } - // 保存实体关系抽取结果 - this.saveERE(eredto, truncateDTO.getId()); - eredtoList.add(eredto); - } + List eredtoList = truncateERE(truncateDTOS); log.info("命名实体识别完成,耗时:{}秒", timer.intervalSecond("doEre")); - // 合并实体关系抽取结果 + generateGraph(eredtoList); + + } + + @Override + public void generateGraph(List eredtoList) { log.info("开始合并实体关系抽取结果..."); List mergedList = tripleConversionPipeline.mergeEreResults(eredtoList); log.info("合并实体关系抽取结果完成,合并后个数:{}", mergedList.size()); @@ -137,23 +132,100 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { if (CollUtil.isEmpty(eredto.getEntities()) && CollUtil.isEmpty(eredto.getRelations())){ continue; } + // 构造一个字典 + allWords = getChineseEnglishWords(eredto); + eredto.setEn(allWords); - tripleToCypherExecutor.saveERE(eredto); + try { + tripleToCypherExecutor.saveERE(eredto); + } catch (Exception e) { + log.info("生成cypher语句失败,切分文档id:{}", JSONUtil.toJsonStr(eredto), e); + } } + } + + private static List getChineseEnglishWords(EREDTO eredto) { + List allWords; + allWords = eredto.getEntities().stream().flatMap(entity -> { + List collect = entity.getAttributes().stream().map(e -> { + ChineseEnglishWords words = new ChineseEnglishWords(); + words.setChineseWord(e.getAttribute()); + words.setEnglishWord(e.getAttribute()); + return words; + }).collect(Collectors.toList()); + ChineseEnglishWords words = new ChineseEnglishWords(); + words.setChineseWord(entity.getEntity()); + words.setEnglishWord(entity.getEntity()); + collect.add(words); + return collect.stream(); + }).collect(Collectors.toList()); + + eredto.getRelations().stream().flatMap(relation -> { + List words = relation.getAttributes().stream().map(e -> { + ChineseEnglishWords word = new ChineseEnglishWords(); + word.setChineseWord(e.getAttribute()); + word.setEnglishWord(e.getAttribute()); + return word; + }).collect(Collectors.toList()); + ChineseEnglishWords words1 = new ChineseEnglishWords(); + words1.setChineseWord(relation.getRelation()); + words1.setEnglishWord(relation.getRelation()); + words.add(words1); + ChineseEnglishWords words2 = new ChineseEnglishWords(); + words2.setChineseWord(relation.getSourceType()); + words2.setEnglishWord(relation.getSourceType()); + words.add(words2); + ChineseEnglishWords words3 = new ChineseEnglishWords(); + words3.setChineseWord(relation.getTargetType()); + words3.setEnglishWord(relation.getTargetType()); + words.add(words3); + return words.stream(); + }).forEach(allWords::add); + + return allWords; + } + @Override + public List truncateERE(List truncateDTOS) { + List eredtoList = new ArrayList<>(); + int truncateSize = truncateDTOS.size(); + int index = 1; + for (TruncateDTO truncateDTO : truncateDTOS) { + log.info("开始命名实体识别,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2)); + index++; + EREDTO eredto = null; + try { + eredto = tripleConversionPipeline.doEre(truncateDTO); + } catch (Exception e) { + log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e); + } + if (null == eredto){ + continue; + } + // 保存实体关系抽取结果 + this.saveERE(eredto, truncateDTO.getId()); + eredtoList.add(eredto); + } + return eredtoList; } @Override @Transactional(rollbackFor = Exception.class) - public void resetGraphData(String documentId) { - log.info("resetGraphData:重置知识图谱数据,documentId:{}", documentId); - List documentTruncations = documentTruncationService.queryByDocumentId(documentId); + public void resetGraphData(String pdfId) { + log.info("resetGraphData:重置知识图谱数据,pdfId:{}", pdfId); + List pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId)); + if (CollUtil.isEmpty(pdfAnalysisOutputs)){ + log.info("没有找到pdfId为{}的pdf分析结果", pdfId); + return; + } + List documentIds = pdfAnalysisOutputs.stream().map(p -> String.valueOf(p.getId())).toList(); + List documentTruncations = documentTruncationService.queryByDocumentIds(documentIds); if (CollUtil.isEmpty(documentTruncations)){ - log.info("没有找到文档切分数据,documentId:{},不用重置数据...", documentId); + log.info("没有找到文档切分数据,pdfId:{},不用重置数据...", pdfId); return; } // 删除切分数据 - documentTruncationService.deleteByDocumentId(documentId); + documentTruncationService.deleteByDocumentIds(documentIds); for (DocumentTruncation documentTruncation : documentTruncations) { String truncationId = documentTruncation.getId(); // 删除实体数据 @@ -161,7 +233,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { // 删除关系数据 relationExtractionService.deleteByTruncationId(truncationId); } - log.info("重置知识图谱数据完成,documentId:{}", documentId); + log.info("重置知识图谱数据完成,pdfId:{}", pdfId); } @@ -214,4 +286,46 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { relationExtractionService.saveERE(eredto.getRelations()); } + @Override + public List listPdfEREDTO(String pdfId) { + + List pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(pdfId)); + if (CollUtil.isEmpty(pdfAnalysisOutputs)){ + log.info("没有找到pdfId为{}的pdf分析结果", pdfId); + return new ArrayList<>(); + } + List documentIds = pdfAnalysisOutputs.stream().map(p -> p.getId().toString()).toList(); + List documentTruncations = documentTruncationService.queryByDocumentIds(documentIds); + List truncationIds = documentTruncations.stream().map(DocumentTruncation::getId).toList(); + List truncationEntityExtractions = truncationEntityExtractionService.queryByTruncationIds(truncationIds); + + List truncationRelationExtractions = truncationRelationExtractionService.queryByTruncationIds(truncationIds); + + List teIds = truncationEntityExtractions.stream().map(TruncationEntityExtraction::getId).toList(); + List trIds = truncationRelationExtractions.stream().map(TruncationRelationExtraction::getId).collect(Collectors.toList()); + trIds.addAll(teIds); + List truncationErAttributes = truncationErAttributeService.queryByTerIds(trIds); + + List eres = new ArrayList<>(); + for (TruncationEntityExtraction entityExtraction : truncationEntityExtractions) { + EREDTO eredto = new EREDTO(); + EntityExtractionDTO extractionDTO = new EntityExtractionDTO(entityExtraction); + List attributes = truncationErAttributes.stream() + .filter(t -> StrUtil.equals(entityExtraction.getId(), t.getTerId())).map(ERAttributeDTO::new).collect(Collectors.toList()); + extractionDTO.setAttributes(attributes); + eredto.getEntities().add(extractionDTO); + eres.add(eredto); + } + for (TruncationRelationExtraction relationExtraction : truncationRelationExtractions) { + EREDTO eredto = new EREDTO(); + RelationExtractionDTO extractionDTO = new RelationExtractionDTO(relationExtraction); + List attributes = truncationErAttributes.stream() + .filter(t -> StrUtil.equals(relationExtraction.getId(), t.getTerId())).map(ERAttributeDTO::new).collect(Collectors.toList()); + extractionDTO.setAttributes(attributes); + eredto.getRelations().add(extractionDTO); + eres.add(eredto); + } + return eres; + } + } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java index 263e0cc..a15ff2b 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java @@ -3,6 +3,7 @@ package com.supervision.pdfqaserver.service.impl; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.lang.Assert; import cn.hutool.core.util.BooleanUtil; +import cn.hutool.core.util.RandomUtil; import cn.hutool.core.util.StrUtil; import com.supervision.pdfqaserver.cache.PromptCache; import com.supervision.pdfqaserver.constant.LayoutTypeEnum; @@ -62,9 +63,40 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline { truncateDTOS.add(truncateDTO); } } else if (LayoutTypeEnum.TABLE.getCode() == layoutType) { - // 如果是表格类型的布局,直接添加到列表中 - TruncateDTO truncateDTO = new TruncateDTO(documentDTO); - truncateDTOS.add(truncateDTO); + // 如果是表格类型的布局,进行切分 + + // 提前抽取表名 + TableTitleDTO tableTitleDTO = this.extractTableTitle(documentDTO.getTitle()); + if (null != tableTitleDTO && StrUtil.isNotEmpty(tableTitleDTO.getTitle())){ + documentDTO.setTitle(tableTitleDTO.getTitle()); + }else { + // 生成一个默认的表 + documentDTO.setTitle("tableName-"+ RandomUtil.randomString(10)); + } + List tableRows = StrUtil.split(documentDTO.getContent(), "\n").stream().filter(StrUtil::isNotEmpty).collect(Collectors.toList()); + if (tableRows.size()<5){ + TruncateDTO truncateDTO = new TruncateDTO(documentDTO); + truncateDTOS.add(truncateDTO); + continue; + } + String tableTitle = tableRows.get(0); + // 标题分割符 + String tableTitleSplit = tableRows.get(1); + List noTitleRows = tableRows.subList(2,tableRows.size()-1); + List> rows = CollUtil.split(noTitleRows, 4); + for (List row : rows) { + StringBuilder sb = new StringBuilder(); + sb.append(tableTitle).append("\n"); + sb.append(tableTitleSplit).append("\n"); + for (String s : row) { + sb.append(s).append("\n"); + } + TruncateDTO truncateDTO = new TruncateDTO(documentDTO); + truncateDTO.setContent(sb.toString()); + truncateDTOS.add(truncateDTO); + } + + } else { log.info("sliceDocuments:错误的布局类型: {}", layoutType); } @@ -89,9 +121,10 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline { if (classify){ return doTextEre(truncateDTO); } + return doTableEre(truncateDTO); } - log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType()); + log.warn("doEre:错误的布局类型: {}", truncateDTO.getLayoutType()); return null; } @@ -118,7 +151,14 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline { @Override public TableTitleDTO extractTableTitle(String content) { - return null; + TableTitleDTO tableTitleDTO = new TableTitleDTO(); + if (StrUtil.isEmpty(content)){ + log.warn("extractTableTitle:内容为空"); + return tableTitleDTO; + } + String table = PromptCache.promptMap.get(PromptCache.EXTRACT_TABLE_TITLE); + tableTitleDTO.setTitle(table); + return tableTitleDTO; } private EREDTO doTextEre(TruncateDTO truncateDTO) { @@ -140,6 +180,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline { // 手动设置表格标题 EntityExtractionDTO titleEntity = new EntityExtractionDTO(); titleEntity.setEntity("表"); + titleEntity.setTruncationId(truncateDTO.getId()); titleEntity.setName(truncateDTO.getTitle()); // 添加关系 List relations = new ArrayList<>(); diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleToCypherExecutorImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleToCypherExecutorImpl.java index 0cb970f..d897037 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleToCypherExecutorImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleToCypherExecutorImpl.java @@ -62,6 +62,7 @@ public class TripleToCypherExecutorImpl implements TripleToCypherExecutor { Map attributes = entity.getAttributes().stream().collect(Collectors.toMap( ERAttributeDTO::getAttributeEn, ERAttributeDTO::getValue )); + attributes.put("truncationId", entity.getTruncationId()); attributes.put("name", entity.getName()); log.info("保存节点{},属性:{}", entity.getEntityEn(),JSONUtil.toJsonStr(entity.getAttributes())); List nodeIds = neo4jRepository.saveOrUpdateEntityNode(entity.getEntityEn(), "name", attributes); @@ -86,6 +87,8 @@ public class TripleToCypherExecutorImpl implements TripleToCypherExecutor { Map attributes = relation.getAttributes().stream().collect(Collectors.toMap( ERAttributeDTO::getAttributeEn, ERAttributeDTO::getValue )); + attributes.put("sourceType", relation.getSourceType()); + attributes.put("truncationId", relation.getTruncationId()); for (Long sourceNodeId : sourceNodeIds) { for (Long targetNodeId : targetNodeIds) { if (sourceNodeId.equals(targetNodeId)) { diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationEntityExtractionServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationEntityExtractionServiceImpl.java index 8295f13..7671764 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationEntityExtractionServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationEntityExtractionServiceImpl.java @@ -13,7 +13,7 @@ import com.supervision.pdfqaserver.service.TruncationErAttributeService; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; - +import java.util.ArrayList; import java.util.List; /** @@ -65,6 +65,14 @@ public class TruncationEntityExtractionServiceImpl extends ServiceImpl queryByTruncationIds(List truncationIds) { + if (CollUtil.isEmpty(truncationIds)){ + return new ArrayList<>(); + } + return this.lambdaQuery().in(TruncationEntityExtraction::getTruncationId, truncationIds).list(); + } } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationErAttributeServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationErAttributeServiceImpl.java index d5f2735..a3bf6ed 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationErAttributeServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationErAttributeServiceImpl.java @@ -7,6 +7,7 @@ import com.supervision.pdfqaserver.domain.TruncationErAttribute; import com.supervision.pdfqaserver.service.TruncationErAttributeService; import com.supervision.pdfqaserver.mapper.TruncationErAttributeMapper; import org.springframework.stereotype.Service; +import java.util.ArrayList; import java.util.List; /** @@ -34,6 +35,14 @@ public class TruncationErAttributeServiceImpl extends ServiceImpl queryByTerIds(List terIds) { + if (CollUtil.isEmpty(terIds)){ + return new ArrayList<>(); + } + return this.lambdaQuery().in(TruncationErAttribute::getTerId, terIds).list(); + } } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationRelationExtractionServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationRelationExtractionServiceImpl.java index b9e9168..1421a61 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationRelationExtractionServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationRelationExtractionServiceImpl.java @@ -13,6 +13,7 @@ import com.supervision.pdfqaserver.mapper.TruncationRelationExtractionMapper; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; +import java.util.ArrayList; import java.util.List; /** @@ -60,6 +61,14 @@ public class TruncationRelationExtractionServiceImpl extends ServiceImpl queryByTruncationIds(List documentIds) { + if (CollUtil.isEmpty(documentIds)){ + return new ArrayList<>(); + } + return this.lambdaQuery().in(TruncationRelationExtraction::getTruncationId, documentIds).list(); + } } diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml index 28b28c3..290c5e1 100644 --- a/src/main/resources/logback.xml +++ b/src/main/resources/logback.xml @@ -44,7 +44,7 @@ 100MB - 2 + 30 500MB @@ -72,7 +72,7 @@ 100MB - 2 + 30 1GB @@ -99,7 +99,7 @@ 100MB - 2 + 30 500MB @@ -126,7 +126,7 @@ 100MB - 2 + 30 500MB diff --git a/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java b/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java index 7766570..60c5253 100644 --- a/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java +++ b/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java @@ -1,5 +1,6 @@ package com.supervision.pdfqaserver; +import com.supervision.pdfqaserver.dto.EREDTO; import com.supervision.pdfqaserver.service.ChinesEsToEnglishGenerator; import com.supervision.pdfqaserver.service.KnowledgeGraphService; import com.supervision.pdfqaserver.service.TripleConversionPipeline; @@ -22,7 +23,15 @@ class PdfQaServerApplicationTests { private KnowledgeGraphService knowledgeGraphService; @Test void generateGraphTest() { - knowledgeGraphService.generateGraph("1"); + knowledgeGraphService.generateGraph("40"); + log.info("finish..."); + } + + @Test + void testGenerateGraph2() { + List eredtos = knowledgeGraphService.listPdfEREDTO("17"); + + knowledgeGraphService.generateGraph(eredtos); log.info("finish..."); }