From 7f5c52546a6d6505b35573c011a7ad4872bf314c Mon Sep 17 00:00:00 2001 From: xueqingkun Date: Mon, 28 Apr 2025 16:55:08 +0800 Subject: [PATCH] =?UTF-8?q?generateGraph=20=E5=8A=9F=E8=83=BD=E5=88=9D?= =?UTF-8?q?=E5=A7=8B=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pdfqaserver/cache/PromptCache.java | 37 ++++++++++++++----- .../pdfqaserver/domain/PdfAnalysisOutput.java | 2 +- .../pdfqaserver/dto/DocumentDTO.java | 6 +-- .../supervision/pdfqaserver/dto/EREDTO.java | 8 ++-- .../service/PdfAnalysisOutputService.java | 2 +- .../impl/ChinesEsToEnglishGeneratorImpl.java | 6 ++- .../impl/DocumentTruncationServiceImpl.java | 6 ++- .../impl/KnowledgeGraphServiceImpl.java | 19 +++++++++- .../impl/PdfAnalysisOutputServiceImpl.java | 4 +- .../impl/TripleConversionPipelineImpl.java | 32 ++++++++++++---- src/main/resources/application.yml | 2 +- .../mapper/PdfAnalysisOutputMapper.xml | 4 +- .../PdfQaServerApplicationTests.java | 10 ++++- 13 files changed, 101 insertions(+), 37 deletions(-) diff --git a/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java b/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java index a8e2c8c..af8111a 100644 --- a/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java +++ b/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java @@ -29,7 +29,7 @@ public class PromptCache { private static final String DOERE_TEXT_PROMPT = """ - 你是一个高级信息抽取引擎,请从给定文本中提取以下结构化信息并以JSON格式输出: + 你是一个高级信息抽取引擎,请从给定文本中提取以下结构化信息并以JSON数据输出,不要进行解释: 1. **节点提取**: - 识别所有实体作为节点 @@ -45,8 +45,8 @@ public class PromptCache { - 生成由 (头节点类型, 关系类型, 尾节点类型) 组成的元组 **输出要求**: + - 输出纯JSON格式,不要使用```json ```等任何Markdown标记包装 - 使用如下JSON Schema: - { "nodes": [ { @@ -138,7 +138,7 @@ public class PromptCache { """; private static final String DOERE_TABLE_PROMPT = """ - 你是一个表格数据处理专家,请严格按以下要求从给出的表格中提取数据: + 你是一个表格数据处理专家,请严格按以下要求从给出的表格中提取数据,直接给出结果,不进行解释: **处理规则:** 1. 完全保留原始表头字段名称,不做任何中英文转换或修改 @@ -147,7 +147,6 @@ public class PromptCache { 4. 表格第一列作为主键字段 **输出格式:** - ```json { "table_data": [ { @@ -158,7 +157,7 @@ public class PromptCache { // 后续行... ] } - ``` + **示例表格:** | 账龄 | 期末余额 | 年初余额 | @@ -189,9 +188,31 @@ public class PromptCache { private static final String CHINESE_TO_ENGLISH_PROMPT = """ - 你是一个表格数据处理专家,请严格按以下要求从给出的表格中提取数据: + 你是一个Neo4j图数据库命名规范转换专家,请将以下中文短语转换为符合Neo4j命名规范的英文名称。要求: - + 1. **命名规范**: + - 使用`UpperCamelCase`命名实体(如`ProductCategory`) + - 使用`SCREAMING_SNAKE_CASE`命名关系(如`IS_RELATED_TO`) + - 保留数字原样(如`2023`→`2023`) + - 禁止特殊字符(如空格、括号、引号等) + - 优先选择技术领域通用术语 + + 2. **转换规则**: + - 直译或意译均可,但需确保语义清晰 + - 若中文含多义词,选择最贴近技术场景的译法 + - 对品牌/专有名词保留原始英文(如"腾讯"→`Tencent`) + + 3. **输入输出示例**: + - 输入: "用户订单" → 输出: `UserOrder`(实体) + - 输入: "属于2023年" → 输出: `BELONGS_TO_2023`(关系) + - 输入: "5G网络设备" → 输出: `5GNetworkDevice`(实体) + - 输入: "评分大于90" → 输出: `SCORE_ABOVE_90`(关系) + + 4. **待转换文本**: + {} + + 5. **输出要求**: + 只需返回转换后的英文名称,无需解释。 """; @@ -203,12 +224,10 @@ public class PromptCache { 4. 返回完整的Cypher语句,不要解释。 ### 输入三元组示例 - ```json [ {"source": "人物","sourceType": "Person", "relation": "创始人", "relationType": "FOUNDED","target": "公司","targetType": "Company"}, {"source": "公司","sourceType": "Company ", "relation": "位于", "relationType": "LOCATED_IN","target": "城市","targetType": "City "} ] - ``` ### 输出示例 diff --git a/src/main/java/com/supervision/pdfqaserver/domain/PdfAnalysisOutput.java b/src/main/java/com/supervision/pdfqaserver/domain/PdfAnalysisOutput.java index 4f60a67..7bde48a 100644 --- a/src/main/java/com/supervision/pdfqaserver/domain/PdfAnalysisOutput.java +++ b/src/main/java/com/supervision/pdfqaserver/domain/PdfAnalysisOutput.java @@ -47,7 +47,7 @@ public class PdfAnalysisOutput implements Serializable { /** * 内容在pdf页面中的顺序,越小表示顺序越靠前 */ - private Integer order; + private Integer displayOrder; /** * diff --git a/src/main/java/com/supervision/pdfqaserver/dto/DocumentDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/DocumentDTO.java index f05621d..85bee8b 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/DocumentDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/DocumentDTO.java @@ -29,7 +29,7 @@ public class DocumentDTO { /** * 内容在pdf页面中的顺序,越小表示顺序越靠前 */ - private Integer layoutOrder; + private Integer displayOrder; private String title; @@ -49,13 +49,13 @@ public class DocumentDTO { } public DocumentDTO(PdfAnalysisOutput pdfAnalysisOutput) { - this.id = pdfAnalysisOutput.getPdfId().toString(); + this.id = pdfAnalysisOutput.getId().toString(); this.sectionId = pdfAnalysisOutput.getId(); this.layoutType = pdfAnalysisOutput.getLayoutType(); this.pageNo = pdfAnalysisOutput.getPageNo(); this.title = pdfAnalysisOutput.getTableTitle(); this.content = pdfAnalysisOutput.getContent(); - this.layoutOrder = pdfAnalysisOutput.getOrder(); + this.displayOrder = pdfAnalysisOutput.getDisplayOrder(); } diff --git a/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java index cf04e13..4475cf3 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java @@ -37,17 +37,17 @@ public class EREDTO { String name = nodeJson.getString("name"); String type = nodeJson.getString("type"); JSONObject attributes = nodeJson.getJSONObject("attributes"); + List erAttributeDTOS = new ArrayList<>(); if (CollUtil.isNotEmpty(attributes)){ - List erAttributeDTOS = new ArrayList<>(); for (String key : attributes.keySet()) { Object value = attributes.get(key); String valueString = attributes.getString(key); ERAttributeDTO erAttributeDTO = new ERAttributeDTO(key, valueString, value instanceof Number?"1":"0"); erAttributeDTOS.add(erAttributeDTO); } - EntityExtractionDTO entityExtraction = new EntityExtractionDTO(truncationId,name,type, erAttributeDTOS); - entities.add(entityExtraction); } + EntityExtractionDTO entityExtraction = new EntityExtractionDTO(truncationId,name,type, erAttributeDTOS); + entities.add(entityExtraction); } } if (CollUtil.isNotEmpty(relations)){ @@ -106,7 +106,7 @@ public class EREDTO { continue; } EntityExtractionDTO entityExtractionDTO = new EntityExtractionDTO(); - entityExtractionDTO.setEntity("row"); + entityExtractionDTO.setEntity("行"); entityExtractionDTO.setName("row"); entityExtractionDTO.setTruncationId(truncationId); List erAttributeDTOS = new ArrayList<>(); diff --git a/src/main/java/com/supervision/pdfqaserver/service/PdfAnalysisOutputService.java b/src/main/java/com/supervision/pdfqaserver/service/PdfAnalysisOutputService.java index f534c5f..76feb6a 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/PdfAnalysisOutputService.java +++ b/src/main/java/com/supervision/pdfqaserver/service/PdfAnalysisOutputService.java @@ -12,5 +12,5 @@ import java.util.List; */ public interface PdfAnalysisOutputService extends IService { - List queryByPdfId(String pdfId); + List queryByPdfId(Integer pdfId); } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java index adefdcf..66250b8 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java @@ -1,5 +1,6 @@ package com.supervision.pdfqaserver.service.impl; +import cn.hutool.core.util.StrUtil; import com.supervision.pdfqaserver.cache.PromptCache; import com.supervision.pdfqaserver.service.ChinesEsToEnglishGenerator; import lombok.RequiredArgsConstructor; @@ -18,7 +19,8 @@ public class ChinesEsToEnglishGeneratorImpl implements ChinesEsToEnglishGenerato public String generate(String chinese) { log.info("generate:开始翻译: {}",chinese); String prompt = PromptCache.promptMap.get(CHINESE_TO_ENGLISH); - ollamaChatModel.call("请将以下中文翻译成英文: " + chinese); - return null; + String response = ollamaChatModel.call(StrUtil.format(prompt, chinese)); + log.info("generate:chinese:{}翻译结果: {}",chinese,response); + return response; } } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/DocumentTruncationServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/DocumentTruncationServiceImpl.java index 390b2f4..aa35308 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/DocumentTruncationServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/DocumentTruncationServiceImpl.java @@ -24,7 +24,11 @@ public class DocumentTruncationServiceImpl extends ServiceImpl pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(documentId); + List pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(documentId)); if (CollUtil.isEmpty(pdfAnalysisOutputs)) { log.info("没有找到pdfId为{}的pdf分析结果", documentId); return; } List documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).toList(); // 对文档进行切分 + TimeInterval timer = new TimeInterval(); + timer.start("sliceDocuments"); + log.info("开始切分文档,初始文档个数:{}",documentDTOList.size()); List truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList); + log.info("切分文档完成,切分后文档个数:{},耗时:{}秒",truncateDTOS.size(), timer.intervalSecond("sliceDocuments")); // 保存分片信息 documentTruncationService.batchSave(truncateDTOS); // 对切分后的文档进行命名实体识别 + timer.start("doEre"); + log.info("开始命名实体识别..."); List eredtoList = new ArrayList<>(); for (TruncateDTO truncateDTO : truncateDTOS) { EREDTO eredto = tripleConversionPipeline.doEre(truncateDTO); @@ -59,12 +66,17 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { } // 保存实体关系抽取结果 this.saveERE(eredto, truncateDTO.getId()); + eredtoList.add(eredto); } + log.info("命名实体识别完成,耗时:{}秒", timer.intervalSecond("doEre")); // 合并实体关系抽取结果 + log.info("开始合并实体关系抽取结果..."); List mergedList = tripleConversionPipeline.mergeEreResults(eredtoList); + log.info("合并实体关系抽取结果完成,合并后个数:{}", mergedList.size()); // 保存领域元数据 + log.info("开始保存领域元数据..."); for (EREDTO eredto : mergedList) { List relations = eredto.getRelations(); if (CollUtil.isEmpty(relations)){ @@ -77,9 +89,12 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { domainMetadataService.saveIfNotExists(domainMetadata); } } + log.info("保存领域元数据完成"); // 保存字典 + log.info("开始保存字典..."); List allWords = chineseEnglishWordsService.queryAll(); + int wordsSize = allWords.size(); for (EREDTO eredto : mergedList) { List entities = eredto.getEntities(); if (CollUtil.isNotEmpty(entities)){ @@ -94,7 +109,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { } } } - + log.info("保存字典完成,新增字典个数:{}", allWords.size() - wordsSize); // 生成cypher语句 for (EREDTO eredto : mergedList) { eredto.setEn(allWords); diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/PdfAnalysisOutputServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/PdfAnalysisOutputServiceImpl.java index c08f7e6..4471600 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/PdfAnalysisOutputServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/PdfAnalysisOutputServiceImpl.java @@ -19,8 +19,8 @@ public class PdfAnalysisOutputServiceImpl extends ServiceImpl queryByPdfId(String pdfId) { - Assert.notEmpty(pdfId, "pdfId不能为空"); + public List queryByPdfId(Integer pdfId) { + Assert.notNull(pdfId, "pdfId不能为空"); return super.lambdaQuery().eq(PdfAnalysisOutput::getPdfId, pdfId).list(); } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java index 5c4ef14..d18ebc6 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java @@ -30,7 +30,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline { // 先对pageNo进行排序再对layoutOrder进行排序 (o1, o2) -> { if (o1.getPageNo().equals(o2.getPageNo())) { - return Integer.compare(o1.getLayoutOrder(), o2.getLayoutOrder()); + return Integer.compare(o1.getDisplayOrder(), o2.getDisplayOrder()); } return Integer.compare(o1.getPageNo(), o2.getPageNo()); } @@ -72,12 +72,12 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline { @Override public EREDTO doEre(TruncateDTO truncateDTO) { - if (StrUtil.equals(truncateDTO.getLayoutType(),"0")){ + if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TEXT.getCode()))){ return doTextEre(truncateDTO); } - if (StrUtil.equals(truncateDTO.getLayoutType(),"1")){ + if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TABLE.getCode()))){ return doTableEre(truncateDTO); } log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType()); @@ -85,21 +85,37 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline { } private EREDTO doTextEre(TruncateDTO truncateDTO) { + log.info("doTextEre:开始进行文本实体关系抽取,内容:{}", truncateDTO.getContent()); String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TEXT); - String formatted = String.format(prompt, truncateDTO.getContent()); + String formatted = StrUtil.format(prompt, truncateDTO.getContent()); String response = ollamaChatModel.call(formatted); // todo:暂时不去处理异常返回 - + log.info("doTextEre响应结果:{}", response); return EREDTO.fromTextJson(response, truncateDTO.getId()); } private EREDTO doTableEre(TruncateDTO truncateDTO) { + log.info("doTableEre:开始进行表格实体关系抽取,内容:{}", truncateDTO.getContent()); String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TABLE); - String formatted = String.format(prompt, truncateDTO.getContent()); + String formatted = StrUtil.format(prompt, truncateDTO.getContent()); String response = ollamaChatModel.call(formatted); + log.info("doTableEre响应结果:{}", response); // todo:暂时不去处理异常返回 - - return EREDTO.fromTableJson(response, truncateDTO.getId()); + EREDTO eredto = EREDTO.fromTableJson(response, truncateDTO.getId()); + EntityExtractionDTO titleEntity = new EntityExtractionDTO(); + titleEntity.setEntity("表"); + titleEntity.setName(truncateDTO.getTitle()); + // + // 添加关系 + ArrayList relations = new ArrayList<>(); + for (EntityExtractionDTO entity : eredto.getEntities()) { + RelationExtractionDTO relationExtractionDTO = new RelationExtractionDTO(truncateDTO.getId(), + titleEntity.getEntity(), titleEntity.getName(), "包含", entity.getEntity(), entity.getName(), entity.getAttributes()); + relations.add(relationExtractionDTO); + } + eredto.getEntities().add(titleEntity); + eredto.setRelations(relations); + return eredto; } /** diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 17e179f..03e8cf4 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -17,7 +17,7 @@ spring: chat: model: qwen2.5:32b options: - max_tokens: 512 + max_tokens: 51200 top_p: 0.9 top_k: 40 temperature: 0.7 diff --git a/src/main/resources/mapper/PdfAnalysisOutputMapper.xml b/src/main/resources/mapper/PdfAnalysisOutputMapper.xml index 87a9397..5dcb5f6 100644 --- a/src/main/resources/mapper/PdfAnalysisOutputMapper.xml +++ b/src/main/resources/mapper/PdfAnalysisOutputMapper.xml @@ -11,13 +11,13 @@ - + id,layout_type,content, page_no,pdf_id,table_title, - order,create_time + display_order,create_time diff --git a/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java b/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java index 9d1600a..2abdc70 100644 --- a/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java +++ b/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java @@ -1,13 +1,21 @@ package com.supervision.pdfqaserver; +import com.supervision.pdfqaserver.service.KnowledgeGraphService; +import lombok.extern.slf4j.Slf4j; import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; +@Slf4j @SpringBootTest class PdfQaServerApplicationTests { + @Autowired + private KnowledgeGraphService knowledgeGraphService; @Test - void contextLoads() { + void generateGraphTest() { + knowledgeGraphService.generateGraph("1"); + log.info("finish..."); } }