From 830acca35d5481b2ff652176b988799435228d11 Mon Sep 17 00:00:00 2001 From: xueqingkun Date: Mon, 28 Apr 2025 14:55:40 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 5 + .../pdfqaserver/cache/PromptCache.java | 53 ++++++ .../constant/DomainMetaGenerationEnum.java | 23 +++ .../pdfqaserver/constant/LayoutTypeEnum.java | 26 +++ .../domain/DocumentTruncation.java | 3 +- .../pdfqaserver/dto/DocumentDTO.java | 13 +- .../pdfqaserver/dto/ERAttributeDTO.java | 11 ++ .../supervision/pdfqaserver/dto/EREDTO.java | 22 ++- .../pdfqaserver/dto/EntityExtractionDTO.java | 5 + .../dto/RelationExtractionDTO.java | 27 +++ .../pdfqaserver/dto/TruncateDTO.java | 13 +- .../service/ChinesEsToEnglishGenerator.java | 6 + .../service/ChineseEnglishWordsService.java | 8 + .../service/DomainMetadataService.java | 2 + .../TruncationEntityExtractionService.java | 4 + .../TruncationRelationExtractionService.java | 3 + .../impl/ChinesEsToEnglishGeneratorImpl.java | 24 +++ .../impl/ChineseEnglishWordsServiceImpl.java | 22 +++ .../impl/DomainMetadataServiceImpl.java | 11 ++ .../impl/KnowledgeGraphServiceImpl.java | 73 ++++++++- .../impl/TripleConversionPipelineImpl.java | 154 ++++++++++++++++-- .../impl/TripleToCypherExecutorImpl.java | 7 +- ...TruncationEntityExtractionServiceImpl.java | 34 ++++ ...uncationRelationExtractionServiceImpl.java | 31 ++++ .../mapper/DocumentTruncationMapper.xml | 2 +- 25 files changed, 546 insertions(+), 36 deletions(-) create mode 100644 src/main/java/com/supervision/pdfqaserver/constant/DomainMetaGenerationEnum.java create mode 100644 src/main/java/com/supervision/pdfqaserver/constant/LayoutTypeEnum.java create mode 100644 src/main/java/com/supervision/pdfqaserver/service/ChinesEsToEnglishGenerator.java create mode 100644 src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java diff --git a/pom.xml b/pom.xml index 34ad1f6..8e4f1ec 100644 --- a/pom.xml +++ b/pom.xml @@ -80,6 +80,11 @@ jackson-databind 2.15.3 + + edu.stanford.nlp + stanford-corenlp + 4.5.4 + diff --git a/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java b/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java index 25e2613..a8e2c8c 100644 --- a/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java +++ b/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java @@ -10,6 +10,10 @@ public class PromptCache { public static final String DOERE_TEXT = "DOERE_TEXT"; public static final String DOERE_TABLE = "DOERE_TABLE"; + + public static final String CHINESE_TO_ENGLISH = "CHINESE_TO_ENGLISH"; + + public static final String ERE_TO_INSERT_CYPHER = "ERE_TO_INSERT_CYPHER"; public static final Map promptMap = new HashMap<>(); static { @@ -18,6 +22,8 @@ public class PromptCache { private static void init(){ promptMap.put(DOERE_TEXT, DOERE_TEXT_PROMPT); promptMap.put(DOERE_TABLE, DOERE_TABLE_PROMPT); + promptMap.put(CHINESE_TO_ENGLISH, CHINESE_TO_ENGLISH_PROMPT); + promptMap.put(ERE_TO_INSERT_CYPHER, ERE_TO_INSERT_CYPHER_PROMPT); } @@ -182,4 +188,51 @@ public class PromptCache { """; + private static final String CHINESE_TO_ENGLISH_PROMPT = """ + 你是一个表格数据处理专家,请严格按以下要求从给出的表格中提取数据: + + + """; + + + private static final String ERE_TO_INSERT_CYPHER_PROMPT = """ + 请将以下三元组数据转换为Neo4j的Cypher语句,要求: + 1. **节点**用`(n:Label {name: "Value"})`表示,其中`Label`是实体类型(如`Person`、`Company`); + 2. **关系**用`[r:RELATION_TYPE]`表示,保持与三元组中关系一致; + 3. 如果节点或关系已存在,使用`MERGE`避免重复创建; + 4. 返回完整的Cypher语句,不要解释。 + + ### 输入三元组示例 + ```json + [ + {"source": "人物","sourceType": "Person", "relation": "创始人", "relationType": "FOUNDED","target": "公司","targetType": "Company"}, + {"source": "公司","sourceType": "Company ", "relation": "位于", "relationType": "LOCATED_IN","target": "城市","targetType": "City "} + ] + ``` + + ### 输出示例 + + MERGE (p:Person {name: "人物"}) + MERGE (c:Company {name: "公司"}) + MERGE (city:City {name: "城市"}) + MERGE (p)-[r1:FOUNDED]->(c) + MERGE (c)-[r2:LOCATED_IN]->(city) + + ### 规则补充 + 1. 实体类型映射: + - "人物" → `Person` + - "公司" → `Company` + - "城市" → `City` + 2. 关系类型映射: + - "创始人" → `FOUNDED` + - "位于" → `LOCATED_IN` + 3. 属性统一用`name`字段存储实体名称。 + + ### 禁止行为 + 1. 不要为关系添加属性(除非明确提供); + 2. 不要使用中文标签(如`人物`→`Person`); + 3. 不要省略MERGE的安全约束。 + ### 请转换以下三元组: + {} + """; } diff --git a/src/main/java/com/supervision/pdfqaserver/constant/DomainMetaGenerationEnum.java b/src/main/java/com/supervision/pdfqaserver/constant/DomainMetaGenerationEnum.java new file mode 100644 index 0000000..f9c5fd6 --- /dev/null +++ b/src/main/java/com/supervision/pdfqaserver/constant/DomainMetaGenerationEnum.java @@ -0,0 +1,23 @@ +package com.supervision.pdfqaserver.constant; + +import lombok.Getter; + +/** + * 域元数据生成枚举类 + */ +@Getter +public enum DomainMetaGenerationEnum { + // 0=手动录入,1=系统自动 + DOM_MANUAL_ENTRY("0", "手动录入"), + + SYSTEM_AUTO_GENERATION("1", "系统自动"); + + + private final String code; + private final String name; + + DomainMetaGenerationEnum(String code, String name) { + this.code = code; + this.name = name; + } +} diff --git a/src/main/java/com/supervision/pdfqaserver/constant/LayoutTypeEnum.java b/src/main/java/com/supervision/pdfqaserver/constant/LayoutTypeEnum.java new file mode 100644 index 0000000..95edb30 --- /dev/null +++ b/src/main/java/com/supervision/pdfqaserver/constant/LayoutTypeEnum.java @@ -0,0 +1,26 @@ +package com.supervision.pdfqaserver.constant; + +import lombok.Getter; + +@Getter +public enum LayoutTypeEnum { + /** + * 文本 + */ + TEXT(0, "文本"), + + /** + * 表格 + */ + TABLE(1, "表格"); + + + private final int code; + private final String name; + + LayoutTypeEnum(int code, String name) { + this.code = code; + this.name = name; + } + +} diff --git a/src/main/java/com/supervision/pdfqaserver/domain/DocumentTruncation.java b/src/main/java/com/supervision/pdfqaserver/domain/DocumentTruncation.java index 4145bc2..7ada473 100644 --- a/src/main/java/com/supervision/pdfqaserver/domain/DocumentTruncation.java +++ b/src/main/java/com/supervision/pdfqaserver/domain/DocumentTruncation.java @@ -1,7 +1,6 @@ package com.supervision.pdfqaserver.domain; import com.baomidou.mybatisplus.annotation.*; - import java.io.Serializable; import java.time.LocalDateTime; import lombok.Data; @@ -27,7 +26,7 @@ public class DocumentTruncation implements Serializable { /** * 段落id pdf_analysis_output表的id */ - private String sectionId; + private Integer sectionId; /** * 布局类型 0-文本 1-表格 diff --git a/src/main/java/com/supervision/pdfqaserver/dto/DocumentDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/DocumentDTO.java index c3af439..f05621d 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/DocumentDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/DocumentDTO.java @@ -14,8 +14,9 @@ public class DocumentDTO { */ private String id; - private Integer documentId; - + /** + * 段落id + */ private Integer sectionId; private Integer pageNo; @@ -23,7 +24,7 @@ public class DocumentDTO { /** * 内容类型 0:文本 1:表格 */ - private String layoutType; + private Integer layoutType; /** * 内容在pdf页面中的顺序,越小表示顺序越靠前 @@ -48,11 +49,9 @@ public class DocumentDTO { } public DocumentDTO(PdfAnalysisOutput pdfAnalysisOutput) { + this.id = pdfAnalysisOutput.getPdfId().toString(); this.sectionId = pdfAnalysisOutput.getId(); - this.documentId = pdfAnalysisOutput.getPdfId(); - if (null != pdfAnalysisOutput.getLayoutType()) { - this.layoutType = pdfAnalysisOutput.getLayoutType().toString(); - } + this.layoutType = pdfAnalysisOutput.getLayoutType(); this.pageNo = pdfAnalysisOutput.getPageNo(); this.title = pdfAnalysisOutput.getTableTitle(); this.content = pdfAnalysisOutput.getContent(); diff --git a/src/main/java/com/supervision/pdfqaserver/dto/ERAttributeDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/ERAttributeDTO.java index 5064e7c..8bdd12d 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/ERAttributeDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/ERAttributeDTO.java @@ -1,5 +1,6 @@ package com.supervision.pdfqaserver.dto; +import com.supervision.pdfqaserver.domain.TruncationErAttribute; import lombok.Data; /** @@ -43,4 +44,14 @@ public class ERAttributeDTO { this.value = value; this.dataType = dataType; } + + public TruncationErAttribute toTruncationErAttribute() { + TruncationErAttribute truncationErAttribute = new TruncationErAttribute(); + truncationErAttribute.setTerId(this.terId); + truncationErAttribute.setType(this.type); + truncationErAttribute.setAttribute(this.attribute); + truncationErAttribute.setValue(this.value); + truncationErAttribute.setDataType(this.dataType); + return truncationErAttribute; + } } diff --git a/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java index 6eda549..cf04e13 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/EREDTO.java @@ -4,6 +4,7 @@ import cn.hutool.core.collection.CollUtil; import cn.hutool.core.util.StrUtil; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; +import com.supervision.pdfqaserver.domain.ChineseEnglishWords; import lombok.Data; import lombok.extern.slf4j.Slf4j; @@ -16,9 +17,9 @@ import java.util.*; @Data public class EREDTO { - private List entities; + private List entities = new ArrayList<>(); - private List relations; + private List relations = new ArrayList<>(); public EREDTO() { } @@ -121,4 +122,21 @@ public class EREDTO { eredto.setEntities(entities); return eredto; } + + public void setEn(List wordsList) { + if (CollUtil.isEmpty(wordsList)){ + return; + } + for (EntityExtractionDTO entity : entities) { + String entityName = entity.getEntity(); + Optional first = wordsList.stream().filter(w -> StrUtil.equals(w.getChineseWord(), entityName)).findFirst(); + first.ifPresent(chineseEnglishWords -> entity.setEntityEn(chineseEnglishWords.getEnglishWord())); + } + for (RelationExtractionDTO relation : relations) { + String relationName = relation.getRelation(); + Optional first = wordsList.stream().filter(w -> StrUtil.equals(w.getChineseWord(), relationName)).findFirst(); + first.ifPresent(chineseEnglishWords -> relation.setRelationEn(chineseEnglishWords.getEnglishWord())); + } + + } } diff --git a/src/main/java/com/supervision/pdfqaserver/dto/EntityExtractionDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/EntityExtractionDTO.java index 30cb118..e976070 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/EntityExtractionDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/EntityExtractionDTO.java @@ -18,6 +18,11 @@ public class EntityExtractionDTO { */ private String entity; + /** + * 实体英文名 + */ + private String entityEn; + /** * 实体名 */ diff --git a/src/main/java/com/supervision/pdfqaserver/dto/RelationExtractionDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/RelationExtractionDTO.java index 1b35063..f01010d 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/RelationExtractionDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/RelationExtractionDTO.java @@ -1,5 +1,7 @@ package com.supervision.pdfqaserver.dto; +import com.supervision.pdfqaserver.domain.DomainMetadata; +import com.supervision.pdfqaserver.domain.TruncationRelationExtraction; import lombok.Data; import java.util.List; @@ -23,11 +25,15 @@ public class RelationExtractionDTO { */ private String sourceType; + private String sourceTypeEn; + /** *关系 */ private String relation; + private String relationEn; + /** * 尾节点数据 */ @@ -38,6 +44,8 @@ public class RelationExtractionDTO { */ private String targetType; + private String targetTypeEn; + private List attributes; public RelationExtractionDTO() { @@ -52,4 +60,23 @@ public class RelationExtractionDTO { this.sourceType = sourceType; this.targetType = targetType; } + + public TruncationRelationExtraction toTruncationRelationExtraction() { + TruncationRelationExtraction truncationRelationExtraction = new TruncationRelationExtraction(); + truncationRelationExtraction.setTruncationId(this.truncationId); + truncationRelationExtraction.setSource(this.source); + truncationRelationExtraction.setSourceType(this.sourceType); + truncationRelationExtraction.setRelation(this.relation); + truncationRelationExtraction.setTarget(this.target); + truncationRelationExtraction.setTargetType(this.targetType); + return truncationRelationExtraction; + } + + public DomainMetadata toDomainMetadata() { + DomainMetadata domainMetadata = new DomainMetadata(); + domainMetadata.setSourceType(this.sourceType); + domainMetadata.setRelation(this.relation); + domainMetadata.setTargetType(this.targetType); + return domainMetadata; + } } diff --git a/src/main/java/com/supervision/pdfqaserver/dto/TruncateDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/TruncateDTO.java index db4a5d2..1b0ed70 100644 --- a/src/main/java/com/supervision/pdfqaserver/dto/TruncateDTO.java +++ b/src/main/java/com/supervision/pdfqaserver/dto/TruncateDTO.java @@ -27,7 +27,7 @@ public class TruncateDTO { /** * 段落id pdf_analysis_output表的id */ - private String sectionId; + private Integer sectionId; /** * 表格标题 @@ -40,6 +40,17 @@ public class TruncateDTO { private String content; + public TruncateDTO() { + } + + public TruncateDTO(DocumentDTO documentDTO) { + this.documentId = Integer.parseInt(documentDTO.getId()); + this.sectionId = documentDTO.getSectionId(); + this.layoutType = documentDTO.getLayoutType().toString(); + this.title = documentDTO.getTitle(); + this.content = documentDTO.getContent(); + } + public DocumentTruncation toDocumentTruncation() { DocumentTruncation truncation = new DocumentTruncation(); truncation.setDocumentId(this.documentId); diff --git a/src/main/java/com/supervision/pdfqaserver/service/ChinesEsToEnglishGenerator.java b/src/main/java/com/supervision/pdfqaserver/service/ChinesEsToEnglishGenerator.java new file mode 100644 index 0000000..613c206 --- /dev/null +++ b/src/main/java/com/supervision/pdfqaserver/service/ChinesEsToEnglishGenerator.java @@ -0,0 +1,6 @@ +package com.supervision.pdfqaserver.service; + +public interface ChinesEsToEnglishGenerator { + + String generate(String chinese); +} diff --git a/src/main/java/com/supervision/pdfqaserver/service/ChineseEnglishWordsService.java b/src/main/java/com/supervision/pdfqaserver/service/ChineseEnglishWordsService.java index 3e25515..9c72c1b 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/ChineseEnglishWordsService.java +++ b/src/main/java/com/supervision/pdfqaserver/service/ChineseEnglishWordsService.java @@ -2,6 +2,7 @@ package com.supervision.pdfqaserver.service; import com.supervision.pdfqaserver.domain.ChineseEnglishWords; import com.baomidou.mybatisplus.extension.service.IService; +import java.util.List; /** * @author Administrator @@ -10,4 +11,11 @@ import com.baomidou.mybatisplus.extension.service.IService; */ public interface ChineseEnglishWordsService extends IService { + + List queryAll(); + + + boolean wordsExists(String word, List wordsList); + + void saveIfNotExists(ChineseEnglishWords words); } diff --git a/src/main/java/com/supervision/pdfqaserver/service/DomainMetadataService.java b/src/main/java/com/supervision/pdfqaserver/service/DomainMetadataService.java index c1f5e1b..59dc4cb 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/DomainMetadataService.java +++ b/src/main/java/com/supervision/pdfqaserver/service/DomainMetadataService.java @@ -10,4 +10,6 @@ import com.baomidou.mybatisplus.extension.service.IService; */ public interface DomainMetadataService extends IService { + + void saveIfNotExists(DomainMetadata metadata); } diff --git a/src/main/java/com/supervision/pdfqaserver/service/TruncationEntityExtractionService.java b/src/main/java/com/supervision/pdfqaserver/service/TruncationEntityExtractionService.java index 99f4ecc..ab06446 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/TruncationEntityExtractionService.java +++ b/src/main/java/com/supervision/pdfqaserver/service/TruncationEntityExtractionService.java @@ -2,6 +2,9 @@ package com.supervision.pdfqaserver.service; import com.supervision.pdfqaserver.domain.TruncationEntityExtraction; import com.baomidou.mybatisplus.extension.service.IService; +import com.supervision.pdfqaserver.dto.EntityExtractionDTO; + +import java.util.List; /** * @author Administrator @@ -10,4 +13,5 @@ import com.baomidou.mybatisplus.extension.service.IService; */ public interface TruncationEntityExtractionService extends IService { + void saveERE(List entities); } diff --git a/src/main/java/com/supervision/pdfqaserver/service/TruncationRelationExtractionService.java b/src/main/java/com/supervision/pdfqaserver/service/TruncationRelationExtractionService.java index 666e7b7..0ff80e0 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/TruncationRelationExtractionService.java +++ b/src/main/java/com/supervision/pdfqaserver/service/TruncationRelationExtractionService.java @@ -2,6 +2,8 @@ package com.supervision.pdfqaserver.service; import com.supervision.pdfqaserver.domain.TruncationRelationExtraction; import com.baomidou.mybatisplus.extension.service.IService; +import com.supervision.pdfqaserver.dto.RelationExtractionDTO; +import java.util.List; /** * @author Administrator @@ -10,4 +12,5 @@ import com.baomidou.mybatisplus.extension.service.IService; */ public interface TruncationRelationExtractionService extends IService { + void saveERE(List relations); } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java new file mode 100644 index 0000000..adefdcf --- /dev/null +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/ChinesEsToEnglishGeneratorImpl.java @@ -0,0 +1,24 @@ +package com.supervision.pdfqaserver.service.impl; + +import com.supervision.pdfqaserver.cache.PromptCache; +import com.supervision.pdfqaserver.service.ChinesEsToEnglishGenerator; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.ai.ollama.OllamaChatModel; +import org.springframework.stereotype.Service; +import static com.supervision.pdfqaserver.cache.PromptCache.CHINESE_TO_ENGLISH; + +@Slf4j +@Service +@RequiredArgsConstructor +public class ChinesEsToEnglishGeneratorImpl implements ChinesEsToEnglishGenerator { + + private final OllamaChatModel ollamaChatModel; + @Override + public String generate(String chinese) { + log.info("generate:开始翻译: {}",chinese); + String prompt = PromptCache.promptMap.get(CHINESE_TO_ENGLISH); + ollamaChatModel.call("请将以下中文翻译成英文: " + chinese); + return null; + } +} diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/ChineseEnglishWordsServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/ChineseEnglishWordsServiceImpl.java index 57106ef..70ca89d 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/ChineseEnglishWordsServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/ChineseEnglishWordsServiceImpl.java @@ -1,10 +1,12 @@ package com.supervision.pdfqaserver.service.impl; +import cn.hutool.core.util.StrUtil; import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; import com.supervision.pdfqaserver.domain.ChineseEnglishWords; import com.supervision.pdfqaserver.service.ChineseEnglishWordsService; import com.supervision.pdfqaserver.mapper.ChineseEnglishWordsMapper; import org.springframework.stereotype.Service; +import java.util.List; /** * @author Administrator @@ -15,6 +17,26 @@ import org.springframework.stereotype.Service; public class ChineseEnglishWordsServiceImpl extends ServiceImpl implements ChineseEnglishWordsService{ + @Override + public List queryAll() { + return this.list(); + } + + @Override + public boolean wordsExists(String word, List wordsList) { + if (StrUtil.isEmpty(word)){ + return true; + } + return wordsList.stream().anyMatch(w->StrUtil.equals(w.getChineseWord(),word)); + } + + @Override + public void saveIfNotExists(ChineseEnglishWords words) { + boolean exists = this.lambdaQuery().eq(ChineseEnglishWords::getChineseWord, words.getChineseWord()).exists(); + if (!exists){ + this.save(words); + } + } } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/DomainMetadataServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/DomainMetadataServiceImpl.java index d246477..2f4219b 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/DomainMetadataServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/DomainMetadataServiceImpl.java @@ -15,6 +15,17 @@ import org.springframework.stereotype.Service; public class DomainMetadataServiceImpl extends ServiceImpl implements DomainMetadataService{ + @Override + public void saveIfNotExists(DomainMetadata metadata) { + + boolean exists = this.lambdaQuery() + .eq(DomainMetadata::getSourceType, metadata.getSourceType()) + .eq(DomainMetadata::getTargetType, metadata.getTargetType()) + .eq(DomainMetadata::getRelation, metadata.getRelation()).exists(); + if (!exists) { + this.save(metadata); + } + } } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java index 06364c7..fac2c4a 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java @@ -1,15 +1,16 @@ package com.supervision.pdfqaserver.service.impl; import cn.hutool.core.collection.CollUtil; -import com.supervision.pdfqaserver.dto.EREDTO; +import cn.hutool.core.util.StrUtil; +import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum; +import com.supervision.pdfqaserver.domain.ChineseEnglishWords; +import com.supervision.pdfqaserver.domain.DomainMetadata; +import com.supervision.pdfqaserver.dto.*; import com.supervision.pdfqaserver.domain.PdfAnalysisOutput; -import com.supervision.pdfqaserver.dto.DocumentDTO; -import com.supervision.pdfqaserver.dto.TruncateDTO; import com.supervision.pdfqaserver.service.*; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; - import java.util.ArrayList; import java.util.List; @@ -30,13 +31,11 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { private final PdfAnalysisOutputService pdfAnalysisOutputService; - private final PdfInfoService pdfInfoService; - private final TruncationEntityExtractionService truncationEntityExtractionService; private final TruncationRelationExtractionService relationExtractionService; - private final TruncationErAttributeService truncationErAttributeService; + private final ChinesEsToEnglishGenerator chinesEsToEnglishGenerator; @Override public void generateGraph(String documentId) { @@ -55,6 +54,9 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { List eredtoList = new ArrayList<>(); for (TruncateDTO truncateDTO : truncateDTOS) { EREDTO eredto = tripleConversionPipeline.doEre(truncateDTO); + if (null == eredto){ + continue; + } // 保存实体关系抽取结果 this.saveERE(eredto, truncateDTO.getId()); } @@ -62,13 +64,63 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { // 合并实体关系抽取结果 List mergedList = tripleConversionPipeline.mergeEreResults(eredtoList); + // 保存领域元数据 for (EREDTO eredto : mergedList) { - String insertCypher = tripleToCypherExecutor.generateInsertCypher(eredto); + List relations = eredto.getRelations(); + if (CollUtil.isEmpty(relations)){ + continue; + } + for (RelationExtractionDTO relation : relations) { + DomainMetadata domainMetadata = relation.toDomainMetadata(); + domainMetadata.setDomainType("1"); + domainMetadata.setGenerationType(DomainMetaGenerationEnum.SYSTEM_AUTO_GENERATION.getCode()); + domainMetadataService.saveIfNotExists(domainMetadata); + } + } + // 保存字典 + List allWords = chineseEnglishWordsService.queryAll(); + for (EREDTO eredto : mergedList) { + List entities = eredto.getEntities(); + if (CollUtil.isNotEmpty(entities)){ + for (EntityExtractionDTO entityDTO : entities) { + saveWordsIfNecessary(entityDTO.getEntity(), allWords); + } + } + List relations = eredto.getRelations(); + if (CollUtil.isNotEmpty(relations)){ + for (RelationExtractionDTO relationDTO : relations) { + saveWordsIfNecessary(relationDTO.getRelation(), allWords); + } + } + } + + // 生成cypher语句 + for (EREDTO eredto : mergedList) { + eredto.setEn(allWords); + String insertCypher = tripleToCypherExecutor.generateInsertCypher(eredto); + log.info("insertCypher:{}", insertCypher); tripleToCypherExecutor.executeCypher(insertCypher); } + } + + private void saveWordsIfNecessary(String word, List allWords) { + boolean exists = chineseEnglishWordsService.wordsExists(word, allWords); + if (exists){ + return; + } + String generate = chinesEsToEnglishGenerator.generate(word); + if (StrUtil.isEmpty(generate)){ + log.info("生成英文名称失败,entity:{}", word); + return; + } + ChineseEnglishWords words = new ChineseEnglishWords(); + words.setChineseWord(word); + words.setEnglishWord(generate); + chineseEnglishWordsService.saveIfNotExists(words); + allWords.add(words);// 更新缓存 } @Override @@ -79,6 +131,11 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { @Override public void saveERE(EREDTO eredto, String truncationId) { + // 保存实体信息 + truncationEntityExtractionService.saveERE(eredto.getEntities()); + + // 保存关系 + relationExtractionService.saveERE(eredto.getRelations()); } } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java index 57899ca..5c4ef14 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java @@ -3,15 +3,19 @@ package com.supervision.pdfqaserver.service.impl; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.util.StrUtil; import com.supervision.pdfqaserver.cache.PromptCache; +import com.supervision.pdfqaserver.constant.LayoutTypeEnum; import com.supervision.pdfqaserver.dto.*; import com.supervision.pdfqaserver.service.TripleConversionPipeline; +import edu.stanford.nlp.pipeline.CoreDocument; +import edu.stanford.nlp.pipeline.CoreSentence; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.ai.ollama.OllamaChatModel; import org.springframework.stereotype.Service; -import java.util.ArrayList; -import java.util.List; +import java.util.*; + @Slf4j @Service @RequiredArgsConstructor @@ -31,7 +35,38 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline { return Integer.compare(o1.getPageNo(), o2.getPageNo()); } ).toList(); - return null; + + Properties props = new Properties(); + props.setProperty("annotators", "tokenize, ssplit"); + // 创建管道 + StanfordCoreNLP pipeline = new StanfordCoreNLP(props); + List truncateDTOS = new ArrayList<>(); + for (DocumentDTO documentDTO : documentDTOList) { + String content = documentDTO.getContent(); + if (StrUtil.isEmpty(content)){ + continue; + } + Integer layoutType = documentDTO.getLayoutType(); + if (LayoutTypeEnum.TEXT.getCode() == layoutType){ + // 如果是文本类型的布局,进行合并 + CoreDocument document = new CoreDocument(content); + // 分析文本 + pipeline.annotate(document); + // 获取句子 + for (CoreSentence sentence : document.sentences()) { + TruncateDTO truncateDTO = new TruncateDTO(documentDTO); + truncateDTO.setContent(sentence.text()); + truncateDTOS.add(truncateDTO); + } + } else if (LayoutTypeEnum.TABLE.getCode() == layoutType) { + // 如果是表格类型的布局,直接添加到列表中 + TruncateDTO truncateDTO = new TruncateDTO(documentDTO); + truncateDTOS.add(truncateDTO); + } else { + log.info("sliceDocuments:错误的布局类型: {}", layoutType); + } + } + return truncateDTOS; } @Override @@ -39,13 +74,11 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline { if (StrUtil.equals(truncateDTO.getLayoutType(),"0")){ - EREDTO eredto = doTextEre(truncateDTO); - return eredto; + return doTextEre(truncateDTO); } if (StrUtil.equals(truncateDTO.getLayoutType(),"1")){ - EREDTO eredto = doTableEre(truncateDTO); - return eredto; + return doTableEre(truncateDTO); } log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType()); return null; @@ -80,26 +113,119 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline { if (CollUtil.isEmpty(eredtoList)){ return merged; } + Map entityMap = new HashMap<>(); + Map relationMap = new HashMap<>(); for (EREDTO eredto : eredtoList) { List entities = eredto.getEntities(); if (CollUtil.isNotEmpty(entities)){ for (EntityExtractionDTO entity : entities) { - String e = entity.getEntity(); - String name = entity.getName(); - // entity.getEntity() 和 entity.getName() 完全相等看作是同一个数据 + String key = generateEntityMapKey(entity); + mergeAttribute(entityMap,entity, key); } } List relations = eredto.getRelations(); if (CollUtil.isNotEmpty(relations)){ for (RelationExtractionDTO relation : relations) { - String source = relation.getSource(); - String target = relation.getTarget(); - String re = relation.getRelation(); // source和target,re完全相等看作是同一个数据 + String relationMapKey = generateRelationMapKey(relation); + mergeAttribute(relationMap,relation, relationMapKey); } } } + // 利用合并后的map生成新的EREDTO + // 优先先把有关系的节点与关系组合在一次 + Set relationEntityKey = new HashSet<>(); + for (Map.Entry relationEntry : relationMap.entrySet()) { + RelationExtractionDTO value = relationEntry.getValue(); + EntityExtractionDTO sourceEntity = entityMap.get(StrUtil.join("_", value.getSourceType(), value.getSource())); + if (null == sourceEntity){ + log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到头节点映射关系", value.getSourceType(), value.getSource()); + continue; + } + EntityExtractionDTO targetEntity = entityMap.get(StrUtil.join("_", value.getTargetType(), value.getTarget())); + if (null == targetEntity){ + log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到尾节点映射关系", value.getTargetType(), value.getTarget()); + continue; + } + EREDTO eredto = new EREDTO(); + eredto.setEntities(List.of(sourceEntity,targetEntity)); + eredto.setRelations(List.of(value)); + merged.add(eredto); + relationEntityKey.addAll(List.of(generateEntityMapKey(sourceEntity),generateEntityMapKey(targetEntity))); + } + // 将没有关系的节点单独放在一起 + List leavedEntities = new ArrayList<>(); + for (Map.Entry entry : entityMap.entrySet()) { + if (!relationEntityKey.contains(entry.getKey())){ + leavedEntities.add(entry.getValue()); + } + } + EREDTO eredto = new EREDTO(); + eredto.setEntities(leavedEntities); + merged.add(eredto); + return merged; + } - return null; + private void mergeAttribute(Map entityMap,RelationExtractionDTO relation, String key) { + + RelationExtractionDTO cachedEntity = entityMap.get(key); + if (null == cachedEntity){ + entityMap.put(key, relation); + }else { + if (CollUtil.isEmpty(relation.getAttributes())){ + return; + } + // 合并属性 + List attributes = relation.getAttributes(); + if (null == attributes){ + attributes = new ArrayList<>(); + } + for (ERAttributeDTO attribute : relation.getAttributes()) { + String attributeKey = attribute.getAttribute(); + String attributeValue = attribute.getValue(); + if (StrUtil.isEmpty(attributeKey) || StrUtil.isEmpty(attributeValue)){ + continue; + } + // 如果属性已经存在,则不添加 + if (attributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) { + attributes.add(attribute); + } + } + } + } + private void mergeAttribute(Map entityMap,EntityExtractionDTO entity, String key) { + + EntityExtractionDTO cachedEntity = entityMap.get(key); + if (null == cachedEntity){ + entityMap.put(key, entity); + }else { + if (CollUtil.isEmpty(entity.getAttributes())){ + return; + } + // 合并属性 + List attributes = entity.getAttributes(); + if (null == attributes){ + attributes = new ArrayList<>(); + } + for (ERAttributeDTO attribute : entity.getAttributes()) { + String attributeKey = attribute.getAttribute(); + String attributeValue = attribute.getValue(); + if (StrUtil.isEmpty(attributeKey) || StrUtil.isEmpty(attributeValue)){ + continue; + } + // 如果属性已经存在,则不添加 + if (attributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) { + attributes.add(attribute); + } + } + } + } + + private String generateEntityMapKey(EntityExtractionDTO entityExtractionDTO) { + return entityExtractionDTO.getEntity() + "_" + entityExtractionDTO.getName(); + } + + private String generateRelationMapKey(RelationExtractionDTO relationExtractionDTO) { + return relationExtractionDTO.getSource() + "_" + relationExtractionDTO.getTarget() + "_" + relationExtractionDTO.getRelation(); } } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleToCypherExecutorImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleToCypherExecutorImpl.java index 1d82afa..025fee0 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleToCypherExecutorImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleToCypherExecutorImpl.java @@ -1,11 +1,13 @@ package com.supervision.pdfqaserver.service.impl; +import com.supervision.pdfqaserver.cache.PromptCache; import com.supervision.pdfqaserver.dto.EREDTO; import com.supervision.pdfqaserver.service.TripleToCypherExecutor; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.ai.ollama.OllamaChatModel; import org.springframework.stereotype.Service; +import static com.supervision.pdfqaserver.cache.PromptCache.ERE_TO_INSERT_CYPHER; @Slf4j @Service @@ -15,7 +17,10 @@ public class TripleToCypherExecutorImpl implements TripleToCypherExecutor { private final OllamaChatModel ollamaChatModel; @Override public String generateInsertCypher(EREDTO eredto) { - return null; + + String prompt = PromptCache.promptMap.get(ERE_TO_INSERT_CYPHER); + String call = ollamaChatModel.call(prompt); + return call; } @Override diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationEntityExtractionServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationEntityExtractionServiceImpl.java index aafd16f..f80dc1f 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationEntityExtractionServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationEntityExtractionServiceImpl.java @@ -1,20 +1,54 @@ package com.supervision.pdfqaserver.service.impl; +import cn.hutool.core.collection.CollUtil; import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; import com.supervision.pdfqaserver.domain.TruncationEntityExtraction; +import com.supervision.pdfqaserver.domain.TruncationErAttribute; +import com.supervision.pdfqaserver.dto.ERAttributeDTO; +import com.supervision.pdfqaserver.dto.EntityExtractionDTO; import com.supervision.pdfqaserver.service.TruncationEntityExtractionService; import com.supervision.pdfqaserver.mapper.TruncationEntityExtractionMapper; +import com.supervision.pdfqaserver.service.TruncationErAttributeService; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; +import java.util.List; + /** * @author Administrator * @description 针对表【truncation_entity_extraction(片段实体抽取)】的数据库操作Service实现 * @createDate 2025-04-27 11:45:24 */ +@Slf4j @Service +@RequiredArgsConstructor public class TruncationEntityExtractionServiceImpl extends ServiceImpl implements TruncationEntityExtractionService{ + private final TruncationErAttributeService truncationErAttributeService; + @Override + public void saveERE(List entities) { + if (CollUtil.isEmpty(entities)){ + return; + } + for (EntityExtractionDTO entity : entities) { + TruncationEntityExtraction tee = new TruncationEntityExtraction(); + tee.setTruncationId(entity.getTruncationId()); + tee.setEntity(entity.getEntity()); + tee.setName(entity.getName()); + this.save(tee); + List attributes = entity.getAttributes(); + if (CollUtil.isEmpty(attributes)){ + continue; + } + for (ERAttributeDTO attribute : attributes) { + attribute.setTerId(tee.getId()); + TruncationErAttribute era = attribute.toTruncationErAttribute(); + truncationErAttributeService.save(era); + } + } + } } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationRelationExtractionServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationRelationExtractionServiceImpl.java index 5ab6692..87b3ec3 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationRelationExtractionServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/TruncationRelationExtractionServiceImpl.java @@ -1,20 +1,51 @@ package com.supervision.pdfqaserver.service.impl; +import cn.hutool.core.collection.CollUtil; import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; +import com.supervision.pdfqaserver.domain.TruncationErAttribute; import com.supervision.pdfqaserver.domain.TruncationRelationExtraction; +import com.supervision.pdfqaserver.dto.ERAttributeDTO; +import com.supervision.pdfqaserver.dto.RelationExtractionDTO; +import com.supervision.pdfqaserver.service.TruncationErAttributeService; import com.supervision.pdfqaserver.service.TruncationRelationExtractionService; import com.supervision.pdfqaserver.mapper.TruncationRelationExtractionMapper; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; +import java.util.List; + /** * @author Administrator * @description 针对表【truncation_relation_extraction(片段关系抽取)】的数据库操作Service实现 * @createDate 2025-04-27 11:45:24 */ +@Slf4j @Service +@RequiredArgsConstructor public class TruncationRelationExtractionServiceImpl extends ServiceImpl implements TruncationRelationExtractionService{ + private final TruncationErAttributeService truncationErAttributeService; + @Override + public void saveERE(List relations) { + if (CollUtil.isEmpty(relations)){ + return; + } + + for (RelationExtractionDTO relation : relations) { + TruncationRelationExtraction re = relation.toTruncationRelationExtraction(); + this.save(re); + if (CollUtil.isEmpty(relation.getAttributes())){ + continue; + } + for (ERAttributeDTO attribute : relation.getAttributes()) { + TruncationErAttribute era = attribute.toTruncationErAttribute(); + era.setTerId(re.getId()); + truncationErAttributeService.save(era); + } + } + } } diff --git a/src/main/resources/mapper/DocumentTruncationMapper.xml b/src/main/resources/mapper/DocumentTruncationMapper.xml index 2440ce9..d32cbb8 100644 --- a/src/main/resources/mapper/DocumentTruncationMapper.xml +++ b/src/main/resources/mapper/DocumentTruncationMapper.xml @@ -7,7 +7,7 @@ - +