generateGraph 功能初始化

master
xueqingkun 7 hours ago
parent 830acca35d
commit 7f5c52546a

@ -29,7 +29,7 @@ public class PromptCache {
private static final String DOERE_TEXT_PROMPT = """ private static final String DOERE_TEXT_PROMPT = """
JSON JSON
1. **** 1. ****
- -
@ -45,8 +45,8 @@ public class PromptCache {
- (, , ) - (, , )
**** ****
- JSON使```json ```Markdown
- 使JSON Schema - 使JSON Schema
{ {
"nodes": [ "nodes": [
{ {
@ -138,7 +138,7 @@ public class PromptCache {
"""; """;
private static final String DOERE_TABLE_PROMPT = """ private static final String DOERE_TABLE_PROMPT = """
**** ****
1. 1.
@ -147,7 +147,6 @@ public class PromptCache {
4. 4.
**** ****
```json
{ {
"table_data": [ "table_data": [
{ {
@ -158,7 +157,7 @@ public class PromptCache {
// 后续行... // 后续行...
] ]
} }
```
**** ****
| | | | | | | |
@ -189,9 +188,31 @@ public class PromptCache {
private static final String CHINESE_TO_ENGLISH_PROMPT = """ private static final String CHINESE_TO_ENGLISH_PROMPT = """
Neo4jNeo4j
1. ****
- 使`UpperCamelCase``ProductCategory`
- 使`SCREAMING_SNAKE_CASE``IS_RELATED_TO`
- `2023``2023`
-
-
2. ****
-
-
- /"腾讯"`Tencent`
3. ****
- : "用户订单" : `UserOrder`
- : "属于2023年" : `BELONGS_TO_2023`
- : "5G网络设备" : `5GNetworkDevice`
- : "评分大于90" : `SCORE_ABOVE_90`
4. ****
{}
5. ****
"""; """;
@ -203,12 +224,10 @@ public class PromptCache {
4. Cypher 4. Cypher
### ###
```json
[ [
{"source": "人物","sourceType": "Person", "relation": "创始人", "relationType": "FOUNDED","target": "公司","targetType": "Company"}, {"source": "人物","sourceType": "Person", "relation": "创始人", "relationType": "FOUNDED","target": "公司","targetType": "Company"},
{"source": "公司","sourceType": "Company ", "relation": "位于", "relationType": "LOCATED_IN","target": "城市","targetType": "City "} {"source": "公司","sourceType": "Company ", "relation": "位于", "relationType": "LOCATED_IN","target": "城市","targetType": "City "}
] ]
```
### ###

@ -47,7 +47,7 @@ public class PdfAnalysisOutput implements Serializable {
/** /**
* pdf * pdf
*/ */
private Integer order; private Integer displayOrder;
/** /**
* *

@ -29,7 +29,7 @@ public class DocumentDTO {
/** /**
* pdf * pdf
*/ */
private Integer layoutOrder; private Integer displayOrder;
private String title; private String title;
@ -49,13 +49,13 @@ public class DocumentDTO {
} }
public DocumentDTO(PdfAnalysisOutput pdfAnalysisOutput) { public DocumentDTO(PdfAnalysisOutput pdfAnalysisOutput) {
this.id = pdfAnalysisOutput.getPdfId().toString(); this.id = pdfAnalysisOutput.getId().toString();
this.sectionId = pdfAnalysisOutput.getId(); this.sectionId = pdfAnalysisOutput.getId();
this.layoutType = pdfAnalysisOutput.getLayoutType(); this.layoutType = pdfAnalysisOutput.getLayoutType();
this.pageNo = pdfAnalysisOutput.getPageNo(); this.pageNo = pdfAnalysisOutput.getPageNo();
this.title = pdfAnalysisOutput.getTableTitle(); this.title = pdfAnalysisOutput.getTableTitle();
this.content = pdfAnalysisOutput.getContent(); this.content = pdfAnalysisOutput.getContent();
this.layoutOrder = pdfAnalysisOutput.getOrder(); this.displayOrder = pdfAnalysisOutput.getDisplayOrder();
} }

@ -37,17 +37,17 @@ public class EREDTO {
String name = nodeJson.getString("name"); String name = nodeJson.getString("name");
String type = nodeJson.getString("type"); String type = nodeJson.getString("type");
JSONObject attributes = nodeJson.getJSONObject("attributes"); JSONObject attributes = nodeJson.getJSONObject("attributes");
List<ERAttributeDTO> erAttributeDTOS = new ArrayList<>();
if (CollUtil.isNotEmpty(attributes)){ if (CollUtil.isNotEmpty(attributes)){
List<ERAttributeDTO> erAttributeDTOS = new ArrayList<>();
for (String key : attributes.keySet()) { for (String key : attributes.keySet()) {
Object value = attributes.get(key); Object value = attributes.get(key);
String valueString = attributes.getString(key); String valueString = attributes.getString(key);
ERAttributeDTO erAttributeDTO = new ERAttributeDTO(key, valueString, value instanceof Number?"1":"0"); ERAttributeDTO erAttributeDTO = new ERAttributeDTO(key, valueString, value instanceof Number?"1":"0");
erAttributeDTOS.add(erAttributeDTO); erAttributeDTOS.add(erAttributeDTO);
} }
EntityExtractionDTO entityExtraction = new EntityExtractionDTO(truncationId,name,type, erAttributeDTOS);
entities.add(entityExtraction);
} }
EntityExtractionDTO entityExtraction = new EntityExtractionDTO(truncationId,name,type, erAttributeDTOS);
entities.add(entityExtraction);
} }
} }
if (CollUtil.isNotEmpty(relations)){ if (CollUtil.isNotEmpty(relations)){
@ -106,7 +106,7 @@ public class EREDTO {
continue; continue;
} }
EntityExtractionDTO entityExtractionDTO = new EntityExtractionDTO(); EntityExtractionDTO entityExtractionDTO = new EntityExtractionDTO();
entityExtractionDTO.setEntity("row"); entityExtractionDTO.setEntity("");
entityExtractionDTO.setName("row"); entityExtractionDTO.setName("row");
entityExtractionDTO.setTruncationId(truncationId); entityExtractionDTO.setTruncationId(truncationId);
List<ERAttributeDTO> erAttributeDTOS = new ArrayList<>(); List<ERAttributeDTO> erAttributeDTOS = new ArrayList<>();

@ -12,5 +12,5 @@ import java.util.List;
*/ */
public interface PdfAnalysisOutputService extends IService<PdfAnalysisOutput> { public interface PdfAnalysisOutputService extends IService<PdfAnalysisOutput> {
List<PdfAnalysisOutput> queryByPdfId(String pdfId); List<PdfAnalysisOutput> queryByPdfId(Integer pdfId);
} }

@ -1,5 +1,6 @@
package com.supervision.pdfqaserver.service.impl; package com.supervision.pdfqaserver.service.impl;
import cn.hutool.core.util.StrUtil;
import com.supervision.pdfqaserver.cache.PromptCache; import com.supervision.pdfqaserver.cache.PromptCache;
import com.supervision.pdfqaserver.service.ChinesEsToEnglishGenerator; import com.supervision.pdfqaserver.service.ChinesEsToEnglishGenerator;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -18,7 +19,8 @@ public class ChinesEsToEnglishGeneratorImpl implements ChinesEsToEnglishGenerato
public String generate(String chinese) { public String generate(String chinese) {
log.info("generate:开始翻译: {}",chinese); log.info("generate:开始翻译: {}",chinese);
String prompt = PromptCache.promptMap.get(CHINESE_TO_ENGLISH); String prompt = PromptCache.promptMap.get(CHINESE_TO_ENGLISH);
ollamaChatModel.call("请将以下中文翻译成英文: " + chinese); String response = ollamaChatModel.call(StrUtil.format(prompt, chinese));
return null; log.info("generate:chinese:{}翻译结果: {}",chinese,response);
return response;
} }
} }

@ -24,7 +24,11 @@ public class DocumentTruncationServiceImpl extends ServiceImpl<DocumentTruncatio
if (CollUtil.isEmpty(truncateDTOS)){ if (CollUtil.isEmpty(truncateDTOS)){
return; return;
} }
truncateDTOS.stream().map(TruncateDTO::toDocumentTruncation).forEach(this::save); for (TruncateDTO truncateDTO : truncateDTOS) {
DocumentTruncation documentTruncation = truncateDTO.toDocumentTruncation();
this.save(documentTruncation);
truncateDTO.setId(documentTruncation.getId());
}
} }
} }

@ -1,6 +1,7 @@
package com.supervision.pdfqaserver.service.impl; package com.supervision.pdfqaserver.service.impl;
import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.date.TimeInterval;
import cn.hutool.core.util.StrUtil; import cn.hutool.core.util.StrUtil;
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum; import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
import com.supervision.pdfqaserver.domain.ChineseEnglishWords; import com.supervision.pdfqaserver.domain.ChineseEnglishWords;
@ -39,18 +40,24 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
@Override @Override
public void generateGraph(String documentId) { public void generateGraph(String documentId) {
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(documentId); List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(documentId));
if (CollUtil.isEmpty(pdfAnalysisOutputs)) { if (CollUtil.isEmpty(pdfAnalysisOutputs)) {
log.info("没有找到pdfId为{}的pdf分析结果", documentId); log.info("没有找到pdfId为{}的pdf分析结果", documentId);
return; return;
} }
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).toList(); List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).toList();
// 对文档进行切分 // 对文档进行切分
TimeInterval timer = new TimeInterval();
timer.start("sliceDocuments");
log.info("开始切分文档,初始文档个数:{}",documentDTOList.size());
List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList); List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
log.info("切分文档完成,切分后文档个数:{},耗时:{}秒",truncateDTOS.size(), timer.intervalSecond("sliceDocuments"));
// 保存分片信息 // 保存分片信息
documentTruncationService.batchSave(truncateDTOS); documentTruncationService.batchSave(truncateDTOS);
// 对切分后的文档进行命名实体识别 // 对切分后的文档进行命名实体识别
timer.start("doEre");
log.info("开始命名实体识别...");
List<EREDTO> eredtoList = new ArrayList<>(); List<EREDTO> eredtoList = new ArrayList<>();
for (TruncateDTO truncateDTO : truncateDTOS) { for (TruncateDTO truncateDTO : truncateDTOS) {
EREDTO eredto = tripleConversionPipeline.doEre(truncateDTO); EREDTO eredto = tripleConversionPipeline.doEre(truncateDTO);
@ -59,12 +66,17 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
} }
// 保存实体关系抽取结果 // 保存实体关系抽取结果
this.saveERE(eredto, truncateDTO.getId()); this.saveERE(eredto, truncateDTO.getId());
eredtoList.add(eredto);
} }
log.info("命名实体识别完成,耗时:{}秒", timer.intervalSecond("doEre"));
// 合并实体关系抽取结果 // 合并实体关系抽取结果
log.info("开始合并实体关系抽取结果...");
List<EREDTO> mergedList = tripleConversionPipeline.mergeEreResults(eredtoList); List<EREDTO> mergedList = tripleConversionPipeline.mergeEreResults(eredtoList);
log.info("合并实体关系抽取结果完成,合并后个数:{}", mergedList.size());
// 保存领域元数据 // 保存领域元数据
log.info("开始保存领域元数据...");
for (EREDTO eredto : mergedList) { for (EREDTO eredto : mergedList) {
List<RelationExtractionDTO> relations = eredto.getRelations(); List<RelationExtractionDTO> relations = eredto.getRelations();
if (CollUtil.isEmpty(relations)){ if (CollUtil.isEmpty(relations)){
@ -77,9 +89,12 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
domainMetadataService.saveIfNotExists(domainMetadata); domainMetadataService.saveIfNotExists(domainMetadata);
} }
} }
log.info("保存领域元数据完成");
// 保存字典 // 保存字典
log.info("开始保存字典...");
List<ChineseEnglishWords> allWords = chineseEnglishWordsService.queryAll(); List<ChineseEnglishWords> allWords = chineseEnglishWordsService.queryAll();
int wordsSize = allWords.size();
for (EREDTO eredto : mergedList) { for (EREDTO eredto : mergedList) {
List<EntityExtractionDTO> entities = eredto.getEntities(); List<EntityExtractionDTO> entities = eredto.getEntities();
if (CollUtil.isNotEmpty(entities)){ if (CollUtil.isNotEmpty(entities)){
@ -94,7 +109,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
} }
} }
} }
log.info("保存字典完成,新增字典个数:{}", allWords.size() - wordsSize);
// 生成cypher语句 // 生成cypher语句
for (EREDTO eredto : mergedList) { for (EREDTO eredto : mergedList) {
eredto.setEn(allWords); eredto.setEn(allWords);

@ -19,8 +19,8 @@ public class PdfAnalysisOutputServiceImpl extends ServiceImpl<PdfAnalysisOutputM
implements PdfAnalysisOutputService{ implements PdfAnalysisOutputService{
@Override @Override
public List<PdfAnalysisOutput> queryByPdfId(String pdfId) { public List<PdfAnalysisOutput> queryByPdfId(Integer pdfId) {
Assert.notEmpty(pdfId, "pdfId不能为空"); Assert.notNull(pdfId, "pdfId不能为空");
return super.lambdaQuery().eq(PdfAnalysisOutput::getPdfId, pdfId).list(); return super.lambdaQuery().eq(PdfAnalysisOutput::getPdfId, pdfId).list();
} }

@ -30,7 +30,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
// 先对pageNo进行排序再对layoutOrder进行排序 // 先对pageNo进行排序再对layoutOrder进行排序
(o1, o2) -> { (o1, o2) -> {
if (o1.getPageNo().equals(o2.getPageNo())) { if (o1.getPageNo().equals(o2.getPageNo())) {
return Integer.compare(o1.getLayoutOrder(), o2.getLayoutOrder()); return Integer.compare(o1.getDisplayOrder(), o2.getDisplayOrder());
} }
return Integer.compare(o1.getPageNo(), o2.getPageNo()); return Integer.compare(o1.getPageNo(), o2.getPageNo());
} }
@ -72,12 +72,12 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
@Override @Override
public EREDTO doEre(TruncateDTO truncateDTO) { public EREDTO doEre(TruncateDTO truncateDTO) {
if (StrUtil.equals(truncateDTO.getLayoutType(),"0")){ if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TEXT.getCode()))){
return doTextEre(truncateDTO); return doTextEre(truncateDTO);
} }
if (StrUtil.equals(truncateDTO.getLayoutType(),"1")){ if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TABLE.getCode()))){
return doTableEre(truncateDTO); return doTableEre(truncateDTO);
} }
log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType()); log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType());
@ -85,21 +85,37 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
} }
private EREDTO doTextEre(TruncateDTO truncateDTO) { private EREDTO doTextEre(TruncateDTO truncateDTO) {
log.info("doTextEre:开始进行文本实体关系抽取,内容:{}", truncateDTO.getContent());
String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TEXT); String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TEXT);
String formatted = String.format(prompt, truncateDTO.getContent()); String formatted = StrUtil.format(prompt, truncateDTO.getContent());
String response = ollamaChatModel.call(formatted); String response = ollamaChatModel.call(formatted);
// todo:暂时不去处理异常返回 // todo:暂时不去处理异常返回
log.info("doTextEre响应结果:{}", response);
return EREDTO.fromTextJson(response, truncateDTO.getId()); return EREDTO.fromTextJson(response, truncateDTO.getId());
} }
private EREDTO doTableEre(TruncateDTO truncateDTO) { private EREDTO doTableEre(TruncateDTO truncateDTO) {
log.info("doTableEre:开始进行表格实体关系抽取,内容:{}", truncateDTO.getContent());
String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TABLE); String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TABLE);
String formatted = String.format(prompt, truncateDTO.getContent()); String formatted = StrUtil.format(prompt, truncateDTO.getContent());
String response = ollamaChatModel.call(formatted); String response = ollamaChatModel.call(formatted);
log.info("doTableEre响应结果:{}", response);
// todo:暂时不去处理异常返回 // todo:暂时不去处理异常返回
EREDTO eredto = EREDTO.fromTableJson(response, truncateDTO.getId());
return EREDTO.fromTableJson(response, truncateDTO.getId()); EntityExtractionDTO titleEntity = new EntityExtractionDTO();
titleEntity.setEntity("表");
titleEntity.setName(truncateDTO.getTitle());
//
// 添加关系
ArrayList<RelationExtractionDTO> relations = new ArrayList<>();
for (EntityExtractionDTO entity : eredto.getEntities()) {
RelationExtractionDTO relationExtractionDTO = new RelationExtractionDTO(truncateDTO.getId(),
titleEntity.getEntity(), titleEntity.getName(), "包含", entity.getEntity(), entity.getName(), entity.getAttributes());
relations.add(relationExtractionDTO);
}
eredto.getEntities().add(titleEntity);
eredto.setRelations(relations);
return eredto;
} }
/** /**

@ -17,7 +17,7 @@ spring:
chat: chat:
model: qwen2.5:32b model: qwen2.5:32b
options: options:
max_tokens: 512 max_tokens: 51200
top_p: 0.9 top_p: 0.9
top_k: 40 top_k: 40
temperature: 0.7 temperature: 0.7

@ -11,13 +11,13 @@
<result property="pageNo" column="page_no" jdbcType="INTEGER"/> <result property="pageNo" column="page_no" jdbcType="INTEGER"/>
<result property="pdfId" column="pdf_id" jdbcType="INTEGER"/> <result property="pdfId" column="pdf_id" jdbcType="INTEGER"/>
<result property="tableTitle" column="table_title" jdbcType="VARCHAR"/> <result property="tableTitle" column="table_title" jdbcType="VARCHAR"/>
<result property="order" column="order" jdbcType="INTEGER"/> <result property="displayOrder" column="display_order" jdbcType="INTEGER"/>
<result property="createTime" column="create_time" jdbcType="TIMESTAMP"/> <result property="createTime" column="create_time" jdbcType="TIMESTAMP"/>
</resultMap> </resultMap>
<sql id="Base_Column_List"> <sql id="Base_Column_List">
id,layout_type,content, id,layout_type,content,
page_no,pdf_id,table_title, page_no,pdf_id,table_title,
order,create_time display_order,create_time
</sql> </sql>
</mapper> </mapper>

@ -1,13 +1,21 @@
package com.supervision.pdfqaserver; package com.supervision.pdfqaserver;
import com.supervision.pdfqaserver.service.KnowledgeGraphService;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest; import org.springframework.boot.test.context.SpringBootTest;
@Slf4j
@SpringBootTest @SpringBootTest
class PdfQaServerApplicationTests { class PdfQaServerApplicationTests {
@Autowired
private KnowledgeGraphService knowledgeGraphService;
@Test @Test
void contextLoads() { void generateGraphTest() {
knowledgeGraphService.generateGraph("1");
log.info("finish...");
} }
} }

Loading…
Cancel
Save