generateGraph 功能初始化

master
xueqingkun 7 hours ago
parent 830acca35d
commit 7f5c52546a

@ -29,7 +29,7 @@ public class PromptCache {
private static final String DOERE_TEXT_PROMPT = """
JSON
JSON
1. ****
-
@ -45,8 +45,8 @@ public class PromptCache {
- (, , )
****
- JSON使```json ```Markdown
- 使JSON Schema
{
"nodes": [
{
@ -138,7 +138,7 @@ public class PromptCache {
""";
private static final String DOERE_TABLE_PROMPT = """
****
1.
@ -147,7 +147,6 @@ public class PromptCache {
4.
****
```json
{
"table_data": [
{
@ -158,7 +157,7 @@ public class PromptCache {
// 后续行...
]
}
```
****
| | | |
@ -189,9 +188,31 @@ public class PromptCache {
private static final String CHINESE_TO_ENGLISH_PROMPT = """
Neo4jNeo4j
1. ****
- 使`UpperCamelCase``ProductCategory`
- 使`SCREAMING_SNAKE_CASE``IS_RELATED_TO`
- `2023``2023`
-
-
2. ****
-
-
- /"腾讯"`Tencent`
3. ****
- : "用户订单" : `UserOrder`
- : "属于2023年" : `BELONGS_TO_2023`
- : "5G网络设备" : `5GNetworkDevice`
- : "评分大于90" : `SCORE_ABOVE_90`
4. ****
{}
5. ****
""";
@ -203,12 +224,10 @@ public class PromptCache {
4. Cypher
###
```json
[
{"source": "人物","sourceType": "Person", "relation": "创始人", "relationType": "FOUNDED","target": "公司","targetType": "Company"},
{"source": "公司","sourceType": "Company ", "relation": "位于", "relationType": "LOCATED_IN","target": "城市","targetType": "City "}
]
```
###

@ -47,7 +47,7 @@ public class PdfAnalysisOutput implements Serializable {
/**
* pdf
*/
private Integer order;
private Integer displayOrder;
/**
*

@ -29,7 +29,7 @@ public class DocumentDTO {
/**
* pdf
*/
private Integer layoutOrder;
private Integer displayOrder;
private String title;
@ -49,13 +49,13 @@ public class DocumentDTO {
}
public DocumentDTO(PdfAnalysisOutput pdfAnalysisOutput) {
this.id = pdfAnalysisOutput.getPdfId().toString();
this.id = pdfAnalysisOutput.getId().toString();
this.sectionId = pdfAnalysisOutput.getId();
this.layoutType = pdfAnalysisOutput.getLayoutType();
this.pageNo = pdfAnalysisOutput.getPageNo();
this.title = pdfAnalysisOutput.getTableTitle();
this.content = pdfAnalysisOutput.getContent();
this.layoutOrder = pdfAnalysisOutput.getOrder();
this.displayOrder = pdfAnalysisOutput.getDisplayOrder();
}

@ -37,17 +37,17 @@ public class EREDTO {
String name = nodeJson.getString("name");
String type = nodeJson.getString("type");
JSONObject attributes = nodeJson.getJSONObject("attributes");
List<ERAttributeDTO> erAttributeDTOS = new ArrayList<>();
if (CollUtil.isNotEmpty(attributes)){
List<ERAttributeDTO> erAttributeDTOS = new ArrayList<>();
for (String key : attributes.keySet()) {
Object value = attributes.get(key);
String valueString = attributes.getString(key);
ERAttributeDTO erAttributeDTO = new ERAttributeDTO(key, valueString, value instanceof Number?"1":"0");
erAttributeDTOS.add(erAttributeDTO);
}
EntityExtractionDTO entityExtraction = new EntityExtractionDTO(truncationId,name,type, erAttributeDTOS);
entities.add(entityExtraction);
}
EntityExtractionDTO entityExtraction = new EntityExtractionDTO(truncationId,name,type, erAttributeDTOS);
entities.add(entityExtraction);
}
}
if (CollUtil.isNotEmpty(relations)){
@ -106,7 +106,7 @@ public class EREDTO {
continue;
}
EntityExtractionDTO entityExtractionDTO = new EntityExtractionDTO();
entityExtractionDTO.setEntity("row");
entityExtractionDTO.setEntity("");
entityExtractionDTO.setName("row");
entityExtractionDTO.setTruncationId(truncationId);
List<ERAttributeDTO> erAttributeDTOS = new ArrayList<>();

@ -12,5 +12,5 @@ import java.util.List;
*/
public interface PdfAnalysisOutputService extends IService<PdfAnalysisOutput> {
List<PdfAnalysisOutput> queryByPdfId(String pdfId);
List<PdfAnalysisOutput> queryByPdfId(Integer pdfId);
}

@ -1,5 +1,6 @@
package com.supervision.pdfqaserver.service.impl;
import cn.hutool.core.util.StrUtil;
import com.supervision.pdfqaserver.cache.PromptCache;
import com.supervision.pdfqaserver.service.ChinesEsToEnglishGenerator;
import lombok.RequiredArgsConstructor;
@ -18,7 +19,8 @@ public class ChinesEsToEnglishGeneratorImpl implements ChinesEsToEnglishGenerato
public String generate(String chinese) {
log.info("generate:开始翻译: {}",chinese);
String prompt = PromptCache.promptMap.get(CHINESE_TO_ENGLISH);
ollamaChatModel.call("请将以下中文翻译成英文: " + chinese);
return null;
String response = ollamaChatModel.call(StrUtil.format(prompt, chinese));
log.info("generate:chinese:{}翻译结果: {}",chinese,response);
return response;
}
}

@ -24,7 +24,11 @@ public class DocumentTruncationServiceImpl extends ServiceImpl<DocumentTruncatio
if (CollUtil.isEmpty(truncateDTOS)){
return;
}
truncateDTOS.stream().map(TruncateDTO::toDocumentTruncation).forEach(this::save);
for (TruncateDTO truncateDTO : truncateDTOS) {
DocumentTruncation documentTruncation = truncateDTO.toDocumentTruncation();
this.save(documentTruncation);
truncateDTO.setId(documentTruncation.getId());
}
}
}

@ -1,6 +1,7 @@
package com.supervision.pdfqaserver.service.impl;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.date.TimeInterval;
import cn.hutool.core.util.StrUtil;
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
import com.supervision.pdfqaserver.domain.ChineseEnglishWords;
@ -39,18 +40,24 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
@Override
public void generateGraph(String documentId) {
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(documentId);
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(Integer.valueOf(documentId));
if (CollUtil.isEmpty(pdfAnalysisOutputs)) {
log.info("没有找到pdfId为{}的pdf分析结果", documentId);
return;
}
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).toList();
// 对文档进行切分
TimeInterval timer = new TimeInterval();
timer.start("sliceDocuments");
log.info("开始切分文档,初始文档个数:{}",documentDTOList.size());
List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
log.info("切分文档完成,切分后文档个数:{},耗时:{}秒",truncateDTOS.size(), timer.intervalSecond("sliceDocuments"));
// 保存分片信息
documentTruncationService.batchSave(truncateDTOS);
// 对切分后的文档进行命名实体识别
timer.start("doEre");
log.info("开始命名实体识别...");
List<EREDTO> eredtoList = new ArrayList<>();
for (TruncateDTO truncateDTO : truncateDTOS) {
EREDTO eredto = tripleConversionPipeline.doEre(truncateDTO);
@ -59,12 +66,17 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
}
// 保存实体关系抽取结果
this.saveERE(eredto, truncateDTO.getId());
eredtoList.add(eredto);
}
log.info("命名实体识别完成,耗时:{}秒", timer.intervalSecond("doEre"));
// 合并实体关系抽取结果
log.info("开始合并实体关系抽取结果...");
List<EREDTO> mergedList = tripleConversionPipeline.mergeEreResults(eredtoList);
log.info("合并实体关系抽取结果完成,合并后个数:{}", mergedList.size());
// 保存领域元数据
log.info("开始保存领域元数据...");
for (EREDTO eredto : mergedList) {
List<RelationExtractionDTO> relations = eredto.getRelations();
if (CollUtil.isEmpty(relations)){
@ -77,9 +89,12 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
domainMetadataService.saveIfNotExists(domainMetadata);
}
}
log.info("保存领域元数据完成");
// 保存字典
log.info("开始保存字典...");
List<ChineseEnglishWords> allWords = chineseEnglishWordsService.queryAll();
int wordsSize = allWords.size();
for (EREDTO eredto : mergedList) {
List<EntityExtractionDTO> entities = eredto.getEntities();
if (CollUtil.isNotEmpty(entities)){
@ -94,7 +109,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
}
}
}
log.info("保存字典完成,新增字典个数:{}", allWords.size() - wordsSize);
// 生成cypher语句
for (EREDTO eredto : mergedList) {
eredto.setEn(allWords);

@ -19,8 +19,8 @@ public class PdfAnalysisOutputServiceImpl extends ServiceImpl<PdfAnalysisOutputM
implements PdfAnalysisOutputService{
@Override
public List<PdfAnalysisOutput> queryByPdfId(String pdfId) {
Assert.notEmpty(pdfId, "pdfId不能为空");
public List<PdfAnalysisOutput> queryByPdfId(Integer pdfId) {
Assert.notNull(pdfId, "pdfId不能为空");
return super.lambdaQuery().eq(PdfAnalysisOutput::getPdfId, pdfId).list();
}

@ -30,7 +30,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
// 先对pageNo进行排序再对layoutOrder进行排序
(o1, o2) -> {
if (o1.getPageNo().equals(o2.getPageNo())) {
return Integer.compare(o1.getLayoutOrder(), o2.getLayoutOrder());
return Integer.compare(o1.getDisplayOrder(), o2.getDisplayOrder());
}
return Integer.compare(o1.getPageNo(), o2.getPageNo());
}
@ -72,12 +72,12 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
@Override
public EREDTO doEre(TruncateDTO truncateDTO) {
if (StrUtil.equals(truncateDTO.getLayoutType(),"0")){
if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TEXT.getCode()))){
return doTextEre(truncateDTO);
}
if (StrUtil.equals(truncateDTO.getLayoutType(),"1")){
if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TABLE.getCode()))){
return doTableEre(truncateDTO);
}
log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType());
@ -85,21 +85,37 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
}
private EREDTO doTextEre(TruncateDTO truncateDTO) {
log.info("doTextEre:开始进行文本实体关系抽取,内容:{}", truncateDTO.getContent());
String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TEXT);
String formatted = String.format(prompt, truncateDTO.getContent());
String formatted = StrUtil.format(prompt, truncateDTO.getContent());
String response = ollamaChatModel.call(formatted);
// todo:暂时不去处理异常返回
log.info("doTextEre响应结果:{}", response);
return EREDTO.fromTextJson(response, truncateDTO.getId());
}
private EREDTO doTableEre(TruncateDTO truncateDTO) {
log.info("doTableEre:开始进行表格实体关系抽取,内容:{}", truncateDTO.getContent());
String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TABLE);
String formatted = String.format(prompt, truncateDTO.getContent());
String formatted = StrUtil.format(prompt, truncateDTO.getContent());
String response = ollamaChatModel.call(formatted);
log.info("doTableEre响应结果:{}", response);
// todo:暂时不去处理异常返回
return EREDTO.fromTableJson(response, truncateDTO.getId());
EREDTO eredto = EREDTO.fromTableJson(response, truncateDTO.getId());
EntityExtractionDTO titleEntity = new EntityExtractionDTO();
titleEntity.setEntity("表");
titleEntity.setName(truncateDTO.getTitle());
//
// 添加关系
ArrayList<RelationExtractionDTO> relations = new ArrayList<>();
for (EntityExtractionDTO entity : eredto.getEntities()) {
RelationExtractionDTO relationExtractionDTO = new RelationExtractionDTO(truncateDTO.getId(),
titleEntity.getEntity(), titleEntity.getName(), "包含", entity.getEntity(), entity.getName(), entity.getAttributes());
relations.add(relationExtractionDTO);
}
eredto.getEntities().add(titleEntity);
eredto.setRelations(relations);
return eredto;
}
/**

@ -17,7 +17,7 @@ spring:
chat:
model: qwen2.5:32b
options:
max_tokens: 512
max_tokens: 51200
top_p: 0.9
top_k: 40
temperature: 0.7

@ -11,13 +11,13 @@
<result property="pageNo" column="page_no" jdbcType="INTEGER"/>
<result property="pdfId" column="pdf_id" jdbcType="INTEGER"/>
<result property="tableTitle" column="table_title" jdbcType="VARCHAR"/>
<result property="order" column="order" jdbcType="INTEGER"/>
<result property="displayOrder" column="display_order" jdbcType="INTEGER"/>
<result property="createTime" column="create_time" jdbcType="TIMESTAMP"/>
</resultMap>
<sql id="Base_Column_List">
id,layout_type,content,
page_no,pdf_id,table_title,
order,create_time
display_order,create_time
</sql>
</mapper>

@ -1,13 +1,21 @@
package com.supervision.pdfqaserver;
import com.supervision.pdfqaserver.service.KnowledgeGraphService;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
@Slf4j
@SpringBootTest
class PdfQaServerApplicationTests {
@Autowired
private KnowledgeGraphService knowledgeGraphService;
@Test
void contextLoads() {
void generateGraphTest() {
knowledgeGraphService.generateGraph("1");
log.info("finish...");
}
}

Loading…
Cancel
Save