diff --git a/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java b/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java index 6712cc9..510aed8 100644 --- a/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java +++ b/src/main/java/com/supervision/pdfqaserver/cache/PromptCache.java @@ -16,6 +16,9 @@ public class PromptCache { public static final String CHINESE_TO_ENGLISH = "CHINESE_TO_ENGLISH"; public static final String ERE_TO_INSERT_CYPHER = "ERE_TO_INSERT_CYPHER"; + + public static final String CLASSIFY_TABLE = "CLASSIFY_TABLE"; + public static final Map promptMap = new HashMap<>(); static { @@ -28,6 +31,7 @@ public class PromptCache { promptMap.put(ERE_TO_INSERT_CYPHER, ERE_TO_INSERT_CYPHER_PROMPT); promptMap.put(TEXT_TO_CYPHER, TEXT_TO_CYPHER_PROMPT); promptMap.put(GENERATE_ANSWER, GENERATE_ANSWER_PROMPT); + promptMap.put(CLASSIFY_TABLE, CLASSIFY_TABLE_PROMPT); } @@ -327,4 +331,46 @@ public class PromptCache { ### 请转换以下三元组: {} """; + + private static final String CLASSIFY_TABLE_PROMPT = """ + 你是一个表格数据处理专家,直接给出结果,不要解释。 + **请根据表格行的标题类型,区分表格行标题是否是描述性标题:** + + 1. **描述性标题型定义** + - 行标题为**描述性文本**(如审计事项说明、应对措施等) + - 内容以**段落式文字**为主,而非结构化数据 + - 列数较少(通常2列),且列标题为**概括性说明**(如"关键审计事项"、"审计应对") + - 示例: + ``` + | 关键审计事项 | 在审计中如何应对该事项 | + |---------------------------|-----------------------------| + | 无形资产减值准备... | 我们对管理层...进行测试... | + ``` + + 2. **分类标签型定义** + - 行标题为**分类标签**(如会计科目、项目名称) + - 内容以**结构化数据**为主(如数字、日期、代码) + - 列数较多(通常≥6列),且列标题为**具体分类**(如"2023年12月31日"、"附注") + - 示例: + ``` + | 项目 | 附注 | 2023年12月31日 | 2023年1月1日 | + |--------------|------|---------------------|-------------------| + | 货币资金 | 六.1 | 4,879,272,436.13 | 20,493,232,077.05 | + ``` + + **输出要求** + - 如果是描述性标题输出**是**,否则输出**否** + - 不需要解释说明 + + **示例表格** + | 三产收人确认 关键审计事项 | 在审计中如何应对该事项 | + | --- | --- | + | 如财务报表附注六注释49营业收入、营 业成本所示,2023年营业收入376\\.42亿 元与上年相比减少5\\.57%,由于收入是 龙源电力的关键绩效指标之一,我们将 其收入的确认作为关键审计事项。 | 我们对收入确认执行的审计程序包括: 1、评价管理层与销售和收款相关的内部控制的设 计和运行有效性;| + + **期望输出** + 是 + + 请处理以下表格: + {} + """; } diff --git a/src/main/java/com/supervision/pdfqaserver/dto/TableTitleDTO.java b/src/main/java/com/supervision/pdfqaserver/dto/TableTitleDTO.java new file mode 100644 index 0000000..214418d --- /dev/null +++ b/src/main/java/com/supervision/pdfqaserver/dto/TableTitleDTO.java @@ -0,0 +1,18 @@ +package com.supervision.pdfqaserver.dto; + +import lombok.Data; + +@Data +public class TableTitleDTO { + + private String title; + + // 编制 + private String createdBy; + + // 编制时间 + private String createdByTime; + + // 金额单位 + private String amountUnit; +} diff --git a/src/main/java/com/supervision/pdfqaserver/service/TripleConversionPipeline.java b/src/main/java/com/supervision/pdfqaserver/service/TripleConversionPipeline.java index 094f16f..60a5510 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/TripleConversionPipeline.java +++ b/src/main/java/com/supervision/pdfqaserver/service/TripleConversionPipeline.java @@ -2,6 +2,7 @@ package com.supervision.pdfqaserver.service; import com.supervision.pdfqaserver.dto.EREDTO; import com.supervision.pdfqaserver.dto.DocumentDTO; +import com.supervision.pdfqaserver.dto.TableTitleDTO; import com.supervision.pdfqaserver.dto.TruncateDTO; import java.util.List; @@ -26,6 +27,21 @@ public interface TripleConversionPipeline { */ EREDTO doEre(TruncateDTO truncateDTO); + /** + * 分类表格是否为描述型表格 + * @param content 文档内容 + * @return true-描述型表格 false-非描述型表格 + */ + Boolean classify(String content); + + + /** + * 提取表格标题 + * @param content 文档内容 + * @return + */ + TableTitleDTO extractTableTitle(String content); + /** * 合并实体关系抽取结果 * @param eredtoList 实体关系抽取结果列表 diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java index 0e1bf36..f2e7dbb 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/KnowledgeGraphServiceImpl.java @@ -67,7 +67,12 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService { log.info("开始命名实体识别..."); List eredtoList = new ArrayList<>(); for (TruncateDTO truncateDTO : truncateDTOS) { - EREDTO eredto = tripleConversionPipeline.doEre(truncateDTO); + EREDTO eredto = null; + try { + eredto = tripleConversionPipeline.doEre(truncateDTO); + } catch (Exception e) { + log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e); + } if (null == eredto){ continue; } diff --git a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java index b2a79fd..263e0cc 100644 --- a/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java +++ b/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImpl.java @@ -1,6 +1,8 @@ package com.supervision.pdfqaserver.service.impl; import cn.hutool.core.collection.CollUtil; +import cn.hutool.core.lang.Assert; +import cn.hutool.core.util.BooleanUtil; import cn.hutool.core.util.StrUtil; import com.supervision.pdfqaserver.cache.PromptCache; import com.supervision.pdfqaserver.constant.LayoutTypeEnum; @@ -74,22 +76,48 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline { public EREDTO doEre(TruncateDTO truncateDTO) { if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TEXT.getCode()))){ + return doTextEre(truncateDTO); + } - try { + if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TABLE.getCode()))){ + // 先分析表格是否是描述类型 + Boolean classify = this.classify(truncateDTO.getContent()); + if (null == classify){ + log.info("doEre:表格分类结果为空,切分文档id:{}", truncateDTO.getId()); + return null; + } + if (classify){ return doTextEre(truncateDTO); - } catch (Exception e) { - log.error("doEre:文本实体关系抽取失败,内容:{}", truncateDTO.getContent(), e); } + return doTableEre(truncateDTO); } + log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType()); + return null; + } - if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TABLE.getCode()))){ - try { - return doTableEre(truncateDTO); - } catch (Exception e) { - log.error("doEre:表格实体关系抽取失败,内容:{}", truncateDTO.getContent(), e); + @Override + public Boolean classify(String content) { + Assert.notEmpty(content, "内容不能为空"); + // 对表格内容进行精简,只获取与前四行相关的内容 + String[] lines = content.split("\n"); + if (lines.length > 5){ + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 5; i++) { + sb.append(lines[i]).append("\n"); } + content = sb.toString(); } - log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType()); + log.info("classify:开始进行实体关系分类,内容:{}", content); + String prompt = PromptCache.promptMap.get(PromptCache.CLASSIFY_TABLE); + + String format = StrUtil.format(prompt, content); + String response = ollamaChatModel.call(format); + log.info("classify响应结果:{}", response); + return BooleanUtil.toBooleanObject(response); + } + + @Override + public TableTitleDTO extractTableTitle(String content) { return null; } diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 7c51fbe..4f2f7e8 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -25,4 +25,10 @@ neo4j: driver: uri: bolt://192.168.10.137:17687 user: neo4j - password: 12345678 \ No newline at end of file + password: 12345678 + +graph: + generate: + thread-pool: + core: 2 + max: 4 \ No newline at end of file diff --git a/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java b/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java index 77d6810..7766570 100644 --- a/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java +++ b/src/test/java/com/supervision/pdfqaserver/PdfQaServerApplicationTests.java @@ -2,6 +2,7 @@ package com.supervision.pdfqaserver; import com.supervision.pdfqaserver.service.ChinesEsToEnglishGenerator; import com.supervision.pdfqaserver.service.KnowledgeGraphService; +import com.supervision.pdfqaserver.service.TripleConversionPipeline; import lombok.extern.slf4j.Slf4j; import org.junit.jupiter.api.Test; import org.neo4j.driver.*; @@ -89,5 +90,19 @@ class PdfQaServerApplicationTests { System.out.println("翻译结果: " + english); } + @Autowired + TripleConversionPipeline tripleConversionPipeline; + @Test + void testChinesEsToEnglishGenerator2() { + String s = """ + | | 项
日 | | | 附注 | 2023年12月31日 | 2023年1月1日 | + |------------------------------------------|---------|--|------|----|--------------------|--------------------| + | 流动资产: | | | | | | | + | 货币资金 | | | 六、1 | | 4,879,272,436.13 | 20,493,232,077.05 | + """; + Boolean classify = tripleConversionPipeline.classify(s); + System.out.println(classify); + } + }