基于三元组提取领域元数据,出版完成

v_0.0.2
xueqingkun 3 weeks ago
parent 25b4437b9f
commit 8b4ec18483

@ -438,17 +438,17 @@ public class PromptCache {
##
1. ** `ContentType` **
- `ContentType`
- `0`
- `1`
- `2`
1. ** `ContentType` **
- `ContentType`
- `0`
- `1`
- `2`
2. ****
- `ContentType` `{"ContentType": }`
- `{}`
2. ****
- `ContentType` `{"ContentType": }`
- `{}`
3. ****
3. ****
```json
{
"0": "研报类型(行业分析、财务数据)",
@ -487,14 +487,14 @@ public class PromptCache {
---
****
****
- `ContentType`
-
-
##
1. JSON
1. JSON使```json ```Markdown
2.
- `{"ContentType": 0/1/2}`
@ -550,7 +550,7 @@ public class PromptCache {
1.
2.
3.
4. JSON使```json ```Markdown
##
{text}
@ -644,24 +644,18 @@ public class PromptCache {
##
-
{text}
-
{IntentTypeList}
##
1.
2.
- source
- relation
- target
- intent
2.
3. /
- type
- attributes
4. 使
4. JSON使```json ```Markdown
5. 使
```json
[
{
"source": {
@ -677,10 +671,11 @@ public class PromptCache {
"attributes": ["属性3"]
},
"intent": "匹配的意图标签"
}
},
{.....}
]
5.
5. ./no_think
""";
private static final String EXTRACT_ERE_BASE_INTENT_PROMPT = """
@ -690,22 +685,24 @@ public class PromptCache {
##
- {text}
-
{text}
-
{domainMetadata}
##
{
"nodes": [
{
"name":"龙源(酒泉)风力发电有限公司",
"type": "公司",
"attributes": {
"名称": "龙源(酒泉)风力发电有限公司",
"地址": "雨花台区"
}
},
{
"name":"2024年电子银行承兑汇票",
"type": "电子银行承兑汇票",
"attributes": {
"金额": "100.00万元",
@ -713,24 +710,29 @@ public class PromptCache {
}
},
{
"name": "杭州六小龙",
"type": "公司",
"attributes": {
"名称": "杭州六小龙",
"地址": "杭州高新区"
}
}
],
"relations": [
{
{
"source": "龙源(酒泉)风力发电有限公司",
"target": "2024年电子银行承兑汇票",
"type": "持有",
"attributes": {
"持有方式": "纸质"
}
},
{
"source": "龙源(酒泉)风力发电有限公司",
"target": "杭州六小龙",
"type": "收购",
"attributes": {
"收购类型": "全资收购"
"收购时间""2025年5月28号"
"收购类型": "全资收购",
"收购时间": "2025年5月28号"
}
}
],
@ -750,9 +752,11 @@ public class PromptCache {
##
- nodesrelations"名称"
- relationssourcetargetnodesname
- `domainMetadata`
-
-
- JSON使```json ```Markdown
- JSON使```json ```Markdown./no_think
""";
}

@ -21,6 +21,7 @@ public class DomainMetadata implements Serializable {
/**
*
*/
@Deprecated
private String domainType;
/**

@ -26,7 +26,7 @@ public class Intention implements Serializable {
/**
*
*/
private String desc;
private String description;
/**
* id

@ -46,7 +46,7 @@ public class IntentDTO {
public IntentDTO(Intention intention){
this.id = intention.getId();
this.digest = intention.getDigest();
this.desc = intention.getDesc();
this.desc = intention.getDescription();
this.domainCategoryId = intention.getDomainCategoryId();
this.generationType = intention.getGenerationType();
}

@ -3,6 +3,7 @@ package com.supervision.pdfqaserver.service.impl;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.lang.Assert;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
import com.supervision.pdfqaserver.domain.DomainMetadata;
import com.supervision.pdfqaserver.domain.ErAttribute;
import com.supervision.pdfqaserver.domain.IntentionDomainMetadata;
@ -53,6 +54,7 @@ public class DomainMetadataServiceImpl extends ServiceImpl<DomainMetadataMapper,
if (null != data){
metadata.setId(data.getId());
}else {
metadata.setDomainCategoryId(domainCategoryId);
super.save(metadata);
}
}
@ -72,7 +74,7 @@ public class DomainMetadataServiceImpl extends ServiceImpl<DomainMetadataMapper,
metadata.setId(data.getId());
}else {
DomainMetadata domainMetadata = metadata.toDomainMetadata();
domainMetadata.setGenerationType("1");// 1:系统录入
domainMetadata.setGenerationType(DomainMetaGenerationEnum.SYSTEM_AUTO_GENERATION.getCode());// 1:系统录入
this.saveIfNotExists(domainMetadata, domainCategoryId);
metadata.setId(domainMetadata.getId());
}

@ -109,7 +109,6 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
@Override
public void generateGraphBaseTrain(Integer pdfId) {
Assert.notNull(pdfId, "pdfId不能为空");
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
TimeInterval timer = new TimeInterval();
try {
log.info("开始生成知识图谱, pdfId:{}", pdfId);
@ -216,6 +215,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
domainMetadataService.batchSaveOrUpdateMetadata(metadataDTOS,intention.getId(), pdfInfo.getDomainCategoryId());
}
}catch (Exception e){
intentSize ++;
log.error("切分文档id:{},意图识别失败", truncateDTO.getId(), e);
}
@ -265,7 +265,9 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
log.info("切分数据完成,切分个数:{}", truncateDTOS.size());
}
// 查询当前行业分类下的意图
List<IntentDTO> intentionDTOs = intentionService.queryByDomainCategoryId(pdfInfo.getDomainCategoryId()).stream().map(IntentDTO::new).distinct().toList();
List<IntentDTO> intentionDTOs = intentionService.queryByDomainCategoryId(pdfInfo.getDomainCategoryId()).stream()
.filter(intention -> StrUtil.equals("0",intention.getGenerationType())) // 过滤出手动确认的数据
.map(IntentDTO::new).distinct().toList();
if (CollUtil.isEmpty(intentionDTOs)){
log.info("没有找到行业分类id为{}的意图数据,不再进行下一步操作...", pdfInfo.getDomainCategoryId());
return;
@ -275,9 +277,24 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
int index = 1;
int truncateSize = truncateDTOS.size();
log.info("开始实体关系抽取,耗时:{}秒,一共处理片段数:{}个", timer.intervalSecond(), truncateDTOS.size());
List<EREDTO> eredtos = new ArrayList<>();
for (TruncateDTO truncateDTO : truncateDTOS) {
index ++;
log.info("开始命名实体识别,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
try {
if (StrUtil.equals(truncateDTO.getLayoutType(), String.valueOf(LayoutTypeEnum.TABLE.getCode()))){
log.info("切分文档id:{},表格类型数据,不进行意图识别...", truncateDTO.getId());
/*EREDTO eredto = conversionPipeline.doEre(truncateDTO, new ArrayList<>());
if (null == eredto){
log.info("切分文档id:{},命名实体识别结果为空...", truncateDTO.getId());
continue;
}
this.saveERE(eredto, truncateDTO.getId());
eredtos.add(eredto);
*/
continue;
}
timer.start("makeOutTruncationIntent");
log.info("开始意图识别,切分文档id:{}", truncateDTO.getId());
List<IntentDTO> intents = conversionPipeline.makeOutTruncationIntent(truncateDTO,intentionDTOs);
@ -296,10 +313,17 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
}
// 保存实体关系抽取结果
this.saveERE(eredto, truncateDTO.getId());
eredtos.add(eredto);
}catch (Exception e){
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
}
}
log.info("实体关系抽取完成,耗时:{}秒", timer.intervalSecond());
log.info("开始生成知识图谱...");
timer.start("generateGraph");
generateGraph(eredtos);
log.info("生成知识图谱完成,耗时:{}秒", timer.intervalSecond("generateGraph"));
}
@Override

@ -93,6 +93,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
log.info("makeOutTruncationIntent:响应结果:{}", call);
JSONObject json = JSONUtil.parseObj(call);
JSONArray jsonArray = json.getJSONArray("IntentTypeList");
if (null == jsonArray){
return new ArrayList<>();
}
return intents.stream().filter(intent->
jsonArray.stream().anyMatch(o->StrUtil.equals(o.toString(), intent.getDigest())))
.collect(Collectors.toList());
@ -104,8 +107,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
Assert.notEmpty(intents, "意图不能为空");
String promptTemplate = promptMap.get(EXTRACT_INTENT_METADATA);
Map<String, String> params = Map.of("text", truncate.getContent(), "IntentType", JSONUtil.toJsonStr(intents));
Map<String, String> params = Map.of("text", truncate.getContent(), "IntentTypeList", JSONUtil.toJsonStr(intents));
String format = StrUtil.format(promptTemplate, params);
log.info("makeOutDomainMetadata:format{}", format);
String call = aiCallService.call(format);
log.info("makeOutDomainMetadata:响应结果:{}", call);
return parseDomainMetadata(call);
@ -139,6 +143,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
JSONObject source = jsonObject.getJSONObject("source");
JSONObject relation = jsonObject.getJSONObject("relation");
JSONObject target = jsonObject.getJSONObject("target");
domainMetadataDTO.setIntentDigest(jsonObject.getStr("intent"));
if (null != source){
String type = source.getStr("type");
JSONArray attributes = source.getJSONArray("attributes");
@ -189,6 +194,14 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
return null;
}
List<DomainMetadataDTO> domainMetadataDTOS = domainMetadataService.listByIntentionIds(intentIds);
log.info("doEre:领域元数据列表个数:{}", domainMetadataDTOS.size());
domainMetadataDTOS = domainMetadataDTOS.stream()
.filter(domainMetadataDTO -> StrUtil.equals(domainMetadataDTO.getGenerationType(), "0"))// 过滤出手动确认的数据
.collect(Collectors.toList());
log.info("doEre:领域元数据列表已经手动确认过的个数:{}", domainMetadataDTOS.size());
if (CollUtil.isEmpty(domainMetadataDTOS)){
return null;
}
return doTextEreWithMetadata(truncateDTO, domainMetadataDTOS);
}
@ -220,9 +233,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
@Override
public List<TruncateDTO> sliceDocuments(List<DocumentDTO> documents) {
int maxTextLength = 1000;
int minTextLength = 800;
int INITIAL_BUFFER_SIZE = 1500;
int maxTextLength = 600;
int minTextLength = 500;
int INITIAL_BUFFER_SIZE = 100;
// 对pdfAnalysisOutputs进行排序
List<DocumentDTO> documentDTOList = documents.stream().sorted(
// 先对pageNo进行排序再对layoutOrder进行排序
@ -240,7 +253,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
List<TruncateDTO> truncateDTOS = new ArrayList<>();
StringBuilder truncateTextBuild = new StringBuilder(1500);
DocumentDTO documentDTOLast = null;
for (DocumentDTO documentDTO : documentDTOList) {
documentDTOLast = documentDTO;
String content = documentDTO.getContent();
if (StrUtil.isEmpty(content)){
continue;
@ -274,13 +289,12 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
}
}
}
// 处理剩余内容
} else if (LayoutTypeEnum.TABLE.getCode() == layoutType) {
// 如果是表格类型的布局,进行切分
// 出现表格后如果truncateTextBuild不为空单独作为一个片段
if (!truncateTextBuild.isEmpty()) {
truncateDTOS.add(new TruncateDTO(documentDTO, truncateTextBuild.toString()));
}
} else if (LayoutTypeEnum.TABLE.getCode() == layoutType) {
// 如果是表格类型的布局,进行切分
// 提前抽取表名
TableTitleDTO tableTitleDTO = this.extractTableTitle(documentDTO.getTitle());
if (null != tableTitleDTO && StrUtil.isNotEmpty(tableTitleDTO.getTitle())){
@ -317,6 +331,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
log.info("sliceDocuments:错误的布局类型: {}", layoutType);
}
}
if (!truncateTextBuild.isEmpty() && null != documentDTOLast) {
truncateDTOS.add(new TruncateDTO(documentDTOLast, truncateTextBuild.toString()));
}
return truncateDTOS;
}
@ -378,7 +395,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
Map<String, String> params = Map.of("text", truncateDTO.getContent(), "domainMetadata", domainMetadata);
String format = StrUtil.format(prompt, params);
String call = aiCallService.call(format);
return null;
return EREDTO.fromTextJson(call, truncateDTO.getId());
}

@ -7,7 +7,7 @@
<resultMap id="BaseResultMap" type="com.supervision.pdfqaserver.domain.Intention">
<id property="id" column="id" jdbcType="VARCHAR"/>
<result property="digest" column="digest" jdbcType="VARCHAR"/>
<result property="desc" column="desc" jdbcType="VARCHAR"/>
<result property="description" column="description" jdbcType="VARCHAR"/>
<result property="domainCategoryId" column="domain_category_id" jdbcType="VARCHAR"/>
<result property="generationType" column="generation_type" jdbcType="VARCHAR"/>
<result property="createTime" column="create_time" jdbcType="TIMESTAMP"/>
@ -15,7 +15,7 @@
</resultMap>
<sql id="Base_Column_List">
id,digest,desc,generation_type,
id,digest,description,generation_type,
domain_category_id,create_time,update_time
</sql>
</mapper>

@ -150,5 +150,17 @@ class PdfQaServerApplicationTests {
System.out.println(strings);
}
@Test
public void metaDataTrainTest() {
knowledgeGraphService.metaDataTrain(13);
}
@Test
void generateGraphBaseTrainTest() {
knowledgeGraphService.generateGraphBaseTrain(13);
}
}

Loading…
Cancel
Save