基于三元组提取领域元数据,出版完成

v_0.0.2
xueqingkun 3 weeks ago
parent 25b4437b9f
commit 8b4ec18483

@ -438,17 +438,17 @@ public class PromptCache {
## ##
1. ** `ContentType` ** 1. ** `ContentType` **
- `ContentType` - `ContentType`
- `0` - `0`
- `1` - `1`
- `2` - `2`
2. **** 2. ****
- `ContentType` `{"ContentType": }` - `ContentType` `{"ContentType": }`
- `{}` - `{}`
3. **** 3. ****
```json ```json
{ {
"0": "研报类型(行业分析、财务数据)", "0": "研报类型(行业分析、财务数据)",
@ -487,14 +487,14 @@ public class PromptCache {
--- ---
**** ****
- `ContentType` - `ContentType`
- -
- -
## ##
1. JSON 1. JSON使```json ```Markdown
2. 2.
- `{"ContentType": 0/1/2}` - `{"ContentType": 0/1/2}`
@ -550,7 +550,7 @@ public class PromptCache {
1. 1.
2. 2.
3. 3.
4. JSON使```json ```Markdown
## ##
{text} {text}
@ -644,24 +644,18 @@ public class PromptCache {
## ##
- -
{text} {text}
- -
{IntentTypeList} {IntentTypeList}
## ##
1. 1.
2. 2.
- source
- relation
- target
- intent
3. / 3. /
- type - type
- attributes - attributes
4. 使 4. JSON使```json ```Markdown
5. 使
```json
[ [
{ {
"source": { "source": {
@ -677,10 +671,11 @@ public class PromptCache {
"attributes": ["属性3"] "attributes": ["属性3"]
}, },
"intent": "匹配的意图标签" "intent": "匹配的意图标签"
} },
{.....}
] ]
5. 5. ./no_think
"""; """;
private static final String EXTRACT_ERE_BASE_INTENT_PROMPT = """ private static final String EXTRACT_ERE_BASE_INTENT_PROMPT = """
@ -690,22 +685,24 @@ public class PromptCache {
## ##
- {text} -
{text}
- -
{domainMetadata} {domainMetadata}
## ##
{ {
"nodes": [ "nodes": [
{ {
"name":"龙源(酒泉)风力发电有限公司",
"type": "公司", "type": "公司",
"attributes": { "attributes": {
"名称": "龙源(酒泉)风力发电有限公司",
"地址": "雨花台区" "地址": "雨花台区"
} }
}, },
{ {
"name":"2024年电子银行承兑汇票",
"type": "电子银行承兑汇票", "type": "电子银行承兑汇票",
"attributes": { "attributes": {
"金额": "100.00万元", "金额": "100.00万元",
@ -713,24 +710,29 @@ public class PromptCache {
} }
}, },
{ {
"name": "杭州六小龙",
"type": "公司", "type": "公司",
"attributes": { "attributes": {
"名称": "杭州六小龙",
"地址": "杭州高新区" "地址": "杭州高新区"
} }
} }
], ],
"relations": [ "relations": [
{ {
"source": "龙源(酒泉)风力发电有限公司",
"target": "2024年电子银行承兑汇票",
"type": "持有", "type": "持有",
"attributes": { "attributes": {
"持有方式": "纸质"
} }
}, },
{ {
"source": "龙源(酒泉)风力发电有限公司",
"target": "杭州六小龙",
"type": "收购", "type": "收购",
"attributes": { "attributes": {
"收购类型": "全资收购" "收购类型": "全资收购",
"收购时间""2025年5月28号" "收购时间": "2025年5月28号"
} }
} }
], ],
@ -750,9 +752,11 @@ public class PromptCache {
## ##
- nodesrelations"名称"
- relationssourcetargetnodesname
- `domainMetadata` - `domainMetadata`
- -
- -
- JSON使```json ```Markdown - JSON使```json ```Markdown./no_think
"""; """;
} }

@ -21,6 +21,7 @@ public class DomainMetadata implements Serializable {
/** /**
* *
*/ */
@Deprecated
private String domainType; private String domainType;
/** /**

@ -26,7 +26,7 @@ public class Intention implements Serializable {
/** /**
* *
*/ */
private String desc; private String description;
/** /**
* id * id

@ -46,7 +46,7 @@ public class IntentDTO {
public IntentDTO(Intention intention){ public IntentDTO(Intention intention){
this.id = intention.getId(); this.id = intention.getId();
this.digest = intention.getDigest(); this.digest = intention.getDigest();
this.desc = intention.getDesc(); this.desc = intention.getDescription();
this.domainCategoryId = intention.getDomainCategoryId(); this.domainCategoryId = intention.getDomainCategoryId();
this.generationType = intention.getGenerationType(); this.generationType = intention.getGenerationType();
} }

@ -3,6 +3,7 @@ package com.supervision.pdfqaserver.service.impl;
import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.lang.Assert; import cn.hutool.core.lang.Assert;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
import com.supervision.pdfqaserver.domain.DomainMetadata; import com.supervision.pdfqaserver.domain.DomainMetadata;
import com.supervision.pdfqaserver.domain.ErAttribute; import com.supervision.pdfqaserver.domain.ErAttribute;
import com.supervision.pdfqaserver.domain.IntentionDomainMetadata; import com.supervision.pdfqaserver.domain.IntentionDomainMetadata;
@ -53,6 +54,7 @@ public class DomainMetadataServiceImpl extends ServiceImpl<DomainMetadataMapper,
if (null != data){ if (null != data){
metadata.setId(data.getId()); metadata.setId(data.getId());
}else { }else {
metadata.setDomainCategoryId(domainCategoryId);
super.save(metadata); super.save(metadata);
} }
} }
@ -72,7 +74,7 @@ public class DomainMetadataServiceImpl extends ServiceImpl<DomainMetadataMapper,
metadata.setId(data.getId()); metadata.setId(data.getId());
}else { }else {
DomainMetadata domainMetadata = metadata.toDomainMetadata(); DomainMetadata domainMetadata = metadata.toDomainMetadata();
domainMetadata.setGenerationType("1");// 1:系统录入 domainMetadata.setGenerationType(DomainMetaGenerationEnum.SYSTEM_AUTO_GENERATION.getCode());// 1:系统录入
this.saveIfNotExists(domainMetadata, domainCategoryId); this.saveIfNotExists(domainMetadata, domainCategoryId);
metadata.setId(domainMetadata.getId()); metadata.setId(domainMetadata.getId());
} }

@ -109,7 +109,6 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
@Override @Override
public void generateGraphBaseTrain(Integer pdfId) { public void generateGraphBaseTrain(Integer pdfId) {
Assert.notNull(pdfId, "pdfId不能为空"); Assert.notNull(pdfId, "pdfId不能为空");
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
TimeInterval timer = new TimeInterval(); TimeInterval timer = new TimeInterval();
try { try {
log.info("开始生成知识图谱, pdfId:{}", pdfId); log.info("开始生成知识图谱, pdfId:{}", pdfId);
@ -216,6 +215,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
domainMetadataService.batchSaveOrUpdateMetadata(metadataDTOS,intention.getId(), pdfInfo.getDomainCategoryId()); domainMetadataService.batchSaveOrUpdateMetadata(metadataDTOS,intention.getId(), pdfInfo.getDomainCategoryId());
} }
}catch (Exception e){ }catch (Exception e){
intentSize ++;
log.error("切分文档id:{},意图识别失败", truncateDTO.getId(), e); log.error("切分文档id:{},意图识别失败", truncateDTO.getId(), e);
} }
@ -265,7 +265,9 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
log.info("切分数据完成,切分个数:{}", truncateDTOS.size()); log.info("切分数据完成,切分个数:{}", truncateDTOS.size());
} }
// 查询当前行业分类下的意图 // 查询当前行业分类下的意图
List<IntentDTO> intentionDTOs = intentionService.queryByDomainCategoryId(pdfInfo.getDomainCategoryId()).stream().map(IntentDTO::new).distinct().toList(); List<IntentDTO> intentionDTOs = intentionService.queryByDomainCategoryId(pdfInfo.getDomainCategoryId()).stream()
.filter(intention -> StrUtil.equals("0",intention.getGenerationType())) // 过滤出手动确认的数据
.map(IntentDTO::new).distinct().toList();
if (CollUtil.isEmpty(intentionDTOs)){ if (CollUtil.isEmpty(intentionDTOs)){
log.info("没有找到行业分类id为{}的意图数据,不再进行下一步操作...", pdfInfo.getDomainCategoryId()); log.info("没有找到行业分类id为{}的意图数据,不再进行下一步操作...", pdfInfo.getDomainCategoryId());
return; return;
@ -275,9 +277,24 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
int index = 1; int index = 1;
int truncateSize = truncateDTOS.size(); int truncateSize = truncateDTOS.size();
log.info("开始实体关系抽取,耗时:{}秒,一共处理片段数:{}个", timer.intervalSecond(), truncateDTOS.size()); log.info("开始实体关系抽取,耗时:{}秒,一共处理片段数:{}个", timer.intervalSecond(), truncateDTOS.size());
List<EREDTO> eredtos = new ArrayList<>();
for (TruncateDTO truncateDTO : truncateDTOS) { for (TruncateDTO truncateDTO : truncateDTOS) {
index ++;
log.info("开始命名实体识别,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2)); log.info("开始命名实体识别,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
try { try {
if (StrUtil.equals(truncateDTO.getLayoutType(), String.valueOf(LayoutTypeEnum.TABLE.getCode()))){
log.info("切分文档id:{},表格类型数据,不进行意图识别...", truncateDTO.getId());
/*EREDTO eredto = conversionPipeline.doEre(truncateDTO, new ArrayList<>());
if (null == eredto){
log.info("切分文档id:{},命名实体识别结果为空...", truncateDTO.getId());
continue;
}
this.saveERE(eredto, truncateDTO.getId());
eredtos.add(eredto);
*/
continue;
}
timer.start("makeOutTruncationIntent"); timer.start("makeOutTruncationIntent");
log.info("开始意图识别,切分文档id:{}", truncateDTO.getId()); log.info("开始意图识别,切分文档id:{}", truncateDTO.getId());
List<IntentDTO> intents = conversionPipeline.makeOutTruncationIntent(truncateDTO,intentionDTOs); List<IntentDTO> intents = conversionPipeline.makeOutTruncationIntent(truncateDTO,intentionDTOs);
@ -296,10 +313,17 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
} }
// 保存实体关系抽取结果 // 保存实体关系抽取结果
this.saveERE(eredto, truncateDTO.getId()); this.saveERE(eredto, truncateDTO.getId());
eredtos.add(eredto);
}catch (Exception e){ }catch (Exception e){
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e); log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
} }
} }
log.info("实体关系抽取完成,耗时:{}秒", timer.intervalSecond());
log.info("开始生成知识图谱...");
timer.start("generateGraph");
generateGraph(eredtos);
log.info("生成知识图谱完成,耗时:{}秒", timer.intervalSecond("generateGraph"));
} }
@Override @Override

@ -93,6 +93,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
log.info("makeOutTruncationIntent:响应结果:{}", call); log.info("makeOutTruncationIntent:响应结果:{}", call);
JSONObject json = JSONUtil.parseObj(call); JSONObject json = JSONUtil.parseObj(call);
JSONArray jsonArray = json.getJSONArray("IntentTypeList"); JSONArray jsonArray = json.getJSONArray("IntentTypeList");
if (null == jsonArray){
return new ArrayList<>();
}
return intents.stream().filter(intent-> return intents.stream().filter(intent->
jsonArray.stream().anyMatch(o->StrUtil.equals(o.toString(), intent.getDigest()))) jsonArray.stream().anyMatch(o->StrUtil.equals(o.toString(), intent.getDigest())))
.collect(Collectors.toList()); .collect(Collectors.toList());
@ -104,8 +107,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
Assert.notEmpty(intents, "意图不能为空"); Assert.notEmpty(intents, "意图不能为空");
String promptTemplate = promptMap.get(EXTRACT_INTENT_METADATA); String promptTemplate = promptMap.get(EXTRACT_INTENT_METADATA);
Map<String, String> params = Map.of("text", truncate.getContent(), "IntentType", JSONUtil.toJsonStr(intents)); Map<String, String> params = Map.of("text", truncate.getContent(), "IntentTypeList", JSONUtil.toJsonStr(intents));
String format = StrUtil.format(promptTemplate, params); String format = StrUtil.format(promptTemplate, params);
log.info("makeOutDomainMetadata:format{}", format);
String call = aiCallService.call(format); String call = aiCallService.call(format);
log.info("makeOutDomainMetadata:响应结果:{}", call); log.info("makeOutDomainMetadata:响应结果:{}", call);
return parseDomainMetadata(call); return parseDomainMetadata(call);
@ -139,6 +143,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
JSONObject source = jsonObject.getJSONObject("source"); JSONObject source = jsonObject.getJSONObject("source");
JSONObject relation = jsonObject.getJSONObject("relation"); JSONObject relation = jsonObject.getJSONObject("relation");
JSONObject target = jsonObject.getJSONObject("target"); JSONObject target = jsonObject.getJSONObject("target");
domainMetadataDTO.setIntentDigest(jsonObject.getStr("intent"));
if (null != source){ if (null != source){
String type = source.getStr("type"); String type = source.getStr("type");
JSONArray attributes = source.getJSONArray("attributes"); JSONArray attributes = source.getJSONArray("attributes");
@ -189,6 +194,14 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
return null; return null;
} }
List<DomainMetadataDTO> domainMetadataDTOS = domainMetadataService.listByIntentionIds(intentIds); List<DomainMetadataDTO> domainMetadataDTOS = domainMetadataService.listByIntentionIds(intentIds);
log.info("doEre:领域元数据列表个数:{}", domainMetadataDTOS.size());
domainMetadataDTOS = domainMetadataDTOS.stream()
.filter(domainMetadataDTO -> StrUtil.equals(domainMetadataDTO.getGenerationType(), "0"))// 过滤出手动确认的数据
.collect(Collectors.toList());
log.info("doEre:领域元数据列表已经手动确认过的个数:{}", domainMetadataDTOS.size());
if (CollUtil.isEmpty(domainMetadataDTOS)){
return null;
}
return doTextEreWithMetadata(truncateDTO, domainMetadataDTOS); return doTextEreWithMetadata(truncateDTO, domainMetadataDTOS);
} }
@ -220,9 +233,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
@Override @Override
public List<TruncateDTO> sliceDocuments(List<DocumentDTO> documents) { public List<TruncateDTO> sliceDocuments(List<DocumentDTO> documents) {
int maxTextLength = 1000; int maxTextLength = 600;
int minTextLength = 800; int minTextLength = 500;
int INITIAL_BUFFER_SIZE = 1500; int INITIAL_BUFFER_SIZE = 100;
// 对pdfAnalysisOutputs进行排序 // 对pdfAnalysisOutputs进行排序
List<DocumentDTO> documentDTOList = documents.stream().sorted( List<DocumentDTO> documentDTOList = documents.stream().sorted(
// 先对pageNo进行排序再对layoutOrder进行排序 // 先对pageNo进行排序再对layoutOrder进行排序
@ -240,7 +253,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
StanfordCoreNLP pipeline = new StanfordCoreNLP(props); StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
List<TruncateDTO> truncateDTOS = new ArrayList<>(); List<TruncateDTO> truncateDTOS = new ArrayList<>();
StringBuilder truncateTextBuild = new StringBuilder(1500); StringBuilder truncateTextBuild = new StringBuilder(1500);
DocumentDTO documentDTOLast = null;
for (DocumentDTO documentDTO : documentDTOList) { for (DocumentDTO documentDTO : documentDTOList) {
documentDTOLast = documentDTO;
String content = documentDTO.getContent(); String content = documentDTO.getContent();
if (StrUtil.isEmpty(content)){ if (StrUtil.isEmpty(content)){
continue; continue;
@ -274,13 +289,12 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
} }
} }
} }
// 处理剩余内容 } else if (LayoutTypeEnum.TABLE.getCode() == layoutType) {
// 如果是表格类型的布局,进行切分
// 出现表格后如果truncateTextBuild不为空单独作为一个片段
if (!truncateTextBuild.isEmpty()) { if (!truncateTextBuild.isEmpty()) {
truncateDTOS.add(new TruncateDTO(documentDTO, truncateTextBuild.toString())); truncateDTOS.add(new TruncateDTO(documentDTO, truncateTextBuild.toString()));
} }
} else if (LayoutTypeEnum.TABLE.getCode() == layoutType) {
// 如果是表格类型的布局,进行切分
// 提前抽取表名 // 提前抽取表名
TableTitleDTO tableTitleDTO = this.extractTableTitle(documentDTO.getTitle()); TableTitleDTO tableTitleDTO = this.extractTableTitle(documentDTO.getTitle());
if (null != tableTitleDTO && StrUtil.isNotEmpty(tableTitleDTO.getTitle())){ if (null != tableTitleDTO && StrUtil.isNotEmpty(tableTitleDTO.getTitle())){
@ -317,6 +331,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
log.info("sliceDocuments:错误的布局类型: {}", layoutType); log.info("sliceDocuments:错误的布局类型: {}", layoutType);
} }
} }
if (!truncateTextBuild.isEmpty() && null != documentDTOLast) {
truncateDTOS.add(new TruncateDTO(documentDTOLast, truncateTextBuild.toString()));
}
return truncateDTOS; return truncateDTOS;
} }
@ -378,7 +395,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
Map<String, String> params = Map.of("text", truncateDTO.getContent(), "domainMetadata", domainMetadata); Map<String, String> params = Map.of("text", truncateDTO.getContent(), "domainMetadata", domainMetadata);
String format = StrUtil.format(prompt, params); String format = StrUtil.format(prompt, params);
String call = aiCallService.call(format); String call = aiCallService.call(format);
return null; return EREDTO.fromTextJson(call, truncateDTO.getId());
} }

@ -7,7 +7,7 @@
<resultMap id="BaseResultMap" type="com.supervision.pdfqaserver.domain.Intention"> <resultMap id="BaseResultMap" type="com.supervision.pdfqaserver.domain.Intention">
<id property="id" column="id" jdbcType="VARCHAR"/> <id property="id" column="id" jdbcType="VARCHAR"/>
<result property="digest" column="digest" jdbcType="VARCHAR"/> <result property="digest" column="digest" jdbcType="VARCHAR"/>
<result property="desc" column="desc" jdbcType="VARCHAR"/> <result property="description" column="description" jdbcType="VARCHAR"/>
<result property="domainCategoryId" column="domain_category_id" jdbcType="VARCHAR"/> <result property="domainCategoryId" column="domain_category_id" jdbcType="VARCHAR"/>
<result property="generationType" column="generation_type" jdbcType="VARCHAR"/> <result property="generationType" column="generation_type" jdbcType="VARCHAR"/>
<result property="createTime" column="create_time" jdbcType="TIMESTAMP"/> <result property="createTime" column="create_time" jdbcType="TIMESTAMP"/>
@ -15,7 +15,7 @@
</resultMap> </resultMap>
<sql id="Base_Column_List"> <sql id="Base_Column_List">
id,digest,desc,generation_type, id,digest,description,generation_type,
domain_category_id,create_time,update_time domain_category_id,create_time,update_time
</sql> </sql>
</mapper> </mapper>

@ -150,5 +150,17 @@ class PdfQaServerApplicationTests {
System.out.println(strings); System.out.println(strings);
} }
@Test
public void metaDataTrainTest() {
knowledgeGraphService.metaDataTrain(13);
}
@Test
void generateGraphBaseTrainTest() {
knowledgeGraphService.generateGraphBaseTrain(13);
}
} }

Loading…
Cancel
Save