基于三元组提取领域元数据...

v_0.0.2
xueqingkun 3 weeks ago
parent 3f82f979c6
commit 25b4437b9f

@ -21,6 +21,38 @@ public class PromptCache {
public static final String EXTRACT_TABLE_TITLE = "EXTRACT_TABLE_TITLE";
/**
* PDF
*/
public static final String CLASSIFY_CONTENT_TYPE = "CLASSIFY_CONTENT_TYPE";
/**
*
*/
public static final String CLASSIFY_INDUSTRY = "CLASSIFY_INDUSTRY";
/**
*
*/
public static final String CLASSIFY_INTENT = "CLASSIFY_INTENT";
/**
* (使)
*/
public static final String CLASSIFY_INTENT_TRAIN = "CLASSIFY_INTENT_TRAIN";
/**
*
*/
public static final String EXTRACT_INTENT_METADATA = "EXTRACT_INTENT_METADATA";
/**
*
*/
public static final String EXTRACT_ERE_BASE_INTENT = "EXTRACT_ERE_BASE_INTENT";
public static final Map<String, String> promptMap = new HashMap<>();
static {
@ -35,6 +67,12 @@ public class PromptCache {
promptMap.put(GENERATE_ANSWER, GENERATE_ANSWER_PROMPT);
promptMap.put(CLASSIFY_TABLE, CLASSIFY_TABLE_PROMPT);
promptMap.put(EXTRACT_TABLE_TITLE, EXTRACT_TABLE_TITLE_PROMPT);
promptMap.put(CLASSIFY_CONTENT_TYPE, CLASSIFY_CONTENT_TYPE_PROMPT);
promptMap.put(CLASSIFY_INDUSTRY, CLASSIFY_INDUSTRY_PROMPT);
promptMap.put(CLASSIFY_INTENT, CLASSIFY_INTENT_PROMPT);
promptMap.put(CLASSIFY_INTENT_TRAIN, CLASSIFY_INTENT_TRAIN_PROMPT);
promptMap.put(EXTRACT_INTENT_METADATA, EXTRACT_INTENT_METADATA_PROMPT);
promptMap.put(EXTRACT_ERE_BASE_INTENT, EXTRACT_ERE_BASE_INTENT_PROMPT);
}
@ -381,4 +419,340 @@ public class PromptCache {
****
{}
""";
private static final String CLASSIFY_CONTENT_TYPE_PROMPT = """
# PDF
##
`ContentType`PDFJSON
##
{ContentType}
##
PDF:
{text}
##
1. ** `ContentType` **
- `ContentType`
- `0`
- `1`
- `2`
2. ****
- `ContentType` `{"ContentType": }`
- `{}`
3. ****
```json
{
"0": "研报类型(行业分析、财务数据)",
"1": "对话类型(会议记录、问答交流)",
"2": "记录类型(操作日志、事务记录)"
}
```
##
```json
// 示例1指定类型0文本符合研报特征
{
"text": "2023年新能源汽车渗透率达35%乘联会预计2024年突破50%"
}
{"ContentType": 0}
// 示例2指定类型1文本不符合对话特征
{
"text": "系统启动执行数据同步"
}
{}
// 示例3指定类型2文本符合记录特征
{
"text": "2023-10-01 14:00 用户登录异常14:05 触发安全警报"
}
{"ContentType": 2}
```
---
****
- `ContentType`
-
-
##
1. JSON
2.
- `{"ContentType": 0/1/2}`
- `{}`
./no_think
""";
private static final String CLASSIFY_INDUSTRY_PROMPT = """
###
###
```
{text}
```
###
{industryCategory}
###
* ****
* ****
*
* JSON使```json ```Markdown
###
```
{
industryCategory:
}
```
""";
private static final String CLASSIFY_INTENT_PROMPT = """
#
##
##
{IntentType}
##
1.
2.
3.
##
{text}
##
```json
// 示例1匹配单个意图
{
"text": "本公司注册地址为上海市浦东新区张江高科技园区"
}
{
"IntentTypeList": ["公司地址"]
}
// 示例2匹配多个意图
{
"text": "2023年度财务报告显示公司总部位于北京全年营收..."
}
{
"IntentTypeList": ["公司地址", "公司年度报告"]
}
// 示例3无匹配意图
{
"text": "今天的天气很适合户外活动"
}
{}
""";
private static final String CLASSIFY_INTENT_TRAIN_PROMPT = """
#
##
PDF
##
{text}
##
```json
// 示例1
{
"text": "..."
}
{
"IntentTypeList": ["...", "..."]
}
// 示例2文本意图无法识别
{
"text": "人生短短几个球"
}
{}
```
##
1. JSON
2. JSON使```json ```Markdown
3.
```json
{"IntentTypeList": ["...", "..."]}
```
-
```json
{}
```
3.使......
./no_think
""";
private static final String EXTRACT_INTENT_METADATA_PROMPT = """
#
##
JSON
##
-
{text}
-
{IntentTypeList}
##
1.
2.
- source
- relation
- target
- intent
3. /
- type
- attributes
4. 使
```json
[
{
"source": {
"type": "实体类型1",
"attributes": ["属性1", "属性2"]
},
"relation": {
"type": "关系类型",
"attributes": []
},
"target": {
"type": "实体类型2",
"attributes": ["属性3"]
},
"intent": "匹配的意图标签"
}
]
5.
""";
private static final String EXTRACT_ERE_BASE_INTENT_PROMPT = """
#
##
##
- {text}
-
{domainMetadata}
##
{
"nodes": [
{
"type": "公司",
"attributes": {
"名称": "龙源(酒泉)风力发电有限公司",
"地址": "雨花台区"
}
},
{
"type": "电子银行承兑汇票",
"attributes": {
"金额": "100.00万元",
"打印时间": "2024年10月20号"
}
},
{
"type": "公司",
"attributes": {
"名称": "杭州六小龙",
"地址": "杭州高新区"
}
}
],
"relations": [
{
"type": "持有",
"attributes": {
}
},
{
"type": "收购",
"attributes": {
"收购类型": "全资收购"
"收购时间""2025年5月28号"
}
}
],
"typed_triplets": [
[
"公司",
"持有",
"电子银行承兑汇票"
],
[
"公司",
"收购",
"公司"
]
]
}
##
- `domainMetadata`
-
-
- JSON使```json ```Markdown
""";
}

@ -32,4 +32,21 @@ public enum DocumentContentTypeEnum {
private final String type;
private final String desc;
public static String formatToString() {
StringBuilder sb = new StringBuilder();
for (DocumentContentTypeEnum value : values()) {
sb.append(value.getType()).append(":").append(value.getDesc()).append(" ");
}
return sb.toString();
}
public static DocumentContentTypeEnum getByType(String type) {
for (DocumentContentTypeEnum value : values()) {
if (value.getType().equals(type)) {
return value;
}
}
return null;
}
}

@ -1,6 +1,8 @@
package com.supervision.pdfqaserver.dto;
import cn.hutool.core.util.StrUtil;
import com.supervision.pdfqaserver.domain.DomainMetadata;
import com.supervision.pdfqaserver.domain.ErAttribute;
import lombok.Data;
import java.util.ArrayList;
@ -51,6 +53,10 @@ public class DomainMetadataDTO {
*/
private String generationType;
public DomainMetadataDTO() {
}
public DomainMetadata toDomainMetadata() {
DomainMetadata domainMetadata = new DomainMetadata();
domainMetadata.setId(this.id);
@ -62,4 +68,31 @@ public class DomainMetadataDTO {
return domainMetadata;
}
public DomainMetadataDTO(DomainMetadata domainMetadata,List<ErAttribute> erAttributes) {
this.id = domainMetadata.getId();
this.domainCategoryId = domainMetadata.getDomainCategoryId();
this.sourceType = domainMetadata.getSourceType();
this.relation = domainMetadata.getRelation();
this.targetType = domainMetadata.getTargetType();
this.generationType = domainMetadata.getGenerationType();
for (ErAttribute erAttribute : erAttributes) {
if (StrUtil.equals(erAttribute.getDomainMetadataId(),this.id)){
if(StrUtil.equals(erAttribute.getErType(),"1")){
// 节点数据
if (StrUtil.equals(erAttribute.getAttrName(),this.sourceType)) {
this.sourceAttributes.add(new ERAttributeDTO(erAttribute));
}
if (StrUtil.equals(erAttribute.getAttrName(),this.targetType)) {
this.targetAttributes.add(new ERAttributeDTO(erAttribute));
}
}else {
if (StrUtil.equals(erAttribute.getAttrName(),this.relation)) {
this.relationAttributes.add(new ERAttributeDTO(erAttribute));
}
}
}
}
}
}

@ -33,6 +33,31 @@ public class ERAttributeDTO {
*/
private String erType;
public ERAttributeDTO() {
}
public ERAttributeDTO(String id, String domainMetadataId, String erName, String attrName, String attrValueType, String erType) {
this.id = id;
this.domainMetadataId = domainMetadataId;
this.erName = erName;
this.attrName = attrName;
this.attrValueType = attrValueType;
this.erType = erType;
}
public ERAttributeDTO(String attrName) {
this.attrName = attrName;
}
public ERAttributeDTO(ErAttribute erAttribute) {
this.id = erAttribute.getId();
this.domainMetadataId = erAttribute.getDomainMetadataId();
this.attrName = erAttribute.getAttrName();
this.attrValueType = erAttribute.getAttrValueType();
this.erType = erAttribute.getErType();
}
public ErAttribute toErAttribute() {
ErAttribute erAttribute = new ErAttribute();
erAttribute.setId(this.id);

@ -32,6 +32,17 @@ public class IntentDTO {
*/
private String generationType;
public IntentDTO() {
}
public IntentDTO(String id, String digest, String desc, String domainCategoryId, String generationType) {
this.id = id;
this.digest = digest;
this.desc = desc;
this.domainCategoryId = domainCategoryId;
this.generationType = generationType;
}
public IntentDTO(Intention intention){
this.id = intention.getId();
this.digest = intention.getDigest();

@ -2,6 +2,7 @@ package com.supervision.pdfqaserver.service;
import com.supervision.pdfqaserver.domain.DomainCategory;
import com.baomidou.mybatisplus.extension.service.IService;
import java.util.List;
/**
* @author Administrator
@ -10,4 +11,7 @@ import com.baomidou.mybatisplus.extension.service.IService;
*/
public interface DomainCategoryService extends IService<DomainCategory> {
DomainCategory queryByIndustryName(String industryName);
List<String> listAllIndustryNames();
}

@ -44,4 +44,12 @@ public interface DomainMetadataService extends IService<DomainMetadata> {
* @return
*/
DomainMetadata getByPrimaryKey(String sourceType, String targetType, String relation,String domainCategoryId);
/**
* ID
* @param intentionIds ID
* @return
*/
List<DomainMetadataDTO> listByIntentionIds(List<String> intentionIds);
}

@ -17,4 +17,5 @@ public interface ErAttributeService extends IService<ErAttribute> {
List<ErAttribute> listByDomainMetadataId(String domainMetadataId);
List<ErAttribute> listByDomainMetadataIds(List<String> domainMetadataIds);
}

@ -15,5 +15,7 @@ public interface IntentionDomainMetadataService extends IService<IntentionDomain
List<IntentionDomainMetadata> listByIntentionId(String intentionId);
List<IntentionDomainMetadata> listByIntentionIds(List<String> intentionIds);
void batchSaveIfAbsent(String intentionId, List<String> metadataIds);
}

@ -13,4 +13,12 @@ import java.util.List;
public interface PdfAnalysisOutputService extends IService<PdfAnalysisOutput> {
List<PdfAnalysisOutput> queryByPdfId(Integer pdfId);
/**
* pdfIdn
* @param pdfId
* @param limit
* @return
*/
String queryByPdfIdAndLimit(Integer pdfId, Integer limit);
}

@ -5,6 +5,8 @@ import com.supervision.pdfqaserver.domain.DomainCategory;
import com.supervision.pdfqaserver.service.DomainCategoryService;
import com.supervision.pdfqaserver.mapper.DomainCategoryMapper;
import org.springframework.stereotype.Service;
import java.util.List;
import java.util.stream.Collectors;
/**
* @author Administrator
@ -15,6 +17,17 @@ import org.springframework.stereotype.Service;
public class DomainCategoryServiceImpl extends ServiceImpl<DomainCategoryMapper, DomainCategory>
implements DomainCategoryService{
@Override
public DomainCategory queryByIndustryName(String industryName) {
return this.lambdaQuery().eq(DomainCategory::getIndustryName, industryName)
.one();
}
@Override
public List<String> listAllIndustryNames() {
return super.lambdaQuery().select(DomainCategory::getIndustryName)
.list().stream().map(DomainCategory::getIndustryName).collect(Collectors.toList());
}
}

@ -4,6 +4,8 @@ import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.lang.Assert;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.supervision.pdfqaserver.domain.DomainMetadata;
import com.supervision.pdfqaserver.domain.ErAttribute;
import com.supervision.pdfqaserver.domain.IntentionDomainMetadata;
import com.supervision.pdfqaserver.dto.DomainMetadataDTO;
import com.supervision.pdfqaserver.dto.ERAttributeDTO;
import com.supervision.pdfqaserver.service.DomainMetadataService;
@ -14,7 +16,7 @@ import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.ArrayList;
import java.util.List;
/**
@ -70,6 +72,7 @@ public class DomainMetadataServiceImpl extends ServiceImpl<DomainMetadataMapper,
metadata.setId(data.getId());
}else {
DomainMetadata domainMetadata = metadata.toDomainMetadata();
domainMetadata.setGenerationType("1");// 1:系统录入
this.saveIfNotExists(domainMetadata, domainCategoryId);
metadata.setId(domainMetadata.getId());
}
@ -119,6 +122,23 @@ public class DomainMetadataServiceImpl extends ServiceImpl<DomainMetadataMapper,
.eq(DomainMetadata::getDomainCategoryId, domainCategoryId)
.one();
}
@Override
public List<DomainMetadataDTO> listByIntentionIds(List<String> intentionIds) {
List<DomainMetadataDTO> domainMetadataDTOS = new ArrayList<>();
List<IntentionDomainMetadata> intentionDomainMetadataList = intentionDomainMetadataService.listByIntentionIds(intentionIds);
if (CollUtil.isEmpty(intentionDomainMetadataList)){
return domainMetadataDTOS;
}
List<String> domainMetadataIds = intentionDomainMetadataList.stream().map(IntentionDomainMetadata::getDomainMetadataId).distinct().toList();
List<ErAttribute> erAttributes = erAttributeService.listByDomainMetadataIds(domainMetadataIds);
for (IntentionDomainMetadata intentionDomainMetadata : intentionDomainMetadataList) {
DomainMetadata domainMetadata = this.getById(intentionDomainMetadata.getDomainMetadataId());
domainMetadataDTOS.add(new DomainMetadataDTO(domainMetadata, erAttributes));
}
return domainMetadataDTOS;
}
}

@ -1,5 +1,6 @@
package com.supervision.pdfqaserver.service.impl;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.lang.Assert;
import cn.hutool.core.util.StrUtil;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
@ -8,7 +9,6 @@ import com.supervision.pdfqaserver.service.ErAttributeService;
import com.supervision.pdfqaserver.mapper.ErAttributeMapper;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.util.List;
/**
@ -39,6 +39,14 @@ public class ErAttributeServiceImpl extends ServiceImpl<ErAttributeMapper, ErAtt
public List<ErAttribute> listByDomainMetadataId(String domainMetadataId) {
return super.lambdaQuery().eq(ErAttribute::getDomainMetadataId, domainMetadataId).list();
}
@Override
public List<ErAttribute> listByDomainMetadataIds(List<String> domainMetadataIds) {
if (CollUtil.isEmpty(domainMetadataIds)){
return List.of();
}
return this.lambdaQuery().in(ErAttribute::getDomainMetadataId, domainMetadataIds).list();
}
}

@ -29,6 +29,14 @@ public class IntentionDomainMetadataServiceImpl extends ServiceImpl<IntentionDom
return this.lambdaQuery().eq(IntentionDomainMetadata::getIntentionId, intentionId).list();
}
@Override
public List<IntentionDomainMetadata> listByIntentionIds(List<String> intentionIds) {
if (CollUtil.isEmpty(intentionIds)){
return new ArrayList<>();
}
return this.lambdaQuery().in(IntentionDomainMetadata::getIntentionId, intentionIds).list();
}
@Override
@Transactional(rollbackFor = Exception.class)
public void batchSaveIfAbsent(String intentionId, List<String> metadataIds) {

@ -41,7 +41,7 @@ public class IntentionServiceImpl extends ServiceImpl<IntentionMapper, Intention
Intention intention = new Intention();
intention.setDigest(intent);
intention.setDomainCategoryId(domainCategoryId);
intention.setGenerationType("1");
intention.setGenerationType("1");// 1:系统录入
this.save(intention);
result.add(intention);

@ -8,6 +8,7 @@ import cn.hutool.core.util.StrUtil;
import cn.hutool.json.JSONUtil;
import com.supervision.pdfqaserver.constant.DocumentContentTypeEnum;
import com.supervision.pdfqaserver.constant.DomainMetaGenerationEnum;
import com.supervision.pdfqaserver.constant.LayoutTypeEnum;
import com.supervision.pdfqaserver.domain.*;
import com.supervision.pdfqaserver.dto.*;
import com.supervision.pdfqaserver.service.*;
@ -51,6 +52,8 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
private final IntentionService intentionService;
private final DomainCategoryService domainCategoryService;
@Override
public void generateGraph(String pdfId) {
@ -81,15 +84,55 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
}
/**
*
* @param pdfId pdfId
*/
@Override
public void metaDataTrain(Integer pdfId) {
TimeInterval timer = new TimeInterval();
try {
metaDataTrainExecutor(pdfId);
pdfInfoService.pdfTrainComplete(pdfId);
log.info("pdfId:{}元数据训练完成,耗时:{}秒", pdfId, timer.intervalSecond());
}catch (Exception e){
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
if ( null == pdfInfo.getTrainStatus() || pdfInfo.getTrainStatus() == 0) {
log.error("pdfId:{}元数据训练失败...", pdfId, e);
pdfInfoService.pdfTrainFail(pdfId);
}
log.info("pdfId:{}元数据训练失败,耗时:{}秒", pdfId, timer.intervalSecond());
}
}
@Override
public void generateGraphBaseTrain(Integer pdfId) {
Assert.notNull(pdfId, "pdfId不能为空");
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
TimeInterval timer = new TimeInterval();
try {
log.info("开始生成知识图谱, pdfId:{}", pdfId);
((KnowledgeGraphService)AopContext.currentProxy()).resetGraphData(pdfId.toString());
pdfInfoService.pdfToGraphStart(pdfId);
generateGraphBaseTrainExecutor(pdfId);
pdfInfoService.pdfToGraphComplete(pdfId);
log.info("pdfId:{}知识图谱生成完成,总耗时:{}秒", pdfId,timer.intervalSecond());
}catch (Exception e){
log.error("pdfId:{}知识图谱生成失败...", pdfId, e);
pdfInfoService.pdfToGraphFail(pdfId);
log.info("pdfId:{}知识图谱生成失败,总耗时:{}秒", pdfId,timer.intervalSecond());
}
}
private void metaDataTrainExecutor(Integer pdfId) {
Assert.notNull(pdfId, "pdfId不能为空");
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
Assert.notNull(pdfInfo, "pdfId:{}没有找到对应的pdf信息", pdfId);
if (null == pdfInfo.getTrainStatus()){
// todo:训练异常,需要记录异常状态
if (null == pdfInfo.getTrainStatus() || pdfInfo.getTrainStatus() == 2){
log.info("pdfId:{}没有找到对应的pdf训练状态,开始识别文档训练状态...", pdfId);
pdfInfoService.pdfToGraphStart(pdfId);
pdfInfoService.pdfTrainStart(pdfId);
if (StrUtil.isEmpty(pdfInfo.getContentType())){
log.info("pdfId:{}没有找到对应的pdf内容类型,开始识别文档内容类型...", pdfId);
DocumentContentTypeEnum documentContentTypeEnum = tripleConversionPipeline.makeOutPdfContentType(pdfId);
@ -111,8 +154,14 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
pdfInfoService.pdfTrainFail(pdfId);
return;
}
pdfInfo.setDomainCategoryId(industry);
pdfInfoService.updateCategory(pdfId, industry);
DomainCategory domainCategory = domainCategoryService.queryByIndustryName(industry);
if (null == domainCategory){
log.info("pdfId:{}没有找到:{}对应的行业分类,停止后续任务...", pdfId, industry);
pdfInfoService.pdfTrainFail(pdfId);
return;
}
pdfInfo.setDomainCategoryId(domainCategory.getId());
pdfInfoService.updateCategory(pdfId, domainCategory.getId());
}
}
TripleConversionPipeline tripleConversionPipeline = this.getTripleConversionPipeline(pdfInfo.getContentType(), pdfInfo.getDomainCategoryId());
@ -135,11 +184,31 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
// 保存分片信息
documentTruncationService.batchSave(truncateDTOS);
// 只识别文本类型数据
truncateDTOS = truncateDTOS.stream()
.filter(t->StrUtil.equals(t.getLayoutType(), String.valueOf(LayoutTypeEnum.TEXT.getCode()))).collect(Collectors.toList());
log.info("只识别文本类型数据,个数:{}", truncateDTOS.size());
int truncateSize = truncateDTOS.size();
int index = 1;
int intentSize = 0;
TimeInterval interval = new TimeInterval();
for (TruncateDTO truncateDTO : truncateDTOS) {
try {
log.info("正在意图、元数据抽取,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
log.info("开始意图识别,切分文档id:{}", truncateDTO.getId());
interval.start("makeOutTruncationIntent");
List<String> intents = tripleConversionPipeline.makeOutTruncationIntent(truncateDTO);
log.info("意图识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(),interval.intervalMs("makeOutTruncationIntent"));
if (CollUtil.isEmpty(intents)){
log.info("切分文档id:{},未正确识别出意图...", truncateDTO.getId());
continue;
}
log.info("开始意图元数据识别,切分文档id:{}", truncateDTO.getId());
interval.start("makeOutDomainMetadata");
List<DomainMetadataDTO> domainMetadataDTOS = tripleConversionPipeline.makeOutDomainMetadata(truncateDTO, intents);
log.info("意图元数据识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(),interval.intervalMs("makeOutDomainMetadata"));
// 保存意图数据
intentSize ++;
List<Intention> intentions = intentionService.batchSaveIfAbsent(intents, pdfInfo.getDomainCategoryId(), pdfId.toString());
for (Intention intention : intentions) {
List<DomainMetadataDTO> metadataDTOS = domainMetadataDTOS.stream()
@ -151,16 +220,39 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
}
}
log.info("意图、元数据抽取完成,耗时:{}秒,一共处理片段数:{}个,抽取出意图数量:{}个", interval.intervalSecond(),truncateSize,intentSize);
}
@Override
public void generateGraphBaseTrain(Integer pdfId) {
private void generateGraphBaseTrainExecutor(Integer pdfId){
Assert.notNull(pdfId, "pdfId不能为空");
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
Assert.notNull(pdfInfo, "pdfId:{}没有找到对应的pdf信息", pdfId);
Assert.isTrue((null !=pdfInfo.getTrainStatus() && pdfInfo.getTrainStatus() == 1),
"pdfId:{}的pdf训练状态{} 不符合要求", pdfId, pdfInfo.getTrainStatus());
if (StrUtil.isEmpty(pdfInfo.getContentType())){
log.info("pdfId:{}没有找到对应的pdf内容类型,开始识别文档内容类型...", pdfId);
DocumentContentTypeEnum documentContentTypeEnum = tripleConversionPipeline.makeOutPdfContentType(pdfId);
if (null == documentContentTypeEnum){
log.info("pdfId:{}没有找到对应的pdf内容类型,停止后续任务...", pdfId);
return;
}
pdfInfo.setContentType(documentContentTypeEnum.getType());
pdfInfoService.updateContentType(pdfId, documentContentTypeEnum.getType());
}
if (null == pdfInfo.getDomainCategoryId()){
log.info("pdfId:{}没有找到对应的pdf行业,开始识别文档行业...", pdfId);
String industry = tripleConversionPipeline.makeOutPdfIndustry(pdfId);
if (StrUtil.isEmpty(industry)){
log.info("pdfId:{}没有找到对应的pdf行业,停止后续任务...", pdfId);
return;
}
DomainCategory domainCategory = domainCategoryService.queryByIndustryName(industry);
if (null == domainCategory){
log.info("pdfId:{}没有找到:{}对应的行业分类,停止后续任务...", pdfId, industry);
return;
}
pdfInfo.setDomainCategoryId(domainCategory.getId());
pdfInfoService.updateCategory(pdfId, domainCategory.getId());
}
List<TruncateDTO> truncateDTOS = documentTruncationService.listByPdfId(pdfId).stream().map(TruncateDTO::new).collect(Collectors.toList());
TripleConversionPipeline conversionPipeline = this.getTripleConversionPipeline(pdfInfo.getContentType(), pdfInfo.getDomainCategoryId());
@ -172,7 +264,6 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
documentTruncationService.batchSave(truncateDTOS);
log.info("切分数据完成,切分个数:{}", truncateDTOS.size());
}
log.info("开始命名实体识别,切分文档个数:{}", truncateDTOS.size());
// 查询当前行业分类下的意图
List<IntentDTO> intentionDTOs = intentionService.queryByDomainCategoryId(pdfInfo.getDomainCategoryId()).stream().map(IntentDTO::new).distinct().toList();
if (CollUtil.isEmpty(intentionDTOs)){
@ -180,14 +271,25 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
return;
}
TimeInterval timer = new TimeInterval();
int index = 1;
int truncateSize = truncateDTOS.size();
log.info("开始实体关系抽取,耗时:{}秒,一共处理片段数:{}个", timer.intervalSecond(), truncateDTOS.size());
for (TruncateDTO truncateDTO : truncateDTOS) {
log.info("开始命名实体识别,切分文档id:{},识别进度:{}", truncateDTO.getId(), NumberUtil.formatPercent((index*1.0)/truncateSize, 2));
try {
timer.start("makeOutTruncationIntent");
log.info("开始意图识别,切分文档id:{}", truncateDTO.getId());
List<IntentDTO> intents = conversionPipeline.makeOutTruncationIntent(truncateDTO,intentionDTOs);
log.info("意图识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(), timer.intervalMs("makeOutTruncationIntent"));
if (CollUtil.isEmpty(intents)){
log.info("切分文档id:{},未正确识别出意图...", truncateDTO.getId());
continue;
}
log.info("开始命名实体识别,切分文档id:{}", truncateDTO.getId());
timer.start("doEre");
EREDTO eredto = conversionPipeline.doEre(truncateDTO, intents);
log.info("命名实体识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(), timer.intervalMs("doEre"));
if (null == eredto){
log.info("切分文档id:{},命名实体识别结果为空...", truncateDTO.getId());
continue;
@ -198,7 +300,6 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
}
}
}
@Override
@ -357,7 +458,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
return;
}
// 删除切分数据
documentTruncationService.deleteByDocumentIds(documentIds);
//documentTruncationService.deleteByDocumentIds(documentIds);
for (DocumentTruncation documentTruncation : documentTruncations) {
String truncationId = documentTruncation.getId();
// 删除实体数据

@ -14,6 +14,7 @@ public class OllamaCallServiceImpl implements AiCallService {
private final OllamaChatModel ollamaChatModel;
@Override
public String call(String prompt) {
return null;
return ollamaChatModel.call(prompt);
}
}

@ -6,8 +6,9 @@ import com.supervision.pdfqaserver.domain.PdfAnalysisOutput;
import com.supervision.pdfqaserver.service.PdfAnalysisOutputService;
import com.supervision.pdfqaserver.mapper.PdfAnalysisOutputMapper;
import org.springframework.stereotype.Service;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
/**
* @author Administrator
@ -24,6 +25,20 @@ public class PdfAnalysisOutputServiceImpl extends ServiceImpl<PdfAnalysisOutputM
return super.lambdaQuery().eq(PdfAnalysisOutput::getPdfId, pdfId).list();
}
@Override
public String queryByPdfIdAndLimit(Integer pdfId, Integer limit) {
if (null == limit || limit <= 0) {
return null;
}
List<PdfAnalysisOutput> pdfAnalysisOutputs = this.queryByPdfId(pdfId);
// 截取前300个字符
String fullText = pdfAnalysisOutputs.stream().sorted(
Comparator.comparingInt(PdfAnalysisOutput::getPageNo)
.thenComparingInt(PdfAnalysisOutput::getDisplayOrder)
).map(PdfAnalysisOutput::getContent).collect(Collectors.joining());
return fullText.substring(0, Math.min(limit, fullText.length()));
}
}

@ -5,56 +5,207 @@ import cn.hutool.core.lang.Assert;
import cn.hutool.core.util.BooleanUtil;
import cn.hutool.core.util.RandomUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.json.JSONArray;
import cn.hutool.json.JSONObject;
import cn.hutool.json.JSONUtil;
import com.supervision.pdfqaserver.cache.PromptCache;
import com.supervision.pdfqaserver.constant.DocumentContentTypeEnum;
import com.supervision.pdfqaserver.constant.LayoutTypeEnum;
import com.supervision.pdfqaserver.dto.*;
import com.supervision.pdfqaserver.service.TripleConversionPipeline;
import com.supervision.pdfqaserver.service.*;
import edu.stanford.nlp.pipeline.CoreDocument;
import edu.stanford.nlp.pipeline.CoreSentence;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.ollama.OllamaChatModel;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.stream.Collectors;
import static com.supervision.pdfqaserver.cache.PromptCache.*;
@Slf4j
@Service
@RequiredArgsConstructor
public class TripleConversionPipelineImpl implements TripleConversionPipeline {
private final OllamaChatModel ollamaChatModel;
private final AiCallService aiCallService;
private final PdfAnalysisOutputService pdfAnalysisOutputService;
private final DomainCategoryService domainCategoryService;
private final DomainMetadataService domainMetadataService;
@Override
public DocumentContentTypeEnum makeOutPdfContentType(Integer pdfId) {
return null;
Assert.notNull(pdfId, "pdfId不能为空");
String promptTemplate = PromptCache.promptMap.get(CLASSIFY_CONTENT_TYPE);
// 截取前300个字符
String text = pdfAnalysisOutputService.queryByPdfIdAndLimit(pdfId,300);
Assert.notEmpty(text, "text不能为空");
Map<String, String> param = Map.of("text", text, "ContentType", DocumentContentTypeEnum.formatToString());
String format = StrUtil.format(promptTemplate, param);
log.debug("makeOutPdfContentType:prompt内容:{}", format);
String call = aiCallService.call(format);
log.info("makeOutPdfContentType:响应结果:{}", call);
JSONObject jsonObject = JSONUtil.parseObj(call);
return DocumentContentTypeEnum.getByType(jsonObject.getStr("ContentType"));
}
@Override
public String makeOutPdfIndustry(Integer pdfId) {
return null;
List<String> allIndustryNames = domainCategoryService.listAllIndustryNames();
Assert.notEmpty(allIndustryNames, "行业名称不能为空");
String promptTemplate = PromptCache.promptMap.get(CLASSIFY_INDUSTRY);
String text = pdfAnalysisOutputService.queryByPdfIdAndLimit(pdfId, 300);
String format = StrUtil.format(promptTemplate, Map.of("text", text, "industryCategory", String.join(",", allIndustryNames)));
String call = aiCallService.call(format);
log.info("makeOutPdfIndustry:响应结果:{}", call);
JSONObject json = JSONUtil.parseObj(call);
return json.getStr("industryCategory");
}
@Override
public List<String> makeOutTruncationIntent(TruncateDTO truncate) {
return null;
Assert.notEmpty(truncate.getContent(), "内容不能为空");
String promptTemplate = PromptCache.promptMap.get(CLASSIFY_INTENT_TRAIN);
Map<String, String> params = Map.of("text", truncate.getContent());
String format = StrUtil.format(promptTemplate, params);
String call = aiCallService.call(format);
log.info("makeOutTruncationIntent:响应结果:{}", call);
JSONObject json = JSONUtil.parseObj(call);
JSONArray jsonArray = json.getJSONArray("IntentTypeList");
return jsonArray.stream().map(Object::toString).toList();
}
@Override
public List<IntentDTO> makeOutTruncationIntent(TruncateDTO truncate, List<IntentDTO> intents) {
return null;
Assert.notEmpty(truncate.getContent(), "内容不能为空");
Assert.notEmpty(intents, "意图不能为空");
String promptTemplate = PromptCache.promptMap.get(CLASSIFY_INTENT);
List<String> digestList = intents.stream().map(IntentDTO::getDigest).toList();
Map<String, String> params = Map.of("text", truncate.getContent(), "IntentType", JSONUtil.toJsonStr(digestList));
String format = StrUtil.format(promptTemplate, params);
String call = aiCallService.call(format);
log.info("makeOutTruncationIntent:响应结果:{}", call);
JSONObject json = JSONUtil.parseObj(call);
JSONArray jsonArray = json.getJSONArray("IntentTypeList");
return intents.stream().filter(intent->
jsonArray.stream().anyMatch(o->StrUtil.equals(o.toString(), intent.getDigest())))
.collect(Collectors.toList());
}
@Override
public List<DomainMetadataDTO> makeOutDomainMetadata(TruncateDTO truncate,List<String> intents) {
return null;
Assert.notEmpty(truncate.getContent(), "内容不能为空");
Assert.notEmpty(intents, "意图不能为空");
String promptTemplate = promptMap.get(EXTRACT_INTENT_METADATA);
Map<String, String> params = Map.of("text", truncate.getContent(), "IntentType", JSONUtil.toJsonStr(intents));
String format = StrUtil.format(promptTemplate, params);
String call = aiCallService.call(format);
log.info("makeOutDomainMetadata:响应结果:{}", call);
return parseDomainMetadata(call);
}
/**
* [
* {
* "source": {
* "type": "实体类型1",
* "attributes": ["属性1", "属性2"]
* },
* "relation": {
* "type": "关系类型",
* "attributes": []
* },
* "target": {
* "type": "实体类型2",
* "attributes": ["属性3"]
* },
* "intent": "匹配的意图标签"
* }
* ]
*/
private List<DomainMetadataDTO> parseDomainMetadata(String jsonStr) {
JSONArray jsonArray = JSONUtil.parseArray(jsonStr);
List<DomainMetadataDTO> domainMetadataDTOS = new ArrayList<>();
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject jsonObject = jsonArray.getJSONObject(i);
DomainMetadataDTO domainMetadataDTO = new DomainMetadataDTO();
JSONObject source = jsonObject.getJSONObject("source");
JSONObject relation = jsonObject.getJSONObject("relation");
JSONObject target = jsonObject.getJSONObject("target");
if (null != source){
String type = source.getStr("type");
JSONArray attributes = source.getJSONArray("attributes");
if (StrUtil.isNotEmpty(type)){
domainMetadataDTO.setSourceType(type);
}
if (CollUtil.isNotEmpty(attributes)){
List<ERAttributeDTO> erAttributeDTOS = attributes.stream().map(at -> new ERAttributeDTO(at.toString())).collect(Collectors.toList());
domainMetadataDTO.setSourceAttributes(erAttributeDTOS);
}
}
if (null != relation){
String type = relation.getStr("type");
JSONArray attributes = relation.getJSONArray("attributes");
if (StrUtil.isNotEmpty(type)){
domainMetadataDTO.setRelation(type);
}
if (CollUtil.isNotEmpty(attributes)){
List<ERAttributeDTO> erAttributeDTOS = attributes.stream().map(at -> new ERAttributeDTO(at.toString())).collect(Collectors.toList());
domainMetadataDTO.setRelationAttributes(erAttributeDTOS);
}
}
if (null != target){
String type = target.getStr("type");
JSONArray attributes = target.getJSONArray("attributes");
if (StrUtil.isNotEmpty(type)){
domainMetadataDTO.setTargetType(type);
}
if (CollUtil.isNotEmpty(attributes)){
List<ERAttributeDTO> erAttributeDTOS = attributes.stream().map(at -> new ERAttributeDTO(at.toString())).collect(Collectors.toList());
domainMetadataDTO.setTargetAttributes(erAttributeDTOS);
}
}
domainMetadataDTOS.add(domainMetadataDTO);
}
return domainMetadataDTOS;
}
@Override
public EREDTO doEre(TruncateDTO truncateDTO, List<IntentDTO> intents) {
if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TEXT.getCode()))){
if (CollUtil.isEmpty(intents)){
return doTextEre(truncateDTO);
}
// 查询意图对应的领域元数据
List<String> intentIds = intents.stream().map(IntentDTO::getId).distinct().collect(Collectors.toList());
if (CollUtil.isEmpty(intentIds)) {
return null;
}
List<DomainMetadataDTO> domainMetadataDTOS = domainMetadataService.listByIntentionIds(intentIds);
return doTextEreWithMetadata(truncateDTO, domainMetadataDTOS);
}
if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TABLE.getCode()))){
// 先分析表格是否是描述类型
Boolean classify = this.classify(truncateDTO.getContent());
if (null == classify){
log.info("doEre:表格分类结果为空,切分文档id:{}", truncateDTO.getId());
return null;
}
if (classify){
return doTextEre(truncateDTO);
}
return doTableEre(truncateDTO);
}
log.warn("doEre:错误的布局类型: {}", truncateDTO.getLayoutType());
return null;
}
@ -172,25 +323,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
@Override
public EREDTO doEre(TruncateDTO truncateDTO) {
if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TEXT.getCode()))){
return doTextEre(truncateDTO);
}
if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TABLE.getCode()))){
// 先分析表格是否是描述类型
Boolean classify = this.classify(truncateDTO.getContent());
if (null == classify){
log.info("doEre:表格分类结果为空,切分文档id:{}", truncateDTO.getId());
return null;
}
if (classify){
return doTextEre(truncateDTO);
}
return doTableEre(truncateDTO);
}
log.warn("doEre:错误的布局类型: {}", truncateDTO.getLayoutType());
return null;
return this.doEre(truncateDTO, new ArrayList<>());
}
@Override
@ -209,7 +342,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
String prompt = PromptCache.promptMap.get(PromptCache.CLASSIFY_TABLE);
String format = StrUtil.format(prompt, content);
String response = ollamaChatModel.call(format);
String response = aiCallService.call(format);
log.info("classify响应结果:{}", response);
return BooleanUtil.toBooleanObject(response);
}
@ -223,16 +356,86 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
}
String table = PromptCache.promptMap.get(PromptCache.EXTRACT_TABLE_TITLE);
String format = StrUtil.format(table, content);
String response = ollamaChatModel.call(format);
String response = aiCallService.call(format);
tableTitleDTO.setTitle(response);
return tableTitleDTO;
}
/**
*
* @param truncateDTO
* @param domainMetadataDTOS
* @return
*/
private EREDTO doTextEreWithMetadata(TruncateDTO truncateDTO, List<DomainMetadataDTO> domainMetadataDTOS) {
Assert.notEmpty(truncateDTO.getContent(), "内容不能为空");
Assert.notEmpty(domainMetadataDTOS, "意图不能为空");
String prompt = promptMap.get(EXTRACT_ERE_BASE_INTENT);
String domainMetadata = metadataToJsonStr(domainMetadataDTOS);
Map<String, String> params = Map.of("text", truncateDTO.getContent(), "domainMetadata", domainMetadata);
String format = StrUtil.format(prompt, params);
String call = aiCallService.call(format);
return null;
}
/**
* json
* @param domainMetadataDTOS domainMetadataDTOS
* @return
*/
private String metadataToJsonStr(List<DomainMetadataDTO> domainMetadataDTOS){
JSONArray jsa = new JSONArray();
for (DomainMetadataDTO metadataDTO : domainMetadataDTOS) {
JSONObject metadataJson = new JSONObject();
JSONObject source = new JSONObject();
source.set("type", metadataDTO.getSourceType());
if (metadataDTO.getSourceAttributes() != null) {
JSONArray sourceAttributes = new JSONArray();
for (ERAttributeDTO attribute : metadataDTO.getSourceAttributes()) {
sourceAttributes.add(attribute.getAttrName());
}
source.set("attributes", sourceAttributes);
}
metadataJson.set("source", source);
JSONObject relation = new JSONObject();
relation.set("type", metadataDTO.getRelation());
if (metadataDTO.getRelationAttributes() != null) {
JSONArray relationAttributes = new JSONArray();
for (ERAttributeDTO attribute : metadataDTO.getRelationAttributes()) {
relationAttributes.add(attribute.getAttrName());
}
relation.set("attributes", relationAttributes);
}
metadataJson.set("relation", relation);
JSONObject target = new JSONObject();
target.set("type", metadataDTO.getTargetType());
if (metadataDTO.getTargetAttributes() != null) {
JSONArray targetAttributes = new JSONArray();
for (ERAttributeDTO attribute : metadataDTO.getTargetAttributes()) {
targetAttributes.add(attribute.getAttrName());
}
target.set("attributes", targetAttributes);
}
metadataJson.set("target", target);
jsa.add(metadataJson);
}
return jsa.toString();
}
private EREDTO doTextEre(TruncateDTO truncateDTO) {
log.info("doTextEre:开始进行文本实体关系抽取,内容:{}", truncateDTO.getContent());
String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TEXT);
String formatted = StrUtil.format(prompt, truncateDTO.getContent());
String response = ollamaChatModel.call(formatted);
String response = aiCallService.call(formatted);
log.info("doTextEre响应结果:{}", response);
return EREDTO.fromTextJson(response, truncateDTO.getId());
}
@ -241,7 +444,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
log.info("doTableEre:开始进行表格实体关系抽取,内容:{}", truncateDTO.getContent());
String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TABLE);
String formatted = StrUtil.format(prompt, truncateDTO.getContent());
String response = ollamaChatModel.call(formatted);
String response = aiCallService.call(formatted);
log.info("doTableEre响应结果:{}", response);
EREDTO eredto = EREDTO.fromTableJson(response, truncateDTO.getId());
// 手动设置表格标题

@ -1,6 +1,9 @@
package com.supervision.pdfqaserver;
import com.supervision.pdfqaserver.constant.DocumentContentTypeEnum;
import com.supervision.pdfqaserver.dto.EREDTO;
import com.supervision.pdfqaserver.dto.IntentDTO;
import com.supervision.pdfqaserver.dto.TruncateDTO;
import com.supervision.pdfqaserver.service.ChinesEsToEnglishGenerator;
import com.supervision.pdfqaserver.service.KnowledgeGraphService;
import com.supervision.pdfqaserver.service.TripleConversionPipeline;
@ -113,5 +116,39 @@ class PdfQaServerApplicationTests {
System.out.println(classify);
}
@Test
void makeOutPdfContentTypeTest() {
DocumentContentTypeEnum documentContentTypeEnum = tripleConversionPipeline.makeOutPdfContentType(5);
System.out.println(documentContentTypeEnum);
}
@Test
void makeOutPdfIndustryTest() {
String industry = tripleConversionPipeline.makeOutPdfIndustry(5);
System.out.println(industry);
}
@Test
void makeOutTruncationIntentTest() {
TruncateDTO truncateDTO = new TruncateDTO();
truncateDTO.setContent("# 2、同时按照境外会计准则与按照中国会计准则披露的财务报告中净利润和净资产差异情况 \n" +
"\n" +
"□适用 回不适用 ");
List<String> strings = tripleConversionPipeline.makeOutTruncationIntent(truncateDTO);
System.out.println(strings);
}
@Test
void makeOutTruncationIntentTest2() {
TruncateDTO truncateDTO = new TruncateDTO();
truncateDTO.setContent("# 2、同时按照境外会计准则与按照中国会计准则披露的财务报告中净利润和净资产差异情况 \n" +
"\n" +
"□适用 回不适用 ");
IntentDTO intentDTO = new IntentDTO();
intentDTO.setDigest("财务报告差异分析");
List<IntentDTO> strings = tripleConversionPipeline.makeOutTruncationIntent(truncateDTO,List.of(intentDTO));
System.out.println(strings);
}
}

Loading…
Cancel
Save