|
|
|
@ -5,56 +5,207 @@ import cn.hutool.core.lang.Assert;
|
|
|
|
|
import cn.hutool.core.util.BooleanUtil;
|
|
|
|
|
import cn.hutool.core.util.RandomUtil;
|
|
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
|
|
import cn.hutool.json.JSONArray;
|
|
|
|
|
import cn.hutool.json.JSONObject;
|
|
|
|
|
import cn.hutool.json.JSONUtil;
|
|
|
|
|
import com.supervision.pdfqaserver.cache.PromptCache;
|
|
|
|
|
import com.supervision.pdfqaserver.constant.DocumentContentTypeEnum;
|
|
|
|
|
import com.supervision.pdfqaserver.constant.LayoutTypeEnum;
|
|
|
|
|
import com.supervision.pdfqaserver.dto.*;
|
|
|
|
|
import com.supervision.pdfqaserver.service.TripleConversionPipeline;
|
|
|
|
|
import com.supervision.pdfqaserver.service.*;
|
|
|
|
|
import edu.stanford.nlp.pipeline.CoreDocument;
|
|
|
|
|
import edu.stanford.nlp.pipeline.CoreSentence;
|
|
|
|
|
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
|
|
|
|
|
import lombok.RequiredArgsConstructor;
|
|
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
|
|
import org.springframework.ai.ollama.OllamaChatModel;
|
|
|
|
|
import org.springframework.stereotype.Service;
|
|
|
|
|
|
|
|
|
|
import java.util.*;
|
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
import static com.supervision.pdfqaserver.cache.PromptCache.*;
|
|
|
|
|
|
|
|
|
|
@Slf4j
|
|
|
|
|
@Service
|
|
|
|
|
@RequiredArgsConstructor
|
|
|
|
|
public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
|
|
|
|
|
private final OllamaChatModel ollamaChatModel;
|
|
|
|
|
private final AiCallService aiCallService;
|
|
|
|
|
|
|
|
|
|
private final PdfAnalysisOutputService pdfAnalysisOutputService;
|
|
|
|
|
|
|
|
|
|
private final DomainCategoryService domainCategoryService;
|
|
|
|
|
|
|
|
|
|
private final DomainMetadataService domainMetadataService;
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public DocumentContentTypeEnum makeOutPdfContentType(Integer pdfId) {
|
|
|
|
|
return null;
|
|
|
|
|
Assert.notNull(pdfId, "pdfId不能为空");
|
|
|
|
|
String promptTemplate = PromptCache.promptMap.get(CLASSIFY_CONTENT_TYPE);
|
|
|
|
|
|
|
|
|
|
// 截取前300个字符
|
|
|
|
|
String text = pdfAnalysisOutputService.queryByPdfIdAndLimit(pdfId,300);
|
|
|
|
|
Assert.notEmpty(text, "text不能为空");
|
|
|
|
|
|
|
|
|
|
Map<String, String> param = Map.of("text", text, "ContentType", DocumentContentTypeEnum.formatToString());
|
|
|
|
|
String format = StrUtil.format(promptTemplate, param);
|
|
|
|
|
log.debug("makeOutPdfContentType:prompt内容:{}", format);
|
|
|
|
|
String call = aiCallService.call(format);
|
|
|
|
|
log.info("makeOutPdfContentType:响应结果:{}", call);
|
|
|
|
|
JSONObject jsonObject = JSONUtil.parseObj(call);
|
|
|
|
|
return DocumentContentTypeEnum.getByType(jsonObject.getStr("ContentType"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public String makeOutPdfIndustry(Integer pdfId) {
|
|
|
|
|
return null;
|
|
|
|
|
List<String> allIndustryNames = domainCategoryService.listAllIndustryNames();
|
|
|
|
|
Assert.notEmpty(allIndustryNames, "行业名称不能为空");
|
|
|
|
|
String promptTemplate = PromptCache.promptMap.get(CLASSIFY_INDUSTRY);
|
|
|
|
|
String text = pdfAnalysisOutputService.queryByPdfIdAndLimit(pdfId, 300);
|
|
|
|
|
String format = StrUtil.format(promptTemplate, Map.of("text", text, "industryCategory", String.join(",", allIndustryNames)));
|
|
|
|
|
String call = aiCallService.call(format);
|
|
|
|
|
log.info("makeOutPdfIndustry:响应结果:{}", call);
|
|
|
|
|
JSONObject json = JSONUtil.parseObj(call);
|
|
|
|
|
return json.getStr("industryCategory");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public List<String> makeOutTruncationIntent(TruncateDTO truncate) {
|
|
|
|
|
return null;
|
|
|
|
|
Assert.notEmpty(truncate.getContent(), "内容不能为空");
|
|
|
|
|
String promptTemplate = PromptCache.promptMap.get(CLASSIFY_INTENT_TRAIN);
|
|
|
|
|
Map<String, String> params = Map.of("text", truncate.getContent());
|
|
|
|
|
String format = StrUtil.format(promptTemplate, params);
|
|
|
|
|
String call = aiCallService.call(format);
|
|
|
|
|
log.info("makeOutTruncationIntent:响应结果:{}", call);
|
|
|
|
|
JSONObject json = JSONUtil.parseObj(call);
|
|
|
|
|
JSONArray jsonArray = json.getJSONArray("IntentTypeList");
|
|
|
|
|
return jsonArray.stream().map(Object::toString).toList();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public List<IntentDTO> makeOutTruncationIntent(TruncateDTO truncate, List<IntentDTO> intents) {
|
|
|
|
|
return null;
|
|
|
|
|
Assert.notEmpty(truncate.getContent(), "内容不能为空");
|
|
|
|
|
Assert.notEmpty(intents, "意图不能为空");
|
|
|
|
|
|
|
|
|
|
String promptTemplate = PromptCache.promptMap.get(CLASSIFY_INTENT);
|
|
|
|
|
List<String> digestList = intents.stream().map(IntentDTO::getDigest).toList();
|
|
|
|
|
Map<String, String> params = Map.of("text", truncate.getContent(), "IntentType", JSONUtil.toJsonStr(digestList));
|
|
|
|
|
String format = StrUtil.format(promptTemplate, params);
|
|
|
|
|
String call = aiCallService.call(format);
|
|
|
|
|
log.info("makeOutTruncationIntent:响应结果:{}", call);
|
|
|
|
|
JSONObject json = JSONUtil.parseObj(call);
|
|
|
|
|
JSONArray jsonArray = json.getJSONArray("IntentTypeList");
|
|
|
|
|
return intents.stream().filter(intent->
|
|
|
|
|
jsonArray.stream().anyMatch(o->StrUtil.equals(o.toString(), intent.getDigest())))
|
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public List<DomainMetadataDTO> makeOutDomainMetadata(TruncateDTO truncate,List<String> intents) {
|
|
|
|
|
return null;
|
|
|
|
|
Assert.notEmpty(truncate.getContent(), "内容不能为空");
|
|
|
|
|
Assert.notEmpty(intents, "意图不能为空");
|
|
|
|
|
|
|
|
|
|
String promptTemplate = promptMap.get(EXTRACT_INTENT_METADATA);
|
|
|
|
|
Map<String, String> params = Map.of("text", truncate.getContent(), "IntentType", JSONUtil.toJsonStr(intents));
|
|
|
|
|
String format = StrUtil.format(promptTemplate, params);
|
|
|
|
|
String call = aiCallService.call(format);
|
|
|
|
|
log.info("makeOutDomainMetadata:响应结果:{}", call);
|
|
|
|
|
return parseDomainMetadata(call);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* [
|
|
|
|
|
* {
|
|
|
|
|
* "source": {
|
|
|
|
|
* "type": "实体类型1",
|
|
|
|
|
* "attributes": ["属性1", "属性2"]
|
|
|
|
|
* },
|
|
|
|
|
* "relation": {
|
|
|
|
|
* "type": "关系类型",
|
|
|
|
|
* "attributes": []
|
|
|
|
|
* },
|
|
|
|
|
* "target": {
|
|
|
|
|
* "type": "实体类型2",
|
|
|
|
|
* "attributes": ["属性3"]
|
|
|
|
|
* },
|
|
|
|
|
* "intent": "匹配的意图标签"
|
|
|
|
|
* }
|
|
|
|
|
* ]
|
|
|
|
|
*/
|
|
|
|
|
private List<DomainMetadataDTO> parseDomainMetadata(String jsonStr) {
|
|
|
|
|
JSONArray jsonArray = JSONUtil.parseArray(jsonStr);
|
|
|
|
|
List<DomainMetadataDTO> domainMetadataDTOS = new ArrayList<>();
|
|
|
|
|
for (int i = 0; i < jsonArray.size(); i++) {
|
|
|
|
|
JSONObject jsonObject = jsonArray.getJSONObject(i);
|
|
|
|
|
DomainMetadataDTO domainMetadataDTO = new DomainMetadataDTO();
|
|
|
|
|
JSONObject source = jsonObject.getJSONObject("source");
|
|
|
|
|
JSONObject relation = jsonObject.getJSONObject("relation");
|
|
|
|
|
JSONObject target = jsonObject.getJSONObject("target");
|
|
|
|
|
if (null != source){
|
|
|
|
|
String type = source.getStr("type");
|
|
|
|
|
JSONArray attributes = source.getJSONArray("attributes");
|
|
|
|
|
if (StrUtil.isNotEmpty(type)){
|
|
|
|
|
domainMetadataDTO.setSourceType(type);
|
|
|
|
|
}
|
|
|
|
|
if (CollUtil.isNotEmpty(attributes)){
|
|
|
|
|
List<ERAttributeDTO> erAttributeDTOS = attributes.stream().map(at -> new ERAttributeDTO(at.toString())).collect(Collectors.toList());
|
|
|
|
|
domainMetadataDTO.setSourceAttributes(erAttributeDTOS);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (null != relation){
|
|
|
|
|
String type = relation.getStr("type");
|
|
|
|
|
JSONArray attributes = relation.getJSONArray("attributes");
|
|
|
|
|
if (StrUtil.isNotEmpty(type)){
|
|
|
|
|
domainMetadataDTO.setRelation(type);
|
|
|
|
|
}
|
|
|
|
|
if (CollUtil.isNotEmpty(attributes)){
|
|
|
|
|
List<ERAttributeDTO> erAttributeDTOS = attributes.stream().map(at -> new ERAttributeDTO(at.toString())).collect(Collectors.toList());
|
|
|
|
|
domainMetadataDTO.setRelationAttributes(erAttributeDTOS);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (null != target){
|
|
|
|
|
String type = target.getStr("type");
|
|
|
|
|
JSONArray attributes = target.getJSONArray("attributes");
|
|
|
|
|
if (StrUtil.isNotEmpty(type)){
|
|
|
|
|
domainMetadataDTO.setTargetType(type);
|
|
|
|
|
}
|
|
|
|
|
if (CollUtil.isNotEmpty(attributes)){
|
|
|
|
|
List<ERAttributeDTO> erAttributeDTOS = attributes.stream().map(at -> new ERAttributeDTO(at.toString())).collect(Collectors.toList());
|
|
|
|
|
domainMetadataDTO.setTargetAttributes(erAttributeDTOS);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
domainMetadataDTOS.add(domainMetadataDTO);
|
|
|
|
|
}
|
|
|
|
|
return domainMetadataDTOS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public EREDTO doEre(TruncateDTO truncateDTO, List<IntentDTO> intents) {
|
|
|
|
|
if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TEXT.getCode()))){
|
|
|
|
|
if (CollUtil.isEmpty(intents)){
|
|
|
|
|
return doTextEre(truncateDTO);
|
|
|
|
|
}
|
|
|
|
|
// 查询意图对应的领域元数据
|
|
|
|
|
List<String> intentIds = intents.stream().map(IntentDTO::getId).distinct().collect(Collectors.toList());
|
|
|
|
|
if (CollUtil.isEmpty(intentIds)) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
List<DomainMetadataDTO> domainMetadataDTOS = domainMetadataService.listByIntentionIds(intentIds);
|
|
|
|
|
return doTextEreWithMetadata(truncateDTO, domainMetadataDTOS);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TABLE.getCode()))){
|
|
|
|
|
// 先分析表格是否是描述类型
|
|
|
|
|
Boolean classify = this.classify(truncateDTO.getContent());
|
|
|
|
|
if (null == classify){
|
|
|
|
|
log.info("doEre:表格分类结果为空,切分文档id:{}", truncateDTO.getId());
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
if (classify){
|
|
|
|
|
return doTextEre(truncateDTO);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return doTableEre(truncateDTO);
|
|
|
|
|
}
|
|
|
|
|
log.warn("doEre:错误的布局类型: {}", truncateDTO.getLayoutType());
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -172,25 +323,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
@Override
|
|
|
|
|
public EREDTO doEre(TruncateDTO truncateDTO) {
|
|
|
|
|
|
|
|
|
|
if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TEXT.getCode()))){
|
|
|
|
|
return doTextEre(truncateDTO);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TABLE.getCode()))){
|
|
|
|
|
// 先分析表格是否是描述类型
|
|
|
|
|
Boolean classify = this.classify(truncateDTO.getContent());
|
|
|
|
|
if (null == classify){
|
|
|
|
|
log.info("doEre:表格分类结果为空,切分文档id:{}", truncateDTO.getId());
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
if (classify){
|
|
|
|
|
return doTextEre(truncateDTO);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return doTableEre(truncateDTO);
|
|
|
|
|
}
|
|
|
|
|
log.warn("doEre:错误的布局类型: {}", truncateDTO.getLayoutType());
|
|
|
|
|
return null;
|
|
|
|
|
return this.doEre(truncateDTO, new ArrayList<>());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
@ -209,7 +342,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
String prompt = PromptCache.promptMap.get(PromptCache.CLASSIFY_TABLE);
|
|
|
|
|
|
|
|
|
|
String format = StrUtil.format(prompt, content);
|
|
|
|
|
String response = ollamaChatModel.call(format);
|
|
|
|
|
String response = aiCallService.call(format);
|
|
|
|
|
log.info("classify响应结果:{}", response);
|
|
|
|
|
return BooleanUtil.toBooleanObject(response);
|
|
|
|
|
}
|
|
|
|
@ -223,16 +356,86 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
}
|
|
|
|
|
String table = PromptCache.promptMap.get(PromptCache.EXTRACT_TABLE_TITLE);
|
|
|
|
|
String format = StrUtil.format(table, content);
|
|
|
|
|
String response = ollamaChatModel.call(format);
|
|
|
|
|
String response = aiCallService.call(format);
|
|
|
|
|
tableTitleDTO.setTitle(response);
|
|
|
|
|
return tableTitleDTO;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 文本实体关系抽取
|
|
|
|
|
* @param truncateDTO 切分文档
|
|
|
|
|
* @param domainMetadataDTOS 领域元数据
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
private EREDTO doTextEreWithMetadata(TruncateDTO truncateDTO, List<DomainMetadataDTO> domainMetadataDTOS) {
|
|
|
|
|
|
|
|
|
|
Assert.notEmpty(truncateDTO.getContent(), "内容不能为空");
|
|
|
|
|
Assert.notEmpty(domainMetadataDTOS, "意图不能为空");
|
|
|
|
|
|
|
|
|
|
String prompt = promptMap.get(EXTRACT_ERE_BASE_INTENT);
|
|
|
|
|
String domainMetadata = metadataToJsonStr(domainMetadataDTOS);
|
|
|
|
|
|
|
|
|
|
Map<String, String> params = Map.of("text", truncateDTO.getContent(), "domainMetadata", domainMetadata);
|
|
|
|
|
String format = StrUtil.format(prompt, params);
|
|
|
|
|
String call = aiCallService.call(format);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 将领域元数据转换为json字符串
|
|
|
|
|
* @param domainMetadataDTOS domainMetadataDTOS
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
private String metadataToJsonStr(List<DomainMetadataDTO> domainMetadataDTOS){
|
|
|
|
|
JSONArray jsa = new JSONArray();
|
|
|
|
|
for (DomainMetadataDTO metadataDTO : domainMetadataDTOS) {
|
|
|
|
|
JSONObject metadataJson = new JSONObject();
|
|
|
|
|
JSONObject source = new JSONObject();
|
|
|
|
|
source.set("type", metadataDTO.getSourceType());
|
|
|
|
|
if (metadataDTO.getSourceAttributes() != null) {
|
|
|
|
|
JSONArray sourceAttributes = new JSONArray();
|
|
|
|
|
for (ERAttributeDTO attribute : metadataDTO.getSourceAttributes()) {
|
|
|
|
|
sourceAttributes.add(attribute.getAttrName());
|
|
|
|
|
}
|
|
|
|
|
source.set("attributes", sourceAttributes);
|
|
|
|
|
}
|
|
|
|
|
metadataJson.set("source", source);
|
|
|
|
|
|
|
|
|
|
JSONObject relation = new JSONObject();
|
|
|
|
|
relation.set("type", metadataDTO.getRelation());
|
|
|
|
|
if (metadataDTO.getRelationAttributes() != null) {
|
|
|
|
|
JSONArray relationAttributes = new JSONArray();
|
|
|
|
|
for (ERAttributeDTO attribute : metadataDTO.getRelationAttributes()) {
|
|
|
|
|
relationAttributes.add(attribute.getAttrName());
|
|
|
|
|
}
|
|
|
|
|
relation.set("attributes", relationAttributes);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
metadataJson.set("relation", relation);
|
|
|
|
|
JSONObject target = new JSONObject();
|
|
|
|
|
target.set("type", metadataDTO.getTargetType());
|
|
|
|
|
if (metadataDTO.getTargetAttributes() != null) {
|
|
|
|
|
JSONArray targetAttributes = new JSONArray();
|
|
|
|
|
for (ERAttributeDTO attribute : metadataDTO.getTargetAttributes()) {
|
|
|
|
|
targetAttributes.add(attribute.getAttrName());
|
|
|
|
|
}
|
|
|
|
|
target.set("attributes", targetAttributes);
|
|
|
|
|
}
|
|
|
|
|
metadataJson.set("target", target);
|
|
|
|
|
jsa.add(metadataJson);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return jsa.toString();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private EREDTO doTextEre(TruncateDTO truncateDTO) {
|
|
|
|
|
log.info("doTextEre:开始进行文本实体关系抽取,内容:{}", truncateDTO.getContent());
|
|
|
|
|
String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TEXT);
|
|
|
|
|
String formatted = StrUtil.format(prompt, truncateDTO.getContent());
|
|
|
|
|
String response = ollamaChatModel.call(formatted);
|
|
|
|
|
String response = aiCallService.call(formatted);
|
|
|
|
|
log.info("doTextEre响应结果:{}", response);
|
|
|
|
|
return EREDTO.fromTextJson(response, truncateDTO.getId());
|
|
|
|
|
}
|
|
|
|
@ -241,7 +444,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
log.info("doTableEre:开始进行表格实体关系抽取,内容:{}", truncateDTO.getContent());
|
|
|
|
|
String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TABLE);
|
|
|
|
|
String formatted = StrUtil.format(prompt, truncateDTO.getContent());
|
|
|
|
|
String response = ollamaChatModel.call(formatted);
|
|
|
|
|
String response = aiCallService.call(formatted);
|
|
|
|
|
log.info("doTableEre响应结果:{}", response);
|
|
|
|
|
EREDTO eredto = EREDTO.fromTableJson(response, truncateDTO.getId());
|
|
|
|
|
// 手动设置表格标题
|
|
|
|
|