package com.supervision.pdfqaserver.service.impl; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.lang.Assert; import cn.hutool.core.util.BooleanUtil; import cn.hutool.core.util.RandomUtil; import cn.hutool.core.util.StrUtil; import cn.hutool.json.JSONArray; import cn.hutool.json.JSONObject; import cn.hutool.json.JSONUtil; import com.supervision.pdfqaserver.cache.PromptCache; import com.supervision.pdfqaserver.constant.DocumentContentTypeEnum; import com.supervision.pdfqaserver.constant.LayoutTypeEnum; import com.supervision.pdfqaserver.dto.*; import com.supervision.pdfqaserver.service.*; import edu.stanford.nlp.pipeline.CoreDocument; import edu.stanford.nlp.pipeline.CoreSentence; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; import java.util.*; import java.util.stream.Collectors; import static com.supervision.pdfqaserver.cache.PromptCache.*; @Slf4j @Service @RequiredArgsConstructor public class TripleConversionPipelineImpl implements TripleConversionPipeline { private final AiCallService aiCallService; private final PdfAnalysisOutputService pdfAnalysisOutputService; private final DomainCategoryService domainCategoryService; private final DomainMetadataService domainMetadataService; @Override public DocumentContentTypeEnum makeOutPdfContentType(Integer pdfId) { Assert.notNull(pdfId, "pdfId不能为空"); String promptTemplate = PromptCache.promptMap.get(CLASSIFY_CONTENT_TYPE); // 截取前300个字符 String text = pdfAnalysisOutputService.queryByPdfIdAndLimit(pdfId,300); Assert.notEmpty(text, "text不能为空"); Map param = Map.of("text", text, "ContentType", DocumentContentTypeEnum.formatToString()); String format = StrUtil.format(promptTemplate, param); log.debug("makeOutPdfContentType:prompt内容:{}", format); String call = aiCallService.call(format); log.info("makeOutPdfContentType:响应结果:{}", call); JSONObject jsonObject = JSONUtil.parseObj(call); return DocumentContentTypeEnum.getByType(jsonObject.getStr("ContentType")); } @Override public String makeOutPdfIndustry(Integer pdfId) { List allIndustryNames = domainCategoryService.listAllIndustryNames(); Assert.notEmpty(allIndustryNames, "行业名称不能为空"); String promptTemplate = PromptCache.promptMap.get(CLASSIFY_INDUSTRY); String text = pdfAnalysisOutputService.queryByPdfIdAndLimit(pdfId, 300); String format = StrUtil.format(promptTemplate, Map.of("text", text, "industryCategory", String.join(",", allIndustryNames))); String call = aiCallService.call(format); log.info("makeOutPdfIndustry:响应结果:{}", call); JSONObject json = JSONUtil.parseObj(call); return json.getStr("industryCategory"); } @Override public List makeOutTruncationIntent(TruncateDTO truncate) { Assert.notEmpty(truncate.getContent(), "内容不能为空"); String promptTemplate = PromptCache.promptMap.get(CLASSIFY_INTENT_TRAIN); Map params = Map.of("text", truncate.getContent()); String format = StrUtil.format(promptTemplate, params); String call = aiCallService.call(format); log.info("makeOutTruncationIntent:响应结果:{}", call); JSONObject json = JSONUtil.parseObj(call); JSONArray jsonArray = json.getJSONArray("IntentTypeList"); return jsonArray.stream().map(Object::toString).toList(); } @Override public List makeOutTruncationIntent(TruncateDTO truncate, List intents) { Assert.notEmpty(truncate.getContent(), "内容不能为空"); Assert.notEmpty(intents, "意图不能为空"); String promptTemplate = PromptCache.promptMap.get(CLASSIFY_INTENT); List digestList = intents.stream().map(IntentDTO::getDigest).toList(); Map params = Map.of("text", truncate.getContent(), "IntentType", JSONUtil.toJsonStr(digestList)); String format = StrUtil.format(promptTemplate, params); String call = aiCallService.call(format); log.info("makeOutTruncationIntent:响应结果:{}", call); JSONObject json = JSONUtil.parseObj(call); JSONArray jsonArray = json.getJSONArray("IntentTypeList"); return intents.stream().filter(intent-> jsonArray.stream().anyMatch(o->StrUtil.equals(o.toString(), intent.getDigest()))) .collect(Collectors.toList()); } @Override public List makeOutDomainMetadata(TruncateDTO truncate,List intents) { Assert.notEmpty(truncate.getContent(), "内容不能为空"); Assert.notEmpty(intents, "意图不能为空"); String promptTemplate = promptMap.get(EXTRACT_INTENT_METADATA); Map params = Map.of("text", truncate.getContent(), "IntentType", JSONUtil.toJsonStr(intents)); String format = StrUtil.format(promptTemplate, params); String call = aiCallService.call(format); log.info("makeOutDomainMetadata:响应结果:{}", call); return parseDomainMetadata(call); } /** * [ * { * "source": { * "type": "实体类型1", * "attributes": ["属性1", "属性2"] * }, * "relation": { * "type": "关系类型", * "attributes": [] * }, * "target": { * "type": "实体类型2", * "attributes": ["属性3"] * }, * "intent": "匹配的意图标签" * } * ] */ private List parseDomainMetadata(String jsonStr) { JSONArray jsonArray = JSONUtil.parseArray(jsonStr); List domainMetadataDTOS = new ArrayList<>(); for (int i = 0; i < jsonArray.size(); i++) { JSONObject jsonObject = jsonArray.getJSONObject(i); DomainMetadataDTO domainMetadataDTO = new DomainMetadataDTO(); JSONObject source = jsonObject.getJSONObject("source"); JSONObject relation = jsonObject.getJSONObject("relation"); JSONObject target = jsonObject.getJSONObject("target"); if (null != source){ String type = source.getStr("type"); JSONArray attributes = source.getJSONArray("attributes"); if (StrUtil.isNotEmpty(type)){ domainMetadataDTO.setSourceType(type); } if (CollUtil.isNotEmpty(attributes)){ List erAttributeDTOS = attributes.stream().map(at -> new ERAttributeDTO(at.toString())).collect(Collectors.toList()); domainMetadataDTO.setSourceAttributes(erAttributeDTOS); } } if (null != relation){ String type = relation.getStr("type"); JSONArray attributes = relation.getJSONArray("attributes"); if (StrUtil.isNotEmpty(type)){ domainMetadataDTO.setRelation(type); } if (CollUtil.isNotEmpty(attributes)){ List erAttributeDTOS = attributes.stream().map(at -> new ERAttributeDTO(at.toString())).collect(Collectors.toList()); domainMetadataDTO.setRelationAttributes(erAttributeDTOS); } } if (null != target){ String type = target.getStr("type"); JSONArray attributes = target.getJSONArray("attributes"); if (StrUtil.isNotEmpty(type)){ domainMetadataDTO.setTargetType(type); } if (CollUtil.isNotEmpty(attributes)){ List erAttributeDTOS = attributes.stream().map(at -> new ERAttributeDTO(at.toString())).collect(Collectors.toList()); domainMetadataDTO.setTargetAttributes(erAttributeDTOS); } } domainMetadataDTOS.add(domainMetadataDTO); } return domainMetadataDTOS; } @Override public EREDTO doEre(TruncateDTO truncateDTO, List intents) { if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TEXT.getCode()))){ if (CollUtil.isEmpty(intents)){ return doTextEre(truncateDTO); } // 查询意图对应的领域元数据 List intentIds = intents.stream().map(IntentDTO::getId).distinct().collect(Collectors.toList()); if (CollUtil.isEmpty(intentIds)) { return null; } List domainMetadataDTOS = domainMetadataService.listByIntentionIds(intentIds); return doTextEreWithMetadata(truncateDTO, domainMetadataDTOS); } if (StrUtil.equals(truncateDTO.getLayoutType(),String.valueOf(LayoutTypeEnum.TABLE.getCode()))){ // 先分析表格是否是描述类型 Boolean classify = this.classify(truncateDTO.getContent()); if (null == classify){ log.info("doEre:表格分类结果为空,切分文档id:{}", truncateDTO.getId()); return null; } if (classify){ return doTextEre(truncateDTO); } return doTableEre(truncateDTO); } log.warn("doEre:错误的布局类型: {}", truncateDTO.getLayoutType()); return null; } /** * 切分文档 * 切分规则: * 文本类型: 以单句为最小单元,最大字数现在这1000字以内。单句超过1000字取完成的单句。 * 表格类型: 以4行数据为最小单元。 * @param documents 文档列表 * @return */ @Override public List sliceDocuments(List documents) { int maxTextLength = 1000; int minTextLength = 800; int INITIAL_BUFFER_SIZE = 1500; // 对pdfAnalysisOutputs进行排序 List documentDTOList = documents.stream().sorted( // 先对pageNo进行排序再对layoutOrder进行排序 (o1, o2) -> { if (o1.getPageNo().equals(o2.getPageNo())) { return Integer.compare(o1.getDisplayOrder(), o2.getDisplayOrder()); } return Integer.compare(o1.getPageNo(), o2.getPageNo()); } ).toList(); Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit"); // 创建管道 StanfordCoreNLP pipeline = new StanfordCoreNLP(props); List truncateDTOS = new ArrayList<>(); StringBuilder truncateTextBuild = new StringBuilder(1500); for (DocumentDTO documentDTO : documentDTOList) { String content = documentDTO.getContent(); if (StrUtil.isEmpty(content)){ continue; } Integer layoutType = documentDTO.getLayoutType(); if (LayoutTypeEnum.TEXT.getCode() == layoutType){ // 如果是文本类型的布局,进行合并 CoreDocument document = new CoreDocument(content); // 分析文本 pipeline.annotate(document); // 获取句子 for (CoreSentence sentence : document.sentences()) { if (StrUtil.isEmpty(sentence.text())) { continue; } if (sentence.text().length() >= maxTextLength) { if (truncateTextBuild.length() >= minTextLength) { // 提交缓存内容 truncateDTOS.add(new TruncateDTO(documentDTO, truncateTextBuild.toString())); truncateTextBuild = new StringBuilder(INITIAL_BUFFER_SIZE); } // 提交超长句子 truncateDTOS.add(new TruncateDTO(documentDTO, sentence.text())); } else { if (truncateTextBuild.length() + sentence.text().length() >= minTextLength) { truncateTextBuild.append(sentence.text()); truncateDTOS.add(new TruncateDTO(documentDTO, truncateTextBuild.toString())); truncateTextBuild = new StringBuilder(INITIAL_BUFFER_SIZE); } else { truncateTextBuild.append(sentence.text()); } } } // 处理剩余内容 if (!truncateTextBuild.isEmpty()) { truncateDTOS.add(new TruncateDTO(documentDTO, truncateTextBuild.toString())); } } else if (LayoutTypeEnum.TABLE.getCode() == layoutType) { // 如果是表格类型的布局,进行切分 // 提前抽取表名 TableTitleDTO tableTitleDTO = this.extractTableTitle(documentDTO.getTitle()); if (null != tableTitleDTO && StrUtil.isNotEmpty(tableTitleDTO.getTitle())){ documentDTO.setTitle(tableTitleDTO.getTitle()); }else { // 生成一个默认的表 documentDTO.setTitle("tableName-"+ RandomUtil.randomString(10)); } List tableRows = StrUtil.split(documentDTO.getContent(), "\n").stream().filter(StrUtil::isNotEmpty).collect(Collectors.toList()); if (tableRows.size()<5){ TruncateDTO truncateDTO = new TruncateDTO(documentDTO); truncateDTOS.add(truncateDTO); continue; } String tableTitle = tableRows.get(0); // 标题分割符 String tableTitleSplit = tableRows.get(1); List noTitleRows = tableRows.subList(2,tableRows.size()-1); List> rows = CollUtil.split(noTitleRows, 4); for (List row : rows) { StringBuilder sb = new StringBuilder(); sb.append(tableTitle).append("\n"); sb.append(tableTitleSplit).append("\n"); for (String s : row) { sb.append(s).append("\n"); } TruncateDTO truncateDTO = new TruncateDTO(documentDTO); truncateDTO.setContent(sb.toString()); truncateDTOS.add(truncateDTO); } } else { log.info("sliceDocuments:错误的布局类型: {}", layoutType); } } return truncateDTOS; } @Override public EREDTO doEre(TruncateDTO truncateDTO) { return this.doEre(truncateDTO, new ArrayList<>()); } @Override public Boolean classify(String content) { Assert.notEmpty(content, "内容不能为空"); // 对表格内容进行精简,只获取与前四行相关的内容 String[] lines = content.split("\n"); if (lines.length > 5){ StringBuilder sb = new StringBuilder(); for (int i = 0; i < 5; i++) { sb.append(lines[i]).append("\n"); } content = sb.toString(); } log.info("classify:开始进行实体关系分类,内容:{}", content); String prompt = PromptCache.promptMap.get(PromptCache.CLASSIFY_TABLE); String format = StrUtil.format(prompt, content); String response = aiCallService.call(format); log.info("classify响应结果:{}", response); return BooleanUtil.toBooleanObject(response); } @Override public TableTitleDTO extractTableTitle(String content) { TableTitleDTO tableTitleDTO = new TableTitleDTO(); if (StrUtil.isEmpty(content)){ log.warn("extractTableTitle:内容为空"); return tableTitleDTO; } String table = PromptCache.promptMap.get(PromptCache.EXTRACT_TABLE_TITLE); String format = StrUtil.format(table, content); String response = aiCallService.call(format); tableTitleDTO.setTitle(response); return tableTitleDTO; } /** * 文本实体关系抽取 * @param truncateDTO 切分文档 * @param domainMetadataDTOS 领域元数据 * @return */ private EREDTO doTextEreWithMetadata(TruncateDTO truncateDTO, List domainMetadataDTOS) { Assert.notEmpty(truncateDTO.getContent(), "内容不能为空"); Assert.notEmpty(domainMetadataDTOS, "意图不能为空"); String prompt = promptMap.get(EXTRACT_ERE_BASE_INTENT); String domainMetadata = metadataToJsonStr(domainMetadataDTOS); Map params = Map.of("text", truncateDTO.getContent(), "domainMetadata", domainMetadata); String format = StrUtil.format(prompt, params); String call = aiCallService.call(format); return null; } /** * 将领域元数据转换为json字符串 * @param domainMetadataDTOS domainMetadataDTOS * @return */ private String metadataToJsonStr(List domainMetadataDTOS){ JSONArray jsa = new JSONArray(); for (DomainMetadataDTO metadataDTO : domainMetadataDTOS) { JSONObject metadataJson = new JSONObject(); JSONObject source = new JSONObject(); source.set("type", metadataDTO.getSourceType()); if (metadataDTO.getSourceAttributes() != null) { JSONArray sourceAttributes = new JSONArray(); for (ERAttributeDTO attribute : metadataDTO.getSourceAttributes()) { sourceAttributes.add(attribute.getAttrName()); } source.set("attributes", sourceAttributes); } metadataJson.set("source", source); JSONObject relation = new JSONObject(); relation.set("type", metadataDTO.getRelation()); if (metadataDTO.getRelationAttributes() != null) { JSONArray relationAttributes = new JSONArray(); for (ERAttributeDTO attribute : metadataDTO.getRelationAttributes()) { relationAttributes.add(attribute.getAttrName()); } relation.set("attributes", relationAttributes); } metadataJson.set("relation", relation); JSONObject target = new JSONObject(); target.set("type", metadataDTO.getTargetType()); if (metadataDTO.getTargetAttributes() != null) { JSONArray targetAttributes = new JSONArray(); for (ERAttributeDTO attribute : metadataDTO.getTargetAttributes()) { targetAttributes.add(attribute.getAttrName()); } target.set("attributes", targetAttributes); } metadataJson.set("target", target); jsa.add(metadataJson); } return jsa.toString(); } private EREDTO doTextEre(TruncateDTO truncateDTO) { log.info("doTextEre:开始进行文本实体关系抽取,内容:{}", truncateDTO.getContent()); String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TEXT); String formatted = StrUtil.format(prompt, truncateDTO.getContent()); String response = aiCallService.call(formatted); log.info("doTextEre响应结果:{}", response); return EREDTO.fromTextJson(response, truncateDTO.getId()); } private EREDTO doTableEre(TruncateDTO truncateDTO) { log.info("doTableEre:开始进行表格实体关系抽取,内容:{}", truncateDTO.getContent()); String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TABLE); String formatted = StrUtil.format(prompt, truncateDTO.getContent()); String response = aiCallService.call(formatted); log.info("doTableEre响应结果:{}", response); EREDTO eredto = EREDTO.fromTableJson(response, truncateDTO.getId()); // 手动设置表格标题 EntityExtractionDTO titleEntity = new EntityExtractionDTO(); titleEntity.setEntity("表"); titleEntity.setTruncationId(truncateDTO.getId()); titleEntity.setName(truncateDTO.getTitle()); // 添加关系 List relations = new ArrayList<>(); for (EntityExtractionDTO entity : eredto.getEntities()) { RelationExtractionDTO relationExtractionDTO = new RelationExtractionDTO(truncateDTO.getId(), titleEntity.getName(), titleEntity.getEntity(), "包含", entity.getName(), entity.getEntity(), entity.getAttributes()); relations.add(relationExtractionDTO); } eredto.getEntities().add(titleEntity); eredto.setRelations(relations); return eredto; } /** * 合并实体关系抽取结果 主要是对实体和关系中的属性进行合并 * 表不参与合并 * @param eredtoList 实体关系抽取结果列表 * @return */ @Override public List mergeEreResults(List eredtoList) { List merged = new ArrayList<>(); if (CollUtil.isEmpty(eredtoList)){ return merged; } // 将表单独拿出来 merged = eredtoList.stream().filter(ere-> ere.getEntities().stream().anyMatch(e->StrUtil.equals(e.getEntity(),"表"))).collect(Collectors.toList()); // 把剩下的数据进行合并计算 eredtoList = eredtoList.stream().filter(ere-> ere.getEntities().stream().noneMatch(e->StrUtil.equals(e.getEntity(),"表"))).collect(Collectors.toList()); Map entityMap = new HashMap<>(); Map relationMap = new HashMap<>(); for (EREDTO eredto : eredtoList) { List entities = eredto.getEntities(); if (CollUtil.isNotEmpty(entities)){ for (EntityExtractionDTO entity : entities) { String key = generateEntityMapKey(entity); mergeAttribute(entityMap,entity, key); } } List relations = eredto.getRelations(); if (CollUtil.isNotEmpty(relations)){ for (RelationExtractionDTO relation : relations) { // source和target,re完全相等看作是同一个数据 String relationMapKey = generateRelationMapKey(relation); mergeAttribute(relationMap,relation, relationMapKey); } } } // 利用合并后的map生成新的EREDTO // 优先先把有关系的节点与关系组合在一次 Set relationEntityKey = new HashSet<>(); for (Map.Entry relationEntry : relationMap.entrySet()) { RelationExtractionDTO value = relationEntry.getValue(); EntityExtractionDTO sourceEntity = entityMap.get(StrUtil.join("_",value.getSourceType(), value.getSource())); if (null == sourceEntity){ log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到头节点映射关系", value.getSourceType(), value.getSource()); continue; } EntityExtractionDTO targetEntity = entityMap.get(StrUtil.join("_", value.getTargetType(),value.getTarget())); if (null == targetEntity){ log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到尾节点映射关系", value.getTargetType(), value.getTarget()); continue; } EREDTO eredto = new EREDTO(); eredto.setEntities(List.of(sourceEntity,targetEntity)); eredto.setRelations(List.of(value)); merged.add(eredto); relationEntityKey.addAll(List.of(generateEntityMapKey(sourceEntity),generateEntityMapKey(targetEntity))); } // 将没有关系的节点单独放在一起 List leavedEntities = new ArrayList<>(); for (Map.Entry entry : entityMap.entrySet()) { if (!relationEntityKey.contains(entry.getKey())){ leavedEntities.add(entry.getValue()); } } if (CollUtil.isNotEmpty(leavedEntities)){ EREDTO eredto = new EREDTO(); eredto.setEntities(leavedEntities); merged.add(eredto); } return merged; } private void mergeAttribute(Map entityMap,RelationExtractionDTO relation, String key) { RelationExtractionDTO cachedRelation = entityMap.get(key); if (null == cachedRelation){ entityMap.put(key, relation); }else { if (CollUtil.isEmpty(relation.getAttributes())){ return; } // 合并属性 List cachedAttributes = cachedRelation.getAttributes(); if (null == cachedAttributes){ cachedAttributes = new ArrayList<>(); } for (TruncationERAttributeDTO attribute : relation.getAttributes()) { String attributeKey = attribute.getAttribute(); String attributeValue = attribute.getValue(); if (StrUtil.isEmpty(attributeKey) || StrUtil.isEmpty(attributeValue)){ continue; } // 如果属性已经存在,则不添加 if (cachedAttributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) { cachedAttributes.add(attribute); } } } } private void mergeAttribute(Map entityMap,EntityExtractionDTO entity, String key) { EntityExtractionDTO cachedEntity = entityMap.get(key); if (null == cachedEntity){ entityMap.put(key, entity); }else { if (CollUtil.isEmpty(entity.getAttributes())){ return; } // 合并属性 List cachedAttributes = cachedEntity.getAttributes(); if (null == cachedAttributes){ cachedAttributes = new ArrayList<>(); cachedEntity.setAttributes(cachedAttributes); } for (TruncationERAttributeDTO attribute : entity.getAttributes()) { String attributeKey = attribute.getAttribute(); String attributeValue = attribute.getValue(); if (StrUtil.isEmpty(attributeKey) || StrUtil.isEmpty(attributeValue)){ continue; } // 如果属性已经存在,则不添加 if (cachedAttributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) { cachedAttributes.add(attribute); } } } } private String generateEntityMapKey(EntityExtractionDTO entityExtractionDTO) { return entityExtractionDTO.getEntity() + "_" + entityExtractionDTO.getName(); } private String generateRelationMapKey(RelationExtractionDTO relationExtractionDTO) { return relationExtractionDTO.getSource()+ "_" + relationExtractionDTO.getRelation() + "_" + relationExtractionDTO.getTarget(); } }