package com.supervision.pdfqaserver.service.impl; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.util.StrUtil; import com.supervision.pdfqaserver.cache.PromptCache; import com.supervision.pdfqaserver.constant.LayoutTypeEnum; import com.supervision.pdfqaserver.dto.*; import com.supervision.pdfqaserver.service.TripleConversionPipeline; import edu.stanford.nlp.pipeline.CoreDocument; import edu.stanford.nlp.pipeline.CoreSentence; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.ai.ollama.OllamaChatModel; import org.springframework.stereotype.Service; import java.util.*; @Slf4j @Service @RequiredArgsConstructor public class TripleConversionPipelineImpl implements TripleConversionPipeline { private final OllamaChatModel ollamaChatModel; @Override public List sliceDocuments(List documents) { // 对pdfAnalysisOutputs进行排序 List documentDTOList = documents.stream().sorted( // 先对pageNo进行排序再对layoutOrder进行排序 (o1, o2) -> { if (o1.getPageNo().equals(o2.getPageNo())) { return Integer.compare(o1.getLayoutOrder(), o2.getLayoutOrder()); } return Integer.compare(o1.getPageNo(), o2.getPageNo()); } ).toList(); Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit"); // 创建管道 StanfordCoreNLP pipeline = new StanfordCoreNLP(props); List truncateDTOS = new ArrayList<>(); for (DocumentDTO documentDTO : documentDTOList) { String content = documentDTO.getContent(); if (StrUtil.isEmpty(content)){ continue; } Integer layoutType = documentDTO.getLayoutType(); if (LayoutTypeEnum.TEXT.getCode() == layoutType){ // 如果是文本类型的布局,进行合并 CoreDocument document = new CoreDocument(content); // 分析文本 pipeline.annotate(document); // 获取句子 for (CoreSentence sentence : document.sentences()) { TruncateDTO truncateDTO = new TruncateDTO(documentDTO); truncateDTO.setContent(sentence.text()); truncateDTOS.add(truncateDTO); } } else if (LayoutTypeEnum.TABLE.getCode() == layoutType) { // 如果是表格类型的布局,直接添加到列表中 TruncateDTO truncateDTO = new TruncateDTO(documentDTO); truncateDTOS.add(truncateDTO); } else { log.info("sliceDocuments:错误的布局类型: {}", layoutType); } } return truncateDTOS; } @Override public EREDTO doEre(TruncateDTO truncateDTO) { if (StrUtil.equals(truncateDTO.getLayoutType(),"0")){ return doTextEre(truncateDTO); } if (StrUtil.equals(truncateDTO.getLayoutType(),"1")){ return doTableEre(truncateDTO); } log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType()); return null; } private EREDTO doTextEre(TruncateDTO truncateDTO) { String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TEXT); String formatted = String.format(prompt, truncateDTO.getContent()); String response = ollamaChatModel.call(formatted); // todo:暂时不去处理异常返回 return EREDTO.fromTextJson(response, truncateDTO.getId()); } private EREDTO doTableEre(TruncateDTO truncateDTO) { String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TABLE); String formatted = String.format(prompt, truncateDTO.getContent()); String response = ollamaChatModel.call(formatted); // todo:暂时不去处理异常返回 return EREDTO.fromTableJson(response, truncateDTO.getId()); } /** * 合并实体关系抽取结果 主要是对实体和关系中的属性进行合并 * @param eredtoList 实体关系抽取结果列表 * @return */ @Override public List mergeEreResults(List eredtoList) { List merged = new ArrayList<>(); if (CollUtil.isEmpty(eredtoList)){ return merged; } Map entityMap = new HashMap<>(); Map relationMap = new HashMap<>(); for (EREDTO eredto : eredtoList) { List entities = eredto.getEntities(); if (CollUtil.isNotEmpty(entities)){ for (EntityExtractionDTO entity : entities) { String key = generateEntityMapKey(entity); mergeAttribute(entityMap,entity, key); } } List relations = eredto.getRelations(); if (CollUtil.isNotEmpty(relations)){ for (RelationExtractionDTO relation : relations) { // source和target,re完全相等看作是同一个数据 String relationMapKey = generateRelationMapKey(relation); mergeAttribute(relationMap,relation, relationMapKey); } } } // 利用合并后的map生成新的EREDTO // 优先先把有关系的节点与关系组合在一次 Set relationEntityKey = new HashSet<>(); for (Map.Entry relationEntry : relationMap.entrySet()) { RelationExtractionDTO value = relationEntry.getValue(); EntityExtractionDTO sourceEntity = entityMap.get(StrUtil.join("_", value.getSourceType(), value.getSource())); if (null == sourceEntity){ log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到头节点映射关系", value.getSourceType(), value.getSource()); continue; } EntityExtractionDTO targetEntity = entityMap.get(StrUtil.join("_", value.getTargetType(), value.getTarget())); if (null == targetEntity){ log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到尾节点映射关系", value.getTargetType(), value.getTarget()); continue; } EREDTO eredto = new EREDTO(); eredto.setEntities(List.of(sourceEntity,targetEntity)); eredto.setRelations(List.of(value)); merged.add(eredto); relationEntityKey.addAll(List.of(generateEntityMapKey(sourceEntity),generateEntityMapKey(targetEntity))); } // 将没有关系的节点单独放在一起 List leavedEntities = new ArrayList<>(); for (Map.Entry entry : entityMap.entrySet()) { if (!relationEntityKey.contains(entry.getKey())){ leavedEntities.add(entry.getValue()); } } EREDTO eredto = new EREDTO(); eredto.setEntities(leavedEntities); merged.add(eredto); return merged; } private void mergeAttribute(Map entityMap,RelationExtractionDTO relation, String key) { RelationExtractionDTO cachedEntity = entityMap.get(key); if (null == cachedEntity){ entityMap.put(key, relation); }else { if (CollUtil.isEmpty(relation.getAttributes())){ return; } // 合并属性 List attributes = relation.getAttributes(); if (null == attributes){ attributes = new ArrayList<>(); } for (ERAttributeDTO attribute : relation.getAttributes()) { String attributeKey = attribute.getAttribute(); String attributeValue = attribute.getValue(); if (StrUtil.isEmpty(attributeKey) || StrUtil.isEmpty(attributeValue)){ continue; } // 如果属性已经存在,则不添加 if (attributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) { attributes.add(attribute); } } } } private void mergeAttribute(Map entityMap,EntityExtractionDTO entity, String key) { EntityExtractionDTO cachedEntity = entityMap.get(key); if (null == cachedEntity){ entityMap.put(key, entity); }else { if (CollUtil.isEmpty(entity.getAttributes())){ return; } // 合并属性 List attributes = entity.getAttributes(); if (null == attributes){ attributes = new ArrayList<>(); } for (ERAttributeDTO attribute : entity.getAttributes()) { String attributeKey = attribute.getAttribute(); String attributeValue = attribute.getValue(); if (StrUtil.isEmpty(attributeKey) || StrUtil.isEmpty(attributeValue)){ continue; } // 如果属性已经存在,则不添加 if (attributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) { attributes.add(attribute); } } } } private String generateEntityMapKey(EntityExtractionDTO entityExtractionDTO) { return entityExtractionDTO.getEntity() + "_" + entityExtractionDTO.getName(); } private String generateRelationMapKey(RelationExtractionDTO relationExtractionDTO) { return relationExtractionDTO.getSource() + "_" + relationExtractionDTO.getTarget() + "_" + relationExtractionDTO.getRelation(); } }