pdf-qa-server/src/main/java/com/supervision/pdfqaserver/service/impl/TripleConversionPipelineImp...

package com.supervision.pdfqaserver.service.impl;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
import com.supervision.pdfqaserver.cache.PromptCache;
import com.supervision.pdfqaserver.constant.LayoutTypeEnum;
import com.supervision.pdfqaserver.dto.*;
import com.supervision.pdfqaserver.service.TripleConversionPipeline;
import edu.stanford.nlp.pipeline.CoreDocument;
import edu.stanford.nlp.pipeline.CoreSentence;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.ollama.OllamaChatModel;
import org.springframework.stereotype.Service;

import java.util.*;

@Slf4j
@Service
@RequiredArgsConstructor
public class TripleConversionPipelineImpl implements TripleConversionPipeline {

    private final OllamaChatModel ollamaChatModel;

    @Override
    public List<TruncateDTO> sliceDocuments(List<DocumentDTO> documents) {
        // 对pdfAnalysisOutputs进行排序
        List<DocumentDTO> documentDTOList = documents.stream().sorted(
                // 先对pageNo进行排序再对layoutOrder进行排序
                (o1, o2) -> {
                    if (o1.getPageNo().equals(o2.getPageNo())) {
                        return Integer.compare(o1.getLayoutOrder(), o2.getLayoutOrder());
                    }
                    return Integer.compare(o1.getPageNo(), o2.getPageNo());
                }
        ).toList();

        Properties props = new Properties();
        props.setProperty("annotators", "tokenize, ssplit");
        // 创建管道
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        List<TruncateDTO> truncateDTOS = new ArrayList<>();
        for (DocumentDTO documentDTO : documentDTOList) {
            String content = documentDTO.getContent();
            if (StrUtil.isEmpty(content)){
                continue;
            }
            Integer layoutType = documentDTO.getLayoutType();
            if (LayoutTypeEnum.TEXT.getCode() == layoutType){
                // 如果是文本类型的布局，进行合并
                CoreDocument document = new CoreDocument(content);
                // 分析文本
                pipeline.annotate(document);
                // 获取句子
                for (CoreSentence sentence : document.sentences()) {
                    TruncateDTO truncateDTO = new TruncateDTO(documentDTO);
                    truncateDTO.setContent(sentence.text());
                    truncateDTOS.add(truncateDTO);
                }
            } else if (LayoutTypeEnum.TABLE.getCode() == layoutType) {
                // 如果是表格类型的布局，直接添加到列表中
                TruncateDTO truncateDTO = new TruncateDTO(documentDTO);
                truncateDTOS.add(truncateDTO);
            } else {
                log.info("sliceDocuments:错误的布局类型: {}", layoutType);
            }
        }
        return truncateDTOS;
    }

    @Override
    public EREDTO doEre(TruncateDTO truncateDTO) {

        if (StrUtil.equals(truncateDTO.getLayoutType(),"0")){

            return doTextEre(truncateDTO);
        }

        if (StrUtil.equals(truncateDTO.getLayoutType(),"1")){
            return doTableEre(truncateDTO);
        }
        log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType());
        return null;
    }

    private EREDTO doTextEre(TruncateDTO truncateDTO) {
        String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TEXT);
        String formatted = String.format(prompt, truncateDTO.getContent());
        String response = ollamaChatModel.call(formatted);
        // todo:暂时不去处理异常返回

        return EREDTO.fromTextJson(response, truncateDTO.getId());
    }

    private EREDTO doTableEre(TruncateDTO truncateDTO) {
        String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TABLE);
        String formatted = String.format(prompt, truncateDTO.getContent());
        String response = ollamaChatModel.call(formatted);
        // todo:暂时不去处理异常返回

        return EREDTO.fromTableJson(response, truncateDTO.getId());
    }

    /**
     * 合并实体关系抽取结果 主要是对实体和关系中的属性进行合并
     * @param eredtoList 实体关系抽取结果列表
     * @return
     */
    @Override
    public List<EREDTO> mergeEreResults(List<EREDTO> eredtoList) {
        List<EREDTO> merged = new ArrayList<>();
        if (CollUtil.isEmpty(eredtoList)){
            return merged;
        }
        Map<String, EntityExtractionDTO> entityMap = new HashMap<>();
        Map<String, RelationExtractionDTO> relationMap = new HashMap<>();
        for (EREDTO eredto : eredtoList) {
            List<EntityExtractionDTO> entities = eredto.getEntities();
            if (CollUtil.isNotEmpty(entities)){
                for (EntityExtractionDTO entity : entities) {
                    String key = generateEntityMapKey(entity);
                    mergeAttribute(entityMap,entity, key);
                }
            }
            List<RelationExtractionDTO> relations = eredto.getRelations();
            if (CollUtil.isNotEmpty(relations)){
                for (RelationExtractionDTO relation : relations) {
                    // source和target,re完全相等看作是同一个数据
                    String relationMapKey = generateRelationMapKey(relation);
                    mergeAttribute(relationMap,relation, relationMapKey);
                }
            }
        }
        // 利用合并后的map生成新的EREDTO
        // 优先先把有关系的节点与关系组合在一次
        Set<String> relationEntityKey = new HashSet<>();
        for (Map.Entry<String, RelationExtractionDTO> relationEntry : relationMap.entrySet()) {
            RelationExtractionDTO value = relationEntry.getValue();
            EntityExtractionDTO sourceEntity = entityMap.get(StrUtil.join("_", value.getSourceType(), value.getSource()));
            if (null == sourceEntity){
                log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到头节点映射关系", value.getSourceType(), value.getSource());
                continue;
            }
            EntityExtractionDTO targetEntity = entityMap.get(StrUtil.join("_", value.getTargetType(),  value.getTarget()));
            if (null == targetEntity){
                log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到尾节点映射关系", value.getTargetType(), value.getTarget());
                continue;
            }
            EREDTO eredto = new EREDTO();
            eredto.setEntities(List.of(sourceEntity,targetEntity));
            eredto.setRelations(List.of(value));
            merged.add(eredto);
            relationEntityKey.addAll(List.of(generateEntityMapKey(sourceEntity),generateEntityMapKey(targetEntity)));
        }
        // 将没有关系的节点单独放在一起
        List<EntityExtractionDTO> leavedEntities = new ArrayList<>();
        for (Map.Entry<String, EntityExtractionDTO> entry : entityMap.entrySet()) {
            if (!relationEntityKey.contains(entry.getKey())){
                leavedEntities.add(entry.getValue());
            }
        }
        EREDTO eredto = new EREDTO();
        eredto.setEntities(leavedEntities);
        merged.add(eredto);
        return merged;
    }

    private void mergeAttribute(Map<String, RelationExtractionDTO> entityMap,RelationExtractionDTO relation, String key) {

        RelationExtractionDTO cachedEntity = entityMap.get(key);
        if (null == cachedEntity){
            entityMap.put(key, relation);
        }else {
            if (CollUtil.isEmpty(relation.getAttributes())){
                return;
            }
            // 合并属性
            List<ERAttributeDTO> attributes = relation.getAttributes();
            if (null == attributes){
                attributes = new ArrayList<>();
            }
            for (ERAttributeDTO attribute : relation.getAttributes()) {
                String attributeKey = attribute.getAttribute();
                String attributeValue = attribute.getValue();
                if (StrUtil.isEmpty(attributeKey) || StrUtil.isEmpty(attributeValue)){
                    continue;
                }
                // 如果属性已经存在，则不添加
                if (attributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) {
                    attributes.add(attribute);
                }
            }
        }
    }
    private void mergeAttribute(Map<String, EntityExtractionDTO> entityMap,EntityExtractionDTO entity, String key) {

        EntityExtractionDTO cachedEntity = entityMap.get(key);
        if (null == cachedEntity){
            entityMap.put(key, entity);
        }else {
            if (CollUtil.isEmpty(entity.getAttributes())){
                return;
            }
            // 合并属性
            List<ERAttributeDTO> attributes = entity.getAttributes();
            if (null == attributes){
                attributes = new ArrayList<>();
            }
            for (ERAttributeDTO attribute : entity.getAttributes()) {
                String attributeKey = attribute.getAttribute();
                String attributeValue = attribute.getValue();
                if (StrUtil.isEmpty(attributeKey) || StrUtil.isEmpty(attributeValue)){
                    continue;
                }
                // 如果属性已经存在，则不添加
                if (attributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) {
                    attributes.add(attribute);
                }
            }
        }
    }

    private String generateEntityMapKey(EntityExtractionDTO entityExtractionDTO) {
        return entityExtractionDTO.getEntity() + "_" + entityExtractionDTO.getName();
    }

    private String generateRelationMapKey(RelationExtractionDTO relationExtractionDTO) {
        return relationExtractionDTO.getSource() + "_" + relationExtractionDTO.getTarget() + "_" + relationExtractionDTO.getRelation();
    }
}
初始化代码 2 months ago			`package com.supervision.pdfqaserver.service.impl;`

			`import cn.hutool.core.collection.CollUtil;`
			`import cn.hutool.core.util.StrUtil;`
			`import com.supervision.pdfqaserver.cache.PromptCache;`
初始化代码 2 months ago			`import com.supervision.pdfqaserver.constant.LayoutTypeEnum;`
初始化代码 2 months ago			`import com.supervision.pdfqaserver.dto.*;`
			`import com.supervision.pdfqaserver.service.TripleConversionPipeline;`
初始化代码 2 months ago			`import edu.stanford.nlp.pipeline.CoreDocument;`
			`import edu.stanford.nlp.pipeline.CoreSentence;`
			`import edu.stanford.nlp.pipeline.StanfordCoreNLP;`
初始化代码 2 months ago			`import lombok.RequiredArgsConstructor;`
			`import lombok.extern.slf4j.Slf4j;`
			`import org.springframework.ai.ollama.OllamaChatModel;`
			`import org.springframework.stereotype.Service;`

初始化代码 2 months ago			`import java.util.*;`

初始化代码 2 months ago			`@Slf4j`
			`@Service`
			`@RequiredArgsConstructor`
			`public class TripleConversionPipelineImpl implements TripleConversionPipeline {`

			`private final OllamaChatModel ollamaChatModel;`

			`@Override`
			`public List<TruncateDTO> sliceDocuments(List<DocumentDTO> documents) {`
			`// 对pdfAnalysisOutputs进行排序`
			`List<DocumentDTO> documentDTOList = documents.stream().sorted(`
			`// 先对pageNo进行排序再对layoutOrder进行排序`
			`(o1, o2) -> {`
			`if (o1.getPageNo().equals(o2.getPageNo())) {`
			`return Integer.compare(o1.getLayoutOrder(), o2.getLayoutOrder());`
			`}`
			`return Integer.compare(o1.getPageNo(), o2.getPageNo());`
			`}`
			`).toList();`
初始化代码 2 months ago
			`Properties props = new Properties();`
			`props.setProperty("annotators", "tokenize, ssplit");`
			`// 创建管道`
			`StanfordCoreNLP pipeline = new StanfordCoreNLP(props);`
			`List<TruncateDTO> truncateDTOS = new ArrayList<>();`
			`for (DocumentDTO documentDTO : documentDTOList) {`
			`String content = documentDTO.getContent();`
			`if (StrUtil.isEmpty(content)){`
			`continue;`
			`}`
			`Integer layoutType = documentDTO.getLayoutType();`
			`if (LayoutTypeEnum.TEXT.getCode() == layoutType){`
			`// 如果是文本类型的布局，进行合并`
			`CoreDocument document = new CoreDocument(content);`
			`// 分析文本`
			`pipeline.annotate(document);`
			`// 获取句子`
			`for (CoreSentence sentence : document.sentences()) {`
			`TruncateDTO truncateDTO = new TruncateDTO(documentDTO);`
			`truncateDTO.setContent(sentence.text());`
			`truncateDTOS.add(truncateDTO);`
			`}`
			`} else if (LayoutTypeEnum.TABLE.getCode() == layoutType) {`
			`// 如果是表格类型的布局，直接添加到列表中`
			`TruncateDTO truncateDTO = new TruncateDTO(documentDTO);`
			`truncateDTOS.add(truncateDTO);`
			`} else {`
			`log.info("sliceDocuments:错误的布局类型: {}", layoutType);`
			`}`
			`}`
			`return truncateDTOS;`
初始化代码 2 months ago			`}`

			`@Override`
			`public EREDTO doEre(TruncateDTO truncateDTO) {`

			`if (StrUtil.equals(truncateDTO.getLayoutType(),"0")){`

初始化代码 2 months ago			`return doTextEre(truncateDTO);`
初始化代码 2 months ago			`}`

			`if (StrUtil.equals(truncateDTO.getLayoutType(),"1")){`
初始化代码 2 months ago			`return doTableEre(truncateDTO);`
初始化代码 2 months ago			`}`
			`log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType());`
			`return null;`
			`}`

			`private EREDTO doTextEre(TruncateDTO truncateDTO) {`
			`String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TEXT);`
			`String formatted = String.format(prompt, truncateDTO.getContent());`
			`String response = ollamaChatModel.call(formatted);`
			`// todo:暂时不去处理异常返回`

			`return EREDTO.fromTextJson(response, truncateDTO.getId());`
			`}`

			`private EREDTO doTableEre(TruncateDTO truncateDTO) {`
			`String prompt = PromptCache.promptMap.get(PromptCache.DOERE_TABLE);`
			`String formatted = String.format(prompt, truncateDTO.getContent());`
			`String response = ollamaChatModel.call(formatted);`
			`// todo:暂时不去处理异常返回`

			`return EREDTO.fromTableJson(response, truncateDTO.getId());`
			`}`

			`/**`
			`* 合并实体关系抽取结果主要是对实体和关系中的属性进行合并`
			`* @param eredtoList 实体关系抽取结果列表`
			`* @return`
			`*/`
			`@Override`
			`public List<EREDTO> mergeEreResults(List<EREDTO> eredtoList) {`
			`List<EREDTO> merged = new ArrayList<>();`
			`if (CollUtil.isEmpty(eredtoList)){`
			`return merged;`
			`}`
初始化代码 2 months ago			`Map<String, EntityExtractionDTO> entityMap = new HashMap<>();`
			`Map<String, RelationExtractionDTO> relationMap = new HashMap<>();`
初始化代码 2 months ago			`for (EREDTO eredto : eredtoList) {`
			`List<EntityExtractionDTO> entities = eredto.getEntities();`
			`if (CollUtil.isNotEmpty(entities)){`
			`for (EntityExtractionDTO entity : entities) {`
初始化代码 2 months ago			`String key = generateEntityMapKey(entity);`
			`mergeAttribute(entityMap,entity, key);`
初始化代码 2 months ago			`}`
			`}`
			`List<RelationExtractionDTO> relations = eredto.getRelations();`
			`if (CollUtil.isNotEmpty(relations)){`
			`for (RelationExtractionDTO relation : relations) {`
			`// source和target,re完全相等看作是同一个数据`
初始化代码 2 months ago			`String relationMapKey = generateRelationMapKey(relation);`
			`mergeAttribute(relationMap,relation, relationMapKey);`
初始化代码 2 months ago			`}`
			`}`
			`}`
初始化代码 2 months ago			`// 利用合并后的map生成新的EREDTO`
			`// 优先先把有关系的节点与关系组合在一次`
			`Set<String> relationEntityKey = new HashSet<>();`
			`for (Map.Entry<String, RelationExtractionDTO> relationEntry : relationMap.entrySet()) {`
			`RelationExtractionDTO value = relationEntry.getValue();`
			`EntityExtractionDTO sourceEntity = entityMap.get(StrUtil.join("_", value.getSourceType(), value.getSource()));`
			`if (null == sourceEntity){`
			`log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到头节点映射关系", value.getSourceType(), value.getSource());`
			`continue;`
			`}`
			`EntityExtractionDTO targetEntity = entityMap.get(StrUtil.join("_", value.getTargetType(), value.getTarget()));`
			`if (null == targetEntity){`
			`log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到尾节点映射关系", value.getTargetType(), value.getTarget());`
			`continue;`
			`}`
			`EREDTO eredto = new EREDTO();`
			`eredto.setEntities(List.of(sourceEntity,targetEntity));`
			`eredto.setRelations(List.of(value));`
			`merged.add(eredto);`
			`relationEntityKey.addAll(List.of(generateEntityMapKey(sourceEntity),generateEntityMapKey(targetEntity)));`
			`}`
			`// 将没有关系的节点单独放在一起`
			`List<EntityExtractionDTO> leavedEntities = new ArrayList<>();`
			`for (Map.Entry<String, EntityExtractionDTO> entry : entityMap.entrySet()) {`
			`if (!relationEntityKey.contains(entry.getKey())){`
			`leavedEntities.add(entry.getValue());`
			`}`
			`}`
			`EREDTO eredto = new EREDTO();`
			`eredto.setEntities(leavedEntities);`
			`merged.add(eredto);`
			`return merged;`
			`}`
初始化代码 2 months ago
初始化代码 2 months ago			`private void mergeAttribute(Map<String, RelationExtractionDTO> entityMap,RelationExtractionDTO relation, String key) {`

			`RelationExtractionDTO cachedEntity = entityMap.get(key);`
			`if (null == cachedEntity){`
			`entityMap.put(key, relation);`
			`}else {`
			`if (CollUtil.isEmpty(relation.getAttributes())){`
			`return;`
			`}`
			`// 合并属性`
			`List<ERAttributeDTO> attributes = relation.getAttributes();`
			`if (null == attributes){`
			`attributes = new ArrayList<>();`
			`}`
			`for (ERAttributeDTO attribute : relation.getAttributes()) {`
			`String attributeKey = attribute.getAttribute();`
			`String attributeValue = attribute.getValue();`
			`if (StrUtil.isEmpty(attributeKey) \|\| StrUtil.isEmpty(attributeValue)){`
			`continue;`
			`}`
			`// 如果属性已经存在，则不添加`
			`if (attributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) {`
			`attributes.add(attribute);`
			`}`
			`}`
			`}`
			`}`
			`private void mergeAttribute(Map<String, EntityExtractionDTO> entityMap,EntityExtractionDTO entity, String key) {`

			`EntityExtractionDTO cachedEntity = entityMap.get(key);`
			`if (null == cachedEntity){`
			`entityMap.put(key, entity);`
			`}else {`
			`if (CollUtil.isEmpty(entity.getAttributes())){`
			`return;`
			`}`
			`// 合并属性`
			`List<ERAttributeDTO> attributes = entity.getAttributes();`
			`if (null == attributes){`
			`attributes = new ArrayList<>();`
			`}`
			`for (ERAttributeDTO attribute : entity.getAttributes()) {`
			`String attributeKey = attribute.getAttribute();`
			`String attributeValue = attribute.getValue();`
			`if (StrUtil.isEmpty(attributeKey) \|\| StrUtil.isEmpty(attributeValue)){`
			`continue;`
			`}`
			`// 如果属性已经存在，则不添加`
			`if (attributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) {`
			`attributes.add(attribute);`
			`}`
			`}`
			`}`
			`}`

			`private String generateEntityMapKey(EntityExtractionDTO entityExtractionDTO) {`
			`return entityExtractionDTO.getEntity() + "_" + entityExtractionDTO.getName();`
			`}`

			`private String generateRelationMapKey(RelationExtractionDTO relationExtractionDTO) {`
			`return relationExtractionDTO.getSource() + "_" + relationExtractionDTO.getTarget() + "_" + relationExtractionDTO.getRelation();`
初始化代码 2 months ago			`}`
			`}`