|
|
@ -3,15 +3,19 @@ package com.supervision.pdfqaserver.service.impl;
|
|
|
|
import cn.hutool.core.collection.CollUtil;
|
|
|
|
import cn.hutool.core.collection.CollUtil;
|
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
|
import com.supervision.pdfqaserver.cache.PromptCache;
|
|
|
|
import com.supervision.pdfqaserver.cache.PromptCache;
|
|
|
|
|
|
|
|
import com.supervision.pdfqaserver.constant.LayoutTypeEnum;
|
|
|
|
import com.supervision.pdfqaserver.dto.*;
|
|
|
|
import com.supervision.pdfqaserver.dto.*;
|
|
|
|
import com.supervision.pdfqaserver.service.TripleConversionPipeline;
|
|
|
|
import com.supervision.pdfqaserver.service.TripleConversionPipeline;
|
|
|
|
|
|
|
|
import edu.stanford.nlp.pipeline.CoreDocument;
|
|
|
|
|
|
|
|
import edu.stanford.nlp.pipeline.CoreSentence;
|
|
|
|
|
|
|
|
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
|
|
|
|
import lombok.RequiredArgsConstructor;
|
|
|
|
import lombok.RequiredArgsConstructor;
|
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
|
import org.springframework.ai.ollama.OllamaChatModel;
|
|
|
|
import org.springframework.ai.ollama.OllamaChatModel;
|
|
|
|
import org.springframework.stereotype.Service;
|
|
|
|
import org.springframework.stereotype.Service;
|
|
|
|
|
|
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.*;
|
|
|
|
import java.util.List;
|
|
|
|
|
|
|
|
@Slf4j
|
|
|
|
@Slf4j
|
|
|
|
@Service
|
|
|
|
@Service
|
|
|
|
@RequiredArgsConstructor
|
|
|
|
@RequiredArgsConstructor
|
|
|
@ -31,7 +35,38 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
return Integer.compare(o1.getPageNo(), o2.getPageNo());
|
|
|
|
return Integer.compare(o1.getPageNo(), o2.getPageNo());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
).toList();
|
|
|
|
).toList();
|
|
|
|
return null;
|
|
|
|
|
|
|
|
|
|
|
|
Properties props = new Properties();
|
|
|
|
|
|
|
|
props.setProperty("annotators", "tokenize, ssplit");
|
|
|
|
|
|
|
|
// 创建管道
|
|
|
|
|
|
|
|
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
|
|
|
|
|
|
|
|
List<TruncateDTO> truncateDTOS = new ArrayList<>();
|
|
|
|
|
|
|
|
for (DocumentDTO documentDTO : documentDTOList) {
|
|
|
|
|
|
|
|
String content = documentDTO.getContent();
|
|
|
|
|
|
|
|
if (StrUtil.isEmpty(content)){
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
Integer layoutType = documentDTO.getLayoutType();
|
|
|
|
|
|
|
|
if (LayoutTypeEnum.TEXT.getCode() == layoutType){
|
|
|
|
|
|
|
|
// 如果是文本类型的布局,进行合并
|
|
|
|
|
|
|
|
CoreDocument document = new CoreDocument(content);
|
|
|
|
|
|
|
|
// 分析文本
|
|
|
|
|
|
|
|
pipeline.annotate(document);
|
|
|
|
|
|
|
|
// 获取句子
|
|
|
|
|
|
|
|
for (CoreSentence sentence : document.sentences()) {
|
|
|
|
|
|
|
|
TruncateDTO truncateDTO = new TruncateDTO(documentDTO);
|
|
|
|
|
|
|
|
truncateDTO.setContent(sentence.text());
|
|
|
|
|
|
|
|
truncateDTOS.add(truncateDTO);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} else if (LayoutTypeEnum.TABLE.getCode() == layoutType) {
|
|
|
|
|
|
|
|
// 如果是表格类型的布局,直接添加到列表中
|
|
|
|
|
|
|
|
TruncateDTO truncateDTO = new TruncateDTO(documentDTO);
|
|
|
|
|
|
|
|
truncateDTOS.add(truncateDTO);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
log.info("sliceDocuments:错误的布局类型: {}", layoutType);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return truncateDTOS;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
@ -39,13 +74,11 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
|
|
|
|
|
|
|
if (StrUtil.equals(truncateDTO.getLayoutType(),"0")){
|
|
|
|
if (StrUtil.equals(truncateDTO.getLayoutType(),"0")){
|
|
|
|
|
|
|
|
|
|
|
|
EREDTO eredto = doTextEre(truncateDTO);
|
|
|
|
return doTextEre(truncateDTO);
|
|
|
|
return eredto;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (StrUtil.equals(truncateDTO.getLayoutType(),"1")){
|
|
|
|
if (StrUtil.equals(truncateDTO.getLayoutType(),"1")){
|
|
|
|
EREDTO eredto = doTableEre(truncateDTO);
|
|
|
|
return doTableEre(truncateDTO);
|
|
|
|
return eredto;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType());
|
|
|
|
log.info("doEre:错误的布局类型: {}", truncateDTO.getLayoutType());
|
|
|
|
return null;
|
|
|
|
return null;
|
|
|
@ -80,26 +113,119 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
if (CollUtil.isEmpty(eredtoList)){
|
|
|
|
if (CollUtil.isEmpty(eredtoList)){
|
|
|
|
return merged;
|
|
|
|
return merged;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Map<String, EntityExtractionDTO> entityMap = new HashMap<>();
|
|
|
|
|
|
|
|
Map<String, RelationExtractionDTO> relationMap = new HashMap<>();
|
|
|
|
for (EREDTO eredto : eredtoList) {
|
|
|
|
for (EREDTO eredto : eredtoList) {
|
|
|
|
List<EntityExtractionDTO> entities = eredto.getEntities();
|
|
|
|
List<EntityExtractionDTO> entities = eredto.getEntities();
|
|
|
|
if (CollUtil.isNotEmpty(entities)){
|
|
|
|
if (CollUtil.isNotEmpty(entities)){
|
|
|
|
for (EntityExtractionDTO entity : entities) {
|
|
|
|
for (EntityExtractionDTO entity : entities) {
|
|
|
|
String e = entity.getEntity();
|
|
|
|
String key = generateEntityMapKey(entity);
|
|
|
|
String name = entity.getName();
|
|
|
|
mergeAttribute(entityMap,entity, key);
|
|
|
|
// entity.getEntity() 和 entity.getName() 完全相等看作是同一个数据
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
List<RelationExtractionDTO> relations = eredto.getRelations();
|
|
|
|
List<RelationExtractionDTO> relations = eredto.getRelations();
|
|
|
|
if (CollUtil.isNotEmpty(relations)){
|
|
|
|
if (CollUtil.isNotEmpty(relations)){
|
|
|
|
for (RelationExtractionDTO relation : relations) {
|
|
|
|
for (RelationExtractionDTO relation : relations) {
|
|
|
|
String source = relation.getSource();
|
|
|
|
|
|
|
|
String target = relation.getTarget();
|
|
|
|
|
|
|
|
String re = relation.getRelation();
|
|
|
|
|
|
|
|
// source和target,re完全相等看作是同一个数据
|
|
|
|
// source和target,re完全相等看作是同一个数据
|
|
|
|
|
|
|
|
String relationMapKey = generateRelationMapKey(relation);
|
|
|
|
|
|
|
|
mergeAttribute(relationMap,relation, relationMapKey);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// 利用合并后的map生成新的EREDTO
|
|
|
|
|
|
|
|
// 优先先把有关系的节点与关系组合在一次
|
|
|
|
|
|
|
|
Set<String> relationEntityKey = new HashSet<>();
|
|
|
|
|
|
|
|
for (Map.Entry<String, RelationExtractionDTO> relationEntry : relationMap.entrySet()) {
|
|
|
|
|
|
|
|
RelationExtractionDTO value = relationEntry.getValue();
|
|
|
|
|
|
|
|
EntityExtractionDTO sourceEntity = entityMap.get(StrUtil.join("_", value.getSourceType(), value.getSource()));
|
|
|
|
|
|
|
|
if (null == sourceEntity){
|
|
|
|
|
|
|
|
log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到头节点映射关系", value.getSourceType(), value.getSource());
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
EntityExtractionDTO targetEntity = entityMap.get(StrUtil.join("_", value.getTargetType(), value.getTarget()));
|
|
|
|
|
|
|
|
if (null == targetEntity){
|
|
|
|
|
|
|
|
log.warn("mergeEreResults:根据entity:{},name:{}未在entityMap中找到尾节点映射关系", value.getTargetType(), value.getTarget());
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
EREDTO eredto = new EREDTO();
|
|
|
|
|
|
|
|
eredto.setEntities(List.of(sourceEntity,targetEntity));
|
|
|
|
|
|
|
|
eredto.setRelations(List.of(value));
|
|
|
|
|
|
|
|
merged.add(eredto);
|
|
|
|
|
|
|
|
relationEntityKey.addAll(List.of(generateEntityMapKey(sourceEntity),generateEntityMapKey(targetEntity)));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// 将没有关系的节点单独放在一起
|
|
|
|
|
|
|
|
List<EntityExtractionDTO> leavedEntities = new ArrayList<>();
|
|
|
|
|
|
|
|
for (Map.Entry<String, EntityExtractionDTO> entry : entityMap.entrySet()) {
|
|
|
|
|
|
|
|
if (!relationEntityKey.contains(entry.getKey())){
|
|
|
|
|
|
|
|
leavedEntities.add(entry.getValue());
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
EREDTO eredto = new EREDTO();
|
|
|
|
|
|
|
|
eredto.setEntities(leavedEntities);
|
|
|
|
|
|
|
|
merged.add(eredto);
|
|
|
|
|
|
|
|
return merged;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private void mergeAttribute(Map<String, RelationExtractionDTO> entityMap,RelationExtractionDTO relation, String key) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RelationExtractionDTO cachedEntity = entityMap.get(key);
|
|
|
|
|
|
|
|
if (null == cachedEntity){
|
|
|
|
|
|
|
|
entityMap.put(key, relation);
|
|
|
|
|
|
|
|
}else {
|
|
|
|
|
|
|
|
if (CollUtil.isEmpty(relation.getAttributes())){
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// 合并属性
|
|
|
|
|
|
|
|
List<ERAttributeDTO> attributes = relation.getAttributes();
|
|
|
|
|
|
|
|
if (null == attributes){
|
|
|
|
|
|
|
|
attributes = new ArrayList<>();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
for (ERAttributeDTO attribute : relation.getAttributes()) {
|
|
|
|
|
|
|
|
String attributeKey = attribute.getAttribute();
|
|
|
|
|
|
|
|
String attributeValue = attribute.getValue();
|
|
|
|
|
|
|
|
if (StrUtil.isEmpty(attributeKey) || StrUtil.isEmpty(attributeValue)){
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// 如果属性已经存在,则不添加
|
|
|
|
|
|
|
|
if (attributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) {
|
|
|
|
|
|
|
|
attributes.add(attribute);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private void mergeAttribute(Map<String, EntityExtractionDTO> entityMap,EntityExtractionDTO entity, String key) {
|
|
|
|
|
|
|
|
|
|
|
|
return null;
|
|
|
|
EntityExtractionDTO cachedEntity = entityMap.get(key);
|
|
|
|
|
|
|
|
if (null == cachedEntity){
|
|
|
|
|
|
|
|
entityMap.put(key, entity);
|
|
|
|
|
|
|
|
}else {
|
|
|
|
|
|
|
|
if (CollUtil.isEmpty(entity.getAttributes())){
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// 合并属性
|
|
|
|
|
|
|
|
List<ERAttributeDTO> attributes = entity.getAttributes();
|
|
|
|
|
|
|
|
if (null == attributes){
|
|
|
|
|
|
|
|
attributes = new ArrayList<>();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
for (ERAttributeDTO attribute : entity.getAttributes()) {
|
|
|
|
|
|
|
|
String attributeKey = attribute.getAttribute();
|
|
|
|
|
|
|
|
String attributeValue = attribute.getValue();
|
|
|
|
|
|
|
|
if (StrUtil.isEmpty(attributeKey) || StrUtil.isEmpty(attributeValue)){
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// 如果属性已经存在,则不添加
|
|
|
|
|
|
|
|
if (attributes.stream().noneMatch(a -> StrUtil.equals(a.getAttribute(), attributeKey))) {
|
|
|
|
|
|
|
|
attributes.add(attribute);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private String generateEntityMapKey(EntityExtractionDTO entityExtractionDTO) {
|
|
|
|
|
|
|
|
return entityExtractionDTO.getEntity() + "_" + entityExtractionDTO.getName();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private String generateRelationMapKey(RelationExtractionDTO relationExtractionDTO) {
|
|
|
|
|
|
|
|
return relationExtractionDTO.getSource() + "_" + relationExtractionDTO.getTarget() + "_" + relationExtractionDTO.getRelation();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|