修改表提取方式为程序快速提取

v_0.0.2
xueqingkun 3 weeks ago
parent 8b4ec18483
commit 1fb2c75dc3

@ -90,6 +90,16 @@
<artifactId>neo4j-java-driver</artifactId>
<version>5.15.0</version>
</dependency>
<dependency>
<groupId>org.commonmark</groupId>
<artifactId>commonmark</artifactId>
<version>0.21.0</version>
</dependency>
<dependency>
<groupId>org.commonmark</groupId>
<artifactId>commonmark-ext-gfm-tables</artifactId>
<version>0.21.0</version>
</dependency>
</dependencies>
<build>
<plugins>

@ -2,6 +2,7 @@ package com.supervision.pdfqaserver.dto;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.lang.UUID;
import cn.hutool.core.util.NumberUtil;
import cn.hutool.core.util.RandomUtil;
import cn.hutool.core.util.StrUtil;
import com.alibaba.fastjson.JSONArray;
@ -109,6 +110,46 @@ public class EREDTO {
return eredto;
}
public static EREDTO fromHeadAndRows(List<String> heads,List<List<String>> rows,String truncationId) {
EREDTO eredto = new EREDTO();
if (CollUtil.isEmpty(heads) || CollUtil.isEmpty(rows)){
return eredto;
}
List<EntityExtractionDTO> entities = new ArrayList<>();
for (List<String> row : rows) {
if (CollUtil.isEmpty(row)){
continue;
}
EntityExtractionDTO entityExtractionDTO = new EntityExtractionDTO();
entityExtractionDTO.setEntity("行");
// 避免表格行名重复
entityExtractionDTO.setName("行-" + RandomUtil.randomString(UUID.randomUUID().toString(), 10));
entityExtractionDTO.setTruncationId(truncationId);
List<TruncationERAttributeDTO> truncationErAttributeDTOS = new ArrayList<>();
for (int i = 0; i < heads.size(); i++) {
String key = heads.get(i);
if (StrUtil.isBlank(key)){
continue;
}
key = StrUtil.trim(key);
String value = i < row.size() ? row.get(i) : "";
if (StrUtil.isBlank(value)){
continue;
}
value = StrUtil.trim(value);
TruncationERAttributeDTO truncationErAttributeDTO = new TruncationERAttributeDTO(key, value, NumberUtil.isNumber(value) ? "1" : "0");
truncationErAttributeDTOS.add(truncationErAttributeDTO);
}
entityExtractionDTO.setAttributes(truncationErAttributeDTOS);
entities.add(entityExtractionDTO);
}
return eredto;
}
public static EREDTO fromTableJson(String json,String truncationId) {
EREDTO eredto = new EREDTO();

@ -0,0 +1,98 @@
package com.supervision.pdfqaserver.service;
import org.commonmark.ext.gfm.tables.*;
import org.commonmark.node.*;
import java.util.ArrayList;
import java.util.List;
public class TableVisitor extends AbstractVisitor {
private boolean inHeader = false;
private boolean inBody = false;
private List<String> currentRow = null;
private List<String> headers = new ArrayList<>();
private final List<List<String>> rows = new ArrayList<>();
@Override
public void visit(CustomBlock customBlock) {
if (customBlock instanceof TableBlock) {
handleTableBlock((TableBlock) customBlock);
} else {
super.visit(customBlock);
}
}
@Override
public void visit(CustomNode customNode) {
if (customNode instanceof TableHead) {
handleTableHead((TableHead) customNode);
} else if (customNode instanceof TableBody) {
handleTableBody((TableBody) customNode);
} else if (customNode instanceof TableRow) {
handleTableRow((TableRow) customNode);
} else if (customNode instanceof TableCell) {
handleTableCell((TableCell) customNode);
} else {
super.visit(customNode);
}
}
private void handleTableBlock(TableBlock tableBlock) {
// 重置状态
inHeader = false;
inBody = false;
visitChildren(tableBlock);
}
private void handleTableHead(TableHead tableHead) {
inHeader = true;
visitChildren(tableHead);
inHeader = false;
}
private void handleTableBody(TableBody tableBody) {
inBody = true;
visitChildren(tableBody);
inBody = false;
}
private void handleTableRow(TableRow tableRow) {
currentRow = new ArrayList<>();
visitChildren(tableRow);
if (inHeader) {
this.headers = currentRow;
} else if (inBody) {
this.rows.add(currentRow);
}
}
private void handleTableCell(TableCell tableCell) {
if (currentRow != null) {
currentRow.add(getTextContent(tableCell));
}
visitChildren(tableCell);
}
private String getTextContent(Node node) {
StringBuilder sb = new StringBuilder();
Node child = node.getFirstChild();
while (child != null) {
if (child instanceof Text) {
sb.append(((Text) child).getLiteral());
}
child = child.getNext();
}
return sb.toString().trim();
}
public List<String> getTableHeaders() {
return headers;
}
public List<List<String>> getTableRows() {
return rows;
}
}

@ -208,6 +208,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
log.info("意图元数据识别完成,切分文档id:{},耗时:{}毫秒", truncateDTO.getId(),interval.intervalMs("makeOutDomainMetadata"));
// 保存意图数据
intentSize ++;
index ++;
List<Intention> intentions = intentionService.batchSaveIfAbsent(intents, pdfInfo.getDomainCategoryId(), pdfId.toString());
for (Intention intention : intentions) {
List<DomainMetadataDTO> metadataDTOS = domainMetadataDTOS.stream()
@ -215,7 +216,7 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
domainMetadataService.batchSaveOrUpdateMetadata(metadataDTOS,intention.getId(), pdfInfo.getDomainCategoryId());
}
}catch (Exception e){
intentSize ++;
index ++;
log.error("切分文档id:{},意图识别失败", truncateDTO.getId(), e);
}
@ -284,15 +285,13 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
try {
if (StrUtil.equals(truncateDTO.getLayoutType(), String.valueOf(LayoutTypeEnum.TABLE.getCode()))){
log.info("切分文档id:{},表格类型数据,不进行意图识别...", truncateDTO.getId());
/*EREDTO eredto = conversionPipeline.doEre(truncateDTO, new ArrayList<>());
EREDTO eredto = conversionPipeline.doEre(truncateDTO, new ArrayList<>());
if (null == eredto){
log.info("切分文档id:{},命名实体识别结果为空...", truncateDTO.getId());
continue;
}
this.saveERE(eredto, truncateDTO.getId());
eredtos.add(eredto);
*/
continue;
}
timer.start("makeOutTruncationIntent");

@ -18,6 +18,10 @@ import edu.stanford.nlp.pipeline.CoreSentence;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.commonmark.Extension;
import org.commonmark.ext.gfm.tables.TablesExtension;
import org.commonmark.node.Node;
import org.commonmark.parser.Parser;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.stream.Collectors;
@ -216,7 +220,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
return doTextEre(truncateDTO);
}
return doTableEre(truncateDTO);
return doTableEreFast(truncateDTO);
}
log.warn("doEre:错误的布局类型: {}", truncateDTO.getLayoutType());
return null;
@ -225,7 +229,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
/**
*
*
* : 10001000
* : maxTextLengthmaxTextLength
* : 4
* @param documents
* @return
@ -252,10 +256,8 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
// 创建管道
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
List<TruncateDTO> truncateDTOS = new ArrayList<>();
StringBuilder truncateTextBuild = new StringBuilder(1500);
DocumentDTO documentDTOLast = null;
StringBuilder truncateTextBuild = new StringBuilder(minTextLength + maxTextLength);
for (DocumentDTO documentDTO : documentDTOList) {
documentDTOLast = documentDTO;
String content = documentDTO.getContent();
if (StrUtil.isEmpty(content)) {
continue;
@ -293,7 +295,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
// 如果是表格类型的布局,进行切分
// 出现表格后如果truncateTextBuild不为空单独作为一个片段
if (!truncateTextBuild.isEmpty()) {
truncateDTOS.add(new TruncateDTO(documentDTO, truncateTextBuild.toString()));
TruncateDTO truncateDTO = new TruncateDTO(documentDTO, truncateTextBuild.toString());
truncateDTO.setLayoutType(String.valueOf(LayoutTypeEnum.TEXT.getCode()));//强制设置为文本类型
truncateDTOS.add(truncateDTO);
}
// 提前抽取表名
TableTitleDTO tableTitleDTO = this.extractTableTitle(documentDTO.getTitle());
@ -331,8 +335,10 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
log.info("sliceDocuments:错误的布局类型: {}", layoutType);
}
}
if (!truncateTextBuild.isEmpty() && null != documentDTOLast) {
truncateDTOS.add(new TruncateDTO(documentDTOLast, truncateTextBuild.toString()));
if (!truncateTextBuild.isEmpty() && null != CollUtil.getLast(documentDTOList)) {
TruncateDTO truncateDTO = new TruncateDTO(CollUtil.getLast(documentDTOList), truncateTextBuild.toString());
truncateDTO.setLayoutType(String.valueOf(LayoutTypeEnum.TEXT.getCode()));//强制设置为文本类型
truncateDTOS.add(truncateDTO);
}
return truncateDTOS;
}
@ -465,6 +471,33 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
log.info("doTableEre响应结果:{}", response);
EREDTO eredto = EREDTO.fromTableJson(response, truncateDTO.getId());
// 手动设置表格标题
manualSetTableTitle(truncateDTO, eredto);
return eredto;
}
private EREDTO doTableEreFast(TruncateDTO truncateDTO){
log.info("doTableEreFast:开始进行表格实体关系抽取,内容:{}", truncateDTO.getContent());
if (StrUtil.isEmpty(truncateDTO.getContent())){
return null;
}
List<Extension> extensions = Arrays.asList(TablesExtension.create());
Parser parser = Parser.builder().extensions(extensions).build();
Node document = parser.parse(truncateDTO.getContent());
TableVisitor visitor = new TableVisitor();
document.accept(visitor);
List<String> tableHeaders = visitor.getTableHeaders();
List<List<String>> tableRows = visitor.getTableRows();
EREDTO eredto = EREDTO.fromHeadAndRows(tableHeaders, tableRows, truncateDTO.getId());
// 手动设置表格标题
manualSetTableTitle(truncateDTO, eredto);
return eredto;
}
private void manualSetTableTitle(TruncateDTO truncateDTO, EREDTO eredto) {
EntityExtractionDTO titleEntity = new EntityExtractionDTO();
titleEntity.setEntity("表");
titleEntity.setTruncationId(truncateDTO.getId());
@ -478,7 +511,6 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
}
eredto.getEntities().add(titleEntity);
eredto.setRelations(relations);
return eredto;
}
/**

@ -17,6 +17,7 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static org.neo4j.driver.Values.parameters;
import org.commonmark.node.*;
@Slf4j
@SpringBootTest
@ -153,13 +154,13 @@ class PdfQaServerApplicationTests {
@Test
public void metaDataTrainTest() {
knowledgeGraphService.metaDataTrain(13);
knowledgeGraphService.metaDataTrain(15);
}
@Test
void generateGraphBaseTrainTest() {
knowledgeGraphService.generateGraphBaseTrain(13);
knowledgeGraphService.generateGraphBaseTrain(14);
}

Loading…
Cancel
Save