|
|
|
@ -87,12 +87,18 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
|
|
|
|
|
Assert.notNull(pdfInfo, "pdfId:{}没有找到对应的pdf信息", pdfId);
|
|
|
|
|
if (null == pdfInfo.getTrainStatus()){
|
|
|
|
|
// todo:训练异常,需要记录异常状态
|
|
|
|
|
log.info("pdfId:{}没有找到对应的pdf训练状态,开始识别文档训练状态...", pdfId);
|
|
|
|
|
pdfInfoService.pdfToGraphStart(pdfId);
|
|
|
|
|
if (StrUtil.isEmpty(pdfInfo.getContentType())){
|
|
|
|
|
log.info("pdfId:{}没有找到对应的pdf内容类型,开始识别文档内容类型...", pdfId);
|
|
|
|
|
DocumentContentTypeEnum documentContentTypeEnum = tripleConversionPipeline.makeOutPdfContentType(pdfId);
|
|
|
|
|
log.info("pdfId:{}识别文档内容类型完成,内容类型:{}", pdfId, documentContentTypeEnum.getType());
|
|
|
|
|
if (StrUtil.isEmpty(documentContentTypeEnum.getType())){
|
|
|
|
|
log.info("pdfId:{}没有找到对应的pdf内容类型,停止后续任务...", pdfId);
|
|
|
|
|
pdfInfoService.pdfTrainFail(pdfId);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
pdfInfo.setContentType(documentContentTypeEnum.getType());
|
|
|
|
|
pdfInfoService.updateContentType(pdfId, documentContentTypeEnum.getType());
|
|
|
|
|
}
|
|
|
|
@ -100,6 +106,11 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
log.info("pdfId:{}没有找到对应的pdf行业,开始识别文档行业...", pdfId);
|
|
|
|
|
String industry = tripleConversionPipeline.makeOutPdfIndustry(pdfId);
|
|
|
|
|
log.info("pdfId:{}识别文档行业完成,行业:{}", pdfId, industry);
|
|
|
|
|
if (StrUtil.isEmpty(industry)){
|
|
|
|
|
log.info("pdfId:{}没有找到对应的pdf行业,停止后续任务...", pdfId);
|
|
|
|
|
pdfInfoService.pdfTrainFail(pdfId);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
pdfInfo.setDomainCategoryId(industry);
|
|
|
|
|
pdfInfoService.updateCategory(pdfId, industry);
|
|
|
|
|
}
|
|
|
|
@ -107,43 +118,86 @@ public class KnowledgeGraphServiceImpl implements KnowledgeGraphService {
|
|
|
|
|
TripleConversionPipeline tripleConversionPipeline = this.getTripleConversionPipeline(pdfInfo.getContentType(), pdfInfo.getDomainCategoryId());
|
|
|
|
|
|
|
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(pdfId);
|
|
|
|
|
if (CollUtil.isEmpty(pdfAnalysisOutputs)){
|
|
|
|
|
log.warn("没有找到pdfId为{}的pdf分析结果,不再进行下一步操作...", pdfId);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
List<String> documentIds = pdfAnalysisOutputs.stream().map(p->String.valueOf(p.getId())).collect(Collectors.toList());
|
|
|
|
|
List<DocumentTruncation> documentTruncations = documentTruncationService.queryByDocumentIds(documentIds);
|
|
|
|
|
List<TruncateDTO> truncateDTOS = new ArrayList<>();
|
|
|
|
|
if (CollUtil.isNotEmpty(documentTruncations)){
|
|
|
|
|
log.info("没有找到文档切分数据,pdfId:{},不用重置数据...", pdfId);
|
|
|
|
|
truncateDTOS = documentTruncations.stream().map(TruncateDTO::new).collect(Collectors.toList());
|
|
|
|
|
}
|
|
|
|
|
if (CollUtil.isEmpty(documentTruncations)){
|
|
|
|
|
log.info("开始切割文档切片,pdfId:{}", pdfId);
|
|
|
|
|
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).collect(Collectors.toList());
|
|
|
|
|
truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
|
|
|
|
|
log.info("切割文档切片完成,切片个数:{}", truncateDTOS.size());
|
|
|
|
|
// 保存分片信息
|
|
|
|
|
documentTruncationService.batchSave(truncateDTOS);
|
|
|
|
|
log.info("文档切分数据不为空,pdfId:{},清除切分数据...", pdfId);
|
|
|
|
|
documentTruncationService.deleteByDocumentIds(documentIds);
|
|
|
|
|
}
|
|
|
|
|
log.info("开始切割文档切片,pdfId:{}", pdfId);
|
|
|
|
|
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).collect(Collectors.toList());
|
|
|
|
|
List<TruncateDTO> truncateDTOS = tripleConversionPipeline.sliceDocuments(documentDTOList);
|
|
|
|
|
log.info("切割文档切片完成,切片个数:{}", truncateDTOS.size());
|
|
|
|
|
// 保存分片信息
|
|
|
|
|
documentTruncationService.batchSave(truncateDTOS);
|
|
|
|
|
|
|
|
|
|
for (TruncateDTO truncateDTO : truncateDTOS) {
|
|
|
|
|
List<String> intents = tripleConversionPipeline.makeOutTruncationIntent(truncateDTO);
|
|
|
|
|
List<DomainMetadataDTO> domainMetadataDTOS = tripleConversionPipeline.makeOutDomainMetadata(truncateDTO, intents);
|
|
|
|
|
// 保存意图数据
|
|
|
|
|
List<Intention> intentions = intentionService.batchSaveIfAbsent(intents, pdfInfo.getDomainCategoryId(), pdfId.toString());
|
|
|
|
|
|
|
|
|
|
for (Intention intention : intentions) {
|
|
|
|
|
List<DomainMetadataDTO> metadataDTOS = domainMetadataDTOS.stream()
|
|
|
|
|
.filter(d -> StrUtil.equals(d.getIntentDigest(), intention.getDigest())).toList();
|
|
|
|
|
domainMetadataService.batchSaveOrUpdateMetadata(metadataDTOS,intention.getId(), pdfInfo.getDomainCategoryId());
|
|
|
|
|
try {
|
|
|
|
|
List<String> intents = tripleConversionPipeline.makeOutTruncationIntent(truncateDTO);
|
|
|
|
|
List<DomainMetadataDTO> domainMetadataDTOS = tripleConversionPipeline.makeOutDomainMetadata(truncateDTO, intents);
|
|
|
|
|
// 保存意图数据
|
|
|
|
|
List<Intention> intentions = intentionService.batchSaveIfAbsent(intents, pdfInfo.getDomainCategoryId(), pdfId.toString());
|
|
|
|
|
for (Intention intention : intentions) {
|
|
|
|
|
List<DomainMetadataDTO> metadataDTOS = domainMetadataDTOS.stream()
|
|
|
|
|
.filter(d -> StrUtil.equals(d.getIntentDigest(), intention.getDigest())).toList();
|
|
|
|
|
domainMetadataService.batchSaveOrUpdateMetadata(metadataDTOS,intention.getId(), pdfInfo.getDomainCategoryId());
|
|
|
|
|
}
|
|
|
|
|
}catch (Exception e){
|
|
|
|
|
log.error("切分文档id:{},意图识别失败", truncateDTO.getId(), e);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void generateGraphBaseTrain(Integer pdfId) {
|
|
|
|
|
Assert.notNull(pdfId, "pdfId不能为空");
|
|
|
|
|
PdfInfo pdfInfo = pdfInfoService.getByPdfId(pdfId);
|
|
|
|
|
Assert.notNull(pdfInfo, "pdfId:{}没有找到对应的pdf信息", pdfId);
|
|
|
|
|
Assert.isTrue((null !=pdfInfo.getTrainStatus() && pdfInfo.getTrainStatus() == 1),
|
|
|
|
|
"pdfId:{}的pdf训练状态:{} 不符合要求", pdfId, pdfInfo.getTrainStatus());
|
|
|
|
|
|
|
|
|
|
List<TruncateDTO> truncateDTOS = documentTruncationService.listByPdfId(pdfId).stream().map(TruncateDTO::new).collect(Collectors.toList());
|
|
|
|
|
TripleConversionPipeline conversionPipeline = this.getTripleConversionPipeline(pdfInfo.getContentType(), pdfInfo.getDomainCategoryId());
|
|
|
|
|
if (CollUtil.isEmpty(truncateDTOS)){
|
|
|
|
|
log.info("没有找到pdfId为{}的文档切分数据,开始切分数据...", pdfId);
|
|
|
|
|
List<PdfAnalysisOutput> pdfAnalysisOutputs = pdfAnalysisOutputService.queryByPdfId(pdfId);
|
|
|
|
|
List<DocumentDTO> documentDTOList = pdfAnalysisOutputs.stream().map(DocumentDTO::new).collect(Collectors.toList());
|
|
|
|
|
truncateDTOS = conversionPipeline.sliceDocuments(documentDTOList);
|
|
|
|
|
documentTruncationService.batchSave(truncateDTOS);
|
|
|
|
|
log.info("切分数据完成,切分个数:{}", truncateDTOS.size());
|
|
|
|
|
}
|
|
|
|
|
log.info("开始命名实体识别,切分文档个数:{}", truncateDTOS.size());
|
|
|
|
|
// 查询当前行业分类下的意图
|
|
|
|
|
List<IntentDTO> intentionDTOs = intentionService.queryByDomainCategoryId(pdfInfo.getDomainCategoryId()).stream().map(IntentDTO::new).distinct().toList();
|
|
|
|
|
if (CollUtil.isEmpty(intentionDTOs)){
|
|
|
|
|
log.info("没有找到行业分类id为{}的意图数据,不再进行下一步操作...", pdfInfo.getDomainCategoryId());
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (TruncateDTO truncateDTO : truncateDTOS) {
|
|
|
|
|
try {
|
|
|
|
|
List<IntentDTO> intents = conversionPipeline.makeOutTruncationIntent(truncateDTO,intentionDTOs);
|
|
|
|
|
if (CollUtil.isEmpty(intents)){
|
|
|
|
|
log.info("切分文档id:{},未正确识别出意图...", truncateDTO.getId());
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
EREDTO eredto = conversionPipeline.doEre(truncateDTO, intents);
|
|
|
|
|
if (null == eredto){
|
|
|
|
|
log.info("切分文档id:{},命名实体识别结果为空...", truncateDTO.getId());
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// 保存实体关系抽取结果
|
|
|
|
|
this.saveERE(eredto, truncateDTO.getId());
|
|
|
|
|
}catch (Exception e){
|
|
|
|
|
log.error("命名实体识别失败,切分文档id:{}", truncateDTO.getId(), e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|