|
|
|
@ -93,6 +93,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
log.info("makeOutTruncationIntent:响应结果:{}", call);
|
|
|
|
|
JSONObject json = JSONUtil.parseObj(call);
|
|
|
|
|
JSONArray jsonArray = json.getJSONArray("IntentTypeList");
|
|
|
|
|
if (null == jsonArray){
|
|
|
|
|
return new ArrayList<>();
|
|
|
|
|
}
|
|
|
|
|
return intents.stream().filter(intent->
|
|
|
|
|
jsonArray.stream().anyMatch(o->StrUtil.equals(o.toString(), intent.getDigest())))
|
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
@ -104,8 +107,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
Assert.notEmpty(intents, "意图不能为空");
|
|
|
|
|
|
|
|
|
|
String promptTemplate = promptMap.get(EXTRACT_INTENT_METADATA);
|
|
|
|
|
Map<String, String> params = Map.of("text", truncate.getContent(), "IntentType", JSONUtil.toJsonStr(intents));
|
|
|
|
|
Map<String, String> params = Map.of("text", truncate.getContent(), "IntentTypeList", JSONUtil.toJsonStr(intents));
|
|
|
|
|
String format = StrUtil.format(promptTemplate, params);
|
|
|
|
|
log.info("makeOutDomainMetadata:format:{}", format);
|
|
|
|
|
String call = aiCallService.call(format);
|
|
|
|
|
log.info("makeOutDomainMetadata:响应结果:{}", call);
|
|
|
|
|
return parseDomainMetadata(call);
|
|
|
|
@ -139,6 +143,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
JSONObject source = jsonObject.getJSONObject("source");
|
|
|
|
|
JSONObject relation = jsonObject.getJSONObject("relation");
|
|
|
|
|
JSONObject target = jsonObject.getJSONObject("target");
|
|
|
|
|
domainMetadataDTO.setIntentDigest(jsonObject.getStr("intent"));
|
|
|
|
|
if (null != source){
|
|
|
|
|
String type = source.getStr("type");
|
|
|
|
|
JSONArray attributes = source.getJSONArray("attributes");
|
|
|
|
@ -189,6 +194,14 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
List<DomainMetadataDTO> domainMetadataDTOS = domainMetadataService.listByIntentionIds(intentIds);
|
|
|
|
|
log.info("doEre:领域元数据列表个数:{}", domainMetadataDTOS.size());
|
|
|
|
|
domainMetadataDTOS = domainMetadataDTOS.stream()
|
|
|
|
|
.filter(domainMetadataDTO -> StrUtil.equals(domainMetadataDTO.getGenerationType(), "0"))// 过滤出手动确认的数据
|
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
|
log.info("doEre:领域元数据列表已经手动确认过的个数:{}", domainMetadataDTOS.size());
|
|
|
|
|
if (CollUtil.isEmpty(domainMetadataDTOS)){
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
return doTextEreWithMetadata(truncateDTO, domainMetadataDTOS);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -220,9 +233,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
@Override
|
|
|
|
|
public List<TruncateDTO> sliceDocuments(List<DocumentDTO> documents) {
|
|
|
|
|
|
|
|
|
|
int maxTextLength = 1000;
|
|
|
|
|
int minTextLength = 800;
|
|
|
|
|
int INITIAL_BUFFER_SIZE = 1500;
|
|
|
|
|
int maxTextLength = 600;
|
|
|
|
|
int minTextLength = 500;
|
|
|
|
|
int INITIAL_BUFFER_SIZE = 100;
|
|
|
|
|
// 对pdfAnalysisOutputs进行排序
|
|
|
|
|
List<DocumentDTO> documentDTOList = documents.stream().sorted(
|
|
|
|
|
// 先对pageNo进行排序再对layoutOrder进行排序
|
|
|
|
@ -240,7 +253,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
|
|
|
|
|
List<TruncateDTO> truncateDTOS = new ArrayList<>();
|
|
|
|
|
StringBuilder truncateTextBuild = new StringBuilder(1500);
|
|
|
|
|
DocumentDTO documentDTOLast = null;
|
|
|
|
|
for (DocumentDTO documentDTO : documentDTOList) {
|
|
|
|
|
documentDTOLast = documentDTO;
|
|
|
|
|
String content = documentDTO.getContent();
|
|
|
|
|
if (StrUtil.isEmpty(content)){
|
|
|
|
|
continue;
|
|
|
|
@ -274,13 +289,12 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// 处理剩余内容
|
|
|
|
|
} else if (LayoutTypeEnum.TABLE.getCode() == layoutType) {
|
|
|
|
|
// 如果是表格类型的布局,进行切分
|
|
|
|
|
// 出现表格后,如果truncateTextBuild不为空,单独作为一个片段
|
|
|
|
|
if (!truncateTextBuild.isEmpty()) {
|
|
|
|
|
truncateDTOS.add(new TruncateDTO(documentDTO, truncateTextBuild.toString()));
|
|
|
|
|
}
|
|
|
|
|
} else if (LayoutTypeEnum.TABLE.getCode() == layoutType) {
|
|
|
|
|
// 如果是表格类型的布局,进行切分
|
|
|
|
|
|
|
|
|
|
// 提前抽取表名
|
|
|
|
|
TableTitleDTO tableTitleDTO = this.extractTableTitle(documentDTO.getTitle());
|
|
|
|
|
if (null != tableTitleDTO && StrUtil.isNotEmpty(tableTitleDTO.getTitle())){
|
|
|
|
@ -317,6 +331,9 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
log.info("sliceDocuments:错误的布局类型: {}", layoutType);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (!truncateTextBuild.isEmpty() && null != documentDTOLast) {
|
|
|
|
|
truncateDTOS.add(new TruncateDTO(documentDTOLast, truncateTextBuild.toString()));
|
|
|
|
|
}
|
|
|
|
|
return truncateDTOS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -378,7 +395,7 @@ public class TripleConversionPipelineImpl implements TripleConversionPipeline {
|
|
|
|
|
Map<String, String> params = Map.of("text", truncateDTO.getContent(), "domainMetadata", domainMetadata);
|
|
|
|
|
String format = StrUtil.format(prompt, params);
|
|
|
|
|
String call = aiCallService.call(format);
|
|
|
|
|
return null;
|
|
|
|
|
return EREDTO.fromTextJson(call, truncateDTO.getId());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|