diff --git a/src/main/java/com/supervision/police/dto/EvidenceDirectoryDTO.java b/src/main/java/com/supervision/police/dto/EvidenceDirectoryDTO.java index e5fb4d2..0339684 100644 --- a/src/main/java/com/supervision/police/dto/EvidenceDirectoryDTO.java +++ b/src/main/java/com/supervision/police/dto/EvidenceDirectoryDTO.java @@ -106,7 +106,9 @@ public class EvidenceDirectoryDTO { public List listAllFileId(){ List fileIdList = new ArrayList<>(); - fileIdList.add(this.id); + if (CollUtil.isNotEmpty(this.fileIdList)){ + fileIdList.addAll(this.fileIdList); + } if (CollUtil.isNotEmpty(this.child)){ for (EvidenceDirectoryDTO child : this.getChild()) { fileIdList.addAll(child.listAllFileId()); diff --git a/src/main/java/com/supervision/police/service/CaseEvidenceService.java b/src/main/java/com/supervision/police/service/CaseEvidenceService.java index 32be10e..9c59c00 100644 --- a/src/main/java/com/supervision/police/service/CaseEvidenceService.java +++ b/src/main/java/com/supervision/police/service/CaseEvidenceService.java @@ -121,7 +121,7 @@ public interface CaseEvidenceService extends IService { String ocrAndExtract(String caseId, List evidenceFileDTOS); - String updateCaseEvidence(List caseEvidenceDetailDTOList); + String updateCaseEvidence4NewTransaction(List caseEvidenceDetailDTOList); /** * 证据识别并提取列表 @@ -141,4 +141,10 @@ public interface CaseEvidenceService extends IService { EvidenceDirectory createDirectory(EvidenceDirectoryReqVO evidenceDirectory); + /** + * 预处理ocr任务 note:这个方法用来提前处理 word 、 pfd 类型的数据。是一个新的事务不会与前面的事务保持原子性 + * @param evidenceFileList 文件信息列表 + */ + void preSyncSubmitOCR(List evidenceFileList); + } diff --git a/src/main/java/com/supervision/police/service/FileOcrProcessService.java b/src/main/java/com/supervision/police/service/FileOcrProcessService.java index 0aa5e7c..b8e1782 100644 --- a/src/main/java/com/supervision/police/service/FileOcrProcessService.java +++ b/src/main/java/com/supervision/police/service/FileOcrProcessService.java @@ -6,6 +6,7 @@ import com.supervision.police.dto.OCRResDTO; import com.supervision.police.dto.RecordFileDTO; import java.util.List; +import java.util.function.Consumer; /** * @author Administrator @@ -18,6 +19,8 @@ public interface FileOcrProcessService extends IService { List syncSubmitOCR(List fileIdList); + List syncSubmitOCR(List fileIdList, Consumer> consumer); + void doOCRTask(List fileOcrProcesses); @@ -49,4 +52,9 @@ public interface FileOcrProcessService extends IService { List queryFileListWithIdSortNoTransaction(List fileIdList); + + void doWordCRTask(List fileOcrProcesses); + + void doPdfCRTask(List fileOcrProcesses); + } diff --git a/src/main/java/com/supervision/police/service/impl/CaseEvidenceServiceImpl.java b/src/main/java/com/supervision/police/service/impl/CaseEvidenceServiceImpl.java index fa46fdd..5d1b009 100644 --- a/src/main/java/com/supervision/police/service/impl/CaseEvidenceServiceImpl.java +++ b/src/main/java/com/supervision/police/service/impl/CaseEvidenceServiceImpl.java @@ -33,6 +33,7 @@ import java.util.Map; import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; +import java.util.stream.Stream; @Slf4j @Service @@ -433,21 +434,20 @@ public class CaseEvidenceServiceImpl extends ServiceImpl evidenceFileDTOS) { List oldEvidences = this.queryEvidenceList(caseId); - List evidenceDirectoryDTOS = listFileTree(caseId); + List oldEvidenceDirectoryDTOS = listFileTree(caseId); - List newEvidences = toCaseCaseEvidenceDetailDTO(evidenceFileDTOS, evidenceDirectoryDTOS); + List newEvidences = toCaseCaseEvidenceDetailDTO(evidenceFileDTOS, oldEvidenceDirectoryDTOS); newEvidences.forEach(caseEvidenceDetailDTO -> caseEvidenceDetailDTO.setCaseId(caseId)); - List operationalEvidence = findChangedEvidence(oldEvidences, newEvidences); + List operationalEvidenceList = findChangedEvidence(oldEvidences, newEvidences); - String batchId = ((CaseEvidenceService)AopContext.currentProxy()).updateCaseEvidence(operationalEvidence); + String batchId = ((CaseEvidenceService)AopContext.currentProxy()).updateCaseEvidence4NewTransaction(operationalEvidenceList); - for (CaseEvidenceDetailDTO caseEvidenceDetailDTO : operationalEvidence) { - if (StrUtil.equalsAny(caseEvidenceDetailDTO.getUpdateStatus(),"-1","0")){ - // 只需要识别即可 - fileOcrProcessService.syncSubmitOCR(List.of(caseEvidenceDetailDTO.getId())); - // todo:是否需要提取标题 - } + List evidenceFileDTOList = operationalEvidenceList.stream() + .flatMap(evidenceDetailDTO -> evidenceDetailDTO.getFileList().stream().filter(file -> StrUtil.equals(file.getUpdateStatus(),"1"))).toList(); + ((CaseEvidenceService)AopContext.currentProxy()).preSyncSubmitOCR(evidenceFileDTOList); + + for (CaseEvidenceDetailDTO caseEvidenceDetailDTO : operationalEvidenceList) { if (StrUtil.equals(caseEvidenceDetailDTO.getUpdateStatus(),"1")){ // 需要分析(ocr识别+标题提取) xxlJobService.executeTaskByJobHandler("evidenceAnalysis", caseEvidenceDetailDTO.getId()); @@ -458,9 +458,14 @@ public class CaseEvidenceServiceImpl extends ServiceImpl caseEvidenceDetailDTOList) { + public String updateCaseEvidence4NewTransaction(List caseEvidenceDetailDTOList) { String batchNo = DateTime.now().toString("yyyyMMddHHmmss"); for (CaseEvidenceDetailDTO evidence : caseEvidenceDetailDTOList) { @@ -471,7 +476,7 @@ public class CaseEvidenceServiceImpl extends ServiceImpl evidenceFileList) { + + for (EvidenceFileDTO evidenceFileDTO : evidenceFileList) { + if (StrUtil.equalsAny(evidenceFileDTO.getFileType(),"doc","docx")){ + fileOcrProcessService.syncSubmitOCR(List.of(evidenceFileDTO.getFileId()), fileOcrProcessService::doWordCRTask); + }else if (StrUtil.equalsAny(evidenceFileDTO.getFileType(),"pdf")){ + fileOcrProcessService.syncSubmitOCR(List.of(evidenceFileDTO.getFileId()), fileOcrProcessService::doPdfCRTask); + }else { + fileOcrProcessService.syncSubmitOCR(List.of(evidenceFileDTO.getFileId()), fileOcrProcessService::doOCRTask); + } + } + } + private CaseEvidenceDetailDTO findEvidenceDetail(String evidenceId, List caseEvidenceDetailDTOS) { for (CaseEvidenceDetailDTO caseEvidenceDetailDTO : caseEvidenceDetailDTOS) { @@ -677,8 +708,7 @@ public class CaseEvidenceServiceImpl extends ServiceImpl toCaseCaseEvidenceDetailDTO(List newDirectoryDTOS, - List evidenceDirectoryDTOS) { + private List toCaseCaseEvidenceDetailDTO(List newDirectoryDTOS,List oldEvidenceDirectoryDTOS) { if (CollUtil.isEmpty(newDirectoryDTOS)){ return new ArrayList<>(); } @@ -692,7 +722,7 @@ public class CaseEvidenceServiceImpl extends ServiceImpl fileCache = Stream.of(newEvidenceFileList, oldEvidenceList) + .flatMap(Collection::stream) + .flatMap(evidenceDetailDTO -> evidenceDetailDTO.getFileList().stream()) + .collect(Collectors.toMap(EvidenceFileDTO::getFileId, Function.identity())); + for (CaseEvidenceDetailDTO oldEvidence : oldEvidenceList) { boolean isFind = false; for (CaseEvidenceDetailDTO newEvidence : newEvidenceFileList) { @@ -780,7 +829,7 @@ public class CaseEvidenceServiceImpl extends ServiceImpl oldFileIds = oldEvidence.getFileList().stream().map(EvidenceFileDTO::getFileId).toList(); List newFileIds = newEvidence.getFileList().stream().map(EvidenceFileDTO::getFileId).toList(); - TupleIdRecord tupleIdRecord = compareFileList(oldFileIds, newFileIds); + TupleIdRecord tupleIdRecord = compareFileList(newFileIds,oldFileIds); List updateFileList = new ArrayList<>(); // 新增的文件顺序排在原有文件的后面 @@ -789,6 +838,10 @@ public class CaseEvidenceServiceImpl extends ServiceImpl syncSubmitOCR(List fileIdList, Consumer> consumer) { + return submitOCR(fileIdList, consumer); + } + private List submitOCR(List fileIdList, Consumer> consumer){ if (CollUtil.isEmpty(fileIdList)){ @@ -91,7 +100,8 @@ public class FileOcrProcessServiceImpl extends ServiceImpl resultList = super.lambdaQuery().in(FileOcrProcess::getFileId, fileIdList).list(); + return sortByIdOrder(fileIdList,resultList, FileOcrProcess::getFileId); } @Override @@ -186,7 +196,7 @@ public class FileOcrProcessServiceImpl extends ServiceImpl queryFileListWithIdSort(List fileIdList) { List recordFileDTOS = this.queryFileList(fileIdList); - return sortByIdOrder(fileIdList, recordFileDTOS); + return sortByIdOrder(fileIdList, recordFileDTOS, RecordFileDTO::getFileId); } @Override @@ -215,6 +225,44 @@ public class FileOcrProcessServiceImpl extends ServiceImpl fileOcrProcesses) { + + doMcr(fileOcrProcesses, (fileId)-> WordReadUtil.readWordInMinio(minioService, fileId)); + } + + @Override + public void doPdfCRTask(List fileOcrProcesses) { + + doMcr(fileOcrProcesses, (fileId)-> PDFReadUtil.readPdfInMinio(minioService, fileId)); + } + + private void doMcr(List fileOcrProcesses, Function function) { + + log.info("doMcr:开始识别文件...{}",JSONUtil.toJsonStr(fileOcrProcesses)); + + if (CollUtil.isEmpty(fileOcrProcesses)){ + log.info("doMcr:当前暂无识别的任务,结束..."); + return; + } + + for (FileOcrProcess ocrProcess : fileOcrProcesses) { + log.info("ocr:开始识别文件:{}", JSONUtil.toJsonStr(ocrProcess)); + this.updateOCrStatus(List.of(ocrProcess.getFileId()),OcrProcessStatus.PROCESSING.getCode()); + try { + String ocrText = function.apply(ocrProcess.getFileId()); + Assert.notNull(ocrText, "识别结果为空"); + this.lambdaUpdate().eq(FileOcrProcess::getFileId, ocrProcess.getFileId()) + .set(FileOcrProcess::getStatus, OcrProcessStatus.PROCESSING.getCode()) + .set(FileOcrProcess::getOcrText, ocrText).update(); + } catch (Exception e) { + log.error("doMcr识别失败",e); + this.updateOCrStatus(List.of(ocrProcess.getFileId()),OcrProcessStatus.FAIL.getCode()); + } + + } + } + private List buildOCRReqDTO(List fileIdList){ List minioFiles = minioService.listMinioFile(fileIdList); @@ -225,15 +273,15 @@ public class FileOcrProcessServiceImpl extends ServiceImpl sortByIdOrder(List fileIdList,List recordFileDTOList){ + private List sortByIdOrder(List idList,List targetList, Function function){ - if (CollUtil.size(fileIdList) < 2 || CollUtil.size(recordFileDTOList) < 2) { - return recordFileDTOList; + if (CollUtil.size(idList) < 2 || CollUtil.size(targetList) < 2) { + return targetList; } - Map recordFileDTOMap = recordFileDTOList.stream() - .collect(Collectors.toMap(RecordFileDTO::getFileId,recordFileDTO -> recordFileDTO, (k1, k2) -> k1)); - return fileIdList.stream().map(recordFileDTOMap::get).filter(Objects::nonNull).collect(Collectors.toList()); + Map targetMap = targetList.stream() + .collect(Collectors.toMap(function,target -> target, (k1, k2) -> k1)); + return idList.stream().map(targetMap::get).filter(Objects::nonNull).collect(Collectors.toList()); } } diff --git a/src/main/java/com/supervision/police/service/impl/LLMExtractServiceImpl.java b/src/main/java/com/supervision/police/service/impl/LLMExtractServiceImpl.java index 73fec74..b9d1b34 100644 --- a/src/main/java/com/supervision/police/service/impl/LLMExtractServiceImpl.java +++ b/src/main/java/com/supervision/police/service/impl/LLMExtractServiceImpl.java @@ -73,7 +73,7 @@ public class LLMExtractServiceImpl implements LLMExtractService { //判断content是否为JSON格式,如果是则尝试转换为JSON格式并获取title属性的值 try { JSONObject jsonObject = new JSONObject(content); - String title = jsonObject.getString("title"); + String title = jsonObject.optString("title"); ocrExtractDto.setTitle(title); result.add(ocrExtractDto); } catch (Exception e) { @@ -155,7 +155,7 @@ public class LLMExtractServiceImpl implements LLMExtractService { log.info("属性提取结果: {}", jsonObject); List extractAttributesList = LLMExtractDto.getExtractAttributes(); extractAttributesMap.forEach((key, value) -> { - String attrValue = jsonObject.getString(key); + String attrValue = jsonObject.optString(key); log.info("属性提取结果: 【{}】。Key:【{}】", attrValue, key); NotePromptExtractAttributesDto extractAttributesDto = new NotePromptExtractAttributesDto(); extractAttributesDto.setAttrName(key); diff --git a/src/main/java/com/supervision/utils/PDFReadUtil.java b/src/main/java/com/supervision/utils/PDFReadUtil.java index e619777..282bb22 100644 --- a/src/main/java/com/supervision/utils/PDFReadUtil.java +++ b/src/main/java/com/supervision/utils/PDFReadUtil.java @@ -1,5 +1,6 @@ package com.supervision.utils; +import com.supervision.minio.service.MinioService; import lombok.extern.slf4j.Slf4j; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; @@ -20,4 +21,14 @@ public class PDFReadUtil { } return text; } + + public static String readPdfInMinio(MinioService minioService, String fileId) { + + try (InputStream inputStream = minioService.getObjectInputStream(fileId)){ + return pdf2text(inputStream); + } catch (Exception e) { + log.error("从minio中获取文件失败", e); + return null; + } + } } diff --git a/src/main/resources/mapper/EvidenceCategoryMapper.xml b/src/main/resources/mapper/EvidenceCategoryMapper.xml index 8451d6d..2b3443d 100644 --- a/src/main/resources/mapper/EvidenceCategoryMapper.xml +++ b/src/main/resources/mapper/EvidenceCategoryMapper.xml @@ -32,7 +32,7 @@ c.category_name as categoryName, c.icon_url as iconURL, c.parent_id as parent_id, - np.name as name + np.name as promptName from evidence_category c left join note_prompt np on c.prompt_id = np.id