pdf文本提取

topo_dev
DESKTOP-DDTUS3E\yaxin 6 months ago
parent b52e04c83c
commit edf9f23e87

@ -168,6 +168,12 @@
<version>1.70</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.28</version>
</dependency>
<dependency>
<groupId>com.xuxueli</groupId>
<artifactId>xxl-job-core</artifactId>

@ -0,0 +1,23 @@
package com.supervision.utils;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.IOException;
import java.io.InputStream;
@Slf4j
public class PDFReadUtil {
public static String pdf2text(InputStream inputStream) {
String text = "";
try (PDDocument document = PDDocument.load(inputStream)) {
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(document);
} catch (IOException e) {
log.error("解析PDF文件失败", e);
}
return text;
}
}

@ -58,7 +58,7 @@ public class WordReadUtil {
public static String readWord(InputStream inputStream) {
StringBuilder stringBuilder = new StringBuilder();
try (inputStream) {
try (inputStream) {
// 创建 XWPFDocument 对象
XWPFDocument document = new XWPFDocument(inputStream);
// 获取所有段落

@ -1,8 +1,10 @@
package com.supervision.demo;
import com.deepoove.poi.XWPFTemplate;
import com.supervision.minio.service.MinioService;
import com.supervision.police.dto.caseScore.CaseScoreDetailDTO;
import com.supervision.police.service.ModelService;
import com.supervision.utils.PDFReadUtil;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
@ -16,7 +18,11 @@ import java.util.HashMap;
public class WordRenderTest {
@Autowired
private ModelService modelService;
private MinioService minioService;
@Autowired
private ModelService modelService;
public static void main(String[] args) throws FileNotFoundException {
HashMap<String, Object> data = new HashMap<>();
data.put("name", "张三");
@ -54,4 +60,10 @@ public class WordRenderTest {
throw new RuntimeException(e);
}
}
@Test
public void pdf2text() {// 创建文件对象
String content = PDFReadUtil.pdf2text(minioService.getObjectInputStream("1848552470327439362"));
log.info("content:{}", content);
}
}

Loading…
Cancel
Save