You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

102 lines
3.7 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from dotenv import load_dotenv
import os
from loguru import logger
env = os.environ.get('env', 'dev')
dotenv_path = '.env.dev' if env == 'dev' else '.env'
logger.info(f'Configure using this dotenv path: {dotenv_path}')
load_dotenv(dotenv_path=dotenv_path, override=True)
import time
import traceback
import cv2
from helper.image_helper import pdf2image, image_orient_cls, page_detection_visual
from helper.page_detection.main import layout_analysis
from helper.content_recognition.main import rec
from helper.db_helper import insert_pdf2md_table
import datetime
from tqdm import tqdm
from helper.constants import PDFAnalysisStatus
def _pdf2markdown_pipeline(pdf_path, visual):
start_time = time.time()
# 1. pdf -> images
t1 = time.time()
images = pdf2image(pdf_path)
t2 = time.time()
# 2. 图片方向分类
t3 = time.time()
angles = image_orient_cls(images)
t4 = time.time()
for i in range(len(angles)):
angle = angles[i]
img = images[i]
if angle == 0:
continue
if angle == 90:
img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
elif angle == 180:
img = cv2.rotate(img, cv2.ROTATE_180)
elif angle == 270:
img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
images[i] = img
# images = images[:2]
# 3. 版面分析
t5 = time.time()
layout_detection_results = layout_analysis(images)
t6 = time.time()
# 3.1 版面分析可视化
if visual:
visual_dir = './visual_images'
for f in os.listdir(visual_dir):
if f.endswith('.jpg'):
os.remove(f'{visual_dir}/{f}')
for i in tqdm(range(len(layout_detection_results)), '版面分析可视化结果'):
vis_img = page_detection_visual(layout_detection_results[i])
cv2.imwrite(f'{visual_dir}/{i + 1}.jpg', vis_img)
# exit(0)
# 4. 内容识别
t7 = time.time()
layout_recognition_results = rec(layout_detection_results)
t8 = time.time()
end_time = time.time()
logger.info(f'{pdf_path} analysis completed in {round(end_time - start_time, 3)} seconds, including {round(t2 - t1, 3)} for pdf to image, {round(t4 - t3, 3)} second for image orient classification, {round(t6 - t5, 3)} seconds for page detection, and {round(t8 - t7, 3)} seconds for layout recognition, page number: {len(images)}')
return layout_recognition_results
def pdf2markdown_pipeline(pdf_path: str, visual=False, insert_db=True):
pdf_name = pdf_path.split('/')[-1]
start_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
process_status = 0
pdf_id = None
try:
results = _pdf2markdown_pipeline(pdf_path, visual)
except Exception:
logger.error(f'analysis pdf error! \n{traceback.format_exc()}')
process_status = PDFAnalysisStatus.FAIL.value
end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
if insert_db:
insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, None)
else:
process_status = PDFAnalysisStatus.SUCCESS.value
end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
if insert_db:
pdf_id = insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, results)
return process_status, pdf_id
if __name__ == '__main__':
insert_db = False
pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力2021年年度报告.PDF', visual=True, insert_db=insert_db)
# pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力2022年年度报告.PDF', visual=True, insert_db=insert_db)
# pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力2023年年度报告.PDF', visual=True, insert_db=insert_db)