You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
3.3 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from dotenv import load_dotenv
import os
env = os.environ.get('env', 'dev')
load_dotenv(dotenv_path='.env.dev' if env == 'dev' else '.env', override=True)
import time
import traceback
import cv2
from helper.image_helper import pdf2image, image_orient_cls, page_detection_visual
from helper.page_detection.main import layout_analysis
from helper.content_recognition.main import rec
from helper.db_helper import insert_pdf2md_table
import tempfile
from loguru import logger
import datetime
import shutil
def _pdf2markdown_pipeline(pdf_path, tmp_dir):
start_time = time.time()
# 1. pdf -> images
t1 = time.time()
pdf2image(pdf_path, tmp_dir)
t2 = time.time()
# 2. 图片方向分类
t3 = time.time()
orient_cls_results = image_orient_cls(tmp_dir)
t4 = time.time()
for r in orient_cls_results:
clsid = r[0]['class_ids'][0]
filename = r[0]['filename']
if clsid == 1 or clsid == 3:
img = cv2.imread(filename)
img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
cv2.imwrite(filename, img)
filepaths = os.listdir(tmp_dir)
filepaths.sort(key=lambda x: int(x.split('/')[-1].split('.')[0]))
filepaths = [f'{tmp_dir}/{_}' for _ in filepaths]
# filepaths = filepaths[250:251]
# 3. 版面分析
t5 = time.time()
layout_detection_results = layout_analysis(filepaths)
t6 = time.time()
# 3.1 visual
if int(os.environ['VISUAL']):
visual_dir = './visual_images'
for f in os.listdir(visual_dir):
if f.endswith('.jpg'):
os.remove(f'{visual_dir}/{f}')
for i in range(len(layout_detection_results)):
vis_img = page_detection_visual(layout_detection_results[i])
cv2.imwrite(f'{visual_dir}/{i + 1}.jpg', vis_img)
# 4. 内容识别
t7 = time.time()
layout_recognition_results = rec(layout_detection_results, tmp_dir)
t8 = time.time()
end_time = time.time()
logger.info(f'{pdf_path} analysis completed in {round(end_time - start_time, 3)} seconds, including {round(t2 - t1, 3)} for pdf to image, {round(t4 - t3, 3)} second for image orient classification, {round(t6 - t5, 3)} seconds for page detection, and {round(t8 - t7, 3)} seconds for layout recognition, page number: {len(filepaths)}')
return layout_recognition_results
def pdf2markdown_pipeline(pdf_path: str):
pdf_name = pdf_path.split('/')[-1]
start_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
process_status = 0
tmp_dir = tempfile.mkdtemp()
try:
results = _pdf2markdown_pipeline(pdf_path, tmp_dir)
except Exception:
logger.error(f'analysis pdf error! \n{traceback.format_exc()}')
process_status = 3
end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, None)
pdf_id = None
else:
process_status = 2
end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
pdf_id = insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, results)
finally:
shutil.rmtree(tmp_dir)
return process_status, pdf_id
if __name__ == '__main__':
pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力2023年年度审计报告.PDF')