You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
3.2 KiB
Python

1 month ago
from dotenv import load_dotenv
import os
env = os.environ.get('env', 'dev')
load_dotenv(dotenv_path='.env.dev' if env == 'dev' else '.env', override=True)
import time
import traceback
import cv2
from helper.image_helper import pdf2image, image_orient_cls, page_detection_visual
from helper.page_detection.main import layout_analysis
from helper.content_recognition.main import rec
from helper.db_helper import insert_pdf2md_table
import tempfile
from loguru import logger
import datetime
import shutil
def _pdf2markdown_pipeline(pdf_path, tmp_dir):
start_time = time.time()
# 1. pdf -> images
t1 = time.time()
pdf2image(pdf_path, tmp_dir)
t2 = time.time()
# 2. 图片方向分类
t3 = time.time()
orient_cls_results = image_orient_cls(tmp_dir)
t4 = time.time()
for r in orient_cls_results:
clsid = r[0]['class_ids'][0]
filename = r[0]['filename']
if clsid == 1 or clsid == 3:
img = cv2.imread(filename)
img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
cv2.imwrite(filename, img)
filepaths = os.listdir(tmp_dir)
filepaths.sort(key=lambda x: int(x.split('/')[-1].split('.')[0]))
filepaths = [f'{tmp_dir}/{_}' for _ in filepaths]
# filepaths = filepaths[:75]
# 3. 版面分析
t5 = time.time()
layout_detection_results = layout_analysis(filepaths)
t6 = time.time()
# 3.1 visual
if int(os.environ['VISUAL']):
visual_dir = './visual_images'
for f in os.listdir(visual_dir):
if f.endswith('.jpg'):
os.remove(f'{visual_dir}/{f}')
for i in range(len(layout_detection_results)):
vis_img = page_detection_visual(layout_detection_results[i])
cv2.imwrite(f'{visual_dir}/{i + 1}.jpg', vis_img)
# 4. 内容识别
t7 = time.time()
layout_recognition_results = rec(layout_detection_results, tmp_dir)
t8 = time.time()
end_time = time.time()
logger.info(f'{pdf_path} analysis completed in {round(end_time - start_time, 3)} seconds, including {round(t2 - t1, 3)} for pdf to image, {round(t4 - t3, 3)} second for image orient classification, {round(t6 - t5, 3)} seconds for page detection, and {round(t8 - t7, 3)} seconds for layout recognition, page number: {len(filepaths)}')
return layout_recognition_results
def pdf2markdown_pipeline(pdf_path: str):
pdf_name = pdf_path.split('/')[-1]
start_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
process_status = 0
tmp_dir = tempfile.mkdtemp()
try:
results = _pdf2markdown_pipeline(pdf_path, tmp_dir)
except Exception:
logger.error(f'analysis pdf error! \n{traceback.format_exc()}')
process_status = 3
end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, None)
pdf_id = None
else:
process_status = 2
end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
pdf_id = insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, results)
finally:
shutil.rmtree(tmp_dir)
return process_status, pdf_id
if __name__ == '__main__':
pdf2markdown_pipeline('/mnt/pdf2markdown/demo.pdf')