from dotenv import load_dotenv import os env = os.environ.get('env', 'dev') load_dotenv(dotenv_path='.env.dev' if env == 'dev' else '.env', override=True) import time import traceback import cv2 from helper.image_helper import pdf2image, image_orient_cls, page_detection_visual from helper.page_detection.main import layout_analysis from helper.content_recognition.main import rec from helper.db_helper import insert_pdf2md_table import tempfile from loguru import logger import datetime import shutil def _pdf2markdown_pipeline(pdf_path, tmp_dir): start_time = time.time() # 1. pdf -> images t1 = time.time() pdf2image(pdf_path, tmp_dir) t2 = time.time() # 2. 图片方向分类 t3 = time.time() orient_cls_results = image_orient_cls(tmp_dir) t4 = time.time() for r in orient_cls_results: clsid = r[0]['class_ids'][0] filename = r[0]['filename'] if clsid == 1 or clsid == 3: img = cv2.imread(filename) img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE) cv2.imwrite(filename, img) filepaths = os.listdir(tmp_dir) filepaths.sort(key=lambda x: int(x.split('/')[-1].split('.')[0])) filepaths = [f'{tmp_dir}/{_}' for _ in filepaths] # filepaths = filepaths[:75] # 3. 版面分析 t5 = time.time() layout_detection_results = layout_analysis(filepaths) t6 = time.time() # 3.1 visual if int(os.environ['VISUAL']): visual_dir = './visual_images' for f in os.listdir(visual_dir): if f.endswith('.jpg'): os.remove(f'{visual_dir}/{f}') for i in range(len(layout_detection_results)): vis_img = page_detection_visual(layout_detection_results[i]) cv2.imwrite(f'{visual_dir}/{i + 1}.jpg', vis_img) # 4. 内容识别 t7 = time.time() layout_recognition_results = rec(layout_detection_results, tmp_dir) t8 = time.time() end_time = time.time() logger.info(f'{pdf_path} analysis completed in {round(end_time - start_time, 3)} seconds, including {round(t2 - t1, 3)} for pdf to image, {round(t4 - t3, 3)} second for image orient classification, {round(t6 - t5, 3)} seconds for page detection, and {round(t8 - t7, 3)} seconds for layout recognition, page number: {len(filepaths)}') return layout_recognition_results def pdf2markdown_pipeline(pdf_path: str): pdf_name = pdf_path.split('/')[-1] start_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') process_status = 0 tmp_dir = tempfile.mkdtemp() try: results = _pdf2markdown_pipeline(pdf_path, tmp_dir) except Exception: logger.error(f'analysis pdf error! \n{traceback.format_exc()}') process_status = 3 end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, None) pdf_id = None else: process_status = 2 end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') pdf_id = insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, results) finally: shutil.rmtree(tmp_dir) return process_status, pdf_id if __name__ == '__main__': pdf2markdown_pipeline('/mnt/pdf2markdown/demo.pdf')