from dotenv import load_dotenv import os from loguru import logger env = os.environ.get('env', 'dev') dotenv_path = '.env.dev' if env == 'dev' else '.env' logger.info(f'Configure using this dotenv path: {dotenv_path}') load_dotenv(dotenv_path=dotenv_path, override=True) import time import traceback import cv2 from helper.image_helper import pdf2image, image_orient_cls, page_detection_visual from helper.page_detection.main import layout_analysis from helper.content_recognition.main import rec from helper.db_helper import insert_pdf2md_table import datetime from tqdm import tqdm from helper.constants import PDFAnalysisStatus def _pdf2markdown_pipeline(pdf_path, visual): start_time = time.time() # 1. pdf -> images t1 = time.time() images = pdf2image(pdf_path) t2 = time.time() # 2. 图片方向分类 t3 = time.time() angles = image_orient_cls(images) t4 = time.time() for i in range(len(angles)): angle = angles[i] img = images[i] if angle == 0: continue if angle == 90: img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE) elif angle == 180: img = cv2.rotate(img, cv2.ROTATE_180) elif angle == 270: img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE) images[i] = img # images = images[:2] # 3. 版面分析 t5 = time.time() layout_detection_results = layout_analysis(images) t6 = time.time() # 3.1 版面分析可视化 if visual: visual_dir = './visual_images' for f in os.listdir(visual_dir): if f.endswith('.jpg'): os.remove(f'{visual_dir}/{f}') for i in tqdm(range(len(layout_detection_results)), '版面分析可视化结果'): vis_img = page_detection_visual(layout_detection_results[i]) cv2.imwrite(f'{visual_dir}/{i + 1}.jpg', vis_img) # exit(0) # 4. 内容识别 t7 = time.time() layout_recognition_results = rec(layout_detection_results) t8 = time.time() end_time = time.time() logger.info(f'{pdf_path} analysis completed in {round(end_time - start_time, 3)} seconds, including {round(t2 - t1, 3)} for pdf to image, {round(t4 - t3, 3)} second for image orient classification, {round(t6 - t5, 3)} seconds for page detection, and {round(t8 - t7, 3)} seconds for layout recognition, page number: {len(images)}') return layout_recognition_results def pdf2markdown_pipeline(pdf_path: str, visual=False, insert_db=True): pdf_name = pdf_path.split('/')[-1] start_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') process_status = 0 pdf_id = None try: results = _pdf2markdown_pipeline(pdf_path, visual) except Exception: logger.error(f'analysis pdf error! \n{traceback.format_exc()}') process_status = PDFAnalysisStatus.FAIL.value end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') if insert_db: insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, None) else: process_status = PDFAnalysisStatus.SUCCESS.value end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') if insert_db: pdf_id = insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, results) return process_status, pdf_id if __name__ == '__main__': insert_db = False pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2021年年度报告.PDF', visual=True, insert_db=insert_db) # pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2022年年度报告.PDF', visual=True, insert_db=insert_db) # pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2023年年度报告.PDF', visual=True, insert_db=insert_db)