from dotenv import load_dotenv
import os
env = os . environ . get ( ' env ' , ' dev ' )
load_dotenv ( dotenv_path = ' .env.dev ' if env == ' dev ' else ' .env ' , override = True )
import time
import traceback
import cv2
from helper . image_helper import pdf2image , image_orient_cls , page_detection_visual
from helper . page_detection . main import layout_analysis
from helper . content_recognition . main import rec
from helper . db_helper import insert_pdf2md_table
import tempfile
from loguru import logger
import datetime
import shutil
def _pdf2markdown_pipeline ( pdf_path , tmp_dir ) :
start_time = time . time ( )
# 1. pdf -> images
t1 = time . time ( )
pdf2image ( pdf_path , tmp_dir )
t2 = time . time ( )
# 2. 图片方向分类
t3 = time . time ( )
orient_cls_results = image_orient_cls ( tmp_dir )
t4 = time . time ( )
for r in orient_cls_results :
clsid = r [ 0 ] [ ' class_ids ' ] [ 0 ]
filename = r [ 0 ] [ ' filename ' ]
if clsid == 1 or clsid == 3 :
img = cv2 . imread ( filename )
img = cv2 . rotate ( img , cv2 . ROTATE_90_CLOCKWISE )
cv2 . imwrite ( filename , img )
filepaths = os . listdir ( tmp_dir )
filepaths . sort ( key = lambda x : int ( x . split ( ' / ' ) [ - 1 ] . split ( ' . ' ) [ 0 ] ) )
filepaths = [ f ' { tmp_dir } / { _ } ' for _ in filepaths ]
# filepaths = filepaths[250:251]
# 3. 版面分析
t5 = time . time ( )
layout_detection_results = layout_analysis ( filepaths )
t6 = time . time ( )
# 3.1 visual
if int ( os . environ [ ' VISUAL ' ] ) :
visual_dir = ' ./visual_images '
for f in os . listdir ( visual_dir ) :
if f . endswith ( ' .jpg ' ) :
os . remove ( f ' { visual_dir } / { f } ' )
for i in range ( len ( layout_detection_results ) ) :
vis_img = page_detection_visual ( layout_detection_results [ i ] )
cv2 . imwrite ( f ' { visual_dir } / { i + 1 } .jpg ' , vis_img )
# 4. 内容识别
t7 = time . time ( )
layout_recognition_results = rec ( layout_detection_results , tmp_dir )
t8 = time . time ( )
end_time = time . time ( )
logger . info ( f ' { pdf_path } analysis completed in { round ( end_time - start_time , 3 ) } seconds, including { round ( t2 - t1 , 3 ) } for pdf to image, { round ( t4 - t3 , 3 ) } second for image orient classification, { round ( t6 - t5 , 3 ) } seconds for page detection, and { round ( t8 - t7 , 3 ) } seconds for layout recognition, page number: { len ( filepaths ) } ' )
return layout_recognition_results
def pdf2markdown_pipeline ( pdf_path : str ) :
pdf_name = pdf_path . split ( ' / ' ) [ - 1 ]
start_time = datetime . datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S. %f ' )
process_status = 0
tmp_dir = tempfile . mkdtemp ( )
try :
results = _pdf2markdown_pipeline ( pdf_path , tmp_dir )
except Exception :
logger . error ( f ' analysis pdf error! \n { traceback . format_exc ( ) } ' )
process_status = 3
end_time = datetime . datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S. %f ' )
insert_pdf2md_table ( pdf_path , pdf_name , process_status , start_time , end_time , None )
pdf_id = None
else :
process_status = 2
end_time = datetime . datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S. %f ' )
pdf_id = insert_pdf2md_table ( pdf_path , pdf_name , process_status , start_time , end_time , results )
finally :
shutil . rmtree ( tmp_dir )
return process_status , pdf_id
if __name__ == ' __main__ ' :
pdf2markdown_pipeline ( ' /mnt/pdf2markdown/龙源电力: 2023年年度审计报告.PDF ' )