from dotenv import load_dotenv
import os
from loguru import logger
env = os . environ . get ( ' env ' , ' dev ' )
logger . info ( f ' Configure using this environment: { env } ' )
load_dotenv ( dotenv_path = ' .env.dev ' if env == ' dev ' else ' .env ' , override = True )
import time
import traceback
import cv2
from helper . image_helper import pdf2image , image_orient_cls , page_detection_visual
from helper . page_detection . main import layout_analysis
from helper . content_recognition . main import rec
from helper . db_helper import insert_pdf2md_table
import datetime
from tqdm import tqdm
from helper . constants import PDFAnalysisStatus
def _pdf2markdown_pipeline ( pdf_path , visual ) :
start_time = time . time ( )
# 1. pdf -> images
t1 = time . time ( )
images = pdf2image ( pdf_path )
t2 = time . time ( )
# 2. 图片方向分类
t3 = time . time ( )
angles = image_orient_cls ( images )
t4 = time . time ( )
for i in range ( len ( angles ) ) :
angle = angles [ i ]
img = images [ i ]
if angle == 0 :
continue
if angle == 90 :
img = cv2 . rotate ( img , cv2 . ROTATE_90_COUNTERCLOCKWISE )
elif angle == 180 :
img = cv2 . rotate ( img , cv2 . ROTATE_180 )
elif angle == 270 :
img = cv2 . rotate ( img , cv2 . ROTATE_90_CLOCKWISE )
images [ i ] = img
# images = images[90: 123]
# 3. 版面分析
t5 = time . time ( )
layout_detection_results = layout_analysis ( images )
t6 = time . time ( )
# 3.1 版面分析可视化
if visual :
visual_dir = ' ./visual_images '
for f in os . listdir ( visual_dir ) :
if f . endswith ( ' .jpg ' ) :
os . remove ( f ' { visual_dir } / { f } ' )
for i in tqdm ( range ( len ( layout_detection_results ) ) , ' 版面分析可视化结果 ' ) :
vis_img = page_detection_visual ( layout_detection_results [ i ] )
cv2 . imwrite ( f ' { visual_dir } / { i + 1 } .jpg ' , vis_img )
# exit(0)
# 4. 内容识别
t7 = time . time ( )
layout_recognition_results = rec ( layout_detection_results )
t8 = time . time ( )
end_time = time . time ( )
logger . info ( f ' { pdf_path } analysis completed in { round ( end_time - start_time , 3 ) } seconds, including { round ( t2 - t1 , 3 ) } for pdf to image, { round ( t4 - t3 , 3 ) } second for image orient classification, { round ( t6 - t5 , 3 ) } seconds for page detection, and { round ( t8 - t7 , 3 ) } seconds for layout recognition, page number: { len ( images ) } ' )
return layout_recognition_results
def pdf2markdown_pipeline ( pdf_path : str , visual = False , insert_db = True ) :
pdf_name = pdf_path . split ( ' / ' ) [ - 1 ]
start_time = datetime . datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S. %f ' )
process_status = 0
try :
results = _pdf2markdown_pipeline ( pdf_path , visual )
except Exception :
logger . error ( f ' analysis pdf error! \n { traceback . format_exc ( ) } ' )
process_status = PDFAnalysisStatus . FAIL . value
end_time = datetime . datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S. %f ' )
if insert_db :
insert_pdf2md_table ( pdf_path , pdf_name , process_status , start_time , end_time , None )
pdf_id = None
else :
process_status = PDFAnalysisStatus . SUCCESS . value
end_time = datetime . datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S. %f ' )
if insert_db :
pdf_id = insert_pdf2md_table ( pdf_path , pdf_name , process_status , start_time , end_time , results )
return process_status , pdf_id
if __name__ == ' __main__ ' :
pdf2markdown_pipeline ( ' /mnt/pdf2markdown/龙源电力: 2021年年度报告.PDF ' , visual = True , insert_db = True )
# pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力: 2022年年度报告.PDF', visual=True, insert_db=True)
# pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力: 2023年年度报告.PDF', visual=True, insert_db=True)