|
|
from dotenv import load_dotenv
|
|
|
import os
|
|
|
from loguru import logger
|
|
|
|
|
|
env = os.environ.get('env', 'dev')
|
|
|
dotenv_path = '.env.dev' if env == 'dev' else '.env'
|
|
|
logger.info(f'Configure using this dotenv path: {dotenv_path}')
|
|
|
load_dotenv(dotenv_path=dotenv_path, override=True)
|
|
|
|
|
|
import time
|
|
|
import traceback
|
|
|
import cv2
|
|
|
from helper.image_helper import pdf2image, image_orient_cls, page_detection_visual
|
|
|
from helper.page_detection.main import layout_analysis
|
|
|
from helper.content_recognition.main import rec
|
|
|
from helper.db_helper import insert_pdf2md_table
|
|
|
import datetime
|
|
|
from tqdm import tqdm
|
|
|
from helper.constants import PDFAnalysisStatus
|
|
|
|
|
|
|
|
|
def _pdf2markdown_pipeline(pdf_path, visual):
|
|
|
start_time = time.time()
|
|
|
# 1. pdf -> images
|
|
|
t1 = time.time()
|
|
|
images = pdf2image(pdf_path)
|
|
|
t2 = time.time()
|
|
|
|
|
|
# 2. 图片方向分类
|
|
|
t3 = time.time()
|
|
|
angles = image_orient_cls(images)
|
|
|
t4 = time.time()
|
|
|
for i in range(len(angles)):
|
|
|
angle = angles[i]
|
|
|
img = images[i]
|
|
|
if angle == 0:
|
|
|
continue
|
|
|
if angle == 90:
|
|
|
img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
|
|
|
elif angle == 180:
|
|
|
img = cv2.rotate(img, cv2.ROTATE_180)
|
|
|
elif angle == 270:
|
|
|
img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
|
|
|
images[i] = img
|
|
|
|
|
|
# images = images[:2]
|
|
|
|
|
|
# 3. 版面分析
|
|
|
t5 = time.time()
|
|
|
layout_detection_results = layout_analysis(images)
|
|
|
t6 = time.time()
|
|
|
|
|
|
# 3.1 版面分析可视化
|
|
|
if visual:
|
|
|
visual_dir = './visual_images'
|
|
|
for f in os.listdir(visual_dir):
|
|
|
if f.endswith('.jpg'):
|
|
|
os.remove(f'{visual_dir}/{f}')
|
|
|
for i in tqdm(range(len(layout_detection_results)), '版面分析可视化结果'):
|
|
|
vis_img = page_detection_visual(layout_detection_results[i])
|
|
|
cv2.imwrite(f'{visual_dir}/{i + 1}.jpg', vis_img)
|
|
|
|
|
|
# exit(0)
|
|
|
|
|
|
# 4. 内容识别
|
|
|
t7 = time.time()
|
|
|
layout_recognition_results = rec(layout_detection_results)
|
|
|
t8 = time.time()
|
|
|
|
|
|
end_time = time.time()
|
|
|
logger.info(f'{pdf_path} analysis completed in {round(end_time - start_time, 3)} seconds, including {round(t2 - t1, 3)} for pdf to image, {round(t4 - t3, 3)} second for image orient classification, {round(t6 - t5, 3)} seconds for page detection, and {round(t8 - t7, 3)} seconds for layout recognition, page number: {len(images)}')
|
|
|
|
|
|
return layout_recognition_results
|
|
|
|
|
|
|
|
|
def pdf2markdown_pipeline(pdf_path: str, visual=False, insert_db=True):
|
|
|
pdf_name = pdf_path.split('/')[-1]
|
|
|
start_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
|
|
|
process_status = 0
|
|
|
pdf_id = None
|
|
|
try:
|
|
|
results = _pdf2markdown_pipeline(pdf_path, visual)
|
|
|
except Exception:
|
|
|
logger.error(f'analysis pdf error! \n{traceback.format_exc()}')
|
|
|
process_status = PDFAnalysisStatus.FAIL.value
|
|
|
end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
|
|
|
if insert_db:
|
|
|
insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, None)
|
|
|
else:
|
|
|
process_status = PDFAnalysisStatus.SUCCESS.value
|
|
|
end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
|
|
|
if insert_db:
|
|
|
pdf_id = insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, results)
|
|
|
return process_status, pdf_id
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
insert_db = False
|
|
|
pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2021年年度报告.PDF', visual=True, insert_db=insert_db)
|
|
|
# pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2022年年度报告.PDF', visual=True, insert_db=insert_db)
|
|
|
# pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2023年年度报告.PDF', visual=True, insert_db=insert_db)
|