|
|
|
@ -1,7 +1,8 @@
|
|
|
|
|
from typing import List
|
|
|
|
|
import cv2
|
|
|
|
|
from .utils import scanning_document_classify, text_rec, table_rec, scanning_document_rec, markdown_rec, assign_tables_to_titles, remove_watermark
|
|
|
|
|
from .utils import scanning_document_classify, table_rec, scanning_document_rec, markdown_rec, assign_tables_to_titles, remove_watermark
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
from ..image_helper import text_rec
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LayoutRecognitionResult(object):
|
|
|
|
@ -60,18 +61,18 @@ def rec(page_detection_results, tmp_dir) -> List[List[LayoutRecognitionResult]]:
|
|
|
|
|
# 扫描件
|
|
|
|
|
is_scanning_document = True
|
|
|
|
|
content, layout_img = scanning_document_rec(layout_img)
|
|
|
|
|
source_page_no_watermark_img = remove_watermark(cv2.imread(f'{tmp_dir}/{page_idx + 1}.jpg'))
|
|
|
|
|
source_page_unwatermarked_img = remove_watermark(cv2.imread(f'{tmp_dir}/{page_idx + 1}.jpg'))
|
|
|
|
|
elif layout.clsid == 4:
|
|
|
|
|
# table
|
|
|
|
|
if scanning_document_classify(layout_img):
|
|
|
|
|
is_scanning_document = True
|
|
|
|
|
content, layout_img = scanning_document_rec(layout_img)
|
|
|
|
|
source_page_no_watermark_img = remove_watermark(cv2.imread(f'{tmp_dir}/{page_idx + 1}.jpg'))
|
|
|
|
|
source_page_unwatermarked_img = remove_watermark(cv2.imread(f'{tmp_dir}/{page_idx + 1}.jpg'))
|
|
|
|
|
else:
|
|
|
|
|
content = table_rec(layout_img)
|
|
|
|
|
elif layout.clsid == 5:
|
|
|
|
|
# table caption
|
|
|
|
|
ocr_results = text_rec(layout_img)
|
|
|
|
|
_, ocr_results, _ = text_rec(layout_img)
|
|
|
|
|
content = ''
|
|
|
|
|
for o in ocr_results:
|
|
|
|
|
content += f'{o}\n'
|
|
|
|
@ -81,25 +82,26 @@ def rec(page_detection_results, tmp_dir) -> List[List[LayoutRecognitionResult]]:
|
|
|
|
|
if not content:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
content = content.replace('\\', '')
|
|
|
|
|
result = LayoutRecognitionResult(layout.clsid, content, layout.pos)
|
|
|
|
|
outputs.append(result)
|
|
|
|
|
|
|
|
|
|
if is_scanning_document and len(outputs) == 1:
|
|
|
|
|
# 扫描件额外提取标题
|
|
|
|
|
h, w = source_page_no_watermark_img.shape[:2]
|
|
|
|
|
h, w = source_page_unwatermarked_img.shape[:2]
|
|
|
|
|
if h > w:
|
|
|
|
|
title_img = source_page_no_watermark_img[:360, :w, ...]
|
|
|
|
|
title_img = source_page_unwatermarked_img[:360, :w, ...]
|
|
|
|
|
|
|
|
|
|
# cv2.imwrite(f'/mnt/pdf2markdown/temp/{page_idx + 1}.jpg', title_img)
|
|
|
|
|
# vis = cv2.rectangle(source_page_no_watermark_img.copy(), (0, 0), (w, 360), (255, 255, 0), 3)
|
|
|
|
|
# vis = cv2.rectangle(source_page_unwatermarked_img.copy(), (0, 0), (w, 360), (255, 255, 0), 3)
|
|
|
|
|
# cv2.imwrite(f'/mnt/pdf2markdown/temp/{page_idx + 1}-vis.jpg', vis)
|
|
|
|
|
else:
|
|
|
|
|
title_img = source_page_no_watermark_img[:410, :w, ...]
|
|
|
|
|
title_img = source_page_unwatermarked_img[:410, :w, ...]
|
|
|
|
|
|
|
|
|
|
# cv2.imwrite(f'/mnt/pdf2markdown/temp/{page_idx + 1}.jpg', title_img)
|
|
|
|
|
# vis = cv2.rectangle(source_page_no_watermark_img.copy(), (0, 310), (w, 410), (255, 255, 0), 3)
|
|
|
|
|
# vis = cv2.rectangle(source_page_unwatermarked_img.copy(), (0, 310), (w, 410), (255, 255, 0), 3)
|
|
|
|
|
# cv2.imwrite(f'/mnt/pdf2markdown/temp/{page_idx + 1}-vis.jpg', vis)
|
|
|
|
|
title = text_rec(title_img)
|
|
|
|
|
_, title, _ = text_rec(title_img)
|
|
|
|
|
outputs[0].table_title = '\n'.join(title)
|
|
|
|
|
else:
|
|
|
|
|
# 自动给表格分配距离它最近的标题
|
|
|
|
|