from typing import List from .utils import table_rec, scanning_document_rec, markdown_rec, assign_tables_to_titles from tqdm import tqdm from ..image_helper import text_rec from ..page_detection.utils import PageDetectionResult from ..constants import PageDetectionEnum as E class LayoutRecognitionResult(object): def __init__(self, clsid, content, box, table_title=None): self.clsid = clsid self.content = content self.box = box self.table_title = table_title def __repr__(self): return f"[{self.clsid}] {self.content}" def rec(page_detection_results: List[PageDetectionResult]) -> List[List[LayoutRecognitionResult]]: page_recognition_results = [] for page_idx in tqdm(range(len(page_detection_results)), '文本识别'): results = page_detection_results[page_idx] if not results.boxes: page_recognition_results.append([]) continue img = results.image outputs = [] for layout in results.boxes: x1, y1, x2, y2 = layout.pos x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) layout_img = img[y1: y2, x1: x2] content = None if layout.clsid == E.TEXT.value: # text content = markdown_rec(layout_img) elif layout.clsid == E.TABLE.value: # table content = table_rec(layout_img) elif layout.clsid == E.SCANNED_DOCUMENT.value: # scanned document content = scanning_document_rec(layout_img) elif layout.clsid == E.TABLE_CAPTION.value: # table caption _, ocr_results, _ = text_rec(layout_img) content = '' for o in ocr_results: content += f'{o}\n' while content.endswith('\n'): content = content[:-1] if not content: continue content = content.replace('\\', '') result = LayoutRecognitionResult(layout.clsid, content, layout.pos) outputs.append(result) # if is_scanning_document and len(outputs) == 1: # # 扫描件额外提取标题 # h, w = layout_img.shape[:2] # if h > w: # title_img = layout_img[:360, :w, ...] # # cv2.imwrite(f'/mnt/pdf2markdown/temp/{page_idx + 1}.jpg', title_img) # # vis = cv2.rectangle(layout_img.copy(), (0, 0), (w, 360), (255, 255, 0), 3) # # cv2.imwrite(f'/mnt/pdf2markdown/temp/{page_idx + 1}-vis.jpg', vis) # else: # title_img = layout_img[:410, :w, ...] # # cv2.imwrite(f'/mnt/pdf2markdown/temp/{page_idx + 1}.jpg', title_img) # # vis = cv2.rectangle(layout_img.copy(), (0, 310), (w, 410), (255, 255, 0), 3) # # cv2.imwrite(f'/mnt/pdf2markdown/temp/{page_idx + 1}-vis.jpg', vis) # _, title, _ = text_rec(title_img) # outputs[0].table_title = '\n'.join(title) # else: # 自动给表格分配距离它最近的标题 assign_tables_to_titles(outputs) # 表格标题可以删掉了 outputs = [_ for _ in outputs if _.clsid != E.TABLE_CAPTION.value] # 将表格转为数据库中的枚举 1-表格 for o in outputs: if o.clsid == E.TABLE.value: o.clsid = 1 page_recognition_results.append(outputs) return page_recognition_results