from typing import List from .pdf_detection import Pipeline from utils import non_max_suppression, merge_text_and_title_boxes, LayoutBox, PageDetectionResult from tqdm import tqdm from ..constants import PageDetectionEnum as E from ..image_helper import remove_watermark """ 0 - Text 1 - Title 2 - Figure 3 - Figure caption 4 - Table 5 - Table caption 6 - Header 7 - Footer 8 - Reference 9 - Equation 使用训练后的权重时,id需要+1,即TEXT从1开始 """ pipeline = Pipeline('./models/PaddleDetection/inference_model/picodet_lcnet_x1_0_fgd_layout_cdla_infer_v2') effective_labels = [E.TEXT.value, E.TITLE.value, E.TABLE.value, E.TABLE_CAPTION.value, E.SCANNED_DOCUMENT.value] # nms优先级,索引越低优先级越低 label_scores = [E.TITLE.value, E.TABLE_CAPTION.value, E.TEXT.value, E.TABLE.value, E.SCANNED_DOCUMENT.value] expand_pixel = 10 def layout_analysis(images) -> List[PageDetectionResult]: layout_analysis_results = [] for image in tqdm(images, '版面分析'): page_detecion_outputs = pipeline(image) layout_boxes = [] is_scanned_document = False for o in page_detecion_outputs: clsid, box, confidence = o if clsid in effective_labels: layout_boxes.append(LayoutBox(clsid, box, confidence)) if clsid == E.SCANNED_DOCUMENT.value: is_scanned_document = True image = remove_watermark(image) if is_scanned_document: # 扫描件需要去水印后重新进行版面分析来识别出标题,因为训练的图片是去水印之后的 _page_detecion_outputs = pipeline(image) for o in _page_detecion_outputs: clsid, box, confidence = o if clsid == E.TABLE_CAPTION.value: layout_boxes.append(LayoutBox(clsid, box, confidence)) page_detecion_outputs = PageDetectionResult(layout_boxes, image) scores = [] poses = [] for box in page_detecion_outputs.boxes: # 相同的label重叠时,保留面积更大的 area = (box.pos[3] - box.pos[1]) * (box.pos[2] - box.pos[0]) area_score = area / 5000000 scores.append(label_scores.index(box.clsid) + area_score) poses.append(box.pos) indices = non_max_suppression(poses, scores, 0.2) _boxes = [] for i in indices: _boxes.append(page_detecion_outputs.boxes[i]) page_detecion_outputs.boxes = _boxes if not is_scanned_document: for i in range(len(page_detecion_outputs.boxes) - 1, -1, -1): # 移除Table box和Figure box中的Table caption box和Text box (有些扫描件会被识别为Figure) box = page_detecion_outputs.boxes[i] if box.clsid in (E.TEXT.value, E.TABLE_CAPTION.value): for _box in page_detecion_outputs.boxes: if _box.clsid != E.FIGURE.value and _box.clsid != E.TABLE.value: continue if box.pos[0] > _box.pos[0] and box.pos[1] > _box.pos[1] and box.pos[2] < _box.pos[2] and box.pos[3] < _box.pos[3]: page_detecion_outputs.boxes.remove(box) # 将text和title合并起来,便于转成markdown格式 merged_labels = [E.TEXT.value, E.TITLE.value] other_labels = list(set(effective_labels) - set(merged_labels)) page_detecion_outputs.boxes = merge_text_and_title_boxes(page_detecion_outputs.boxes, merged_labels, other_labels, E.TEXT.value) # 对box进行排序 page_detecion_outputs.boxes.sort(key=lambda x: (x.pos[1], x.pos[0])) # box外扩,便于后续的ocr h, w = image.shape[:2] for layout in page_detecion_outputs.boxes: if layout.clsid != E.TEXT.value: continue layout.pos[0] -= expand_pixel layout.pos[1] -= expand_pixel layout.pos[2] += expand_pixel layout.pos[3] += expand_pixel layout.pos[0] = max(0, layout.pos[0]) layout.pos[1] = max(0, layout.pos[1]) layout.pos[2] = min(w, layout.pos[2]) layout.pos[3] = min(h, layout.pos[3]) layout_analysis_results.append(page_detecion_outputs) return layout_analysis_results