diff --git a/helper/content_recognition/main.py b/helper/content_recognition/main.py index abf08cd..73835cd 100644 --- a/helper/content_recognition/main.py +++ b/helper/content_recognition/main.py @@ -87,8 +87,10 @@ def rec(page_detection_results: List[PageDetectionResult]) -> List[List[LayoutRe outputs = [_ for _ in outputs if _.clsid != E.TABLE_CAPTION.value] # 将表格转为数据库中的枚举 1-表格 for o in outputs: - if o.clsid == E.TABLE.value: + if o.clsid == E.TABLE.value or o.clsid == E.SCANNED_DOCUMENT.value: o.clsid = 1 + else: + o.clsid = 0 page_recognition_results.append(outputs) return page_recognition_results diff --git a/helper/image_helper.py b/helper/image_helper.py index 3965546..087d8b8 100644 --- a/helper/image_helper.py +++ b/helper/image_helper.py @@ -2,12 +2,12 @@ from typing import List from typing_extensions import deprecated import numpy as np from pdf2image import convert_from_path -import os import cv2 from .page_detection.utils import PageDetectionResult from paddleocr import PaddleOCR from .constants import PageDetectionEnum as E from paddlex import create_model +from tqdm import tqdm ocr = PaddleOCR(use_angle_cls=False, lang='ch', use_gpu=True, show_log=False) @@ -82,7 +82,7 @@ def image_orient_cls(images): images = [images] angles = [] - for img in images: + for img in tqdm(images, '文本方向分类'): h, w = img.shape[:2] det_res = ocr.ocr(img, det=True, rec=False, cls=False)[0] boxes = [] diff --git a/helper/page_detection/main.py b/helper/page_detection/main.py index b7f3b3e..4b86086 100644 --- a/helper/page_detection/main.py +++ b/helper/page_detection/main.py @@ -89,6 +89,8 @@ def layout_analysis(images) -> List[PageDetectionResult]: # box外扩,便于后续的ocr h, w = image.shape[:2] for layout in page_detecion_outputs.boxes: + if layout.clsid != E.TEXT.value: + continue layout.pos[0] -= expand_pixel layout.pos[1] -= expand_pixel layout.pos[2] += expand_pixel diff --git a/helper/page_detection/pdf_detection.py b/helper/page_detection/pdf_detection.py index bfd41c9..2173be0 100644 --- a/helper/page_detection/pdf_detection.py +++ b/helper/page_detection/pdf_detection.py @@ -14,9 +14,6 @@ import os import yaml -import glob -import json -from pathlib import Path import cv2 import numpy as np @@ -34,7 +31,7 @@ from preprocess import preprocess, Resize, NormalizeImage, Permute, PadStride, L from .picodet_postprocess import PicoDetPostProcess from clrnet_postprocess import CLRNetPostProcess from visualize import visualize_box_mask, imshow_lanes -from utils import argsparser, Timer, multiclass_nms, coco_clsid2catid +from utils import Timer # Global dictionary SUPPORT_MODELS = { @@ -45,7 +42,6 @@ SUPPORT_MODELS = { } - class Detector(object): """ Args: diff --git a/helper/page_detection/utils.py b/helper/page_detection/utils.py index c5b3aaf..a57b511 100644 --- a/helper/page_detection/utils.py +++ b/helper/page_detection/utils.py @@ -33,199 +33,6 @@ class PageDetectionResult(object): self.image = image -def argsparser(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--image_dir", - type=str, - default=None, - help="Dir of image file, `image_file` has a higher priority.") - parser.add_argument( - "--batch_size", type=int, default=1, help="batch_size for inference.") - parser.add_argument( - "--video_file", - type=str, - default=None, - help="Path of video file, `video_file` or `camera_id` has a highest priority." - ) - parser.add_argument( - "--camera_id", - type=int, - default=-1, - help="device id of camera to predict.") - parser.add_argument( - "--threshold", type=float, default=0.5, help="Threshold of score.") - parser.add_argument( - "--output_dir", - type=str, - default="output", - help="Directory of output visualization files.") - parser.add_argument( - "--run_mode", - type=str, - default='paddle', - help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)") - parser.add_argument( - "--device", - type=str, - default='cpu', - help="Choose the device you want to run, it can be: CPU/GPU/XPU/NPU, default is CPU." - ) - parser.add_argument( - "--use_gpu", - type=ast.literal_eval, - default=False, - help="Deprecated, please use `--device`.") - parser.add_argument( - "--run_benchmark", - type=ast.literal_eval, - default=False, - help="Whether to predict a image_file repeatedly for benchmark") - parser.add_argument( - "--enable_mkldnn", - type=ast.literal_eval, - default=False, - help="Whether use mkldnn with CPU.") - parser.add_argument( - "--enable_mkldnn_bfloat16", - type=ast.literal_eval, - default=False, - help="Whether use mkldnn bfloat16 inference with CPU.") - parser.add_argument( - "--cpu_threads", type=int, default=1, help="Num of threads with CPU.") - parser.add_argument( - "--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.") - parser.add_argument( - "--trt_max_shape", - type=int, - default=1280, - help="max_shape for TensorRT.") - parser.add_argument( - "--trt_opt_shape", - type=int, - default=640, - help="opt_shape for TensorRT.") - parser.add_argument( - "--trt_calib_mode", - type=bool, - default=False, - help="If the model is produced by TRT offline quantitative " - "calibration, trt_calib_mode need to set True.") - parser.add_argument( - '--save_images', - type=ast.literal_eval, - default=True, - help='Save visualization image results.') - parser.add_argument( - '--save_mot_txts', - action='store_true', - help='Save tracking results (txt).') - parser.add_argument( - '--save_mot_txt_per_img', - action='store_true', - help='Save tracking results (txt) for each image.') - parser.add_argument( - '--scaled', - type=bool, - default=False, - help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 " - "True in general detector.") - parser.add_argument( - "--tracker_config", type=str, default=None, help=("tracker donfig")) - parser.add_argument( - "--reid_model_dir", - type=str, - default=None, - help=("Directory include:'model.pdiparams', 'model.pdmodel', " - "'infer_cfg.yml', created by tools/export_model.py.")) - parser.add_argument( - "--reid_batch_size", - type=int, - default=50, - help="max batch_size for reid model inference.") - parser.add_argument( - '--use_dark', - type=ast.literal_eval, - default=True, - help='whether to use darkpose to get better keypoint position predict ') - parser.add_argument( - "--action_file", - type=str, - default=None, - help="Path of input file for action recognition.") - parser.add_argument( - "--window_size", - type=int, - default=50, - help="Temporal size of skeleton feature for action recognition.") - parser.add_argument( - "--random_pad", - type=ast.literal_eval, - default=False, - help="Whether do random padding for action recognition.") - parser.add_argument( - "--save_results", - action='store_true', - default=False, - help="Whether save detection result to file using coco format") - parser.add_argument( - '--use_coco_category', - action='store_true', - default=False, - help='Whether to use the coco format dictionary `clsid2catid`') - parser.add_argument( - "--slice_infer", - action='store_true', - help="Whether to slice the image and merge the inference results for small object detection." - ) - parser.add_argument( - '--slice_size', - nargs='+', - type=int, - default=[640, 640], - help="Height of the sliced image.") - parser.add_argument( - "--overlap_ratio", - nargs='+', - type=float, - default=[0.25, 0.25], - help="Overlap height ratio of the sliced image.") - parser.add_argument( - "--combine_method", - type=str, - default='nms', - help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']." - ) - parser.add_argument( - "--match_threshold", - type=float, - default=0.6, - help="Combine method matching threshold.") - parser.add_argument( - "--match_metric", - type=str, - default='ios', - help="Combine method matching metric, choose in ['iou', 'ios'].") - parser.add_argument( - "--collect_trt_shape_info", - action='store_true', - default=False, - help="Whether to collect dynamic shape before using tensorrt.") - parser.add_argument( - "--tuned_trt_shape_file", - type=str, - default="shape_range_info.pbtxt", - help="Path of a dynamic shape file for tensorrt.") - parser.add_argument("--use_fd_format", action="store_true") - parser.add_argument( - "--task_type", - type=str, - default='Detection', - help="How to save the coco result, it only work with save_results==True. Optional inputs are Rotate or Detection, default is Detection." - ) - return parser - - class Times(object): def __init__(self): self.time = 0. @@ -325,212 +132,6 @@ class Timer(Times): return dic - -def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'): - final_boxes = [] - for c in range(num_classes): - idxs = bboxs[:, 0] == c - if np.count_nonzero(idxs) == 0: continue - r = nms(bboxs[idxs, 1:], match_threshold, match_metric) - final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1)) - return final_boxes - - -def nms(dets, match_threshold=0.6, match_metric='iou'): - """ Apply NMS to avoid detecting too many overlapping bounding boxes. - Args: - dets: shape [N, 5], [score, x1, y1, x2, y2] - match_metric: 'iou' or 'ios' - match_threshold: overlap thresh for match metric. - """ - if dets.shape[0] == 0: - return dets[[], :] - scores = dets[:, 0] - x1 = dets[:, 1] - y1 = dets[:, 2] - x2 = dets[:, 3] - y2 = dets[:, 4] - areas = (x2 - x1 + 1) * (y2 - y1 + 1) - order = scores.argsort()[::-1] - - ndets = dets.shape[0] - suppressed = np.zeros((ndets), dtype=np.int32) - - for _i in range(ndets): - i = order[_i] - if suppressed[i] == 1: - continue - ix1 = x1[i] - iy1 = y1[i] - ix2 = x2[i] - iy2 = y2[i] - iarea = areas[i] - for _j in range(_i + 1, ndets): - j = order[_j] - if suppressed[j] == 1: - continue - xx1 = max(ix1, x1[j]) - yy1 = max(iy1, y1[j]) - xx2 = min(ix2, x2[j]) - yy2 = min(iy2, y2[j]) - w = max(0.0, xx2 - xx1 + 1) - h = max(0.0, yy2 - yy1 + 1) - inter = w * h - if match_metric == 'iou': - union = iarea + areas[j] - inter - match_value = inter / union - elif match_metric == 'ios': - smaller = min(iarea, areas[j]) - match_value = inter / smaller - else: - raise ValueError() - if match_value >= match_threshold: - suppressed[j] = 1 - keep = np.where(suppressed == 0)[0] - dets = dets[keep, :] - return dets - - -coco_clsid2catid = { - 0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5, - 5: 6, - 6: 7, - 7: 8, - 8: 9, - 9: 10, - 10: 11, - 11: 13, - 12: 14, - 13: 15, - 14: 16, - 15: 17, - 16: 18, - 17: 19, - 18: 20, - 19: 21, - 20: 22, - 21: 23, - 22: 24, - 23: 25, - 24: 27, - 25: 28, - 26: 31, - 27: 32, - 28: 33, - 29: 34, - 30: 35, - 31: 36, - 32: 37, - 33: 38, - 34: 39, - 35: 40, - 36: 41, - 37: 42, - 38: 43, - 39: 44, - 40: 46, - 41: 47, - 42: 48, - 43: 49, - 44: 50, - 45: 51, - 46: 52, - 47: 53, - 48: 54, - 49: 55, - 50: 56, - 51: 57, - 52: 58, - 53: 59, - 54: 60, - 55: 61, - 56: 62, - 57: 63, - 58: 64, - 59: 65, - 60: 67, - 61: 70, - 62: 72, - 63: 73, - 64: 74, - 65: 75, - 66: 76, - 67: 77, - 68: 78, - 69: 79, - 70: 80, - 71: 81, - 72: 82, - 73: 84, - 74: 85, - 75: 86, - 76: 87, - 77: 88, - 78: 89, - 79: 90 -} - - -def gaussian_radius(bbox_size, min_overlap): - height, width = bbox_size - - a1 = 1 - b1 = (height + width) - c1 = width * height * (1 - min_overlap) / (1 + min_overlap) - sq1 = np.sqrt(b1**2 - 4 * a1 * c1) - radius1 = (b1 + sq1) / (2 * a1) - - a2 = 4 - b2 = 2 * (height + width) - c2 = (1 - min_overlap) * width * height - sq2 = np.sqrt(b2**2 - 4 * a2 * c2) - radius2 = (b2 + sq2) / 2 - - a3 = 4 * min_overlap - b3 = -2 * min_overlap * (height + width) - c3 = (min_overlap - 1) * width * height - sq3 = np.sqrt(b3**2 - 4 * a3 * c3) - radius3 = (b3 + sq3) / 2 - return min(radius1, radius2, radius3) - - -def gaussian2D(shape, sigma_x=1, sigma_y=1): - m, n = [(ss - 1.) / 2. for ss in shape] - y, x = np.ogrid[-m:m + 1, -n:n + 1] - - h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y * - sigma_y))) - h[h < np.finfo(h.dtype).eps * h.max()] = 0 - return h - - -def draw_umich_gaussian(heatmap, center, radius, k=1): - """ - draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126 - """ - diameter = 2 * radius + 1 - gaussian = gaussian2D( - (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6) - - x, y = int(center[0]), int(center[1]) - - height, width = heatmap.shape[0:2] - - left, right = min(x, radius), min(width - x, radius + 1) - top, bottom = min(y, radius), min(height - y, radius + 1) - - masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] - masked_gaussian = gaussian[radius - top:radius + bottom, radius - left: - radius + right] - if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: - np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) - return heatmap - - def iou(box1, box2): """计算两个框的 IoU(交并比)""" x1 = max(box1[0], box2[0]) diff --git a/pipeline.py b/pipeline.py index 3960190..04a3c75 100644 --- a/pipeline.py +++ b/pipeline.py @@ -3,8 +3,9 @@ import os from loguru import logger env = os.environ.get('env', 'dev') -logger.info(f'Configure using this environment: {env}') -load_dotenv(dotenv_path='.env.dev' if env == 'dev' else '.env', override=True) +dotenv_path = '.env.dev' if env == 'dev' else '.env' +logger.info(f'Configure using this dotenv path: {dotenv_path}') +load_dotenv(dotenv_path=dotenv_path, override=True) import time import traceback @@ -42,7 +43,7 @@ def _pdf2markdown_pipeline(pdf_path, visual): img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE) images[i] = img - # images = images[90: 123] + # images = images[:2] # 3. 版面分析 t5 = time.time() @@ -76,6 +77,7 @@ def pdf2markdown_pipeline(pdf_path: str, visual=False, insert_db=True): pdf_name = pdf_path.split('/')[-1] start_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') process_status = 0 + pdf_id = None try: results = _pdf2markdown_pipeline(pdf_path, visual) except Exception: @@ -84,7 +86,6 @@ def pdf2markdown_pipeline(pdf_path: str, visual=False, insert_db=True): end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') if insert_db: insert_pdf2md_table(pdf_path, pdf_name, process_status, start_time, end_time, None) - pdf_id = None else: process_status = PDFAnalysisStatus.SUCCESS.value end_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') @@ -94,6 +95,7 @@ def pdf2markdown_pipeline(pdf_path: str, visual=False, insert_db=True): if __name__ == '__main__': - pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2021年年度报告.PDF', visual=True, insert_db=True) - # pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2022年年度报告.PDF', visual=True, insert_db=True) - # pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2023年年度报告.PDF', visual=True, insert_db=True) + insert_db = False + pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2021年年度报告.PDF', visual=True, insert_db=insert_db) + # pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2022年年度报告.PDF', visual=True, insert_db=insert_db) + # pdf2markdown_pipeline('/mnt/pdf2markdown/龙源电力:2023年年度报告.PDF', visual=True, insert_db=insert_db)