from typing import List from typing_extensions import deprecated import numpy as np from pdf2image import convert_from_path import cv2 from .page_detection.utils import PageDetectionResult from paddleocr import PaddleOCR from .constants import PageDetectionEnum as E from paddlex import create_model from tqdm import tqdm ocr = PaddleOCR(use_angle_cls=False, lang='ch', use_gpu=True, show_log=False) text_orient_model = create_model(model_name="PP-LCNet_x1_0_doc_ori") def pdf2image(pdf_path: str) -> List[np.ndarray]: images = convert_from_path(pdf_path) images = [cv2.cvtColor(np.array(_.convert('RGB')), cv2.COLOR_RGB2BGR) for _ in images] return images def boxes_to_heatmap(boxes, image_shape): """生成热力图:每个像素表示被多少个box覆盖""" heatmap = np.zeros(image_shape, dtype=np.float32) for xmin, ymin, xmax, ymax in boxes: xmin, ymin, xmax, ymax = map(int, (xmin, ymin, xmax, ymax)) heatmap[ymin:ymax, xmin:xmax] += 1 return heatmap def compute_integral_image(heatmap): """计算积分图""" return heatmap.cumsum(axis=0).cumsum(axis=1) def get_sum_in_rect(integral, x1, y1, x2, y2): """使用积分图获取矩形区域的和""" total = integral[y2, x2] if x1 > 0: total -= integral[y2, x1 - 1] if y1 > 0: total -= integral[y1 - 1, x2] if x1 > 0 and y1 > 0: total += integral[y1 - 1, x1 - 1] return total def find_max_area_square(boxes, image_width, image_height, window_size=224): heatmap = boxes_to_heatmap(boxes, (image_height, image_width)) integral = compute_integral_image(heatmap) h, w = heatmap.shape H = h - window_size + 1 W = w - window_size + 1 # 创建矩阵索引 ys, xs = np.meshgrid(np.arange(H), np.arange(W), indexing='ij') x1 = xs y1 = ys x2 = x1 + window_size - 1 y2 = y1 + window_size - 1 # 用矢量化方式计算所有区域的总和 total = ( integral[y2, x2] - np.where(x1 > 0, integral[y2, x1 - 1], 0) - np.where(y1 > 0, integral[y1 - 1, x2], 0) + np.where((x1 > 0) & (y1 > 0), integral[y1 - 1, x1 - 1], 0) ) max_idx = np.unravel_index(np.argmax(total), total.shape) top_left_y, top_left_x = max_idx best_window = [int(top_left_x), int(top_left_y), int(top_left_x + window_size), int(top_left_y + window_size)] best_score = total[max_idx] return best_window, best_score def image_orient_cls(images): if isinstance(images, np.ndarray): images = [images] angles = [] for img in tqdm(images, '文本方向分类'): h, w = img.shape[:2] det_res = ocr.ocr(img, det=True, rec=False, cls=False)[0] boxes = [] for r in det_res: x1, y1, x2, y2 = int(r[0][0]), int(r[0][1]), int(r[2][0]), int(r[2][1]) boxes.append((x1, y1, x2, y2)) square_box, _ = find_max_area_square(boxes, w, h, 224) x1, y1, x2, y2 = square_box x1 = max(0, x1 - 16) y1 = max(0, y1 - 16) x2 = min(w, x2 + 16) y2 = min(h, y2 + 16) _img = img[y1:y2, x1:x2] output = text_orient_model.predict(_img, batch_size=1) angle = int(next(output)['label_names'][0]) angles.append(angle) return angles @deprecated('Low precision classification of scanned documents') def scanning_document_classify(image): # 判断是否是扫描件 # 将图像从BGR颜色空间转换到HSV颜色空间 hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) # 定义红色的HSV范围 lower_red1 = np.array([0, 70, 50]) upper_red1 = np.array([10, 255, 255]) lower_red2 = np.array([170, 70, 50]) upper_red2 = np.array([180, 255, 255]) # 创建两个掩码,一个用于低色调的红色,一个用于高色调的红色 mask1 = cv2.inRange(hsv, lower_red1, upper_red1) mask2 = cv2.inRange(hsv, lower_red2, upper_red2) # 将两个掩码合并 mask = cv2.bitwise_or(mask1, mask2) # 计算红色区域的非零像素数量 non_zero_pixels = cv2.countNonZero(mask) return 1 < non_zero_pixels < 1000 def remove_watermark(image): # 去除红色印章 _, _, r_channel = cv2.split(image) r_channel[r_channel > 210] = 255 r_channel = cv2.cvtColor(r_channel, cv2.COLOR_GRAY2BGR) return r_channel def overlay_rectangle(image, top_left, bottom_right, color, alpha=0.5): """ 在图像的矩形区域内覆盖颜色并设置透明度。 参数: - image: 输入图像(numpy数组,BGR格式) - top_left: 左上角坐标 (x, y) - bottom_right: 右下角坐标 (x, y) - color: 填充颜色,BGR 格式,默认绿色 - alpha: 透明度,0.0(完全透明)到 1.0(完全不透明) 返回: - 带有矩形覆盖效果的图像副本 """ # 确保 alpha 合法 alpha = np.clip(alpha, 0, 1) # 复制图像用于输出 output = image.copy() # 创建一个矩形区域的遮罩图层 overlay = image.copy() cv2.rectangle(overlay, top_left, bottom_right, color, thickness=-1) # 将 overlay 融合到原图上 cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output) return output def overlay_rectangle(image, top_left, bottom_right, text, color, alpha=0.5, font_scale=1.0, thickness=1): """ 在图像上绘制带透明填充的矩形框和位于框外部的带背景标签。 返回: - 带可视化效果的图像副本 """ alpha = np.clip(alpha, 0, 1) output = image.copy() overlay = image.copy() # 1. 绘制目标区域透明填充 cv2.rectangle(overlay, top_left, bottom_right, color, thickness=-1) x, y = top_left # 获取文字尺寸 (text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_TRIPLEX, font_scale, thickness) text_origin = (x, y) # 文字左下角坐标 text_topleft = (x, y - text_height - baseline) text_bottomright = (x + text_width, y) # 防止文字背景区域越界 text_topleft = (max(0, text_topleft[0]), max(0, text_topleft[1])) # 绘制文字背景到 overlay(同样使用透明度) cv2.rectangle(overlay, text_topleft, text_bottomright, color, thickness=-1) # 将文字写到 output 图像上 cv2.putText(output, text, text_origin, cv2.FONT_HERSHEY_TRIPLEX, font_scale, (0, 0, 0), thickness) # 3. 合成透明效果 cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output) return output def page_detection_visual(page_detection_result: PageDetectionResult): img = page_detection_result.image for box in page_detection_result.boxes: pos = box.pos clsid = box.clsid confidence = box.confidence if clsid == E.TEXT.value: text = E.TEXT.label color = (177, 216, 178) elif clsid == E.TITLE.value: text = E.TITLE.label color = (181, 136, 145) elif clsid == E.TABLE.value: text = E.TABLE.label color = (255, 97, 176) elif clsid == E.TABLE_CAPTION.value: text = E.TABLE_CAPTION.label color = (66, 73, 255) elif clsid == E.SCANNED_DOCUMENT.value: text = E.SCANNED_DOCUMENT.label color = (255, 239, 145) else: continue text = f'{text} {confidence:.2f}' img = overlay_rectangle(img, (int(pos[0]), int(pos[1])), (int(pos[2]), int(pos[3])), text, color, font_scale=0.6) return img def text_rec(image): result = ocr.ocr(image, cls=False) boxes = [] texts = [] conficences = [] for idx in range(len(result)): res = result[idx] if not res: continue for line in res: if not line: continue box = line[0] text = line[1][0] confidence = line[1][1] boxes.append(box) texts.append(text) conficences.append(confidence) return boxes, texts, conficences