pdf2markdown/helper/image_helper.py

from typing import List
from typing_extensions import deprecated
import numpy as np
from pdf2image import convert_from_path
import cv2
from .page_detection.utils import PageDetectionResult
from paddleocr import PaddleOCR
from .constants import PageDetectionEnum as E
from paddlex import create_model
from tqdm import tqdm


ocr = PaddleOCR(use_angle_cls=False, lang='ch', use_gpu=True, show_log=False)
text_orient_model = create_model(model_name="PP-LCNet_x1_0_doc_ori")


def pdf2image(pdf_path: str) -> List[np.ndarray]:
    images = convert_from_path(pdf_path)
    images = [cv2.cvtColor(np.array(_.convert('RGB')), cv2.COLOR_RGB2BGR) for _ in images]
    return images


def boxes_to_heatmap(boxes, image_shape):
    """生成热力图：每个像素表示被多少个box覆盖"""
    heatmap = np.zeros(image_shape, dtype=np.float32)
    for xmin, ymin, xmax, ymax in boxes:
        xmin, ymin, xmax, ymax = map(int, (xmin, ymin, xmax, ymax))
        heatmap[ymin:ymax, xmin:xmax] += 1
    return heatmap


def compute_integral_image(heatmap):
    """计算积分图"""
    return heatmap.cumsum(axis=0).cumsum(axis=1)


def get_sum_in_rect(integral, x1, y1, x2, y2):
    """使用积分图获取矩形区域的和"""
    total = integral[y2, x2]
    if x1 > 0:
        total -= integral[y2, x1 - 1]
    if y1 > 0:
        total -= integral[y1 - 1, x2]
    if x1 > 0 and y1 > 0:
        total += integral[y1 - 1, x1 - 1]
    return total


def find_max_area_square(boxes, image_width, image_height, window_size=224):
    heatmap = boxes_to_heatmap(boxes, (image_height, image_width))
    integral = compute_integral_image(heatmap)

    h, w = heatmap.shape
    H = h - window_size + 1
    W = w - window_size + 1

    # 创建矩阵索引
    ys, xs = np.meshgrid(np.arange(H), np.arange(W), indexing='ij')
    x1 = xs
    y1 = ys
    x2 = x1 + window_size - 1
    y2 = y1 + window_size - 1

    # 用矢量化方式计算所有区域的总和
    total = (
        integral[y2, x2]
        - np.where(x1 > 0, integral[y2, x1 - 1], 0)
        - np.where(y1 > 0, integral[y1 - 1, x2], 0)
        + np.where((x1 > 0) & (y1 > 0), integral[y1 - 1, x1 - 1], 0)
    )

    max_idx = np.unravel_index(np.argmax(total), total.shape)
    top_left_y, top_left_x = max_idx
    best_window = [int(top_left_x), int(top_left_y),
                   int(top_left_x + window_size), int(top_left_y + window_size)]
    best_score = total[max_idx]
    return best_window, best_score


def image_orient_cls(images):
    if isinstance(images, np.ndarray):
        images = [images]

    angles = []
    for img in tqdm(images, '文本方向分类'):
        h, w = img.shape[:2]
        det_res = ocr.ocr(img, det=True, rec=False, cls=False)[0]
        boxes = []
        for r in det_res:
            x1, y1, x2, y2 = int(r[0][0]), int(r[0][1]), int(r[2][0]), int(r[2][1])
            boxes.append((x1, y1, x2, y2))
        square_box, _ = find_max_area_square(boxes, w, h, 224)
        x1, y1, x2, y2 = square_box
        x1 = max(0, x1 - 16)
        y1 = max(0, y1 - 16)
        x2 = min(w, x2 + 16)
        y2 = min(h, y2 + 16)
        _img = img[y1:y2, x1:x2]

        output = text_orient_model.predict(_img,  batch_size=1)
        angle = int(next(output)['label_names'][0])
        angles.append(angle)

    return angles


@deprecated('Low precision classification of scanned documents')
def scanning_document_classify(image):
    # 判断是否是扫描件

    # 将图像从BGR颜色空间转换到HSV颜色空间
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    # 定义红色的HSV范围
    lower_red1 = np.array([0, 70, 50])
    upper_red1 = np.array([10, 255, 255])
    lower_red2 = np.array([170, 70, 50])
    upper_red2 = np.array([180, 255, 255])

    # 创建两个掩码，一个用于低色调的红色，一个用于高色调的红色
    mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
    mask2 = cv2.inRange(hsv, lower_red2, upper_red2)

    # 将两个掩码合并
    mask = cv2.bitwise_or(mask1, mask2)

    # 计算红色区域的非零像素数量
    non_zero_pixels = cv2.countNonZero(mask)
    return 1 < non_zero_pixels < 1000


def remove_watermark(image):
    # 去除红色印章
    _, _, r_channel = cv2.split(image)
    r_channel[r_channel > 210] = 255
    r_channel = cv2.cvtColor(r_channel, cv2.COLOR_GRAY2BGR)
    return r_channel


def overlay_rectangle(image, top_left, bottom_right, color, alpha=0.5):
    """
    在图像的矩形区域内覆盖颜色并设置透明度。

    参数：
    - image: 输入图像（numpy数组，BGR格式）
    - top_left: 左上角坐标 (x, y)
    - bottom_right: 右下角坐标 (x, y)
    - color: 填充颜色，BGR 格式，默认绿色
    - alpha: 透明度，0.0（完全透明）到 1.0（完全不透明）

    返回：
    - 带有矩形覆盖效果的图像副本
    """
    # 确保 alpha 合法
    alpha = np.clip(alpha, 0, 1)

    # 复制图像用于输出
    output = image.copy()

    # 创建一个矩形区域的遮罩图层
    overlay = image.copy()
    cv2.rectangle(overlay, top_left, bottom_right, color, thickness=-1)

    # 将 overlay 融合到原图上
    cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)

    return output


def overlay_rectangle(image, top_left, bottom_right, text, color, alpha=0.5, font_scale=1.0, thickness=1):
    """
    在图像上绘制带透明填充的矩形框和位于框外部的带背景标签。

    返回：
    - 带可视化效果的图像副本
    """
    alpha = np.clip(alpha, 0, 1)
    output = image.copy()
    overlay = image.copy()

    # 1. 绘制目标区域透明填充
    cv2.rectangle(overlay, top_left, bottom_right, color, thickness=-1)

    x, y = top_left
    # 获取文字尺寸
    (text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_TRIPLEX, font_scale, thickness)
    text_origin = (x, y)  # 文字左下角坐标
    text_topleft = (x, y - text_height - baseline)
    text_bottomright = (x + text_width, y)

    # 防止文字背景区域越界
    text_topleft = (max(0, text_topleft[0]), max(0, text_topleft[1]))

    # 绘制文字背景到 overlay（同样使用透明度）
    cv2.rectangle(overlay, text_topleft, text_bottomright, color, thickness=-1)

    # 将文字写到 output 图像上
    cv2.putText(output, text, text_origin, cv2.FONT_HERSHEY_TRIPLEX, font_scale, (0, 0, 0), thickness)

    # 3. 合成透明效果
    cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)

    return output


def page_detection_visual(page_detection_result: PageDetectionResult):
    img = page_detection_result.image
    for box in page_detection_result.boxes:
        pos = box.pos
        clsid = box.clsid
        confidence = box.confidence
        if clsid == E.TEXT.value:
            text = E.TEXT.label
            color = (177, 216, 178)
        elif clsid == E.TITLE.value:
            text = E.TITLE.label
            color = (181, 136, 145)
        elif clsid == E.TABLE.value:
            text = E.TABLE.label
            color = (255, 97, 176)
        elif clsid == E.TABLE_CAPTION.value:
            text = E.TABLE_CAPTION.label
            color = (66, 73, 255)
        elif clsid == E.SCANNED_DOCUMENT.value:
            text = E.SCANNED_DOCUMENT.label
            color = (255, 239, 145)
        else:
            continue
        text = f'{text} {confidence:.2f}'
        img = overlay_rectangle(img, (int(pos[0]), int(pos[1])), (int(pos[2]), int(pos[3])), text, color, font_scale=0.6)

    return img


def text_rec(image):
    result = ocr.ocr(image, cls=False)
    boxes = []
    texts = []
    conficences = []
    for idx in range(len(result)):
        res = result[idx]
        if not res:
            continue
        for line in res:
            if not line:
                continue
            box = line[0]
            text = line[1][0]
            confidence = line[1][1]
            boxes.append(box)
            texts.append(text)
            conficences.append(confidence)
    return boxes, texts, conficences