pdf2markdown/helper/content_recognition/utils.py

import os
import tempfile
import cv2
import numpy as np
from marker.converters.table import TableConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from .rapid_table_pipeline.main import table2md_pipeline
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_images
from markdownify import markdownify as md
import re
from ..image_helper import text_rec


def scanning_document_classify(image):
    # 判断是否是扫描件

    # 将图像从BGR颜色空间转换到HSV颜色空间
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    # 定义红色的HSV范围
    lower_red1 = np.array([0, 70, 50])
    upper_red1 = np.array([10, 255, 255])
    lower_red2 = np.array([170, 70, 50])
    upper_red2 = np.array([180, 255, 255])

    # 创建两个掩码，一个用于低色调的红色，一个用于高色调的红色
    mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
    mask2 = cv2.inRange(hsv, lower_red2, upper_red2)

    # 将两个掩码合并
    mask = cv2.bitwise_or(mask1, mask2)

    # 计算红色区域的非零像素数量
    non_zero_pixels = cv2.countNonZero(mask)
    return 1 < non_zero_pixels < 1000


def remove_watermark(image):
    # 去除红色印章
    _, _, r_channel = cv2.split(image)
    r_channel[r_channel > 210] = 255
    r_channel = cv2.cvtColor(r_channel, cv2.COLOR_GRAY2BGR)
    return r_channel


def html2md(html_content):
    md_content = md(html_content)
    md_content = re.sub(r'\\([#*_`])', r'\1', md_content)
    return md_content


def markdown_rec(image):
    # TODO 可以传入文件夹
    image_path = f'{tempfile.mktemp()}.jpg'
    cv2.imwrite(image_path, image)

    try:
        ds = read_local_images(image_path)[0]
        x = ds.apply(doc_analyze, ocr=True)
        x = x.pipe_ocr_mode(None)
        html = x.get_markdown(None)
    finally:
        os.remove(image_path)
    return html2md(html)


def table_rec(image):
    boxes, texts, conficences = text_rec(image)
    ocr_result = list(zip(boxes, texts, conficences))
    return table2md_pipeline(image, ocr_result)


table_converter = TableConverter(artifact_dict=create_model_dict())


def scanning_document_rec(image):
    tmp_image_path = f'{tempfile.mktemp()}.jpg'

    try:
        unwatermarked_image = remove_watermark(image)
        cv2.imwrite(tmp_image_path, unwatermarked_image)

        rendered = table_converter(tmp_image_path)
        text, _, _ = text_from_rendered(rendered)
    finally:
        os.remove(tmp_image_path)
    return text, unwatermarked_image


def compute_box_distance(box1, box2):
    x11, y11, x12, y12 = box1
    x21, y21, x22, y22 = box2

    # 计算水平和垂直方向的重叠量
    x_overlap = max(0, min(x12, x22) - max(x11, x21))
    y_overlap = max(0, min(y12, y22) - max(y11, y21))

    # 如果有重叠（x和y都重叠），返回负的重叠深度（取 min 表示最小穿透）
    if x_overlap > 0 and y_overlap > 0:
        return -min(x_overlap, y_overlap)

    distances = []

    # 如果 x 方向有投影重叠，计算上下边的距离
    if x12 > x21 and x11 < x22:
        dist_top = y21 - y12  # box1下边到box2上边
        dist_bottom = y11 - y22  # box1上边到box2下边
        if dist_top > 0:
            distances.append(dist_top)
        if dist_bottom > 0:
            distances.append(dist_bottom)

    # 如果 y 方向有投影重叠，计算左右边的距离
    if y12 > y21 and y11 < y22:
        dist_left = x11 - x22  # box1左边到box2右边
        dist_right = x21 - x12  # box1右边到box2左边
        if dist_left > 0:
            distances.append(dist_left)
        if dist_right > 0:
            distances.append(dist_right)

    # 如果有合法的距离，返回最小值，否则说明边无法对齐，返回 None
    return min(distances) if distances else None


def assign_tables_to_titles(layout_results, max_distance=200):
    tables = [_ for _ in layout_results if _.clsid == 4]
    titles = [_ for _ in layout_results if _.clsid == 5]

    table_to_title = {}
    title_to_table = {}

    changed = True
    while changed:
        changed = False
        for title in titles:
            title_id = id(title)

            best_table = None
            min_dist = float('inf')

            for table in tables:
                table_id = id(table)

                dist = compute_box_distance(title.box, table.box)
                if dist is None or dist > max_distance:
                    continue

                if dist < min_dist:
                    min_dist = dist
                    best_table = table

            if best_table is None:
                continue

            table_id = id(best_table)

            current_table = title_to_table.get(title_id)
            if current_table is best_table:
                continue  # 已是最优，无需更新

            prev_title = table_to_title.get(table_id)
            if prev_title:
                prev_title_id = id(prev_title)
                prev_dist = compute_box_distance(prev_title.box, best_table.box)
                if prev_dist is not None and prev_dist <= min_dist:
                    continue  # 原标题绑定得更近，跳过

                # 解绑旧标题
                title_to_table.pop(prev_title_id, None)

            # 更新新绑定
            title_to_table[title_id] = best_table
            table_to_title[table_id] = title
            changed = True  # 有更新

    # 最终写回绑定结果
    for table in tables:
        table_id = id(table)
        title = table_to_title.get(table_id)
        if title:
            table.table_title = title.content
        else:
            table.table_title = None


if __name__ == '__main__':
    # content = text_rec('/mnt/research/PaddleOCR/pdf2md_pipeline/s4_content_recognition/all_layouts/5.jpg')
    # content = markdown_rec('/mnt/research/PaddleOCR/pdf2md_pipeline/s4_content_recognition/all_layouts/3.jpg')
    # content = table_rec('/mnt/research/PaddleOCR/pdf2md_pipeline/s4_content_recognition/all_layouts/6.jpg')
    content = scanning_document_rec('/mnt/research/PaddleOCR/pdf2md_pipeline/s4_content_recognition/all_layouts/103.jpg')
    print(content)