You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

254 lines
7.9 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from typing import List
from typing_extensions import deprecated
import numpy as np
from pdf2image import convert_from_path
import cv2
from .page_detection.utils import PageDetectionResult
from paddleocr import PaddleOCR
from .constants import PageDetectionEnum as E
from paddlex import create_model
from tqdm import tqdm
ocr = PaddleOCR(use_angle_cls=False, lang='ch', use_gpu=True, show_log=False)
text_orient_model = create_model(model_name="PP-LCNet_x1_0_doc_ori")
def pdf2image(pdf_path: str) -> List[np.ndarray]:
images = convert_from_path(pdf_path)
images = [cv2.cvtColor(np.array(_.convert('RGB')), cv2.COLOR_RGB2BGR) for _ in images]
return images
def boxes_to_heatmap(boxes, image_shape):
"""生成热力图每个像素表示被多少个box覆盖"""
heatmap = np.zeros(image_shape, dtype=np.float32)
for xmin, ymin, xmax, ymax in boxes:
xmin, ymin, xmax, ymax = map(int, (xmin, ymin, xmax, ymax))
heatmap[ymin:ymax, xmin:xmax] += 1
return heatmap
def compute_integral_image(heatmap):
"""计算积分图"""
return heatmap.cumsum(axis=0).cumsum(axis=1)
def get_sum_in_rect(integral, x1, y1, x2, y2):
"""使用积分图获取矩形区域的和"""
total = integral[y2, x2]
if x1 > 0:
total -= integral[y2, x1 - 1]
if y1 > 0:
total -= integral[y1 - 1, x2]
if x1 > 0 and y1 > 0:
total += integral[y1 - 1, x1 - 1]
return total
def find_max_area_square(boxes, image_width, image_height, window_size=224):
heatmap = boxes_to_heatmap(boxes, (image_height, image_width))
integral = compute_integral_image(heatmap)
h, w = heatmap.shape
H = h - window_size + 1
W = w - window_size + 1
# 创建矩阵索引
ys, xs = np.meshgrid(np.arange(H), np.arange(W), indexing='ij')
x1 = xs
y1 = ys
x2 = x1 + window_size - 1
y2 = y1 + window_size - 1
# 用矢量化方式计算所有区域的总和
total = (
integral[y2, x2]
- np.where(x1 > 0, integral[y2, x1 - 1], 0)
- np.where(y1 > 0, integral[y1 - 1, x2], 0)
+ np.where((x1 > 0) & (y1 > 0), integral[y1 - 1, x1 - 1], 0)
)
max_idx = np.unravel_index(np.argmax(total), total.shape)
top_left_y, top_left_x = max_idx
best_window = [int(top_left_x), int(top_left_y),
int(top_left_x + window_size), int(top_left_y + window_size)]
best_score = total[max_idx]
return best_window, best_score
def image_orient_cls(images):
if isinstance(images, np.ndarray):
images = [images]
angles = []
for img in tqdm(images, '文本方向分类'):
h, w = img.shape[:2]
det_res = ocr.ocr(img, det=True, rec=False, cls=False)[0]
boxes = []
for r in det_res:
x1, y1, x2, y2 = int(r[0][0]), int(r[0][1]), int(r[2][0]), int(r[2][1])
boxes.append((x1, y1, x2, y2))
square_box, _ = find_max_area_square(boxes, w, h, 224)
x1, y1, x2, y2 = square_box
x1 = max(0, x1 - 16)
y1 = max(0, y1 - 16)
x2 = min(w, x2 + 16)
y2 = min(h, y2 + 16)
_img = img[y1:y2, x1:x2]
output = text_orient_model.predict(_img, batch_size=1)
angle = int(next(output)['label_names'][0])
angles.append(angle)
return angles
@deprecated('Low precision classification of scanned documents')
def scanning_document_classify(image):
# 判断是否是扫描件
# 将图像从BGR颜色空间转换到HSV颜色空间
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
# 定义红色的HSV范围
lower_red1 = np.array([0, 70, 50])
upper_red1 = np.array([10, 255, 255])
lower_red2 = np.array([170, 70, 50])
upper_red2 = np.array([180, 255, 255])
# 创建两个掩码,一个用于低色调的红色,一个用于高色调的红色
mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
mask2 = cv2.inRange(hsv, lower_red2, upper_red2)
# 将两个掩码合并
mask = cv2.bitwise_or(mask1, mask2)
# 计算红色区域的非零像素数量
non_zero_pixels = cv2.countNonZero(mask)
return 1 < non_zero_pixels < 1000
def remove_watermark(image):
# 去除红色印章
_, _, r_channel = cv2.split(image)
r_channel[r_channel > 210] = 255
r_channel = cv2.cvtColor(r_channel, cv2.COLOR_GRAY2BGR)
return r_channel
def overlay_rectangle(image, top_left, bottom_right, color, alpha=0.5):
"""
在图像的矩形区域内覆盖颜色并设置透明度。
参数:
- image: 输入图像numpy数组BGR格式
- top_left: 左上角坐标 (x, y)
- bottom_right: 右下角坐标 (x, y)
- color: 填充颜色BGR 格式,默认绿色
- alpha: 透明度0.0(完全透明)到 1.0(完全不透明)
返回:
- 带有矩形覆盖效果的图像副本
"""
# 确保 alpha 合法
alpha = np.clip(alpha, 0, 1)
# 复制图像用于输出
output = image.copy()
# 创建一个矩形区域的遮罩图层
overlay = image.copy()
cv2.rectangle(overlay, top_left, bottom_right, color, thickness=-1)
# 将 overlay 融合到原图上
cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
return output
def overlay_rectangle(image, top_left, bottom_right, text, color, alpha=0.5, font_scale=1.0, thickness=1):
"""
在图像上绘制带透明填充的矩形框和位于框外部的带背景标签。
返回:
- 带可视化效果的图像副本
"""
alpha = np.clip(alpha, 0, 1)
output = image.copy()
overlay = image.copy()
# 1. 绘制目标区域透明填充
cv2.rectangle(overlay, top_left, bottom_right, color, thickness=-1)
x, y = top_left
# 获取文字尺寸
(text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_TRIPLEX, font_scale, thickness)
text_origin = (x, y) # 文字左下角坐标
text_topleft = (x, y - text_height - baseline)
text_bottomright = (x + text_width, y)
# 防止文字背景区域越界
text_topleft = (max(0, text_topleft[0]), max(0, text_topleft[1]))
# 绘制文字背景到 overlay同样使用透明度
cv2.rectangle(overlay, text_topleft, text_bottomright, color, thickness=-1)
# 将文字写到 output 图像上
cv2.putText(output, text, text_origin, cv2.FONT_HERSHEY_TRIPLEX, font_scale, (0, 0, 0), thickness)
# 3. 合成透明效果
cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
return output
def page_detection_visual(page_detection_result: PageDetectionResult):
img = page_detection_result.image
for box in page_detection_result.boxes:
pos = box.pos
clsid = box.clsid
confidence = box.confidence
if clsid == E.TEXT.value:
text = E.TEXT.label
color = (177, 216, 178)
elif clsid == E.TITLE.value:
text = E.TITLE.label
color = (181, 136, 145)
elif clsid == E.TABLE.value:
text = E.TABLE.label
color = (255, 97, 176)
elif clsid == E.TABLE_CAPTION.value:
text = E.TABLE_CAPTION.label
color = (66, 73, 255)
elif clsid == E.SCANNED_DOCUMENT.value:
text = E.SCANNED_DOCUMENT.label
color = (255, 239, 145)
else:
continue
text = f'{text} {confidence:.2f}'
img = overlay_rectangle(img, (int(pos[0]), int(pos[1])), (int(pos[2]), int(pos[3])), text, color, font_scale=0.6)
return img
def text_rec(image):
result = ocr.ocr(image, cls=False)
boxes = []
texts = []
conficences = []
for idx in range(len(result)):
res = result[idx]
if not res:
continue
for line in res:
if not line:
continue
box = line[0]
text = line[1][0]
confidence = line[1][1]
boxes.append(box)
texts.append(text)
conficences.append(confidence)
return boxes, texts, conficences