|
|
from typing import List
|
|
|
from typing_extensions import deprecated
|
|
|
import numpy as np
|
|
|
from pdf2image import convert_from_path
|
|
|
import cv2
|
|
|
from .page_detection.utils import PageDetectionResult
|
|
|
from paddleocr import PaddleOCR
|
|
|
from .constants import PageDetectionEnum as E
|
|
|
from paddlex import create_model
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
|
ocr = PaddleOCR(use_angle_cls=False, lang='ch', use_gpu=True, show_log=False)
|
|
|
text_orient_model = create_model(model_name="PP-LCNet_x1_0_doc_ori")
|
|
|
|
|
|
|
|
|
def pdf2image(pdf_path: str) -> List[np.ndarray]:
|
|
|
images = convert_from_path(pdf_path)
|
|
|
images = [cv2.cvtColor(np.array(_.convert('RGB')), cv2.COLOR_RGB2BGR) for _ in images]
|
|
|
return images
|
|
|
|
|
|
|
|
|
def boxes_to_heatmap(boxes, image_shape):
|
|
|
"""生成热力图:每个像素表示被多少个box覆盖"""
|
|
|
heatmap = np.zeros(image_shape, dtype=np.float32)
|
|
|
for xmin, ymin, xmax, ymax in boxes:
|
|
|
xmin, ymin, xmax, ymax = map(int, (xmin, ymin, xmax, ymax))
|
|
|
heatmap[ymin:ymax, xmin:xmax] += 1
|
|
|
return heatmap
|
|
|
|
|
|
|
|
|
def compute_integral_image(heatmap):
|
|
|
"""计算积分图"""
|
|
|
return heatmap.cumsum(axis=0).cumsum(axis=1)
|
|
|
|
|
|
|
|
|
def get_sum_in_rect(integral, x1, y1, x2, y2):
|
|
|
"""使用积分图获取矩形区域的和"""
|
|
|
total = integral[y2, x2]
|
|
|
if x1 > 0:
|
|
|
total -= integral[y2, x1 - 1]
|
|
|
if y1 > 0:
|
|
|
total -= integral[y1 - 1, x2]
|
|
|
if x1 > 0 and y1 > 0:
|
|
|
total += integral[y1 - 1, x1 - 1]
|
|
|
return total
|
|
|
|
|
|
|
|
|
def find_max_area_square(boxes, image_width, image_height, window_size=224):
|
|
|
heatmap = boxes_to_heatmap(boxes, (image_height, image_width))
|
|
|
integral = compute_integral_image(heatmap)
|
|
|
|
|
|
h, w = heatmap.shape
|
|
|
H = h - window_size + 1
|
|
|
W = w - window_size + 1
|
|
|
|
|
|
# 创建矩阵索引
|
|
|
ys, xs = np.meshgrid(np.arange(H), np.arange(W), indexing='ij')
|
|
|
x1 = xs
|
|
|
y1 = ys
|
|
|
x2 = x1 + window_size - 1
|
|
|
y2 = y1 + window_size - 1
|
|
|
|
|
|
# 用矢量化方式计算所有区域的总和
|
|
|
total = (
|
|
|
integral[y2, x2]
|
|
|
- np.where(x1 > 0, integral[y2, x1 - 1], 0)
|
|
|
- np.where(y1 > 0, integral[y1 - 1, x2], 0)
|
|
|
+ np.where((x1 > 0) & (y1 > 0), integral[y1 - 1, x1 - 1], 0)
|
|
|
)
|
|
|
|
|
|
max_idx = np.unravel_index(np.argmax(total), total.shape)
|
|
|
top_left_y, top_left_x = max_idx
|
|
|
best_window = [int(top_left_x), int(top_left_y),
|
|
|
int(top_left_x + window_size), int(top_left_y + window_size)]
|
|
|
best_score = total[max_idx]
|
|
|
return best_window, best_score
|
|
|
|
|
|
|
|
|
def image_orient_cls(images):
|
|
|
if isinstance(images, np.ndarray):
|
|
|
images = [images]
|
|
|
|
|
|
angles = []
|
|
|
for img in tqdm(images, '文本方向分类'):
|
|
|
h, w = img.shape[:2]
|
|
|
det_res = ocr.ocr(img, det=True, rec=False, cls=False)[0]
|
|
|
boxes = []
|
|
|
for r in det_res:
|
|
|
x1, y1, x2, y2 = int(r[0][0]), int(r[0][1]), int(r[2][0]), int(r[2][1])
|
|
|
boxes.append((x1, y1, x2, y2))
|
|
|
square_box, _ = find_max_area_square(boxes, w, h, 224)
|
|
|
x1, y1, x2, y2 = square_box
|
|
|
x1 = max(0, x1 - 16)
|
|
|
y1 = max(0, y1 - 16)
|
|
|
x2 = min(w, x2 + 16)
|
|
|
y2 = min(h, y2 + 16)
|
|
|
_img = img[y1:y2, x1:x2]
|
|
|
|
|
|
output = text_orient_model.predict(_img, batch_size=1)
|
|
|
angle = int(next(output)['label_names'][0])
|
|
|
angles.append(angle)
|
|
|
|
|
|
return angles
|
|
|
|
|
|
|
|
|
@deprecated('Low precision classification of scanned documents')
|
|
|
def scanning_document_classify(image):
|
|
|
# 判断是否是扫描件
|
|
|
|
|
|
# 将图像从BGR颜色空间转换到HSV颜色空间
|
|
|
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
|
|
|
|
|
|
# 定义红色的HSV范围
|
|
|
lower_red1 = np.array([0, 70, 50])
|
|
|
upper_red1 = np.array([10, 255, 255])
|
|
|
lower_red2 = np.array([170, 70, 50])
|
|
|
upper_red2 = np.array([180, 255, 255])
|
|
|
|
|
|
# 创建两个掩码,一个用于低色调的红色,一个用于高色调的红色
|
|
|
mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
|
|
|
mask2 = cv2.inRange(hsv, lower_red2, upper_red2)
|
|
|
|
|
|
# 将两个掩码合并
|
|
|
mask = cv2.bitwise_or(mask1, mask2)
|
|
|
|
|
|
# 计算红色区域的非零像素数量
|
|
|
non_zero_pixels = cv2.countNonZero(mask)
|
|
|
return 1 < non_zero_pixels < 1000
|
|
|
|
|
|
|
|
|
def remove_watermark(image):
|
|
|
# 去除红色印章
|
|
|
_, _, r_channel = cv2.split(image)
|
|
|
r_channel[r_channel > 210] = 255
|
|
|
r_channel = cv2.cvtColor(r_channel, cv2.COLOR_GRAY2BGR)
|
|
|
return r_channel
|
|
|
|
|
|
|
|
|
def overlay_rectangle(image, top_left, bottom_right, color, alpha=0.5):
|
|
|
"""
|
|
|
在图像的矩形区域内覆盖颜色并设置透明度。
|
|
|
|
|
|
参数:
|
|
|
- image: 输入图像(numpy数组,BGR格式)
|
|
|
- top_left: 左上角坐标 (x, y)
|
|
|
- bottom_right: 右下角坐标 (x, y)
|
|
|
- color: 填充颜色,BGR 格式,默认绿色
|
|
|
- alpha: 透明度,0.0(完全透明)到 1.0(完全不透明)
|
|
|
|
|
|
返回:
|
|
|
- 带有矩形覆盖效果的图像副本
|
|
|
"""
|
|
|
# 确保 alpha 合法
|
|
|
alpha = np.clip(alpha, 0, 1)
|
|
|
|
|
|
# 复制图像用于输出
|
|
|
output = image.copy()
|
|
|
|
|
|
# 创建一个矩形区域的遮罩图层
|
|
|
overlay = image.copy()
|
|
|
cv2.rectangle(overlay, top_left, bottom_right, color, thickness=-1)
|
|
|
|
|
|
# 将 overlay 融合到原图上
|
|
|
cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
|
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
def overlay_rectangle(image, top_left, bottom_right, text, color, alpha=0.5, font_scale=1.0, thickness=1):
|
|
|
"""
|
|
|
在图像上绘制带透明填充的矩形框和位于框外部的带背景标签。
|
|
|
|
|
|
返回:
|
|
|
- 带可视化效果的图像副本
|
|
|
"""
|
|
|
alpha = np.clip(alpha, 0, 1)
|
|
|
output = image.copy()
|
|
|
overlay = image.copy()
|
|
|
|
|
|
# 1. 绘制目标区域透明填充
|
|
|
cv2.rectangle(overlay, top_left, bottom_right, color, thickness=-1)
|
|
|
|
|
|
x, y = top_left
|
|
|
# 获取文字尺寸
|
|
|
(text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_TRIPLEX, font_scale, thickness)
|
|
|
text_origin = (x, y) # 文字左下角坐标
|
|
|
text_topleft = (x, y - text_height - baseline)
|
|
|
text_bottomright = (x + text_width, y)
|
|
|
|
|
|
# 防止文字背景区域越界
|
|
|
text_topleft = (max(0, text_topleft[0]), max(0, text_topleft[1]))
|
|
|
|
|
|
# 绘制文字背景到 overlay(同样使用透明度)
|
|
|
cv2.rectangle(overlay, text_topleft, text_bottomright, color, thickness=-1)
|
|
|
|
|
|
# 将文字写到 output 图像上
|
|
|
cv2.putText(output, text, text_origin, cv2.FONT_HERSHEY_TRIPLEX, font_scale, (0, 0, 0), thickness)
|
|
|
|
|
|
# 3. 合成透明效果
|
|
|
cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
|
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
def page_detection_visual(page_detection_result: PageDetectionResult):
|
|
|
img = page_detection_result.image
|
|
|
for box in page_detection_result.boxes:
|
|
|
pos = box.pos
|
|
|
clsid = box.clsid
|
|
|
confidence = box.confidence
|
|
|
if clsid == E.TEXT.value:
|
|
|
text = E.TEXT.label
|
|
|
color = (177, 216, 178)
|
|
|
elif clsid == E.TITLE.value:
|
|
|
text = E.TITLE.label
|
|
|
color = (181, 136, 145)
|
|
|
elif clsid == E.TABLE.value:
|
|
|
text = E.TABLE.label
|
|
|
color = (255, 97, 176)
|
|
|
elif clsid == E.TABLE_CAPTION.value:
|
|
|
text = E.TABLE_CAPTION.label
|
|
|
color = (66, 73, 255)
|
|
|
elif clsid == E.SCANNED_DOCUMENT.value:
|
|
|
text = E.SCANNED_DOCUMENT.label
|
|
|
color = (255, 239, 145)
|
|
|
else:
|
|
|
continue
|
|
|
text = f'{text} {confidence:.2f}'
|
|
|
img = overlay_rectangle(img, (int(pos[0]), int(pos[1])), (int(pos[2]), int(pos[3])), text, color, font_scale=0.6)
|
|
|
|
|
|
return img
|
|
|
|
|
|
|
|
|
def text_rec(image):
|
|
|
result = ocr.ocr(image, cls=False)
|
|
|
boxes = []
|
|
|
texts = []
|
|
|
conficences = []
|
|
|
for idx in range(len(result)):
|
|
|
res = result[idx]
|
|
|
if not res:
|
|
|
continue
|
|
|
for line in res:
|
|
|
if not line:
|
|
|
continue
|
|
|
box = line[0]
|
|
|
text = line[1][0]
|
|
|
confidence = line[1][1]
|
|
|
boxes.append(box)
|
|
|
texts.append(text)
|
|
|
conficences.append(confidence)
|
|
|
return boxes, texts, conficences
|