You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

254 lines
7.9 KiB
Python

1 month ago
from typing import List
from typing_extensions import deprecated
import numpy as np
1 month ago
from pdf2image import convert_from_path
import cv2
from .page_detection.utils import PageDetectionResult
from paddleocr import PaddleOCR
from .constants import PageDetectionEnum as E
from paddlex import create_model
from tqdm import tqdm
1 month ago
ocr = PaddleOCR(use_angle_cls=False, lang='ch', use_gpu=True, show_log=False)
text_orient_model = create_model(model_name="PP-LCNet_x1_0_doc_ori")
1 month ago
def pdf2image(pdf_path: str) -> List[np.ndarray]:
1 month ago
images = convert_from_path(pdf_path)
images = [cv2.cvtColor(np.array(_.convert('RGB')), cv2.COLOR_RGB2BGR) for _ in images]
return images
def boxes_to_heatmap(boxes, image_shape):
"""生成热力图每个像素表示被多少个box覆盖"""
heatmap = np.zeros(image_shape, dtype=np.float32)
for xmin, ymin, xmax, ymax in boxes:
xmin, ymin, xmax, ymax = map(int, (xmin, ymin, xmax, ymax))
heatmap[ymin:ymax, xmin:xmax] += 1
return heatmap
def compute_integral_image(heatmap):
"""计算积分图"""
return heatmap.cumsum(axis=0).cumsum(axis=1)
def get_sum_in_rect(integral, x1, y1, x2, y2):
"""使用积分图获取矩形区域的和"""
total = integral[y2, x2]
if x1 > 0:
total -= integral[y2, x1 - 1]
if y1 > 0:
total -= integral[y1 - 1, x2]
if x1 > 0 and y1 > 0:
total += integral[y1 - 1, x1 - 1]
return total
def find_max_area_square(boxes, image_width, image_height, window_size=224):
heatmap = boxes_to_heatmap(boxes, (image_height, image_width))
integral = compute_integral_image(heatmap)
h, w = heatmap.shape
H = h - window_size + 1
W = w - window_size + 1
# 创建矩阵索引
ys, xs = np.meshgrid(np.arange(H), np.arange(W), indexing='ij')
x1 = xs
y1 = ys
x2 = x1 + window_size - 1
y2 = y1 + window_size - 1
# 用矢量化方式计算所有区域的总和
total = (
integral[y2, x2]
- np.where(x1 > 0, integral[y2, x1 - 1], 0)
- np.where(y1 > 0, integral[y1 - 1, x2], 0)
+ np.where((x1 > 0) & (y1 > 0), integral[y1 - 1, x1 - 1], 0)
)
max_idx = np.unravel_index(np.argmax(total), total.shape)
top_left_y, top_left_x = max_idx
best_window = [int(top_left_x), int(top_left_y),
int(top_left_x + window_size), int(top_left_y + window_size)]
best_score = total[max_idx]
return best_window, best_score
def image_orient_cls(images):
if isinstance(images, np.ndarray):
images = [images]
angles = []
for img in tqdm(images, '文本方向分类'):
h, w = img.shape[:2]
det_res = ocr.ocr(img, det=True, rec=False, cls=False)[0]
boxes = []
for r in det_res:
x1, y1, x2, y2 = int(r[0][0]), int(r[0][1]), int(r[2][0]), int(r[2][1])
boxes.append((x1, y1, x2, y2))
square_box, _ = find_max_area_square(boxes, w, h, 224)
x1, y1, x2, y2 = square_box
x1 = max(0, x1 - 16)
y1 = max(0, y1 - 16)
x2 = min(w, x2 + 16)
y2 = min(h, y2 + 16)
_img = img[y1:y2, x1:x2]
output = text_orient_model.predict(_img, batch_size=1)
angle = int(next(output)['label_names'][0])
angles.append(angle)
return angles
@deprecated('Low precision classification of scanned documents')
def scanning_document_classify(image):
# 判断是否是扫描件
# 将图像从BGR颜色空间转换到HSV颜色空间
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
# 定义红色的HSV范围
lower_red1 = np.array([0, 70, 50])
upper_red1 = np.array([10, 255, 255])
lower_red2 = np.array([170, 70, 50])
upper_red2 = np.array([180, 255, 255])
# 创建两个掩码,一个用于低色调的红色,一个用于高色调的红色
mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
mask2 = cv2.inRange(hsv, lower_red2, upper_red2)
1 month ago
# 将两个掩码合并
mask = cv2.bitwise_or(mask1, mask2)
1 month ago
# 计算红色区域的非零像素数量
non_zero_pixels = cv2.countNonZero(mask)
return 1 < non_zero_pixels < 1000
def remove_watermark(image):
# 去除红色印章
_, _, r_channel = cv2.split(image)
r_channel[r_channel > 210] = 255
r_channel = cv2.cvtColor(r_channel, cv2.COLOR_GRAY2BGR)
return r_channel
def overlay_rectangle(image, top_left, bottom_right, color, alpha=0.5):
"""
在图像的矩形区域内覆盖颜色并设置透明度
参数
- image: 输入图像numpy数组BGR格式
- top_left: 左上角坐标 (x, y)
- bottom_right: 右下角坐标 (x, y)
- color: 填充颜色BGR 格式默认绿色
- alpha: 透明度0.0完全透明 1.0完全不透明
返回
- 带有矩形覆盖效果的图像副本
"""
# 确保 alpha 合法
alpha = np.clip(alpha, 0, 1)
# 复制图像用于输出
output = image.copy()
# 创建一个矩形区域的遮罩图层
overlay = image.copy()
cv2.rectangle(overlay, top_left, bottom_right, color, thickness=-1)
# 将 overlay 融合到原图上
cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
return output
def overlay_rectangle(image, top_left, bottom_right, text, color, alpha=0.5, font_scale=1.0, thickness=1):
"""
在图像上绘制带透明填充的矩形框和位于框外部的带背景标签
返回
- 带可视化效果的图像副本
"""
alpha = np.clip(alpha, 0, 1)
output = image.copy()
overlay = image.copy()
# 1. 绘制目标区域透明填充
cv2.rectangle(overlay, top_left, bottom_right, color, thickness=-1)
x, y = top_left
# 获取文字尺寸
(text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_TRIPLEX, font_scale, thickness)
text_origin = (x, y) # 文字左下角坐标
text_topleft = (x, y - text_height - baseline)
text_bottomright = (x + text_width, y)
# 防止文字背景区域越界
text_topleft = (max(0, text_topleft[0]), max(0, text_topleft[1]))
# 绘制文字背景到 overlay同样使用透明度
cv2.rectangle(overlay, text_topleft, text_bottomright, color, thickness=-1)
# 将文字写到 output 图像上
cv2.putText(output, text, text_origin, cv2.FONT_HERSHEY_TRIPLEX, font_scale, (0, 0, 0), thickness)
# 3. 合成透明效果
cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
return output
1 month ago
def page_detection_visual(page_detection_result: PageDetectionResult):
img = page_detection_result.image
1 month ago
for box in page_detection_result.boxes:
pos = box.pos
clsid = box.clsid
confidence = box.confidence
if clsid == E.TEXT.value:
text = E.TEXT.label
color = (177, 216, 178)
elif clsid == E.TITLE.value:
text = E.TITLE.label
color = (181, 136, 145)
elif clsid == E.TABLE.value:
text = E.TABLE.label
color = (255, 97, 176)
elif clsid == E.TABLE_CAPTION.value:
text = E.TABLE_CAPTION.label
color = (66, 73, 255)
elif clsid == E.SCANNED_DOCUMENT.value:
text = E.SCANNED_DOCUMENT.label
color = (255, 239, 145)
else:
continue
text = f'{text} {confidence:.2f}'
img = overlay_rectangle(img, (int(pos[0]), int(pos[1])), (int(pos[2]), int(pos[3])), text, color, font_scale=0.6)
return img
def text_rec(image):
result = ocr.ocr(image, cls=False)
boxes = []
texts = []
conficences = []
for idx in range(len(result)):
res = result[idx]
if not res:
continue
for line in res:
if not line:
continue
box = line[0]
text = line[1][0]
confidence = line[1][1]
boxes.append(box)
texts.append(text)
conficences.append(confidence)
return boxes, texts, conficences