You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

55 lines
1.7 KiB
Python

import fitz as pymupdf
from surya.common.util import rescale_bbox
def get_pdf_lines(pdf_path, img_sizes):
doc = pymupdf.open(pdf_path)
page_lines = []
for idx, img_size in enumerate(img_sizes):
page = doc[idx]
blocks = page.get_text("dict", sort=True, flags=pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]
line_boxes = []
for block_idx, block in enumerate(blocks):
for l in block["lines"]:
line_boxes.append(list(l["bbox"]))
page_box = page.bound()
pwidth, pheight = page_box[2] - page_box[0], page_box[3] - page_box[1]
line_boxes = [rescale_bbox(bbox, (pwidth, pheight), img_size) for bbox in line_boxes]
page_lines.append(line_boxes)
return page_lines
def merge_boxes(box1, box2):
return (min(box1[0], box2[0]), min(box1[1], box2[1]), max(box1[2], box2[2]), max(box1[3], box2[3]))
def join_lines(bboxes, max_gap=5):
to_merge = {}
for i, box1 in bboxes:
for z, box2 in bboxes[i + 1:]:
j = i + z + 1
if box1 == box2:
continue
if box1[0] <= box2[0] and box1[2] >= box2[2]:
if abs(box1[1] - box2[3]) <= max_gap:
if i not in to_merge:
to_merge[i] = []
to_merge[i].append(j)
merged_boxes = set()
merged = []
for i, box in bboxes:
if i in merged_boxes:
continue
if i in to_merge:
for j in to_merge[i]:
box = merge_boxes(box, bboxes[j][1])
merged_boxes.add(j)
merged.append(box)
return merged