You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
55 lines
1.7 KiB
Python
55 lines
1.7 KiB
Python
import fitz as pymupdf
|
|
from surya.common.util import rescale_bbox
|
|
|
|
|
|
def get_pdf_lines(pdf_path, img_sizes):
|
|
doc = pymupdf.open(pdf_path)
|
|
page_lines = []
|
|
for idx, img_size in enumerate(img_sizes):
|
|
page = doc[idx]
|
|
blocks = page.get_text("dict", sort=True, flags=pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]
|
|
|
|
line_boxes = []
|
|
for block_idx, block in enumerate(blocks):
|
|
for l in block["lines"]:
|
|
line_boxes.append(list(l["bbox"]))
|
|
|
|
page_box = page.bound()
|
|
pwidth, pheight = page_box[2] - page_box[0], page_box[3] - page_box[1]
|
|
line_boxes = [rescale_bbox(bbox, (pwidth, pheight), img_size) for bbox in line_boxes]
|
|
page_lines.append(line_boxes)
|
|
|
|
return page_lines
|
|
|
|
def merge_boxes(box1, box2):
|
|
return (min(box1[0], box2[0]), min(box1[1], box2[1]), max(box1[2], box2[2]), max(box1[3], box2[3]))
|
|
|
|
|
|
def join_lines(bboxes, max_gap=5):
|
|
to_merge = {}
|
|
for i, box1 in bboxes:
|
|
for z, box2 in bboxes[i + 1:]:
|
|
j = i + z + 1
|
|
if box1 == box2:
|
|
continue
|
|
|
|
if box1[0] <= box2[0] and box1[2] >= box2[2]:
|
|
if abs(box1[1] - box2[3]) <= max_gap:
|
|
if i not in to_merge:
|
|
to_merge[i] = []
|
|
to_merge[i].append(j)
|
|
|
|
merged_boxes = set()
|
|
merged = []
|
|
for i, box in bboxes:
|
|
if i in merged_boxes:
|
|
continue
|
|
|
|
if i in to_merge:
|
|
for j in to_merge[i]:
|
|
box = merge_boxes(box, bboxes[j][1])
|
|
merged_boxes.add(j)
|
|
|
|
merged.append(box)
|
|
return merged
|