You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
419 lines
18 KiB
Python
419 lines
18 KiB
Python
4 weeks ago
|
import fitz
|
||
|
from magic_pdf.config.constants import CROSS_PAGE
|
||
|
from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
|
||
|
ContentType)
|
||
|
from magic_pdf.data.dataset import Dataset
|
||
|
from magic_pdf.model.magic_model import MagicModel
|
||
|
|
||
|
|
||
|
def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
|
||
|
new_rgb = []
|
||
|
for item in rgb_config:
|
||
|
item = float(item) / 255
|
||
|
new_rgb.append(item)
|
||
|
page_data = bbox_list[i]
|
||
|
for bbox in page_data:
|
||
|
x0, y0, x1, y1 = bbox
|
||
|
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
||
|
if fill_config:
|
||
|
page.draw_rect(
|
||
|
rect_coords,
|
||
|
color=None,
|
||
|
fill=new_rgb,
|
||
|
fill_opacity=0.3,
|
||
|
width=0.5,
|
||
|
overlay=True,
|
||
|
) # Draw the rectangle
|
||
|
else:
|
||
|
page.draw_rect(
|
||
|
rect_coords,
|
||
|
color=new_rgb,
|
||
|
fill=None,
|
||
|
fill_opacity=1,
|
||
|
width=0.5,
|
||
|
overlay=True,
|
||
|
) # Draw the rectangle
|
||
|
|
||
|
|
||
|
def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox=True):
|
||
|
new_rgb = []
|
||
|
for item in rgb_config:
|
||
|
item = float(item) / 255
|
||
|
new_rgb.append(item)
|
||
|
page_data = bbox_list[i]
|
||
|
for j, bbox in enumerate(page_data):
|
||
|
x0, y0, x1, y1 = bbox
|
||
|
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
||
|
if draw_bbox:
|
||
|
if fill_config:
|
||
|
page.draw_rect(
|
||
|
rect_coords,
|
||
|
color=None,
|
||
|
fill=new_rgb,
|
||
|
fill_opacity=0.3,
|
||
|
width=0.5,
|
||
|
overlay=True,
|
||
|
) # Draw the rectangle
|
||
|
else:
|
||
|
page.draw_rect(
|
||
|
rect_coords,
|
||
|
color=new_rgb,
|
||
|
fill=None,
|
||
|
fill_opacity=1,
|
||
|
width=0.5,
|
||
|
overlay=True,
|
||
|
) # Draw the rectangle
|
||
|
page.insert_text(
|
||
|
(x1 + 2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
|
||
|
) # Insert the index in the top left corner of the rectangle
|
||
|
|
||
|
|
||
|
def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||
|
dropped_bbox_list = []
|
||
|
tables_list, tables_body_list = [], []
|
||
|
tables_caption_list, tables_footnote_list = [], []
|
||
|
imgs_list, imgs_body_list, imgs_caption_list = [], [], []
|
||
|
imgs_footnote_list = []
|
||
|
titles_list = []
|
||
|
texts_list = []
|
||
|
interequations_list = []
|
||
|
lists_list = []
|
||
|
indexs_list = []
|
||
|
for page in pdf_info:
|
||
|
|
||
|
page_dropped_list = []
|
||
|
tables, tables_body, tables_caption, tables_footnote = [], [], [], []
|
||
|
imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
|
||
|
titles = []
|
||
|
texts = []
|
||
|
interequations = []
|
||
|
lists = []
|
||
|
indices = []
|
||
|
|
||
|
for dropped_bbox in page['discarded_blocks']:
|
||
|
page_dropped_list.append(dropped_bbox['bbox'])
|
||
|
dropped_bbox_list.append(page_dropped_list)
|
||
|
for block in page['para_blocks']:
|
||
|
bbox = block['bbox']
|
||
|
if block['type'] == BlockType.Table:
|
||
|
tables.append(bbox)
|
||
|
for nested_block in block['blocks']:
|
||
|
bbox = nested_block['bbox']
|
||
|
if nested_block['type'] == BlockType.TableBody:
|
||
|
tables_body.append(bbox)
|
||
|
elif nested_block['type'] == BlockType.TableCaption:
|
||
|
tables_caption.append(bbox)
|
||
|
elif nested_block['type'] == BlockType.TableFootnote:
|
||
|
tables_footnote.append(bbox)
|
||
|
elif block['type'] == BlockType.Image:
|
||
|
imgs.append(bbox)
|
||
|
for nested_block in block['blocks']:
|
||
|
bbox = nested_block['bbox']
|
||
|
if nested_block['type'] == BlockType.ImageBody:
|
||
|
imgs_body.append(bbox)
|
||
|
elif nested_block['type'] == BlockType.ImageCaption:
|
||
|
imgs_caption.append(bbox)
|
||
|
elif nested_block['type'] == BlockType.ImageFootnote:
|
||
|
imgs_footnote.append(bbox)
|
||
|
elif block['type'] == BlockType.Title:
|
||
|
titles.append(bbox)
|
||
|
elif block['type'] == BlockType.Text:
|
||
|
texts.append(bbox)
|
||
|
elif block['type'] == BlockType.InterlineEquation:
|
||
|
interequations.append(bbox)
|
||
|
elif block['type'] == BlockType.List:
|
||
|
lists.append(bbox)
|
||
|
elif block['type'] == BlockType.Index:
|
||
|
indices.append(bbox)
|
||
|
|
||
|
tables_list.append(tables)
|
||
|
tables_body_list.append(tables_body)
|
||
|
tables_caption_list.append(tables_caption)
|
||
|
tables_footnote_list.append(tables_footnote)
|
||
|
imgs_list.append(imgs)
|
||
|
imgs_body_list.append(imgs_body)
|
||
|
imgs_caption_list.append(imgs_caption)
|
||
|
imgs_footnote_list.append(imgs_footnote)
|
||
|
titles_list.append(titles)
|
||
|
texts_list.append(texts)
|
||
|
interequations_list.append(interequations)
|
||
|
lists_list.append(lists)
|
||
|
indexs_list.append(indices)
|
||
|
|
||
|
layout_bbox_list = []
|
||
|
|
||
|
table_type_order = {
|
||
|
'table_caption': 1,
|
||
|
'table_body': 2,
|
||
|
'table_footnote': 3
|
||
|
}
|
||
|
for page in pdf_info:
|
||
|
page_block_list = []
|
||
|
for block in page['para_blocks']:
|
||
|
if block['type'] in [
|
||
|
BlockType.Text,
|
||
|
BlockType.Title,
|
||
|
BlockType.InterlineEquation,
|
||
|
BlockType.List,
|
||
|
BlockType.Index,
|
||
|
]:
|
||
|
bbox = block['bbox']
|
||
|
page_block_list.append(bbox)
|
||
|
elif block['type'] in [BlockType.Image]:
|
||
|
for sub_block in block['blocks']:
|
||
|
bbox = sub_block['bbox']
|
||
|
page_block_list.append(bbox)
|
||
|
elif block['type'] in [BlockType.Table]:
|
||
|
sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']])
|
||
|
for sub_block in sorted_blocks:
|
||
|
bbox = sub_block['bbox']
|
||
|
page_block_list.append(bbox)
|
||
|
|
||
|
layout_bbox_list.append(page_block_list)
|
||
|
|
||
|
pdf_docs = fitz.open('pdf', pdf_bytes)
|
||
|
|
||
|
for i, page in enumerate(pdf_docs):
|
||
|
|
||
|
draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
|
||
|
# draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
|
||
|
draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
|
||
|
draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
|
||
|
draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
|
||
|
# draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
|
||
|
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
|
||
|
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
|
||
|
draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
|
||
|
draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
|
||
|
draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
|
||
|
draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
|
||
|
draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True)
|
||
|
draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True)
|
||
|
|
||
|
draw_bbox_with_number(
|
||
|
i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False
|
||
|
)
|
||
|
|
||
|
# Save the PDF
|
||
|
pdf_docs.save(f'{out_path}/{filename}')
|
||
|
|
||
|
|
||
|
def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||
|
text_list = []
|
||
|
inline_equation_list = []
|
||
|
interline_equation_list = []
|
||
|
image_list = []
|
||
|
table_list = []
|
||
|
dropped_list = []
|
||
|
next_page_text_list = []
|
||
|
next_page_inline_equation_list = []
|
||
|
|
||
|
def get_span_info(span):
|
||
|
if span['type'] == ContentType.Text:
|
||
|
if span.get(CROSS_PAGE, False):
|
||
|
next_page_text_list.append(span['bbox'])
|
||
|
else:
|
||
|
page_text_list.append(span['bbox'])
|
||
|
elif span['type'] == ContentType.InlineEquation:
|
||
|
if span.get(CROSS_PAGE, False):
|
||
|
next_page_inline_equation_list.append(span['bbox'])
|
||
|
else:
|
||
|
page_inline_equation_list.append(span['bbox'])
|
||
|
elif span['type'] == ContentType.InterlineEquation:
|
||
|
page_interline_equation_list.append(span['bbox'])
|
||
|
elif span['type'] == ContentType.Image:
|
||
|
page_image_list.append(span['bbox'])
|
||
|
elif span['type'] == ContentType.Table:
|
||
|
page_table_list.append(span['bbox'])
|
||
|
|
||
|
for page in pdf_info:
|
||
|
page_text_list = []
|
||
|
page_inline_equation_list = []
|
||
|
page_interline_equation_list = []
|
||
|
page_image_list = []
|
||
|
page_table_list = []
|
||
|
page_dropped_list = []
|
||
|
|
||
|
# 将跨页的span放到移动到下一页的列表中
|
||
|
if len(next_page_text_list) > 0:
|
||
|
page_text_list.extend(next_page_text_list)
|
||
|
next_page_text_list.clear()
|
||
|
if len(next_page_inline_equation_list) > 0:
|
||
|
page_inline_equation_list.extend(next_page_inline_equation_list)
|
||
|
next_page_inline_equation_list.clear()
|
||
|
|
||
|
# 构造dropped_list
|
||
|
for block in page['discarded_blocks']:
|
||
|
if block['type'] == BlockType.Discarded:
|
||
|
for line in block['lines']:
|
||
|
for span in line['spans']:
|
||
|
page_dropped_list.append(span['bbox'])
|
||
|
dropped_list.append(page_dropped_list)
|
||
|
# 构造其余useful_list
|
||
|
# for block in page['para_blocks']: # span直接用分段合并前的结果就可以
|
||
|
for block in page['preproc_blocks']:
|
||
|
if block['type'] in [
|
||
|
BlockType.Text,
|
||
|
BlockType.Title,
|
||
|
BlockType.InterlineEquation,
|
||
|
BlockType.List,
|
||
|
BlockType.Index,
|
||
|
]:
|
||
|
for line in block['lines']:
|
||
|
for span in line['spans']:
|
||
|
get_span_info(span)
|
||
|
elif block['type'] in [BlockType.Image, BlockType.Table]:
|
||
|
for sub_block in block['blocks']:
|
||
|
for line in sub_block['lines']:
|
||
|
for span in line['spans']:
|
||
|
get_span_info(span)
|
||
|
text_list.append(page_text_list)
|
||
|
inline_equation_list.append(page_inline_equation_list)
|
||
|
interline_equation_list.append(page_interline_equation_list)
|
||
|
image_list.append(page_image_list)
|
||
|
table_list.append(page_table_list)
|
||
|
pdf_docs = fitz.open('pdf', pdf_bytes)
|
||
|
for i, page in enumerate(pdf_docs):
|
||
|
# 获取当前页面的数据
|
||
|
draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
|
||
|
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
|
||
|
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
|
||
|
draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
|
||
|
draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
|
||
|
draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
|
||
|
|
||
|
# Save the PDF
|
||
|
pdf_docs.save(f'{out_path}/{filename}')
|
||
|
|
||
|
|
||
|
def draw_model_bbox(model_list, dataset: Dataset, out_path, filename):
|
||
|
dropped_bbox_list = []
|
||
|
tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
|
||
|
imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
|
||
|
titles_list = []
|
||
|
texts_list = []
|
||
|
interequations_list = []
|
||
|
magic_model = MagicModel(model_list, dataset)
|
||
|
for i in range(len(model_list)):
|
||
|
page_dropped_list = []
|
||
|
tables_body, tables_caption, tables_footnote = [], [], []
|
||
|
imgs_body, imgs_caption, imgs_footnote = [], [], []
|
||
|
titles = []
|
||
|
texts = []
|
||
|
interequations = []
|
||
|
page_info = magic_model.get_model_list(i)
|
||
|
layout_dets = page_info['layout_dets']
|
||
|
for layout_det in layout_dets:
|
||
|
bbox = layout_det['bbox']
|
||
|
if layout_det['category_id'] == CategoryId.Text:
|
||
|
texts.append(bbox)
|
||
|
elif layout_det['category_id'] == CategoryId.Title:
|
||
|
titles.append(bbox)
|
||
|
elif layout_det['category_id'] == CategoryId.TableBody:
|
||
|
tables_body.append(bbox)
|
||
|
elif layout_det['category_id'] == CategoryId.TableCaption:
|
||
|
tables_caption.append(bbox)
|
||
|
elif layout_det['category_id'] == CategoryId.TableFootnote:
|
||
|
tables_footnote.append(bbox)
|
||
|
elif layout_det['category_id'] == CategoryId.ImageBody:
|
||
|
imgs_body.append(bbox)
|
||
|
elif layout_det['category_id'] == CategoryId.ImageCaption:
|
||
|
imgs_caption.append(bbox)
|
||
|
elif layout_det['category_id'] == CategoryId.InterlineEquation_YOLO:
|
||
|
interequations.append(bbox)
|
||
|
elif layout_det['category_id'] == CategoryId.Abandon:
|
||
|
page_dropped_list.append(bbox)
|
||
|
elif layout_det['category_id'] == CategoryId.ImageFootnote:
|
||
|
imgs_footnote.append(bbox)
|
||
|
|
||
|
tables_body_list.append(tables_body)
|
||
|
tables_caption_list.append(tables_caption)
|
||
|
tables_footnote_list.append(tables_footnote)
|
||
|
imgs_body_list.append(imgs_body)
|
||
|
imgs_caption_list.append(imgs_caption)
|
||
|
titles_list.append(titles)
|
||
|
texts_list.append(texts)
|
||
|
interequations_list.append(interequations)
|
||
|
dropped_bbox_list.append(page_dropped_list)
|
||
|
imgs_footnote_list.append(imgs_footnote)
|
||
|
|
||
|
for i in range(len(dataset)):
|
||
|
page = dataset.get_page(i)
|
||
|
draw_bbox_with_number(
|
||
|
i, dropped_bbox_list, page, [158, 158, 158], True
|
||
|
) # color !
|
||
|
draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
|
||
|
draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
|
||
|
draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
|
||
|
draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
|
||
|
draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
|
||
|
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102], True)
|
||
|
draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
|
||
|
draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
|
||
|
draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
|
||
|
|
||
|
# Save the PDF
|
||
|
dataset.dump_to_file(f'{out_path}/{filename}')
|
||
|
|
||
|
|
||
|
def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
|
||
|
layout_bbox_list = []
|
||
|
|
||
|
for page in pdf_info:
|
||
|
page_line_list = []
|
||
|
for block in page['preproc_blocks']:
|
||
|
if block['type'] in [BlockType.Text]:
|
||
|
for line in block['lines']:
|
||
|
bbox = line['bbox']
|
||
|
index = line['index']
|
||
|
page_line_list.append({'index': index, 'bbox': bbox})
|
||
|
elif block['type'] in [BlockType.Title, BlockType.InterlineEquation]:
|
||
|
if 'virtual_lines' in block:
|
||
|
if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None:
|
||
|
for line in block['virtual_lines']:
|
||
|
bbox = line['bbox']
|
||
|
index = line['index']
|
||
|
page_line_list.append({'index': index, 'bbox': bbox})
|
||
|
else:
|
||
|
for line in block['lines']:
|
||
|
bbox = line['bbox']
|
||
|
index = line['index']
|
||
|
page_line_list.append({'index': index, 'bbox': bbox})
|
||
|
elif block['type'] in [BlockType.Image, BlockType.Table]:
|
||
|
for sub_block in block['blocks']:
|
||
|
if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
||
|
if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:
|
||
|
for line in sub_block['virtual_lines']:
|
||
|
bbox = line['bbox']
|
||
|
index = line['index']
|
||
|
page_line_list.append({'index': index, 'bbox': bbox})
|
||
|
else:
|
||
|
for line in sub_block['lines']:
|
||
|
bbox = line['bbox']
|
||
|
index = line['index']
|
||
|
page_line_list.append({'index': index, 'bbox': bbox})
|
||
|
elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
|
||
|
for line in sub_block['lines']:
|
||
|
bbox = line['bbox']
|
||
|
index = line['index']
|
||
|
page_line_list.append({'index': index, 'bbox': bbox})
|
||
|
sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
|
||
|
layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
|
||
|
pdf_docs = fitz.open('pdf', pdf_bytes)
|
||
|
for i, page in enumerate(pdf_docs):
|
||
|
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
|
||
|
|
||
|
pdf_docs.save(f'{out_path}/{filename}')
|
||
|
|
||
|
|
||
|
def draw_char_bbox(pdf_bytes, out_path, filename):
|
||
|
pdf_docs = fitz.open('pdf', pdf_bytes)
|
||
|
for i, page in enumerate(pdf_docs):
|
||
|
for block in page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']:
|
||
|
for line in block['lines']:
|
||
|
for span in line['spans']:
|
||
|
for char in span['chars']:
|
||
|
char_bbox = char['bbox']
|
||
|
page.draw_rect(char_bbox, color=[1, 0, 0], fill=None, fill_opacity=1, width=0.3, overlay=True,)
|
||
|
pdf_docs.save(f'{out_path}/{filename}')
|