You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
42 lines
1.6 KiB
Python
42 lines
1.6 KiB
Python
import pytest
|
|
|
|
from marker.schema import BlockTypes
|
|
from marker.schema.text.line import Line
|
|
|
|
|
|
def _ocr_pipeline_test(pdf_document):
|
|
first_page = pdf_document.pages[0]
|
|
assert first_page.structure[0] == '/page/0/SectionHeader/0'
|
|
|
|
first_block = first_page.get_block(first_page.structure[0])
|
|
assert first_block.text_extraction_method == 'surya'
|
|
assert first_block.block_type == BlockTypes.SectionHeader
|
|
|
|
first_text_block: Line = first_page.get_block(first_block.structure[0])
|
|
assert first_text_block.block_type == BlockTypes.Line
|
|
|
|
first_span = first_page.get_block(first_text_block.structure[0])
|
|
assert first_span.block_type == BlockTypes.Span
|
|
assert first_span.text.strip() == 'Subspace Adversarial Training'
|
|
|
|
# Ensure we match all text lines up properly
|
|
# Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
|
|
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
|
|
text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
|
|
assert len(text_lines) == 84
|
|
|
|
# Ensure the bbox sizes match up
|
|
max_line_position = max([line.polygon.y_end for line in text_lines])
|
|
max_block_position = max([block.polygon.y_end for block in text_blocks if block.source == "layout"])
|
|
assert max_line_position <= (max_block_position * 1.02)
|
|
|
|
|
|
@pytest.mark.config({"force_ocr": True, "page_range": [0]})
|
|
def test_ocr_pipeline(pdf_document):
|
|
_ocr_pipeline_test(pdf_document)
|
|
|
|
@pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
|
|
def test_ocr_with_inline_pipeline(pdf_document):
|
|
_ocr_pipeline_test(pdf_document)
|
|
|