pdf2markdown/marker/tests/builders/test_ocr_pipeline.py

import pytest

from marker.schema import BlockTypes
from marker.schema.text.line import Line


def _ocr_pipeline_test(pdf_document):
    first_page = pdf_document.pages[0]
    assert first_page.structure[0] == '/page/0/SectionHeader/0'

    first_block = first_page.get_block(first_page.structure[0])
    assert first_block.text_extraction_method == 'surya'
    assert first_block.block_type == BlockTypes.SectionHeader

    first_text_block: Line = first_page.get_block(first_block.structure[0])
    assert first_text_block.block_type == BlockTypes.Line

    first_span = first_page.get_block(first_text_block.structure[0])
    assert first_span.block_type == BlockTypes.Span
    assert first_span.text.strip() == 'Subspace Adversarial Training'

    # Ensure we match all text lines up properly
    # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
    text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
    text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
    assert len(text_lines) == 84

    # Ensure the bbox sizes match up
    max_line_position = max([line.polygon.y_end for line in text_lines])
    max_block_position = max([block.polygon.y_end for block in text_blocks if block.source == "layout"])
    assert max_line_position <= (max_block_position * 1.02)


@pytest.mark.config({"force_ocr": True, "page_range": [0]})
def test_ocr_pipeline(pdf_document):
    _ocr_pipeline_test(pdf_document)

@pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
def test_ocr_with_inline_pipeline(pdf_document):
    _ocr_pipeline_test(pdf_document)