You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

55 lines
2.4 KiB
Python

import pytest
from marker.builders.document import DocumentBuilder
from marker.builders.line import LineBuilder
from marker.processors.table import TableProcessor
from marker.schema import BlockTypes
@pytest.mark.filename("water_damage.pdf")
def test_garbled_pdf(pdf_document, detection_model, recognition_model, table_rec_model):
assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'
table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
assert table_block.block_type == BlockTypes.Table
assert table_block.structure[0] == "/page/0/Line/1"
table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
assert table_cell.block_type == BlockTypes.Line
assert table_cell.structure[0] == "/page/0/Span/3"
# We don't OCR in the initial pass, only with the TableProcessor
processor = TableProcessor(detection_model, recognition_model, table_rec_model)
processor(pdf_document)
table = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Table,))[0]
assert "варіант" in table.raw_text(pdf_document)
table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
assert table_cell.block_type == BlockTypes.TableCell
@pytest.mark.filename("hindi_judgement.pdf")
@pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
def test_garbled_builder(config, doc_provider, detection_model, inline_detection_model, ocr_error_model):
line_builder = LineBuilder(detection_model, inline_detection_model, ocr_error_model, config)
builder = DocumentBuilder(config)
document = builder.build_document(doc_provider)
bad_ocr_results = line_builder.ocr_error_detection(document.pages, doc_provider.page_lines)
assert len(bad_ocr_results.labels) == 2
assert any([l == "bad" for l in bad_ocr_results.labels])
@pytest.mark.filename("adversarial.pdf")
@pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
def test_nongarbled_builder(config, doc_provider, detection_model, inline_detection_model, ocr_error_model):
line_builder = LineBuilder(detection_model, inline_detection_model, ocr_error_model, config)
builder = DocumentBuilder(config)
document = builder.build_document(doc_provider)
bad_ocr_results = line_builder.ocr_error_detection(document.pages, doc_provider.page_lines)
assert len(bad_ocr_results.labels) == 2
assert all([l == "good" for l in bad_ocr_results.labels])