You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

50 lines
1.3 KiB
Python

1 month ago
import multiprocessing as mp
import pytest
from marker.providers.pdf import PdfProvider
from marker.schema import BlockTypes
from marker.schema.blocks import SectionHeader
from marker.schema.document import Document
from marker.schema.registry import register_block_class
from marker.schema.text import Line
from tests.utils import setup_pdf_provider
class NewSectionHeader(SectionHeader):
pass
class NewLine(Line):
pass
@pytest.mark.config({
"page_range": [0],
"override_map": {BlockTypes.SectionHeader: NewSectionHeader}
})
def test_overriding(pdf_document: Document):
assert pdf_document.pages[0]\
.get_block(pdf_document.pages[0].structure[0]).__class__ == NewSectionHeader
def get_lines(pdf: str, config=None):
for block_type, block_cls in config["override_map"].items():
register_block_class(block_type, block_cls)
provider: PdfProvider = setup_pdf_provider(pdf, config)
return provider.get_page_lines(0)
def test_overriding_mp():
config = {
"page_range": [0],
"override_map": {BlockTypes.Line: NewLine}
}
pdf_list = ["adversarial.pdf", "adversarial_rot.pdf"]
with mp.Pool(processes=2) as pool:
results = pool.starmap(get_lines, [(pdf, config) for pdf in pdf_list])
assert all([r[0].line.__class__ == NewLine for r in results])