import re
import pytest
from marker.converters.pdf import PdfConverter
from marker.renderers.markdown import MarkdownOutput
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.util import classes_to_strings
@pytest.mark.filename("arxiv_test.pdf")
@pytest.mark.output_format("markdown")
def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_doc):
first_page = pdf_document.pages[1]
processors = ["marker.processors.reference.ReferenceProcessor"]
pdf_converter = PdfConverter(
artifact_dict=model_dict,
processor_list=processors,
renderer=classes_to_strings([renderer])[0],
config=config
)
for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)):
if "II." in section_header_span.text:
assert section_header_span.url == "#page-1-0"
break
else:
raise ValueError("Could not find II. in the first page")
section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0]
assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n'
assert first_page.refs[0].ref == "page-1-0"
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
markdown = markdown_output.markdown
assert '[II.](#page-1-0)' in markdown
assert 'II. THEORETICAL FRAMEWORK' in markdown
for ref in set([f'' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]):
assert ref in markdown, f"Reference {ref} not found in markdown"