You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
71 lines
2.5 KiB
Python
71 lines
2.5 KiB
Python
import pytest
|
|
from marker.converters.pdf import PdfConverter
|
|
from marker.renderers.markdown import MarkdownOutput
|
|
|
|
|
|
@pytest.mark.output_format("markdown")
|
|
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7]})
|
|
def test_pdf_converter(pdf_converter: PdfConverter, temp_doc):
|
|
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
|
markdown = markdown_output.markdown
|
|
|
|
# Basic assertions
|
|
assert len(markdown) > 0
|
|
assert "# Subspace Adversarial Training" in markdown
|
|
|
|
# Some assertions for line joining across pages
|
|
assert "AT solutions. However, these methods highly rely on specifically" in markdown # pgs: 1-2
|
|
assert "(with adversarial perturbations), which harms natural accuracy, " in markdown # pgs: 3-4
|
|
|
|
# Some assertions for line joining across columns
|
|
assert "remain similar across a wide range of choices." in markdown # pg: 2
|
|
assert "a new scheme for designing more robust and efficient" in markdown # pg: 8
|
|
|
|
@pytest.mark.filename("manual.epub")
|
|
@pytest.mark.config({"page_range": [0]})
|
|
def test_epub_converter(pdf_converter: PdfConverter, temp_doc):
|
|
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
|
markdown = markdown_output.markdown
|
|
|
|
# Basic assertions
|
|
assert "Simple Sabotage Field Manual" in markdown
|
|
|
|
@pytest.mark.filename("single_sheet.xlsx")
|
|
@pytest.mark.config({"page_range": [0]})
|
|
def test_xlsx_converter(pdf_converter: PdfConverter, temp_doc):
|
|
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
|
markdown = markdown_output.markdown
|
|
|
|
# Basic assertions
|
|
assert "four" in markdown
|
|
|
|
|
|
@pytest.mark.filename("china.html")
|
|
@pytest.mark.config({"page_range": [10]})
|
|
def test_html_converter(pdf_converter: PdfConverter, temp_doc):
|
|
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
|
markdown = markdown_output.markdown
|
|
|
|
# Basic assertions
|
|
assert "Republic of China" in markdown
|
|
|
|
|
|
@pytest.mark.filename("gatsby.docx")
|
|
@pytest.mark.config({"page_range": [0]})
|
|
def test_docx_converter(pdf_converter: PdfConverter, temp_doc):
|
|
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
|
markdown = markdown_output.markdown
|
|
|
|
# Basic assertions
|
|
assert "The Decline of the American Dream in the 1920s" in markdown
|
|
|
|
|
|
@pytest.mark.filename("lambda.pptx")
|
|
@pytest.mark.config({"page_range": [0]})
|
|
def test_pptx_converter(pdf_converter: PdfConverter, temp_doc):
|
|
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
|
markdown = markdown_output.markdown
|
|
|
|
# Basic assertions
|
|
assert "Adam Doupé" in markdown
|