You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
20 lines
478 B
Python
20 lines
478 B
Python
1 month ago
|
from marker.providers.pdf import PdfProvider
|
||
|
import tempfile
|
||
|
|
||
|
import datasets
|
||
|
|
||
|
|
||
|
def setup_pdf_provider(
|
||
|
filename='adversarial.pdf',
|
||
|
config=None,
|
||
|
) -> PdfProvider:
|
||
|
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
|
||
|
idx = dataset['filename'].index(filename)
|
||
|
|
||
|
temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
|
||
|
temp_pdf.write(dataset['pdf'][idx])
|
||
|
temp_pdf.flush()
|
||
|
|
||
|
provider = PdfProvider(temp_pdf.name, config)
|
||
|
return provider
|