You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

40 lines
1.1 KiB
Python

import time
import torch
import click
import pypdfium2 as pdfium
from tqdm import tqdm
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
@click.command(help="Benchmark PDF to MD conversion throughput.")
@click.argument("pdf_path", type=str)
def main(pdf_path):
print(f"Converting {pdf_path} to markdown...")
pdf = pdfium.PdfDocument(pdf_path)
page_count = len(pdf)
pdf.close()
model_dict = create_model_dict()
torch.cuda.reset_peak_memory_stats()
times = []
for i in tqdm(range(10), desc="Benchmarking"):
block_converter = PdfConverter(
artifact_dict=model_dict,
config={"disable_tqdm": True}
)
start = time.time()
block_converter(pdf_path)
total = time.time() - start
times.append(total)
max_gpu_vram = torch.cuda.max_memory_allocated() / 1024 ** 3
print(f"Converted {page_count} pages in {sum(times)/len(times):.2f} seconds.")
print(f"Max GPU VRAM: {max_gpu_vram:.2f} GB")
if __name__ == "__main__":
main()