You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

40 lines
1.1 KiB
Python

1 month ago
import time
import torch
import click
import pypdfium2 as pdfium
from tqdm import tqdm
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
@click.command(help="Benchmark PDF to MD conversion throughput.")
@click.argument("pdf_path", type=str)
def main(pdf_path):
print(f"Converting {pdf_path} to markdown...")
pdf = pdfium.PdfDocument(pdf_path)
page_count = len(pdf)
pdf.close()
model_dict = create_model_dict()
torch.cuda.reset_peak_memory_stats()
times = []
for i in tqdm(range(10), desc="Benchmarking"):
block_converter = PdfConverter(
artifact_dict=model_dict,
config={"disable_tqdm": True}
)
start = time.time()
block_converter(pdf_path)
total = time.time() - start
times.append(total)
max_gpu_vram = torch.cuda.max_memory_allocated() / 1024 ** 3
print(f"Converted {page_count} pages in {sum(times)/len(times):.2f} seconds.")
print(f"Max GPU VRAM: {max_gpu_vram:.2f} GB")
if __name__ == "__main__":
main()