pdf2markdown/third_party/marker/benchmarks/throughput/main.py

import time
import torch

import click
import pypdfium2 as pdfium
from tqdm import tqdm

from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict


@click.command(help="Benchmark PDF to MD conversion throughput.")
@click.argument("pdf_path", type=str)
def main(pdf_path):
    print(f"Converting {pdf_path} to markdown...")
    pdf = pdfium.PdfDocument(pdf_path)
    page_count = len(pdf)
    pdf.close()
    model_dict = create_model_dict()
    torch.cuda.reset_peak_memory_stats()

    times = []
    for i in tqdm(range(10), desc="Benchmarking"):
        block_converter = PdfConverter(
            artifact_dict=model_dict,
            config={"disable_tqdm": True}
        )
        start = time.time()
        block_converter(pdf_path)
        total = time.time() - start
        times.append(total)

    max_gpu_vram = torch.cuda.max_memory_allocated() / 1024 ** 3

    print(f"Converted {page_count} pages in {sum(times)/len(times):.2f} seconds.")
    print(f"Max GPU VRAM: {max_gpu_vram:.2f} GB")


if __name__ == "__main__":
    main()
commit 1 month ago			`import time`
			`import torch`

			`import click`
			`import pypdfium2 as pdfium`
			`from tqdm import tqdm`

			`from marker.converters.pdf import PdfConverter`
			`from marker.models import create_model_dict`


			`@click.command(help="Benchmark PDF to MD conversion throughput.")`
			`@click.argument("pdf_path", type=str)`
			`def main(pdf_path):`
			`print(f"Converting {pdf_path} to markdown...")`
			`pdf = pdfium.PdfDocument(pdf_path)`
			`page_count = len(pdf)`
			`pdf.close()`
			`model_dict = create_model_dict()`
			`torch.cuda.reset_peak_memory_stats()`

			`times = []`
			`for i in tqdm(range(10), desc="Benchmarking"):`
			`block_converter = PdfConverter(`
			`artifact_dict=model_dict,`
			`config={"disable_tqdm": True}`
			`)`
			`start = time.time()`
			`block_converter(pdf_path)`
			`total = time.time() - start`
			`times.append(total)`

			`max_gpu_vram = torch.cuda.max_memory_allocated() / 1024 ** 3`

			`print(f"Converted {page_count} pages in {sum(times)/len(times):.2f} seconds.")`
			`print(f"Max GPU VRAM: {max_gpu_vram:.2f} GB")`


			`if __name__ == "__main__":`
			`main()`