You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

42 lines
1.2 KiB
Python

import os
import tempfile
import time
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
class MarkerMethod(BaseMethod):
model_dict: dict = None
use_llm: bool = False
def __call__(self, sample) -> BenchmarkResult:
pdf_bytes = sample["pdf"] # This is a single page PDF
parser = ConfigParser({
"page_range": "0",
"disable_tqdm": True,
"use_llm": self.use_llm,
"redo_inline_math": self.use_llm,
"llm_service": "marker.services.vertex.GoogleVertexService",
"vertex_project_id": os.getenv("VERTEX_PROJECT_ID"),
})
block_converter = PdfConverter(
artifact_dict=self.model_dict,
config=parser.generate_config_dict(),
llm_service=parser.get_llm_service()
)
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
f.write(pdf_bytes)
start = time.time()
rendered = block_converter(f.name)
total = time.time() - start
return {
"markdown": rendered.markdown,
"time": total
}