You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

42 lines
1.2 KiB
Python

1 month ago
import os
import tempfile
import time
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
class MarkerMethod(BaseMethod):
model_dict: dict = None
use_llm: bool = False
def __call__(self, sample) -> BenchmarkResult:
pdf_bytes = sample["pdf"] # This is a single page PDF
parser = ConfigParser({
"page_range": "0",
"disable_tqdm": True,
"use_llm": self.use_llm,
"redo_inline_math": self.use_llm,
"llm_service": "marker.services.vertex.GoogleVertexService",
"vertex_project_id": os.getenv("VERTEX_PROJECT_ID"),
})
block_converter = PdfConverter(
artifact_dict=self.model_dict,
config=parser.generate_config_dict(),
llm_service=parser.get_llm_service()
)
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
f.write(pdf_bytes)
start = time.time()
rendered = block_converter(f.name)
total = time.time() - start
return {
"markdown": rendered.markdown,
"time": total
}