pdf2markdown/marker/benchmarks/overall/methods/olmocr.py

import base64
import json
import tempfile
import time
from io import BytesIO

import torch
from PIL import Image

from benchmarks.overall.methods import BaseMethod, BenchmarkResult


def convert_single_page(filename: str, model, processor, device):
    from olmocr.data.renderpdf import render_pdf_to_base64png
    from olmocr.prompts import build_finetuning_prompt
    from olmocr.prompts.anchor import get_anchor_text

    image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024)

    # Build the prompt, using document metadata
    anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000)
    prompt = build_finetuning_prompt(anchor_text)

    # Build the full prompt
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
            ],
        }
    ]

    # Apply the chat template and processor
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    main_image = Image.open(BytesIO(base64.b64decode(image_base64)))

    inputs = processor(
        text=[text],
        images=[main_image],
        padding=True,
        return_tensors="pt",
    )
    inputs = {key: value.to(device) for (key, value) in inputs.items()}

    # Generate the output
    output = model.generate(
        **inputs,
        temperature=0.8,
        max_new_tokens=8192,
        num_return_sequences=1,
        do_sample=True,
    )

    # Decode the output
    prompt_length = inputs["input_ids"].shape[1]
    new_tokens = output[:, prompt_length:]
    text_output = processor.tokenizer.batch_decode(
        new_tokens, skip_special_tokens=True
    )[0]

    try:
        text_output = json.loads(text_output)
        text = text_output["natural_text"]
    except Exception:
        try:
            text = text_output.split("natural_text")[1].strip()
        except Exception:
            text = ""

    return text


class OlmOCRMethod(BaseMethod):
    olmocr_model: dict = None
    use_llm: bool = False

    def __call__(self, sample) -> BenchmarkResult:
        pdf_bytes = sample["pdf"]  # This is a single page PDF

        with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
            f.write(pdf_bytes)
            start = time.time()
            result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device)
            total = time.time() - start

        return {
            "markdown": result,
            "time": total
        }
commit 1 month ago			`import base64`
			`import json`
			`import tempfile`
			`import time`
			`from io import BytesIO`

			`import torch`
			`from PIL import Image`

			`from benchmarks.overall.methods import BaseMethod, BenchmarkResult`


			`def convert_single_page(filename: str, model, processor, device):`
			`from olmocr.data.renderpdf import render_pdf_to_base64png`
			`from olmocr.prompts import build_finetuning_prompt`
			`from olmocr.prompts.anchor import get_anchor_text`

			`image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024)`

			`# Build the prompt, using document metadata`
			`anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000)`
			`prompt = build_finetuning_prompt(anchor_text)`

			`# Build the full prompt`
			`messages = [`
			`{`
			`"role": "user",`
			`"content": [`
			`{"type": "text", "text": prompt},`
			`{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},`
			`],`
			`}`
			`]`

			`# Apply the chat template and processor`
			`text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)`
			`main_image = Image.open(BytesIO(base64.b64decode(image_base64)))`

			`inputs = processor(`
			`text=[text],`
			`images=[main_image],`
			`padding=True,`
			`return_tensors="pt",`
			`)`
			`inputs = {key: value.to(device) for (key, value) in inputs.items()}`

			`# Generate the output`
			`output = model.generate(`
			`**inputs,`
			`temperature=0.8,`
			`max_new_tokens=8192,`
			`num_return_sequences=1,`
			`do_sample=True,`
			`)`

			`# Decode the output`
			`prompt_length = inputs["input_ids"].shape[1]`
			`new_tokens = output[:, prompt_length:]`
			`text_output = processor.tokenizer.batch_decode(`
			`new_tokens, skip_special_tokens=True`
			`)[0]`

			`try:`
			`text_output = json.loads(text_output)`
			`text = text_output["natural_text"]`
			`except Exception:`
			`try:`
			`text = text_output.split("natural_text")[1].strip()`
			`except Exception:`
			`text = ""`

			`return text`


			`class OlmOCRMethod(BaseMethod):`
			`olmocr_model: dict = None`
			`use_llm: bool = False`

			`def __call__(self, sample) -> BenchmarkResult:`
			`pdf_bytes = sample["pdf"] # This is a single page PDF`

			`with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:`
			`f.write(pdf_bytes)`
			`start = time.time()`
			`result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device)`
			`total = time.time() - start`

			`return {`
			`"markdown": result,`
			`"time": total`
			`}`