pdf2markdown/third_party/surya/benchmark/texify.py

import argparse
import os.path
import random
import re
import time
from functools import partial
from pathlib import Path
from typing import List

import click
import datasets
from tabulate import tabulate
from bs4 import BeautifulSoup

from surya.settings import settings
from surya.texify import TexifyPredictor, TexifyResult
import json
import io
from rapidfuzz.distance import Levenshtein

def normalize_text(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def score_text(predictions, references):
    lev_dist = []
    for p, r in zip(predictions, references):
        p = normalize_text(p)
        r = normalize_text(r)
        lev_dist.append(Levenshtein.normalized_distance(p, r))

    return sum(lev_dist) / len(lev_dist)


def inference_texify(source_data, predictor):
    texify_predictions: List[TexifyResult] = predictor([sd["image"] for sd in source_data])
    out_data = [
        {"text": texify_predictions[i].text, "equation": source_data[i]["equation"]}
        for i in range(len(texify_predictions))
    ]

    return out_data


def image_to_bmp(image):
    img_out = io.BytesIO()
    image.save(img_out, format="BMP")
    return img_out

@click.command(help="Benchmark the performance of texify.")
@click.option("--ds_name", type=str, help="Path to dataset file with source images/equations.", default=settings.TEXIFY_BENCHMARK_DATASET)
@click.option("--results_dir", type=str, help="Path to JSON file with benchmark results.", default=os.path.join(settings.RESULT_DIR, "benchmark"))
@click.option("--max_rows", type=int, help="Maximum number of images to benchmark.", default=None)
def main(ds_name: str, results_dir: str, max_rows: int):
    predictor = TexifyPredictor()
    ds = datasets.load_dataset(ds_name, split="train")

    if max_rows:
        ds = ds.filter(lambda x, idx: idx < max_rows, with_indices=True)

    start = time.time()
    predictions = inference_texify(ds, predictor)
    time_taken = time.time() - start

    text = [p["text"] for p in predictions]
    references = [p["equation"] for p in predictions]
    scores = score_text(text, references)

    write_data = {
        "scores": scores,
        "text": [{"prediction": p, "reference": r} for p, r in zip(text, references)]
    }

    score_table = [
        ["texify", write_data["scores"], time_taken]
    ]
    score_headers = ["edit", "time taken (s)"]
    score_dirs = ["⬇", "⬇"]

    score_headers = [f"{h} {d}" for h, d in zip(score_headers, score_dirs)]
    table = tabulate(score_table, headers=["Method", *score_headers])
    print()
    print(table)

    result_path = Path(results_dir) / "texify_bench"
    result_path.mkdir(parents=True, exist_ok=True)
    with open(result_path / "results.json", "w", encoding="utf-8") as f:
        json.dump(write_data, f, indent=4)


if __name__ == "__main__":
    main()