pdf2markdown/marker/benchmarks/overall/display/table.py

from pathlib import Path
from typing import Dict, List

import tabulate

from benchmarks.overall.schema import FullResult

def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
    table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
    with open(out_path / filename, "w", encoding="utf-8") as f:
        f.write(f"# {title}\n")
        f.write(table)
    print(title)
    print(table)


def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
    document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
    headers = ["Document Type"]
    for method in methods:
        for score_type in score_types:
            headers.append(f"{method} {score_type}")

    document_rows = [[k] for k in document_types]
    for i, doc_type in enumerate(document_types):
        for method in methods:
            for score_type in score_types:
                avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
                document_rows[i].append(avg_score)

    write_table("Document Types", document_rows, headers, out_path, "document_types.md")

    headers = ["Block Type"]
    block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
    block_score_types = list(result["averages_by_block_type"][default_method].keys())
    for method in methods:
        for score_type in block_score_types:
            headers.append(f"{method} {score_type}")

    block_rows = [[k] for k in block_types]
    for i, block_type in enumerate(block_types):
        for method in methods:
            for score_type in block_score_types:
                avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
                block_rows[i].append(avg_score)

    write_table("Block types", block_rows, headers, out_path, "block_types.md")

    headers = ["Method",  "Avg Time"] + score_types
    inference_rows = [[k] for k in methods]
    all_raw_scores = [result["scores"][i] for i in result["scores"]]
    for i, method in enumerate(methods):
        avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
        inference_rows[i].append(avg_time)
        for score_type in score_types:
            scores_lst = []
            for ar in all_raw_scores:
                try:
                    # Sometimes a few llm scores are missing
                    scores_lst.append(ar[method][score_type]["score"])
                except KeyError:
                    continue
            avg_score = sum(scores_lst) / max(1, len(scores_lst))
            inference_rows[i].append(avg_score)

    write_table("Overall Results", inference_rows, headers, out_path, "overall.md")

    print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method.  The scores are 0-100 based on edit distance.")