You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
3.1 KiB
Python

from pathlib import Path
from typing import Dict, List
import tabulate
from benchmarks.overall.schema import FullResult
def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
with open(out_path / filename, "w", encoding="utf-8") as f:
f.write(f"# {title}\n")
f.write(table)
print(title)
print(table)
def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
headers = ["Document Type"]
for method in methods:
for score_type in score_types:
headers.append(f"{method} {score_type}")
document_rows = [[k] for k in document_types]
for i, doc_type in enumerate(document_types):
for method in methods:
for score_type in score_types:
avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
document_rows[i].append(avg_score)
write_table("Document Types", document_rows, headers, out_path, "document_types.md")
headers = ["Block Type"]
block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
block_score_types = list(result["averages_by_block_type"][default_method].keys())
for method in methods:
for score_type in block_score_types:
headers.append(f"{method} {score_type}")
block_rows = [[k] for k in block_types]
for i, block_type in enumerate(block_types):
for method in methods:
for score_type in block_score_types:
avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
block_rows[i].append(avg_score)
write_table("Block types", block_rows, headers, out_path, "block_types.md")
headers = ["Method", "Avg Time"] + score_types
inference_rows = [[k] for k in methods]
all_raw_scores = [result["scores"][i] for i in result["scores"]]
for i, method in enumerate(methods):
avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
inference_rows[i].append(avg_time)
for score_type in score_types:
scores_lst = []
for ar in all_raw_scores:
try:
# Sometimes a few llm scores are missing
scores_lst.append(ar[method][score_type]["score"])
except KeyError:
continue
avg_score = sum(scores_lst) / max(1, len(scores_lst))
inference_rows[i].append(avg_score)
write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")