You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
68 lines
3.1 KiB
Python
68 lines
3.1 KiB
Python
from pathlib import Path
|
|
from typing import Dict, List
|
|
|
|
import tabulate
|
|
|
|
from benchmarks.overall.schema import FullResult
|
|
|
|
def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
|
|
table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
|
|
with open(out_path / filename, "w", encoding="utf-8") as f:
|
|
f.write(f"# {title}\n")
|
|
f.write(table)
|
|
print(title)
|
|
print(table)
|
|
|
|
|
|
def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
|
|
document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
|
|
headers = ["Document Type"]
|
|
for method in methods:
|
|
for score_type in score_types:
|
|
headers.append(f"{method} {score_type}")
|
|
|
|
document_rows = [[k] for k in document_types]
|
|
for i, doc_type in enumerate(document_types):
|
|
for method in methods:
|
|
for score_type in score_types:
|
|
avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
|
|
document_rows[i].append(avg_score)
|
|
|
|
write_table("Document Types", document_rows, headers, out_path, "document_types.md")
|
|
|
|
headers = ["Block Type"]
|
|
block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
|
|
block_score_types = list(result["averages_by_block_type"][default_method].keys())
|
|
for method in methods:
|
|
for score_type in block_score_types:
|
|
headers.append(f"{method} {score_type}")
|
|
|
|
block_rows = [[k] for k in block_types]
|
|
for i, block_type in enumerate(block_types):
|
|
for method in methods:
|
|
for score_type in block_score_types:
|
|
avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
|
|
block_rows[i].append(avg_score)
|
|
|
|
write_table("Block types", block_rows, headers, out_path, "block_types.md")
|
|
|
|
headers = ["Method", "Avg Time"] + score_types
|
|
inference_rows = [[k] for k in methods]
|
|
all_raw_scores = [result["scores"][i] for i in result["scores"]]
|
|
for i, method in enumerate(methods):
|
|
avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
|
|
inference_rows[i].append(avg_time)
|
|
for score_type in score_types:
|
|
scores_lst = []
|
|
for ar in all_raw_scores:
|
|
try:
|
|
# Sometimes a few llm scores are missing
|
|
scores_lst.append(ar[method][score_type]["score"])
|
|
except KeyError:
|
|
continue
|
|
avg_score = sum(scores_lst) / max(1, len(scores_lst))
|
|
inference_rows[i].append(avg_score)
|
|
|
|
write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
|
|
|
|
print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.") |