pdf2markdown/third_party/marker/benchmarks/overall/download/mathpix.py

import json
import time

import requests

from benchmarks.overall.download.base import Downloader


class MathpixDownloader(Downloader):
    service = "mathpix"

    def get_html(self, pdf_bytes):
        headers = {
            "app_id": self.app_id,
            "app_key": self.api_key,
        }
        start = time.time()
        pdf_id = mathpix_request(pdf_bytes, headers)
        status = mathpix_status(pdf_id, headers)
        if status in ["processing", "error"]:
            md = ""
        else:
            md = mathpix_results(pdf_id, headers)
        end = time.time()
        if isinstance(md, bytes):
            md = md.decode("utf-8")

        return {
            "md": md,
            "time": end - start
        }

def mathpix_request(buffer, headers):
    response = requests.post("https://api.mathpix.com/v3/pdf",
        headers=headers,
        data={
            "options_json": json.dumps(
                {
                    "conversion_formats": {
                        "md": True,
                        "html": True
                    }
                }
            )
        },
        files={
            "file": buffer
        }
    )
    data = response.json()
    pdf_id = data["pdf_id"]
    return pdf_id

def mathpix_status(pdf_id, headers):
    max_iters = 120
    i = 0
    status = "processing"
    status2 = "processing"
    while i < max_iters:
        time.sleep(1)
        response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}",
            headers=headers
        )
        status_resp = response.json()
        if "conversion_status" not in status_resp:
            continue
        status = status_resp["conversion_status"]["md"]["status"]
        status2 = status_resp["conversion_status"]["html"]["status"]
        if status == "completed" and status2 == "completed":
            break
        elif status == "error" or status2 == "error":
            break
    out_status = "completed" if status == "completed" and status2 == "completed" else "error"
    return out_status

def mathpix_results(pdf_id, headers, ext="md"):
    response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}",
        headers=headers
    )
    return response.content