You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

81 lines
2.2 KiB
Python

1 month ago
import json
import time
import requests
from benchmarks.overall.download.base import Downloader
class MathpixDownloader(Downloader):
service = "mathpix"
def get_html(self, pdf_bytes):
headers = {
"app_id": self.app_id,
"app_key": self.api_key,
}
start = time.time()
pdf_id = mathpix_request(pdf_bytes, headers)
status = mathpix_status(pdf_id, headers)
if status in ["processing", "error"]:
md = ""
else:
md = mathpix_results(pdf_id, headers)
end = time.time()
if isinstance(md, bytes):
md = md.decode("utf-8")
return {
"md": md,
"time": end - start
}
def mathpix_request(buffer, headers):
response = requests.post("https://api.mathpix.com/v3/pdf",
headers=headers,
data={
"options_json": json.dumps(
{
"conversion_formats": {
"md": True,
"html": True
}
}
)
},
files={
"file": buffer
}
)
data = response.json()
pdf_id = data["pdf_id"]
return pdf_id
def mathpix_status(pdf_id, headers):
max_iters = 120
i = 0
status = "processing"
status2 = "processing"
while i < max_iters:
time.sleep(1)
response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}",
headers=headers
)
status_resp = response.json()
if "conversion_status" not in status_resp:
continue
status = status_resp["conversion_status"]["md"]["status"]
status2 = status_resp["conversion_status"]["html"]["status"]
if status == "completed" and status2 == "completed":
break
elif status == "error" or status2 == "error":
break
out_status = "completed" if status == "completed" and status2 == "completed" else "error"
return out_status
def mathpix_results(pdf_id, headers, ext="md"):
response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}",
headers=headers
)
return response.content