You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
81 lines
2.2 KiB
Python
81 lines
2.2 KiB
Python
import json
|
|
import time
|
|
|
|
import requests
|
|
|
|
from benchmarks.overall.download.base import Downloader
|
|
|
|
|
|
class MathpixDownloader(Downloader):
|
|
service = "mathpix"
|
|
|
|
def get_html(self, pdf_bytes):
|
|
headers = {
|
|
"app_id": self.app_id,
|
|
"app_key": self.api_key,
|
|
}
|
|
start = time.time()
|
|
pdf_id = mathpix_request(pdf_bytes, headers)
|
|
status = mathpix_status(pdf_id, headers)
|
|
if status in ["processing", "error"]:
|
|
md = ""
|
|
else:
|
|
md = mathpix_results(pdf_id, headers)
|
|
end = time.time()
|
|
if isinstance(md, bytes):
|
|
md = md.decode("utf-8")
|
|
|
|
return {
|
|
"md": md,
|
|
"time": end - start
|
|
}
|
|
|
|
def mathpix_request(buffer, headers):
|
|
response = requests.post("https://api.mathpix.com/v3/pdf",
|
|
headers=headers,
|
|
data={
|
|
"options_json": json.dumps(
|
|
{
|
|
"conversion_formats": {
|
|
"md": True,
|
|
"html": True
|
|
}
|
|
}
|
|
)
|
|
},
|
|
files={
|
|
"file": buffer
|
|
}
|
|
)
|
|
data = response.json()
|
|
pdf_id = data["pdf_id"]
|
|
return pdf_id
|
|
|
|
def mathpix_status(pdf_id, headers):
|
|
max_iters = 120
|
|
i = 0
|
|
status = "processing"
|
|
status2 = "processing"
|
|
while i < max_iters:
|
|
time.sleep(1)
|
|
response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}",
|
|
headers=headers
|
|
)
|
|
status_resp = response.json()
|
|
if "conversion_status" not in status_resp:
|
|
continue
|
|
status = status_resp["conversion_status"]["md"]["status"]
|
|
status2 = status_resp["conversion_status"]["html"]["status"]
|
|
if status == "completed" and status2 == "completed":
|
|
break
|
|
elif status == "error" or status2 == "error":
|
|
break
|
|
out_status = "completed" if status == "completed" and status2 == "completed" else "error"
|
|
return out_status
|
|
|
|
def mathpix_results(pdf_id, headers, ext="md"):
|
|
response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}",
|
|
headers=headers
|
|
)
|
|
return response.content
|