You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
73 lines
1.8 KiB
Python
73 lines
1.8 KiB
Python
import io
|
|
import time
|
|
import requests
|
|
|
|
from benchmarks.overall.download.base import Downloader
|
|
|
|
|
|
class MistralDownloader(Downloader):
|
|
service = "mistral"
|
|
|
|
def get_html(self, pdf_bytes):
|
|
rand_name = str(time.time()) + ".pdf"
|
|
start = time.time()
|
|
buff = io.BytesIO(pdf_bytes)
|
|
md = upload_and_process_file(self.api_key, rand_name, buff)
|
|
end = time.time()
|
|
if isinstance(md, bytes):
|
|
md = md.decode("utf-8")
|
|
|
|
return {
|
|
"md": md,
|
|
"time": end - start,
|
|
}
|
|
|
|
|
|
def upload_and_process_file(api_key: str, fname: str, buff):
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}"
|
|
}
|
|
|
|
upload_headers = headers.copy()
|
|
files = {
|
|
'file': (fname, buff, 'application/pdf'),
|
|
'purpose': (None, 'ocr')
|
|
}
|
|
|
|
upload_response = requests.post(
|
|
'https://api.mistral.ai/v1/files',
|
|
headers=upload_headers,
|
|
files=files
|
|
)
|
|
upload_response.raise_for_status()
|
|
file_id = upload_response.json()['id']
|
|
|
|
url_headers = headers.copy()
|
|
url_headers["Accept"] = "application/json"
|
|
|
|
url_response = requests.get(
|
|
f'https://api.mistral.ai/v1/files/{file_id}/url?expiry=24',
|
|
headers=url_headers
|
|
)
|
|
url_response.raise_for_status()
|
|
signed_url = url_response.json()['url']
|
|
|
|
ocr_headers = headers.copy()
|
|
ocr_headers["Content-Type"] = "application/json"
|
|
|
|
ocr_data = {
|
|
"model": "mistral-ocr-latest",
|
|
"document": {
|
|
"type": "document_url",
|
|
"document_url": signed_url
|
|
},
|
|
"include_image_base64": True
|
|
}
|
|
ocr_response = requests.post(
|
|
'https://api.mistral.ai/v1/ocr',
|
|
headers=ocr_headers,
|
|
json=ocr_data
|
|
)
|
|
ocr_response.raise_for_status()
|
|
result = ocr_response.json()
|
|
return result["pages"][0]["markdown"] |