pdf2markdown/marker/benchmarks/overall/download/mistral.py

import io
import time
import requests

from benchmarks.overall.download.base import Downloader


class MistralDownloader(Downloader):
    service = "mistral"

    def get_html(self, pdf_bytes):
        rand_name = str(time.time()) + ".pdf"
        start = time.time()
        buff = io.BytesIO(pdf_bytes)
        md = upload_and_process_file(self.api_key, rand_name, buff)
        end = time.time()
        if isinstance(md, bytes):
            md = md.decode("utf-8")

        return {
            "md": md,
            "time": end - start,
        }


def upload_and_process_file(api_key: str, fname: str, buff):
    headers = {
        "Authorization": f"Bearer {api_key}"
    }

    upload_headers = headers.copy()
    files = {
        'file': (fname, buff, 'application/pdf'),
        'purpose': (None, 'ocr')
    }

    upload_response = requests.post(
        'https://api.mistral.ai/v1/files',
        headers=upload_headers,
        files=files
    )
    upload_response.raise_for_status()
    file_id = upload_response.json()['id']

    url_headers = headers.copy()
    url_headers["Accept"] = "application/json"

    url_response = requests.get(
        f'https://api.mistral.ai/v1/files/{file_id}/url?expiry=24',
        headers=url_headers
    )
    url_response.raise_for_status()
    signed_url = url_response.json()['url']

    ocr_headers = headers.copy()
    ocr_headers["Content-Type"] = "application/json"

    ocr_data = {
        "model": "mistral-ocr-latest",
        "document": {
            "type": "document_url",
            "document_url": signed_url
        },
        "include_image_base64": True
    }
    ocr_response = requests.post(
        'https://api.mistral.ai/v1/ocr',
        headers=ocr_headers,
        json=ocr_data
    )
    ocr_response.raise_for_status()
    result = ocr_response.json()
    return result["pages"][0]["markdown"]