pdf2markdown/marker/benchmarks/overall/download/llamaparse.py

import io
import time

import requests

from benchmarks.overall.download.base import Downloader


class LlamaParseDownloader(Downloader):
    service = "llamaparse"

    def get_html(self, pdf_bytes):
        rand_name = str(time.time()) + ".pdf"
        start = time.time()
        buff = io.BytesIO(pdf_bytes)
        md = upload_and_parse_file(self.api_key, rand_name, buff)
        end = time.time()
        if isinstance(md, bytes):
            md = md.decode("utf-8")

        return {
            "md": md,
            "time": end - start,
        }


def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1):
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Accept": "application/json"
    }

    # Upload file
    files = {
        'file': (fname, buff, 'application/pdf')
    }
    response = requests.post(
        'https://api.cloud.llamaindex.ai/api/v1/parsing/upload',
        headers=headers,
        files=files
    )
    response.raise_for_status()
    job_id = response.json()['id']

    # Poll for completion
    for _ in range(max_retries):
        status_response = requests.get(
            f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}',
            headers=headers
        )
        status_response.raise_for_status()
        if status_response.json()['status'] == 'SUCCESS':
            # Get results
            result_response = requests.get(
                f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown',
                headers=headers
            )
            result_response.raise_for_status()
            return result_response.json()['markdown']

        time.sleep(delay)

    raise TimeoutError("Job did not complete within the maximum retry attempts")