You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

63 lines
1.8 KiB
Python

import io
import time
import requests
from benchmarks.overall.download.base import Downloader
class LlamaParseDownloader(Downloader):
service = "llamaparse"
def get_html(self, pdf_bytes):
rand_name = str(time.time()) + ".pdf"
start = time.time()
buff = io.BytesIO(pdf_bytes)
md = upload_and_parse_file(self.api_key, rand_name, buff)
end = time.time()
if isinstance(md, bytes):
md = md.decode("utf-8")
return {
"md": md,
"time": end - start,
}
def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1):
headers = {
"Authorization": f"Bearer {api_key}",
"Accept": "application/json"
}
# Upload file
files = {
'file': (fname, buff, 'application/pdf')
}
response = requests.post(
'https://api.cloud.llamaindex.ai/api/v1/parsing/upload',
headers=headers,
files=files
)
response.raise_for_status()
job_id = response.json()['id']
# Poll for completion
for _ in range(max_retries):
status_response = requests.get(
f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}',
headers=headers
)
status_response.raise_for_status()
if status_response.json()['status'] == 'SUCCESS':
# Get results
result_response = requests.get(
f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown',
headers=headers
)
result_response.raise_for_status()
return result_response.json()['markdown']
time.sleep(delay)
raise TimeoutError("Job did not complete within the maximum retry attempts")