You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
63 lines
1.8 KiB
Python
63 lines
1.8 KiB
Python
import io
|
|
import time
|
|
|
|
import requests
|
|
|
|
from benchmarks.overall.download.base import Downloader
|
|
|
|
|
|
class LlamaParseDownloader(Downloader):
|
|
service = "llamaparse"
|
|
|
|
def get_html(self, pdf_bytes):
|
|
rand_name = str(time.time()) + ".pdf"
|
|
start = time.time()
|
|
buff = io.BytesIO(pdf_bytes)
|
|
md = upload_and_parse_file(self.api_key, rand_name, buff)
|
|
end = time.time()
|
|
if isinstance(md, bytes):
|
|
md = md.decode("utf-8")
|
|
|
|
return {
|
|
"md": md,
|
|
"time": end - start,
|
|
}
|
|
|
|
|
|
def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1):
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Accept": "application/json"
|
|
}
|
|
|
|
# Upload file
|
|
files = {
|
|
'file': (fname, buff, 'application/pdf')
|
|
}
|
|
response = requests.post(
|
|
'https://api.cloud.llamaindex.ai/api/v1/parsing/upload',
|
|
headers=headers,
|
|
files=files
|
|
)
|
|
response.raise_for_status()
|
|
job_id = response.json()['id']
|
|
|
|
# Poll for completion
|
|
for _ in range(max_retries):
|
|
status_response = requests.get(
|
|
f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}',
|
|
headers=headers
|
|
)
|
|
status_response.raise_for_status()
|
|
if status_response.json()['status'] == 'SUCCESS':
|
|
# Get results
|
|
result_response = requests.get(
|
|
f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown',
|
|
headers=headers
|
|
)
|
|
result_response.raise_for_status()
|
|
return result_response.json()['markdown']
|
|
|
|
time.sleep(delay)
|
|
|
|
raise TimeoutError("Job did not complete within the maximum retry attempts") |