You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
143 lines
5.1 KiB
Python
143 lines
5.1 KiB
Python
import json
|
|
import os
|
|
import tempfile
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
from magic_pdf.config.exceptions import EmptyData, InvalidParams
|
|
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
|
|
MultiBucketS3DataReader)
|
|
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
|
|
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
|
|
|
|
def read_jsonl(
|
|
s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
|
|
) -> list[PymuDocDataset]:
|
|
"""Read the jsonl file and return the list of PymuDocDataset.
|
|
|
|
Args:
|
|
s3_path_or_local (str): local file or s3 path
|
|
s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
|
|
|
|
Raises:
|
|
InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
|
|
EmptyData: if no pdf file location is provided in some line of jsonl file.
|
|
InvalidParams: if the file location is s3 path but s3_client is not provided
|
|
|
|
Returns:
|
|
list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
|
|
"""
|
|
bits_arr = []
|
|
if s3_path_or_local.startswith('s3://'):
|
|
if s3_client is None:
|
|
raise InvalidParams('s3_client is required when s3_path is provided')
|
|
jsonl_bits = s3_client.read(s3_path_or_local)
|
|
else:
|
|
jsonl_bits = FileBasedDataReader('').read(s3_path_or_local)
|
|
jsonl_d = [
|
|
json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
|
|
]
|
|
for d in jsonl_d:
|
|
pdf_path = d.get('file_location', '') or d.get('path', '')
|
|
if len(pdf_path) == 0:
|
|
raise EmptyData('pdf file location is empty')
|
|
if pdf_path.startswith('s3://'):
|
|
if s3_client is None:
|
|
raise InvalidParams('s3_client is required when s3_path is provided')
|
|
bits_arr.append(s3_client.read(pdf_path))
|
|
else:
|
|
bits_arr.append(FileBasedDataReader('').read(pdf_path))
|
|
return [PymuDocDataset(bits) for bits in bits_arr]
|
|
|
|
|
|
def read_local_pdfs(path: str) -> list[PymuDocDataset]:
|
|
"""Read pdf from path or directory.
|
|
|
|
Args:
|
|
path (str): pdf file path or directory that contains pdf files
|
|
|
|
Returns:
|
|
list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
|
|
"""
|
|
if os.path.isdir(path):
|
|
reader = FileBasedDataReader()
|
|
ret = []
|
|
for root, _, files in os.walk(path):
|
|
for file in files:
|
|
suffix = file.split('.')
|
|
if suffix[-1] == 'pdf':
|
|
ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
|
|
return ret
|
|
else:
|
|
reader = FileBasedDataReader()
|
|
bits = reader.read(path)
|
|
return [PymuDocDataset(bits)]
|
|
|
|
def read_local_office(path: str) -> list[PymuDocDataset]:
|
|
"""Read ms-office file (ppt, pptx, doc, docx) from path or directory.
|
|
|
|
Args:
|
|
path (str): ms-office file or directory that contains ms-office files
|
|
|
|
Returns:
|
|
list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
|
|
|
|
Raises:
|
|
ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
|
|
FileNotFoundError: File not Found
|
|
Exception: Unknown Exception raised
|
|
"""
|
|
suffixes = ['.ppt', '.pptx', '.doc', '.docx']
|
|
fns = []
|
|
ret = []
|
|
if os.path.isdir(path):
|
|
for root, _, files in os.walk(path):
|
|
for file in files:
|
|
suffix = Path(file).suffix
|
|
if suffix in suffixes:
|
|
fns.append((os.path.join(root, file)))
|
|
else:
|
|
fns.append(path)
|
|
|
|
reader = FileBasedDataReader()
|
|
temp_dir = tempfile.mkdtemp()
|
|
for fn in fns:
|
|
try:
|
|
convert_file_to_pdf(fn, temp_dir)
|
|
except ConvertToPdfError as e:
|
|
raise e
|
|
except FileNotFoundError as e:
|
|
raise e
|
|
except Exception as e:
|
|
raise e
|
|
fn_path = Path(fn)
|
|
pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
|
|
ret.append(PymuDocDataset(reader.read(pdf_fn)))
|
|
shutil.rmtree(temp_dir)
|
|
return ret
|
|
|
|
def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg', '.jpeg']) -> list[ImageDataset]:
|
|
"""Read images from path or directory.
|
|
|
|
Args:
|
|
path (str): image file path or directory that contains image files
|
|
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
|
|
|
|
Returns:
|
|
list[ImageDataset]: each image file will converted to a ImageDataset
|
|
"""
|
|
if os.path.isdir(path):
|
|
imgs_bits = []
|
|
s_suffixes = set(suffixes)
|
|
reader = FileBasedDataReader()
|
|
for root, _, files in os.walk(path):
|
|
for file in files:
|
|
suffix = Path(file).suffix
|
|
if suffix in s_suffixes:
|
|
imgs_bits.append(reader.read(os.path.join(root, file)))
|
|
return [ImageDataset(bits) for bits in imgs_bits]
|
|
else:
|
|
reader = FileBasedDataReader()
|
|
bits = reader.read(path)
|
|
return [ImageDataset(bits)]
|