You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
167 lines
4.8 KiB
Python
167 lines
4.8 KiB
Python
import concurrent.futures
|
|
|
|
import fitz
|
|
|
|
from magic_pdf.data.dataset import PymuDocDataset
|
|
from magic_pdf.data.utils import fitz_doc_to_image # PyMuPDF
|
|
|
|
|
|
def partition_array_greedy(arr, k):
|
|
"""Partition an array into k parts using a simple greedy approach.
|
|
|
|
Parameters:
|
|
-----------
|
|
arr : list
|
|
The input array of integers
|
|
k : int
|
|
Number of partitions to create
|
|
|
|
Returns:
|
|
--------
|
|
partitions : list of lists
|
|
The k partitions of the array
|
|
"""
|
|
# Handle edge cases
|
|
if k <= 0:
|
|
raise ValueError('k must be a positive integer')
|
|
if k > len(arr):
|
|
k = len(arr) # Adjust k if it's too large
|
|
if k == 1:
|
|
return [list(range(len(arr)))]
|
|
if k == len(arr):
|
|
return [[i] for i in range(len(arr))]
|
|
|
|
# Sort the array in descending order
|
|
sorted_indices = sorted(range(len(arr)), key=lambda i: arr[i][1], reverse=True)
|
|
|
|
# Initialize k empty partitions
|
|
partitions = [[] for _ in range(k)]
|
|
partition_sums = [0] * k
|
|
|
|
# Assign each element to the partition with the smallest current sum
|
|
for idx in sorted_indices:
|
|
# Find the partition with the smallest sum
|
|
min_sum_idx = partition_sums.index(min(partition_sums))
|
|
|
|
# Add the element to this partition
|
|
partitions[min_sum_idx].append(idx) # Store the original index
|
|
partition_sums[min_sum_idx] += arr[idx][1]
|
|
|
|
return partitions
|
|
|
|
|
|
def process_pdf_batch(pdf_jobs, idx):
|
|
"""Process a batch of PDF pages using multiple threads.
|
|
|
|
Parameters:
|
|
-----------
|
|
pdf_jobs : list of tuples
|
|
List of (pdf_path, page_num) tuples
|
|
output_dir : str or None
|
|
Directory to save images to
|
|
num_threads : int
|
|
Number of threads to use
|
|
**kwargs :
|
|
Additional arguments for process_pdf_page
|
|
|
|
Returns:
|
|
--------
|
|
images : list
|
|
List of processed images
|
|
"""
|
|
images = []
|
|
|
|
for pdf_path, _ in pdf_jobs:
|
|
doc = fitz.open(pdf_path)
|
|
tmp = []
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
tmp.append(fitz_doc_to_image(page))
|
|
images.append(tmp)
|
|
return (idx, images)
|
|
|
|
|
|
def batch_build_dataset(pdf_paths, k, lang=None):
|
|
"""Process multiple PDFs by partitioning them into k balanced parts and
|
|
processing each part in parallel.
|
|
|
|
Parameters:
|
|
-----------
|
|
pdf_paths : list
|
|
List of paths to PDF files
|
|
k : int
|
|
Number of partitions to create
|
|
output_dir : str or None
|
|
Directory to save images to
|
|
threads_per_worker : int
|
|
Number of threads to use per worker
|
|
**kwargs :
|
|
Additional arguments for process_pdf_page
|
|
|
|
Returns:
|
|
--------
|
|
all_images : list
|
|
List of all processed images
|
|
"""
|
|
|
|
results = []
|
|
for pdf_path in pdf_paths:
|
|
with open(pdf_path, 'rb') as f:
|
|
pdf_bytes = f.read()
|
|
dataset = PymuDocDataset(pdf_bytes, lang=lang)
|
|
results.append(dataset)
|
|
return results
|
|
|
|
|
|
#
|
|
# # Get page counts for each PDF
|
|
# pdf_info = []
|
|
# total_pages = 0
|
|
#
|
|
# for pdf_path in pdf_paths:
|
|
# try:
|
|
# doc = fitz.open(pdf_path)
|
|
# num_pages = len(doc)
|
|
# pdf_info.append((pdf_path, num_pages))
|
|
# total_pages += num_pages
|
|
# doc.close()
|
|
# except Exception as e:
|
|
# print(f'Error opening {pdf_path}: {e}')
|
|
#
|
|
# # Partition the jobs based on page countEach job has 1 page
|
|
# partitions = partition_array_greedy(pdf_info, k)
|
|
#
|
|
# # Process each partition in parallel
|
|
# all_images_h = {}
|
|
#
|
|
# with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
|
|
# # Submit one task per partition
|
|
# futures = []
|
|
# for sn, partition in enumerate(partitions):
|
|
# # Get the jobs for this partition
|
|
# partition_jobs = [pdf_info[idx] for idx in partition]
|
|
#
|
|
# # Submit the task
|
|
# future = executor.submit(
|
|
# process_pdf_batch,
|
|
# partition_jobs,
|
|
# sn
|
|
# )
|
|
# futures.append(future)
|
|
# # Process results as they complete
|
|
# for i, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
# try:
|
|
# idx, images = future.result()
|
|
# all_images_h[idx] = images
|
|
# except Exception as e:
|
|
# print(f'Error processing partition: {e}')
|
|
# results = [None] * len(pdf_paths)
|
|
# for i in range(len(partitions)):
|
|
# partition = partitions[i]
|
|
# for j in range(len(partition)):
|
|
# with open(pdf_info[partition[j]][0], 'rb') as f:
|
|
# pdf_bytes = f.read()
|
|
# dataset = PymuDocDataset(pdf_bytes, lang=lang)
|
|
# dataset.set_images(all_images_h[i][j])
|
|
# results[partition[j]] = dataset
|
|
# return results |