pdf2markdown/third_party/MinerU/magic_pdf/data/batch_build_dataset.py

import concurrent.futures

import fitz

from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.data.utils import fitz_doc_to_image  # PyMuPDF


def partition_array_greedy(arr, k):
    """Partition an array into k parts using a simple greedy approach.

    Parameters:
    -----------
    arr : list
        The input array of integers
    k : int
        Number of partitions to create

    Returns:
    --------
    partitions : list of lists
        The k partitions of the array
    """
    # Handle edge cases
    if k <= 0:
        raise ValueError('k must be a positive integer')
    if k > len(arr):
        k = len(arr)  # Adjust k if it's too large
    if k == 1:
        return [list(range(len(arr)))]
    if k == len(arr):
        return [[i] for i in range(len(arr))]

    # Sort the array in descending order
    sorted_indices = sorted(range(len(arr)), key=lambda i: arr[i][1], reverse=True)

    # Initialize k empty partitions
    partitions = [[] for _ in range(k)]
    partition_sums = [0] * k

    # Assign each element to the partition with the smallest current sum
    for idx in sorted_indices:
        # Find the partition with the smallest sum
        min_sum_idx = partition_sums.index(min(partition_sums))

        # Add the element to this partition
        partitions[min_sum_idx].append(idx)  # Store the original index
        partition_sums[min_sum_idx] += arr[idx][1]

    return partitions


def process_pdf_batch(pdf_jobs, idx):
    """Process a batch of PDF pages using multiple threads.

    Parameters:
    -----------
    pdf_jobs : list of tuples
        List of (pdf_path, page_num) tuples
    output_dir : str or None
        Directory to save images to
    num_threads : int
        Number of threads to use
    **kwargs :
        Additional arguments for process_pdf_page

    Returns:
    --------
    images : list
        List of processed images
    """
    images = []

    for pdf_path, _ in pdf_jobs:
        doc = fitz.open(pdf_path)
        tmp = []
        for page_num in range(len(doc)):
            page = doc[page_num]
            tmp.append(fitz_doc_to_image(page))
        images.append(tmp)
    return (idx, images)


def batch_build_dataset(pdf_paths, k, lang=None):
    """Process multiple PDFs by partitioning them into k balanced parts and
    processing each part in parallel.

    Parameters:
    -----------
    pdf_paths : list
        List of paths to PDF files
    k : int
        Number of partitions to create
    output_dir : str or None
        Directory to save images to
    threads_per_worker : int
        Number of threads to use per worker
    **kwargs :
        Additional arguments for process_pdf_page

    Returns:
    --------
    all_images : list
        List of all processed images
    """

    results = []
    for pdf_path in pdf_paths:
        with open(pdf_path, 'rb') as f:
            pdf_bytes = f.read()
        dataset = PymuDocDataset(pdf_bytes, lang=lang)
        results.append(dataset)
    return results


    #
    # # Get page counts for each PDF
    # pdf_info = []
    # total_pages = 0
    #
    # for pdf_path in pdf_paths:
    #     try:
    #         doc = fitz.open(pdf_path)
    #         num_pages = len(doc)
    #         pdf_info.append((pdf_path, num_pages))
    #         total_pages += num_pages
    #         doc.close()
    #     except Exception as e:
    #         print(f'Error opening {pdf_path}: {e}')
    #
    # # Partition the jobs based on page countEach job has 1 page
    # partitions = partition_array_greedy(pdf_info, k)
    #
    # # Process each partition in parallel
    # all_images_h = {}
    #
    # with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
    #     # Submit one task per partition
    #     futures = []
    #     for sn, partition in enumerate(partitions):
    #         # Get the jobs for this partition
    #         partition_jobs = [pdf_info[idx] for idx in partition]
    #
    #         # Submit the task
    #         future = executor.submit(
    #             process_pdf_batch,
    #             partition_jobs,
    #             sn
    #         )
    #         futures.append(future)
    #     # Process results as they complete
    #     for i, future in enumerate(concurrent.futures.as_completed(futures)):
    #         try:
    #             idx, images = future.result()
    #             all_images_h[idx] = images
    #         except Exception as e:
    #             print(f'Error processing partition: {e}')
    # results = [None] * len(pdf_paths)
    # for i in range(len(partitions)):
    #     partition = partitions[i]
    #     for j in range(len(partition)):
    #         with open(pdf_info[partition[j]][0], 'rb') as f:
    #             pdf_bytes = f.read()
    #         dataset = PymuDocDataset(pdf_bytes, lang=lang)
    #         dataset.set_images(all_images_h[i][j])
    #         results[partition[j]] = dataset
    # return results