You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
167 lines
5.2 KiB
Python
167 lines
5.2 KiB
Python
|
|
import multiprocessing as mp
|
|
import threading
|
|
from concurrent.futures import (ProcessPoolExecutor, ThreadPoolExecutor,
|
|
as_completed)
|
|
|
|
import fitz
|
|
import numpy as np
|
|
from loguru import logger
|
|
|
|
|
|
|
|
def fitz_doc_to_image(doc, dpi=200) -> dict:
|
|
"""Convert fitz.Document to image, Then convert the image to numpy array.
|
|
|
|
Args:
|
|
doc (_type_): pymudoc page
|
|
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
|
|
|
|
Returns:
|
|
dict: {'img': numpy array, 'width': width, 'height': height }
|
|
"""
|
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|
pm = doc.get_pixmap(matrix=mat, alpha=False)
|
|
|
|
# If the width or height exceeds 4500 after scaling, do not scale further.
|
|
if pm.width > 4500 or pm.height > 4500:
|
|
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
|
|
|
# Convert pixmap samples directly to numpy array
|
|
img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
|
|
|
|
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
|
|
|
|
return img_dict
|
|
|
|
def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
|
|
images = []
|
|
with fitz.open('pdf', pdf_bytes) as doc:
|
|
pdf_page_num = doc.page_count
|
|
end_page_id = (
|
|
end_page_id
|
|
if end_page_id is not None and end_page_id >= 0
|
|
else pdf_page_num - 1
|
|
)
|
|
if end_page_id > pdf_page_num - 1:
|
|
logger.warning('end_page_id is out of range, use images length')
|
|
end_page_id = pdf_page_num - 1
|
|
|
|
for index in range(0, doc.page_count):
|
|
if start_page_id <= index <= end_page_id:
|
|
page = doc[index]
|
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
|
pm = page.get_pixmap(matrix=mat, alpha=False)
|
|
|
|
# If the width or height exceeds 4500 after scaling, do not scale further.
|
|
if pm.width > 4500 or pm.height > 4500:
|
|
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
|
|
|
# Convert pixmap samples directly to numpy array
|
|
img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
|
|
|
|
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
|
|
else:
|
|
img_dict = {'img': [], 'width': 0, 'height': 0}
|
|
|
|
images.append(img_dict)
|
|
return images
|
|
|
|
|
|
def convert_page(bytes_page):
|
|
pdfs = fitz.open('pdf', bytes_page)
|
|
page = pdfs[0]
|
|
return fitz_doc_to_image(page)
|
|
|
|
def parallel_process_pdf_safe(pages, num_workers=None, **kwargs):
|
|
"""Process PDF pages in parallel with serialization-safe approach."""
|
|
if num_workers is None:
|
|
num_workers = mp.cpu_count()
|
|
|
|
|
|
# Process the extracted page data in parallel
|
|
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
|
# Process the page data
|
|
results = list(
|
|
executor.map(convert_page, pages)
|
|
)
|
|
|
|
return results
|
|
|
|
|
|
def threaded_process_pdf(pdf_path, num_threads=4, **kwargs):
|
|
"""Process all pages of a PDF using multiple threads.
|
|
|
|
Parameters:
|
|
-----------
|
|
pdf_path : str
|
|
Path to the PDF file
|
|
num_threads : int
|
|
Number of threads to use
|
|
**kwargs :
|
|
Additional arguments for fitz_doc_to_image
|
|
|
|
Returns:
|
|
--------
|
|
images : list
|
|
List of processed images, in page order
|
|
"""
|
|
# Open the PDF
|
|
doc = fitz.open(pdf_path)
|
|
num_pages = len(doc)
|
|
|
|
# Create a list to store results in the correct order
|
|
results = [None] * num_pages
|
|
|
|
# Create a thread pool
|
|
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
|
# Submit all tasks
|
|
futures = {}
|
|
for page_num in range(num_pages):
|
|
page = doc[page_num]
|
|
future = executor.submit(fitz_doc_to_image, page, **kwargs)
|
|
futures[future] = page_num
|
|
# Process results as they complete with progress bar
|
|
for future in as_completed(futures):
|
|
page_num = futures[future]
|
|
try:
|
|
results[page_num] = future.result()
|
|
except Exception as e:
|
|
print(f'Error processing page {page_num}: {e}')
|
|
results[page_num] = None
|
|
|
|
# Close the document
|
|
doc.close()
|
|
|
|
if __name__ == '__main__':
|
|
pdf = fitz.open('/tmp/[MS-DOC].pdf')
|
|
|
|
|
|
pdf_page = [fitz.open() for i in range(pdf.page_count)]
|
|
[pdf_page[i].insert_pdf(pdf, from_page=i, to_page=i) for i in range(pdf.page_count)]
|
|
|
|
pdf_page = [v.tobytes() for v in pdf_page]
|
|
results = parallel_process_pdf_safe(pdf_page, num_workers=16)
|
|
|
|
# threaded_process_pdf('/tmp/[MS-DOC].pdf', num_threads=16)
|
|
|
|
""" benchmark results of multi-threaded processing (fitz page to image)
|
|
total page nums: 578
|
|
thread nums, time cost
|
|
1 7.351 sec
|
|
2 6.334 sec
|
|
4 5.968 sec
|
|
8 6.728 sec
|
|
16 8.085 sec
|
|
"""
|
|
|
|
""" benchmark results of multi-processor processing (fitz page to image)
|
|
total page nums: 578
|
|
processor nums, time cost
|
|
1 17.170 sec
|
|
2 10.170 sec
|
|
4 7.841 sec
|
|
8 7.900 sec
|
|
16 7.984 sec
|
|
"""
|