162 lines
5.0 KiB
Python

import os
import shutil
import tempfile
from pathlib import Path
import click
import fitz
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.data.batch_build_dataset import batch_build_dataset
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__
from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
pdf_suffixes = ['.pdf']
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
image_suffixes = ['.png', '.jpeg', '.jpg']
@click.command()
@click.version_option(__version__,
'--version',
'-v',
help='display the version and exit')
@click.option(
'-p',
'--path',
'path',
type=click.Path(exists=True),
required=True,
help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
)
@click.option(
'-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='output local directory',
)
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help="""the method for parsing pdf.
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
default='auto',
)
@click.option(
'-l',
'--lang',
'lang',
type=str,
help="""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
""",
default=None,
)
@click.option(
'-d',
'--debug',
'debug_able',
type=bool,
help='Enables detailed debugging information during the execution of the CLI commands.',
default=False,
)
@click.option(
'-s',
'--start',
'start_page_id',
type=int,
help='The starting page for PDF parsing, beginning from 0.',
default=0,
)
@click.option(
'-e',
'--end',
'end_page_id',
type=int,
help='The ending page for PDF parsing, beginning from 0.',
default=None,
)
def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
os.makedirs(output_dir, exist_ok=True)
temp_dir = tempfile.mkdtemp()
def read_fn(path: Path):
if path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(path), temp_dir)
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
elif path.suffix in image_suffixes:
with open(str(path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
with open(fn, 'wb') as f:
f.write(pdf_bytes)
elif path.suffix in pdf_suffixes:
fn = str(path)
else:
raise Exception(f'Unknown file suffix: {path.suffix}')
disk_rw = FileBasedDataReader(os.path.dirname(fn))
return disk_rw.read(os.path.basename(fn))
def parse_doc(doc_path: Path, dataset: Dataset | None = None):
try:
file_name = str(Path(doc_path).stem)
if dataset is None:
pdf_data_or_dataset = read_fn(doc_path)
else:
pdf_data_or_dataset = dataset
do_parse(
output_dir,
file_name,
pdf_data_or_dataset,
[],
method,
debug_able,
start_page_id=start_page_id,
end_page_id=end_page_id,
lang=lang
)
except Exception as e:
logger.exception(e)
if os.path.isdir(path):
doc_paths = []
for doc_path in Path(path).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
if doc_path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(doc_path), temp_dir)
doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
elif doc_path.suffix in image_suffixes:
with open(str(doc_path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
with open(fn, 'wb') as f:
f.write(pdf_bytes)
doc_path = Path(fn)
doc_paths.append(doc_path)
datasets = batch_build_dataset(doc_paths, 4, lang)
batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
else:
parse_doc(Path(path))
shutil.rmtree(temp_dir)
if __name__ == '__main__':
cli()