162 lines
5.0 KiB
Python
162 lines
5.0 KiB
Python
import os
|
|
import shutil
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import click
|
|
import fitz
|
|
from loguru import logger
|
|
|
|
import magic_pdf.model as model_config
|
|
from magic_pdf.data.batch_build_dataset import batch_build_dataset
|
|
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
|
from magic_pdf.data.dataset import Dataset
|
|
from magic_pdf.libs.version import __version__
|
|
from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
|
|
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
|
|
|
|
pdf_suffixes = ['.pdf']
|
|
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
|
|
image_suffixes = ['.png', '.jpeg', '.jpg']
|
|
|
|
|
|
@click.command()
|
|
@click.version_option(__version__,
|
|
'--version',
|
|
'-v',
|
|
help='display the version and exit')
|
|
@click.option(
|
|
'-p',
|
|
'--path',
|
|
'path',
|
|
type=click.Path(exists=True),
|
|
required=True,
|
|
help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
|
|
)
|
|
@click.option(
|
|
'-o',
|
|
'--output-dir',
|
|
'output_dir',
|
|
type=click.Path(),
|
|
required=True,
|
|
help='output local directory',
|
|
)
|
|
@click.option(
|
|
'-m',
|
|
'--method',
|
|
'method',
|
|
type=parse_pdf_methods,
|
|
help="""the method for parsing pdf.
|
|
ocr: using ocr technique to extract information from pdf.
|
|
txt: suitable for the text-based pdf only and outperform ocr.
|
|
auto: automatically choose the best method for parsing pdf from ocr and txt.
|
|
without method specified, auto will be used by default.""",
|
|
default='auto',
|
|
)
|
|
@click.option(
|
|
'-l',
|
|
'--lang',
|
|
'lang',
|
|
type=str,
|
|
help="""
|
|
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
|
|
You should input "Abbreviation" with language form url:
|
|
https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
|
|
""",
|
|
default=None,
|
|
)
|
|
@click.option(
|
|
'-d',
|
|
'--debug',
|
|
'debug_able',
|
|
type=bool,
|
|
help='Enables detailed debugging information during the execution of the CLI commands.',
|
|
default=False,
|
|
)
|
|
@click.option(
|
|
'-s',
|
|
'--start',
|
|
'start_page_id',
|
|
type=int,
|
|
help='The starting page for PDF parsing, beginning from 0.',
|
|
default=0,
|
|
)
|
|
@click.option(
|
|
'-e',
|
|
'--end',
|
|
'end_page_id',
|
|
type=int,
|
|
help='The ending page for PDF parsing, beginning from 0.',
|
|
default=None,
|
|
)
|
|
def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
temp_dir = tempfile.mkdtemp()
|
|
def read_fn(path: Path):
|
|
if path.suffix in ms_office_suffixes:
|
|
convert_file_to_pdf(str(path), temp_dir)
|
|
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
|
|
elif path.suffix in image_suffixes:
|
|
with open(str(path), 'rb') as f:
|
|
bits = f.read()
|
|
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
|
|
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
|
|
with open(fn, 'wb') as f:
|
|
f.write(pdf_bytes)
|
|
elif path.suffix in pdf_suffixes:
|
|
fn = str(path)
|
|
else:
|
|
raise Exception(f'Unknown file suffix: {path.suffix}')
|
|
|
|
disk_rw = FileBasedDataReader(os.path.dirname(fn))
|
|
return disk_rw.read(os.path.basename(fn))
|
|
|
|
def parse_doc(doc_path: Path, dataset: Dataset | None = None):
|
|
try:
|
|
file_name = str(Path(doc_path).stem)
|
|
if dataset is None:
|
|
pdf_data_or_dataset = read_fn(doc_path)
|
|
else:
|
|
pdf_data_or_dataset = dataset
|
|
do_parse(
|
|
output_dir,
|
|
file_name,
|
|
pdf_data_or_dataset,
|
|
[],
|
|
method,
|
|
debug_able,
|
|
start_page_id=start_page_id,
|
|
end_page_id=end_page_id,
|
|
lang=lang
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
|
|
if os.path.isdir(path):
|
|
doc_paths = []
|
|
for doc_path in Path(path).glob('*'):
|
|
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
|
|
if doc_path.suffix in ms_office_suffixes:
|
|
convert_file_to_pdf(str(doc_path), temp_dir)
|
|
doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
|
|
elif doc_path.suffix in image_suffixes:
|
|
with open(str(doc_path), 'rb') as f:
|
|
bits = f.read()
|
|
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
|
|
fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
|
|
with open(fn, 'wb') as f:
|
|
f.write(pdf_bytes)
|
|
doc_path = Path(fn)
|
|
doc_paths.append(doc_path)
|
|
datasets = batch_build_dataset(doc_paths, 4, lang)
|
|
batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
|
|
else:
|
|
parse_doc(Path(path))
|
|
|
|
shutil.rmtree(temp_dir)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
cli()
|