You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

162 lines
5.0 KiB
Python

import os
import shutil
import tempfile
from pathlib import Path
import click
import fitz
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.data.batch_build_dataset import batch_build_dataset
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__
from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
pdf_suffixes = ['.pdf']
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
image_suffixes = ['.png', '.jpeg', '.jpg']
@click.command()
@click.version_option(__version__,
'--version',
'-v',
help='display the version and exit')
@click.option(
'-p',
'--path',
'path',
type=click.Path(exists=True),
required=True,
help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
)
@click.option(
'-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='output local directory',
)
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help="""the method for parsing pdf.
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
default='auto',
)
@click.option(
'-l',
'--lang',
'lang',
type=str,
help="""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
""",
default=None,
)
@click.option(
'-d',
'--debug',
'debug_able',
type=bool,
help='Enables detailed debugging information during the execution of the CLI commands.',
default=False,
)
@click.option(
'-s',
'--start',
'start_page_id',
type=int,
help='The starting page for PDF parsing, beginning from 0.',
default=0,
)
@click.option(
'-e',
'--end',
'end_page_id',
type=int,
help='The ending page for PDF parsing, beginning from 0.',
default=None,
)
def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
os.makedirs(output_dir, exist_ok=True)
temp_dir = tempfile.mkdtemp()
def read_fn(path: Path):
if path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(path), temp_dir)
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
elif path.suffix in image_suffixes:
with open(str(path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
with open(fn, 'wb') as f:
f.write(pdf_bytes)
elif path.suffix in pdf_suffixes:
fn = str(path)
else:
raise Exception(f'Unknown file suffix: {path.suffix}')
disk_rw = FileBasedDataReader(os.path.dirname(fn))
return disk_rw.read(os.path.basename(fn))
def parse_doc(doc_path: Path, dataset: Dataset | None = None):
try:
file_name = str(Path(doc_path).stem)
if dataset is None:
pdf_data_or_dataset = read_fn(doc_path)
else:
pdf_data_or_dataset = dataset
do_parse(
output_dir,
file_name,
pdf_data_or_dataset,
[],
method,
debug_able,
start_page_id=start_page_id,
end_page_id=end_page_id,
lang=lang
)
except Exception as e:
logger.exception(e)
if os.path.isdir(path):
doc_paths = []
for doc_path in Path(path).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
if doc_path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(doc_path), temp_dir)
doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
elif doc_path.suffix in image_suffixes:
with open(str(doc_path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
with open(fn, 'wb') as f:
f.write(pdf_bytes)
doc_path = Path(fn)
doc_paths.append(doc_path)
datasets = batch_build_dataset(doc_paths, 4, lang)
batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
else:
parse_doc(Path(path))
shutil.rmtree(temp_dir)
if __name__ == '__main__':
cli()