pdf2markdown/third_party/MinerU/magic_pdf/tools/cli.py

import os
import shutil
import tempfile
from pathlib import Path

import click
import fitz
from loguru import logger

import magic_pdf.model as model_config
from magic_pdf.data.batch_build_dataset import batch_build_dataset
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__
from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf

pdf_suffixes = ['.pdf']
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
image_suffixes = ['.png', '.jpeg', '.jpg']


@click.command()
@click.version_option(__version__,
                      '--version',
                      '-v',
                      help='display the version and exit')
@click.option(
    '-p',
    '--path',
    'path',
    type=click.Path(exists=True),
    required=True,
    help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
)
@click.option(
    '-o',
    '--output-dir',
    'output_dir',
    type=click.Path(),
    required=True,
    help='output local directory',
)
@click.option(
    '-m',
    '--method',
    'method',
    type=parse_pdf_methods,
    help="""the method for parsing pdf.
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
    default='auto',
)
@click.option(
    '-l',
    '--lang',
    'lang',
    type=str,
    help="""
    Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
    You should input "Abbreviation" with language form url:
    https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
    """,
    default=None,
)
@click.option(
    '-d',
    '--debug',
    'debug_able',
    type=bool,
    help='Enables detailed debugging information during the execution of the CLI commands.',
    default=False,
)
@click.option(
    '-s',
    '--start',
    'start_page_id',
    type=int,
    help='The starting page for PDF parsing, beginning from 0.',
    default=0,
)
@click.option(
    '-e',
    '--end',
    'end_page_id',
    type=int,
    help='The ending page for PDF parsing, beginning from 0.',
    default=None,
)
def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
    os.makedirs(output_dir, exist_ok=True)
    temp_dir = tempfile.mkdtemp()
    def read_fn(path: Path):
        if path.suffix in ms_office_suffixes:
            convert_file_to_pdf(str(path), temp_dir)
            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
        elif path.suffix in image_suffixes:
            with open(str(path), 'rb') as f:
                bits = f.read()
            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
            with open(fn, 'wb') as f:
                f.write(pdf_bytes)
        elif path.suffix in pdf_suffixes:
            fn = str(path)
        else:
            raise Exception(f'Unknown file suffix: {path.suffix}')

        disk_rw = FileBasedDataReader(os.path.dirname(fn))
        return disk_rw.read(os.path.basename(fn))

    def parse_doc(doc_path: Path, dataset: Dataset | None = None):
        try:
            file_name = str(Path(doc_path).stem)
            if dataset is None:
                pdf_data_or_dataset = read_fn(doc_path)
            else:
                pdf_data_or_dataset = dataset
            do_parse(
                output_dir,
                file_name,
                pdf_data_or_dataset,
                [],
                method,
                debug_able,
                start_page_id=start_page_id,
                end_page_id=end_page_id,
                lang=lang
            )

        except Exception as e:
            logger.exception(e)

    if os.path.isdir(path):
        doc_paths = []
        for doc_path in Path(path).glob('*'):
            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
                if doc_path.suffix in ms_office_suffixes:
                    convert_file_to_pdf(str(doc_path), temp_dir)
                    doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
                elif doc_path.suffix in image_suffixes:
                    with open(str(doc_path), 'rb') as f:
                        bits = f.read()
                        pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
                    fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
                    with open(fn, 'wb') as f:
                        f.write(pdf_bytes)
                    doc_path = Path(fn)
                doc_paths.append(doc_path)
        datasets = batch_build_dataset(doc_paths, 4, lang)
        batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
    else:
        parse_doc(Path(path))

    shutil.rmtree(temp_dir)


if __name__ == '__main__':
    cli()
文本方向分类优化&更新版面分析模型 1 month ago			`import os`
			`import shutil`
			`import tempfile`
			`from pathlib import Path`

			`import click`
			`import fitz`
			`from loguru import logger`

			`import magic_pdf.model as model_config`
			`from magic_pdf.data.batch_build_dataset import batch_build_dataset`
			`from magic_pdf.data.data_reader_writer import FileBasedDataReader`
			`from magic_pdf.data.dataset import Dataset`
			`from magic_pdf.libs.version import __version__`
			`from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods`
			`from magic_pdf.utils.office_to_pdf import convert_file_to_pdf`

			`pdf_suffixes = ['.pdf']`
			`ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']`
			`image_suffixes = ['.png', '.jpeg', '.jpg']`


			`@click.command()`
			`@click.version_option(__version__,`
			`'--version',`
			`'-v',`
			`help='display the version and exit')`
			`@click.option(`
			`'-p',`
			`'--path',`
			`'path',`
			`type=click.Path(exists=True),`
			`required=True,`
			`help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',`
			`)`
			`@click.option(`
			`'-o',`
			`'--output-dir',`
			`'output_dir',`
			`type=click.Path(),`
			`required=True,`
			`help='output local directory',`
			`)`
			`@click.option(`
			`'-m',`
			`'--method',`
			`'method',`
			`type=parse_pdf_methods,`
			`help="""the method for parsing pdf.`
			`ocr: using ocr technique to extract information from pdf.`
			`txt: suitable for the text-based pdf only and outperform ocr.`
			`auto: automatically choose the best method for parsing pdf from ocr and txt.`
			`without method specified, auto will be used by default.""",`
			`default='auto',`
			`)`
			`@click.option(`
			`'-l',`
			`'--lang',`
			`'lang',`
			`type=str,`
			`help="""`
			`Input the languages in the pdf (if known) to improve OCR accuracy. Optional.`
			`You should input "Abbreviation" with language form url:`
			`https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations`
			`""",`
			`default=None,`
			`)`
			`@click.option(`
			`'-d',`
			`'--debug',`
			`'debug_able',`
			`type=bool,`
			`help='Enables detailed debugging information during the execution of the CLI commands.',`
			`default=False,`
			`)`
			`@click.option(`
			`'-s',`
			`'--start',`
			`'start_page_id',`
			`type=int,`
			`help='The starting page for PDF parsing, beginning from 0.',`
			`default=0,`
			`)`
			`@click.option(`
			`'-e',`
			`'--end',`
			`'end_page_id',`
			`type=int,`
			`help='The ending page for PDF parsing, beginning from 0.',`
			`default=None,`
			`)`
			`def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):`
			`os.makedirs(output_dir, exist_ok=True)`
			`temp_dir = tempfile.mkdtemp()`
			`def read_fn(path: Path):`
			`if path.suffix in ms_office_suffixes:`
			`convert_file_to_pdf(str(path), temp_dir)`
			`fn = os.path.join(temp_dir, f'{path.stem}.pdf')`
			`elif path.suffix in image_suffixes:`
			`with open(str(path), 'rb') as f:`
			`bits = f.read()`
			`pdf_bytes = fitz.open(stream=bits).convert_to_pdf()`
			`fn = os.path.join(temp_dir, f'{path.stem}.pdf')`
			`with open(fn, 'wb') as f:`
			`f.write(pdf_bytes)`
			`elif path.suffix in pdf_suffixes:`
			`fn = str(path)`
			`else:`
			`raise Exception(f'Unknown file suffix: {path.suffix}')`

			`disk_rw = FileBasedDataReader(os.path.dirname(fn))`
			`return disk_rw.read(os.path.basename(fn))`

			`def parse_doc(doc_path: Path, dataset: Dataset \| None = None):`
			`try:`
			`file_name = str(Path(doc_path).stem)`
			`if dataset is None:`
			`pdf_data_or_dataset = read_fn(doc_path)`
			`else:`
			`pdf_data_or_dataset = dataset`
			`do_parse(`
			`output_dir,`
			`file_name,`
			`pdf_data_or_dataset,`
			`[],`
			`method,`
			`debug_able,`
			`start_page_id=start_page_id,`
			`end_page_id=end_page_id,`
			`lang=lang`
			`)`

			`except Exception as e:`
			`logger.exception(e)`

			`if os.path.isdir(path):`
			`doc_paths = []`
			`for doc_path in Path(path).glob('*'):`
			`if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:`
			`if doc_path.suffix in ms_office_suffixes:`
			`convert_file_to_pdf(str(doc_path), temp_dir)`
			`doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))`
			`elif doc_path.suffix in image_suffixes:`
			`with open(str(doc_path), 'rb') as f:`
			`bits = f.read()`
			`pdf_bytes = fitz.open(stream=bits).convert_to_pdf()`
			`fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')`
			`with open(fn, 'wb') as f:`
			`f.write(pdf_bytes)`
			`doc_path = Path(fn)`
			`doc_paths.append(doc_path)`
			`datasets = batch_build_dataset(doc_paths, 4, lang)`
			`batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)`
			`else:`
			`parse_doc(Path(path))`

			`shutil.rmtree(temp_dir)`


			`if __name__ == '__main__':`
			`cli()`