pdf2markdown/third_party/MinerU/tests/test_cli/test_cli_sdk.py

425 lines
18 KiB
Python

"""test cli and sdk."""
import logging
import os
import pytest
from conf import conf
from lib import common
import time
import magic_pdf.model as model_config
from magic_pdf.data.read_api import read_local_images
from magic_pdf.data.read_api import read_local_office
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
pdf_res_path = conf.conf['pdf_res_path']
code_path = conf.conf['code_path']
pdf_dev_path = conf.conf['pdf_dev_path']
magic_pdf_config = "/home/quyuan/magic-pdf.json"
class TestCli:
"""test cli."""
@pytest.fixture(autouse=True)
def setup(self):
"""
init
"""
common.clear_gpu_memory()
common.update_config_file(magic_pdf_config, "device-mode", "cuda")
# 这里可以添加任何前置操作
yield
@pytest.mark.P0
def test_pdf_local_sdk(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_path)
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
common.delete_file(dir_path)
### draw model result on each page
infer_result.draw_model(os.path.join(dir_path, f"{name_without_suff}_model.pdf"))
### get model inference result
model_inference_result = infer_result.get_infer_res()
### draw layout result on each page
pipe_result.draw_layout(os.path.join(dir_path, f"{name_without_suff}_layout.pdf"))
### draw spans result on each page
pipe_result.draw_span(os.path.join(dir_path, f"{name_without_suff}_spans.pdf"))
### dump markdown
md_content = pipe_result.get_markdown(image_dir)
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir)
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get middle json
middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_s3_sdk(self):
"""pdf s3 sdk test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
pass
@pytest.mark.P0
def test_pdf_local_ppt(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'ppt')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pptx'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'ppt', f'{demo_name}.pptx')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".pptx")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_office(pdf_path)[0]
common.delete_file(dir_path)
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_local_image(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'images')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.jpg'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'images', f'{demo_name}.jpg')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".jpg")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(dir_path)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_images(pdf_path)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_local_image_dir(self):
"""local image dir."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'images')
dir_path = os.path.join(pdf_dev_path, 'mineru')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
common.delete_file(dir_path)
dss = read_local_images(pdf_path, suffixes=['.png', '.jpg'])
count = 0
for ds in dss:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{count}.md", image_dir)
count += 1
common.sdk_count_folders_and_check_contents(dir_path)
def test_local_doc_parse(self):
"""
doc 解析
"""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'doc')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.docx'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'doc', f'{demo_name}.docx')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".docx")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_office(pdf_path)[0]
common.delete_file(dir_path)
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_cli_auto(self):
"""magic_pdf cli test auto."""
time.sleep(2)
demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
res_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(res_path)
cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
logging.info(cmd)
os.system(cmd)
common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'auto'))
@pytest.mark.P0
def test_pdf_cli_txt(self):
"""magic_pdf cli test txt."""
time.sleep(2)
demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
res_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(res_path)
cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
logging.info(cmd)
os.system(cmd)
common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'txt'))
@pytest.mark.P0
def test_pdf_cli_ocr(self):
"""magic_pdf cli test ocr."""
time.sleep(2)
demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
res_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(res_path)
cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
logging.info(cmd)
os.system(cmd)
common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'ocr'))
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_txt(self):
"""magic_pdf_dev cli local txt."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_ocr(self):
"""magic_pdf_dev cli local ocr."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_auto(self):
"""magic_pdf_dev cli local auto."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_txt(self):
"""magic_pdf_dev cli s3 txt."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_ocr(self):
"""magic_pdf_dev cli s3 ocr."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_auto(self):
"""magic_pdf_dev cli s3 auto."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_pdf_dev_cli_pdf_json_auto(self):
"""magic_pdf_dev cli pdf+json auto."""
time.sleep(2)
json_path = os.path.join(pdf_dev_path, 'test_model.json')
pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_pdf_json_ocr(self):
"""magic_pdf_dev cli pdf+json ocr."""
time.sleep(2)
json_path = os.path.join(pdf_dev_path, 'test_model.json')
pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_local_magic_pdf_open_rapidai_table(self):
"""magic pdf cli open rapid ai table."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "rapid_table",
"enable": True,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
@pytest.mark.P1
def test_local_magic_pdf_doclayout_yolo(self):
"""magic pdf cli open doclyaout yolo."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "doclayout_yolo"
}
common.update_config_file(magic_pdf_config, "layout-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
@pytest.mark.skip(reason="layoutlmv3废弃")
@pytest.mark.P1
def test_local_magic_pdf_layoutlmv3_yolo(self):
"""magic pdf cli open layoutlmv3."""
time.sleep(2)
value = {
"model": "layoutlmv3"
}
common.update_config_file(magic_pdf_config, "layout-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
#res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
@pytest.mark.P1
def test_magic_pdf_cpu(self):
"""magic pdf cli cpu mode."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "rapid_table",
"enable": True,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
common.update_config_file(magic_pdf_config, "device-mode", "cpu")
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
@pytest.mark.P1
def test_local_magic_pdf_close_html_table(self):
"""magic pdf cli close table."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "rapid_table",
"enable": False,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_close_tables(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
if __name__ == '__main__':
pytest.main()