pdf2markdown/third_party/MinerU/magic_pdf/spark/spark_api.py

50 lines
1.1 KiB
Python

from loguru import logger
from magic_pdf.config.drop_reason import DropReason
def get_data_source(jso: dict):
data_source = jso.get('data_source')
if data_source is None:
data_source = jso.get('file_source')
return data_source
def get_data_type(jso: dict):
data_type = jso.get('data_type')
if data_type is None:
data_type = jso.get('file_type')
return data_type
def get_bookid(jso: dict):
book_id = jso.get('bookid')
if book_id is None:
book_id = jso.get('original_file_id')
return book_id
def exception_handler(jso: dict, e):
logger.exception(e)
jso['_need_drop'] = True
jso['_drop_reason'] = DropReason.Exception
jso['_exception'] = f'ERROR: {e}'
return jso
def get_bookname(jso: dict):
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = f'{data_source}/{file_id}'
return book_name
def spark_json_extractor(jso: dict) -> dict:
"""从json中提取数据返回一个dict."""
return {
'_pdf_type': jso['_pdf_type'],
'model_list': jso['doc_layout_result'],
}