from loguru import logger from magic_pdf.config.drop_reason import DropReason def get_data_source(jso: dict): data_source = jso.get('data_source') if data_source is None: data_source = jso.get('file_source') return data_source def get_data_type(jso: dict): data_type = jso.get('data_type') if data_type is None: data_type = jso.get('file_type') return data_type def get_bookid(jso: dict): book_id = jso.get('bookid') if book_id is None: book_id = jso.get('original_file_id') return book_id def exception_handler(jso: dict, e): logger.exception(e) jso['_need_drop'] = True jso['_drop_reason'] = DropReason.Exception jso['_exception'] = f'ERROR: {e}' return jso def get_bookname(jso: dict): data_source = get_data_source(jso) file_id = jso.get('file_id') book_name = f'{data_source}/{file_id}' return book_name def spark_json_extractor(jso: dict) -> dict: """从json中提取数据,返回一个dict.""" return { '_pdf_type': jso['_pdf_type'], 'model_list': jso['doc_layout_result'], }