from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.data.read_api import read_local_images from markdownify import markdownify as md import re # proc ## Create Dataset Instance input_file = "/mnt/research/PaddleOCR/pdf2md_pipeline/s4_content_recognition/all_layouts/207.jpg" ds = read_local_images(input_file)[0] x = ds.apply(doc_analyze, ocr=True) x = x.pipe_ocr_mode(None) html = x.get_markdown(None) content = md(html) content = re.sub(r'\\([#*_`])', r'\1', content) print(content)