You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
19 lines
520 B
Python
19 lines
520 B
Python
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
|
from magic_pdf.data.read_api import read_local_images
|
|
from markdownify import markdownify as md
|
|
import re
|
|
|
|
|
|
# proc
|
|
## Create Dataset Instance
|
|
input_file = "/mnt/research/PaddleOCR/pdf2md_pipeline/s4_content_recognition/all_layouts/207.jpg"
|
|
|
|
ds = read_local_images(input_file)[0]
|
|
|
|
x = ds.apply(doc_analyze, ocr=True)
|
|
x = x.pipe_ocr_mode(None)
|
|
html = x.get_markdown(None)
|
|
content = md(html)
|
|
content = re.sub(r'\\([#*_`])', r'\1', content)
|
|
print(content)
|