You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
100 lines
4.0 KiB
Python
100 lines
4.0 KiB
Python
1 month ago
|
import io
|
||
|
import random
|
||
|
import re
|
||
|
from typing import Tuple
|
||
|
|
||
|
import markdown2
|
||
|
from PIL import Image
|
||
|
from playwright.sync_api import sync_playwright
|
||
|
|
||
|
from benchmarks.overall.methods.schema import BenchmarkResult
|
||
|
from marker.renderers.markdown import MarkdownRenderer
|
||
|
|
||
|
|
||
|
class BaseMethod:
|
||
|
def __init__(self, **kwargs):
|
||
|
for kwarg in kwargs:
|
||
|
if hasattr(self, kwarg):
|
||
|
setattr(self, kwarg, kwargs[kwarg])
|
||
|
|
||
|
@staticmethod
|
||
|
def convert_to_md(html: str):
|
||
|
md = MarkdownRenderer()
|
||
|
markdown = md.md_cls.convert(html)
|
||
|
return markdown
|
||
|
|
||
|
def __call__(self, sample) -> BenchmarkResult:
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
def render(self, markdown: str):
|
||
|
return self.html_to_image(self.convert_to_html(markdown))
|
||
|
|
||
|
@staticmethod
|
||
|
def convert_to_html(md: str):
|
||
|
block_placeholders = []
|
||
|
inline_placeholders = []
|
||
|
|
||
|
# Add placeholders for the math
|
||
|
def block_sub(match):
|
||
|
content = match.group(1)
|
||
|
placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
|
||
|
block_placeholders.append((placeholder, f"$${content}$$"))
|
||
|
return placeholder
|
||
|
|
||
|
def inline_sub(match):
|
||
|
content = match.group(1)
|
||
|
placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
|
||
|
inline_placeholders.append((placeholder, f"${content}$"))
|
||
|
return placeholder
|
||
|
|
||
|
md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
|
||
|
md = re.sub(r'\$(.*?)\$', inline_sub, md)
|
||
|
|
||
|
html = markdown2.markdown(md, extras=['tables'])
|
||
|
|
||
|
# Replace placeholders
|
||
|
for placeholder, math_str in block_placeholders:
|
||
|
html = html.replace(placeholder, math_str)
|
||
|
for placeholder, math_str in inline_placeholders:
|
||
|
html = html.replace(placeholder, math_str)
|
||
|
|
||
|
return html
|
||
|
|
||
|
def html_to_image(self, html: str) -> Image.Image:
|
||
|
with sync_playwright() as p:
|
||
|
browser = p.chromium.launch()
|
||
|
page = browser.new_page()
|
||
|
html_str = f"""
|
||
|
<!DOCTYPE html>
|
||
|
<html>
|
||
|
<head>
|
||
|
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
|
||
|
<!-- The loading of KaTeX is deferred to speed up page rendering -->
|
||
|
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
|
||
|
<!-- To automatically render math in text elements, include the auto-render extension: -->
|
||
|
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.21/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
|
||
|
</head>
|
||
|
<body>
|
||
|
{html}
|
||
|
<script>
|
||
|
document.addEventListener("DOMContentLoaded", function() {{
|
||
|
renderMathInElement(document.body, {{
|
||
|
delimiters: [
|
||
|
{{left: '$$', right: '$$', display: true}},
|
||
|
{{left: '$', right: '$', display: false}}
|
||
|
],
|
||
|
throwOnError : false
|
||
|
}});
|
||
|
}});
|
||
|
</script>
|
||
|
</body>
|
||
|
</html>
|
||
|
""".strip()
|
||
|
page.set_viewport_size({"width": 1200, "height": 800})
|
||
|
page.set_content(html_str)
|
||
|
page.wait_for_load_state("domcontentloaded")
|
||
|
page.wait_for_timeout(500) # Wait for KaTeX to render
|
||
|
screenshot_bytes = page.screenshot(full_page=True)
|
||
|
browser.close()
|
||
|
|
||
|
return Image.open(io.BytesIO(screenshot_bytes))
|