import json from PIL import Image from google import genai from google.genai import types from io import BytesIO from pydantic import BaseModel from marker.settings import settings prompt = """ You're an expert document analyst who is good at turning tables in documents into HTML. Analyze the provided image, and convert it to a faithful HTML representation. Guidelines: - Keep the HTML simple and concise. - Only include the tag and contents. - Only use
, , and , , or
tags. Only use the colspan and rowspan attributes if necessary. Do not use
tags. - Make sure the table is as faithful to the image as possible with the given tags. **Instructions** 1. Analyze the image, and determine the table structure. 2. Convert the table image to HTML, following the guidelines above. 3. Output only the HTML for the table, starting with the tag and ending with the
tag. """.strip() class TableSchema(BaseModel): table_html: str def gemini_table_rec(image: Image.Image): client = genai.Client( api_key=settings.GOOGLE_API_KEY, http_options={"timeout": 60000} ) image_bytes = BytesIO() image.save(image_bytes, format="PNG") responses = client.models.generate_content( model="gemini-2.0-flash", contents=[types.Part.from_bytes(data=image_bytes.getvalue(), mime_type="image/png"), prompt], # According to gemini docs, it performs better if the image is the first element config={ "temperature": 0, "response_schema": TableSchema, "response_mime_type": "application/json", }, ) output = responses.candidates[0].content.parts[0].text return json.loads(output)["table_html"]