Multimodal-OCR2

Build error

App Files Files Community

prithivMLmods commited on Sep 25

Commit

63cec06

verified ·

1 Parent(s): 43c3626

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -322

app.py CHANGED Viewed

@@ -23,6 +23,8 @@ from transformers import (
 )
 from transformers.image_utils import load_image
 import re
 import ast
 import html
@@ -34,7 +36,6 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# --- Model Loading ---
 # Load Nanonets-OCR-s
 MODEL_ID_M = "nanonets/Nanonets-OCR-s"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -86,7 +87,7 @@ model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# --- Preprocessing and Helper Functions ---
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
     """Add random padding to an image based on its size."""
     image = image.convert("RGB")
@@ -120,7 +121,6 @@ def downsample_video(video_path):
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    # Use 10 frames for video processing
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -133,11 +133,76 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
-# A placeholder function in case docling_core is not installed
-def format_smoldocling_output(buffer_text, images):
-    cleaned_output = buffer_text.replace("<end_of_utterance>", "").strip()
-    # Check if docling_core is available and was imported
-    if 'DocTagsDocument' in globals() and 'DoclingDocument' in globals():
         if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
             if "<chart>" in cleaned_output:
                 cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
@@ -145,44 +210,43 @@ def format_smoldocling_output(buffer_text, images):
             doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
             doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
             markdown_output = doc.export_to_markdown()
-            return buffer_text, markdown_output
-    # Fallback if library is not available or tags are not present
-    return buffer_text, cleaned_output
-# --- Core Generation Logic ---
-def get_model_and_processor(model_name):
-    """Helper to select model and processor."""
     if model_name == "Nanonets-OCR-s":
-        return processor_m, model_m
     elif model_name == "MonkeyOCR-Recognition":
-        return processor_g, model_g
     elif model_name == "SmolDocling-256M-preview":
-        return processor_x, model_x
     elif model_name == "Typhoon-OCR-7B":
-        return processor_l, model_l
     elif model_name == "Thyme-RL":
-        return processor_n, model_n
     else:
-        return None, None
-@spaces.GPU
-def generate_response(model_name: str, text: str, media_input, media_type: str,
-                      max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
-    """Unified generation function for both image and video."""
-    processor, model = get_model_and_processor(model_name)
-    if not processor or not model:
         yield "Invalid model selected.", "Invalid model selected."
         return
-    if media_input is None:
-        yield f"Please upload a {media_type}.", f"Please upload a {media_type}."
         return
-    if media_type == "video":
-        frames = downsample_video(media_input)
-        images = [frame for frame, _ in frames]
-    else: # image
-        images = [media_input]
     if model_name == "SmolDocling-256M-preview":
         if "OTSL" in text or "code" in text:
@@ -191,7 +255,12 @@ def generate_response(model_name: str, text: str, media_input, media_type: str,
             text = normalize_values(text, target_max=500)
     messages = [
-        {"role": "user", "content": [{"type": "image"} for _ in images] + [{"type": "text", "text": text}]}
     ]
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
     inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
@@ -211,24 +280,23 @@ def generate_response(model_name: str, text: str, media_input, media_type: str,
     buffer = ""
     for new_text in streamer:
-        buffer += new_text.replace("<|end_of_text|>", "")
         yield buffer, buffer
     if model_name == "SmolDocling-256M-preview":
-        raw_output, formatted_output = format_smoldocling_output(buffer, images)
-        yield raw_output, formatted_output
-    else:
-        # For other models, the formatted output is just the cleaned buffer
-        yield buffer, buffer.strip()
-def generate_image_wrapper(text, img, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty):
-    yield from generate_response(model, text, img, "image", max_tokens, temp, top_p_val, top_k_val, rep_penalty)
-def generate_video_wrapper(text, vid, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty):
-    yield from generate_response(model, text, vid, "video", max_tokens, temp, top_p_val, top_k_val, rep_penalty)
-# --- Examples ---
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
     ["Describe the image!", "images/8.png"],
@@ -238,7 +306,7 @@ image_examples = [
     ["Convert chart to OTSL.", "images/4.png"],
     ["Convert code to text", "images/5.jpg"],
     ["Convert this table to OTSL.", "images/6.jpg"],
-    ["Convert formula to latex.", "images/7.jpg"],
 ]
 video_examples = [
@@ -246,292 +314,84 @@ video_examples = [
     ["Explain the video in detail.", "videos/2.mp4"]
 ]
-# --- Custom CSS for the new UI ---
 css = """
-/* Left sidebar styles */
-.sidebar {
-    background-color: #f8f9fa;
-    border-right: 1px solid #e9ecef;
-    padding: 20px;
-    height: 100vh;
-}
-/* Main content area */
-.content-area {
-    padding: 20px;
-}
-/* Document grid */
-.doc-grid {
-    display: grid;
-    grid-template-columns: repeat(5, 1fr);
-    gap: 10px;
-    margin: 20px 0;
-}
-.doc-item {
-    border: 1px solid #dee2e6;
-    border-radius: 8px;
-    padding: 10px;
-    text-align: center;
-    height: 120px;
-    background-color: #f8f9fa;
-    cursor: pointer;
-    transition: all 0.2s ease;
-}
-.doc-item:hover {
-    border-color: #007bff;
-    background-color: #e9f0ff;
-}
-/* Upload and controls area */
-.upload-controls {
-    display: flex;
-    align-items: center;
-    gap: 10px;
-    margin: 20px 0;
-    padding: 15px;
-    border: 1px solid #e9ecef;
-    border-radius: 8px;
-}
-.file-upload {
-    flex: 1;
-}
-.model-dropdown {
-    width: 200px;
-}
 .submit-btn {
-    background-color: #007bff;
-    color: white;
-    border: none;
-    border-radius: 4px;
-    padding: 10px 20px;
-    font-size: 1.2rem;
-    cursor: pointer;
-    transition: background-color 0.2s;
 }
 .submit-btn:hover {
-    background-color: #0069d9;
-}
-/* Output area */
-.output-area {
-    margin-top: 20px;
 }
-/* Add conversation button */
-.add-conv-btn {
-    background-color: #28a745;
-    color: white;
-    border: none;
-    padding: 8px 15px;
-    border-radius: 4px;
-    cursor: pointer;
-}
-.add-conv-btn:hover {
-    background-color: #218838;
-}
-/* Examples section */
-.examples-section {
-    margin-top: 20px;
-}
-/* Header styles */
-.header {
-    margin-bottom: 15px;
-}
-/* Media upload icon styling */
-.upload-icon {
-    font-size: 1.5rem;
-    color: #6c757d;
-    margin-right: 10px;
-}
-/* Document icon styling */
-.doc-icon {
-    font-size: 2rem;
-    color: #6c757d;
-    margin-bottom: 5px;
-}
-/* Query input */
-.query-input {
-    margin: 15px 0;
-}
-/* Model dropdown styling */
-.model-dropdown .select {
-    padding: 8px 12px;
-    border: 1px solid #ced4da;
-    border-radius: 4px;
-}
-/* Output styling */
-.output-text {
-    border: 1px solid #ced4da;
-    border-radius: 4px;
-    padding: 10px;
-    min-height: 150px;
-}
-/* Add some space between elements */
-.gr-box {
-    margin-bottom: 15px;
 }
 """
-# --- Gradio Interface ---
-with gr.Blocks(css=css) as demo:
-    # Initialize state variables that hold data
-    image_upload_state = gr.State(None)
-    video_upload_state = gr.State(None)
-    media_type_state = gr.State("image")
     gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
-        # Left sidebar - OCR section
-        with gr.Column(scale=1, min_width=250, elem_classes="sidebar"):
-            gr.Markdown("## OCR")
-            add_conv_btn = gr.Button("+ Add Conv", elem_classes="add-conv-btn")
-            # Document grid
-            gr.Markdown("### Documents")
-            with gr.Group(elem_classes="doc-grid"):
-                for i in range(5):
-                    with gr.Column():
-                        gr.Markdown(f'<div class="doc-item"><div class="doc-icon">📄</div>Doc {i+1}</div>')
-        # Main content area
-        with gr.Column(scale=3, elem_classes="content-area"):
-            # Document processing section
-            with gr.Group():
-                gr.Markdown("## Multimodal OCR2")
-                # Document grid (5 document thumbnails as shown in the sketch)
-                with gr.Row(elem_classes="doc-grid"):
-                    for i in range(5):
-                        with gr.Column():
-                            doc_item = gr.Image(
-                                value=None,
-                                label=f"Document {i+1}",
-                                height=120,
-                                show_label=False,
-                                container=False,
-                                elem_classes="doc-item"
-                            )
-                # Define input components before they are referenced by gr.Examples
-                with gr.Group(elem_classes="upload-controls"):
-                    with gr.Column(elem_classes="file-upload"):
-                        file_upload = gr.File(
-                            label="Upload files (image/video)",
-                            file_types=["image", "video"],
-                            elem_classes="file-upload"
-                        )
-                    model_dropdown = gr.Dropdown(
-                        choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
-                        value="Nanonets-OCR-s",
-                        label="Select Model",
-                        elem_classes="model-dropdown"
                     )
-                    submit_btn = gr.Button("→", size="lg", elem_classes="submit-btn")
-                query_input = gr.Textbox(
-                    label="Enter your query",
-                    placeholder="Describe the image, extract text, convert to markdown...",
-                    elem_classes="query-input"
-                )
-                # Examples section
-                gr.Markdown("### Examples")
-                with gr.Row():
-                    with gr.Column():
-                        gr.Examples(
-                            examples=image_examples,
-                            inputs=[query_input, file_upload], # Corrected inputs
-                            label="Image Examples"
-                        )
-                    with gr.Column():
-                        gr.Examples(
-                            examples=video_examples,
-                            inputs=[query_input, file_upload], # Corrected inputs
-                            label="Video Examples"
-                        )
-                # Advanced options (hidden by default)
-                with gr.Accordion("Advanced Options", open=False):
-                    max_new_tokens = gr.Slider(label="Max New Tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
-                    temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
-                    top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
-                    top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
-                    repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
-                # Output area
-                with gr.Group(elem_classes="output-area"):
-                    gr.Markdown("### Output")
-                    raw_output = gr.Textbox(
-                        label="Result",
-                        interactive=False,
-                        lines=10,
-                        elem_classes="output-text"
-                    )
-    # --- Event Handlers ---
-    def handle_file_upload(file):
-        if file is None:
-            return "image", None, None
-        file_path = file.name
-        if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
-            return "image", Image.open(file_path), None
-        elif file_path.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
-            return "video", None, file_path
-        return "image", None, None
-    file_upload.change(
-        fn=handle_file_upload,
-        inputs=[file_upload],
-        outputs=[media_type_state, image_upload_state, video_upload_state] # Corrected outputs
     )
-    def generate_wrapper(text, img, vid, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty, m_type):
-        media_input = None
-        if m_type == "image" and img is not None:
-            media_input = img
-        elif m_type == "video" and vid is not None:
-            media_input = vid
-        else:
-            yield "Please upload a valid file.", "Please upload a valid file."
-            return
-        yield from generate_response(model, text, media_input, m_type, max_tokens, temp, top_p_val, top_k_val, rep_penalty)
-    submit_btn.click(
-        fn=generate_wrapper,
-        inputs=[
-            query_input,
-            image_upload_state, # Corrected input state
-            video_upload_state, # Corrected input state
-            model_dropdown,
-            max_new_tokens,
-            temperature,
-            top_p,
-            top_k,
-            repetition_penalty,
-            media_type_state # Corrected input state
-        ],
-        outputs=[raw_output, raw_output]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(share=True, show_error=True)

 )
 from transformers.image_utils import load_image
+from docling_core.types.doc import DoclingDocument, DocTagsDocument
 import re
 import ast
 import html
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR-s
 MODEL_ID_M = "nanonets/Nanonets-OCR-s"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
     torch_dtype=torch.float16
 ).to(device).eval()
+# Preprocessing functions for SmolDocling-256M
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
     """Add random padding to an image based on its size."""
     image = image.convert("RGB")
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
     vidcap.release()
     return frames
+@spaces.GPU
+def generate_image(model_name: str, text: str, image: Image.Image,
+                   max_new_tokens: int = 1024,
+                   temperature: float = 0.6,
+                   top_p: float = 0.9,
+                   top_k: int = 50,
+                   repetition_penalty: float = 1.2):
+    """Generate responses for image input using the selected model."""
+    if model_name == "Nanonets-OCR-s":
+        processor = processor_m
+        model = model_m
+    elif model_name == "MonkeyOCR-Recognition":
+        processor = processor_g
+        model = model_g
+    elif model_name == "SmolDocling-256M-preview":
+        processor = processor_x
+        model = model_x
+    elif model_name == "Typhoon-OCR-7B":
+        processor = processor_l
+        model = model_l
+    elif model_name == "Thyme-RL":
+        processor = processor_n
+        model = model_n
+    else:
+        yield "Invalid model selected.", "Invalid model selected."
+        return
+    if image is None:
+        yield "Please upload an image.", "Please upload an image."
+        return
+    images = [image]
+    if model_name == "SmolDocling-256M-preview":
+        if "OTSL" in text or "code" in text:
+            images = [add_random_padding(img) for img in images]
+        if "OCR at text at" in text or "Identify element" in text or "formula" in text:
+            text = normalize_values(text, target_max=500)
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image"} for _ in images] + [
+                {"type": "text", "text": text}
+            ]
+        }
+    ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text.replace("<|im_end|>", "")
+        yield buffer, buffer
+    if model_name == "SmolDocling-256M-preview":
+        cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
         if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
             if "<chart>" in cleaned_output:
                 cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
             doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
             doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
             markdown_output = doc.export_to_markdown()
+            yield buffer, markdown_output
+        else:
+            yield buffer, cleaned_output
+@spaces.GPU
+def generate_video(model_name: str, text: str, video_path: str,
+                   max_new_tokens: int = 1024,
+                   temperature: float = 0.6,
+                   top_p: float = 0.9,
+                   top_k: int = 50,
+                   repetition_penalty: float = 1.2):
+    """Generate responses for video input using the selected model."""
     if model_name == "Nanonets-OCR-s":
+        processor = processor_m
+        model = model_m
     elif model_name == "MonkeyOCR-Recognition":
+        processor = processor_g
+        model = model_g
     elif model_name == "SmolDocling-256M-preview":
+        processor = processor_x
+        model = model_x
     elif model_name == "Typhoon-OCR-7B":
+        processor = processor_l
+        model = model_l
     elif model_name == "Thyme-RL":
+        processor = processor_n
+        model = model_n
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
+    if video_path is None:
+        yield "Please upload a video.", "Please upload a video."
         return
+    frames = downsample_video(video_path)
+    images = [frame for frame, _ in frames]
     if model_name == "SmolDocling-256M-preview":
         if "OTSL" in text or "code" in text:
             text = normalize_values(text, target_max=500)
     messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image"} for _ in images] + [
+                {"type": "text", "text": text}
+            ]
+        }
     ]
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
     inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
     buffer = ""
     for new_text in streamer:
+        buffer += new_text.replace("<|im_end|>", "")
         yield buffer, buffer
     if model_name == "SmolDocling-256M-preview":
+        cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
+        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+            if "<chart>" in cleaned_output:
+                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+            markdown_output = doc.export_to_markdown()
+            yield buffer, markdown_output
+        else:
+            yield buffer, cleaned_output
+# Define examples for image and video inference
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
     ["Describe the image!", "images/8.png"],
     ["Convert chart to OTSL.", "images/4.png"],
     ["Convert code to text", "images/5.jpg"],
     ["Convert this table to OTSL.", "images/6.jpg"],
+    ["Convert formula to late.", "images/7.jpg"],
 ]
 video_examples = [
     ["Explain the video in detail.", "videos/2.mp4"]
 ]
+#css
 css = """
 .submit-btn {
+    background-color: #2980b9 !important;
+    color: white !important;
 }
 .submit-btn:hover {
+    background-color: #3498db !important;
 }
+.canvas-output {
+    border: 2px solid #4682B4;
+    border-radius: 10px;
+    padding: 20px;
 }
 """
+# Create the Gradio Interface
+with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("Image Inference"):
+                    image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    image_upload = gr.Image(type="pil", label="Image", height=290)
+                    image_submit = gr.Button("Submit", elem_classes="submit-btn")
+                    gr.Examples(
+                        examples=image_examples,
+                        inputs=[image_query, image_upload]
+                    )
+                with gr.TabItem("Video Inference"):
+                    video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    video_upload = gr.Video(label="Video", height=290)
+                    video_submit = gr.Button("Submit", elem_classes="submit-btn")
+                    gr.Examples(
+                        examples=video_examples,
+                        inputs=[video_query, video_upload]
                     )
+            with gr.Accordion("Advanced options", open=False):
+                max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
+                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
+                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
+                top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
+                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
+        with gr.Column():
+            with gr.Column(elem_classes="canvas-output"):
+                gr.Markdown("## Output")
+                raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5)
+                with gr.Accordion("(Result.md)", open=False):
+                    formatted_output = gr.Markdown(label="(Result.md)")
+            model_choice = gr.Radio(
+                choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
+                label="Select Model",
+                value="Nanonets-OCR-s"
+            )
+            gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
+            gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
+            gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
+            gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
+            gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
+            gr.Markdown("> [Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL): Thyme: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.")
+            gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
+    image_submit.click(
+        fn=generate_image,
+        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[raw_output, formatted_output]
     )
+    video_submit.click(
+        fn=generate_video,
+        inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[raw_output,
+                 formatted_output]
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)