import spaces import torch import re import gradio as gr from threading import Thread from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM from PIL import ImageDraw from torchvision.transforms.v2 import Resize from transformers import AutoModelForCausalLM moondream = AutoModelForCausalLM.from_pretrained( "moondream/moondream3-preview", trust_remote_code=True, dtype=torch.bfloat16, device_map={"": "cuda"}, ) moondream.compile() def answer_questions(image_tuples, prompt_text): # Encode image once image = [img[0] for img in image_tuples if img[0] is not none] encoded = moondream.encode_image(image) questions = prompt_text for q in questions: result1 = moondream.query(image=encoded, question=q, reasoning=False) print(f"Q: {q}") print(f"A: {result1['answer']}\n") # Also works with other skills caption = moondream.caption(encoded, length="normal") objects = moondream.detect(encoded, "poop") pointe = moondream.point(encoded, "grass") print(f"caption: {caption}, objects:{objects}, point:{pointe}") # Segment an object result2 = moondream.segment(image, "cat") svg_path = result2["path"] bbox = result2["bbox"] print(f"SVG Path: {svg_path[:100]}...") print(f"Bounding box: {bbox}") # With spatial hint (point) to guide segmentation result3 = model.segment(image, "cat", spatial_refs=[[0.5, 0.3]]) print(result1) # With spatial hint (bounding box) result3 = model.segment(image, "cat", spatial_refs=[[0.2, 0.1, 0.8, 0.9]]) print(result3) result = "" Q_and_A = "" prompts = [p.strip() for p in prompt_text.split('?')] image_embeds = [img[0] for img in image_tuples if img[0] is not None] answers = [] for prompt in prompts: answers.append(moondream.query( images=[img.convert("RGB") for img in image_embeds], prompts=[prompt] * len(image_embeds), )) for i, prompt in enumerate(prompts): Q_and_A += f"### Q: {prompt}\n" for j, image_tuple in enumerate(image_tuples): image_name = f"image{j+1}" answer_text = answers[i][j] Q_and_A += f"**{image_name} A:** \n {answer_text} \n" result = {'headers': prompts, 'data': answers} print("result\n{}\n\nQ_and_A\n{}\n\n".format(result, Q_and_A)) return Q_and_A, result """ Load Moondream model and tokenizer. moondream = AutoModelForCausalLM.from_pretrained( "vikhyatk/moondream2", revision="2025-01-09", trust_remote_code=True, device_map={"": "cuda"}, ) tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2") """ with gr.Blocks() as demo: gr.Markdown("# moondream2 unofficial batch processing demo") gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n") gr.Markdown("**Currently each image will be sent as a batch with the prompts thus asking each prompt on each image**") gr.Markdown("A tiny vision language model. [moondream2](https://huggingface.co/vikhyatk/moondream2)") with gr.Row(): img = gr.Gallery(label="Upload Images", type="pil", preview=True, columns=4) with gr.Row(): prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts (one prompt for each image provided) separated by question marks. Ex: Describe this image? What is in this image?", lines=8) with gr.Row(): submit = gr.Button("Submit") with gr.Row(): output = gr.Markdown(label="Questions and Answers", line_breaks=True) with gr.Row(): output2 = gr.Dataframe(label="Structured Dataframe", type="array", wrap=True) submit.click(answer_questions, inputs=[img, prompt], outputs=[output, output2]) demo.queue().launch()