Spaces:

Csplk
/

moondream2-batch-processing

Paused

File size: 4,954 Bytes

6a8ca1f
 
 
 
3068721
 
 
 
6a8ca1f
7ab08cb
733bfde
7ab08cb
 
 
 
 
 
 
3a2d213
fc2da7e
7ab08cb
 
6c84d1d
 
 
da6e213
7ab08cb
 
da6e213
 
 
 
 
 
 
 
 
 
733bfde
da6e213
6a8ca1f
7ab08cb
 
 
 
 
6c84d1d
7ab08cb
6c84d1d
27f8b5a
04fc1f1
fc2da7e
3a2d213
fc2da7e
 
 
 
 
 
 
 
 
 
3a2d213
fc2da7e
3a2d213
fc2da7e
 
 
 
 
3a2d213
fc2da7e
 
3a2d213
 
 
fc2da7e
 
 
 
 
3a2d213
 
fc2da7e
3a2d213
 
 
ee5e19e
134e8f7
9ef0967
aae971d
3b88725
b8e5afc
db2ea29
b8e5afc
 
 
 
 
 
3f71d24
680cfd1
3f71d24
 
b8e5afc
07b2bd0
3f71d24
b8e5afc
3a2d213
134e8f7
6a8ca1f
7ab08cb
 
 
 
 
 
 
 
 
 
 
 
6a8ca1f
ee5e19e
471f9af
07b2bd0
 
6a8ca1f
1635aec
69cfbe8
9ef0967
69cfbe8
6a8ca1f
1635aec
 
 
 
07b2bd0
6a8ca1f
b1e9279

import spaces
import torch
import re
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize

from transformers import AutoModelForCausalLM

moondream = AutoModelForCausalLM.from_pretrained(
    "moondream/moondream3-preview",
    trust_remote_code=True,
    dtype=torch.bfloat16,
    device_map={"": "cuda"},
)
moondream.compile()
tokenizer = AutoTokenizer.from_pretrained("moondream/moondream3-preview")


"""
#model_id = "vikhyatk/moondream2"
#revision = "2025-01-09"

#def load_moondream():
#    Load Moondream model and tokenizer.
 #    model = AutoModelForCausalLM.from_pretrained(
#        "vikhyatk/moondream2", trust_remote_code=True, device_map={"": "cuda"}
#    )
#    tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
#    return model, tokenizer

#tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
#moondream = AutoModelForCausalLM.from_pretrained(
#    model_id, trust_remote_code=True, revision=revision,
#    torch_dtype=torch.bfloat16, device_map={"": "cuda"},
#)

#moondream.eval()

model = AutoModelForCausalLM.from_pretrained(
    "vikhyatk/moondream2",
    trust_remote_code=True,
    dtype=torch.bfloat16,
    device_map="cuda", # "cuda" on Nvidia GPUs
)
"""

@spaces.GPU(durtion="150")
def answer_questions(image_tuples, prompt_text):
    # Encode image once
    image = [img[0] for img in image_tuples if img[0] is not none]
    encoded = moondream.encode_image(image)

    # Reuse the encoding for multiple queries
    questions = [
        "How many people are in this image?",
        "What time of day was this taken?",
        "What's the weather like?"
    ]

    for q in questions:
        result1 = moondream.query(image=encoded, question=q, reasoning=False)
        print(f"Q: {q}")
        print(f"A: {result1['answer']}\n")

    # Also works with other skills
    caption = moondream.caption(encoded, length="normal")
    objects = moondream.detect(encoded, "poop")
    pointe = moondream.point(encoded, "grass")
    print(f"caption: {caption}, objects:{objects}, point:{pointe}")

    # Segment an object
    result2 = moondream.segment(image, "cat")
    svg_path = result2["path"]
    bbox = result2["bbox"]

    print(f"SVG Path: {svg_path[:100]}...")
    print(f"Bounding box: {bbox}")

    # With spatial hint (point) to guide segmentation
    result3 = model.segment(image, "cat", spatial_refs=[[0.5, 0.3]])
    print(result1)
    # With spatial hint (bounding box)
    result3 = model.segment(image, "cat", spatial_refs=[[0.2, 0.1, 0.8, 0.9]])
    print(result3)
    
    result = ""
    Q_and_A = ""
    prompts = [p.strip() for p in prompt_text.split('?')]
    image_embeds = [img[0] for img in image_tuples if img[0] is not None]
    answers = []

    for prompt in prompts:
        answers.append(moondream.batch_answer(
            images=[img.convert("RGB") for img in image_embeds],
            prompts=[prompt] * len(image_embeds),
            tokenizer=tokenizer
        ))

    for i, prompt in enumerate(prompts):
        Q_and_A += f"### Q: {prompt}\n"
        for j, image_tuple in enumerate(image_tuples):
            image_name = f"image{j+1}"
            answer_text = answers[i][j]
            Q_and_A += f"**{image_name} A:** \n {answer_text} \n"

    result = {'headers': prompts, 'data': answers}
    print("result\n{}\n\nQ_and_A\n{}\n\n".format(result, Q_and_A))
    return Q_and_A, result

"""
Load Moondream model and tokenizer.
moondream = AutoModelForCausalLM.from_pretrained(
  "vikhyatk/moondream2",
  revision="2025-01-09",
  trust_remote_code=True,
  device_map={"": "cuda"},
)
tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
"""


with gr.Blocks() as demo:
    gr.Markdown("# moondream2 unofficial batch processing demo")
    gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n")
    gr.Markdown("**Currently each image will be sent as a batch with the prompts thus asking each prompt on each image**")
    gr.Markdown("A tiny vision language model. [moondream2](https://huggingface.co/vikhyatk/moondream2)")
    with gr.Row():
        img = gr.Gallery(label="Upload Images", type="pil", preview=True, columns=4)
    with gr.Row():
        prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts (one prompt for each image provided) separated by question marks. Ex: Describe this image? What is in this image?", lines=8)
    with gr.Row():
        submit = gr.Button("Submit")
    with gr.Row():
        output = gr.Markdown(label="Questions and Answers", line_breaks=True)
    with gr.Row():
        output2 = gr.Dataframe(label="Structured Dataframe", type="array", wrap=True)
    submit.click(answer_questions, inputs=[img, prompt], outputs=[output, output2])

demo.queue().launch()