File size: 4,954 Bytes
6a8ca1f 3068721 6a8ca1f 7ab08cb 733bfde 7ab08cb 3a2d213 fc2da7e 7ab08cb 6c84d1d da6e213 7ab08cb da6e213 733bfde da6e213 6a8ca1f 7ab08cb 6c84d1d 7ab08cb 6c84d1d 27f8b5a 04fc1f1 fc2da7e 3a2d213 fc2da7e 3a2d213 fc2da7e 3a2d213 fc2da7e 3a2d213 fc2da7e 3a2d213 fc2da7e 3a2d213 fc2da7e 3a2d213 ee5e19e 134e8f7 9ef0967 aae971d 3b88725 b8e5afc db2ea29 b8e5afc 3f71d24 680cfd1 3f71d24 b8e5afc 07b2bd0 3f71d24 b8e5afc 3a2d213 134e8f7 6a8ca1f 7ab08cb 6a8ca1f ee5e19e 471f9af 07b2bd0 6a8ca1f 1635aec 69cfbe8 9ef0967 69cfbe8 6a8ca1f 1635aec 07b2bd0 6a8ca1f b1e9279 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import spaces
import torch
import re
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize
from transformers import AutoModelForCausalLM
moondream = AutoModelForCausalLM.from_pretrained(
"moondream/moondream3-preview",
trust_remote_code=True,
dtype=torch.bfloat16,
device_map={"": "cuda"},
)
moondream.compile()
tokenizer = AutoTokenizer.from_pretrained("moondream/moondream3-preview")
"""
#model_id = "vikhyatk/moondream2"
#revision = "2025-01-09"
#def load_moondream():
# Load Moondream model and tokenizer.
# model = AutoModelForCausalLM.from_pretrained(
# "vikhyatk/moondream2", trust_remote_code=True, device_map={"": "cuda"}
# )
# tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
# return model, tokenizer
#tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
#moondream = AutoModelForCausalLM.from_pretrained(
# model_id, trust_remote_code=True, revision=revision,
# torch_dtype=torch.bfloat16, device_map={"": "cuda"},
#)
#moondream.eval()
model = AutoModelForCausalLM.from_pretrained(
"vikhyatk/moondream2",
trust_remote_code=True,
dtype=torch.bfloat16,
device_map="cuda", # "cuda" on Nvidia GPUs
)
"""
@spaces.GPU(durtion="150")
def answer_questions(image_tuples, prompt_text):
# Encode image once
image = [img[0] for img in image_tuples if img[0] is not none]
encoded = moondream.encode_image(image)
# Reuse the encoding for multiple queries
questions = [
"How many people are in this image?",
"What time of day was this taken?",
"What's the weather like?"
]
for q in questions:
result1 = moondream.query(image=encoded, question=q, reasoning=False)
print(f"Q: {q}")
print(f"A: {result1['answer']}\n")
# Also works with other skills
caption = moondream.caption(encoded, length="normal")
objects = moondream.detect(encoded, "poop")
pointe = moondream.point(encoded, "grass")
print(f"caption: {caption}, objects:{objects}, point:{pointe}")
# Segment an object
result2 = moondream.segment(image, "cat")
svg_path = result2["path"]
bbox = result2["bbox"]
print(f"SVG Path: {svg_path[:100]}...")
print(f"Bounding box: {bbox}")
# With spatial hint (point) to guide segmentation
result3 = model.segment(image, "cat", spatial_refs=[[0.5, 0.3]])
print(result1)
# With spatial hint (bounding box)
result3 = model.segment(image, "cat", spatial_refs=[[0.2, 0.1, 0.8, 0.9]])
print(result3)
result = ""
Q_and_A = ""
prompts = [p.strip() for p in prompt_text.split('?')]
image_embeds = [img[0] for img in image_tuples if img[0] is not None]
answers = []
for prompt in prompts:
answers.append(moondream.batch_answer(
images=[img.convert("RGB") for img in image_embeds],
prompts=[prompt] * len(image_embeds),
tokenizer=tokenizer
))
for i, prompt in enumerate(prompts):
Q_and_A += f"### Q: {prompt}\n"
for j, image_tuple in enumerate(image_tuples):
image_name = f"image{j+1}"
answer_text = answers[i][j]
Q_and_A += f"**{image_name} A:** \n {answer_text} \n"
result = {'headers': prompts, 'data': answers}
print("result\n{}\n\nQ_and_A\n{}\n\n".format(result, Q_and_A))
return Q_and_A, result
"""
Load Moondream model and tokenizer.
moondream = AutoModelForCausalLM.from_pretrained(
"vikhyatk/moondream2",
revision="2025-01-09",
trust_remote_code=True,
device_map={"": "cuda"},
)
tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
"""
with gr.Blocks() as demo:
gr.Markdown("# moondream2 unofficial batch processing demo")
gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n")
gr.Markdown("**Currently each image will be sent as a batch with the prompts thus asking each prompt on each image**")
gr.Markdown("A tiny vision language model. [moondream2](https://huggingface.co/vikhyatk/moondream2)")
with gr.Row():
img = gr.Gallery(label="Upload Images", type="pil", preview=True, columns=4)
with gr.Row():
prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts (one prompt for each image provided) separated by question marks. Ex: Describe this image? What is in this image?", lines=8)
with gr.Row():
submit = gr.Button("Submit")
with gr.Row():
output = gr.Markdown(label="Questions and Answers", line_breaks=True)
with gr.Row():
output2 = gr.Dataframe(label="Structured Dataframe", type="array", wrap=True)
submit.click(answer_questions, inputs=[img, prompt], outputs=[output, output2])
demo.queue().launch()
|