Csplk commited on
Commit
8a039e2
Β·
verified Β·
1 Parent(s): ed3a44a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -43
app.py CHANGED
@@ -8,7 +8,6 @@ from PIL import ImageDraw
8
  from torchvision.transforms.v2 import Resize
9
 
10
  from transformers import AutoModelForCausalLM
11
-
12
  moondream = AutoModelForCausalLM.from_pretrained(
13
  "moondream/moondream3-preview",
14
  trust_remote_code=True,
@@ -16,49 +15,12 @@ moondream = AutoModelForCausalLM.from_pretrained(
16
  device_map={"": "cuda"},
17
  )
18
  moondream.compile()
19
- tokenizer = AutoTokenizer.from_pretrained("moondream/moondream3-preview")
20
-
21
-
22
- """
23
- #model_id = "vikhyatk/moondream2"
24
- #revision = "2025-01-09"
25
-
26
- #def load_moondream():
27
- # Load Moondream model and tokenizer.
28
- # model = AutoModelForCausalLM.from_pretrained(
29
- # "vikhyatk/moondream2", trust_remote_code=True, device_map={"": "cuda"}
30
- # )
31
- # tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
32
- # return model, tokenizer
33
 
34
- #tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
35
- #moondream = AutoModelForCausalLM.from_pretrained(
36
- # model_id, trust_remote_code=True, revision=revision,
37
- # torch_dtype=torch.bfloat16, device_map={"": "cuda"},
38
- #)
39
-
40
- #moondream.eval()
41
-
42
- model = AutoModelForCausalLM.from_pretrained(
43
- "vikhyatk/moondream2",
44
- trust_remote_code=True,
45
- dtype=torch.bfloat16,
46
- device_map="cuda", # "cuda" on Nvidia GPUs
47
- )
48
- """
49
-
50
- @spaces.GPU(durtion="150")
51
  def answer_questions(image_tuples, prompt_text):
52
  # Encode image once
53
  image = [img[0] for img in image_tuples if img[0] is not none]
54
  encoded = moondream.encode_image(image)
55
-
56
- # Reuse the encoding for multiple queries
57
- questions = [
58
- "How many people are in this image?",
59
- "What time of day was this taken?",
60
- "What's the weather like?"
61
- ]
62
 
63
  for q in questions:
64
  result1 = moondream.query(image=encoded, question=q, reasoning=False)
@@ -85,7 +47,7 @@ def answer_questions(image_tuples, prompt_text):
85
  # With spatial hint (bounding box)
86
  result3 = model.segment(image, "cat", spatial_refs=[[0.2, 0.1, 0.8, 0.9]])
87
  print(result3)
88
-
89
  result = ""
90
  Q_and_A = ""
91
  prompts = [p.strip() for p in prompt_text.split('?')]
@@ -93,10 +55,9 @@ def answer_questions(image_tuples, prompt_text):
93
  answers = []
94
 
95
  for prompt in prompts:
96
- answers.append(moondream.batch_answer(
97
  images=[img.convert("RGB") for img in image_embeds],
98
  prompts=[prompt] * len(image_embeds),
99
- tokenizer=tokenizer
100
  ))
101
 
102
  for i, prompt in enumerate(prompts):
@@ -138,5 +99,5 @@ with gr.Blocks() as demo:
138
  with gr.Row():
139
  output2 = gr.Dataframe(label="Structured Dataframe", type="array", wrap=True)
140
  submit.click(answer_questions, inputs=[img, prompt], outputs=[output, output2])
141
-
142
  demo.queue().launch()
 
8
  from torchvision.transforms.v2 import Resize
9
 
10
  from transformers import AutoModelForCausalLM
 
11
  moondream = AutoModelForCausalLM.from_pretrained(
12
  "moondream/moondream3-preview",
13
  trust_remote_code=True,
 
15
  device_map={"": "cuda"},
16
  )
17
  moondream.compile()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def answer_questions(image_tuples, prompt_text):
20
  # Encode image once
21
  image = [img[0] for img in image_tuples if img[0] is not none]
22
  encoded = moondream.encode_image(image)
23
+ questions = prompt_text
 
 
 
 
 
 
24
 
25
  for q in questions:
26
  result1 = moondream.query(image=encoded, question=q, reasoning=False)
 
47
  # With spatial hint (bounding box)
48
  result3 = model.segment(image, "cat", spatial_refs=[[0.2, 0.1, 0.8, 0.9]])
49
  print(result3)
50
+
51
  result = ""
52
  Q_and_A = ""
53
  prompts = [p.strip() for p in prompt_text.split('?')]
 
55
  answers = []
56
 
57
  for prompt in prompts:
58
+ answers.append(moondream.query(
59
  images=[img.convert("RGB") for img in image_embeds],
60
  prompts=[prompt] * len(image_embeds),
 
61
  ))
62
 
63
  for i, prompt in enumerate(prompts):
 
99
  with gr.Row():
100
  output2 = gr.Dataframe(label="Structured Dataframe", type="array", wrap=True)
101
  submit.click(answer_questions, inputs=[img, prompt], outputs=[output, output2])
102
+
103
  demo.queue().launch()