Spaces:

MCP-1st-Birthday
/

VisionPro

Running

App Files Files Community

subhash4face commited on 12 days ago

Commit

31ba90a

verified ·

1 Parent(s): 4df429e

fixed version

Browse files

Files changed (1) hide show

app.py +248 -88

app.py CHANGED Viewed

@@ -28,8 +28,15 @@ GOOGLE_GEMINI_API_KEY = os.environ.get("GOOGLE_GEMINI_API_KEY")
 if OPENAI_API_KEY and OPENAI_AVAILABLE:
     openai.api_key = OPENAI_API_KEY
 # -----------------------------
-# Minimal MCP Server shim (unchanged)
 # -----------------------------
 class ToolResult(BaseModel):
     content: str
@@ -66,10 +73,11 @@ class MCPServer:
 server = MCPServer("accessibility_voice_mcp")
 # -----------------------------
-# Utilities: STT, TTS, Image describe (kept minimal for portability)
 # -----------------------------
 def transcribe_with_openai(audio_file_path: str) -> str:
     if not OPENAI_AVAILABLE:
         return "OpenAI library not available"
     try:
@@ -81,7 +89,9 @@ def transcribe_with_openai(audio_file_path: str) -> str:
     except Exception as e:
         return f"OpenAI transcription error: {e}"
 def transcribe_fallback(audio_file_path: str) -> str:
     try:
         import whisper
         model = whisper.load_model("small")
@@ -90,59 +100,152 @@ def transcribe_fallback(audio_file_path: str) -> str:
     except Exception as e:
         return f"Local transcription fallback failed: {e}"
 def tts_elevenlabs(text: str) -> bytes:
     if not ELEVENLABS_API_KEY:
         raise RuntimeError("ELEVENLABS_API_KEY not set in environment")
     import requests
-    ELEVEN_VOICE_ID = os.environ.get("ELEVEN_VOICE_ID", "EXAVITQu4vr4xnSDxMaL")
-    ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"
     url = f"{ELEVEN_API_URL}/{ELEVEN_VOICE_ID}"
-    headers = {"xi-api-key": ELEVENLABS_API_KEY, "Content-Type": "application/json"}
-    payload = {"text": text, "voice_settings": {"stability": 0.5, "similarity_boost": 0.75}}
     resp = requests.post(url, headers=headers, json=payload, stream=True)
     if resp.status_code != 200:
         raise RuntimeError(f"ElevenLabs TTS failed: {resp.status_code} {resp.text}")
     return resp.content
 def describe_image_hf(image_path: str) -> str:
     try:
         import requests
-        HF_INFERENCE_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
-        HF_INFERENCE_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
-        if not HF_INFERENCE_TOKEN:
             return "HUGGINGFACE_API_TOKEN not set"
         with open(image_path, "rb") as f:
             image_bytes = f.read()
-        headers = {"Authorization": f"Bearer {HF_INFERENCE_TOKEN}"}
         resp = requests.post(HF_INFERENCE_URL, headers=headers, data=image_bytes)
         if resp.status_code != 200:
             return f"HF Inference error: {resp.status_code} {resp.text}"
-        j = resp.json()
-        if isinstance(j, list) and j and 'generated_text' in j[0]:
-            return j[0]['generated_text']
-        if isinstance(j, dict) and 'generated_text' in j:
-            return j['generated_text']
-        return str(j)
     except Exception as e:
         return f"HF describe error: {e}"
 # -----------------------------
-# MCP Tools (unchanged interface)
 # -----------------------------
 @server.tool(name="speak_text", description="Convert text to speech using ElevenLabs")
 def speak_text_tool(text: str) -> ToolResult:
     try:
         audio_bytes = tts_elevenlabs(text)
         encoded = base64.b64encode(audio_bytes).decode("utf-8")
-        return ToolResult(content=encoded, meta={"format": "base64-audio", "backend":"elevenlabs"})
     except Exception as e:
-        return ToolResult(content=f"TTS Error: {e}", meta={"backend":"elevenlabs"})
 @server.tool(name="describe_image", description="Describe an uploaded image for visually impaired users")
 def describe_image_tool(image_path: str) -> ToolResult:
-    # Try HF as conservative default (keeps the demo working without OpenAI)
     desc = describe_image_hf(image_path)
-    return ToolResult(content=desc, meta={"backend":"huggingface"})
 @server.tool(name="transcribe_audio", description="Transcribe user audio to text")
 def transcribe_audio_tool(audio_path: str) -> ToolResult:
@@ -157,29 +260,75 @@ def transcribe_audio_tool(audio_path: str) -> ToolResult:
         return ToolResult(content=text, meta={"backend":"local_whisper","duration":duration})
 # -----------------------------
-# UI: improved UX and always-visible tools
 # -----------------------------
 def decode_base64_audio(b64: str) -> bytes:
     return base64.b64decode(b64)
-def format_tool_log(tool_name, reason, meta, output, style="B"):
     backend = meta.get("backend") if meta else "unknown"
     duration = meta.get("duration") if meta else None
-    lines = [
-        f"🔧 Tool: {tool_name}",
-        f"🎯 Reason: {reason}",
-        f"⚙️ Backend: {backend}",
-    ]
-    if duration is not None:
-        try:
-            lines.append(f"⏱ Duration: {float(duration):.2f}s")
-        except:
-            lines.append(f"⏱ Duration: {duration}")
-    lines.append("📝 Output: " + (str(output)[:1000] if output else ""))
-    return "\n".join(lines)
 def messages_to_tuples(messages):
     tuples = []
     if not messages:
         return tuples
@@ -191,6 +340,7 @@ def messages_to_tuples(messages):
         elif isinstance(m, (list, tuple)) and len(m) == 2:
             tuples.append((m[0], m[1]))
         else:
             tuples.append(("", str(m)))
     return tuples
@@ -203,106 +353,113 @@ def tuples_to_messages(tuples):
             messages.append({"role":"assistant","content":assistant_text})
     return messages
-custom_css = """
-.gradio-container { background: #f7fafc; font-family: Inter, Roboto, Arial; }
-.tool-panel { background: linear-gradient(180deg,#ffffff,#f8fafc); padding:12px; border-radius:10px; box-shadow: 0 6px 18px rgba(15,23,42,0.06); }
-.tool-badge { display:inline-block; padding:6px 10px; border-radius:999px; font-weight:600; margin-right:8px; background:#eff6ff; color:#0369a1; }
-.tool-name { font-weight:700; margin-bottom:6px; display:block; }
-.log-good { background:#ecfdf5; padding:8px; border-radius:8px; }
-.log-warn { background:#fff7ed; padding:8px; border-radius:8px; }
-.chat-wrap { border-radius:12px; padding:8px; background:#ffffff; box-shadow: 0 4px 10px rgba(2,6,23,0.04); }
-"""
-with gr.Blocks(css=custom_css, title="Accessibility Voice Agent (Improved UX)") as demo:
     gr.Markdown("# Accessibility Voice Agent — MCP Tools")
     with gr.Row():
         with gr.Column(scale=3):
             chatbox = gr.Chatbot(label="Assistant", elem_id="chatbox", type="messages")
-            user_input = gr.Textbox(placeholder="Type a message...", show_label=False)
             with gr.Row():
-                # Use Microphone component for broader compatibility and clearer UX
-                mic = gr.Microphone(source="microphone", type="filepath", label="Record voice (press to record)")
                 send_btn = gr.Button("Send")
-            # Always-visible tools area (no accordion)
-            with gr.Box(elem_classes="tool-panel", visible=True):
-                gr.Markdown("### Tools (always visible)")
-                with gr.Row():
-                    with gr.Column(scale=6):
-                        gr.Markdown("<span class='tool-badge'>TTS</span><span class='tool-name'>Speak (ElevenLabs)</span>", elem_id="tts_label")
-                        tts_text = gr.Textbox(label="Text to speak", placeholder="Enter a sentence to synthesize", lines=2)
-                        tts_btn = gr.Button("Speak (TTS)")
-                    with gr.Column(scale=6):
-                        gr.Markdown("<span class='tool-badge'>IMG</span><span class='tool-name'>Describe Image</span>", elem_id="img_label")
-                        img_upload = gr.File(label="Upload image (for description)")
-                        img_btn = gr.Button("Describe Image")
         with gr.Column(scale=2):
             gr.Markdown("### Tool Call Log & Explanations")
-            tools_log = gr.Textbox(value="Ready.", lines=6, interactive=False, label="Tools Summary")
-            tools_panel = gr.HTML("<div id='tools_panel' style='max-height:420px;overflow:auto;'></div>")
             gr.Markdown("---")
-            gr.Markdown("**Each tool run shows its name, backend and short output.**")
     # Callbacks
-    def on_send_text(text, chat_history, mic_file):
         tools_entries = []
         tuples = messages_to_tuples(chat_history)
-        user_text = ""
-        # If mic recorded, prefer that
         if mic_file:
             tr = transcribe_audio_tool(mic_file)
             user_text = tr.content
-            tools_entries.append(format_tool_log("transcribe_audio", "User recorded audio", tr.meta or {}, tr.content))
         else:
             user_text = text or ""
-        # Append to chat and generate a simple assistant reply
-        tuples.append((user_text, "..." ))
         if user_text and user_text.strip().lower().startswith("describe image:"):
             _, _, fname = user_text.partition(":")
             fname = fname.strip()
             if fname:
                 res = describe_image_tool(fname)
                 assistant = res.content
-                tools_entries.append(format_tool_log("describe_image", "User requested image description", res.meta or {}, res.content))
             else:
                 assistant = "Please upload an image using the Describe Image tool or provide a path like: describe image: /path/to/image.jpg"
         else:
             assistant = "I heard: " + (user_text or "(empty)")
         tuples[-1] = (tuples[-1][0], assistant)
         new_messages = tuples_to_messages(tuples)
-        panel_html = ""
-        for e in tools_entries:
-            panel_html += f"<div class='log-good' style='margin-bottom:8px;'><pre>{e}</pre></div>"
-        return new_messages, gr.update(value="\\n\\n".join(tools_entries) or "Ready."), gr.update(value=panel_html)
-    send_btn.click(on_send_text, inputs=[user_input, chatbox, mic], outputs=[chatbox, tools_log, tools_panel])
-    def on_tts(text):
         if not text:
-            return None, gr.update(value="<div class='log-warn'><pre>No text provided</pre></div>")
         res = speak_text_tool(text)
         if res.meta and res.meta.get("format") == "base64-audio":
             audio_bytes = decode_base64_audio(res.content)
-            panel_html = f"<div class='log-good'><pre>{format_tool_log('speak_text','TTS requested', res.meta or {}, '<audio bytes>')}</pre></div>"
             return (audio_bytes, 16000), gr.update(value=panel_html)
         else:
-            panel_html = f"<div class='log-warn'><pre>{format_tool_log('speak_text','TTS requested', res.meta or {}, res.content)}</pre></div>"
             return None, gr.update(value=panel_html)
-    tts_btn.click(on_tts, inputs=[tts_text], outputs=[gr.Audio(label="TTS Output"), tools_panel])
-    def on_describe_image(file_obj):
         if not file_obj:
-            return [], gr.update(value="<div class='log-warn'><pre>No file uploaded</pre></div>")
         path = getattr(file_obj, 'name', None)
         if isinstance(file_obj, dict) and 'tmp_path' in file_obj:
             path = file_obj['tmp_path']
         if not path:
             try:
                 contents = file_obj.read()
                 tmp_path = "/tmp/gr_uploaded_image.jpg"
@@ -310,16 +467,19 @@ with gr.Blocks(css=custom_css, title="Accessibility Voice Agent (Improved UX)")
                     f.write(contents)
                 path = tmp_path
             except Exception as e:
-                return [], gr.update(value=f"<div class='log-warn'><pre>Failed to read uploaded file: {e}</pre></div>")
         res = describe_image_tool(path)
-        panel_html = f"<div class='log-good'><pre>{format_tool_log('describe_image','Uploaded image for description', res.meta or {}, res.content)}</pre></div>"
         messages = [{"role":"user","content":"<image uploaded>"}, {"role":"assistant","content":res.content}]
         return messages, gr.update(value=panel_html)
-    img_btn.click(on_describe_image, inputs=[img_upload], outputs=[chatbox, tools_panel])
-    # Session-only API keys area (kept but collapsed)
     with gr.Accordion("🔑 API Keys (stored only in session)", open=False):
         openai_key = gr.Textbox(label="OpenAI API Key", type="password")
         eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password")

 if OPENAI_API_KEY and OPENAI_AVAILABLE:
     openai.api_key = OPENAI_API_KEY
+# ElevenLabs defaults
+ELEVEN_VOICE_ID = os.environ.get("ELEVEN_VOICE_ID", "EXAVITQu4vr4xnSDxMaL")  # placeholder
+ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"
+# Hugging Face Inference API endpoint (for image captioning fallback)
+HF_INFERENCE_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
 # -----------------------------
+# Minimal MCP Server shim
 # -----------------------------
 class ToolResult(BaseModel):
     content: str
 server = MCPServer("accessibility_voice_mcp")
 # -----------------------------
+# Utilities: STT, TTS, Image describe
 # -----------------------------
 def transcribe_with_openai(audio_file_path: str) -> str:
+    """Transcribe audio using OpenAI Whisper (if available)."""
     if not OPENAI_AVAILABLE:
         return "OpenAI library not available"
     try:
     except Exception as e:
         return f"OpenAI transcription error: {e}"
 def transcribe_fallback(audio_file_path: str) -> str:
+    """Fallback: invoke whisper from local package (if installed)."""
     try:
         import whisper
         model = whisper.load_model("small")
     except Exception as e:
         return f"Local transcription fallback failed: {e}"
 def tts_elevenlabs(text: str) -> bytes:
+    """Call ElevenLabs API to synthesize speech. Returns raw audio bytes."""
     if not ELEVENLABS_API_KEY:
         raise RuntimeError("ELEVENLABS_API_KEY not set in environment")
     import requests
     url = f"{ELEVEN_API_URL}/{ELEVEN_VOICE_ID}"
+    headers = {
+        "xi-api-key": ELEVENLABS_API_KEY,
+        "Content-Type": "application/json",
+    }
+    payload = {
+        "text": text,
+        "voice_settings": {"stability": 0.5, "similarity_boost": 0.75}
+    }
     resp = requests.post(url, headers=headers, json=payload, stream=True)
     if resp.status_code != 200:
         raise RuntimeError(f"ElevenLabs TTS failed: {resp.status_code} {resp.text}")
     return resp.content
 def describe_image_hf(image_path: str) -> str:
+    """Describe an image using Hugging Face Inference API (BLIP model hosted)."""
     try:
         import requests
+        if not HUGGINGFACE_API_TOKEN:
             return "HUGGINGFACE_API_TOKEN not set"
         with open(image_path, "rb") as f:
             image_bytes = f.read()
+        headers = {
+            "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}"
+        }
+        # The HF Inference API accepts files as binary
         resp = requests.post(HF_INFERENCE_URL, headers=headers, data=image_bytes)
         if resp.status_code != 200:
             return f"HF Inference error: {resp.status_code} {resp.text}"
+        # Model returns JSON with 'generated_text' or a simple string depending on model
+        try:
+            j = resp.json()
+            # Some endpoints return [{'generated_text': '...'}]
+            if isinstance(j, list) and j and 'generated_text' in j[0]:
+                return j[0]['generated_text']
+            if isinstance(j, dict) and 'generated_text' in j:
+                return j['generated_text']
+            # Otherwise return text
+            return str(j)
+        except Exception:
+            return resp.text
     except Exception as e:
         return f"HF describe error: {e}"
+def describe_image_openai(image_path: str) -> str:
+    """Describe an image using OpenAI Vision (modern SDK compatible)."""
+    if not OPENAI_AVAILABLE:
+        return "OpenAI not available for image captioning"
+    try:
+        # Read image bytes
+        with open(image_path, "rb") as f:
+            image_bytes = f.read()
+        # Convert to base64 for safe transport in older SDKs
+        b64_image = base64.b64encode(image_bytes).decode("utf-8")
+        # Modern prompt content
+        prompt = (
+            "You are an accessibility assistant that describes images for visually impaired users. "
+            "Provide a clear, helpful, vivid, human-friendly description of the image.\n"
+        )
+        # Some OpenAI SDK versions require: client = openai.OpenAI()
+        try:
+            client = openai.OpenAI()
+            response = client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": "You describe images for visually impaired users."},
+                    {"role": "user", "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": f"data:image/jpeg;base64,{b64_image}"
+                        }
+                    ]}
+                ],
+                max_tokens=300,
+            )
+            return response.choices[0].message.content.strip()
+        except Exception:
+            # Fallback for legacy SDKs
+            legacy_prompt = (
+                "You are an assistant that describes images for visually impaired users.\n"
+                "Provide a concise, vivid, accessible description.\n"
+                "Image(base64): " + b64_image
+            )
+            resp = openai.ChatCompletion.create(
+                model="gpt-4o-mini",
+                messages=[{"role": "user", "content": legacy_prompt}],
+                max_tokens=300,
+            )
+            return resp.choices[0].message.content.strip()
+    except Exception as e:
+        return f"OpenAI image describe error: {e}"
 # -----------------------------
+# MCP Tools
 # -----------------------------
 @server.tool(name="speak_text", description="Convert text to speech using ElevenLabs")
 def speak_text_tool(text: str) -> ToolResult:
     try:
         audio_bytes = tts_elevenlabs(text)
         encoded = base64.b64encode(audio_bytes).decode("utf-8")
+        return ToolResult(content=encoded, meta={"format": "base64-audio"})
     except Exception as e:
+        return ToolResult(content=f"TTS Error: {e}")
 @server.tool(name="describe_image", description="Describe an uploaded image for visually impaired users")
 def describe_image_tool(image_path: str) -> ToolResult:
+    # Priority: OpenAI -> Gemini -> Hugging Face Inference -> error
+    if OPENAI_AVAILABLE:
+        desc = describe_image_openai(image_path)
+        if desc and not desc.startswith("OpenAI image describe error"):
+            return ToolResult(content=desc, meta={"backend":"openai"})
+    # Gemini (if configured)
+    if GOOGLE_GEMINI_API_KEY:
+        try:
+            import google.generativeai as genai
+            genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
+            model = genai.GenerativeModel("gemini-1.5-flash")
+            with open(image_path, "rb") as f:
+                image_bytes = f.read()
+            response = model.generate_content(["Describe this image for a visually impaired user.", {"mime_type":"image/jpeg", "data": image_bytes}])
+            return ToolResult(content=response.text, meta={"backend":"gemini"})
+        except Exception:
+            pass
+    # Hugging Face Inference
     desc = describe_image_hf(image_path)
+    if desc:
+        return ToolResult(content=desc, meta={"backend":"huggingface"})
+    return ToolResult(content="No image captioning backend available. Set OPENAI_API_KEY, GOOGLE_GEMINI_API_KEY, or HUGGINGFACE_API_TOKEN.")
 @server.tool(name="transcribe_audio", description="Transcribe user audio to text")
 def transcribe_audio_tool(audio_path: str) -> ToolResult:
         return ToolResult(content=text, meta={"backend":"local_whisper","duration":duration})
 # -----------------------------
+# Gradio UI (client)
 # -----------------------------
 def decode_base64_audio(b64: str) -> bytes:
     return base64.b64decode(b64)
+app_theme = {
+    "primary_hue": "blue",
+    "secondary_hue": "slate",
+}
+# Helper to format tool-call explanations
+def format_tool_log(tool_name, reason, meta, output, style="A"):
     backend = meta.get("backend") if meta else "unknown"
     duration = meta.get("duration") if meta else None
+    # ---------------------------
+    # Style A: Simple
+    # ---------------------------
+    if style == "A":
+        return f"[{tool_name}] {backend} -> {str(output)[:200]}"
+    # ---------------------------
+    # Style B: Detailed Human-Readable
+    # ---------------------------
+    if style == "B":
+        lines = [
+            f"🔧 Tool: {tool_name}",
+            f"🎯 Why: {reason}",
+            f"⚙️ Backend: {backend}",
+        ]
+        if duration is not None:
+            try:
+                lines.append(f"⏱ Duration: {float(duration):.2f}s")
+            except:
+                lines.append(f"⏱ Duration: {duration}")
+        lines.append(f"📝 Output: {str(output)}")
+        return "\n".join(lines)
+    # ---------------------------
+    # Style C: Ultra-visual
+    # ---------------------------
+    if style == "C":
+        parts = [
+            f"🔧 {tool_name}",
+            f"• Reason: {reason}",
+            f"• Backend: {backend}",
+        ]
+        if duration is not None:
+            try:
+                parts.append(f"• {float(duration):.2f}s")
+            except:
+                parts.append(f"• {duration}")
+        visual = " ".join(parts) + "\n" + f"→ {str(output)}"
+        return visual
+    # ---------------------------
+    # Style D: Both Simple + Detailed
+    # ---------------------------
+    return {
+        "simple": f"[{tool_name}] {backend} -> {str(output)[:200]}",
+        "detailed": format_tool_log(tool_name, reason, meta, output, style="B"),
+    }
+# Conversion helpers for chat history between 'messages' (gradio new) and tuple list used in logic
 def messages_to_tuples(messages):
+    # messages is a list of dicts {"role": "user"/"assistant", "content": "..."}
     tuples = []
     if not messages:
         return tuples
         elif isinstance(m, (list, tuple)) and len(m) == 2:
             tuples.append((m[0], m[1]))
         else:
+            # fallback: treat as assistant reply
             tuples.append(("", str(m)))
     return tuples
             messages.append({"role":"assistant","content":assistant_text})
     return messages
+with gr.Blocks(css=".gradio-container {background:#f7fafc}") as demo:
     gr.Markdown("# Accessibility Voice Agent — MCP Tools")
     with gr.Row():
         with gr.Column(scale=3):
+            # Set type='messages' to avoid the deprecation warning, and convert inside handlers.
             chatbox = gr.Chatbot(label="Assistant", elem_id="chatbox", type="messages")
+            user_input = gr.Textbox(placeholder="Type or press the microphone to speak...", show_label=False)
             with gr.Row():
+                # Some gradio versions don't accept 'source' kw; remove it to be broadly compatible.
+                mic = gr.Audio(type="filepath", label="Record voice (press to record)")
                 send_btn = gr.Button("Send")
+            with gr.Accordion("Advanced / Tools", open=False):
+                tts_text = gr.Textbox(label="Text to speak (ElevenLabs)")
+                tts_btn = gr.Button("Speak (TTS)")
+                img_upload = gr.File(label="Upload image (for description)")
+                img_btn = gr.Button("Describe image")
         with gr.Column(scale=2):
             gr.Markdown("### Tool Call Log & Explanations")
+            log_style = gr.Radio(choices=["A","B","C","D"], value="B", label="Log style (A:Simple B:Detailed C:Visual D:Both)")
+            tools_log = gr.Textbox(value="Ready.", lines=20, interactive=False, label="Tools Log")
+            tools_panel = gr.HTML("<div id='tools_panel' style='max-height:400px;overflow:auto;background:#ffffff;padding:8px;border-radius:8px;'></div>")
             gr.Markdown("---")
+            gr.Markdown("**Tool explanations appear here each time a tool runs.**")
     # Callbacks
+    def on_send_text(text, chat_history, mic_file, style):
         tools_entries = []
+        # convert incoming chat_history (messages) into tuples for internal logic
         tuples = messages_to_tuples(chat_history)
         if mic_file:
+            # transcribe audio
             tr = transcribe_audio_tool(mic_file)
             user_text = tr.content
+            log = format_tool_log("transcribe_audio", "User provided microphone audio", tr.meta or {}, tr.content, style)
+            tools_entries.append(log)
         else:
             user_text = text or ""
+        # Append user message to tuples and placeholder assistant
+        tuples.append((user_text, "..."))
+        # demo assistant behavior
         if user_text and user_text.strip().lower().startswith("describe image:"):
+            # expects: "describe image: filename"
             _, _, fname = user_text.partition(":")
             fname = fname.strip()
             if fname:
+                # We assume the image was uploaded earlier and path provided
                 res = describe_image_tool(fname)
                 assistant = res.content
+                log = format_tool_log("describe_image", "User requested image description", res.meta or {}, res.content, style)
+                tools_entries.append(log)
             else:
                 assistant = "Please upload an image using the Describe Image tool or provide a path like: describe image: /path/to/image.jpg"
         else:
             assistant = "I heard: " + (user_text or "(empty)")
+        # replace placeholder assistant
         tuples[-1] = (tuples[-1][0], assistant)
+        # update tools panel content
+        panel_html = ''
+        if isinstance(log, dict):
+            # D style returns dict
+            panel_html += f"<pre>{log['detailed']}</pre>"
+            panel_html += f"<hr><pre>{log['simple']}</pre>"
+        else:
+            for e in tools_entries:
+                panel_html += f"<pre style='background:#f1f5f9;border-radius:6px;padding:8px;margin-bottom:8px;'>{e}</pre>"
+        # convert back to messages for gr.Chatbot
         new_messages = tuples_to_messages(tuples)
+        return new_messages, gr.update(value="\n".join(tools_entries) or "Ready."), gr.update(value=panel_html)
+    send_btn.click(on_send_text, inputs=[user_input, chatbox, mic, log_style], outputs=[chatbox, tools_log, tools_panel])
+    def on_tts(text, style):
         if not text:
+            return None, gr.update(value="No text provided")
         res = speak_text_tool(text)
         if res.meta and res.meta.get("format") == "base64-audio":
             audio_bytes = decode_base64_audio(res.content)
+            log = format_tool_log("speak_text", "User requested text-to-speech", res.meta or {}, "<audio bytes>", style)
+            panel_html = f"<pre style='background:#eef2ff;padding:8px;border-radius:6px;'>{log}</pre>"
             return (audio_bytes, 16000), gr.update(value=panel_html)
         else:
+            log = format_tool_log("speak_text", "User requested text-to-speech", res.meta or {}, res.content, style)
+            panel_html = f"<pre style='background:#fee2e2;padding:8px;border-radius:6px;'>{log}</pre>"
             return None, gr.update(value=panel_html)
+    tts_btn.click(on_tts, inputs=[tts_text, log_style], outputs=[gr.Audio(label="TTS Output"), tools_panel])
+    def on_describe_image(file_obj, style):
         if not file_obj:
+            return [], gr.update(value="No file uploaded")
+        # file_obj may be an UploadFile-like object; get path or save to tmp file
         path = getattr(file_obj, 'name', None)
+        # If it's a temporary file dict (from gr.File), it might be a dict with 'name' and 'tmp_path'
         if isinstance(file_obj, dict) and 'tmp_path' in file_obj:
             path = file_obj['tmp_path']
         if not path:
+            # try to save bytes
             try:
                 contents = file_obj.read()
                 tmp_path = "/tmp/gr_uploaded_image.jpg"
                     f.write(contents)
                 path = tmp_path
             except Exception as e:
+                return [], gr.update(value=f"Failed to read uploaded file: {e}")
         res = describe_image_tool(path)
+        log = format_tool_log("describe_image", "User uploaded an image for description", res.meta or {}, res.content, style)
+        panel_html = f"<pre style='background:#ecfdf5;padding:8px;border-radius:6px;'>{log}</pre>"
+        # Return as messages for chatbox
         messages = [{"role":"user","content":"<image uploaded>"}, {"role":"assistant","content":res.content}]
         return messages, gr.update(value=panel_html)
+    img_btn.click(on_describe_image, inputs=[img_upload, log_style], outputs=[chatbox, tools_panel])
+    # API Keys accordion (session-only)
     with gr.Accordion("🔑 API Keys (stored only in session)", open=False):
         openai_key = gr.Textbox(label="OpenAI API Key", type="password")
         eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password")