Spaces:

MCP-1st-Birthday
/

VisionPro

Running

File size: 20,150 Bytes

036c510
2f91957
 
 
 
 
ede6a4f
2f91957
 
2904653
2f91957
 
420030c
2f91957
 
 
 
 
 
420030c
 
 
2f91957
 
420030c
ede6a4f
2f91957
 
 
 
31ba90a
 
 
 
 
 
 
420030c
31ba90a
420030c
2f91957
 
 
 
 
 
 
 
 
 
 
 
420030c
 
 
 
2f91957
 
 
420030c
 
 
 
 
 
 
 
 
 
 
 
 
2f91957
 
420030c
31ba90a
420030c
 
2f91957
31ba90a
2f91957
420030c
ede6a4f
 
420030c
 
 
 
ede6a4f
 
420030c
31ba90a
2f91957
31ba90a
2f91957
 
 
 
 
 
420030c
 
31ba90a
2f91957
31ba90a
2f91957
420030c
ede6a4f
2f91957
31ba90a
 
 
 
 
 
 
 
420030c
 
 
 
 
31ba90a
ede6a4f
31ba90a
2f91957
ede6a4f
31ba90a
ede6a4f
2f91957
420030c
31ba90a
 
 
 
ede6a4f
 
 
31ba90a
 
 
 
 
 
 
 
 
 
 
 
2f91957
ede6a4f
2f91957
31ba90a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420030c
31ba90a
420030c
 
2f91957
 
420030c
 
31ba90a
2f91957
31ba90a
 
420030c
 
2f91957
31ba90a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ede6a4f
31ba90a
 
 
 
2f91957
420030c
 
ede6a4f
2f91957
420030c
ede6a4f
 
420030c
 
ede6a4f
 
420030c
 
31ba90a
420030c
2f91957
 
 
2904653
31ba90a
 
 
 
 
 
 
ede6a4f
 
dbbbc60
31ba90a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
036c510
31ba90a
036c510
 
 
 
 
 
 
 
 
 
 
31ba90a
036c510
 
 
 
 
 
 
 
 
 
 
 
31ba90a
2f91957
31ba90a
2f91957
ede6a4f
31ba90a
036c510
31ba90a
125ea2c
2f91957
31ba90a
 
2f91957
 
31ba90a
 
 
 
 
 
2f91957
ede6a4f
 
31ba90a
 
 
ede6a4f
31ba90a
2f91957
420030c
31ba90a
ede6a4f
31ba90a
036c510
2f91957
31ba90a
2f91957
420030c
31ba90a
 
420030c
036c510
 
31ba90a
 
 
 
ede6a4f
31ba90a
420030c
 
 
31ba90a
ede6a4f
 
31ba90a
 
420030c
ede6a4f
420030c
ede6a4f
 
31ba90a
036c510
31ba90a
 
 
 
 
 
 
 
 
 
 
 
036c510
31ba90a
ede6a4f
31ba90a
2f91957
31ba90a
ede6a4f
31ba90a
2f91957
 
420030c
31ba90a
 
ede6a4f
 
31ba90a
 
ede6a4f
2f91957
31ba90a
2f91957
31ba90a
2f91957
31ba90a
 
036c510
31ba90a
036c510
 
 
31ba90a
036c510
 
 
 
 
 
 
31ba90a
036c510
ede6a4f
31ba90a
 
 
 
036c510
 
ede6a4f
31ba90a
ede6a4f
31ba90a
ede6a4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2904653
 
ede6a4f


import os
import io
import json
import asyncio
import base64
import time
from typing import Optional

import gradio as gr
from pydantic import BaseModel

# Optional: use openai if available for transcription and image captioning
try:
    import openai
    OPENAI_AVAILABLE = True
except Exception:
    OPENAI_AVAILABLE = False

# -----------------------------
# Configuration
# -----------------------------
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
GOOGLE_GEMINI_API_KEY = os.environ.get("GOOGLE_GEMINI_API_KEY")

if OPENAI_API_KEY and OPENAI_AVAILABLE:
    openai.api_key = OPENAI_API_KEY

# ElevenLabs defaults
ELEVEN_VOICE_ID = os.environ.get("ELEVEN_VOICE_ID", "EXAVITQu4vr4xnSDxMaL")  # placeholder
ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"

# Hugging Face Inference API endpoint (for image captioning fallback)
HF_INFERENCE_URL = "https://huggingface.co/proxy/api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"

# -----------------------------
# Minimal MCP Server shim
# -----------------------------
class ToolResult(BaseModel):
    content: str
    meta: Optional[dict] = None

class MCPServer:
    def __init__(self, name: str, version: str = "0.1.0"):
        self.name = name
        self.version = version
        self.tools = {}

    def tool(self, name: str, description: str = ""):
        def decorator(fn):
            self.tools[name] = {
                "fn": fn,
                "description": description,
            }
            return fn
        return decorator

    async def run_tool(self, name: str, *args, **kwargs):
        tool = self.tools.get(name)
        if not tool:
            raise ValueError(f"Tool {name} not found")
        fn = tool["fn"]
        if asyncio.iscoroutinefunction(fn):
            res = await fn(*args, **kwargs)
        else:
            res = fn(*args, **kwargs)
        if isinstance(res, ToolResult):
            return res
        return ToolResult(content=str(res))

server = MCPServer("accessibility_voice_mcp")

# -----------------------------
# Utilities: STT, TTS, Image describe
# -----------------------------

def transcribe_with_openai(audio_file_path: str) -> str:
    """Transcribe audio using OpenAI Whisper (if available)."""
    if not OPENAI_AVAILABLE:
        return "OpenAI library not available"
    try:
        with open(audio_file_path, "rb") as f:
            transcript = openai.Audio.transcriptions.create(model="whisper-1", file=f)
            if isinstance(transcript, dict):
                return transcript.get("text", "")
            return getattr(transcript, "text", "")
    except Exception as e:
        return f"OpenAI transcription error: {e}"


def transcribe_fallback(audio_file_path: str) -> str:
    """Fallback: invoke whisper from local package (if installed)."""
    try:
        import whisper
        model = whisper.load_model("small")
        res = model.transcribe(audio_file_path)
        return res.get("text", "")
    except Exception as e:
        return f"Local transcription fallback failed: {e}"


def tts_elevenlabs(text: str) -> bytes:
    """Call ElevenLabs API to synthesize speech. Returns raw audio bytes."""
    if not ELEVENLABS_API_KEY:
        raise RuntimeError("ELEVENLABS_API_KEY not set in environment")
    import requests
    url = f"{ELEVEN_API_URL}/{ELEVEN_VOICE_ID}"
    headers = {
        "xi-api-key": ELEVENLABS_API_KEY,
        "Content-Type": "application/json",
    }
    payload = {
        "text": text,
        "voice_settings": {"stability": 0.5, "similarity_boost": 0.75}
    }
    resp = requests.post(url, headers=headers, json=payload, stream=True)
    if resp.status_code != 200:
        raise RuntimeError(f"ElevenLabs TTS failed: {resp.status_code} {resp.text}")
    return resp.content


def describe_image_hf(image_path: str) -> str:
    """Describe an image using Hugging Face Inference API (BLIP model hosted)."""
    try:
        import requests
        if not HUGGINGFACE_API_TOKEN:
            return "HUGGINGFACE_API_TOKEN not set"
        with open(image_path, "rb") as f:
            image_bytes = f.read()
        headers = {
            "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}"
        }
        # The HF Inference API accepts files as binary
        resp = requests.post(HF_INFERENCE_URL, headers=headers, data=image_bytes)
        if resp.status_code != 200:
            return f"HF Inference error: {resp.status_code} {resp.text}"
        # Model returns JSON with 'generated_text' or a simple string depending on model
        try:
            j = resp.json()
            # Some endpoints return [{'generated_text': '...'}]
            if isinstance(j, list) and j and 'generated_text' in j[0]:
                return j[0]['generated_text']
            if isinstance(j, dict) and 'generated_text' in j:
                return j['generated_text']
            # Otherwise return text
            return str(j)
        except Exception:
            return resp.text
    except Exception as e:
        return f"HF describe error: {e}"


def describe_image_openai(image_path: str) -> str:
    """Describe an image using OpenAI Vision (modern SDK compatible)."""
    if not OPENAI_AVAILABLE:
        return "OpenAI not available for image captioning"

    try:
        # Read image bytes
        with open(image_path, "rb") as f:
            image_bytes = f.read()

        # Convert to base64 for safe transport in older SDKs
        b64_image = base64.b64encode(image_bytes).decode("utf-8")

        # Modern prompt content
        prompt = (
            "You are an accessibility assistant that describes images for visually impaired users. "
            "Provide a clear, helpful, vivid, human-friendly description of the image.\n"
        )

        # Some OpenAI SDK versions require: client = openai.OpenAI()
        try:
            client = openai.OpenAI()
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You describe images for visually impaired users."},
                    {"role": "user", "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": f"data:image/jpeg;base64,{b64_image}"
                        }
                    ]}
                ],
                max_tokens=300,
            )
            return response.choices[0].message.content.strip()

        except Exception:
            # Fallback for legacy SDKs
            legacy_prompt = (
                "You are an assistant that describes images for visually impaired users.\n"
                "Provide a concise, vivid, accessible description.\n"
                "Image(base64): " + b64_image
            )
            resp = openai.ChatCompletion.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": legacy_prompt}],
                max_tokens=300,
            )
            return resp.choices[0].message.content.strip()

    except Exception as e:
        return f"OpenAI image describe error: {e}"


# -----------------------------
# MCP Tools
# -----------------------------
@server.tool(name="speak_text", description="Convert text to speech using ElevenLabs")
def speak_text_tool(text: str) -> ToolResult:
    try:
        audio_bytes = tts_elevenlabs(text)
        encoded = base64.b64encode(audio_bytes).decode("utf-8")
        return ToolResult(content=encoded, meta={"format": "base64-audio"})
    except Exception as e:
        return ToolResult(content=f"TTS Error: {e}")


@server.tool(name="describe_image", description="Describe an uploaded image for visually impaired users")
def describe_image_tool(image_path: str) -> ToolResult:
    # Priority: OpenAI -> Gemini -> Hugging Face Inference -> error
    if OPENAI_AVAILABLE:
        desc = describe_image_openai(image_path)
        if desc and not desc.startswith("OpenAI image describe error"):
            return ToolResult(content=desc, meta={"backend":"openai"})
    # Gemini (if configured)
    if GOOGLE_GEMINI_API_KEY:
        try:
            import google.generativeai as genai
            genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
            model = genai.GenerativeModel("gemini-1.5-flash")
            with open(image_path, "rb") as f:
                image_bytes = f.read()
            response = model.generate_content(["Describe this image for a visually impaired user.", {"mime_type":"image/jpeg", "data": image_bytes}])
            return ToolResult(content=response.text, meta={"backend":"gemini"})
        except Exception:
            pass
    # Hugging Face Inference
    desc = describe_image_hf(image_path)
    if desc:
        return ToolResult(content=desc, meta={"backend":"huggingface"})
    return ToolResult(content="No image captioning backend available. Set OPENAI_API_KEY, GOOGLE_GEMINI_API_KEY, or HUGGINGFACE_API_TOKEN.")


@server.tool(name="transcribe_audio", description="Transcribe user audio to text")
def transcribe_audio_tool(audio_path: str) -> ToolResult:
    start = time.time()
    if OPENAI_AVAILABLE:
        text = transcribe_with_openai(audio_path)
        duration = time.time() - start
        return ToolResult(content=text, meta={"backend":"openai","duration":duration})
    else:
        text = transcribe_fallback(audio_path)
        duration = time.time() - start
        return ToolResult(content=text, meta={"backend":"local_whisper","duration":duration})

# -----------------------------
# Gradio UI (client)
# -----------------------------

def decode_base64_audio(b64: str) -> bytes:
    return base64.b64decode(b64)

app_theme = {
    "primary_hue": "blue",
    "secondary_hue": "slate",
}

# Helper to format tool-call explanations
def format_tool_log(tool_name, reason, meta, output, style="A"):
    backend = meta.get("backend") if meta else "unknown"
    duration = meta.get("duration") if meta else None

    # ---------------------------
    # Style A: Simple
    # ---------------------------
    if style == "A":
        return f"[{tool_name}] {backend} -> {str(output)[:200]}"

    # ---------------------------
    # Style B: Detailed Human-Readable
    # ---------------------------
    if style == "B":
        lines = [
            f"🔧 Tool: {tool_name}",
            f"🎯 Why: {reason}",
            f"⚙️ Backend: {backend}",
        ]
        if duration is not None:
            try:
                lines.append(f"⏱ Duration: {float(duration):.2f}s")
            except:
                lines.append(f"⏱ Duration: {duration}")

        lines.append(f"📝 Output: {str(output)}")
        return "\n".join(lines)

    # ---------------------------
    # Style C: Ultra-visual
    # ---------------------------
    if style == "C":
        parts = [
            f"🔧 {tool_name}",
            f"• Reason: {reason}",
            f"• Backend: {backend}",
        ]
        if duration is not None:
            try:
                parts.append(f"• {float(duration):.2f}s")
            except:
                parts.append(f"• {duration}")

        visual = " ".join(parts) + "\n" + f"→ {str(output)}"
        return visual

    # ---------------------------
    # Style D: Both Simple + Detailed
    # ---------------------------
    return {
        "simple": f"[{tool_name}] {backend} -> {str(output)[:200]}",
        "detailed": format_tool_log(tool_name, reason, meta, output, style="B"),
    }

# Conversion helpers for chat history between 'messages' (gradio new) and tuple list used in logic
def messages_to_tuples(messages):
    # messages is a list of dicts {"role": "user"/"assistant", "content": "..."}
    tuples = []
    if not messages:
        return tuples
    for m in messages:
        if isinstance(m, dict):
            role = m.get("role", "user")
            content = m.get("content", "")
            tuples.append((content, "")) if role == "user" else tuples.append(("", content))
        elif isinstance(m, (list, tuple)) and len(m) == 2:
            tuples.append((m[0], m[1]))
        else:
            # fallback: treat as assistant reply
            tuples.append(("", str(m)))
    return tuples

def tuples_to_messages(tuples):
    messages = []
    for user_text, assistant_text in tuples:
        if user_text:
            messages.append({"role":"user","content":user_text})
        if assistant_text:
            messages.append({"role":"assistant","content":assistant_text})
    return messages

with gr.Blocks(css=".gradio-container {background:#f7fafc}") as demo:
    gr.Markdown("# Accessibility Voice Agent — MCP Tools")

    with gr.Row():
        with gr.Column(scale=3):
            # Set type='messages' to avoid the deprecation warning, and convert inside handlers.
            chatbox = gr.Chatbot(label="Assistant", elem_id="chatbox", type="messages")
            user_input = gr.Textbox(placeholder="Type or press the microphone to speak...", show_label=False)

            with gr.Row():
                # Some gradio versions don't accept 'source' kw; remove it to be broadly compatible.
                mic = gr.Audio(type="filepath", label="Record voice (press to record)")
                send_btn = gr.Button("Send")

            with gr.Accordion("Advanced / Tools", open=False):
                tts_text = gr.Textbox(label="Text to speak (ElevenLabs)")
                tts_btn = gr.Button("Speak (TTS)")

                img_upload = gr.File(label="Upload image (for description)")
                img_btn = gr.Button("Describe image")

        with gr.Column(scale=2):
            gr.Markdown("### Tool Call Log & Explanations")
            log_style = gr.Radio(choices=["A","B","C","D"], value="B", label="Log style (A:Simple B:Detailed C:Visual D:Both)")
            tools_log = gr.Textbox(value="Ready.", lines=20, interactive=False, label="Tools Log")
            tools_panel = gr.HTML("<div id='tools_panel' style='max-height:400px;overflow:auto;background:#ffffff;padding:8px;border-radius:8px;'></div>")
            gr.Markdown("---")
            gr.Markdown("**Tool explanations appear here each time a tool runs.**")

    # Callbacks
    def on_send_text(text, chat_history, mic_file, style):
        tools_entries = []
        # convert incoming chat_history (messages) into tuples for internal logic
        tuples = messages_to_tuples(chat_history)
        if mic_file:
            # transcribe audio
            tr = transcribe_audio_tool(mic_file)
            user_text = tr.content
            log = format_tool_log("transcribe_audio", "User provided microphone audio", tr.meta or {}, tr.content, style)
            tools_entries.append(log)
        else:
            user_text = text or ""

        # Append user message to tuples and placeholder assistant
        tuples.append((user_text, "..."))

        # demo assistant behavior
        if user_text and user_text.strip().lower().startswith("describe image:"):
            # expects: "describe image: filename"
            _, _, fname = user_text.partition(":")
            fname = fname.strip()
            if fname:
                # We assume the image was uploaded earlier and path provided
                res = describe_image_tool(fname)
                assistant = res.content
                log = format_tool_log("describe_image", "User requested image description", res.meta or {}, res.content, style)
                tools_entries.append(log)
            else:
                assistant = "Please upload an image using the Describe Image tool or provide a path like: describe image: /path/to/image.jpg"
        else:
            assistant = "I heard: " + (user_text or "(empty)")

        # replace placeholder assistant
        tuples[-1] = (tuples[-1][0], assistant)

        # update tools panel content
        panel_html = ''
        if isinstance(log, dict):
            # D style returns dict
            panel_html += f"<pre>{log['detailed']}</pre>"
            panel_html += f"<hr><pre>{log['simple']}</pre>"
        else:
            for e in tools_entries:
                panel_html += f"<pre style='background:#f1f5f9;border-radius:6px;padding:8px;margin-bottom:8px;'>{e}</pre>"

        # convert back to messages for gr.Chatbot
        new_messages = tuples_to_messages(tuples)
        return new_messages, gr.update(value="\n".join(tools_entries) or "Ready."), gr.update(value=panel_html)

    send_btn.click(on_send_text, inputs=[user_input, chatbox, mic, log_style], outputs=[chatbox, tools_log, tools_panel])

    def on_tts(text, style):
        if not text:
            return None, gr.update(value="No text provided")
        res = speak_text_tool(text)
        if res.meta and res.meta.get("format") == "base64-audio":
            audio_bytes = decode_base64_audio(res.content)
            log = format_tool_log("speak_text", "User requested text-to-speech", res.meta or {}, "<audio bytes>", style)
            panel_html = f"<pre style='background:#eef2ff;padding:8px;border-radius:6px;'>{log}</pre>"
            return (audio_bytes, 16000), gr.update(value=panel_html)
        else:
            log = format_tool_log("speak_text", "User requested text-to-speech", res.meta or {}, res.content, style)
            panel_html = f"<pre style='background:#fee2e2;padding:8px;border-radius:6px;'>{log}</pre>"
            return None, gr.update(value=panel_html)

    tts_btn.click(on_tts, inputs=[tts_text, log_style], outputs=[gr.Audio(label="TTS Output"), tools_panel])

    def on_describe_image(file_obj, style):
        if not file_obj:
            return [], gr.update(value="No file uploaded")
        # file_obj may be an UploadFile-like object; get path or save to tmp file
        path = getattr(file_obj, 'name', None)
        # If it's a temporary file dict (from gr.File), it might be a dict with 'name' and 'tmp_path'
        if isinstance(file_obj, dict) and 'tmp_path' in file_obj:
            path = file_obj['tmp_path']
        if not path:
            # try to save bytes
            try:
                contents = file_obj.read()
                tmp_path = "/tmp/gr_uploaded_image.jpg"
                with open(tmp_path, "wb") as f:
                    f.write(contents)
                path = tmp_path
            except Exception as e:
                return [], gr.update(value=f"Failed to read uploaded file: {e}")

        res = describe_image_tool(path)
        log = format_tool_log("describe_image", "User uploaded an image for description", res.meta or {}, res.content, style)
        panel_html = f"<pre style='background:#ecfdf5;padding:8px;border-radius:6px;'>{log}</pre>"

        # Return as messages for chatbox
        messages = [{"role":"user","content":"<image uploaded>"}, {"role":"assistant","content":res.content}]
        return messages, gr.update(value=panel_html)

    img_btn.click(on_describe_image, inputs=[img_upload, log_style], outputs=[chatbox, tools_panel])

    # API Keys accordion (session-only)
    with gr.Accordion("🔑 API Keys (stored only in session)", open=False):
        openai_key = gr.Textbox(label="OpenAI API Key", type="password")
        eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password")
        hf_key = gr.Textbox(label="Hugging Face API Token", type="password")

        def set_keys(ok, ek, hk):
            if ok:
                os.environ["OPENAI_API_KEY"] = ok
            if ek:
                os.environ["ELEVENLABS_API_KEY"] = ek
            if hk:
                os.environ["HUGGINGFACE_API_TOKEN"] = hk
            return "API keys set for this session. Refresh the page to pick them up in all runtimes."

        set_btn = gr.Button("Save API Keys")
        set_output = gr.Textbox(label="Status")
        set_btn.click(set_keys, [openai_key, eleven_key, hf_key], [set_output])

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))