Nihal2000 commited on
Commit
ea97dd9
·
1 Parent(s): 6edc114

fixed voice qa

Browse files
Files changed (3) hide show
  1. app.py +266 -60
  2. mcp_tools/voice_tool.py +133 -16
  3. services/elevenlabs_service.py +145 -76
app.py CHANGED
@@ -630,40 +630,74 @@ voice_conversation_state = {
630
  "transcript": []
631
  }
632
 
 
 
 
 
 
 
633
  def start_voice_conversation():
634
- """Start a new voice conversation session"""
 
 
 
 
 
635
  try:
 
636
  if not mcp_server.elevenlabs_service.is_available():
637
  return (
638
- "⚠️ Voice assistant not configured. Please set ELEVENLABS_API_KEY and ELEVENLABS_AGENT_ID in .env",
639
- gr.update(interactive=False),
640
- gr.update(interactive=True),
 
 
 
 
 
 
 
641
  []
642
  )
643
 
 
644
  session_id = str(uuid.uuid4())
645
- result = mcp_server.run_async(mcp_server.elevenlabs_service.start_conversation(session_id))
 
 
646
 
647
  if result.get("success"):
648
  voice_conversation_state["session_id"] = session_id
649
  voice_conversation_state["active"] = True
650
  voice_conversation_state["transcript"] = []
651
 
 
 
 
 
 
 
652
  return (
653
- "🎙️ Voice assistant is ready. Type your question below.",
654
- gr.update(interactive=False),
655
- gr.update(interactive=True),
656
- []
 
657
  )
658
  else:
 
659
  return (
660
- f"❌ Failed to start conversation: {result.get('error')}",
 
 
 
 
661
  gr.update(interactive=True),
662
  gr.update(interactive=False),
663
  []
664
  )
665
  except Exception as e:
666
- logger.error(f"Error starting voice conversation: {str(e)}")
667
  return (
668
  f"❌ Error: {str(e)}",
669
  gr.update(interactive=True),
@@ -672,11 +706,16 @@ def start_voice_conversation():
672
  )
673
 
674
  def stop_voice_conversation():
675
- """Stop active voice conversation"""
 
 
 
 
 
676
  try:
677
  if not voice_conversation_state["active"]:
678
  return (
679
- "No active conversation",
680
  gr.update(interactive=True),
681
  gr.update(interactive=False),
682
  voice_conversation_state["transcript"]
@@ -684,13 +723,19 @@ def stop_voice_conversation():
684
 
685
  session_id = voice_conversation_state["session_id"]
686
  if session_id:
687
- mcp_server.run_async(mcp_server.elevenlabs_service.end_conversation(session_id))
 
 
 
 
 
688
 
689
  voice_conversation_state["active"] = False
690
  voice_conversation_state["session_id"] = None
691
 
692
  return (
693
- "✅ Conversation ended",
 
694
  gr.update(interactive=True),
695
  gr.update(interactive=False),
696
  voice_conversation_state["transcript"]
@@ -705,40 +750,159 @@ def stop_voice_conversation():
705
  )
706
 
707
  def send_voice_message_v6(message, chat_history):
708
- """Send message in voice conversation - Gradio 6 format"""
 
 
 
 
 
 
 
 
 
709
  try:
 
710
  if not voice_conversation_state["active"]:
 
 
 
 
711
  return chat_history, ""
712
 
 
713
  if not message or not message.strip():
714
  return chat_history, message
715
 
716
  session_id = voice_conversation_state["session_id"]
717
 
718
- # Add user message
719
- chat_history.append({"role": "user", "content": message})
 
 
 
 
 
 
 
 
 
720
 
721
  # Get AI response
722
- result = mcp_server.run_async(mcp_server.voice_tool.voice_qa(message, session_id))
 
 
 
 
 
723
 
 
724
  if result.get("success"):
725
  answer = result.get("answer", "No response")
726
- chat_history.append({"role": "assistant", "content": answer})
 
 
 
 
 
 
 
 
 
 
727
  else:
 
728
  chat_history.append({
729
  "role": "assistant",
730
- "content": f"❌ Error: {result.get('error')}"
 
 
 
 
731
  })
732
 
 
 
 
733
  return chat_history, ""
 
734
  except Exception as e:
735
- logger.error(f"Error in voice message: {str(e)}")
 
 
 
 
 
736
  chat_history.append({
737
  "role": "assistant",
738
- "content": f"❌ Error: {str(e)}"
739
  })
740
  return chat_history, ""
741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
  def generate_podcast_ui(doc_ids, style, duration, voice1, voice2):
743
  """UI wrapper for podcast generation"""
744
  try:
@@ -1109,51 +1273,88 @@ def create_gradio_interface():
1109
  )
1110
 
1111
  with gr.Tab("🎙️ Voice Assistant"):
1112
- gr.Markdown("""
1113
- ### 🗣️ Talk to Your AI Librarian
1114
-
1115
- Have a natural conversation about your documents. Ask questions, request summaries,
1116
- or explore your content library through voice-powered interaction.
1117
-
1118
- **Note:** Requires ElevenLabs API configuration.
1119
- """)
1120
 
1121
  with gr.Row():
1122
- with gr.Column(scale=2):
1123
- with gr.Group():
1124
- voice_status_display = gr.Textbox(
1125
- label="Status",
1126
- value="Ready to start",
1127
- interactive=False,
1128
- lines=2
1129
- )
1130
-
1131
- with gr.Row():
1132
- start_voice_btn = gr.Button("🎤 Start Conversation", variant="primary", size="lg")
1133
- stop_voice_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", interactive=False)
1134
 
1135
- with gr.Group():
1136
- gr.Markdown("#### 💬 Send Message")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1137
  voice_input_text = gr.Textbox(
1138
- label="",
1139
- placeholder="Type your question...",
1140
- lines=3,
 
 
1141
  container=False,
1142
- info="Press Enter or click Send"
1143
  )
1144
- send_voice_btn = gr.Button("📤 Send", variant="secondary")
1145
-
1146
- with gr.Column(scale=3):
1147
- with gr.Group():
1148
- voice_chatbot = gr.Chatbot(
1149
- label="Conversation",
1150
- type="messages",
1151
- height=500,
1152
- show_copy_button=True
1153
  )
1154
-
1155
- clear_chat_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
 
 
 
 
1156
 
 
1157
  start_voice_btn.click(
1158
  fn=start_voice_conversation,
1159
  outputs=[voice_status_display, start_voice_btn, stop_voice_btn, voice_chatbot]
@@ -1180,6 +1381,11 @@ def create_gradio_interface():
1180
  fn=lambda: [],
1181
  outputs=[voice_chatbot]
1182
  )
 
 
 
 
 
1183
 
1184
  with gr.Tab("🎧 Podcast Studio"):
1185
  gr.Markdown("""
 
630
  "transcript": []
631
  }
632
 
633
+ voice_conversation_state = {
634
+ "session_id": None,
635
+ "active": False,
636
+ "transcript": []
637
+ }
638
+
639
  def start_voice_conversation():
640
+ """
641
+ Start a new voice conversation session
642
+
643
+ Returns:
644
+ Tuple of (status_message, start_button_state, stop_button_state, chatbot_history)
645
+ """
646
  try:
647
+ # Check if service is available
648
  if not mcp_server.elevenlabs_service.is_available():
649
  return (
650
+ "⚠️ Voice assistant not configured.\n\n"
651
+ "**Setup Instructions:**\n"
652
+ "1. Get API key from: https://elevenlabs.io/app/settings/api-keys\n"
653
+ "2. Create an agent at: https://elevenlabs.io/app/conversational-ai\n"
654
+ "3. Add to .env file:\n"
655
+ " - ELEVENLABS_API_KEY=your_api_key\n"
656
+ " - ELEVENLABS_AGENT_ID=your_agent_id\n"
657
+ "4. Restart the application",
658
+ gr.update(interactive=True), # start button enabled
659
+ gr.update(interactive=False), # stop button disabled
660
  []
661
  )
662
 
663
+ # Create new session
664
  session_id = str(uuid.uuid4())
665
+ result = mcp_server.run_async(
666
+ mcp_server.elevenlabs_service.start_conversation(session_id)
667
+ )
668
 
669
  if result.get("success"):
670
  voice_conversation_state["session_id"] = session_id
671
  voice_conversation_state["active"] = True
672
  voice_conversation_state["transcript"] = []
673
 
674
+ # Initialize chatbot with welcome message
675
+ initial_message = {
676
+ "role": "assistant",
677
+ "content": "👋 Hello! I'm your AI librarian. Ask me anything about your documents!"
678
+ }
679
+
680
  return (
681
+ " Voice assistant is ready!\n\n"
682
+ "You can now ask questions about your uploaded documents.",
683
+ gr.update(interactive=False), # start button disabled
684
+ gr.update(interactive=True), # stop button enabled
685
+ [initial_message]
686
  )
687
  else:
688
+ error_msg = result.get("error", "Unknown error")
689
  return (
690
+ f"❌ Failed to start: {error_msg}\n\n"
691
+ "**Troubleshooting:**\n"
692
+ "• Check your API key is valid\n"
693
+ "• Verify agent ID is correct\n"
694
+ "• Check internet connection",
695
  gr.update(interactive=True),
696
  gr.update(interactive=False),
697
  []
698
  )
699
  except Exception as e:
700
+ logger.error(f"Error starting voice conversation: {str(e)}", exc_info=True)
701
  return (
702
  f"❌ Error: {str(e)}",
703
  gr.update(interactive=True),
 
706
  )
707
 
708
  def stop_voice_conversation():
709
+ """
710
+ Stop active voice conversation
711
+
712
+ Returns:
713
+ Tuple of (status_message, start_button_state, stop_button_state, chatbot_history)
714
+ """
715
  try:
716
  if not voice_conversation_state["active"]:
717
  return (
718
+ "ℹ️ No active conversation",
719
  gr.update(interactive=True),
720
  gr.update(interactive=False),
721
  voice_conversation_state["transcript"]
 
723
 
724
  session_id = voice_conversation_state["session_id"]
725
  if session_id:
726
+ mcp_server.run_async(
727
+ mcp_server.elevenlabs_service.end_conversation(session_id)
728
+ )
729
+
730
+ # Get conversation stats
731
+ message_count = len(voice_conversation_state["transcript"])
732
 
733
  voice_conversation_state["active"] = False
734
  voice_conversation_state["session_id"] = None
735
 
736
  return (
737
+ f"✅ Conversation ended\n\n"
738
+ f"📊 Stats: {message_count} messages exchanged",
739
  gr.update(interactive=True),
740
  gr.update(interactive=False),
741
  voice_conversation_state["transcript"]
 
750
  )
751
 
752
  def send_voice_message_v6(message, chat_history):
753
+ """
754
+ Send message in voice conversation - Gradio 6+ format
755
+
756
+ Args:
757
+ message: User's text message
758
+ chat_history: Current chat history (list of message dicts)
759
+
760
+ Returns:
761
+ Tuple of (updated_chat_history, cleared_input_box)
762
+ """
763
  try:
764
+ # Validate state
765
  if not voice_conversation_state["active"]:
766
+ chat_history.append({
767
+ "role": "assistant",
768
+ "content": "⚠️ Please start a conversation first by clicking 'Start Conversation'"
769
+ })
770
  return chat_history, ""
771
 
772
+ # Validate input
773
  if not message or not message.strip():
774
  return chat_history, message
775
 
776
  session_id = voice_conversation_state["session_id"]
777
 
778
+ # Add user message to display
779
+ chat_history.append({
780
+ "role": "user",
781
+ "content": message
782
+ })
783
+
784
+ # Show typing indicator
785
+ chat_history.append({
786
+ "role": "assistant",
787
+ "content": "🤔 Thinking..."
788
+ })
789
 
790
  # Get AI response
791
+ result = mcp_server.run_async(
792
+ mcp_server.voice_tool.voice_qa(message, session_id)
793
+ )
794
+
795
+ # Remove typing indicator
796
+ chat_history = chat_history[:-1]
797
 
798
+ # Add response
799
  if result.get("success"):
800
  answer = result.get("answer", "No response")
801
+
802
+ # Add helpful context if RAG was used
803
+ if "document" in answer.lower() or "file" in answer.lower():
804
+ footer = "\n\n💡 *Answer based on your documents*"
805
+ else:
806
+ footer = ""
807
+
808
+ chat_history.append({
809
+ "role": "assistant",
810
+ "content": answer + footer
811
+ })
812
  else:
813
+ error_msg = result.get("error", "Unknown error")
814
  chat_history.append({
815
  "role": "assistant",
816
+ "content": f"❌ Error: {error_msg}\n\n"
817
+ "**Suggestions:**\n"
818
+ "• Try rephrasing your question\n"
819
+ "• Make sure you have uploaded relevant documents\n"
820
+ "• Check if the question is about your document library"
821
  })
822
 
823
+ # Update conversation state
824
+ voice_conversation_state["transcript"] = chat_history
825
+
826
  return chat_history, ""
827
+
828
  except Exception as e:
829
+ logger.error(f"Error in voice message: {str(e)}", exc_info=True)
830
+
831
+ # Remove typing indicator if present
832
+ if chat_history and chat_history[-1]["role"] == "assistant" and "Thinking" in chat_history[-1]["content"]:
833
+ chat_history = chat_history[:-1]
834
+
835
  chat_history.append({
836
  "role": "assistant",
837
+ "content": f"❌ An error occurred: {str(e)}\n\nPlease try again."
838
  })
839
  return chat_history, ""
840
 
841
+ def test_voice_connection():
842
+ """
843
+ Test voice assistant connection
844
+
845
+ Returns:
846
+ Status message with test results
847
+ """
848
+ try:
849
+ result = mcp_server.run_async(
850
+ mcp_server.voice_tool.test_connection()
851
+ )
852
+
853
+ if result.get("success"):
854
+ return (
855
+ "✅ **Connection Test Passed**\n\n"
856
+ f"• API Status: Connected\n"
857
+ f"• Voices Available: {result.get('voices_available', 0)}\n"
858
+ f"• RAG Tool: {'✓ Working' if result.get('rag_tool_working') else '✗ Failed'}\n"
859
+ f"• Client Tools: {'✓ Registered' if result.get('client_tools_registered') else '✗ Not Registered'}\n\n"
860
+ "🎉 Voice assistant is ready to use!"
861
+ )
862
+ else:
863
+ return (
864
+ "❌ **Connection Test Failed**\n\n"
865
+ f"Error: {result.get('message', 'Unknown error')}\n\n"
866
+ "**Troubleshooting:**\n"
867
+ "1. Verify ELEVENLABS_API_KEY in .env\n"
868
+ "2. Check ELEVENLABS_AGENT_ID is set\n"
869
+ "3. Ensure API key is valid\n"
870
+ "4. Check internet connection"
871
+ )
872
+ except Exception as e:
873
+ logger.error(f"Connection test error: {str(e)}")
874
+ return (
875
+ f"❌ **Test Error**\n\n{str(e)}\n\n"
876
+ "Please check your configuration and try again."
877
+ )
878
+
879
+ def get_conversation_stats():
880
+ """
881
+ Get statistics about current conversation
882
+
883
+ Returns:
884
+ Formatted stats string
885
+ """
886
+ try:
887
+ if not voice_conversation_state["active"]:
888
+ return "ℹ️ No active conversation"
889
+
890
+ transcript = voice_conversation_state["transcript"]
891
+ user_msgs = sum(1 for msg in transcript if msg["role"] == "user")
892
+ ai_msgs = sum(1 for msg in transcript if msg["role"] == "assistant")
893
+
894
+ return (
895
+ "📊 **Conversation Statistics**\n\n"
896
+ f"• Session ID: {voice_conversation_state['session_id'][:8]}...\n"
897
+ f"• Your messages: {user_msgs}\n"
898
+ f"• AI responses: {ai_msgs}\n"
899
+ f"• Total exchanges: {user_msgs}\n"
900
+ f"• Status: {'🟢 Active' if voice_conversation_state['active'] else '🔴 Inactive'}"
901
+ )
902
+ except Exception as e:
903
+ logger.error(f"Error getting stats: {str(e)}")
904
+ return f"❌ Error: {str(e)}"
905
+
906
  def generate_podcast_ui(doc_ids, style, duration, voice1, voice2):
907
  """UI wrapper for podcast generation"""
908
  try:
 
1273
  )
1274
 
1275
  with gr.Tab("🎙️ Voice Assistant"):
1276
+ # Simple header
1277
+ gr.Markdown("### Ask questions about your documents using AI")
 
 
 
 
 
 
1278
 
1279
  with gr.Row():
1280
+ # Compact left sidebar (25% width)
1281
+ with gr.Column(scale=1):
1282
+ # Status box
1283
+ voice_status_display = gr.Textbox(
1284
+ label="Status",
1285
+ value="Click 'Start' to begin",
1286
+ interactive=False,
1287
+ lines=3,
1288
+ max_lines=3
1289
+ )
 
 
1290
 
1291
+ # Control buttons stacked vertically
1292
+ start_voice_btn = gr.Button(
1293
+ "🎤 Start",
1294
+ variant="primary",
1295
+ size="lg"
1296
+ )
1297
+
1298
+ stop_voice_btn = gr.Button(
1299
+ "⏹️ Stop",
1300
+ variant="stop",
1301
+ size="lg",
1302
+ interactive=False
1303
+ )
1304
+
1305
+ test_connection_btn = gr.Button(
1306
+ "🔧 Test",
1307
+ variant="secondary",
1308
+ size="sm"
1309
+ )
1310
+
1311
+ gr.Markdown("---")
1312
+
1313
+ # Quick tips
1314
+ gr.Markdown("""
1315
+ **Quick Tips:**
1316
+ • Upload documents first
1317
+ • Ask specific questions
1318
+ • Press Enter to send
1319
+ """, elem_classes=["small-text"])
1320
+
1321
+ # Main chat area (75% width)
1322
+ with gr.Column(scale=3):
1323
+ # Large chat window
1324
+ voice_chatbot = gr.Chatbot(
1325
+ type="messages",
1326
+ height=550,
1327
+ show_copy_button=True,
1328
+ avatar_images=(None, "🤖"),
1329
+ show_label=False,
1330
+ container=True,
1331
+ bubble_full_width=False
1332
+ )
1333
+
1334
+ # Input row
1335
+ with gr.Row():
1336
  voice_input_text = gr.Textbox(
1337
+ placeholder="Ask me anything about your documents...",
1338
+ lines=2,
1339
+ max_lines=4,
1340
+ scale=4,
1341
+ show_label=False,
1342
  container=False,
1343
+ autofocus=True
1344
  )
1345
+ send_voice_btn = gr.Button(
1346
+ "Send",
1347
+ scale=1,
1348
+ variant="primary"
 
 
 
 
 
1349
  )
1350
+
1351
+ # Footer actions
1352
+ with gr.Row():
1353
+ clear_chat_btn = gr.Button("Clear", size="sm")
1354
+ with gr.Column(scale=3):
1355
+ gr.Markdown("*Tip: Type your question and press Enter*")
1356
 
1357
+ # Event handlers
1358
  start_voice_btn.click(
1359
  fn=start_voice_conversation,
1360
  outputs=[voice_status_display, start_voice_btn, stop_voice_btn, voice_chatbot]
 
1381
  fn=lambda: [],
1382
  outputs=[voice_chatbot]
1383
  )
1384
+
1385
+ test_connection_btn.click(
1386
+ fn=test_voice_connection,
1387
+ outputs=[voice_status_display]
1388
+ )
1389
 
1390
  with gr.Tab("🎧 Podcast Studio"):
1391
  gr.Markdown("""
mcp_tools/voice_tool.py CHANGED
@@ -6,7 +6,13 @@ logger = logging.getLogger(__name__)
6
 
7
  class VoiceTool:
8
  """
9
- MCP Tool for voice-based Q&A using ElevenLabs conversational AI
 
 
 
 
 
 
10
  """
11
 
12
  def __init__(self, elevenlabs_service):
@@ -24,40 +30,151 @@ class VoiceTool:
24
  session_id: Optional[str] = None
25
  ) -> Dict[str, Any]:
26
  """
27
- MCP Tool: Ask a question using voice assistant
28
 
29
  Args:
30
- question: User's question (text or transcribed from voice)
31
  session_id: Optional session ID for conversation context
32
 
33
  Returns:
34
- Dictionary with answer, audio URL (if applicable), and sources
35
  """
36
  try:
 
37
  if not self.elevenlabs_service or not self.elevenlabs_service.is_available():
38
  return {
39
  "success": False,
40
- "error": "Voice assistant not configured. Please set ELEVENLABS_API_KEY and ELEVENLABS_AGENT_ID"
 
 
 
 
 
 
 
41
  }
42
 
43
- logger.info(f"Voice QA: {question}")
44
 
45
- # For text-based queries, we can use the RAG tool directly
46
- # This provides the backend for voice queries
47
- result = await self.elevenlabs_service.llamaindex_service.query(question)
 
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  return {
50
- "success": True,
51
- "question": question,
52
- "answer": result,
53
- "session_id": session_id,
54
- "mode": "text" # Could be "voice" if audio processing is involved
55
  }
 
 
 
 
 
 
 
56
 
 
 
 
 
 
 
57
  except Exception as e:
58
- logger.error(f"Voice QA failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  return {
60
  "success": False,
61
  "error": str(e),
62
- "question": question
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  }
 
 
 
 
 
 
 
6
 
7
  class VoiceTool:
8
  """
9
+ Enhanced MCP Tool for voice-based Q&A using ElevenLabs conversational AI
10
+
11
+ Improvements:
12
+ - Better error handling and user feedback
13
+ - Support for conversation context
14
+ - Streaming responses support
15
+ - Session management
16
  """
17
 
18
  def __init__(self, elevenlabs_service):
 
30
  session_id: Optional[str] = None
31
  ) -> Dict[str, Any]:
32
  """
33
+ Ask a question using voice assistant (text-based for web UI)
34
 
35
  Args:
36
+ question: User's question
37
  session_id: Optional session ID for conversation context
38
 
39
  Returns:
40
+ Dictionary with answer and metadata
41
  """
42
  try:
43
+ # Check if service is available
44
  if not self.elevenlabs_service or not self.elevenlabs_service.is_available():
45
  return {
46
  "success": False,
47
+ "error": "Voice assistant not configured. Please set ELEVENLABS_API_KEY in your .env file.",
48
+ "help": "Get your API key from: https://elevenlabs.io/app/settings/api-keys"
49
+ }
50
+
51
+ if not question or not question.strip():
52
+ return {
53
+ "success": False,
54
+ "error": "Please enter a question"
55
  }
56
 
57
+ logger.info(f"Voice QA (session: {session_id}): {question}")
58
 
59
+ # Send message through ElevenLabs service
60
+ result = await self.elevenlabs_service.send_text_message(
61
+ message=question,
62
+ session_id=session_id or "default"
63
+ )
64
 
65
+ if result.get("success"):
66
+ return {
67
+ "success": True,
68
+ "question": question,
69
+ "answer": result["answer"],
70
+ "session_id": session_id,
71
+ "mode": "text"
72
+ }
73
+ else:
74
+ return {
75
+ "success": False,
76
+ "error": result.get("error", "Unknown error"),
77
+ "question": question
78
+ }
79
+
80
+ except Exception as e:
81
+ logger.error(f"Voice QA failed: {str(e)}", exc_info=True)
82
  return {
83
+ "success": False,
84
+ "error": f"An error occurred: {str(e)}",
85
+ "question": question
 
 
86
  }
87
+
88
+ async def start_session(self, session_id: str) -> Dict[str, Any]:
89
+ """
90
+ Start a new voice assistant session
91
+
92
+ Args:
93
+ session_id: Unique session identifier
94
 
95
+ Returns:
96
+ Session start status
97
+ """
98
+ try:
99
+ result = await self.elevenlabs_service.start_conversation(session_id)
100
+ return result
101
  except Exception as e:
102
+ logger.error(f"Failed to start session: {str(e)}")
103
+ return {
104
+ "success": False,
105
+ "error": str(e)
106
+ }
107
+
108
+ async def end_session(self, session_id: str) -> Dict[str, Any]:
109
+ """
110
+ End a voice assistant session
111
+
112
+ Args:
113
+ session_id: Session identifier
114
+
115
+ Returns:
116
+ Session end status
117
+ """
118
+ try:
119
+ success = await self.elevenlabs_service.end_conversation(session_id)
120
+ return {
121
+ "success": success,
122
+ "message": "Session ended" if success else "Session not found"
123
+ }
124
+ except Exception as e:
125
+ logger.error(f"Failed to end session: {str(e)}")
126
+ return {
127
+ "success": False,
128
+ "error": str(e)
129
+ }
130
+
131
+ def get_conversation_history(self, session_id: str) -> Dict[str, Any]:
132
+ """
133
+ Get conversation history for a session
134
+
135
+ Args:
136
+ session_id: Session identifier
137
+
138
+ Returns:
139
+ Dictionary with conversation history
140
+ """
141
+ try:
142
+ history = self.elevenlabs_service.get_conversation_history(session_id)
143
+ return {
144
+ "success": True,
145
+ "history": history,
146
+ "message_count": len(history)
147
+ }
148
+ except Exception as e:
149
+ logger.error(f"Failed to get history: {str(e)}")
150
  return {
151
  "success": False,
152
  "error": str(e),
153
+ "history": []
154
+ }
155
+
156
+ async def test_connection(self) -> Dict[str, Any]:
157
+ """
158
+ Test voice assistant connection
159
+
160
+ Returns:
161
+ Connection test results
162
+ """
163
+ try:
164
+ if not self.elevenlabs_service:
165
+ return {
166
+ "success": False,
167
+ "message": "Service not initialized"
168
+ }
169
+
170
+ result = await self.elevenlabs_service.test_connection()
171
+ return {
172
+ "success": result["status"] == "success",
173
+ **result
174
  }
175
+ except Exception as e:
176
+ logger.error(f"Connection test failed: {str(e)}")
177
+ return {
178
+ "success": False,
179
+ "message": str(e)
180
+ }
services/elevenlabs_service.py CHANGED
@@ -11,7 +11,7 @@ try:
11
  except ImportError:
12
  ELEVENLABS_AVAILABLE = False
13
  logger = logging.getLogger(__name__)
14
- logger.warning("ElevenLabs SDK not available. Voice features will be disabled.")
15
 
16
  import config
17
  from services.llamaindex_service import LlamaIndexService
@@ -20,8 +20,13 @@ logger = logging.getLogger(__name__)
20
 
21
  class ElevenLabsService:
22
  """
23
- Service for integrating ElevenLabs Conversational AI with RAG capabilities.
24
- Provides voice-based interaction with the document library.
 
 
 
 
 
25
  """
26
 
27
  def __init__(self, llamaindex_service: LlamaIndexService):
@@ -36,13 +41,14 @@ class ElevenLabsService:
36
  self.client = None
37
  self.client_tools = None
38
  self.active_conversations: Dict[str, Conversation] = {}
 
39
 
40
  if not ELEVENLABS_AVAILABLE:
41
  logger.error("ElevenLabs SDK not installed. Run: pip install elevenlabs")
42
  return
43
 
44
  if not self.config.ELEVENLABS_API_KEY:
45
- logger.warning("ELEVENLABS_API_KEY not configured. Voice features will be limited.")
46
  return
47
 
48
  try:
@@ -50,87 +56,102 @@ class ElevenLabsService:
50
  self.client = ElevenLabs(api_key=self.config.ELEVENLABS_API_KEY)
51
  logger.info("ElevenLabs client initialized successfully")
52
 
53
- # Initialize client tools for custom tool registration
54
- self.client_tools = ClientTools()
55
-
56
- # Register RAG tool
57
- self._register_rag_tool()
58
 
59
- logger.info("ElevenLabs service initialized with RAG tool")
60
 
61
  except Exception as e:
62
  logger.error(f"Error initializing ElevenLabs service: {str(e)}")
63
 
64
- def _register_rag_tool(self):
65
- """Register RAG query tool with ElevenLabs agent"""
66
- if not self.client_tools:
67
- return
68
-
69
  try:
70
- # Register the query_documents tool
71
- # Modern ElevenLabs SDK: register(tool_name, handler=callable)
72
- self.client_tools.register("query_documents", handler=self._rag_query_tool)
 
 
 
 
 
 
73
 
74
- logger.info("RAG tool 'query_documents' registered successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  except Exception as e:
77
- logger.error(f"Error registering RAG tool: {str(e)}")
 
78
 
79
- async def _rag_query_tool(self, params: Dict[str, Any]) -> Dict[str, Any]:
80
  """
81
- Custom tool for querying documents using LlamaIndex agentic RAG
 
 
82
 
83
  Args:
84
- params: Dictionary containing the query
85
- - query (str): The user's question or search query
86
 
87
  Returns:
88
- Dictionary with answer and metadata
89
  """
90
  try:
91
  query = params.get("query", "")
92
 
93
- if not query:
94
  return {
95
- "error": "No query provided",
96
- "answer": "I didn't receive a question to search for."
97
  }
98
 
99
- logger.info(f"RAG tool called with query: '{query}'")
100
 
101
- # Query the LlamaIndex agentic RAG system
102
  try:
103
  result = await asyncio.wait_for(
104
  self.llamaindex_service.query(query),
105
- timeout=self.config.CONVERSATION_TIMEOUT
106
  )
107
 
108
- logger.info(f"RAG query successful")
109
 
 
110
  return {
111
  "answer": result,
112
- "source": "document_library",
113
- "confidence": "high"
114
  }
115
 
116
  except asyncio.TimeoutError:
117
  logger.error("RAG query timeout")
118
  return {
119
- "error": "timeout",
120
- "answer": "The search took too long. Please try a simpler question."
121
  }
122
 
123
  except Exception as e:
124
- logger.error(f"Error in RAG query tool: {str(e)}")
125
  return {
126
- "error": str(e),
127
- "answer": f"I encountered an error searching the documents: {str(e)}"
128
  }
129
 
130
  def create_conversation(
131
  self,
132
  agent_id: Optional[str] = None,
133
- session_id: Optional[str] = None
 
134
  ) -> Optional[Conversation]:
135
  """
136
  Create a new conversation session
@@ -138,6 +159,7 @@ class ElevenLabsService:
138
  Args:
139
  agent_id: ElevenLabs agent ID (uses config default if not provided)
140
  session_id: Optional session ID for tracking
 
141
 
142
  Returns:
143
  Conversation object or None if initialization fails
@@ -153,8 +175,8 @@ class ElevenLabsService:
153
  logger.error("No agent ID provided or configured")
154
  return None
155
 
156
- # Create audio interface for real-time audio
157
- audio_interface = DefaultAudioInterface()
158
 
159
  # Create conversation with RAG tool
160
  conversation = Conversation(
@@ -162,12 +184,16 @@ class ElevenLabsService:
162
  agent_id=agent_id,
163
  requires_auth=True,
164
  audio_interface=audio_interface,
165
- client_tools=self.client_tools
 
 
 
166
  )
167
 
168
- # Store conversation if session ID provided
169
  if session_id:
170
  self.active_conversations[session_id] = conversation
 
171
 
172
  logger.info(f"Created conversation for agent: {agent_id}")
173
  return conversation
@@ -176,9 +202,27 @@ class ElevenLabsService:
176
  logger.error(f"Error creating conversation: {str(e)}")
177
  return None
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  async def start_conversation(self, session_id: Optional[str] = None) -> Dict[str, Any]:
180
  """
181
- Start a new conversation session (async wrapper for UI)
182
 
183
  Args:
184
  session_id: Optional session ID for tracking
@@ -187,18 +231,18 @@ class ElevenLabsService:
187
  Dictionary with success status and conversation info
188
  """
189
  try:
190
- conversation = self.create_conversation(session_id=session_id)
191
 
192
  if conversation:
193
  return {
194
  "success": True,
195
  "session_id": session_id,
196
- "message": "Conversation started successfully"
197
  }
198
  else:
199
  return {
200
  "success": False,
201
- "error": "Failed to create conversation"
202
  }
203
  except Exception as e:
204
  logger.error(f"Error starting conversation: {str(e)}")
@@ -207,41 +251,60 @@ class ElevenLabsService:
207
  "error": str(e)
208
  }
209
 
210
- async def process_voice_query(
211
  self,
212
- audio_file_path: str,
213
- agent_id: Optional[str] = None
214
  ) -> Dict[str, Any]:
215
  """
216
- Process a voice query file and return response
 
 
217
 
218
  Args:
219
- audio_file_path: Path to audio file
220
- agent_id: Optional agent ID
221
 
222
  Returns:
223
- Dictionary with transcription, answer, and metadata
224
  """
225
  try:
226
- # For now, this is a placeholder for file-based processing
227
- # ElevenLabs Conversational AI is primarily WebSocket-based
228
- # This would be used for async/batch processing
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
- logger.info(f"Processing voice query from: {audio_file_path}")
 
231
 
232
- # This would require additional implementation for file upload
233
- # and processing through ElevenLabs API
 
 
 
 
234
 
235
  return {
236
- "status": "pending",
237
- "message": "Voice query processing requires WebSocket connection",
238
- "file": audio_file_path
239
  }
240
 
241
  except Exception as e:
242
- logger.error(f"Error processing voice query: {str(e)}")
243
  return {
244
- "status": "error",
245
  "error": str(e)
246
  }
247
 
@@ -261,23 +324,26 @@ class ElevenLabsService:
261
 
262
  # Try to end the session gracefully
263
  try:
264
- conversation.end_session()
265
- except AttributeError as ae:
266
- # Handle cases where DefaultAudioInterface doesn't have expected methods
267
- logger.warning(f"Could not cleanly end session: {str(ae)}")
268
  except Exception as e:
269
  logger.warning(f"Error during session cleanup: {str(e)}")
270
 
271
- # Always remove from active conversations
272
  del self.active_conversations[session_id]
273
  logger.info(f"Ended conversation: {session_id}")
274
  return True
 
275
  return False
276
 
277
  except Exception as e:
278
  logger.error(f"Error ending conversation: {str(e)}")
279
  return False
280
 
 
 
 
 
281
  def get_available_voices(self) -> List[Dict[str, str]]:
282
  """
283
  Get list of available voice models
@@ -289,14 +355,13 @@ class ElevenLabsService:
289
  if not self.client:
290
  return []
291
 
292
- # Get voices from ElevenLabs API
293
  voices = self.client.voices.get_all()
294
 
295
  return [
296
  {
297
  "voice_id": voice.voice_id,
298
  "name": voice.name,
299
- "category": voice.category if hasattr(voice, 'category') else "general"
300
  }
301
  for voice in voices.voices
302
  ]
@@ -323,14 +388,18 @@ class ElevenLabsService:
323
  "message": "Client not initialized"
324
  }
325
 
326
- # Try to fetch user info or voices as a connection test
327
  voices = self.get_available_voices()
328
 
 
 
 
329
  return {
330
  "status": "success",
331
  "message": "ElevenLabs API connected",
332
  "voices_available": len(voices),
333
- "rag_tool_registered": self.client_tools is not None
 
334
  }
335
 
336
  except Exception as e:
@@ -338,4 +407,4 @@ class ElevenLabsService:
338
  return {
339
  "status": "error",
340
  "message": str(e)
341
- }
 
11
  except ImportError:
12
  ELEVENLABS_AVAILABLE = False
13
  logger = logging.getLogger(__name__)
14
+ logger.warning("ElevenLabs SDK not available. Install: pip install elevenlabs")
15
 
16
  import config
17
  from services.llamaindex_service import LlamaIndexService
 
20
 
21
  class ElevenLabsService:
22
  """
23
+ Enhanced service for ElevenLabs Conversational AI with proper RAG integration.
24
+
25
+ Key improvements:
26
+ - Proper client tools registration with event loop handling
27
+ - Built-in RAG through ElevenLabs Knowledge Base
28
+ - Support for both real-time voice and text-based chat
29
+ - Session management and conversation history
30
  """
31
 
32
  def __init__(self, llamaindex_service: LlamaIndexService):
 
41
  self.client = None
42
  self.client_tools = None
43
  self.active_conversations: Dict[str, Conversation] = {}
44
+ self.conversation_history: Dict[str, List[Dict]] = {}
45
 
46
  if not ELEVENLABS_AVAILABLE:
47
  logger.error("ElevenLabs SDK not installed. Run: pip install elevenlabs")
48
  return
49
 
50
  if not self.config.ELEVENLABS_API_KEY:
51
+ logger.warning("ELEVENLABS_API_KEY not configured.")
52
  return
53
 
54
  try:
 
56
  self.client = ElevenLabs(api_key=self.config.ELEVENLABS_API_KEY)
57
  logger.info("ElevenLabs client initialized successfully")
58
 
59
+ # Initialize client tools - CRITICAL: Must be done in async context
60
+ self._init_client_tools()
 
 
 
61
 
62
+ logger.info("ElevenLabs service initialized")
63
 
64
  except Exception as e:
65
  logger.error(f"Error initializing ElevenLabs service: {str(e)}")
66
 
67
+ def _init_client_tools(self):
68
+ """Initialize client tools for RAG integration"""
 
 
 
69
  try:
70
+ # Get or create event loop for ClientTools
71
+ try:
72
+ loop = asyncio.get_running_loop()
73
+ except RuntimeError:
74
+ loop = asyncio.new_event_loop()
75
+ asyncio.set_event_loop(loop)
76
+
77
+ # Initialize ClientTools with the loop
78
+ self.client_tools = ClientTools(loop=loop)
79
 
80
+ # Register RAG query tool with proper metadata
81
+ self.client_tools.register(
82
+ "query_documents",
83
+ handler=self._rag_query_handler,
84
+ description="Search through the user's uploaded documents to find relevant information. Use this tool whenever the user asks questions about their documents, files, or content in their library.",
85
+ parameters={
86
+ "query": {
87
+ "type": "string",
88
+ "description": "The search query or question to find information in the documents"
89
+ }
90
+ },
91
+ is_async=True
92
+ )
93
+
94
+ logger.info("Client tools registered: query_documents")
95
 
96
  except Exception as e:
97
+ logger.error(f"Error initializing client tools: {str(e)}")
98
+ self.client_tools = None
99
 
100
+ async def _rag_query_handler(self, params: Dict[str, Any]) -> Dict[str, Any]:
101
  """
102
+ Enhanced RAG query handler with better error handling and response formatting
103
+
104
+ This tool is called by the ElevenLabs agent when it needs to search documents.
105
 
106
  Args:
107
+ params: Dictionary with 'query' key containing user's question
 
108
 
109
  Returns:
110
+ Dictionary with 'answer' and optional 'sources'
111
  """
112
  try:
113
  query = params.get("query", "")
114
 
115
+ if not query or not query.strip():
116
  return {
117
+ "answer": "I didn't receive a question to search for. Could you please ask again?"
 
118
  }
119
 
120
+ logger.info(f"RAG query: {query}")
121
 
122
+ # Query LlamaIndex with timeout
123
  try:
124
  result = await asyncio.wait_for(
125
  self.llamaindex_service.query(query),
126
+ timeout=self.config.CONVERSATION_TIMEOUT if hasattr(self.config, 'CONVERSATION_TIMEOUT') else 30
127
  )
128
 
129
+ logger.info(f"RAG query successful: {len(result)} chars")
130
 
131
+ # Format response for conversational voice
132
  return {
133
  "answer": result,
134
+ "confidence": "high",
135
+ "source": "document_library"
136
  }
137
 
138
  except asyncio.TimeoutError:
139
  logger.error("RAG query timeout")
140
  return {
141
+ "answer": "The search is taking longer than expected. Could you try rephrasing your question?"
 
142
  }
143
 
144
  except Exception as e:
145
+ logger.error(f"RAG query error: {str(e)}", exc_info=True)
146
  return {
147
+ "answer": f"I encountered an error while searching: {str(e)}. Please try again."
 
148
  }
149
 
150
  def create_conversation(
151
  self,
152
  agent_id: Optional[str] = None,
153
+ session_id: Optional[str] = None,
154
+ use_audio: bool = True
155
  ) -> Optional[Conversation]:
156
  """
157
  Create a new conversation session
 
159
  Args:
160
  agent_id: ElevenLabs agent ID (uses config default if not provided)
161
  session_id: Optional session ID for tracking
162
+ use_audio: If True, use audio interface; if False, text-only mode
163
 
164
  Returns:
165
  Conversation object or None if initialization fails
 
175
  logger.error("No agent ID provided or configured")
176
  return None
177
 
178
+ # Create audio interface only if requested
179
+ audio_interface = DefaultAudioInterface() if use_audio else None
180
 
181
  # Create conversation with RAG tool
182
  conversation = Conversation(
 
184
  agent_id=agent_id,
185
  requires_auth=True,
186
  audio_interface=audio_interface,
187
+ client_tools=self.client_tools,
188
+ # Add callbacks for monitoring
189
+ callback_agent_response=lambda response: self._on_agent_response(session_id, response),
190
+ callback_user_transcript=lambda transcript: self._on_user_message(session_id, transcript)
191
  )
192
 
193
+ # Store conversation and initialize history
194
  if session_id:
195
  self.active_conversations[session_id] = conversation
196
+ self.conversation_history[session_id] = []
197
 
198
  logger.info(f"Created conversation for agent: {agent_id}")
199
  return conversation
 
202
  logger.error(f"Error creating conversation: {str(e)}")
203
  return None
204
 
205
+ def _on_agent_response(self, session_id: Optional[str], response: str):
206
+ """Track agent responses"""
207
+ if session_id and session_id in self.conversation_history:
208
+ self.conversation_history[session_id].append({
209
+ "role": "assistant",
210
+ "content": response
211
+ })
212
+ logger.debug(f"Agent response: {response[:100]}...")
213
+
214
+ def _on_user_message(self, session_id: Optional[str], message: str):
215
+ """Track user messages"""
216
+ if session_id and session_id in self.conversation_history:
217
+ self.conversation_history[session_id].append({
218
+ "role": "user",
219
+ "content": message
220
+ })
221
+ logger.debug(f"User message: {message[:100]}...")
222
+
223
  async def start_conversation(self, session_id: Optional[str] = None) -> Dict[str, Any]:
224
  """
225
+ Start a new conversation session
226
 
227
  Args:
228
  session_id: Optional session ID for tracking
 
231
  Dictionary with success status and conversation info
232
  """
233
  try:
234
+ conversation = self.create_conversation(session_id=session_id, use_audio=False)
235
 
236
  if conversation:
237
  return {
238
  "success": True,
239
  "session_id": session_id,
240
+ "message": "Voice assistant ready. Ask me anything about your documents!"
241
  }
242
  else:
243
  return {
244
  "success": False,
245
+ "error": "Failed to create conversation. Check API configuration."
246
  }
247
  except Exception as e:
248
  logger.error(f"Error starting conversation: {str(e)}")
 
251
  "error": str(e)
252
  }
253
 
254
+ async def send_text_message(
255
  self,
256
+ message: str,
257
+ session_id: str
258
  ) -> Dict[str, Any]:
259
  """
260
+ Send a text message to the agent and get response
261
+
262
+ This is for text-based chat (no audio). Perfect for web interfaces.
263
 
264
  Args:
265
+ message: User's text message
266
+ session_id: Session identifier
267
 
268
  Returns:
269
+ Dictionary with agent's response
270
  """
271
  try:
272
+ if not message or not message.strip():
273
+ return {
274
+ "success": False,
275
+ "error": "Empty message"
276
+ }
277
+
278
+ # For text-based interaction, we directly query the RAG system
279
+ # since ElevenLabs Conversational AI is primarily audio-focused
280
+
281
+ # Store user message
282
+ if session_id in self.conversation_history:
283
+ self.conversation_history[session_id].append({
284
+ "role": "user",
285
+ "content": message
286
+ })
287
 
288
+ # Query RAG system
289
+ response = await self._rag_query_handler({"query": message})
290
 
291
+ # Store assistant response
292
+ if session_id in self.conversation_history:
293
+ self.conversation_history[session_id].append({
294
+ "role": "assistant",
295
+ "content": response["answer"]
296
+ })
297
 
298
  return {
299
+ "success": True,
300
+ "answer": response["answer"],
301
+ "session_id": session_id
302
  }
303
 
304
  except Exception as e:
305
+ logger.error(f"Error sending message: {str(e)}")
306
  return {
307
+ "success": False,
308
  "error": str(e)
309
  }
310
 
 
324
 
325
  # Try to end the session gracefully
326
  try:
327
+ if hasattr(conversation, 'end_session'):
328
+ conversation.end_session()
 
 
329
  except Exception as e:
330
  logger.warning(f"Error during session cleanup: {str(e)}")
331
 
332
+ # Remove from active conversations
333
  del self.active_conversations[session_id]
334
  logger.info(f"Ended conversation: {session_id}")
335
  return True
336
+
337
  return False
338
 
339
  except Exception as e:
340
  logger.error(f"Error ending conversation: {str(e)}")
341
  return False
342
 
343
+ def get_conversation_history(self, session_id: str) -> List[Dict]:
344
+ """Get conversation history for a session"""
345
+ return self.conversation_history.get(session_id, [])
346
+
347
  def get_available_voices(self) -> List[Dict[str, str]]:
348
  """
349
  Get list of available voice models
 
355
  if not self.client:
356
  return []
357
 
 
358
  voices = self.client.voices.get_all()
359
 
360
  return [
361
  {
362
  "voice_id": voice.voice_id,
363
  "name": voice.name,
364
+ "category": getattr(voice, 'category', "general")
365
  }
366
  for voice in voices.voices
367
  ]
 
388
  "message": "Client not initialized"
389
  }
390
 
391
+ # Test API by fetching voices
392
  voices = self.get_available_voices()
393
 
394
+ # Test RAG tool
395
+ test_result = await self._rag_query_handler({"query": "test"})
396
+
397
  return {
398
  "status": "success",
399
  "message": "ElevenLabs API connected",
400
  "voices_available": len(voices),
401
+ "rag_tool_working": "answer" in test_result,
402
+ "client_tools_registered": self.client_tools is not None
403
  }
404
 
405
  except Exception as e:
 
407
  return {
408
  "status": "error",
409
  "message": str(e)
410
+ }