ACloudCenter commited on
Commit
ea1aabc
·
1 Parent(s): 36f9277

Simplify Q&A and improve UI layout

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +40 -48
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  *.pyc
2
  __pycache__/
3
  .env
 
 
1
  *.pyc
2
  __pycache__/
3
  .env
4
+ gradiouitest.py
app.py CHANGED
@@ -28,7 +28,7 @@ model = SALM.from_pretrained("nvidia/canary-qwen-2.5b").bfloat16().eval().to(dev
28
  @spaces.GPU
29
  def transcribe_audio(audio_filepath):
30
  if audio_filepath is None:
31
- return "Please upload an audio file", "", [], 0
32
 
33
  # Load audio with torchaudio (handles all formats)
34
  audio, sample_rate = torchaudio.load(audio_filepath)
@@ -56,47 +56,41 @@ def transcribe_audio(audio_filepath):
56
 
57
  # Convert output IDs to text
58
  transcript = model.tokenizer.ids_to_text(output_ids[0].cpu())
59
- initial_message = [{"role": "assistant", "content": f"Transcript ready. Ask me questions about it."}]
60
- return transcript, transcript, initial_message, 0
61
 
62
 
63
- # Enhanced Q&A function with conversation history
64
  @spaces.GPU
65
- def answer_question_with_history(transcript, question, history, qa_count):
66
  if not transcript:
67
- return history, qa_count, "Please transcribe audio first"
68
 
69
- if qa_count >= 5:
70
- history.append({"role": "user", "content": question})
71
- history.append({"role": "assistant", "content": "You've reached the maximum of 5 questions for this transcript. Please transcribe new audio to continue."})
72
- return history, qa_count, ""
73
 
74
- # Build context from history for better continuity
75
- context = ""
76
- for msg in history[-4:]: # Use last 2 exchanges for context
77
- if msg.get("role") == "user":
78
- context += f"Previous question: {msg['content']}\n"
79
-
80
- with torch.inference_mode(), model.llm.disable_adapter():
81
- prompt = f"{context}Current question: {question}\n\nTranscript:\n{transcript}"
82
  output_ids = model.generate(
83
  prompts=[[{"role": "user", "content": prompt}]],
84
- max_new_tokens=400,
85
  )
86
 
87
- # Convert output IDs to text and extract answer
88
- answer = model.tokenizer.ids_to_text(output_ids[0].cpu())
89
- answer = answer.split("<|im_start|>assistant")[-1]
90
 
91
- # Add follow-up prompt if under 5 questions
92
- if qa_count < 4:
93
- answer += f"\n\nQuestion {qa_count + 1}/5 - What else would you like to know?"
94
  else:
95
- answer += "\n\nThis is your final question for this transcript."
 
 
 
96
 
97
  history.append({"role": "user", "content": question})
98
  history.append({"role": "assistant", "content": answer})
99
- return history, qa_count + 1, ""
100
 
101
  # Build the Gradio interface
102
  with gr.Blocks(theme=theme) as demo:
@@ -105,9 +99,7 @@ with gr.Blocks(theme=theme) as demo:
105
 
106
  # State variables
107
  transcript_state = gr.State()
108
- qa_history = gr.State([])
109
- qa_counter = gr.State(0)
110
-
111
  with gr.Row():
112
  with gr.Column(scale=1):
113
  gr.Markdown("### Audio Input")
@@ -130,20 +122,20 @@ with gr.Blocks(theme=theme) as demo:
130
  gr.Markdown("### Interactive Q&A")
131
  chatbot = gr.Chatbot(
132
  type="messages",
133
- height=400,
134
- label="Conversation History",
135
  bubble_full_width=False
136
  )
137
 
138
  with gr.Row():
139
  question_input = gr.Textbox(
140
- label="Your Question",
141
- placeholder="e.g., What was the main topic? Why did they say that?",
142
- scale=4
 
143
  )
144
- ask_btn = gr.Button("Ask", variant="primary", scale=1)
145
-
146
- clear_chat_btn = gr.Button("Clear Chat", variant="secondary")
147
 
148
  gr.Markdown("""
149
  ### Example Questions to Try:
@@ -158,25 +150,25 @@ with gr.Blocks(theme=theme) as demo:
158
  transcribe_btn.click(
159
  fn=transcribe_audio,
160
  inputs=[audio_input],
161
- outputs=[transcript_output, transcript_state, chatbot, qa_counter]
162
  )
163
 
164
  ask_btn.click(
165
- fn=answer_question_with_history,
166
- inputs=[transcript_state, question_input, chatbot, qa_counter],
167
- outputs=[chatbot, qa_counter, question_input]
168
  )
169
 
170
  question_input.submit(
171
- fn=answer_question_with_history,
172
- inputs=[transcript_state, question_input, chatbot, qa_counter],
173
- outputs=[chatbot, qa_counter, question_input]
174
  )
175
 
176
  clear_chat_btn.click(
177
- fn=lambda t: ([{"role": "assistant", "content": "Chat cleared. Ask me questions about the transcript."}] if t else [], 1 if t else 0),
178
- inputs=[transcript_state],
179
- outputs=[chatbot, qa_counter]
180
  )
181
 
182
  demo.queue()
 
28
  @spaces.GPU
29
  def transcribe_audio(audio_filepath):
30
  if audio_filepath is None:
31
+ return "Please upload an audio file", "", []
32
 
33
  # Load audio with torchaudio (handles all formats)
34
  audio, sample_rate = torchaudio.load(audio_filepath)
 
56
 
57
  # Convert output IDs to text
58
  transcript = model.tokenizer.ids_to_text(output_ids[0].cpu())
59
+ initial_message = [{"role": "assistant", "content": "Transcript ready. Ask me questions about it."}]
60
+ return transcript, transcript, initial_message
61
 
62
 
63
+ # Simple Q&A function
64
  @spaces.GPU
65
+ def answer_question(transcript, question, history):
66
  if not transcript:
67
+ return history, "Please transcribe audio first"
68
 
69
+ if not question:
70
+ return history, ""
 
 
71
 
72
+ with torch.inference_mode():
73
+ prompt = f"Based on this transcript, answer the following question:\n\nTranscript: {transcript}\n\nQuestion: {question}\n\nAnswer:"
 
 
 
 
 
 
74
  output_ids = model.generate(
75
  prompts=[[{"role": "user", "content": prompt}]],
76
+ max_new_tokens=256,
77
  )
78
 
79
+ # Convert output IDs to text
80
+ full_response = model.tokenizer.ids_to_text(output_ids[0].cpu())
 
81
 
82
+ # Extract just the answer part
83
+ if "Answer:" in full_response:
84
+ answer = full_response.split("Answer:")[-1].strip()
85
  else:
86
+ answer = full_response.strip()
87
+
88
+ # Clean up any remaining tags
89
+ answer = answer.replace("<|im_end|>", "").replace("<|im_start|>", "").strip()
90
 
91
  history.append({"role": "user", "content": question})
92
  history.append({"role": "assistant", "content": answer})
93
+ return history, ""
94
 
95
  # Build the Gradio interface
96
  with gr.Blocks(theme=theme) as demo:
 
99
 
100
  # State variables
101
  transcript_state = gr.State()
102
+
 
 
103
  with gr.Row():
104
  with gr.Column(scale=1):
105
  gr.Markdown("### Audio Input")
 
122
  gr.Markdown("### Interactive Q&A")
123
  chatbot = gr.Chatbot(
124
  type="messages",
125
+ height=450,
126
+ label="",
127
  bubble_full_width=False
128
  )
129
 
130
  with gr.Row():
131
  question_input = gr.Textbox(
132
+ label="",
133
+ placeholder="Ask a question about the transcript...",
134
+ scale=5,
135
+ container=False
136
  )
137
+ ask_btn = gr.Button("Ask", variant="primary", scale=1, size="lg")
138
+ clear_chat_btn = gr.Button("Clear", variant="secondary", scale=1, size="lg")
 
139
 
140
  gr.Markdown("""
141
  ### Example Questions to Try:
 
150
  transcribe_btn.click(
151
  fn=transcribe_audio,
152
  inputs=[audio_input],
153
+ outputs=[transcript_output, transcript_state, chatbot]
154
  )
155
 
156
  ask_btn.click(
157
+ fn=answer_question,
158
+ inputs=[transcript_state, question_input, chatbot],
159
+ outputs=[chatbot, question_input]
160
  )
161
 
162
  question_input.submit(
163
+ fn=answer_question,
164
+ inputs=[transcript_state, question_input, chatbot],
165
+ outputs=[chatbot, question_input]
166
  )
167
 
168
  clear_chat_btn.click(
169
+ fn=lambda: [],
170
+ inputs=[],
171
+ outputs=[chatbot]
172
  )
173
 
174
  demo.queue()