ACloudCenter commited on
Commit
4c3f05f
·
1 Parent(s): 5bc92c5

feat: add additional comments for function clarity. Fix pipeline error by using model.generate() directly

Browse files
Files changed (1) hide show
  1. app.py +26 -14
app.py CHANGED
@@ -4,53 +4,64 @@ import spaces
4
  from lhotse import Recording
5
  from nemo.collections.speechlm2 import SALM
6
 
 
7
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
  SAMPLE_RATE = 16000
9
 
 
10
  model = SALM.from_pretrained("nvidia/canary-qwen-2.5b").bfloat16().eval().to(device)
11
 
 
12
  @spaces.GPU
13
  def transcribe_audio(audio_filepath):
14
  if audio_filepath is None:
15
  return "Please upload an audio file", ""
16
-
 
17
  rec = Recording.from_file(audio_filepath, recording_id="temp")
 
 
18
  cut = rec.resample(SAMPLE_RATE).to_cut()
19
  if cut.num_channels > 1:
20
  cut = cut.to_mono(mono_downmix=True)
21
-
 
22
  audio, audio_lens = cut.load_audio()
23
-
 
24
  with torch.inference_mode():
25
  output_ids = model.generate(
26
- prompts=[[{"role": "user", "content": f"Transcribe the following: {model.audio_locator_tag}"}]],
27
  audios=torch.as_tensor(audio).unsqueeze(0).to(device),
28
- audio_lens=torch.as_tensor([audio_lens]).to(device),
29
- max_new_tokens=256,
30
  )
31
-
 
32
  transcript = model.tokenizer.ids_to_text(output_ids[0].cpu())
33
  return transcript, transcript
34
 
35
- @spaces.GPU
 
36
  def answer_question(transcript, question):
37
  if not transcript:
38
  return "Please transcribe audio first"
39
-
40
  with torch.inference_mode(), model.llm.disable_adapter():
41
  output_ids = model.generate(
42
- prompts=[[{"role": "user", "content": f"{question}\n\n{transcript}"}]],
43
  max_new_tokens=512,
44
  )
45
-
 
46
  answer = model.tokenizer.ids_to_text(output_ids[0].cpu())
47
  answer = answer.split("<|im_start|>assistant")[-1]
48
  return answer.strip()
49
 
 
50
  with gr.Blocks(title="Canary-Qwen Transcriber & Q&A") as demo:
51
  gr.Markdown("# Canary-Qwen Transcriber with Q&A")
52
- gr.Markdown("Upload audio to transcribe, then ask questions about it!")
53
-
54
  with gr.Row():
55
  with gr.Column():
56
  audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input")
@@ -58,7 +69,8 @@ with gr.Blocks(title="Canary-Qwen Transcriber & Q&A") as demo:
58
 
59
  with gr.Column():
60
  transcript_output = gr.Textbox(label="Transcript", lines=8)
61
-
 
62
  transcript_state = gr.State()
63
 
64
  with gr.Row():
 
4
  from lhotse import Recording
5
  from nemo.collections.speechlm2 import SALM
6
 
7
+ # Set device to use cuda if available and sample rate to 16000 for Nvidia NeMo
8
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
  SAMPLE_RATE = 16000
10
 
11
+ # Load the model from Hugging Face Hub using Nvidia SALM
12
  model = SALM.from_pretrained("nvidia/canary-qwen-2.5b").bfloat16().eval().to(device)
13
 
14
+ # Define the audio transcription function and use ZeroGPU
15
  @spaces.GPU
16
  def transcribe_audio(audio_filepath):
17
  if audio_filepath is None:
18
  return "Please upload an audio file", ""
19
+
20
+ # Load and preprocess audio from the users file
21
  rec = Recording.from_file(audio_filepath, recording_id="temp")
22
+
23
+ # Resample and convert to mono if needed
24
  cut = rec.resample(SAMPLE_RATE).to_cut()
25
  if cut.num_channels > 1:
26
  cut = cut.to_mono(mono_downmix=True)
27
+
28
+ # Load audio data
29
  audio, audio_lens = cut.load_audio()
30
+
31
+ # Generate transcription
32
  with torch.inference_mode():
33
  output_ids = model.generate(
34
+ prompts=[[{"role": "user", "content": f"Transcribe the following: {model.audio_locator_tag}"}]], # torch.as_tensor is used to convert the audio data to a tensor for model input
35
  audios=torch.as_tensor(audio).unsqueeze(0).to(device),
36
+ audio_lens=torch.as_tensor([audio_lens]).to(device), # torch.as_tensor is used to convert the audio length to a tensor for model input
37
+ max_new_tokens=256, # Maximum number of tokens to generate
38
  )
39
+
40
+ # Convert output IDs to text then return the transcript
41
  transcript = model.tokenizer.ids_to_text(output_ids[0].cpu())
42
  return transcript, transcript
43
 
44
+ # Define the question answering function for transcription queries
45
+ @spaces.GPU
46
  def answer_question(transcript, question):
47
  if not transcript:
48
  return "Please transcribe audio first"
 
49
  with torch.inference_mode(), model.llm.disable_adapter():
50
  output_ids = model.generate(
51
+ prompts=[[{"role": "user", "content": f"{question}\n\n{transcript}"}]], # torch.as_tensor is used to convert the audio data to a tensor for model input
52
  max_new_tokens=512,
53
  )
54
+
55
+ # Convert output IDs to text then return the answer
56
  answer = model.tokenizer.ids_to_text(output_ids[0].cpu())
57
  answer = answer.split("<|im_start|>assistant")[-1]
58
  return answer.strip()
59
 
60
+ # Build the Gradio interface
61
  with gr.Blocks(title="Canary-Qwen Transcriber & Q&A") as demo:
62
  gr.Markdown("# Canary-Qwen Transcriber with Q&A")
63
+ gr.Markdown("Upload or record audio to transcribe, then ask questions about it.")
64
+
65
  with gr.Row():
66
  with gr.Column():
67
  audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input")
 
69
 
70
  with gr.Column():
71
  transcript_output = gr.Textbox(label="Transcript", lines=8)
72
+
73
+ # Define a state variable to hold the transcript
74
  transcript_state = gr.State()
75
 
76
  with gr.Row():