ACloudCenter commited on
Commit
3316662
·
1 Parent(s): e96a4b0

fix: MP3 resampling error with torchaudio preprocessing

Browse files
Files changed (2) hide show
  1. app.py +16 -7
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import torch
3
  import spaces
 
4
  from lhotse import Recording
5
  from lhotse.dataset import DynamicCutSampler
6
  from nemo.collections.speechlm2 import SALM
@@ -19,12 +20,20 @@ def transcribe_audio(audio_filepath):
19
  return "Please upload an audio file", ""
20
 
21
  # Load and preprocess audio from the users file
22
- rec = Recording.from_file(audio_filepath, recording_id="temp")
23
-
24
- # Resample and convert to mono if needed
25
- cut = rec.resample(SAMPLE_RATE).to_cut()
26
- if cut.num_channels > 1:
27
- cut = cut.to_mono(mono_downmix=True)
 
 
 
 
 
 
 
 
28
 
29
  # Load audio data
30
  batch = DynamicCutSampler([cut], max_cuts=1)
@@ -67,7 +76,7 @@ with gr.Blocks(title="Canary-Qwen Transcriber & Q&A") as demo:
67
 
68
  with gr.Row():
69
  with gr.Column():
70
- audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input")
71
  transcribe_btn = gr.Button("Transcribe", variant="primary")
72
 
73
  with gr.Column():
 
1
  import gradio as gr
2
  import torch
3
  import spaces
4
+ import torchaudio
5
  from lhotse import Recording
6
  from lhotse.dataset import DynamicCutSampler
7
  from nemo.collections.speechlm2 import SALM
 
20
  return "Please upload an audio file", ""
21
 
22
  # Load and preprocess audio from the users file
23
+ audio, sample_rate = torchaudio.load(audio_filepath)
24
+
25
+ # Resample if needed
26
+ if sample_rate != SAMPLE_RATE:
27
+ resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)
28
+ audio = resampler(audio)
29
+
30
+ # Convert to mono if needed
31
+ if audio.shape[0] > 1:
32
+ audio = audio.mean(dim=0, keepdim=True)
33
+
34
+ # Create Recording from preprocessed audio
35
+ rec = Recording.from_numpy(audio.numpy(), SAMPLE_RATE, recording_id="temp")
36
+ cut = rec.to_cut()
37
 
38
  # Load audio data
39
  batch = DynamicCutSampler([cut], max_cuts=1)
 
76
 
77
  with gr.Row():
78
  with gr.Column():
79
+ audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input (MP3, WAV, M4A, etc.)")
80
  transcribe_btn = gr.Button("Transcribe", variant="primary")
81
 
82
  with gr.Column():
requirements.txt CHANGED
@@ -5,5 +5,6 @@ lhotse
5
  peft
6
  --extra-index-url https://download.pytorch.org/whl/cu113
7
  torch
 
8
  sacrebleu
9
  seaborn
 
5
  peft
6
  --extra-index-url https://download.pytorch.org/whl/cu113
7
  torch
8
+ torchaudio
9
  sacrebleu
10
  seaborn