Spaces:

ACloudCenter
/

canary-qwen-transcriber-2.5b

Runtime error

ACloudCenter commited on Aug 26

Commit

3316662

1 Parent(s): e96a4b0

fix: MP3 resampling error with torchaudio preprocessing

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import torch
 import spaces
 from lhotse import Recording
 from lhotse.dataset import DynamicCutSampler
 from nemo.collections.speechlm2 import SALM
@@ -19,12 +20,20 @@ def transcribe_audio(audio_filepath):
         return "Please upload an audio file", ""
     # Load and preprocess audio from the users file
-    rec = Recording.from_file(audio_filepath, recording_id="temp")
-    # Resample and convert to mono if needed
-    cut = rec.resample(SAMPLE_RATE).to_cut()
-    if cut.num_channels > 1:
-        cut = cut.to_mono(mono_downmix=True)
     # Load audio data
     batch = DynamicCutSampler([cut], max_cuts=1)
@@ -67,7 +76,7 @@ with gr.Blocks(title="Canary-Qwen Transcriber & Q&A") as demo:
     with gr.Row():
         with gr.Column():
-            audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input")
             transcribe_btn = gr.Button("Transcribe", variant="primary")
         with gr.Column():

 import gradio as gr
 import torch
 import spaces
+import torchaudio
 from lhotse import Recording
 from lhotse.dataset import DynamicCutSampler
 from nemo.collections.speechlm2 import SALM
         return "Please upload an audio file", ""
     # Load and preprocess audio from the users file
+    audio, sample_rate = torchaudio.load(audio_filepath)
+    # Resample if needed
+    if sample_rate != SAMPLE_RATE:
+        resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)
+        audio = resampler(audio)
+    # Convert to mono if needed
+    if audio.shape[0] > 1:
+        audio = audio.mean(dim=0, keepdim=True)
+    # Create Recording from preprocessed audio
+    rec = Recording.from_numpy(audio.numpy(), SAMPLE_RATE, recording_id="temp")
+    cut = rec.to_cut()
     # Load audio data
     batch = DynamicCutSampler([cut], max_cuts=1)
     with gr.Row():
         with gr.Column():
+            audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input (MP3, WAV, M4A, etc.)")
             transcribe_btn = gr.Button("Transcribe", variant="primary")
         with gr.Column():

requirements.txt CHANGED Viewed

@@ -5,5 +5,6 @@ lhotse
 peft
 --extra-index-url https://download.pytorch.org/whl/cu113
 torch
 sacrebleu
 seaborn

 peft
 --extra-index-url https://download.pytorch.org/whl/cu113
 torch
+torchaudio
 sacrebleu
 seaborn