Spaces:
Runtime error
Runtime error
Commit
·
3316662
1
Parent(s):
e96a4b0
fix: MP3 resampling error with torchaudio preprocessing
Browse files- app.py +16 -7
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
import spaces
|
|
|
|
| 4 |
from lhotse import Recording
|
| 5 |
from lhotse.dataset import DynamicCutSampler
|
| 6 |
from nemo.collections.speechlm2 import SALM
|
|
@@ -19,12 +20,20 @@ def transcribe_audio(audio_filepath):
|
|
| 19 |
return "Please upload an audio file", ""
|
| 20 |
|
| 21 |
# Load and preprocess audio from the users file
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
# Resample
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# Load audio data
|
| 30 |
batch = DynamicCutSampler([cut], max_cuts=1)
|
|
@@ -67,7 +76,7 @@ with gr.Blocks(title="Canary-Qwen Transcriber & Q&A") as demo:
|
|
| 67 |
|
| 68 |
with gr.Row():
|
| 69 |
with gr.Column():
|
| 70 |
-
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input")
|
| 71 |
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
| 72 |
|
| 73 |
with gr.Column():
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
import spaces
|
| 4 |
+
import torchaudio
|
| 5 |
from lhotse import Recording
|
| 6 |
from lhotse.dataset import DynamicCutSampler
|
| 7 |
from nemo.collections.speechlm2 import SALM
|
|
|
|
| 20 |
return "Please upload an audio file", ""
|
| 21 |
|
| 22 |
# Load and preprocess audio from the users file
|
| 23 |
+
audio, sample_rate = torchaudio.load(audio_filepath)
|
| 24 |
+
|
| 25 |
+
# Resample if needed
|
| 26 |
+
if sample_rate != SAMPLE_RATE:
|
| 27 |
+
resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)
|
| 28 |
+
audio = resampler(audio)
|
| 29 |
+
|
| 30 |
+
# Convert to mono if needed
|
| 31 |
+
if audio.shape[0] > 1:
|
| 32 |
+
audio = audio.mean(dim=0, keepdim=True)
|
| 33 |
+
|
| 34 |
+
# Create Recording from preprocessed audio
|
| 35 |
+
rec = Recording.from_numpy(audio.numpy(), SAMPLE_RATE, recording_id="temp")
|
| 36 |
+
cut = rec.to_cut()
|
| 37 |
|
| 38 |
# Load audio data
|
| 39 |
batch = DynamicCutSampler([cut], max_cuts=1)
|
|
|
|
| 76 |
|
| 77 |
with gr.Row():
|
| 78 |
with gr.Column():
|
| 79 |
+
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input (MP3, WAV, M4A, etc.)")
|
| 80 |
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
| 81 |
|
| 82 |
with gr.Column():
|
requirements.txt
CHANGED
|
@@ -5,5 +5,6 @@ lhotse
|
|
| 5 |
peft
|
| 6 |
--extra-index-url https://download.pytorch.org/whl/cu113
|
| 7 |
torch
|
|
|
|
| 8 |
sacrebleu
|
| 9 |
seaborn
|
|
|
|
| 5 |
peft
|
| 6 |
--extra-index-url https://download.pytorch.org/whl/cu113
|
| 7 |
torch
|
| 8 |
+
torchaudio
|
| 9 |
sacrebleu
|
| 10 |
seaborn
|