Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,11 +2,33 @@ import gradio as gr
|
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
| 4 |
import librosa
|
| 5 |
-
|
| 6 |
from torchmetrics.functional.audio.nisqa import non_intrusive_speech_quality_assessment as tm_nisqa
|
| 7 |
|
| 8 |
SR = 16000
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
def predict_nisqa(audio):
|
| 11 |
if isinstance(audio, tuple):
|
| 12 |
_sr, y = audio
|
|
@@ -15,31 +37,56 @@ def predict_nisqa(audio):
|
|
| 15 |
y, _ = librosa.load(audio, sr=SR, mono=True)
|
| 16 |
|
| 17 |
wav = torch.tensor(y, dtype=torch.float32)
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
table = {
|
| 23 |
-
"Metric":
|
| 24 |
-
"Score":
|
| 25 |
-
|
|
|
|
| 26 |
}
|
| 27 |
-
|
|
|
|
|
|
|
| 28 |
|
| 29 |
with gr.Blocks(title="NISQA Speech Quality (MOS) Demo") as demo:
|
| 30 |
gr.Markdown(
|
| 31 |
"""
|
| 32 |
-
# 🎧 NISQA Speech Quality (MOS)
|
| 33 |
-
Upload or record speech and get **MOS + quality dimensions**.
|
| 34 |
-
|
|
|
|
| 35 |
"""
|
| 36 |
)
|
| 37 |
-
|
| 38 |
-
audio = gr.Audio(sources=['upload', 'microphone'], type="filepath", label="Input audio (wav/mp3/m4a...)")
|
| 39 |
btn = gr.Button("Predict")
|
| 40 |
-
out = gr.Dataframe(headers=["Metric", "Score"], label="Results", interactive=False)
|
| 41 |
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
if __name__ == "__main__":
|
| 45 |
demo.launch()
|
|
|
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
| 4 |
import librosa
|
|
|
|
| 5 |
from torchmetrics.functional.audio.nisqa import non_intrusive_speech_quality_assessment as tm_nisqa
|
| 6 |
|
| 7 |
SR = 16000
|
| 8 |
|
| 9 |
+
def label_mos(x: float):
|
| 10 |
+
# ITU-T P.800 ACR-inspired buckets
|
| 11 |
+
if x < 1.5: return "Bad"
|
| 12 |
+
if x < 2.5: return "Poor"
|
| 13 |
+
if x < 3.5: return "Fair"
|
| 14 |
+
if x < 4.3: return "Good"
|
| 15 |
+
return "Excellent"
|
| 16 |
+
|
| 17 |
+
def label_dim(x: float):
|
| 18 |
+
if x < 1.5: return "Severe"
|
| 19 |
+
if x < 2.5: return "High"
|
| 20 |
+
if x < 3.5: return "Moderate"
|
| 21 |
+
if x < 4.3: return "Low"
|
| 22 |
+
return "Negligible"
|
| 23 |
+
|
| 24 |
+
def explain_dim(name: str):
|
| 25 |
+
return {
|
| 26 |
+
"Noisiness": "How noisy it sounds (higher = less noise).",
|
| 27 |
+
"Discontinuity": "Dropouts/glitches (higher = fewer glitches).",
|
| 28 |
+
"Coloration": "Tone/timbre coloration (higher = more natural).",
|
| 29 |
+
"Loudness": "Perceived loudness appropriateness (higher = more appropriate)."
|
| 30 |
+
}[name]
|
| 31 |
+
|
| 32 |
def predict_nisqa(audio):
|
| 33 |
if isinstance(audio, tuple):
|
| 34 |
_sr, y = audio
|
|
|
|
| 37 |
y, _ = librosa.load(audio, sr=SR, mono=True)
|
| 38 |
|
| 39 |
wav = torch.tensor(y, dtype=torch.float32)
|
| 40 |
+
mos, noisiness, discontinuity, coloration, loudness = tm_nisqa(wav, SR).detach().cpu().numpy().tolist()
|
| 41 |
|
| 42 |
+
metrics = [
|
| 43 |
+
("MOS (overall)", mos, label_mos(mos), "Higher = better perceived quality."),
|
| 44 |
+
("Noisiness", noisiness, label_dim(noisiness), explain_dim("Noisiness")),
|
| 45 |
+
("Discontinuity", discontinuity, label_dim(discontinuity), explain_dim("Discontinuity")),
|
| 46 |
+
("Coloration", coloration, label_dim(coloration), explain_dim("Coloration")),
|
| 47 |
+
("Loudness", loudness, label_dim(loudness), explain_dim("Loudness")),
|
| 48 |
+
]
|
| 49 |
|
| 50 |
table = {
|
| 51 |
+
"Metric": [m[0] for m in metrics],
|
| 52 |
+
"Score": [round(m[1], 3) for m in metrics],
|
| 53 |
+
"Label": [m[2] for m in metrics],
|
| 54 |
+
"Notes": [m[3] for m in metrics],
|
| 55 |
}
|
| 56 |
+
bars = {m[0]: float(m[1]) for m in metrics}
|
| 57 |
+
|
| 58 |
+
return table, bars
|
| 59 |
|
| 60 |
with gr.Blocks(title="NISQA Speech Quality (MOS) Demo") as demo:
|
| 61 |
gr.Markdown(
|
| 62 |
"""
|
| 63 |
+
# 🎧 NISQA Speech Quality (MOS)
|
| 64 |
+
Upload or record speech and get **MOS + quality dimensions**.
|
| 65 |
+
**Scale:** 1–5 where higher = better.
|
| 66 |
+
**Dimensions:** higher = fewer issues in that aspect.
|
| 67 |
"""
|
| 68 |
)
|
| 69 |
+
audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input audio")
|
|
|
|
| 70 |
btn = gr.Button("Predict")
|
|
|
|
| 71 |
|
| 72 |
+
out_table = gr.Dataframe(headers=["Metric", "Score", "Label", "Notes"], interactive=False)
|
| 73 |
+
|
| 74 |
+
bars = gr.BarPlot(
|
| 75 |
+
x="Metric", y="Score",
|
| 76 |
+
y_lim=(0, 5),
|
| 77 |
+
tooltip=["Score"],
|
| 78 |
+
width=0.6,
|
| 79 |
+
interactive=False,
|
| 80 |
+
label="Scores (0–5, higher = better)"
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
def _bars_to_df(table_dict, bars_dict):
|
| 84 |
+
import pandas as pd
|
| 85 |
+
df = pd.DataFrame({"Metric": list(bars_dict.keys()), "Score": list(bars_dict.values())})
|
| 86 |
+
return table_dict, df
|
| 87 |
+
|
| 88 |
+
btn.click(fn=predict_nisqa, inputs=audio, outputs=[out_table, bars], postprocess=False)\
|
| 89 |
+
.then(fn=_bars_to_df, inputs=[out_table, bars], outputs=[out_table, bars])
|
| 90 |
|
| 91 |
if __name__ == "__main__":
|
| 92 |
demo.launch()
|