Spaces:
Sleeping
Sleeping
File size: 10,013 Bytes
82892a4 9658111 82892a4 9658111 82892a4 20739c9 18de4f2 e314678 82892a4 18de4f2 20739c9 82892a4 18de4f2 82892a4 18de4f2 82892a4 20739c9 82892a4 20739c9 82892a4 e314678 82892a4 20739c9 82892a4 20739c9 18de4f2 82892a4 18de4f2 82892a4 18de4f2 82892a4 18de4f2 82892a4 20739c9 82892a4 20739c9 82892a4 20739c9 82892a4 ffd8750 0951916 82892a4 20739c9 82892a4 18de4f2 82892a4 0951916 82892a4 20739c9 82892a4 20739c9 82892a4 fdf062c c8a92c5 82892a4 20739c9 82892a4 191fc47 20739c9 82892a4 20739c9 82892a4 20739c9 18de4f2 20739c9 18de4f2 20739c9 82892a4 20739c9 82892a4 20739c9 82892a4 20739c9 82892a4 9658111 82892a4 c8a92c5 82892a4 20739c9 18de4f2 20739c9 18de4f2 20739c9 18de4f2 82892a4 18de4f2 20739c9 18de4f2 20739c9 82892a4 18de4f2 82892a4 20739c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
import random
import gradio as gr
import numpy as np
import torch
from src.chatterbox.mtl_tts import SUPPORTED_LANGUAGES, ChatterboxMultilingualTTS
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Running on device: {DEVICE}")
# --- Global Model Initialization ---
MODEL = None
LANGUAGE_CONFIG = {
"da": {
"audio_options": {
"mic": "voices/mic.wav",
"nic": "voices/nic.wav"
},
"default_audio": "voices/mic.wav", # Default to mic
"text": "København er Danmarks hovedstad og ligger på øerne Sjælland og Amager, hvor mange turister besøger de smukke kanaler og historiske bygninger."
},
"en": {
"audio": "voices/en_f1.flac",
"text": "Last month, we reached a new milestone with two billion views on our YouTube channel."
},
}
# --- UI Helpers ---
def default_audio_for_ui(lang: str, danish_voice: str = "mic") -> str | None:
config = LANGUAGE_CONFIG.get(lang, {})
if lang == "da" and "audio_options" in config:
return config["audio_options"].get(danish_voice, config.get("default_audio"))
return config.get("audio")
def default_text_for_ui(lang: str) -> str:
return LANGUAGE_CONFIG.get(lang, {}).get("text", "")
def get_danish_voice_options() -> list[tuple[str, str]]:
"""Get the available Danish voice options for the dropdown."""
return [("Mic", "mic"), ("Nic", "nic")]
def get_supported_languages_display() -> str:
"""Generate a formatted display of all supported languages."""
language_items = []
for code, name in sorted(SUPPORTED_LANGUAGES.items()):
language_items.append(f"**{name}** (`{code}`)")
# Split into 2 lines
mid = len(language_items) // 2
line1 = " • ".join(language_items[:mid])
line2 = " • ".join(language_items[mid:])
return f"""
### Supported Languages)
{line1}
{line2}
"""
def get_or_load_model():
"""Loads the ChatterboxMultilingualTTS model if it hasn't been loaded already,
and ensures it's on the correct device."""
global MODEL
if MODEL is None:
print("Model not loaded, initializing...")
try:
MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE:
MODEL.to(DEVICE)
print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
except Exception as e:
print(f"Error loading model: {e}")
raise
return MODEL
# Attempt to load the model at startup.
try:
get_or_load_model()
except Exception as e:
print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")
def set_seed(seed: int):
"""Sets the random seed for reproducibility across torch, numpy, and random."""
torch.manual_seed(seed)
if DEVICE == "cuda":
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
def resolve_audio_prompt(language_id: str, provided_path: str | None, danish_voice: str = "mic") -> str | None:
"""
Decide which audio prompt to use:
- If user provided a path (upload/mic/url), use it.
- Else, fall back to language-specific default (if any).
- For Danish, use the selected voice option.
"""
if provided_path and str(provided_path).strip():
return provided_path
return default_audio_for_ui(language_id, danish_voice)
def generate_tts_audio(
text_input: str,
language_id: str,
audio_prompt_path_input: str = None,
danish_voice_input: str = "mic",
temperature_input: float = 0.8,
seed_num_input: int = 0,
cfgw_input: float = 0.5
) -> tuple[int, np.ndarray]:
"""
Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
This tool synthesizes natural-sounding speech from input text. When a reference audio file
is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
Args:
text_input (str): The text to synthesize into speech (maximum 300 characters)
language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
Returns:
tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
"""
current_model = get_or_load_model()
exaggeration = 0.5
if current_model is None:
raise RuntimeError("TTS model is not loaded.")
if seed_num_input != 0:
set_seed(int(seed_num_input))
print(f"Generating audio for text: '{text_input[:50]}...'")
# Handle optional audio prompt
chosen_prompt = resolve_audio_prompt(language_id, audio_prompt_path_input, danish_voice_input)
generate_kwargs = {
"exaggeration": exaggeration,
"temperature": temperature_input,
"cfg_weight": cfgw_input,
}
if chosen_prompt:
generate_kwargs["audio_prompt_path"] = chosen_prompt
print(f"Using audio prompt: {chosen_prompt}")
else:
print("No audio prompt provided; using default voice.")
wav = current_model.generate(
text_input[:300], # Truncate text to max chars
language_id=language_id,
**generate_kwargs
)
print("Audio generation complete.")
return (current_model.sr, wav.squeeze(0).numpy())
with gr.Blocks() as demo:
gr.Markdown(
"""
# Danish TTS Demo 🇩🇰
Generate high-quality danish speech from text with reference audio styling.
This is a preview of a model that was developed as part of the CoRal project, and is a finetuned version of the Chatterbox Multilingual.
"""
)
# Display supported languages
gr.Markdown(get_supported_languages_display())
with gr.Row():
with gr.Column():
initial_lang = "da"
text = gr.Textbox(
value=default_text_for_ui(initial_lang),
label="Text to synthesize (max chars 300)",
max_lines=5
)
language_id = gr.Dropdown(
choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
value=initial_lang,
label="Language",
info="Select the language for text-to-speech synthesis"
)
danish_voice = gr.Dropdown(
choices=get_danish_voice_options(),
value="mic",
label="Danish Voice Selection",
info="Choose between different Danish voice options",
visible=(initial_lang == "da")
)
ref_wav = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Reference Audio File (Optional)",
value=default_audio_for_ui(initial_lang)
)
gr.Markdown(
"💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
elem_classes=["audio-note"]
)
cfg_weight = gr.Slider(
0.2, 1, step=.05, label="CFG/Pace", value=0.5
)
with gr.Accordion("More options", open=False):
seed_num = gr.Number(value=0, label="Random seed (0 for random)")
temp = gr.Slider(0.05, 2.5, step=.05, label="Temperature", value=.8)
run_btn = gr.Button("Generate", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Output Audio")
def on_language_change(lang, current_ref, current_text):
is_danish = (lang == "da")
danish_voice_val = "mic" if is_danish else "mic" # Default to mic
return (
default_audio_for_ui(lang, danish_voice_val),
default_text_for_ui(lang),
gr.update(visible=is_danish), # Update Danish voice dropdown visibility
danish_voice_val
)
def on_danish_voice_change(lang, danish_voice_val):
if lang == "da":
return default_audio_for_ui(lang, danish_voice_val)
return gr.update() # No change if not Danish
language_id.change(
fn=on_language_change,
inputs=[language_id, ref_wav, text],
outputs=[ref_wav, text, danish_voice, danish_voice],
show_progress=False
)
danish_voice.change(
fn=on_danish_voice_change,
inputs=[language_id, danish_voice],
outputs=[ref_wav],
show_progress=False
)
run_btn.click(
fn=generate_tts_audio,
inputs=[
text,
language_id,
ref_wav,
danish_voice,
temp,
seed_num,
cfg_weight,
],
outputs=[audio_output],
)
demo.launch() #mcp_server=True |