Spaces:
Runtime error
Runtime error
| import os | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| import gradio as gr | |
| import torch | |
| import spaces | |
| import torchaudio | |
| import uuid | |
| import time | |
| from datetime import timedelta | |
| from lhotse import Recording | |
| from lhotse.dataset import DynamicCutSampler | |
| from nemo.collections.speechlm2 import SALM | |
| from pathlib import Path | |
| # Set synthwave theme | |
| theme = gr.themes.Ocean( | |
| primary_hue="indigo", | |
| secondary_hue="fuchsia", | |
| neutral_hue="slate", | |
| ).set( | |
| button_large_radius='*radius_sm' | |
| ) | |
| # Set device to use cuda if available and sample rate to 16000 for Nvidia NeMo | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| SAMPLE_RATE = 16000 | |
| MAX_AUDIO_MINUTES = 120 | |
| CHUNK_SECONDS = 40.0 | |
| BATCH_SIZE = 192 | |
| # Load the model from Hugging Face Hub using Nvidia SALM | |
| model = SALM.from_pretrained("nvidia/canary-qwen-2.5b").bfloat16().eval().to(device) | |
| def as_batches(audio_filepath, utt_id): | |
| rec = Recording.from_file(audio_filepath, recording_id=utt_id) | |
| if rec.duration / 60.0 > MAX_AUDIO_MINUTES: | |
| raise gr.Error(f"Audio file is too long. Maximum duration is {MAX_AUDIO_MINUTES} minutes.") | |
| cut = rec.resample(SAMPLE_RATE).to_cut() | |
| if cut.num_channels > 1: | |
| cut = cut.to_mono(mono_downmix=True) | |
| return DynamicCutSampler(cut.cut_into_windows(CHUNK_SECONDS), max_cuts=BATCH_SIZE) | |
| # Define the audio transcription function and use ZeroGPU | |
| def transcribe_audio(audio_filepath): | |
| if audio_filepath is None: | |
| return "Please upload an audio file", "", [], "" | |
| start_time = time.time() | |
| utt_id = uuid.uuid4() | |
| pred_text = [] | |
| for batch in as_batches(audio_filepath, str(utt_id)): | |
| audio, audio_lens = batch.load_audio(collate=True) | |
| with torch.inference_mode(): | |
| output_ids = model.generate( | |
| prompts=[[{"role": "user", "content": f"Transcribe the following using accurate punctuation and capitalization: {model.audio_locator_tag}"}]] * len(batch), | |
| audios=torch.as_tensor(audio).to(device, non_blocking=True), | |
| audio_lens=torch.as_tensor(audio_lens).to(device, non_blocking=True), | |
| max_new_tokens=256, | |
| ) | |
| texts = [model.tokenizer.ids_to_text(oids) for oids in output_ids.cpu()] | |
| for t in texts: | |
| pred_text.append(t) | |
| transcript = ' '.join(pred_text) | |
| end_time = time.time() | |
| # Calculate statistics | |
| transcription_time = round(end_time - start_time, 2) | |
| word_count = len(transcript.split()) | |
| words_per_second = round(word_count / transcription_time, 2) if transcription_time > 0 else 0 | |
| # Get filename | |
| filename = Path(audio_filepath).name | |
| # Create label with stats | |
| label_text = f"File: {filename} | Words: {word_count} | Time: {transcription_time}s | WPS: {words_per_second}" | |
| return transcript, transcript, gr.update(label=label_text) | |
| def transcript_qa(transcript, question): | |
| if not transcript: | |
| return "Please transcribe audio first before asking questions.", "" | |
| if not question or question.strip() == "": | |
| return "", "" | |
| with torch.inference_mode(), model.llm.disable_adapter(): | |
| output_ids = model.generate( | |
| prompts=[[{"role": "user", "content": f"{question}\n\nTranscript: {transcript}"}]], | |
| max_new_tokens=1024, | |
| ) | |
| ans = model.tokenizer.ids_to_text(output_ids[0].cpu()) | |
| ans = ans.split("<|im_start|>assistant")[-1] | |
| thinking = "" | |
| if "<think>" in ans: | |
| if "</think>" in ans: | |
| parts = ans.split("<think>") | |
| # Get text before <think> tag if any | |
| before_think = parts[0] if len(parts) > 1 else "" | |
| # Get content between <think> and </think> | |
| think_content = parts[1] if len(parts) > 1 else parts[0] | |
| thinking, after_think = think_content.split("</think>") | |
| thinking = thinking.strip() | |
| # Combine text before and after thinking | |
| ans = before_think + after_think | |
| ans = ans.strip() | |
| if not ans: | |
| ans = "I couldn't generate a response. Please try rephrasing your question." | |
| return ans, thinking | |
| def disable_transcribe(): | |
| return gr.update(interactive=False) | |
| def enable_transcribe(): | |
| return gr.update(interactive=True) | |
| # Load external CSS and HTML | |
| def load_template(filename): | |
| template_path = Path(__file__).parent / "templates" / filename | |
| return template_path.read_text() if template_path.exists() else "" | |
| # Build the Gradio interface | |
| with gr.Blocks(theme=theme) as demo: | |
| # Simple banner image - responsive and clean | |
| gr.HTML(""" | |
| <div style="width: 100%; margin-bottom: 20px;"> | |
| <img src="https://huggingface.co/spaces/ACloudCenter/canary-qwen-transcriber-2.5b/resolve/main/public/banner.png" | |
| style="width: 100%; height: auto; border-radius: 15px; box-shadow: 0 10px 40px rgba(0,0,0,0.2);" | |
| alt="Canary-Qwen Transcriber Banner"> | |
| </div> | |
| """) | |
| gr.Markdown("## Upload an Audio File, Choose an Example File, or Record Yourself Then Ask Questions About the Transcript.") | |
| gr.Markdown('''NVIDIA NeMo Canary-Qwen-2.5B is an English speech recognition model that achieves state-of-the art | |
| performance on multiple English speech benchmarks. With 2.5 billion parameters and running at 418 RTFx, | |
| Canary-Qwen-2.5B supports automatic speech-to-text recognition (ASR) in English with punctuation and capitalization | |
| (PnC). The model works in two modes: as a transcription tool (ASR mode) and as an LLM (LLM mode). In ASR mode, the | |
| model is only capable of transcribing the speech into text, but does not retain any LLM-specific skills such as reasoning. | |
| In LLM mode, the model retains all of the original LLM capabilities, which can be used to post-process the transcript, e.g. | |
| summarize it or answer questions about it. In LLM mode, the model does not "understand" the raw audio anymore - only | |
| its transcript. This model is ready for commercial use. All example audio was generated using Microsoft VibeVoice, found in my other space | |
| - [Conference Generator VibeVoice](https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice)''') | |
| with gr.Tabs(): | |
| with gr.Tab("Transcribe"): | |
| # State variables | |
| transcript_state = gr.State("") | |
| # Example questions | |
| example_questions = [ | |
| ["Can you summarize this meeting?"], | |
| ["Please provide bullet points of the key items."], | |
| ["What is the TL;DR of this meeting?"], | |
| ["What was the main take-away?"], | |
| ["What was the main topic?"], | |
| ] | |
| # Define file paths as variables | |
| ai_ted = "public/audio_files/ai_tedtalk.wav" | |
| financial = "public/audio_files/financial_meeting.wav" | |
| military = "public/audio_files/military_meeting.wav" | |
| oil = "public/audio_files/oil_meeting.wav" | |
| political = "public/audio_files/political_speech.wav" | |
| telehealth = "public/audio_files/telehealth_meeting.wav" | |
| game_dev = "public/audio_files/game_create_meeting.wav" | |
| product = "public/audio_files/product_meeting.wav" | |
| # Audio Input and Transcript | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Audio Input") | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="Record/Upload Audio (MP3, WAV, M4A, etc.)", | |
| show_download_button=True | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| [ai_ted], | |
| [financial], | |
| [military], | |
| [oil], | |
| [political], | |
| [telehealth], | |
| [game_dev], | |
| [product] | |
| ], | |
| inputs=audio_input, | |
| label="Example Audio Files", | |
| example_labels=["AI TED Talk", "Financial Meeting", "Military Meeting", "Oil & Gas Meeting", | |
| "Political Speech", "Telehealth Meeting", "Game Dev Meeting", "Product Meeting"] | |
| ) | |
| transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg") | |
| clear_audio_btn = gr.Button("Clear Audio") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Transcript") | |
| transcript_output = gr.Textbox( | |
| label="Waiting for transcription...", | |
| lines=12, | |
| placeholder="Transcript will appear here after clicking 'Transcribe Audio'...", | |
| max_lines=12, | |
| autoscroll=True | |
| ) | |
| clear_transcript_btn = gr.Button("Clear Transcript") | |
| # Spacing | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Interactive Q&A") | |
| gr.Markdown("#### About Context-Aware Q&A") | |
| gr.Markdown("""The model retains the full transcript context, allowing you to ask follow-up questions | |
| naturally without re-stating information. It understands references like 'they', 'it', or 'that topic'.""") | |
| gr.Markdown("#### Example Questions") | |
| # Examples will be added after msg is defined | |
| example_container = gr.Column() | |
| with gr.Column(scale=3): | |
| # Add thinking display above chat | |
| with gr.Accordion("🧠 Model Thinking", open=False): | |
| thinking_box = gr.Textbox( | |
| label="", | |
| placeholder="The model's reasoning will appear here when available...", | |
| lines=6, | |
| max_lines=10, | |
| interactive=False | |
| ) | |
| chatbot = gr.Chatbot( | |
| label="Response", | |
| type="messages", | |
| height=400, | |
| show_copy_button=True, | |
| autoscroll=True | |
| ) | |
| with gr.Row(): | |
| msg = gr.Textbox( | |
| placeholder="Ask a question about the transcript...", | |
| label="Your Question", | |
| lines=1 | |
| ) | |
| submit_chat_btn = gr.Button("Send", variant="primary", scale=1) | |
| clear_chat_btn = gr.Button("Clear Chat", size="sm") | |
| # Event handlers | |
| def submit_question(question, transcript): | |
| if not question or question.strip() == "": | |
| yield "", [], "" | |
| answer, thinking = transcript_qa(transcript, question) | |
| # Just show the current Q&A, no history | |
| messages = [ | |
| {"role": "user", "content": question}, | |
| {"role": "assistant", "content": answer} | |
| ] | |
| yield "", messages, thinking | |
| # Add examples inside the left column container | |
| with example_container: | |
| gr.Examples( | |
| examples=example_questions, | |
| inputs=msg, | |
| outputs=[msg, chatbot, thinking_box], | |
| fn=lambda q: submit_question(q, transcript_state.value), | |
| cache_examples=False, | |
| label="" | |
| ) | |
| transcribe_btn.click( | |
| fn=disable_transcribe, | |
| outputs=[transcribe_btn] | |
| ).then( | |
| fn=lambda: ([], ""), | |
| outputs=[chatbot, thinking_box] | |
| ).then( | |
| fn=transcribe_audio, | |
| inputs=[audio_input], | |
| outputs=[transcript_output, transcript_state, transcript_output] # Third output updates the label | |
| ).then( | |
| fn=enable_transcribe, | |
| outputs=[transcribe_btn] | |
| ) | |
| clear_audio_btn.click( | |
| fn=lambda: None, | |
| outputs=[audio_input] | |
| ) | |
| clear_transcript_btn.click( | |
| fn=lambda: ("", "", gr.update(label="Waiting for transcription...")), | |
| outputs=[transcript_output, transcript_state, transcript_output] | |
| ) | |
| msg.submit( | |
| fn=submit_question, | |
| inputs=[msg, transcript_state], | |
| outputs=[msg, chatbot, thinking_box] | |
| ) | |
| submit_chat_btn.click( | |
| fn=submit_question, | |
| inputs=[msg, transcript_state], | |
| outputs=[msg, chatbot, thinking_box] | |
| ) | |
| clear_chat_btn.click( | |
| fn=lambda: ([], ""), | |
| outputs=[chatbot, thinking_box] | |
| ) | |
| with gr.Tab("Architecture"): | |
| gr.Markdown("### Model Performance") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown(""" | |
| #### Industry-Leading Performance | |
| Canary ranks at the top of the HuggingFace Open ASR Leaderboard with an average word error rate (WER) of **6.67%**. It outperforms all other open-source models by a wide margin. | |
| #### Training Data | |
| Canary is trained on a combination of public and in-house data: | |
| - **85K hours** of transcribed speech for speech recognition | |
| - NVIDIA NeMo text translation models used to generate translations of the original transcripts in all supported languages | |
| Despite using an order of magnitude less data, Canary outperforms the similarly sized Whisper-large-v3 and SeamlessM4T-Medium-v1 models on both transcription and translation tasks. | |
| """) | |
| gr.Markdown("### Benchmark Results") | |
| gr.Markdown(""" | |
| #### Word Error Rate (WER) on MCV 16.1 Test Sets | |
| On the MCV 16.1 test sets for English, Spanish, French, and German, Canary achieved a WER of **5.77** (lower is better). | |
| """) | |
| with gr.Column(scale=3): | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px;"> | |
| <img src="https://huggingface.co/spaces/ACloudCenter/canary-qwen-transcriber-2.5b/resolve/main/public/nvidia-speech.png" | |
| style="width: 100%; height: auto; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);" | |
| alt="NVIDIA Canary Architecture"> | |
| <p style="margin-top: 10px; color: #666; font-size: 14px;">NVIDIA ASR</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown(""" | |
| | Model | Average WER | | |
| |-------|-------------| | |
| | **Canary** | **5.77** | | |
| | SeamlessM4T-v2 | 6.41 | | |
| | Whisper-large-v3 | 8.05 | | |
| | SeamlessM4T-v1 | 9.48 | | |
| """) | |
| with gr.Column(scale=3): | |
| gr.Markdown(""" | |
| #### Translation BLEU Scores | |
| **From English** (ES, FR, DE on FLEURS & MExpresso): | |
| - Canary: **30.57** BLEU | |
| **To English** (ES, FR, DE on FLEURS & CoVoST): | |
| - Canary: **34.25** BLEU | |
| *(Higher BLEU scores indicate better translation quality)* | |
| """) | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown(""" | |
| ### Canary Architecture Details | |
| Canary is an encoder-decoder model built on NVIDIA innovations: | |
| - **Encoder**: Fast-Conformer - an efficient Conformer architecture optimized for ~3x savings on compute and ~4x savings on memory | |
| - **Processing**: Audio is processed as log-mel spectrogram features | |
| - **Decoder**: Transformer decoder generates output text tokens auto-regressively | |
| - **Control**: Special tokens control whether Canary performs transcription or translation | |
| - **Tokenizer**: Concatenated tokenizer offers explicit control of output token space | |
| #### Licensing | |
| - **Model weights**: CC BY-NC 4.0 license (research-friendly, non-commercial) | |
| - **Training code**: Apache 2.0 license (available from NeMo) | |
| For more information about accessing Canary locally and building on top of it, see the [NVIDIA/NeMo GitHub repository](https://github.com/NVIDIA/NeMo). | |
| """) | |
| with gr.Column(scale=3): | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px;"> | |
| <img src="https://huggingface.co/spaces/ACloudCenter/canary-qwen-transcriber-2.5b/resolve/main/public/chart1.png" | |
| style="width: 100%; height: auto; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);" | |
| alt="NVIDIA Canary Architecture"> | |
| <p style="margin-top: 10px; color: #666; font-size: 14px;">ASR Models RTFx vs Accuracy Benchmarks</p> | |
| </div> | |
| """) | |
| demo.queue() | |
| demo.launch() | |