import gradio as gr from transformers import AutoTokenizer import json import traceback from typing import Optional, Dict, List, Tuple # Popular tokenizer models TOKENIZER_OPTIONS = { # Qwen Series "Qwen/Qwen3-0.6B": "Qwen 3 (0.6B)", "Qwen/Qwen3-1.8B": "Qwen 3 (1.8B)", "Qwen/Qwen3-4B": "Qwen 3 (4B)", "Qwen/Qwen3-7B": "Qwen 3 (7B)", "Qwen/Qwen2.5-7B": "Qwen 2.5 (7B)", "Qwen/Qwen2.5-72B": "Qwen 2.5 (72B)", "Qwen/Qwen2-7B": "Qwen 2 (7B)", "Qwen/Qwen2-72B": "Qwen 2 (72B)", "Qwen/Qwen-7B": "Qwen 1 (7B)", # Llama Series "meta-llama/Llama-3.2-1B": "Llama 3.2 (1B)", "meta-llama/Llama-3.2-3B": "Llama 3.2 (3B)", "meta-llama/Llama-3.1-8B": "Llama 3.1 (8B)", "meta-llama/Llama-3.1-70B": "Llama 3.1 (70B)", "meta-llama/Llama-2-7b-hf": "Llama 2 (7B)", "meta-llama/Llama-2-13b-hf": "Llama 2 (13B)", "meta-llama/Llama-2-70b-hf": "Llama 2 (70B)", # Other Popular Models "openai-community/gpt2": "GPT-2", "google/gemma-2b": "Gemma (2B)", "google/gemma-7b": "Gemma (7B)", "mistralai/Mistral-7B-v0.1": "Mistral (7B)", "mistralai/Mixtral-8x7B-v0.1": "Mixtral (8x7B)", "deepseek-ai/deepseek-coder-6.7b-base": "DeepSeek Coder (6.7B)", "microsoft/phi-2": "Phi-2", "microsoft/phi-3-mini-4k-instruct": "Phi-3 Mini", "01-ai/Yi-6B": "Yi (6B)", "01-ai/Yi-34B": "Yi (34B)", "google-t5/t5-base": "T5 Base", "google-bert/bert-base-uncased": "BERT Base (uncased)", "google-bert/bert-base-cased": "BERT Base (cased)", "EleutherAI/gpt-neox-20b": "GPT-NeoX (20B)", "bigscience/bloom-560m": "BLOOM (560M)", "facebook/opt-350m": "OPT (350M)", "stabilityai/stablelm-base-alpha-7b": "StableLM (7B)", } # Cache for loaded tokenizers tokenizer_cache = {} def load_tokenizer(model_id: str): """Load a tokenizer with caching.""" if model_id not in tokenizer_cache: try: tokenizer_cache[model_id] = AutoTokenizer.from_pretrained( model_id, trust_remote_code=True, use_fast=True # Use fast tokenizer when available ) except Exception as e: # Fallback to slow tokenizer if fast is not available try: tokenizer_cache[model_id] = AutoTokenizer.from_pretrained( model_id, trust_remote_code=True, use_fast=False ) except: raise e return tokenizer_cache[model_id] def tokenize_text( text: str, model_id: str, add_special_tokens: bool = True, show_special_tokens: bool = True, custom_model_id: Optional[str] = None ) -> Tuple[str, str, str, str]: """ Tokenize text using the selected tokenizer. Returns: Tuple of (tokens_json, token_ids, decoded_text, stats) """ try: # Use custom model ID if provided actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id if not actual_model_id: return "", "", "", "Please select or enter a tokenizer model." # Load tokenizer tokenizer = load_tokenizer(actual_model_id) # Tokenize encoded = tokenizer.encode(text, add_special_tokens=add_special_tokens) tokens = tokenizer.convert_ids_to_tokens(encoded) # Decode decoded = tokenizer.decode(encoded, skip_special_tokens=not show_special_tokens) # Create detailed token information token_info = [] for i, (token, token_id) in enumerate(zip(tokens, encoded)): # Try to get the actual string representation of the token try: token_str = tokenizer.convert_tokens_to_string([token]) except: token_str = token token_info.append({ "index": i, "token": token, "token_id": token_id, "text": token_str, "is_special": token_id in (tokenizer.all_special_ids if hasattr(tokenizer, 'all_special_ids') else []) }) # Format outputs tokens_display = json.dumps(tokens, ensure_ascii=False, indent=2) token_ids_display = str(encoded) token_info_json = json.dumps(token_info, ensure_ascii=False, indent=2) # Statistics stats = f"""Statistics: • Model: {actual_model_id} • Number of tokens: {len(tokens)} • Number of characters: {len(text)} • Tokens per character: {len(tokens)/len(text):.2f} • Characters per token: {len(text)/len(tokens):.2f} • Vocabulary size: {tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else 'N/A'} • Special tokens: {', '.join(tokenizer.all_special_tokens) if hasattr(tokenizer, 'all_special_tokens') else 'N/A'}""" return tokens_display, token_ids_display, decoded, token_info_json, stats except Exception as e: error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" return error_msg, "", "", "", "" def decode_tokens( token_ids_str: str, model_id: str, skip_special_tokens: bool = False, custom_model_id: Optional[str] = None ) -> Tuple[str, str, str]: """Decode token IDs back to text. Returns: Tuple of (decoded_text, tokens_json, stats) """ try: # Use custom model ID if provided actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id if not actual_model_id: return "Please select or enter a tokenizer model.", "", "" # Parse token IDs token_ids_str = token_ids_str.strip() if not token_ids_str: return "", "", "" if token_ids_str.startswith('[') and token_ids_str.endswith(']'): token_ids = json.loads(token_ids_str) else: # Try to parse as comma or space separated values token_ids = [int(x.strip()) for x in token_ids_str.replace(',', ' ').split()] # Load tokenizer and decode tokenizer = load_tokenizer(actual_model_id) decoded = tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) # Also show tokens tokens = tokenizer.convert_ids_to_tokens(token_ids) tokens_json = json.dumps(tokens, ensure_ascii=False, indent=2) # Statistics stats = f"""Statistics: • Model: {actual_model_id} • Token count: {len(tokens)} • Character count: {len(decoded)} • Characters per token: {len(decoded)/len(tokens):.2f} • Special tokens skipped: {'Yes' if skip_special_tokens else 'No'}""" return decoded, tokens_json, stats except Exception as e: error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" return error_msg, "", "" def compare_tokenizers( text: str, model_ids: List[str], add_special_tokens: bool = True ) -> str: """Compare tokenization across multiple models.""" if not model_ids: return "Please select at least one model to compare." results = [] for model_id in model_ids: try: tokenizer = load_tokenizer(model_id) encoded = tokenizer.encode(text, add_special_tokens=add_special_tokens) tokens = tokenizer.convert_ids_to_tokens(encoded) results.append({ "model": model_id, "token_count": len(tokens), "tokens": tokens[:50], # Show first 50 tokens "token_ids": encoded[:50] # Show first 50 IDs }) except Exception as e: results.append({ "model": model_id, "error": str(e) }) # Sort by token count results.sort(key=lambda x: x.get("token_count", float('inf'))) # Format output output = "# Tokenizer Comparison\n\n" output += f"Input text length: {len(text)} characters\n\n" for result in results: if "error" in result: output += f"## {result['model']}\n" output += f"Error: {result['error']}\n\n" else: output += f"## {result['model']}\n" output += f"**Token count:** {result['token_count']} " output += f"(ratio: {result['token_count']/len(text):.2f} tokens/char)\n\n" output += f"**First tokens:** {result['tokens']}\n\n" if len(result['tokens']) == 50: output += "*(showing first 50 tokens)*\n\n" return output def analyze_vocabulary(model_id: str, custom_model_id: Optional[str] = None) -> str: """Analyze tokenizer vocabulary.""" try: actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id if not actual_model_id: return "Please select or enter a tokenizer model." tokenizer = load_tokenizer(actual_model_id) # Get vocabulary information vocab_size = tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else len(tokenizer.get_vocab()) # Get special tokens special_tokens = {} if hasattr(tokenizer, 'special_tokens_map'): special_tokens = tokenizer.special_tokens_map # Get some example tokens vocab = tokenizer.get_vocab() sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])[:100] # First 100 tokens output = f"""# Tokenizer Vocabulary Analysis **Model:** {actual_model_id} **Vocabulary Size:** {vocab_size:,} **Tokenizer Type:** {tokenizer.__class__.__name__} ## Special Tokens ```json {json.dumps(special_tokens, ensure_ascii=False, indent=2)} ``` ## Token Settings • Padding Token: {tokenizer.pad_token if tokenizer.pad_token else 'None'} • BOS Token: {tokenizer.bos_token if tokenizer.bos_token else 'None'} • EOS Token: {tokenizer.eos_token if tokenizer.eos_token else 'None'} • UNK Token: {tokenizer.unk_token if tokenizer.unk_token else 'None'} • SEP Token: {tokenizer.sep_token if hasattr(tokenizer, 'sep_token') and tokenizer.sep_token else 'None'} • CLS Token: {tokenizer.cls_token if hasattr(tokenizer, 'cls_token') and tokenizer.cls_token else 'None'} • Mask Token: {tokenizer.mask_token if hasattr(tokenizer, 'mask_token') and tokenizer.mask_token else 'None'} ## First 100 Tokens in Vocabulary Token → ID """ for token, token_id in sorted_vocab: # Escape special characters for display display_token = repr(token) if not token.isprintable() else token output += f"{display_token} → {token_id}\n" return output except Exception as e: return f"Error: {str(e)}\n{traceback.format_exc()}" # Create Gradio interface with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app: gr.Markdown(""" # 🤗 Tokenizer Playground A comprehensive tool for NLP researchers to experiment with various Hugging Face tokenizers. Supports popular models including **Qwen**, **Llama**, **Mistral**, **GPT**, and many more. ### Features: - 🔤 **Tokenize & Detokenize** text with any Hugging Face tokenizer - 📊 **Compare** tokenization across multiple models - 📖 **Analyze** vocabulary and special tokens - 🎯 **Support** for custom model IDs from Hugging Face Hub """) with gr.Tab("🔤 Tokenize"): with gr.Row(): with gr.Column(scale=3): tokenize_input = gr.Textbox( label="Input Text", placeholder="Enter text to tokenize...", lines=5, max_lines=15, autoscroll=False ) with gr.Column(scale=1): tokenize_model = gr.Dropdown( label="Select Tokenizer", choices=list(TOKENIZER_OPTIONS.keys()), value="Qwen/Qwen3-0.6B", allow_custom_value=False ) tokenize_custom_model = gr.Textbox( label="Or Enter Custom Model ID", placeholder="e.g., facebook/bart-base", info="Override selection above with any HF model" ) add_special = gr.Checkbox(label="Add Special Tokens", value=True) show_special = gr.Checkbox(label="Show Special Tokens in Decoded", value=True) tokenize_btn = gr.Button("Tokenize", variant="primary") with gr.Row(): with gr.Column(): tokens_output = gr.Textbox(label="Tokens", lines=10, max_lines=20, autoscroll=False, show_copy_button=True) with gr.Column(): token_ids_output = gr.Textbox(label="Token IDs", lines=10, max_lines=20, autoscroll=False, show_copy_button=True) with gr.Row(): with gr.Column(): decoded_output = gr.Textbox(label="Decoded Text (Verification)", lines=5, max_lines=15, autoscroll=False, show_copy_button=True) with gr.Column(): token_info_output = gr.Textbox(label="Detailed Token Information", lines=10, max_lines=20, autoscroll=False, show_copy_button=True) stats_output = gr.Textbox(label="Statistics", lines=7, max_lines=15, autoscroll=False) tokenize_btn.click( fn=tokenize_text, inputs=[tokenize_input, tokenize_model, add_special, show_special, tokenize_custom_model], outputs=[tokens_output, token_ids_output, decoded_output, token_info_output, stats_output] ) with gr.Tab("🔄 Detokenize"): with gr.Row(): with gr.Column(scale=3): decode_input = gr.Textbox( label="Token IDs", placeholder="Enter token IDs as a list [101, 2023, ...] or space/comma separated", lines=5, max_lines=15, autoscroll=False ) with gr.Column(scale=1): decode_model = gr.Dropdown( label="Select Tokenizer", choices=list(TOKENIZER_OPTIONS.keys()), value="Qwen/Qwen3-0.6B" ) decode_custom_model = gr.Textbox( label="Or Enter Custom Model ID", placeholder="e.g., facebook/bart-base" ) skip_special = gr.Checkbox(label="Skip Special Tokens", value=False) decode_btn = gr.Button("Decode", variant="primary") decode_output = gr.Textbox( label="Decoded Text", lines=10, max_lines=20, interactive=False, show_copy_button=True, placeholder="Decoded text will appear here...", autoscroll=False ) decode_stats = gr.Textbox( label="Statistics", lines=5, interactive=False ) with gr.Accordion("Show Tokens", open=False): decode_tokens_output = gr.Textbox( label="Tokens", lines=10, max_lines=20, interactive=False, show_copy_button=True, autoscroll=False ) decode_btn.click( fn=decode_tokens, inputs=[decode_input, decode_model, skip_special, decode_custom_model], outputs=[decode_output, decode_tokens_output, decode_stats] ) with gr.Tab("📊 Compare"): compare_input = gr.Textbox( label="Input Text", placeholder="Enter text to compare tokenization across models...", lines=5, max_lines=15, autoscroll=False ) compare_models = gr.CheckboxGroup( label="Select Models to Compare", choices=list(TOKENIZER_OPTIONS.keys()), value=["Qwen/Qwen3-0.6B", "meta-llama/Llama-3.1-8B", "openai-community/gpt2"] ) compare_add_special = gr.Checkbox(label="Add Special Tokens", value=True) compare_btn = gr.Button("Compare Tokenizers", variant="primary") compare_output = gr.Markdown() compare_btn.click( fn=compare_tokenizers, inputs=[compare_input, compare_models, compare_add_special], outputs=compare_output ) with gr.Tab("📖 Vocabulary"): with gr.Row(): vocab_model = gr.Dropdown( label="Select Tokenizer", choices=list(TOKENIZER_OPTIONS.keys()), value="Qwen/Qwen3-0.6B" ) vocab_custom_model = gr.Textbox( label="Or Enter Custom Model ID", placeholder="e.g., facebook/bart-base" ) vocab_btn = gr.Button("Analyze Vocabulary", variant="primary") vocab_output = gr.Markdown() vocab_btn.click( fn=analyze_vocabulary, inputs=[vocab_model, vocab_custom_model], outputs=vocab_output ) with gr.Tab("ℹ️ About"): gr.Markdown(""" ## About This Tool This tokenizer playground provides researchers and developers with an easy way to experiment with various tokenizers from the Hugging Face Model Hub. ### Supported Models **Qwen Series:** Qwen 3, Qwen 2.5, Qwen 2, Qwen 1 (various sizes) **Llama Series:** Llama 3.2, Llama 3.1, Llama 2 (various sizes) **Other Popular Models:** GPT-2, Gemma, Mistral, Mixtral, DeepSeek, Phi, Yi, T5, BERT, GPT-NeoX, BLOOM, OPT, StableLM ### Custom Models You can use any tokenizer from the Hugging Face Hub by entering its model ID in the "Custom Model ID" field. For example: - `facebook/bart-base` - `EleutherAI/gpt-j-6b` - `bigscience/bloom` ### Features Explanation - **Tokenize:** Convert text into tokens and token IDs - **Detokenize:** Convert token IDs back to text - **Compare:** See how different tokenizers handle the same text - **Vocabulary:** Explore tokenizer vocabulary and special tokens ### Tips 1. Different tokenizers can produce very different token counts for the same text 2. Special tokens (like [CLS], [SEP], , ) are model-specific 3. Subword tokenization (used by most modern models) allows handling of out-of-vocabulary words 4. Token efficiency affects model performance and API costs ### Resources - [Hugging Face Tokenizers Documentation](https://huggingface.co/docs/transformers/main_classes/tokenizer) - [Understanding Tokenization](https://huggingface.co/docs/transformers/tokenizer_summary) - [Model Hub](https://huggingface.co/models) --- """) # Launch the app if __name__ == "__main__": app.launch()