Spaces:
Running
Running
| import gradio as gr | |
| from transformers import AutoTokenizer | |
| import json | |
| import traceback | |
| from typing import Optional, Dict, List, Tuple | |
| # Popular tokenizer models | |
| TOKENIZER_OPTIONS = { | |
| # Qwen Series | |
| "Qwen/Qwen3-0.6B": "Qwen 3 (0.6B)", | |
| "Qwen/Qwen3-1.8B": "Qwen 3 (1.8B)", | |
| "Qwen/Qwen3-4B": "Qwen 3 (4B)", | |
| "Qwen/Qwen3-7B": "Qwen 3 (7B)", | |
| "Qwen/Qwen2.5-7B": "Qwen 2.5 (7B)", | |
| "Qwen/Qwen2.5-72B": "Qwen 2.5 (72B)", | |
| "Qwen/Qwen2-7B": "Qwen 2 (7B)", | |
| "Qwen/Qwen2-72B": "Qwen 2 (72B)", | |
| "Qwen/Qwen-7B": "Qwen 1 (7B)", | |
| # Llama Series | |
| "meta-llama/Llama-3.2-1B": "Llama 3.2 (1B)", | |
| "meta-llama/Llama-3.2-3B": "Llama 3.2 (3B)", | |
| "meta-llama/Llama-3.1-8B": "Llama 3.1 (8B)", | |
| "meta-llama/Llama-3.1-70B": "Llama 3.1 (70B)", | |
| "meta-llama/Llama-2-7b-hf": "Llama 2 (7B)", | |
| "meta-llama/Llama-2-13b-hf": "Llama 2 (13B)", | |
| "meta-llama/Llama-2-70b-hf": "Llama 2 (70B)", | |
| # Other Popular Models | |
| "openai-community/gpt2": "GPT-2", | |
| "google/gemma-2b": "Gemma (2B)", | |
| "google/gemma-7b": "Gemma (7B)", | |
| "mistralai/Mistral-7B-v0.1": "Mistral (7B)", | |
| "mistralai/Mixtral-8x7B-v0.1": "Mixtral (8x7B)", | |
| "deepseek-ai/deepseek-coder-6.7b-base": "DeepSeek Coder (6.7B)", | |
| "microsoft/phi-2": "Phi-2", | |
| "microsoft/phi-3-mini-4k-instruct": "Phi-3 Mini", | |
| "01-ai/Yi-6B": "Yi (6B)", | |
| "01-ai/Yi-34B": "Yi (34B)", | |
| "google-t5/t5-base": "T5 Base", | |
| "google-bert/bert-base-uncased": "BERT Base (uncased)", | |
| "google-bert/bert-base-cased": "BERT Base (cased)", | |
| "EleutherAI/gpt-neox-20b": "GPT-NeoX (20B)", | |
| "bigscience/bloom-560m": "BLOOM (560M)", | |
| "facebook/opt-350m": "OPT (350M)", | |
| "stabilityai/stablelm-base-alpha-7b": "StableLM (7B)", | |
| } | |
| # Cache for loaded tokenizers | |
| tokenizer_cache = {} | |
| def load_tokenizer(model_id: str): | |
| """Load a tokenizer with caching.""" | |
| if model_id not in tokenizer_cache: | |
| try: | |
| tokenizer_cache[model_id] = AutoTokenizer.from_pretrained( | |
| model_id, | |
| trust_remote_code=True, | |
| use_fast=True # Use fast tokenizer when available | |
| ) | |
| except Exception as e: | |
| # Fallback to slow tokenizer if fast is not available | |
| try: | |
| tokenizer_cache[model_id] = AutoTokenizer.from_pretrained( | |
| model_id, | |
| trust_remote_code=True, | |
| use_fast=False | |
| ) | |
| except: | |
| raise e | |
| return tokenizer_cache[model_id] | |
| def tokenize_text( | |
| text: str, | |
| model_id: str, | |
| add_special_tokens: bool = True, | |
| show_special_tokens: bool = True, | |
| custom_model_id: Optional[str] = None | |
| ) -> Tuple[str, str, str, str]: | |
| """ | |
| Tokenize text using the selected tokenizer. | |
| Returns: | |
| Tuple of (tokens_json, token_ids, decoded_text, stats) | |
| """ | |
| try: | |
| # Use custom model ID if provided | |
| actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id | |
| if not actual_model_id: | |
| return "", "", "", "Please select or enter a tokenizer model." | |
| # Load tokenizer | |
| tokenizer = load_tokenizer(actual_model_id) | |
| # Tokenize | |
| encoded = tokenizer.encode(text, add_special_tokens=add_special_tokens) | |
| tokens = tokenizer.convert_ids_to_tokens(encoded) | |
| # Decode | |
| decoded = tokenizer.decode(encoded, skip_special_tokens=not show_special_tokens) | |
| # Create detailed token information | |
| token_info = [] | |
| for i, (token, token_id) in enumerate(zip(tokens, encoded)): | |
| # Try to get the actual string representation of the token | |
| try: | |
| token_str = tokenizer.convert_tokens_to_string([token]) | |
| except: | |
| token_str = token | |
| token_info.append({ | |
| "index": i, | |
| "token": token, | |
| "token_id": token_id, | |
| "text": token_str, | |
| "is_special": token_id in (tokenizer.all_special_ids if hasattr(tokenizer, 'all_special_ids') else []) | |
| }) | |
| # Format outputs | |
| tokens_display = json.dumps(tokens, ensure_ascii=False, indent=2) | |
| token_ids_display = str(encoded) | |
| token_info_json = json.dumps(token_info, ensure_ascii=False, indent=2) | |
| # Statistics | |
| stats = f"""Statistics: | |
| • Model: {actual_model_id} | |
| • Number of tokens: {len(tokens)} | |
| • Number of characters: {len(text)} | |
| • Tokens per character: {len(tokens)/len(text):.2f} | |
| • Characters per token: {len(text)/len(tokens):.2f} | |
| • Vocabulary size: {tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else 'N/A'} | |
| • Special tokens: {', '.join(tokenizer.all_special_tokens) if hasattr(tokenizer, 'all_special_tokens') else 'N/A'}""" | |
| return tokens_display, token_ids_display, decoded, token_info_json, stats | |
| except Exception as e: | |
| error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" | |
| return error_msg, "", "", "", "" | |
| def decode_tokens( | |
| token_ids_str: str, | |
| model_id: str, | |
| skip_special_tokens: bool = False, | |
| custom_model_id: Optional[str] = None | |
| ) -> Tuple[str, str, str]: | |
| """Decode token IDs back to text. | |
| Returns: | |
| Tuple of (decoded_text, tokens_json, stats) | |
| """ | |
| try: | |
| # Use custom model ID if provided | |
| actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id | |
| if not actual_model_id: | |
| return "Please select or enter a tokenizer model.", "", "" | |
| # Parse token IDs | |
| token_ids_str = token_ids_str.strip() | |
| if not token_ids_str: | |
| return "", "", "" | |
| if token_ids_str.startswith('[') and token_ids_str.endswith(']'): | |
| token_ids = json.loads(token_ids_str) | |
| else: | |
| # Try to parse as comma or space separated values | |
| token_ids = [int(x.strip()) for x in token_ids_str.replace(',', ' ').split()] | |
| # Load tokenizer and decode | |
| tokenizer = load_tokenizer(actual_model_id) | |
| decoded = tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) | |
| # Also show tokens | |
| tokens = tokenizer.convert_ids_to_tokens(token_ids) | |
| tokens_json = json.dumps(tokens, ensure_ascii=False, indent=2) | |
| # Statistics | |
| stats = f"""Statistics: | |
| • Model: {actual_model_id} | |
| • Token count: {len(tokens)} | |
| • Character count: {len(decoded)} | |
| • Characters per token: {len(decoded)/len(tokens):.2f} | |
| • Special tokens skipped: {'Yes' if skip_special_tokens else 'No'}""" | |
| return decoded, tokens_json, stats | |
| except Exception as e: | |
| error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" | |
| return error_msg, "", "" | |
| def compare_tokenizers( | |
| text: str, | |
| model_ids: List[str], | |
| add_special_tokens: bool = True | |
| ) -> str: | |
| """Compare tokenization across multiple models.""" | |
| if not model_ids: | |
| return "Please select at least one model to compare." | |
| results = [] | |
| for model_id in model_ids: | |
| try: | |
| tokenizer = load_tokenizer(model_id) | |
| encoded = tokenizer.encode(text, add_special_tokens=add_special_tokens) | |
| tokens = tokenizer.convert_ids_to_tokens(encoded) | |
| results.append({ | |
| "model": model_id, | |
| "token_count": len(tokens), | |
| "tokens": tokens[:50], # Show first 50 tokens | |
| "token_ids": encoded[:50] # Show first 50 IDs | |
| }) | |
| except Exception as e: | |
| results.append({ | |
| "model": model_id, | |
| "error": str(e) | |
| }) | |
| # Sort by token count | |
| results.sort(key=lambda x: x.get("token_count", float('inf'))) | |
| # Format output | |
| output = "# Tokenizer Comparison\n\n" | |
| output += f"Input text length: {len(text)} characters\n\n" | |
| for result in results: | |
| if "error" in result: | |
| output += f"## {result['model']}\n" | |
| output += f"Error: {result['error']}\n\n" | |
| else: | |
| output += f"## {result['model']}\n" | |
| output += f"**Token count:** {result['token_count']} " | |
| output += f"(ratio: {result['token_count']/len(text):.2f} tokens/char)\n\n" | |
| output += f"**First tokens:** {result['tokens']}\n\n" | |
| if len(result['tokens']) == 50: | |
| output += "*(showing first 50 tokens)*\n\n" | |
| return output | |
| def analyze_vocabulary(model_id: str, custom_model_id: Optional[str] = None) -> str: | |
| """Analyze tokenizer vocabulary.""" | |
| try: | |
| actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id | |
| if not actual_model_id: | |
| return "Please select or enter a tokenizer model." | |
| tokenizer = load_tokenizer(actual_model_id) | |
| # Get vocabulary information | |
| vocab_size = tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else len(tokenizer.get_vocab()) | |
| # Get special tokens | |
| special_tokens = {} | |
| if hasattr(tokenizer, 'special_tokens_map'): | |
| special_tokens = tokenizer.special_tokens_map | |
| # Get some example tokens | |
| vocab = tokenizer.get_vocab() | |
| sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])[:100] # First 100 tokens | |
| output = f"""# Tokenizer Vocabulary Analysis | |
| **Model:** {actual_model_id} | |
| **Vocabulary Size:** {vocab_size:,} | |
| **Tokenizer Type:** {tokenizer.__class__.__name__} | |
| ## Special Tokens | |
| ```json | |
| {json.dumps(special_tokens, ensure_ascii=False, indent=2)} | |
| ``` | |
| ## Token Settings | |
| • Padding Token: {tokenizer.pad_token if tokenizer.pad_token else 'None'} | |
| • BOS Token: {tokenizer.bos_token if tokenizer.bos_token else 'None'} | |
| • EOS Token: {tokenizer.eos_token if tokenizer.eos_token else 'None'} | |
| • UNK Token: {tokenizer.unk_token if tokenizer.unk_token else 'None'} | |
| • SEP Token: {tokenizer.sep_token if hasattr(tokenizer, 'sep_token') and tokenizer.sep_token else 'None'} | |
| • CLS Token: {tokenizer.cls_token if hasattr(tokenizer, 'cls_token') and tokenizer.cls_token else 'None'} | |
| • Mask Token: {tokenizer.mask_token if hasattr(tokenizer, 'mask_token') and tokenizer.mask_token else 'None'} | |
| ## First 100 Tokens in Vocabulary | |
| Token → ID | |
| """ | |
| for token, token_id in sorted_vocab: | |
| # Escape special characters for display | |
| display_token = repr(token) if not token.isprintable() else token | |
| output += f"{display_token} → {token_id}\n" | |
| return output | |
| except Exception as e: | |
| return f"Error: {str(e)}\n{traceback.format_exc()}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app: | |
| gr.Markdown(""" | |
| # 🤗 Tokenizer Playground | |
| A comprehensive tool for NLP researchers to experiment with various Hugging Face tokenizers. | |
| Supports popular models including **Qwen**, **Llama**, **Mistral**, **GPT**, and many more. | |
| ### Features: | |
| - 🔤 **Tokenize & Detokenize** text with any Hugging Face tokenizer | |
| - 📊 **Compare** tokenization across multiple models | |
| - 📖 **Analyze** vocabulary and special tokens | |
| - 🎯 **Support** for custom model IDs from Hugging Face Hub | |
| """) | |
| with gr.Tab("🔤 Tokenize"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| tokenize_input = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter text to tokenize...", | |
| lines=5 | |
| ) | |
| with gr.Column(scale=1): | |
| tokenize_model = gr.Dropdown( | |
| label="Select Tokenizer", | |
| choices=list(TOKENIZER_OPTIONS.keys()), | |
| value="Qwen/Qwen3-0.6B", | |
| allow_custom_value=False | |
| ) | |
| tokenize_custom_model = gr.Textbox( | |
| label="Or Enter Custom Model ID", | |
| placeholder="e.g., facebook/bart-base", | |
| info="Override selection above with any HF model" | |
| ) | |
| add_special = gr.Checkbox(label="Add Special Tokens", value=True) | |
| show_special = gr.Checkbox(label="Show Special Tokens in Decoded", value=True) | |
| tokenize_btn = gr.Button("Tokenize", variant="primary") | |
| with gr.Row(): | |
| with gr.Column(): | |
| tokens_output = gr.Textbox(label="Tokens", lines=10, max_lines=20) | |
| with gr.Column(): | |
| token_ids_output = gr.Textbox(label="Token IDs", lines=10, max_lines=20) | |
| with gr.Row(): | |
| with gr.Column(): | |
| decoded_output = gr.Textbox(label="Decoded Text (Verification)", lines=5) | |
| with gr.Column(): | |
| token_info_output = gr.Textbox(label="Detailed Token Information", lines=10, max_lines=20) | |
| stats_output = gr.Textbox(label="Statistics", lines=7) | |
| tokenize_btn.click( | |
| fn=tokenize_text, | |
| inputs=[tokenize_input, tokenize_model, add_special, show_special, tokenize_custom_model], | |
| outputs=[tokens_output, token_ids_output, decoded_output, token_info_output, stats_output] | |
| ) | |
| with gr.Tab("🔄 Detokenize"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| decode_input = gr.Textbox( | |
| label="Token IDs", | |
| placeholder="Enter token IDs as a list [101, 2023, ...] or space/comma separated", | |
| lines=5 | |
| ) | |
| with gr.Column(scale=1): | |
| decode_model = gr.Dropdown( | |
| label="Select Tokenizer", | |
| choices=list(TOKENIZER_OPTIONS.keys()), | |
| value="Qwen/Qwen3-0.6B" | |
| ) | |
| decode_custom_model = gr.Textbox( | |
| label="Or Enter Custom Model ID", | |
| placeholder="e.g., facebook/bart-base" | |
| ) | |
| skip_special = gr.Checkbox(label="Skip Special Tokens", value=False) | |
| decode_btn = gr.Button("Decode", variant="primary") | |
| decode_output = gr.Textbox( | |
| label="Decoded Text", | |
| lines=10, | |
| interactive=False, | |
| show_copy_button=True, | |
| placeholder="Decoded text will appear here..." | |
| ) | |
| decode_stats = gr.Textbox( | |
| label="Statistics", | |
| lines=5, | |
| interactive=False | |
| ) | |
| with gr.Accordion("Show Tokens", open=False): | |
| decode_tokens_output = gr.Textbox( | |
| label="Tokens", | |
| lines=10, | |
| interactive=False, | |
| show_copy_button=True | |
| ) | |
| decode_btn.click( | |
| fn=decode_tokens, | |
| inputs=[decode_input, decode_model, skip_special, decode_custom_model], | |
| outputs=[decode_output, decode_tokens_output, decode_stats] | |
| ) | |
| with gr.Tab("📊 Compare"): | |
| compare_input = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter text to compare tokenization across models...", | |
| lines=5 | |
| ) | |
| compare_models = gr.CheckboxGroup( | |
| label="Select Models to Compare", | |
| choices=list(TOKENIZER_OPTIONS.keys()), | |
| value=["Qwen/Qwen3-0.6B", "meta-llama/Llama-3.1-8B", "openai-community/gpt2"] | |
| ) | |
| compare_add_special = gr.Checkbox(label="Add Special Tokens", value=True) | |
| compare_btn = gr.Button("Compare Tokenizers", variant="primary") | |
| compare_output = gr.Markdown() | |
| compare_btn.click( | |
| fn=compare_tokenizers, | |
| inputs=[compare_input, compare_models, compare_add_special], | |
| outputs=compare_output | |
| ) | |
| with gr.Tab("📖 Vocabulary"): | |
| with gr.Row(): | |
| vocab_model = gr.Dropdown( | |
| label="Select Tokenizer", | |
| choices=list(TOKENIZER_OPTIONS.keys()), | |
| value="Qwen/Qwen3-0.6B" | |
| ) | |
| vocab_custom_model = gr.Textbox( | |
| label="Or Enter Custom Model ID", | |
| placeholder="e.g., facebook/bart-base" | |
| ) | |
| vocab_btn = gr.Button("Analyze Vocabulary", variant="primary") | |
| vocab_output = gr.Markdown() | |
| vocab_btn.click( | |
| fn=analyze_vocabulary, | |
| inputs=[vocab_model, vocab_custom_model], | |
| outputs=vocab_output | |
| ) | |
| with gr.Tab("ℹ️ About"): | |
| gr.Markdown(""" | |
| ## About This Tool | |
| This tokenizer playground provides researchers and developers with an easy way to experiment | |
| with various tokenizers from the Hugging Face Model Hub. | |
| ### Supported Models | |
| **Qwen Series:** Qwen 3, Qwen 2.5, Qwen 2, Qwen 1 (various sizes) | |
| **Llama Series:** Llama 3.2, Llama 3.1, Llama 2 (various sizes) | |
| **Other Popular Models:** GPT-2, Gemma, Mistral, Mixtral, DeepSeek, Phi, Yi, T5, BERT, GPT-NeoX, BLOOM, OPT, StableLM | |
| ### Custom Models | |
| You can use any tokenizer from the Hugging Face Hub by entering its model ID in the "Custom Model ID" field. | |
| For example: | |
| - `facebook/bart-base` | |
| - `EleutherAI/gpt-j-6b` | |
| - `bigscience/bloom` | |
| ### Features Explanation | |
| - **Tokenize:** Convert text into tokens and token IDs | |
| - **Detokenize:** Convert token IDs back to text | |
| - **Compare:** See how different tokenizers handle the same text | |
| - **Vocabulary:** Explore tokenizer vocabulary and special tokens | |
| ### Tips | |
| 1. Different tokenizers can produce very different token counts for the same text | |
| 2. Special tokens (like [CLS], [SEP], <s>, </s>) are model-specific | |
| 3. Subword tokenization (used by most modern models) allows handling of out-of-vocabulary words | |
| 4. Token efficiency affects model performance and API costs | |
| ### Resources | |
| - [Hugging Face Tokenizers Documentation](https://huggingface.co/docs/transformers/main_classes/tokenizer) | |
| - [Understanding Tokenization](https://huggingface.co/docs/transformers/tokenizer_summary) | |
| - [Model Hub](https://huggingface.co/models) | |
| --- | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| app.launch() |