tokenizers / app.py
afeng's picture
update output window
405302e
raw
history blame
18.4 kB
import gradio as gr
from transformers import AutoTokenizer
import json
import traceback
from typing import Optional, Dict, List, Tuple
# Popular tokenizer models
TOKENIZER_OPTIONS = {
# Qwen Series
"Qwen/Qwen3-0.6B": "Qwen 3 (0.6B)",
"Qwen/Qwen3-1.8B": "Qwen 3 (1.8B)",
"Qwen/Qwen3-4B": "Qwen 3 (4B)",
"Qwen/Qwen3-7B": "Qwen 3 (7B)",
"Qwen/Qwen2.5-7B": "Qwen 2.5 (7B)",
"Qwen/Qwen2.5-72B": "Qwen 2.5 (72B)",
"Qwen/Qwen2-7B": "Qwen 2 (7B)",
"Qwen/Qwen2-72B": "Qwen 2 (72B)",
"Qwen/Qwen-7B": "Qwen 1 (7B)",
# Llama Series
"meta-llama/Llama-3.2-1B": "Llama 3.2 (1B)",
"meta-llama/Llama-3.2-3B": "Llama 3.2 (3B)",
"meta-llama/Llama-3.1-8B": "Llama 3.1 (8B)",
"meta-llama/Llama-3.1-70B": "Llama 3.1 (70B)",
"meta-llama/Llama-2-7b-hf": "Llama 2 (7B)",
"meta-llama/Llama-2-13b-hf": "Llama 2 (13B)",
"meta-llama/Llama-2-70b-hf": "Llama 2 (70B)",
# Other Popular Models
"openai-community/gpt2": "GPT-2",
"google/gemma-2b": "Gemma (2B)",
"google/gemma-7b": "Gemma (7B)",
"mistralai/Mistral-7B-v0.1": "Mistral (7B)",
"mistralai/Mixtral-8x7B-v0.1": "Mixtral (8x7B)",
"deepseek-ai/deepseek-coder-6.7b-base": "DeepSeek Coder (6.7B)",
"microsoft/phi-2": "Phi-2",
"microsoft/phi-3-mini-4k-instruct": "Phi-3 Mini",
"01-ai/Yi-6B": "Yi (6B)",
"01-ai/Yi-34B": "Yi (34B)",
"google-t5/t5-base": "T5 Base",
"google-bert/bert-base-uncased": "BERT Base (uncased)",
"google-bert/bert-base-cased": "BERT Base (cased)",
"EleutherAI/gpt-neox-20b": "GPT-NeoX (20B)",
"bigscience/bloom-560m": "BLOOM (560M)",
"facebook/opt-350m": "OPT (350M)",
"stabilityai/stablelm-base-alpha-7b": "StableLM (7B)",
}
# Cache for loaded tokenizers
tokenizer_cache = {}
def load_tokenizer(model_id: str):
"""Load a tokenizer with caching."""
if model_id not in tokenizer_cache:
try:
tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
model_id,
trust_remote_code=True,
use_fast=True # Use fast tokenizer when available
)
except Exception as e:
# Fallback to slow tokenizer if fast is not available
try:
tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
model_id,
trust_remote_code=True,
use_fast=False
)
except:
raise e
return tokenizer_cache[model_id]
def tokenize_text(
text: str,
model_id: str,
add_special_tokens: bool = True,
show_special_tokens: bool = True,
custom_model_id: Optional[str] = None
) -> Tuple[str, str, str, str]:
"""
Tokenize text using the selected tokenizer.
Returns:
Tuple of (tokens_json, token_ids, decoded_text, stats)
"""
try:
# Use custom model ID if provided
actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id
if not actual_model_id:
return "", "", "", "Please select or enter a tokenizer model."
# Load tokenizer
tokenizer = load_tokenizer(actual_model_id)
# Tokenize
encoded = tokenizer.encode(text, add_special_tokens=add_special_tokens)
tokens = tokenizer.convert_ids_to_tokens(encoded)
# Decode
decoded = tokenizer.decode(encoded, skip_special_tokens=not show_special_tokens)
# Create detailed token information
token_info = []
for i, (token, token_id) in enumerate(zip(tokens, encoded)):
# Try to get the actual string representation of the token
try:
token_str = tokenizer.convert_tokens_to_string([token])
except:
token_str = token
token_info.append({
"index": i,
"token": token,
"token_id": token_id,
"text": token_str,
"is_special": token_id in (tokenizer.all_special_ids if hasattr(tokenizer, 'all_special_ids') else [])
})
# Format outputs
tokens_display = json.dumps(tokens, ensure_ascii=False, indent=2)
token_ids_display = str(encoded)
token_info_json = json.dumps(token_info, ensure_ascii=False, indent=2)
# Statistics
stats = f"""Statistics:
• Model: {actual_model_id}
• Number of tokens: {len(tokens)}
• Number of characters: {len(text)}
• Tokens per character: {len(tokens)/len(text):.2f}
• Characters per token: {len(text)/len(tokens):.2f}
• Vocabulary size: {tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else 'N/A'}
• Special tokens: {', '.join(tokenizer.all_special_tokens) if hasattr(tokenizer, 'all_special_tokens') else 'N/A'}"""
return tokens_display, token_ids_display, decoded, token_info_json, stats
except Exception as e:
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
return error_msg, "", "", "", ""
def decode_tokens(
token_ids_str: str,
model_id: str,
skip_special_tokens: bool = False,
custom_model_id: Optional[str] = None
) -> Tuple[str, str, str]:
"""Decode token IDs back to text.
Returns:
Tuple of (decoded_text, tokens_json, stats)
"""
try:
# Use custom model ID if provided
actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id
if not actual_model_id:
return "Please select or enter a tokenizer model.", "", ""
# Parse token IDs
token_ids_str = token_ids_str.strip()
if not token_ids_str:
return "", "", ""
if token_ids_str.startswith('[') and token_ids_str.endswith(']'):
token_ids = json.loads(token_ids_str)
else:
# Try to parse as comma or space separated values
token_ids = [int(x.strip()) for x in token_ids_str.replace(',', ' ').split()]
# Load tokenizer and decode
tokenizer = load_tokenizer(actual_model_id)
decoded = tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
# Also show tokens
tokens = tokenizer.convert_ids_to_tokens(token_ids)
tokens_json = json.dumps(tokens, ensure_ascii=False, indent=2)
# Statistics
stats = f"""Statistics:
• Model: {actual_model_id}
• Token count: {len(tokens)}
• Character count: {len(decoded)}
• Characters per token: {len(decoded)/len(tokens):.2f}
• Special tokens skipped: {'Yes' if skip_special_tokens else 'No'}"""
return decoded, tokens_json, stats
except Exception as e:
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
return error_msg, "", ""
def compare_tokenizers(
text: str,
model_ids: List[str],
add_special_tokens: bool = True
) -> str:
"""Compare tokenization across multiple models."""
if not model_ids:
return "Please select at least one model to compare."
results = []
for model_id in model_ids:
try:
tokenizer = load_tokenizer(model_id)
encoded = tokenizer.encode(text, add_special_tokens=add_special_tokens)
tokens = tokenizer.convert_ids_to_tokens(encoded)
results.append({
"model": model_id,
"token_count": len(tokens),
"tokens": tokens[:50], # Show first 50 tokens
"token_ids": encoded[:50] # Show first 50 IDs
})
except Exception as e:
results.append({
"model": model_id,
"error": str(e)
})
# Sort by token count
results.sort(key=lambda x: x.get("token_count", float('inf')))
# Format output
output = "# Tokenizer Comparison\n\n"
output += f"Input text length: {len(text)} characters\n\n"
for result in results:
if "error" in result:
output += f"## {result['model']}\n"
output += f"Error: {result['error']}\n\n"
else:
output += f"## {result['model']}\n"
output += f"**Token count:** {result['token_count']} "
output += f"(ratio: {result['token_count']/len(text):.2f} tokens/char)\n\n"
output += f"**First tokens:** {result['tokens']}\n\n"
if len(result['tokens']) == 50:
output += "*(showing first 50 tokens)*\n\n"
return output
def analyze_vocabulary(model_id: str, custom_model_id: Optional[str] = None) -> str:
"""Analyze tokenizer vocabulary."""
try:
actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id
if not actual_model_id:
return "Please select or enter a tokenizer model."
tokenizer = load_tokenizer(actual_model_id)
# Get vocabulary information
vocab_size = tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else len(tokenizer.get_vocab())
# Get special tokens
special_tokens = {}
if hasattr(tokenizer, 'special_tokens_map'):
special_tokens = tokenizer.special_tokens_map
# Get some example tokens
vocab = tokenizer.get_vocab()
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])[:100] # First 100 tokens
output = f"""# Tokenizer Vocabulary Analysis
**Model:** {actual_model_id}
**Vocabulary Size:** {vocab_size:,}
**Tokenizer Type:** {tokenizer.__class__.__name__}
## Special Tokens
```json
{json.dumps(special_tokens, ensure_ascii=False, indent=2)}
```
## Token Settings
• Padding Token: {tokenizer.pad_token if tokenizer.pad_token else 'None'}
• BOS Token: {tokenizer.bos_token if tokenizer.bos_token else 'None'}
• EOS Token: {tokenizer.eos_token if tokenizer.eos_token else 'None'}
• UNK Token: {tokenizer.unk_token if tokenizer.unk_token else 'None'}
• SEP Token: {tokenizer.sep_token if hasattr(tokenizer, 'sep_token') and tokenizer.sep_token else 'None'}
• CLS Token: {tokenizer.cls_token if hasattr(tokenizer, 'cls_token') and tokenizer.cls_token else 'None'}
• Mask Token: {tokenizer.mask_token if hasattr(tokenizer, 'mask_token') and tokenizer.mask_token else 'None'}
## First 100 Tokens in Vocabulary
Token → ID
"""
for token, token_id in sorted_vocab:
# Escape special characters for display
display_token = repr(token) if not token.isprintable() else token
output += f"{display_token}{token_id}\n"
return output
except Exception as e:
return f"Error: {str(e)}\n{traceback.format_exc()}"
# Create Gradio interface
with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app:
gr.Markdown("""
# 🤗 Tokenizer Playground
A comprehensive tool for NLP researchers to experiment with various Hugging Face tokenizers.
Supports popular models including **Qwen**, **Llama**, **Mistral**, **GPT**, and many more.
### Features:
- 🔤 **Tokenize & Detokenize** text with any Hugging Face tokenizer
- 📊 **Compare** tokenization across multiple models
- 📖 **Analyze** vocabulary and special tokens
- 🎯 **Support** for custom model IDs from Hugging Face Hub
""")
with gr.Tab("🔤 Tokenize"):
with gr.Row():
with gr.Column(scale=3):
tokenize_input = gr.Textbox(
label="Input Text",
placeholder="Enter text to tokenize...",
lines=5
)
with gr.Column(scale=1):
tokenize_model = gr.Dropdown(
label="Select Tokenizer",
choices=list(TOKENIZER_OPTIONS.keys()),
value="Qwen/Qwen3-0.6B",
allow_custom_value=False
)
tokenize_custom_model = gr.Textbox(
label="Or Enter Custom Model ID",
placeholder="e.g., facebook/bart-base",
info="Override selection above with any HF model"
)
add_special = gr.Checkbox(label="Add Special Tokens", value=True)
show_special = gr.Checkbox(label="Show Special Tokens in Decoded", value=True)
tokenize_btn = gr.Button("Tokenize", variant="primary")
with gr.Row():
with gr.Column():
tokens_output = gr.Textbox(label="Tokens", lines=10, max_lines=20)
with gr.Column():
token_ids_output = gr.Textbox(label="Token IDs", lines=10, max_lines=20)
with gr.Row():
with gr.Column():
decoded_output = gr.Textbox(label="Decoded Text (Verification)", lines=5)
with gr.Column():
token_info_output = gr.Textbox(label="Detailed Token Information", lines=10, max_lines=20)
stats_output = gr.Textbox(label="Statistics", lines=7)
tokenize_btn.click(
fn=tokenize_text,
inputs=[tokenize_input, tokenize_model, add_special, show_special, tokenize_custom_model],
outputs=[tokens_output, token_ids_output, decoded_output, token_info_output, stats_output]
)
with gr.Tab("🔄 Detokenize"):
with gr.Row():
with gr.Column(scale=3):
decode_input = gr.Textbox(
label="Token IDs",
placeholder="Enter token IDs as a list [101, 2023, ...] or space/comma separated",
lines=5
)
with gr.Column(scale=1):
decode_model = gr.Dropdown(
label="Select Tokenizer",
choices=list(TOKENIZER_OPTIONS.keys()),
value="Qwen/Qwen3-0.6B"
)
decode_custom_model = gr.Textbox(
label="Or Enter Custom Model ID",
placeholder="e.g., facebook/bart-base"
)
skip_special = gr.Checkbox(label="Skip Special Tokens", value=False)
decode_btn = gr.Button("Decode", variant="primary")
decode_output = gr.Textbox(
label="Decoded Text",
lines=10,
interactive=False,
show_copy_button=True,
placeholder="Decoded text will appear here..."
)
decode_stats = gr.Textbox(
label="Statistics",
lines=5,
interactive=False
)
with gr.Accordion("Show Tokens", open=False):
decode_tokens_output = gr.Textbox(
label="Tokens",
lines=10,
interactive=False,
show_copy_button=True
)
decode_btn.click(
fn=decode_tokens,
inputs=[decode_input, decode_model, skip_special, decode_custom_model],
outputs=[decode_output, decode_tokens_output, decode_stats]
)
with gr.Tab("📊 Compare"):
compare_input = gr.Textbox(
label="Input Text",
placeholder="Enter text to compare tokenization across models...",
lines=5
)
compare_models = gr.CheckboxGroup(
label="Select Models to Compare",
choices=list(TOKENIZER_OPTIONS.keys()),
value=["Qwen/Qwen3-0.6B", "meta-llama/Llama-3.1-8B", "openai-community/gpt2"]
)
compare_add_special = gr.Checkbox(label="Add Special Tokens", value=True)
compare_btn = gr.Button("Compare Tokenizers", variant="primary")
compare_output = gr.Markdown()
compare_btn.click(
fn=compare_tokenizers,
inputs=[compare_input, compare_models, compare_add_special],
outputs=compare_output
)
with gr.Tab("📖 Vocabulary"):
with gr.Row():
vocab_model = gr.Dropdown(
label="Select Tokenizer",
choices=list(TOKENIZER_OPTIONS.keys()),
value="Qwen/Qwen3-0.6B"
)
vocab_custom_model = gr.Textbox(
label="Or Enter Custom Model ID",
placeholder="e.g., facebook/bart-base"
)
vocab_btn = gr.Button("Analyze Vocabulary", variant="primary")
vocab_output = gr.Markdown()
vocab_btn.click(
fn=analyze_vocabulary,
inputs=[vocab_model, vocab_custom_model],
outputs=vocab_output
)
with gr.Tab("ℹ️ About"):
gr.Markdown("""
## About This Tool
This tokenizer playground provides researchers and developers with an easy way to experiment
with various tokenizers from the Hugging Face Model Hub.
### Supported Models
**Qwen Series:** Qwen 3, Qwen 2.5, Qwen 2, Qwen 1 (various sizes)
**Llama Series:** Llama 3.2, Llama 3.1, Llama 2 (various sizes)
**Other Popular Models:** GPT-2, Gemma, Mistral, Mixtral, DeepSeek, Phi, Yi, T5, BERT, GPT-NeoX, BLOOM, OPT, StableLM
### Custom Models
You can use any tokenizer from the Hugging Face Hub by entering its model ID in the "Custom Model ID" field.
For example:
- `facebook/bart-base`
- `EleutherAI/gpt-j-6b`
- `bigscience/bloom`
### Features Explanation
- **Tokenize:** Convert text into tokens and token IDs
- **Detokenize:** Convert token IDs back to text
- **Compare:** See how different tokenizers handle the same text
- **Vocabulary:** Explore tokenizer vocabulary and special tokens
### Tips
1. Different tokenizers can produce very different token counts for the same text
2. Special tokens (like [CLS], [SEP], <s>, </s>) are model-specific
3. Subword tokenization (used by most modern models) allows handling of out-of-vocabulary words
4. Token efficiency affects model performance and API costs
### Resources
- [Hugging Face Tokenizers Documentation](https://huggingface.co/docs/transformers/main_classes/tokenizer)
- [Understanding Tokenization](https://huggingface.co/docs/transformers/tokenizer_summary)
- [Model Hub](https://huggingface.co/models)
---
""")
# Launch the app
if __name__ == "__main__":
app.launch()