Spaces:

afeng
/

tokenizers

Running

App Files Files Community

tokenizers / app.py

afeng

update output window

405302e about 1 month ago

raw

history blame

18.4 kB

	import gradio as gr
	from transformers import AutoTokenizer
	import json
	import traceback
	from typing import Optional, Dict, List, Tuple

	# Popular tokenizer models
	TOKENIZER_OPTIONS = {
	# Qwen Series
	"Qwen/Qwen3-0.6B": "Qwen 3 (0.6B)",
	"Qwen/Qwen3-1.8B": "Qwen 3 (1.8B)",
	"Qwen/Qwen3-4B": "Qwen 3 (4B)",
	"Qwen/Qwen3-7B": "Qwen 3 (7B)",
	"Qwen/Qwen2.5-7B": "Qwen 2.5 (7B)",
	"Qwen/Qwen2.5-72B": "Qwen 2.5 (72B)",
	"Qwen/Qwen2-7B": "Qwen 2 (7B)",
	"Qwen/Qwen2-72B": "Qwen 2 (72B)",
	"Qwen/Qwen-7B": "Qwen 1 (7B)",

	# Llama Series
	"meta-llama/Llama-3.2-1B": "Llama 3.2 (1B)",
	"meta-llama/Llama-3.2-3B": "Llama 3.2 (3B)",
	"meta-llama/Llama-3.1-8B": "Llama 3.1 (8B)",
	"meta-llama/Llama-3.1-70B": "Llama 3.1 (70B)",
	"meta-llama/Llama-2-7b-hf": "Llama 2 (7B)",
	"meta-llama/Llama-2-13b-hf": "Llama 2 (13B)",
	"meta-llama/Llama-2-70b-hf": "Llama 2 (70B)",

	# Other Popular Models
	"openai-community/gpt2": "GPT-2",
	"google/gemma-2b": "Gemma (2B)",
	"google/gemma-7b": "Gemma (7B)",
	"mistralai/Mistral-7B-v0.1": "Mistral (7B)",
	"mistralai/Mixtral-8x7B-v0.1": "Mixtral (8x7B)",
	"deepseek-ai/deepseek-coder-6.7b-base": "DeepSeek Coder (6.7B)",
	"microsoft/phi-2": "Phi-2",
	"microsoft/phi-3-mini-4k-instruct": "Phi-3 Mini",
	"01-ai/Yi-6B": "Yi (6B)",
	"01-ai/Yi-34B": "Yi (34B)",
	"google-t5/t5-base": "T5 Base",
	"google-bert/bert-base-uncased": "BERT Base (uncased)",
	"google-bert/bert-base-cased": "BERT Base (cased)",
	"EleutherAI/gpt-neox-20b": "GPT-NeoX (20B)",
	"bigscience/bloom-560m": "BLOOM (560M)",
	"facebook/opt-350m": "OPT (350M)",
	"stabilityai/stablelm-base-alpha-7b": "StableLM (7B)",
	}

	# Cache for loaded tokenizers
	tokenizer_cache = {}

	def load_tokenizer(model_id: str):
	"""Load a tokenizer with caching."""
	if model_id not in tokenizer_cache:
	try:
	tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
	model_id,
	trust_remote_code=True,
	use_fast=True # Use fast tokenizer when available
	)
	except Exception as e:
	# Fallback to slow tokenizer if fast is not available
	try:
	tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
	model_id,
	trust_remote_code=True,
	use_fast=False
	)
	except:
	raise e
	return tokenizer_cache[model_id]

	def tokenize_text(
	text: str,
	model_id: str,
	add_special_tokens: bool = True,
	show_special_tokens: bool = True,
	custom_model_id: Optional[str] = None
	) -> Tuple[str, str, str, str]:
	"""
	Tokenize text using the selected tokenizer.

	Returns:
	Tuple of (tokens_json, token_ids, decoded_text, stats)
	"""
	try:
	# Use custom model ID if provided
	actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id

	if not actual_model_id:
	return "", "", "", "Please select or enter a tokenizer model."

	# Load tokenizer
	tokenizer = load_tokenizer(actual_model_id)

	# Tokenize
	encoded = tokenizer.encode(text, add_special_tokens=add_special_tokens)
	tokens = tokenizer.convert_ids_to_tokens(encoded)

	# Decode
	decoded = tokenizer.decode(encoded, skip_special_tokens=not show_special_tokens)

	# Create detailed token information
	token_info = []
	for i, (token, token_id) in enumerate(zip(tokens, encoded)):
	# Try to get the actual string representation of the token
	try:
	token_str = tokenizer.convert_tokens_to_string([token])
	except:
	token_str = token

	token_info.append({
	"index": i,
	"token": token,
	"token_id": token_id,
	"text": token_str,
	"is_special": token_id in (tokenizer.all_special_ids if hasattr(tokenizer, 'all_special_ids') else [])
	})

	# Format outputs
	tokens_display = json.dumps(tokens, ensure_ascii=False, indent=2)
	token_ids_display = str(encoded)
	token_info_json = json.dumps(token_info, ensure_ascii=False, indent=2)

	# Statistics
	stats = f"""Statistics:
	• Model: {actual_model_id}
	• Number of tokens: {len(tokens)}
	• Number of characters: {len(text)}
	• Tokens per character: {len(tokens)/len(text):.2f}
	• Characters per token: {len(text)/len(tokens):.2f}
	• Vocabulary size: {tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else 'N/A'}
	• Special tokens: {', '.join(tokenizer.all_special_tokens) if hasattr(tokenizer, 'all_special_tokens') else 'N/A'}"""

	return tokens_display, token_ids_display, decoded, token_info_json, stats

	except Exception as e:
	error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
	return error_msg, "", "", "", ""

	def decode_tokens(
	token_ids_str: str,
	model_id: str,
	skip_special_tokens: bool = False,
	custom_model_id: Optional[str] = None
	) -> Tuple[str, str, str]:
	"""Decode token IDs back to text.

	Returns:
	Tuple of (decoded_text, tokens_json, stats)
	"""
	try:
	# Use custom model ID if provided
	actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id

	if not actual_model_id:
	return "Please select or enter a tokenizer model.", "", ""

	# Parse token IDs
	token_ids_str = token_ids_str.strip()
	if not token_ids_str:
	return "", "", ""

	if token_ids_str.startswith('[') and token_ids_str.endswith(']'):
	token_ids = json.loads(token_ids_str)
	else:
	# Try to parse as comma or space separated values
	token_ids = [int(x.strip()) for x in token_ids_str.replace(',', ' ').split()]

	# Load tokenizer and decode
	tokenizer = load_tokenizer(actual_model_id)
	decoded = tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)

	# Also show tokens
	tokens = tokenizer.convert_ids_to_tokens(token_ids)
	tokens_json = json.dumps(tokens, ensure_ascii=False, indent=2)

	# Statistics
	stats = f"""Statistics:
	• Model: {actual_model_id}
	• Token count: {len(tokens)}
	• Character count: {len(decoded)}
	• Characters per token: {len(decoded)/len(tokens):.2f}
	• Special tokens skipped: {'Yes' if skip_special_tokens else 'No'}"""

	return decoded, tokens_json, stats

	except Exception as e:
	error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
	return error_msg, "", ""

	def compare_tokenizers(
	text: str,
	model_ids: List[str],
	add_special_tokens: bool = True
	) -> str:
	"""Compare tokenization across multiple models."""
	if not model_ids:
	return "Please select at least one model to compare."

	results = []

	for model_id in model_ids:
	try:
	tokenizer = load_tokenizer(model_id)
	encoded = tokenizer.encode(text, add_special_tokens=add_special_tokens)
	tokens = tokenizer.convert_ids_to_tokens(encoded)

	results.append({
	"model": model_id,
	"token_count": len(tokens),
	"tokens": tokens[:50], # Show first 50 tokens
	"token_ids": encoded[:50] # Show first 50 IDs
	})
	except Exception as e:
	results.append({
	"model": model_id,
	"error": str(e)
	})

	# Sort by token count
	results.sort(key=lambda x: x.get("token_count", float('inf')))

	# Format output
	output = "# Tokenizer Comparison\n\n"
	output += f"Input text length: {len(text)} characters\n\n"

	for result in results:
	if "error" in result:
	output += f"## {result['model']}\n"
	output += f"Error: {result['error']}\n\n"
	else:
	output += f"## {result['model']}\n"
	output += f"Token count: {result['token_count']} "
	output += f"(ratio: {result['token_count']/len(text):.2f} tokens/char)\n\n"
	output += f"First tokens: {result['tokens']}\n\n"
	if len(result['tokens']) == 50:
	output += "(showing first 50 tokens)\n\n"

	return output

	def analyze_vocabulary(model_id: str, custom_model_id: Optional[str] = None) -> str:
	"""Analyze tokenizer vocabulary."""
	try:
	actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id

	if not actual_model_id:
	return "Please select or enter a tokenizer model."

	tokenizer = load_tokenizer(actual_model_id)

	# Get vocabulary information
	vocab_size = tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else len(tokenizer.get_vocab())

	# Get special tokens
	special_tokens = {}
	if hasattr(tokenizer, 'special_tokens_map'):
	special_tokens = tokenizer.special_tokens_map

	# Get some example tokens
	vocab = tokenizer.get_vocab()
	sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])[:100] # First 100 tokens

	output = f"""# Tokenizer Vocabulary Analysis

	Model: {actual_model_id}
	Vocabulary Size: {vocab_size:,}
	Tokenizer Type: {tokenizer.__class__.__name__}

	## Special Tokens
	```json
	{json.dumps(special_tokens, ensure_ascii=False, indent=2)}
	```

	## Token Settings
	• Padding Token: {tokenizer.pad_token if tokenizer.pad_token else 'None'}
	• BOS Token: {tokenizer.bos_token if tokenizer.bos_token else 'None'}
	• EOS Token: {tokenizer.eos_token if tokenizer.eos_token else 'None'}
	• UNK Token: {tokenizer.unk_token if tokenizer.unk_token else 'None'}
	• SEP Token: {tokenizer.sep_token if hasattr(tokenizer, 'sep_token') and tokenizer.sep_token else 'None'}
	• CLS Token: {tokenizer.cls_token if hasattr(tokenizer, 'cls_token') and tokenizer.cls_token else 'None'}
	• Mask Token: {tokenizer.mask_token if hasattr(tokenizer, 'mask_token') and tokenizer.mask_token else 'None'}

	## First 100 Tokens in Vocabulary
	Token → ID
	"""
	for token, token_id in sorted_vocab:
	# Escape special characters for display
	display_token = repr(token) if not token.isprintable() else token
	output += f"{display_token} → {token_id}\n"

	return output

	except Exception as e:
	return f"Error: {str(e)}\n{traceback.format_exc()}"

	# Create Gradio interface
	with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app:
	gr.Markdown("""
	# 🤗 Tokenizer Playground

	A comprehensive tool for NLP researchers to experiment with various Hugging Face tokenizers.
	Supports popular models including Qwen, Llama, Mistral, GPT, and many more.

	### Features:
	- 🔤 Tokenize & Detokenize text with any Hugging Face tokenizer
	- 📊 Compare tokenization across multiple models
	- 📖 Analyze vocabulary and special tokens
	- 🎯 Support for custom model IDs from Hugging Face Hub
	""")

	with gr.Tab("🔤 Tokenize"):
	with gr.Row():
	with gr.Column(scale=3):
	tokenize_input = gr.Textbox(
	label="Input Text",
	placeholder="Enter text to tokenize...",
	lines=5
	)
	with gr.Column(scale=1):
	tokenize_model = gr.Dropdown(
	label="Select Tokenizer",
	choices=list(TOKENIZER_OPTIONS.keys()),
	value="Qwen/Qwen3-0.6B",
	allow_custom_value=False
	)
	tokenize_custom_model = gr.Textbox(
	label="Or Enter Custom Model ID",
	placeholder="e.g., facebook/bart-base",
	info="Override selection above with any HF model"
	)
	add_special = gr.Checkbox(label="Add Special Tokens", value=True)
	show_special = gr.Checkbox(label="Show Special Tokens in Decoded", value=True)
	tokenize_btn = gr.Button("Tokenize", variant="primary")

	with gr.Row():
	with gr.Column():
	tokens_output = gr.Textbox(label="Tokens", lines=10, max_lines=20)
	with gr.Column():
	token_ids_output = gr.Textbox(label="Token IDs", lines=10, max_lines=20)

	with gr.Row():
	with gr.Column():
	decoded_output = gr.Textbox(label="Decoded Text (Verification)", lines=5)
	with gr.Column():
	token_info_output = gr.Textbox(label="Detailed Token Information", lines=10, max_lines=20)

	stats_output = gr.Textbox(label="Statistics", lines=7)

	tokenize_btn.click(
	fn=tokenize_text,
	inputs=[tokenize_input, tokenize_model, add_special, show_special, tokenize_custom_model],
	outputs=[tokens_output, token_ids_output, decoded_output, token_info_output, stats_output]
	)

	with gr.Tab("🔄 Detokenize"):
	with gr.Row():
	with gr.Column(scale=3):
	decode_input = gr.Textbox(
	label="Token IDs",
	placeholder="Enter token IDs as a list [101, 2023, ...] or space/comma separated",
	lines=5
	)
	with gr.Column(scale=1):
	decode_model = gr.Dropdown(
	label="Select Tokenizer",
	choices=list(TOKENIZER_OPTIONS.keys()),
	value="Qwen/Qwen3-0.6B"
	)
	decode_custom_model = gr.Textbox(
	label="Or Enter Custom Model ID",
	placeholder="e.g., facebook/bart-base"
	)
	skip_special = gr.Checkbox(label="Skip Special Tokens", value=False)
	decode_btn = gr.Button("Decode", variant="primary")

	decode_output = gr.Textbox(
	label="Decoded Text",
	lines=10,
	interactive=False,
	show_copy_button=True,
	placeholder="Decoded text will appear here..."
	)

	decode_stats = gr.Textbox(
	label="Statistics",
	lines=5,
	interactive=False
	)

	with gr.Accordion("Show Tokens", open=False):
	decode_tokens_output = gr.Textbox(
	label="Tokens",
	lines=10,
	interactive=False,
	show_copy_button=True
	)

	decode_btn.click(
	fn=decode_tokens,
	inputs=[decode_input, decode_model, skip_special, decode_custom_model],
	outputs=[decode_output, decode_tokens_output, decode_stats]
	)

	with gr.Tab("📊 Compare"):
	compare_input = gr.Textbox(
	label="Input Text",
	placeholder="Enter text to compare tokenization across models...",
	lines=5
	)

	compare_models = gr.CheckboxGroup(
	label="Select Models to Compare",
	choices=list(TOKENIZER_OPTIONS.keys()),
	value=["Qwen/Qwen3-0.6B", "meta-llama/Llama-3.1-8B", "openai-community/gpt2"]
	)

	compare_add_special = gr.Checkbox(label="Add Special Tokens", value=True)
	compare_btn = gr.Button("Compare Tokenizers", variant="primary")

	compare_output = gr.Markdown()

	compare_btn.click(
	fn=compare_tokenizers,
	inputs=[compare_input, compare_models, compare_add_special],
	outputs=compare_output
	)

	with gr.Tab("📖 Vocabulary"):
	with gr.Row():
	vocab_model = gr.Dropdown(
	label="Select Tokenizer",
	choices=list(TOKENIZER_OPTIONS.keys()),
	value="Qwen/Qwen3-0.6B"
	)
	vocab_custom_model = gr.Textbox(
	label="Or Enter Custom Model ID",
	placeholder="e.g., facebook/bart-base"
	)
	vocab_btn = gr.Button("Analyze Vocabulary", variant="primary")

	vocab_output = gr.Markdown()

	vocab_btn.click(
	fn=analyze_vocabulary,
	inputs=[vocab_model, vocab_custom_model],
	outputs=vocab_output
	)

	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	## About This Tool

	This tokenizer playground provides researchers and developers with an easy way to experiment
	with various tokenizers from the Hugging Face Model Hub.

	### Supported Models

	Qwen Series: Qwen 3, Qwen 2.5, Qwen 2, Qwen 1 (various sizes)

	Llama Series: Llama 3.2, Llama 3.1, Llama 2 (various sizes)

	Other Popular Models: GPT-2, Gemma, Mistral, Mixtral, DeepSeek, Phi, Yi, T5, BERT, GPT-NeoX, BLOOM, OPT, StableLM

	### Custom Models

	You can use any tokenizer from the Hugging Face Hub by entering its model ID in the "Custom Model ID" field.
	For example:
	- `facebook/bart-base`
	- `EleutherAI/gpt-j-6b`
	- `bigscience/bloom`

	### Features Explanation

	- Tokenize: Convert text into tokens and token IDs
	- Detokenize: Convert token IDs back to text
	- Compare: See how different tokenizers handle the same text
	- Vocabulary: Explore tokenizer vocabulary and special tokens

	### Tips

	1. Different tokenizers can produce very different token counts for the same text
	2. Special tokens (like [CLS], [SEP], <s>, </s>) are model-specific
	3. Subword tokenization (used by most modern models) allows handling of out-of-vocabulary words
	4. Token efficiency affects model performance and API costs

	### Resources

	- [Hugging Face Tokenizers Documentation](https://huggingface.co/docs/transformers/main_classes/tokenizer)
	- [Understanding Tokenization](https://huggingface.co/docs/transformers/tokenizer_summary)
	- [Model Hub](https://huggingface.co/models)

	---


	""")

	# Launch the app
	if __name__ == "__main__":
	app.launch()