Spaces:

afeng
/

tokenizers

Running

App Files Files Community

afeng commited on Nov 11

Commit

405302e

1 Parent(s): af99c46

update output window

Browse files

Files changed (2) hide show

README.md +2 -1
app.py +53 -20

README.md CHANGED Viewed

@@ -9,6 +9,7 @@ app_file: app.py
 pinned: true
 license: mit
 models:
   - Qwen/Qwen2.5-7B
   - meta-llama/Llama-3.1-8B
   - openai-community/gpt2
@@ -56,7 +57,7 @@ An interactive web application for experimenting with various Hugging Face token
 ## Supported Models
 ### Pre-configured Models
-- **Qwen Series**: Qwen 2.5, Qwen 2, Qwen 1 (multiple sizes)
 - **Llama Series**: Llama 3.2, Llama 3.1, Llama 2 (multiple sizes)
 - **GPT Models**: GPT-2, GPT-NeoX
 - **Google Models**: Gemma, T5, BERT

 pinned: true
 license: mit
 models:
+  - Qwen/Qwen3-0.6B
   - Qwen/Qwen2.5-7B
   - meta-llama/Llama-3.1-8B
   - openai-community/gpt2
 ## Supported Models
 ### Pre-configured Models
+- **Qwen Series**: Qwen 3, Qwen 2.5, Qwen 2, Qwen 1 (multiple sizes)
 - **Llama Series**: Llama 3.2, Llama 3.1, Llama 2 (multiple sizes)
 - **GPT Models**: GPT-2, GPT-NeoX
 - **Google Models**: Gemma, T5, BERT

app.py CHANGED Viewed

@@ -7,6 +7,10 @@ from typing import Optional, Dict, List, Tuple
 # Popular tokenizer models
 TOKENIZER_OPTIONS = {
     # Qwen Series
     "Qwen/Qwen2.5-7B": "Qwen 2.5 (7B)",
     "Qwen/Qwen2.5-72B": "Qwen 2.5 (72B)",
     "Qwen/Qwen2-7B": "Qwen 2 (7B)",
@@ -139,17 +143,24 @@ def decode_tokens(
     model_id: str,
     skip_special_tokens: bool = False,
     custom_model_id: Optional[str] = None
-) -> str:
-    """Decode token IDs back to text."""
     try:
         # Use custom model ID if provided
         actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id
         if not actual_model_id:
-            return "Please select or enter a tokenizer model."
         # Parse token IDs
         token_ids_str = token_ids_str.strip()
         if token_ids_str.startswith('[') and token_ids_str.endswith(']'):
             token_ids = json.loads(token_ids_str)
         else:
@@ -162,19 +173,21 @@ def decode_tokens(
         # Also show tokens
         tokens = tokenizer.convert_ids_to_tokens(token_ids)
-        result = f"""Decoded Text:
-{decoded}
-Tokens:
-{json.dumps(tokens, ensure_ascii=False, indent=2)}
-Token Count: {len(tokens)}"""
-        return result
     except Exception as e:
-        return f"Error: {str(e)}\n{traceback.format_exc()}"
 def compare_tokenizers(
     text: str,
@@ -308,7 +321,7 @@ with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app
                 tokenize_model = gr.Dropdown(
                     label="Select Tokenizer",
                     choices=list(TOKENIZER_OPTIONS.keys()),
-                    value="Qwen/Qwen2.5-7B",
                     allow_custom_value=False
                 )
                 tokenize_custom_model = gr.Textbox(
@@ -352,7 +365,7 @@ with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app
                 decode_model = gr.Dropdown(
                     label="Select Tokenizer",
                     choices=list(TOKENIZER_OPTIONS.keys()),
-                    value="Qwen/Qwen2.5-7B"
                 )
                 decode_custom_model = gr.Textbox(
                     label="Or Enter Custom Model ID",
@@ -361,12 +374,32 @@ with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app
                 skip_special = gr.Checkbox(label="Skip Special Tokens", value=False)
                 decode_btn = gr.Button("Decode", variant="primary")
-        decode_output = gr.Textbox(label="Decoded Result", lines=10)
         decode_btn.click(
             fn=decode_tokens,
             inputs=[decode_input, decode_model, skip_special, decode_custom_model],
-            outputs=decode_output
         )
     with gr.Tab("📊 Compare"):
@@ -379,7 +412,7 @@ with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app
         compare_models = gr.CheckboxGroup(
             label="Select Models to Compare",
             choices=list(TOKENIZER_OPTIONS.keys()),
-            value=["Qwen/Qwen2.5-7B", "meta-llama/Llama-3.1-8B", "openai-community/gpt2"]
         )
         compare_add_special = gr.Checkbox(label="Add Special Tokens", value=True)
@@ -398,7 +431,7 @@ with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app
             vocab_model = gr.Dropdown(
                 label="Select Tokenizer",
                 choices=list(TOKENIZER_OPTIONS.keys()),
-                value="Qwen/Qwen2.5-7B"
             )
             vocab_custom_model = gr.Textbox(
                 label="Or Enter Custom Model ID",
@@ -423,7 +456,7 @@ with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app
         ### Supported Models
-        **Qwen Series:** Qwen 2.5, Qwen 2, Qwen 1 (various sizes)
         **Llama Series:** Llama 3.2, Llama 3.1, Llama 2 (various sizes)
@@ -459,7 +492,7 @@ with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app
         ---
-        Made with ❤️ for the NLP research community
         """)
 # Launch the app

 # Popular tokenizer models
 TOKENIZER_OPTIONS = {
     # Qwen Series
+    "Qwen/Qwen3-0.6B": "Qwen 3 (0.6B)",
+    "Qwen/Qwen3-1.8B": "Qwen 3 (1.8B)",
+    "Qwen/Qwen3-4B": "Qwen 3 (4B)",
+    "Qwen/Qwen3-7B": "Qwen 3 (7B)",
     "Qwen/Qwen2.5-7B": "Qwen 2.5 (7B)",
     "Qwen/Qwen2.5-72B": "Qwen 2.5 (72B)",
     "Qwen/Qwen2-7B": "Qwen 2 (7B)",
     model_id: str,
     skip_special_tokens: bool = False,
     custom_model_id: Optional[str] = None
+) -> Tuple[str, str, str]:
+    """Decode token IDs back to text.
+    Returns:
+        Tuple of (decoded_text, tokens_json, stats)
+    """
     try:
         # Use custom model ID if provided
         actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id
         if not actual_model_id:
+            return "Please select or enter a tokenizer model.", "", ""
         # Parse token IDs
         token_ids_str = token_ids_str.strip()
+        if not token_ids_str:
+            return "", "", ""
         if token_ids_str.startswith('[') and token_ids_str.endswith(']'):
             token_ids = json.loads(token_ids_str)
         else:
         # Also show tokens
         tokens = tokenizer.convert_ids_to_tokens(token_ids)
+        tokens_json = json.dumps(tokens, ensure_ascii=False, indent=2)
+        # Statistics
+        stats = f"""Statistics:
+• Model: {actual_model_id}
+• Token count: {len(tokens)}
+• Character count: {len(decoded)}
+• Characters per token: {len(decoded)/len(tokens):.2f}
+• Special tokens skipped: {'Yes' if skip_special_tokens else 'No'}"""
+        return decoded, tokens_json, stats
     except Exception as e:
+        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+        return error_msg, "", ""
 def compare_tokenizers(
     text: str,
                 tokenize_model = gr.Dropdown(
                     label="Select Tokenizer",
                     choices=list(TOKENIZER_OPTIONS.keys()),
+                    value="Qwen/Qwen3-0.6B",
                     allow_custom_value=False
                 )
                 tokenize_custom_model = gr.Textbox(
                 decode_model = gr.Dropdown(
                     label="Select Tokenizer",
                     choices=list(TOKENIZER_OPTIONS.keys()),
+                    value="Qwen/Qwen3-0.6B"
                 )
                 decode_custom_model = gr.Textbox(
                     label="Or Enter Custom Model ID",
                 skip_special = gr.Checkbox(label="Skip Special Tokens", value=False)
                 decode_btn = gr.Button("Decode", variant="primary")
+        decode_output = gr.Textbox(
+            label="Decoded Text",
+            lines=10,
+            interactive=False,
+            show_copy_button=True,
+            placeholder="Decoded text will appear here..."
+        )
+        decode_stats = gr.Textbox(
+            label="Statistics",
+            lines=5,
+            interactive=False
+        )
+        with gr.Accordion("Show Tokens", open=False):
+            decode_tokens_output = gr.Textbox(
+                label="Tokens",
+                lines=10,
+                interactive=False,
+                show_copy_button=True
+            )
         decode_btn.click(
             fn=decode_tokens,
             inputs=[decode_input, decode_model, skip_special, decode_custom_model],
+            outputs=[decode_output, decode_tokens_output, decode_stats]
         )
     with gr.Tab("📊 Compare"):
         compare_models = gr.CheckboxGroup(
             label="Select Models to Compare",
             choices=list(TOKENIZER_OPTIONS.keys()),
+            value=["Qwen/Qwen3-0.6B", "meta-llama/Llama-3.1-8B", "openai-community/gpt2"]
         )
         compare_add_special = gr.Checkbox(label="Add Special Tokens", value=True)
             vocab_model = gr.Dropdown(
                 label="Select Tokenizer",
                 choices=list(TOKENIZER_OPTIONS.keys()),
+                value="Qwen/Qwen3-0.6B"
             )
             vocab_custom_model = gr.Textbox(
                 label="Or Enter Custom Model ID",
         ### Supported Models
+        **Qwen Series:** Qwen 3, Qwen 2.5, Qwen 2, Qwen 1 (various sizes)
         **Llama Series:** Llama 3.2, Llama 3.1, Llama 2 (various sizes)
         ---
         """)
 # Launch the app