afeng commited on
Commit
405302e
Β·
1 Parent(s): af99c46

update output window

Browse files
Files changed (2) hide show
  1. README.md +2 -1
  2. app.py +53 -20
README.md CHANGED
@@ -9,6 +9,7 @@ app_file: app.py
9
  pinned: true
10
  license: mit
11
  models:
 
12
  - Qwen/Qwen2.5-7B
13
  - meta-llama/Llama-3.1-8B
14
  - openai-community/gpt2
@@ -56,7 +57,7 @@ An interactive web application for experimenting with various Hugging Face token
56
  ## Supported Models
57
 
58
  ### Pre-configured Models
59
- - **Qwen Series**: Qwen 2.5, Qwen 2, Qwen 1 (multiple sizes)
60
  - **Llama Series**: Llama 3.2, Llama 3.1, Llama 2 (multiple sizes)
61
  - **GPT Models**: GPT-2, GPT-NeoX
62
  - **Google Models**: Gemma, T5, BERT
 
9
  pinned: true
10
  license: mit
11
  models:
12
+ - Qwen/Qwen3-0.6B
13
  - Qwen/Qwen2.5-7B
14
  - meta-llama/Llama-3.1-8B
15
  - openai-community/gpt2
 
57
  ## Supported Models
58
 
59
  ### Pre-configured Models
60
+ - **Qwen Series**: Qwen 3, Qwen 2.5, Qwen 2, Qwen 1 (multiple sizes)
61
  - **Llama Series**: Llama 3.2, Llama 3.1, Llama 2 (multiple sizes)
62
  - **GPT Models**: GPT-2, GPT-NeoX
63
  - **Google Models**: Gemma, T5, BERT
app.py CHANGED
@@ -7,6 +7,10 @@ from typing import Optional, Dict, List, Tuple
7
  # Popular tokenizer models
8
  TOKENIZER_OPTIONS = {
9
  # Qwen Series
 
 
 
 
10
  "Qwen/Qwen2.5-7B": "Qwen 2.5 (7B)",
11
  "Qwen/Qwen2.5-72B": "Qwen 2.5 (72B)",
12
  "Qwen/Qwen2-7B": "Qwen 2 (7B)",
@@ -139,17 +143,24 @@ def decode_tokens(
139
  model_id: str,
140
  skip_special_tokens: bool = False,
141
  custom_model_id: Optional[str] = None
142
- ) -> str:
143
- """Decode token IDs back to text."""
 
 
 
 
144
  try:
145
  # Use custom model ID if provided
146
  actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id
147
 
148
  if not actual_model_id:
149
- return "Please select or enter a tokenizer model."
150
 
151
  # Parse token IDs
152
  token_ids_str = token_ids_str.strip()
 
 
 
153
  if token_ids_str.startswith('[') and token_ids_str.endswith(']'):
154
  token_ids = json.loads(token_ids_str)
155
  else:
@@ -162,19 +173,21 @@ def decode_tokens(
162
 
163
  # Also show tokens
164
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
 
165
 
166
- result = f"""Decoded Text:
167
- {decoded}
168
-
169
- Tokens:
170
- {json.dumps(tokens, ensure_ascii=False, indent=2)}
171
-
172
- Token Count: {len(tokens)}"""
173
 
174
- return result
175
 
176
  except Exception as e:
177
- return f"Error: {str(e)}\n{traceback.format_exc()}"
 
178
 
179
  def compare_tokenizers(
180
  text: str,
@@ -308,7 +321,7 @@ with gr.Blocks(title="πŸ€— Tokenizer Playground", theme=gr.themes.Soft()) as app
308
  tokenize_model = gr.Dropdown(
309
  label="Select Tokenizer",
310
  choices=list(TOKENIZER_OPTIONS.keys()),
311
- value="Qwen/Qwen2.5-7B",
312
  allow_custom_value=False
313
  )
314
  tokenize_custom_model = gr.Textbox(
@@ -352,7 +365,7 @@ with gr.Blocks(title="πŸ€— Tokenizer Playground", theme=gr.themes.Soft()) as app
352
  decode_model = gr.Dropdown(
353
  label="Select Tokenizer",
354
  choices=list(TOKENIZER_OPTIONS.keys()),
355
- value="Qwen/Qwen2.5-7B"
356
  )
357
  decode_custom_model = gr.Textbox(
358
  label="Or Enter Custom Model ID",
@@ -361,12 +374,32 @@ with gr.Blocks(title="πŸ€— Tokenizer Playground", theme=gr.themes.Soft()) as app
361
  skip_special = gr.Checkbox(label="Skip Special Tokens", value=False)
362
  decode_btn = gr.Button("Decode", variant="primary")
363
 
364
- decode_output = gr.Textbox(label="Decoded Result", lines=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
  decode_btn.click(
367
  fn=decode_tokens,
368
  inputs=[decode_input, decode_model, skip_special, decode_custom_model],
369
- outputs=decode_output
370
  )
371
 
372
  with gr.Tab("πŸ“Š Compare"):
@@ -379,7 +412,7 @@ with gr.Blocks(title="πŸ€— Tokenizer Playground", theme=gr.themes.Soft()) as app
379
  compare_models = gr.CheckboxGroup(
380
  label="Select Models to Compare",
381
  choices=list(TOKENIZER_OPTIONS.keys()),
382
- value=["Qwen/Qwen2.5-7B", "meta-llama/Llama-3.1-8B", "openai-community/gpt2"]
383
  )
384
 
385
  compare_add_special = gr.Checkbox(label="Add Special Tokens", value=True)
@@ -398,7 +431,7 @@ with gr.Blocks(title="πŸ€— Tokenizer Playground", theme=gr.themes.Soft()) as app
398
  vocab_model = gr.Dropdown(
399
  label="Select Tokenizer",
400
  choices=list(TOKENIZER_OPTIONS.keys()),
401
- value="Qwen/Qwen2.5-7B"
402
  )
403
  vocab_custom_model = gr.Textbox(
404
  label="Or Enter Custom Model ID",
@@ -423,7 +456,7 @@ with gr.Blocks(title="πŸ€— Tokenizer Playground", theme=gr.themes.Soft()) as app
423
 
424
  ### Supported Models
425
 
426
- **Qwen Series:** Qwen 2.5, Qwen 2, Qwen 1 (various sizes)
427
 
428
  **Llama Series:** Llama 3.2, Llama 3.1, Llama 2 (various sizes)
429
 
@@ -459,7 +492,7 @@ with gr.Blocks(title="πŸ€— Tokenizer Playground", theme=gr.themes.Soft()) as app
459
 
460
  ---
461
 
462
- Made with ❀️ for the NLP research community
463
  """)
464
 
465
  # Launch the app
 
7
  # Popular tokenizer models
8
  TOKENIZER_OPTIONS = {
9
  # Qwen Series
10
+ "Qwen/Qwen3-0.6B": "Qwen 3 (0.6B)",
11
+ "Qwen/Qwen3-1.8B": "Qwen 3 (1.8B)",
12
+ "Qwen/Qwen3-4B": "Qwen 3 (4B)",
13
+ "Qwen/Qwen3-7B": "Qwen 3 (7B)",
14
  "Qwen/Qwen2.5-7B": "Qwen 2.5 (7B)",
15
  "Qwen/Qwen2.5-72B": "Qwen 2.5 (72B)",
16
  "Qwen/Qwen2-7B": "Qwen 2 (7B)",
 
143
  model_id: str,
144
  skip_special_tokens: bool = False,
145
  custom_model_id: Optional[str] = None
146
+ ) -> Tuple[str, str, str]:
147
+ """Decode token IDs back to text.
148
+
149
+ Returns:
150
+ Tuple of (decoded_text, tokens_json, stats)
151
+ """
152
  try:
153
  # Use custom model ID if provided
154
  actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id
155
 
156
  if not actual_model_id:
157
+ return "Please select or enter a tokenizer model.", "", ""
158
 
159
  # Parse token IDs
160
  token_ids_str = token_ids_str.strip()
161
+ if not token_ids_str:
162
+ return "", "", ""
163
+
164
  if token_ids_str.startswith('[') and token_ids_str.endswith(']'):
165
  token_ids = json.loads(token_ids_str)
166
  else:
 
173
 
174
  # Also show tokens
175
  tokens = tokenizer.convert_ids_to_tokens(token_ids)
176
+ tokens_json = json.dumps(tokens, ensure_ascii=False, indent=2)
177
 
178
+ # Statistics
179
+ stats = f"""Statistics:
180
+ β€’ Model: {actual_model_id}
181
+ β€’ Token count: {len(tokens)}
182
+ β€’ Character count: {len(decoded)}
183
+ β€’ Characters per token: {len(decoded)/len(tokens):.2f}
184
+ β€’ Special tokens skipped: {'Yes' if skip_special_tokens else 'No'}"""
185
 
186
+ return decoded, tokens_json, stats
187
 
188
  except Exception as e:
189
+ error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
190
+ return error_msg, "", ""
191
 
192
  def compare_tokenizers(
193
  text: str,
 
321
  tokenize_model = gr.Dropdown(
322
  label="Select Tokenizer",
323
  choices=list(TOKENIZER_OPTIONS.keys()),
324
+ value="Qwen/Qwen3-0.6B",
325
  allow_custom_value=False
326
  )
327
  tokenize_custom_model = gr.Textbox(
 
365
  decode_model = gr.Dropdown(
366
  label="Select Tokenizer",
367
  choices=list(TOKENIZER_OPTIONS.keys()),
368
+ value="Qwen/Qwen3-0.6B"
369
  )
370
  decode_custom_model = gr.Textbox(
371
  label="Or Enter Custom Model ID",
 
374
  skip_special = gr.Checkbox(label="Skip Special Tokens", value=False)
375
  decode_btn = gr.Button("Decode", variant="primary")
376
 
377
+ decode_output = gr.Textbox(
378
+ label="Decoded Text",
379
+ lines=10,
380
+ interactive=False,
381
+ show_copy_button=True,
382
+ placeholder="Decoded text will appear here..."
383
+ )
384
+
385
+ decode_stats = gr.Textbox(
386
+ label="Statistics",
387
+ lines=5,
388
+ interactive=False
389
+ )
390
+
391
+ with gr.Accordion("Show Tokens", open=False):
392
+ decode_tokens_output = gr.Textbox(
393
+ label="Tokens",
394
+ lines=10,
395
+ interactive=False,
396
+ show_copy_button=True
397
+ )
398
 
399
  decode_btn.click(
400
  fn=decode_tokens,
401
  inputs=[decode_input, decode_model, skip_special, decode_custom_model],
402
+ outputs=[decode_output, decode_tokens_output, decode_stats]
403
  )
404
 
405
  with gr.Tab("πŸ“Š Compare"):
 
412
  compare_models = gr.CheckboxGroup(
413
  label="Select Models to Compare",
414
  choices=list(TOKENIZER_OPTIONS.keys()),
415
+ value=["Qwen/Qwen3-0.6B", "meta-llama/Llama-3.1-8B", "openai-community/gpt2"]
416
  )
417
 
418
  compare_add_special = gr.Checkbox(label="Add Special Tokens", value=True)
 
431
  vocab_model = gr.Dropdown(
432
  label="Select Tokenizer",
433
  choices=list(TOKENIZER_OPTIONS.keys()),
434
+ value="Qwen/Qwen3-0.6B"
435
  )
436
  vocab_custom_model = gr.Textbox(
437
  label="Or Enter Custom Model ID",
 
456
 
457
  ### Supported Models
458
 
459
+ **Qwen Series:** Qwen 3, Qwen 2.5, Qwen 2, Qwen 1 (various sizes)
460
 
461
  **Llama Series:** Llama 3.2, Llama 3.1, Llama 2 (various sizes)
462
 
 
492
 
493
  ---
494
 
495
+
496
  """)
497
 
498
  # Launch the app