#!/usr/bin/env python3 """ Simple test script to verify tokenizer functionality. This tests the core functions without launching the Gradio interface. """ import sys import json # Test imports try: from transformers import AutoTokenizer print("✓ transformers imported successfully") except ImportError as e: print(f"✗ Failed to import transformers: {e}") sys.exit(1) try: import gradio as gr print("✓ gradio imported successfully") except ImportError as e: print(f"✗ Failed to import gradio: {e}") sys.exit(1) # Test basic tokenization def test_basic_tokenization(): """Test basic tokenization with a small model.""" print("\n--- Testing Basic Tokenization ---") try: # Use GPT-2 as it's small and commonly available model_id = "openai-community/gpt2" text = "Hello, world! This is a test." print(f"Loading tokenizer: {model_id}") tokenizer = AutoTokenizer.from_pretrained(model_id) print("✓ Tokenizer loaded successfully") # Test encoding encoded = tokenizer.encode(text) print(f"✓ Text encoded: {encoded[:10]}...") # Show first 10 tokens # Test decoding decoded = tokenizer.decode(encoded) print(f"✓ Text decoded: {decoded}") # Verify round-trip assert decoded == text, "Round-trip tokenization failed" print("✓ Round-trip tokenization successful") # Test token conversion tokens = tokenizer.convert_ids_to_tokens(encoded) print(f"✓ Tokens: {tokens[:5]}...") # Show first 5 tokens return True except Exception as e: print(f"✗ Test failed: {e}") return False def test_special_tokens(): """Test special token handling.""" print("\n--- Testing Special Tokens ---") try: model_id = "openai-community/gpt2" text = "Test text" tokenizer = AutoTokenizer.from_pretrained(model_id) # With special tokens encoded_with = tokenizer.encode(text, add_special_tokens=True) # Without special tokens encoded_without = tokenizer.encode(text, add_special_tokens=False) print(f"✓ With special tokens: {len(encoded_with)} tokens") print(f"✓ Without special tokens: {len(encoded_without)} tokens") # Decode with and without special tokens decoded_with = tokenizer.decode(encoded_with, skip_special_tokens=False) decoded_without = tokenizer.decode(encoded_with, skip_special_tokens=True) print(f"✓ Decoded with special: {decoded_with}") print(f"✓ Decoded without special: {decoded_without}") return True except Exception as e: print(f"✗ Test failed: {e}") return False def test_app_functions(): """Test the main app functions.""" print("\n--- Testing App Functions ---") try: # Import app functions from app import tokenize_text, decode_tokens, analyze_vocabulary # Test tokenize_text print("Testing tokenize_text function...") result = tokenize_text( text="Hello world", model_id="openai-community/gpt2", add_special_tokens=True, show_special_tokens=True, custom_model_id=None ) assert len(result) == 5, "tokenize_text should return 5 values" print("✓ tokenize_text function works") # Test decode_tokens print("Testing decode_tokens function...") decode_result = decode_tokens( token_ids_str="[15496, 11, 995]", # "Hello, world" in GPT-2 model_id="openai-community/gpt2", skip_special_tokens=False, custom_model_id=None ) assert "Decoded Text:" in decode_result, "decode_tokens should return decoded text" print("✓ decode_tokens function works") # Test analyze_vocabulary print("Testing analyze_vocabulary function...") vocab_result = analyze_vocabulary( model_id="openai-community/gpt2", custom_model_id=None ) assert "Vocabulary Size:" in vocab_result, "analyze_vocabulary should return vocabulary info" print("✓ analyze_vocabulary function works") return True except Exception as e: print(f"✗ Test failed: {e}") import traceback traceback.print_exc() return False def main(): """Run all tests.""" print("=" * 50) print("Tokenizer Playground Test Suite") print("=" * 50) tests = [ test_basic_tokenization, test_special_tokens, test_app_functions ] results = [] for test in tests: results.append(test()) print("\n" + "=" * 50) print("Test Summary") print("=" * 50) passed = sum(results) total = len(results) print(f"Passed: {passed}/{total}") if passed == total: print("✅ All tests passed!") return 0 else: print("❌ Some tests failed") return 1 if __name__ == "__main__": sys.exit(main())