Spaces:
Running
Running
| import gradio as gr | |
| import utils | |
| import os | |
| # Create a custom theme | |
| theme = gr.themes.Base( | |
| text_size="lg", | |
| radius_size="none", | |
| font=[gr.themes.GoogleFont('Source Sans 3'), 'ui-sans-serif', 'system-ui', 'sans-serif'], | |
| ) | |
| # Load tokenizers only once during development | |
| if gr.NO_RELOAD: | |
| print("Loading tokenizers...") | |
| all_tokenizers = utils.load_tokenizers() | |
| all_tokenizer_names = list(all_tokenizers.keys()) | |
| print("Tokenizers loaded!") | |
| def read_svg_file(name: str) -> str: | |
| """Read SVG file content.""" | |
| icon_map = { | |
| "meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama.svg", | |
| "deepseek-ai/DeepSeek-V3-0324": "deepseek.svg", | |
| "ZurichNLP/swissbert": "swissbert.svg", | |
| "mistralai/Mistral-Nemo-Instruct-2407": "mistral.svg", | |
| "google/gemma-3-27b-it": "gemma.svg", | |
| "gpt-4o": "chatgpt.svg" | |
| } | |
| icon_path = os.path.join("icons", icon_map.get(name, "chatgpt.svg")) | |
| try: | |
| with open(icon_path, 'r') as f: | |
| return f.read() | |
| except Exception as e: | |
| print(f"Error reading SVG file {icon_path}: {e}") | |
| return "" | |
| def get_model_icon(name: str) -> str: | |
| """Get the HTML for the model icon.""" | |
| # Skip icons for collapsed models | |
| if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: | |
| return "" | |
| svg_content = read_svg_file(name) | |
| if svg_content: | |
| # Add viewBox and preserve aspect ratio to the SVG element | |
| svg_content = svg_content.replace('<svg', '<svg preserveAspectRatio="xMidYMid meet" style="height: 24px; width: 24px;"') | |
| # Wrap in a container that maintains aspect ratio | |
| return f'<div style="display: inline-block; vertical-align: middle; margin-right: 8px; height: 24px; width: 24px; overflow: hidden;">{svg_content}</div>' | |
| return "" | |
| def process_text(text): | |
| """Process the input text and return visualizations for all tokenizers.""" | |
| # Use the pre-loaded tokenizers | |
| visualizations = utils.visualize_tokens(text, all_tokenizers) | |
| return list(visualizations.values()) + [gr.update(visible=True)] | |
| # Create the Gradio interface | |
| with gr.Blocks(title="Tokens matter.", theme=theme, css=""" | |
| .tokenizer-panel > div { background: var(--input-background-fill); } | |
| .no-padding { padding: 0 !important; } | |
| .form { border: 0 !important; } | |
| .html-container { line-height: 2em; !important; } | |
| .pending { opacity: 1; } | |
| @media (prefers-color-scheme: dark) { | |
| .gradio-container.gradio-container-5-29-0 .contain .html-container span.model-name { color: white !important; } | |
| .html-container span { color: black !important; } | |
| } | |
| """) as demo: | |
| gr.Markdown("# Tokens matter.") | |
| with gr.Row(): | |
| # Left column for inputs | |
| with gr.Column(scale=1): | |
| input_text = gr.Textbox( | |
| label="Input Text:", | |
| placeholder="Enter text to tokenize ...", | |
| value="Als Zürcher bini nöd so Fan vom FC Basel.", | |
| lines=3, | |
| elem_classes="no-padding", | |
| interactive=True, | |
| every=True, # This enables real-time updates | |
| ) | |
| # Right column for outputs | |
| with gr.Column(scale=2): | |
| # Create output boxes for main tokenizers | |
| main_output_boxes = [] | |
| more_output_boxes = [] | |
| # Create 2x2 grid for main tokenizers | |
| with gr.Row(): | |
| with gr.Column(): | |
| for name in all_tokenizer_names[:2]: | |
| if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: | |
| continue | |
| display_name = utils.MODEL_DISPLAY_NAMES.get(name, name) | |
| with gr.Group(elem_classes="tokenizer-panel"): | |
| gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>') | |
| box = gr.HTML() | |
| main_output_boxes.append(box) | |
| with gr.Column(): | |
| for name in all_tokenizer_names[2:4]: | |
| if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: | |
| continue | |
| display_name = utils.MODEL_DISPLAY_NAMES.get(name, name) | |
| with gr.Group(elem_classes="tokenizer-panel"): | |
| gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>') | |
| box = gr.HTML() | |
| main_output_boxes.append(box) | |
| # Display more tokenizers in accordion | |
| more_models = gr.Accordion("More Models", open=False, visible=False) | |
| with more_models: | |
| for name in all_tokenizer_names: | |
| if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: | |
| display_name = utils.MODEL_DISPLAY_NAMES.get(name, name) | |
| with gr.Group(elem_classes="tokenizer-panel"): | |
| gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>') | |
| box = gr.HTML() | |
| more_output_boxes.append(box) | |
| all_outputs = main_output_boxes + more_output_boxes + [more_models] | |
| # Use change event for real-time updates | |
| input_text.change( | |
| fn=process_text, | |
| inputs=[input_text], | |
| outputs=all_outputs, | |
| show_progress="hidden", | |
| ) | |
| # Add examples | |
| gr.Examples( | |
| examples=[ | |
| ["Als Zürcher bini nöd so Fan vom FC Basel."], | |
| ["Als Zürcher bin ich nicht sonderlich Fan des FC Basel."], | |
| ["En tant que Zurichois, je ne suis pas un grand fan du FC Bâle."], | |
| ["Come Zurighese, non sono un grande fan del FC Basilea."], | |
| ["Sco Turitgais na sun jau betg in grond fan da l'FC Basilea."], | |
| ["As a Zurich resident, I am not a big fan of FC Basel."], | |
| ], | |
| inputs=input_text | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |