Spaces:
Build error
Build error
| import random | |
| import gradio as gr | |
| import unicodedata | |
| from transformers import AutoTokenizer, PreTrainedTokenizerFast | |
| tokenizers = { | |
| "Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"), | |
| "Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"), | |
| "KoichiYasuoka/bert-base-german-upos": AutoTokenizer.from_pretrained("KoichiYasuoka/bert-base-german-upos"), | |
| "benjamin/gerpt2-large": AutoTokenizer.from_pretrained("benjamin/gerpt2-large"), | |
| "deepset/gbert-base": AutoTokenizer.from_pretrained("deepset/gbert-base"), | |
| "bert-base-german-cased Tokenizer": AutoTokenizer.from_pretrained("bert-base-german-cased"), | |
| "MiriUll/gpt2-wechsel-german_easy": AutoTokenizer.from_pretrained("MiriUll/gpt2-wechsel-german_easy"), | |
| "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1") | |
| } | |
| def decode_byte_token(token): | |
| token_clean = token.replace("Ġ", "") | |
| try: | |
| byte_seq = bytes([ord(c) for c in token_clean]) | |
| return unicodedata.normalize("NFC", byte_seq.decode("utf-8")) | |
| except Exception: | |
| return token_clean | |
| def visualize_tokens(text, tokenizer_name, show_token_ids): | |
| tokenizer = tokenizers[tokenizer_name] | |
| encoded = tokenizer(text, add_special_tokens=False, return_tensors=None) | |
| token_ids = encoded["input_ids"] | |
| tokens = tokenizer.convert_ids_to_tokens(token_ids) | |
| def random_pastel(): | |
| r = lambda: random.randint(100, 255) | |
| return f"rgb({r()},{r()},{r()})" | |
| def is_special_token(token): | |
| return ( | |
| token.startswith('[') and token.endswith(']') or | |
| token.startswith('<') and token.endswith('>') or | |
| token in tokenizer.all_special_tokens | |
| ) | |
| html_tokens = [] | |
| for token in tokens: | |
| prefix = "" | |
| token_body = token | |
| if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"): | |
| prefix = "Ġ" | |
| token_body = token[1:] | |
| try: | |
| byte_seq = bytes([ord(c) for c in token_body]) | |
| decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8")) | |
| except Exception: | |
| decoded = token_body | |
| label = f"{prefix}{decoded}" | |
| color = "lightgray" if is_special_token(token) else random_pastel() | |
| html_token = f""" | |
| <span title="{token}" style=' | |
| display:inline-block; | |
| margin:4px; | |
| padding:8px 12px; | |
| background-color:{color}; | |
| border-radius:8px; | |
| font-size:18px; | |
| font-family:monospace; | |
| font-weight:bold; | |
| '>{label}</span> | |
| """ | |
| html_tokens.append(html_token) | |
| html_output = "".join(html_tokens) | |
| if show_token_ids: | |
| html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids) | |
| try: | |
| decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True) | |
| except Exception: | |
| decoded_output = "[Could not decode using this tokenizer]" | |
| return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output | |
| # App | |
| with gr.Blocks() as app: | |
| gr.Markdown("# 🚀 German Tokenizers") | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox(lines=4, label="Enter your text here", placeholder="Type or paste text...") | |
| tokenizer_choice = gr.Dropdown(list(tokenizers.keys()), label="Choose Tokenizer") | |
| show_ids = gr.Checkbox(label="Show Token IDs", value=False) | |
| tokenize_btn = gr.Button("Tokenize!") | |
| with gr.Column(): | |
| html_output = gr.HTML(label="Tokens Visualized") | |
| token_count = gr.Label(label="Token Count") | |
| decoded_output = gr.Textbox(label="Decoded Text", lines=3) | |
| tokenize_btn.click( | |
| visualize_tokens, | |
| inputs=[text_input, tokenizer_choice, show_ids], | |
| outputs=[html_output, token_count, decoded_output] | |
| ) | |
| app.launch() | |