Spaces:

dnzblgn
/

Tokenizers

Build error

App Files Files Community

dnzblgn commited on May 6, 2025

Commit

f9cbde1

verified ·

1 Parent(s): d58e0ec

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -9

app.py CHANGED Viewed

@@ -14,14 +14,13 @@ tokenizers = {
     "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
 }
-# Decode byte-level tokens back to UTF-8, normalized
 def decode_byte_token(token):
     token_clean = token.replace("Ġ", "")
     try:
         byte_seq = bytes([ord(c) for c in token_clean])
         return unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
     except Exception:
-        return token_clean  # fallback
 def visualize_tokens(text, tokenizer_name, show_token_ids):
     tokenizer = tokenizers[tokenizer_name]
@@ -35,16 +34,15 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
     def is_special_token(token):
         return (
-            token.startswith('[') and token.endswith(']')
-            or token.startswith('<') and token.endswith('>')
-            or token in tokenizer.all_special_tokens
         )
     html_tokens = []
     for token in tokens:
         prefix = ""
         token_body = token
         if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
             prefix = "Ġ"
             token_body = token[1:]
@@ -77,9 +75,14 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
     if show_token_ids:
         html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
-    return html_output, f"🔢 Token Count: {len(tokens)}"
-#app
 with gr.Blocks() as app:
     gr.Markdown("# 🚀 German Tokenizers")
@@ -92,11 +95,12 @@ with gr.Blocks() as app:
         with gr.Column():
             html_output = gr.HTML(label="Tokens Visualized")
             token_count = gr.Label(label="Token Count")
     tokenize_btn.click(
         visualize_tokens,
         inputs=[text_input, tokenizer_choice, show_ids],
-        outputs=[html_output, token_count]
     )
 app.launch()

     "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
 }
 def decode_byte_token(token):
     token_clean = token.replace("Ġ", "")
     try:
         byte_seq = bytes([ord(c) for c in token_clean])
         return unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
     except Exception:
+        return token_clean
 def visualize_tokens(text, tokenizer_name, show_token_ids):
     tokenizer = tokenizers[tokenizer_name]
     def is_special_token(token):
         return (
+            token.startswith('[') and token.endswith(']') or
+            token.startswith('<') and token.endswith('>') or
+            token in tokenizer.all_special_tokens
         )
     html_tokens = []
     for token in tokens:
         prefix = ""
         token_body = token
         if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
             prefix = "Ġ"
             token_body = token[1:]
     if show_token_ids:
         html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
+    try:
+        decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
+    except Exception:
+        decoded_output = "[Could not decode using this tokenizer]"
+    return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
+# App
 with gr.Blocks() as app:
     gr.Markdown("# 🚀 German Tokenizers")
         with gr.Column():
             html_output = gr.HTML(label="Tokens Visualized")
             token_count = gr.Label(label="Token Count")
+            decoded_output = gr.Textbox(label="Decoded Text", lines=3)
     tokenize_btn.click(
         visualize_tokens,
         inputs=[text_input, tokenizer_choice, show_ids],
+        outputs=[html_output, token_count, decoded_output]
     )
 app.launch()