dnzblgn commited on
Commit
f9cbde1
·
verified ·
1 Parent(s): d58e0ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -9
app.py CHANGED
@@ -14,14 +14,13 @@ tokenizers = {
14
  "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
15
  }
16
 
17
- # Decode byte-level tokens back to UTF-8, normalized
18
  def decode_byte_token(token):
19
  token_clean = token.replace("Ġ", "")
20
  try:
21
  byte_seq = bytes([ord(c) for c in token_clean])
22
  return unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
23
  except Exception:
24
- return token_clean # fallback
25
 
26
  def visualize_tokens(text, tokenizer_name, show_token_ids):
27
  tokenizer = tokenizers[tokenizer_name]
@@ -35,16 +34,15 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
35
 
36
  def is_special_token(token):
37
  return (
38
- token.startswith('[') and token.endswith(']')
39
- or token.startswith('<') and token.endswith('>')
40
- or token in tokenizer.all_special_tokens
41
  )
42
 
43
  html_tokens = []
44
  for token in tokens:
45
  prefix = ""
46
  token_body = token
47
-
48
  if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
49
  prefix = "Ġ"
50
  token_body = token[1:]
@@ -77,9 +75,14 @@ def visualize_tokens(text, tokenizer_name, show_token_ids):
77
  if show_token_ids:
78
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
79
 
80
- return html_output, f"🔢 Token Count: {len(tokens)}"
 
 
 
 
 
81
 
82
- #app
83
  with gr.Blocks() as app:
84
  gr.Markdown("# 🚀 German Tokenizers")
85
 
@@ -92,11 +95,12 @@ with gr.Blocks() as app:
92
  with gr.Column():
93
  html_output = gr.HTML(label="Tokens Visualized")
94
  token_count = gr.Label(label="Token Count")
 
95
 
96
  tokenize_btn.click(
97
  visualize_tokens,
98
  inputs=[text_input, tokenizer_choice, show_ids],
99
- outputs=[html_output, token_count]
100
  )
101
 
102
  app.launch()
 
14
  "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
15
  }
16
 
 
17
  def decode_byte_token(token):
18
  token_clean = token.replace("Ġ", "")
19
  try:
20
  byte_seq = bytes([ord(c) for c in token_clean])
21
  return unicodedata.normalize("NFC", byte_seq.decode("utf-8"))
22
  except Exception:
23
+ return token_clean
24
 
25
  def visualize_tokens(text, tokenizer_name, show_token_ids):
26
  tokenizer = tokenizers[tokenizer_name]
 
34
 
35
  def is_special_token(token):
36
  return (
37
+ token.startswith('[') and token.endswith(']') or
38
+ token.startswith('<') and token.endswith('>') or
39
+ token in tokenizer.all_special_tokens
40
  )
41
 
42
  html_tokens = []
43
  for token in tokens:
44
  prefix = ""
45
  token_body = token
 
46
  if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"):
47
  prefix = "Ġ"
48
  token_body = token[1:]
 
75
  if show_token_ids:
76
  html_output += "<br><br><b>Token IDs:</b><br>" + str(token_ids)
77
 
78
+ try:
79
+ decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
80
+ except Exception:
81
+ decoded_output = "[Could not decode using this tokenizer]"
82
+
83
+ return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output
84
 
85
+ # App
86
  with gr.Blocks() as app:
87
  gr.Markdown("# 🚀 German Tokenizers")
88
 
 
95
  with gr.Column():
96
  html_output = gr.HTML(label="Tokens Visualized")
97
  token_count = gr.Label(label="Token Count")
98
+ decoded_output = gr.Textbox(label="Decoded Text", lines=3)
99
 
100
  tokenize_btn.click(
101
  visualize_tokens,
102
  inputs=[text_input, tokenizer_choice, show_ids],
103
+ outputs=[html_output, token_count, decoded_output]
104
  )
105
 
106
  app.launch()