Naphula commited on
Commit
d45a30f
·
verified ·
1 Parent(s): 22115d9

Upload tokenizer_ripper_v1.py

Browse files
Files changed (1) hide show
  1. tokenizer_ripper_v1.py +154 -0
tokenizer_ripper_v1.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ from gguf import GGUFReader
5
+ from typing import List, Dict, Any
6
+
7
+ def extract_and_save_tokenizer_files(gguf_path: str, output_dir: str) -> None:
8
+ """
9
+ Extracts tokenizer metadata from a GGUF file and saves it as
10
+ tokenizer.json, tokenizer_config.json, and special_tokens_map.json.
11
+ """
12
+ print(f"Loading GGUF file for tokenizer metadata: {gguf_path}")
13
+ reader = GGUFReader(gguf_path, 'r')
14
+
15
+ # --- Extract raw metadata from GGUF ---
16
+ try:
17
+ vocab_list_raw = reader.get_field("tokenizer.ggml.tokens").parts[0]
18
+ merges_list = reader.get_field("tokenizer.ggml.merges").parts[0]
19
+
20
+ bos_token_id = int(reader.get_field("tokenizer.ggml.bos_token_id").parts[0])
21
+ eos_token_id = int(reader.get_field("tokenizer.ggml.eos_token_id").parts[0])
22
+ unk_token_id = int(reader.get_field("tokenizer.ggml.unknown_token_id").parts[0])
23
+ padding_token_id = int(reader.get_field("tokenizer.ggml.padding_token_id").parts[0])
24
+
25
+ model_max_length = int(reader.get_field("llama.context_length").parts[0])
26
+
27
+ # Optional: chat template
28
+ chat_template = None
29
+ try:
30
+ chat_template = reader.get_field("tokenizer.chat_template").parts[0]
31
+ except KeyError:
32
+ pass # Chat template might not always be present
33
+
34
+ # Convert raw vocab bytes to strings
35
+ vocab_list = [token.decode('utf-8', errors='ignore') for token in vocab_list_raw]
36
+
37
+ except Exception as e:
38
+ print(f"Fatal Error: Could not extract essential tokenizer metadata from GGUF. Error: {e}")
39
+ return
40
+
41
+ # --- 1. Create tokenizer.json ---
42
+ try:
43
+ # The vocab for tokenizer.json needs to be a dict of {token_string: id}
44
+ vocab_dict = {token: i for i, token in enumerate(vocab_list)}
45
+
46
+ tokenizer_json_data = {
47
+ "version": "1.0",
48
+ "truncation": None,
49
+ "padding": None,
50
+ "added_tokens": [], # GGUF doesn't typically store this separately
51
+ "normalizer": {
52
+ "type": "Sequence",
53
+ "normalizers": [
54
+ {"type": "NFC"},
55
+ {"type": "Replace", "pattern": " ", "content": " "}, # Example, adjust if needed
56
+ ]
57
+ },
58
+ "pre_tokenizer": {
59
+ "type": "ByteLevel", # Common for BPE models like GPT2/Llama
60
+ "add_prefix_space": False, # Based on tokenizer.ggml.add_space_prefix = 0
61
+ "splits_by_unicode_script": False,
62
+ "trim_offsets": True
63
+ },
64
+ "post_processor": {
65
+ "type": "ByteLevel",
66
+ "truncation": None,
67
+ "padding": None,
68
+ "add_prefix_space": False,
69
+ "trim_offsets": True
70
+ },
71
+ "decoder": {
72
+ "type": "ByteLevel",
73
+ "add_prefix_space": False,
74
+ "trim_offsets": True
75
+ },
76
+ "model": {
77
+ "type": "BPE",
78
+ "vocab": vocab_dict,
79
+ "merges": merges_list,
80
+ "dropout": None,
81
+ "unk_token": vocab_list[unk_token_id] if 0 <= unk_token_id < len(vocab_list) else "<unk>"
82
+ }
83
+ }
84
+
85
+ tokenizer_json_path = os.path.join(output_dir, "tokenizer.json")
86
+ with open(tokenizer_json_path, 'w', encoding='utf-8') as f:
87
+ json.dump(tokenizer_json_data, f, indent=None, separators=(',', ':')) # Compact format
88
+ print(f"Created tokenizer.json at {tokenizer_json_path}")
89
+ except Exception as e:
90
+ print(f"Warning: Could not create tokenizer.json. Error: {e}")
91
+
92
+ # --- 2. Create tokenizer_config.json ---
93
+ try:
94
+ tokenizer_config_data = {
95
+ "model_max_length": model_max_length,
96
+ "padding_side": "left", # Common default for causal models
97
+ "tokenizer_class": "LlamaTokenizer", # Mistral uses LlamaTokenizer
98
+ "clean_up_tokenization_spaces": False,
99
+ "add_bos_token": bool(reader.get_field("tokenizer.ggml.add_bos_token").parts[0]),
100
+ "add_eos_token": bool(reader.get_field("tokenizer.ggml.add_eos_token").parts[0]),
101
+ }
102
+ if chat_template:
103
+ tokenizer_config_data["chat_template"] = chat_template
104
+
105
+ tokenizer_config_path = os.path.join(output_dir, "tokenizer_config.json")
106
+ with open(tokenizer_config_path, 'w', encoding='utf-8') as f:
107
+ json.dump(tokenizer_config_data, f, indent=2)
108
+ print(f"Created tokenizer_config.json at {tokenizer_config_path}")
109
+ except Exception as e:
110
+ print(f"Warning: Could not create tokenizer_config.json. Error: {e}")
111
+
112
+ # --- 3. Create special_tokens_map.json ---
113
+ try:
114
+ special_tokens_map_data = {}
115
+
116
+ def get_token_string(token_id, default_str):
117
+ if 0 <= token_id < len(vocab_list):
118
+ return vocab_list[token_id]
119
+ return default_str
120
+
121
+ special_tokens_map_data["bos_token"] = get_token_string(bos_token_id, "<|begin_of_text|>")
122
+ special_tokens_map_data["eos_token"] = get_token_string(eos_token_id, "<|end_of_text|>")
123
+ special_tokens_map_data["unk_token"] = get_token_string(unk_token_id, "<unk>")
124
+ special_tokens_map_data["pad_token"] = get_token_string(padding_token_id, "<pad>")
125
+
126
+ special_tokens_map_path = os.path.join(output_dir, "special_tokens_map.json")
127
+ with open(special_tokens_map_path, 'w', encoding='utf-8') as f:
128
+ json.dump(special_tokens_map_data, f, indent=2)
129
+ print(f"Created special_tokens_map.json at {special_tokens_map_path}")
130
+ except Exception as e:
131
+ print(f"Warning: Could not create special_tokens_map.json. Error: {e}")
132
+
133
+
134
+ def main():
135
+ parser = argparse.ArgumentParser(
136
+ description="Extracts tokenizer metadata from a GGUF file and saves it as Hugging Face tokenizer files."
137
+ )
138
+ parser.add_argument("--gguf-file", required=True, help="Path to the original GGUF file to read metadata from.")
139
+ parser.add_argument("--output-dir", required=True, help="Path to the directory where the tokenizer files will be saved.")
140
+ args = parser.parse_args()
141
+
142
+ if not os.path.isfile(args.gguf_file):
143
+ print(f"Error: GGUF file not found at {args.gguf_file}")
144
+ return
145
+ if not os.path.isdir(args.output_dir):
146
+ os.makedirs(args.output_dir, exist_ok=True)
147
+ print(f"Created output directory: {args.output_dir}")
148
+
149
+ extract_and_save_tokenizer_files(args.gguf_file, args.output_dir)
150
+
151
+ print("\nTokenizer file generation complete.")
152
+
153
+ if __name__ == "__main__":
154
+ main()