to0ony commited on
Commit
8fccddb
·
1 Parent(s): ce5071c

added app.py, requirements.txt and mingpt

Browse files
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc, json, torch, gradio as gr
2
+ from huggingface_hub import hf_hub_download
3
+ import tiktoken
4
+
5
+ from mingpt.model import GPT
6
+
7
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
8
+ REPO_ID = "to0ony/final-thesis-plotgen"
9
+
10
+ state = {"model": None, "enc": tiktoken.get_encoding("gpt2")}
11
+
12
+ def load_model():
13
+ """Lazy-load model iz Hugging Face repozitorija"""
14
+ if state["model"] is not None:
15
+ return state["model"]
16
+
17
+ # skinuti config i model.pt
18
+ cfg_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")
19
+ mdl_path = hf_hub_download(repo_id=REPO_ID, filename="model.pt")
20
+
21
+ # učitati config
22
+ with open(cfg_path, "r", encoding="utf-8") as f:
23
+ cfg = json.load(f)
24
+
25
+ gcfg = GPT.get_default_config()
26
+ gcfg.vocab_size = cfg["vocab_size"]
27
+ gcfg.block_size = cfg["block_size"]
28
+ gcfg.n_layer = cfg["n_layer"]
29
+ gcfg.n_head = cfg["n_head"]
30
+ gcfg.n_embd = cfg["n_embd"]
31
+
32
+ model = GPT(gcfg)
33
+ sd = torch.load(mdl_path, map_location="cpu")
34
+ model.load_state_dict(sd, strict=True)
35
+ model.to(DEVICE)
36
+ model.eval()
37
+
38
+ state["model"] = model
39
+ return model
40
+
41
+ @torch.inference_mode()
42
+ def generate(prompt, max_new_tokens=200, temperature=0.9, top_k=50):
43
+ """Generiranje teksta iz prompta"""
44
+ model = load_model()
45
+ enc = state["enc"]
46
+
47
+ x = torch.tensor([enc.encode(prompt)], dtype=torch.long, device=DEVICE)
48
+
49
+ y = model.generate(
50
+ x,
51
+ max_new_tokens=int(max_new_tokens),
52
+ temperature=float(temperature),
53
+ top_k=int(top_k) if top_k > 0 else None
54
+ )
55
+
56
+ return enc.decode(y[0].tolist())
57
+
58
+ # Gradio UI
59
+ with gr.Blocks(title="🎬 Final Thesis Plot Generator") as demo:
60
+ gr.Markdown("## 🎬 Film Plot Generator\nUnesi prompt i generiraj radnju filma.")
61
+
62
+ prompt = gr.Textbox(label="Prompt", lines=5, placeholder="E.g. A young detective arrives in a coastal town...")
63
+ max_new_tokens = gr.Slider(32, 512, value=200, step=16, label="Max new tokens")
64
+ temperature = gr.Slider(0.1, 1.5, value=0.9, step=0.1, label="Temperature")
65
+ top_k = gr.Slider(0, 100, value=50, step=5, label="Top-K (0 = off)")
66
+ btn = gr.Button("Generate")
67
+ output = gr.Textbox(label="Output", lines=15)
68
+
69
+ btn.click(generate, [prompt, max_new_tokens, temperature, top_k], output)
70
+
71
+ if __name__ == "__main__":
72
+ demo.launch()
mingpt/__init__.py ADDED
File without changes
mingpt/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (136 Bytes). View file
 
mingpt/__pycache__/model.cpython-312.pyc ADDED
Binary file (19.7 kB). View file
 
mingpt/__pycache__/utils.cpython-312.pyc ADDED
Binary file (5.26 kB). View file
 
mingpt/bpe.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ bpe is short for Byte Pair Encoder. It translates arbitrary utf-8 strings into
3
+ sequences of integers, where each integer represents small chunks of commonly
4
+ occuring characters. This implementation is based on openai's gpt2 encoder.py:
5
+ https://github.com/openai/gpt-2/blob/master/src/encoder.py
6
+ but was mildly modified because the original implementation is a bit confusing.
7
+ I also tried to add as many comments as possible, my own understanding of what's
8
+ going on.
9
+ """
10
+
11
+ import os
12
+ import json
13
+ import regex as re
14
+ import requests
15
+
16
+ import torch
17
+
18
+ # -----------------------------------------------------------------------------
19
+
20
+ def bytes_to_unicode():
21
+ """
22
+ Every possible byte (really an integer 0..255) gets mapped by OpenAI to a unicode
23
+ character that represents it visually. Some bytes have their appearance preserved
24
+ because they don't cause any trouble. These are defined in list bs. For example:
25
+ chr(33) returns "!", so in the returned dictionary we simply have d[33] -> "!".
26
+ However, chr(0), for example, is '\x00', which looks ugly. So OpenAI maps these
27
+ bytes, into new characters in a range where chr() returns a single nice character.
28
+ So in the final dictionary we have d[0] -> 'Ā' instead, which is just chr(0 + 2**8).
29
+ In particular, the space character is 32, which we can see by ord(' '). Instead,
30
+ this function will shift space (32) by 256 to 288, so d[32] -> 'Ġ'.
31
+ So this is just a simple one-to-one mapping of bytes 0..255 into unicode characters
32
+ that "look nice", either in their original form, or a funny shifted character
33
+ like 'Ā', or 'Ġ', etc.
34
+ """
35
+ # the 188 integers that render fine in their original form and need no shifting
36
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
37
+ cs = bs[:] # all integers b in bs will simply map to chr(b) in the output dict
38
+ # now get the representations of the other 68 integers that do need shifting
39
+ # each will get mapped chr(256 + n), where n will grow from 0...67 in the loop
40
+ n = 0
41
+ for b in range(2**8):
42
+ if b not in bs:
43
+ # if this byte is "ugly" then map it to the next available "nice" character
44
+ bs.append(b)
45
+ cs.append(2**8+n)
46
+ n += 1
47
+ cs = [chr(n) for n in cs]
48
+ d = dict(zip(bs, cs))
49
+ return d
50
+
51
+ def get_pairs(word):
52
+ """
53
+ Return all bigrams as a set of tuples, of consecutive elements in the iterable word.
54
+ """
55
+ pairs = set()
56
+ prev_char = word[0]
57
+ for char in word[1:]:
58
+ pairs.add((prev_char, char))
59
+ prev_char = char
60
+ return pairs
61
+
62
+ class Encoder:
63
+
64
+ def __init__(self, encoder, bpe_merges):
65
+ # byte encoder/decoder
66
+ self.byte_encoder = bytes_to_unicode()
67
+ self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
68
+ # bpe token encoder/decoder
69
+ self.encoder = encoder
70
+ self.decoder = {v:k for k,v in self.encoder.items()}
71
+ # bpe merge list that defines the bpe "tree", of tuples (a,b) that are to merge to token ab
72
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
73
+ # the splitting pattern used for pre-tokenization
74
+ # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions <-- original openai comment
75
+ """
76
+ ok so what is this regex looking for, exactly?
77
+ python re reference: https://docs.python.org/3/library/re.html
78
+ - the vertical bars | is OR, so re.findall will chunkate text as the pieces match, from left to right
79
+ - '\'s' would split up things like Andrej's -> (Andrej, 's)
80
+ - ' ?\p{L}': optional space followed by 1+ unicode code points in the category "letter"
81
+ - ' ?\p{N}': optional space followed by 1+ unicode code points in the category "number"
82
+ - ' ?[^\s\p{L}\p{N}]+': optional space, then 1+ things that are NOT a whitespace, letter or number
83
+ - '\s+(?!\S)': 1+ whitespace characters (e.g. space or tab or etc) UNLESS they are followed by non-whitespace
84
+ so this will consume whitespace characters in a sequence but exclude the last whitespace in
85
+ that sequence. that last whitespace has the opportunity to then match the optional ' ?' in
86
+ earlier patterns.
87
+ - '\s+': 1+ whitespace characters, intended probably to catch a full trailing sequence of whitespaces at end of string
88
+ So TLDR:
89
+ - we are special casing a few common apostrophe constructs ('s, 't, 're, ...) and making those into separate tokens
90
+ - we then separate out strings into consecutive chunks of 1) letters, 2) numbers, 3) non-letter-numbers, 4) whitespaces
91
+ """
92
+ self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
93
+ self.cache = {}
94
+
95
+ def bpe(self, token):
96
+ """
97
+ this function uses self.bpe_ranks to iteratively merge all the possible bpe tokens
98
+ up the tree. token is a string of one individual 'word' (after regex tokenization)
99
+ and after byte encoding, e.g. 'Ġthere'.
100
+ """
101
+ # token is a string of one individual 'word', after byte encoding, e.g. 'Ġthere'
102
+
103
+ # memoization, for efficiency
104
+ if token in self.cache:
105
+ return self.cache[token]
106
+
107
+ word = tuple(token) # individual characters that make up the token, in a tuple
108
+ pairs = get_pairs(word) # get all bigrams
109
+
110
+ if not pairs:
111
+ return token
112
+
113
+ while True:
114
+
115
+ # find the next lowest rank bigram that can be merged
116
+ bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
117
+ if bigram not in self.bpe_ranks:
118
+ break # no more bigrams are eligible to be merged
119
+ first, second = bigram
120
+
121
+ # we will now replace all occurences of (first, second) in the list of current
122
+ # words into one merged token first_second, in the output list new_words
123
+ new_word = []
124
+ i = 0
125
+ while i < len(word):
126
+
127
+ # find the next occurence of first in the sequence of current words
128
+ try:
129
+ j = word.index(first, i)
130
+ new_word.extend(word[i:j])
131
+ i = j
132
+ except:
133
+ new_word.extend(word[i:])
134
+ break
135
+
136
+ # if this occurence is also followed by second, then merge them into one
137
+ if word[i] == first and i < len(word)-1 and word[i+1] == second:
138
+ new_word.append(first+second)
139
+ i += 2
140
+ else:
141
+ new_word.append(word[i])
142
+ i += 1
143
+
144
+ # all occurences of (first, second) have been merged to first_second
145
+ new_word = tuple(new_word)
146
+ word = new_word
147
+ if len(word) == 1:
148
+ break
149
+ else:
150
+ pairs = get_pairs(word)
151
+
152
+ # concat all words into a string, and use ' ' as the separator. Note that
153
+ # by now all characters have been byte encoded, guaranteeing that ' ' is
154
+ # not used in the actual data and is a 'special' delimiter character
155
+ word = ' '.join(word)
156
+
157
+ # cache the result and return
158
+ self.cache[token] = word
159
+ return word
160
+
161
+ def encode(self, text):
162
+ """ string goes in, list of integers comes out """
163
+ bpe_idx = []
164
+ # pre-tokenize the input text into string tokens (words, roughly speaking)
165
+ tokens = re.findall(self.pat, text)
166
+ # process each token into BPE integers
167
+ for token in tokens:
168
+ # encode the token as a bytes (b'') object
169
+ token_bytes = token.encode('utf-8')
170
+ # translate all bytes to their unicode string representation and flatten
171
+ token_translated = ''.join(self.byte_encoder[b] for b in token_bytes)
172
+ # perform all the applicable bpe merges according to self.bpe_ranks
173
+ token_merged = self.bpe(token_translated).split(' ')
174
+ # translate all bpe tokens to integers
175
+ token_ix = [self.encoder[bpe_token] for bpe_token in token_merged]
176
+ # extend our running list of all output integers
177
+ bpe_idx.extend(token_ix)
178
+ return bpe_idx
179
+
180
+ def encode_and_show_work(self, text):
181
+ """ debugging function, same as encode but returns all intermediate work """
182
+ bpe_idx = []
183
+ parts = []
184
+ tokens = re.findall(self.pat, text)
185
+ for token in tokens:
186
+ token_bytes = token.encode('utf-8')
187
+ token_translated = ''.join(self.byte_encoder[b] for b in token_bytes)
188
+ token_merged = self.bpe(token_translated).split(' ')
189
+ token_ix = [self.encoder[bpe_token] for bpe_token in token_merged]
190
+ bpe_idx.extend(token_ix)
191
+ parts.append({
192
+ 'token': token,
193
+ 'token_bytes': token_bytes,
194
+ 'token_translated': token_translated,
195
+ 'token_merged': token_merged,
196
+ 'token_ix': token_ix,
197
+ })
198
+ out = {
199
+ 'bpe_idx': bpe_idx, # the actual output sequence
200
+ 'tokens': tokens, # result of pre-tokenization
201
+ 'parts': parts, # intermediates for each token part
202
+ }
203
+ return out
204
+
205
+ def decode(self, bpe_idx):
206
+ """ list of integers comes in, string comes out """
207
+ # inverse map the integers to get the tokens
208
+ tokens_merged = [self.decoder[token] for token in bpe_idx]
209
+ # inverse the byte encoder, e.g. recovering 'Ġ' -> ' ', and get the bytes
210
+ tokens_flat = ''.join(tokens_merged)
211
+ tokens_bytes = bytearray([self.byte_decoder[c] for c in tokens_flat])
212
+ # recover the full utf-8 string
213
+ text = tokens_bytes.decode('utf-8', errors='replace')
214
+ return text
215
+
216
+ def get_file(local_file, remote_file):
217
+ """ downloads remote_file to local_file if necessary """
218
+ if not os.path.isfile(local_file):
219
+ print(f"downloading {remote_file} to {local_file}")
220
+ response = requests.get(remote_file)
221
+ open(local_file, "wb").write(response.content)
222
+
223
+ def get_encoder():
224
+ """
225
+ Returns an instance of the GPT BPE Encoder/Decoder
226
+ and handles caching of "database" files.
227
+ """
228
+ home_dir = os.path.expanduser('~')
229
+ cache_dir = os.path.join(home_dir, '.cache', 'mingpt')
230
+ os.makedirs(cache_dir, exist_ok=True)
231
+
232
+ # load encoder.json that has the raw mappings from token -> bpe index
233
+ encoder_local_file = os.path.join(cache_dir, 'encoder.json')
234
+ encoder_remote_file = 'https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json'
235
+ get_file(encoder_local_file, encoder_remote_file)
236
+ with open(encoder_local_file, 'r') as f:
237
+ encoder = json.load(f)
238
+ assert len(encoder) == 50257 # 256 individual byte tokens, 50,000 merged tokens, and 1 special <|endoftext|> token
239
+
240
+ # load vocab.bpe that contains the bpe merges, i.e. the bpe tree structure
241
+ # in the form tuples (a, b), that indicate that (a, b) is to be merged to one token ab
242
+ vocab_local_file = os.path.join(cache_dir, 'vocab.bpe')
243
+ vocab_remote_file = 'https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe'
244
+ get_file(vocab_local_file, vocab_remote_file)
245
+ with open(vocab_local_file, 'r', encoding="utf-8") as f:
246
+ bpe_data = f.read()
247
+ # light postprocessing: strip the version on first line and the last line is a blank
248
+ bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
249
+ assert len(bpe_merges) == 50000 # 50,000 merged tokens
250
+
251
+ # construct the Encoder object and return
252
+ enc = Encoder(encoder, bpe_merges)
253
+ return enc
254
+
255
+ # -----------------------------------------------------------------------------
256
+
257
+ class BPETokenizer:
258
+ """ PyTorch-aware class that wraps the Encoder above """
259
+
260
+ def __init__(self):
261
+ self.encoder = get_encoder()
262
+
263
+ def __call__(self, text, return_tensors='pt'):
264
+ # PyTorch only; here because we want to match huggingface/transformers interface
265
+ assert return_tensors == 'pt'
266
+ # single string input for now, in the future potentially a list of strings
267
+ assert isinstance(text, str)
268
+ # encode and create a "batch dimension" of 1
269
+ idx = [self.encoder.encode(text)]
270
+ # wrap into PyTorch tensor
271
+ out = torch.tensor(idx, dtype=torch.long)
272
+ return out
273
+
274
+ def decode(self, idx):
275
+ # ensure a simple 1D tensor for now
276
+ assert idx.ndim == 1
277
+ # decode indices to text
278
+ text = self.encoder.decode(idx.tolist())
279
+ return text
280
+
281
+
282
+ if __name__ == '__main__':
283
+
284
+ # here is an encoding example
285
+ text = "Hello!! I'm Andrej Karpathy. It's 2022. w00t :D 🤗"
286
+ e = get_encoder()
287
+ r = e.encode_and_show_work(text)
288
+
289
+ print("Original text is:")
290
+ print(text)
291
+ print("First the text gets pre-tokenized, broken up into chunks, the outcome is:")
292
+ print(r['tokens'])
293
+ # ['Hello', '!!', ' I', "'m", ' Andrej', ' Karpathy', '.', ' It', "'s", ' 2022', '.', ' w', '00', 't', ' :', 'D', ' 🤗']
294
+ print("Then we iterate over each chunk and process them in turn...")
295
+ for part in r['parts']:
296
+ print(part)
297
+ # {'token': 'Hello', 'token_bytes': b'Hello', 'token_translated': 'Hello', 'token_merged': ['Hello'], 'token_ix': [15496]}
298
+ # {'token': '!!', 'token_bytes': b'!!', 'token_translated': '!!', 'token_merged': ['!!'], 'token_ix': [3228]}
299
+ # {'token': ' I', 'token_bytes': b' I', 'token_translated': 'ĠI', 'token_merged': ['ĠI'], 'token_ix': [314]}
300
+ # {'token': "'m", 'token_bytes': b"'m", 'token_translated': "'m", 'token_merged': ["'m"], 'token_ix': [1101]}
301
+ # {'token': ' Andrej', 'token_bytes': b' Andrej', 'token_translated': 'ĠAndrej', 'token_merged': ['ĠAndre', 'j'], 'token_ix': [10948, 73]}
302
+ # {'token': ' Karpathy', 'token_bytes': b' Karpathy', 'token_translated': 'ĠKarpathy', 'token_merged': ['ĠK', 'arp', 'athy'], 'token_ix': [509, 5117, 10036]}
303
+ # {'token': '.', 'token_bytes': b'.', 'token_translated': '.', 'token_merged': ['.'], 'token_ix': [13]}
304
+ # {'token': ' It', 'token_bytes': b' It', 'token_translated': 'ĠIt', 'token_merged': ['ĠIt'], 'token_ix': [632]}
305
+ # {'token': "'s", 'token_bytes': b"'s", 'token_translated': "'s", 'token_merged': ["'s"], 'token_ix': [338]}
306
+ # {'token': ' 2022', 'token_bytes': b' 2022', 'token_translated': 'Ġ2022', 'token_merged': ['Ġ2022'], 'token_ix': [33160]}
307
+ # {'token': '.', 'token_bytes': b'.', 'token_translated': '.', 'token_merged': ['.'], 'token_ix': [13]}
308
+ # {'token': ' w', 'token_bytes': b' w', 'token_translated': 'Ġw', 'token_merged': ['Ġw'], 'token_ix': [266]}
309
+ # {'token': '00', 'token_bytes': b'00', 'token_translated': '00', 'token_merged': ['00'], 'token_ix': [405]}
310
+ # {'token': 't', 'token_bytes': b't', 'token_translated': 't', 'token_merged': ['t'], 'token_ix': [83]}
311
+ # {'token': ' :', 'token_bytes': b' :', 'token_translated': 'Ġ:', 'token_merged': ['Ġ:'], 'token_ix': [1058]}
312
+ # {'token': 'D', 'token_bytes': b'D', 'token_translated': 'D', 'token_merged': ['D'], 'token_ix': [35]}
313
+ # {'token': ' 🤗', 'token_bytes': b' \xf0\x9f\xa4\x97', 'token_translated': 'Ġð٤Ĺ', 'token_merged': ['ĠðŁ', '¤', 'Ĺ'], 'token_ix': [12520, 97, 245]}
314
+ # (refer to the code inside Encoder.encode for what these intermediates are)
315
+ print("and the final outcome is concatenating and flattening all the token_ix:")
316
+ print(r['bpe_idx'])
317
+ # [15496, 3228, 314, 1101, 10948, 73, 509, 5117, 10036, 13, 632, 338, 33160, 13, 266, 405, 83, 1058, 35, 12520, 97, 245]
318
+ # this would then become the integer input sequence to the transformer
319
+ print("ready to feed into a Transformer!")
mingpt/model.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Full definition of a GPT Language Model, all of it in this single file.
3
+
4
+ References:
5
+ 1) the official GPT-2 TensorFlow implementation released by OpenAI:
6
+ https://github.com/openai/gpt-2/blob/master/src/model.py
7
+ 2) huggingface/transformers PyTorch implementation:
8
+ https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
9
+ """
10
+
11
+ import math
12
+
13
+ import torch
14
+ import torch.nn as nn
15
+ from torch.nn import functional as F
16
+
17
+ from mingpt.utils import CfgNode as CN
18
+
19
+ # -----------------------------------------------------------------------------
20
+
21
+ class NewGELU(nn.Module):
22
+ """
23
+ Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
24
+ Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
25
+ """
26
+ def forward(self, x):
27
+ return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
28
+
29
+ class CausalSelfAttention(nn.Module):
30
+ """
31
+ A vanilla multi-head masked self-attention layer with a projection at the end.
32
+ It is possible to use torch.nn.MultiheadAttention here but I am including an
33
+ explicit implementation here to show that there is nothing too scary here.
34
+ """
35
+
36
+ def __init__(self, config):
37
+ super().__init__()
38
+ assert config.n_embd % config.n_head == 0
39
+ # key, query, value projections for all heads, but in a batch
40
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
41
+ # output projection
42
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
43
+ # regularization
44
+ self.attn_dropout = nn.Dropout(config.attn_pdrop)
45
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
46
+ # causal mask to ensure that attention is only applied to the left in the input sequence
47
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
48
+ .view(1, 1, config.block_size, config.block_size))
49
+ self.n_head = config.n_head
50
+ self.n_embd = config.n_embd
51
+
52
+ def forward(self, x):
53
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
54
+
55
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
56
+ q, k ,v = self.c_attn(x).split(self.n_embd, dim=2)
57
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
58
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
59
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
60
+
61
+ # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
62
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
63
+ att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
64
+ att = F.softmax(att, dim=-1)
65
+ att = self.attn_dropout(att)
66
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
67
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
68
+
69
+ # output projection
70
+ y = self.resid_dropout(self.c_proj(y))
71
+ return y
72
+
73
+ class Block(nn.Module):
74
+ """ an unassuming Transformer block """
75
+
76
+ def __init__(self, config):
77
+ super().__init__()
78
+ self.ln_1 = nn.LayerNorm(config.n_embd)
79
+ self.attn = CausalSelfAttention(config)
80
+ self.ln_2 = nn.LayerNorm(config.n_embd)
81
+ self.mlp = nn.ModuleDict(dict(
82
+ c_fc = nn.Linear(config.n_embd, 4 * config.n_embd),
83
+ c_proj = nn.Linear(4 * config.n_embd, config.n_embd),
84
+ act = NewGELU(),
85
+ dropout = nn.Dropout(config.resid_pdrop),
86
+ ))
87
+ m = self.mlp
88
+ self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x)))) # MLP forward
89
+
90
+ def forward(self, x):
91
+ x = x + self.attn(self.ln_1(x))
92
+ x = x + self.mlpf(self.ln_2(x))
93
+ return x
94
+
95
+ class GPT(nn.Module):
96
+ """ GPT Language Model """
97
+
98
+ @staticmethod
99
+ def get_default_config():
100
+ C = CN()
101
+ # either model_type or (n_layer, n_head, n_embd) must be given in the config
102
+ C.model_type = 'gpt'
103
+ C.n_layer = None
104
+ C.n_head = None
105
+ C.n_embd = None
106
+ # these options must be filled in externally
107
+ C.vocab_size = None
108
+ C.block_size = None
109
+ # dropout hyperparameters
110
+ C.embd_pdrop = 0.1
111
+ C.resid_pdrop = 0.1
112
+ C.attn_pdrop = 0.1
113
+ return C
114
+
115
+ def __init__(self, config):
116
+ super().__init__()
117
+ assert config.vocab_size is not None
118
+ assert config.block_size is not None
119
+ self.block_size = config.block_size
120
+
121
+ type_given = config.model_type is not None
122
+ params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None])
123
+ assert type_given ^ params_given # exactly one of these (XOR)
124
+ if type_given:
125
+ # translate from model_type to detailed configuration
126
+ config.merge_from_dict({
127
+ # names follow the huggingface naming conventions
128
+ # GPT-1
129
+ 'openai-gpt': dict(n_layer=12, n_head=12, n_embd=768), # 117M params
130
+ # GPT-2 configs
131
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
132
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
133
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
134
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
135
+ # Gophers
136
+ 'gopher-44m': dict(n_layer=8, n_head=16, n_embd=512),
137
+ # (there are a number more...)
138
+ # I made these tiny models up
139
+ 'gpt-mini': dict(n_layer=6, n_head=6, n_embd=192),
140
+ 'gpt-micro': dict(n_layer=4, n_head=4, n_embd=128),
141
+ 'gpt-nano': dict(n_layer=3, n_head=3, n_embd=48),
142
+ }[config.model_type])
143
+
144
+ self.transformer = nn.ModuleDict(dict(
145
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
146
+ wpe = nn.Embedding(config.block_size, config.n_embd),
147
+ drop = nn.Dropout(config.embd_pdrop),
148
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
149
+ ln_f = nn.LayerNorm(config.n_embd),
150
+ ))
151
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
152
+
153
+ # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
154
+ self.apply(self._init_weights)
155
+ for pn, p in self.named_parameters():
156
+ if pn.endswith('c_proj.weight'):
157
+ torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
158
+
159
+ # report number of parameters (note we don't count the decoder parameters in lm_head)
160
+ n_params = sum(p.numel() for p in self.transformer.parameters())
161
+ print("number of parameters: %.2fM" % (n_params/1e6,))
162
+
163
+ def _init_weights(self, module):
164
+ if isinstance(module, nn.Linear):
165
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
166
+ if module.bias is not None:
167
+ torch.nn.init.zeros_(module.bias)
168
+ elif isinstance(module, nn.Embedding):
169
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
170
+ elif isinstance(module, nn.LayerNorm):
171
+ torch.nn.init.zeros_(module.bias)
172
+ torch.nn.init.ones_(module.weight)
173
+
174
+ @classmethod
175
+ def from_pretrained(cls, model_type):
176
+ """
177
+ Initialize a pretrained GPT model by copying over the weights
178
+ from a huggingface/transformers checkpoint.
179
+ """
180
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
181
+ from transformers import GPT2LMHeadModel
182
+
183
+ # create a from-scratch initialized minGPT model
184
+ config = cls.get_default_config()
185
+ config.model_type = model_type
186
+ config.vocab_size = 50257 # openai's model vocabulary
187
+ config.block_size = 1024 # openai's model block_size
188
+ model = GPT(config)
189
+ sd = model.state_dict()
190
+
191
+ # init a huggingface/transformers model
192
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
193
+ sd_hf = model_hf.state_dict()
194
+
195
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
196
+ keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')] # ignore these
197
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
198
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla nn.Linear.
199
+ # this means that we have to transpose these weights when we import them
200
+ assert len(keys) == len(sd)
201
+ for k in keys:
202
+ if any(k.endswith(w) for w in transposed):
203
+ # special treatment for the Conv1D weights we need to transpose
204
+ assert sd_hf[k].shape[::-1] == sd[k].shape
205
+ with torch.no_grad():
206
+ sd[k].copy_(sd_hf[k].t())
207
+ else:
208
+ # vanilla copy over the other parameters
209
+ assert sd_hf[k].shape == sd[k].shape
210
+ with torch.no_grad():
211
+ sd[k].copy_(sd_hf[k])
212
+
213
+ return model
214
+
215
+ def configure_optimizers(self, train_config):
216
+ """
217
+ This long function is unfortunately doing something very simple and is being very defensive:
218
+ We are separating out all parameters of the model into two buckets: those that will experience
219
+ weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
220
+ We are then returning the PyTorch optimizer object.
221
+ """
222
+
223
+ # separate out all parameters to those that will and won't experience regularizing weight decay
224
+ decay = set()
225
+ no_decay = set()
226
+ whitelist_weight_modules = (torch.nn.Linear, )
227
+ blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
228
+ for mn, m in self.named_modules():
229
+ for pn, p in m.named_parameters():
230
+ fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
231
+ # random note: because named_modules and named_parameters are recursive
232
+ # we will see the same tensors p many many times. but doing it this way
233
+ # allows us to know which parent module any tensor p belongs to...
234
+ if pn.endswith('bias'):
235
+ # all biases will not be decayed
236
+ no_decay.add(fpn)
237
+ elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
238
+ # weights of whitelist modules will be weight decayed
239
+ decay.add(fpn)
240
+ elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
241
+ # weights of blacklist modules will NOT be weight decayed
242
+ no_decay.add(fpn)
243
+
244
+ # validate that we considered every parameter
245
+ param_dict = {pn: p for pn, p in self.named_parameters()}
246
+ inter_params = decay & no_decay
247
+ union_params = decay | no_decay
248
+ assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
249
+ assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
250
+ % (str(param_dict.keys() - union_params), )
251
+
252
+ # create the pytorch optimizer object
253
+ optim_groups = [
254
+ {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
255
+ {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
256
+ ]
257
+ optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
258
+ return optimizer
259
+
260
+ def forward(self, idx, targets=None):
261
+ device = idx.device
262
+ b, t = idx.size()
263
+ assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
264
+ pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
265
+
266
+ # forward the GPT model itself
267
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
268
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
269
+ x = self.transformer.drop(tok_emb + pos_emb)
270
+ for block in self.transformer.h:
271
+ x = block(x)
272
+ x = self.transformer.ln_f(x)
273
+ logits = self.lm_head(x)
274
+
275
+ # if we are given some desired targets also calculate the loss
276
+ loss = None
277
+ if targets is not None:
278
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
279
+
280
+ return logits, loss
281
+
282
+ @torch.no_grad()
283
+ def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
284
+ """
285
+ Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
286
+ the sequence max_new_tokens times, feeding the predictions back into the model each time.
287
+ Most likely you'll want to make sure to be in model.eval() mode of operation for this.
288
+ """
289
+ for _ in range(max_new_tokens):
290
+ # if the sequence context is growing too long we must crop it at block_size
291
+ idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
292
+ # forward the model to get the logits for the index in the sequence
293
+ logits, _ = self(idx_cond)
294
+ # pluck the logits at the final step and scale by desired temperature
295
+ logits = logits[:, -1, :] / temperature
296
+ # optionally crop the logits to only the top k options
297
+ if top_k is not None:
298
+ v, _ = torch.topk(logits, top_k)
299
+ logits[logits < v[:, [-1]]] = -float('Inf')
300
+ # apply softmax to convert logits to (normalized) probabilities
301
+ probs = F.softmax(logits, dim=-1)
302
+ # either sample from the distribution or take the most likely element
303
+ if do_sample:
304
+ idx_next = torch.multinomial(probs, num_samples=1)
305
+ else:
306
+ _, idx_next = torch.topk(probs, k=1, dim=-1)
307
+ # append sampled index to the running sequence and continue
308
+ idx = torch.cat((idx, idx_next), dim=1)
309
+
310
+ return idx
mingpt/trainer.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple training loop; Boilerplate that could apply to any arbitrary neural network,
3
+ so nothing in this file really has anything to do with GPT specifically.
4
+ """
5
+
6
+ import time
7
+ from collections import defaultdict
8
+
9
+ import torch
10
+ from torch.utils.data.dataloader import DataLoader
11
+ from mingpt.utils import CfgNode as CN
12
+
13
+ class Trainer:
14
+
15
+ @staticmethod
16
+ def get_default_config():
17
+ C = CN()
18
+ # device to train on
19
+ C.device = 'auto'
20
+ # dataloder parameters
21
+ C.num_workers = 4
22
+ # optimizer parameters
23
+ C.max_iters = None
24
+ C.batch_size = 64
25
+ C.learning_rate = 3e-4
26
+ C.betas = (0.9, 0.95)
27
+ C.weight_decay = 0.1 # only applied on matmul weights
28
+ C.grad_norm_clip = 1.0
29
+ return C
30
+
31
+ def __init__(self, config, model, train_dataset):
32
+ self.config = config
33
+ self.model = model
34
+ self.optimizer = None
35
+ self.train_dataset = train_dataset
36
+ self.callbacks = defaultdict(list)
37
+
38
+ # determine the device we'll train on
39
+ if config.device == 'auto':
40
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
41
+ else:
42
+ self.device = config.device
43
+ self.model = self.model.to(self.device)
44
+ print("running on device", self.device)
45
+
46
+ # variables that will be assigned to trainer class later for logging and etc
47
+ self.iter_num = 0
48
+ self.iter_time = 0.0
49
+ self.iter_dt = 0.0
50
+
51
+ def add_callback(self, onevent: str, callback):
52
+ self.callbacks[onevent].append(callback)
53
+
54
+ def set_callback(self, onevent: str, callback):
55
+ self.callbacks[onevent] = [callback]
56
+
57
+ def trigger_callbacks(self, onevent: str):
58
+ for callback in self.callbacks.get(onevent, []):
59
+ callback(self)
60
+
61
+ def run(self):
62
+ model, config = self.model, self.config
63
+
64
+ # setup the optimizer
65
+ self.optimizer = model.configure_optimizers(config)
66
+
67
+ # setup the dataloader
68
+ train_loader = DataLoader(
69
+ self.train_dataset,
70
+ sampler=torch.utils.data.RandomSampler(self.train_dataset, replacement=True, num_samples=int(1e10)),
71
+ shuffle=False,
72
+ pin_memory=True,
73
+ batch_size=config.batch_size,
74
+ num_workers=config.num_workers,
75
+ )
76
+
77
+ model.train()
78
+ self.iter_num = 0
79
+ self.iter_time = time.time()
80
+ data_iter = iter(train_loader)
81
+ while True:
82
+
83
+ # fetch the next batch (x, y) and re-init iterator if needed
84
+ try:
85
+ batch = next(data_iter)
86
+ except StopIteration:
87
+ data_iter = iter(train_loader)
88
+ batch = next(data_iter)
89
+ batch = [t.to(self.device) for t in batch]
90
+ x, y = batch
91
+
92
+ # forward the model
93
+ logits, self.loss = model(x, y)
94
+
95
+ # backprop and update the parameters
96
+ model.zero_grad(set_to_none=True)
97
+ self.loss.backward()
98
+ torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
99
+ self.optimizer.step()
100
+
101
+ self.trigger_callbacks('on_batch_end')
102
+ self.iter_num += 1
103
+ tnow = time.time()
104
+ self.iter_dt = tnow - self.iter_time
105
+ self.iter_time = tnow
106
+
107
+ # termination conditions
108
+ if config.max_iters is not None and self.iter_num >= config.max_iters:
109
+ break
mingpt/utils.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import sys
4
+ import json
5
+ import random
6
+ from ast import literal_eval
7
+
8
+ import numpy as np
9
+ import torch
10
+
11
+ # -----------------------------------------------------------------------------
12
+
13
+ def set_seed(seed):
14
+ random.seed(seed)
15
+ np.random.seed(seed)
16
+ torch.manual_seed(seed)
17
+ torch.cuda.manual_seed_all(seed)
18
+
19
+ def setup_logging(config):
20
+ """ monotonous bookkeeping """
21
+ work_dir = config.system.work_dir
22
+ # create the work directory if it doesn't already exist
23
+ os.makedirs(work_dir, exist_ok=True)
24
+ # log the args (if any)
25
+ with open(os.path.join(work_dir, 'args.txt'), 'w') as f:
26
+ f.write(' '.join(sys.argv))
27
+ # log the config itself
28
+ with open(os.path.join(work_dir, 'config.json'), 'w') as f:
29
+ f.write(json.dumps(config.to_dict(), indent=4))
30
+
31
+ class CfgNode:
32
+ """ a lightweight configuration class inspired by yacs """
33
+ # TODO: convert to subclass from a dict like in yacs?
34
+ # TODO: implement freezing to prevent shooting of own foot
35
+ # TODO: additional existence/override checks when reading/writing params?
36
+
37
+ def __init__(self, **kwargs):
38
+ self.__dict__.update(kwargs)
39
+
40
+ def __str__(self):
41
+ return self._str_helper(0)
42
+
43
+ def _str_helper(self, indent):
44
+ """ need to have a helper to support nested indentation for pretty printing """
45
+ parts = []
46
+ for k, v in self.__dict__.items():
47
+ if isinstance(v, CfgNode):
48
+ parts.append("%s:\n" % k)
49
+ parts.append(v._str_helper(indent + 1))
50
+ else:
51
+ parts.append("%s: %s\n" % (k, v))
52
+ parts = [' ' * (indent * 4) + p for p in parts]
53
+ return "".join(parts)
54
+
55
+ def to_dict(self):
56
+ """ return a dict representation of the config """
57
+ return { k: v.to_dict() if isinstance(v, CfgNode) else v for k, v in self.__dict__.items() }
58
+
59
+ def merge_from_dict(self, d):
60
+ self.__dict__.update(d)
61
+
62
+ def merge_from_args(self, args):
63
+ """
64
+ update the configuration from a list of strings that is expected
65
+ to come from the command line, i.e. sys.argv[1:].
66
+
67
+ The arguments are expected to be in the form of `--arg=value`, and
68
+ the arg can use . to denote nested sub-attributes. Example:
69
+
70
+ --model.n_layer=10 --trainer.batch_size=32
71
+ """
72
+ for arg in args:
73
+
74
+ keyval = arg.split('=')
75
+ assert len(keyval) == 2, "expecting each override arg to be of form --arg=value, got %s" % arg
76
+ key, val = keyval # unpack
77
+
78
+ # first translate val into a python object
79
+ try:
80
+ val = literal_eval(val)
81
+ """
82
+ need some explanation here.
83
+ - if val is simply a string, literal_eval will throw a ValueError
84
+ - if val represents a thing (like an 3, 3.14, [1,2,3], False, None, etc.) it will get created
85
+ """
86
+ except ValueError:
87
+ pass
88
+
89
+ # find the appropriate object to insert the attribute into
90
+ assert key[:2] == '--'
91
+ key = key[2:] # strip the '--'
92
+ keys = key.split('.')
93
+ obj = self
94
+ for k in keys[:-1]:
95
+ obj = getattr(obj, k)
96
+ leaf_key = keys[-1]
97
+
98
+ # ensure that this attribute exists
99
+ assert hasattr(obj, leaf_key), f"{key} is not an attribute that exists in the config"
100
+
101
+ # overwrite the attribute
102
+ print("command line overwriting config attribute %s with %s" % (key, val))
103
+ setattr(obj, leaf_key, val)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ gradio>=4.0.0
3
+ huggingface_hub
4
+ tiktoken