NNEngine commited on
Commit
4f7e31d
·
verified ·
1 Parent(s): f111c19

Initial release: TinyWay 1.1.0 (83.17M params)

Browse files
__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .modeling_tinyway import TinyWayConfig, TinyWayForCausalLM
config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "tinyway",
3
+ "architectures": [
4
+ "TinyWayForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "modeling_tinyway.TinyWayConfig",
8
+ "AutoModelForCausalLM": "modeling_tinyway.TinyWayForCausalLM"
9
+ },
10
+ "vocab_size": 50257,
11
+ "n_positions": 256,
12
+ "n_embd": 512,
13
+ "n_layer": 10,
14
+ "n_head": 8,
15
+ "dropout": 0.1,
16
+ "bos_token_id": 50256,
17
+ "eos_token_id": 50256,
18
+ "pad_token_id": 50256
19
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "max_new_tokens": 256,
3
+ "do_sample": true,
4
+ "temperature": 0.8,
5
+ "top_p": 0.95,
6
+ "top_k": 50,
7
+ "repetition_penalty": 1.1,
8
+ "eos_token_id": 50256,
9
+ "pad_token_id": 50256
10
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a144801be18e22e1c1456b2b4c4ce9c065238bc97c4f307b63b5b7da9a36aa4
3
+ size 333345108
modeling_tinyway.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import math
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+
8
+ from transformers import PreTrainedModel, PretrainedConfig
9
+ from transformers.modeling_outputs import CausalLMOutput
10
+
11
+
12
+ # =========================
13
+ # Config
14
+ # =========================
15
+
16
+ class TinyWayConfig(PretrainedConfig):
17
+ model_type = "tinyway"
18
+
19
+ def __init__(
20
+ self,
21
+ vocab_size=50257,
22
+ n_positions=256,
23
+ n_embd=512,
24
+ n_layer=10,
25
+ n_head=8,
26
+ dropout=0.1,
27
+ **kwargs
28
+ ):
29
+ super().__init__(**kwargs)
30
+
31
+ self.vocab_size = vocab_size
32
+ self.n_positions = n_positions
33
+ self.n_embd = n_embd
34
+ self.n_layer = n_layer
35
+ self.n_head = n_head
36
+ self.dropout = dropout
37
+
38
+ # 🔥 HuggingFace-required aliases
39
+ self.hidden_size = n_embd
40
+ self.num_hidden_layers = n_layer
41
+ self.num_attention_heads = n_head
42
+ self.max_position_embeddings = n_positions
43
+
44
+
45
+ # =========================
46
+ # Causal Self-Attention
47
+ # =========================
48
+
49
+ class CausalSelfAttention(nn.Module):
50
+ def __init__(self, config):
51
+ super().__init__()
52
+ assert config.n_embd % config.n_head == 0
53
+
54
+ self.n_head = config.n_head
55
+ self.head_dim = config.n_embd // config.n_head
56
+
57
+ self.qkv = nn.Linear(config.n_embd, 3 * config.n_embd)
58
+ self.proj = nn.Linear(config.n_embd, config.n_embd)
59
+
60
+ self.attn_dropout = nn.Dropout(config.dropout)
61
+ self.proj_dropout = nn.Dropout(config.dropout)
62
+
63
+ self.register_buffer(
64
+ "mask",
65
+ torch.tril(
66
+ torch.ones(
67
+ config.n_positions,
68
+ config.n_positions,
69
+ dtype=torch.bool
70
+ )
71
+ )
72
+ )
73
+
74
+ self.last_attn = None
75
+
76
+ def forward(self, x):
77
+ B, T, C = x.shape
78
+
79
+ qkv = self.qkv(x)
80
+ q, k, v = qkv.chunk(3, dim=-1)
81
+
82
+ q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
83
+ k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
84
+ v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
85
+
86
+ att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
87
+ att = att.masked_fill(
88
+ ~self.mask[:T, :T],
89
+ torch.finfo(att.dtype).min
90
+ )
91
+
92
+ att = F.softmax(att, dim=-1)
93
+ self.last_attn = att.detach()
94
+
95
+ att = self.attn_dropout(att)
96
+
97
+ out = att @ v
98
+ out = out.transpose(1, 2).contiguous().view(B, T, C)
99
+
100
+ out = self.proj(out)
101
+ out = self.proj_dropout(out)
102
+
103
+ return out
104
+
105
+
106
+ # =========================
107
+ # Transformer Block
108
+ # =========================
109
+
110
+ class Block(nn.Module):
111
+ def __init__(self, config):
112
+ super().__init__()
113
+
114
+ self.ln1 = nn.LayerNorm(config.n_embd)
115
+ self.attn = CausalSelfAttention(config)
116
+
117
+ self.ln2 = nn.LayerNorm(config.n_embd)
118
+
119
+ # 🔥 FFN EXACTLY MATCHES TRAINING
120
+ self.ffn = nn.Sequential(
121
+ nn.Linear(config.n_embd, 4 * config.n_embd),
122
+ nn.GELU(),
123
+ nn.Linear(4 * config.n_embd, config.n_embd),
124
+ nn.Dropout(config.dropout),
125
+ )
126
+
127
+ def forward(self, x):
128
+ x = x + self.attn(self.ln1(x))
129
+ x = x + self.ffn(self.ln2(x))
130
+ return x
131
+
132
+
133
+ # =========================
134
+ # TinyWay Language Model
135
+ # =========================
136
+
137
+ class TinyWayForCausalLM(PreTrainedModel):
138
+ config_class = TinyWayConfig
139
+
140
+ def __init__(self, config):
141
+ super().__init__(config)
142
+
143
+ self.token_emb = nn.Embedding(config.vocab_size, config.n_embd)
144
+ self.pos_emb = nn.Embedding(config.n_positions, config.n_embd)
145
+
146
+ self.blocks = nn.ModuleList([
147
+ Block(config) for _ in range(config.n_layer)
148
+ ])
149
+
150
+ self.ln = nn.LayerNorm(config.n_embd)
151
+
152
+ self.head = nn.Linear(
153
+ config.n_embd,
154
+ config.vocab_size,
155
+ bias=False
156
+ )
157
+
158
+ # weight tying
159
+ self.head.weight = self.token_emb.weight
160
+
161
+ self.dropout = nn.Dropout(config.dropout)
162
+
163
+ self.post_init()
164
+
165
+ def forward(
166
+ self,
167
+ input_ids,
168
+ labels=None,
169
+ attention_mask=None, # intentionally unused (causal LM)
170
+ **kwargs # 🔥 accept return_dict, use_cache, etc.
171
+ ):
172
+ B, T = input_ids.shape
173
+ pos = torch.arange(T, device=input_ids.device)
174
+
175
+ x = self.token_emb(input_ids) + self.pos_emb(pos)
176
+ x = self.dropout(x)
177
+
178
+ for block in self.blocks:
179
+ x = block(x)
180
+
181
+ x = self.ln(x)
182
+ logits = self.head(x)
183
+
184
+ loss = None
185
+ if labels is not None:
186
+ loss = F.cross_entropy(
187
+ logits.view(-1, logits.size(-1)),
188
+ labels.view(-1)
189
+ )
190
+
191
+ return CausalLMOutput(
192
+ loss=loss,
193
+ logits=logits
194
+ )
195
+
196
+
197
+
198
+ def prepare_inputs_for_generation(self, input_ids, **kwargs):
199
+ return {"input_ids": input_ids}
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff