Initial release: TinyWay 1.1.0 (83.17M params)

Browse files

Files changed (10) hide show

__init__.py +1 -0
config.json +19 -0
generation_config.json +10 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_tinyway.py +199 -0
special_tokens_map.json +6 -0
tokenizer.json +0 -0
tokenizer_config.json +21 -0
vocab.json +0 -0

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .modeling_tinyway import TinyWayConfig, TinyWayForCausalLM

config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "model_type": "tinyway",
+  "architectures": [
+    "TinyWayForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_tinyway.TinyWayConfig",
+    "AutoModelForCausalLM": "modeling_tinyway.TinyWayForCausalLM"
+  },
+  "vocab_size": 50257,
+  "n_positions": 256,
+  "n_embd": 512,
+  "n_layer": 10,
+  "n_head": 8,
+  "dropout": 0.1,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "pad_token_id": 50256
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "max_new_tokens": 256,
+  "do_sample": true,
+  "temperature": 0.8,
+  "top_p": 0.95,
+  "top_k": 50,
+  "repetition_penalty": 1.1,
+  "eos_token_id": 50256,
+  "pad_token_id": 50256
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a144801be18e22e1c1456b2b4c4ce9c065238bc97c4f307b63b5b7da9a36aa4
+size 333345108

modeling_tinyway.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutput
+# =========================
+# Config
+# =========================
+class TinyWayConfig(PretrainedConfig):
+    model_type = "tinyway"
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=256,
+        n_embd=512,
+        n_layer=10,
+        n_head=8,
+        dropout=0.1,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.dropout = dropout
+        # 🔥 HuggingFace-required aliases
+        self.hidden_size = n_embd
+        self.num_hidden_layers = n_layer
+        self.num_attention_heads = n_head
+        self.max_position_embeddings = n_positions
+# =========================
+# Causal Self-Attention
+# =========================
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.n_head = config.n_head
+        self.head_dim = config.n_embd // config.n_head
+        self.qkv = nn.Linear(config.n_embd, 3 * config.n_embd)
+        self.proj = nn.Linear(config.n_embd, config.n_embd)
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.proj_dropout = nn.Dropout(config.dropout)
+        self.register_buffer(
+            "mask",
+            torch.tril(
+                torch.ones(
+                    config.n_positions,
+                    config.n_positions,
+                    dtype=torch.bool
+                )
+            )
+        )
+        self.last_attn = None
+    def forward(self, x):
+        B, T, C = x.shape
+        qkv = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(
+            ~self.mask[:T, :T],
+            torch.finfo(att.dtype).min
+        )
+        att = F.softmax(att, dim=-1)
+        self.last_attn = att.detach()
+        att = self.attn_dropout(att)
+        out = att @ v
+        out = out.transpose(1, 2).contiguous().view(B, T, C)
+        out = self.proj(out)
+        out = self.proj_dropout(out)
+        return out
+# =========================
+# Transformer Block
+# =========================
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.ln2 = nn.LayerNorm(config.n_embd)
+        # 🔥 FFN EXACTLY MATCHES TRAINING
+        self.ffn = nn.Sequential(
+            nn.Linear(config.n_embd, 4 * config.n_embd),
+            nn.GELU(),
+            nn.Linear(4 * config.n_embd, config.n_embd),
+            nn.Dropout(config.dropout),
+        )
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.ffn(self.ln2(x))
+        return x
+# =========================
+# TinyWay Language Model
+# =========================
+class TinyWayForCausalLM(PreTrainedModel):
+    config_class = TinyWayConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.token_emb = nn.Embedding(config.vocab_size, config.n_embd)
+        self.pos_emb = nn.Embedding(config.n_positions, config.n_embd)
+        self.blocks = nn.ModuleList([
+            Block(config) for _ in range(config.n_layer)
+        ])
+        self.ln = nn.LayerNorm(config.n_embd)
+        self.head = nn.Linear(
+            config.n_embd,
+            config.vocab_size,
+            bias=False
+        )
+        # weight tying
+        self.head.weight = self.token_emb.weight
+        self.dropout = nn.Dropout(config.dropout)
+        self.post_init()
+    def forward(
+        self,
+        input_ids,
+        labels=None,
+        attention_mask=None,  # intentionally unused (causal LM)
+        **kwargs              # 🔥 accept return_dict, use_cache, etc.
+    ):
+        B, T = input_ids.shape
+        pos = torch.arange(T, device=input_ids.device)
+        x = self.token_emb(input_ids) + self.pos_emb(pos)
+        x = self.dropout(x)
+        for block in self.blocks:
+            x = block(x)
+        x = self.ln(x)
+        logits = self.head(x)
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                labels.view(-1)
+            )
+        return CausalLMOutput(
+            loss=loss,
+            logits=logits
+        )
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {"input_ids": input_ids}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff