Custom Architecture:

import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
import math

# 1. CONFIG
@dataclass
class Config:
    vocab_size: int = 32000
    block_size: int = 512
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    rope_theta: float = 10000.0
    norm_eps: float = 1e-6

# 2. RMSNorm
class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-6):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(dim))

    def forward(self, x):
        return x * self.scale / (x.pow(2).mean(-1, keepdim=True) + self.eps).sqrt()

# 3. RoPE precompute
def precompute_rope_freqs(dim, max_len, theta=10000.0):
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[:dim//2].float() / dim))
    t = torch.arange(max_len, dtype=torch.float32)
    freqs = torch.outer(t, freqs)
    return torch.cos(freqs), torch.sin(freqs)

# 4. RoPE apply
def apply_rotary_emb(q, k, cos, sin):
    head_dim = q.shape[-1]
    q_real, q_imag = q[..., :head_dim//2], q[..., head_dim//2:]
    k_real, k_imag = k[..., :head_dim//2], k[..., head_dim//2:]
    
    cos = cos[:, :, :q.shape[2], :]
    sin = sin[:, :, :q.shape[2], :]
    
    q_rot = torch.cat((q_real * cos - q_imag * sin, q_real * sin + q_imag * cos), dim=-1)
    k_rot = torch.cat((k_real * cos - k_imag * sin, k_real * sin + k_imag * cos), dim=-1)
    return q_rot, k_rot

# 5. BLOCK (SwiGLU 2.7x)
class OtterLMBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        
        # LLaMA-style SwiGLU: (8/3)*n_embd rounded to 256-multiple
        hidden_dim = int(8 * config.n_embd / 3)
        hidden_dim = ((hidden_dim + 255) // 256) * 256  # GPU-friendly alignment
        
        self.ln_1 = RMSNorm(config.n_embd, eps=config.norm_eps)
        self.ln_2 = RMSNorm(config.n_embd, eps=config.norm_eps)
        
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)  # Init handled by _init_weights
        
        self.mlp = nn.ModuleDict({
            'gate_proj': nn.Linear(config.n_embd, hidden_dim, bias=False),
            'up_proj': nn.Linear(config.n_embd, hidden_dim, bias=False),
            'down_proj': nn.Linear(hidden_dim, config.n_embd, bias=False),  # Init handled by _init_weights
        })

    def forward(self, x, cos, sin):
        x = x + self._attn_block(self.ln_1(x), cos, sin)
        x = x + self._mlp_block(self.ln_2(x))
        return x

    def _attn_block(self, x, cos, sin):
        B, T, C = x.size()
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        
        q, k = apply_rotary_emb(q, k, cos, sin)
        
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        return self.c_proj(y)

    def _mlp_block(self, x):
        gate = F.silu(self.mlp.gate_proj(x))
        up = self.mlp.up_proj(x)
        return self.mlp.down_proj(gate * up)

# 6. MAIN MODEL (FIX INIT: depth-scaled for all output projections)
class OtterLM(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config.vocab_size, config.n_embd),
            'h': nn.ModuleList([OtterLMBlock(config) for _ in range(config.n_layer)]),
            'ln_f': RMSNorm(config.n_embd, eps=config.norm_eps),
        })
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.lm_head.weight = self.transformer.wte.weight  # Weight tying
        
        # Precompute RoPE
        dim = config.n_embd // config.n_head
        max_len = config.block_size * 2
        cos, sin = precompute_rope_freqs(dim, max_len, theta=config.rope_theta)
        self.register_buffer("cos", cos.unsqueeze(0).unsqueeze(0))
        self.register_buffer("sin", sin.unsqueeze(0).unsqueeze(0))
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        """Depth-scaled init untuk output projections (stabilize residual branches)"""
        std = 0.02
        if isinstance(module, nn.Linear):
            # Output projections (c_proj & down_proj) punya out_features = n_embd
            # Scale dengan 1/sqrt(2 * depth) sesuai LLaMA best practice
            if module.weight.size(0) == self.config.n_embd:  # out_features == n_embd
                std = 0.02 / math.sqrt(2 * self.config.n_layer)
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        _, t = idx.size()
        assert t <= self.config.block_size, f"Sequence length {t} exceeds block_size {self.config.block_size}"
        
        tok_emb = self.transformer.wte(idx)
        cos = self.cos[:, :, :t, :]
        sin = self.sin[:, :, :t, :]
        
        x = tok_emb
        for block in self.transformer.h:
            x = block(x, cos, sin)
        
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

result inference after training:

🔮 Generating with OtterLM-110M...


[1/4] Generating...
Prompt: The world is a
==================================================
 The world is a and a hapless entity whose goal is to be a branch of the United States . The planet is a giant planet , having a planet called the Milky Way that is located within the Milky Way at a time . There are about one million stars ( <unk> ) , making it the largest planet in the Milky Way 's system . These stars are found in the Milky Way . The Sun has an almost circular orbit around the Sun , making it one of the few stars with a central orbit . = = = <unk> = = = The Solar System orbits around Jupiter every 1 @,@ 000 to 1 @,@ 800 million years , the largest of which is thought to be the Kuiper belt . The Solar System orbits the planet 's orbits as a whole , but it is not entirely a terrestrial star . The Sun is only visible from a distance of 1 @,@ 000 – 2 @,@ 000 AU from Earth . In contrast , the Solar System orbits the Sun at roughly the same period as the Earth . Since the Sun orbits at its nearest point , it orbits it directly with its own large orbit . The orbits of the planets lie at a distance of roughly 0 @.@ 2 – 0 @.@ 2 K ( a distance of 2 – 1 @,@ 000 AU ) . The Sun '

--------------------------------------------------

[2/4] Generating...
Prompt: Science is the study of
==================================================
 Science is the study ofology at the University of Wisconsin , which is the study of the state of the country . The results of the study are based on a study taken by the National University of Wisconsin , which was published in 2000 . The study is based on a study of the <unk> 's study , which discussed the results of research , and the study found that the study had been more successful than the study , in comparison to the study 's report . The study concluded that the study " was the only study that was made into such a study " for almost 50 years . The study cited the study favorably , but the study concluded that it " may have been better in the more advanced study of the climate than the study of its findings " . In the context of the study , the study found that the study 's findings were " an evolutionary project to take on a new analysis " , though no definitive findings were known to exist for this particular study . = = Structure and analysis = = The study of Vistaraology is the work of the study of the environment in which the sociology of information is not widely distributed , and its implications are subject to various research . The study , which was the first to be performed in the study , provided much of its research on

--------------------------------------------------

[3/4] Generating...
Prompt: In the history of Europe,
==================================================
 In the history of Europe, .... The whole history of French empire in Europe is a part of the cultural history of France , and is to be a part of the historical history of Europe . In the 16th century , French writer and astronomer François @-@ François <unk> published the first book in the French language , in which the astronomer Jacques de la <unk> , who studied the geography of the region , wrote that " the French philosopher , the astronomer and a man in Germany could have used his name to refer to the French explorer , who was given a " master in the presence of a pirate " . This was part of the " <unk> theory " of the French astronomer Georges @-@ François <unk> @-@ <unk> , who proposed that the Dutch explorers , traders and traders had a more reliable basis for the French scientific establishment than was customary in Scandinavia . The French explorer Jean @-@ Baptiste <unk> , who would later become a writer for the French and German , was the first to find a name for the Dutch . The French astronomer Jean @-@ Baptiste <unk> and his assistants William de la <unk> created a series of observations which were published two years later . This was a detailed analysis of the Dutch theory of the sea in the 19th century that showed

--------------------------------------------------

[4/4] Generating...
Prompt: A computer is a machine that
==================================================
 A computer is a machine that had created at the time before or during his tenure , so he would be able to keep the machine from the hands of its owner after the death of his brother . Following the end of the war , Innis became the fourth @-@ largest member of the committee that was already working on the project . Innis 's success had earned him a reputation as a highly decorated artist , and his efforts to create his own work helped to create his own . In the end , Innis was given the task of designing a design on the project . He then spent the next fifty years studying mathematics and mathematics from the University of California at Berkeley , and then working as a technical director of the Physics Physics Laboratory at Los Alamos , Berkeley . His work led to his retirement in 1959 , and he became a director of the first computer physics laboratory . = = Early life = = Innis was born in Brooklyn , New York , the father of Samuel Innis and his wife , Maria , née <unk> . The family 's first name is " John " , from the Greek <unk> " G " , meaning " G " . He had two younger sisters , John and <unk> . His father , William , and a cousin , also named John , were born in Boston , Massachusetts

--------------------------------------------------

Description:

OtterLM-WKT103-110M
Small but scrappy language model trained from scratch on T4x2 GPU.
Overview
OtterLM is a 110M-parameter decoder-only transformer trained from scratch on WikiText-103. Built with modern architectural choices (RoPE, RMSNorm, SwiGLU) and a custom 32k BPE tokenizer — all trained on 2× NVIDIA T4 GPUs.
This LM model is for Education and Research Only.
Downloads last month
32
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Dataset used to train RinKana/OtterLM-WKT103-110M