Custom Architecture:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
import math
@dataclass
class Config:
vocab_size: int = 32000
block_size: int = 512
n_layer: int = 12
n_head: int = 12
n_embd: int = 768
rope_theta: float = 10000.0
norm_eps: float = 1e-6
class RMSNorm(nn.Module):
def __init__(self, dim, eps=1e-6):
super().__init__()
self.eps = eps
self.scale = nn.Parameter(torch.ones(dim))
def forward(self, x):
return x * self.scale / (x.pow(2).mean(-1, keepdim=True) + self.eps).sqrt()
def precompute_rope_freqs(dim, max_len, theta=10000.0):
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[:dim//2].float() / dim))
t = torch.arange(max_len, dtype=torch.float32)
freqs = torch.outer(t, freqs)
return torch.cos(freqs), torch.sin(freqs)
def apply_rotary_emb(q, k, cos, sin):
head_dim = q.shape[-1]
q_real, q_imag = q[..., :head_dim//2], q[..., head_dim//2:]
k_real, k_imag = k[..., :head_dim//2], k[..., head_dim//2:]
cos = cos[:, :, :q.shape[2], :]
sin = sin[:, :, :q.shape[2], :]
q_rot = torch.cat((q_real * cos - q_imag * sin, q_real * sin + q_imag * cos), dim=-1)
k_rot = torch.cat((k_real * cos - k_imag * sin, k_real * sin + k_imag * cos), dim=-1)
return q_rot, k_rot
class OtterLMBlock(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.n_head = config.n_head
self.n_embd = config.n_embd
hidden_dim = int(8 * config.n_embd / 3)
hidden_dim = ((hidden_dim + 255) // 256) * 256
self.ln_1 = RMSNorm(config.n_embd, eps=config.norm_eps)
self.ln_2 = RMSNorm(config.n_embd, eps=config.norm_eps)
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
self.mlp = nn.ModuleDict({
'gate_proj': nn.Linear(config.n_embd, hidden_dim, bias=False),
'up_proj': nn.Linear(config.n_embd, hidden_dim, bias=False),
'down_proj': nn.Linear(hidden_dim, config.n_embd, bias=False),
})
def forward(self, x, cos, sin):
x = x + self._attn_block(self.ln_1(x), cos, sin)
x = x + self._mlp_block(self.ln_2(x))
return x
def _attn_block(self, x, cos, sin):
B, T, C = x.size()
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
q, k = apply_rotary_emb(q, k, cos, sin)
y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
y = y.transpose(1, 2).contiguous().view(B, T, C)
return self.c_proj(y)
def _mlp_block(self, x):
gate = F.silu(self.mlp.gate_proj(x))
up = self.mlp.up_proj(x)
return self.mlp.down_proj(gate * up)
class OtterLM(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.transformer = nn.ModuleDict({
'wte': nn.Embedding(config.vocab_size, config.n_embd),
'h': nn.ModuleList([OtterLMBlock(config) for _ in range(config.n_layer)]),
'ln_f': RMSNorm(config.n_embd, eps=config.norm_eps),
})
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.lm_head.weight = self.transformer.wte.weight
dim = config.n_embd // config.n_head
max_len = config.block_size * 2
cos, sin = precompute_rope_freqs(dim, max_len, theta=config.rope_theta)
self.register_buffer("cos", cos.unsqueeze(0).unsqueeze(0))
self.register_buffer("sin", sin.unsqueeze(0).unsqueeze(0))
self.apply(self._init_weights)
def _init_weights(self, module):
"""Depth-scaled init untuk output projections (stabilize residual branches)"""
std = 0.02
if isinstance(module, nn.Linear):
if module.weight.size(0) == self.config.n_embd:
std = 0.02 / math.sqrt(2 * self.config.n_layer)
torch.nn.init.normal_(module.weight, mean=0.0, std=std)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
def forward(self, idx, targets=None):
_, t = idx.size()
assert t <= self.config.block_size, f"Sequence length {t} exceeds block_size {self.config.block_size}"
tok_emb = self.transformer.wte(idx)
cos = self.cos[:, :, :t, :]
sin = self.sin[:, :, :t, :]
x = tok_emb
for block in self.transformer.h:
x = block(x, cos, sin)
x = self.transformer.ln_f(x)
logits = self.lm_head(x)
loss = None
if targets is not None:
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
return logits, loss
result inference after training:
🔮 Generating with OtterLM-110M...
[1/4] Generating...
Prompt: The world is a
==================================================
The world is a and a hapless entity whose goal is to be a branch of the United States . The planet is a giant planet , having a planet called the Milky Way that is located within the Milky Way at a time . There are about one million stars ( <unk> ) , making it the largest planet in the Milky Way 's system . These stars are found in the Milky Way . The Sun has an almost circular orbit around the Sun , making it one of the few stars with a central orbit . = = = <unk> = = = The Solar System orbits around Jupiter every 1 @,@ 000 to 1 @,@ 800 million years , the largest of which is thought to be the Kuiper belt . The Solar System orbits the planet 's orbits as a whole , but it is not entirely a terrestrial star . The Sun is only visible from a distance of 1 @,@ 000 – 2 @,@ 000 AU from Earth . In contrast , the Solar System orbits the Sun at roughly the same period as the Earth . Since the Sun orbits at its nearest point , it orbits it directly with its own large orbit . The orbits of the planets lie at a distance of roughly 0 @.@ 2 – 0 @.@ 2 K ( a distance of 2 – 1 @,@ 000 AU ) . The Sun '
--------------------------------------------------
[2/4] Generating...
Prompt: Science is the study of
==================================================
Science is the study ofology at the University of Wisconsin , which is the study of the state of the country . The results of the study are based on a study taken by the National University of Wisconsin , which was published in 2000 . The study is based on a study of the <unk> 's study , which discussed the results of research , and the study found that the study had been more successful than the study , in comparison to the study 's report . The study concluded that the study " was the only study that was made into such a study " for almost 50 years . The study cited the study favorably , but the study concluded that it " may have been better in the more advanced study of the climate than the study of its findings " . In the context of the study , the study found that the study 's findings were " an evolutionary project to take on a new analysis " , though no definitive findings were known to exist for this particular study . = = Structure and analysis = = The study of Vistaraology is the work of the study of the environment in which the sociology of information is not widely distributed , and its implications are subject to various research . The study , which was the first to be performed in the study , provided much of its research on
--------------------------------------------------
[3/4] Generating...
Prompt: In the history of Europe,
==================================================
In the history of Europe, .... The whole history of French empire in Europe is a part of the cultural history of France , and is to be a part of the historical history of Europe . In the 16th century , French writer and astronomer François @-@ François <unk> published the first book in the French language , in which the astronomer Jacques de la <unk> , who studied the geography of the region , wrote that " the French philosopher , the astronomer and a man in Germany could have used his name to refer to the French explorer , who was given a " master in the presence of a pirate " . This was part of the " <unk> theory " of the French astronomer Georges @-@ François <unk> @-@ <unk> , who proposed that the Dutch explorers , traders and traders had a more reliable basis for the French scientific establishment than was customary in Scandinavia . The French explorer Jean @-@ Baptiste <unk> , who would later become a writer for the French and German , was the first to find a name for the Dutch . The French astronomer Jean @-@ Baptiste <unk> and his assistants William de la <unk> created a series of observations which were published two years later . This was a detailed analysis of the Dutch theory of the sea in the 19th century that showed
--------------------------------------------------
[4/4] Generating...
Prompt: A computer is a machine that
==================================================
A computer is a machine that had created at the time before or during his tenure , so he would be able to keep the machine from the hands of its owner after the death of his brother . Following the end of the war , Innis became the fourth @-@ largest member of the committee that was already working on the project . Innis 's success had earned him a reputation as a highly decorated artist , and his efforts to create his own work helped to create his own . In the end , Innis was given the task of designing a design on the project . He then spent the next fifty years studying mathematics and mathematics from the University of California at Berkeley , and then working as a technical director of the Physics Physics Laboratory at Los Alamos , Berkeley . His work led to his retirement in 1959 , and he became a director of the first computer physics laboratory . = = Early life = = Innis was born in Brooklyn , New York , the father of Samuel Innis and his wife , Maria , née <unk> . The family 's first name is " John " , from the Greek <unk> " G " , meaning " G " . He had two younger sisters , John and <unk> . His father , William , and a cousin , also named John , were born in Boston , Massachusetts
--------------------------------------------------
Description:
OtterLM-WKT103-110M
Small but scrappy language model trained from scratch on T4x2 GPU.
Overview
OtterLM is a 110M-parameter decoder-only transformer trained from scratch on WikiText-103. Built with modern architectural choices (RoPE, RMSNorm, SwiGLU) and a custom 32k BPE tokenizer — all trained on 2× NVIDIA T4 GPUs.
This LM model is for Education and Research Only.