initial release

Browse files

Files changed (5) hide show

models/__init__.py +4 -0
models/autoregressive.py +358 -0
models/dimamba.py +1136 -0
models/dit.py +514 -0
models/ema.py +97 -0

models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from . import dit
+from . import dimamba
+from . import ema
+from . import autoregressive

models/autoregressive.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import math
+import typing
+import flash_attn
+import flash_attn.layers.rotary
+import huggingface_hub
+import omegaconf
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+# Flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+def bias_dropout_add_scale(
+  x: torch.Tensor,
+  bias: typing.Optional[torch.Tensor],
+  scale: torch.Tensor,
+  residual: typing.Optional[torch.Tensor],
+  prob: float,
+  training: bool,
+) -> torch.Tensor:
+  if bias is not None:
+    out = scale * F.dropout(
+      x + bias, p=prob, training=training
+    )
+  else:
+    out = scale * F.dropout(x, p=prob, training=training)
+  if residual is not None:
+    out = residual + out
+  return out
+def get_bias_dropout_add_scale(training):
+  def _bias_dropout_add(x, bias, scale, residual, prob):
+    return bias_dropout_add_scale(
+      x, bias, scale, residual, prob, training
+    )
+  return _bias_dropout_add
+@torch.jit.script
+def bias_dropout_add_scale_fused_train(
+  x: torch.Tensor,
+  bias: typing.Optional[torch.Tensor],
+  scale: torch.Tensor,
+  residual: typing.Optional[torch.Tensor],
+  prob: float,
+) -> torch.Tensor:
+  return bias_dropout_add_scale(
+    x, bias, scale, residual, prob, True
+  )
+@torch.jit.script
+def bias_dropout_add_scale_fused_inference(
+  x: torch.Tensor,
+  bias: typing.Optional[torch.Tensor],
+  scale: torch.Tensor,
+  residual: typing.Optional[torch.Tensor],
+  prob: float,
+) -> torch.Tensor:
+  return bias_dropout_add_scale(
+    x, bias, scale, residual, prob, False
+  )
+class Rotary(torch.nn.Module):
+  def __init__(self, dim, base=10_000):
+    super().__init__()
+    inv_freq = 1.0 / (
+      base ** (torch.arange(0, dim, 2).float() / dim)
+    )
+    self.register_buffer('inv_freq', inv_freq)
+    self.seq_len_cached = None
+    self.cos_cached = None
+    self.sin_cached = None
+  def forward(self, x, seq_dim=1):
+    seq_len = x.shape[seq_dim]
+    if seq_len != self.seq_len_cached:
+      self.seq_len_cached = seq_len
+      t = torch.arange(
+        x.shape[seq_dim], device=x.device
+      ).type_as(self.inv_freq)
+      freqs = torch.einsum(
+        'i,j->ij', t, self.inv_freq.clone()
+      )
+      emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+      # dims are: batch, seq_len, qkv, head, dim
+      self.cos_cached = emb.cos()[
+        None, :, None, None, :
+      ].repeat(1, 1, 3, 1, 1)
+      self.sin_cached = emb.sin()[
+        None, :, None, None, :
+      ].repeat(1, 1, 3, 1, 1)
+      # This makes the transformation on v an identity.
+      self.cos_cached[:, :, 2, :, :].fill_(1.0)
+      self.sin_cached[:, :, 2, :, :].fill_(0.0)
+    return self.cos_cached, self.sin_cached
+def rotate_half(x):
+  x1, x2 = (
+    x[..., : x.shape[-1] // 2],
+    x[..., x.shape[-1] // 2 :],
+  )
+  return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(qkv, cos, sin):
+  cos = cos[0, :, 0, 0, : cos.shape[-1] // 2]
+  sin = sin[0, :, 0, 0, : sin.shape[-1] // 2]
+  return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
+    qkv, cos, sin
+  )
+#################################################################################
+#                                  Layers                                       #
+#################################################################################
+class LayerNorm(nn.Module):
+  def __init__(self, dim):
+    super().__init__()
+    self.weight = nn.Parameter(torch.ones([dim]))
+    self.dim = dim
+  def forward(self, x):
+    with torch.cuda.amp.autocast(enabled=False):
+      x = F.layer_norm(x.float(), [self.dim])
+    return x * self.weight[None, None, :]
+def residual_linear(x, W, x_skip, residual_scale):
+  """x_skip + residual_scale * W @ x"""
+  dim_out, dim_in = W.shape[0], W.shape[1]
+  return torch.addmm(
+    x_skip.view(-1, dim_out),
+    x.view(-1, dim_in),
+    W.T,
+    alpha=residual_scale,
+  ).view(*x.shape[:-1], dim_out)
+#################################################################################
+#                                 Core Model                                    #
+#################################################################################
+class DDiTBlock(nn.Module):
+  def __init__(
+    self,
+    dim,
+    n_heads,
+    cond_dim,
+    mlp_ratio=4,
+    dropout=0.1,
+    causal=False,
+  ):
+    super().__init__()
+    self.n_heads = n_heads
+    self.causal = causal
+    self.norm1 = LayerNorm(dim)
+    self.attn_qkv = nn.Linear(dim, 3 * dim, bias=False)
+    self.attn_out = nn.Linear(dim, dim, bias=False)
+    self.dropout1 = nn.Dropout(dropout)
+    self.norm2 = LayerNorm(dim)
+    self.mlp = nn.Sequential(
+      nn.Linear(dim, mlp_ratio * dim, bias=True),
+      nn.GELU(approximate='tanh'),
+      nn.Linear(mlp_ratio * dim, dim, bias=True),
+    )
+    self.dropout2 = nn.Dropout(dropout)
+    self.dropout = dropout
+  def _get_bias_dropout_scale(self):
+    if self.training:
+      return bias_dropout_add_scale_fused_train
+    else:
+      return bias_dropout_add_scale_fused_inference
+  def forward(self, x, rotary_cos_sin, c, seqlens=None):
+    batch_size, seq_len = x.shape[0], x.shape[1]
+    bias_dropout_scale_fn = self._get_bias_dropout_scale()
+    # attention operation
+    x_skip = x
+    x = self.norm1(x)
+    qkv = self.attn_qkv(x)
+    qkv = rearrange(
+      qkv,
+      'b s (three h d) -> b s three h d',
+      three=3,
+      h=self.n_heads,
+    )
+    with torch.cuda.amp.autocast(enabled=False):
+      cos, sin = rotary_cos_sin
+      qkv = apply_rotary_pos_emb(
+        qkv, cos.to(qkv.dtype), sin.to(qkv.dtype)
+      )
+    qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+    if seqlens is None:
+      cu_seqlens = torch.arange(
+        0,
+        (batch_size + 1) * seq_len,
+        step=seq_len,
+        dtype=torch.int32,
+        device=qkv.device,
+      )
+    else:
+      cu_seqlens = seqlens.cumsum(-1)
+    x = flash_attn.flash_attn_interface.flash_attn_varlen_qkvpacked_func(
+      qkv, cu_seqlens, seq_len, 0.0, causal=self.causal
+    )
+    x = rearrange(x, '(b s) h d -> b s (h d)', b=batch_size)
+    scale = torch.ones(1, device=x.device, dtype=x.dtype)
+    x = bias_dropout_scale_fn(
+      self.attn_out(x), None, scale, x_skip, self.dropout
+    )
+    # mlp operation
+    x = bias_dropout_scale_fn(
+      self.mlp(self.norm2(x)), None, scale, x, self.dropout
+    )
+    return x
+class EmbeddingLayer(nn.Module):
+  def __init__(self, dim, vocab_dim):
+    super().__init__()
+    self.embedding = nn.Parameter(
+      torch.empty((vocab_dim, dim))
+    )
+    torch.nn.init.kaiming_uniform_(
+      self.embedding, a=math.sqrt(5)
+    )
+  def forward(self, x):
+    return self.embedding[x]
+class DDitFinalLayer(nn.Module):
+  def __init__(
+    self, hidden_size, out_channels, cond_dim, causal=False
+  ):
+    super().__init__()
+    self.causal = causal
+    assert causal == True
+    self.norm_final = LayerNorm(hidden_size)
+    self.linear = nn.Linear(hidden_size, out_channels)
+    self.linear.weight.data.zero_()
+    self.linear.bias.data.zero_()
+  def forward(self, x, c):
+    return self.linear(self.norm_final(x))
+class DDIT(nn.Module, huggingface_hub.PyTorchModelHubMixin):
+  def __init__(self, config, vocab_size: int):
+    super().__init__()
+    if type(config) == dict:
+      config = omegaconf.OmegaConf.create(config)
+    self.config = config
+    self.vocab_size = vocab_size
+    self.causal = (
+      hasattr(config.model, 'causal')
+      and config.model.causal
+    )
+    assert self.causal == True
+    self.vocab_embed = EmbeddingLayer(
+      config.model.hidden_size, vocab_size
+    )
+    self.rotary_emb = Rotary(
+      config.model.hidden_size // config.model.n_heads
+    )
+    blocks = []
+    for _ in range(config.model.n_blocks):
+      blocks.append(
+        DDiTBlock(
+          config.model.hidden_size,
+          config.model.n_heads,
+          config.model.cond_dim,
+          dropout=config.model.dropout,
+          causal=self.causal,
+        )
+      )
+    self.blocks = nn.ModuleList(blocks)
+    self.output_layer = DDitFinalLayer(
+      config.model.hidden_size,
+      vocab_size,
+      config.model.cond_dim,
+      causal=self.causal,
+    )
+    self.scale_by_sigma = config.model.scale_by_sigma
+  def _get_bias_dropout_scale(self):
+    if self.training:
+      return bias_dropout_add_scale_fused_train
+    else:
+      return bias_dropout_add_scale_fused_inference
+class AR(DDIT):
+  def __init__(self, config, vocab_size, mask_index):
+    super().__init__(config, vocab_size)
+    self.mask_index = mask_index
+    self.neg_infinity = -1000.0
+  def forward(self, xt, sigma):
+    """Forward pass of the denoising model.
+    Args:
+      xt: int torch.Tensor with shape
+          (batch_size, diffusion_model_input_length), token ids.
+      sigma: float torch.Tensor with shape
+          (batch_size).
+    Returns:
+      log probability with shape
+          (batch_size, diffusion_model_input_length, vocab_size)
+    """
+    x = self.vocab_embed(xt)
+    rotary_cos_sin = self.rotary_emb(x)
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+      for i in range(len(self.blocks)):
+        x = self.blocks[i](
+          x, rotary_cos_sin, None, seqlens=None
+        )
+      output = self.output_layer(x, None)
+    # log prob at the mask index = - infinity
+    output[:, :, self.mask_index] = self.neg_infinity
+    # Normalize the logits such that x.exp() is
+    # a probability distribution over vocab_size.
+    # x = x - torch.logsumexp(x, dim=-1, keepdim=True)
+    return output.log_softmax(-1)

models/dimamba.py ADDED Viewed

	@@ -0,0 +1,1136 @@

+import math
+from functools import partial
+from typing import Optional, Tuple, Union
+import huggingface_hub
+import numpy as np
+import omegaconf
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from causal_conv1d import (
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
+from einops import rearrange, repeat
+from mamba_ssm.ops.selective_scan_interface import (
+    mamba_inner_fn,
+    selective_scan_fn,
+)
+from torch import Tensor
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    MaskedLMOutput,
+)
+try:
+    from mamba_ssm.ops.triton.layernorm import (
+        RMSNorm,
+        layer_norm_fn,
+        rms_norm_fn,
+    )
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+from mamba_ssm.ops.triton.selective_state_update import (
+    selective_state_update,
+)
+from models.dit import (
+    TimestepEmbedder,
+    bias_dropout_add_scale_fused_inference,
+    bias_dropout_add_scale_fused_train,
+    modulate_fused,
+)
+# sys.path.append('mamba_wrappers/mamba2')
+# from .mamba2.src.modules.ssd import SSD as Mamba
+class Mamba(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        dt_rank='auto',
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init='random',
+        dt_scale=1.0,
+        dt_init_floor=1e-4,
+        conv_bias=True,
+        bias=False,
+        use_fast_path=True,  # Fused kernel options
+        layer_idx=None,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == 'auto' else dt_rank
+        self.use_fast_path = use_fast_path
+        self.layer_idx = layer_idx
+        self.in_proj = nn.Linear(
+            self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs
+        )
+        self.conv1d = nn.Conv1d(
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            bias=conv_bias,
+            kernel_size=d_conv,
+            groups=self.d_inner,
+            padding=d_conv - 1,
+            **factory_kwargs,
+        )
+        self.activation = 'silu'
+        self.act = nn.SiLU()
+        self.x_proj = nn.Linear(
+            self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
+        )
+        self.dt_proj = nn.Linear(
+            self.dt_rank, self.d_inner, bias=True, **factory_kwargs
+        )
+        # Initialize special dt projection to preserve variance at initialization
+        dt_init_std = self.dt_rank**-0.5 * dt_scale
+        if dt_init == 'constant':
+            nn.init.constant_(self.dt_proj.weight, dt_init_std)
+        elif dt_init == 'random':
+            nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
+        else:
+            raise NotImplementedError
+        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+        dt = torch.exp(
+            torch.rand(self.d_inner, **factory_kwargs)
+            * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        ).clamp(min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        with torch.no_grad():
+            self.dt_proj.bias.copy_(inv_dt)
+        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
+        self.dt_proj.bias._no_reinit = True
+        # S4D real initialization
+        A = repeat(
+            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+            'n -> d n',
+            d=self.d_inner,
+        ).contiguous()
+        A_log = torch.log(A)  # Keep A_log in fp32
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
+        # D 'skip' parameter
+        self.D = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
+        self.D._no_weight_decay = True
+        self.out_proj = nn.Linear(
+            self.d_inner, self.d_model, bias=bias, **factory_kwargs
+        )
+    def forward(self, hidden_states, inference_params=None):
+        """
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        batch, seqlen, dim = hidden_states.shape
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
+            if inference_params.seqlen_offset > 0:
+                # The states are updated inplace
+                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out
+        # We do matmul and transpose BLH -> HBL at the same time
+        xz = rearrange(
+            self.in_proj.weight @ rearrange(hidden_states, 'b l d -> d (b l)'),
+            'd (b l) -> b d l',
+            l=seqlen,
+        )
+        if self.in_proj.bias is not None:
+            xz = xz + rearrange(self.in_proj.bias.to(dtype=xz.dtype), 'd -> d 1')
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # In the backward pass we write dx and dz next to each other to avoid torch.cat
+        if (
+            self.use_fast_path
+            and causal_conv1d_fn is not None
+            and inference_params is None
+        ):  # Doesn't support outputting the states
+            out = mamba_inner_fn(
+                xz,
+                self.conv1d.weight,
+                self.conv1d.bias,
+                self.x_proj.weight,
+                self.dt_proj.weight,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                A,
+                None,  # input-dependent B
+                None,  # input-dependent C
+                self.D.float(),
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+            )
+        else:
+            x, z = xz.chunk(2, dim=1)
+            # Compute short convolution
+            if conv_state is not None:
+                # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
+                # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
+                conv_state.copy_(
+                    F.pad(x, (self.d_conv - x.shape[-1], 0))
+                )  # Update state (B D W)
+            if causal_conv1d_fn is None:
+                x = self.act(self.conv1d(x)[..., :seqlen])
+            else:
+                assert self.activation in ['silu', 'swish']
+                x = causal_conv1d_fn(
+                    x=x,
+                    weight=rearrange(self.conv1d.weight, 'd 1 w -> d w'),
+                    bias=self.conv1d.bias,
+                    activation=self.activation,
+                )
+            # We're careful here about the layout, to avoid extra transposes.
+            # We want dt to have d as the slowest moving dimension
+            # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+            x_dbl = self.x_proj(rearrange(x, 'b d l -> (b l) d'))  # (bl d)
+            dt, B, C = torch.split(
+                x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1
+            )
+            dt = self.dt_proj.weight @ dt.t()
+            dt = rearrange(dt, 'd (b l) -> b d l', l=seqlen)
+            B = rearrange(B, '(b l) dstate -> b dstate l', l=seqlen).contiguous()
+            C = rearrange(C, '(b l) dstate -> b dstate l', l=seqlen).contiguous()
+            assert self.activation in ['silu', 'swish']
+            y = selective_scan_fn(
+                x,
+                dt,
+                A,
+                B,
+                C,
+                self.D.float(),
+                z=z,
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+                return_last_state=ssm_state is not None,
+            )
+            if ssm_state is not None:
+                y, last_state = y
+                ssm_state.copy_(last_state)
+            y = rearrange(y, 'b d l -> b l d')
+            out = self.out_proj(y)
+        return out
+    def step(self, hidden_states, conv_state, ssm_state):
+        dtype = hidden_states.dtype
+        assert (
+            hidden_states.shape[1] == 1
+        ), 'Only support decoding with 1 token at a time for now'
+        xz = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+        x, z = xz.chunk(2, dim=-1)  # (B D)
+        # Conv step
+        if causal_conv1d_update is None:
+            conv_state.copy_(
+                torch.roll(conv_state, shifts=-1, dims=-1)
+            )  # Update state (B D W)
+            conv_state[:, :, -1] = x
+            x = torch.sum(
+                conv_state * rearrange(self.conv1d.weight, 'd 1 w -> d w'), dim=-1
+            )  # (B D)
+            if self.conv1d.bias is not None:
+                x = x + self.conv1d.bias
+            x = self.act(x).to(dtype=dtype)
+        else:
+            x = causal_conv1d_update(
+                x,
+                conv_state,
+                rearrange(self.conv1d.weight, 'd 1 w -> d w'),
+                self.conv1d.bias,
+                self.activation,
+            )
+        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
+        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+        # Don't add dt_bias here
+        dt = F.linear(dt, self.dt_proj.weight)  # (B d_inner)
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # SSM step
+        if selective_state_update is None:
+            # Discretize A and B
+            dt = F.softplus(dt + self.dt_proj.bias.to(dtype=dt.dtype))
+            dA = torch.exp(torch.einsum('bd,dn->bdn', dt, A))
+            dB = torch.einsum('bd,bn->bdn', dt, B)
+            ssm_state.copy_(ssm_state * dA + rearrange(x, 'b d -> b d 1') * dB)
+            y = torch.einsum('bdn,bn->bd', ssm_state.to(dtype), C)
+            y = y + self.D.to(dtype) * x
+            y = y * self.act(z)  # (B D)
+        else:
+            y = selective_state_update(
+                ssm_state,
+                x,
+                dt,
+                A,
+                B,
+                C,
+                self.D,
+                z=z,
+                dt_bias=self.dt_proj.bias,
+                dt_softplus=True,
+            )
+        out = self.out_proj(y)
+        return out.unsqueeze(1), conv_state, ssm_state
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_state = torch.zeros(
+            batch_size,
+            self.d_model * self.expand,
+            self.d_conv,
+            device=device,
+            dtype=conv_dtype,
+        )
+        ssm_dtype = self.dt_proj.weight.dtype if dtype is None else dtype
+        # ssm_dtype = torch.float32
+        ssm_state = torch.zeros(
+            batch_size,
+            self.d_model * self.expand,
+            self.d_state,
+            device=device,
+            dtype=ssm_dtype,
+        )
+        return conv_state, ssm_state
+    def _get_states_from_cache(
+        self, inference_params, batch_size, initialize_states=False
+    ):
+        assert self.layer_idx is not None
+        if self.layer_idx not in inference_params.key_value_memory_dict:
+            batch_shape = (batch_size,)
+            conv_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_conv,
+                device=self.conv1d.weight.device,
+                dtype=self.conv1d.weight.dtype,
+            )
+            ssm_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_state,
+                device=self.dt_proj.weight.device,
+                dtype=self.dt_proj.weight.dtype,
+                # dtype=torch.float32,
+            )
+            inference_params.key_value_memory_dict[self.layer_idx] = (
+                conv_state,
+                ssm_state,
+            )
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[
+                self.layer_idx
+            ]
+            # TODO: What if batch size changes between generation, and we reuse the same states?
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state
+class Block(nn.Module):
+    def __init__(
+      self,
+      dim,
+      mixer_cls,
+      norm_cls=nn.LayerNorm,
+      fused_add_norm=False,
+      residual_in_fp32=False,
+      modulate=False,
+      t_dim=0,
+    ):
+      """
+      Simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection'
+      This Block has a slightly different structure compared to a regular
+      prenorm Transformer block.
+      The standard block is: LN -> MHA/MLP -> Add.
+      [Ref: https://arxiv.org/abs/2002.04745]
+      Here we have: Add -> LN -> Mixer, returning both
+      the hidden_states (output of the mixer) and the residual.
+      This is purely for performance reasons, as we can fuse add and LayerNorm.
+      The residual needs to be provided (except for the very first block).
+      """
+      super().__init__()
+      self.residual_in_fp32 = residual_in_fp32
+      self.fused_add_norm = fused_add_norm
+      self.mixer = mixer_cls(dim)
+      self.norm = norm_cls(dim)
+      if self.fused_add_norm:
+        assert RMSNorm is not None, 'RMSNorm import fails'
+        assert isinstance(
+            self.norm, (nn.LayerNorm, RMSNorm)
+        ), 'Only LayerNorm and RMSNorm are supported for fused_add_norm'
+      self.dropout = 0.1
+      self.modulate = modulate
+      self.t_dim = t_dim
+      if modulate:
+        self.adaLN_modulation = nn.Linear(t_dim,
+                                          3 * dim,
+                                          bias=True)
+        self.adaLN_modulation.weight.data.zero_()
+        self.adaLN_modulation.bias.data.zero_()
+    def _get_bias_dropout_scale(self):
+      return (
+        bias_dropout_add_scale_fused_train
+        if self.training
+        else bias_dropout_add_scale_fused_inference
+      )
+    def forward(
+      self,
+      hidden_states: Tensor,
+      residual: Optional[Tensor] = None,
+      inference_params=None,
+      time_embeds=None,
+    ):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Mixer(LN(residual))
+        """
+        if not self.fused_add_norm:
+          residual = (
+            (hidden_states + residual)
+            if residual is not None
+            else hidden_states
+          )
+          hidden_states = self.norm(
+            residual.to(dtype=self.norm.weight.dtype))
+          if self.residual_in_fp32:
+            residual = residual.to(torch.float32)
+        else:
+          fused_add_norm_fn = (
+            rms_norm_fn
+            if isinstance(self.norm, RMSNorm)
+            else layer_norm_fn
+          )
+          hidden_states, residual = fused_add_norm_fn(
+            hidden_states,
+            self.norm.weight,
+            self.norm.bias,
+            residual=residual,
+            prenorm=True,
+            residual_in_fp32=self.residual_in_fp32,
+            eps=self.norm.eps)
+        if self.modulate and time_embeds is not None:
+          (shift_msa,
+           scale_msa,
+           gate_msa) = self.adaLN_modulation(
+              time_embeds)[:, None].chunk(3, dim=-1)
+          hidden_states = modulate_fused(hidden_states,
+                                         shift_msa,
+                                         scale_msa)
+        mixer_out = self.mixer(hidden_states, inference_params=inference_params)
+        hidden_states = mixer_out
+        if self.modulate and time_embeds is not None:
+          bias_dropout_scale_fn = self._get_bias_dropout_scale()
+          hidden_states = bias_dropout_scale_fn(
+            hidden_states,
+            None,
+            gate_msa,
+            residual,
+            self.dropout)
+        return hidden_states, residual
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+      return self.mixer.allocate_inference_cache(
+        batch_size, max_seqlen, dtype=dtype, **kwargs)
+class BiMambaConfig(PretrainedConfig):
+    """Config that extends the original MambaConfig with params relevant to bi-directionality."""
+    model_type = 'bimamba'
+    def __init__(
+        self,
+        # From original MambaConfig
+        d_model: int = 2560,
+        n_layer: int = 64,
+        vocab_size: int = 50277,
+        ssm_cfg: Optional[dict] = None,
+        rms_norm: bool = True,
+        residual_in_fp32: bool = True,
+        fused_add_norm: bool = True,
+        pad_vocab_size_multiple: int = 8,
+        tie_word_embeddings: bool = True,
+        # Not in original MambaConfig, but default arg in create_block in mamba_ssm repo; used in layer norm
+        norm_epsilon: float = 1e-5,
+        # Used in init_weights
+        initializer_cfg: Optional[dict] = None,
+        # Caduceus-specific params
+        bidirectional: bool = True,
+        bidirectional_strategy: Union[str, None] = 'add',
+        bidirectional_weight_tie: bool = True,
+        temb_strategy: Union[str, None] = None,
+        d_temb: int = 0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.d_model = d_model
+        self.n_layer = n_layer
+        self.vocab_size = vocab_size
+        self.ssm_cfg = ssm_cfg
+        self.rms_norm = rms_norm
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        self.tie_word_embeddings = tie_word_embeddings
+        self.norm_epsilon = norm_epsilon
+        self.initializer_cfg = initializer_cfg
+        self.bidirectional = bidirectional
+        self.bidirectional_strategy = bidirectional_strategy
+        self.bidirectional_weight_tie = bidirectional_weight_tie
+        self.temb_strategy = temb_strategy
+        self.d_temb = d_temb
+def create_block(
+    d_model,
+    ssm_cfg=None,
+    norm_epsilon=1e-5,
+    rms_norm=False,
+    residual_in_fp32=False,
+    fused_add_norm=False,
+    layer_idx=None,
+    bidirectional=True,
+    bidirectional_strategy='add',
+    bidirectional_weight_tie=True,
+    device=None,
+    dtype=None,
+    modulate=False,
+    d_temb=0,
+):
+    """Create BiMamba block.
+    Adapted from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
+    """
+    if ssm_cfg is None:
+        ssm_cfg = {}
+    factory_kwargs = {'device': device, 'dtype': dtype}
+    bidirectional_kwargs = {
+        'bidirectional': bidirectional,
+        'bidirectional_strategy': bidirectional_strategy,
+        'bidirectional_weight_tie': bidirectional_weight_tie,
+    }
+    mixer_cls = partial(
+        BiMambaWrapper,
+        layer_idx=layer_idx,
+        **ssm_cfg,
+        **bidirectional_kwargs,
+        **factory_kwargs,
+    )
+    norm_cls = partial(
+        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
+    )
+    block_cls = Block
+    block = block_cls(
+        d_model,
+        mixer_cls,
+        norm_cls=norm_cls,
+        fused_add_norm=fused_add_norm,
+        residual_in_fp32=residual_in_fp32,
+        t_dim=d_temb,
+        modulate=modulate,
+    )
+    block.layer_idx = layer_idx
+    return block
+class BiMambaWrapper(nn.Module):
+    """Thin wrapper around Mamba to support bi-directionality."""
+    def __init__(
+        self,
+        d_model: int,
+        bidirectional: bool = True,
+        bidirectional_strategy: Optional[str] = 'add',
+        bidirectional_weight_tie: bool = True,
+        **mamba_kwargs,
+    ):
+        super().__init__()
+        if bidirectional and bidirectional_strategy is None:
+            bidirectional_strategy = 'add'  # Default strategy: `add`
+        if bidirectional and bidirectional_strategy not in ['add', 'ew_multiply']:
+            raise NotImplementedError(
+                f'`{bidirectional_strategy}` strategy for bi-directionality is not implemented!'
+            )
+        self.bidirectional = bidirectional
+        self.bidirectional_strategy = bidirectional_strategy
+        self.mamba_fwd = Mamba(d_model=d_model, **mamba_kwargs)
+        self.mamba_rev = None
+        if bidirectional:
+            self.mamba_rev = Mamba(d_model=d_model, **mamba_kwargs)
+            if (
+                bidirectional_weight_tie
+            ):  # Tie in and out projections (where most of param count lies)
+                self.mamba_rev.in_proj.weight = self.mamba_fwd.in_proj.weight
+                self.mamba_rev.in_proj.bias = self.mamba_fwd.in_proj.bias
+                self.mamba_rev.out_proj.weight = self.mamba_fwd.out_proj.weight
+                self.mamba_rev.out_proj.bias = self.mamba_fwd.out_proj.bias
+        else:
+            self.mamba_rev = None
+    def forward(self, hidden_states, inference_params=None):
+        """Bidirectional-enabled forward pass
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        out = self.mamba_fwd(
+            hidden_states,
+            inference_params=inference_params,
+        )
+        if self.bidirectional:
+            hidden_states_flipped = torch.flip(hidden_states, dims=(1,))
+            out_rev = self.mamba_rev(
+                hidden_states_flipped,  # Flip along the sequence length dimension
+                inference_params=inference_params,
+            )
+            out_rev_flipped = torch.flip(out_rev, dims=(1,))
+            if self.bidirectional_strategy == 'add':
+                out = (
+                    out + out_rev_flipped
+                )  # Flip back for combining with forward hidden states
+            elif self.bidirectional_strategy == 'ew_multiply':
+                out = out * out_rev_flipped
+            else:
+                raise NotImplementedError(
+                    f'`{self.bidirectional_strategy}` for bi-directionality not implemented!'
+                )
+        return out
+class BiMambaEmbeddings(nn.Module):
+    def __init__(
+        self,
+        config: BiMambaConfig,
+        input_dim=None,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        if input_dim is None:
+            input_dim = config.vocab_size
+        self.word_embeddings = nn.Embedding(
+            input_dim, config.d_model, **factory_kwargs
+        )
+    def forward(self, input_ids):
+        """
+        input_ids: (batch, seqlen)
+        """
+        return self.word_embeddings(input_ids)
+class BiMambaMixerModel(nn.Module):
+    def __init__(
+        self,
+        config: BiMambaConfig,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__()
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        self.temb_strategy = config.temb_strategy
+        self.config = config
+        input_dim = config.vocab_size
+        d_model = config.d_model
+        if self.temb_strategy and self.temb_strategy == 'concat':
+            input_dim += config.d_temb
+            d_model += config.d_temb
+        if self.temb_strategy is None:
+            config.d_temb = 0
+        self.fused_add_norm = config.fused_add_norm
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.embeddings = BiMambaEmbeddings(
+            config,input_dim=input_dim, **factory_kwargs)
+        # Mamba changes the order of residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Add, we do:
+        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP / Mixer). The model definition is unchanged.
+        # This is for performance reason: we can fuse add + layer_norm.
+        if config.fused_add_norm:
+            if layer_norm_fn is None or rms_norm_fn is None:
+                raise ImportError('Failed to import Triton LayerNorm / RMSNorm kernels')
+        self.layers = nn.ModuleList(
+            [
+                create_block(
+                    d_model,
+                    ssm_cfg=config.ssm_cfg,
+                    norm_epsilon=config.norm_epsilon,
+                    rms_norm=config.rms_norm,
+                    residual_in_fp32=config.residual_in_fp32,
+                    fused_add_norm=config.fused_add_norm,
+                    layer_idx=i,
+                    bidirectional=config.bidirectional,
+                    bidirectional_strategy=config.bidirectional_strategy,
+                    bidirectional_weight_tie=config.bidirectional_weight_tie,
+                    modulate=True if config.temb_strategy and 'adaln' in config.temb_strategy else False,
+                    d_temb=config.d_temb,
+                    **factory_kwargs,
+                )
+                for i in range(config.n_layer)
+            ]
+        )
+        if self.temb_strategy and 'adaln' in self.temb_strategy:
+            self.adaLN_modulation_final = nn.Linear(
+                config.d_temb, 2 * d_model, bias=True
+            )
+            self.adaLN_modulation_final.weight.data.zero_()
+            self.adaLN_modulation_final.bias.data.zero_()
+        norm_f = (nn.LayerNorm if not config.rms_norm else RMSNorm)(
+            d_model, eps=config.norm_epsilon, **factory_kwargs
+        )
+        self.norm_f = norm_f
+    def pre_apply_temb(self, input_embeds, time_embeds):
+        """Prepend/add time embeddings to input embeddings at the start of the forward pass.
+        Args:
+            input_embeds: Input embeddings. (batch, seqlen, d_model)
+            time_embeds: Timestep embeddings. (batch, d_temb)
+        Returns:
+            if self.temb_strategy == 'concat':
+                input_embeds: (batch, seqlen, d_model + d_temb)
+            if self.temb_strategy == 'add':
+                input_embeds: (batch, seqlen, d_model)
+        """
+        if self.temb_strategy == 'concat':
+            input_embeds = torch.cat([time_embeds.unsqueeze(1).tile(
+                1, input_embeds.shape[1], 1), input_embeds], axis=-1)
+        elif self.temb_strategy == 'add':
+            input_embeds += time_embeds.unsqueeze(1).tile(1, input_embeds.shape[1], 1)
+        return input_embeds
+    def forward(
+        self,
+        input_ids,
+        inputs_embeds=None,
+        output_hidden_states=False,
+        time_embeds=None,
+    ):
+        """Mixer forward."""
+        all_hidden_states = []
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids)
+        if (
+            time_embeds is not None
+            and self.temb_strategy in ['concat', 'add']
+        ):
+            hidden_states = self.pre_apply_temb(hidden_states, time_embeds)
+        residual = None
+        for ind, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+            # TODO: Add support for gradient checkpointing
+            layer_out = layer(
+                hidden_states, residual, inference_params=None, time_embeds=time_embeds
+            )
+            hidden_states, residuals = layer_out
+        if not self.fused_add_norm:
+            if self.temb_strategy and 'adaln' in self.temb_strategy:
+                raise NotImplementedError('adaln only implemented for fused_add_norm')
+            residual = (
+                (hidden_states + residual) if residual is not None else hidden_states
+            )
+            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
+        else:
+            if time_embeds is not None and self.temb_strategy and 'adaln' in self.temb_strategy:
+                shift, scale = self.adaLN_modulation_final(time_embeds)[:, None].chunk(
+                    2, dim=2
+                )
+            fused_add_norm_fn = (
+                rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
+            )
+            # Set prenorm=False here since we don't need the residual
+            hidden_states = fused_add_norm_fn(
+                hidden_states,
+                self.norm_f.weight,
+                self.norm_f.bias,
+                eps=self.norm_f.eps,
+                residual=residual,
+                prenorm=False,
+                residual_in_fp32=self.residual_in_fp32,
+            )
+            if time_embeds is not None and self.temb_strategy and 'adaln' in self.temb_strategy:
+                hidden_states = modulate_fused(hidden_states, shift, scale)
+        if output_hidden_states:
+            all_hidden_states.append(hidden_states)
+        return hidden_states, all_hidden_states
+def cross_entropy(logits, y, ignore_index=-100):
+    """Cross entropy loss."""
+    logits = logits.view(-1, logits.shape[-1])
+    y = y.view(-1)
+    return F.cross_entropy(logits, y, ignore_index=ignore_index)
+def weighted_cross_entropy(logits, y, loss_weights, ignore_index=-100):
+    """Weighted cross entropy loss (discounts certain tokens)."""
+    logits = logits.view(-1, logits.shape[-1])
+    y = y.view(-1)
+    ce = F.cross_entropy(logits, y, ignore_index=ignore_index, reduction='none')
+    loss_weights = loss_weights.view(-1)
+    loss_weights[y == ignore_index] = 0.0
+    # TODO: Follows GPN implementation, but should we remove weight normalization?
+    return (ce * (loss_weights / loss_weights.sum())).sum()
+class BiMambaPreTrainedModel(PreTrainedModel):
+    """PreTrainedModel wrapper for BiMamba backbone."""
+    config_class = BiMambaConfig
+    base_model_prefix = 'bimamba'
+    supports_gradient_checkpointing = False
+    _no_split_modules = ['BiMambaWrapper']
+    def _init_weights(
+        self,
+        module,
+        initializer_range=0.02,  # Now only used for embedding layer.
+        **kwargs,
+    ):
+        """Adapted from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py"""
+        n_layer = self.config.n_layer
+        initialized_cfg = (
+            self.config.initializer_cfg
+            if self.config.initializer_cfg is not None
+            else {}
+        )
+        rescale_prenorm_residual = initialized_cfg.get('rescale_prenorm_residual', True)
+        initializer_range = initialized_cfg.get('initializer_range', initializer_range)
+        n_residuals_per_layer = initialized_cfg.get('n_residuals_per_layer', 1)
+        if isinstance(module, nn.Linear):
+            if module.bias is not None:
+                if not getattr(module.bias, '_no_reinit', False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=initializer_range)
+        if rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth.
+            #   > Scale the weights of residual layers at initialization by a factor of 1/√N where N is the # of
+            #   residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if name in ['out_proj.weight', 'fc2.weight']:
+                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                    # We need to reinit p since this code could be called multiple times
+                    # Having just p *= scale would repeatedly scale it down
+                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                    with torch.no_grad():
+                        p /= math.sqrt(n_residuals_per_layer * n_layer)
+class BiMamba(BiMambaPreTrainedModel):
+    """BiMamba model that can be instantiated using HF patterns."""
+    def __init__(self, config: BiMambaConfig, device=None, dtype=None, **kwargs):
+        super().__init__(config)
+        # Adjust vocab size if vocab padding is set.
+        if config.vocab_size % config.pad_vocab_size_multiple != 0:
+            config.vocab_size += config.pad_vocab_size_multiple - (
+                config.vocab_size % config.pad_vocab_size_multiple
+            )
+        self.config = config
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        self.backbone = BiMambaMixerModel(config, **factory_kwargs, **kwargs)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        time_embeds: Optional[bool] = None,
+    ) -> Union[torch.Tensor, Tuple, BaseModelOutputWithNoAttention]:
+        """HF-compatible forward method."""
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        backbone_out = self.backbone(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            time_embeds=time_embeds,
+        )
+        hidden_states, all_hidden_states = backbone_out
+        if return_dict:
+            return BaseModelOutputWithNoAttention(
+                last_hidden_state=hidden_states,
+                hidden_states=all_hidden_states if output_hidden_states else None,
+            )
+        elif output_hidden_states:
+            return hidden_states, all_hidden_states
+        else:
+            return hidden_states
+class BiMambaForMaskedLM(BiMambaPreTrainedModel):
+    """HF-compatible BiMamba model for masked language modeling."""
+    def __init__(self, config: BiMambaConfig, device=None, dtype=None, **kwargs):
+        super().__init__(config, **kwargs)
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        self.bimamba = BiMamba(config, **factory_kwargs, **kwargs)
+        self.config = config
+        self.temb_strategy = config.temb_strategy
+        lm_head_in_dim = config.d_model
+        # LM head may only take in concatenated timestep embeddings
+        # if its weights are not tied to the vocab embedding
+        if (
+            not config.tie_word_embeddings
+            and config.temb_strategy == 'concat'
+        ):
+            lm_head_in_dim += config.d_temb
+        self.lm_head = nn.Linear(
+            lm_head_in_dim,
+            self.config.vocab_size,  # Use BiMamba config as it might have been updated
+            bias=False,
+            **factory_kwargs,
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+        if self.config.tie_word_embeddings:
+            self.tie_weights()
+    def init_weights(self):
+        """
+        If needed prunes and maybe initializes weights. If using a custom `PreTrainedModel`, you need to implement any
+        initialization logic in `_init_weights`.
+        """
+        # Initialize weights
+        self.apply(self._initialize_weights)
+        # Tie weights should be skipped when not initializing all weights
+        # since from_pretrained(...) calls tie weights anyways
+    def post_init(self):
+        """
+        A method executed at the end of each Transformer model initialization, to execute code that needs the model's
+        modules properly initialized (such as weight initialization).
+        """
+        self.init_weights()
+        self._backward_compatibility_gradient_checkpointing()
+    def get_input_embeddings(self):
+        return self.bimamba.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.bimamba.backbone.embeddings.word_embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Overrides output embeddings."""
+        self.lm_head = new_embeddings
+    def tie_weights(self):
+        """Tie weights."""
+        super().tie_weights()
+    def get_decoder(self):
+        """Get decoder (backbone) for the model."""
+        return self.bimamba
+    def set_decoder(self, decoder):
+        """Set decoder (backbone) for the model."""
+        self.bimamba = decoder
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        loss_weights: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        time_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        """HF-compatible forward method."""
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.bimamba(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            time_embeds=time_embeds,
+        )
+        hidden_states = outputs[0]
+        if (
+            self.config.tie_word_embeddings
+            and time_embeds is not None
+            and self.temb_strategy is not None
+            and self.temb_strategy == 'concat'
+        ):
+            hidden_states = hidden_states[:, :, self.config.d_temb:]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            if loss_weights is not None:
+                loss = weighted_cross_entropy(
+                    logits, labels, loss_weights, ignore_index=self.config.pad_token_id
+                )
+            else:
+                loss = cross_entropy(
+                    logits, labels, ignore_index=self.config.pad_token_id
+                )
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+class DiMamba(nn.Module, huggingface_hub.PyTorchModelHubMixin):
+  def __init__(self, config, vocab_size: int, pad_token_id: int):
+    super().__init__()
+    if type(config) == dict:
+      config = omegaconf.OmegaConf.create(config)
+    self.temb_strategy = config.model.temb_strategy
+    if self.temb_strategy == 'add':
+      self.sigma_map = TimestepEmbedder(config.model.hidden_size)
+    elif self.temb_strategy != 'none':
+      self.sigma_map = TimestepEmbedder(config.model.cond_dim)
+    mamba_config = BiMambaConfig(
+      d_model=config.model.hidden_size,
+      n_layer=config.model.n_blocks,
+      pad_token_id=pad_token_id,
+      vocab_size=vocab_size,
+      pad_vocab_size_multiple=1,
+      tie_word_embeddings=config.model.tie_word_embeddings,
+      temb_strategy=self.temb_strategy,
+      d_temb=config.model.cond_dim,
+      bidirectional=True)
+    self.model = BiMambaForMaskedLM(config=mamba_config)
+  def _get_bias_dropout_scale(self):
+    if self.training:
+      return bias_dropout_add_scale_fused_train
+    else:
+      return bias_dropout_add_scale_fused_inference
+  def forward(self, indices, sigma):
+    c = None
+    if self.temb_strategy is not None:
+      c = F.silu(self.sigma_map(sigma))
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+      x = self.model(indices, time_embeds=c).logits
+    return x

models/dit.py ADDED Viewed

	@@ -0,0 +1,514 @@

+import math
+import typing
+import flash_attn
+import flash_attn.layers.rotary
+import huggingface_hub
+import omegaconf
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+# Flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+def bias_dropout_add_scale(
+    x: torch.Tensor,
+    bias: typing.Optional[torch.Tensor],
+    scale: torch.Tensor,
+    residual: typing.Optional[torch.Tensor],
+    prob: float,
+    training: bool) -> torch.Tensor:
+  if bias is not None:
+    out = scale * F.dropout(x + bias, p=prob, training=training)
+  else:
+    out = scale * F.dropout(x, p=prob, training=training)
+  if residual is not None:
+    out = residual + out
+  return out
+def get_bias_dropout_add_scale(training):
+  def _bias_dropout_add(x, bias, scale, residual, prob):
+    return bias_dropout_add_scale(
+      x, bias, scale, residual, prob, training)
+  return _bias_dropout_add
+# function overload
+def modulate(x: torch.Tensor,
+             shift: torch.Tensor,
+             scale: torch.Tensor) -> torch.Tensor:
+  return x * (1 + scale) + shift
+@torch.jit.script
+def bias_dropout_add_scale_fused_train(
+    x: torch.Tensor,
+    bias: typing.Optional[torch.Tensor],
+    scale: torch.Tensor,
+    residual: typing.Optional[torch.Tensor],
+    prob: float) -> torch.Tensor:
+  return bias_dropout_add_scale(
+    x, bias, scale, residual, prob, True)
+@torch.jit.script
+def bias_dropout_add_scale_fused_inference(
+    x: torch.Tensor,
+    bias: typing.Optional[torch.Tensor],
+    scale: torch.Tensor,
+    residual: typing.Optional[torch.Tensor],
+    prob: float) -> torch.Tensor:
+  return bias_dropout_add_scale(
+    x, bias, scale, residual, prob, False)
+@torch.jit.script
+def modulate_fused(x: torch.Tensor,
+                   shift: torch.Tensor,
+                   scale: torch.Tensor) -> torch.Tensor:
+  return modulate(x, shift, scale)
+class Rotary(torch.nn.Module):
+  def __init__(self, dim, base=10_000):
+    super().__init__()
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+    self.register_buffer('inv_freq', inv_freq)
+    self.seq_len_cached = None
+    self.cos_cached = None
+    self.sin_cached = None
+  def forward(self, x, seq_dim=1):
+    seq_len = x.shape[seq_dim]
+    if seq_len != self.seq_len_cached:
+      self.seq_len_cached = seq_len
+      t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq)
+      freqs = torch.einsum("i,j->ij", t, self.inv_freq.clone())
+      emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+      # dims are: batch, seq_len, qkv, head, dim
+      self.cos_cached = emb.cos()[None, :, None, None, :].repeat(1,1,3,1,1)
+      self.sin_cached = emb.sin()[None, :, None, None, :].repeat(1,1,3,1,1)
+      # This makes the transformation on v an identity.
+      self.cos_cached[:,:,2,:,:].fill_(1.)
+      self.sin_cached[:,:,2,:,:].fill_(0.)
+    return self.cos_cached, self.sin_cached
+def rotate_half(x):
+  x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+  return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(qkv, cos, sin):
+  cos = cos[0,:,0,0,:cos.shape[-1]//2]
+  sin = sin[0,:,0,0,:sin.shape[-1]//2]
+  return flash_attn.layers.rotary.apply_rotary_emb_qkv_(qkv, cos, sin)
+# function overload
+def modulate(x, shift, scale):
+  return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#                                  Layers                                       #
+#################################################################################
+class LayerNorm(nn.Module):
+  def __init__(self, dim):
+    super().__init__()
+    self.weight = nn.Parameter(torch.ones([dim]))
+    self.dim = dim
+  def forward(self, x):
+    with torch.cuda.amp.autocast(enabled=False):
+      x = F.layer_norm(x.float(), [self.dim])
+    return x * self.weight[None,None,:]
+def residual_linear(x, W, x_skip, residual_scale):
+  """x_skip + residual_scale * W @ x"""
+  dim_out, dim_in = W.shape[0], W.shape[1]
+  return torch.addmm(
+    x_skip.view(-1, dim_out),
+    x.view(-1, dim_in),
+    W.T,
+    alpha=residual_scale).view(*x.shape[:-1], dim_out)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+  """
+  Embeds scalar timesteps into vector representations.
+  """
+  def __init__(self, hidden_size, frequency_embedding_size=256):
+    super().__init__()
+    self.mlp = nn.Sequential(
+      nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+      nn.SiLU(),
+      nn.Linear(hidden_size, hidden_size, bias=True))
+    self.frequency_embedding_size = frequency_embedding_size
+  @staticmethod
+  def timestep_embedding(t, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    half = dim // 2
+    freqs = torch.exp(
+      - math.log(max_period)
+      * torch.arange(start=0, end=half, dtype=torch.float32)
+      / half).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+      embedding = torch.cat(
+        [embedding,
+         torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+  def forward(self, t):
+    t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+    t_emb = self.mlp(t_freq)
+    return t_emb
+class LabelEmbedder(nn.Module):
+  """Embeds class labels into vector representations.
+  Also handles label dropout for classifier-free guidance.
+  """
+  def __init__(self, num_classes, cond_size):
+    super().__init__()
+    self.embedding_table = nn.Embedding(num_classes + 1, cond_size)
+    self.num_classes = num_classes
+    # TODO think of initializing with 0.02 std deviation like in original DiT paper
+  def forward(self, labels):
+    embeddings = self.embedding_table(labels)
+    return embeddings
+#################################################################################
+#                                 Core Model                                    #
+#################################################################################
+class DDiTBlock(nn.Module):
+  def __init__(self, dim, n_heads, cond_dim, mlp_ratio=4, dropout=0.1):
+    super().__init__()
+    self.n_heads = n_heads
+    self.norm1 = LayerNorm(dim)
+    self.attn_qkv = nn.Linear(dim, 3 * dim, bias=False)
+    self.attn_out = nn.Linear(dim, dim, bias=False)
+    self.dropout1 = nn.Dropout(dropout)
+    self.norm2 = LayerNorm(dim)
+    self.mlp = nn.Sequential(
+      nn.Linear(dim, mlp_ratio * dim, bias=True),
+      nn.GELU(approximate='tanh'),
+      nn.Linear(mlp_ratio * dim, dim, bias=True))
+    self.dropout2 = nn.Dropout(dropout)
+    self.dropout = dropout
+    self.adaLN_modulation = nn.Linear(cond_dim, 6 * dim, bias=True)
+    self.adaLN_modulation.weight.data.zero_()
+    self.adaLN_modulation.bias.data.zero_()
+  def _get_bias_dropout_scale(self):
+    if self.training:
+      return bias_dropout_add_scale_fused_train
+    else:
+      return bias_dropout_add_scale_fused_inference
+  def forward(self, x, rotary_cos_sin, c, seqlens=None):
+    batch_size, seq_len = x.shape[0], x.shape[1]
+    bias_dropout_scale_fn = self._get_bias_dropout_scale()
+    (shift_msa, scale_msa, gate_msa, shift_mlp,
+     scale_mlp, gate_mlp) = self.adaLN_modulation(c)[:, None].chunk(6, dim=2)
+    # attention operation
+    x_skip = x
+    x = modulate_fused(self.norm1(x), shift_msa, scale_msa)
+    qkv = self.attn_qkv(x)  # dim -> 3 * dim
+    qkv = rearrange(qkv,
+                    'b s (three h d) -> b s three h d',
+                    three=3,
+                    h=self.n_heads)
+    with torch.cuda.amp.autocast(enabled=False):
+      cos, sin = rotary_cos_sin
+      qkv = apply_rotary_pos_emb(
+        qkv, cos.to(qkv.dtype), sin.to(qkv.dtype))
+    qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+    if seqlens is None:
+      cu_seqlens = torch.arange(
+        0, (batch_size + 1) * seq_len, step=seq_len,
+        dtype=torch.int32, device=qkv.device)
+    else:
+      cu_seqlens = seqlens.cumsum(-1)
+    x = flash_attn.flash_attn_interface.flash_attn_varlen_qkvpacked_func(
+      qkv, cu_seqlens, seq_len, 0., causal=False)
+    x = rearrange(x, '(b s) h d -> b s (h d)', b=batch_size)
+    x = bias_dropout_scale_fn(self.attn_out(x),
+                              None,
+                              gate_msa,
+                              x_skip,
+                              self.dropout)
+    # mlp operation
+    x = bias_dropout_scale_fn(
+      self.mlp(modulate_fused(
+        self.norm2(x), shift_mlp, scale_mlp)),
+      None, gate_mlp, x, self.dropout)
+    return x
+class DDiTBlock_non_pad(nn.Module):
+  def __init__(self, dim, n_heads, cond_dim, mlp_ratio=4, dropout=0.1):
+    super().__init__()
+    self.n_heads = n_heads
+    self.norm1 = LayerNorm(dim)
+    self.attn_qkv = nn.Linear(dim, 3 * dim, bias=False)
+    self.attn_out = nn.Linear(dim, dim, bias=False)
+    self.dropout1 = nn.Dropout(dropout)
+    self.norm2 = LayerNorm(dim)
+    self.mlp = nn.Sequential(
+      nn.Linear(dim, mlp_ratio * dim, bias=True),
+      nn.GELU(approximate='tanh'),
+      nn.Linear(mlp_ratio * dim, dim, bias=True))
+    self.dropout2 = nn.Dropout(dropout)
+    self.dropout = dropout
+    self.adaLN_modulation = nn.Linear(cond_dim, 6 * dim, bias=True)
+    self.adaLN_modulation.weight.data.zero_()
+    self.adaLN_modulation.bias.data.zero_()
+  def _get_bias_dropout_scale(self):
+    if self.training:
+      return bias_dropout_add_scale_fused_train
+    else:
+      return bias_dropout_add_scale_fused_inference
+  def forward(self, x, rotary_cos_sin, c, seqlens=None, attnmask = None):
+    batch_size, seq_len = x.shape[0], x.shape[1]
+    bias_dropout_scale_fn = self._get_bias_dropout_scale()
+    (shift_msa, scale_msa, gate_msa, shift_mlp,
+     scale_mlp, gate_mlp) = self.adaLN_modulation(c)[:, None].chunk(6, dim=2)
+    # attention operation
+    x_skip = x
+    x = modulate_fused(self.norm1(x), shift_msa, scale_msa)
+    qkv = self.attn_qkv(x)  # dim -> 3 * dim
+    qkv = rearrange(qkv,
+                    'b s (three h d) -> b s three h d',
+                    three=3,
+                    h=self.n_heads)
+    with torch.cuda.amp.autocast(enabled=True):
+      cos, sin = rotary_cos_sin
+      qkv = apply_rotary_pos_emb(qkv, cos.to(qkv.dtype), sin.to(qkv.dtype))
+    qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+    # --------------------------------
+    mask_flat = attnmask.reshape(-1)
+    qkv = qkv[mask_flat]
+    seqlens = attnmask.sum(dim=1)
+    pad_seq_len = torch.zeros(len(seqlens)+1, dtype=torch.int32, device=qkv.device)
+    pad_seq_len[1:] = seqlens
+    seqlens = pad_seq_len
+    # cu_seqlens = pad_seq_len.cumsum(-1)
+    # x = flash_attn.flash_attn_interface.flash_attn_varlen_qkvpacked_func(
+    #   qkv, cu_seqlens, seq_len, 0., causal=False)
+    # --------------------------------
+    if seqlens is None:
+      cu_seqlens = torch.arange(
+        0, (batch_size + 1) * seq_len, step=seq_len,
+        dtype=torch.int32, device=qkv.device)
+    else:
+      cu_seqlens = seqlens.cumsum(-1).to(torch.int32)
+    assert cu_seqlens.min() == 0, "cu_seqlens 最小值必须等于 0"
+    assert qkv.size(0) == cu_seqlens[-1], "token 总数和 cu_seqlens 不符"
+    x = flash_attn.flash_attn_interface.flash_attn_varlen_qkvpacked_func(
+      qkv, cu_seqlens, seq_len, 0., causal=False)
+    # --------------------------------
+    out_flat = torch.zeros([batch_size*seq_len, x.shape[1], x.shape[2]]).to(x.device).to(x.dtype)
+    out_flat[mask_flat] = x
+    x = out_flat
+    # --------------------------------
+    x = rearrange(x, '(b s) h d -> b s (h d)', b=batch_size)
+    x = bias_dropout_scale_fn(self.attn_out(x),
+                              None,
+                              gate_msa,
+                              x_skip,
+                              self.dropout)
+    # mlp operation
+    x = bias_dropout_scale_fn(
+      self.mlp(modulate_fused(
+        self.norm2(x), shift_mlp, scale_mlp)),
+      None, gate_mlp, x, self.dropout)
+    return x
+class EmbeddingLayer(nn.Module):
+  def __init__(self, dim, vocab_dim):
+    super().__init__()
+    self.embedding = nn.Parameter(torch.empty((vocab_dim, dim)))
+    torch.nn.init.kaiming_uniform_(self.embedding, a=math.sqrt(5))
+  def forward(self, x):
+    return self.embedding[x]
+class DDitFinalLayer(nn.Module):
+  def __init__(self, hidden_size, out_channels, cond_dim):
+    super().__init__()
+    self.norm_final = LayerNorm(hidden_size)
+    self.linear = nn.Linear(hidden_size, out_channels)
+    self.linear.weight.data.zero_()
+    self.linear.bias.data.zero_()
+    self.adaLN_modulation = nn.Linear(cond_dim,
+                                      2 * hidden_size,
+                                      bias=True)
+    self.adaLN_modulation.weight.data.zero_()
+    self.adaLN_modulation.bias.data.zero_()
+  def forward(self, x, c):
+    shift, scale = self.adaLN_modulation(c)[:, None].chunk(2, dim=2)
+    x = modulate_fused(self.norm_final(x), shift, scale)
+    x = self.linear(x)
+    return x
+class DIT(nn.Module, huggingface_hub.PyTorchModelHubMixin):
+  def __init__(self, config, vocab_size: int):
+    super().__init__()
+    if type(config) == dict:
+      config = omegaconf.OmegaConf.create(config)
+    self.config = config
+    self.vocab_size = vocab_size
+    self.vocab_embed = EmbeddingLayer(config.model.hidden_size,
+                                      vocab_size)
+    self.sigma_map = TimestepEmbedder(config.model.cond_dim)
+    self.rotary_emb = Rotary(
+      config.model.hidden_size // config.model.n_heads)
+    blocks = []
+    for _ in range(config.model.n_blocks):
+      blocks.append(DDiTBlock(config.model.hidden_size,
+                              config.model.n_heads,
+                              config.model.cond_dim,
+                              dropout=config.model.dropout))
+    self.blocks = nn.ModuleList(blocks)
+    self.output_layer = DDitFinalLayer(
+      config.model.hidden_size,
+      vocab_size,
+      config.model.cond_dim)
+    self.scale_by_sigma = config.model.scale_by_sigma
+  def _get_bias_dropout_scale(self):
+    if self.training:
+      return bias_dropout_add_scale_fused_train
+    else:
+      return  bias_dropout_add_scale_fused_inference
+  def forward(self, indices, sigma):
+    x = self.vocab_embed(indices)
+    c = F.silu(self.sigma_map(sigma))
+    rotary_cos_sin = self.rotary_emb(x)
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+      for i in range(len(self.blocks)):
+        x = self.blocks[i](x, rotary_cos_sin, c, seqlens=None)
+      x = self.output_layer(x, c)
+    return x
+class DIT_non_pad(nn.Module, huggingface_hub.PyTorchModelHubMixin):
+  def __init__(self, config, vocab_size: int):
+    super().__init__()
+    if type(config) == dict:
+      config = omegaconf.OmegaConf.create(config)
+    self.config = config
+    self.vocab_size = vocab_size
+    self.vocab_embed = EmbeddingLayer(config.model.hidden_size,
+                                      vocab_size)
+    self.sigma_map = TimestepEmbedder(config.model.cond_dim)
+    self.rotary_emb = Rotary(
+      config.model.hidden_size // config.model.n_heads)
+    blocks = []
+    for _ in range(config.model.n_blocks):
+      blocks.append(DDiTBlock_non_pad(config.model.hidden_size,
+                              config.model.n_heads,
+                              config.model.cond_dim,
+                              dropout=config.model.dropout))
+    self.blocks = nn.ModuleList(blocks)
+    self.output_layer = DDitFinalLayer(
+      config.model.hidden_size,
+      vocab_size,
+      config.model.cond_dim)
+    self.scale_by_sigma = config.model.scale_by_sigma
+  def _get_bias_dropout_scale(self):
+    if self.training:
+      return bias_dropout_add_scale_fused_train
+    else:
+      return  bias_dropout_add_scale_fused_inference
+  def forward(self, indices, sigma, attnmask):
+    x = self.vocab_embed(indices)
+    c = F.silu(self.sigma_map(sigma))
+    rotary_cos_sin = self.rotary_emb(x)
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+      for i in range(len(self.blocks)):
+        x = self.blocks[i](x, rotary_cos_sin, c, seqlens=None, attnmask=attnmask)
+      x = self.output_layer(x, c)
+    return x

models/ema.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+class ExponentialMovingAverage:
+  """
+  Maintains (exponential) moving average of a set of parameters.
+  """
+  def __init__(self, parameters, decay, use_num_updates=True):
+    """
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; usually the result of
+            `model.parameters()`.
+        decay: The exponential decay.
+        use_num_updates: Whether to use number of updates when computing
+            averages.
+    """
+    if decay < 0.0 or decay > 1.0:
+      raise ValueError('Decay must be between 0 and 1')
+    self.decay = decay
+    self.num_updates = 0 if use_num_updates else None
+    self.shadow_params = [p.clone().detach()
+                          for p in parameters if p.requires_grad]
+    self.collected_params = []
+  def move_shadow_params_to_device(self, device):
+    self.shadow_params = [i.to(device) for i in self.shadow_params]
+  def update(self, parameters):
+    """
+    Update currently maintained parameters.
+    Call this every time the parameters are updated, such as the result of
+    the `optimizer.step()` call.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; usually the same set of
+            parameters used to initialize this object.
+    """
+    decay = self.decay
+    if self.num_updates is not None:
+      self.num_updates += 1
+      decay = min(decay, (1 + self.num_updates) /
+                  (10 + self.num_updates))
+    one_minus_decay = 1.0 - decay
+    with torch.no_grad():
+      parameters = [p for p in parameters if p.requires_grad]
+      for s_param, param in zip(self.shadow_params, parameters):
+        s_param.sub_(one_minus_decay * (s_param - param))
+  def copy_to(self, parameters):
+    """
+    Copy current parameters into given collection of parameters.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored moving averages.
+    """
+    parameters = [p for p in parameters if p.requires_grad]
+    for s_param, param in zip(self.shadow_params, parameters):
+      if param.requires_grad:
+        param.data.copy_(s_param.data)
+  def store(self, parameters):
+    """
+    Save the current parameters for restoring later.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            temporarily stored.
+    """
+    self.collected_params = [param.clone() for param in parameters]
+  def restore(self, parameters):
+    """
+    Restore the parameters stored with the `store` method.
+    Useful to validate the model with EMA parameters without affecting the
+    original optimization process. Store the parameters before the
+    `copy_to` method. After validation (or model saving), use this to
+    restore the former parameters.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored parameters.
+    """
+    for c_param, param in zip(self.collected_params, parameters):
+      param.data.copy_(c_param.data)
+  def state_dict(self):
+    return dict(decay=self.decay,
+                num_updates=self.num_updates,
+                shadow_params=self.shadow_params)
+  def load_state_dict(self, state_dict):
+    self.decay = state_dict['decay']
+    self.num_updates = state_dict['num_updates']
+    self.shadow_params = state_dict['shadow_params']