Maple728
/

TimeMoE-50M

@@ -25,6 +25,7 @@ try:
 except:
     pass
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
@@ -66,7 +67,7 @@ def load_balancing_loss_func(
         The auxiliary loss.
     """
     if gate_logits is None or not isinstance(gate_logits, (tuple, list)) or gate_logits[0] is None:
-        return None
     compute_device = gate_logits[0].device
     concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
@@ -293,7 +294,7 @@ class TimeMoeSparseExpertsLayer(nn.Module):
         """ """
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
-        # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
@@ -764,8 +765,6 @@ class TimeMoeModel(TimeMoePreTrainedModel):
     def __init__(self, config: TimeMoeConfig):
         super().__init__(config)
-        # self.padding_idx = config.pad_token_id
         self.embed_layer = TimeMoeInputEmbedding(config)
         self.layers = nn.ModuleList(
             [TimeMoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
@@ -1096,12 +1095,10 @@ class TimeMoeForPrediction(TimeMoePreTrainedModel, TSGenerationMixin):
             shift_labels = labels
         # Calculate loss with mask
-        # losses = self.loss_function(shift_predictions.to(torch.float32), shift_labels.to(torch.float32))
         losses = self.loss_function(shift_predictions, shift_labels)
         if loss_masks is not None:
             losses = losses * loss_masks
             loss = losses.sum() / loss_masks.sum()
         else:
             loss = torch.mean(losses)

 except:
     pass
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
         The auxiliary loss.
     """
     if gate_logits is None or not isinstance(gate_logits, (tuple, list)) or gate_logits[0] is None:
+        return 0.0
     compute_device = gate_logits[0].device
     concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
         """ """
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits -> (batch * sequence_length, n_experts)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
     def __init__(self, config: TimeMoeConfig):
         super().__init__(config)
         self.embed_layer = TimeMoeInputEmbedding(config)
         self.layers = nn.ModuleList(
             [TimeMoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
             shift_labels = labels
         # Calculate loss with mask
         losses = self.loss_function(shift_predictions, shift_labels)
         if loss_masks is not None:
             losses = losses * loss_masks
             loss = losses.sum() / loss_masks.sum()
         else:
             loss = torch.mean(losses)