Firworks commited on Nov 30, 2025

Commit

aadbb6b

verified ·

1 Parent(s): d802356

Add NVFP4 quantized checkpoint

Browse files

Files changed (20) hide show

.gitattributes +1 -0
README.md +23 -0
added_tokens.json +0 -0
chat_template.jinja +74 -0
config.json +358 -0
configuration_step_audio_2.py +128 -0
generation_config.json +5 -0
merges.txt +0 -0
model-00001-of-00005.safetensors +3 -0
model-00002-of-00005.safetensors +3 -0
model-00003-of-00005.safetensors +3 -0
model-00004-of-00005.safetensors +3 -0
model-00005-of-00005.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_step_audio_2.py +425 -0
recipe.yaml +6 -0
special_tokens_map.json +49 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+---
+datasets:
+- Rombo-Org/Optimized_Reasoning
+base_model:
+- stepfun-ai/Step-Audio-R1
+---
+# Step-Audio-R1-nvfp4
+**Format:** NVFP4 — weights & activations quantized to FP4 with dual scaling.
+**Base model:** `stepfun-ai/Step-Audio-R1`
+**How it was made:** One-shot calibration with LLM Compressor (NVFP4 recipe), long-seq calibration with Rombo-Org/Optimized_Reasoning.
+> Notes: Keep `lm_head` in high precision; calibrate on long, domain-relevant sequences.
+Check the original model card for information about this model.
+# Running the model with VLLM in Docker
+```sh
+sudo docker run --runtime nvidia --gpus all -p 8000:8000 --ipc=host vllm/vllm-openai:nightly --model Firworks/Step-Audio-R1-nvfp4 --dtype auto --max-model-len 32768
+```
+This was tested on a B200 cloud instance.
+If there are other models you're interested in seeing quantized to NVFP4 for use on the DGX Spark, or other modern Blackwell (or newer) cards let me know. I'm trying to make more NVFP4 models available to allow more people to try them out.

added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,74 @@

+{%- if tools %}
+    {{- '<|BOT|>system
+' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] + '<|EOT|>' }}
+    {%- else %}
+        {{- 'You are a helpful assistant. Please think step by step and provide your reasoning process within <think> </think> tags, followed by your final answer. Format: <think>your reasoning here</think>your final answer<|EOT|>' }}
+    {%- endif %}
+    {{- '<|BOT|>' }}
+    {{- "tool_json_schemas
+" }}
+    {{- tools | tojson }}
+    {{- '<|EOT|>' }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|BOT|>system
+' + messages[0]['content'] + '<|EOT|>' }}
+    {%- else %}
+        {{- '<|BOT|>system
+You are a helpful assistant. Please think step by step and provide your reasoning process within <think> </think> tags, followed by your final answer. Format: <think>your reasoning here</think>your final answer<|EOT|>' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if message["role"] == "user" %}
+        {{- '<|BOT|>human
+' + message["content"] + '<|EOT|>' }}
+    {%- elif (message["role"] == "system" and not loop.first) or (message["role"] == "assistant" and not message["tool_calls"]) %}
+        {{- '<|BOT|>' + message["role"] + '
+' + message["content"] + '<|EOT|>' }}
+    {%- elif message["role"] == "assistant" %}
+        {{- '<|BOT|>' + message["role"] + '
+' }}
+        {%- if message["content"] %}
+            {{- message["content"] }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call["function"] is defined %}
+                {%- set tool_call = tool_call["function"] %}
+            {%- endif %}
+            {{- '<|CALL_START|>' + 'function
+' + tool_call["name"] + '
+' }}
+            {{- tool_call["arguments"] | tojson }}
+            {{- '<|CALL_END|>' }}
+        {%- endfor %}
+        {{- '<|EOT|>' }}
+    {%- elif message["role"] == "tool" %}
+        {{- '<|BOT|>' }}
+        {%- set ns = namespace(function_name="tool") %}
+        {%- if message["tool_call_id"] %}
+            {%- for prev_msg in messages %}
+                {%- if prev_msg["role"] == "assistant" and prev_msg["tool_calls"] %}
+                    {%- for tool_call in prev_msg["tool_calls"] %}
+                        {%- if tool_call["id"] == message["tool_call_id"] %}
+                            {%- if tool_call["function"] is defined %}
+                                {%- set ns.function_name = tool_call["function"]["name"] %}
+                            {%- endif %}
+                        {%- endif %}
+                    {%- endfor %}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- 'function_output
+' + ns.function_name + '
+' }}
+        {{- message["content"] }}
+        {{- '<|EOT|>' }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|BOT|>assistant
+<think>
+' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,358 @@

+{
+  "architectures": [
+    "StepAudio2ForCausalLM"
+  ],
+  "audio_encoder_config": {
+    "adapter_stride": 2,
+    "kernel_size": 3,
+    "llm_dim": 5120,
+    "model_type": "step_audio_2_encoder",
+    "n_audio_ctx": 1500,
+    "n_audio_head": 20,
+    "n_audio_layer": 32,
+    "n_audio_state": 1280,
+    "n_codebook_size": 4096,
+    "n_mels": 128
+  },
+  "auto_map": {
+    "AutoConfig": "configuration_step_audio_2.StepAudio2Config",
+    "AutoModelForCausalLM": "modeling_step_audio_2.StepAudio2ForCausalLM"
+  },
+  "dtype": "bfloat16",
+  "max_window_layers": null,
+  "model_type": "step_audio_2",
+  "quantization_config": {
+    "config_groups": {
+      "group_0": {
+        "format": "nvfp4-pack-quantized",
+        "input_activations": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": "local",
+          "group_size": 16,
+          "num_bits": 4,
+          "observer": "minmax",
+          "observer_kwargs": {},
+          "strategy": "tensor_group",
+          "symmetric": true,
+          "type": "float"
+        },
+        "output_activations": null,
+        "targets": [
+          "Linear"
+        ],
+        "weights": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": false,
+          "group_size": 16,
+          "num_bits": 4,
+          "observer": "minmax",
+          "observer_kwargs": {},
+          "strategy": "tensor_group",
+          "symmetric": true,
+          "type": "float"
+        }
+      }
+    },
+    "format": "nvfp4-pack-quantized",
+    "global_compression_ratio": null,
+    "ignore": [
+      "encoder.blocks.0.attn.query",
+      "encoder.blocks.0.attn.key",
+      "encoder.blocks.0.attn.value",
+      "encoder.blocks.0.attn.out",
+      "encoder.blocks.0.mlp.0",
+      "encoder.blocks.0.mlp.2",
+      "encoder.blocks.1.attn.query",
+      "encoder.blocks.1.attn.key",
+      "encoder.blocks.1.attn.value",
+      "encoder.blocks.1.attn.out",
+      "encoder.blocks.1.mlp.0",
+      "encoder.blocks.1.mlp.2",
+      "encoder.blocks.2.attn.query",
+      "encoder.blocks.2.attn.key",
+      "encoder.blocks.2.attn.value",
+      "encoder.blocks.2.attn.out",
+      "encoder.blocks.2.mlp.0",
+      "encoder.blocks.2.mlp.2",
+      "encoder.blocks.3.attn.query",
+      "encoder.blocks.3.attn.key",
+      "encoder.blocks.3.attn.value",
+      "encoder.blocks.3.attn.out",
+      "encoder.blocks.3.mlp.0",
+      "encoder.blocks.3.mlp.2",
+      "encoder.blocks.4.attn.query",
+      "encoder.blocks.4.attn.key",
+      "encoder.blocks.4.attn.value",
+      "encoder.blocks.4.attn.out",
+      "encoder.blocks.4.mlp.0",
+      "encoder.blocks.4.mlp.2",
+      "encoder.blocks.5.attn.query",
+      "encoder.blocks.5.attn.key",
+      "encoder.blocks.5.attn.value",
+      "encoder.blocks.5.attn.out",
+      "encoder.blocks.5.mlp.0",
+      "encoder.blocks.5.mlp.2",
+      "encoder.blocks.6.attn.query",
+      "encoder.blocks.6.attn.key",
+      "encoder.blocks.6.attn.value",
+      "encoder.blocks.6.attn.out",
+      "encoder.blocks.6.mlp.0",
+      "encoder.blocks.6.mlp.2",
+      "encoder.blocks.7.attn.query",
+      "encoder.blocks.7.attn.key",
+      "encoder.blocks.7.attn.value",
+      "encoder.blocks.7.attn.out",
+      "encoder.blocks.7.mlp.0",
+      "encoder.blocks.7.mlp.2",
+      "encoder.blocks.8.attn.query",
+      "encoder.blocks.8.attn.key",
+      "encoder.blocks.8.attn.value",
+      "encoder.blocks.8.attn.out",
+      "encoder.blocks.8.mlp.0",
+      "encoder.blocks.8.mlp.2",
+      "encoder.blocks.9.attn.query",
+      "encoder.blocks.9.attn.key",
+      "encoder.blocks.9.attn.value",
+      "encoder.blocks.9.attn.out",
+      "encoder.blocks.9.mlp.0",
+      "encoder.blocks.9.mlp.2",
+      "encoder.blocks.10.attn.query",
+      "encoder.blocks.10.attn.key",
+      "encoder.blocks.10.attn.value",
+      "encoder.blocks.10.attn.out",
+      "encoder.blocks.10.mlp.0",
+      "encoder.blocks.10.mlp.2",
+      "encoder.blocks.11.attn.query",
+      "encoder.blocks.11.attn.key",
+      "encoder.blocks.11.attn.value",
+      "encoder.blocks.11.attn.out",
+      "encoder.blocks.11.mlp.0",
+      "encoder.blocks.11.mlp.2",
+      "encoder.blocks.12.attn.query",
+      "encoder.blocks.12.attn.key",
+      "encoder.blocks.12.attn.value",
+      "encoder.blocks.12.attn.out",
+      "encoder.blocks.12.mlp.0",
+      "encoder.blocks.12.mlp.2",
+      "encoder.blocks.13.attn.query",
+      "encoder.blocks.13.attn.key",
+      "encoder.blocks.13.attn.value",
+      "encoder.blocks.13.attn.out",
+      "encoder.blocks.13.mlp.0",
+      "encoder.blocks.13.mlp.2",
+      "encoder.blocks.14.attn.query",
+      "encoder.blocks.14.attn.key",
+      "encoder.blocks.14.attn.value",
+      "encoder.blocks.14.attn.out",
+      "encoder.blocks.14.mlp.0",
+      "encoder.blocks.14.mlp.2",
+      "encoder.blocks.15.attn.query",
+      "encoder.blocks.15.attn.key",
+      "encoder.blocks.15.attn.value",
+      "encoder.blocks.15.attn.out",
+      "encoder.blocks.15.mlp.0",
+      "encoder.blocks.15.mlp.2",
+      "encoder.blocks.16.attn.query",
+      "encoder.blocks.16.attn.key",
+      "encoder.blocks.16.attn.value",
+      "encoder.blocks.16.attn.out",
+      "encoder.blocks.16.mlp.0",
+      "encoder.blocks.16.mlp.2",
+      "encoder.blocks.17.attn.query",
+      "encoder.blocks.17.attn.key",
+      "encoder.blocks.17.attn.value",
+      "encoder.blocks.17.attn.out",
+      "encoder.blocks.17.mlp.0",
+      "encoder.blocks.17.mlp.2",
+      "encoder.blocks.18.attn.query",
+      "encoder.blocks.18.attn.key",
+      "encoder.blocks.18.attn.value",
+      "encoder.blocks.18.attn.out",
+      "encoder.blocks.18.mlp.0",
+      "encoder.blocks.18.mlp.2",
+      "encoder.blocks.19.attn.query",
+      "encoder.blocks.19.attn.key",
+      "encoder.blocks.19.attn.value",
+      "encoder.blocks.19.attn.out",
+      "encoder.blocks.19.mlp.0",
+      "encoder.blocks.19.mlp.2",
+      "encoder.blocks.20.attn.query",
+      "encoder.blocks.20.attn.key",
+      "encoder.blocks.20.attn.value",
+      "encoder.blocks.20.attn.out",
+      "encoder.blocks.20.mlp.0",
+      "encoder.blocks.20.mlp.2",
+      "encoder.blocks.21.attn.query",
+      "encoder.blocks.21.attn.key",
+      "encoder.blocks.21.attn.value",
+      "encoder.blocks.21.attn.out",
+      "encoder.blocks.21.mlp.0",
+      "encoder.blocks.21.mlp.2",
+      "encoder.blocks.22.attn.query",
+      "encoder.blocks.22.attn.key",
+      "encoder.blocks.22.attn.value",
+      "encoder.blocks.22.attn.out",
+      "encoder.blocks.22.mlp.0",
+      "encoder.blocks.22.mlp.2",
+      "encoder.blocks.23.attn.query",
+      "encoder.blocks.23.attn.key",
+      "encoder.blocks.23.attn.value",
+      "encoder.blocks.23.attn.out",
+      "encoder.blocks.23.mlp.0",
+      "encoder.blocks.23.mlp.2",
+      "encoder.blocks.24.attn.query",
+      "encoder.blocks.24.attn.key",
+      "encoder.blocks.24.attn.value",
+      "encoder.blocks.24.attn.out",
+      "encoder.blocks.24.mlp.0",
+      "encoder.blocks.24.mlp.2",
+      "encoder.blocks.25.attn.query",
+      "encoder.blocks.25.attn.key",
+      "encoder.blocks.25.attn.value",
+      "encoder.blocks.25.attn.out",
+      "encoder.blocks.25.mlp.0",
+      "encoder.blocks.25.mlp.2",
+      "encoder.blocks.26.attn.query",
+      "encoder.blocks.26.attn.key",
+      "encoder.blocks.26.attn.value",
+      "encoder.blocks.26.attn.out",
+      "encoder.blocks.26.mlp.0",
+      "encoder.blocks.26.mlp.2",
+      "encoder.blocks.27.attn.query",
+      "encoder.blocks.27.attn.key",
+      "encoder.blocks.27.attn.value",
+      "encoder.blocks.27.attn.out",
+      "encoder.blocks.27.mlp.0",
+      "encoder.blocks.27.mlp.2",
+      "encoder.blocks.28.attn.query",
+      "encoder.blocks.28.attn.key",
+      "encoder.blocks.28.attn.value",
+      "encoder.blocks.28.attn.out",
+      "encoder.blocks.28.mlp.0",
+      "encoder.blocks.28.mlp.2",
+      "encoder.blocks.29.attn.query",
+      "encoder.blocks.29.attn.key",
+      "encoder.blocks.29.attn.value",
+      "encoder.blocks.29.attn.out",
+      "encoder.blocks.29.mlp.0",
+      "encoder.blocks.29.mlp.2",
+      "encoder.blocks.30.attn.query",
+      "encoder.blocks.30.attn.key",
+      "encoder.blocks.30.attn.value",
+      "encoder.blocks.30.attn.out",
+      "encoder.blocks.30.mlp.0",
+      "encoder.blocks.30.mlp.2",
+      "encoder.blocks.31.attn.query",
+      "encoder.blocks.31.attn.key",
+      "encoder.blocks.31.attn.value",
+      "encoder.blocks.31.attn.out",
+      "encoder.blocks.31.mlp.0",
+      "encoder.blocks.31.mlp.2",
+      "adapter.linear1",
+      "adapter.linear2",
+      "lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "quant_method": "compressed-tensors",
+    "quantization_status": "compressed",
+    "sparsity_config": {},
+    "transform_config": {},
+    "version": "0.12.2"
+  },
+  "sliding_window": 2048,
+  "text_config": {
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "intermediate_size": 27648,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 65536,
+    "max_window_layers": 28,
+    "model_type": "qwen2",
+    "num_attention_heads": 40,
+    "num_hidden_layers": 64,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 158720
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "4.56.2",
+  "use_sliding_window": false
+}

configuration_step_audio_2.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from typing import Optional, Union
+from transformers import Qwen2Config
+from transformers.configuration_utils import PretrainedConfig
+class StepAudio2EncoderConfig(PretrainedConfig):
+    model_type = "step_audio_2_encoder"
+    def __init__(
+        self,
+        n_mels=128,
+        n_audio_ctx=1500,
+        n_audio_state=512,
+        n_audio_head=8,
+        n_audio_layer=6,
+        llm_dim=4096,
+        kernel_size=3,
+        adapter_stride=2,
+        **kwargs,
+    ):
+        self.n_mels      = n_mels
+        self.n_audio_ctx = n_audio_ctx
+        self.n_audio_state = n_audio_state
+        self.n_audio_head = n_audio_head
+        self.n_audio_layer = n_audio_layer
+        self.llm_dim     = llm_dim
+        self.kernel_size = kernel_size
+        self.adapter_stride = adapter_stride
+        super().__init__(**kwargs)
+class StepAudio2TextConfig(PretrainedConfig):
+    model_type = "step_audio_2_text"
+    def __init__(
+        self,
+        vocab_size=64012,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=48,
+        num_attention_heads=32,
+        num_attention_groups=4,
+        num_key_value_heads=4,
+        hidden_act="silu",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        rope_theta=1000000.0,
+        rope_scaling=None,
+        eos_token_id=None,
+        **kwargs
+    ):
+        if eos_token_id is not None:
+            if isinstance(eos_token_id, list):
+                eos_token_id = list(set([151643, 151645, 151665] + eos_token_id))
+            else:
+                eos_token_id = [151643, 151645, 151665, eos_token_id]
+        else:
+            eos_token_id = [151643, 151645, 151665]
+        super().__init__(
+            eos_token_id=eos_token_id,
+            **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_groups = num_attention_groups
+        self.num_key_value_heads = num_key_value_heads
+        assert self.num_attention_groups == self.num_key_value_heads, "num_attention_groups must be equal to num_key_value_heads"
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.text_config = Qwen2Config(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            hidden_act=hidden_act,
+            max_position_embeddings=max_position_embeddings,
+            initializer_range=initializer_range,
+            rms_norm_eps=rms_norm_eps,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            architectures=["Qwen2ForCausalLM"],
+            torch_dtype=getattr(self, "torch_dtype", "bfloat16"),
+        )
+class StepAudio2Config(PretrainedConfig):
+    model_type = "step_audio_2"
+    architectures = ["StepAudio2ForCausalLM"]
+    def __init__(
+        self,
+        audio_encoder_config :Optional[Union[dict, StepAudio2EncoderConfig]] = None,
+        text_config: Optional[Union[dict, StepAudio2TextConfig]] = None,
+        use_sliding_window: bool = False,
+        sliding_window: Optional[int] = 2048,
+        max_window_layers: Optional[int] = None,
+        **kwargs
+    ):
+        kwargs.setdefault("use_sliding_window", use_sliding_window)
+        kwargs.setdefault("sliding_window", sliding_window)
+        if max_window_layers is None:
+            max_window_layers = kwargs.get("num_hidden_layers", None)
+        kwargs.setdefault("max_window_layers", max_window_layers)
+        super().__init__(**kwargs)
+        if text_config is None:
+            text_config = StepAudio2TextConfig().text_config
+        elif isinstance(text_config, dict):
+            text_config = StepAudio2TextConfig(**text_config).text_config
+        self.text_config = text_config
+        if audio_encoder_config is None:
+            self.audio_encoder_config = StepAudio2EncoderConfig()
+        elif isinstance(audio_encoder_config, dict):
+            self.audio_encoder_config = StepAudio2EncoderConfig(**audio_encoder_config)

generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "do_sample": true,
+  "transformers_version": "4.56.2"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a47a8da85e0da3bc31c230badb7663bae60309d731cb849ba15959dc422d68e
+size 4952380248

model-00002-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95ef95ec980fec815184d6c9f2b2e95661be9e1e063a1a45466e908952803bbc
+size 4937521480

model-00003-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:093b77b6b160d2476ca545eb757dbd0ee1d4b554fa152a833b4f60a9526bd26d
+size 4937521480

model-00004-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33090fc271d96985256ae7998d71d93c46756ad16d63c037602529b155428689
+size 4997834160

model-00005-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d812abb83c44af00a386341fc563d9c15cda01c08e1851ff1c904eb09d793a8
+size 2291022848

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_step_audio_2.py ADDED Viewed

	@@ -0,0 +1,425 @@

+from typing import Iterable, Optional, Tuple
+import librosa
+import torch
+import torch.nn.functional as F
+import torchaudio
+from torch import Tensor, nn
+from transformers import PreTrainedModel, Qwen2Model
+from transformers.generation.utils import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_step_audio_2 import StepAudio2Config
+def _mel_filters(n_mels: int) -> torch.Tensor:
+    """Load the mel filterbank matrix for projecting STFT into a Mel spectrogram."""
+    assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}"
+    if n_mels == 128:
+        return torch.from_numpy(librosa.filters.mel(sr=16000, n_fft=400, n_mels=128))
+    else:
+        return torch.from_numpy(librosa.filters.mel(sr=16000, n_fft=400, n_mels=80))
+def load_audio(file_path, target_rate=16000, max_length=None):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    If max_length is provided, truncate the audio to that length
+    """
+    waveform, sample_rate = torchaudio.load(file_path)
+    if sample_rate != target_rate:
+        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform)
+    audio = waveform[0]  # get the first channel
+    # Truncate audio if it exceeds max_length
+    if max_length is not None and audio.shape[0] > max_length:
+        audio = audio[:max_length]
+    return audio
+def log_mel_spectrogram(audio, n_mels=128, padding=479, device=None):
+    """
+    Compute the log-Mel spectrogram with specific padding for StepAudio
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio = torch.from_numpy(audio)
+    if device is not None:
+        audio = audio.to(device)
+    if padding > 0:
+        audio = F.pad(audio, (0, padding))
+    window = torch.hann_window(400).to(audio.device)
+    stft = torch.stft(audio, 400, 160, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = _mel_filters(n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec
+def compute_token_num(max_feature_len):
+    # First, audio goes through encoder:
+    # 1. conv1: kernel=3, stride=1, padding=1 -> size unchanged
+    # 2. conv2: kernel=3, stride=2, padding=1 -> size/2
+    # 3. avg_pooler: kernel=2, stride=2 -> size/2
+    max_feature_len = max_feature_len - 2  # remove padding
+    encoder_output_dim = (max_feature_len + 1) // 2 // 2  # after conv2 and avg_pooler
+    # Then through adaptor (parameters from config file):
+    padding = 1
+    kernel_size = 3  # from config: audio_encoder_config.kernel_size
+    stride = 2      # from config: audio_encoder_config.adapter_stride
+    adapter_output_dim = (encoder_output_dim + 2 * padding - kernel_size) // stride + 1
+    return adapter_output_dim
+def make_non_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """Make mask tensor containing indices of non-padded part.
+    The sequences in a batch may have different lengths. To enable
+    batch computing, padding is need to make all sequence in same
+    size. To avoid the padding part pass value to context dependent
+    block such as attention or convolution , this padding part is
+    masked.
+    1 for non-padded part and 0 for padded part.
+    Parameters
+    ----------
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+    -------
+        torch.Tensor: Mask tensor containing indices of padded part (B, max_T).
+    Examples:
+        >>> import torch
+        >>> import s3tokenizer
+        >>> lengths = torch.tensor([5, 3, 2])
+        >>> masks = s3tokenizer.make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1, 1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+    """
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0,
+                             max_len,
+                             dtype=torch.int64,
+                             device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return ~mask
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Convert bool-tensor to float-tensor for flash attention.
+    Parameters
+    ----------
+        lengths (torch.Tensor): Batch of lengths (B, ?).
+    Returns:
+    -------
+        torch.Tensor: Mask tensor containing indices of padded part (B, ?).
+    Examples:
+        >>> import torch
+        >>> import s3tokenizer
+        >>> lengths = torch.tensor([5, 3, 2])
+        >>> masks = s3tokenizer.make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1, 1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+        >>> new_masks = s3tokenizer.mask_to_bias(masks, torch.float32)
+        new_masks = [[-0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00],
+                    [-0.0000e+00, -0.0000e+00, -0.0000e+00, -1.0000e+10, -1.0000e+10],
+                    [-0.0000e+00, -0.0000e+00, -1.0000e+10, -1.0000e+10, -1.0000e+10]]
+    """
+    assert mask.dtype == torch.bool
+    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+    mask = mask.to(dtype)
+    # attention mask bias
+    # NOTE(Mddct): torch.finfo jit issues
+    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * -1.0e+10
+    return mask
+class LayerNorm(nn.LayerNorm):
+    def forward(self, input: Tensor) -> Tensor:
+        return super().forward(input).type(input.dtype)
+class Linear(nn.Linear):
+    def forward(self, input: Tensor) -> Tensor:
+        return F.linear(
+            input,
+            self.weight.to(input.dtype),
+            None if self.bias is None else self.bias.to(input.dtype),
+        )
+class Conv1d(nn.Conv1d):
+    def _conv_forward(
+        self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
+    ) -> Tensor:
+        return super()._conv_forward(
+            input, weight.to(input.dtype), None if bias is None else bias.to(input.dtype)
+        )
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+    def forward(
+        self,
+        x: Tensor,
+        mask: Optional[Tensor] = None,
+    ):
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+        wv, qk = self.qkv_attention(q, k, v, mask)
+        return self.out(wv), qk
+    def qkv_attention(
+        self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None
+    ):
+        _, T, D = q.shape
+        scale = (D // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        qk = q @ k  # (B, n_head, T, T)
+        if mask is not None:
+            qk = qk + mask
+        qk = qk.float()
+        w = F.softmax(qk, dim=-1).to(q.dtype)
+        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
+        )
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: Tensor,
+        mask: Optional[Tensor] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x.contiguous()), mask=mask)[0]
+        x = x + self.mlp(self.mlp_ln(x.contiguous()))
+        return x
+class AudioEncoder(nn.Module):
+    def __init__(
+        self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.positional_embedding = nn.Embedding(n_ctx, n_state)
+        self.positional_embedding.requires_grad_(False)
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        self.avg_pooler = nn.AvgPool1d(2, stride=2)
+        self.after_norm = LayerNorm(n_state)
+        self.gradient_checkpointing = False
+    def forward(self, x: Tensor, x_len: Tensor) -> Tuple[Tensor, Tensor]:
+        T = x.size(-1)
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)  # (B, T // 2, n_state)
+        mask = make_non_pad_mask(x_len, T).unsqueeze(1)  # (B, 1, T)
+        mask = mask_to_bias(mask[:, :, (T + 1) % 2::2], x.dtype)  # (B, 1, T // 2)
+        x = (x + self.positional_embedding.weight[:x.shape[1], :]).to(x.dtype)
+        for block in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(block, x, mask.unsqueeze(1))
+            else:
+                x = block(x, mask.unsqueeze(1))
+        x = x.permute(0, 2, 1)
+        x = self.avg_pooler(x)
+        x = x.permute(0, 2, 1)
+        x_len = (x_len + 1) // 2 // 2
+        x = self.after_norm(x.contiguous())
+        return x, x_len
+class Adaptor(nn.Module):
+    def __init__(
+        self,
+        n_state: int = 1280,
+        n_hidden: int = 3072,
+        kernel_size: int = 7,
+        stride: int = 4
+    ):
+        super().__init__()
+        self.stride = stride
+        if self.stride != -1:
+            # print("self.stride: {}".format(self.stride))
+            self.conv = Conv1d(n_state, n_state, kernel_size, stride, padding=1)
+        self.linear1 = nn.Linear(n_state, 2048)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(2048, n_hidden)
+        self.gradient_checkpointing = False
+    def forward(self, x: Tensor) -> Tuple[Tensor]:
+        T = x.size(-1)
+        if self.stride != -1:
+            if self.gradient_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(self.conv, x.permute(0, 2, 1))
+                x = x.permute(0, 2, 1)
+            else:
+                x = x.permute(0, 2, 1)
+                x = F.gelu(self.conv(x))
+                x = x.permute(0, 2, 1)
+        if self.gradient_checkpointing and self.training:
+            x = torch.utils.checkpoint.checkpoint(self.linear1, x)
+            x = torch.utils.checkpoint.checkpoint(self.relu, x)
+            x = torch.utils.checkpoint.checkpoint(self.linear2, x)
+        else:
+            x = self.linear1(x)
+            x = self.relu(x)
+            x = self.linear2(x)
+        return x
+class StepAudio2ForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = StepAudio2Config
+    main_input_name = "input_ids"
+    # Important: Add this attribute to make HF recognize it as a model with generation capability
+    # _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    supports_gradient_checkpointing = True  # 新增，声明支持gradient checkpointing
+    def __init__(self, config: StepAudio2Config):
+        super().__init__(config)
+        if isinstance(config.torch_dtype, str):
+            dtype = getattr(torch, config.torch_dtype)
+        else:
+            dtype = config.torch_dtype
+        self.model = Qwen2Model(config.text_config)
+        self.bf16 = dtype==torch.bfloat16
+        self.encoder = AudioEncoder(
+            config.audio_encoder_config.n_mels, config.audio_encoder_config.n_audio_ctx, config.audio_encoder_config.n_audio_state,
+            config.audio_encoder_config.n_audio_head, config.audio_encoder_config.n_audio_layer
+        )
+        self.adapter = Adaptor(
+            config.audio_encoder_config.n_audio_state, config.audio_encoder_config.llm_dim,
+            config.audio_encoder_config.kernel_size, config.audio_encoder_config.adapter_stride
+        )
+        if self.bf16:
+            self.encoder = self.encoder.bfloat16()
+            self.adapter = self.adapter.bfloat16()
+        self.lm_head = torch.nn.Linear(
+            config.text_config.hidden_size,
+            config.text_config.vocab_size,
+            bias=False,
+            dtype=dtype
+        )
+        self.post_init()
+    def forward(
+        self,
+        input_ids=None,
+        wavs=None,
+        wav_lens=None,
+        attention_mask=None,
+        **kwargs
+    ):
+        hidden_states = self.model.embed_tokens(input_ids)
+        if wavs is not None:
+            if self.bf16:
+                wavs = wavs.bfloat16()
+            out, feat_lens = self.encoder(wavs, wav_lens)
+            out = self.adapter(out)
+            feat_lens = (feat_lens - 1) // 2 + 1
+            insert_location = torch.nonzero(input_ids == 151688)
+            insert_location[:,1] += 1
+            for idx in range(len(insert_location)):
+                i,s = insert_location[idx]
+                hidden_states[i][s : s+feat_lens[idx]] = out[idx][:feat_lens[idx]]
+        x = self.model(inputs_embeds=hidden_states, attention_mask=attention_mask)[0]
+        logits = self.lm_head(x)
+        return CausalLMOutputWithPast(
+            logits=logits,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None
+        )
+    def get_input_embeddings(self):
+        """Return the model's input embeddings - required for GenerationMixin"""
+        return self.model.embed_tokens
+    def get_output_embeddings(self):
+        """Return the model's output embeddings (LM head) - required for GenerationMixin"""
+        return self.lm_head
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
+        """Prepare inputs for generation - required for GenerationMixin"""
+        # Keep the wavs and wav_lens from the initial call
+        wavs = kwargs.get("wavs", None)
+        wav_lens = kwargs.get("wav_lens", None)
+        # For generation steps after the first, we don't need to process audio again
+        # because the audio tokens have already been replaced in the input sequence
+        if "past_key_values" in kwargs and kwargs["past_key_values"] is not None:
+            # We're in a generation step, no need to process audio again
+            return {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "past_key_values": kwargs.get("past_key_values")
+            }
+        # First generation step, include audio processing
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "wavs": wavs,
+            "wav_lens": wav_lens
+        }
+    def _reorder_cache(self, past_key_values, beam_idx):
+        """Reorder the cache for beam search - required for GenerationMixin if using beam search"""
+        # If you're not using past_key_values or beam search, this can be a simple pass-through
+        # Otherwise implement according to your model's cache structure
+        return past_key_values
+    def _set_gradient_checkpointing(self, module, value=False):
+        # For Qwen2Model
+        if hasattr(self.model, 'gradient_checkpointing'):
+            self.model.gradient_checkpointing = value
+            # Add the missing _gradient_checkpointing_func method to Qwen2Model
+            # This is what Qwen2Model tries to use when gradient_checkpointing=True
+            if value and not hasattr(self.model, '_gradient_checkpointing_func'):
+                def _gradient_checkpointing_func(module_to_run, *args, **kwargs):
+                    # This function wraps torch.utils.checkpoint.checkpoint
+                    # and is used by Qwen2Model to perform checkpointing
+                    return torch.utils.checkpoint.checkpoint(module_to_run, *args, **kwargs)
+                self.model._gradient_checkpointing_func = _gradient_checkpointing_func
+        # For custom encoder and adapter
+        if hasattr(self.encoder, 'gradient_checkpointing'):
+            self.encoder.gradient_checkpointing = value
+        if hasattr(self.adapter, 'gradient_checkpointing'):
+            self.adapter.gradient_checkpointing = value

recipe.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+default_stage:
+  default_modifiers:
+    QuantizationModifier:
+      targets: [Linear]
+      ignore: [lm_head, 're:^encoder\.', 're:^adapter\.', 're:^model\.embed_tokens\.', 're:.*layernorm.*']
+      scheme: NVFP4

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "additional_special_tokens": [
+    "<|EOT|>",
+    "<|BOT|>",
+    "<|CALL_START|>",
+    "<|CALL_END|>",
+    "<|THINK_START|>",
+    "<|THINK_END|>",
+    "<|IMG_START|>",
+    "<|IMG_END|>",
+    "<|META_START|>",
+    "<|META_END|>",
+    "<im_patch>",
+    "<im_start>",
+    "<im_end>",
+    "<dream>",
+    "<dream_start>",
+    "<dream_end>",
+    "<|MASK_1e69f|>",
+    "<|UNMASK_1e69f|>",
+    "<video_start>",
+    "<video_end>",
+    "<patch_start>",
+    "<patch_end>",
+    "<patch_newline>",
+    "<audio_start>",
+    "<audio_end>",
+    "<audio_patch>",
+    "<audio_patch_pad>",
+    "<|SC|>",
+    "<tts_start>",
+    "<tts_end>",
+    "<tts_pad>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c23796e0498b651e92b0d514d43636d0dfd556534f8dde7b72ed0e2ff1d07744
+size 12684616

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff