support Qwen3.5 quantization

deepindeed2022 · deepindeed2022 · commit f660a938f18f · 2026-04-13T01:51:11.000Z
diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md
@@ -109,6 +109,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http
 | Gemma 3 | ✅<sup>2</sup> | - | ✅ | - | - |
 | QWen 2, 2.5 <sup>4</sup> | ✅ | ✅ | ✅ | ✅ | ✅ |
 | QWen3, 3.5 MOE, Next <sup>6</sup> | ✅ | - | - | - | ✅ |
+| QWen3.5 <sup>6</sup> | ✅ | - | ✅ | - | - |
 | QwQ | ✅ | - | - | - | ✅ |
 | DeepSeek V3, R1, V3.1, V3.2<sup>7</sup> | - | - | - | - | ✅ |
 | GLM-4.7<sup>8</sup> | ✅ | - | - | - | ✅ |
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -252,6 +252,12 @@ def build_quant_cfg(
         quant_cfg["quant_cfg"].append({"quantizer_name": "*image*", "enable": False})
         quant_cfg["quant_cfg"].append({"quantizer_name": "*vision*", "enable": False})
 
+    if model_type == "qwen3_5":
+        # GatedDeltaNet's in_proj_b and in_proj_a have very narrow output dimensions
+        # (hidden_size -> num_v_heads, e.g. 1024 -> 16), quantizing them causes accuracy loss.
+        quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_b*", "enable": False})
+        quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_a*", "enable": False})
+
     return quant_cfg
 
 
diff --git a/examples/vlm_ptq/README.md b/examples/vlm_ptq/README.md
@@ -38,6 +38,7 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#getting-started) fo
 | VILA | ✅ | ✅ | ✅ | ✅ | - |
 | Phi-3-vision, Phi-4-multimodal | ✅ | ✅ | ✅ | ✅ | ✅  |
 | Qwen2, 2.5-VL | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Qwen3.5 | ✅ | - | ✅ | - | - |
 | Gemma3 | ✅ | - | - | - | - |
 
 > *<sup>1.</sup>Only TensorRT-LLM checkpoint export is supported. Not compatible with the TensorRT-LLM torch backend* \
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
@@ -29,6 +29,8 @@
     "MPT": "mpt",
     "Bloom": "bloom",
     "ChatGLM": "chatglm",
+    "Qwen3_5Moe": "qwen3_5moe",
+    "Qwen3_5": "qwen3_5",
     "Qwen3Moe": "qwen3moe",
     "Qwen3Next": "qwen3next",
     "QWen": "qwen",
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -1221,7 +1221,7 @@ def _update_svdquant(modules, new_pre_quant_scale):
     # Mathematical equivalence:
     #   Before: down_proj_out = {[act_fn(self.gate_proj(x)) * up_proj(x)] * scale} @ down_proj.W^T
     #   After:  down_proj_out = {[act_fn(self.gate_proj(x)) * (up_proj(x) * scale)]} @ down_proj.W^T
-    (["LlamaMLP", "Qwen3MLP", "Qwen3MoeMLP"], ("up_proj", "down_proj")),
+    (["LlamaMLP", "Qwen3MLP", "Qwen3MoeMLP", "Qwen3_5MLP"], ("up_proj", "down_proj")),
 ]
 
 
diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py
@@ -35,6 +35,11 @@
     T5ForConditionalGeneration,
 )
 
+try:
+    from transformers import Qwen3_5TextConfig
+except ImportError:
+    Qwen3_5TextConfig = None
+
 import modelopt.torch.opt as mto
 
 SEED = 1234
@@ -117,6 +122,37 @@ def create_tiny_qwen3_moe_dir(
     get_tiny_qwen3_moe(**config_kwargs).save_pretrained(qwen3_moe_dir)
     return qwen3_moe_dir
 
+##### Qwen3.5 (hybrid linear attention + full attention) #####
+def get_tiny_qwen3_5(**config_kwargs) -> PreTrainedModel:
+    if Qwen3_5TextConfig is None:
+        pytest.skip("Qwen3_5TextConfig not available (requires transformers >= 4.57)")
+
+    set_seed(SEED)
+
+    kwargs = {
+        "dtype": torch.bfloat16,
+        "hidden_size": 32,
+        "intermediate_size": 32,
+        "num_hidden_layers": 4,
+        "num_attention_heads": 2,
+        "num_key_value_heads": 1,
+        "head_dim": 16,
+        "linear_num_key_heads": 4,
+        "linear_num_value_heads": 4,
+        "linear_key_head_dim": 8,
+        "linear_value_head_dim": 8,
+        "linear_conv_kernel_dim": 4,
+        "full_attention_interval": 4,
+        "attn_output_gate": True,
+        "max_position_embeddings": 32,
+        "vocab_size": 32,
+        "rms_norm_eps": 1e-6,
+    }
+    kwargs.update(**config_kwargs)
+    tiny_qwen3_5 = AutoModelForCausalLM.from_config(Qwen3_5TextConfig(**kwargs))
+
+    return tiny_qwen3_5
+
 
 ##### GPT-OSS #####
 def get_tiny_gpt_oss(**config_kwargs) -> PreTrainedModel:
diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py
@@ -24,6 +24,7 @@
     create_tiny_llama_dir,
     get_tiny_gpt_oss,
     get_tiny_llama,
+    get_tiny_qwen3_5,
     get_tiny_qwen3_moe,
     tf_modelopt_state_and_output_tester,
 )
@@ -243,3 +244,61 @@ def test_hf_decoder_discoverer_registration_path():
     assert LayerActivationCollector.get_decoder_layers(model) is get_homogeneous_hf_decoder_layers(
         model
     )
+
+
+@pytest.mark.parametrize(
+    "quant_config",
+    [mtq.FP8_DEFAULT_CFG, mtq.INT4_AWQ_CFG],
+    ids=["fp8", "int4_awq"],
+)
+def test_qwen3_5_hybrid_attention_quantize(quant_config):
+    """Verify FP8 and AWQ quantization works for Qwen3.5 hybrid (GatedDeltaNet + Attention)."""
+    import copy
+
+    model = get_tiny_qwen3_5()
+
+    quant_cfg = copy.deepcopy(quant_config)
+    if quant_config is mtq.INT4_AWQ_CFG:
+        for entry in quant_cfg["quant_cfg"]:
+            if entry["quantizer_name"] == "*weight_quantizer":
+                entry.setdefault("cfg", {})["block_sizes"] = {-1: 16}
+                break
+
+    # Disable narrow GatedDeltaNet projections (same as example_utils does for qwen3_5)
+    quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_b*", "enable": False})
+    quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_a*", "enable": False})
+
+    def calib_fn(model):
+        x = model.dummy_inputs["input_ids"]
+        for _ in range(2):
+            model(x)
+
+    mtq.quantize(model, quant_cfg, calib_fn)
+
+    # Verify the model still produces output
+    with torch.no_grad():
+        out = model(model.dummy_inputs["input_ids"])
+    assert out.logits is not None
+
+    # Verify both GatedDeltaNet and Attention linear layers got quantized
+    has_gdn_quantized = False
+    has_attn_quantized = False
+    for name, module in model.named_modules():
+        if hasattr(module, "weight_quantizer") and hasattr(module, "weight"):
+            if "linear_attn.in_proj_qkv" in name:
+                has_gdn_quantized = True
+            if "self_attn.q_proj" in name:
+                has_attn_quantized = True
+    assert has_gdn_quantized, "GatedDeltaNet linear layers should be quantized"
+    assert has_attn_quantized, "Attention linear layers should be quantized"
+
+    # Verify narrow projections are NOT quantized
+    for name, module in model.named_modules():
+        if "in_proj_b" in name and hasattr(module, "weight_quantizer"):
+            assert not module.weight_quantizer.is_enabled, (
+                f"in_proj_b should have quantization disabled: {name}"
+            )
+        if "in_proj_a" in name and hasattr(module, "weight_quantizer"):
+            assert not module.weight_quantizer.is_enabled, (
+                f"in_proj_a should have quantization disabled: {name}"
+            )

Original file line number	Diff line number	Diff line change
`@@ -1221,7 +1221,7 @@ def _update_svdquant(modules, new_pre_quant_scale):`
`1221`	`1221`	`# Mathematical equivalence:`
`1222`	`1222`	`# Before: down_proj_out = {[act_fn(self.gate_proj(x)) * up_proj(x)] * scale} @ down_proj.W^T`
`1223`	`1223`	`# After: down_proj_out = {[act_fn(self.gate_proj(x)) * (up_proj(x) * scale)]} @ down_proj.W^T`
`1224`		`- (["LlamaMLP", "Qwen3MLP", "Qwen3MoeMLP"], ("up_proj", "down_proj")),`
	`1224`	`+ (["LlamaMLP", "Qwen3MLP", "Qwen3MoeMLP", "Qwen3_5MLP"], ("up_proj", "down_proj")),`
`1225`	`1225`	`]`
`1226`	`1226`
`1227`	`1227`