add docstring for Docstring coverage

deepindeed2022 · deepindeed2022 · commit 477ee772d6c0 · 2026-04-13T03:20:36.000Z
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -209,6 +209,7 @@ def build_quant_cfg(
     model_type,
     moe_calib_experts_ratio: float | None = None,
 ) -> dict[str, Any]:
+    """Build quantization config with model-specific overrides for AWQ, SmoothQuant, and VLM."""
     quant_cfg = copy.deepcopy(quant_cfg)
     if "awq" in str(quant_cfg.get("algorithm")):
         from modelopt.torch.quantization.config import find_quant_cfg_entry_by_path
@@ -257,6 +258,11 @@ def build_quant_cfg(
         # (hidden_size -> num_v_heads, e.g. 1024 -> 16), quantizing them causes accuracy loss.
         quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_b*", "enable": False})
         quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_a*", "enable": False})
+        # TRT-LLM's Qwen3.5 linear-attention packing only supports weight/weight_scale_inv;
+        # disable activation quantization so input_scale is not exported for these layers.
+        quant_cfg["quant_cfg"].append(
+            {"quantizer_name": "*linear_attn*input_quantizer", "enable": False}
+        )
 
     return quant_cfg
 
diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py
@@ -112,6 +112,7 @@ def get_tiny_qwen3_moe(**config_kwargs) -> PreTrainedModel:
 def create_tiny_qwen3_moe_dir(
     tmp_path: Path | str, with_tokenizer: bool = False, **config_kwargs
 ) -> Path:
+    """Save a tiny Qwen3 MoE model (and optional tokenizer) to a temp directory."""
     qwen3_moe_dir = Path(tmp_path) / "tiny_qwen3_moe"
     if with_tokenizer:
         tokenizer = AutoTokenizer.from_pretrained(
@@ -157,6 +158,7 @@ def get_tiny_qwen3_5(**config_kwargs) -> PreTrainedModel:
 
 ##### GPT-OSS #####
 def get_tiny_gpt_oss(**config_kwargs) -> PreTrainedModel:
+    """Create a tiny GPT-OSS MoE model for testing."""
     set_seed(SEED)
 
     kwargs = {
diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py
@@ -236,6 +236,7 @@ def test_is_homogeneous_hf_model_gpt_oss():
 
 
 def test_hf_decoder_discoverer_registration_path():
+    """Verify HF decoder layer discoverer is registered and returns correct layers."""
     model = get_tiny_llama()
     assert any(
         is_supported is is_homogeneous_hf_model and discoverer is get_homogeneous_hf_decoder_layers
@@ -267,6 +268,10 @@ def test_qwen3_5_hybrid_attention_quantize(quant_config):
     # Disable narrow GatedDeltaNet projections (same as example_utils does for qwen3_5)
     quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_b*", "enable": False})
     quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_a*", "enable": False})
+    # Disable activation quantization for linear attention (TRT-LLM packing limitation)
+    quant_cfg["quant_cfg"].append(
+        {"quantizer_name": "*linear_attn*input_quantizer", "enable": False}
+    )
 
     def calib_fn(model):
         """Run calibration forward passes with dummy inputs."""
@@ -303,3 +308,10 @@ def calib_fn(model):
             assert not module.weight_quantizer.is_enabled, (
                 f"in_proj_a should have quantization disabled: {name}"
             )
+
+    # Verify linear_attn input quantizers are disabled (no input_scale for TRT-LLM export)
+    for name, module in model.named_modules():
+        if "linear_attn" in name and hasattr(module, "input_quantizer"):
+            assert not module.input_quantizer.is_enabled, (
+                f"linear_attn input_quantizer should be disabled: {name}"
+            )