diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f6c9735ab..ade3023c9 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12576,6 +12576,193 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
           ep: 4
           dp-attn: true
 
+# MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
+# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
+# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
+# tensor cores on Blackwell. Image is the multi-arch m3_release vLLM build
+# (vllm/vllm-openai:minimax-m3, vllm-project/vllm#45381); recipes set
+# dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND
+# NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image
+# shipped without NIXL, so disagg workers crashed at NixlConnector init).
+# block-size 128 mandatory (MSA index-cache alignment); FLASHINFER
+# (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd
+# (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image
+# from m3_release before running. Fully disaggregated, rack-scale wide-EP
+# GB200 sweep (NixlConnector P/D split over the NVL72 NVLink fabric). Mirrors
+# the deepseek-v4 "megamoe" ladder: DEP unit = DP-attn + expert-parallel
+# (DEP8 = 8 GPU / 2 nodes, DEP16 = 16 GPU / 4 nodes), with prefill workers
+# scaled 1P->4P. EP8/EP16 vs B200's 8-GPU NVLink island is the GB200 edge.
+# 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode),
+# 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts.
+minimaxm3-fp8-gb200-dynamo-vllm:
+  image: vllm/vllm-openai:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: gb200
+  precision: fp8
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
+      # EP splits 128 MoE experts across 8 decode ranks (16 each), cutting
+      # per-step latency ~19% vs pure TP8.  Matches B200 TEP8 topology.
+      - conc-list: [1, 2, 4, 8, 16, 32, 64]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 4 nodes.
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode, 6 nodes.
+      - conc-list: [512, 1024, 2048]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Prefill-scaled: 2P+1D, 8 nodes.
+      - conc-list: [2048, 4096]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Max throughput: 4P+1D, 12 nodes.
+      - conc-list: [4096, 8192]
+        prefill:
+          num-worker: 4
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
+      - conc-list: [1, 2, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
+      - conc-list: [32, 64, 128]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Prefill-scaled 8k1k: 2P+1D, 8 nodes.
+      - conc-list: [512, 1024]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Max throughput 8k1k: 4P+1D, 12 nodes.
+      - conc-list: [1024, 2048]
+        prefill:
+          num-worker: 4
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 81727ef39..85b399e6c 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -123,6 +123,11 @@ on:
 
 env:
   RANDOM_RANGE_RATIO: 0.8
+  # Day-zero models resolved via hf: ids download from the Hub inside the
+  # slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests
+  # get 429-rate-limited when several workers pull a 444 GB snapshot at
+  # once; sbatch/srun inherit this env so the token reaches the workers.
+  HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }}
   EXP_NAME: ${{ inputs.exp-name }}
   IMAGE: ${{ inputs.image }}
   MODEL_PREFIX: ${{ inputs.model-prefix }}
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
new file mode 100644
index 000000000..efc5d5740
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
@@ -0,0 +1,118 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, wide EP).
+# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector ->
+# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel
+# over the NVL72 NVLink fabric -- the regime where GB200 pulls ahead of
+# B200 (capped at an 8-GPU NVLink island). M3 has 128 routed experts so
+# EP8 shards 16 experts/rank. FLASHINFER attention, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
new file mode 100644
index 000000000..5ca08a06d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
@@ -0,0 +1,117 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide-decode curve).
+# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 (DP-attn
+# + EP across 16 GPU / 4 nodes) = 6 nodes. EP16 (8 experts/rank of 128)
+# spans the NVL72 fabric to maximize decode token throughput. FLASHINFER
+# attention, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
new file mode 100644
index 000000000..f3e79340a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -0,0 +1,111 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve).
+# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure
+# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where
+# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention,
+# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
new file mode 100644
index 000000000..147803c78
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
@@ -0,0 +1,106 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tep4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, decode
+# TEP4). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP4
+# (TP4+EP4, 1 node) = 2 nodes. Expert parallelism on decode splits 128
+# MoE experts across 4 ranks (32 each), reducing per-step MoE compute.
+# FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
new file mode 100644
index 000000000..199699212
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -0,0 +1,106 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, wider
+# decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8
+# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism
+# reduces per-step latency by spreading both attention and MoE across
+# 8 GPU over NVL72 NVLink.  FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
new file mode 100644
index 000000000..1d1591198
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
@@ -0,0 +1,104 @@
+name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (low-latency, more
+# decode workers). Prefill TP4 (1 node) -> NixlConnector -> 2x Decode
+# TP4 (2 nodes) = 3 nodes. Two decode workers halve the per-worker
+# batch, reducing ITL at low concurrency. FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 2
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
new file mode 100644
index 000000000..853095727
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
@@ -0,0 +1,117 @@
+name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k"
+
+# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled).
+# 2x Prefill DEP8 (8 GPU / 2 nodes each) -> NixlConnector -> Decode DEP16
+# (16 GPU / 4 nodes) = 8 nodes. Two wide prefill workers sustain prompt
+# ingest into a single wide decode at high concurrency. FLASHINFER
+# attention, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 4
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2048x4096"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
new file mode 100644
index 000000000..4a6aa5d0f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
@@ -0,0 +1,117 @@
+name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k"
+
+# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput).
+# 4x Prefill DEP8 (8 GPU / 2 nodes each = 8 nodes) -> NixlConnector ->
+# Decode DEP16 (16 GPU / 4 nodes) = 12 nodes within one NVL72 rack. Max
+# prefill fan-in for the highest-concurrency points. FLASHINFER attention,
+# block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 8
+  decode_nodes: 4
+  prefill_workers: 4
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4096x8192"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
new file mode 100644
index 000000000..f6f2c7874
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
@@ -0,0 +1,111 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, 8k1k).
+# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector ->
+# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel
+# over the NVL72 NVLink fabric. M3 has 128 routed experts so EP8 shards
+# 16 experts/rank. FLASHINFER attention, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32x64x128"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
new file mode 100644
index 000000000..0d7d44843
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
@@ -0,0 +1,111 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide decode, 8k1k).
+# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16
+# (16 GPU / 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72
+# NVLink -- EP16 across 4 nodes is the regime B200 can't reach. M3 has
+# 128 routed experts: EP16 = 8 experts/rank. FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
new file mode 100644
index 000000000..b0602354c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -0,0 +1,106 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k).
+# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure
+# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where
+# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention,
+# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd).
+# Low-conc tuned: stream-interval 1, cudagraph cap 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 9472
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
new file mode 100644
index 000000000..453df782b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -0,0 +1,106 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k,
+# wider decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8
+# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism
+# reduces per-step latency by spreading both attention and MoE across
+# 8 GPU over NVL72 NVLink.  FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
new file mode 100644
index 000000000..6a0765c60
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
@@ -0,0 +1,110 @@
+name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k"
+
+# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled, 8k1k).
+# 2x Prefill DEP8 (4 nodes) -> NixlConnector -> 1x Decode DEP16
+# (4 nodes) = 8 nodes. Double prefill workers absorb 8k ISL compute;
+# rack-scale DEP16 decode across NVL72. FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 4
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
new file mode 100644
index 000000000..9e4ff3c2b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
@@ -0,0 +1,110 @@
+name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k"
+
+# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput, 8k1k).
+# 4x Prefill DEP8 (8 nodes) -> NixlConnector -> 1x Decode DEP16
+# (4 nodes) = 12 nodes within one NVL72 rack. Maximises prefill
+# bandwidth for 8k ISL; rack-scale DEP16 decode. FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 8
+  decode_nodes: 4
+  prefill_workers: 4
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1024x2048"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 43f06f4f1..43f4e074e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3665,6 +3665,19 @@
     - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724
 
+- config-keys:
+    - minimaxm3-fp8-gb200-dynamo-vllm
+  description:
+    - "Initial submission: MiniMax-M3 MXFP8 disaggregated rack-scale wide-EP vLLM sweep for GB200 via Dynamo"
+    - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, 128 routed experts top-4, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
+    - "Image: vllm/vllm-openai:minimax-m3, rebuilt from m3_release HEAD 022448dd (vllm-project/vllm#45381, gates trtllm-gen page>=128); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL"
+    - "FLASHINFER (trtllm-gen) attention on Blackwell + block-size 128 (MSA index-cache alignment); --language-model-only for text-only benchmarks"
+    - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
+    - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
+    - "5 topologies, 1k1k + 8k1k: 1P1D TEP8 decode (3n, low-lat conc 1-64), 1P1D DEP8 (4n, conc 128-512), 1P1D DEP8->DEP16 (6n, conc 512-2048), 2P1D (8n, conc 2048-4096), 4P1D (12n, conc 4096-8192)"
+    - "TEP8 decode (enable-expert-parallel on TP8): 128 experts / 8 ranks = 16 experts/rank, ~19% lower ITL than pure TP8 at low conc; stream-interval 1 + max-cudagraph-capture-size 128 for interactivity"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
+
 - config-keys:
     - minimaxm3-fp8-b200-vllm
   description:
@@ -3850,6 +3863,16 @@
     - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689
 
+- config-keys:
+    - minimaxm3-fp8-gb200-dynamo-vllm
+  description:
+    - "Fix NixlConnector handshake failure for hetero-TP disagg when num_kv_heads < decode TP (M3 TEP8: TP4 prefill → TP8 decode)"
+    - "Root cause: _validate_remote_agent_handshake used raw tp_ratio (8/4=2) for block_len validation, but M3 has only 4 KV heads — both sides have max(1,4//tp)=1 head/rank → same block_len. The assertion expected remote_len=131072 but got 65536, failing every handshake (0 KV transfers, gsm8k=0.0000)."
+    - "Fix: replace tp_ratio with head_ratio (remote_heads_per_rank // local_heads_per_rank) which correctly accounts for GQA replication — commit 78ef73b on cleanup/m3-mi300x-mxfp8"
+    - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)"
+    - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (built from cleanup/m3-mi300x-mxfp8 HEAD 78ef73bc4)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
+
 - config-keys:
     - minimaxm3-fp8-b200-vllm
   description:
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 36c8af203..9c3430289 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -60,8 +60,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5"
         export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8"
+    elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then
+        export MODEL_PATH="/mnt/lustre01/models/MiniMax-M3-MXFP8"
+        export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8"
     else
-        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8"
+        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8, minimaxm3/fp8"
         exit 1
     fi
 else
@@ -81,7 +84,7 @@ NGINX_IMAGE="nginx:1.27.4"
 # squash dir on a path that's also visible to compute nodes. Falls
 # back to the legacy sa-shared path so other configs are untouched.
 SQUASH_DIR="/mnt/lustre01/users-public/sa-shared"
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     echo "=== cluster diagnostic (minimax sweep) ==="
     echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)"
     echo "HOME=$HOME"
@@ -128,8 +131,32 @@ fi
 SQUASH_FILE="${SQUASH_DIR}/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 NGINX_SQUASH_FILE="${SQUASH_DIR}/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 
-enroot import -o $SQUASH_FILE docker://$IMAGE
-enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
+# Concurrent matrix jobs (three gb200-nv runners) all import to the same
+# shared-FS squash path. An unsynchronized `enroot import -o` onto an
+# existing file APPENDS to it (mksquashfs default), corrupting the image
+# while other jobs' pyxis extractions are reading it — observed on the
+# minimaxm3 day-zero sweep (R1: an eval job appended to the live squash
+# mid-run). Serialize with a lock, skip when the existing file is valid,
+# and build to a temp path + atomic mv so readers never see a half-written
+# file. Mirrors the import_squash pattern in launch_gb300-nv.sh.
+import_squash() {
+    local squash="$1" image="$2"
+    local lock="${squash}.lock"
+    (
+        exec 9>"$lock"
+        flock -w 1800 9 || { echo "Failed to acquire lock for $squash" >&2; exit 1; }
+        if unsquashfs -l "$squash" > /dev/null 2>&1; then
+            echo "Squash file already exists and is valid, skipping import: $squash"
+        else
+            rm -f "$squash" "$squash".tmp.*
+            enroot import -o "${squash}.tmp.$$" "docker://$image"
+            mv -f "${squash}.tmp.$$" "$squash"
+        fi
+    ) || exit 1
+}
+
+import_squash "$SQUASH_FILE" "$IMAGE"
+import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE"
 
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 
@@ -202,7 +229,7 @@ SRT_REPO_DIR="srt-slurm"
 # cross-mounted to compute nodes. Put the srt-slurm workspace and staged
 # InferenceX checkout on a writable shared-FS path that compute can see.
 # Per-run-unique paths avoid races between parallel sweep jobs.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     SHARED_BASE=""
     for cand in \
         /mnt/lustre01/users-public/sa-shared/gha-runs \
@@ -269,6 +296,12 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then
         echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2
         exit 1
     fi
+elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" ]]; then
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1
+    cd "$SRT_REPO_DIR" || exit 1
+    git checkout main || exit 1
+    mkdir -p recipes/vllm/minimax-m3-gb200-fp8 || exit 1
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8" recipes/vllm/minimax-m3-gb200-fp8 || exit 1
 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
@@ -292,7 +325,7 @@ source $HOME/.local/bin/env
 # under a head-node-only path, .venv/bin/python3 becomes a broken
 # symlink on compute. Pin the venv to /usr/bin/python3 — a system
 # path that exists at the same location on both head and compute.
-if [[ $MODEL_PREFIX == "minimaxm2.5" && -x /usr/bin/python3 ]]; then
+if [[ ( $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ) && -x /usr/bin/python3 ]]; then
     uv venv --seed --python /usr/bin/python3
 else
     uv venv --seed
@@ -312,7 +345,7 @@ SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
 # Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path
 # above so srtctl's outputs/ directory (which lives under
 # SRTCTL_ROOT) is visible to compute nodes.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     SRTCTL_ROOT="$SRT_REPO_DIR"
 fi
 echo "Creating srtslurm.yaml configuration..."
@@ -354,7 +387,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
 # can't see. Stage the relevant subset to shared FS and repoint
 # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already
 # on shared FS) and .git (not needed in container) for speed.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}"
     mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1
     rsync -a --delete \
@@ -379,6 +412,7 @@ if [[ ! -f "$CONFIG_PATH" ]]; then
 fi
 sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH"
 
+
 if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then
     SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1)
 else