diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f6c9735ab..ade3023c9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12576,6 +12576,193 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: ep: 4 dp-attn: true +# MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). +# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint +# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX +# tensor cores on Blackwell. Image is the multi-arch m3_release vLLM build +# (vllm/vllm-openai:minimax-m3, vllm-project/vllm#45381); recipes set +# dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND +# NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image +# shipped without NIXL, so disagg workers crashed at NixlConnector init). +# block-size 128 mandatory (MSA index-cache alignment); FLASHINFER +# (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd +# (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image +# from m3_release before running. Fully disaggregated, rack-scale wide-EP +# GB200 sweep (NixlConnector P/D split over the NVL72 NVLink fabric). Mirrors +# the deepseek-v4 "megamoe" ladder: DEP unit = DP-attn + expert-parallel +# (DEP8 = 8 GPU / 2 nodes, DEP16 = 16 GPU / 4 nodes), with prefill workers +# scaled 1P->4P. EP8/EP16 vs B200's 8-GPU NVLink island is the GB200 edge. +# 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode), +# 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts. +minimaxm3-fp8-gb200-dynamo-vllm: + image: vllm/vllm-openai:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: gb200 + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes. + # EP splits 128 MoE experts across 8 decode ranks (16 each), cutting + # per-step latency ~19% vs pure TP8. Matches B200 TEP8 topology. + - conc-list: [1, 2, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + + # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 4 nodes. + - conc-list: [128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode, 6 nodes. + - conc-list: [512, 1024, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Prefill-scaled: 2P+1D, 8 nodes. + - conc-list: [2048, 4096] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Max throughput: 4P+1D, 12 nodes. + - conc-list: [4096, 8192] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes. + - conc-list: [1, 2, 4, 8, 16] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + + # Mid curve 8k1k: 1P+1D DEP8, 4 nodes. + - conc-list: [32, 64, 128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes. + - conc-list: [128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Prefill-scaled 8k1k: 2P+1D, 8 nodes. + - conc-list: [512, 1024] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Max throughput 8k1k: 4P+1D, 12 nodes. + - conc-list: [1024, 2048] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 81727ef39..85b399e6c 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -123,6 +123,11 @@ on: env: RANDOM_RANGE_RATIO: 0.8 + # Day-zero models resolved via hf: ids download from the Hub inside the + # slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests + # get 429-rate-limited when several workers pull a 444 GB snapshot at + # once; sbatch/srun inherit this env so the token reaches the workers. + HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }} EXP_NAME: ${{ inputs.exp-name }} IMAGE: ${{ inputs.image }} MODEL_PREFIX: ${{ inputs.model-prefix }} diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml new file mode 100644 index 000000000..efc5d5740 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml @@ -0,0 +1,118 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, wide EP). +# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector -> +# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel +# over the NVL72 NVLink fabric -- the regime where GB200 pulls ahead of +# B200 (capped at an 8-GPU NVLink island). M3 has 128 routed experts so +# EP8 shards 16 experts/rank. FLASHINFER attention, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml new file mode 100644 index 000000000..5ca08a06d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml @@ -0,0 +1,117 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide-decode curve). +# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 (DP-attn +# + EP across 16 GPU / 4 nodes) = 6 nodes. EP16 (8 experts/rank of 128) +# spans the NVL72 fabric to maximize decode token throughput. FLASHINFER +# attention, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml new file mode 100644 index 000000000..f3e79340a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -0,0 +1,111 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve). +# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure +# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where +# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention, +# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml new file mode 100644 index 000000000..147803c78 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml @@ -0,0 +1,106 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tep4-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, decode +# TEP4). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP4 +# (TP4+EP4, 1 node) = 2 nodes. Expert parallelism on decode splits 128 +# MoE experts across 4 ranks (32 each), reducing per-step MoE compute. +# FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml new file mode 100644 index 000000000..199699212 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -0,0 +1,106 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, wider +# decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8 +# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism +# reduces per-step latency by spreading both attention and MoE across +# 8 GPU over NVL72 NVLink. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml new file mode 100644 index 000000000..1d1591198 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml @@ -0,0 +1,104 @@ +name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4-1k1k" + +# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (low-latency, more +# decode workers). Prefill TP4 (1 node) -> NixlConnector -> 2x Decode +# TP4 (2 nodes) = 3 nodes. Two decode workers halve the per-worker +# batch, reducing ITL at low concurrency. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml new file mode 100644 index 000000000..853095727 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml @@ -0,0 +1,117 @@ +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k" + +# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled). +# 2x Prefill DEP8 (8 GPU / 2 nodes each) -> NixlConnector -> Decode DEP16 +# (16 GPU / 4 nodes) = 8 nodes. Two wide prefill workers sustain prompt +# ingest into a single wide decode at high concurrency. FLASHINFER +# attention, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048x4096" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml new file mode 100644 index 000000000..4a6aa5d0f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml @@ -0,0 +1,117 @@ +name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k" + +# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput). +# 4x Prefill DEP8 (8 GPU / 2 nodes each = 8 nodes) -> NixlConnector -> +# Decode DEP16 (16 GPU / 4 nodes) = 12 nodes within one NVL72 rack. Max +# prefill fan-in for the highest-concurrency points. FLASHINFER attention, +# block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 8 + decode_nodes: 4 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096x8192" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml new file mode 100644 index 000000000..f6f2c7874 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml @@ -0,0 +1,111 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, 8k1k). +# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector -> +# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel +# over the NVL72 NVLink fabric. M3 has 128 routed experts so EP8 shards +# 16 experts/rank. FLASHINFER attention, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml new file mode 100644 index 000000000..0d7d44843 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml @@ -0,0 +1,111 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide decode, 8k1k). +# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 +# (16 GPU / 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 +# NVLink -- EP16 across 4 nodes is the regime B200 can't reach. M3 has +# 128 routed experts: EP16 = 8 experts/rank. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml new file mode 100644 index 000000000..b0602354c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -0,0 +1,106 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k). +# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure +# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where +# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention, +# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd). +# Low-conc tuned: stream-interval 1, cudagraph cap 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 9472 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml new file mode 100644 index 000000000..453df782b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -0,0 +1,106 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k, +# wider decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8 +# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism +# reduces per-step latency by spreading both attention and MoE across +# 8 GPU over NVL72 NVLink. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml new file mode 100644 index 000000000..6a0765c60 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml @@ -0,0 +1,110 @@ +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k" + +# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled, 8k1k). +# 2x Prefill DEP8 (4 nodes) -> NixlConnector -> 1x Decode DEP16 +# (4 nodes) = 8 nodes. Double prefill workers absorb 8k ISL compute; +# rack-scale DEP16 decode across NVL72. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml new file mode 100644 index 000000000..9e4ff3c2b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml @@ -0,0 +1,110 @@ +name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k" + +# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput, 8k1k). +# 4x Prefill DEP8 (8 nodes) -> NixlConnector -> 1x Decode DEP16 +# (4 nodes) = 12 nodes within one NVL72 rack. Maximises prefill +# bandwidth for 8k ISL; rack-scale DEP16 decode. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 8 + decode_nodes: 4 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024x2048" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 43f06f4f1..43f4e074e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3665,6 +3665,19 @@ - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724 +- config-keys: + - minimaxm3-fp8-gb200-dynamo-vllm + description: + - "Initial submission: MiniMax-M3 MXFP8 disaggregated rack-scale wide-EP vLLM sweep for GB200 via Dynamo" + - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, 128 routed experts top-4, MSA sparse attention, ~444 GB MXFP8 checkpoint)" + - "Image: vllm/vllm-openai:minimax-m3, rebuilt from m3_release HEAD 022448dd (vllm-project/vllm#45381, gates trtllm-gen page>=128); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL" + - "FLASHINFER (trtllm-gen) attention on Blackwell + block-size 128 (MSA index-cache alignment); --language-model-only for text-only benchmarks" + - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" + - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" + - "5 topologies, 1k1k + 8k1k: 1P1D TEP8 decode (3n, low-lat conc 1-64), 1P1D DEP8 (4n, conc 128-512), 1P1D DEP8->DEP16 (6n, conc 512-2048), 2P1D (8n, conc 2048-4096), 4P1D (12n, conc 4096-8192)" + - "TEP8 decode (enable-expert-parallel on TP8): 128 experts / 8 ranks = 16 experts/rank, ~19% lower ITL than pure TP8 at low conc; stream-interval 1 + max-cudagraph-capture-size 128 for interactivity" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 + - config-keys: - minimaxm3-fp8-b200-vllm description: @@ -3850,6 +3863,16 @@ - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689 +- config-keys: + - minimaxm3-fp8-gb200-dynamo-vllm + description: + - "Fix NixlConnector handshake failure for hetero-TP disagg when num_kv_heads < decode TP (M3 TEP8: TP4 prefill → TP8 decode)" + - "Root cause: _validate_remote_agent_handshake used raw tp_ratio (8/4=2) for block_len validation, but M3 has only 4 KV heads — both sides have max(1,4//tp)=1 head/rank → same block_len. The assertion expected remote_len=131072 but got 65536, failing every handshake (0 KV transfers, gsm8k=0.0000)." + - "Fix: replace tp_ratio with head_ratio (remote_heads_per_rank // local_heads_per_rank) which correctly accounts for GQA replication — commit 78ef73b on cleanup/m3-mi300x-mxfp8" + - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)" + - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (built from cleanup/m3-mi300x-mxfp8 HEAD 78ef73bc4)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 + - config-keys: - minimaxm3-fp8-b200-vllm description: diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 36c8af203..9c3430289 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -60,8 +60,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5" export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" + elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then + export MODEL_PATH="/mnt/lustre01/models/MiniMax-M3-MXFP8" + export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8" else - echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8" + echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8, minimaxm3/fp8" exit 1 fi else @@ -81,7 +84,7 @@ NGINX_IMAGE="nginx:1.27.4" # squash dir on a path that's also visible to compute nodes. Falls # back to the legacy sa-shared path so other configs are untouched. SQUASH_DIR="/mnt/lustre01/users-public/sa-shared" -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then echo "=== cluster diagnostic (minimax sweep) ===" echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)" echo "HOME=$HOME" @@ -128,8 +131,32 @@ fi SQUASH_FILE="${SQUASH_DIR}/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" NGINX_SQUASH_FILE="${SQUASH_DIR}/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -enroot import -o $SQUASH_FILE docker://$IMAGE -enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE +# Concurrent matrix jobs (three gb200-nv runners) all import to the same +# shared-FS squash path. An unsynchronized `enroot import -o` onto an +# existing file APPENDS to it (mksquashfs default), corrupting the image +# while other jobs' pyxis extractions are reading it — observed on the +# minimaxm3 day-zero sweep (R1: an eval job appended to the live squash +# mid-run). Serialize with a lock, skip when the existing file is valid, +# and build to a temp path + atomic mv so readers never see a half-written +# file. Mirrors the import_squash pattern in launch_gb300-nv.sh. +import_squash() { + local squash="$1" image="$2" + local lock="${squash}.lock" + ( + exec 9>"$lock" + flock -w 1800 9 || { echo "Failed to acquire lock for $squash" >&2; exit 1; } + if unsquashfs -l "$squash" > /dev/null 2>&1; then + echo "Squash file already exists and is valid, skipping import: $squash" + else + rm -f "$squash" "$squash".tmp.* + enroot import -o "${squash}.tmp.$$" "docker://$image" + mv -f "${squash}.tmp.$$" "$squash" + fi + ) || exit 1 +} + +import_squash "$SQUASH_FILE" "$IMAGE" +import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE" export EVAL_ONLY="${EVAL_ONLY:-false}" @@ -202,7 +229,7 @@ SRT_REPO_DIR="srt-slurm" # cross-mounted to compute nodes. Put the srt-slurm workspace and staged # InferenceX checkout on a writable shared-FS path that compute can see. # Per-run-unique paths avoid races between parallel sweep jobs. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then SHARED_BASE="" for cand in \ /mnt/lustre01/users-public/sa-shared/gha-runs \ @@ -269,6 +296,12 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2 exit 1 fi +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 + cd "$SRT_REPO_DIR" || exit 1 + git checkout main || exit 1 + mkdir -p recipes/vllm/minimax-m3-gb200-fp8 || exit 1 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8" recipes/vllm/minimax-m3-gb200-fp8 || exit 1 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" @@ -292,7 +325,7 @@ source $HOME/.local/bin/env # under a head-node-only path, .venv/bin/python3 becomes a broken # symlink on compute. Pin the venv to /usr/bin/python3 — a system # path that exists at the same location on both head and compute. -if [[ $MODEL_PREFIX == "minimaxm2.5" && -x /usr/bin/python3 ]]; then +if [[ ( $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ) && -x /usr/bin/python3 ]]; then uv venv --seed --python /usr/bin/python3 else uv venv --seed @@ -312,7 +345,7 @@ SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" # Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path # above so srtctl's outputs/ directory (which lives under # SRTCTL_ROOT) is visible to compute nodes. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then SRTCTL_ROOT="$SRT_REPO_DIR" fi echo "Creating srtslurm.yaml configuration..." @@ -354,7 +387,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" # can't see. Stage the relevant subset to shared FS and repoint # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already # on shared FS) and .git (not needed in container) for speed. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}" mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1 rsync -a --delete \ @@ -379,6 +412,7 @@ if [[ ! -f "$CONFIG_PATH" ]]; then fi sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH" + if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) else