Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4635,13 +4635,13 @@ minimaxm3-fp8-h100-vllm:
# ~52 GB expert shard, and KV-cache init fails at high conc (sweep
# 27441767143, conc 256/512: "No available memory for the cache
# blocks"). TEP8 covers the high-concurrency regime instead.
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
- { tp: 8, conc-start: 1, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }
- { tp: 8, conc-start: 1, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256 }

dsr1-fp8-h100-dynamo-sglang:
image: lmsysorg/sglang:v0.5.8-cu130
Expand Down Expand Up @@ -4888,17 +4888,18 @@ minimaxm3-fp8-h200-vllm:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 128, conc-end: 256 }
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 512 }
- { tp: 4, conc-start: 1, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 1, conc-end: 256 }
- { tp: 8, conc-start: 1, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, conc-start: 4, conc-end: 32 }
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
- { tp: 4, conc-start: 1, conc-end: 32 }
- { tp: 4, ep: 4, conc-start: 1, conc-end: 256 }
- { tp: 8, conc-start: 1, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }

dsr1-fp4-gb200-dynamo-trt:
Expand Down
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3807,3 +3807,10 @@
description:
- "Extend the MiniMax-M3 MXFP8 MI300X and MI325X non-MTP sweeps down to concurrency 1 on the TP-only latency rows (was conc 4), to capture the single-request latency point; TEP/DEP rows keep their higher concurrency starts"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1760

- config-keys:
- minimaxm3-fp8-h100-vllm
- minimaxm3-fp8-h200-vllm
description:
- "Extend MiniMax-M3 MXFP8 H100/H200 non-MTP sweeps to concurrency 1 on the latency rows (H100: TP8; H200: TP4 and TP8) and add full TEP coverage from conc 1 to 256 (H100: TP8+EP8; H200: TP4+EP4 and TP8+EP8, incl. a new TP4+EP4 row for 8k1k). H200 TP8+EP8 upper bound moves 512->256 (high concurrency stays covered by the TP8+EP8 dp-attn DEP rows). DEP rows unchanged"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1761
Loading