Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2930,3 +2930,39 @@ minimaxm3-fp8-mi325x-vllm:
- { tp: 8, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
# minimaxm3-fp8-mi325x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same H200-style
# search space as the non-MTP MI325X entry, trimmed at the extreme-concurrency
# end with TP-only latency rows started at conc 1 (matching the H200/MI355X MTP
# recipes). Runs with CUDA graphs (no --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0,
# BF16 KV on gfx942). The shipped ROCm image lacks SupportsEagle3 on the AMD
# MiniMax-M3 model, so the recipe applies that fix in-place at runtime
# (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on
# MI355X/MI300X) before serving.
minimaxm3-fp8-mi325x-vllm-mtp:
image: vllm/vllm-openai-rocm:minimax-m3
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi325x
precision: fp8
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp }
- { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 32, spec-decoding: mtp }
- { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
214 changes: 214 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
#!/usr/bin/env bash

# MiniMax-M3 MXFP8 MI325X (gfx942) single-node vLLM recipe with EAGLE3
# speculative decoding — the spec-decoding=mtp variant of
# minimaxm3_fp8_mi325x.sh. Adds the Inferact/MiniMax-M3-EAGLE3 draft head via
# --speculative-config with 3 speculative tokens. Everything else mirrors the
# non-MTP MI325X recipe: mandatory --block-size 128, --language-model-only for
# the text-only benchmark, --attention-backend TRITON_ATTN, and
# --no-enable-prefix-caching. Runs with CUDA graphs (no --enforce-eager);
# VLLM_USE_BREAKABLE_CUDAGRAPH=0 avoids the M3-decode breakable-cudagraph path.
# The default BF16 KV cache is retained (unlike the MI355X recipe's FP8 KV
# cache): gfx942 has no calibrated q/prob scales for ROCm FP8 attention and
# vLLM's fallback scale of 1.0 corrupts accuracy.
#
# Unlike the CUDA recipes, the drafter needs no attention_backend override:
# the FlashInfer "page size 128 requires GQA/MQA" limitation that forced
# FLASH_ATTN for the EAGLE3 MHA head on Blackwell is FlashInfer/CUDA-specific.
# Here the whole server runs on TRITON_ATTN (set globally below), which serves
# the MHA draft fine.
#
# [AI generated draft test] The shipped vllm/vllm-openai-rocm:minimax-m3 image
# does NOT implement SupportsEagle3 on the AMD MiniMax-M3 model, so EAGLE3
# engine init fails with "Model does not support EAGLE3 interface but
# aux_hidden_state_outputs was requested". This recipe applies that fix
# (functionstackx/vllm#1 — ported from nvidia/model.py, upstreamed as
# vllm-project/vllm#45546) in-place to the installed vllm before serving, so we
# can validate EAGLE3 on real MI325X hardware ahead of an image rebuild. The
# same patch is validated green on MI355X. It is idempotent and fails the job
# loudly if the installed amd/model.py has drifted from the expected base.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

# MODEL is a bare HF id on the mi325x single-node runner (a fast cache hit when
# pre-staged). The EAGLE3 draft is not staged; fetch it into the same cache.
if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
hf download "$DRAFT_MODEL"
fi

if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

SERVER_LOG=/workspace/server.log
export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_USE_BREAKABLE_CUDAGRAPH=0

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
fi

PARALLEL_ARGS=(--tensor-parallel-size "$TP")
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(
--tensor-parallel-size 1
--data-parallel-size "$TP"
--enable-expert-parallel
)
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS+=(--enable-expert-parallel)
fi

# use 3 speculative tokens for all configs for now
NUM_SPEC_TOKENS=3

# [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the
# SupportsEagle3 interface (functionstackx/vllm#1, upstream vllm-project/vllm#45546).
# Mirrors nvidia/model.py: adds EagleModelMixin to the inner model +
# aux-hidden-state emission, and SupportsEagle3 to the two outer classes.
# Idempotent; hard-fails if the installed file has drifted from the expected
# base (so we never silently run unpatched and mislabel the result).
python3 - <<'PYEOF' || { echo "EAGLE3 in-place patch failed" >&2; exit 1; }
import ast, importlib.util, pathlib, sys

spec = importlib.util.find_spec("vllm")
root = pathlib.Path(spec.submodule_search_locations[0])
target = root / "models" / "minimax_m3" / "amd" / "model.py"
src = target.read_text()

if "EagleModelMixin" in src and "class MiniMaxM3Model(nn.Module, EagleModelMixin):" in src:
print(f"[eagle3-patch] already applied: {target}")
sys.exit(0)

edits = [
(
"from vllm.model_executor.models.interfaces import (\n"
" MultiModalEmbeddings,\n"
" SupportsMultiModal,\n"
")",
"from vllm.model_executor.models.interfaces import (\n"
" EagleModelMixin,\n"
" MultiModalEmbeddings,\n"
" SupportsEagle3,\n"
" SupportsMultiModal,\n"
")",
),
(
"class MiniMaxM3Model(nn.Module):",
"class MiniMaxM3Model(nn.Module, EagleModelMixin):",
),
(
" inputs_embeds: torch.Tensor | None = None,\n"
" ) -> torch.Tensor:\n"
" if inputs_embeds is not None:",
" inputs_embeds: torch.Tensor | None = None,\n"
" ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:\n"
" if inputs_embeds is not None:",
),
(
" residual = None\n\n"
" for layer in self.layers[self.start_layer : self.end_layer]:\n"
" hidden_states, residual = layer(positions, hidden_states, residual)\n\n"
" hidden_states, _ = self.norm(hidden_states, residual)\n"
" return hidden_states",
" residual = None\n\n"
" # EAGLE3 is not yet compatible with pipeline parallel\n"
" aux_hidden_states = self._maybe_add_hidden_state([], 0, hidden_states, residual)\n"
" for idx, layer in enumerate(self.layers[self.start_layer : self.end_layer]):\n"
" hidden_states, residual = layer(positions, hidden_states, residual)\n"
" self._maybe_add_hidden_state(\n"
" aux_hidden_states, idx + 1, hidden_states, residual\n"
" )\n\n"
" hidden_states, _ = self.norm(hidden_states, residual)\n\n"
" if len(aux_hidden_states) > 0:\n"
" return hidden_states, aux_hidden_states\n"
" return hidden_states",
),
(
"class MiniMaxM3SparseForCausalLM(nn.Module):",
"class MiniMaxM3SparseForCausalLM(nn.Module, SupportsEagle3):",
),
(
"class MiniMaxM3SparseForConditionalGeneration(nn.Module, SupportsMultiModal):",
"class MiniMaxM3SparseForConditionalGeneration(\n"
" nn.Module, SupportsMultiModal, SupportsEagle3\n"
"):",
),
]

for old, new in edits:
count = src.count(old)
if count != 1:
sys.exit(
f"[eagle3-patch] anchor matched {count} times (expected 1); "
f"installed {target} has drifted from the expected base — aborting"
)
src = src.replace(old, new)

ast.parse(src)
target.write_text(src)
print(f"[eagle3-patch] applied EAGLE3 support to {target}")
PYEOF

start_gpu_monitor

set -x
vllm serve "$MODEL" --port "$PORT" \
"${PARALLEL_ARGS[@]}" \
--block-size 128 \
--no-enable-prefix-caching \
--language-model-only \
--max-model-len "$MAX_MODEL_LEN" \
--attention-backend TRITON_ATTN \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
--tool-call-parser minimax_m3 \
--reasoning-parser minimax_m3 \
--enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &

SERVER_PID=$!
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

# Spec-decode acceptance rate degrades on raw random tokens; route prompts
# through the chat template as the other MTP recipes do.
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code \
--use-chat-template

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3790,3 +3790,13 @@
- "Run the MiniMax-M3 MXFP8 MI355X EAGLE3 MTP recipe with CUDA graphs instead of --enforce-eager"
- "Drop --enforce-eager and set VLLM_USE_BREAKABLE_CUDAGRAPH=0, which avoids the M3-decode breakable-cudagraph path that previously forced eager execution (the non-MTP MI355X recipe already got this in #1754)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1755

- config-keys:
- minimaxm3-fp8-mi325x-vllm-mtp
description:
- "Initial submission: MiniMax-M3 MXFP8 MI325X (gfx942) vLLM benchmark with EAGLE3 speculative decoding (target: MiniMaxAI/MiniMax-M3-MXFP8, draft: Inferact/MiniMax-M3-EAGLE3, 3 speculative tokens) — spec-decoding=mtp variant of the MI325X day-zero recipe (#1748)"
- "Image: vllm/vllm-openai-rocm:minimax-m3 (same day-zero ROCm build as the non-MTP entry)"
- "Serve shape follows minimaxm3-fp8-mi325x-vllm: --block-size 128, --no-enable-prefix-caching, --language-model-only, --attention-backend TRITON_ATTN, minimax_m3 parsers, default BF16 KV cache (gfx942 lacks calibrated ROCm FP8 attention scales). Runs with CUDA graphs (no --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0); prompts via chat template for realistic acceptance"
- "H200-style search space (TP4/TP8 latency, TP4+EP4/TP8+EP8 TEP, TP8+EP8 dp-attn DEP) trimmed at the extreme-concurrency end with TP-only latency rows started at conc 1"
- "[AI generated draft test] The shipped ROCm image's AMD MiniMax-M3 model lacks SupportsEagle3, so the recipe patches it in-place at runtime (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on MI355X/MI300X) before serving; also adds SPEC_SUFFIX to launch_mi325x-amds.sh so spec-decoding=mtp routes to the _mtp script"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1759
6 changes: 5 additions & 1 deletion runners/launch_mi325x-amds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ PARTITION="compute"
SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
LOCK_FILE="${SQUASH_FILE}.lock"

# Route spec-decoding=mtp configs to the _mtp benchmark script (parity with
# the h200 launchers, which have carried SPEC_SUFFIX since #392).
SPEC_SUFFIX=$([[ "${SPEC_DECODING:-}" == "mtp" ]] && printf '_mtp' || printf '')

set -x

JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
Expand Down Expand Up @@ -42,6 +46,6 @@ srun --jobid="$JOB_ID" \
--container-remap-root \
--container-workdir=/workspace/ \
--no-container-entrypoint --export=ALL \
bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi325x.sh
bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi325x${SPEC_SUFFIX}.sh

scancel $JOB_ID
Loading