From 48f40b4edeaeccb4dc31404dd9d46473c22116c8 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Mon, 22 Jun 2026 17:53:06 -0700 Subject: [PATCH 1/2] feat(test): TestBenchmark drives seiload (load suite) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the load suite: after provisioning the chain + RPC fleet, render the platform seiload profile (from the seiload-profiles ConfigMap) with the fleet's EVM endpoints, apply seiload's own Job manifest as a decoupled unit, wait for it to run the full load, and assert the chain stayed live under it. - seiload runs from its own manifest (embedded template, parameterized) — its Job spec is not constructed in Go. Profile is read from the platform CM, not vendored. - Pass/fail = Job completion + post-load chain liveness. A throughput/regression gate belongs in telemetry (a PromQL query over the run's metrics); the Job carries the metrics scrape label so that gate can be added later. - Per-run profile CM + seiload Job carry sei.io/harness-run for the GC sweep; t.Cleanup deletes them on normal exit. Co-Authored-By: Claude Opus 4.8 --- test/integration/benchmark_test.go | 44 +++--- test/integration/harness_test.go | 21 +++ test/integration/seiload_job.yaml.tmpl | 66 +++++++++ test/integration/seiload_test.go | 191 +++++++++++++++++++++++++ 4 files changed, 302 insertions(+), 20 deletions(-) create mode 100644 test/integration/seiload_job.yaml.tmpl create mode 100644 test/integration/seiload_test.go diff --git a/test/integration/benchmark_test.go b/test/integration/benchmark_test.go index 5bc2c20..282c25e 100644 --- a/test/integration/benchmark_test.go +++ b/test/integration/benchmark_test.go @@ -10,15 +10,20 @@ import ( "time" ) -// TestBenchmark provisions a validator chain + RPC fleet for the load suite. -// seiload drive + report upload are not yet wired (see TODO below). +// TestBenchmark provisions a validator chain + RPC fleet, drives seiload against +// the fleet for the configured duration, and asserts the chain stayed live under +// load. The load suite. // // Inputs (env, mirroring k8s_nightly.yml): // -// SEI_CHAIN_ID per-run chain id (e.g. bench-) [required] -// SEID_IMAGE seid image under test [required] -// SEI_RUN_ID unique run id (sei.io/harness-run) [default: SEI_CHAIN_ID] -// SEI_NAMESPACE shared nightly namespace [default: SDK default] +// SEI_CHAIN_ID per-run chain id (e.g. bench-) [required] +// SEID_IMAGE seid image under test [required] +// SEILOAD_IMAGE sei-load benchmark image [required] +// SEI_RUN_ID unique run id (sei.io/harness-run) [default: SEI_CHAIN_ID] +// SEI_NAMESPACE shared nightly namespace [default: SDK default] +// SEILOAD_PROFILE profile name in seiload-profiles [default: nightly_evm_transfer] +// DURATION_MINUTES seiload run length [default: 10] +// SEILOAD_COMMIT_ID sei-chain commit label for metrics [default: ""] // // Deadlines: the CronJob MUST run this with `-test.timeout 0` (or safely above // the scenario timeout). A -test.timeout breach panics and bypasses t.Cleanup, @@ -30,13 +35,17 @@ func TestBenchmark(t *testing.T) { chainID := mustEnv(t, "SEI_CHAIN_ID") s := spec{ - chainID: chainID, - runID: envOr("SEI_RUN_ID", chainID), - namespace: envOr("SEI_NAMESPACE", ""), - seidImage: mustEnv(t, "SEID_IMAGE"), - validators: 4, - rpcNodes: 2, // seiload fans across both via the EVM endpoint list - timeout: 90 * time.Minute, + chainID: chainID, + runID: envOr("SEI_RUN_ID", chainID), + namespace: envOr("SEI_NAMESPACE", ""), + seidImage: mustEnv(t, "SEID_IMAGE"), + validators: 4, + rpcNodes: 2, // seiload fans across both via the EVM endpoint list + timeout: 90 * time.Minute, + seiloadImage: mustEnv(t, "SEILOAD_IMAGE"), + seiloadProfile: envOr("SEILOAD_PROFILE", "nightly_evm_transfer"), + seiloadCommit: envOr("SEILOAD_COMMIT_ID", ""), + durationMin: envInt(t, "DURATION_MINUTES", 10), } ctx, cancel := context.WithTimeout(context.Background(), s.timeout) @@ -49,20 +58,15 @@ func TestBenchmark(t *testing.T) { defer stopSignals() c := openClient(ctx, t) + cs := clientset(t) ch, err := provision(ctx, t, c, s) cleanupChain(t, ch) if err != nil { t.Fatalf("provision: %v", err) } - t.Logf("provisioned %s: %d validators + %d RPC followers; EVM endpoints=%v", s.chainID, s.validators, len(ch.rpcNodes), ch.evmEndpoints()) - // TODO: drive seiload as a decoupled unit — apply its own manifest - // parameterized with ch.evmEndpoints(), stamped sei.io/harness-run; wait, - // read the report from S3, assert TPS/receipts. seiload's Job spec is not - // constructed here. - t.Skipf("provisioned %s (%d validators + %d followers); seiload drive + report not yet wired — tearing down", - s.chainID, s.validators, len(ch.rpcNodes)) + runSeiload(ctx, t, cs, ch, s) } diff --git a/test/integration/harness_test.go b/test/integration/harness_test.go index c4460c4..176a601 100644 --- a/test/integration/harness_test.go +++ b/test/integration/harness_test.go @@ -20,6 +20,7 @@ import ( "fmt" "net/http" "os" + "strconv" "testing" "time" @@ -46,6 +47,12 @@ type spec struct { validators int // genesis validator count (>= 1) rpcNodes int // standalone RPC followers; named -rpc-0..N-1 timeout time.Duration // overall scenario deadline (drives ctx, kept < CronJob activeDeadlineSeconds) + + // seiload inputs (load suite) + seiloadImage string // sei-load benchmark image + seiloadProfile string // profile name in the seiload-profiles ConfigMap + seiloadCommit string // sei-chain commit label for the run's metrics + durationMin int // seiload run length, minutes } // chain is the live provisioned topology a suite runs load against and asserts @@ -214,3 +221,17 @@ func mustEnv(t *testing.T, key string) string { } return v } + +// envInt reads an integer env var or a fallback; a non-integer value fails fast. +func envInt(t *testing.T, key string, fallback int) int { + t.Helper() + v := os.Getenv(key) + if v == "" { + return fallback + } + n, err := strconv.Atoi(v) + if err != nil { + t.Fatalf("integration suite: env %s=%q is not an integer: %v", key, v, err) + } + return n +} diff --git a/test/integration/seiload_job.yaml.tmpl b/test/integration/seiload_job.yaml.tmpl new file mode 100644 index 0000000..94a1ad3 --- /dev/null +++ b/test/integration/seiload_job.yaml.tmpl @@ -0,0 +1,66 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: seiload-{{.RunID}} + labels: + app.kubernetes.io/name: seiload + sei.io/harness-run: "{{.RunID}}" +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 86400 + template: + metadata: + labels: + # podMonitor selects this for Prometheus scrape (metrics continuity). + app.kubernetes.io/name: seiload + sei.io/harness-run: "{{.RunID}}" + spec: + restartPolicy: Never + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + seccompProfile: + type: RuntimeDefault + containers: + - name: seiload + image: {{.Image}} + args: + - --config + - /etc/seiload/profile.json + - --duration={{.DurationMinutes}}m + - --post-summary-flush-delay=45s + - --track-receipts=true + ports: + - name: metrics + containerPort: 9090 + protocol: TCP + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + readOnlyRootFilesystem: true + env: + - name: SEILOAD_RUN_ID + value: "{{.RunID}}" + - name: SEILOAD_CHAIN_ID + value: "{{.ChainID}}" + - name: SEILOAD_COMMIT_ID + value: "{{.Commit}}" + - name: SEILOAD_WORKLOAD + value: nightly + volumeMounts: + - name: profile + mountPath: /etc/seiload + readOnly: true + resources: + requests: + cpu: "2" + memory: "4Gi" + limits: + cpu: "4" + memory: "8Gi" + volumes: + - name: profile + configMap: + name: {{.ProfileCM}} diff --git a/test/integration/seiload_test.go b/test/integration/seiload_test.go new file mode 100644 index 0000000..f6d155a --- /dev/null +++ b/test/integration/seiload_test.go @@ -0,0 +1,191 @@ +//go:build integration + +package integration + +import ( + "bytes" + "context" + _ "embed" + "net/http" + "strconv" + "strings" + "testing" + "text/template" + "time" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/yaml" + + "github.com/sei-protocol/sei-k8s-controller/sdk/sei" +) + +//go:embed seiload_job.yaml.tmpl +var seiloadJobTmpl string + +// seiloadProfilesCM is the platform-owned ConfigMap holding the profile +// templates (placeholders __SEI_CHAIN_ID__ / __RPC_ENDPOINTS__). The harness +// reads it from the cluster rather than vendoring the profile, so the load +// shape stays owned by platform. +const seiloadProfilesCM = "seiload-profiles" + +// seiloadParams are the per-run values templated into the seiload Job manifest. +type seiloadParams struct { + RunID string + ChainID string + Commit string + Image string + DurationMinutes int + ProfileCM string +} + +// clientset builds a client-go clientset from the ambient config — the harness +// uses it for the Job/ConfigMap operations the SDK does not cover. +func clientset(t *testing.T) *kubernetes.Clientset { + t.Helper() + cfg, err := ctrl.GetConfig() + if err != nil { + t.Fatalf("load kubeconfig: %v", err) + } + cs, err := kubernetes.NewForConfig(cfg) + if err != nil { + t.Fatalf("build clientset: %v", err) + } + return cs +} + +// renderProfile reads the platform profile template from seiload-profiles and +// substitutes the per-run chain id + the fleet's EVM endpoints (JSON-quoted). +func renderProfile( + ctx context.Context, t *testing.T, cs *kubernetes.Clientset, + ns, profile, chainID string, endpoints []string, +) string { + t.Helper() + cm, err := cs.CoreV1().ConfigMaps(ns).Get(ctx, seiloadProfilesCM, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get %s/%s: %v", ns, seiloadProfilesCM, err) + } + tmpl, ok := cm.Data[profile+".json"] + if !ok { + t.Fatalf("profile %q.json absent from %s", profile, seiloadProfilesCM) + } + quoted := make([]string, len(endpoints)) + for i, e := range endpoints { + quoted[i] = strconv.Quote(e) + } + tmpl = strings.ReplaceAll(tmpl, "__SEI_CHAIN_ID__", chainID) + tmpl = strings.ReplaceAll(tmpl, "__RPC_ENDPOINTS__", strings.Join(quoted, ",")) + return tmpl +} + +// createProfileCM writes the rendered profile to a per-run ConfigMap stamped +// with the run label so the GC sweep reaps it on an abnormal exit. +func createProfileCM(ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ns, name, runID, profileJSON string) { + t.Helper() + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + Labels: map[string]string{runLabelKey: runID}, + }, + Data: map[string]string{"profile.json": profileJSON}, + } + if _, err := cs.CoreV1().ConfigMaps(ns).Create(ctx, cm, metav1.CreateOptions{}); err != nil { + t.Fatalf("create profile cm %q: %v", name, err) + } + t.Cleanup(func() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + _ = cs.CoreV1().ConfigMaps(ns).Delete(ctx, name, metav1.DeleteOptions{}) + }) +} + +// renderJob templates the embedded seiload Job manifest with the per-run params. +// The manifest owns seiload's shape; only per-run values are injected. +func renderJob(t *testing.T, p seiloadParams) *batchv1.Job { + t.Helper() + var buf bytes.Buffer + if err := template.Must(template.New("job").Parse(seiloadJobTmpl)).Execute(&buf, p); err != nil { + t.Fatalf("render seiload job: %v", err) + } + var job batchv1.Job + if err := yaml.Unmarshal(buf.Bytes(), &job); err != nil { + t.Fatalf("unmarshal seiload job: %v", err) + } + return &job +} + +// runSeiload drives seiload against the fleet as a decoupled unit: render the +// platform profile, apply seiload's own Job manifest, wait for it to run the +// full load, then assert the chain stayed live under it. +// +// The pass/fail signal is Job completion (seiload ran the load to the end +// without erroring) plus post-load chain liveness. A throughput/regression gate +// belongs in telemetry — a PromQL query over the run's metrics — not in this +// harness; the Job carries the metrics scrape label so that gate can be added. +func runSeiload(ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ch *chain, s spec) { + t.Helper() + ns := envOr("SEI_NAMESPACE", ch.network.Namespace()) + + profileCM := "seiload-profile-" + s.runID + profileJSON := renderProfile(ctx, t, cs, ns, s.seiloadProfile, s.chainID, ch.evmEndpoints()) + createProfileCM(ctx, t, cs, ns, profileCM, s.runID, profileJSON) + + job := renderJob(t, seiloadParams{ + RunID: s.runID, + ChainID: s.chainID, + Commit: s.seiloadCommit, + Image: s.seiloadImage, + DurationMinutes: s.durationMin, + ProfileCM: profileCM, + }) + job.Namespace = ns + if _, err := cs.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{}); err != nil { + t.Fatalf("create seiload job: %v", err) + } + t.Cleanup(func() { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + bg := metav1.DeletePropagationBackground + _ = cs.BatchV1().Jobs(ns).Delete(ctx, job.Name, metav1.DeleteOptions{PropagationPolicy: &bg}) + }) + + waitJob(ctx, t, cs, ns, job.Name) + + // Chain survived the load: node-0 still caught up. + hc := &http.Client{Timeout: 10 * time.Second} + n0 := ch.rpcNodes[0] + if err := sei.WaitCaughtUp(ctx, hc, n0.TendermintRPC()); err != nil { + t.Errorf("post-load %s not caught up: %v", n0.Name(), err) + } +} + +// waitJob blocks until the seiload Job reaches a terminal condition. A Failed +// Job fails the suite; success returns. Bounded by ctx. +func waitJob(ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ns, name string) { + t.Helper() + tick := time.NewTicker(10 * time.Second) + defer tick.Stop() + for { + job, err := cs.BatchV1().Jobs(ns).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get seiload job %q: %v", name, err) + } + for _, cond := range job.Status.Conditions { + if cond.Type == batchv1.JobComplete && cond.Status == corev1.ConditionTrue { + return + } + if cond.Type == batchv1.JobFailed && cond.Status == corev1.ConditionTrue { + t.Fatalf("seiload job %q failed: %s", name, cond.Message) + } + } + select { + case <-ctx.Done(): + t.Fatalf("seiload job %q did not finish before deadline: %v", name, ctx.Err()) + case <-tick.C: + } + } +} From 557897ee2319ffa2e3fdb9dea23ef54931f51b6d Mon Sep 17 00:00:00 2001 From: bdchatham Date: Mon, 22 Jun 2026 18:03:03 -0700 Subject: [PATCH 2/2] fix(test): address seiload-drive xreview (4 lenses) - systems: capture the failed seiload pod log into the fatal (the failure-time signal a Job condition message can't give); add a self-terminating Job activeDeadlineSeconds independent of the harness ctx. - sei-network: widen the post-load liveness check to every follower, not just node-0 (a half-dead fleet would otherwise ship green). - k8s/dissenter + comment-register: soften the runLabelKey comment to stop claiming a label-GC sweep that isn't shipped yet (pending platform deliverable; DeletionPolicy cascade + t.Cleanup cover normal exit); use the chain's resolved namespace directly for the seiload Job (no env re-resolve); fix the stale namespace field comment. - idiom: trim the runSeiload doc to present-state. chainId 713714 confirmed correct (seid GetEVMChainID falls through to DefaultChainID for bench-* chains). RBAC Role + podMonitor selector flip + gc label sweep are platform prereqs, tracked separately. Co-Authored-By: Claude Opus 4.8 --- test/integration/harness_test.go | 12 +++--- test/integration/seiload_job.yaml.tmpl | 1 + test/integration/seiload_test.go | 52 +++++++++++++++++++------- 3 files changed, 46 insertions(+), 19 deletions(-) diff --git a/test/integration/harness_test.go b/test/integration/harness_test.go index 176a601..0b5df0a 100644 --- a/test/integration/harness_test.go +++ b/test/integration/harness_test.go @@ -31,10 +31,12 @@ import ( _ "github.com/sei-protocol/sei-k8s-controller/sdk/sei/provider/k8s" ) -// runLabelKey marks a run's resources for the nightly label-GC sweep — the only -// reaper on abnormal exit (shared namespace), since t.Cleanup is skipped on -// SIGKILL or a -test.timeout breach. provision stamps it on the network + every -// node; a suite's directly-applied seiload Job and fault CRs must stamp it too. +// runLabelKey marks a run's resources for the abnormal-exit reaper (t.Cleanup is +// skipped on SIGKILL / a -test.timeout breach). provision stamps it on the +// network + every node; a suite's directly-applied seiload Job + fault CRs stamp +// it too. The matching nightly label-GC sweep is a pending platform deliverable; +// until it ships, normal-exit teardown (t.Cleanup) + the SeiNetwork +// DeletionPolicy cascade are the cleanup path. const runLabelKey = "sei.io/harness-run" // spec is the typed input shared by the suites — the local-Go-state replacement @@ -42,7 +44,7 @@ const runLabelKey = "sei.io/harness-run" type spec struct { chainID string // SeiNetwork name == genesis chain id; also the peer-selector value and per-run discriminator runID string // unique per run; the sei.io/harness-run label value - namespace string // shared nightly namespace (D2); "" => SDK client default (SA namespace) + namespace string // shared nightly namespace; "" => the SDK client's resolved default seidImage string // seid container image under test validators int // genesis validator count (>= 1) rpcNodes int // standalone RPC followers; named -rpc-0..N-1 diff --git a/test/integration/seiload_job.yaml.tmpl b/test/integration/seiload_job.yaml.tmpl index 94a1ad3..b1927bb 100644 --- a/test/integration/seiload_job.yaml.tmpl +++ b/test/integration/seiload_job.yaml.tmpl @@ -7,6 +7,7 @@ metadata: sei.io/harness-run: "{{.RunID}}" spec: backoffLimit: 0 + activeDeadlineSeconds: {{.DeadlineSeconds}} ttlSecondsAfterFinished: 86400 template: metadata: diff --git a/test/integration/seiload_test.go b/test/integration/seiload_test.go index f6d155a..b4270ce 100644 --- a/test/integration/seiload_test.go +++ b/test/integration/seiload_test.go @@ -6,6 +6,7 @@ import ( "bytes" "context" _ "embed" + "fmt" "net/http" "strconv" "strings" @@ -40,6 +41,7 @@ type seiloadParams struct { Image string DurationMinutes int ProfileCM string + DeadlineSeconds int } // clientset builds a client-go clientset from the ambient config — the harness @@ -118,17 +120,16 @@ func renderJob(t *testing.T, p seiloadParams) *batchv1.Job { return &job } -// runSeiload drives seiload against the fleet as a decoupled unit: render the -// platform profile, apply seiload's own Job manifest, wait for it to run the -// full load, then assert the chain stayed live under it. -// -// The pass/fail signal is Job completion (seiload ran the load to the end -// without erroring) plus post-load chain liveness. A throughput/regression gate -// belongs in telemetry — a PromQL query over the run's metrics — not in this -// harness; the Job carries the metrics scrape label so that gate can be added. +// runSeiload renders the platform profile, applies seiload's Job manifest, waits +// for the Job to complete, and asserts every follower is still caught up. +// Pass/fail is Job completion plus post-load liveness; throughput gating is a +// PromQL query over the run's metrics (the Job carries a metrics scrape label, +// pending a podMonitor that selects it). func runSeiload(ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ch *chain, s spec) { t.Helper() - ns := envOr("SEI_NAMESPACE", ch.network.Namespace()) + // The seiload Job co-locates with the chain; the network's resolved + // namespace is authoritative (never re-resolve from env here). + ns := ch.network.Namespace() profileCM := "seiload-profile-" + s.runID profileJSON := renderProfile(ctx, t, cs, ns, s.seiloadProfile, s.chainID, ch.evmEndpoints()) @@ -141,6 +142,9 @@ func runSeiload(ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ch Image: s.seiloadImage, DurationMinutes: s.durationMin, ProfileCM: profileCM, + // Self-terminating cap independent of the harness ctx: the load plus + // generous slack for image pull + the post-summary flush. + DeadlineSeconds: (s.durationMin + 15) * 60, }) job.Namespace = ns if _, err := cs.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{}); err != nil { @@ -155,11 +159,13 @@ func runSeiload(ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ch waitJob(ctx, t, cs, ns, job.Name) - // Chain survived the load: node-0 still caught up. + // Chain survived the load: every follower still caught up (a follower can't + // catch up to a halted chain, so this transitively covers validator quorum). hc := &http.Client{Timeout: 10 * time.Second} - n0 := ch.rpcNodes[0] - if err := sei.WaitCaughtUp(ctx, hc, n0.TendermintRPC()); err != nil { - t.Errorf("post-load %s not caught up: %v", n0.Name(), err) + for _, n := range ch.rpcNodes { + if err := sei.WaitCaughtUp(ctx, hc, n.TendermintRPC()); err != nil { + t.Errorf("post-load %s not caught up: %v", n.Name(), err) + } } } @@ -179,7 +185,8 @@ func waitJob(ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ns, na return } if cond.Type == batchv1.JobFailed && cond.Status == corev1.ConditionTrue { - t.Fatalf("seiload job %q failed: %s", name, cond.Message) + t.Fatalf("seiload job %q failed: %s\n--- seiload pod log (tail) ---\n%s", + name, cond.Message, podLogTail(ctx, cs, ns, name)) } } select { @@ -189,3 +196,20 @@ func waitJob(ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ns, na } } } + +// podLogTail returns the tail of the seiload pod's log for a Job, best-effort — +// the failure-time signal a Job condition message alone cannot give. +func podLogTail(ctx context.Context, cs *kubernetes.Clientset, ns, jobName string) string { + pods, err := cs.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{ + LabelSelector: "batch.kubernetes.io/job-name=" + jobName, + }) + if err != nil || len(pods.Items) == 0 { + return fmt.Sprintf("(no pod for job %q: %v)", jobName, err) + } + lines := int64(50) + raw, err := cs.CoreV1().Pods(ns).GetLogs(pods.Items[0].Name, &corev1.PodLogOptions{TailLines: &lines}).DoRaw(ctx) + if err != nil { + return fmt.Sprintf("(read logs failed: %v)", err) + } + return string(raw) +}