diff --git a/test/integration/benchmark_test.go b/test/integration/benchmark_test.go index 5bc2c20..282c25e 100644 --- a/test/integration/benchmark_test.go +++ b/test/integration/benchmark_test.go @@ -10,15 +10,20 @@ import ( "time" ) -// TestBenchmark provisions a validator chain + RPC fleet for the load suite. -// seiload drive + report upload are not yet wired (see TODO below). +// TestBenchmark provisions a validator chain + RPC fleet, drives seiload against +// the fleet for the configured duration, and asserts the chain stayed live under +// load. The load suite. // // Inputs (env, mirroring k8s_nightly.yml): // -// SEI_CHAIN_ID per-run chain id (e.g. bench-) [required] -// SEID_IMAGE seid image under test [required] -// SEI_RUN_ID unique run id (sei.io/harness-run) [default: SEI_CHAIN_ID] -// SEI_NAMESPACE shared nightly namespace [default: SDK default] +// SEI_CHAIN_ID per-run chain id (e.g. bench-) [required] +// SEID_IMAGE seid image under test [required] +// SEILOAD_IMAGE sei-load benchmark image [required] +// SEI_RUN_ID unique run id (sei.io/harness-run) [default: SEI_CHAIN_ID] +// SEI_NAMESPACE shared nightly namespace [default: SDK default] +// SEILOAD_PROFILE profile name in seiload-profiles [default: nightly_evm_transfer] +// DURATION_MINUTES seiload run length [default: 10] +// SEILOAD_COMMIT_ID sei-chain commit label for metrics [default: ""] // // Deadlines: the CronJob MUST run this with `-test.timeout 0` (or safely above // the scenario timeout). A -test.timeout breach panics and bypasses t.Cleanup, @@ -30,13 +35,17 @@ func TestBenchmark(t *testing.T) { chainID := mustEnv(t, "SEI_CHAIN_ID") s := spec{ - chainID: chainID, - runID: envOr("SEI_RUN_ID", chainID), - namespace: envOr("SEI_NAMESPACE", ""), - seidImage: mustEnv(t, "SEID_IMAGE"), - validators: 4, - rpcNodes: 2, // seiload fans across both via the EVM endpoint list - timeout: 90 * time.Minute, + chainID: chainID, + runID: envOr("SEI_RUN_ID", chainID), + namespace: envOr("SEI_NAMESPACE", ""), + seidImage: mustEnv(t, "SEID_IMAGE"), + validators: 4, + rpcNodes: 2, // seiload fans across both via the EVM endpoint list + timeout: 90 * time.Minute, + seiloadImage: mustEnv(t, "SEILOAD_IMAGE"), + seiloadProfile: envOr("SEILOAD_PROFILE", "nightly_evm_transfer"), + seiloadCommit: envOr("SEILOAD_COMMIT_ID", ""), + durationMin: envInt(t, "DURATION_MINUTES", 10), } ctx, cancel := context.WithTimeout(context.Background(), s.timeout) @@ -49,20 +58,15 @@ func TestBenchmark(t *testing.T) { defer stopSignals() c := openClient(ctx, t) + cs := clientset(t) ch, err := provision(ctx, t, c, s) cleanupChain(t, ch) if err != nil { t.Fatalf("provision: %v", err) } - t.Logf("provisioned %s: %d validators + %d RPC followers; EVM endpoints=%v", s.chainID, s.validators, len(ch.rpcNodes), ch.evmEndpoints()) - // TODO: drive seiload as a decoupled unit — apply its own manifest - // parameterized with ch.evmEndpoints(), stamped sei.io/harness-run; wait, - // read the report from S3, assert TPS/receipts. seiload's Job spec is not - // constructed here. - t.Skipf("provisioned %s (%d validators + %d followers); seiload drive + report not yet wired — tearing down", - s.chainID, s.validators, len(ch.rpcNodes)) + runSeiload(ctx, t, cs, ch, s) } diff --git a/test/integration/harness_test.go b/test/integration/harness_test.go index c4460c4..0b5df0a 100644 --- a/test/integration/harness_test.go +++ b/test/integration/harness_test.go @@ -20,6 +20,7 @@ import ( "fmt" "net/http" "os" + "strconv" "testing" "time" @@ -30,10 +31,12 @@ import ( _ "github.com/sei-protocol/sei-k8s-controller/sdk/sei/provider/k8s" ) -// runLabelKey marks a run's resources for the nightly label-GC sweep — the only -// reaper on abnormal exit (shared namespace), since t.Cleanup is skipped on -// SIGKILL or a -test.timeout breach. provision stamps it on the network + every -// node; a suite's directly-applied seiload Job and fault CRs must stamp it too. +// runLabelKey marks a run's resources for the abnormal-exit reaper (t.Cleanup is +// skipped on SIGKILL / a -test.timeout breach). provision stamps it on the +// network + every node; a suite's directly-applied seiload Job + fault CRs stamp +// it too. The matching nightly label-GC sweep is a pending platform deliverable; +// until it ships, normal-exit teardown (t.Cleanup) + the SeiNetwork +// DeletionPolicy cascade are the cleanup path. const runLabelKey = "sei.io/harness-run" // spec is the typed input shared by the suites — the local-Go-state replacement @@ -41,11 +44,17 @@ const runLabelKey = "sei.io/harness-run" type spec struct { chainID string // SeiNetwork name == genesis chain id; also the peer-selector value and per-run discriminator runID string // unique per run; the sei.io/harness-run label value - namespace string // shared nightly namespace (D2); "" => SDK client default (SA namespace) + namespace string // shared nightly namespace; "" => the SDK client's resolved default seidImage string // seid container image under test validators int // genesis validator count (>= 1) rpcNodes int // standalone RPC followers; named -rpc-0..N-1 timeout time.Duration // overall scenario deadline (drives ctx, kept < CronJob activeDeadlineSeconds) + + // seiload inputs (load suite) + seiloadImage string // sei-load benchmark image + seiloadProfile string // profile name in the seiload-profiles ConfigMap + seiloadCommit string // sei-chain commit label for the run's metrics + durationMin int // seiload run length, minutes } // chain is the live provisioned topology a suite runs load against and asserts @@ -214,3 +223,17 @@ func mustEnv(t *testing.T, key string) string { } return v } + +// envInt reads an integer env var or a fallback; a non-integer value fails fast. +func envInt(t *testing.T, key string, fallback int) int { + t.Helper() + v := os.Getenv(key) + if v == "" { + return fallback + } + n, err := strconv.Atoi(v) + if err != nil { + t.Fatalf("integration suite: env %s=%q is not an integer: %v", key, v, err) + } + return n +} diff --git a/test/integration/seiload_job.yaml.tmpl b/test/integration/seiload_job.yaml.tmpl new file mode 100644 index 0000000..b1927bb --- /dev/null +++ b/test/integration/seiload_job.yaml.tmpl @@ -0,0 +1,67 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: seiload-{{.RunID}} + labels: + app.kubernetes.io/name: seiload + sei.io/harness-run: "{{.RunID}}" +spec: + backoffLimit: 0 + activeDeadlineSeconds: {{.DeadlineSeconds}} + ttlSecondsAfterFinished: 86400 + template: + metadata: + labels: + # podMonitor selects this for Prometheus scrape (metrics continuity). + app.kubernetes.io/name: seiload + sei.io/harness-run: "{{.RunID}}" + spec: + restartPolicy: Never + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + seccompProfile: + type: RuntimeDefault + containers: + - name: seiload + image: {{.Image}} + args: + - --config + - /etc/seiload/profile.json + - --duration={{.DurationMinutes}}m + - --post-summary-flush-delay=45s + - --track-receipts=true + ports: + - name: metrics + containerPort: 9090 + protocol: TCP + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + readOnlyRootFilesystem: true + env: + - name: SEILOAD_RUN_ID + value: "{{.RunID}}" + - name: SEILOAD_CHAIN_ID + value: "{{.ChainID}}" + - name: SEILOAD_COMMIT_ID + value: "{{.Commit}}" + - name: SEILOAD_WORKLOAD + value: nightly + volumeMounts: + - name: profile + mountPath: /etc/seiload + readOnly: true + resources: + requests: + cpu: "2" + memory: "4Gi" + limits: + cpu: "4" + memory: "8Gi" + volumes: + - name: profile + configMap: + name: {{.ProfileCM}} diff --git a/test/integration/seiload_test.go b/test/integration/seiload_test.go new file mode 100644 index 0000000..b4270ce --- /dev/null +++ b/test/integration/seiload_test.go @@ -0,0 +1,215 @@ +//go:build integration + +package integration + +import ( + "bytes" + "context" + _ "embed" + "fmt" + "net/http" + "strconv" + "strings" + "testing" + "text/template" + "time" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/yaml" + + "github.com/sei-protocol/sei-k8s-controller/sdk/sei" +) + +//go:embed seiload_job.yaml.tmpl +var seiloadJobTmpl string + +// seiloadProfilesCM is the platform-owned ConfigMap holding the profile +// templates (placeholders __SEI_CHAIN_ID__ / __RPC_ENDPOINTS__). The harness +// reads it from the cluster rather than vendoring the profile, so the load +// shape stays owned by platform. +const seiloadProfilesCM = "seiload-profiles" + +// seiloadParams are the per-run values templated into the seiload Job manifest. +type seiloadParams struct { + RunID string + ChainID string + Commit string + Image string + DurationMinutes int + ProfileCM string + DeadlineSeconds int +} + +// clientset builds a client-go clientset from the ambient config — the harness +// uses it for the Job/ConfigMap operations the SDK does not cover. +func clientset(t *testing.T) *kubernetes.Clientset { + t.Helper() + cfg, err := ctrl.GetConfig() + if err != nil { + t.Fatalf("load kubeconfig: %v", err) + } + cs, err := kubernetes.NewForConfig(cfg) + if err != nil { + t.Fatalf("build clientset: %v", err) + } + return cs +} + +// renderProfile reads the platform profile template from seiload-profiles and +// substitutes the per-run chain id + the fleet's EVM endpoints (JSON-quoted). +func renderProfile( + ctx context.Context, t *testing.T, cs *kubernetes.Clientset, + ns, profile, chainID string, endpoints []string, +) string { + t.Helper() + cm, err := cs.CoreV1().ConfigMaps(ns).Get(ctx, seiloadProfilesCM, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get %s/%s: %v", ns, seiloadProfilesCM, err) + } + tmpl, ok := cm.Data[profile+".json"] + if !ok { + t.Fatalf("profile %q.json absent from %s", profile, seiloadProfilesCM) + } + quoted := make([]string, len(endpoints)) + for i, e := range endpoints { + quoted[i] = strconv.Quote(e) + } + tmpl = strings.ReplaceAll(tmpl, "__SEI_CHAIN_ID__", chainID) + tmpl = strings.ReplaceAll(tmpl, "__RPC_ENDPOINTS__", strings.Join(quoted, ",")) + return tmpl +} + +// createProfileCM writes the rendered profile to a per-run ConfigMap stamped +// with the run label so the GC sweep reaps it on an abnormal exit. +func createProfileCM(ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ns, name, runID, profileJSON string) { + t.Helper() + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + Labels: map[string]string{runLabelKey: runID}, + }, + Data: map[string]string{"profile.json": profileJSON}, + } + if _, err := cs.CoreV1().ConfigMaps(ns).Create(ctx, cm, metav1.CreateOptions{}); err != nil { + t.Fatalf("create profile cm %q: %v", name, err) + } + t.Cleanup(func() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + _ = cs.CoreV1().ConfigMaps(ns).Delete(ctx, name, metav1.DeleteOptions{}) + }) +} + +// renderJob templates the embedded seiload Job manifest with the per-run params. +// The manifest owns seiload's shape; only per-run values are injected. +func renderJob(t *testing.T, p seiloadParams) *batchv1.Job { + t.Helper() + var buf bytes.Buffer + if err := template.Must(template.New("job").Parse(seiloadJobTmpl)).Execute(&buf, p); err != nil { + t.Fatalf("render seiload job: %v", err) + } + var job batchv1.Job + if err := yaml.Unmarshal(buf.Bytes(), &job); err != nil { + t.Fatalf("unmarshal seiload job: %v", err) + } + return &job +} + +// runSeiload renders the platform profile, applies seiload's Job manifest, waits +// for the Job to complete, and asserts every follower is still caught up. +// Pass/fail is Job completion plus post-load liveness; throughput gating is a +// PromQL query over the run's metrics (the Job carries a metrics scrape label, +// pending a podMonitor that selects it). +func runSeiload(ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ch *chain, s spec) { + t.Helper() + // The seiload Job co-locates with the chain; the network's resolved + // namespace is authoritative (never re-resolve from env here). + ns := ch.network.Namespace() + + profileCM := "seiload-profile-" + s.runID + profileJSON := renderProfile(ctx, t, cs, ns, s.seiloadProfile, s.chainID, ch.evmEndpoints()) + createProfileCM(ctx, t, cs, ns, profileCM, s.runID, profileJSON) + + job := renderJob(t, seiloadParams{ + RunID: s.runID, + ChainID: s.chainID, + Commit: s.seiloadCommit, + Image: s.seiloadImage, + DurationMinutes: s.durationMin, + ProfileCM: profileCM, + // Self-terminating cap independent of the harness ctx: the load plus + // generous slack for image pull + the post-summary flush. + DeadlineSeconds: (s.durationMin + 15) * 60, + }) + job.Namespace = ns + if _, err := cs.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{}); err != nil { + t.Fatalf("create seiload job: %v", err) + } + t.Cleanup(func() { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + bg := metav1.DeletePropagationBackground + _ = cs.BatchV1().Jobs(ns).Delete(ctx, job.Name, metav1.DeleteOptions{PropagationPolicy: &bg}) + }) + + waitJob(ctx, t, cs, ns, job.Name) + + // Chain survived the load: every follower still caught up (a follower can't + // catch up to a halted chain, so this transitively covers validator quorum). + hc := &http.Client{Timeout: 10 * time.Second} + for _, n := range ch.rpcNodes { + if err := sei.WaitCaughtUp(ctx, hc, n.TendermintRPC()); err != nil { + t.Errorf("post-load %s not caught up: %v", n.Name(), err) + } + } +} + +// waitJob blocks until the seiload Job reaches a terminal condition. A Failed +// Job fails the suite; success returns. Bounded by ctx. +func waitJob(ctx context.Context, t *testing.T, cs *kubernetes.Clientset, ns, name string) { + t.Helper() + tick := time.NewTicker(10 * time.Second) + defer tick.Stop() + for { + job, err := cs.BatchV1().Jobs(ns).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get seiload job %q: %v", name, err) + } + for _, cond := range job.Status.Conditions { + if cond.Type == batchv1.JobComplete && cond.Status == corev1.ConditionTrue { + return + } + if cond.Type == batchv1.JobFailed && cond.Status == corev1.ConditionTrue { + t.Fatalf("seiload job %q failed: %s\n--- seiload pod log (tail) ---\n%s", + name, cond.Message, podLogTail(ctx, cs, ns, name)) + } + } + select { + case <-ctx.Done(): + t.Fatalf("seiload job %q did not finish before deadline: %v", name, ctx.Err()) + case <-tick.C: + } + } +} + +// podLogTail returns the tail of the seiload pod's log for a Job, best-effort — +// the failure-time signal a Job condition message alone cannot give. +func podLogTail(ctx context.Context, cs *kubernetes.Clientset, ns, jobName string) string { + pods, err := cs.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{ + LabelSelector: "batch.kubernetes.io/job-name=" + jobName, + }) + if err != nil || len(pods.Items) == 0 { + return fmt.Sprintf("(no pod for job %q: %v)", jobName, err) + } + lines := int64(50) + raw, err := cs.CoreV1().Pods(ns).GetLogs(pods.Items[0].Name, &corev1.PodLogOptions{TailLines: &lines}).DoRaw(ctx) + if err != nil { + return fmt.Sprintf("(read logs failed: %v)", err) + } + return string(raw) +}