diff --git a/.gitignore b/.gitignore index 72a79d0..ac2c77c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ # Build output — populated by `go build ./cmd/tracebloc` at the repo -# root. Release artifacts go through dist/ (see Phase 5 / release -# workflow); local hacking puts a binary here. +# root, or by `go build -o ./bin/tracebloc ./cmd/tracebloc` for the +# convention some folks prefer. Release artifacts go through dist/ +# (see Phase 5 / release workflow); local hacking puts a binary here. /tracebloc +/bin/ # Cross-compiled binaries staged by the release workflow. /dist/ diff --git a/cmd/tracebloc/main.go b/cmd/tracebloc/main.go index 8066305..978cde4 100644 --- a/cmd/tracebloc/main.go +++ b/cmd/tracebloc/main.go @@ -18,8 +18,11 @@ package main import ( + "context" "fmt" "os" + "os/signal" + "syscall" "github.com/tracebloc/cli/internal/cli" ) @@ -35,11 +38,32 @@ var ( ) func main() { + // Wire SIGINT / SIGTERM into the cobra root command's context. + // Long-running operations (e.g. push.Stage in `dataset push`) + // propagate ctx down through every k8s API call, so cancelling + // here triggers ctx.Done() → in-flight HTTP cancels → defers + // fire in normal stack unwind → orphan Pod gets cleaned up. + // + // Without this wire, Ctrl-C goes straight through Go's runtime + // signal handler and exits the process WITHOUT running defers + // — which silently broke the SIGINT-safe cleanup contract in + // push.Stage's docstring. Bugbot flagged this on PR-b. + // + // signal.NotifyContext (Go 1.16+) is the stdlib pattern for + // this. The stop func unregisters the handler so a *second* + // SIGINT does the normal hard-kill — important for the case + // where the cleanup itself hangs (e.g. API server unreachable + // even past the 30s cleanup deadline). The customer's second + // Ctrl-C should always work. + ctx, stop := signal.NotifyContext(context.Background(), + syscall.SIGINT, syscall.SIGTERM) + defer stop() + err := cli.NewRootCmd(cli.BuildInfo{ Version: version, GitSHA: gitSHA, BuildDate: buildDate, - }).Execute() + }).ExecuteContext(ctx) if err == nil { return } diff --git a/go.mod b/go.mod index c792eb3..56b4e7b 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,9 @@ go 1.26.0 require ( github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 + github.com/schollz/progressbar/v3 v3.19.0 github.com/spf13/cobra v1.8.1 + golang.org/x/term v0.39.0 golang.org/x/text v0.33.0 gopkg.in/yaml.v3 v3.0.1 k8s.io/api v0.36.1 @@ -28,13 +30,17 @@ require ( github.com/go-openapi/swag v0.23.0 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect + github.com/moby/spdystream v0.5.1 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/rivo/uniseg v0.4.7 // indirect github.com/spf13/pflag v1.0.9 // indirect github.com/x448/float16 v0.8.4 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect @@ -42,13 +48,13 @@ require ( golang.org/x/net v0.49.0 // indirect golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/sys v0.40.0 // indirect - golang.org/x/term v0.39.0 // indirect golang.org/x/time v0.14.0 // indirect google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect k8s.io/klog/v2 v2.140.0 // indirect k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a // indirect + k8s.io/streaming v0.36.1 // indirect k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect diff --git a/go.sum b/go.sum index 9a05a37..a49a7f6 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,7 @@ +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= +github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM= +github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -27,6 +31,8 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -42,6 +48,12 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= +github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= +github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y= +github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -53,11 +65,15 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8m github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= +github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 h1:KRzFb2m7YtdldCEkzs6KqmJw4nqEVZGK7IN2kJkjTuQ= github.com/santhosh-tekuri/jsonschema/v6 v6.0.2/go.mod h1:JXeL+ps8p7/KNMjDQk3TCwPpBy0wYklyWTfbkIzdIFU= +github.com/schollz/progressbar/v3 v3.19.0 h1:Ea18xuIRQXLAUidVDox3AbwfUhD0/1IvohyTutOIFoc= +github.com/schollz/progressbar/v3 v3.19.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec= github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= @@ -114,6 +130,8 @@ k8s.io/klog/v2 v2.140.0 h1:Tf+J3AH7xnUzZyVVXhTgGhEKnFqye14aadWv7bzXdzc= k8s.io/klog/v2 v2.140.0/go.mod h1:o+/RWfJ6PwpnFn7OyAG3QnO47BFsymfEfrz6XyYSSp0= k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a h1:xCeOEAOoGYl2jnJoHkC3hkbPJgdATINPMAxaynU2Ovg= k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a/go.mod h1:uGBT7iTA6c6MvqUvSXIaYZo9ukscABYi2btjhvgKGZ0= +k8s.io/streaming v0.36.1 h1:L+K68n4Gg940BGNNYtUBvL1WTLL0YnKT3s+P1MNAmR4= +k8s.io/streaming v0.36.1/go.mod h1:z6fV3D+NVkoeqRMtWwlUZK6U17SY/LqNzOxWL6GyR/s= k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 h1:AZYQSJemyQB5eRxqcPky+/7EdBj0xi3g0ZcxxJ7vbWU= k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= diff --git a/internal/cli/dataset.go b/internal/cli/dataset.go index d740e0f..6034e03 100644 --- a/internal/cli/dataset.go +++ b/internal/cli/dataset.go @@ -15,11 +15,11 @@ import ( ) // newDatasetCmd wires the `tracebloc dataset` subtree. The dominant -// verb is `push`, introduced in Phase 3 (tracebloc/client#151) and -// landing across two PRs: PR-a (this one) implements the -// no-op-safe pre-flight; PR-b adds the actual file streaming via -// an ephemeral Pod + tar-over-exec. Future verbs (`dataset list`, -// `dataset rm`) hang off this parent in v0.2. +// verb is `push`, completed in Phase 3 (tracebloc/client#151) across +// PR-a (pre-flight: spec synth, validation, layout walk, cluster +// discovery) and PR-b (this one: ephemeral stage Pod + tar-over- +// exec stream + progress bar + SIGINT-safe cleanup). Future verbs +// (`dataset list`, `dataset rm`) hang off this parent in v0.2. func newDatasetCmd() *cobra.Command { cmd := &cobra.Command{ Use: "dataset", @@ -27,9 +27,13 @@ func newDatasetCmd() *cobra.Command { Long: `Commands for staging and managing datasets on the cluster's shared PVC. -Today: ` + "`dataset push`" + ` stages a local directory and (in PR-b) -submits an ingestion run. ` + "`tracebloc cluster info`" + ` is the -pre-flight you'd typically run before the first push.`, +Today: ` + "`dataset push`" + ` stages a local directory to the cluster's +shared PVC. Submission to jobs-manager (so the ingestor Job actually +runs) lands in Phase 4 (` + "`tracebloc/client#152`" + `); for now the staged +files are picked up by the existing helm ` + "`tracebloc/ingestor`" + ` flow. + +` + "`tracebloc cluster info`" + ` is the pre-flight you'd typically run +before the first push.`, } cmd.AddCommand(newDatasetPushCmd()) return cmd @@ -37,19 +41,21 @@ pre-flight you'd typically run before the first push.`, // newDatasetPushCmd implements `tracebloc dataset push `. // -// PR-a scope (what this implements today): +// Phase 3 scope (now complete across PR-a + PR-b): // // - Synthesize the ingest spec from flags (`internal/push.SpecArgs.Build`) // - Validate it against the embedded ingest.v1 schema // - Walk the local directory + enforce v0.1 size caps // - Discover cluster, parent release, and shared PVC -// - Print a single-screen "ready to stage" summary -// - --dry-run stops here (and so does this PR — actual staging -// errors out with "coming in PR-b" until #151 PR-b merges) +// - Print a single-screen pre-flight summary +// - Either --dry-run stop, OR create an ephemeral stage Pod +// (alpine 3.20 pinned by digest, PSA-restricted security +// context), tar local files into it via an SPDY exec stream +// with a progress bar, then defer-delete the Pod // -// PR-b will add the ephemeral stage Pod, tar-over-exec stream, -// progress bar, and SIGINT-safe cleanup — slotting into the -// "TODO: PR-b" branch below without touching anything above it. +// Phase 4 (`tracebloc/client#152`) hooks the submit-to-jobs-manager +// step into the bottom of this command, replacing the "manually +// kick off helm ingestor" workaround in the success message. func newDatasetPushCmd() *cobra.Command { var ( // Kubeconfig flags — same conventions as `cluster info`. @@ -71,10 +77,18 @@ func newDatasetPushCmd() *cobra.Command { // Operations flags. dryRun bool - // Ingestor SA name override (only matters once PR-b mints - // a token to talk to the future stage-pod-creation hook). - // Plumbed today so PR-b doesn't have to touch flag wiring. + // Ingestor SA name override. Used as the ServiceAccountName + // of the ephemeral stage Pod, so the Pod inherits whatever + // imagePullSecrets + PSA exemptions the admin already + // configured for that SA. ingestorSAName string + + // Stage Pod image override. Defaults to the digest-pinned + // alpine that ships with the CLI; air-gapped customers + // override this to an image their registry mirror serves. + // Pin by digest in your override too — tag-only references + // drift silently and break "all my pushes worked yesterday." + stagePodImage string ) cmd := &cobra.Command{ @@ -99,12 +113,13 @@ datasets need the v0.2 cloud-source story (S3/GCS/HTTPS sources) — see tracebloc/client#147 non-goals. Exit codes: - 0 staged + (in PR-b) submitted successfully - 2 schema validation failed (synthesized spec rejected) + 0 files staged successfully (Phase 4 will add: submitted + completed) + 2 schema validation failed (synthesized spec rejected) or + v0.1-unsupported category passed 3 local-layout or kubeconfig error - 4 cluster reachable but parent release missing - 6 pre-flight succeeded but the actual stage step isn't - implemented yet (PR-b for #151 will deliver it)`, + 4 cluster reachable but parent release / shared PVC missing + 7 pre-flight succeeded but staging the files failed + (Pod creation, image pull, exec stream, or remote tar error)`, Args: cobra.ExactArgs(1), RunE: func(cmd *cobra.Command, args []string) error { return runDatasetPush(cmd.Context(), cmd.OutOrStdout(), cmd.ErrOrStderr(), @@ -116,6 +131,7 @@ Exit codes: Spec: push.SpecArgs{Table: table, Category: category, Intent: intent, LabelColumn: labelColumn}, DryRun: dryRun, IngestorSAName: ingestorSAName, + StagePodImage: stagePodImage, }) }, } @@ -146,6 +162,9 @@ Exit codes: cmd.Flags().StringVar(&ingestorSAName, "ingestor-sa", "", "override the ingestor ServiceAccount name (default: \"ingestor\"); "+ "set this if you customized ingestionAuthz.serviceAccountName in the parent client chart") + cmd.Flags().StringVar(&stagePodImage, "stage-pod-image", "", + "override the ephemeral stage Pod's image (default: digest-pinned alpine 3.20 baked into the CLI). "+ + "Pin by digest in your override too — tag-only refs drift silently.") return cmd } @@ -162,12 +181,13 @@ type runDatasetPushArgs struct { Spec push.SpecArgs DryRun bool IngestorSAName string + StagePodImage string } -// runDatasetPush is the PR-a slim implementation. It performs every -// pre-flight check and prints a summary; the actual file staging -// is gated behind a clear "not yet implemented" error so PR-a -// merging doesn't silently advertise a feature it can't deliver. +// runDatasetPush is the full Phase 3 implementation: pre-flight +// checks, then either --dry-run stop or stage Pod + tar stream + +// cleanup. Phase 4 (#152) will hook submit-to-jobs-manager after +// the staging step. // // Step order is "fail fast, fail local" — every step that doesn't // need the cluster runs before any that does, so a customer with @@ -177,7 +197,7 @@ func runDatasetPush(ctx context.Context, out, errOut io.Writer, a runDatasetPush // 1. Validate the table name BEFORE anything else. It's both // the MySQL identifier and the /data/shared// PVC // subdirectory — an unsanitized traversal name (../../etc) - // would escape that subtree once PR-b's stage Pod writes to + // would escape that subtree once the stage Pod writes to // it. The embedded schema only checks minLength on `table`, // so this CLI-side guard is the real fix. SpecArgs.Build() // below calls StagedPrefix, which panics on an unsafe name — @@ -186,7 +206,23 @@ func runDatasetPush(ctx context.Context, out, errOut io.Writer, a runDatasetPush return &exitError{code: 2, err: err} } - // 2. Synthesize the spec from flags + validate against schema. + // 2. v0.1 category gate. Runs BEFORE schema validation because + // schema-valid-but-unsupported categories (e.g. + // tabular_classification) would otherwise fail with the + // schema's "missing property 'schema'" message — confusing + // for the customer who has no way to set --schema in v0.1. + // Nonsense categories (typos) also hit this gate; the + // "only image_classification in v0.1" message is more + // actionable than the schema's 11-option enum list anyway. + // Bugbot's review-on-self caught the missing gate on PR-a. + if a.Spec.Category != "" && a.Spec.Category != "image_classification" { + return &exitError{code: 2, err: fmt.Errorf( + "category %q is not supported in v0.1 (only image_classification). "+ + "Other categories are one-PR additions in v0.2 — see "+ + "tracebloc/client#147 non-goals.", a.Spec.Category)} + } + + // 3. Synthesize the spec from flags + validate against schema. // Catches "bad category", "missing intent" etc. BEFORE we // touch the filesystem or the cluster. The error formatter // is the same one ingest validate uses, so a customer who @@ -223,7 +259,7 @@ func runDatasetPush(ctx context.Context, out, errOut io.Writer, a runDatasetPush return &exitError{code: 2, err: errors.New("synthesized spec failed schema validation; check the flag values above")} } - // 3. Walk the local directory. Enforces layout + size caps; + // 4. Walk the local directory. Enforces layout + size caps; // customer sees a clear pointer to expected layout if they // pass the wrong directory. layout, err := push.Discover(a.LocalPath) @@ -231,7 +267,7 @@ func runDatasetPush(ctx context.Context, out, errOut io.Writer, a runDatasetPush return &exitError{code: 3, err: err} } - // 4. Cluster discovery — same kubeconfig path as `cluster info`. + // 5. Cluster discovery — same kubeconfig path as `cluster info`. // Errors mirror that command's exit-code contract (3 for // kubeconfig, 4 for missing release) so behaviour is // consistent across pre-flight commands. @@ -255,36 +291,74 @@ func runDatasetPush(ctx context.Context, out, errOut io.Writer, a runDatasetPush release.IngestorSAName = a.IngestorSAName } - // 5. PVC discovery. New in this PR — confirms the chart's - // shared-data PVC is Bound before we waste time provisioning - // a Pod that can't mount it. + // 6. PVC discovery — confirms the chart's shared-data PVC is + // Bound before we waste time provisioning a Pod that can't + // mount it. pvc, err := cluster.DiscoverSharedPVC(ctx, cs, resolved.Namespace) if err != nil { return &exitError{code: 4, err: err} } - // 6. Print the pre-flight summary. The output is the same in - // dry-run and (eventually) live mode — only the "what - // happens next" line differs. Customers iterating on a - // bad layout see this every attempt, so it's worth keeping - // skimmable: one fact per line, aligned by column. + // 7. Print the pre-flight summary. The output is the same in + // dry-run and live mode — only the "what happens next" line + // differs. Customers iterating on a bad layout see this + // every attempt, so it's worth keeping skimmable: one fact + // per line, aligned by column. printPushPreflight(out, layout, release, pvc, spec, a.DryRun) - // 7. Dry-run stop. Acknowledged success. + // 8. Dry-run stop. Acknowledged success. if a.DryRun { _, _ = fmt.Fprintln(out, "Dry-run complete — no cluster resources were created.") return nil } - // 8. The actual staging branch lands in PR-b. Failing here - // rather than silently returning success means a customer - // who pulled PR-a's binary and ran without --dry-run gets - // a clear "wait for PR-b" signal instead of "0 files - // transferred" confusion in Phase 4. - return &exitError{code: 6, err: errors.New( - "pre-flight succeeded but the actual file staging step isn't " + - "implemented yet — wait for tracebloc/client#151 PR-b. " + - "Re-run with --dry-run to validate without this error.")} + // 9. Stage the files: create ephemeral Pod → wait Ready → tar + // stream → cleanup. The deferred cleanup inside push.Stage + // runs on success and failure (including ctx cancellation + // from a SIGINT handler), so no orphan Pod is left behind. + // + // Exit code 7 ("staging failed") is distinct from the + // pre-flight codes so customers can branch on whether the + // failure was their environment vs the actual data transfer. + progress := push.NewProgress(out, layout.TotalBytes, + fmt.Sprintf("Staging %s", a.Spec.Table)) + // Defer Finish so a failure path that returns BEFORE + // StreamLayout (e.g. CreateStagePod fails on PSA rejection, + // WaitForStagePodReady times out) still clears the TTY + // progress UI. push.StreamLayout's own deferred Finish would + // otherwise be unreachable. Calling Finish twice on the same + // schollz bar is a no-op, so the double-call on the happy + // path is safe. Bugbot flagged on PR-b round 5. + defer progress.Finish() + stageErr := push.Stage(ctx, push.StageOptions{ + Client: cs, + Executor: &push.SPDYExecutor{ + Config: resolved.RestConfig, + Client: cs, + }, + Namespace: resolved.Namespace, + IngestorSAName: release.IngestorSAName, + PVCClaimName: pvc.ClaimName, + PVCMountPath: pvc.MountPath, + Layout: layout, + Table: a.Spec.Table, + StagePodImage: a.StagePodImage, + Progress: progress, + Out: out, + }) + if stageErr != nil { + return &exitError{code: 7, err: stageErr} + } + + // 10. Phase 4 (submit to jobs-manager + watch + summary) hooks + // in here — tracebloc/client#152. For now PR-b leaves the + // dataset staged on the PVC and exits 0, which the customer + // can then chase manually via `helm install ingestor ...` if + // they need the ingestion to actually run today. + _, _ = fmt.Fprintln(out) + _, _ = fmt.Fprintln(out, "Dataset staged. Submission to jobs-manager arrives in Phase 4 (#152);") + _, _ = fmt.Fprintln(out, "in the meantime, the existing helm ingestor flow can pick up the staged files.") + return nil } // printPushPreflight is the customer-facing summary. Mirrors @@ -333,7 +407,7 @@ func printPushPreflight( _, _ = fmt.Fprintln(out) if !dryRun { - _, _ = fmt.Fprintf(out, "Next: stage %d files (%s) → %s (coming in PR-b for #151)\n", + _, _ = fmt.Fprintf(out, "Next: stage %d files (%s) → %s\n", 1+len(layout.Images), push.HumanBytes(layout.TotalBytes), push.StagedPrefix(spec["table"].(string))) _, _ = fmt.Fprintln(out) diff --git a/internal/cli/dataset_test.go b/internal/cli/dataset_test.go index 5a4ef69..96d7ea0 100644 --- a/internal/cli/dataset_test.go +++ b/internal/cli/dataset_test.go @@ -63,39 +63,33 @@ func execDatasetPush(t *testing.T, args []string) (exitCode int, stdout, stderr return ExitCodeFromError(err), so.String(), se.String() } -// TestDatasetPush_BadCategory_ExitsTwo: the synthesized spec gets -// fed through the embedded schema before any cluster work; an -// invalid category surfaces with exit 2 (the CLI-wide "schema -// violation" code). -func TestDatasetPush_BadCategory_ExitsTwo(t *testing.T) { +// TestDatasetPush_UnsupportedCategory_ExitsTwo: v0.1 only supports +// image_classification end-to-end (epic #147 non-goals); the +// CLI-side gate runs before schema validation so a customer who +// passes --category=tabular_classification gets the actionable +// "v0.1 supports only image_classification" message instead of +// the schema's confusing "missing property 'schema'" (which the +// customer has no way to supply in v0.1). Bugbot review-on-self +// caught the missing gate on PR-a; landing it here. +func TestDatasetPush_UnsupportedCategory_ExitsTwo(t *testing.T) { root := imgcLayout(t) - code, _, stderr := execDatasetPush(t, []string{ - root, - "--table=t1", - "--category=definitely-not-a-category", - "--intent=train", - "--label-column=label", - }) - if code != 2 { - t.Fatalf("expected exit 2 for schema failure, got %d", code) - } - // The same FormatErrors output ingest validate uses, routed - // to stderr so downstream pipes of stdout aren't polluted. - // Checking for "category" + "image_classification" surfacing - // means we're getting the JSON-pointer-anchored diagnostic + - // the enum-list expansion, not a generic message. - // Check three things: (1) the JSON-pointer-style "category" - // anchor surfaces, (2) the enum-list expansion happened (so - // "image_classification" appears as a valid option), (3) our - // "synthesized spec" framing is on the right output stream. - // The wording "synthesized spec" is intentionally distinct - // from ingest validate's "schema validation failed" — it tells - // the customer the CLI synthesized the YAML; they didn't - // author it. - for _, want := range []string{"category", "image_classification", "synthesized spec failed schema validation"} { - if !strings.Contains(stderr, want) { - t.Errorf("expected stderr to mention %q, got:\n%s", want, stderr) - } + for _, badCategory := range []string{ + "tabular_classification", // schema-valid but v0.1-unsupported + "object_detection", // ditto + "definitely-not-a-category", // nonsense; gate catches this too + } { + t.Run(badCategory, func(t *testing.T) { + code, _, _ := execDatasetPush(t, []string{ + root, + "--table=t1", + "--category=" + badCategory, + "--intent=train", + "--label-column=label", + }) + if code != 2 { + t.Fatalf("expected exit 2 for unsupported category %q, got %d", badCategory, code) + } + }) } } diff --git a/internal/push/orphan.go b/internal/push/orphan.go new file mode 100644 index 0000000..65ab4c0 --- /dev/null +++ b/internal/push/orphan.go @@ -0,0 +1,196 @@ +package push + +import ( + "context" + "fmt" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// OrphanGracePeriod is how long a non-Running stage Pod has to live +// before we flag it as an orphan. Running Pods are NEVER flagged +// regardless of age (see FindOrphanStagePods's Phase=Running skip, +// Bugbot r7) — Pods past pod.go's StagePodActiveDeadline of 30 min +// are still legitimate "in-progress active push" candidates, and +// any per-age cap would produce false positives for near-cap pushes. +// +// 5 minutes targets the genuinely-stuck shapes: +// +// - Phase=Pending past 5 min: image pull failure that didn't +// resolve, scheduling stuck on missing nodes, PSA rejection +// that took kubelet a while to surface +// - Phase=Failed past 5 min: container crashed at startup or +// during stream, CLI didn't get a chance to clean up +// - Phase=Unknown past 5 min: node went away (network partition, +// kubelet crash) and the Pod metadata is stranded +// +// All three shapes are genuinely orphan — no still-running push +// will ever progress past Pending/Failed/Unknown into Ready, so +// 5 min is comfortable for "stuck enough to surface as a warning." +const OrphanGracePeriod = 5 * time.Minute + +// Orphan describes a stage Pod found by the orphan scan. Carries +// enough metadata for the warning surface to render an actionable +// "delete with: kubectl delete pod X -n Y" hint. +type Orphan struct { + // Name is the metadata.name. + Name string + + // Namespace is the resolved namespace (not the chart's + // release-name, the actual API namespace). + Namespace string + + // Table is the destination table the orphan was staging for — + // read from the StagePodTableLabel set in BuildStagePodSpec. + // May be empty for very old orphans (pre-label-convention), + // but every Pod the CLI ever created carries this label. + Table string + + // Age is how long the Pod has existed (now - creationTimestamp). + // Surfaces in the warning so customers can spot stale Pods at + // a glance ("3 hours old" → almost certainly safe to delete; + // "6 minutes old" → maybe wait, might be from a slow parallel + // push). + Age time.Duration +} + +// FindOrphanStagePods lists every stage Pod the CLI has ever +// created in the namespace and returns the ones past OrphanGrace +// Period. Pods younger than that are silently filtered — they +// might be the CURRENT invocation's own Pod (the orphan scan runs +// before CreateStagePod, but in a tight enough loop the previous +// invocation's just-created Pod could plausibly still be < grace). +// +// Returns nil + nil if there are no orphans. Doesn't return an +// error for the API call failing — orphan scanning is best-effort +// (an RBAC denial on `list pods` shouldn't block the push) — the +// caller logs the scan failure but proceeds. +// +// Hence the signature returns ([]Orphan, error): the slice is +// empty on success-with-no-orphans, the error is non-nil only on +// API failures the caller might want to surface for debugging. +func FindOrphanStagePods(ctx context.Context, cs kubernetes.Interface, namespace string) ([]Orphan, error) { + selector := fmt.Sprintf("%s=%s,%s=%s", + StagePodManagedByLabel, StagePodManagedByValue, + StagePodComponentLabel, StagePodComponentValue) + + pods, err := cs.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: selector, + }) + if err != nil { + // Don't wrap — the caller (cli.runDatasetPush) will either + // log this for diagnostic value and proceed, or ignore it + // entirely. Either way, the orphan scan isn't on the + // critical path. + return nil, fmt.Errorf("listing stage Pods in %s: %w", namespace, err) + } + + now := time.Now() + var orphans []Orphan + for i := range pods.Items { + p := &pods.Items[i] + + // Phase=Running means an active push (this workstation or + // another's) is still doing real work. activeDeadlineSeconds + // is its safety net at the cluster level. Flagging it as + // orphan would produce false positives for legitimate + // slow/near-cap pushes — pod.go budgets ~8.5 minutes for + // the 1 GiB-cap transfer alone, which exceeds the 5-min + // grace below. Bugbot flagged the false-positive risk on + // PR-b round 7. + // + // Pods in non-Running phases (Pending/Failed/Unknown) past + // the grace period are still flagged — those are the + // genuine orphan shapes (stuck on image pull, crashed + // during stream, network-partitioned). + if p.Status.Phase == corev1.PodRunning { + continue + } + + age := now.Sub(p.CreationTimestamp.Time) + if age < OrphanGracePeriod { + continue + } + orphans = append(orphans, Orphan{ + Name: p.Name, + Namespace: p.Namespace, + Table: p.Labels[StagePodTableLabel], + Age: age, + }) + } + return orphans, nil +} + +// FormatOrphansWarning renders the orphan list as a multi-line +// customer-facing warning. Returns empty string when orphans is +// empty — caller can blind-print without an "if len > 0" check. +// +// The "delete with" hint is the actionable part. v0.1 doesn't +// auto-delete because: +// +// 1. We can't tell from labels alone whether the Pod is stuck +// mid-transfer for a still-live parallel push from another +// workstation, vs truly orphaned from a crash. +// 2. Deleting someone else's in-progress push silently is bad +// UX. A warn-only approach respects the principle of least +// surprise. +// +// v0.2 can layer on `--cleanup-orphans` for ops folks who want it +// automated. +func FormatOrphansWarning(orphans []Orphan) string { + if len(orphans) == 0 { + return "" + } + var s string + s += fmt.Sprintf("WARNING: %d orphan stage Pod%s detected in this namespace — likely "+ + "leftover from a previously crashed `dataset push`:\n", + len(orphans), pluralS(len(orphans))) + names := make([]string, 0, len(orphans)) + for _, o := range orphans { + tableHint := "" + if o.Table != "" { + tableHint = fmt.Sprintf(" (table: %s)", o.Table) + } + s += fmt.Sprintf(" - %s, age %s%s\n", o.Name, humanDuration(o.Age), tableHint) + names = append(names, o.Name) + } + // Use SPECIFIC Pod names in the delete command, not the + // label selector. The selector would match every stage Pod + // in the namespace, including legitimate running ones from + // parallel pushes (this workstation or another's) — copy- + // pasting a label-based delete could silently kill someone + // else's in-progress push. Bugbot flagged the over-broad + // delete on PR-b round 7. + s += "Delete with: kubectl delete pod -n " + orphans[0].Namespace + " " + + strings.Join(names, " ") + "\n" + return s +} + +// pluralS returns "s" for n != 1, else "". Local helper to keep +// internal/push self-contained (the cli package has its own copy +// for the same reason — see cli/ingest.go's plural()). +func pluralS(n int) string { + if n == 1 { + return "" + } + return "s" +} + +// humanDuration is a coarse "X hours" / "X minutes" / "X seconds" +// formatter for orphan warnings. time.Duration.String() returns +// "2h13m45s" which is technically more precise but harder to read +// in a one-line warning. +func humanDuration(d time.Duration) string { + switch { + case d >= time.Hour: + return fmt.Sprintf("%dh", int(d.Hours())) + case d >= time.Minute: + return fmt.Sprintf("%dm", int(d.Minutes())) + default: + return fmt.Sprintf("%ds", int(d.Seconds())) + } +} diff --git a/internal/push/orphan_test.go b/internal/push/orphan_test.go new file mode 100644 index 0000000..271f086 --- /dev/null +++ b/internal/push/orphan_test.go @@ -0,0 +1,220 @@ +package push + +import ( + "context" + "strings" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +// stagePodWithAge constructs a stage-labeled Pod whose +// creationTimestamp is `age` ago. Used to seed the fake clientset +// for each orphan-test variant. Default phase is "" (unscheduled) — +// helpers below construct Running/etc explicitly. +func stagePodWithAge(name, table string, age time.Duration) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "tracebloc", + CreationTimestamp: metav1.NewTime(time.Now().Add(-age)), + Labels: map[string]string{ + StagePodManagedByLabel: StagePodManagedByValue, + StagePodComponentLabel: StagePodComponentValue, + StagePodTableLabel: table, + }, + }, + } +} + +// runningStagePodWithAge constructs a stage-labeled Pod in +// Phase=Running. Used to assert the Bugbot-r7 false-positive fix: +// a Running Pod is presumed to be an active push (this or +// another workstation's) and must NOT be flagged as an orphan +// regardless of age. +func runningStagePodWithAge(name, table string, age time.Duration) *corev1.Pod { + p := stagePodWithAge(name, table, age) + p.Status.Phase = corev1.PodRunning + return p +} + +// TestFindOrphanStagePods_NoOrphans: empty cluster returns empty, +// no error. +func TestFindOrphanStagePods_NoOrphans(t *testing.T) { + cs := fake.NewClientset() + got, err := FindOrphanStagePods(context.Background(), cs, "tracebloc") + if err != nil { + t.Fatalf("FindOrphanStagePods: %v", err) + } + if len(got) != 0 { + t.Errorf("len(orphans) = %d, want 0", len(got)) + } +} + +// TestFindOrphanStagePods_RecentPodFiltered: a Pod just created +// (well within OrphanGracePeriod) is silently filtered out — it +// might be the current invocation's own Pod from a parallel +// workstation push. +func TestFindOrphanStagePods_RecentPodFiltered(t *testing.T) { + cs := fake.NewClientset(stagePodWithAge("fresh", "t1", 30*time.Second)) + got, err := FindOrphanStagePods(context.Background(), cs, "tracebloc") + if err != nil { + t.Fatalf("FindOrphanStagePods: %v", err) + } + if len(got) != 0 { + t.Errorf("recent Pod surfaced as orphan; got %d, want 0", len(got)) + } +} + +// TestFindOrphanStagePods_StaleSurfaces: a Pod past the grace +// period surfaces with its metadata intact (name, table, age). +func TestFindOrphanStagePods_StaleSurfaces(t *testing.T) { + cs := fake.NewClientset(stagePodWithAge("stale-cats", "cats_dogs", 30*time.Minute)) + got, err := FindOrphanStagePods(context.Background(), cs, "tracebloc") + if err != nil { + t.Fatalf("FindOrphanStagePods: %v", err) + } + if len(got) != 1 { + t.Fatalf("len(orphans) = %d, want 1", len(got)) + } + o := got[0] + if o.Name != "stale-cats" { + t.Errorf("Name = %q, want stale-cats", o.Name) + } + if o.Table != "cats_dogs" { + t.Errorf("Table = %q, want cats_dogs", o.Table) + } + if o.Age < 29*time.Minute { + t.Errorf("Age = %v, want at least 29m", o.Age) + } +} + +// TestFindOrphanStagePods_IgnoresNonStagePods: the chart's own +// Pods (managed-by=Helm) and arbitrary user Pods must NOT show +// up — the label selector is the safety boundary. +func TestFindOrphanStagePods_IgnoresNonStagePods(t *testing.T) { + chartPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "jobs-manager-abc", + Namespace: "tracebloc", + CreationTimestamp: metav1.NewTime(time.Now().Add(-1 * time.Hour)), + Labels: map[string]string{ + "app.kubernetes.io/managed-by": "Helm", + "app.kubernetes.io/name": "client", + }, + }, + } + customerPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "their-app", + Namespace: "tracebloc", + CreationTimestamp: metav1.NewTime(time.Now().Add(-1 * time.Hour)), + }, + } + cs := fake.NewClientset(chartPod, customerPod) + got, err := FindOrphanStagePods(context.Background(), cs, "tracebloc") + if err != nil { + t.Fatalf("FindOrphanStagePods: %v", err) + } + if len(got) != 0 { + t.Errorf("non-stage Pods surfaced as orphans; got %d, want 0; names=%v", len(got), got) + } +} + +// TestFormatOrphansWarning_Empty returns the empty string so the +// caller can blind-print without conditionals. +func TestFormatOrphansWarning_Empty(t *testing.T) { + if got := FormatOrphansWarning(nil); got != "" { + t.Errorf("FormatOrphansWarning(nil) = %q, want empty", got) + } +} + +// TestFormatOrphansWarning_ActionableHint: the warning must contain +// (a) the count, (b) the per-pod name + age + table, (c) a +// kubectl delete command using SPECIFIC POD NAMES (not the label +// selector — Bugbot r7 flagged that the label selector would +// match every stage Pod including parallel-push's still-running +// ones). +func TestFormatOrphansWarning_ActionableHint(t *testing.T) { + orphans := []Orphan{ + {Name: "stage-a", Namespace: "tracebloc", Table: "cats_dogs", Age: 30 * time.Minute}, + {Name: "stage-b", Namespace: "tracebloc", Table: "xrays", Age: 2 * time.Hour}, + } + got := FormatOrphansWarning(orphans) + for _, want := range []string{ + "2 orphan stage Pods", + "stage-a", "cats_dogs", "30m", + "stage-b", "xrays", "2h", + // Targeted delete — specific names, not the label + // selector. The label-selector version would nuke + // running pods from parallel pushes. + "kubectl delete pod -n tracebloc stage-a stage-b", + } { + if !strings.Contains(got, want) { + t.Errorf("warning missing %q in:\n%s", want, got) + } + } + // Regression check: the label selector MUST NOT appear in + // the delete command. If a refactor reintroduces it, + // customers could accidentally nuke their parallel-push Pod. + if strings.Contains(got, "kubectl delete pod -n tracebloc -l ") { + t.Errorf("warning uses label-selector delete (would catch parallel pushes):\n%s", got) + } +} + +// TestFindOrphanStagePods_SkipsRunningPods is the Bugbot-r7 +// false-positive fix: a Pod in Phase=Running is presumed to be an +// active push (this workstation or another's still doing work), +// and must NOT be flagged as orphan regardless of age. Near-cap +// 1 GiB pushes legitimately take >5 min (pod.go budgets ~8.5 min +// for the stream alone), so without this guard the next concurrent +// push would warn about Pods that are still actively transferring. +func TestFindOrphanStagePods_SkipsRunningPods(t *testing.T) { + // 30-minute-old Running Pod — well past OrphanGracePeriod, + // but Running means it's an active push. + cs := fake.NewClientset(runningStagePodWithAge("active-slow-push", + "big_table", 30*time.Minute)) + got, err := FindOrphanStagePods(context.Background(), cs, "tracebloc") + if err != nil { + t.Fatalf("FindOrphanStagePods: %v", err) + } + if len(got) != 0 { + t.Errorf("Running Pod flagged as orphan; got %d, want 0", len(got)) + } +} + +// TestFindOrphanStagePods_FlagsNonRunningPastGrace: complement to +// the Running-skip — a Failed or Pending Pod past the grace period +// IS still an orphan (crashed-at-startup case, stuck-on-image-pull +// case). The Running-skip should narrow the false positives, not +// eliminate the warning entirely. +func TestFindOrphanStagePods_FlagsNonRunningPastGrace(t *testing.T) { + failedPod := stagePodWithAge("crashed", "t1", 30*time.Minute) + failedPod.Status.Phase = corev1.PodFailed + cs := fake.NewClientset(failedPod) + got, err := FindOrphanStagePods(context.Background(), cs, "tracebloc") + if err != nil { + t.Fatalf("FindOrphanStagePods: %v", err) + } + if len(got) != 1 { + t.Errorf("Failed Pod past grace not flagged; got %d, want 1", len(got)) + } +} + +// TestFormatOrphansWarning_NoTableLabel: very old orphans from +// before we added StagePodTableLabel might not have it. The +// formatter must not crash or render "(table: )". +func TestFormatOrphansWarning_NoTableLabel(t *testing.T) { + got := FormatOrphansWarning([]Orphan{ + {Name: "old-orphan", Namespace: "tracebloc", Age: 1 * time.Hour}, + }) + if strings.Contains(got, "(table: )") { + t.Errorf("warning rendered empty table parenthetical:\n%s", got) + } + if !strings.Contains(got, "old-orphan") { + t.Errorf("warning missing Pod name:\n%s", got) + } +} diff --git a/internal/push/pod.go b/internal/push/pod.go new file mode 100644 index 0000000..fb28e6d --- /dev/null +++ b/internal/push/pod.go @@ -0,0 +1,528 @@ +package push + +import ( + "context" + "crypto/rand" + "encoding/hex" + "errors" + "fmt" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" +) + +// DefaultStagePodImage is the alpine image the ephemeral stage Pod +// runs. Pinned by digest at CLI build time so a customer pulling +// v0.1.x at any future date gets bit-for-bit identical behavior — +// no "alpine just shipped a glibc-on-musl regression and now my +// stage Pod crashloops" surprises. +// +// alpine:3.20 was the current 3.x stable when Phase 3 PR-b landed +// (May 2026). The image is tiny (~8 MiB), has tar/sh/busybox built +// in, and is one of the most-mirrored images on the planet — air- +// gapped customers can almost always pull it from their internal +// mirror without extra config. +// +// Override via `tracebloc dataset push --stage-pod-image=...` for: +// +// - Customers on registries that don't proxy docker.io +// - Customers running a curated base image with extra audit +// instrumentation +// - Custom forks that need a specific busybox build +// +// Bumping the pinned digest is a v0.2 task; track it alongside +// the kube-deps refresh. +const DefaultStagePodImage = "alpine:3.20@sha256:d9e853e87e55526f6b2917df91a2115c36dd7c696a35be12163d44e6e2a4b6bc" + +// StagePodLabel{Key,Value} mark every Pod the CLI creates so the +// orphan-scan logic (orphan.go) can find leftover Pods from a +// previously crashed `dataset push` invocation. Using the +// kubernetes.io/managed-by label means the chart's own resources +// (managed-by=Helm) don't get caught in our scan. +const ( + StagePodManagedByLabel = "app.kubernetes.io/managed-by" + StagePodManagedByValue = "tracebloc-cli" + + // StagePodComponentLabel narrows the orphan scan to just stage + // Pods (not, e.g., a future `tracebloc cluster doctor` Pod). + StagePodComponentLabel = "tracebloc.io/component" + StagePodComponentValue = "stage-pod" + + // StagePodTableLabel records which table the Pod was staging + // for. Useful in the orphan-warning output so the customer + // knows which `dataset push` invocation it came from. + StagePodTableLabel = "tracebloc.io/table" +) + +// StagePodActiveDeadline is the in-cluster self-kill timer. The CLI +// also deletes the Pod via defer + SIGINT handler, but those only +// fire if the CLI process is still alive — a hard kill (`kill -9`, +// OS crash, network partition between laptop and cluster) leaves +// the Pod stranded. activeDeadlineSeconds means the kubelet kills +// it even if the CLI never gets the chance. +// +// 30 minutes covers the full lifecycle, not just the stream: +// +// ~60s image pull on a fresh node + scheduler back-pressure +// ~60s WaitForStagePodReady ceiling (StagePodReadyTimeout) +// ~8.5m 1 GiB transfer at a conservative 2 MB/s +// + comfortable margin for variance +// +// activeDeadlineSeconds starts the clock at Pod CREATION (not at +// streaming start), so we have to budget for the pre-stream +// readiness portion too — the earlier 10-minute value cut it too +// fine for near-cap customers on slow uplinks (kubelet would +// terminate mid-transfer). Bugbot flagged the squeeze as High on +// PR-b round 6. +// +// 30 min is generous but cheap — an idle alpine Pod with `sleep` +// consumes ~5 MiB RAM and zero CPU on the cluster. v0.2 should +// make this configurable per push. +const StagePodActiveDeadline = 1800 + +// StagePodReadyTimeout is how long we wait for the Pod to become +// Running + Ready after CREATE. Most clusters spawn an alpine Pod +// in 5-15 seconds; 60 seconds covers image-pull from a slow mirror +// + scheduler back-pressure on busy clusters. Beyond that, something +// is wrong (no image-pull access, no schedulable node, PSP/PSA +// rejection) and the customer wants the diagnostic, not a longer +// wait. +const StagePodReadyTimeout = 60 * time.Second + +// PodSpecOptions controls the ephemeral stage Pod construction. +// Fields are intentionally narrow — every knob is one a customer +// could plausibly need to turn for an air-gapped or hardened- +// security setup. Adding fields should require a real use case. +type PodSpecOptions struct { + // Namespace is where the Pod gets created — always the + // discovered parent-release namespace. + Namespace string + + // PVCClaimName is the shared PVC to mount at /data/shared. + // Discovered by cluster.DiscoverSharedPVC (always "client-pvc" + // today, but routed through a field so a future per-customer + // override doesn't require touching this signature). + PVCClaimName string + + // PVCMountPath is where to mount the PVC inside the Pod — + // "/data/shared" by chart convention. + PVCMountPath string + + // Table is the destination table name. Used to compose the Pod + // name and the on-PVC subdirectory. MUST have already passed + // ValidateTableName; pod-name composition relies on the same + // character-class restrictions. + Table string + + // Image overrides DefaultStagePodImage. Empty = use default. + Image string + + // ServiceAccountName is the SA the Pod runs as. Phase 2's + // discovery surfaces this as `ingestor` (the chart default) or + // whatever the customer's `--ingestor-sa` flag overrides to. + // Using the chart's existing SA means the Pod inherits any + // imagePullSecrets and PSA exemptions the admin already + // configured for it. + ServiceAccountName string +} + +// BuildStagePodSpec produces the corev1.Pod for an ephemeral stage +// Pod, fully parametrized but with no cluster side-effects. +// Separated from CreateStagePod so unit tests can assert the spec +// shape without needing a fake clientset for every assertion. +// +// Returns an error only when crypto/rand fails — which is rare but +// possible on systems with exhausted entropy. Earlier versions +// swallowed that error and produced a Pod name ending with a bare +// trailing hyphen (DNS-1123 violation → opaque API server +// rejection). Bugbot flagged that as Low on PR-b; surfacing the +// error here turns "weird API error message" into "clear local +// diagnostic at the call site." +// +// Security context follows the Kubernetes Pod Security Standards +// "restricted" profile — the strictest preset, accepted on every +// PSA-enabled namespace including the chart's recommended config. +// This is intentional: the stage Pod runs in the customer's +// namespace and writes to their PVC, so being a model citizen for +// PSA defaults reduces "the Pod won't even start on my cluster" +// surface area. +func BuildStagePodSpec(opts PodSpecOptions) (*corev1.Pod, error) { + image := opts.Image + if image == "" { + image = DefaultStagePodImage + } + + suffix, err := randomSuffix(4) // 4 bytes → 8 hex chars + if err != nil { + return nil, fmt.Errorf("generating Pod-name random suffix: %w", err) + } + // Transform the table name into a DNS-1123 subdomain-safe + // segment for the Pod name. ValidateTableName accepts + // [A-Za-z0-9_]+ (MySQL identifier rules), but Kubernetes Pod + // names follow DNS-1123 — lowercase + alphanumeric + hyphen + // only, must start/end with alphanumeric. Without this + // transform, the dominant canonical example (cats_dogs_train, + // snake_case throughout the tracebloc docs) would fail Pod + // creation post-pre-flight, which is a worst-of-both-worlds + // UX (the pre-flight summary says "we're good!" then the + // create fails). Bugbot flagged the gap as High on PR-b. + // + // The original (un-transformed) table name is preserved + // verbatim in the tracebloc.io/table label below, so orphan + // warnings still surface the customer-facing identifier. + podName := fmt.Sprintf("tracebloc-stage-%s-%s", + dns1123SafeTableSegment(opts.Table), suffix) + + // Pod-level security context: runAsNonRoot is the only field + // PSA's restricted profile *requires* at the Pod level (the + // rest are container-level). Setting it here means every + // future container (if we ever add one) inherits the + // non-root constraint. + runAsNonRoot := true + runAsUser := int64(65532) // distroless's "nonroot" UID; works on any cluster + + // Container-level security context: every PSA-restricted + // requirement. Reads top-to-bottom: capabilities dropped, + // read-only root FS, no privilege escalation, no privileged, + // seccomp default. None of these prevent tar from working — + // tar reads stdin + writes to the PVC mount (which is RW), + // and doesn't need any caps to run as a non-root user. + allowPrivEsc := false + privileged := false + readOnlyRootFS := true + + activeDeadline := int64(StagePodActiveDeadline) + + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: opts.Namespace, + Labels: map[string]string{ + StagePodManagedByLabel: StagePodManagedByValue, + StagePodComponentLabel: StagePodComponentValue, + StagePodTableLabel: opts.Table, + }, + Annotations: map[string]string{ + // Annotations are searchable via `kubectl describe` + // but don't constrain scheduling — perfect for + // breadcrumbs that help post-mortem an orphan. + "tracebloc.io/created-at": time.Now().UTC().Format(time.RFC3339), + "tracebloc.io/created-by": "tracebloc-cli", + }, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: opts.ServiceAccountName, + RestartPolicy: corev1.RestartPolicyNever, + + // 10-min in-cluster self-kill — see comment on the const. + ActiveDeadlineSeconds: &activeDeadline, + + SecurityContext: &corev1.PodSecurityContext{ + RunAsNonRoot: &runAsNonRoot, + RunAsUser: &runAsUser, + FSGroup: &runAsUser, // PVC writes need this group + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + + Containers: []corev1.Container{{ + Name: "stage", + Image: image, + // `sleep` keeps the Pod alive long enough for the + // CLI to open an exec stream and tar files in. + // activeDeadlineSeconds caps the worst case; the + // CLI deletes the Pod the moment the stream + // finishes (or fails). + // + // Why not the busybox `sleep infinity` idiom: it + // causes some PSA configurations to flag the Pod + // because the container holds the SIGTERM until + // activeDeadlineSeconds fires (sleep traps signals + // imperfectly). A finite sleep matched to the + // deadline is gentler. + Command: []string{"/bin/sleep", fmt.Sprintf("%d", StagePodActiveDeadline)}, + + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: &allowPrivEsc, + Privileged: &privileged, + ReadOnlyRootFilesystem: &readOnlyRootFS, + RunAsNonRoot: &runAsNonRoot, + RunAsUser: &runAsUser, + Capabilities: &corev1.Capabilities{ + // PSA restricted requires ALL capabilities + // dropped, then optionally add back + // NET_BIND_SERVICE. tar doesn't need it. + Drop: []corev1.Capability{"ALL"}, + }, + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + + VolumeMounts: []corev1.VolumeMount{{ + Name: "shared", + MountPath: opts.PVCMountPath, + }, { + // tar needs a writable working dir for its + // temporary state; with ReadOnlyRootFilesystem + // it can't use /tmp on the root FS. An + // emptyDir at /tmp is the standard pattern. + Name: "tmp", + MountPath: "/tmp", + }}, + + // Conservative resource requests: tar of typical + // image_classification data uses <100 MiB RAM and + // negligible CPU. Setting requests lets the + // scheduler place us; setting limits keeps a + // runaway tar (e.g. corrupted archive triggering + // a busy loop) from impacting the cluster. + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("64Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("256Mi"), + }, + }, + }}, + + Volumes: []corev1.Volume{{ + Name: "shared", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: opts.PVCClaimName, + }, + }, + }, { + Name: "tmp", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }}, + }, + }, nil +} + +// CreateStagePod creates the Pod in the cluster and returns the +// metadata.name the API server assigned. The returned name is what +// callers pass to WaitForStagePodReady + DeleteStagePod. +// +// Take-name-from-API-response is deliberate: even though +// BuildStagePodSpec pre-computes a name, the API server can in +// principle rewrite it (e.g. via a mutating admission webhook +// renaming with a prefix). Reading back from the response is the +// safe contract. +func CreateStagePod(ctx context.Context, cs kubernetes.Interface, opts PodSpecOptions) (string, error) { + pod, err := BuildStagePodSpec(opts) + if err != nil { + return "", err + } + created, err := cs.CoreV1().Pods(opts.Namespace).Create(ctx, pod, metav1.CreateOptions{}) + if err != nil { + // Most-common failure path: PSA rejects the Pod because + // the customer's namespace policy is stricter than even + // "restricted." We surface the raw API error so the customer + // sees the PSA violation list verbatim — that's the most + // actionable thing. + return "", fmt.Errorf("creating stage Pod in namespace %q: %w", opts.Namespace, err) + } + return created.Name, nil +} + +// WaitForStagePodReady polls until the Pod's status reports +// Ready=True (the canonical "containers started and not yet +// terminated" signal). Times out per StagePodReadyTimeout with a +// diagnostic that tries to help — image-pull failures and +// scheduler back-pressure are the dominant slow paths and have +// distinct symptom strings. +// +// Returns nil on Ready, the pod object on success so callers don't +// have to re-Get for it. +func WaitForStagePodReady(ctx context.Context, cs kubernetes.Interface, namespace, podName string) (*corev1.Pod, error) { + var lastObserved *corev1.Pod + + // Poll every 1s — image-pull failures surface in seconds, and + // the Ready transition itself is instant once kubelet has + // pulled+started. Tighter polling burns API server CPU for no + // benefit; looser polling delays the user. + err := wait.PollUntilContextTimeout(ctx, 1*time.Second, StagePodReadyTimeout, true, + func(ctx context.Context) (bool, error) { + p, err := cs.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + // Distinguish terminal vs transient. Terminal + // errors (Pod deleted out-of-band, RBAC revoked + // to read Pods) MUST short-circuit the poll — + // otherwise the customer waits the full 60s + // timeout for a condition that won't change. + // Bugbot flagged the prior "everything transient" + // version as Medium on PR-b. + if apierrors.IsNotFound(err) || apierrors.IsForbidden(err) { + return false, err + } + // Transient (network blip, brief API unavail). + // Keep polling; last-observed context survives. + return false, nil + } + lastObserved = p + // Positive terminal: Ready=True. + for _, c := range p.Status.Conditions { + if c.Type == corev1.PodReady && c.Status == corev1.ConditionTrue { + return true, nil + } + } + // Negative terminal: Phase=Failed (container crashed + // at startup — PSA rejection, image crashloop, OOM) + // or Phase=Succeeded (sleep exited unexpectedly — the + // stage container shouldn't terminate by itself before + // we exec into it, so this is also a failure mode). + // Without these checks the poll waits the full 60s + // for a Pod that will never become Ready. Bugbot + // flagged on PR-b round 4 as Medium; this is the + // counterpart to the NotFound/Forbidden short-circuit + // landed in the previous commit. + if p.Status.Phase == corev1.PodFailed || p.Status.Phase == corev1.PodSucceeded { + return false, fmt.Errorf( + "stage Pod %s/%s terminated in phase %q before becoming Ready%s", + namespace, podName, p.Status.Phase, podReadyTimeoutHint(p)) + } + return false, nil + }) + + if err == nil { + return lastObserved, nil + } + + // Differentiate "actual timeout expired" from "poll terminated + // early" (NotFound, Forbidden, Failed phase, ctx-canceled). + // The earlier blanket "did not become Ready within 60s" + // wording was misleading — Bugbot flagged on PR-b round 7. + hint := podReadyTimeoutHint(lastObserved) + if errors.Is(err, context.DeadlineExceeded) { + return nil, fmt.Errorf( + "stage Pod %s/%s did not become Ready within %s%s", + namespace, podName, StagePodReadyTimeout, hint) + } + // Early-exit error (terminal API error, terminal phase, + // SIGINT). Surface it as-is so the customer sees the actual + // cause without the wrong "ran out the timer" framing. + return nil, fmt.Errorf( + "stage Pod %s/%s did not reach Ready state: %w%s", + namespace, podName, err, hint) +} + +// podReadyTimeoutHint extracts the most useful diagnostic from a +// last-observed Pod status. Designed to match against the two +// dominant slow-path scenarios: +// +// 1. Image pull is slow or failed (ImagePullBackOff, +// ErrImagePull) — surface the registry message. +// 2. Pod is unschedulable (PodScheduled=False with reason) — +// surface the scheduler's message. +func podReadyTimeoutHint(p *corev1.Pod) string { + if p == nil { + return " (no Pod status observed — check API server connectivity)" + } + for _, cs := range p.Status.ContainerStatuses { + if cs.State.Waiting != nil && cs.State.Waiting.Reason != "" { + return fmt.Sprintf(" (last container state: %s — %s)", + cs.State.Waiting.Reason, cs.State.Waiting.Message) + } + } + for _, c := range p.Status.Conditions { + if c.Type == corev1.PodScheduled && c.Status == corev1.ConditionFalse { + return fmt.Sprintf(" (scheduling: %s — %s)", c.Reason, c.Message) + } + } + return fmt.Sprintf(" (Pod phase: %s)", p.Status.Phase) +} + +// DeleteStagePod removes the Pod. Called via defer in the orchestrator +// so it runs on both success and failure (including SIGINT, which +// is wired up to cancel the parent context and let the defer fire +// in normal stack unwind). +// +// Uses background propagation + a tiny grace period because we +// don't care about giving sleep a graceful shutdown — we just want +// the Pod gone so the next push doesn't see an orphan. +func DeleteStagePod(ctx context.Context, cs kubernetes.Interface, namespace, podName string) error { + gracePeriod := int64(0) + propagation := metav1.DeletePropagationBackground + err := cs.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{ + GracePeriodSeconds: &gracePeriod, + PropagationPolicy: &propagation, + }) + if err != nil && !apierrors.IsNotFound(err) { + // Not-found is fine: the Pod might have already self-killed + // via activeDeadlineSeconds, or a parallel `kubectl delete` + // got there first. Either way, our goal (no orphan) is + // achieved. Returning the error here would mask the + // upstream cause of failure that triggered the defer. + return fmt.Errorf("deleting stage Pod %s/%s: %w", namespace, podName, err) + } + return nil +} + +// dns1123SafeTableSegment transforms a ValidateTableName-passed +// table name into a Kubernetes-Pod-name-compatible path segment. +// The Pod's full name is then `tracebloc-stage--<8hex>`, +// which must satisfy DNS-1123 subdomain rules: +// +// [a-z0-9]([-a-z0-9]*[a-z0-9])? +// +// (lowercase + digit + hyphen, start/end alphanumeric). +// +// Our input alphabet is [A-Za-z0-9_], so the transform is: +// +// 1. Lowercase (DNS-1123 forbids uppercase) +// 2. Replace '_' with '-' (DNS-1123 forbids underscore) +// 3. Strip leading/trailing hyphens (a name like "_leading" +// would otherwise become "-leading" → tracebloc-stage--leading +// which is OK in the middle but ugly) +// 4. Cap length at 30 chars — Pod names are bounded at 63 total, +// and "tracebloc-stage-" + 8-hex-suffix already consumes ~25 +// of those, leaving ~38 for the segment. 30 gives margin. +// 5. Fallback to "tbl" if the transform leaves an empty string +// (the pathological "_"-only-name case). +func dns1123SafeTableSegment(table string) string { + s := strings.ToLower(strings.ReplaceAll(table, "_", "-")) + s = strings.Trim(s, "-") + if len(s) > 30 { + s = s[:30] + // Truncation could leave a trailing hyphen — re-trim. + s = strings.TrimRight(s, "-") + } + if s == "" { + // Pathological all-underscore name. The label still + // carries the original, so customers can still trace + // orphan-Pod warnings back to their push. + s = "tbl" + } + return s +} + +// randomSuffix returns a hex string of length 2*n. Used to make +// stage Pod names unique across parallel `dataset push` invocations +// for the same table — without this, two CLI runs would race on the +// same Pod name and one would fail the Create with AlreadyExists. +// +// Cryptographic randomness is overkill for collision avoidance +// (8 hex chars = 32 bits of entropy is way more than needed) but +// crypto/rand is the simpler import compared to math/rand which +// needs explicit seeding. +func randomSuffix(n int) (string, error) { + b := make([]byte, n) + if _, err := rand.Read(b); err != nil { + return "", err + } + return hex.EncodeToString(b), nil +} diff --git a/internal/push/pod_test.go b/internal/push/pod_test.go new file mode 100644 index 0000000..6e58715 --- /dev/null +++ b/internal/push/pod_test.go @@ -0,0 +1,560 @@ +package push + +import ( + "context" + "errors" + "strings" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + k8stesting "k8s.io/client-go/testing" +) + +// stageOpts is the minimal PodSpecOptions for happy-path tests — +// individual cases mutate one field to exercise their specific +// branch. +func stageOpts() PodSpecOptions { + return PodSpecOptions{ + Namespace: "tracebloc", + PVCClaimName: "client-pvc", + PVCMountPath: "/data/shared", + Table: "cats_dogs", + ServiceAccountName: "ingestor", + } +} + +// mustBuildStagePod wraps BuildStagePodSpec for tests that don't +// care about the (rare) crypto/rand failure path — failing the +// test if it ever fires is the right escape from "if err != nil" +// noise in every assertion. The dedicated TestBuildStagePodSpec_ +// PropagatesRandError below covers the error path directly. +func mustBuildStagePod(t *testing.T, opts PodSpecOptions) *corev1.Pod { + t.Helper() + p, err := BuildStagePodSpec(opts) + if err != nil { + t.Fatalf("BuildStagePodSpec: %v", err) + } + return p +} + +// TestDNS1123SafeTableSegment is the High-severity regression pin +// for the Bugbot finding on PR-b. Every realistic table name in +// tracebloc docs uses snake_case (cats_dogs_train, chest_xrays_train); +// without the transform these would produce Pod names K8s rejects +// post-pre-flight, which is the worst-of-both-worlds UX. The +// transform's job: take any ValidateTableName-passed name and +// produce a DNS-1123 subdomain-safe segment for the Pod name. +func TestDNS1123SafeTableSegment(t *testing.T) { + cases := []struct { + in, want string + }{ + // Canonical happy path: snake_case → kebab-case. + {"cats_dogs", "cats-dogs"}, + {"chest_xrays_train", "chest-xrays-train"}, + // Uppercase: must lowercase. + {"MyTable", "mytable"}, + {"ABC", "abc"}, + // Already lowercase no-underscore: identity. + {"single", "single"}, + // Mixed: lowercase + underscore → hyphen. + {"Table_123", "table-123"}, + // Edge: leading underscore must be stripped (would + // otherwise produce "-leading", and while the full Pod + // name `tracebloc-stage--leading-` is valid DNS-1123 + // the consecutive hyphens are ugly). + {"_leading_underscore", "leading-underscore"}, + // Edge: trailing underscore stripped. + {"trailing_", "trailing"}, + // Edge: digit-led is valid (DNS-1123 allows it). + {"9starts_with_digit", "9starts-with-digit"}, + // Truncation: input > 30 chars caps at 30, then trims + // any trailing hyphen the cut might have left. + {strings.Repeat("a", 50), strings.Repeat("a", 30)}, + // Pathological: all-underscores → fallback "tbl". + {"_", "tbl"}, + {"___", "tbl"}, + } + for _, c := range cases { + t.Run(c.in, func(t *testing.T) { + got := dns1123SafeTableSegment(c.in) + if got != c.want { + t.Errorf("dns1123SafeTableSegment(%q) = %q, want %q", c.in, got, c.want) + } + // Defensive: the result must satisfy DNS-1123 + // subdomain rules for a Pod-name segment. Cheap + // regex check covers the contract. + if !isDNS1123SafeSegment(got) { + t.Errorf("dns1123SafeTableSegment(%q) = %q, which violates DNS-1123", + c.in, got) + } + }) + } +} + +// isDNS1123SafeSegment is a test-only helper: returns true iff s +// matches the DNS-1123 subdomain segment regex (lowercase alnum + +// hyphen, must start AND end with alnum). Matches what the K8s +// validation library checks for Pod names. +func isDNS1123SafeSegment(s string) bool { + if s == "" { + return false + } + for i, r := range s { + switch { + case r >= 'a' && r <= 'z': + case r >= '0' && r <= '9': + case r == '-': + // Hyphen forbidden at boundaries. + if i == 0 || i == len(s)-1 { + return false + } + default: + return false + } + } + return true +} + +// TestBuildStagePodSpec_Defaults pins the spec fields PR-b's +// post-create logic depends on: the SA name, the PVC mount, the +// activeDeadline, and the labels orphan.go keys off. +func TestBuildStagePodSpec_Defaults(t *testing.T) { + p := mustBuildStagePod(t, stageOpts()) + + if p.Namespace != "tracebloc" { + t.Errorf("Namespace = %q, want tracebloc", p.Namespace) + } + // Pod name uses the DNS-1123-safe transform of the table name: + // "cats_dogs" → "cats-dogs". Bugbot flagged the raw-name version + // on PR-b as High severity (snake_case is the canonical naming + // style in tracebloc docs but K8s Pod names reject underscores). + if !strings.HasPrefix(p.Name, "tracebloc-stage-cats-dogs-") { + t.Errorf("Name = %q, want prefix tracebloc-stage-cats-dogs-", p.Name) + } + if got, wantLen := len(p.Name), len("tracebloc-stage-cats-dogs-")+8; got != wantLen { + t.Errorf("len(Name) = %d, want %d (8-hex-char random suffix)", got, wantLen) + } + if p.Spec.ServiceAccountName != "ingestor" { + t.Errorf("ServiceAccountName = %q, want ingestor", p.Spec.ServiceAccountName) + } + if p.Spec.RestartPolicy != corev1.RestartPolicyNever { + t.Errorf("RestartPolicy = %q, want Never", p.Spec.RestartPolicy) + } + if p.Spec.ActiveDeadlineSeconds == nil || *p.Spec.ActiveDeadlineSeconds != int64(StagePodActiveDeadline) { + t.Errorf("ActiveDeadlineSeconds = %v, want %d", p.Spec.ActiveDeadlineSeconds, StagePodActiveDeadline) + } +} + +// TestStagePodActiveDeadline_CoversFullLifecycle pins the floor on +// activeDeadlineSeconds. The earlier 600s value was too tight: the +// timer starts at Pod CREATION, but image pull (up to 60s) + +// readiness wait (up to 60s) + the worst-case stream (1 GiB at +// 2 MB/s = ~8.5 min) leaves no margin, so the kubelet could +// terminate near-cap pushes mid-transfer. Bugbot flagged as High +// on PR-b round 6. This test pins the floor at 1500s (25 min) so +// a future regression bumping it back near 600 gets caught. +func TestStagePodActiveDeadline_CoversFullLifecycle(t *testing.T) { + const minDeadline = 1500 + if StagePodActiveDeadline < minDeadline { + t.Errorf("StagePodActiveDeadline = %d, want at least %d "+ + "(must cover image-pull + readiness + worst-case 1 GiB stream "+ + "+ margin; activeDeadline starts at Pod creation not stream start)", + StagePodActiveDeadline, minDeadline) + } +} + +// TestBuildStagePodSpec_DefaultImage pins the digest-pinned image — +// if this ever drifts to a tag-only reference, air-gapped customers +// would silently get whatever alpine:3.20 resolved to that day. +func TestBuildStagePodSpec_DefaultImage(t *testing.T) { + p := mustBuildStagePod(t, stageOpts()) + got := p.Spec.Containers[0].Image + if got != DefaultStagePodImage { + t.Errorf("Image = %q, want %q (default)", got, DefaultStagePodImage) + } + if !strings.Contains(got, "@sha256:") { + t.Errorf("Image = %q, want digest-pinned (@sha256:...); air-gapped customers depend on this", got) + } +} + +// TestBuildStagePodSpec_OverrideImage pins the --stage-pod-image +// flag's contract end-to-end at the spec layer. +func TestBuildStagePodSpec_OverrideImage(t *testing.T) { + opts := stageOpts() + opts.Image = "internal-mirror.example.com/alpine:3.20@sha256:abc123" + p := mustBuildStagePod(t, opts) + if got := p.Spec.Containers[0].Image; got != opts.Image { + t.Errorf("Image = %q, want override %q", got, opts.Image) + } +} + +// TestBuildStagePodSpec_LabelsForOrphanScan pins the labels orphan.go +// will key off. If these ever drift, orphan-pod detection silently +// misses leftover Pods from crashed pushes. +func TestBuildStagePodSpec_LabelsForOrphanScan(t *testing.T) { + p := mustBuildStagePod(t, stageOpts()) + wantLabels := map[string]string{ + StagePodManagedByLabel: StagePodManagedByValue, + StagePodComponentLabel: StagePodComponentValue, + StagePodTableLabel: "cats_dogs", + } + for k, want := range wantLabels { + if got := p.Labels[k]; got != want { + t.Errorf("Label %s = %q, want %q", k, got, want) + } + } +} + +// TestBuildStagePodSpec_RestrictedPSA is the security-regression +// pin: every PSA-restricted requirement must hold, otherwise the +// stage Pod gets rejected on hardened namespaces (which is the +// majority of production tracebloc deployments). +func TestBuildStagePodSpec_RestrictedPSA(t *testing.T) { + p := mustBuildStagePod(t, stageOpts()) + + // Pod-level: runAsNonRoot, seccomp RuntimeDefault. + psc := p.Spec.SecurityContext + if psc == nil { + t.Fatal("Pod has no SecurityContext") + } + if psc.RunAsNonRoot == nil || !*psc.RunAsNonRoot { + t.Errorf("Pod.SecurityContext.RunAsNonRoot = %v, want true", psc.RunAsNonRoot) + } + if psc.SeccompProfile == nil || psc.SeccompProfile.Type != corev1.SeccompProfileTypeRuntimeDefault { + t.Errorf("Pod.SeccompProfile = %v, want RuntimeDefault", psc.SeccompProfile) + } + + // Container-level: every PSA restricted constraint. + if len(p.Spec.Containers) != 1 { + t.Fatalf("len(Containers) = %d, want 1", len(p.Spec.Containers)) + } + c := p.Spec.Containers[0] + sc := c.SecurityContext + if sc == nil { + t.Fatal("Container has no SecurityContext") + } + if sc.AllowPrivilegeEscalation == nil || *sc.AllowPrivilegeEscalation { + t.Errorf("AllowPrivilegeEscalation = %v, want false", sc.AllowPrivilegeEscalation) + } + if sc.Privileged == nil || *sc.Privileged { + t.Errorf("Privileged = %v, want false", sc.Privileged) + } + if sc.ReadOnlyRootFilesystem == nil || !*sc.ReadOnlyRootFilesystem { + t.Errorf("ReadOnlyRootFilesystem = %v, want true", sc.ReadOnlyRootFilesystem) + } + if sc.Capabilities == nil || len(sc.Capabilities.Drop) == 0 { + t.Fatalf("Capabilities.Drop = %v, want [ALL]", sc.Capabilities) + } + if sc.Capabilities.Drop[0] != "ALL" { + t.Errorf("Capabilities.Drop = %v, want [ALL]", sc.Capabilities.Drop) + } + if sc.SeccompProfile == nil || sc.SeccompProfile.Type != corev1.SeccompProfileTypeRuntimeDefault { + t.Errorf("Container SeccompProfile = %v, want RuntimeDefault", sc.SeccompProfile) + } +} + +// TestBuildStagePodSpec_PVCMount pins the volume + mountPath that +// the tar stream writes into. If the mount path here ever drifts +// from cluster.SharedPVCMountPath, the tar would write to the +// wrong location and Phase 4's ingestor Job would see "no files." +func TestBuildStagePodSpec_PVCMount(t *testing.T) { + p := mustBuildStagePod(t, stageOpts()) + + // Volume side: the PVC reference. + var foundVol bool + for _, v := range p.Spec.Volumes { + if v.Name == "shared" { + foundVol = true + if v.PersistentVolumeClaim == nil { + t.Errorf("shared volume has no PVC source") + } else if v.PersistentVolumeClaim.ClaimName != "client-pvc" { + t.Errorf("PVC ClaimName = %q, want client-pvc", v.PersistentVolumeClaim.ClaimName) + } + } + } + if !foundVol { + t.Error("Pod has no shared volume") + } + + // Mount side: where the container sees it. + var foundMount bool + for _, m := range p.Spec.Containers[0].VolumeMounts { + if m.Name == "shared" { + foundMount = true + if m.MountPath != "/data/shared" { + t.Errorf("MountPath = %q, want /data/shared", m.MountPath) + } + } + } + if !foundMount { + t.Error("stage container has no shared mount") + } +} + +// TestBuildStagePodSpec_TmpEmptyDir pins the writable /tmp emptyDir +// that tar needs (since the root FS is read-only). Without this, +// tar would fail to create its working state and the stream would +// die mid-transfer. +func TestBuildStagePodSpec_TmpEmptyDir(t *testing.T) { + p := mustBuildStagePod(t, stageOpts()) + var foundEmptyDir, foundMount bool + for _, v := range p.Spec.Volumes { + if v.Name == "tmp" { + foundEmptyDir = true + if v.EmptyDir == nil { + t.Error("tmp volume is not an EmptyDir") + } + } + } + for _, m := range p.Spec.Containers[0].VolumeMounts { + if m.Name == "tmp" && m.MountPath == "/tmp" { + foundMount = true + } + } + if !foundEmptyDir || !foundMount { + t.Errorf("tmp emptyDir mount missing (vol=%v, mount=%v)", foundEmptyDir, foundMount) + } +} + +// TestBuildStagePodSpec_RandomSuffixCollisionAvoidance: two specs +// built back-to-back must have distinct names so parallel pushes +// don't race on Create. +func TestBuildStagePodSpec_RandomSuffixCollisionAvoidance(t *testing.T) { + a := mustBuildStagePod(t, stageOpts()) + b := mustBuildStagePod(t, stageOpts()) + if a.Name == b.Name { + t.Errorf("back-to-back BuildStagePodSpec produced identical name %q; "+ + "random-suffix collision avoidance is broken", a.Name) + } +} + +// TestCreateStagePod_HappyPath: the fake clientset accepts a Create +// and returns the created object. We pin that CreateStagePod +// surfaces the assigned name back to the caller. +func TestCreateStagePod_HappyPath(t *testing.T) { + cs := fake.NewClientset() + name, err := CreateStagePod(context.Background(), cs, stageOpts()) + if err != nil { + t.Fatalf("CreateStagePod: %v", err) + } + if !strings.HasPrefix(name, "tracebloc-stage-cats-dogs-") { + t.Errorf("returned name = %q, want prefix tracebloc-stage-cats-dogs-", name) + } + // Cross-check: the Pod actually exists in the fake cluster. + if _, err := cs.CoreV1().Pods("tracebloc").Get(context.Background(), name, metav1.GetOptions{}); err != nil { + t.Errorf("Pod not found after CreateStagePod: %v", err) + } +} + +// TestCreateStagePod_APIErrorSurfaces: PSA rejections, RBAC denials, +// etc. surface verbatim so the customer sees the actionable cluster- +// side message. +func TestCreateStagePod_APIErrorSurfaces(t *testing.T) { + cs := fake.NewClientset() + cs.PrependReactor("create", "pods", + func(_ k8stesting.Action) (bool, runtime.Object, error) { + return true, nil, apierrors.NewForbidden( + corev1.Resource("pods"), "", + errors.New("user cannot create pods")) + }) + + _, err := CreateStagePod(context.Background(), cs, stageOpts()) + if err == nil { + t.Fatal("CreateStagePod returned nil error on forbidden Create") + } + if !strings.Contains(err.Error(), "creating stage Pod") { + t.Errorf("error missing CLI framing: %v", err) + } +} + +// TestWaitForStagePodReady_HappyPath: a Pod that immediately reports +// Ready=True passes through. +func TestWaitForStagePodReady_HappyPath(t *testing.T) { + cs := fake.NewClientset(&corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "tracebloc-stage-cats_dogs-abc12345", + Namespace: "tracebloc", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{{ + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }}, + }, + }) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + p, err := WaitForStagePodReady(ctx, cs, "tracebloc", "tracebloc-stage-cats_dogs-abc12345") + if err != nil { + t.Fatalf("WaitForStagePodReady: %v", err) + } + if p == nil { + t.Fatal("returned nil Pod on success") + } +} + +// TestWaitForStagePodReady_TimeoutHint: when the Pod is stuck (e.g. +// ImagePullBackOff), the timeout error should surface the waiting- +// state reason from container statuses. This is the dominant slow +// path — air-gapped customers without the right pull secret. +func TestWaitForStagePodReady_TimeoutHint(t *testing.T) { + cs := fake.NewClientset(&corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "stuck-pod", + Namespace: "tracebloc", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + ContainerStatuses: []corev1.ContainerStatus{{ + Name: "stage", + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "ImagePullBackOff", + Message: "Back-off pulling image \"alpine:3.20@sha256:...\"", + }, + }, + }}, + }, + }) + + // Tight ctx timeout so the test doesn't actually wait 60s. + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + _, err := WaitForStagePodReady(ctx, cs, "tracebloc", "stuck-pod") + if err == nil { + t.Fatal("WaitForStagePodReady returned nil on stuck Pod") + } + if !strings.Contains(err.Error(), "ImagePullBackOff") { + t.Errorf("error missing ImagePullBackOff hint: %v", err) + } +} + +// TestWaitForStagePodReady_NotFoundIsTerminal: if the Pod gets +// deleted out-of-band mid-wait (admin cleanup, parallel test +// teardown, etc.), the poll must short-circuit instead of waiting +// the full timeout. Bugbot flagged the prior "everything transient" +// behavior as Medium on PR-b. +func TestWaitForStagePodReady_NotFoundIsTerminal(t *testing.T) { + cs := fake.NewClientset() // empty — Get returns NotFound + + // Generous overall ctx; the test should return WAY before + // StagePodReadyTimeout (60s) because NotFound terminates the + // poll immediately. If the fix regresses, this test takes the + // full 60s and gets caught by go test's -timeout. + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + start := time.Now() + _, err := WaitForStagePodReady(ctx, cs, "tracebloc", "ghost-pod") + elapsed := time.Since(start) + if err == nil { + t.Fatal("WaitForStagePodReady returned nil on NotFound") + } + if elapsed > 3*time.Second { + t.Errorf("WaitForStagePodReady waited %s for NotFound; expected immediate return", elapsed) + } +} + +// TestWaitForStagePodReady_ForbiddenIsTerminal: same as NotFound but +// for RBAC denial — the customer's kubeconfig might lose `get pods` +// permission mid-push (token rotation, RBAC change). The poll must +// surface that immediately, not spin until timeout. +func TestWaitForStagePodReady_ForbiddenIsTerminal(t *testing.T) { + cs := fake.NewClientset() + cs.PrependReactor("get", "pods", + func(_ k8stesting.Action) (bool, runtime.Object, error) { + return true, nil, apierrors.NewForbidden( + corev1.Resource("pods"), "test-pod", + errors.New("user cannot get pods")) + }) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + start := time.Now() + _, err := WaitForStagePodReady(ctx, cs, "tracebloc", "test-pod") + elapsed := time.Since(start) + if err == nil { + t.Fatal("WaitForStagePodReady returned nil on Forbidden") + } + if elapsed > 3*time.Second { + t.Errorf("WaitForStagePodReady waited %s for Forbidden; expected immediate return", elapsed) + } +} + +// TestWaitForStagePodReady_FailedPhaseIsTerminal: a Pod that +// crashes at startup (PSA rejection, ImagePullBackOff that escalates +// to ErrImagePull, OOMKilled during container start) lands in +// Phase=Failed and will never become Ready. The poll must +// short-circuit instead of waiting the full 60s. Bugbot flagged +// the missing check as Medium on PR-b round 4. +func TestWaitForStagePodReady_FailedPhaseIsTerminal(t *testing.T) { + cs := fake.NewClientset(&corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "crashed-pod", + Namespace: "tracebloc", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + ContainerStatuses: []corev1.ContainerStatus{{ + Name: "stage", + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + Reason: "OOMKilled", + ExitCode: 137, + }, + }, + }}, + }, + }) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + start := time.Now() + _, err := WaitForStagePodReady(ctx, cs, "tracebloc", "crashed-pod") + elapsed := time.Since(start) + if err == nil { + t.Fatal("WaitForStagePodReady returned nil on Phase=Failed") + } + if elapsed > 3*time.Second { + t.Errorf("WaitForStagePodReady waited %s for Phase=Failed; expected immediate return", elapsed) + } + if !strings.Contains(err.Error(), "Failed") { + t.Errorf("error missing Phase=Failed signal: %v", err) + } +} + +// TestDeleteStagePod_NotFoundIsOK: the Pod might be gone already +// (activeDeadlineSeconds fired, or someone kubectl-deleted it). +// Not-found shouldn't error — our goal is "Pod doesn't exist," which +// is satisfied. +func TestDeleteStagePod_NotFoundIsOK(t *testing.T) { + cs := fake.NewClientset() // empty + if err := DeleteStagePod(context.Background(), cs, "tracebloc", "nope"); err != nil { + t.Errorf("DeleteStagePod on absent Pod = %v, want nil", err) + } +} + +// TestDeleteStagePod_HappyPath: delete a real Pod, confirm it's gone. +func TestDeleteStagePod_HappyPath(t *testing.T) { + cs := fake.NewClientset(&corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "to-delete", Namespace: "tracebloc"}, + }) + if err := DeleteStagePod(context.Background(), cs, "tracebloc", "to-delete"); err != nil { + t.Fatalf("DeleteStagePod: %v", err) + } + _, err := cs.CoreV1().Pods("tracebloc").Get(context.Background(), "to-delete", metav1.GetOptions{}) + if !apierrors.IsNotFound(err) { + t.Errorf("Pod still exists after delete (err=%v)", err) + } +} diff --git a/internal/push/progress.go b/internal/push/progress.go new file mode 100644 index 0000000..f5df7b3 --- /dev/null +++ b/internal/push/progress.go @@ -0,0 +1,132 @@ +package push + +import ( + "io" + "os" + + "github.com/schollz/progressbar/v3" + "golang.org/x/term" +) + +// Progress is the narrow interface push.Stream + the stage +// orchestrator use to surface per-byte transfer progress to the +// customer. The real implementation wraps schollz/progressbar/v3 +// (NewTTYProgress); tests pass a no-op (NoOpProgress) since CI +// runs aren't TTYs and decoupling the test layer from any +// progressbar internals keeps test output clean. +// +// The interface is deliberately byte-oriented (not file-count +// oriented). Tar header bytes flow through too — they're a few +// hundred bytes per file, which makes the progress bar slightly +// over-count the "useful" bytes, but they're real bytes on the +// wire that the customer pays for anyway. The alternative +// (counting only file-body bytes) requires the progress sink to +// know what a tar header looks like, which couples it to the +// streaming format. +type Progress interface { + // Add records that n bytes have been transferred. Safe to + // call from any goroutine. + Add(n int) + + // Finish marks the transfer done. Calling Finish before all + // bytes have been added is fine — the bar just won't fill to + // 100%, which is a useful visual signal that something cut + // the stream short. + Finish() +} + +// NewProgress returns a Progress sink appropriate for the given +// output writer: +// +// - If out is a TTY (interactive terminal), return a +// progressbar/v3 that renders an animated bar with bytes/sec +// - ETA + percentage. +// - Otherwise return a no-op (CI, redirected output, piped to +// a file). A spinning bar in a CI log is noise, not a +// feature. +// +// totalBytes is the expected transfer size (from LocalLayout's +// TotalBytes plus a small tar-overhead margin). The bar normalizes +// against this; if actual transferred bytes overshoot, the bar +// caps at 100% rather than wrapping. +// +// `description` shows up to the left of the bar; "Staging cats_dogs" +// is a typical value. +func NewProgress(out io.Writer, totalBytes int64, description string) Progress { + if !isTTY(out) { + return NoOpProgress{} + } + return &ttyProgress{ + bar: progressbar.NewOptions64(totalBytes, + progressbar.OptionSetWriter(out), + progressbar.OptionSetDescription(description), + progressbar.OptionShowBytes(true), + progressbar.OptionShowCount(), + progressbar.OptionThrottle(100), // ms — keep CPU low + progressbar.OptionSetRenderBlankState(true), + progressbar.OptionSetWidth(40), + progressbar.OptionClearOnFinish(), + ), + } +} + +// isTTY reports whether out points at a real terminal. We test for +// *os.File AND golang.org/x/term IsTerminal — a *bytes.Buffer in +// tests isn't *os.File so we return false fast. Wrappers like +// io.MultiWriter that aren't *os.File also return false, which is +// the conservative choice. +func isTTY(out io.Writer) bool { + f, ok := out.(*os.File) + if !ok { + return false + } + return term.IsTerminal(int(f.Fd())) +} + +// ttyProgress is the real-bar implementation. The progressbar +// library is itself concurrency-safe under Add(), so the wrapper +// can be a thin pass-through. +type ttyProgress struct { + bar *progressbar.ProgressBar +} + +func (p *ttyProgress) Add(n int) { + // Explicit-discard the error — progressbar's Add returns an + // error if the write to the underlying writer fails, which we + // can't do anything about mid-stream (the customer's terminal + // going away during a push doesn't affect the staging + // outcome). Same rationale as the Fprintf discards elsewhere. + _ = p.bar.Add(n) +} + +func (p *ttyProgress) Finish() { + _ = p.bar.Finish() +} + +// NoOpProgress satisfies Progress without doing anything. Used in +// tests and when output is non-TTY. Exported so tests in OTHER +// packages (internal/cli) can also use it. +type NoOpProgress struct{} + +func (NoOpProgress) Add(_ int) {} +func (NoOpProgress) Finish() {} + +// progressWriter is an io.Writer that funnels write counts into a +// Progress. Used to wrap the stdin pipe-writer so every byte +// piped to the exec stream counts toward the bar. +// +// Not exported — it's an implementation detail of Stream. Tests +// validate the contract end-to-end (transferred bytes match +// expected) rather than poking the wrapper directly. +type progressWriter struct { + w io.Writer + p Progress +} + +func (pw *progressWriter) Write(b []byte) (int, error) { + n, err := pw.w.Write(b) + if n > 0 { + pw.p.Add(n) + } + return n, err +} diff --git a/internal/push/spec.go b/internal/push/spec.go index ec727b0..8aabb0a 100644 --- a/internal/push/spec.go +++ b/internal/push/spec.go @@ -40,18 +40,33 @@ import ( // table-naming style anyway. var tableNamePattern = regexp.MustCompile(`^[A-Za-z0-9_]+$`) +// MaxTableNameLength caps `--table` at 63 chars. Two hard limits +// agree on this: +// +// 1. MySQL identifier limit: 64 chars (MySQL Reference 9.2 Schema +// Object Names). We use 63 to leave one char of headroom for +// any future "ingestion_run_id" suffix the chart might want to +// append. +// 2. Kubernetes label value limit: 63 chars (DNS-1123 label rules). +// The stage Pod's tracebloc.io/table label carries the +// untransformed table name verbatim — a longer value fails Pod +// creation post-pre-flight, which Bugbot flagged on PR-b +// round 5. +// +// 63 is a coincidence that lets both checks share the same constant. +const MaxTableNameLength = 63 + // ValidateTableName rejects table names that aren't safe as both a // MySQL identifier and a single PVC path segment. // // Why a CLI-side check rather than the schema: the embedded // ingest.v1.json only enforces `minLength: 1` on `table` — no -// `pattern`. Without this guard, --table=../../etc would flow into -// the /data/shared/
/ PVC path; PR-b's stage Pod would then -// write outside the intended subtree and could clobber another -// table's data. Tightening the upstream schema with a `pattern` -// is the proper long-term fix (it would protect the helm flow + -// jobs-manager too) but needs a change to tracebloc/data-ingestors' -// schema, which the schema-drift CI check pins — filed as +// `pattern`, no `maxLength`. Without this guard, --table=../../etc +// would flow into the /data/shared/
/ PVC path, and a 100-char +// name would make the Pod-label assignment fail post-pre-flight. +// Tightening the upstream schema is the proper long-term fix +// (it would protect the helm flow + jobs-manager too) but needs +// a change to tracebloc/data-ingestors' schema — filed as // tracebloc/data-ingestors#116. Once that lands and we re-sync, // this guard can collapse to a thin "schema says so" wrapper. // @@ -61,6 +76,14 @@ func ValidateTableName(table string) error { if table == "" { return fmt.Errorf("table name is required (set --table)") } + if len(table) > MaxTableNameLength { + return fmt.Errorf( + "table name is %d characters; the max is %d "+ + "(matches both the MySQL identifier limit and the "+ + "Kubernetes label-value limit, which the stage Pod's "+ + "tracebloc.io/table label is bound by). Use a shorter name.", + len(table), MaxTableNameLength) + } if !tableNamePattern.MatchString(table) { return fmt.Errorf( "table name %q is invalid: must match [A-Za-z0-9_]+ "+ diff --git a/internal/push/spec_test.go b/internal/push/spec_test.go index 8f19716..05c7961 100644 --- a/internal/push/spec_test.go +++ b/internal/push/spec_test.go @@ -1,6 +1,7 @@ package push import ( + "strings" "testing" "gopkg.in/yaml.v3" @@ -126,6 +127,24 @@ func TestValidateTableName_Accepts(t *testing.T) { } } +// TestValidateTableName_RejectsTooLong: K8s label values are +// capped at 63 chars, and the stage Pod carries the raw table +// name as the tracebloc.io/table label. Without this rejection, +// a 100-char table name passes pre-flight then fails Pod create +// with an opaque label-validation error. Bugbot flagged on PR-b +// round 5. +func TestValidateTableName_RejectsTooLong(t *testing.T) { + tooLong := strings.Repeat("a", MaxTableNameLength+1) + if err := ValidateTableName(tooLong); err == nil { + t.Fatalf("ValidateTableName(%d-char name) = nil, want length-cap error", len(tooLong)) + } + // Right at the boundary: 63 chars must be accepted. + atCap := strings.Repeat("a", MaxTableNameLength) + if err := ValidateTableName(atCap); err != nil { + t.Errorf("ValidateTableName(%d-char name) = %v, want nil (at exact cap)", len(atCap), err) + } +} + // TestValidateTableName_RejectsUnsafe is the security-regression // pin. The path-traversal cases (../../etc, ../foo) are the ones // Bugbot flagged on PR #8 — if this test ever goes green with diff --git a/internal/push/stage.go b/internal/push/stage.go new file mode 100644 index 0000000..f8e9472 --- /dev/null +++ b/internal/push/stage.go @@ -0,0 +1,153 @@ +package push + +import ( + "context" + "fmt" + "io" + "time" + + "k8s.io/client-go/kubernetes" +) + +// StageOptions packages every dependency Stage needs into a single +// struct so callers (cli.runDatasetPush) can build it incrementally +// without juggling 10 positional args. +// +// Required fields are pinned in the doc; the nil-able Progress + Out +// fields get default behaviors (NoOpProgress / io.Discard) so the +// happy path is short. +type StageOptions struct { + // Client is the discovered kubernetes.Interface. + Client kubernetes.Interface + + // Executor is how the tar stream actually reaches the Pod. + // Production: a *SPDYExecutor backed by the rest.Config. + // Tests: a fakeExecutor (see stream_test.go). + Executor Executor + + // Namespace + IngestorSAName come from Phase 2's parent-release + // discovery (with optional --ingestor-sa override). + Namespace string + IngestorSAName string + + // PVCClaimName + PVCMountPath come from Phase 3 PR-a's PVC + // discovery (always "client-pvc" + "/data/shared" today). + PVCClaimName string + PVCMountPath string + + // Layout is the validated local files-to-stage. + Layout *LocalLayout + + // Table is the destination table name. MUST have already + // passed ValidateTableName. + Table string + + // StagePodImage overrides the alpine default. Empty = default. + StagePodImage string + + // Progress is the customer-facing transfer bar. Nil = no-op + // (use this in tests / non-TTY output). + Progress Progress + + // Out is where Stage prints non-progress diagnostic output + // (orphan warnings, "Created stage Pod X" etc.). Nil = io.Discard. + // Progress bar output is separate — schollz writes to its own + // configured writer (typically the same one). + Out io.Writer +} + +// Stage is Phase 3 PR-b's top-level entrypoint. Does the full +// dance: orphan scan → create stage Pod → wait Ready → tar-over- +// exec stream → defer-delete (SIGINT-safe). +// +// On success, the layout's files are on the cluster's PVC at +// /data/shared/
/{labels.csv, images/...} and the stage Pod +// has been cleaned up. Caller proceeds to Phase 4's submit step. +// +// On error, the deferred cleanup still fires — no orphan Pod left +// behind even if the stream fails partway. Phase 4's idempotency +// key handles the "retry the whole push" scenario; for v0.1 we +// don't try to resume a partial transfer. +// +// SIGINT handling: the caller is expected to wire a signal handler +// that cancels the passed-in ctx (cli/main.go pattern). When ctx +// is cancelled, the in-flight Exec stream returns ctx.Err(), the +// deferred DeleteStagePod runs with a FRESH context (so it can +// reach the API even after our parent ctx is cancelled), and the +// error propagates with the SIGINT signal preserved. +func Stage(ctx context.Context, opts StageOptions) error { + if opts.Out == nil { + opts.Out = io.Discard + } + if opts.Progress == nil { + opts.Progress = NoOpProgress{} + } + + // 1. Orphan scan — best-effort. We log either the warning or + // the scan-failure and proceed regardless. Customers who + // care about orphans get to see them; customers whose RBAC + // forbids List Pods just don't get the warning (no impact + // on the push itself). + if orphans, err := FindOrphanStagePods(ctx, opts.Client, opts.Namespace); err != nil { + _, _ = fmt.Fprintf(opts.Out, "(orphan-scan skipped: %v)\n", err) + } else if warning := FormatOrphansWarning(orphans); warning != "" { + _, _ = fmt.Fprint(opts.Out, warning) + } + + // 2. Create the stage Pod. From here on, cleanup matters. + podName, err := CreateStagePod(ctx, opts.Client, PodSpecOptions{ + Namespace: opts.Namespace, + PVCClaimName: opts.PVCClaimName, + PVCMountPath: opts.PVCMountPath, + Table: opts.Table, + Image: opts.StagePodImage, + ServiceAccountName: opts.IngestorSAName, + }) + if err != nil { + return err + } + _, _ = fmt.Fprintf(opts.Out, "Created stage Pod %s/%s\n", opts.Namespace, podName) + + // 3. Defer cleanup. The deferred call uses a FRESH context with + // its own deadline — if the parent ctx is cancelled (SIGINT, + // timeout), we still want the Pod deleted. Without this, a + // Ctrl-C right after pod-create would leak an orphan. + // + // 30s deadline on the cleanup is a safety net for the case + // where the API server itself is unreachable — better to + // print "failed to clean up stage Pod" than hang forever. + defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if delErr := DeleteStagePod(cleanupCtx, opts.Client, opts.Namespace, podName); delErr != nil { + _, _ = fmt.Fprintf(opts.Out, + "WARNING: failed to delete stage Pod %s/%s (it will be activeDeadline-killed in <=%ds): %v\n", + opts.Namespace, podName, StagePodActiveDeadline, delErr) + } + }() + + // 4. Wait for the Pod to be Ready (containers up, ready to + // accept exec). Times out at StagePodReadyTimeout with + // diagnostic hints from container statuses (image pull, + // scheduling) when we have them. + _, _ = fmt.Fprintf(opts.Out, "Waiting for stage Pod to be Ready (timeout %s)...\n", StagePodReadyTimeout) + if _, err := WaitForStagePodReady(ctx, opts.Client, opts.Namespace, podName); err != nil { + return err + } + + // 5. Stream the tar. This is where actual bytes flow. The + // progress bar (if TTY) renders during this call. + _, _ = fmt.Fprintf(opts.Out, "Streaming %d files (%s) to %s...\n", + 1+len(opts.Layout.Images), HumanBytes(opts.Layout.TotalBytes), StagedPrefix(opts.Table)) + + if err := StreamLayout(ctx, opts.Executor, + opts.Namespace, podName, "stage", + opts.Layout, opts.Table, opts.Progress); err != nil { + return err + } + + // 6. Print "done" message. The deferred cleanup runs after this. + _, _ = fmt.Fprintf(opts.Out, "Staged %d files to %s\n", + 1+len(opts.Layout.Images), StagedPrefix(opts.Table)) + return nil +} diff --git a/internal/push/stage_test.go b/internal/push/stage_test.go new file mode 100644 index 0000000..b46e0cb --- /dev/null +++ b/internal/push/stage_test.go @@ -0,0 +1,288 @@ +package push + +import ( + "bytes" + "context" + "errors" + "strings" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + k8stesting "k8s.io/client-go/testing" +) + +// readyOnNextGet adds a reactor that marks any new stage Pod as +// Ready=True the moment after it's created. Without this, the fake +// clientset creates the Pod in default (empty) status and +// WaitForStagePodReady spins until its timeout. We can't easily +// patch the Pod from inside StageOptions construction, so the +// reactor is the standard pattern. +func readyOnNextGet(cs *fake.Clientset) { + cs.PrependReactor("get", "pods", + func(action k8stesting.Action) (bool, runtime.Object, error) { + ga, ok := action.(k8stesting.GetAction) + if !ok { + return false, nil, nil + } + // Let the existing tracker handle the get to find the + // real object, then patch it before returning. The + // safest way: do the get ourselves, then synthesize a + // Ready condition onto the result. + pod, err := cs.Tracker().Get(action.GetResource(), action.GetNamespace(), ga.GetName()) + if err != nil { + return false, nil, nil + } + p := pod.(*corev1.Pod) + p.Status.Phase = corev1.PodRunning + p.Status.Conditions = []corev1.PodCondition{{ + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }} + return true, p, nil + }) +} + +// TestStage_HappyPath: orphan scan (empty) → create → wait ready +// → stream → delete. The fake clientset + fake executor let us +// verify the full orchestration without a real cluster. +func TestStage_HappyPath(t *testing.T) { + root := imgcDir(t) + layout, err := Discover(root) + if err != nil { + t.Fatalf("Discover: %v", err) + } + + cs := fake.NewClientset() + readyOnNextGet(cs) + fe := &fakeExecutor{} + var out bytes.Buffer + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + err = Stage(ctx, StageOptions{ + Client: cs, + Executor: fe, + Namespace: "tracebloc", + IngestorSAName: "ingestor", + PVCClaimName: "client-pvc", + PVCMountPath: "/data/shared", + Layout: layout, + Table: "cats_dogs", + Out: &out, + }) + if err != nil { + t.Fatalf("Stage: %v\nout:\n%s", err, out.String()) + } + + // Diagnostic output: customer should see the lifecycle + // breadcrumbs. Pin a few key phrases so a regression that + // silences them is caught. + for _, want := range []string{"Created stage Pod", "Waiting for stage Pod", "Streaming", "Staged"} { + if !strings.Contains(out.String(), want) { + t.Errorf("output missing %q:\n%s", want, out.String()) + } + } + + // Executor must have been called with the right ns + a tar + // stream on stdin (assert by inspecting captured tar). + if fe.gotNS != "tracebloc" { + t.Errorf("executor ns = %q, want tracebloc", fe.gotNS) + } + if len(fe.gotStdin) == 0 { + t.Error("executor stdin was empty; expected tar archive bytes") + } +} + +// TestStage_CleanupRunsOnError: deferred DeleteStagePod must fire +// even when StreamLayout fails. Otherwise every failed push leaks +// an orphan Pod that can't be reused for retry (the random suffix +// avoids name collisions, but it's still cluster pollution). +func TestStage_CleanupRunsOnError(t *testing.T) { + root := imgcDir(t) + layout, err := Discover(root) + if err != nil { + t.Fatalf("Discover: %v", err) + } + + cs := fake.NewClientset() + readyOnNextGet(cs) + fe := &fakeExecutor{ + errToReturn: errors.New("simulated stream failure"), + drainBeforeReturn: true, + } + var out bytes.Buffer + + err = Stage(context.Background(), StageOptions{ + Client: cs, + Executor: fe, + Namespace: "tracebloc", + IngestorSAName: "ingestor", + PVCClaimName: "client-pvc", + PVCMountPath: "/data/shared", + Layout: layout, + Table: "t1", + Out: &out, + }) + if err == nil { + t.Fatal("Stage returned nil on stream failure") + } + + // Cleanup contract: NO stage Pods should remain after Stage + // returns. The list-by-label query is the same one orphan.go + // uses, so if any leaks it's user-visible in the next push. + pods, listErr := cs.CoreV1().Pods("tracebloc").List(context.Background(), + metav1.ListOptions{LabelSelector: StagePodManagedByLabel + "=" + StagePodManagedByValue}) + if listErr != nil { + t.Fatalf("post-Stage list: %v", listErr) + } + if len(pods.Items) != 0 { + var names []string + for _, p := range pods.Items { + names = append(names, p.Name) + } + t.Errorf("Stage leaked %d Pod(s) after stream failure: %v", len(pods.Items), names) + } +} + +// TestStage_OrphanWarningSurfaces: stale stage Pods from a +// previous crash get surfaced as a warning at the start of the +// next push. Validates the integration: orphan scan → format → +// print to Out. +func TestStage_OrphanWarningSurfaces(t *testing.T) { + root := imgcDir(t) + layout, err := Discover(root) + if err != nil { + t.Fatalf("Discover: %v", err) + } + + cs := fake.NewClientset(stagePodWithAge("stale-from-crash", "cats_dogs", 30*time.Minute)) + readyOnNextGet(cs) + fe := &fakeExecutor{} + var out bytes.Buffer + + err = Stage(context.Background(), StageOptions{ + Client: cs, + Executor: fe, + Namespace: "tracebloc", + IngestorSAName: "ingestor", + PVCClaimName: "client-pvc", + PVCMountPath: "/data/shared", + Layout: layout, + Table: "new_push", + Out: &out, + }) + if err != nil { + t.Fatalf("Stage: %v\nout:\n%s", err, out.String()) + } + + if !strings.Contains(out.String(), "stale-from-crash") { + t.Errorf("orphan warning didn't surface; output:\n%s", out.String()) + } + if !strings.Contains(out.String(), "kubectl delete pod") { + t.Errorf("orphan warning missing actionable hint; output:\n%s", out.String()) + } +} + +// TestStage_IngestorSANameFlowsToPod: the --ingestor-sa override +// MUST land on the stage Pod's ServiceAccountName — otherwise the +// flag is silently ignored and customers who renamed the chart's +// ingestor SA get pods running as the default SA (no PVC write +// access). Pin the integration here at the Stage layer since the +// flag wiring's effect is only observable end-to-end. Bugbot +// flagged the missing test coverage on PR-a. +func TestStage_IngestorSANameFlowsToPod(t *testing.T) { + root := imgcDir(t) + layout, err := Discover(root) + if err != nil { + t.Fatalf("Discover: %v", err) + } + + cs := fake.NewClientset() + readyOnNextGet(cs) + fe := &fakeExecutor{} + var out bytes.Buffer + + const customSA = "my-renamed-ingestor" + _ = Stage(context.Background(), StageOptions{ + Client: cs, + Executor: fe, + Namespace: "tracebloc", + IngestorSAName: customSA, + PVCClaimName: "client-pvc", + PVCMountPath: "/data/shared", + Layout: layout, + Table: "t1", + Out: &out, + }) + + // List the stage Pods created (we deleted them in Stage's + // defer, so we need to inspect the tracker BEFORE the test + // process finishes — but the cleanup happens within Stage). + // Workaround: assert via the fake clientset's action log — + // look at the Create action's object directly. + var seenSA string + for _, action := range cs.Actions() { + if action.GetVerb() == "create" && action.GetResource().Resource == "pods" { + pod := action.(k8stesting.CreateAction).GetObject().(*corev1.Pod) + seenSA = pod.Spec.ServiceAccountName + break + } + } + if seenSA != customSA { + t.Errorf("created Pod's ServiceAccountName = %q, want %q (the --ingestor-sa override)", + seenSA, customSA) + } +} + +// TestStage_CancelledContext_StillCleansUp: when the parent ctx +// is cancelled (SIGINT scenario), the deferred cleanup MUST still +// run — using its own context with a fresh deadline. Without +// this, every Ctrl-C leaves an orphan Pod. +func TestStage_CancelledContext_StillCleansUp(t *testing.T) { + root := imgcDir(t) + layout, err := Discover(root) + if err != nil { + t.Fatalf("Discover: %v", err) + } + + cs := fake.NewClientset() + readyOnNextGet(cs) + fe := &fakeExecutor{ + // Simulate the SPDY stream returning ctx.Err() because the + // caller cancelled mid-stream. + errToReturn: context.Canceled, + } + var out bytes.Buffer + + // Use a context we cancel immediately. Stage's create still + // succeeds (fake clientset doesn't honor ctx on Create), but + // the executor sees Canceled. + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + _ = Stage(ctx, StageOptions{ + Client: cs, + Executor: fe, + Namespace: "tracebloc", + IngestorSAName: "ingestor", + PVCClaimName: "client-pvc", + PVCMountPath: "/data/shared", + Layout: layout, + Table: "interrupted", + Out: &out, + }) + + // Whether Stage returns an error or not, the cluster MUST be + // clean. This is the SIGINT-safety contract. + pods, _ := cs.CoreV1().Pods("tracebloc").List(context.Background(), + metav1.ListOptions{LabelSelector: StagePodManagedByLabel + "=" + StagePodManagedByValue}) + if len(pods.Items) != 0 { + t.Errorf("Stage leaked %d Pod(s) on context-cancel; SIGINT cleanup contract broken", + len(pods.Items)) + } +} diff --git a/internal/push/stream.go b/internal/push/stream.go new file mode 100644 index 0000000..baba3cb --- /dev/null +++ b/internal/push/stream.go @@ -0,0 +1,462 @@ +package push + +import ( + "archive/tar" + "bytes" + "context" + "errors" + "fmt" + "io" + "os" + "path" + "path/filepath" + + corev1 "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/remotecommand" +) + +// Executor abstracts "exec a command in a Pod with stdin/stdout/ +// stderr." The interface exists for one reason: client-go's +// fake.Clientset (the cornerstone of every test in this package) +// doesn't support the exec subresource. Without an interface +// boundary here, Stream's lifecycle couldn't be unit-tested — +// every assertion would need a real cluster. +// +// Production uses SPDYExecutor (this file). Tests construct a +// FakeExecutor (stream_test.go) that captures stdin into a buffer +// + records the command + can return a synthetic error. +type Executor interface { + // Exec runs cmd in the named container of ns/pod, wiring + // stdin/stdout/stderr to the provided streams. Returns when + // the remote command terminates or the context is cancelled. + // + // stdin, stdout, stderr may be nil — Exec must tolerate that + // (a stage Pod with no stderr to capture is a legitimate + // case for the "just delete me" command after the tar is + // done). + Exec(ctx context.Context, ns, pod, container string, + cmd []string, stdin io.Reader, stdout, stderr io.Writer) error +} + +// SPDYExecutor is the production Executor. Wraps client-go's +// remotecommand.NewSPDYExecutor — the same machinery `kubectl exec` +// uses internally. +// +// Holds both Config (needed by NewSPDYExecutor for auth/transport) +// and Client (for building the exec URL via the REST client). Both +// come from cluster.Load + cluster.NewClientset. +type SPDYExecutor struct { + Config *rest.Config + Client kubernetes.Interface +} + +func (e *SPDYExecutor) Exec( + ctx context.Context, + ns, pod, container string, + cmd []string, + stdin io.Reader, stdout, stderr io.Writer, +) error { + // Build the exec URL the way kubectl does — POST against the + // /pods//exec subresource, parametrized via PodExecOptions + // encoded through the scheme's ParameterCodec. + req := e.Client.CoreV1().RESTClient().Post(). + Resource("pods"). + Name(pod). + Namespace(ns). + SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Container: container, + Command: cmd, + // Toggle the streams that the caller actually wants. + // PodExecOptions ignores the unused ones, but being + // explicit here matches kubectl's behavior and means + // "stdin requested but caller passed nil" surfaces as + // a clearer error from the executor. + Stdin: stdin != nil, + Stdout: stdout != nil, + Stderr: stderr != nil, + }, scheme.ParameterCodec) + + exec, err := remotecommand.NewSPDYExecutor(e.Config, "POST", req.URL()) + if err != nil { + return fmt.Errorf("creating SPDY executor for %s/%s: %w", ns, pod, err) + } + + err = exec.StreamWithContext(ctx, remotecommand.StreamOptions{ + Stdin: stdin, + Stdout: stdout, + Stderr: stderr, + // Tty=false: we're not running an interactive shell, we're + // piping bytes. With Tty=true the tar stream would get + // line-buffered/CR-translated/etc, which corrupts the + // archive. + Tty: false, + }) + if err != nil { + return fmt.Errorf("exec stream against %s/%s: %w", ns, pod, err) + } + return nil +} + +// StreamLayout pipes a tar of the layout's files into the stage +// Pod's `tar -xf - -C /data/shared/
` so the files land at +// /data/shared/
/{labels.csv, images/...}. +// +// On success, the stage Pod's PVC mount now contains the files +// jobs-manager's ingestor will read in Phase 4. +// +// On failure, the partial-write state on the PVC is whatever tar +// managed before the error — possibly a few image files but +// likely not all. Phase 4's idempotency_key handles "retry the +// whole push"; for v0.1, customers re-run the command and the +// shell's `mkdir -p` is idempotent so re-creating the destination +// is fine. Truly-recovering a failed mid-stream push (resume from +// last-written file) is a v0.2 optimization. +// +// Why `tar` (rather than a more modern protocol like a custom REST +// upload endpoint on jobs-manager): tar-over-exec works against +// any cluster running the existing chart, with zero server-side +// changes. A custom upload endpoint would require a coordinated +// jobs-manager release before the CLI could ship. +func StreamLayout( + ctx context.Context, + exec Executor, + namespace, podName, containerName string, + layout *LocalLayout, + table string, + progress Progress, +) error { + if progress == nil { + progress = NoOpProgress{} + } + defer progress.Finish() + + // Generate the staging-dir suffix + compose the remote command + // BEFORE spawning the tar goroutine. Earlier versions did the + // suffix after, which meant a crypto/rand failure would return + // from this function without closing the pipe or waiting on + // the goroutine — the goroutine would block once the ~64 KB + // pipe buffer filled, leaking forever. Bugbot flagged on r9. + // + // Staging dir gets a fresh 8-hex-char suffix per invocation so + // two concurrent `dataset push` runs for the same table don't + // race on the same .staging path (r8). Without this, push A's + // `rm -rf .staging` could wipe push B's in-progress tar + // extraction (or vice versa), producing an interleaved + // corrupt dataset on the PVC. + stagingSuffix, err := randomSuffix(4) // 4 bytes → 8 hex chars + if err != nil { + return fmt.Errorf("generating staging-dir suffix: %w", err) + } + dest := StagedPrefix(table) + staging := dest + ".staging-" + stagingSuffix + + // Backup path for the swap. Same unique suffix as staging so + // two parallel pushes don't collide on each other's .old-, + // AND so the find pattern below catches both .staging-* and + // .old-* siblings as orphan candidates. + backup := dest + ".old-" + stagingSuffix + + parentDir := path.Dir(dest) + tableBase := path.Base(dest) + + // Remote command pipeline. Crosses three semantic guarantees: + // + // - HERMETIC (r5): old files don't survive into a re-push + // - TRANSACTIONAL (r7+r10): the previously-staged dataset + // stays intact if THIS push fails mid-transfer; backup- + // and-rollback restores on mv failure + // - PARALLEL-SAFE (r8): unique .staging-/.old- + // suffix per invocation so concurrent pushes don't + // interleave each other's tar extraction + // - EXIT-FAITHFUL (r11): set -e propagates any non-zero + // status; the prior `&&-chain ; find || true` shape + // could silently return success on actual failures + // + // Pipeline steps: + // 1. set -e — any unhandled non-zero status aborts the script + // 2. rm -rf $STAGING (clean prior failed attempt) + // 3. mkdir + tar (extract to staging) + // 4. If $DEST exists, mv $DEST → $DEST.old- (backup) + // 5. mv $STAGING → $DEST (commit) + // On failure: restore backup, exit 1 + // 6. rm -rf $BACKUP (success) + // 7. find ... orphan cleanup (best-effort, || true) + // + // The earlier `&&-chained ... ; find ... || true` shape had a + // hidden correctness bug (Bugbot r11): POSIX `;` runs the + // next command regardless of prior status, and `|| true` then + // forces exit 0. A failed tar mid-script would silently + // return success to the exec subprocess, and the CLI would + // report "Staged N files" on what was actually a failed push. + // + // set -e fixes that: any unguarded non-zero exits the script + // with that status. The find's `|| true` is still fine + // because under set -e a `cmd || fallback` is treated as + // "cmd may fail" and doesn't trigger the immediate exit. + // + // Failure modes (recap): + // - tar fails: set -e aborts; $DEST untouched (transactional from r7) + // - backup mv fails: same — aborts before any change to $DEST + // - main mv fails: explicit if/then rolls back the backup + // then `exit 1` (set -e doesn't preempt the explicit + // handler because of the `||`) + // - final rm fails: set -e exits, but the customer's new + // data is already in place — the leftover .old- + // gets picked up by the find sweep on next invocation + // - find fails: `|| true` suppresses (cleanup is best-effort) + remoteCmd := []string{ + "/bin/sh", "-c", + fmt.Sprintf( + "set -e\n"+ + "rm -rf %q\n"+ + "mkdir -p %q\n"+ + "/bin/tar -xf - -C %q\n"+ + "if [ -e %q ]; then mv %q %q; fi\n"+ + "if ! mv %q %q; then\n"+ + " if [ -e %q ]; then mv %q %q; fi\n"+ + " exit 1\n"+ + "fi\n"+ + "rm -rf %q\n"+ + "find %q -maxdepth 1 \\( -name %q -o -name %q \\) -mmin +60 -exec rm -rf {} + 2>/dev/null || true\n", + staging, // rm + staging, // mkdir + staging, // tar -C + dest, dest, backup, // if exists, backup + staging, dest, // main mv + backup, backup, dest, // rollback + backup, // success cleanup + parentDir, tableBase+".staging-*", tableBase+".old-*"), // find + } + + // Use a synchronous pipe so the tar Writer's writes block on + // the exec stream's reads — no in-memory buffering of the + // whole archive. For a 1 GiB dataset this is the difference + // between using 1 GiB of laptop RAM and using a few KB. + pr, pw := io.Pipe() + + // Wire the progress sink on the WRITE side so every byte the + // tar Writer emits (headers + bodies) counts toward the bar. + countedPW := &progressWriter{w: pw, p: progress} + + // Capture stderr from the in-cluster tar so failures surface + // with the actual remote message. + var stderrBuf bytes.Buffer + + // Kick off the tar build in a goroutine. The Pipe.Writer MUST + // be closed when we're done — without that, the Pipe.Reader + // side blocks forever waiting for more bytes. + // + // CloseWithError vs Close: CloseWithError preserves the tar + // error across the pipe so the exec side sees the cause. + tarErrCh := make(chan error, 1) + go func() { + err := writeLayoutTar(countedPW, layout) + if err != nil { + _ = pw.CloseWithError(err) + } else { + _ = pw.Close() + } + tarErrCh <- err + }() + + streamErr := exec.Exec(ctx, namespace, podName, containerName, + remoteCmd, pr, nil, &stderrBuf) + + // Close the reader side. If the executor returned without + // fully draining stdin (early exit, ctx cancellation, remote + // tar dying), the tar-write goroutine is blocked on its next + // pipe.Write — closing the reader unblocks it with + // io.ErrClosedPipe so it can finish and report on tarErrCh. + // Without this, every error path deadlocks. + _ = pr.Close() + + // Drain the tar goroutine — even on stream error, it must + // finish or we leak. Buffered channel size 1 means this + // receive won't block (the goroutine already sent or will + // send imminently after CloseWithError or the pr.Close above). + tarErr := <-tarErrCh + + // Order matters here. Bugbot r11 caught the previous logic: + // when the LOCAL tar build fails (e.g. r8's stream-time size + // cap recheck rejecting a file that grew on disk), the + // CloseWithError on the pipe causes exec to see "broken pipe" + // and return a generic stream error. If we report streamErr + // first, the customer sees "streaming failed" instead of the + // actually-actionable "dataset exceeded v0.1 cap" diagnostic + // from the tar side. + // + // Check tarErr first — when both are non-nil, the tar side + // is usually the upstream cause. streamErr-only (no tarErr) + // is the genuine network/RBAC/remote-tar-failed case where + // the exec wording is the right surface. + if tarErr != nil { + return fmt.Errorf("building tar archive: %w", tarErr) + } + if streamErr != nil { + hint := "" + if remote := stderrBuf.String(); remote != "" { + hint = fmt.Sprintf(" (remote tar stderr: %s)", remote) + } + return fmt.Errorf("streaming files to %s/%s: %w%s", namespace, podName, streamErr, hint) + } + return nil +} + +// writeLayoutTar packages the layout's files into the writer as a +// flat tar archive with this structure: +// +// labels.csv (file) +// images/ (file, per layout.Images) +// +// The destination tar is unpacked with `tar -xf - -C /data/shared/
`, +// so file paths are RELATIVE to that root — that's why we write +// "labels.csv" not "/data/shared/
/labels.csv". +// +// Mode is 0644 on all files (read-only for the ingestor SA). Tar's +// type flag is TypeReg (regular file). No symlinks, no hard links, +// no extended attrs — the layout we accept doesn't have any. +// +// Uses a named return + deferred Close so a tar trailer-write error +// (truncated archive — GNU tar refuses to extract these) propagates +// even on the happy path. errcheck would otherwise flag the bare +// `defer tw.Close()`, and rightly: silently dropping that error +// means a truncated stream looks identical to a successful one +// from this function's caller. +func writeLayoutTar(w io.Writer, layout *LocalLayout) (err error) { + tw := tar.NewWriter(w) + defer func() { + // If we already have an error, preserve it; the close error + // is less useful than whatever caused the early return. + // Otherwise surface the close error so a failed trailer + // write doesn't silently corrupt the archive. + if cerr := tw.Close(); cerr != nil && err == nil { + err = fmt.Errorf("closing tar writer: %w", cerr) + } + }() + + // Running total enforces the v0.1 MaxTotalBytes cap at STREAM + // time, not just at Discover time. A file that grew between + // pre-flight and now (TOCTOU) — or just a sum miscount — would + // otherwise sneak past the cap silently. Bugbot flagged on + // PR-b r8 as Medium. + var totalBytes int64 + + // labels.csv first (small, sanity-checks the stream quickly). + n, err := writeTarFile(tw, layout.LabelsCSV, "labels.csv") + if err != nil { + return fmt.Errorf("packaging labels.csv: %w", err) + } + totalBytes += n + if totalBytes > MaxTotalBytes { + return fmt.Errorf( + "dataset exceeded v0.1 total cap of %s during stream "+ + "(labels.csv alone is %s; pre-flight likely raced with "+ + "a file growing on disk)", + HumanBytes(MaxTotalBytes), HumanBytes(totalBytes)) + } + + // Then each image. We write them in the order Discover returned + // (filesystem-walk order, not sorted) — sorting would make the + // stream deterministic across runs but adds zero customer value. + for _, abs := range layout.Images { + // The destination filename inside images/ is the file's + // basename — strips the customer's local path so a push + // from /home/alice/datasets/cats_dogs/images/001.jpg + // becomes images/001.jpg in the tar (and on the PVC). + // + // Use path.Join (always forward-slash) NOT filepath.Join + // (OS-native separator) for the tar HEADER name. USTAR/POSIX + // tar requires forward slashes; a Windows-built CLI using + // filepath.Join would emit `images\001.jpg` headers that the + // Linux stage Pod's `tar -xf` either rejects or extracts as + // a flat-named file. Bugbot flagged on PR-b round 6. + dst := path.Join("images", filepath.Base(abs)) + n, err := writeTarFile(tw, abs, dst) + if err != nil { + return fmt.Errorf("packaging %s: %w", abs, err) + } + totalBytes += n + if totalBytes > MaxTotalBytes { + return fmt.Errorf( + "dataset exceeded v0.1 total cap of %s after streaming %s "+ + "(reached %s; file growth between pre-flight and stream)", + HumanBytes(MaxTotalBytes), dst, HumanBytes(totalBytes)) + } + } + + // tw.Close() in the defer above writes the tar footer + // (two zero blocks). Without that, GNU tar treats the archive + // as truncated and refuses to extract. + return nil +} + +// writeTarFile writes one file from `src` into tw under the +// archive-relative name `dst`. Streams the file body — no full- +// read into memory — so a single 500 MiB image doesn't balloon +// the CLI's memory. +// +// Returns the file's declared size (from os.Lstat) so the caller +// can maintain a running total against MaxTotalBytes — closes the +// stream-time half of the TOCTOU window Bugbot flagged on r8. +// +// Uses os.Lstat (not Stat) + an explicit symlink reject as a +// defense-in-depth second layer behind walk.go's rejectSymlink +// (Bugbot r4). Also re-checks the single-file size cap at stream +// time: a file that grew between Discover and now would otherwise +// upload past the advertised 500 MiB cap (Bugbot r8). +func writeTarFile(tw *tar.Writer, src, dst string) (int64, error) { + st, err := os.Lstat(src) + if err != nil { + return 0, err + } + if st.Mode()&os.ModeSymlink != 0 { + // Reaching here means Discover let a symlink through — + // shouldn't happen in production, but the explicit error + // keeps the security property locally enforceable. + return 0, fmt.Errorf("refusing to stream symbolic link %q (defense-in-depth; should have been rejected by Discover)", src) + } + if st.Size() > MaxSingleFileBytes { + // Stream-time recheck of the single-file cap. Discover + // caught this in pre-flight; if we see it again here, the + // file grew between then and now. + return 0, sizeError(dst, st.Size(), MaxSingleFileBytes) + } + hdr := &tar.Header{ + Name: dst, + Size: st.Size(), + Mode: 0o644, + Typeflag: tar.TypeReg, + // We don't carry ModTime forward — the customer's local + // mtime has no useful semantic on the PVC, and zeroing it + // keeps the tar bit-for-bit reproducible across runs + // (helps tests, makes future content-hash checks easier). + } + if err := tw.WriteHeader(hdr); err != nil { + return 0, err + } + f, err := os.Open(src) + if err != nil { + return 0, err + } + // Read-only file Close errors are not meaningful — there's + // nothing the caller can do, and the underlying file descriptor + // gets returned to the OS regardless. Explicit-discard pattern, + // same as the Fprintf writers elsewhere. + defer func() { _ = f.Close() }() + // io.CopyN caps the actual stream at the declared header size. + // Without the cap, a file that grew between Lstat above and + // the Copy here would overflow the header — the tar would be + // corrupted (header says N bytes, body has > N). Cap-and-trust + // is the safe choice: if the file shrank, the tar trailer + // surfaces the mismatch; if the file grew, we just stop at + // the declared size and let the new bytes get re-pushed next + // invocation. Closes the body-side half of the TOCTOU window. + if _, err := io.CopyN(tw, f, st.Size()); err != nil && !errors.Is(err, io.EOF) { + return 0, err + } + return st.Size(), nil +} diff --git a/internal/push/stream_test.go b/internal/push/stream_test.go new file mode 100644 index 0000000..62bde45 --- /dev/null +++ b/internal/push/stream_test.go @@ -0,0 +1,428 @@ +package push + +import ( + "archive/tar" + "bytes" + "context" + "errors" + "io" + "regexp" + "sort" + "strings" + "testing" +) + +// fakeExecutor is the stand-in for SPDYExecutor in tests. Captures +// every Exec call's parameters into fields the test can assert on, +// and reads the stdin stream into a buffer (so we can parse the tar +// archive back out and verify its contents). +type fakeExecutor struct { + // Captured call parameters. + gotNS, gotPod, gotContainer string + gotCmd []string + + // Captured stdin (the tar archive). + gotStdin []byte + + // What to write back to the caller as stderr (simulates a + // "tar: no space left on device" style remote-side failure). + stderrToReturn []byte + + // What to return as the exec error (simulates a 4xx/5xx from + // the apiserver, a network drop, etc.). + errToReturn error + + // drainBeforeReturn: if false, errToReturn fires BEFORE the + // stdin pipe is drained — simulates a fast failure where the + // remote tar dies on the first byte. Test infrastructure for + // broken-pipe coverage. + drainBeforeReturn bool +} + +func (f *fakeExecutor) Exec( + _ context.Context, + ns, pod, container string, + cmd []string, + stdin io.Reader, _ io.Writer, stderr io.Writer, +) error { + f.gotNS, f.gotPod, f.gotContainer = ns, pod, container + f.gotCmd = cmd + + if stdin != nil && (f.drainBeforeReturn || f.errToReturn == nil) { + b, _ := io.ReadAll(stdin) + f.gotStdin = b + } + if len(f.stderrToReturn) > 0 && stderr != nil { + _, _ = stderr.Write(f.stderrToReturn) + } + return f.errToReturn +} + +// TestStreamLayout_TarPathsAreForwardSlash pins the cross-platform +// tar header convention. On Windows, filepath.Join uses '\' as +// separator — using it for the tar HEADER name would produce +// archive entries the Linux stage Pod's `tar -xf` either rejects +// or extracts as flat-named files (collapsing the images/ subdir). +// The fix is path.Join (always forward-slash); this test asserts +// the portable property regardless of which OS the test runs on. +// Bugbot flagged the Windows bug on PR-b round 6. +func TestStreamLayout_TarPathsAreForwardSlash(t *testing.T) { + root := imgcDir(t, "a.jpg", "b.png") + layout, err := Discover(root) + if err != nil { + t.Fatalf("Discover: %v", err) + } + fe := &fakeExecutor{} + if err := StreamLayout(context.Background(), fe, "tracebloc", "p", "stage", + layout, "t", NoOpProgress{}); err != nil { + t.Fatalf("StreamLayout: %v", err) + } + + tr := tar.NewReader(bytes.NewReader(fe.gotStdin)) + for { + hdr, err := tr.Next() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + t.Fatalf("reading tar: %v", err) + } + if strings.Contains(hdr.Name, `\`) { + t.Errorf("tar entry %q contains backslash; "+ + "Linux stage Pod's `tar -xf` requires forward-slash paths", hdr.Name) + } + } +} + +// TestStreamLayout_TarContents end-to-end: the bytes the executor +// sees on stdin MUST be a valid tar with the layout's files at the +// expected paths (labels.csv at the root, images/ under +// images/). If this drifts, the in-cluster `tar -xf - -C +// /data/shared/
/` lands files in the wrong place and +// jobs-manager's ingestor in Phase 4 silently sees zero rows. +func TestStreamLayout_TarContents(t *testing.T) { + root := imgcDir(t, "a.jpg", "b.png") + layout, err := Discover(root) + if err != nil { + t.Fatalf("Discover: %v", err) + } + + fe := &fakeExecutor{} + err = StreamLayout(context.Background(), fe, "tracebloc", "pod-x", "stage", + layout, "cats_dogs", NoOpProgress{}) + if err != nil { + t.Fatalf("StreamLayout: %v", err) + } + + // Parse the captured tar back out and collect (name, size). + type entry struct { + name string + size int64 + } + var got []entry + tr := tar.NewReader(bytes.NewReader(fe.gotStdin)) + for { + hdr, err := tr.Next() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + t.Fatalf("reading tar: %v", err) + } + got = append(got, entry{name: hdr.Name, size: hdr.Size}) + } + + // Sort for deterministic comparison — the producer walks + // images/ in OS order which isn't stable across filesystems. + sort.Slice(got, func(i, j int) bool { return got[i].name < got[j].name }) + + // Three entries: labels.csv + 2 images. + if len(got) != 3 { + t.Fatalf("tar entry count = %d, want 3 (labels.csv + 2 images); got=%v", len(got), got) + } + want := []entry{ + {name: "images/a.jpg", size: 100}, // imgcDir writes 100-byte images + {name: "images/b.png", size: 100}, + {name: "labels.csv", size: 39}, // "image_id,label\n001.jpg,cat\n002.jpg,dog\n" + } + for i := range want { + if got[i] != want[i] { + t.Errorf("entry[%d] = %v, want %v", i, got[i], want[i]) + } + } +} + +// TestStreamLayout_RemoteCommand pins the in-cluster command. If +// this ever drifts (e.g. someone changes the destination path +// helper), the tar gets extracted to the wrong PVC subdirectory +// and Phase 4 silently fails. The mkdir -p is what makes re-push +// overwrite semantics work. +func TestStreamLayout_RemoteCommand(t *testing.T) { + layout, err := Discover(imgcDir(t)) + if err != nil { + t.Fatalf("Discover: %v", err) + } + fe := &fakeExecutor{} + _ = StreamLayout(context.Background(), fe, "tracebloc", "pod-x", "stage", + layout, "my_table", NoOpProgress{}) + + if got, want := len(fe.gotCmd), 3; got != want { + t.Fatalf("len(cmd) = %d, want %d", got, want) + } + if fe.gotCmd[0] != "/bin/sh" || fe.gotCmd[1] != "-c" { + t.Errorf("cmd[:2] = %v, want [/bin/sh -c]", fe.gotCmd[:2]) + } + script := fe.gotCmd[2] + // The remote command must be: + // 1. HERMETIC (Bugbot r5): old files don't survive a re-push + // 2. TRANSACTIONAL (Bugbot r7): tar failure preserves prev + // dataset + // 3. RACE-SAFE under parallel-push (Bugbot r8): unique + // .staging- suffix per invocation so concurrent pushes + // don't interleave each other's tar extraction + // + // Pattern: extract to .staging-, then on tar SUCCESS + // rm the old dest + mv staging → dest. The order matters: + // + // - tar BEFORE any rm of $DEST (preserves on tar failure) + // - mv AFTER tar succeeds (atomic-ish swap) + // + // Extract the staging path with a regex so the random hex + // suffix doesn't pin us to a specific invocation's bytes. + stagingRE := regexp.MustCompile(`/data/shared/my_table\.staging-[0-9a-f]{8}`) + stagingPaths := stagingRE.FindAllString(script, -1) + if len(stagingPaths) == 0 { + t.Fatalf("remote script has no /data/shared/my_table.staging-<8hex> path (race-safety regression):\n%s", script) + } + // All staging mentions must refer to the SAME suffix in a single + // invocation. If we see two distinct suffixes that's a bug: + // rm/mkdir/tar/mv would target different dirs. + for i := 1; i < len(stagingPaths); i++ { + if stagingPaths[i] != stagingPaths[0] { + t.Errorf("remote script mixes staging suffixes %q and %q:\n%s", + stagingPaths[0], stagingPaths[i], script) + } + } + staging := stagingPaths[0] + + // Pin the four key operations. + for _, want := range []string{ + `mkdir -p "` + staging + `"`, + `tar -xf - -C "` + staging + `"`, + `mv "` + staging + `" "/data/shared/my_table"`, + } { + if !strings.Contains(script, want) { + t.Errorf("remote script missing %q: %s", want, script) + } + } + // Transactional property: tar must come BEFORE any mutation + // of the real destination. r5+r7 used `rm $DEST` for the + // pre-mv step; r10 swapped that to `mv $DEST $DEST.old-` + // (backup-and-swap, so a mv failure can be rolled back). + // The contract is the same: tar runs while $DEST is intact. + tarIdx := strings.Index(script, `tar -xf - -C "`+staging+`"`) + destBackupMvIdx := strings.Index(script, `mv "/data/shared/my_table" "`) + if tarIdx < 0 || destBackupMvIdx < 0 { + t.Fatalf("remote script missing tar or destination-backup mv: %s", script) + } + if tarIdx >= destBackupMvIdx { + t.Errorf("remote script touches $DEST BEFORE tar succeeds — partial-transfer could destroy previous data:\n%s", script) + } + // Single-segment guarantee: both rm targets must end with + // /my_table or /my_table.staging-* — never just /data/shared. + if strings.Contains(script, `rm -rf "/data/shared"`) || + strings.Contains(script, `rm -rf "/data/shared/"`) { + t.Errorf("remote script rm-rfs the parent /data/shared (would nuke sibling tables):\n%s", script) + } + + // Bugbot r9 + r10: orphan cleanup for previously-failed pushes + // must catch BOTH .staging-* and .old-* siblings. The .old-* + // dirs are created by the round-10 backup-and-rollback swap; + // without including them in the find pattern, failed-mid-swap + // pushes would leak .old-* dirs. + for _, wantName := range []string{`-name "my_table.staging-*"`, `-name "my_table.old-*"`} { + if !strings.Contains(script, wantName) { + t.Errorf("remote script missing %s in orphan cleanup:\n%s", wantName, script) + } + } + if !strings.Contains(script, `-mmin +60 -exec rm -rf {} +`) { + t.Errorf("remote script missing -mmin+60 orphan window:\n%s", script) + } + + // Bugbot r10: backup-and-swap pattern. The previous dataset + // must be backed up to .old- BEFORE the new dataset + // arrives, and restored if the main mv fails. Pin the key + // shape pieces — backup mv, primary mv, rollback mv, cleanup. + backupRE := regexp.MustCompile(`/data/shared/my_table\.old-[0-9a-f]{8}`) + backupPaths := backupRE.FindAllString(script, -1) + if len(backupPaths) == 0 { + t.Fatalf("remote script has no .old- backup path (r10 rollback regression):\n%s", script) + } + // All .old- mentions in a single invocation must agree — + // otherwise the rollback would target a different path than + // the backup created. + for i := 1; i < len(backupPaths); i++ { + if backupPaths[i] != backupPaths[0] { + t.Errorf("remote script mixes backup suffixes %q vs %q:\n%s", + backupPaths[0], backupPaths[i], script) + } + } + backup := backupPaths[0] + // Backup must use the SAME suffix as staging — different + // suffixes would defeat the "find -name ...staging-* ...old-*" + // orphan-cleanup symmetry, AND would risk collision with a + // concurrent push's .old-. + if strings.TrimPrefix(backup, "/data/shared/my_table.old-") != + strings.TrimPrefix(staging, "/data/shared/my_table.staging-") { + t.Errorf("backup and staging suffixes diverge: %q vs %q", backup, staging) + } + // Backup mv (DEST → .old) must appear BEFORE primary mv + // (.staging → DEST), or rollback wouldn't have anything to + // restore. + backupMvIdx := strings.Index(script, `mv "/data/shared/my_table" "`+backup+`"`) + primaryMvIdx := strings.Index(script, `mv "`+staging+`" "/data/shared/my_table"`) + if backupMvIdx < 0 || primaryMvIdx < 0 { + t.Fatalf("remote script missing backup or primary mv:\n%s", script) + } + if backupMvIdx >= primaryMvIdx { + t.Errorf("backup mv (DEST→.old) must come BEFORE primary mv (.staging→DEST); rollback contract broken:\n%s", script) + } +} + +// TestStreamLayout_StagingSuffixIsUniquePerInvocation: two back-to- +// back StreamLayout calls for the same table MUST produce different +// staging-dir suffixes. Without this, concurrent `dataset push` runs +// for the same table would race on the same .staging path and +// corrupt each other's tar extraction. Bugbot r8 fix. +func TestStreamLayout_StagingSuffixIsUniquePerInvocation(t *testing.T) { + layout, err := Discover(imgcDir(t)) + if err != nil { + t.Fatalf("Discover: %v", err) + } + stagingRE := regexp.MustCompile(`/data/shared/t\.staging-[0-9a-f]{8}`) + + collect := func() string { + fe := &fakeExecutor{} + if err := StreamLayout(context.Background(), fe, "tracebloc", "p", "stage", + layout, "t", NoOpProgress{}); err != nil { + t.Fatalf("StreamLayout: %v", err) + } + m := stagingRE.FindString(fe.gotCmd[2]) + if m == "" { + t.Fatalf("no staging path in: %s", fe.gotCmd[2]) + } + return m + } + a, b := collect(), collect() + if a == b { + t.Errorf("back-to-back StreamLayout produced identical staging path %q; race-safety regression", a) + } +} + +// TestStreamLayout_TargetingFromArgs pins that namespace + pod name +// flow through to the executor unchanged. Catches a refactor that +// hardcodes the namespace or builds the pod name in the wrong layer. +func TestStreamLayout_TargetingFromArgs(t *testing.T) { + layout, err := Discover(imgcDir(t)) + if err != nil { + t.Fatalf("Discover: %v", err) + } + fe := &fakeExecutor{} + _ = StreamLayout(context.Background(), fe, "custom-ns", "weird-pod-name", "container-x", + layout, "t", NoOpProgress{}) + if fe.gotNS != "custom-ns" { + t.Errorf("namespace = %q, want custom-ns", fe.gotNS) + } + if fe.gotPod != "weird-pod-name" { + t.Errorf("pod = %q, want weird-pod-name", fe.gotPod) + } + if fe.gotContainer != "container-x" { + t.Errorf("container = %q, want container-x", fe.gotContainer) + } +} + +// TestStreamLayout_ExecErrorWithRemoteStderr: when the in-cluster +// tar fails (e.g. read-only FS, no-space), the remote stderr must +// surface in the customer-facing error. This is the actionable +// signal — without it, customers see "exec stream failed" with no +// way to diagnose what went wrong server-side. +func TestStreamLayout_ExecErrorWithRemoteStderr(t *testing.T) { + layout, err := Discover(imgcDir(t)) + if err != nil { + t.Fatalf("Discover: %v", err) + } + fe := &fakeExecutor{ + stderrToReturn: []byte("tar: write error: No space left on device"), + errToReturn: errors.New("command terminated with exit code 1"), + drainBeforeReturn: true, + } + err = StreamLayout(context.Background(), fe, "tracebloc", "p", "stage", + layout, "t", NoOpProgress{}) + if err == nil { + t.Fatal("StreamLayout returned nil on exec error") + } + for _, want := range []string{"streaming files", "No space left on device", "exit code 1"} { + if !strings.Contains(err.Error(), want) { + t.Errorf("error missing %q: %v", want, err) + } + } +} + +// TestStreamLayout_ProgressBytes: the Progress sink receives byte +// counts during the stream. This is the contract the schollz bar +// keys off — if the wrapper ever stops calling Add(), the bar would +// freeze at 0%. +func TestStreamLayout_ProgressBytes(t *testing.T) { + layout, err := Discover(imgcDir(t)) + if err != nil { + t.Fatalf("Discover: %v", err) + } + fp := &fakeProgress{} + err = StreamLayout(context.Background(), &fakeExecutor{}, "tracebloc", "p", "stage", + layout, "t", fp) + if err != nil { + t.Fatalf("StreamLayout: %v", err) + } + // At minimum we expect the file bodies (200 + 39 = 239) plus + // tar headers (~512 per file = ~1536 bytes overhead minimum). + // Assert "some bytes flowed" rather than an exact count + // because the tar overhead is implementation-defined. + if fp.added < 239 { + t.Errorf("progress.Add total = %d, want at least 239 (file body bytes)", fp.added) + } + if !fp.finished { + t.Errorf("progress.Finish() never called") + } +} + +// TestWriteLayoutTar_LabelsCSVFirst: ordering matters for fail-fast +// diagnostic — labels.csv at the start means a corrupt tar is more +// likely to surface before we've shipped a hundred image bytes. +func TestWriteLayoutTar_LabelsCSVFirst(t *testing.T) { + layout, err := Discover(imgcDir(t)) + if err != nil { + t.Fatalf("Discover: %v", err) + } + var buf bytes.Buffer + if err := writeLayoutTar(&buf, layout); err != nil { + t.Fatalf("writeLayoutTar: %v", err) + } + tr := tar.NewReader(&buf) + hdr, err := tr.Next() + if err != nil { + t.Fatalf("reading first entry: %v", err) + } + if hdr.Name != "labels.csv" { + t.Errorf("first entry name = %q, want labels.csv (ordering pins this)", hdr.Name) + } +} + +// fakeProgress: a Progress that records total bytes added and +// whether Finish was called. +type fakeProgress struct { + added int + finished bool +} + +func (p *fakeProgress) Add(n int) { p.added += n } +func (p *fakeProgress) Finish() { p.finished = true } diff --git a/internal/push/walk.go b/internal/push/walk.go index dd95571..bb7b15f 100644 --- a/internal/push/walk.go +++ b/internal/push/walk.go @@ -106,9 +106,17 @@ func Discover(rootDir string) (*LocalLayout, error) { layout := &LocalLayout{Root: abs} - // labels.csv (required). + // labels.csv (required). Use Lstat — NOT Stat — so a symlink + // shows up as a symlink (mode includes ModeSymlink) rather + // than being silently followed. v0.1 rejects symlinks entirely + // (see rejectSymlink); without Lstat the size cap below would + // see the symlink's own ~100-byte size while writeTarFile + // (which uses os.Stat → follows symlinks) would happily stream + // the target's full contents — a size-cap bypass and an + // arbitrary-local-file disclosure to the cluster PVC. Bugbot + // flagged this as Medium-severity security on PR-b round 4. labelsPath := filepath.Join(abs, "labels.csv") - labelsStat, err := os.Stat(labelsPath) + labelsStat, err := os.Lstat(labelsPath) if err != nil { if errors.Is(err, os.ErrNotExist) { return nil, fmt.Errorf( @@ -120,6 +128,9 @@ func Discover(rootDir string) (*LocalLayout, error) { } return nil, fmt.Errorf("stat labels.csv: %w", err) } + if err := rejectSymlink(labelsStat, "labels.csv"); err != nil { + return nil, err + } if labelsStat.IsDir() { // A directory literally named "labels.csv" passes the // os.Stat above — without this check the pre-flight would @@ -138,9 +149,14 @@ func Discover(rootDir string) (*LocalLayout, error) { layout.TotalBytes += labelsStat.Size() // images/ subdir (required, must contain at least one - // image-extension file). + // image-extension file). Use os.Lstat — NOT Stat — so a + // symlinked-directory case (e.g. `ln -s /etc images`) gets + // caught here, not silently followed into the linked path. + // Without Lstat, the symlink-image fixes from the previous + // commit don't matter: the whole directory could be a link. + // Bugbot flagged the missing dir-level Lstat on PR-b round 5. imagesDir := filepath.Join(abs, "images") - imagesStat, err := os.Stat(imagesDir) + imagesStat, err := os.Lstat(imagesDir) if err != nil { if errors.Is(err, os.ErrNotExist) { return nil, fmt.Errorf( @@ -150,6 +166,9 @@ func Discover(rootDir string) (*LocalLayout, error) { } return nil, fmt.Errorf("stat images/: %w", err) } + if err := rejectSymlink(imagesStat, "images"); err != nil { + return nil, err + } if !imagesStat.IsDir() { return nil, fmt.Errorf("%q exists but is not a directory", imagesDir) } @@ -175,10 +194,19 @@ func Discover(rootDir string) (*LocalLayout, error) { if _, ok := imageExtensions[ext]; !ok { continue } + // entry.Info() returns Lstat-like metadata for the + // directory entry (the symlink's own mode if it's a + // symlink, not the target's). That's exactly what we + // want here — combined with rejectSymlink it closes the + // symlink-bypass-size-caps hole Bugbot flagged on PR-b + // round 4. info, err := entry.Info() if err != nil { return nil, fmt.Errorf("stat %q: %w", entry.Name(), err) } + if err := rejectSymlink(info, filepath.Join("images", entry.Name())); err != nil { + return nil, err + } if info.Size() > MaxSingleFileBytes { return nil, sizeError(filepath.Join("images", entry.Name()), info.Size(), MaxSingleFileBytes) @@ -207,6 +235,54 @@ func Discover(rootDir string) (*LocalLayout, error) { return layout, nil } +// rejectSymlink returns a non-nil error if info describes a symlink. +// v0.1 refuses symlinks under /{labels.csv, images, images/*} +// entirely because: +// +// - SECURITY: writeTarFile uses os.Open (which follows symlinks). +// Discover sized entries via DirEntry.Info() (which does not). +// A symlink whose target is a multi-GB local file would pass +// Discover's size cap (the symlink itself is ~100 bytes) yet +// stream the target's full contents to the cluster PVC — a +// size-cap bypass and arbitrary-local-file disclosure to the +// cluster admin. Bugbot caught this on PR-b round 4. +// - UX: legitimate image_classification datasets don't use +// symlinks. A clear "symlinks not supported" error is better +// than the alternative fixes (resolve + re-stat the target, +// blanket Stat() everywhere) — both of which would let the +// security hole creep back in on a future refactor. +// +// Customers with symlink-heavy layouts (rare; usually means their +// data lives on another filesystem) can `cp -L` to materialize +// the files before pushing. +// +// Known limitation: HARD LINKS are NOT rejected. The filesystem +// doesn't expose a Mode bit for hard links the way ModeSymlink +// distinguishes symlinks, and a high-Nlink check has too many +// false positives (legitimate hard-linked datasets where the +// dataset dir is the only entry point). The implicit security +// boundary is the CLI's process-level read permissions: a +// customer can only hard-link files they already have read +// access to, so the cluster admin reading a hard-linked +// /etc/shadow via the PVC isn't a privilege escalation — they'd +// need the CLI to be running as root for that to be exploitable, +// and the docs already recommend running as a low-privileged +// user. v0.2 may add openat(O_NOFOLLOW)-based sandboxing if a +// real customer needs harder isolation; tracked alongside the +// cloud-source story (#147 non-goals). +func rejectSymlink(info os.FileInfo, relPath string) error { + if info.Mode()&os.ModeSymlink == 0 { + return nil + } + return fmt.Errorf( + "%q is a symbolic link, which v0.1 does not allow in the dataset "+ + "layout (security: a symlink could escape the dataset tree or "+ + "bypass size caps). Materialize the link target (e.g. `cp -L`) "+ + "and re-run, or wait for v0.2's cloud-source story if the data "+ + "lives elsewhere.", + relPath) +} + // sizeError builds the over-the-single-file-cap error with the same // human-readable framing as the total-cap branch above. Centralized // so the message stays consistent if we tune the wording later. diff --git a/internal/push/walk_test.go b/internal/push/walk_test.go index 8095415..441400c 100644 --- a/internal/push/walk_test.go +++ b/internal/push/walk_test.go @@ -3,6 +3,7 @@ package push import ( "os" "path/filepath" + "runtime" "strings" "testing" ) @@ -191,6 +192,124 @@ func TestDiscover_LabelsCSVIsDirectory(t *testing.T) { } } +// TestDiscover_RejectsSymlinkedImage is the security regression pin +// for the Bugbot-Medium finding on PR-b round 4. A symlink in +// images/ whose target points outside the dataset tree (or at a +// big local file) used to pass Discover's size cap (because +// DirEntry.Info() returns the link's own ~100-byte size) yet +// stream the target's full contents into the cluster PVC via +// writeTarFile's os.Stat+os.Open path. That's both a size-cap +// bypass AND an arbitrary-local-file disclosure to the cluster +// admin. v0.1 rejects symlinks outright. +func TestDiscover_RejectsSymlinkedImage(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("symlinks require admin privileges on Windows") + } + root := t.TempDir() + if err := os.WriteFile(filepath.Join(root, "labels.csv"), + []byte("image_id,label\n001.jpg,cat\n"), 0o644); err != nil { + t.Fatalf("write labels.csv: %v", err) + } + imagesDir := filepath.Join(root, "images") + if err := os.MkdirAll(imagesDir, 0o755); err != nil { + t.Fatalf("mkdir images: %v", err) + } + // Create a real image file outside the dataset tree to be the + // symlink target. The symlink inside images/ points at it. + target := filepath.Join(t.TempDir(), "outside.jpg") + if err := os.WriteFile(target, make([]byte, 100), 0o644); err != nil { + t.Fatalf("write outside target: %v", err) + } + link := filepath.Join(imagesDir, "evil.jpg") + if err := os.Symlink(target, link); err != nil { + t.Fatalf("symlink: %v", err) + } + + _, err := Discover(root) + if err == nil { + t.Fatal("Discover accepted a symlinked image — security regression; v0.1 must reject") + } + for _, want := range []string{"symbolic link", "security", "cp -L"} { + if !strings.Contains(err.Error(), want) { + t.Errorf("error missing %q in: %v", want, err) + } + } +} + +// TestDiscover_RejectsSymlinkedImagesDir is the round-5 follow-up +// to the symlinked-FILE fix from round 4. The DIRECTORY itself +// could also be a symlink (e.g. `ln -s /etc images`), which the +// previous fix didn't catch because it only Lstat'd the entries +// INSIDE images/. Bugbot flagged it; this test pins the +// dir-level Lstat. +func TestDiscover_RejectsSymlinkedImagesDir(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("symlinks require admin privileges on Windows") + } + root := t.TempDir() + if err := os.WriteFile(filepath.Join(root, "labels.csv"), + []byte("image_id,label\n001.jpg,cat\n"), 0o644); err != nil { + t.Fatalf("write labels.csv: %v", err) + } + // Create a real images dir somewhere ELSE that the symlink + // will point at. The symlink in `root` is what should be + // rejected. + realDir := filepath.Join(t.TempDir(), "real-images") + if err := os.MkdirAll(realDir, 0o755); err != nil { + t.Fatalf("mkdir real-images: %v", err) + } + if err := os.WriteFile(filepath.Join(realDir, "a.jpg"), + make([]byte, 100), 0o644); err != nil { + t.Fatalf("write img: %v", err) + } + if err := os.Symlink(realDir, filepath.Join(root, "images")); err != nil { + t.Fatalf("symlink images: %v", err) + } + + _, err := Discover(root) + if err == nil { + t.Fatal("Discover accepted a symlinked images/ directory — security regression") + } + if !strings.Contains(err.Error(), "symbolic link") { + t.Errorf("error missing symbolic-link rejection: %v", err) + } +} + +// TestDiscover_RejectsSymlinkedLabelsCSV: same hole, different +// file. The labels.csv path can also be a symlink (e.g. pointing +// at a huge local file). Lstat + reject covers both cases +// symmetrically. +func TestDiscover_RejectsSymlinkedLabelsCSV(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("symlinks require admin privileges on Windows") + } + root := t.TempDir() + target := filepath.Join(t.TempDir(), "outside-labels.csv") + if err := os.WriteFile(target, []byte("image_id,label\n001.jpg,cat\n"), 0o644); err != nil { + t.Fatalf("write outside target: %v", err) + } + link := filepath.Join(root, "labels.csv") + if err := os.Symlink(target, link); err != nil { + t.Fatalf("symlink: %v", err) + } + imagesDir := filepath.Join(root, "images") + if err := os.MkdirAll(imagesDir, 0o755); err != nil { + t.Fatalf("mkdir images: %v", err) + } + if err := os.WriteFile(filepath.Join(imagesDir, "a.jpg"), + make([]byte, 100), 0o644); err != nil { + t.Fatalf("write img: %v", err) + } + + _, err := Discover(root) + if err == nil { + t.Fatal("Discover accepted a symlinked labels.csv — security regression") + } + if !strings.Contains(err.Error(), "symbolic link") { + t.Errorf("error missing symbolic-link rejection: %v", err) + } +} + func TestDiscover_NotADirectory(t *testing.T) { // Customer passes a path to a single file instead of a dir. // This is a common autocomplete-mistake (tab-completing