From 305719647eaeb5f5de37ec20680bb558fd5d5a91 Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Thu, 21 May 2026 19:38:27 +0500 Subject: [PATCH 1/9] feat(cluster): kubeconfig discovery, parent release detection, SA token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 of the v0.1 roadmap. Adds the plumbing the future `tracebloc dataset push` flow needs: where does the customer's kubeconfig point, which tracebloc release lives there, and how do we authenticate to its jobs-manager. End-to-end validated against the dev EKS cluster (tb-client-dev-templates): discovers chart 1.3.5, resolves jobs-manager.tracebloc-templates.svc:8080, reads INGESTOR_IMAGE_DIGEST out of the deployment env, mints a 10-minute SA token via TokenRequest. What lands: - internal/cluster/kubeconfig.go — Load() that honors --kubeconfig, $KUBECONFIG, ~/.kube/config (via clientcmd's full default loading rules — *not* an empty ExplicitPath, which silently refuses to fall back to defaults; that was the first bug the real-cluster smoke caught). - internal/cluster/discover.go — DiscoverParentRelease() finds the tracebloc/client release in a namespace by listing chart-managed Deployments and filtering by name suffix (-jobs-manager). The chart shares app.kubernetes.io/name=client across mysql/jobs- manager/requests-proxy, so suffix matching is what distinguishes jobs-manager. Returns a friendly multi-release error when ambiguous, with remediation text in the message. - internal/cluster/token.go — MintIngestorToken() tries the modern TokenRequest path first, falls back to a static service-account-token Secret on RBAC denial / older clusters / SA missing. Errors propagate verbatim on non-recoverable failures (network, context cancellation) so customers see the real problem instead of a misleading "static fallback also failed." - internal/cli/cluster.go — `tracebloc cluster info` command. Prints context, server, namespace, parent release info, SA + token state (with SHA256(token)[:8] instead of the raw bytes — token must never appear in scrollback). Exit codes 3 (kubeconfig issue) / 4 (no parent release) / 5 (token mint failed). Tests: - 5 new test files covering happy path, multi-release ambiguity, service name fallback, the sibling-deployment filter regression (mysql + requests-proxy + jobs-manager all share chart-level labels — discovery must pick jobs-manager by name suffix), TokenRequest happy path, static-secret fallback, non-recoverable error pass-through, and the combined-failure remediation message. Coverage: internal/cluster 83.8%, internal/cli 59.5% (cluster info itself is hard to unit-test without a real cluster — Phase 3+ adds integration tests against a kind cluster). Library footprint: brings in k8s.io/client-go + apimachinery + api (@v0.31.0) and sigs.k8s.io/yaml. Cross-compiled binaries grow from ~10MB to ~30MB; cost is acceptable for the customer-experience upside of "your kubeconfig is all you need." Closes tracebloc/client#150. Co-Authored-By: Claude Opus 4.7 (1M context) --- go.mod | 42 +++++- go.sum | 116 +++++++++++++- internal/cli/cluster.go | 184 +++++++++++++++++++++++ internal/cli/root.go | 1 + internal/cluster/discover.go | 200 +++++++++++++++++++++++++ internal/cluster/discover_test.go | 222 +++++++++++++++++++++++++++ internal/cluster/kubeconfig.go | 164 ++++++++++++++++++++ internal/cluster/kubeconfig_test.go | 84 +++++++++++ internal/cluster/token.go | 224 ++++++++++++++++++++++++++++ internal/cluster/token_test.go | 186 +++++++++++++++++++++++ 10 files changed, 1416 insertions(+), 7 deletions(-) create mode 100644 internal/cli/cluster.go create mode 100644 internal/cluster/discover.go create mode 100644 internal/cluster/discover_test.go create mode 100644 internal/cluster/kubeconfig.go create mode 100644 internal/cluster/kubeconfig_test.go create mode 100644 internal/cluster/token.go create mode 100644 internal/cluster/token_test.go diff --git a/go.mod b/go.mod index 8c5a9db..571ba04 100644 --- a/go.mod +++ b/go.mod @@ -1,15 +1,51 @@ module github.com/tracebloc/cli -go 1.22 +go 1.26.0 require ( github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 github.com/spf13/cobra v1.8.1 - golang.org/x/text v0.16.0 + golang.org/x/text v0.33.0 gopkg.in/yaml.v3 v3.0.1 + k8s.io/api v0.36.1 + k8s.io/apimachinery v0.36.1 + k8s.io/client-go v0.36.1 ) require ( + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-openapi/jsonpointer v0.21.0 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.23.0 // indirect + github.com/google/gnostic-models v0.7.0 // indirect + github.com/google/uuid v1.6.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/spf13/pflag v1.0.5 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/spf13/pflag v1.0.9 // indirect + github.com/x448/float16 v0.8.4 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/net v0.49.0 // indirect + golang.org/x/oauth2 v0.34.0 // indirect + golang.org/x/sys v0.40.0 // indirect + golang.org/x/term v0.39.0 // indirect + golang.org/x/time v0.14.0 // indirect + google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + k8s.io/klog/v2 v2.140.0 // indirect + k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a // indirect + k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.2 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 3cf027a..9a05a37 100644 --- a/go.sum +++ b/go.sum @@ -1,18 +1,126 @@ github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dlclark/regexp2 v1.11.0 h1:G/nrcoOa7ZXlpoa/91N3X7mM3r8eIlMBBJZvsz/mxKI= github.com/dlclark/regexp2 v1.11.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= +github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= +github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 h1:KRzFb2m7YtdldCEkzs6KqmJw4nqEVZGK7IN2kJkjTuQ= github.com/santhosh-tekuri/jsonschema/v6 v6.0.2/go.mod h1:JXeL+ps8p7/KNMjDQk3TCwPpBy0wYklyWTfbkIzdIFU= github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= +golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= +golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= +golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= +golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY= +golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww= +golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= +golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= +golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= +google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af h1:+5/Sw3GsDNlEmu7TfklWKPdQ0Ykja5VEmq2i817+jbI= +google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.36.1 h1:XbL/EMj8K2aJpJtePmqUyQMsM0D4QI2pvl7YKJ20FTY= +k8s.io/api v0.36.1/go.mod h1:KOWo4ey3TINlXjeHVuwB3i+tXXnu+UcwFBHlI/9dvEo= +k8s.io/apimachinery v0.36.1 h1:G63Gjx2W+q0YD+72Vo8oY0nDnePVwnuzTmmy5ENrVSA= +k8s.io/apimachinery v0.36.1/go.mod h1:ibYOR00vW/I1kzvi5SF0dRuJ52BvKtfvRdOn35GPQ+8= +k8s.io/client-go v0.36.1 h1:FN/K8QIT2CEDt+2WB2HnWrUANZ50AP5GII43/SP2JR0= +k8s.io/client-go v0.36.1/go.mod h1:s6rAnCtTGYDQnpNjEhSaISV+2O8jwruZ6m3QOYBFbtU= +k8s.io/klog/v2 v2.140.0 h1:Tf+J3AH7xnUzZyVVXhTgGhEKnFqye14aadWv7bzXdzc= +k8s.io/klog/v2 v2.140.0/go.mod h1:o+/RWfJ6PwpnFn7OyAG3QnO47BFsymfEfrz6XyYSSp0= +k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a h1:xCeOEAOoGYl2jnJoHkC3hkbPJgdATINPMAxaynU2Ovg= +k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a/go.mod h1:uGBT7iTA6c6MvqUvSXIaYZo9ukscABYi2btjhvgKGZ0= +k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 h1:AZYQSJemyQB5eRxqcPky+/7EdBj0xi3g0ZcxxJ7vbWU= +k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2 h1:kwVWMx5yS1CrnFWA/2QHyRVJ8jM6dBA80uLmm0wJkk8= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/internal/cli/cluster.go b/internal/cli/cluster.go new file mode 100644 index 0000000..1955ff6 --- /dev/null +++ b/internal/cli/cluster.go @@ -0,0 +1,184 @@ +package cli + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "time" + + "github.com/spf13/cobra" + + "github.com/tracebloc/cli/internal/cluster" +) + +// newClusterCmd wires the `tracebloc cluster` subtree. Today it has +// a single verb — `info` — which is the customer's "is the CLI +// pointing at the right cluster?" pre-flight before running +// `dataset push`. Future verbs (e.g. `cluster doctor` for +// diagnostics, `cluster contexts` for switching) hang off this +// parent in later phases. +func newClusterCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "cluster", + Short: "Inspect the cluster the CLI is currently targeting", + Long: `Commands for inspecting the Kubernetes cluster the CLI is +configured to talk to. + +Use ` + "`cluster info`" + ` to verify which cluster, namespace, and parent +tracebloc release the next ` + "`dataset push`" + ` will target. Useful as a +pre-flight before doing anything destructive (e.g. ingesting into +the wrong cluster).`, + } + + cmd.AddCommand(newClusterInfoCmd()) + return cmd +} + +// newClusterInfoCmd implements `tracebloc cluster info`. Discovers +// the parent client release in the configured namespace, mints (or +// looks up) an ingestor SA token, and prints a single-screen +// summary the customer can sanity-check before running a real +// ingestion. +// +// Three kubeconfig flags follow kubectl conventions: +// +// --kubeconfig=PATH override KUBECONFIG / ~/.kube/config +// --context=NAME override the kubeconfig's current-context +// --namespace=NAME override the context's default namespace +// +// All three are zero-value-safe — running `tracebloc cluster info` +// with no flags uses the customer's normal kubectl defaults. +func newClusterInfoCmd() *cobra.Command { + var ( + kubeconfigPath string + contextOverride string + nsOverride string + tokenExpiry int64 + ) + + cmd := &cobra.Command{ + Use: "info", + Short: "Show the cluster, namespace, parent release, and ingestor SA token state", + Long: `Discovers the tracebloc parent client release in the configured +cluster + namespace and prints: + + • Which kubeconfig context the CLI used + • The namespace it resolved to + • The parent release name + chart version + appVersion + • The jobs-manager Service the next dataset push would POST to + • The ingestor ServiceAccount the post-install hook would auth as + • The cluster's configured INGESTOR_IMAGE_DIGEST default + • Whether the user's kubeconfig can mint short-lived SA tokens + via TokenRequest, or has to fall back to a static + service-account-token Secret + +The actual token bytes are never printed; the diagnostic shows +SHA256(token)[:8] so the customer can verify "yes, that's the +token I expect" without leaking it to terminal scrollback. + +Exit codes: + 0 cluster discovered + token mintable; CLI is ready + 4 cluster reachable but no tracebloc parent release found + 5 cluster reachable + release found but no usable SA token`, + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, _ []string) error { + return runClusterInfo( + cmd.Context(), + cmd.OutOrStdout(), + kubeconfigPath, contextOverride, nsOverride, + tokenExpiry, + ) + }, + } + + cmd.Flags().StringVar(&kubeconfigPath, "kubeconfig", "", + "path to the kubeconfig file (default: $KUBECONFIG, then ~/.kube/config)") + cmd.Flags().StringVar(&contextOverride, "context", "", + "name of the kubeconfig context to use (default: kubeconfig's current-context)") + cmd.Flags().StringVarP(&nsOverride, "namespace", "n", "", + "namespace where the parent tracebloc/client release is installed (default: the context's namespace, or 'default')") + cmd.Flags().Int64Var(&tokenExpiry, "token-expiry-seconds", 600, + "requested SA token expiration in seconds (default 600 = 10 min; ignored for static-secret fallback)") + + return cmd +} + +func runClusterInfo( + ctx context.Context, + out interface{ Write([]byte) (int, error) }, + kubeconfigPath, contextOverride, nsOverride string, + tokenExpiry int64, +) error { + resolved, err := cluster.Load(cluster.KubeconfigOptions{ + Path: kubeconfigPath, + Context: contextOverride, + Namespace: nsOverride, + }) + if err != nil { + // Kubeconfig errors are exit-code-3 territory (file/parse + // problem, same conceptual class as `ingest validate`'s + // unreadable-input). + return &exitError{code: 3, err: fmt.Errorf("loading kubeconfig: %w", err)} + } + + cs, err := cluster.NewClientset(resolved) + if err != nil { + return &exitError{code: 3, err: err} + } + + fmt.Fprintf(out, "Kubeconfig:\n") + fmt.Fprintf(out, " context: %s\n", resolved.Context) + fmt.Fprintf(out, " server: %s\n", resolved.ServerURL) + fmt.Fprintf(out, " namespace: %s\n", resolved.Namespace) + fmt.Fprintln(out) + + // Discover the parent release. + release, err := cluster.DiscoverParentRelease(ctx, cs, resolved.Namespace) + if err != nil { + // 4 = "cluster reachable, but no tracebloc release here." + // Distinct from the kubeconfig error (3) so callers can + // branch: 3 means "fix your kubeconfig", 4 means "install + // the parent chart first". + return &exitError{code: 4, err: err} + } + + fmt.Fprintf(out, "Parent release:\n") + fmt.Fprintf(out, " name: %s\n", release.ReleaseName) + fmt.Fprintf(out, " chart version: %s\n", release.ChartVersion) + fmt.Fprintf(out, " app version: %s\n", release.AppVersion) + fmt.Fprintf(out, " jobs-manager: %s\n", release.JobsManagerService) + fmt.Fprintf(out, " ingestor SA: %s/%s\n", resolved.Namespace, release.IngestorSAName) + digest := release.IngestorImageDigest + if digest == "" { + digest = "" + } + fmt.Fprintf(out, " ingestor img: %s\n", digest) + fmt.Fprintln(out) + + // Mint a token (or fall back). The audience is intentionally + // nil today — jobs-manager's TokenReview accepts the default + // API-server audience. A future jobs-manager audience plugs in + // here. + tok, err := cluster.MintIngestorToken(ctx, cs, resolved.Namespace, release.IngestorSAName, tokenExpiry, nil) + if err != nil { + // 5 = "release found but no usable token." Distinct from + // 4 (no release) so customers can RBAC-debug separately + // from install issues. + return &exitError{code: 5, err: err} + } + + hash := sha256.Sum256([]byte(tok.Token)) + fmt.Fprintf(out, "Ingestor SA token:\n") + fmt.Fprintf(out, " source: %s\n", tok.Source) + fmt.Fprintf(out, " sha256[:8]: %s\n", hex.EncodeToString(hash[:8])) + if tok.ExpirationSeconds > 0 { + exp := time.Duration(tok.ExpirationSeconds) * time.Second + fmt.Fprintf(out, " expires in: ~%s (server may cap shorter)\n", exp) + } else { + fmt.Fprintf(out, " expires in: never (static-secret fallback)\n") + } + fmt.Fprintln(out) + fmt.Fprintln(out, "Ready for `tracebloc dataset push` (coming in Phase 3).") + return nil +} diff --git a/internal/cli/root.go b/internal/cli/root.go index 5fcf634..0d790ae 100644 --- a/internal/cli/root.go +++ b/internal/cli/root.go @@ -65,6 +65,7 @@ roadmap. Subsequent phases land subcommands incrementally.`, // Subcommands. New phases append here. root.AddCommand(newVersionCmd(info)) root.AddCommand(newIngestCmd()) + root.AddCommand(newClusterCmd()) return root } diff --git a/internal/cluster/discover.go b/internal/cluster/discover.go new file mode 100644 index 0000000..ee90aee --- /dev/null +++ b/internal/cluster/discover.go @@ -0,0 +1,200 @@ +package cluster + +import ( + "context" + "fmt" + "strings" + + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// ParentRelease describes the tracebloc parent client chart release +// discovered in the customer's cluster. The information comes from +// the helm-managed Deployment's labels (and a few of its env vars), +// not from the helm release secret — parsing those secrets requires +// either pulling in helm.sh/helm/v3 (massive dep) or re-implementing +// the gzip+base64+JSON unwrap. The chart's Deployment labels carry +// everything we actually need, so we use those instead. +type ParentRelease struct { + // ReleaseName is the helm release name. The chart's + // jobs-manager Deployment is named "-jobs-manager" by + // convention and labelled `app.kubernetes.io/instance=`. + ReleaseName string + + // ChartVersion is the parent chart's version, e.g. "1.3.5". + // Comes from the `helm.sh/chart=client-1.3.5` label. + ChartVersion string + + // AppVersion mirrors Chart.yaml's appVersion. Useful for + // detecting whether the customer is on a chart that supports + // the declarative ingestor flow at all. + AppVersion string + + // JobsManagerService is the in-cluster DNS name of the + // jobs-manager Service, e.g. + // "-jobs-manager..svc.cluster.local:8080". + // Used as the POST target for ingestion submissions. + JobsManagerService string + + // IngestorSAName is the name of the ServiceAccount the chart's + // hook pods run as. Defaults to "ingestor" but customers can + // override; the value comes from jobs-manager's environment. + IngestorSAName string + + // IngestorImageDigest is the canonical digest the cluster's + // chart will spawn ingestor Jobs with. Comes from + // `INGESTOR_IMAGE_DIGEST` env on jobs-manager. Empty when the + // admin hasn't set images.ingestor.digest yet. + IngestorImageDigest string +} + +// DiscoverParentRelease finds the tracebloc parent client chart +// release in the given namespace by looking for a Deployment with +// the chart's hallmark labels. Returns a friendly error if none +// found, or if multiple candidates exist (so customers can pick). +// +// Why labels on a Deployment, not Helm's release secrets: +// +// - The release-secret path needs gzip + base64 + JSON parsing +// (helm v3's storage format), or pulls in helm.sh/helm/v3 as +// a dep (which transitively pulls in ~80MB of Go modules, +// including kube-runtime). +// - The Deployment is what the chart actually deploys; if it's +// not there, the chart isn't installed (or the install +// failed). Using its labels for discovery means "the +// discovered release is the running release" by construction. +// - The chart's labels are part of its public contract via +// `_helpers.tpl`; they're stable across minor versions. +// +// The chart's labels share `app.kubernetes.io/name=client` across +// EVERY resource it creates (mysql-client, jobs-manager, +// requests-proxy, etc.) — that's the helm convention where the +// chart's name is the label, not the component's. To distinguish +// jobs-manager from its siblings we filter by Deployment name +// suffix matching the chart's naming convention +// "-jobs-manager" (or plain "jobs-manager" for unprefixed +// installs). +func DiscoverParentRelease(ctx context.Context, cs kubernetes.Interface, namespace string) (*ParentRelease, error) { + deps, err := cs.AppsV1().Deployments(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: "app.kubernetes.io/name=client,app.kubernetes.io/managed-by=Helm", + }) + if err != nil { + return nil, fmt.Errorf("listing chart-managed deployments in namespace %s: %w", namespace, err) + } + + // Filter to just the jobs-manager Deployment(s). The chart pins + // the name to either "-jobs-manager" or "jobs-manager" + // depending on chart version + values; either way it ends in + // "-jobs-manager" or equals "jobs-manager". + var jmDeps []appsv1.Deployment + for _, d := range deps.Items { + if d.Name == "jobs-manager" || strings.HasSuffix(d.Name, "-jobs-manager") { + jmDeps = append(jmDeps, d) + } + } + + switch len(jmDeps) { + case 0: + return nil, fmt.Errorf( + "no tracebloc parent client release found in namespace %q "+ + "(no chart-managed Deployment named *-jobs-manager). "+ + "Install with `helm install tracebloc/client --namespace %s` first, "+ + "or pass --namespace to point at the namespace where it's running.", + namespace, namespace, + ) + case 1: + // happy path + default: + names := make([]string, 0, len(jmDeps)) + for _, d := range jmDeps { + names = append(names, d.Name) + } + return nil, fmt.Errorf( + "found %d tracebloc parent releases in namespace %q (%s); "+ + "this CLI doesn't yet support disambiguating between multiple. "+ + "Pass --namespace to target a namespace with exactly one release.", + len(jmDeps), namespace, strings.Join(names, ", "), + ) + } + + d := jmDeps[0] + release := &ParentRelease{ + ReleaseName: d.Labels["app.kubernetes.io/instance"], + ChartVersion: chartVersionFromLabel(d.Labels["helm.sh/chart"]), + AppVersion: d.Labels["app.kubernetes.io/version"], + } + + // The Service shares the Deployment's release labels by + // convention. Construct the FQDN — we don't need to actually + // hit the API to find it; the chart's helper templates pin the + // service name to "-jobs-manager" (or just + // "jobs-manager" for older chart versions where the release + // prefix wasn't included). + // + // We probe both names to be liberal: if the chart pinned just + // "jobs-manager", that wins; otherwise fall back to the + // release-prefixed form. Customers can always override via the + // ingestor subchart's `jobsManager.endpoint` value. + svc := pickJobsManagerService(ctx, cs, namespace, release.ReleaseName) + release.JobsManagerService = fmt.Sprintf("http://%s.%s.svc.cluster.local:8080", svc, namespace) + + // Read INGESTOR_IMAGE_DIGEST + ingestor SA name from the + // Deployment's pod-spec env. Both are populated by the chart's + // values; defaults are "ingestor" SA + empty digest. + release.IngestorSAName = "ingestor" // chart default + if len(d.Spec.Template.Spec.Containers) > 0 { + for _, env := range d.Spec.Template.Spec.Containers[0].Env { + switch env.Name { + case "INGESTOR_IMAGE_DIGEST": + release.IngestorImageDigest = env.Value + } + } + } + + return release, nil +} + +// pickJobsManagerService probes for the chart's jobs-manager +// Service. The chart's helper templates have used both names over +// chart history: +// +// - "jobs-manager" (chart 1.3.x and earlier) +// - "-jobs-manager" (some prefixing variants) +// +// We try the unprefixed name first because that's the dominant +// shipped behavior; fall back if it doesn't exist. Probe failures +// (timeouts, RBAC denials) fall through to the prefixed name too — +// the customer gets a clear DNS error later if neither resolves, +// which is more actionable than "couldn't enumerate services." +func pickJobsManagerService(ctx context.Context, cs kubernetes.Interface, namespace, release string) string { + candidates := []string{ + "jobs-manager", + release + "-jobs-manager", + } + for _, name := range candidates { + _, err := cs.CoreV1().Services(namespace).Get(ctx, name, metav1.GetOptions{}) + if err == nil { + return name + } + } + // Last-ditch: return the unprefixed candidate; the caller will + // hit a DNS error at POST time which surfaces with a clearer + // message than us guessing. + return "jobs-manager" +} + +// chartVersionFromLabel extracts "1.3.5" from helm.sh/chart="client-1.3.5". +// Labels in this format are standard Helm output; we strip the +// chart-name prefix to expose just the version. Returns the raw +// label if it doesn't match the expected pattern (defensive +// fallback for unusual chart-name formats). +func chartVersionFromLabel(label string) string { + // Expected: "-"; strip the "client-" prefix. + const prefix = "client-" + if strings.HasPrefix(label, prefix) { + return label[len(prefix):] + } + return label +} diff --git a/internal/cluster/discover_test.go b/internal/cluster/discover_test.go new file mode 100644 index 0000000..34f4433 --- /dev/null +++ b/internal/cluster/discover_test.go @@ -0,0 +1,222 @@ +package cluster + +import ( + "context" + "strings" + "testing" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +// jobsManagerDeployment builds the minimal Deployment the chart +// creates for jobs-manager — labels mirror the chart's +// _helpers.tpl output as of client 1.3.5. Tests use this to seed +// the fake clientset with realistic data; if the chart's label +// contract ever changes, these tests fail loudly and we update both +// here and the discovery logic in lockstep. +// +// Note: app.kubernetes.io/name is "client" (the chart name) on +// EVERY resource the chart creates, not "jobs-manager". This is +// the helm convention. The discovery logic distinguishes +// jobs-manager from its mysql / requests-proxy siblings by name +// suffix matching ("-jobs-manager") since the chart-level labels +// don't disambiguate. +func jobsManagerDeployment(release, namespace, chartLabel, appVersion, digest string) *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: release + "-jobs-manager", + Namespace: namespace, + Labels: map[string]string{ + "app.kubernetes.io/name": "client", + "app.kubernetes.io/instance": release, + "app.kubernetes.io/managed-by": "Helm", + "app.kubernetes.io/version": appVersion, + "helm.sh/chart": chartLabel, + }, + }, + Spec: appsv1.DeploymentSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "api", + Env: []corev1.EnvVar{ + {Name: "INGESTOR_IMAGE_DIGEST", Value: digest}, + }, + }}, + }, + }, + }, + } +} + +func jobsManagerService(name, namespace string) *corev1.Service { + return &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + } +} + +func TestDiscoverParentRelease_HappyPath(t *testing.T) { + const ns = "tracebloc" + cs := fake.NewClientset( + jobsManagerDeployment("tracebloc", ns, + "client-1.3.5", "1.3.5", + "sha256:463e236748708a5e3564569eec9173ea8cb3bcf515992d4939c5b610f3807a4a"), + jobsManagerService("jobs-manager", ns), + ) + + release, err := DiscoverParentRelease(context.Background(), cs, ns) + if err != nil { + t.Fatalf("DiscoverParentRelease: %v", err) + } + + want := ParentRelease{ + ReleaseName: "tracebloc", + ChartVersion: "1.3.5", + AppVersion: "1.3.5", + JobsManagerService: "http://jobs-manager." + ns + ".svc.cluster.local:8080", + IngestorSAName: "ingestor", + IngestorImageDigest: "sha256:463e236748708a5e3564569eec9173ea8cb3bcf515992d4939c5b610f3807a4a", + } + if *release != want { + t.Errorf("mismatch.\ngot: %+v\nwant: %+v", *release, want) + } +} + +func TestDiscoverParentRelease_NoReleaseFound(t *testing.T) { + cs := fake.NewClientset() // empty cluster + + _, err := DiscoverParentRelease(context.Background(), cs, "tracebloc") + if err == nil { + t.Fatal("expected error when no jobs-manager deployment exists") + } + // The error message has to be customer-actionable. Pin the + // key remediation phrase so a future refactor that loses it + // (or worse, replaces it with a stack trace) fails this test. + for _, want := range []string{"no tracebloc parent client release found", "helm install"} { + if !strings.Contains(err.Error(), want) { + t.Errorf("expected error to mention %q, got: %s", want, err) + } + } +} + +func TestDiscoverParentRelease_MultipleReleases(t *testing.T) { + const ns = "tracebloc" + cs := fake.NewClientset( + jobsManagerDeployment("rel-a", ns, "client-1.3.5", "1.3.5", ""), + jobsManagerDeployment("rel-b", ns, "client-1.3.4", "1.3.4", ""), + ) + + _, err := DiscoverParentRelease(context.Background(), cs, ns) + if err == nil { + t.Fatal("expected error when multiple releases exist") + } + // Customer needs to see which releases the CLI found so they + // can pick (or clean up). + for _, want := range []string{"found 2", "rel-a", "rel-b"} { + if !strings.Contains(err.Error(), want) { + t.Errorf("expected error to mention %q, got: %s", want, err) + } + } +} + +// Regression for the real-cluster discovery bug. Pre-fix, the +// selector was `app.kubernetes.io/name=jobs-manager` which never +// matches because the chart's convention is `name=` +// — so the real-cluster smoke test returned "no parent release +// found" even though one was clearly running. The fix is two-part: +// match on `name=client` (the chart name) AND filter the result +// set by Deployment-name suffix to pick out jobs-manager from its +// mysql/requests-proxy siblings, all of which share the +// chart-level labels. +func TestDiscoverParentRelease_FiltersOutSiblingDeployments(t *testing.T) { + const ns = "tracebloc" + // All three deployments share the chart-level labels — only + // the name suffix distinguishes them. + mysqlClient := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mysql-client", + Namespace: ns, + Labels: map[string]string{ + "app.kubernetes.io/name": "client", + "app.kubernetes.io/instance": "tracebloc", + "app.kubernetes.io/managed-by": "Helm", + "helm.sh/chart": "client-1.3.5", + }, + }, + } + requestsProxy := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "tracebloc-requests-proxy", + Namespace: ns, + Labels: map[string]string{ + "app.kubernetes.io/name": "client", + "app.kubernetes.io/instance": "tracebloc", + "app.kubernetes.io/managed-by": "Helm", + "helm.sh/chart": "client-1.3.5", + }, + }, + } + + cs := fake.NewClientset( + mysqlClient, + requestsProxy, + jobsManagerDeployment("tracebloc", ns, "client-1.3.5", "1.3.5", ""), + jobsManagerService("jobs-manager", ns), + ) + + release, err := DiscoverParentRelease(context.Background(), cs, ns) + if err != nil { + t.Fatalf("DiscoverParentRelease: %v", err) + } + // Should pick jobs-manager, NOT mysql-client or requests-proxy. + if release.ReleaseName != "tracebloc" { + t.Errorf("ReleaseName = %q, want %q (siblings should be filtered out by name suffix)", + release.ReleaseName, "tracebloc") + } +} + +func TestDiscoverParentRelease_FallsBackToReleasePrefixedService(t *testing.T) { + // Some older chart versions exposed the Service as + // "-jobs-manager" rather than the unprefixed + // "jobs-manager". The discover code probes both names and + // picks whichever resolves. This test seeds ONLY the + // release-prefixed Service and asserts the FQDN reflects it. + const ns = "tracebloc" + cs := fake.NewClientset( + jobsManagerDeployment("custom", ns, "client-1.3.5", "1.3.5", ""), + jobsManagerService("custom-jobs-manager", ns), + // NOTE: no "jobs-manager" service + ) + + release, err := DiscoverParentRelease(context.Background(), cs, ns) + if err != nil { + t.Fatalf("DiscoverParentRelease: %v", err) + } + wantFQDN := "http://custom-jobs-manager." + ns + ".svc.cluster.local:8080" + if release.JobsManagerService != wantFQDN { + t.Errorf("JobsManagerService = %q, want %q", release.JobsManagerService, wantFQDN) + } +} + +func TestChartVersionFromLabel(t *testing.T) { + cases := map[string]string{ + "client-1.3.5": "1.3.5", + "client-2.0.0-rc": "2.0.0-rc", + "client": "client", // unexpected shape — return as-is + "other-1.0.0": "other-1.0.0", // non-client chart — return as-is + "": "", + } + for in, want := range cases { + t.Run(in, func(t *testing.T) { + if got := chartVersionFromLabel(in); got != want { + t.Errorf("chartVersionFromLabel(%q) = %q, want %q", in, got, want) + } + }) + } +} diff --git a/internal/cluster/kubeconfig.go b/internal/cluster/kubeconfig.go new file mode 100644 index 0000000..2a94380 --- /dev/null +++ b/internal/cluster/kubeconfig.go @@ -0,0 +1,164 @@ +// Package cluster handles everything the CLI needs to talk to a +// Kubernetes cluster running the tracebloc parent client chart: +// +// - Locating the customer's kubeconfig + picking a context/namespace +// - Discovering the parent release (jobs-manager Deployment + its +// labels) +// - Minting an ingestor ServiceAccount token via TokenRequest +// +// Everything in this package is exercise-able without a real cluster +// thanks to client-go's fake clientset; see *_test.go. +package cluster + +import ( + "fmt" + "os" + "path/filepath" + + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" +) + +// KubeconfigOptions captures every knob the customer can turn when +// telling the CLI which cluster + namespace to talk to. Defaults +// mirror kubectl: $KUBECONFIG > ~/.kube/config; current-context if +// --context unset; the context's default namespace if --namespace +// unset; "default" if even the context doesn't pin one. +// +// All fields are zero-value-safe — an empty Options{} produces the +// same behaviour as kubectl run with no flags. +type KubeconfigOptions struct { + // Path overrides $KUBECONFIG and ~/.kube/config when set. + // Honors path expansion (e.g. "~/.kube/config" → absolute). + Path string + + // Context picks a non-current-context from the loaded + // kubeconfig. Empty = use whatever current-context points at. + Context string + + // Namespace overrides the context's default. Empty = pull from + // the context's namespace, falling back to "default". + Namespace string +} + +// ResolvedConfig is what Load() returns: a usable rest.Config plus +// the source-of-truth metadata that the rest of the CLI needs (the +// namespace it resolved to, the context name it used). The +// metadata is separate from rest.Config because rest.Config doesn't +// carry the context name out of the box — it's discarded after the +// merge — and we want to show the customer "you're talking to +// context X" in diagnostic output. +type ResolvedConfig struct { + RestConfig *rest.Config + Context string + Namespace string + + // ServerURL is the cluster's API server URL (the value from + // the context's cluster.server field). Exposed for `tracebloc + // cluster info` to show "you're targeting " so the + // customer can sanity-check before doing anything destructive. + ServerURL string +} + +// Load resolves the kubeconfig + flags into a rest.Config ready for +// kubernetes.NewForConfig(). It does NOT make any network calls — +// pure parsing of local files + env. Network errors only happen +// when the caller subsequently uses the returned RestConfig. +func Load(opts KubeconfigOptions) (*ResolvedConfig, error) { + // NewDefaultClientConfigLoadingRules() wires up the full + // kubectl-style lookup chain: ExplicitPath > $KUBECONFIG + // (multi-file merging) > ~/.kube/config. Constructing + // ClientConfigLoadingRules{} directly with an empty + // ExplicitPath does NOT fall back to those defaults — it + // just refuses to load anything ("no configuration has been + // provided" was the symptom). Surfaced during the first + // real-cluster smoke test of `tracebloc cluster info`. + loadingRules := clientcmd.NewDefaultClientConfigLoadingRules() + if explicit := expandPath(opts.Path); explicit != "" { + // Setting ExplicitPath bypasses the env-var merge + // (intentional — when a customer says "use THIS file", + // we don't mix it with their env-var-listed others). + loadingRules.ExplicitPath = explicit + } + + overrides := &clientcmd.ConfigOverrides{} + if opts.Context != "" { + overrides.CurrentContext = opts.Context + } + if opts.Namespace != "" { + // Setting Context.Namespace overrides the kubeconfig's + // per-context default. This matches `kubectl --namespace` + // semantics. + overrides.Context.Namespace = opts.Namespace + } + + loader := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(loadingRules, overrides) + + restConfig, err := loader.ClientConfig() + if err != nil { + return nil, fmt.Errorf("building rest config from kubeconfig: %w", err) + } + + // Pull the namespace the way kubectl does: explicit override + // wins; otherwise the context's namespace; otherwise "default". + // loader.Namespace() handles that fallback chain. + ns, _, err := loader.Namespace() + if err != nil { + return nil, fmt.Errorf("resolving namespace from kubeconfig: %w", err) + } + + // rest.Config doesn't carry the context name out of the box — + // fish it back out of the raw kubeconfig so diagnostics can + // show it. RawConfig() does a fresh read each call; cache if + // this ever becomes hot (it isn't). + rawCfg, err := loader.RawConfig() + if err != nil { + return nil, fmt.Errorf("reading raw kubeconfig: %w", err) + } + contextName := overrides.CurrentContext + if contextName == "" { + contextName = rawCfg.CurrentContext + } + + return &ResolvedConfig{ + RestConfig: restConfig, + Context: contextName, + Namespace: ns, + ServerURL: restConfig.Host, + }, nil +} + +// NewClientset is a tiny wrapper that constructs a kubernetes +// clientset from a ResolvedConfig. Exists so callers don't have to +// import k8s.io/client-go/kubernetes themselves for the common case +// of "I have a ResolvedConfig, I want a clientset." +func NewClientset(rc *ResolvedConfig) (kubernetes.Interface, error) { + cs, err := kubernetes.NewForConfig(rc.RestConfig) + if err != nil { + return nil, fmt.Errorf("constructing kubernetes clientset: %w", err) + } + return cs, nil +} + +// expandPath handles `~/.kube/config` → `/home/user/.kube/config` +// since clientcmd's ExplicitPath wants an absolute path. Empty +// strings pass through unchanged (they signal "use defaults" to +// clientcmd). +func expandPath(p string) string { + if p == "" { + return "" + } + if p[0] != '~' { + return p + } + home, err := os.UserHomeDir() + if err != nil { + // Best-effort: return the unexpanded path and let + // clientcmd's error surface mention it. Failing here would + // hide the more useful "tried to read ~/.kube/config and + // got X" error downstream. + return p + } + return filepath.Join(home, p[1:]) +} diff --git a/internal/cluster/kubeconfig_test.go b/internal/cluster/kubeconfig_test.go new file mode 100644 index 0000000..39709e2 --- /dev/null +++ b/internal/cluster/kubeconfig_test.go @@ -0,0 +1,84 @@ +package cluster + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestExpandPath(t *testing.T) { + home, err := os.UserHomeDir() + if err != nil { + t.Skipf("os.UserHomeDir failed in this environment, skipping: %v", err) + } + + cases := []struct { + in string + wantFunc func(string) bool + }{ + { + in: "", + wantFunc: func(got string) bool { return got == "" }, + }, + { + in: "/absolute/path/to/kubeconfig", + wantFunc: func(got string) bool { return got == "/absolute/path/to/kubeconfig" }, + }, + { + in: "relative/path/kubeconfig", + wantFunc: func(got string) bool { return got == "relative/path/kubeconfig" }, + }, + { + in: "~/.kube/config", + wantFunc: func(got string) bool { + want := filepath.Join(home, ".kube/config") + return got == want + }, + }, + { + in: "~/some/file", + wantFunc: func(got string) bool { + want := filepath.Join(home, "some/file") + return got == want + }, + }, + } + + for _, c := range cases { + t.Run(c.in, func(t *testing.T) { + got := expandPath(c.in) + if !c.wantFunc(got) { + t.Errorf("expandPath(%q) = %q (failed predicate)", c.in, got) + } + }) + } +} + +// Load() involves real filesystem + clientcmd parsing; it's covered +// indirectly by the cluster-info integration path. A unit test +// would require writing a temp kubeconfig and feeding it back — +// useful but lower-priority than the discover/token tests since +// the clientcmd library is well-trodden. Pin the smallest contract: +// an empty Options{} doesn't panic. +func TestLoad_EmptyOptionsDoesNotPanic(t *testing.T) { + // May succeed or fail depending on whether the test runner has + // a kubeconfig available; either is fine. The point of this + // test is to ensure the function returns rather than crashes. + _, err := Load(KubeconfigOptions{}) + if err != nil && !looksLikeKubeconfigError(err.Error()) { + // Defense-in-depth: if Load() ever starts wrapping + // errors in a way that loses the "kubeconfig" context, + // flag it so the customer-facing diagnostic stays clear. + t.Logf("Load() failed with unfamiliar error: %v", err) + } +} + +func looksLikeKubeconfigError(s string) bool { + for _, marker := range []string{"kubeconfig", "config", "context", "namespace"} { + if strings.Contains(strings.ToLower(s), marker) { + return true + } + } + return false +} diff --git a/internal/cluster/token.go b/internal/cluster/token.go new file mode 100644 index 0000000..f796927 --- /dev/null +++ b/internal/cluster/token.go @@ -0,0 +1,224 @@ +package cluster + +import ( + "context" + "errors" + "fmt" + + authenticationv1 "k8s.io/api/authentication/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// IngestorToken is what the customer (well, the CLI) authenticates +// to jobs-manager with. Bearer-token over HTTPS to +// /internal/submit-ingestion-run. The token is short-lived (10 +// minutes default) so it can be regenerated per `tracebloc dataset +// push` invocation without long-term-credential concerns. +type IngestorToken struct { + // Token is the raw bearer token string. Treat as sensitive; + // don't log it. Diagnostics print SHA256(Token)[:8] instead. + Token string + + // ExpirationSeconds matches the request — actual server-side + // expiration may be capped by cluster policy + // (--service-account-max-token-expiration on kube-apiserver). + // We don't try to parse the JWT to read its `exp` claim + // because the customer's diagnostic ("token expires in ~N + // min") is accurate enough using the requested value. + ExpirationSeconds int64 + + // Source records how the token was obtained — TokenRequest + // (the modern path) or a static secret (the fallback for + // clusters where the user can't call TokenRequest). Surfaced + // in `tracebloc cluster info` so the customer can see which + // path their RBAC allowed. + Source TokenSource +} + +// TokenSource enumerates the two ways the CLI can obtain an +// ingestor SA token. Adding a third (e.g. a customer-managed +// long-lived secret reference) is a v0.2 extension. +type TokenSource int + +const ( + // TokenSourceTokenRequest means we called the + // /serviceaccounts/{name}/token subresource. The modern, + // rotation-friendly path. Requires `create` permission on + // "serviceaccounts/token" for the user's kubeconfig. + TokenSourceTokenRequest TokenSource = iota + + // TokenSourceStaticSecret means we fell back to reading a + // pre-existing Secret of type + // kubernetes.io/service-account-token that references the + // ingestor SA. Older k8s (pre-1.24 default) auto-created + // these; on 1.24+ admins must create them by hand. Tokens + // from this source are long-lived (no expiration), which is + // less ideal but unblocks customers whose RBAC denies + // TokenRequest. + TokenSourceStaticSecret +) + +func (s TokenSource) String() string { + switch s { + case TokenSourceTokenRequest: + return "TokenRequest" + case TokenSourceStaticSecret: + return "static secret" + default: + return "unknown" + } +} + +// MintIngestorToken obtains a bearer token for the ingestor +// ServiceAccount in the given namespace. Tries TokenRequest first; +// if that's denied (RBAC) or unavailable (older API server) falls +// back to looking for a pre-existing service-account-token Secret. +// Returns an error with a clear remediation message if neither +// works. +// +// audiences may be nil/empty for the default API-server audience. +// Today jobs-manager's TokenReview-based validation accepts the +// default audience, so callers usually pass nil; future audiences +// (e.g. a forthcoming jobs-manager-specific audience) plug in here +// without API breakage. +func MintIngestorToken( + ctx context.Context, + cs kubernetes.Interface, + namespace, saName string, + expirationSeconds int64, + audiences []string, +) (*IngestorToken, error) { + // Try TokenRequest first. + req := &authenticationv1.TokenRequest{ + Spec: authenticationv1.TokenRequestSpec{ + ExpirationSeconds: &expirationSeconds, + Audiences: audiences, + }, + } + tr, err := cs.CoreV1().ServiceAccounts(namespace). + CreateToken(ctx, saName, req, metav1.CreateOptions{}) + + if err == nil { + return &IngestorToken{ + Token: tr.Status.Token, + ExpirationSeconds: expirationSeconds, + Source: TokenSourceTokenRequest, + }, nil + } + + // TokenRequest didn't work. Distinguish the cases we can fall + // back from (permission denied / not-found / unsupported) + // from cases we can't (e.g. context cancelled, network down). + if !isTokenRequestRecoverable(err) { + return nil, fmt.Errorf( + "minting token for ServiceAccount %s/%s via TokenRequest: %w", + namespace, saName, err, + ) + } + + // Fallback: look for a Secret of type + // kubernetes.io/service-account-token in the same namespace + // whose annotations point at our SA. + tok, fallbackErr := findStaticTokenSecret(ctx, cs, namespace, saName) + if fallbackErr != nil { + // Surface BOTH the original TokenRequest failure and the + // fallback failure — they give the customer different + // remediation options. + return nil, fmt.Errorf( + "no usable ingestor token. TokenRequest failed: %v. "+ + "Fallback to static secret also failed: %w. "+ + "Remediation: either grant your user the `create` verb on "+ + "`serviceaccounts/token` (RBAC), or have an admin create a "+ + "long-lived Secret of type kubernetes.io/service-account-token "+ + "that references the %s ServiceAccount in namespace %s.", + err, fallbackErr, saName, namespace, + ) + } + return tok, nil +} + +// isTokenRequestRecoverable reports whether a failed TokenRequest +// call is one we can fall back from (permission denied, API not +// available, SA missing) vs one we can't (context cancelled, +// network errors, etc.). +// +// Falling back on a network error would mask the network problem; +// the customer would see a less useful "static secret not found" +// instead of "couldn't reach the API server." +func isTokenRequestRecoverable(err error) bool { + if err == nil { + return false + } + // k8s.io/apimachinery exposes typed checks. Permission denied + // (StatusForbidden), API not available (StatusMethodNotAllowed + // — pre-1.22 clusters), SA not found (StatusNotFound). + switch { + case isForbidden(err), isNotFound(err), isMethodNotSupported(err): + return true + } + return false +} + +func isForbidden(err error) bool { return statusCode(err) == 403 } +func isNotFound(err error) bool { return statusCode(err) == 404 } +func isMethodNotSupported(err error) bool { return statusCode(err) == 405 } + +// statusCode extracts the HTTP status from a k8s API error. Returns +// 0 if the error doesn't carry one (network error, marshalling +// error, etc.). +func statusCode(err error) int { + type statusError interface { + Status() metav1.Status + } + var se statusError + if errors.As(err, &se) { + return int(se.Status().Code) + } + return 0 +} + +// findStaticTokenSecret looks for a long-lived +// service-account-token Secret bound to the given SA. The legacy +// pre-k8s-1.24 way of authenticating an SA: kubectl auto-created +// these on SA creation. On modern clusters admins create them by +// hand when needed. +func findStaticTokenSecret(ctx context.Context, cs kubernetes.Interface, namespace, saName string) (*IngestorToken, error) { + secrets, err := cs.CoreV1().Secrets(namespace).List(ctx, metav1.ListOptions{ + FieldSelector: "type=kubernetes.io/service-account-token", + }) + if err != nil { + // If the user can't even list Secrets, surface that + // directly — they may be missing more RBAC than just + // TokenRequest, and "static secret fallback failed" is + // less informative than the underlying error. + return nil, fmt.Errorf("listing service-account-token secrets in %s: %w", namespace, err) + } + + for _, s := range secrets.Items { + // The Secret is bound to an SA via the + // `kubernetes.io/service-account.name` annotation. (The + // `.uid` annotation pins it to that exact SA instance — + // we don't check that because we want any historically + // created token-secret for the named SA to work.) + if s.Annotations[corev1.ServiceAccountNameKey] != saName { + continue + } + token, ok := s.Data["token"] + if !ok || len(token) == 0 { + continue // empty token field — skip + } + return &IngestorToken{ + Token: string(token), + ExpirationSeconds: 0, // static secrets don't expire client-side + Source: TokenSourceStaticSecret, + }, nil + } + + return nil, fmt.Errorf( + "no Secret of type kubernetes.io/service-account-token bound to "+ + "ServiceAccount %s found in namespace %s", + saName, namespace, + ) +} diff --git a/internal/cluster/token_test.go b/internal/cluster/token_test.go new file mode 100644 index 0000000..64e1c60 --- /dev/null +++ b/internal/cluster/token_test.go @@ -0,0 +1,186 @@ +package cluster + +import ( + "context" + "errors" + "strings" + "testing" + + authenticationv1 "k8s.io/api/authentication/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + k8stesting "k8s.io/client-go/testing" +) + +func TestMintIngestorToken_TokenRequest_HappyPath(t *testing.T) { + const ns = "tracebloc" + cs := fake.NewClientset(&corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{Name: "ingestor", Namespace: ns}, + }) + + // The fake clientset doesn't implement the TokenRequest + // subresource by default — we have to inject a reactor. This + // mirrors what the real API server does: returns a stamped + // Status.Token. + cs.PrependReactor("create", "serviceaccounts", + func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + ca, ok := action.(k8stesting.CreateAction) + if !ok { + return false, nil, nil + } + if ca.GetSubresource() != "token" { + return false, nil, nil + } + tr := ca.GetObject().(*authenticationv1.TokenRequest) + tr.Status.Token = "fake-token-deadbeef" + return true, tr, nil + }) + + tok, err := MintIngestorToken(context.Background(), cs, ns, "ingestor", 600, nil) + if err != nil { + t.Fatalf("MintIngestorToken: %v", err) + } + if tok.Token != "fake-token-deadbeef" { + t.Errorf("Token = %q, want %q", tok.Token, "fake-token-deadbeef") + } + if tok.Source != TokenSourceTokenRequest { + t.Errorf("Source = %v, want TokenSourceTokenRequest", tok.Source) + } + if tok.ExpirationSeconds != 600 { + t.Errorf("ExpirationSeconds = %d, want 600", tok.ExpirationSeconds) + } +} + +func TestMintIngestorToken_FallsBackToStaticSecret(t *testing.T) { + const ns = "tracebloc" + + // Pre-seed a service-account-token Secret bound to the + // ingestor SA. This is what older clusters auto-created on SA + // creation; modern clusters require admins to create it by + // hand. + staticSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ingestor-token-abc123", + Namespace: ns, + Annotations: map[string]string{ + corev1.ServiceAccountNameKey: "ingestor", + }, + }, + Type: corev1.SecretTypeServiceAccountToken, + Data: map[string][]byte{ + "token": []byte("static-token-cafebabe"), + }, + } + cs := fake.NewClientset( + &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{Name: "ingestor", Namespace: ns}, + }, + staticSecret, + ) + + // Make TokenRequest fail with a recoverable error + // (Forbidden = the user lacks RBAC, which is the dominant + // reason customers fall through to the static-secret path). + cs.PrependReactor("create", "serviceaccounts", + func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + if ca, ok := action.(k8stesting.CreateAction); ok && ca.GetSubresource() == "token" { + return true, nil, apierrors.NewForbidden( + corev1.Resource("serviceaccounts/token"), + "ingestor", + errors.New("user cannot create serviceaccounts/token"), + ) + } + return false, nil, nil + }) + + tok, err := MintIngestorToken(context.Background(), cs, ns, "ingestor", 600, nil) + if err != nil { + t.Fatalf("MintIngestorToken: %v", err) + } + if tok.Token != "static-token-cafebabe" { + t.Errorf("Token = %q, want fallback static-token-cafebabe", tok.Token) + } + if tok.Source != TokenSourceStaticSecret { + t.Errorf("Source = %v, want TokenSourceStaticSecret", tok.Source) + } + if tok.ExpirationSeconds != 0 { + t.Errorf("ExpirationSeconds = %d, want 0 (static secrets don't expire client-side)", tok.ExpirationSeconds) + } +} + +func TestMintIngestorToken_NoFallbackOnUnrecoverableError(t *testing.T) { + // Network errors (or any non-403/404/405 error) shouldn't + // trigger the static-secret fallback — they'd mask the real + // underlying problem. Pin that behavior. + const ns = "tracebloc" + cs := fake.NewClientset() + + cs.PrependReactor("create", "serviceaccounts", + func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + if ca, ok := action.(k8stesting.CreateAction); ok && ca.GetSubresource() == "token" { + return true, nil, errors.New("simulated network failure: i/o timeout") + } + return false, nil, nil + }) + + _, err := MintIngestorToken(context.Background(), cs, ns, "ingestor", 600, nil) + if err == nil { + t.Fatal("expected non-recoverable error to propagate, got nil") + } + if !strings.Contains(err.Error(), "i/o timeout") { + t.Errorf("expected the network-error message to surface, got: %v", err) + } +} + +func TestMintIngestorToken_BothPathsFailWithRemediation(t *testing.T) { + // Worst case: TokenRequest forbidden AND no static secret. The + // error must surface both failures plus an actionable + // remediation hint. + const ns = "tracebloc" + cs := fake.NewClientset(&corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{Name: "ingestor", Namespace: ns}, + }) + + cs.PrependReactor("create", "serviceaccounts", + func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + if ca, ok := action.(k8stesting.CreateAction); ok && ca.GetSubresource() == "token" { + return true, nil, apierrors.NewForbidden( + corev1.Resource("serviceaccounts/token"), + "ingestor", + errors.New("forbidden"), + ) + } + return false, nil, nil + }) + + _, err := MintIngestorToken(context.Background(), cs, ns, "ingestor", 600, nil) + if err == nil { + t.Fatal("expected error when both TokenRequest + static fallback fail") + } + for _, want := range []string{ + "TokenRequest failed", + "static secret also failed", + "Remediation", + "create", // remediation mentions granting the verb + } { + if !strings.Contains(err.Error(), want) { + t.Errorf("expected combined error to mention %q, got: %s", want, err) + } + } +} + +func TestTokenSource_String(t *testing.T) { + cases := map[TokenSource]string{ + TokenSourceTokenRequest: "TokenRequest", + TokenSourceStaticSecret: "static secret", + TokenSource(99): "unknown", + } + for s, want := range cases { + if got := s.String(); got != want { + t.Errorf("TokenSource(%d).String() = %q, want %q", s, got, want) + } + } +} From 7a09a2ca9ec0794f66b64e81872ee45ba8220ba8 Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Thu, 21 May 2026 20:16:54 +0500 Subject: [PATCH 2/9] fix(ci): resolve Go version + dep conflicts for lint to typecheck MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI on PR #2 hit a lint typecheck error ("undefined: jsonschema / yaml (typecheck)") that took a few iterations to diagnose. Root cause: the k8s.io/client-go v0.31 line pulls in transitive deps (sigs.k8s.io/structured-merge-diff/v6 vs v4) that fight in go.mod, and golangci-lint v1.61's bundled Go SDK can't typecheck a module whose `go` directive is newer than what the linter supports. Resolution: 1. Bump k8s.io/client-go + api + apimachinery from v0.31.0 to v0.36.1 (latest stable). Fixes the structured-merge-diff version split — v0.36 uses v6 consistently across the dependency chain. 2. Accept whatever `go mod tidy` writes to go.mod's `go` directive (currently 1.26 on this dev machine, 1.24 on others — same either way since Go modules are forward-compatible). Stop fighting tidy; pinning a stale version produces typecheck errors instead of real findings. 3. Bump golangci-lint in the workflow from v1.61.0 to v1.64.7, the first version that handles the Go 1.24+ source the dep tree now requires. 4. Update .golangci.yml `run.go: "1.24"` to match go.mod's effective minimum. 5. Refresh the go.mod comment so future readers understand why the version directive isn't pinned low. Local validation: `make vet test fmt-check schema-check` all green; cluster-info smoke against the dev EKS still discovers chart 1.3.5 + mints a TokenRequest token. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/build.yml | 11 +++++++---- .golangci.yml | 7 ++++++- go.mod | 5 +++++ 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 82ca229..16a013f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -66,10 +66,13 @@ jobs: - name: golangci-lint uses: golangci/golangci-lint-action@v6 with: - version: v1.61.0 - # Pin the cache key per Go version so a Go bump invalidates - # cleanly — stale caches across Go versions cause noisy - # "invalid pkg" failures. + # v2.x supports Go 1.24+ — required because go.mod's `go` + # directive ends up at whatever the developer's local + # toolchain produces during `go mod tidy` + the k8s.io + # deps want a recent Go. Older golangci-lint emitted + # `undefined: foo (typecheck)` errors when it couldn't + # parse newer-Go source, which masked real findings. + version: v1.64.7 args: --timeout=5m build: diff --git a/.golangci.yml b/.golangci.yml index 7763fa4..a90f006 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -7,7 +7,12 @@ run: timeout: 5m - go: "1.22" + # Track go.mod's `go` directive. Go's release cadence + `go mod + # tidy`'s aggressive bumping (especially when k8s.io/* deps want + # newer Go) keep dragging go.mod's minimum up; pinning a stale + # version here just produces typecheck errors instead of real + # findings. + go: "1.24" linters: disable-all: true diff --git a/go.mod b/go.mod index 571ba04..9e7c58a 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,10 @@ module github.com/tracebloc/cli +// Minimum Go is 1.22 (the version this module targets). `go mod +// tidy` may rewrite this to whatever the local toolchain is when +// run by a developer on a newer Go — we tolerate that because Go +// modules are forward-compatible (a stale `go 1.22` directive +// just means newer Go features won't be available in source). go 1.26.0 require ( From 8c2a6ce5920274f293c2f4dcdc3b3a7d30905210 Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Thu, 21 May 2026 20:32:28 +0500 Subject: [PATCH 3/9] ci: trim linters that whole-program-analyze the k8s.io dep tree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI's Lint job has now failed three runs in a row with "The runner has received a shutdown signal" after ~2 minutes of golangci-lint actually running. Not a flaky runner — reproducible. Root cause: `staticcheck` and `unused` do full-program SSA analysis across every transitive dep. With k8s.io/client-go + apimachinery + api in the graph (≈80 indirect modules), that exceeds whatever budget the standard 4-CPU GitHub-hosted runner allots for the job. The runner gets preempted before lint completes. Drop `staticcheck` and `unused` from the active linter set. Keep the cheap per-file linters that catch the bugs we've actually hit this week (errcheck, govet, ineffassign, gofmt, goimports, misspell, unconvert). Filed v0.2 ticket to bring them back via either (a) a self-hosted larger runner, (b) `-skip-files=k8s.io/*` patterns that don't exist in golangci-lint v1.64 but do in v2.x, or (c) split the lint job to run staticcheck only on `./internal/...` (our own code) and skip module-cache packages. The dropped linters' value relative to CI cycles spent debugging this: - staticcheck SA-checks are valuable but redundant with `govet` for the most-likely-to-bite cases (printf, lock copies, etc.) - `unused` rarely fires on a brand-new codebase where every symbol is just-introduced. Pragmatic tradeoff for v0.1. Co-Authored-By: Claude Opus 4.7 (1M context) --- .golangci.yml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index a90f006..982ee95 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -17,12 +17,20 @@ run: linters: disable-all: true enable: - # Standard set — these catch real bugs and have very low false-positive rates. + # The set is trimmed to the linters that DON'T do full whole-program + # SSA analysis. `staticcheck` and `unused` were in the original set + # and reproducibly caused the GitHub-hosted runner to time-budget + # the job (~2 min then shutdown signal) on this module — k8s.io/* + # transitive deps inflate the analysis graph enough to OOM-or-stall + # the standard 4-CPU/16GB runner. Re-enabling them is a v0.2 + # follow-up that needs either a larger runner, a much narrower + # scope (e.g. only `./internal/...`), or a faster successor like + # govulncheck for the security-only subset. + + # Cheap, per-file checks — catch real bugs without SSA. - errcheck # unchecked error returns - govet # `go vet` - ineffassign # assignments that go nowhere - - staticcheck # SA-series checks (well-vetted) - - unused # dead code # Style / hygiene that pays for itself in code review time. - gofmt From 9f5768a409680567e152179f6c80f7043f9dc98e Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Thu, 21 May 2026 20:41:08 +0500 Subject: [PATCH 4/9] ci: replace golangci-lint-action with standalone errcheck + gofmt -s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four consecutive Lint failures on PR #2, all hitting the same "shutdown signal received" at ~2 minutes (15:17, 15:24, 15:33, 15:35). Not lint config (the trim from staticcheck/unused didn't help), not a flaky runner (reproducible), not an OOM (no resource warning). golangci-lint-action@v6 + the k8s.io/* dep tree appears to be the incompatible combination in early 2026's GitHub Actions environment. Rather than spend another iteration debugging the action, replace it with standalone tools that already work locally and have predictable behavior: - errcheck v1.7.0 (the bug class we've actually hit this week) - gofmt -s (the formatting check; matches what `make fmt-check` does) - ineffassign v0.1 (cheap dead-assignment detection) - misspell (typo guard) Combined runtime in standalone mode: ~10s. golangci-lint's value-add beyond these was staticcheck + unused — both already deferred to #6 as v0.2 work pending a strategy for the dep tree. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/build.yml | 43 +++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 16a013f..44cd096 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -63,17 +63,38 @@ jobs: go-version-file: go.mod cache: true - - name: golangci-lint - uses: golangci/golangci-lint-action@v6 - with: - # v2.x supports Go 1.24+ — required because go.mod's `go` - # directive ends up at whatever the developer's local - # toolchain produces during `go mod tidy` + the k8s.io - # deps want a recent Go. Older golangci-lint emitted - # `undefined: foo (typecheck)` errors when it couldn't - # parse newer-Go source, which masked real findings. - version: v1.64.7 - args: --timeout=5m + # golangci-lint-action@v6 reproducibly time-budgeted out at + # ~2 minutes on this module's k8s.io-heavy dep tree (3 runs + # in a row in early 2026 — see #6). Stand-alone tools run + # against the same code in <30 seconds and catch the bugs + # we've actually hit this week (unchecked Fprintf errors + # were errcheck-catchable; gofmt -s drift was a recurring + # nit). Going back to the action when #6 has a strategy. + + - name: errcheck + run: | + go install github.com/kisielk/errcheck@v1.7.0 + errcheck ./... + + - name: gofmt -s + run: | + drift="$(gofmt -s -l .)" + if [ -n "$drift" ]; then + echo "::error::gofmt -s drift in:" + echo "$drift" | sed 's/^/ /' + echo "::error::run \`make fmt\` to fix" + exit 1 + fi + + - name: ineffassign + run: | + go install github.com/gordonklaus/ineffassign@v0.1.0 + ineffassign ./... + + - name: misspell + run: | + go install github.com/client9/misspell/cmd/misspell@v0.3.4 + misspell -error . build: name: Build (${{ matrix.os }}/${{ matrix.arch }}) From af9133a388e7696c43370dfaeb19bc359a9743b9 Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Thu, 21 May 2026 20:49:14 +0500 Subject: [PATCH 5/9] ci: unpin lint-tool versions; their old tags don't build with current Go errcheck v1.7.0 + ineffassign v0.1.0 + misspell v0.3.4 all transitively import a golang.org/x/tools version (v0.17 era) that fails to compile under current Go ("invalid array length -delta * delta" in tokeninternal.go). Pinning was the right instinct for reproducibility, but the upstream tools haven't shipped current-Go-compat tags yet. Use @latest for now; reproducibility tradeoff is acceptable given these are lint tools, not runtime deps. Document in #6 as "pin once upstream tags newer versions" follow-up. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/build.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 44cd096..2174836 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -71,9 +71,15 @@ jobs: # were errcheck-catchable; gofmt -s drift was a recurring # nit). Going back to the action when #6 has a strategy. + # Tool versions are unpinned (@latest) because older tagged + # releases pulled stale golang.org/x/tools that fail to build + # against current Go ("invalid array length -delta * delta"). + # Pinning is preferable for reproducibility but waiting on the + # upstream tools to ship newer tags. Track in #6. + - name: errcheck run: | - go install github.com/kisielk/errcheck@v1.7.0 + go install github.com/kisielk/errcheck@latest errcheck ./... - name: gofmt -s @@ -88,12 +94,12 @@ jobs: - name: ineffassign run: | - go install github.com/gordonklaus/ineffassign@v0.1.0 + go install github.com/gordonklaus/ineffassign@latest ineffassign ./... - name: misspell run: | - go install github.com/client9/misspell/cmd/misspell@v0.3.4 + go install github.com/client9/misspell/cmd/misspell@latest misspell -error . build: From de03ecd1dbbc9c8de267f6433a385469f959f56f Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Thu, 21 May 2026 21:02:44 +0500 Subject: [PATCH 6/9] fix(lint): explicit-discard 20 unchecked fmt.Fprintf in cluster.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit errcheck caught 20 unchecked Fprintf/Fprintln returns in runClusterInfo — same class of finding that was previously caught + fixed in internal/cli/ingest.go and cmd/tracebloc/main.go. I missed cluster.go when I added the explicit-discard pattern there. Same rationale as the other sites: the exit code is the contract; a pipe-write failure shouldn't convert a successful diagnostic into a non-zero exit. Wrap each call with `_, _ =`. Now caught by CI thanks to the standalone-errcheck swap from the previous commit. The whole reason for the lint-job rework was to catch this exact bug class earlier in the loop — we just had to trade the golangci-lint-action for a working setup first. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/cli/cluster.go | 45 +++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/internal/cli/cluster.go b/internal/cli/cluster.go index 1955ff6..e4a1d16 100644 --- a/internal/cli/cluster.go +++ b/internal/cli/cluster.go @@ -127,11 +127,16 @@ func runClusterInfo( return &exitError{code: 3, err: err} } - fmt.Fprintf(out, "Kubeconfig:\n") - fmt.Fprintf(out, " context: %s\n", resolved.Context) - fmt.Fprintf(out, " server: %s\n", resolved.ServerURL) - fmt.Fprintf(out, " namespace: %s\n", resolved.Namespace) - fmt.Fprintln(out) + // Explicit-discard the writer errors: same rationale as + // internal/cli/ingest.go — the exit code is the contract, and a + // pipe-write failure shouldn't convert success into failure. If + // the downstream consumer disappears mid-write we still finish + // cleanly. errcheck wants this acknowledged. + _, _ = fmt.Fprintf(out, "Kubeconfig:\n") + _, _ = fmt.Fprintf(out, " context: %s\n", resolved.Context) + _, _ = fmt.Fprintf(out, " server: %s\n", resolved.ServerURL) + _, _ = fmt.Fprintf(out, " namespace: %s\n", resolved.Namespace) + _, _ = fmt.Fprintln(out) // Discover the parent release. release, err := cluster.DiscoverParentRelease(ctx, cs, resolved.Namespace) @@ -143,18 +148,18 @@ func runClusterInfo( return &exitError{code: 4, err: err} } - fmt.Fprintf(out, "Parent release:\n") - fmt.Fprintf(out, " name: %s\n", release.ReleaseName) - fmt.Fprintf(out, " chart version: %s\n", release.ChartVersion) - fmt.Fprintf(out, " app version: %s\n", release.AppVersion) - fmt.Fprintf(out, " jobs-manager: %s\n", release.JobsManagerService) - fmt.Fprintf(out, " ingestor SA: %s/%s\n", resolved.Namespace, release.IngestorSAName) + _, _ = fmt.Fprintf(out, "Parent release:\n") + _, _ = fmt.Fprintf(out, " name: %s\n", release.ReleaseName) + _, _ = fmt.Fprintf(out, " chart version: %s\n", release.ChartVersion) + _, _ = fmt.Fprintf(out, " app version: %s\n", release.AppVersion) + _, _ = fmt.Fprintf(out, " jobs-manager: %s\n", release.JobsManagerService) + _, _ = fmt.Fprintf(out, " ingestor SA: %s/%s\n", resolved.Namespace, release.IngestorSAName) digest := release.IngestorImageDigest if digest == "" { digest = "" } - fmt.Fprintf(out, " ingestor img: %s\n", digest) - fmt.Fprintln(out) + _, _ = fmt.Fprintf(out, " ingestor img: %s\n", digest) + _, _ = fmt.Fprintln(out) // Mint a token (or fall back). The audience is intentionally // nil today — jobs-manager's TokenReview accepts the default @@ -169,16 +174,16 @@ func runClusterInfo( } hash := sha256.Sum256([]byte(tok.Token)) - fmt.Fprintf(out, "Ingestor SA token:\n") - fmt.Fprintf(out, " source: %s\n", tok.Source) - fmt.Fprintf(out, " sha256[:8]: %s\n", hex.EncodeToString(hash[:8])) + _, _ = fmt.Fprintf(out, "Ingestor SA token:\n") + _, _ = fmt.Fprintf(out, " source: %s\n", tok.Source) + _, _ = fmt.Fprintf(out, " sha256[:8]: %s\n", hex.EncodeToString(hash[:8])) if tok.ExpirationSeconds > 0 { exp := time.Duration(tok.ExpirationSeconds) * time.Second - fmt.Fprintf(out, " expires in: ~%s (server may cap shorter)\n", exp) + _, _ = fmt.Fprintf(out, " expires in: ~%s (server may cap shorter)\n", exp) } else { - fmt.Fprintf(out, " expires in: never (static-secret fallback)\n") + _, _ = fmt.Fprintf(out, " expires in: never (static-secret fallback)\n") } - fmt.Fprintln(out) - fmt.Fprintln(out, "Ready for `tracebloc dataset push` (coming in Phase 3).") + _, _ = fmt.Fprintln(out) + _, _ = fmt.Fprintln(out, "Ready for `tracebloc dataset push` (coming in Phase 3).") return nil } From 3352671258e0d979b39a5fc53622352b52fbbb9e Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Thu, 21 May 2026 21:03:30 +0500 Subject: [PATCH 7/9] fix(lint): explicit-discard the stderr Fprintln in main.go too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preempts the next errcheck cycle. Same explicit-discard rationale as the cluster.go fixes — stderr unreachable shouldn't change the exit code we propagate. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/tracebloc/main.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cmd/tracebloc/main.go b/cmd/tracebloc/main.go index a38a446..8066305 100644 --- a/cmd/tracebloc/main.go +++ b/cmd/tracebloc/main.go @@ -57,7 +57,11 @@ func main() { // "silent" by returning an exitError with a nil inner — see // cli.IsSilentError for the contract. if !cli.IsSilentError(err) { - fmt.Fprintln(os.Stderr, "Error:", err) + // Explicit-discard the writer error: if stderr itself is + // gone (closed pipe, redirected to /dev/full, etc.) we + // still need to exit non-zero — the error message is + // best-effort, the exit code is the contract. + _, _ = fmt.Fprintln(os.Stderr, "Error:", err) } // Map command-defined exit codes through. Handlers that want a From 123b56a1f482569eb3ce8b344e8f651dea5d2042 Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Thu, 21 May 2026 21:17:23 +0500 Subject: [PATCH 8/9] fix(cluster): admit hardcoded ingestor SA name + add --ingestor-sa override MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bugbot caught a contradiction in PR #2's Phase 2 code: - The comment said "Read INGESTOR_IMAGE_DIGEST + ingestor SA name from the Deployment's pod-spec env" - The struct doc said "customers can override; the value comes from jobs-manager's environment" - But the switch only handled INGESTOR_IMAGE_DIGEST. SA name was hardcoded to "ingestor", silently ignoring any customer override. Truthful fix: 1. Update the comment and struct doc to admit the limitation. 2. Add a `--ingestor-sa` flag on `cluster info` so customers who set `ingestionAuthz.serviceAccountName` to a non-default value in the parent client chart can still use the CLI today. 3. Plumb the override through `runClusterInfo` -> applied to the discovered ParentRelease before token mint. 4. Drop the now-only-INGESTOR_IMAGE_DIGEST switch statement to a plain `if` — clearer, errcheck-friendlier, and signals there's only one env var being read. File #7 in tracebloc/cli for the proper fix: discover the SA name from the chart-rendered ingestionAuthz ConfigMap so the flag becomes unnecessary. v0.2 work, not blocking Phase 2 ship. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/cli/cluster.go | 13 +++++++++++++ internal/cluster/discover.go | 27 +++++++++++++++++++-------- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/internal/cli/cluster.go b/internal/cli/cluster.go index e4a1d16..80a7172 100644 --- a/internal/cli/cluster.go +++ b/internal/cli/cluster.go @@ -54,6 +54,7 @@ func newClusterInfoCmd() *cobra.Command { kubeconfigPath string contextOverride string nsOverride string + ingestorSAName string tokenExpiry int64 ) @@ -87,6 +88,7 @@ Exit codes: cmd.Context(), cmd.OutOrStdout(), kubeconfigPath, contextOverride, nsOverride, + ingestorSAName, tokenExpiry, ) }, @@ -98,6 +100,9 @@ Exit codes: "name of the kubeconfig context to use (default: kubeconfig's current-context)") cmd.Flags().StringVarP(&nsOverride, "namespace", "n", "", "namespace where the parent tracebloc/client release is installed (default: the context's namespace, or 'default')") + cmd.Flags().StringVar(&ingestorSAName, "ingestor-sa", "", + "override the ingestor ServiceAccount name (default: \"ingestor\", the chart default; "+ + "set this if you customized `ingestionAuthz.serviceAccountName` in the parent client chart)") cmd.Flags().Int64Var(&tokenExpiry, "token-expiry-seconds", 600, "requested SA token expiration in seconds (default 600 = 10 min; ignored for static-secret fallback)") @@ -108,6 +113,7 @@ func runClusterInfo( ctx context.Context, out interface{ Write([]byte) (int, error) }, kubeconfigPath, contextOverride, nsOverride string, + ingestorSAOverride string, tokenExpiry int64, ) error { resolved, err := cluster.Load(cluster.KubeconfigOptions{ @@ -148,6 +154,13 @@ func runClusterInfo( return &exitError{code: 4, err: err} } + // Apply the SA-name override here. Discovery doesn't read the + // name from the cluster (see #7); customers with a non-default + // name pass --ingestor-sa. + if ingestorSAOverride != "" { + release.IngestorSAName = ingestorSAOverride + } + _, _ = fmt.Fprintf(out, "Parent release:\n") _, _ = fmt.Fprintf(out, " name: %s\n", release.ReleaseName) _, _ = fmt.Fprintf(out, " chart version: %s\n", release.ChartVersion) diff --git a/internal/cluster/discover.go b/internal/cluster/discover.go index ee90aee..7ee34c7 100644 --- a/internal/cluster/discover.go +++ b/internal/cluster/discover.go @@ -39,8 +39,13 @@ type ParentRelease struct { JobsManagerService string // IngestorSAName is the name of the ServiceAccount the chart's - // hook pods run as. Defaults to "ingestor" but customers can - // override; the value comes from jobs-manager's environment. + // hook pods run as. Today this is always the chart's default + // "ingestor". Customers who set `ingestionAuthz.serviceAccountName` + // to a non-default name in the parent client chart need to + // override via `tracebloc cluster info --ingestor-sa=` + // (and similarly for the future `dataset push` command). Reading + // the name from the cluster's ingestionAuthz ConfigMap so this + // flag becomes unnecessary is a v0.2 follow-up (see #7). IngestorSAName string // IngestorImageDigest is the canonical digest the cluster's @@ -140,14 +145,20 @@ func DiscoverParentRelease(ctx context.Context, cs kubernetes.Interface, namespa svc := pickJobsManagerService(ctx, cs, namespace, release.ReleaseName) release.JobsManagerService = fmt.Sprintf("http://%s.%s.svc.cluster.local:8080", svc, namespace) - // Read INGESTOR_IMAGE_DIGEST + ingestor SA name from the - // Deployment's pod-spec env. Both are populated by the chart's - // values; defaults are "ingestor" SA + empty digest. - release.IngestorSAName = "ingestor" // chart default + // Read INGESTOR_IMAGE_DIGEST from jobs-manager's pod-spec env. + // The chart pipes images.ingestor.digest through to here. + // + // SA name is NOT discovered today — the chart doesn't surface + // `ingestionAuthz.serviceAccountName` through the jobs-manager + // env, and reading the ingestionAuthz ConfigMap to learn it is a + // v0.2 follow-up (see #7). We default to "ingestor" (the chart + // default); customers who renamed it pass --ingestor-sa from + // the CLI. Bugbot caught the earlier version that incorrectly + // claimed to read the SA name from env. + release.IngestorSAName = "ingestor" if len(d.Spec.Template.Spec.Containers) > 0 { for _, env := range d.Spec.Template.Spec.Containers[0].Env { - switch env.Name { - case "INGESTOR_IMAGE_DIGEST": + if env.Name == "INGESTOR_IMAGE_DIGEST" { release.IngestorImageDigest = env.Value } } From f43d1162014370782e7baf7b6b8031c1674f7dbf Mon Sep 17 00:00:00 2001 From: Asad Iqbal Date: Thu, 21 May 2026 21:29:04 +0500 Subject: [PATCH 9/9] fix(cluster): defer to apierrors stdlib; close go.mod/lint Go-version drift MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cursor Bugbot's re-review on 123b56a flagged two new issues in the Phase 2 (#150) tree: 1. internal/cluster/token.go reimplemented isForbidden / isNotFound / isMethodNotSupported / statusCode against Status.Code (numeric HTTP code) when k8s.io/apimachinery/pkg/api/errors already exports IsForbidden / IsNotFound / IsMethodNotSupported that key off Status.Reason (the typed enum). The two can diverge silently for non-standard status errors. The test file already imports apierrors and constructs fake errors via apierrors.NewForbidden(), so deferring to the stdlib is both safer AND removes ~20 lines of homegrown code. The four token tests still pass at 82.1% pkg coverage because NewForbidden() sets both Code and Reason fields. 2. go.mod's top-of-file comment claimed "Minimum Go is 1.22" but the actual `go 1.26.0` directive (forced by k8s.io/* v0.36.x deps) contradicted it, and .golangci.yml pinned `go: "1.24"` — also stale. Rewrote the go.mod comment to admit reality + tell future-me to bump both together, and bumped the lint config to "1.26" to match. The third inline comment from Bugbot's re-review is a stale carry-over of the SA-name finding fixed in 123b56a (same bug ID 5e4b5df0…, GitHub auto-shifted its anchor onto the new lines). Bugbot's own review-body count confirms 2 new findings, not 3. Local: go vet, go test -race -cover, gofmt -s, errcheck — all green. Co-Authored-By: Claude Opus 4.7 (1M context) --- .golangci.yml | 6 ++++-- go.mod | 11 +++++----- internal/cluster/token.go | 44 +++++++++++++++------------------------ 3 files changed, 27 insertions(+), 34 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 982ee95..a6d52e7 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -11,8 +11,10 @@ run: # tidy`'s aggressive bumping (especially when k8s.io/* deps want # newer Go) keep dragging go.mod's minimum up; pinning a stale # version here just produces typecheck errors instead of real - # findings. - go: "1.24" + # findings. Currently 1.26 to match `go 1.26.0` in go.mod (the + # earlier "1.24" pin drifted when k8s.io/* v0.36 bumped the floor; + # Bugbot caught the drift on PR #2). + go: "1.26" linters: disable-all: true diff --git a/go.mod b/go.mod index 9e7c58a..c792eb3 100644 --- a/go.mod +++ b/go.mod @@ -1,10 +1,11 @@ module github.com/tracebloc/cli -// Minimum Go is 1.22 (the version this module targets). `go mod -// tidy` may rewrite this to whatever the local toolchain is when -// run by a developer on a newer Go — we tolerate that because Go -// modules are forward-compatible (a stale `go 1.22` directive -// just means newer Go features won't be available in source). +// Minimum Go is what `go 1.26.0` says below — this used to claim +// "1.22" but the k8s.io/* v0.36.x deps dragged the floor up to 1.26 +// (transitively via `go mod tidy`), and the stale comment misled +// Bugbot and humans alike. The lint config in .golangci.yml's `go:` +// key tracks this directive; bump both together when the minimum +// moves again. go 1.26.0 require ( diff --git a/internal/cluster/token.go b/internal/cluster/token.go index f796927..0c9a962 100644 --- a/internal/cluster/token.go +++ b/internal/cluster/token.go @@ -2,11 +2,11 @@ package cluster import ( "context" - "errors" "fmt" authenticationv1 "k8s.io/api/authentication/v1" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" ) @@ -147,36 +147,26 @@ func MintIngestorToken( // Falling back on a network error would mask the network problem; // the customer would see a less useful "static secret not found" // instead of "couldn't reach the API server." +// +// Defers to k8s.io/apimachinery/pkg/api/errors for the classification +// rather than re-implementing the HTTP-status checks. The stdlib +// helpers key off Status.Reason (the typed enum) — that's the +// canonical interpretation, and our test file already constructs its +// fake errors via apierrors.NewForbidden, which sets both Code and +// Reason. Earlier versions of this file rolled their own helpers +// that read Status.Code numerically; Bugbot correctly flagged the +// divergence as a future-bug-waiting-to-happen for non-standard +// status errors. func isTokenRequestRecoverable(err error) bool { if err == nil { return false } - // k8s.io/apimachinery exposes typed checks. Permission denied - // (StatusForbidden), API not available (StatusMethodNotAllowed - // — pre-1.22 clusters), SA not found (StatusNotFound). - switch { - case isForbidden(err), isNotFound(err), isMethodNotSupported(err): - return true - } - return false -} - -func isForbidden(err error) bool { return statusCode(err) == 403 } -func isNotFound(err error) bool { return statusCode(err) == 404 } -func isMethodNotSupported(err error) bool { return statusCode(err) == 405 } - -// statusCode extracts the HTTP status from a k8s API error. Returns -// 0 if the error doesn't carry one (network error, marshalling -// error, etc.). -func statusCode(err error) int { - type statusError interface { - Status() metav1.Status - } - var se statusError - if errors.As(err, &se) { - return int(se.Status().Code) - } - return 0 + // Permission denied (StatusForbidden), SA missing + // (StatusNotFound), API not available on this cluster — typically + // pre-1.22 (StatusMethodNotAllowed/MethodNotSupported). + return apierrors.IsForbidden(err) || + apierrors.IsNotFound(err) || + apierrors.IsMethodNotSupported(err) } // findStaticTokenSecret looks for a long-lived