diff --git a/client/Chart.yaml b/client/Chart.yaml index 052c031..e93a3f5 100644 --- a/client/Chart.yaml +++ b/client/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: client description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift type: application -version: 1.4.5 -appVersion: "1.4.5" +version: 1.5.0 +appVersion: "1.5.0" keywords: - tracebloc - kubernetes diff --git a/client/templates/jobs-manager-deployment.yaml b/client/templates/jobs-manager-deployment.yaml index 95dea16..9e67c3f 100644 --- a/client/templates/jobs-manager-deployment.yaml +++ b/client/templates/jobs-manager-deployment.yaml @@ -76,6 +76,17 @@ spec: secretKeyRef: name: {{ include "tracebloc.secretName" . }} key: CLIENT_PASSWORD + # client-runtime#79: shared HMAC secret used to MINT the per-pod + # REQUESTS_PROXY_TOKEN. Same Secret key the requests-proxy reads to + # verify. When present, spin_job mints stateless signed tokens (no + # pod_tokens table). Never injected into training pods. + - name: POD_TOKEN_SIGNING_SECRET + valueFrom: + secretKeyRef: + name: {{ include "tracebloc.secretName" . }} + key: POD_TOKEN_SIGNING_SECRET + - name: POD_TOKEN_TTL_SECONDS + value: {{ .Values.podTokenTtlSeconds | default 604800 | quote }} - name: CLIENT_PVC value: {{ include "tracebloc.clientDataPvc" . | quote }} - name: CLIENT_LOGS_PVC diff --git a/client/templates/requests-proxy-deployment.yaml b/client/templates/requests-proxy-deployment.yaml index cde610c..8750d4b 100644 --- a/client/templates/requests-proxy-deployment.yaml +++ b/client/templates/requests-proxy-deployment.yaml @@ -77,6 +77,15 @@ spec: value: "experiments" - name: FLOPS_QUEUE_NAME value: "flops" + # client-runtime#79: shared HMAC secret used to VERIFY the per-pod + # bearer token statelessly (signature + expiry), the same key the + # jobs-manager uses to mint it. When present, the proxy skips the + # pod_tokens table entirely. + - name: POD_TOKEN_SIGNING_SECRET + valueFrom: + secretKeyRef: + name: {{ include "tracebloc.secretName" . }} + key: POD_TOKEN_SIGNING_SECRET {{- if include "tracebloc.useImagePullSecrets" . }} imagePullSecrets: - name: {{ include "tracebloc.registrySecretName" . }} diff --git a/client/templates/secrets.yaml b/client/templates/secrets.yaml index 65242b4..1cd1713 100644 --- a/client/templates/secrets.yaml +++ b/client/templates/secrets.yaml @@ -2,6 +2,27 @@ {{- $clientPassword := required "clientPassword is required (set via --set clientPassword=... or values file)" .Values.clientPassword -}} {{- if regexMatch "^<.*>$" $clientId -}}{{- fail "clientId looks like a placeholder (e.g. \"\"); set a real value" -}}{{- end -}} {{- if regexMatch "^<.*>$" $clientPassword -}}{{- fail "clientPassword looks like a placeholder (e.g. \"\"); set a real value" -}}{{- end -}} +{{- /* + Pod-proxy token signing secret (client-runtime#79). Stability is critical: + a token minted before an upgrade must still verify after, so we must NOT + regenerate the secret on every `helm upgrade`. Resolution order: + 1. explicit .Values.podTokenSigningSecret (operator pin / rotation), else + 2. the value already stored in the live Secret (preserve across upgrades), else + 3. a freshly generated random secret (first install). + `lookup` returns an empty dict during `helm template`/dry-run; that's fine — + it falls through to generate, and the rendered value is only consumed at a + real install/upgrade. +*/ -}} +{{- $secretName := include "tracebloc.secretName" . -}} +{{- $existingSecret := (lookup "v1" "Secret" .Release.Namespace $secretName) -}} +{{- $podTokenSecret := "" -}} +{{- if .Values.podTokenSigningSecret -}} +{{- $podTokenSecret = .Values.podTokenSigningSecret -}} +{{- else if and $existingSecret $existingSecret.data (hasKey $existingSecret.data "POD_TOKEN_SIGNING_SECRET") -}} +{{- $podTokenSecret = (index $existingSecret.data "POD_TOKEN_SIGNING_SECRET" | b64dec) -}} +{{- else -}} +{{- $podTokenSecret = randAlphaNum 48 -}} +{{- end -}} apiVersion: v1 kind: Secret metadata: @@ -13,6 +34,7 @@ type: Opaque data: CLIENT_ID: {{ $clientId | b64enc | quote }} CLIENT_PASSWORD: {{ $clientPassword | b64enc | quote }} + POD_TOKEN_SIGNING_SECRET: {{ $podTokenSecret | b64enc | quote }} {{- if and (ne .Values.resourceMonitor false) (ne .Values.nodeAgents.namespace.name .Release.Namespace) }} --- # Mirrored into the node-agents namespace so the resource-monitor DaemonSet diff --git a/client/values.yaml b/client/values.yaml index 5dbccd1..4f5aaea 100644 --- a/client/values.yaml +++ b/client/values.yaml @@ -349,6 +349,26 @@ podDisruptionBudget: clientId: "" clientPassword: "" +# -- Pod-proxy token signing (client-runtime#79) +# Shared HMAC secret used by the jobs-manager to MINT and the requests-proxy to +# VERIFY the per-pod REQUESTS_PROXY_TOKEN. When set (non-empty), the proxy +# validates tokens statelessly (signature + expiry) and the legacy pod_tokens +# table is not used — eliminating the token-revoke-while-live race class. +# +# Leave empty to AUTO-GENERATE a stable secret: the chart generates one on +# first install and reuses it on every subsequent upgrade (via `lookup`), so +# tokens minted before an upgrade still verify after. Set explicitly only to +# pin a known value or to rotate (rotating invalidates all live tokens at once, +# forcing pods to be respawned). Never injected into training pods. +podTokenSigningSecret: "" + +# -- Backstop lifetime (seconds) for a signed pod-proxy token. The real bound +# on a token's usefulness is the pod lifecycle (the pod is deleted on stop); +# this just caps a leaked token. Set to comfortably exceed the longest expected +# training-job duration, or a long-running job will 401 mid-run when its token +# expires. Default 7 days. +podTokenTtlSeconds: 604800 + # -- Docker registry credentials (optional; only used when dockerRegistry is set and create is true) # Omit dockerRegistry entirely, or set create: false, for public images (no imagePullSecrets). # When create is true, secret name is {{ .Release.Name }}-regcred.