Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions client/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ apiVersion: v2
name: client
description: A unified Helm chart for tracebloc on AKS, EKS, bare-metal, and OpenShift
type: application
version: 1.4.5
appVersion: "1.4.5"
version: 1.5.0
appVersion: "1.5.0"
keywords:
- tracebloc
- kubernetes
Expand Down
11 changes: 11 additions & 0 deletions client/templates/jobs-manager-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,17 @@ spec:
secretKeyRef:
name: {{ include "tracebloc.secretName" . }}
key: CLIENT_PASSWORD
# client-runtime#79: shared HMAC secret used to MINT the per-pod
# REQUESTS_PROXY_TOKEN. Same Secret key the requests-proxy reads to
# verify. When present, spin_job mints stateless signed tokens (no
# pod_tokens table). Never injected into training pods.
- name: POD_TOKEN_SIGNING_SECRET
valueFrom:
secretKeyRef:
name: {{ include "tracebloc.secretName" . }}
key: POD_TOKEN_SIGNING_SECRET
- name: POD_TOKEN_TTL_SECONDS
value: {{ .Values.podTokenTtlSeconds | default 604800 | quote }}
- name: CLIENT_PVC
value: {{ include "tracebloc.clientDataPvc" . | quote }}
- name: CLIENT_LOGS_PVC
Expand Down
9 changes: 9 additions & 0 deletions client/templates/requests-proxy-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,15 @@ spec:
value: "experiments"
- name: FLOPS_QUEUE_NAME
value: "flops"
# client-runtime#79: shared HMAC secret used to VERIFY the per-pod
# bearer token statelessly (signature + expiry), the same key the
# jobs-manager uses to mint it. When present, the proxy skips the
# pod_tokens table entirely.
- name: POD_TOKEN_SIGNING_SECRET
valueFrom:
secretKeyRef:
name: {{ include "tracebloc.secretName" . }}
key: POD_TOKEN_SIGNING_SECRET
{{- if include "tracebloc.useImagePullSecrets" . }}
imagePullSecrets:
- name: {{ include "tracebloc.registrySecretName" . }}
Expand Down
22 changes: 22 additions & 0 deletions client/templates/secrets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,27 @@
{{- $clientPassword := required "clientPassword is required (set via --set clientPassword=... or values file)" .Values.clientPassword -}}
{{- if regexMatch "^<.*>$" $clientId -}}{{- fail "clientId looks like a placeholder (e.g. \"<CLIENT_ID>\"); set a real value" -}}{{- end -}}
{{- if regexMatch "^<.*>$" $clientPassword -}}{{- fail "clientPassword looks like a placeholder (e.g. \"<CLIENT_PASSWORD>\"); set a real value" -}}{{- end -}}
{{- /*
Pod-proxy token signing secret (client-runtime#79). Stability is critical:
a token minted before an upgrade must still verify after, so we must NOT
regenerate the secret on every `helm upgrade`. Resolution order:
1. explicit .Values.podTokenSigningSecret (operator pin / rotation), else
2. the value already stored in the live Secret (preserve across upgrades), else
3. a freshly generated random secret (first install).
`lookup` returns an empty dict during `helm template`/dry-run; that's fine —
it falls through to generate, and the rendered value is only consumed at a
real install/upgrade.
*/ -}}
{{- $secretName := include "tracebloc.secretName" . -}}
{{- $existingSecret := (lookup "v1" "Secret" .Release.Namespace $secretName) -}}
{{- $podTokenSecret := "" -}}
{{- if .Values.podTokenSigningSecret -}}
{{- $podTokenSecret = .Values.podTokenSigningSecret -}}
{{- else if and $existingSecret $existingSecret.data (hasKey $existingSecret.data "POD_TOKEN_SIGNING_SECRET") -}}
{{- $podTokenSecret = (index $existingSecret.data "POD_TOKEN_SIGNING_SECRET" | b64dec) -}}
{{- else -}}
{{- $podTokenSecret = randAlphaNum 48 -}}
{{- end -}}
apiVersion: v1
kind: Secret
metadata:
Expand All @@ -13,6 +34,7 @@ type: Opaque
data:
CLIENT_ID: {{ $clientId | b64enc | quote }}
CLIENT_PASSWORD: {{ $clientPassword | b64enc | quote }}
POD_TOKEN_SIGNING_SECRET: {{ $podTokenSecret | b64enc | quote }}
{{- if and (ne .Values.resourceMonitor false) (ne .Values.nodeAgents.namespace.name .Release.Namespace) }}
---
# Mirrored into the node-agents namespace so the resource-monitor DaemonSet
Expand Down
20 changes: 20 additions & 0 deletions client/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,26 @@ podDisruptionBudget:
clientId: ""
clientPassword: ""

# -- Pod-proxy token signing (client-runtime#79)
# Shared HMAC secret used by the jobs-manager to MINT and the requests-proxy to
# VERIFY the per-pod REQUESTS_PROXY_TOKEN. When set (non-empty), the proxy
# validates tokens statelessly (signature + expiry) and the legacy pod_tokens
# table is not used — eliminating the token-revoke-while-live race class.
#
# Leave empty to AUTO-GENERATE a stable secret: the chart generates one on
# first install and reuses it on every subsequent upgrade (via `lookup`), so
# tokens minted before an upgrade still verify after. Set explicitly only to
# pin a known value or to rotate (rotating invalidates all live tokens at once,
# forcing pods to be respawned). Never injected into training pods.
podTokenSigningSecret: ""

# -- Backstop lifetime (seconds) for a signed pod-proxy token. The real bound
# on a token's usefulness is the pod lifecycle (the pod is deleted on stop);
# this just caps a leaked token. Set to comfortably exceed the longest expected
# training-job duration, or a long-running job will 401 mid-run when its token
# expires. Default 7 days.
podTokenTtlSeconds: 604800

# -- Docker registry credentials (optional; only used when dockerRegistry is set and create is true)
# Omit dockerRegistry entirely, or set create: false, for public images (no imagePullSecrets).
# When create is true, secret name is {{ .Release.Name }}-regcred.
Expand Down
Loading