Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion client/tests/mysql_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,18 @@ tests:
path: spec.template.spec.containers[0].livenessProbe.failureThreshold
value: 5

- it: should reference the data-plane PriorityClass when enabled
- it: should NOT set priorityClassName by default (PriorityClass dropped)
template: templates/mysql-deployment.yaml
asserts:
- notExists:
path: spec.template.spec.priorityClassName

- it: should reference the PriorityClass when a name is set
template: templates/mysql-deployment.yaml
set:
priorityClass:
create: true
name: tracebloc-data-plane
asserts:
- equal:
path: spec.template.spec.priorityClassName
Expand Down
12 changes: 11 additions & 1 deletion client/tests/priority_class_pdb_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,18 @@ set:
clientId: "test-id"
clientPassword: "test"
tests:
- it: should create the data-plane PriorityClass by default
- it: should NOT render the PriorityClass by default (dropped)
template: templates/priority-class.yaml
asserts:
- hasDocuments:
count: 0

- it: should render the PriorityClass when explicitly enabled
template: templates/priority-class.yaml
set:
priorityClass:
create: true
name: tracebloc-data-plane
asserts:
- isKind:
of: PriorityClass
Expand Down
29 changes: 15 additions & 14 deletions client/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -313,22 +313,23 @@ resources:
memory: "512Mi"

# -- PriorityClass for the data-plane (mysql).
# Cluster-scoped resource. Created with helm.sh/resource-policy: keep so a
# release uninstall does not yank it from sibling releases that share it.
# Value 1,000,000 sits well above default (0) so the scheduler will preempt
# noisy training jobs to keep mysql scheduled, but well below
# system-cluster-critical (2,000,000,000).
# DROPPED BY DEFAULT (create: false, name: ""). mysql doesn't need it: its
# memory requests==limits (so it's among the last evicted under memory
# pressure), its data lives on a PVC (eviction = a transient restart, never
# data loss), and a PodDisruptionBudget guards voluntary disruptions. Dropping
# this cluster-scoped, fixed-name object also lets multiple tracebloc
# namespaces coexist in one cluster (BYO) and removes the one-client-per-
# cluster install collision.
#
# create: false — chart does not template the PriorityClass; you (or your
# GitOps tool / shared platform) manage it out-of-band.
# The mysql pod still references `name`, so make sure the
# PriorityClass exists at install time.
# name: "" — disable the priorityClassName reference entirely. mysql
# falls back to the cluster default priority (0) and loses
# the OOM-protection this chart's mysql tuning relies on.
# Opt back in ONLY on a heavily-contended cluster where you want the scheduler
# to preempt noisy training jobs to keep mysql scheduled:
# create: true, name: <some-name>, value: ~1000000 (above default 0, below
# system-cluster-critical 2,000,000,000).
# For a PriorityClass your platform manages out-of-band, use create: false +
# name: <existing-name> (mysql references it without the chart templating it).
priorityClass:
create: true
name: tracebloc-data-plane
create: false
name: ""
value: 1000000

# -- PodDisruptionBudgets.
Expand Down
78 changes: 61 additions & 17 deletions scripts/install-k8s.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,15 @@ function ConvertTo-WorkspaceName {
return $sanitized
}

# Best-effort chart version of the installed client release (e.g. "1.4.4");
# empty if not found / cluster unreachable. Greps helm's CHART column.
function Get-ChartVersion {
param([string]$Namespace = "tracebloc")
$out = (helm list -n $Namespace 2>$null) | Out-String
if ($out -match 'client-([0-9][^\s]*)') { return $Matches[1] }
return ""
}

# =============================================================================
# CONFIGURATION
# =============================================================================
Expand Down Expand Up @@ -1006,7 +1015,6 @@ function Install-ClientHelm {
}
$valuesFile = Join-Path $HOST_DATA_DIR "values.yaml"

$defaultNamespace = "default"
$defaultClientId = ""
$defaultClientPassword = ""

Expand All @@ -1027,21 +1035,15 @@ function Install-ClientHelm {
}
}

# -- Workspace name prompt --
PromptHeader "Choose a workspace name"
Hint "This identifies your tracebloc client on this machine."
Write-Host ""
Hint "Examples: myteam, vision-lab, lukas"
Write-Host ""
$nsInput = Read-Host " Workspace name [$defaultNamespace]"
$rawName = if ($nsInput) { $nsInput } else { $defaultNamespace }
$TB_NAMESPACE = ConvertTo-WorkspaceName -Input_ $rawName
# -- Namespace (fixed; not prompted) --
# The on-prem client is one-per-machine and is identified to the backend by
# its credentials (clientId), not by this name -- so we don't ask the user to
# invent one. It's just the local k8s namespace / Helm release name.
# Advanced / GitOps setups can override with TB_NAMESPACE=<name>.
$rawNs = if ($env:TB_NAMESPACE) { $env:TB_NAMESPACE } else { "tracebloc" }
$TB_NAMESPACE = ConvertTo-WorkspaceName -Input_ $rawNs
$script:TB_NAMESPACE = $TB_NAMESPACE # share with Wait-ForClientReady / Print-Summary

if ($TB_NAMESPACE -ne $rawName) {
Info "Using workspace: $TB_NAMESPACE"
}

# -- Step 4/4: Connect to tracebloc network --
Step 4 4 "Connect to tracebloc network"

Expand Down Expand Up @@ -1101,6 +1103,43 @@ function Install-ClientHelm {
$defaultClientId = ""; $defaultClientPassword = ""
}

# -- One-client-per-machine guard --
# A machine runs exactly one tracebloc client: it shares this cluster and the
# host's CPU/RAM/GPU, and the platform counts each client as separate
# capacity. If a DIFFERENT client is already installed here, a re-install
# would silently re-point the machine -- so we stop and let the operator
# decide. The same clientId is a normal re-run/upgrade and passes through.
# Check ANY namespace: a fresh install lands in 'tracebloc', but an install
# from an older installer version may be in a different namespace. Enumerate
# client-chart releases and read each clientId (ConvertFrom-Json -- no jq).
$existingId = ""; $existingNs = ""
$listJson = (helm list -A -o json 2>$null) | Out-String
if ($LASTEXITCODE -eq 0 -and $listJson.Trim()) {
try {
foreach ($rel in ($listJson | ConvertFrom-Json)) {
if ($rel.chart -and $rel.chart.StartsWith("client-")) {
$vals = (helm get values $rel.name -n $rel.namespace 2>$null) | Out-String
if ($vals -match 'clientId:\s*"([^"]+)"') { $existingId = $Matches[1].Trim(); $existingNs = $rel.namespace; break }
}
}
} catch { }
}
if ($existingId -and $existingId -ne $TB_CLIENT_ID) {
Write-Host ""
Warn "This machine already runs the tracebloc client '$existingId' (namespace '$existingNs')."
Hint "tracebloc runs one client per machine -- it shares this cluster and host"
Hint "resources, and the platform counts each client as separate capacity."
Write-Host ""
Hint "You entered a different Client ID ('$TB_CLIENT_ID'). Pick one:"
Hint " - Repair / update '$existingId' -> re-run with that same Client ID"
Hint " - Switch to '$TB_CLIENT_ID' -> remove the current client first:"
Hint " k3d cluster delete $CLUSTER_NAME (wipes this client + its local data)"
Hint " then re-run this installer"
Hint " - Run both clients -> install on a separate machine"
Write-Host ""
Err "Refusing to replace the existing client. See the options above."
}

$passwordEscaped = $TB_CLIENT_PASSWORD -replace "'", "''"

$gpuVal = ""
Expand Down Expand Up @@ -1257,6 +1296,8 @@ function Print-Summary {
Write-Host " " -NoNewline; Write-Host "$([char]0x2714) Connected to tracebloc" -ForegroundColor Green
Write-Host ""
Write-Host " Workspace : " -NoNewline; Write-Host $ns -ForegroundColor Cyan
$cver = Get-ChartVersion -Namespace $ns; if (-not $cver) { $cver = "unknown" }
Write-Host " Version : " -NoNewline; Write-Host $cver -ForegroundColor Cyan
Write-Host " Mode : " -NoNewline; Write-Host $mode -ForegroundColor Cyan
Write-Host ""
Write-Host " Your client is live. Confirm it shows as Online:"
Expand Down Expand Up @@ -1460,8 +1501,6 @@ function Invoke-DiagnoseBundle {
$d = Join-Path $work "tracebloc-diagnose-$ts"
New-Item -ItemType Directory -Path (Join-Path $d "logs") -Force | Out-Null

Info "Collecting diagnostics -- this is safe; credentials are redacted before the file is written."

# Namespace discovery (TB_NAMESPACE isn't set on a standalone diagnose run).
$ns = $TB_NAMESPACE
if (-not $ns) {
Expand All @@ -1470,9 +1509,14 @@ function Invoke-DiagnoseBundle {
}
if (-not $ns) { $ns = "default" }

# Surface the client version first -- the #1 thing support needs to know.
$cver = Get-ChartVersion -Namespace $ns; if (-not $cver) { $cver = "unknown" }
Info "tracebloc client version: $cver (namespace: $ns)"
Info "Collecting diagnostics -- this is safe; credentials are redacted before the file is written."

# host / versions
$h = @("# tracebloc diagnose ($ts)", "OS: Windows ARCH: $(Get-WindowsArch)",
"CLIENT_ENV: $($env:CLIENT_ENV) CLUSTER_NAME: $cn NAMESPACE: $ns", "## versions",
"CLIENT_ENV: $($env:CLIENT_ENV) CLUSTER_NAME: $cn NAMESPACE: $ns", "CLIENT VERSION: $cver", "## versions",
(k3d version 2>&1 | Out-String), (kubectl version --client 2>&1 | Out-String),
(helm version --short 2>&1 | Out-String), (docker version 2>&1 | Out-String))
try { $cs = Get-CimInstance Win32_ComputerSystem -ErrorAction Stop; $h += "CPUs=$($cs.NumberOfLogicalProcessors) MemBytes=$($cs.TotalPhysicalMemory)" } catch {}
Expand Down
2 changes: 2 additions & 0 deletions scripts/install-k8s.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#
# Environment variable overrides (optional):
# CLUSTER_NAME=myapp default: tracebloc
# TB_NAMESPACE=myns default: tracebloc (k8s namespace + local label;
# not prompted — the client is identified by its credentials)
# SERVERS=1 default: 1 (control-plane nodes)
# AGENTS=1 default: 1 (worker nodes)
# K8S_VERSION=v1.29.4-k3s1 default: latest stable k3s
Expand Down
10 changes: 10 additions & 0 deletions scripts/lib/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ hint() { echo -e " ${DIM}$*${RESET}"; }
# ── Utility ──────────────────────────────────────────────────────────────────
has() { command -v "$1" &>/dev/null; }

# Best-effort chart version of the installed client release in namespace $1
# (e.g. "1.4.4"); empty if not found / cluster unreachable. Greps helm's CHART
# column ("client-<ver>"), so it needs no jq.
_chart_version() {
local ns="${1:-${TB_NAMESPACE:-tracebloc}}"
has helm || return 0
helm list -n "$ns" 2>/dev/null | grep -oE 'client-[0-9][^[:space:]]*' | head -1 | sed 's/^client-//'
}

# ── macOS: Docker Desktop architecture vs machine (for wrong-arch UX) ────────
# Call early on macOS to fail fast with clear instructions if Docker.app
# is for the wrong architecture (e.g. Intel Docker on Apple Silicon).
Expand Down Expand Up @@ -361,6 +370,7 @@ Commands:

Advanced configuration (environment variables):
CLUSTER_NAME Cluster name (default: tracebloc)
TB_NAMESPACE Namespace / workspace label (default: tracebloc)
SERVERS Control-plane nodes (default: 1)
AGENTS Worker nodes (default: 1)
K8S_VERSION k3s image tag (default: v1.29.4-k3s1)
Expand Down
8 changes: 6 additions & 2 deletions scripts/lib/diagnose.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ run_diagnose() {
if [[ -z "$outdir" || ! -d "$outdir" ]]; then echo " diagnose: cannot create a temp directory" >&2; return 1; fi
d="$outdir/tracebloc-diagnose-$ts"; mkdir -p "$d/logs"

info "Collecting diagnostics — this is safe; credentials are redacted before the file is written."

# Namespace discovery — TB_NAMESPACE isn't set on a standalone diagnose run,
# so find the namespace of the jobs-manager pod (falls back to "default").
ns="${TB_NAMESPACE:-}"
Expand All @@ -59,12 +57,18 @@ run_diagnose() {
fi
[[ -z "$ns" ]] && ns="default"

# Surface the client version first — the #1 thing support needs to know.
local cver; cver="$(_chart_version "$ns")"
info "tracebloc client version: ${cver:-unknown} (namespace: $ns)"
info "Collecting diagnostics — this is safe; credentials are redacted before the file is written."

# ── host / versions ──
{
echo "# tracebloc diagnose ($ts)"
echo "OS: $(uname -s) $(uname -r)"
echo "ARCH: $(uname -m)"
echo "CLIENT_ENV: ${CLIENT_ENV:-<unset>} CLUSTER_NAME: $cn NAMESPACE: $ns"
echo "CLIENT VERSION: ${cver:-unknown}"
echo; echo "## versions"
has k3d && k3d version
has kubectl && kubectl version --client 2>/dev/null
Expand Down
62 changes: 47 additions & 15 deletions scripts/lib/install-client-helm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,12 @@ install_client_helm() {
if [[ -n "${TRACEBLOC_VALUES_FILE:-}" ]]; then
[[ -f "$TRACEBLOC_VALUES_FILE" ]] || error "TRACEBLOC_VALUES_FILE not found: $TRACEBLOC_VALUES_FILE"
values_file="$TRACEBLOC_VALUES_FILE"
TB_NAMESPACE="${TB_NAMESPACE:-default}"
TB_NAMESPACE="${TB_NAMESPACE:-tracebloc}"
info "Dev mode: using caller-provided values file"
log "Using values file: $values_file (namespace: $TB_NAMESPACE)"
else

local use_existing=""
local default_namespace="default"
local default_client_id=""
local default_client_password=""

Expand All @@ -179,19 +178,12 @@ install_client_helm() {
fi
fi

# ── Workspace name prompt ────────────────────────────────────────────────
prompt_header "Choose a workspace name"
hint "This identifies your tracebloc client on this machine."
echo ""
hint "Examples: berlin-team, vision-lab, ml-mardan"
echo ""
read -r -p " Workspace name [${default_namespace}]: " TB_NAMESPACE_INPUT
local raw_name="${TB_NAMESPACE_INPUT:-$default_namespace}"
TB_NAMESPACE=$(_sanitize_workspace_name "$raw_name")

if [[ "$TB_NAMESPACE" != "$raw_name" ]]; then
info "Using workspace: ${BOLD}${TB_NAMESPACE}${RESET}"
fi
# ── Namespace (fixed; not prompted) ──────────────────────────────────────
# The on-prem client is one-per-machine and is identified to the backend by
# its credentials (clientId), not by this name — so we don't ask the user to
# invent one. It's just the local k8s namespace / Helm release name.
# Advanced / GitOps setups can override with TB_NAMESPACE=<name>.
TB_NAMESPACE=$(_sanitize_workspace_name "${TB_NAMESPACE:-tracebloc}")

# ── Step 4/4: Connect to tracebloc network ──────────────────────────────
step 4 4 "Connect to tracebloc network"
Expand Down Expand Up @@ -255,6 +247,46 @@ install_client_helm() {
default_client_id=""; default_client_password=""
done

# ── One-client-per-machine guard ─────────────────────────────────────────
# A machine runs exactly one tracebloc client: it shares this cluster and the
# host's CPU/RAM/GPU, and the platform counts each client as separate
# capacity. If a DIFFERENT client is already installed here, a re-install
# would silently re-point the machine — so we stop and let the operator
# decide. The same clientId is a normal re-run/upgrade and passes through.
# Check ANY namespace: a fresh install lands in `tracebloc`, but an install
# from an older installer version may be in a different namespace. Enumerate
# client-chart releases and read each clientId. jq is already used elsewhere
# in the installer; if it's somehow absent, fall back to the `tracebloc` ns.
local existing_id="" existing_ns="" _gvf _rel _ns _id
_gvf="$(mktemp)"
if has jq; then
while IFS=$'\t' read -r _rel _ns; do
[[ -z "$_rel" ]] && continue
if helm get values "$_rel" -n "$_ns" > "$_gvf" 2>/dev/null; then
_id="$(_extract_yaml_value "$_gvf" clientId)"
[[ -n "$_id" ]] && { existing_id="$_id"; existing_ns="$_ns"; break; }
fi
done < <(helm list -A -o json 2>/dev/null | jq -r '.[] | select((.chart // "") | startswith("client-")) | "\(.name)\t\(.namespace)"')
elif helm get values "$TB_NAMESPACE" -n "$TB_NAMESPACE" > "$_gvf" 2>/dev/null; then
existing_id="$(_extract_yaml_value "$_gvf" clientId)"; existing_ns="$TB_NAMESPACE"
fi
rm -f "$_gvf"
if [[ -n "$existing_id" && "$existing_id" != "$TB_CLIENT_ID" ]]; then
echo ""
warn "This machine already runs the tracebloc client '${existing_id}' (namespace '${existing_ns}')."
hint "tracebloc runs one client per machine — it shares this cluster and host"
hint "resources, and the platform counts each client as separate capacity."
echo ""
hint "You entered a different Client ID ('${TB_CLIENT_ID}'). Pick one:"
hint " • Repair / update '${existing_id}' → re-run with that same Client ID"
hint " • Switch to '${TB_CLIENT_ID}' → remove the current client first:"
hint " k3d cluster delete ${CLUSTER_NAME:-tracebloc} (wipes this client + its local data)"
hint " then re-run this installer"
hint " • Run both clients → install on a separate machine"
echo ""
error "Refusing to replace the existing client. See the options above."
fi

TB_CLIENT_PASSWORD_ESCAPED="${TB_CLIENT_PASSWORD//\'/\'\'}"

# ── GPU limits ──────────────────────────────────────────────────────────
Expand Down
2 changes: 2 additions & 0 deletions scripts/lib/summary.sh
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ print_summary() {
[[ "$GPU_VENDOR" == "nvidia" ]] && mode="NVIDIA GPU"
[[ "$GPU_VENDOR" == "amd" ]] && mode="AMD GPU"
local ns="${TB_NAMESPACE:-default}"
local cver; cver="$(_chart_version "$ns")"
local line="━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"

echo ""
Expand All @@ -99,6 +100,7 @@ print_summary() {
echo -e " ${BOLD}${GREEN}✔ Connected to tracebloc${RESET}"
echo ""
echo -e " ${BOLD}Workspace${RESET} : ${CYAN}${ns}${RESET}"
echo -e " ${BOLD}Version${RESET} : ${CYAN}${cver:-unknown}${RESET}"
echo -e " ${BOLD}Mode${RESET} : ${CYAN}${mode}${RESET}"
echo ""
echo -e " Your client is live. Confirm it shows as ${BOLD}🟢 Online${RESET}:"
Expand Down
11 changes: 11 additions & 0 deletions scripts/tests/diagnose.bats
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,14 @@ setup() {
# Finding 2 (security review): `helm get manifest` (base64 Secrets) is NOT collected
! tar -xzOf "$tgz" 2>/dev/null | grep -q 'get manifest'
}

@test "run_diagnose: surfaces + records the client version" {
has() { case "$1" in helm) return 0 ;; *) return 1 ;; esac; } # only helm present
helm() { echo "tracebloc tracebloc 1 now deployed client-1.4.4 1.4.4"; }
run run_diagnose
[ "$status" -eq 0 ]
[[ "$output" == *"client version: 1.4.4"* ]]
tgz="$(ls "$HOST_DATA_DIR"/tracebloc-diagnose-*.tgz 2>/dev/null | head -1)"
[ -n "$tgz" ]
tar -xzOf "$tgz" 2>/dev/null | grep -q 'CLIENT VERSION: 1.4.4'
}
Loading
Loading