Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
14 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions kubernetes/session-master-rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# ServiceAccount + Role + RoleBinding for the session-master. Grants CoreV1Api
# access to pods and services (CRUD) and read access to pods/status, scoped to
# the sessions namespace. No ClusterRole, no cross-namespace access.
apiVersion: v1
kind: ServiceAccount
metadata:
name: session-master-sa
namespace: sessions
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: session-master
namespace: sessions
rules:
- apiGroups: [""]
resources: ["pods", "services"]
verbs: ["get", "list", "watch", "create", "delete"]
- apiGroups: [""]
resources: ["pods/status"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: session-master
namespace: sessions
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: session-master
subjects:
- kind: ServiceAccount
name: session-master-sa
namespace: sessions
27 changes: 27 additions & 0 deletions kubernetes/session-master-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Per-session master Service template. The session-manager renders this per
# session, substituting ${SESSION_ID} and ${MASTER_JOB_UID} before applying.
# ownerReferences target the master Job so K8s cascade-GC reaps this Service
# when the Job is deleted or TTL-reaped after the master exits.
apiVersion: v1
kind: Service
metadata:
name: session-master-${SESSION_ID}
namespace: sessions
labels:
app: session-master
sessionId: ${SESSION_ID}
ownerReferences:
- apiVersion: batch/v1
kind: Job
name: session-master-${SESSION_ID}
uid: ${MASTER_JOB_UID}
controller: true
blockOwnerDeletion: true
spec:
type: ClusterIP
selector:
app: session-master
sessionId: ${SESSION_ID}
ports:
- port: 80
targetPort: 80
60 changes: 60 additions & 0 deletions kubernetes/session-master-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Per-session master Job template. The session-manager renders this per
# session, substituting ${SESSION_ID} and ${SESSIONS_IMAGE_TAG} (CI-supplied
# image tag, e.g. web_api_gpu:sessions-<sha>) before applying.
apiVersion: batch/v1
kind: Job
metadata:
name: session-master-${SESSION_ID}
namespace: sessions
labels:
app: session-master
sessionId: ${SESSION_ID}
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 300
template:
metadata:
labels:
app: session-master
sessionId: ${SESSION_ID}
spec:
serviceAccountName: session-master-sa
restartPolicy: Never
containers:
- name: session-master
image: ${SESSIONS_IMAGE_TAG}
command: ["zetta", "session-master"]
env:
- name: SESSION_ID
value: ${SESSION_ID}
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
- name: WORKLOAD_NAMESPACE
value: sessions
- name: SESSIONS_IMAGE_TAG
value: ${SESSIONS_IMAGE_TAG}
- name: SESSION_WORKER_TEMPLATE_PATH
value: /etc/sessions/session-worker-template.yaml
- name: SESSION_WORKER_SERVICE_TEMPLATE_PATH
value: /etc/sessions/session-worker-service.yaml
- name: OAUTH_CLIENT_ID
valueFrom:
secretKeyRef:
name: sessions-oauth
key: client-id
resources:
requests: { cpu: "0.1", memory: "256Mi" }
limits: { cpu: "0.5", memory: "512Mi" }
volumeMounts:
- name: session-templates
mountPath: /etc/sessions
volumes:
- name: session-templates
configMap:
name: session-templates
36 changes: 36 additions & 0 deletions kubernetes/session-reconcile-cronjob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Daily reconcile backstop. Runs at 06:00 UTC; concurrencyPolicy: Forbid
# ensures only one scan runs at a time. Finds orphaned or stale sessions that
# cascade-GC missed and terminates them.
apiVersion: batch/v1
kind: CronJob
metadata:
name: session-reconcile
namespace: sessions
spec:
schedule: "0 6 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
serviceAccountName: session-reconcile-sa
restartPolicy: Never
containers:
- name: session-reconcile
image: ${SESSIONS_IMAGE_TAG}
command: ["zetta", "session-reconcile"]
env:
- name: WORKLOAD_NAMESPACE
value: sessions
- name: SESSIONS_FIRESTORE_PROJECT
valueFrom:
secretKeyRef:
name: sessions-firestore
key: project
optional: true
- name: SESSIONS_FIRESTORE_DATABASE
valueFrom:
secretKeyRef:
name: sessions-firestore
key: database
optional: true
35 changes: 35 additions & 0 deletions kubernetes/session-reconcile-rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# ServiceAccount + Role + RoleBinding for the session-reconcile CronJob. Distinct
# from session-manager-sa. Grants BatchV1Api read+delete on jobs and CoreV1Api
# read+delete on pods and services, scoped to the sessions namespace.
apiVersion: v1
kind: ServiceAccount
metadata:
name: session-reconcile-sa
namespace: sessions
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: session-reconcile
namespace: sessions
rules:
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["get", "list", "delete"]
- apiGroups: [""]
resources: ["pods", "services"]
verbs: ["get", "list", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: session-reconcile
namespace: sessions
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: session-reconcile
subjects:
- kind: ServiceAccount
name: session-reconcile-sa
namespace: sessions
28 changes: 28 additions & 0 deletions kubernetes/session-worker-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Worker Service template. The session-master renders this at boot, substituting
# ${SESSION_ID}, ${MASTER_POD_NAME}, and ${MASTER_POD_UID}, then calls
# CoreV1Api().create_namespaced_service(...). Provides stable cluster DNS
# session-worker-<sessionId>.sessions.svc.cluster.local. ownerReferences target
# the master Pod so K8s cascade-GC reaps the Service when the master is gone.
apiVersion: v1
kind: Service
metadata:
name: session-worker-${SESSION_ID}
namespace: sessions
labels:
app: session-worker
sessionId: ${SESSION_ID}
ownerReferences:
- apiVersion: v1
kind: Pod
name: ${MASTER_POD_NAME}
uid: ${MASTER_POD_UID}
controller: true
blockOwnerDeletion: true
spec:
type: ClusterIP
selector:
app: session-worker
sessionId: ${SESSION_ID}
ports:
- port: 80
targetPort: 80
39 changes: 39 additions & 0 deletions kubernetes/session-worker-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Worker Pod template. The session-master renders this at boot, substituting
# ${SESSION_ID}, ${INITIAL_PRELOAD}, ${MASTER_POD_NAME}, ${MASTER_POD_UID}, and
# ${SESSIONS_IMAGE_TAG}, then calls CoreV1Api().create_namespaced_pod(...).
#
# ownerReferences target the master Pod so K8s cascade-GC reaps the worker when
# the master is gone.
apiVersion: v1
kind: Pod
metadata:
name: session-worker-${SESSION_ID}
namespace: sessions
labels:
app: session-worker
sessionId: ${SESSION_ID}
ownerReferences:
- apiVersion: v1
kind: Pod
name: ${MASTER_POD_NAME}
uid: ${MASTER_POD_UID}
controller: true
blockOwnerDeletion: true
spec:
restartPolicy: Never
containers:
- name: session-worker
image: ${SESSIONS_IMAGE_TAG}
command: ["hypercorn", "app.worker:app", "--bind", "0.0.0.0:80"]
env:
- name: SESSION_ID
value: ${SESSION_ID}
- name: INITIAL_PRELOAD
value: ${INITIAL_PRELOAD}
- name: OAUTH_CLIENT_ID
valueFrom:
secretKeyRef:
name: sessions-oauth
key: client-id
ports:
- containerPort: 80
53 changes: 53 additions & 0 deletions tests/unit/run/test_check_run_id_conflict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pytest

from zetta_utils.run import RunInfo, RunState, _check_run_id_conflict


class _FakeRunDB:
def __init__(self):
self._rows: dict[str, dict] = {}

def __contains__(self, key):
return key in self._rows

def __getitem__(self, key):
run_id, _cols = key
return self._rows.get(run_id, {})

def __setitem__(self, key, value):
run_id, _cols = key
self._rows.setdefault(run_id, {}).update(value)


@pytest.fixture
def fake_run_db(mocker):
fake = _FakeRunDB()
mocker.patch("zetta_utils.run.RUN_DB", fake)
return fake


def test_no_existing_row(fake_run_db):
_check_run_id_conflict("fresh")


def test_existing_row_raises_without_allowed(fake_run_db):
fake_run_db[("running-id", (RunInfo.STATE.value,))] = {
RunInfo.STATE.value: RunState.RUNNING.value
}
with pytest.raises(ValueError, match="already exists"):
_check_run_id_conflict("running-id")


def test_existing_queued_with_allowed_does_not_raise(fake_run_db):
fake_run_db[("queued-id", (RunInfo.STATE.value,))] = {
RunInfo.STATE.value: RunState.QUEUED.value
}
_check_run_id_conflict("queued-id", allowed_prior_state="queued")


def test_mismatched_prior_state_raises(fake_run_db):
fake_run_db[("mismatch-id", (RunInfo.STATE.value,))] = {
RunInfo.STATE.value: RunState.RUNNING.value
}
with pytest.raises(ValueError, match="state="):
_check_run_id_conflict("mismatch-id", allowed_prior_state="queued")
Loading
Loading