Skip to content

Commit 4d603ad

Browse files
committed
feat(supervisor): add compute checkpoint/restore support
- Fix instance creation URL from /api/sandboxes to /api/instances - Pass name: runnerId when creating compute instances - Add snapshot(), deleteInstance(), and restore() methods to ComputeWorkloadManager - Add /api/v1/compute/snapshot-complete callback endpoint to WorkloadServer - Handle suspend requests in compute mode via fire-and-forget snapshot with callback - Handle restore in compute mode by calling gateway restore API directly - Wire computeManager into WorkloadServer for compute mode suspend/restore
1 parent e4915c4 commit 4d603ad

3 files changed

Lines changed: 269 additions & 22 deletions

File tree

apps/supervisor/src/index.ts

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,11 @@ class ManagedSupervisor {
3636
private readonly metricsServer?: HttpServer;
3737
private readonly workloadServer: WorkloadServer;
3838
private readonly workloadManager: WorkloadManager;
39+
private readonly computeManager?: ComputeWorkloadManager;
3940
private readonly logger = new SimpleStructuredLogger("managed-supervisor");
4041
private readonly resourceMonitor: ResourceMonitor;
4142
private readonly checkpointClient?: CheckpointClient;
43+
private readonly isComputeMode: boolean;
4244

4345
private readonly podCleaner?: PodCleaner;
4446
private readonly failedPodHandler?: FailedPodHandler;
@@ -78,16 +80,22 @@ class ManagedSupervisor {
7880
: new DockerResourceMonitor(new Docker())
7981
: new NoopResourceMonitor();
8082

81-
this.workloadManager = env.COMPUTE_GATEWAY_URL
82-
? new ComputeWorkloadManager({
83-
...workloadManagerOptions,
84-
gatewayUrl: env.COMPUTE_GATEWAY_URL,
85-
gatewayAuthToken: env.COMPUTE_GATEWAY_AUTH_TOKEN,
86-
gatewayTimeoutMs: env.COMPUTE_GATEWAY_TIMEOUT_MS,
87-
})
88-
: this.isKubernetes
83+
this.isComputeMode = !!env.COMPUTE_GATEWAY_URL;
84+
85+
if (env.COMPUTE_GATEWAY_URL) {
86+
const computeManager = new ComputeWorkloadManager({
87+
...workloadManagerOptions,
88+
gatewayUrl: env.COMPUTE_GATEWAY_URL,
89+
gatewayAuthToken: env.COMPUTE_GATEWAY_AUTH_TOKEN,
90+
gatewayTimeoutMs: env.COMPUTE_GATEWAY_TIMEOUT_MS,
91+
});
92+
this.computeManager = computeManager;
93+
this.workloadManager = computeManager;
94+
} else {
95+
this.workloadManager = this.isKubernetes
8996
? new KubernetesWorkloadManager(workloadManagerOptions)
9097
: new DockerWorkloadManager(workloadManagerOptions);
98+
}
9199

92100
if (this.isKubernetes) {
93101
if (env.POD_CLEANER_ENABLED) {
@@ -215,6 +223,22 @@ class ManagedSupervisor {
215223
if (checkpoint) {
216224
this.logger.log("Restoring run", { runId: message.run.id });
217225

226+
if (this.isComputeMode && this.computeManager) {
227+
try {
228+
const didRestore = await this.computeManager.restore(checkpoint.location);
229+
230+
if (didRestore) {
231+
this.logger.log("Compute restore successful", { runId: message.run.id });
232+
} else {
233+
this.logger.error("Compute restore failed", { runId: message.run.id });
234+
}
235+
} catch (error) {
236+
this.logger.error("Failed to restore run (compute)", { error });
237+
}
238+
239+
return;
240+
}
241+
218242
if (!this.checkpointClient) {
219243
this.logger.error("No checkpoint client", { runId: message.run.id });
220244
return;
@@ -309,6 +333,7 @@ class ManagedSupervisor {
309333
host: env.TRIGGER_WORKLOAD_API_HOST_INTERNAL,
310334
workerClient: this.workerSession.httpClient,
311335
checkpointClient: this.checkpointClient,
336+
computeManager: this.computeManager,
312337
});
313338

314339
this.workloadServer.on("runConnected", this.onRunConnected.bind(this));

apps/supervisor/src/workloadManager/compute.ts

Lines changed: 120 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,9 @@ export class ComputeWorkloadManager implements WorkloadManager {
6060
}
6161

6262
if (this.opts.snapshotPollIntervalSeconds) {
63-
envVars.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = String(this.opts.snapshotPollIntervalSeconds);
63+
envVars.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = String(
64+
this.opts.snapshotPollIntervalSeconds
65+
);
6466
}
6567

6668
if (this.opts.additionalEnvVars) {
@@ -78,7 +80,7 @@ export class ComputeWorkloadManager implements WorkloadManager {
7880
// Strip image digest — resolve by tag, not digest
7981
const imageRef = opts.image.split("@")[0]!;
8082

81-
const url = `${this.opts.gatewayUrl}/api/sandboxes`;
83+
const url = `${this.opts.gatewayUrl}/api/instances`;
8284

8385
// Wide event: single canonical log line emitted in finally
8486
const event: Record<string, unknown> = {
@@ -111,6 +113,7 @@ export class ComputeWorkloadManager implements WorkloadManager {
111113
headers,
112114
signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs),
113115
body: JSON.stringify({
116+
name: runnerId,
114117
image: imageRef,
115118
env: envVars,
116119
cpu: opts.machine.cpu,
@@ -153,12 +156,125 @@ export class ComputeWorkloadManager implements WorkloadManager {
153156
return;
154157
}
155158

156-
event.sandboxId = data.id;
159+
event.instanceId = data.id;
157160
event.ok = true;
158161
} finally {
159162
event.durationMs = Math.round(performance.now() - startMs);
160163
event.ok ??= false;
161-
this.logger.info("create sandbox", event);
164+
this.logger.info("create instance", event);
165+
}
166+
}
167+
168+
private get authHeaders(): Record<string, string> {
169+
const headers: Record<string, string> = {
170+
"Content-Type": "application/json",
171+
};
172+
if (this.opts.gatewayAuthToken) {
173+
headers["Authorization"] = `Bearer ${this.opts.gatewayAuthToken}`;
174+
}
175+
return headers;
176+
}
177+
178+
async snapshot(opts: {
179+
runnerId: string;
180+
callbackUrl: string;
181+
metadata: Record<string, string>;
182+
}): Promise<boolean> {
183+
const url = `${this.opts.gatewayUrl}/api/instances/${opts.runnerId}/snapshot`;
184+
185+
const [error, response] = await tryCatch(
186+
fetch(url, {
187+
method: "POST",
188+
headers: this.authHeaders,
189+
signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs),
190+
body: JSON.stringify({
191+
callback: {
192+
url: opts.callbackUrl,
193+
metadata: opts.metadata,
194+
},
195+
}),
196+
})
197+
);
198+
199+
if (error) {
200+
this.logger.error("snapshot request failed", {
201+
runnerId: opts.runnerId,
202+
error: error instanceof Error ? error.message : String(error),
203+
});
204+
return false;
162205
}
206+
207+
if (response.status !== 202) {
208+
this.logger.error("snapshot request rejected", {
209+
runnerId: opts.runnerId,
210+
status: response.status,
211+
});
212+
return false;
213+
}
214+
215+
this.logger.info("snapshot request accepted", { runnerId: opts.runnerId });
216+
return true;
217+
}
218+
219+
async deleteInstance(runnerId: string): Promise<boolean> {
220+
const url = `${this.opts.gatewayUrl}/api/instances/${runnerId}`;
221+
222+
const [error, response] = await tryCatch(
223+
fetch(url, {
224+
method: "DELETE",
225+
headers: this.authHeaders,
226+
signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs),
227+
})
228+
);
229+
230+
if (error) {
231+
this.logger.error("delete instance failed", {
232+
runnerId,
233+
error: error instanceof Error ? error.message : String(error),
234+
});
235+
return false;
236+
}
237+
238+
if (!response.ok) {
239+
this.logger.error("delete instance rejected", {
240+
runnerId,
241+
status: response.status,
242+
});
243+
return false;
244+
}
245+
246+
this.logger.info("delete instance success", { runnerId });
247+
return true;
248+
}
249+
250+
async restore(snapshotId: string): Promise<boolean> {
251+
const url = `${this.opts.gatewayUrl}/api/snapshots/${snapshotId}/restore`;
252+
253+
const [error, response] = await tryCatch(
254+
fetch(url, {
255+
method: "POST",
256+
headers: this.authHeaders,
257+
signal: AbortSignal.timeout(this.opts.gatewayTimeoutMs),
258+
})
259+
);
260+
261+
if (error) {
262+
this.logger.error("restore request failed", {
263+
snapshotId,
264+
error: error instanceof Error ? error.message : String(error),
265+
});
266+
return false;
267+
}
268+
269+
if (!response.ok) {
270+
this.logger.error("restore request rejected", {
271+
snapshotId,
272+
status: response.status,
273+
});
274+
return false;
275+
}
276+
277+
this.logger.info("restore request success", { snapshotId });
278+
return true;
163279
}
164280
}

0 commit comments

Comments
 (0)