fix: review fixes - COMPUTE checkpoint type, memory_gb standardization, OTLP endpoint, snapshot concurrency

nicktrn · nicktrn · commit 05a6721def4c · 2026-03-28T18:55:38.000Z
diff --git a/apps/supervisor/package.json b/apps/supervisor/package.json
@@ -14,10 +14,11 @@
   },
   "dependencies": {
     "@aws-sdk/client-ecr": "^3.839.0",
-    "@kubernetes/client-node": "^1.0.0",
     "@internal/compute": "workspace:*",
+    "@kubernetes/client-node": "^1.0.0",
     "@trigger.dev/core": "workspace:*",
     "dockerode": "^4.0.6",
+    "p-limit": "^6.2.0",
     "prom-client": "^15.1.0",
     "socket.io": "4.7.4",
     "std-env": "^3.8.0",
diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts
@@ -27,7 +27,7 @@ import { PodCleaner } from "./services/podCleaner.js";
 import { FailedPodHandler } from "./services/failedPodHandler.js";
 import { getWorkerToken } from "./workerToken.js";
 import { OtlpTraceService } from "./services/otlpTraceService.js";
-import { extractTraceparent } from "./util.js";
+import { extractTraceparent, getRestoreRunnerId } from "./util.js";
 
 if (env.METRICS_COLLECT_DEFAULTS) {
   collectDefaultMetrics({ register });
@@ -96,7 +96,7 @@ class ManagedSupervisor {
 
       if (env.COMPUTE_TRACE_SPANS_ENABLED) {
         this.tracing = new OtlpTraceService({
-          endpointUrl: env.OTEL_EXPORTER_OTLP_ENDPOINT,
+          endpointUrl: env.COMPUTE_TRACE_OTLP_ENDPOINT,
         });
       }
 
@@ -273,10 +273,7 @@ class ManagedSupervisor {
 
           if (this.computeManager) {
             try {
-              // Derive runnerId unique per restore cycle (matches iceman's pattern)
-              const runIdShort = message.run.friendlyId.replace("run_", "");
-              const checkpointSuffix = checkpoint.id.slice(-8);
-              const runnerId = `runner-${runIdShort}-${checkpointSuffix}`;
+              const runnerId = getRestoreRunnerId(message.run.friendlyId, checkpoint.id);
 
               const didRestore = await this.computeManager.restore({
                 snapshotId: checkpoint.location,
diff --git a/apps/supervisor/src/util.ts b/apps/supervisor/src/util.ts
@@ -35,3 +35,10 @@ export function getRunnerId(runId: string, attemptNumber?: number) {
 
   return parts.join("-");
 }
+
+/** Derive a unique runnerId for a restore cycle using the checkpoint suffix */
+export function getRestoreRunnerId(runFriendlyId: string, checkpointId: string) {
+  const runIdShort = runFriendlyId.replace("run_", "");
+  const checkpointSuffix = checkpointId.slice(-8);
+  return `runner-${runIdShort}-${checkpointSuffix}`;
+}
diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts
@@ -154,6 +154,8 @@ export class ComputeWorkloadManager implements WorkloadManager {
         event.error = error instanceof Error ? error.message : String(error);
         event.errorType =
           error instanceof DOMException && error.name === "TimeoutError" ? "timeout" : "fetch";
+        // Intentional: errors are captured in the wide event, not thrown. This matches
+        // the Docker/K8s managers. The run will eventually time out if scheduling fails.
         return;
       }
 
@@ -293,7 +295,7 @@ export class ComputeWorkloadManager implements WorkloadManager {
         name: opts.runnerId,
         metadata,
         cpu: opts.machine.cpu,
-        memory_mb: opts.machine.memory * 1024,
+        memory_gb: opts.machine.memory,
       })
     );
 
diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts
@@ -1,6 +1,7 @@
 import { type Namespace, Server, type Socket } from "socket.io";
 import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger";
 import EventEmitter from "node:events";
+import pLimit from "p-limit";
 import { z } from "zod";
 import {
   type SupervisorHttpClient,
@@ -119,8 +120,10 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
   // hours later after a checkpoint/restore cycle. Using a capped map avoids unbounded
   // growth while keeping recent contexts available. Oldest entries are evicted first.
   private static readonly MAX_TRACE_CONTEXTS = 10_000;
+  private static readonly SNAPSHOT_CONCURRENCY = 10;
   private readonly runTraceContexts = new Map<string, RunTraceContext>();
   private readonly snapshotDelayWheel?: TimerWheel<DelayedSnapshot>;
+  private readonly snapshotLimit = pLimit(WorkloadServer.SNAPSHOT_CONCURRENCY);
 
   constructor(opts: WorkloadServerOptions) {
     super();
@@ -137,7 +140,7 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
       this.snapshotDelayWheel = new TimerWheel<DelayedSnapshot>({
         delayMs: this.computeManager.snapshotDelayMs,
         onExpire: (item) => {
-          this.dispatchComputeSnapshot(item.data).catch((error) => {
+          this.snapshotLimit(() => this.dispatchComputeSnapshot(item.data)).catch((error) => {
             this.logger.error("Compute snapshot dispatch failed", {
               runId: item.data.runFriendlyId,
               runnerId: item.data.runnerId,
@@ -513,7 +516,7 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
             body: {
               success: true,
               checkpoint: {
-                type: "KUBERNETES",
+                type: "COMPUTE",
                 location: body.snapshot_id,
               },
             },
@@ -820,7 +823,10 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
   }
 
   registerRunTraceContext(runFriendlyId: string, ctx: RunTraceContext) {
-    // Evict oldest entries if we've hit the cap
+    // Evict oldest entries if we've hit the cap. This is best-effort: on a busy
+    // supervisor, entries for long-lived runs may be evicted before their snapshot
+    // callback arrives, causing those snapshot spans to be silently dropped.
+    // That's acceptable - trace spans are observability sugar, not correctness.
     if (this.runTraceContexts.size >= WorkloadServer.MAX_TRACE_CONTEXTS) {
       const firstKey = this.runTraceContexts.keys().next().value;
       if (firstKey) {
diff --git a/apps/webapp/app/v3/services/computeTemplateCreation.server.ts b/apps/webapp/app/v3/services/computeTemplateCreation.server.ts
@@ -169,7 +169,7 @@ export class ComputeTemplateCreationService {
       await this.client.templates.create({
         image: stripImageDigest(imageReference),
         cpu: machine.cpu,
-        memory_mb: machine.memory * 1024,
+        memory_gb: machine.memory,
         background: options?.background,
       });
       return { success: true };
diff --git a/internal-packages/compute/src/types.ts b/internal-packages/compute/src/types.ts
@@ -5,7 +5,7 @@ import { z } from "zod";
 export const TemplateCreateRequestSchema = z.object({
   image: z.string(),
   cpu: z.number(),
-  memory_mb: z.number(),
+  memory_gb: z.number(),
   background: z.boolean().optional(),
   callback: z
     .object({
@@ -58,6 +58,6 @@ export const SnapshotRestoreRequestSchema = z.object({
   name: z.string(),
   metadata: z.record(z.string()),
   cpu: z.number(),
-  memory_mb: z.number(),
+  memory_gb: z.number(),
 });
 export type SnapshotRestoreRequest = z.infer<typeof SnapshotRestoreRequestSchema>;
diff --git a/internal-packages/database/prisma/migrations/20260328000000_add_compute_checkpoint_type/migration.sql b/internal-packages/database/prisma/migrations/20260328000000_add_compute_checkpoint_type/migration.sql
@@ -0,0 +1,2 @@
+-- AlterEnum
+ALTER TYPE "TaskRunCheckpointType" ADD VALUE 'COMPUTE';
diff --git a/internal-packages/database/prisma/schema.prisma b/internal-packages/database/prisma/schema.prisma
@@ -1100,6 +1100,7 @@ model TaskRunCheckpoint {
 enum TaskRunCheckpointType {
   DOCKER
   KUBERNETES
+  COMPUTE
 }
 
 /// A Waitpoint blocks a run from continuing until it's completed
diff --git a/packages/core/src/v3/schemas/runEngine.ts b/packages/core/src/v3/schemas/runEngine.ts
@@ -177,7 +177,8 @@ export type CompleteRunAttemptResult = z.infer<typeof CompleteRunAttemptResult>;
 export const CheckpointTypeEnum = {
   DOCKER: "DOCKER",
   KUBERNETES: "KUBERNETES",
-} satisfies Enum<DB_TYPES.CheckpointType>;
+  COMPUTE: "COMPUTE",
+} satisfies Enum<DB_TYPES.TaskRunCheckpointType>;
 export type CheckpointTypeEnum = (typeof CheckpointTypeEnum)[keyof typeof CheckpointTypeEnum];
 
 export const CheckpointType = z.enum(Object.values(CheckpointTypeEnum) as [CheckpointTypeEnum]);
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+-- AlterEnum`
	`2`	`+ALTER TYPE "TaskRunCheckpointType" ADD VALUE 'COMPUTE';`
Original file line number	Diff line number	Diff line change
`@@ -1100,6 +1100,7 @@ model TaskRunCheckpoint {`
`1100`	`1100`	`enum TaskRunCheckpointType {`
`1101`	`1101`	`DOCKER`
`1102`	`1102`	`KUBERNETES`
	`1103`	`+ COMPUTE`
`1103`	`1104`	`}`
`1104`	`1105`
`1105`	`1106`	`/// A Waitpoint blocks a run from continuing until it's completed`