11import { type Namespace , Server , type Socket } from "socket.io" ;
22import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger" ;
33import EventEmitter from "node:events" ;
4+ import pLimit from "p-limit" ;
45import { z } from "zod" ;
56import {
67 type SupervisorHttpClient ,
@@ -119,8 +120,10 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
119120 // hours later after a checkpoint/restore cycle. Using a capped map avoids unbounded
120121 // growth while keeping recent contexts available. Oldest entries are evicted first.
121122 private static readonly MAX_TRACE_CONTEXTS = 10_000 ;
123+ private static readonly SNAPSHOT_CONCURRENCY = 10 ;
122124 private readonly runTraceContexts = new Map < string , RunTraceContext > ( ) ;
123125 private readonly snapshotDelayWheel ?: TimerWheel < DelayedSnapshot > ;
126+ private readonly snapshotLimit = pLimit ( WorkloadServer . SNAPSHOT_CONCURRENCY ) ;
124127
125128 constructor ( opts : WorkloadServerOptions ) {
126129 super ( ) ;
@@ -137,7 +140,7 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
137140 this . snapshotDelayWheel = new TimerWheel < DelayedSnapshot > ( {
138141 delayMs : this . computeManager . snapshotDelayMs ,
139142 onExpire : ( item ) => {
140- this . dispatchComputeSnapshot ( item . data ) . catch ( ( error ) => {
143+ this . snapshotLimit ( ( ) => this . dispatchComputeSnapshot ( item . data ) ) . catch ( ( error ) => {
141144 this . logger . error ( "Compute snapshot dispatch failed" , {
142145 runId : item . data . runFriendlyId ,
143146 runnerId : item . data . runnerId ,
@@ -513,7 +516,7 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
513516 body : {
514517 success : true ,
515518 checkpoint : {
516- type : "KUBERNETES " ,
519+ type : "COMPUTE " ,
517520 location : body . snapshot_id ,
518521 } ,
519522 } ,
@@ -820,7 +823,10 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
820823 }
821824
822825 registerRunTraceContext ( runFriendlyId : string , ctx : RunTraceContext ) {
823- // Evict oldest entries if we've hit the cap
826+ // Evict oldest entries if we've hit the cap. This is best-effort: on a busy
827+ // supervisor, entries for long-lived runs may be evicted before their snapshot
828+ // callback arrives, causing those snapshot spans to be silently dropped.
829+ // That's acceptable - trace spans are observability sugar, not correctness.
824830 if ( this . runTraceContexts . size >= WorkloadServer . MAX_TRACE_CONTEXTS ) {
825831 const firstKey = this . runTraceContexts . keys ( ) . next ( ) . value ;
826832 if ( firstKey ) {
0 commit comments