@@ -66,6 +66,86 @@ function isBusinessSpan(spanName: string): boolean {
6666 return ALLOWED_SPAN_PREFIXES . some ( ( prefix ) => spanName . startsWith ( prefix ) )
6767}
6868
69+ /**
70+ * Parse OTLP headers from the standard env var `OTEL_EXPORTER_OTLP_HEADERS`.
71+ *
72+ * Spec format: `key1=value1,key2=value2`, with values optionally
73+ * URL-encoded. We tolerate whitespace around entries and values that
74+ * themselves contain `=`. This is the mechanism every managed backend
75+ * (Honeycomb, Grafana Cloud, New Relic, Datadog) uses to receive its
76+ * auth token without any backend-specific code paths here.
77+ */
78+ function parseOtlpHeadersEnv ( raw : string ) : Record < string , string > {
79+ const out : Record < string , string > = { }
80+ if ( ! raw ) return out
81+ for ( const part of raw . split ( ',' ) ) {
82+ const trimmed = part . trim ( )
83+ if ( ! trimmed ) continue
84+ const eq = trimmed . indexOf ( '=' )
85+ if ( eq <= 0 ) continue
86+ const key = trimmed . slice ( 0 , eq ) . trim ( )
87+ const rawVal = trimmed . slice ( eq + 1 ) . trim ( )
88+ let val = rawVal
89+ try {
90+ val = decodeURIComponent ( rawVal )
91+ } catch {
92+ // value wasn't URL-encoded; keep as-is.
93+ }
94+ if ( key ) out [ key ] = val
95+ }
96+ return out
97+ }
98+
99+ /**
100+ * Normalize an OTLP base URL to the full traces-signal endpoint.
101+ *
102+ * The OTel HTTP exporter sends to whatever URL you give it verbatim
103+ * — no signal-path appending. That's a footgun when the same env
104+ * var also flows into the Go side, where the SDK *does* append
105+ * `/v1/traces` automatically. We bridge the gap here so both halves
106+ * of the mothership can share one endpoint value.
107+ *
108+ * Rules:
109+ * - If the URL already has a non-root path, respect it (operator
110+ * intent: "post to exactly this URL").
111+ * - Otherwise, append `/v1/traces`.
112+ * - Malformed URLs pass through unchanged; the exporter will
113+ * surface the error at first export.
114+ */
115+ function normalizeOtlpTracesUrl ( url : string ) : string {
116+ if ( ! url ) return url
117+ try {
118+ const u = new URL ( url )
119+ if ( u . pathname && u . pathname !== '/' ) return url
120+ return `${ url . replace ( / \/ $ / , '' ) } /v1/traces`
121+ } catch {
122+ return url
123+ }
124+ }
125+
126+ /**
127+ * Resolve the sampling ratio from env, with sensible fallbacks.
128+ *
129+ * Matches the Go side's `samplerFromEnv()` semantics so operators can
130+ * control both halves of the mothership trace tree from the same
131+ * variable. Invalid values degrade gracefully to the fallback.
132+ */
133+ function resolveSamplingRatio ( isLocalEndpoint : boolean ) : number {
134+ const raw = process . env . TELEMETRY_SAMPLING_RATIO || process . env . OTEL_TRACES_SAMPLER_ARG || ''
135+ if ( raw ) {
136+ const parsed = Number . parseFloat ( raw )
137+ if ( Number . isFinite ( parsed ) ) {
138+ if ( parsed <= 0 ) return 0
139+ if ( parsed >= 1 ) return 1
140+ return parsed
141+ }
142+ }
143+ // Local dev gets 100% for deterministic manual verification.
144+ // Production default is also 100% — the 1-day retention at the
145+ // backend caps storage cost, not sampling.
146+ return isLocalEndpoint ? 1.0 : 1.0
147+ }
148+
69149/**
70150 * MothershipOriginSpanProcessor tags every span this process creates with
71151 * `mothership.origin` and prepends a `sim: ` prefix to the span name on
@@ -106,11 +186,16 @@ async function initializeOpenTelemetry() {
106186 telemetryConfig = DEFAULT_TELEMETRY_CONFIG
107187 }
108188
109- // Prefer process.env directly: @t 3-oss/env-nextjs sometimes returns
110- // undefined for server vars that aren't listed in experimental__runtimeEnv,
111- // and TELEMETRY_ENDPOINT isn't mapped there.
189+ // Endpoint resolution: prefer the OTel spec env var, fall back to
190+ // our legacy TELEMETRY_ENDPOINT so existing deploys keep working
191+ // during rollout. Read process.env directly because
192+ // @t 3-oss/env-nextjs sometimes returns undefined for server vars
193+ // that aren't listed in experimental__runtimeEnv.
112194 const resolvedEndpoint =
113- process . env . TELEMETRY_ENDPOINT || env . TELEMETRY_ENDPOINT || telemetryConfig . endpoint
195+ process . env . OTEL_EXPORTER_OTLP_ENDPOINT ||
196+ process . env . TELEMETRY_ENDPOINT ||
197+ env . TELEMETRY_ENDPOINT ||
198+ telemetryConfig . endpoint
114199 telemetryConfig = {
115200 ...telemetryConfig ,
116201 endpoint : resolvedEndpoint ,
@@ -187,9 +272,24 @@ async function initializeOpenTelemetry() {
187272 } ,
188273 } )
189274
275+ // Parse OTEL_EXPORTER_OTLP_HEADERS per the OTel spec: comma-
276+ // separated `key=value` pairs, values optionally URL-encoded. This
277+ // is how managed backends (Honeycomb, Grafana Cloud, New Relic)
278+ // receive their API keys without needing a vendor-specific code
279+ // path — flip the secret, redeploy, traces land in the new place.
280+ const otlpHeaders = parseOtlpHeadersEnv ( process . env . OTEL_EXPORTER_OTLP_HEADERS || '' )
281+
282+ // The @opentelemetry /exporter-trace-otlp-http exporter treats the
283+ // `url` option as the complete POST target and does NOT append the
284+ // `/v1/traces` signal path. The Go SDK, by contrast, does append
285+ // it when only a host is given. Normalize here so operators can
286+ // set the same `OTEL_EXPORTER_OTLP_ENDPOINT=https://api.honeycomb.io`
287+ // for both services and have it Just Work.
288+ const exporterUrl = normalizeOtlpTracesUrl ( telemetryConfig . endpoint )
289+
190290 const exporter = new OTLPTraceExporter ( {
191- url : telemetryConfig . endpoint ,
192- headers : { } ,
291+ url : exporterUrl ,
292+ headers : otlpHeaders ,
193293 timeoutMillis : Math . min ( telemetryConfig . batchSettings . exportTimeoutMillis , 10000 ) ,
194294 keepAlive : false ,
195295 } )
@@ -244,14 +344,27 @@ async function initializeOpenTelemetry() {
244344 } )
245345 )
246346
247- // Dev / self-hosted OTLP backends (Jaeger/Tempo on localhost) should
248- // capture every trace so manual verification is deterministic. Keep 10%
249- // for production cloud endpoints.
347+ // Sampling ratio resolution, in priority order:
348+ // 1. `TELEMETRY_SAMPLING_RATIO` (our explicit, matches Go side)
349+ // 2. `OTEL_TRACES_SAMPLER_ARG` (OTel spec env var)
350+ // 3. 1.0 for local endpoints (so dev traces are deterministic)
351+ // 4. 1.0 otherwise (production wants every mothership request —
352+ // retention happens at the backend)
353+ //
354+ // `1.0` is the right default for mothership: every request is
355+ // support-critical and we rely on the backend's retention (1 day
356+ // in prod) to cap storage, not upstream sampling.
250357 const isLocalEndpoint = / l o c a l h o s t | 1 2 7 \. 0 \. 0 \. 1 / i. test ( telemetryConfig . endpoint )
251- const samplingRatio = isLocalEndpoint ? 1.0 : 0.1
358+ const samplingRatio = resolveSamplingRatio ( isLocalEndpoint )
252359 const rootRatioSampler = new TraceIdRatioBasedSampler ( samplingRatio )
253360 const sampler = createBusinessSpanSampler ( rootRatioSampler )
254361
362+ logger . info ( 'OpenTelemetry sampler configured' , {
363+ samplingRatio,
364+ endpoint : telemetryConfig . endpoint ,
365+ origin : MOTHERSHIP_ORIGIN ,
366+ } )
367+
255368 // Order matters: the origin-prefix processor must run BEFORE the batch
256369 // processor so the renamed span and the mothership.origin attribute are
257370 // captured on export.
0 commit comments