Skip to content

Commit e5d0a49

Browse files
committed
Otel
1 parent 62a351a commit e5d0a49

File tree

1 file changed

+123
-10
lines changed

1 file changed

+123
-10
lines changed

apps/sim/instrumentation-node.ts

Lines changed: 123 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,86 @@ function isBusinessSpan(spanName: string): boolean {
6666
return ALLOWED_SPAN_PREFIXES.some((prefix) => spanName.startsWith(prefix))
6767
}
6868

69+
/**
70+
* Parse OTLP headers from the standard env var `OTEL_EXPORTER_OTLP_HEADERS`.
71+
*
72+
* Spec format: `key1=value1,key2=value2`, with values optionally
73+
* URL-encoded. We tolerate whitespace around entries and values that
74+
* themselves contain `=`. This is the mechanism every managed backend
75+
* (Honeycomb, Grafana Cloud, New Relic, Datadog) uses to receive its
76+
* auth token without any backend-specific code paths here.
77+
*/
78+
function parseOtlpHeadersEnv(raw: string): Record<string, string> {
79+
const out: Record<string, string> = {}
80+
if (!raw) return out
81+
for (const part of raw.split(',')) {
82+
const trimmed = part.trim()
83+
if (!trimmed) continue
84+
const eq = trimmed.indexOf('=')
85+
if (eq <= 0) continue
86+
const key = trimmed.slice(0, eq).trim()
87+
const rawVal = trimmed.slice(eq + 1).trim()
88+
let val = rawVal
89+
try {
90+
val = decodeURIComponent(rawVal)
91+
} catch {
92+
// value wasn't URL-encoded; keep as-is.
93+
}
94+
if (key) out[key] = val
95+
}
96+
return out
97+
}
98+
99+
/**
100+
* Normalize an OTLP base URL to the full traces-signal endpoint.
101+
*
102+
* The OTel HTTP exporter sends to whatever URL you give it verbatim
103+
* — no signal-path appending. That's a footgun when the same env
104+
* var also flows into the Go side, where the SDK *does* append
105+
* `/v1/traces` automatically. We bridge the gap here so both halves
106+
* of the mothership can share one endpoint value.
107+
*
108+
* Rules:
109+
* - If the URL already has a non-root path, respect it (operator
110+
* intent: "post to exactly this URL").
111+
* - Otherwise, append `/v1/traces`.
112+
* - Malformed URLs pass through unchanged; the exporter will
113+
* surface the error at first export.
114+
*/
115+
function normalizeOtlpTracesUrl(url: string): string {
116+
if (!url) return url
117+
try {
118+
const u = new URL(url)
119+
if (u.pathname && u.pathname !== '/') return url
120+
return `${url.replace(/\/$/, '')}/v1/traces`
121+
} catch {
122+
return url
123+
}
124+
}
125+
126+
/**
127+
* Resolve the sampling ratio from env, with sensible fallbacks.
128+
*
129+
* Matches the Go side's `samplerFromEnv()` semantics so operators can
130+
* control both halves of the mothership trace tree from the same
131+
* variable. Invalid values degrade gracefully to the fallback.
132+
*/
133+
function resolveSamplingRatio(isLocalEndpoint: boolean): number {
134+
const raw = process.env.TELEMETRY_SAMPLING_RATIO || process.env.OTEL_TRACES_SAMPLER_ARG || ''
135+
if (raw) {
136+
const parsed = Number.parseFloat(raw)
137+
if (Number.isFinite(parsed)) {
138+
if (parsed <= 0) return 0
139+
if (parsed >= 1) return 1
140+
return parsed
141+
}
142+
}
143+
// Local dev gets 100% for deterministic manual verification.
144+
// Production default is also 100% — the 1-day retention at the
145+
// backend caps storage cost, not sampling.
146+
return isLocalEndpoint ? 1.0 : 1.0
147+
}
148+
69149
/**
70150
* MothershipOriginSpanProcessor tags every span this process creates with
71151
* `mothership.origin` and prepends a `sim: ` prefix to the span name on
@@ -106,11 +186,16 @@ async function initializeOpenTelemetry() {
106186
telemetryConfig = DEFAULT_TELEMETRY_CONFIG
107187
}
108188

109-
// Prefer process.env directly: @t3-oss/env-nextjs sometimes returns
110-
// undefined for server vars that aren't listed in experimental__runtimeEnv,
111-
// and TELEMETRY_ENDPOINT isn't mapped there.
189+
// Endpoint resolution: prefer the OTel spec env var, fall back to
190+
// our legacy TELEMETRY_ENDPOINT so existing deploys keep working
191+
// during rollout. Read process.env directly because
192+
// @t3-oss/env-nextjs sometimes returns undefined for server vars
193+
// that aren't listed in experimental__runtimeEnv.
112194
const resolvedEndpoint =
113-
process.env.TELEMETRY_ENDPOINT || env.TELEMETRY_ENDPOINT || telemetryConfig.endpoint
195+
process.env.OTEL_EXPORTER_OTLP_ENDPOINT ||
196+
process.env.TELEMETRY_ENDPOINT ||
197+
env.TELEMETRY_ENDPOINT ||
198+
telemetryConfig.endpoint
114199
telemetryConfig = {
115200
...telemetryConfig,
116201
endpoint: resolvedEndpoint,
@@ -187,9 +272,24 @@ async function initializeOpenTelemetry() {
187272
},
188273
})
189274

275+
// Parse OTEL_EXPORTER_OTLP_HEADERS per the OTel spec: comma-
276+
// separated `key=value` pairs, values optionally URL-encoded. This
277+
// is how managed backends (Honeycomb, Grafana Cloud, New Relic)
278+
// receive their API keys without needing a vendor-specific code
279+
// path — flip the secret, redeploy, traces land in the new place.
280+
const otlpHeaders = parseOtlpHeadersEnv(process.env.OTEL_EXPORTER_OTLP_HEADERS || '')
281+
282+
// The @opentelemetry/exporter-trace-otlp-http exporter treats the
283+
// `url` option as the complete POST target and does NOT append the
284+
// `/v1/traces` signal path. The Go SDK, by contrast, does append
285+
// it when only a host is given. Normalize here so operators can
286+
// set the same `OTEL_EXPORTER_OTLP_ENDPOINT=https://api.honeycomb.io`
287+
// for both services and have it Just Work.
288+
const exporterUrl = normalizeOtlpTracesUrl(telemetryConfig.endpoint)
289+
190290
const exporter = new OTLPTraceExporter({
191-
url: telemetryConfig.endpoint,
192-
headers: {},
291+
url: exporterUrl,
292+
headers: otlpHeaders,
193293
timeoutMillis: Math.min(telemetryConfig.batchSettings.exportTimeoutMillis, 10000),
194294
keepAlive: false,
195295
})
@@ -244,14 +344,27 @@ async function initializeOpenTelemetry() {
244344
})
245345
)
246346

247-
// Dev / self-hosted OTLP backends (Jaeger/Tempo on localhost) should
248-
// capture every trace so manual verification is deterministic. Keep 10%
249-
// for production cloud endpoints.
347+
// Sampling ratio resolution, in priority order:
348+
// 1. `TELEMETRY_SAMPLING_RATIO` (our explicit, matches Go side)
349+
// 2. `OTEL_TRACES_SAMPLER_ARG` (OTel spec env var)
350+
// 3. 1.0 for local endpoints (so dev traces are deterministic)
351+
// 4. 1.0 otherwise (production wants every mothership request —
352+
// retention happens at the backend)
353+
//
354+
// `1.0` is the right default for mothership: every request is
355+
// support-critical and we rely on the backend's retention (1 day
356+
// in prod) to cap storage, not upstream sampling.
250357
const isLocalEndpoint = /localhost|127\.0\.0\.1/i.test(telemetryConfig.endpoint)
251-
const samplingRatio = isLocalEndpoint ? 1.0 : 0.1
358+
const samplingRatio = resolveSamplingRatio(isLocalEndpoint)
252359
const rootRatioSampler = new TraceIdRatioBasedSampler(samplingRatio)
253360
const sampler = createBusinessSpanSampler(rootRatioSampler)
254361

362+
logger.info('OpenTelemetry sampler configured', {
363+
samplingRatio,
364+
endpoint: telemetryConfig.endpoint,
365+
origin: MOTHERSHIP_ORIGIN,
366+
})
367+
255368
// Order matters: the origin-prefix processor must run BEFORE the batch
256369
// processor so the renamed span and the mothership.origin attribute are
257370
// captured on export.

0 commit comments

Comments
 (0)