Skip to content

Commit a54470e

Browse files
committed
update otel
1 parent e5d0a49 commit a54470e

File tree

3 files changed

+230
-19
lines changed

3 files changed

+230
-19
lines changed

apps/sim/instrumentation-node.ts

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,21 @@
77
* OTel `service.name = "mothership"` so every request shows up as one
88
* service in the OTLP backend. To keep the two halves distinguishable:
99
*
10-
* - Every span emitted by this process is prefixed with `sim: ` on
11-
* start, and gets a `mothership.origin = "sim"` attribute.
12-
* - The Go side does the same with `go: ` / `mothership.origin = "go"`.
10+
* - Every span emitted by the mothership lifecycle on this process is
11+
* prefixed with `sim-mothership: ` on start, and gets a
12+
* `mothership.origin = "sim-mothership"` attribute.
13+
* - The Go side does the same with `go-mothership: ` /
14+
* `mothership.origin = "go-mothership"`.
1315
*
14-
* So in Jaeger/Tempo, filtering by `mothership.origin` (exact) or by
15-
* operation name prefix (`sim:` / `go:`) cleanly splits the two halves.
16+
* The `-mothership` suffix on the origin is deliberate: this Sim process
17+
* hosts plenty of non-mothership code (workflow executor, block runtime,
18+
* indexer clients) that may emit its own traces in the future. Making
19+
* the origin value explicit means a later "sim" origin can't collide
20+
* with the mothership side.
21+
*
22+
* So in any OTLP backend, filter by `mothership.origin` (exact) or by
23+
* operation name prefix (`sim-mothership:` / `go-mothership:`) to
24+
* cleanly split the two halves.
1625
*/
1726

1827
import type { Attributes, Context, Link, SpanKind } from '@opentelemetry/api'
@@ -31,9 +40,18 @@ diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.ERROR)
3140

3241
const logger = createLogger('OTelInstrumentation')
3342

34-
const MOTHERSHIP_ORIGIN = 'sim' as const
43+
// Origin value lives on every mothership span as `mothership.origin`.
44+
// Longer form intentionally used (vs. plain "sim") so non-mothership
45+
// code running in this same Sim process can't collide if it later
46+
// starts emitting its own traces.
47+
const MOTHERSHIP_ORIGIN = 'sim-mothership' as const
3548
const SPAN_NAME_PREFIX = `${MOTHERSHIP_ORIGIN}: `
3649

50+
// Short slug used only for `service.instance.id`. Kept as plain "sim"
51+
// so the instance id reads as `mothership-sim` — concise, already
52+
// scoped by `service.name = "mothership"` as the container.
53+
const SERVICE_INSTANCE_SLUG = 'sim' as const
54+
3755
const DEFAULT_TELEMETRY_CONFIG = {
3856
endpoint: env.TELEMETRY_ENDPOINT || 'https://telemetry.simstudio.ai/v1/traces',
3957
// Joint Sim+Go service surface in Jaeger/Tempo. See header comment.
@@ -147,18 +165,29 @@ function resolveSamplingRatio(isLocalEndpoint: boolean): number {
147165
}
148166

149167
/**
150-
* MothershipOriginSpanProcessor tags every span this process creates with
151-
* `mothership.origin` and prepends a `sim: ` prefix to the span name on
152-
* start, before any downstream processor (BatchSpanProcessor) reads it.
168+
* MothershipOriginSpanProcessor tags mothership-lifecycle spans with
169+
* `mothership.origin` and prepends the origin prefix to the span name
170+
* on start, before any downstream processor (BatchSpanProcessor)
171+
* reads it.
153172
*
154-
* Implemented as its own processor rather than a resource attribute so
155-
* the backend span/operation list (which keys on span name) is visually
156-
* split between sim and go even when both share service.name.
173+
* Gated on `isBusinessSpan(name)` so only spans that already match
174+
* the mothership allowlist get the label. The sampler drops
175+
* non-mothership roots anyway, but keeping the tagger conditional
176+
* means that if the sampler is ever relaxed (or a different
177+
* instrumentation stream is added alongside mothership), unrelated
178+
* spans won't accidentally inherit the mothership origin.
179+
*
180+
* Implemented as its own processor rather than a resource attribute
181+
* so the backend span/operation list (which keys on span name) is
182+
* visually split between sim and go even when both share service.name.
157183
*/
158184
class MothershipOriginSpanProcessor implements SpanProcessor {
159185
onStart(span: Span): void {
160-
span.setAttribute('mothership.origin', MOTHERSHIP_ORIGIN)
161186
const name = span.name
187+
if (!isBusinessSpan(name)) {
188+
return
189+
}
190+
span.setAttribute('mothership.origin', MOTHERSHIP_ORIGIN)
162191
if (!name.startsWith(SPAN_NAME_PREFIX)) {
163192
span.updateName(`${SPAN_NAME_PREFIX}${name}`)
164193
}
@@ -326,10 +355,12 @@ async function initializeOpenTelemetry() {
326355
// multi-second cross-machine clock drift within one group, and its
327356
// adjuster emits spurious "parent is not in the trace; skipping
328357
// clock skew adjustment" warnings on every cross-process child.
329-
// Stable per-origin instance ID (`mothership-sim` / `mothership-go`)
330-
// is enough to split the groups cleanly; Jaeger still shows both
331-
// under the single `mothership` service in its service picker.
332-
const serviceInstanceId = `${telemetryConfig.serviceName}-${MOTHERSHIP_ORIGIN}`
358+
// Using the short slug (`sim` / `go`) keeps the instance id as
359+
// `mothership-sim` / `mothership-go` — already scoped by
360+
// `service.name = "mothership"` as the container. The longer
361+
// `mothership.origin = "sim-mothership"` value does the
362+
// disambiguation at the attribute level.
363+
const serviceInstanceId = `${telemetryConfig.serviceName}-${SERVICE_INSTANCE_SLUG}`
333364
const resource = defaultResource().merge(
334365
resourceFromAttributes({
335366
[ATTR_SERVICE_NAME]: telemetryConfig.serviceName,

apps/sim/lib/copilot/chat/post.ts

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,10 +374,34 @@ function buildOnComplete(params: {
374374
requestId: string
375375
workspaceId?: string
376376
notifyWorkspaceStatus: boolean
377+
/**
378+
* Root agent span for this request. When present, the final
379+
* assistant message + invoked tool calls are recorded as
380+
* `gen_ai.output.messages` on it before persistence runs. Keeps
381+
* the Honeycomb Gen AI view complete across both the Sim root
382+
* span and the Go-side `llm.stream` spans.
383+
*/
384+
otelRoot?: {
385+
setOutputMessages: (output: {
386+
assistantText?: string
387+
toolCalls?: Array<{ id: string; name: string; arguments?: Record<string, unknown> }>
388+
}) => void
389+
}
377390
}) {
378-
const { chatId, userMessageId, requestId, workspaceId, notifyWorkspaceStatus } = params
391+
const { chatId, userMessageId, requestId, workspaceId, notifyWorkspaceStatus, otelRoot } = params
379392

380393
return async (result: OrchestratorResult) => {
394+
if (otelRoot && result.success) {
395+
otelRoot.setOutputMessages({
396+
assistantText: result.content,
397+
toolCalls: result.toolCalls?.map((tc) => ({
398+
id: tc.id,
399+
name: tc.name,
400+
arguments: tc.params,
401+
})),
402+
})
403+
}
404+
381405
if (!chatId) return
382406

383407
try {
@@ -601,6 +625,11 @@ export async function handleUnifiedChatPost(req: NextRequest) {
601625
runId,
602626
transport: 'stream',
603627
})
628+
// Emit `gen_ai.input.messages` on the root agent span for OTel
629+
// GenAI spec compliance (Honeycomb's Gen AI view keys off this).
630+
// Gated on OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT
631+
// internally — safe to always call.
632+
otelRoot.setInputMessages({ userMessage: body.message })
604633

605634
// Wrap the rest of the handler so every nested withCopilotSpan /
606635
// withDbSpan (persistUserMessage, createRunSegment, resolveBranch DB
@@ -799,6 +828,7 @@ export async function handleUnifiedChatPost(req: NextRequest) {
799828
requestId: tracker.requestId,
800829
workspaceId,
801830
notifyWorkspaceStatus: branch.notifyWorkspaceStatus,
831+
otelRoot,
802832
}),
803833
onError: buildOnError({
804834
chatId: actualChatId,

apps/sim/lib/copilot/request/otel.ts

Lines changed: 151 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,136 @@ import type { RequestTraceV1Outcome } from '@/lib/copilot/generated/request-trac
1414
import { TraceSpan } from '@/lib/copilot/generated/trace-spans-v1'
1515
import { contextFromRequestHeaders } from '@/lib/copilot/request/go/propagation'
1616

17+
/**
18+
* OTel GenAI experimental semantic conventions env var. When set to a
19+
* truthy value, each `gen_ai.*` span carries the full input and
20+
* output conversation content as attributes. Mirrors the Go-side
21+
* gate in `copilot/internal/providers/telemetry.go` so operators
22+
* control both halves with one variable.
23+
*
24+
* Spec: https://opentelemetry.io/docs/specs/semconv/gen-ai/
25+
*/
26+
const GENAI_CAPTURE_ENV = 'OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT'
27+
28+
/**
29+
* Attribute-size cap for `gen_ai.{input,output}.messages`. Most OTLP
30+
* backends reject attributes larger than ~64 KiB, so we truncate
31+
* proactively to keep the rest of the span alive if a conversation
32+
* runs long. Matches the Go-side cap to keep truncation behavior
33+
* symmetrical between the two halves.
34+
*/
35+
const GENAI_MESSAGE_ATTR_MAX_BYTES = 60 * 1024
36+
37+
function isGenAIMessageCaptureEnabled(): boolean {
38+
const raw = (process.env[GENAI_CAPTURE_ENV] || '').toLowerCase().trim()
39+
return raw === 'true' || raw === '1' || raw === 'yes'
40+
}
41+
42+
/**
43+
* Canonical OTel GenAI message shape used for both input and output
44+
* attributes. Kept minimal — only the three part types we actually
45+
* emit: `text`, `tool_call`, and `tool_call_response`. Adding more
46+
* part types is cheap, but every additional shape here has to be
47+
* mirrored in the Go serializer.
48+
*/
49+
interface GenAIAgentPart {
50+
type: 'text' | 'tool_call' | 'tool_call_response'
51+
content?: string
52+
id?: string
53+
name?: string
54+
arguments?: Record<string, unknown>
55+
response?: string
56+
}
57+
58+
interface GenAIAgentMessage {
59+
role: 'system' | 'user' | 'assistant' | 'tool'
60+
parts: GenAIAgentPart[]
61+
}
62+
63+
function marshalAgentMessages(messages: GenAIAgentMessage[]): string | undefined {
64+
if (messages.length === 0) return undefined
65+
const json = JSON.stringify(messages)
66+
if (json.length <= GENAI_MESSAGE_ATTR_MAX_BYTES) return json
67+
// Simple tail-preserving truncation: drop from the front until we
68+
// fit. Matches the Go side's behavior. The last message is
69+
// usually the most diagnostic for span-level outcome.
70+
let remaining = messages.slice()
71+
while (remaining.length > 1) {
72+
remaining = remaining.slice(1)
73+
const candidate = JSON.stringify(remaining)
74+
if (candidate.length <= GENAI_MESSAGE_ATTR_MAX_BYTES) return candidate
75+
}
76+
// Single message still over cap — truncate the text part in place
77+
// with a marker so the partial content is still readable.
78+
const only = remaining[0]
79+
for (const part of only.parts) {
80+
if (part.type === 'text' && part.content) {
81+
const headroom = GENAI_MESSAGE_ATTR_MAX_BYTES - 1024
82+
if (part.content.length > headroom) {
83+
part.content = `${part.content.slice(0, headroom)}\n\n[truncated: capture cap ${GENAI_MESSAGE_ATTR_MAX_BYTES} bytes]`
84+
}
85+
}
86+
}
87+
const final = JSON.stringify([only])
88+
return final.length <= GENAI_MESSAGE_ATTR_MAX_BYTES ? final : undefined
89+
}
90+
91+
export interface CopilotAgentInputMessages {
92+
userMessage?: string
93+
systemPrompt?: string
94+
}
95+
96+
export interface CopilotAgentOutputMessages {
97+
assistantText?: string
98+
toolCalls?: Array<{
99+
id: string
100+
name: string
101+
arguments?: Record<string, unknown>
102+
}>
103+
}
104+
105+
function setAgentInputMessages(span: Span, input: CopilotAgentInputMessages): void {
106+
if (!isGenAIMessageCaptureEnabled()) return
107+
const messages: GenAIAgentMessage[] = []
108+
if (input.systemPrompt) {
109+
messages.push({
110+
role: 'system',
111+
parts: [{ type: 'text', content: input.systemPrompt }],
112+
})
113+
}
114+
if (input.userMessage) {
115+
messages.push({
116+
role: 'user',
117+
parts: [{ type: 'text', content: input.userMessage }],
118+
})
119+
}
120+
const serialized = marshalAgentMessages(messages)
121+
if (serialized) {
122+
span.setAttribute('gen_ai.input.messages', serialized)
123+
}
124+
}
125+
126+
function setAgentOutputMessages(span: Span, output: CopilotAgentOutputMessages): void {
127+
if (!isGenAIMessageCaptureEnabled()) return
128+
const parts: GenAIAgentPart[] = []
129+
if (output.assistantText) {
130+
parts.push({ type: 'text', content: output.assistantText })
131+
}
132+
for (const tc of output.toolCalls ?? []) {
133+
parts.push({
134+
type: 'tool_call',
135+
id: tc.id,
136+
name: tc.name,
137+
...(tc.arguments ? { arguments: tc.arguments } : {}),
138+
})
139+
}
140+
if (parts.length === 0) return
141+
const serialized = marshalAgentMessages([{ role: 'assistant', parts }])
142+
if (serialized) {
143+
span.setAttribute('gen_ai.output.messages', serialized)
144+
}
145+
}
146+
17147
/**
18148
* Reuse the generated RequestTraceV1Outcome string values for every
19149
* lifecycle outcome field. This keeps our OTel attributes, internal
@@ -262,6 +392,20 @@ export interface CopilotOtelRoot {
262392
span: Span
263393
context: Context
264394
finish: (outcome?: CopilotLifecycleOutcome, error?: unknown) => void
395+
/**
396+
* Record `gen_ai.input.messages` on the root agent span. Gated on
397+
* `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT` — no-op when
398+
* capture is disabled. Safe to call multiple times; the latest
399+
* call wins.
400+
*/
401+
setInputMessages: (input: CopilotAgentInputMessages) => void
402+
/**
403+
* Record `gen_ai.output.messages` on the root agent span. Gated on
404+
* the same env var as `setInputMessages`. Typically called from the
405+
* stream finalize callback once the assistant's final content and
406+
* invoked tool calls are known.
407+
*/
408+
setOutputMessages: (output: CopilotAgentOutputMessages) => void
265409
}
266410

267411
export function startCopilotOtelRoot(scope: CopilotOtelScope): CopilotOtelRoot {
@@ -300,7 +444,13 @@ export function startCopilotOtelRoot(scope: CopilotOtelScope): CopilotOtelRoot {
300444
span.end()
301445
}
302446

303-
return { span, context: rootContext, finish }
447+
return {
448+
span,
449+
context: rootContext,
450+
finish,
451+
setInputMessages: (input) => setAgentInputMessages(span, input),
452+
setOutputMessages: (output) => setAgentOutputMessages(span, output),
453+
}
304454
}
305455

306456
export async function withCopilotOtelContext<T>(

0 commit comments

Comments
 (0)