Skip to content

Commit e729a82

Browse files
committed
feat: add multimodal embeddings (text + image + video) to VoyageAI integration
- New tool: voyageai_multimodal_embeddings using voyage-multimodal-3.5 model - New API route: /api/tools/voyageai/multimodal-embeddings for server-side file handling - Supports text, image files/URLs, video files/URLs in a single embedding - Uses file-upload subBlocks with basic/advanced mode for images and video - Internal proxy pattern: downloads UserFiles via downloadFileFromStorage, converts to base64 - URL validation via validateUrlWithDNS for SSRF protection - 14 new unit tests (tool metadata, body, response transform) - 5 new integration tests (text-only, image URL, text+image, dimensions, auth) - 8 new block tests (multimodal operation, params, subBlocks)
1 parent 1bf99c0 commit e729a82

File tree

9 files changed

+846
-6
lines changed

9 files changed

+846
-6
lines changed
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
import { createLogger } from '@sim/logger'
2+
import { type NextRequest, NextResponse } from 'next/server'
3+
import { z } from 'zod'
4+
import { checkInternalAuth } from '@/lib/auth/hybrid'
5+
import { validateUrlWithDNS } from '@/lib/core/security/input-validation.server'
6+
import { generateRequestId } from '@/lib/core/utils/request'
7+
import { RawFileInputArraySchema, RawFileInputSchema } from '@/lib/uploads/utils/file-schemas'
8+
import { processSingleFileToUserFile } from '@/lib/uploads/utils/file-utils'
9+
import { downloadFileFromStorage } from '@/lib/uploads/utils/file-utils.server'
10+
11+
export const dynamic = 'force-dynamic'
12+
13+
const logger = createLogger('VoyageAIMultimodalAPI')
14+
15+
const MultimodalEmbeddingsSchema = z.object({
16+
apiKey: z.string().min(1, 'API key is required'),
17+
input: z.string().optional().nullable(),
18+
imageFiles: z.union([RawFileInputSchema, RawFileInputArraySchema]).optional().nullable(),
19+
imageUrls: z.string().optional().nullable(),
20+
videoFile: RawFileInputSchema.optional().nullable(),
21+
videoUrl: z.string().optional().nullable(),
22+
model: z.string().optional().default('voyage-multimodal-3.5'),
23+
inputType: z.enum(['query', 'document']).optional().nullable(),
24+
})
25+
26+
export async function POST(request: NextRequest) {
27+
const requestId = generateRequestId()
28+
29+
try {
30+
const authResult = await checkInternalAuth(request, { requireWorkflowId: false })
31+
if (!authResult.success) {
32+
logger.warn(`[${requestId}] Unauthorized multimodal embeddings attempt`)
33+
return NextResponse.json(
34+
{ success: false, error: authResult.error || 'Authentication required' },
35+
{ status: 401 }
36+
)
37+
}
38+
39+
const body = await request.json()
40+
const params = MultimodalEmbeddingsSchema.parse(body)
41+
42+
const content: Array<Record<string, string>> = []
43+
44+
// Add text content
45+
if (params.input?.trim()) {
46+
content.push({ type: 'text', text: params.input })
47+
}
48+
49+
// Process image files → base64
50+
if (params.imageFiles) {
51+
const files = Array.isArray(params.imageFiles) ? params.imageFiles : [params.imageFiles]
52+
for (const rawFile of files) {
53+
try {
54+
const userFile = processSingleFileToUserFile(rawFile, requestId, logger)
55+
let base64 = userFile.base64
56+
if (!base64) {
57+
const buffer = await downloadFileFromStorage(userFile, requestId, logger)
58+
base64 = buffer.toString('base64')
59+
logger.info(`[${requestId}] Converted image to base64 (${buffer.length} bytes)`)
60+
}
61+
const mimeType = userFile.type || 'image/jpeg'
62+
content.push({
63+
type: 'image_base64',
64+
image_base64: `data:${mimeType};base64,${base64}`,
65+
})
66+
} catch (error) {
67+
logger.error(`[${requestId}] Failed to process image file:`, error)
68+
return NextResponse.json(
69+
{ success: false, error: `Failed to process image file: ${error instanceof Error ? error.message : 'Unknown error'}` },
70+
{ status: 400 }
71+
)
72+
}
73+
}
74+
}
75+
76+
// Process image URLs
77+
if (params.imageUrls?.trim()) {
78+
let urls: string[]
79+
try {
80+
urls = JSON.parse(params.imageUrls)
81+
} catch {
82+
urls = params.imageUrls
83+
.split(/[,\n]/)
84+
.map((u) => u.trim())
85+
.filter(Boolean)
86+
}
87+
88+
for (const url of urls) {
89+
const validation = await validateUrlWithDNS(url, 'imageUrl')
90+
if (!validation.isValid) {
91+
return NextResponse.json(
92+
{ success: false, error: `Invalid image URL: ${validation.error}` },
93+
{ status: 400 }
94+
)
95+
}
96+
content.push({ type: 'image_url', image_url: url })
97+
}
98+
}
99+
100+
// Process video file → base64
101+
if (params.videoFile) {
102+
try {
103+
const userFile = processSingleFileToUserFile(params.videoFile, requestId, logger)
104+
let base64 = userFile.base64
105+
if (!base64) {
106+
const buffer = await downloadFileFromStorage(userFile, requestId, logger)
107+
base64 = buffer.toString('base64')
108+
logger.info(`[${requestId}] Converted video to base64 (${buffer.length} bytes)`)
109+
}
110+
const mimeType = userFile.type || 'video/mp4'
111+
content.push({
112+
type: 'video_base64',
113+
video_base64: `data:${mimeType};base64,${base64}`,
114+
})
115+
} catch (error) {
116+
logger.error(`[${requestId}] Failed to process video file:`, error)
117+
return NextResponse.json(
118+
{ success: false, error: `Failed to process video file: ${error instanceof Error ? error.message : 'Unknown error'}` },
119+
{ status: 400 }
120+
)
121+
}
122+
}
123+
124+
// Process video URL
125+
if (params.videoUrl?.trim()) {
126+
const validation = await validateUrlWithDNS(params.videoUrl, 'videoUrl')
127+
if (!validation.isValid) {
128+
return NextResponse.json(
129+
{ success: false, error: `Invalid video URL: ${validation.error}` },
130+
{ status: 400 }
131+
)
132+
}
133+
content.push({ type: 'video_url', video_url: params.videoUrl })
134+
}
135+
136+
if (content.length === 0) {
137+
return NextResponse.json(
138+
{ success: false, error: 'At least one input (text, image, or video) is required' },
139+
{ status: 400 }
140+
)
141+
}
142+
143+
logger.info(`[${requestId}] Calling VoyageAI multimodal embeddings`, {
144+
contentTypes: content.map((c) => c.type),
145+
model: params.model,
146+
})
147+
148+
// Build VoyageAI request
149+
const voyageBody: Record<string, unknown> = {
150+
inputs: [{ content }],
151+
model: params.model,
152+
}
153+
if (params.inputType) {
154+
voyageBody.input_type = params.inputType
155+
}
156+
157+
const voyageResponse = await fetch('https://api.voyageai.com/v1/multimodalembeddings', {
158+
method: 'POST',
159+
headers: {
160+
Authorization: `Bearer ${params.apiKey}`,
161+
'Content-Type': 'application/json',
162+
},
163+
body: JSON.stringify(voyageBody),
164+
})
165+
166+
if (!voyageResponse.ok) {
167+
const errorText = await voyageResponse.text()
168+
logger.error(`[${requestId}] VoyageAI API error: ${voyageResponse.status}`, { errorText })
169+
return NextResponse.json(
170+
{ success: false, error: `VoyageAI API error: ${voyageResponse.status} - ${errorText}` },
171+
{ status: voyageResponse.status }
172+
)
173+
}
174+
175+
const data = await voyageResponse.json()
176+
177+
logger.info(`[${requestId}] Multimodal embeddings generated successfully`, {
178+
embeddingsCount: data.data?.length,
179+
totalTokens: data.usage?.total_tokens,
180+
})
181+
182+
return NextResponse.json({
183+
success: true,
184+
output: {
185+
embeddings: data.data.map((item: { embedding: number[] }) => item.embedding),
186+
model: data.model,
187+
usage: {
188+
text_tokens: data.usage?.text_tokens,
189+
image_pixels: data.usage?.image_pixels,
190+
video_pixels: data.usage?.video_pixels,
191+
total_tokens: data.usage?.total_tokens,
192+
},
193+
},
194+
})
195+
} catch (error) {
196+
if (error instanceof z.ZodError) {
197+
logger.warn(`[${requestId}] Invalid request data`, { errors: error.errors })
198+
return NextResponse.json(
199+
{ success: false, error: 'Invalid request data', details: error.errors },
200+
{ status: 400 }
201+
)
202+
}
203+
204+
const errorMessage = error instanceof Error ? error.message : 'Unknown error'
205+
logger.error(`[${requestId}] Multimodal embeddings failed:`, error)
206+
return NextResponse.json(
207+
{ success: false, error: `Multimodal embeddings failed: ${errorMessage}` },
208+
{ status: 500 }
209+
)
210+
}
211+
}

apps/sim/blocks/blocks/voyageai.test.ts

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,12 @@ describe('VoyageAIBlock', () => {
4646
expect(VoyageAIBlock.bgColor).toBe('#1A1A2E')
4747
})
4848

49-
it('should list both tool IDs in access', () => {
50-
expect(VoyageAIBlock.tools.access).toEqual(['voyageai_embeddings', 'voyageai_rerank'])
49+
it('should list all tool IDs in access', () => {
50+
expect(VoyageAIBlock.tools.access).toEqual([
51+
'voyageai_embeddings',
52+
'voyageai_multimodal_embeddings',
53+
'voyageai_rerank',
54+
])
5155
})
5256

5357
it('should have tools.config.tool and tools.config.params functions', () => {
@@ -159,6 +163,31 @@ describe('VoyageAIBlock', () => {
159163
expect(modelIds).toContain('voyage-law-2')
160164
})
161165

166+
it('should have multimodal-specific subBlocks with correct conditions', () => {
167+
const mmBlocks = VoyageAIBlock.subBlocks.filter(
168+
(sb) =>
169+
sb.condition &&
170+
typeof sb.condition === 'object' &&
171+
'value' in sb.condition &&
172+
sb.condition.value === 'multimodal_embeddings'
173+
)
174+
const ids = mmBlocks.map((sb) => sb.id)
175+
expect(ids).toContain('multimodalInput')
176+
expect(ids).toContain('imageFiles')
177+
expect(ids).toContain('imageFilesRef')
178+
expect(ids).toContain('videoFile')
179+
expect(ids).toContain('videoFileRef')
180+
expect(ids).toContain('multimodalModel')
181+
})
182+
183+
it('should have multimodal models in the dropdown', () => {
184+
const modelBlock = VoyageAIBlock.subBlocks.find((sb) => sb.id === 'multimodalModel') as any
185+
expect(modelBlock).toBeDefined()
186+
const modelIds = modelBlock.options.map((o: any) => o.id)
187+
expect(modelIds).toContain('voyage-multimodal-3.5')
188+
expect(modelIds).toContain('voyage-multimodal-3')
189+
})
190+
162191
it('should have all rerank models in the dropdown', () => {
163192
const modelBlock = VoyageAIBlock.subBlocks.find((sb) => sb.id === 'rerankModel') as any
164193
expect(modelBlock).toBeDefined()
@@ -200,6 +229,12 @@ describe('VoyageAIBlock', () => {
200229
expect(toolFunction({ operation: 'embeddings' })).toBe('voyageai_embeddings')
201230
})
202231

232+
it('should return voyageai_multimodal_embeddings for multimodal_embeddings operation', () => {
233+
expect(toolFunction({ operation: 'multimodal_embeddings' })).toBe(
234+
'voyageai_multimodal_embeddings'
235+
)
236+
})
237+
203238
it('should return voyageai_rerank for rerank operation', () => {
204239
expect(toolFunction({ operation: 'rerank' })).toBe('voyageai_rerank')
205240
})
@@ -398,5 +433,75 @@ describe('VoyageAIBlock', () => {
398433
expect(result.embeddingModel).toBeUndefined()
399434
})
400435
})
436+
437+
describe('multimodal_embeddings operation', () => {
438+
it('should pass text input and model', () => {
439+
const result = paramsFunction({
440+
operation: 'multimodal_embeddings',
441+
apiKey: 'va-key',
442+
multimodalInput: 'describe this image',
443+
multimodalModel: 'voyage-multimodal-3.5',
444+
})
445+
expect(result.apiKey).toBe('va-key')
446+
expect(result.input).toBe('describe this image')
447+
expect(result.model).toBe('voyage-multimodal-3.5')
448+
})
449+
450+
it('should pass image URLs', () => {
451+
const result = paramsFunction({
452+
operation: 'multimodal_embeddings',
453+
apiKey: 'va-key',
454+
imageUrls: 'https://example.com/img.jpg',
455+
multimodalModel: 'voyage-multimodal-3.5',
456+
})
457+
expect(result.imageUrls).toBe('https://example.com/img.jpg')
458+
})
459+
460+
it('should pass video URL', () => {
461+
const result = paramsFunction({
462+
operation: 'multimodal_embeddings',
463+
apiKey: 'va-key',
464+
videoUrl: 'https://example.com/video.mp4',
465+
multimodalModel: 'voyage-multimodal-3.5',
466+
})
467+
expect(result.videoUrl).toBe('https://example.com/video.mp4')
468+
})
469+
470+
it('should pass inputType for multimodal', () => {
471+
const result = paramsFunction({
472+
operation: 'multimodal_embeddings',
473+
apiKey: 'va-key',
474+
multimodalInput: 'test',
475+
multimodalModel: 'voyage-multimodal-3.5',
476+
multimodalInputType: 'query',
477+
})
478+
expect(result.inputType).toBe('query')
479+
})
480+
481+
it('should omit empty optional fields', () => {
482+
const result = paramsFunction({
483+
operation: 'multimodal_embeddings',
484+
apiKey: 'va-key',
485+
multimodalModel: 'voyage-multimodal-3.5',
486+
})
487+
expect(result.input).toBeUndefined()
488+
expect(result.imageFiles).toBeUndefined()
489+
expect(result.imageUrls).toBeUndefined()
490+
expect(result.videoFile).toBeUndefined()
491+
expect(result.videoUrl).toBeUndefined()
492+
})
493+
494+
it('should not include text embedding or rerank fields', () => {
495+
const result = paramsFunction({
496+
operation: 'multimodal_embeddings',
497+
apiKey: 'va-key',
498+
multimodalModel: 'voyage-multimodal-3.5',
499+
embeddingModel: 'should not appear',
500+
query: 'should not appear',
501+
})
502+
expect(result.embeddingModel).toBeUndefined()
503+
expect(result.query).toBeUndefined()
504+
})
505+
})
401506
})
402507
})

0 commit comments

Comments
 (0)