Merge pull request #448 from cloudflare/vision-inputs

threepointone · web-flow · commit 5f603ffdc296 · 2026-03-19T12:54:13.000Z
Support image inputs for vision chat models
diff --git a/.changeset/fix-image-inputs.md b/.changeset/fix-image-inputs.md
@@ -0,0 +1,10 @@
+---
+"workers-ai-provider": patch
+"@cloudflare/tanstack-ai": patch
+---
+
+Fix image inputs for vision-capable chat models
+
+- Handle all `LanguageModelV3DataContent` variants (Uint8Array, base64 string, data URL) instead of only Uint8Array
+- Send images as OpenAI-compatible `image_url` content parts inline in messages, enabling vision for models like Llama 4 Scout and Kimi K2.5
+- Works with both the binding and REST API paths
diff --git a/packages/tanstack-ai/README.md b/packages/tanstack-ai/README.md
@@ -68,6 +68,39 @@ const adapter = createWorkersAiChat("@cf/meta/llama-4-scout-17b-16e-instruct", {
 });
 ```
 
+### Vision (Image Inputs)
+
+Send images to vision-capable chat models:
+
+```typescript
+const adapter = createWorkersAiChat("@cf/meta/llama-4-scout-17b-16e-instruct", {
+	accountId: "your-account-id",
+	apiKey: "your-api-key",
+});
+
+const response = chat({
+	adapter,
+	stream: true,
+	messages: [
+		{
+			role: "user",
+			content: [
+				{ type: "text", content: "What's in this image?" },
+				{ type: "image", source: { type: "data", value: base64String, mimeType: "image/png" } },
+			],
+		},
+	],
+});
+```
+
+URL sources are also supported:
+
+```typescript
+{ type: "image", source: { type: "url", value: "https://example.com/photo.jpg" } }
+```
+
+Works with all configuration modes (binding, REST, and AI Gateway).
+
 ### Image Generation
 
 ```typescript
diff --git a/packages/tanstack-ai/src/utils/create-fetcher.ts b/packages/tanstack-ai/src/utils/create-fetcher.ts
@@ -274,7 +274,10 @@ export function createGatewayFetch(
  * Normalize messages before passing to Workers AI binding.
  *
  * The binding has strict schema validation that may differ from the OpenAI API:
- * - `content` must be a string (not null)
+ * - `content` must not be null
+ *
+ * Content arrays (with image_url parts) are passed through as-is since the
+ * Workers AI binding accepts them at runtime for vision-capable models.
  */
 function normalizeMessagesForBinding(
 	messages: Record<string, unknown>[],
diff --git a/packages/tanstack-ai/test/binding-fetch.test.ts b/packages/tanstack-ai/test/binding-fetch.test.ts
@@ -598,4 +598,34 @@ describe("createWorkersAiBindingFetch", () => {
 			json_schema: { name: "test", schema: {} },
 		});
 	});
+
+	it("should pass content arrays through to binding for vision models", async () => {
+		const binding = mockBinding(vi.fn().mockResolvedValue({ response: "A red square" }));
+
+		const fetcher = createWorkersAiBindingFetch(binding);
+
+		const base64 = btoa("fake-png-bytes");
+
+		await fetcher("https://api.openai.com/v1/chat/completions", {
+			method: "POST",
+			body: JSON.stringify({
+				model: "@cf/meta/llama-4-scout-17b-16e-instruct",
+				messages: [
+					{
+						role: "user",
+						content: [
+							{ type: "text", text: "Describe this" },
+							{ type: "image_url", image_url: { url: `data:image/png;base64,${base64}` } },
+						],
+					},
+				],
+			}),
+		});
+
+		const [, inputs] = binding.run.mock.calls[0]!;
+		expect(inputs.messages[0].content).toEqual([
+			{ type: "text", text: "Describe this" },
+			{ type: "image_url", image_url: { url: `data:image/png;base64,${base64}` } },
+		]);
+	});
 });
diff --git a/packages/tanstack-ai/test/message-builder.test.ts b/packages/tanstack-ai/test/message-builder.test.ts
@@ -145,7 +145,6 @@ describe("message building (via chatStream)", () => {
 			binding,
 		);
 
-		// Should preserve image parts as OpenAI multi-modal content array
 		expect(messages[0].content).toEqual([
 			{ type: "text", text: "Part 1" },
 			{ type: "image_url", image_url: { url: "https://example.com/img.png" } },
diff --git a/packages/workers-ai-provider/README.md b/packages/workers-ai-provider/README.md
@@ -111,6 +111,29 @@ for await (const chunk of result.textStream) {
 }
 ```
 
+## Vision (Image Inputs)
+
+Send images to vision-capable models like Llama 4 Scout and Kimi K2.5:
+
+```ts
+import { generateText } from "ai";
+
+const { text } = await generateText({
+	model: workersai("@cf/meta/llama-4-scout-17b-16e-instruct"),
+	messages: [
+		{
+			role: "user",
+			content: [
+				{ type: "text", text: "What's in this image?" },
+				{ type: "image", image: imageUint8Array },
+			],
+		},
+	],
+});
+```
+
+Images can be provided as `Uint8Array`, base64 strings, or data URLs. Multiple images per message are supported. Works with both the binding and REST API configurations.
+
 ## Tool Calling
 
 ```ts
diff --git a/packages/workers-ai-provider/src/convert-to-workersai-chat-messages.ts b/packages/workers-ai-provider/src/convert-to-workersai-chat-messages.ts
@@ -1,20 +1,62 @@
-import type { LanguageModelV3Prompt, SharedV3ProviderOptions } from "@ai-sdk/provider";
-import type { WorkersAIChatPrompt } from "./workersai-chat-prompt";
+import type {
+	LanguageModelV3DataContent,
+	LanguageModelV3Prompt,
+} from "@ai-sdk/provider";
+import type { WorkersAIContentPart, WorkersAIChatPrompt } from "./workersai-chat-prompt";
+
+/**
+ * Normalise any LanguageModelV3DataContent value to a Uint8Array.
+ *
+ * Handles:
+ *   - Uint8Array  → returned as-is
+ *   - string      → decoded from base64 (with or without data-URL prefix)
+ *   - URL         → not supported (Workers AI needs raw bytes, not a reference)
+ */
+function toUint8Array(data: LanguageModelV3DataContent): Uint8Array | null {
+	if (data instanceof Uint8Array) {
+		return data;
+	}
+
+	if (typeof data === "string") {
+		let base64 = data;
+		if (base64.startsWith("data:")) {
+			const commaIndex = base64.indexOf(",");
+			if (commaIndex >= 0) {
+				base64 = base64.slice(commaIndex + 1);
+			}
+		}
+		const binaryString = atob(base64);
+		const bytes = new Uint8Array(binaryString.length);
+		for (let i = 0; i < binaryString.length; i++) {
+			bytes[i] = binaryString.charCodeAt(i);
+		}
+		return bytes;
+	}
+
+	if (data instanceof URL) {
+		throw new Error(
+			"URL image sources are not supported by Workers AI. " +
+				"Provide image data as a Uint8Array or base64 string instead.",
+		);
+	}
+
+	return null;
+}
+
+function uint8ArrayToBase64(bytes: Uint8Array): string {
+	let binary = "";
+	const chunkSize = 8192;
+	for (let i = 0; i < bytes.length; i += chunkSize) {
+		const chunk = bytes.subarray(i, Math.min(i + chunkSize, bytes.length));
+		binary += String.fromCharCode(...chunk);
+	}
+	return btoa(binary);
+}
 
 export function convertToWorkersAIChatMessages(prompt: LanguageModelV3Prompt): {
 	messages: WorkersAIChatPrompt;
-	images: {
-		mediaType: string | undefined;
-		image: Uint8Array;
-		providerOptions: SharedV3ProviderOptions | undefined;
-	}[];
 } {
 	const messages: WorkersAIChatPrompt = [];
-	const images: {
-		mediaType: string | undefined;
-		image: Uint8Array;
-		providerOptions: SharedV3ProviderOptions | undefined;
-	}[] = [];
 
 	for (const { role, content } of prompt) {
 		switch (role) {
@@ -25,6 +67,7 @@ export function convertToWorkersAIChatMessages(prompt: LanguageModelV3Prompt): {
 
 			case "user": {
 				const textParts: string[] = [];
+				const imageParts: { image: Uint8Array; mediaType: string | undefined }[] = [];
 
 				for (const part of content) {
 					switch (part.type) {
@@ -33,23 +76,36 @@ export function convertToWorkersAIChatMessages(prompt: LanguageModelV3Prompt): {
 							break;
 						}
 						case "file": {
-							if (part.data instanceof Uint8Array) {
-								images.push({
-									image: part.data,
+							const imageBytes = toUint8Array(part.data);
+							if (imageBytes) {
+								imageParts.push({
+									image: imageBytes,
 									mediaType: part.mediaType,
-									providerOptions: part.providerOptions,
 								});
 							}
-							// Don't push empty strings for image parts
 							break;
 						}
 					}
 				}
 
-				messages.push({
-					content: textParts.join("\n"),
-					role: "user",
-				});
+				if (imageParts.length > 0) {
+					const contentArray: WorkersAIContentPart[] = [];
+					if (textParts.length > 0) {
+						contentArray.push({ type: "text", text: textParts.join("\n") });
+					}
+					for (const img of imageParts) {
+						const base64 = uint8ArrayToBase64(img.image);
+						const mediaType = img.mediaType || "image/png";
+						contentArray.push({
+							type: "image_url",
+							image_url: { url: `data:${mediaType};base64,${base64}` },
+						});
+					}
+					messages.push({ content: contentArray, role: "user" });
+				} else {
+					messages.push({ content: textParts.join("\n"), role: "user" });
+				}
+
 				break;
 			}
 
@@ -144,5 +200,5 @@ export function convertToWorkersAIChatMessages(prompt: LanguageModelV3Prompt): {
 		}
 	}
 
-	return { images, messages };
+	return { messages };
 }
diff --git a/packages/workers-ai-provider/src/utils.ts b/packages/workers-ai-provider/src/utils.ts
@@ -10,7 +10,7 @@ import type { WorkersAIChatPrompt } from "./workersai-chat-prompt";
  * Normalize messages before passing to the Workers AI binding.
  *
  * The binding has strict schema validation that differs from the OpenAI API:
- * - `content` must be a string (not null)
+ * - `content` must not be null
  */
 export function normalizeMessagesForBinding(messages: WorkersAIChatPrompt): WorkersAIChatPrompt {
 	return messages.map((msg) => {
diff --git a/packages/workers-ai-provider/src/workersai-chat-language-model.ts b/packages/workers-ai-provider/src/workersai-chat-language-model.ts
@@ -117,33 +117,27 @@ export class WorkersAIChatLanguageModel implements LanguageModelV3 {
 
 	/**
 	 * Build the inputs object for `binding.run()`, shared by doGenerate and doStream.
+	 *
+	 * Images are embedded inline in messages as OpenAI-compatible content
+	 * arrays with `image_url` parts. Both the REST API and the binding
+	 * accept this format at runtime.
+	 *
+	 * The binding path additionally normalises null content to empty strings.
 	 */
 	private buildRunInputs(
 		args: ReturnType<typeof this.getArgs>["args"],
 		messages: ReturnType<typeof convertToWorkersAIChatMessages>["messages"],
-		images: ReturnType<typeof convertToWorkersAIChatMessages>["images"],
 		options?: { stream?: boolean },
 	) {
-		if (images.length > 1) {
-			throw new Error("Multiple images are not yet supported as input");
-		}
-
-		const imagePart = images[0];
-
-		// Only normalize messages for the binding path (REST API doesn't need it)
-		const finalMessages = this.config.isBinding
-			? normalizeMessagesForBinding(messages)
-			: messages;
-
 		return {
 			max_tokens: args.max_tokens,
-			messages: finalMessages,
+			messages: this.config.isBinding
+				? normalizeMessagesForBinding(messages)
+				: messages,
 			temperature: args.temperature,
 			tools: args.tools,
 			...(args.tool_choice ? { tool_choice: args.tool_choice } : {}),
 			top_p: args.top_p,
-			...(imagePart ? { image: Array.from(imagePart.image) } : {}),
-			// Only include response_format when actually set
 			...(args.response_format ? { response_format: args.response_format } : {}),
 			...(options?.stream ? { stream: true } : {}),
 		};
@@ -179,14 +173,16 @@ export class WorkersAIChatLanguageModel implements LanguageModelV3 {
 		options: Parameters<LanguageModelV3["doGenerate"]>[0],
 	): Promise<Awaited<ReturnType<LanguageModelV3["doGenerate"]>>> {
 		const { args, warnings } = this.getArgs(options);
-		const { messages, images } = convertToWorkersAIChatMessages(options.prompt);
+		const { messages } = convertToWorkersAIChatMessages(options.prompt);
 
-		const inputs = this.buildRunInputs(args, messages, images);
+		const inputs = this.buildRunInputs(args, messages);
 		const runOptions = this.getRunOptions();
 
 		const output = await this.config.binding.run(
 			args.model as keyof AiModels,
-			inputs,
+			// Content arrays for vision are valid at runtime but not in the
+			// binding's strict TypeScript definitions (which expect string content).
+			inputs as AiModels[keyof AiModels]["inputs"],
 			runOptions,
 		);
 
@@ -226,14 +222,14 @@ export class WorkersAIChatLanguageModel implements LanguageModelV3 {
 		options: Parameters<LanguageModelV3["doStream"]>[0],
 	): Promise<Awaited<ReturnType<LanguageModelV3["doStream"]>>> {
 		const { args, warnings } = this.getArgs(options);
-		const { messages, images } = convertToWorkersAIChatMessages(options.prompt);
+		const { messages } = convertToWorkersAIChatMessages(options.prompt);
 
-		const inputs = this.buildRunInputs(args, messages, images, { stream: true });
+		const inputs = this.buildRunInputs(args, messages, { stream: true });
 		const runOptions = this.getRunOptions();
 
 		const response = await this.config.binding.run(
 			args.model as keyof AiModels,
-			inputs,
+			inputs as AiModels[keyof AiModels]["inputs"],
 			runOptions,
 		);
 
diff --git a/packages/workers-ai-provider/src/workersai-chat-prompt.ts b/packages/workers-ai-provider/src/workersai-chat-prompt.ts
@@ -6,14 +6,18 @@ export type WorkersAIChatMessage =
 	| WorkersAIAssistantMessage
 	| WorkersAIToolMessage;
 
+export type WorkersAIContentPart =
+	| { type: "text"; text: string }
+	| { type: "image_url"; image_url: { url: string } };
+
 export interface WorkersAISystemMessage {
 	role: "system";
 	content: string;
 }
 
 export interface WorkersAIUserMessage {
 	role: "user";
-	content: string;
+	content: string | WorkersAIContentPart[];
 }
 
 export interface WorkersAIAssistantMessage {
diff --git a/packages/workers-ai-provider/test/convert-to-workersai-chat-messages.test.ts b/packages/workers-ai-provider/test/convert-to-workersai-chat-messages.test.ts
diff --git a/packages/workers-ai-provider/test/e2e/fixtures/binding-worker/src/index.ts b/packages/workers-ai-provider/test/e2e/fixtures/binding-worker/src/index.ts
diff --git a/packages/workers-ai-provider/test/e2e/workers-ai-binding.e2e.test.ts b/packages/workers-ai-provider/test/e2e/workers-ai-binding.e2e.test.ts
diff --git a/packages/workers-ai-provider/test/e2e/workers-ai-rest.e2e.test.ts b/packages/workers-ai-provider/test/e2e/workers-ai-rest.e2e.test.ts
diff --git a/packages/workers-ai-provider/test/utils.test.ts b/packages/workers-ai-provider/test/utils.test.ts

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ import type { WorkersAIChatPrompt } from "./workersai-chat-prompt";`
`10`	`10`	`* Normalize messages before passing to the Workers AI binding.`
`11`	`11`	`*`
`12`	`12`	`* The binding has strict schema validation that differs from the OpenAI API:`
`13`		- * - `content` must be a string (not null)
	`13`	+ * - `content` must not be null
`14`	`14`	`*/`
`15`	`15`	`export function normalizeMessagesForBinding(messages: WorkersAIChatPrompt): WorkersAIChatPrompt {`
`16`	`16`	`return messages.map((msg) => {`