Skip to content

Commit 5f603ff

Browse files
Merge pull request #448 from cloudflare/vision-inputs
Support image inputs for vision chat models
2 parents dbadcd0 + 054ccb8 commit 5f603ff

File tree

15 files changed

+489
-61
lines changed

15 files changed

+489
-61
lines changed

.changeset/fix-image-inputs.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
"workers-ai-provider": patch
3+
"@cloudflare/tanstack-ai": patch
4+
---
5+
6+
Fix image inputs for vision-capable chat models
7+
8+
- Handle all `LanguageModelV3DataContent` variants (Uint8Array, base64 string, data URL) instead of only Uint8Array
9+
- Send images as OpenAI-compatible `image_url` content parts inline in messages, enabling vision for models like Llama 4 Scout and Kimi K2.5
10+
- Works with both the binding and REST API paths

packages/tanstack-ai/README.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,39 @@ const adapter = createWorkersAiChat("@cf/meta/llama-4-scout-17b-16e-instruct", {
6868
});
6969
```
7070

71+
### Vision (Image Inputs)
72+
73+
Send images to vision-capable chat models:
74+
75+
```typescript
76+
const adapter = createWorkersAiChat("@cf/meta/llama-4-scout-17b-16e-instruct", {
77+
accountId: "your-account-id",
78+
apiKey: "your-api-key",
79+
});
80+
81+
const response = chat({
82+
adapter,
83+
stream: true,
84+
messages: [
85+
{
86+
role: "user",
87+
content: [
88+
{ type: "text", content: "What's in this image?" },
89+
{ type: "image", source: { type: "data", value: base64String, mimeType: "image/png" } },
90+
],
91+
},
92+
],
93+
});
94+
```
95+
96+
URL sources are also supported:
97+
98+
```typescript
99+
{ type: "image", source: { type: "url", value: "https://example.com/photo.jpg" } }
100+
```
101+
102+
Works with all configuration modes (binding, REST, and AI Gateway).
103+
71104
### Image Generation
72105

73106
```typescript

packages/tanstack-ai/src/utils/create-fetcher.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,10 @@ export function createGatewayFetch(
274274
* Normalize messages before passing to Workers AI binding.
275275
*
276276
* The binding has strict schema validation that may differ from the OpenAI API:
277-
* - `content` must be a string (not null)
277+
* - `content` must not be null
278+
*
279+
* Content arrays (with image_url parts) are passed through as-is since the
280+
* Workers AI binding accepts them at runtime for vision-capable models.
278281
*/
279282
function normalizeMessagesForBinding(
280283
messages: Record<string, unknown>[],

packages/tanstack-ai/test/binding-fetch.test.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,4 +598,34 @@ describe("createWorkersAiBindingFetch", () => {
598598
json_schema: { name: "test", schema: {} },
599599
});
600600
});
601+
602+
it("should pass content arrays through to binding for vision models", async () => {
603+
const binding = mockBinding(vi.fn().mockResolvedValue({ response: "A red square" }));
604+
605+
const fetcher = createWorkersAiBindingFetch(binding);
606+
607+
const base64 = btoa("fake-png-bytes");
608+
609+
await fetcher("https://api.openai.com/v1/chat/completions", {
610+
method: "POST",
611+
body: JSON.stringify({
612+
model: "@cf/meta/llama-4-scout-17b-16e-instruct",
613+
messages: [
614+
{
615+
role: "user",
616+
content: [
617+
{ type: "text", text: "Describe this" },
618+
{ type: "image_url", image_url: { url: `data:image/png;base64,${base64}` } },
619+
],
620+
},
621+
],
622+
}),
623+
});
624+
625+
const [, inputs] = binding.run.mock.calls[0]!;
626+
expect(inputs.messages[0].content).toEqual([
627+
{ type: "text", text: "Describe this" },
628+
{ type: "image_url", image_url: { url: `data:image/png;base64,${base64}` } },
629+
]);
630+
});
601631
});

packages/tanstack-ai/test/message-builder.test.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,6 @@ describe("message building (via chatStream)", () => {
145145
binding,
146146
);
147147

148-
// Should preserve image parts as OpenAI multi-modal content array
149148
expect(messages[0].content).toEqual([
150149
{ type: "text", text: "Part 1" },
151150
{ type: "image_url", image_url: { url: "https://example.com/img.png" } },

packages/workers-ai-provider/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,29 @@ for await (const chunk of result.textStream) {
111111
}
112112
```
113113

114+
## Vision (Image Inputs)
115+
116+
Send images to vision-capable models like Llama 4 Scout and Kimi K2.5:
117+
118+
```ts
119+
import { generateText } from "ai";
120+
121+
const { text } = await generateText({
122+
model: workersai("@cf/meta/llama-4-scout-17b-16e-instruct"),
123+
messages: [
124+
{
125+
role: "user",
126+
content: [
127+
{ type: "text", text: "What's in this image?" },
128+
{ type: "image", image: imageUint8Array },
129+
],
130+
},
131+
],
132+
});
133+
```
134+
135+
Images can be provided as `Uint8Array`, base64 strings, or data URLs. Multiple images per message are supported. Works with both the binding and REST API configurations.
136+
114137
## Tool Calling
115138

116139
```ts

packages/workers-ai-provider/src/convert-to-workersai-chat-messages.ts

Lines changed: 78 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,62 @@
1-
import type { LanguageModelV3Prompt, SharedV3ProviderOptions } from "@ai-sdk/provider";
2-
import type { WorkersAIChatPrompt } from "./workersai-chat-prompt";
1+
import type {
2+
LanguageModelV3DataContent,
3+
LanguageModelV3Prompt,
4+
} from "@ai-sdk/provider";
5+
import type { WorkersAIContentPart, WorkersAIChatPrompt } from "./workersai-chat-prompt";
6+
7+
/**
8+
* Normalise any LanguageModelV3DataContent value to a Uint8Array.
9+
*
10+
* Handles:
11+
* - Uint8Array → returned as-is
12+
* - string → decoded from base64 (with or without data-URL prefix)
13+
* - URL → not supported (Workers AI needs raw bytes, not a reference)
14+
*/
15+
function toUint8Array(data: LanguageModelV3DataContent): Uint8Array | null {
16+
if (data instanceof Uint8Array) {
17+
return data;
18+
}
19+
20+
if (typeof data === "string") {
21+
let base64 = data;
22+
if (base64.startsWith("data:")) {
23+
const commaIndex = base64.indexOf(",");
24+
if (commaIndex >= 0) {
25+
base64 = base64.slice(commaIndex + 1);
26+
}
27+
}
28+
const binaryString = atob(base64);
29+
const bytes = new Uint8Array(binaryString.length);
30+
for (let i = 0; i < binaryString.length; i++) {
31+
bytes[i] = binaryString.charCodeAt(i);
32+
}
33+
return bytes;
34+
}
35+
36+
if (data instanceof URL) {
37+
throw new Error(
38+
"URL image sources are not supported by Workers AI. " +
39+
"Provide image data as a Uint8Array or base64 string instead.",
40+
);
41+
}
42+
43+
return null;
44+
}
45+
46+
function uint8ArrayToBase64(bytes: Uint8Array): string {
47+
let binary = "";
48+
const chunkSize = 8192;
49+
for (let i = 0; i < bytes.length; i += chunkSize) {
50+
const chunk = bytes.subarray(i, Math.min(i + chunkSize, bytes.length));
51+
binary += String.fromCharCode(...chunk);
52+
}
53+
return btoa(binary);
54+
}
355

456
export function convertToWorkersAIChatMessages(prompt: LanguageModelV3Prompt): {
557
messages: WorkersAIChatPrompt;
6-
images: {
7-
mediaType: string | undefined;
8-
image: Uint8Array;
9-
providerOptions: SharedV3ProviderOptions | undefined;
10-
}[];
1158
} {
1259
const messages: WorkersAIChatPrompt = [];
13-
const images: {
14-
mediaType: string | undefined;
15-
image: Uint8Array;
16-
providerOptions: SharedV3ProviderOptions | undefined;
17-
}[] = [];
1860

1961
for (const { role, content } of prompt) {
2062
switch (role) {
@@ -25,6 +67,7 @@ export function convertToWorkersAIChatMessages(prompt: LanguageModelV3Prompt): {
2567

2668
case "user": {
2769
const textParts: string[] = [];
70+
const imageParts: { image: Uint8Array; mediaType: string | undefined }[] = [];
2871

2972
for (const part of content) {
3073
switch (part.type) {
@@ -33,23 +76,36 @@ export function convertToWorkersAIChatMessages(prompt: LanguageModelV3Prompt): {
3376
break;
3477
}
3578
case "file": {
36-
if (part.data instanceof Uint8Array) {
37-
images.push({
38-
image: part.data,
79+
const imageBytes = toUint8Array(part.data);
80+
if (imageBytes) {
81+
imageParts.push({
82+
image: imageBytes,
3983
mediaType: part.mediaType,
40-
providerOptions: part.providerOptions,
4184
});
4285
}
43-
// Don't push empty strings for image parts
4486
break;
4587
}
4688
}
4789
}
4890

49-
messages.push({
50-
content: textParts.join("\n"),
51-
role: "user",
52-
});
91+
if (imageParts.length > 0) {
92+
const contentArray: WorkersAIContentPart[] = [];
93+
if (textParts.length > 0) {
94+
contentArray.push({ type: "text", text: textParts.join("\n") });
95+
}
96+
for (const img of imageParts) {
97+
const base64 = uint8ArrayToBase64(img.image);
98+
const mediaType = img.mediaType || "image/png";
99+
contentArray.push({
100+
type: "image_url",
101+
image_url: { url: `data:${mediaType};base64,${base64}` },
102+
});
103+
}
104+
messages.push({ content: contentArray, role: "user" });
105+
} else {
106+
messages.push({ content: textParts.join("\n"), role: "user" });
107+
}
108+
53109
break;
54110
}
55111

@@ -144,5 +200,5 @@ export function convertToWorkersAIChatMessages(prompt: LanguageModelV3Prompt): {
144200
}
145201
}
146202

147-
return { images, messages };
203+
return { messages };
148204
}

packages/workers-ai-provider/src/utils.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import type { WorkersAIChatPrompt } from "./workersai-chat-prompt";
1010
* Normalize messages before passing to the Workers AI binding.
1111
*
1212
* The binding has strict schema validation that differs from the OpenAI API:
13-
* - `content` must be a string (not null)
13+
* - `content` must not be null
1414
*/
1515
export function normalizeMessagesForBinding(messages: WorkersAIChatPrompt): WorkersAIChatPrompt {
1616
return messages.map((msg) => {

packages/workers-ai-provider/src/workersai-chat-language-model.ts

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -117,33 +117,27 @@ export class WorkersAIChatLanguageModel implements LanguageModelV3 {
117117

118118
/**
119119
* Build the inputs object for `binding.run()`, shared by doGenerate and doStream.
120+
*
121+
* Images are embedded inline in messages as OpenAI-compatible content
122+
* arrays with `image_url` parts. Both the REST API and the binding
123+
* accept this format at runtime.
124+
*
125+
* The binding path additionally normalises null content to empty strings.
120126
*/
121127
private buildRunInputs(
122128
args: ReturnType<typeof this.getArgs>["args"],
123129
messages: ReturnType<typeof convertToWorkersAIChatMessages>["messages"],
124-
images: ReturnType<typeof convertToWorkersAIChatMessages>["images"],
125130
options?: { stream?: boolean },
126131
) {
127-
if (images.length > 1) {
128-
throw new Error("Multiple images are not yet supported as input");
129-
}
130-
131-
const imagePart = images[0];
132-
133-
// Only normalize messages for the binding path (REST API doesn't need it)
134-
const finalMessages = this.config.isBinding
135-
? normalizeMessagesForBinding(messages)
136-
: messages;
137-
138132
return {
139133
max_tokens: args.max_tokens,
140-
messages: finalMessages,
134+
messages: this.config.isBinding
135+
? normalizeMessagesForBinding(messages)
136+
: messages,
141137
temperature: args.temperature,
142138
tools: args.tools,
143139
...(args.tool_choice ? { tool_choice: args.tool_choice } : {}),
144140
top_p: args.top_p,
145-
...(imagePart ? { image: Array.from(imagePart.image) } : {}),
146-
// Only include response_format when actually set
147141
...(args.response_format ? { response_format: args.response_format } : {}),
148142
...(options?.stream ? { stream: true } : {}),
149143
};
@@ -179,14 +173,16 @@ export class WorkersAIChatLanguageModel implements LanguageModelV3 {
179173
options: Parameters<LanguageModelV3["doGenerate"]>[0],
180174
): Promise<Awaited<ReturnType<LanguageModelV3["doGenerate"]>>> {
181175
const { args, warnings } = this.getArgs(options);
182-
const { messages, images } = convertToWorkersAIChatMessages(options.prompt);
176+
const { messages } = convertToWorkersAIChatMessages(options.prompt);
183177

184-
const inputs = this.buildRunInputs(args, messages, images);
178+
const inputs = this.buildRunInputs(args, messages);
185179
const runOptions = this.getRunOptions();
186180

187181
const output = await this.config.binding.run(
188182
args.model as keyof AiModels,
189-
inputs,
183+
// Content arrays for vision are valid at runtime but not in the
184+
// binding's strict TypeScript definitions (which expect string content).
185+
inputs as AiModels[keyof AiModels]["inputs"],
190186
runOptions,
191187
);
192188

@@ -226,14 +222,14 @@ export class WorkersAIChatLanguageModel implements LanguageModelV3 {
226222
options: Parameters<LanguageModelV3["doStream"]>[0],
227223
): Promise<Awaited<ReturnType<LanguageModelV3["doStream"]>>> {
228224
const { args, warnings } = this.getArgs(options);
229-
const { messages, images } = convertToWorkersAIChatMessages(options.prompt);
225+
const { messages } = convertToWorkersAIChatMessages(options.prompt);
230226

231-
const inputs = this.buildRunInputs(args, messages, images, { stream: true });
227+
const inputs = this.buildRunInputs(args, messages, { stream: true });
232228
const runOptions = this.getRunOptions();
233229

234230
const response = await this.config.binding.run(
235231
args.model as keyof AiModels,
236-
inputs,
232+
inputs as AiModels[keyof AiModels]["inputs"],
237233
runOptions,
238234
);
239235

packages/workers-ai-provider/src/workersai-chat-prompt.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,18 @@ export type WorkersAIChatMessage =
66
| WorkersAIAssistantMessage
77
| WorkersAIToolMessage;
88

9+
export type WorkersAIContentPart =
10+
| { type: "text"; text: string }
11+
| { type: "image_url"; image_url: { url: string } };
12+
913
export interface WorkersAISystemMessage {
1014
role: "system";
1115
content: string;
1216
}
1317

1418
export interface WorkersAIUserMessage {
1519
role: "user";
16-
content: string;
20+
content: string | WorkersAIContentPart[];
1721
}
1822

1923
export interface WorkersAIAssistantMessage {

0 commit comments

Comments
 (0)