diff --git a/packages/ai/src/providers/anthropic.ts b/packages/ai/src/providers/anthropic.ts index 15ceb04c..60dfb883 100644 --- a/packages/ai/src/providers/anthropic.ts +++ b/packages/ai/src/providers/anthropic.ts @@ -22,6 +22,7 @@ import type { } from "../types.js"; import { AssistantMessageEventStream } from "../utils/event-stream.js"; import { parseStreamingJson } from "../utils/json-parse.js"; +import { sanitizeSurrogates } from "../utils/sanitize-unicode.js"; import { validateToolArguments } from "../utils/validation.js"; import { transformMessages } from "./transorm-messages.js"; @@ -284,7 +285,7 @@ function buildParams( if (context.systemPrompt) { params.system.push({ type: "text", - text: context.systemPrompt, + text: sanitizeSurrogates(context.systemPrompt), cache_control: { type: "ephemeral", }, @@ -295,7 +296,7 @@ function buildParams( params.system = [ { type: "text", - text: context.systemPrompt, + text: sanitizeSurrogates(context.systemPrompt), cache_control: { type: "ephemeral", }, @@ -349,7 +350,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages"> if (msg.content.trim().length > 0) { params.push({ role: "user", - content: msg.content, + content: sanitizeSurrogates(msg.content), }); } } else { @@ -357,7 +358,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages"> if (item.type === "text") { return { type: "text", - text: item.text, + text: sanitizeSurrogates(item.text), }; } else { return { @@ -391,13 +392,13 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages"> if (block.text.trim().length === 0) continue; blocks.push({ type: "text", - text: block.text, + text: sanitizeSurrogates(block.text), }); } else if (block.type === "thinking") { if (block.thinking.trim().length === 0) continue; blocks.push({ type: "thinking", - thinking: block.thinking, + thinking: sanitizeSurrogates(block.thinking), signature: block.thinkingSignature || "", }); } else if (block.type === "toolCall") { @@ -422,7 +423,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages"> toolResults.push({ type: "tool_result", tool_use_id: sanitizeToolCallId(msg.toolCallId), - content: msg.output, + content: sanitizeSurrogates(msg.output), is_error: msg.isError, }); @@ -433,7 +434,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages"> toolResults.push({ type: "tool_result", tool_use_id: sanitizeToolCallId(nextMsg.toolCallId), - content: nextMsg.output, + content: sanitizeSurrogates(nextMsg.output), is_error: nextMsg.isError, }); j++; diff --git a/packages/ai/src/providers/google.ts b/packages/ai/src/providers/google.ts index 9f85814e..2a4ecd44 100644 --- a/packages/ai/src/providers/google.ts +++ b/packages/ai/src/providers/google.ts @@ -22,6 +22,7 @@ import type { ToolCall, } from "../types.js"; import { AssistantMessageEventStream } from "../utils/event-stream.js"; +import { sanitizeSurrogates } from "../utils/sanitize-unicode.js"; import { validateToolArguments } from "../utils/validation.js"; import { transformMessages } from "./transorm-messages.js"; @@ -278,7 +279,7 @@ function buildParams( const config: GenerateContentConfig = { ...(Object.keys(generationConfig).length > 0 && generationConfig), - ...(context.systemPrompt && { systemInstruction: context.systemPrompt }), + ...(context.systemPrompt && { systemInstruction: sanitizeSurrogates(context.systemPrompt) }), ...(context.tools && context.tools.length > 0 && { tools: convertTools(context.tools) }), }; @@ -323,12 +324,12 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context) if (typeof msg.content === "string") { contents.push({ role: "user", - parts: [{ text: msg.content }], + parts: [{ text: sanitizeSurrogates(msg.content) }], }); } else { const parts: Part[] = msg.content.map((item) => { if (item.type === "text") { - return { text: item.text }; + return { text: sanitizeSurrogates(item.text) }; } else { return { inlineData: { @@ -350,12 +351,12 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context) for (const block of msg.content) { if (block.type === "text") { - parts.push({ text: block.text }); + parts.push({ text: sanitizeSurrogates(block.text) }); } else if (block.type === "thinking") { const thinkingPart: Part = { thought: true, thoughtSignature: block.thinkingSignature, - text: block.thinking, + text: sanitizeSurrogates(block.thinking), }; parts.push(thinkingPart); } else if (block.type === "toolCall") { @@ -383,7 +384,7 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context) id: msg.toolCallId, name: msg.toolName, response: { - result: msg.output, + result: sanitizeSurrogates(msg.output), isError: msg.isError, }, }, diff --git a/packages/ai/src/providers/openai-completions.ts b/packages/ai/src/providers/openai-completions.ts index 22a2c74d..3edbd634 100644 --- a/packages/ai/src/providers/openai-completions.ts +++ b/packages/ai/src/providers/openai-completions.ts @@ -22,6 +22,7 @@ import type { } from "../types.js"; import { AssistantMessageEventStream } from "../utils/event-stream.js"; import { parseStreamingJson } from "../utils/json-parse.js"; +import { sanitizeSurrogates } from "../utils/sanitize-unicode.js"; import { validateToolArguments } from "../utils/validation.js"; import { transformMessages } from "./transorm-messages.js"; @@ -310,7 +311,7 @@ function convertMessages(model: Model<"openai-completions">, context: Context): const useDeveloperRole = model.reasoning && !model.baseUrl.includes("cerebras.ai") && !model.baseUrl.includes("api.x.ai"); const role = useDeveloperRole ? "developer" : "system"; - params.push({ role: role, content: context.systemPrompt }); + params.push({ role: role, content: sanitizeSurrogates(context.systemPrompt) }); } for (const msg of transformedMessages) { @@ -318,14 +319,14 @@ function convertMessages(model: Model<"openai-completions">, context: Context): if (typeof msg.content === "string") { params.push({ role: "user", - content: msg.content, + content: sanitizeSurrogates(msg.content), }); } else { const content: ChatCompletionContentPart[] = msg.content.map((item): ChatCompletionContentPart => { if (item.type === "text") { return { type: "text", - text: item.text, + text: sanitizeSurrogates(item.text), } satisfies ChatCompletionContentPartText; } else { return { @@ -354,7 +355,7 @@ function convertMessages(model: Model<"openai-completions">, context: Context): const textBlocks = msg.content.filter((b) => b.type === "text") as TextContent[]; if (textBlocks.length > 0) { assistantMsg.content = textBlocks.map((b) => { - return { type: "text", text: b.text }; + return { type: "text", text: sanitizeSurrogates(b.text) }; }); } @@ -386,7 +387,7 @@ function convertMessages(model: Model<"openai-completions">, context: Context): } else if (msg.role === "toolResult") { params.push({ role: "tool", - content: msg.output, + content: sanitizeSurrogates(msg.output), tool_call_id: msg.toolCallId, }); } diff --git a/packages/ai/src/providers/openai-responses.ts b/packages/ai/src/providers/openai-responses.ts index 51004506..4ce68cad 100644 --- a/packages/ai/src/providers/openai-responses.ts +++ b/packages/ai/src/providers/openai-responses.ts @@ -26,6 +26,7 @@ import type { } from "../types.js"; import { AssistantMessageEventStream } from "../utils/event-stream.js"; import { parseStreamingJson } from "../utils/json-parse.js"; +import { sanitizeSurrogates } from "../utils/sanitize-unicode.js"; import { validateToolArguments } from "../utils/validation.js"; import { transformMessages } from "./transorm-messages.js"; @@ -364,7 +365,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re const role = model.reasoning ? "developer" : "system"; messages.push({ role, - content: context.systemPrompt, + content: sanitizeSurrogates(context.systemPrompt), }); } @@ -373,14 +374,14 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re if (typeof msg.content === "string") { messages.push({ role: "user", - content: [{ type: "input_text", text: msg.content }], + content: [{ type: "input_text", text: sanitizeSurrogates(msg.content) }], }); } else { const content: ResponseInputContent[] = msg.content.map((item): ResponseInputContent => { if (item.type === "text") { return { type: "input_text", - text: item.text, + text: sanitizeSurrogates(item.text), } satisfies ResponseInputText; } else { return { @@ -414,7 +415,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re output.push({ type: "message", role: "assistant", - content: [{ type: "output_text", text: textBlock.text, annotations: [] }], + content: [{ type: "output_text", text: sanitizeSurrogates(textBlock.text), annotations: [] }], status: "completed", id: textBlock.textSignature || "msg_" + Math.random().toString(36).substring(2, 15), } satisfies ResponseOutputMessage); @@ -436,7 +437,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re messages.push({ type: "function_call_output", call_id: msg.toolCallId.split("|")[0], - output: msg.output, + output: sanitizeSurrogates(msg.output), }); } } diff --git a/packages/ai/src/utils/sanitize-unicode.ts b/packages/ai/src/utils/sanitize-unicode.ts new file mode 100644 index 00000000..d869ee9d --- /dev/null +++ b/packages/ai/src/utils/sanitize-unicode.ts @@ -0,0 +1,25 @@ +/** + * Removes unpaired Unicode surrogate characters from a string. + * + * Unpaired surrogates (high surrogates 0xD800-0xDBFF without matching low surrogates 0xDC00-0xDFFF, + * or vice versa) cause JSON serialization errors in many API providers. + * + * Valid emoji and other characters outside the Basic Multilingual Plane use properly paired + * surrogates and will NOT be affected by this function. + * + * @param text - The text to sanitize + * @returns The sanitized text with unpaired surrogates removed + * + * @example + * // Valid emoji (properly paired surrogates) are preserved + * sanitizeSurrogates("Hello πŸ™ˆ World") // => "Hello πŸ™ˆ World" + * + * // Unpaired high surrogate is removed + * const unpaired = String.fromCharCode(0xD83D); // high surrogate without low + * sanitizeSurrogates(`Text ${unpaired} here`) // => "Text here" + */ +export function sanitizeSurrogates(text: string): string { + // Replace unpaired high surrogates (0xD800-0xDBFF not followed by low surrogate) + // Replace unpaired low surrogates (0xDC00-0xDFFF not preceded by high surrogate) + return text.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?(llm: Model, options: OptionsForApi = {}) { + // Simulate a tool that returns emoji + const context: Context = { + systemPrompt: "You are a helpful assistant.", + messages: [ + { + role: "user", + content: "Use the test tool", + }, + { + role: "assistant", + content: [ + { + type: "toolCall", + id: "test_1", + name: "test_tool", + arguments: {}, + }, + ], + api: llm.api, + provider: llm.provider, + model: llm.id, + usage: { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, + }, + stopReason: "toolUse", + }, + ], + tools: [ + { + name: "test_tool", + description: "A test tool", + parameters: {} as any, + }, + ], + }; + + // Add tool result with various problematic Unicode characters + const toolResult: ToolResultMessage = { + role: "toolResult", + toolCallId: "test_1", + toolName: "test_tool", + output: `Test with emoji πŸ™ˆ and other characters: +- Monkey emoji: πŸ™ˆ +- Thumbs up: πŸ‘ +- Heart: ❀️ +- Thinking face: πŸ€” +- Rocket: πŸš€ +- Mixed text: Mario Zechner wann? Wo? Bin grad Γ€ußersr eventuninformiert πŸ™ˆ +- Japanese: こんにけは +- Chinese: δ½ ε₯½ +- Mathematical symbols: βˆ‘βˆ«βˆ‚βˆš +- Special quotes: "curly" 'quotes'`, + isError: false, + }; + + context.messages.push(toolResult); + + // Add follow-up user message + context.messages.push({ + role: "user", + content: "Summarize the tool result briefly.", + }); + + // This should not throw a surrogate pair error + const response = await complete(llm, context, options); + + expect(response.stopReason).not.toBe("error"); + expect(response.errorMessage).toBeFalsy(); + expect(response.content.length).toBeGreaterThan(0); +} + +async function testRealWorldLinkedInData(llm: Model, options: OptionsForApi = {}) { + const context: Context = { + systemPrompt: "You are a helpful assistant.", + messages: [ + { + role: "user", + content: "Use the linkedin tool to get comments", + }, + { + role: "assistant", + content: [ + { + type: "toolCall", + id: "linkedin_1", + name: "linkedin_skill", + arguments: {}, + }, + ], + api: llm.api, + provider: llm.provider, + model: llm.id, + usage: { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, + }, + stopReason: "toolUse", + }, + ], + tools: [ + { + name: "linkedin_skill", + description: "Get LinkedIn comments", + parameters: {} as any, + }, + ], + }; + + // Real-world tool result from LinkedIn with emoji + const toolResult: ToolResultMessage = { + role: "toolResult", + toolCallId: "linkedin_1", + toolName: "linkedin_skill", + output: `Post: Hab einen "Generative KI fΓΌr Nicht-Techniker" Workshop gebaut. +Unanswered Comments: 2 + +=> { + "comments": [ + { + "author": "Matthias Neumayer's graphic link", + "text": "Leider nehmen das viel zu wenige Leute ernst" + }, + { + "author": "Matthias Neumayer's graphic link", + "text": "Mario Zechner wann? Wo? Bin grad Γ€ußersr eventuninformiert πŸ™ˆ" + } + ] +}`, + isError: false, + }; + + context.messages.push(toolResult); + + context.messages.push({ + role: "user", + content: "How many comments are there?", + }); + + // This should not throw a surrogate pair error + const response = await complete(llm, context, options); + + expect(response.stopReason).not.toBe("error"); + expect(response.errorMessage).toBeFalsy(); + expect(response.content.some((b) => b.type === "text")).toBe(true); +} + +async function testUnpairedHighSurrogate(llm: Model, options: OptionsForApi = {}) { + const context: Context = { + systemPrompt: "You are a helpful assistant.", + messages: [ + { + role: "user", + content: "Use the test tool", + }, + { + role: "assistant", + content: [ + { + type: "toolCall", + id: "test_2", + name: "test_tool", + arguments: {}, + }, + ], + api: llm.api, + provider: llm.provider, + model: llm.id, + usage: { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, + }, + stopReason: "toolUse", + }, + ], + tools: [ + { + name: "test_tool", + description: "A test tool", + parameters: {} as any, + }, + ], + }; + + // Construct a string with an intentionally unpaired high surrogate + // This simulates what might happen if text processing corrupts emoji + const unpairedSurrogate = String.fromCharCode(0xd83d); // High surrogate without low surrogate + + const toolResult: ToolResultMessage = { + role: "toolResult", + toolCallId: "test_2", + toolName: "test_tool", + output: `Text with unpaired surrogate: ${unpairedSurrogate} <- should be sanitized`, + isError: false, + }; + + context.messages.push(toolResult); + + context.messages.push({ + role: "user", + content: "What did the tool return?", + }); + + // This should not throw a surrogate pair error + // The unpaired surrogate should be sanitized before sending to API + const response = await complete(llm, context, options); + + expect(response.stopReason).not.toBe("error"); + expect(response.errorMessage).toBeFalsy(); + expect(response.content.length).toBeGreaterThan(0); +} + +describe("AI Providers Unicode Surrogate Pair Tests", () => { + describe.skipIf(!process.env.GEMINI_API_KEY)("Google Provider Unicode Handling", () => { + const llm = getModel("google", "gemini-2.5-flash"); + + it("should handle emoji in tool results", async () => { + await testEmojiInToolResults(llm); + }); + + it("should handle real-world LinkedIn comment data with emoji", async () => { + await testRealWorldLinkedInData(llm); + }); + + it("should handle unpaired high surrogate (0xD83D) in tool results", async () => { + await testUnpairedHighSurrogate(llm); + }); + }); + + describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Completions Provider Unicode Handling", () => { + const llm = getModel("openai", "gpt-4o-mini"); + + it("should handle emoji in tool results", async () => { + await testEmojiInToolResults(llm); + }); + + it("should handle real-world LinkedIn comment data with emoji", async () => { + await testRealWorldLinkedInData(llm); + }); + + it("should handle unpaired high surrogate (0xD83D) in tool results", async () => { + await testUnpairedHighSurrogate(llm); + }); + }); + + describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Responses Provider Unicode Handling", () => { + const llm = getModel("openai", "gpt-5-mini"); + + it("should handle emoji in tool results", async () => { + await testEmojiInToolResults(llm); + }); + + it("should handle real-world LinkedIn comment data with emoji", async () => { + await testRealWorldLinkedInData(llm); + }); + + it("should handle unpaired high surrogate (0xD83D) in tool results", async () => { + await testUnpairedHighSurrogate(llm); + }); + }); + + describe.skipIf(!process.env.ANTHROPIC_OAUTH_TOKEN)("Anthropic Provider Unicode Handling", () => { + const llm = getModel("anthropic", "claude-3-5-haiku-20241022"); + + it("should handle emoji in tool results", async () => { + await testEmojiInToolResults(llm); + }); + + it("should handle real-world LinkedIn comment data with emoji", async () => { + await testRealWorldLinkedInData(llm); + }); + + it("should handle unpaired high surrogate (0xD83D) in tool results", async () => { + await testUnpairedHighSurrogate(llm); + }); + }); + + describe.skipIf(!process.env.XAI_API_KEY)("xAI Provider Unicode Handling", () => { + const llm = getModel("xai", "grok-3"); + + it("should handle emoji in tool results", async () => { + await testEmojiInToolResults(llm); + }); + + it("should handle real-world LinkedIn comment data with emoji", async () => { + await testRealWorldLinkedInData(llm); + }); + + it("should handle unpaired high surrogate (0xD83D) in tool results", async () => { + await testUnpairedHighSurrogate(llm); + }); + }); + + describe.skipIf(!process.env.GROQ_API_KEY)("Groq Provider Unicode Handling", () => { + const llm = getModel("groq", "openai/gpt-oss-20b"); + + it("should handle emoji in tool results", async () => { + await testEmojiInToolResults(llm); + }); + + it("should handle real-world LinkedIn comment data with emoji", async () => { + await testRealWorldLinkedInData(llm); + }); + + it("should handle unpaired high surrogate (0xD83D) in tool results", async () => { + await testUnpairedHighSurrogate(llm); + }); + }); + + describe.skipIf(!process.env.CEREBRAS_API_KEY)("Cerebras Provider Unicode Handling", () => { + const llm = getModel("cerebras", "gpt-oss-120b"); + + it("should handle emoji in tool results", async () => { + await testEmojiInToolResults(llm); + }); + + it("should handle real-world LinkedIn comment data with emoji", async () => { + await testRealWorldLinkedInData(llm); + }); + + it("should handle unpaired high surrogate (0xD83D) in tool results", async () => { + await testUnpairedHighSurrogate(llm); + }); + }); + + describe.skipIf(!process.env.ZAI_API_KEY)("zAI Provider Unicode Handling", () => { + const llm = getModel("zai", "glm-4.5-air"); + + it("should handle emoji in tool results", async () => { + await testEmojiInToolResults(llm); + }); + + it("should handle real-world LinkedIn comment data with emoji", async () => { + await testRealWorldLinkedInData(llm); + }); + + it("should handle unpaired high surrogate (0xD83D) in tool results", async () => { + await testUnpairedHighSurrogate(llm); + }); + }); +});