Add Unicode surrogate sanitization for all providers

Fixes issue where unpaired Unicode surrogates in tool results cause JSON serialization errors in API providers, particularly Anthropic. - Add sanitizeSurrogates() utility function to remove unpaired surrogates - Apply sanitization in all provider convertMessages() functions: - User message text content (string and text blocks) - Assistant message text and thinking blocks - Tool result output - System prompts - Valid emoji (properly paired surrogates) are preserved - Add comprehensive test suite covering all 8 providers Previously only Google and Groq handled unpaired surrogates correctly. Now all providers (Anthropic, OpenAI Completions/Responses, Google, xAI, Groq, Cerebras, zAI) sanitize text before API submission.
2026-04-16 00:03:00 +00:00 · 2025-10-13 14:26:54 +02:00 · 2025-10-13 14:26:54 +02:00 · 4e7a340460
commit 4e7a340460
parent 949cd4efd8
6 changed files with 420 additions and 24 deletions
--- a/packages/ai/src/providers/anthropic.ts
+++ b/packages/ai/src/providers/anthropic.ts
@ -22,6 +22,7 @@ import type {
 } from "../types.js";
 import { AssistantMessageEventStream } from "../utils/event-stream.js";
 import { parseStreamingJson } from "../utils/json-parse.js";
+import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
 import { validateToolArguments } from "../utils/validation.js";
 import { transformMessages } from "./transorm-messages.js";

@ -284,7 +285,7 @@ function buildParams(
 		if (context.systemPrompt) {
 			params.system.push({
 				type: "text",
-				text: context.systemPrompt,
+				text: sanitizeSurrogates(context.systemPrompt),
 				cache_control: {
 					type: "ephemeral",
 				},
@ -295,7 +296,7 @@ function buildParams(
 		params.system = [
 			{
 				type: "text",
-				text: context.systemPrompt,
+				text: sanitizeSurrogates(context.systemPrompt),
 				cache_control: {
 					type: "ephemeral",
 				},
@ -349,7 +350,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
 				if (msg.content.trim().length > 0) {
 					params.push({
 						role: "user",
-						content: msg.content,
+						content: sanitizeSurrogates(msg.content),
 					});
 				}
 			} else {
@ -357,7 +358,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
 					if (item.type === "text") {
 						return {
 							type: "text",
-							text: item.text,
+							text: sanitizeSurrogates(item.text),
 						};
 					} else {
 						return {
@ -391,13 +392,13 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
 					if (block.text.trim().length === 0) continue;
 					blocks.push({
 						type: "text",
-						text: block.text,
+						text: sanitizeSurrogates(block.text),
 					});
 				} else if (block.type === "thinking") {
 					if (block.thinking.trim().length === 0) continue;
 					blocks.push({
 						type: "thinking",
-						thinking: block.thinking,
+						thinking: sanitizeSurrogates(block.thinking),
 						signature: block.thinkingSignature || "",
 					});
 				} else if (block.type === "toolCall") {
@ -422,7 +423,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
 			toolResults.push({
 				type: "tool_result",
 				tool_use_id: sanitizeToolCallId(msg.toolCallId),
-				content: msg.output,
+				content: sanitizeSurrogates(msg.output),
 				is_error: msg.isError,
 			});

@ -433,7 +434,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
 				toolResults.push({
 					type: "tool_result",
 					tool_use_id: sanitizeToolCallId(nextMsg.toolCallId),
-					content: nextMsg.output,
+					content: sanitizeSurrogates(nextMsg.output),
 					is_error: nextMsg.isError,
 				});
 				j++;
--- a/packages/ai/src/providers/google.ts
+++ b/packages/ai/src/providers/google.ts
@ -22,6 +22,7 @@ import type {
 	ToolCall,
 } from "../types.js";
 import { AssistantMessageEventStream } from "../utils/event-stream.js";
+import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
 import { validateToolArguments } from "../utils/validation.js";
 import { transformMessages } from "./transorm-messages.js";

@ -278,7 +279,7 @@ function buildParams(

 	const config: GenerateContentConfig = {
 		...(Object.keys(generationConfig).length > 0 && generationConfig),
-		...(context.systemPrompt && { systemInstruction: context.systemPrompt }),
+		...(context.systemPrompt && { systemInstruction: sanitizeSurrogates(context.systemPrompt) }),
 		...(context.tools && context.tools.length > 0 && { tools: convertTools(context.tools) }),
 	};

@ -323,12 +324,12 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context)
 			if (typeof msg.content === "string") {
 				contents.push({
 					role: "user",
-					parts: [{ text: msg.content }],
+					parts: [{ text: sanitizeSurrogates(msg.content) }],
 				});
 			} else {
 				const parts: Part[] = msg.content.map((item) => {
 					if (item.type === "text") {
-						return { text: item.text };
+						return { text: sanitizeSurrogates(item.text) };
 					} else {
 						return {
 							inlineData: {
@ -350,12 +351,12 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context)

 			for (const block of msg.content) {
 				if (block.type === "text") {
-					parts.push({ text: block.text });
+					parts.push({ text: sanitizeSurrogates(block.text) });
 				} else if (block.type === "thinking") {
 					const thinkingPart: Part = {
 						thought: true,
 						thoughtSignature: block.thinkingSignature,
-						text: block.thinking,
+						text: sanitizeSurrogates(block.thinking),
 					};
 					parts.push(thinkingPart);
 				} else if (block.type === "toolCall") {
@ -383,7 +384,7 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context)
 							id: msg.toolCallId,
 							name: msg.toolName,
 							response: {
-								result: msg.output,
+								result: sanitizeSurrogates(msg.output),
 								isError: msg.isError,
 							},
 						},
--- a/packages/ai/src/providers/openai-completions.ts
+++ b/packages/ai/src/providers/openai-completions.ts
@ -22,6 +22,7 @@ import type {
 } from "../types.js";
 import { AssistantMessageEventStream } from "../utils/event-stream.js";
 import { parseStreamingJson } from "../utils/json-parse.js";
+import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
 import { validateToolArguments } from "../utils/validation.js";
 import { transformMessages } from "./transorm-messages.js";

@ -310,7 +311,7 @@ function convertMessages(model: Model<"openai-completions">, context: Context):
 		const useDeveloperRole =
 			model.reasoning && !model.baseUrl.includes("cerebras.ai") && !model.baseUrl.includes("api.x.ai");
 		const role = useDeveloperRole ? "developer" : "system";
-		params.push({ role: role, content: context.systemPrompt });
+		params.push({ role: role, content: sanitizeSurrogates(context.systemPrompt) });
 	}

 	for (const msg of transformedMessages) {
@ -318,14 +319,14 @@ function convertMessages(model: Model<"openai-completions">, context: Context):
 			if (typeof msg.content === "string") {
 				params.push({
 					role: "user",
-					content: msg.content,
+					content: sanitizeSurrogates(msg.content),
 				});
 			} else {
 				const content: ChatCompletionContentPart[] = msg.content.map((item): ChatCompletionContentPart => {
 					if (item.type === "text") {
 						return {
 							type: "text",
-							text: item.text,
+							text: sanitizeSurrogates(item.text),
 						} satisfies ChatCompletionContentPartText;
 					} else {
 						return {
@ -354,7 +355,7 @@ function convertMessages(model: Model<"openai-completions">, context: Context):
 			const textBlocks = msg.content.filter((b) => b.type === "text") as TextContent[];
 			if (textBlocks.length > 0) {
 				assistantMsg.content = textBlocks.map((b) => {
-					return { type: "text", text: b.text };
+					return { type: "text", text: sanitizeSurrogates(b.text) };
 				});
 			}

@ -386,7 +387,7 @@ function convertMessages(model: Model<"openai-completions">, context: Context):
 		} else if (msg.role === "toolResult") {
 			params.push({
 				role: "tool",
-				content: msg.output,
+				content: sanitizeSurrogates(msg.output),
 				tool_call_id: msg.toolCallId,
 			});
 		}
--- a/packages/ai/src/providers/openai-responses.ts
+++ b/packages/ai/src/providers/openai-responses.ts
@ -26,6 +26,7 @@ import type {
 } from "../types.js";
 import { AssistantMessageEventStream } from "../utils/event-stream.js";
 import { parseStreamingJson } from "../utils/json-parse.js";
+import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
 import { validateToolArguments } from "../utils/validation.js";
 import { transformMessages } from "./transorm-messages.js";

@ -364,7 +365,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
 		const role = model.reasoning ? "developer" : "system";
 		messages.push({
 			role,
-			content: context.systemPrompt,
+			content: sanitizeSurrogates(context.systemPrompt),
 		});
 	}

@ -373,14 +374,14 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
 			if (typeof msg.content === "string") {
 				messages.push({
 					role: "user",
-					content: [{ type: "input_text", text: msg.content }],
+					content: [{ type: "input_text", text: sanitizeSurrogates(msg.content) }],
 				});
 			} else {
 				const content: ResponseInputContent[] = msg.content.map((item): ResponseInputContent => {
 					if (item.type === "text") {
 						return {
 							type: "input_text",
-							text: item.text,
+							text: sanitizeSurrogates(item.text),
 						} satisfies ResponseInputText;
 					} else {
 						return {
@ -414,7 +415,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
 					output.push({
 						type: "message",
 						role: "assistant",
-						content: [{ type: "output_text", text: textBlock.text, annotations: [] }],
+						content: [{ type: "output_text", text: sanitizeSurrogates(textBlock.text), annotations: [] }],
 						status: "completed",
 						id: textBlock.textSignature || "msg_" + Math.random().toString(36).substring(2, 15),
 					} satisfies ResponseOutputMessage);
@ -436,7 +437,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
 			messages.push({
 				type: "function_call_output",
 				call_id: msg.toolCallId.split("|")[0],
-				output: msg.output,
+				output: sanitizeSurrogates(msg.output),
 			});
 		}
 	}
--- a/packages/ai/src/utils/sanitize-unicode.ts
+++ b/packages/ai/src/utils/sanitize-unicode.ts
@ -0,0 +1,25 @@
+/**
+ * Removes unpaired Unicode surrogate characters from a string.
+ *
+ * Unpaired surrogates (high surrogates 0xD800-0xDBFF without matching low surrogates 0xDC00-0xDFFF,
+ * or vice versa) cause JSON serialization errors in many API providers.
+ *
+ * Valid emoji and other characters outside the Basic Multilingual Plane use properly paired
+ * surrogates and will NOT be affected by this function.
+ *
+ * @param text - The text to sanitize
+ * @returns The sanitized text with unpaired surrogates removed
+ *
+ * @example
+ * // Valid emoji (properly paired surrogates) are preserved
+ * sanitizeSurrogates("Hello 🙈 World") // => "Hello 🙈 World"
+ *
+ * // Unpaired high surrogate is removed
+ * const unpaired = String.fromCharCode(0xD83D); // high surrogate without low
+ * sanitizeSurrogates(`Text ${unpaired} here`) // => "Text  here"
+ */
+export function sanitizeSurrogates(text: string): string {
+	// Replace unpaired high surrogates (0xD800-0xDBFF not followed by low surrogate)
+	// Replace unpaired low surrogates (0xDC00-0xDFFF not preceded by high surrogate)
+	return text.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g, "");
+}
--- a/packages/ai/test/unicode-surrogate.test.ts
+++ b/packages/ai/test/unicode-surrogate.test.ts
@ -0,0 +1,367 @@
+import { describe, expect, it } from "vitest";
+import { getModel } from "../src/models.js";
+import { complete } from "../src/stream.js";
+import type { Api, Context, Model, OptionsForApi, ToolResultMessage } from "../src/types.js";
+
+/**
+ * Test for Unicode surrogate pair handling in tool results.
+ *
+ * Issue: When tool results contain emoji or other characters outside the Basic Multilingual Plane,
+ * they may be incorrectly serialized as unpaired surrogates, causing "no low surrogate in string"
+ * errors when sent to the API provider.
+ *
+ * Example error from Anthropic:
+ * "The request body is not valid JSON: no low surrogate in string: line 1 column 197667"
+ */
+
+async function testEmojiInToolResults<TApi extends Api>(llm: Model<TApi>, options: OptionsForApi<TApi> = {}) {
+	// Simulate a tool that returns emoji
+	const context: Context = {
+		systemPrompt: "You are a helpful assistant.",
+		messages: [
+			{
+				role: "user",
+				content: "Use the test tool",
+			},
+			{
+				role: "assistant",
+				content: [
+					{
+						type: "toolCall",
+						id: "test_1",
+						name: "test_tool",
+						arguments: {},
+					},
+				],
+				api: llm.api,
+				provider: llm.provider,
+				model: llm.id,
+				usage: {
+					input: 0,
+					output: 0,
+					cacheRead: 0,
+					cacheWrite: 0,
+					cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+				},
+				stopReason: "toolUse",
+			},
+		],
+		tools: [
+			{
+				name: "test_tool",
+				description: "A test tool",
+				parameters: {} as any,
+			},
+		],
+	};
+
+	// Add tool result with various problematic Unicode characters
+	const toolResult: ToolResultMessage = {
+		role: "toolResult",
+		toolCallId: "test_1",
+		toolName: "test_tool",
+		output: `Test with emoji 🙈 and other characters:
+- Monkey emoji: 🙈
+- Thumbs up: 👍
+- Heart: ❤️
+- Thinking face: 🤔
+- Rocket: 🚀
+- Mixed text: Mario Zechner wann? Wo? Bin grad äußersr eventuninformiert 🙈
+- Japanese: こんにちは
+- Chinese: 你好
+- Mathematical symbols: ∑∫∂√
+- Special quotes: "curly" 'quotes'`,
+		isError: false,
+	};
+
+	context.messages.push(toolResult);
+
+	// Add follow-up user message
+	context.messages.push({
+		role: "user",
+		content: "Summarize the tool result briefly.",
+	});
+
+	// This should not throw a surrogate pair error
+	const response = await complete(llm, context, options);
+
+	expect(response.stopReason).not.toBe("error");
+	expect(response.errorMessage).toBeFalsy();
+	expect(response.content.length).toBeGreaterThan(0);
+}
+
+async function testRealWorldLinkedInData<TApi extends Api>(llm: Model<TApi>, options: OptionsForApi<TApi> = {}) {
+	const context: Context = {
+		systemPrompt: "You are a helpful assistant.",
+		messages: [
+			{
+				role: "user",
+				content: "Use the linkedin tool to get comments",
+			},
+			{
+				role: "assistant",
+				content: [
+					{
+						type: "toolCall",
+						id: "linkedin_1",
+						name: "linkedin_skill",
+						arguments: {},
+					},
+				],
+				api: llm.api,
+				provider: llm.provider,
+				model: llm.id,
+				usage: {
+					input: 0,
+					output: 0,
+					cacheRead: 0,
+					cacheWrite: 0,
+					cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+				},
+				stopReason: "toolUse",
+			},
+		],
+		tools: [
+			{
+				name: "linkedin_skill",
+				description: "Get LinkedIn comments",
+				parameters: {} as any,
+			},
+		],
+	};
+
+	// Real-world tool result from LinkedIn with emoji
+	const toolResult: ToolResultMessage = {
+		role: "toolResult",
+		toolCallId: "linkedin_1",
+		toolName: "linkedin_skill",
+		output: `Post: Hab einen "Generative KI für Nicht-Techniker" Workshop gebaut.
+Unanswered Comments: 2
+
+=> {
+  "comments": [
+    {
+      "author": "Matthias Neumayer's  graphic link",
+      "text": "Leider nehmen das viel zu wenige Leute ernst"
+    },
+    {
+      "author": "Matthias Neumayer's  graphic link",
+      "text": "Mario Zechner wann? Wo? Bin grad äußersr eventuninformiert 🙈"
+    }
+  ]
+}`,
+		isError: false,
+	};
+
+	context.messages.push(toolResult);
+
+	context.messages.push({
+		role: "user",
+		content: "How many comments are there?",
+	});
+
+	// This should not throw a surrogate pair error
+	const response = await complete(llm, context, options);
+
+	expect(response.stopReason).not.toBe("error");
+	expect(response.errorMessage).toBeFalsy();
+	expect(response.content.some((b) => b.type === "text")).toBe(true);
+}
+
+async function testUnpairedHighSurrogate<TApi extends Api>(llm: Model<TApi>, options: OptionsForApi<TApi> = {}) {
+	const context: Context = {
+		systemPrompt: "You are a helpful assistant.",
+		messages: [
+			{
+				role: "user",
+				content: "Use the test tool",
+			},
+			{
+				role: "assistant",
+				content: [
+					{
+						type: "toolCall",
+						id: "test_2",
+						name: "test_tool",
+						arguments: {},
+					},
+				],
+				api: llm.api,
+				provider: llm.provider,
+				model: llm.id,
+				usage: {
+					input: 0,
+					output: 0,
+					cacheRead: 0,
+					cacheWrite: 0,
+					cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+				},
+				stopReason: "toolUse",
+			},
+		],
+		tools: [
+			{
+				name: "test_tool",
+				description: "A test tool",
+				parameters: {} as any,
+			},
+		],
+	};
+
+	// Construct a string with an intentionally unpaired high surrogate
+	// This simulates what might happen if text processing corrupts emoji
+	const unpairedSurrogate = String.fromCharCode(0xd83d); // High surrogate without low surrogate
+
+	const toolResult: ToolResultMessage = {
+		role: "toolResult",
+		toolCallId: "test_2",
+		toolName: "test_tool",
+		output: `Text with unpaired surrogate: ${unpairedSurrogate} <- should be sanitized`,
+		isError: false,
+	};
+
+	context.messages.push(toolResult);
+
+	context.messages.push({
+		role: "user",
+		content: "What did the tool return?",
+	});
+
+	// This should not throw a surrogate pair error
+	// The unpaired surrogate should be sanitized before sending to API
+	const response = await complete(llm, context, options);
+
+	expect(response.stopReason).not.toBe("error");
+	expect(response.errorMessage).toBeFalsy();
+	expect(response.content.length).toBeGreaterThan(0);
+}
+
+describe("AI Providers Unicode Surrogate Pair Tests", () => {
+	describe.skipIf(!process.env.GEMINI_API_KEY)("Google Provider Unicode Handling", () => {
+		const llm = getModel("google", "gemini-2.5-flash");
+
+		it("should handle emoji in tool results", async () => {
+			await testEmojiInToolResults(llm);
+		});
+
+		it("should handle real-world LinkedIn comment data with emoji", async () => {
+			await testRealWorldLinkedInData(llm);
+		});
+
+		it("should handle unpaired high surrogate (0xD83D) in tool results", async () => {
+			await testUnpairedHighSurrogate(llm);
+		});
+	});
+
+	describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Completions Provider Unicode Handling", () => {
+		const llm = getModel("openai", "gpt-4o-mini");
+
+		it("should handle emoji in tool results", async () => {
+			await testEmojiInToolResults(llm);
+		});
+
+		it("should handle real-world LinkedIn comment data with emoji", async () => {
+			await testRealWorldLinkedInData(llm);
+		});
+
+		it("should handle unpaired high surrogate (0xD83D) in tool results", async () => {
+			await testUnpairedHighSurrogate(llm);
+		});
+	});
+
+	describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Responses Provider Unicode Handling", () => {
+		const llm = getModel("openai", "gpt-5-mini");
+
+		it("should handle emoji in tool results", async () => {
+			await testEmojiInToolResults(llm);
+		});
+
+		it("should handle real-world LinkedIn comment data with emoji", async () => {
+			await testRealWorldLinkedInData(llm);
+		});
+
+		it("should handle unpaired high surrogate (0xD83D) in tool results", async () => {
+			await testUnpairedHighSurrogate(llm);
+		});
+	});
+
+	describe.skipIf(!process.env.ANTHROPIC_OAUTH_TOKEN)("Anthropic Provider Unicode Handling", () => {
+		const llm = getModel("anthropic", "claude-3-5-haiku-20241022");
+
+		it("should handle emoji in tool results", async () => {
+			await testEmojiInToolResults(llm);
+		});
+
+		it("should handle real-world LinkedIn comment data with emoji", async () => {
+			await testRealWorldLinkedInData(llm);
+		});
+
+		it("should handle unpaired high surrogate (0xD83D) in tool results", async () => {
+			await testUnpairedHighSurrogate(llm);
+		});
+	});
+
+	describe.skipIf(!process.env.XAI_API_KEY)("xAI Provider Unicode Handling", () => {
+		const llm = getModel("xai", "grok-3");
+
+		it("should handle emoji in tool results", async () => {
+			await testEmojiInToolResults(llm);
+		});
+
+		it("should handle real-world LinkedIn comment data with emoji", async () => {
+			await testRealWorldLinkedInData(llm);
+		});
+
+		it("should handle unpaired high surrogate (0xD83D) in tool results", async () => {
+			await testUnpairedHighSurrogate(llm);
+		});
+	});
+
+	describe.skipIf(!process.env.GROQ_API_KEY)("Groq Provider Unicode Handling", () => {
+		const llm = getModel("groq", "openai/gpt-oss-20b");
+
+		it("should handle emoji in tool results", async () => {
+			await testEmojiInToolResults(llm);
+		});
+
+		it("should handle real-world LinkedIn comment data with emoji", async () => {
+			await testRealWorldLinkedInData(llm);
+		});
+
+		it("should handle unpaired high surrogate (0xD83D) in tool results", async () => {
+			await testUnpairedHighSurrogate(llm);
+		});
+	});
+
+	describe.skipIf(!process.env.CEREBRAS_API_KEY)("Cerebras Provider Unicode Handling", () => {
+		const llm = getModel("cerebras", "gpt-oss-120b");
+
+		it("should handle emoji in tool results", async () => {
+			await testEmojiInToolResults(llm);
+		});
+
+		it("should handle real-world LinkedIn comment data with emoji", async () => {
+			await testRealWorldLinkedInData(llm);
+		});
+
+		it("should handle unpaired high surrogate (0xD83D) in tool results", async () => {
+			await testUnpairedHighSurrogate(llm);
+		});
+	});
+
+	describe.skipIf(!process.env.ZAI_API_KEY)("zAI Provider Unicode Handling", () => {
+		const llm = getModel("zai", "glm-4.5-air");
+
+		it("should handle emoji in tool results", async () => {
+			await testEmojiInToolResults(llm);
+		});
+
+		it("should handle real-world LinkedIn comment data with emoji", async () => {
+			await testRealWorldLinkedInData(llm);
+		});
+
+		it("should handle unpaired high surrogate (0xD83D) in tool results", async () => {
+			await testUnpairedHighSurrogate(llm);
+		});
+	});
+});