Add Unicode surrogate sanitization for all providers

Fixes issue where unpaired Unicode surrogates in tool results cause JSON serialization errors in API providers, particularly Anthropic.

- Add sanitizeSurrogates() utility function to remove unpaired surrogates
- Apply sanitization in all provider convertMessages() functions:
  - User message text content (string and text blocks)
  - Assistant message text and thinking blocks
  - Tool result output
  - System prompts
- Valid emoji (properly paired surrogates) are preserved
- Add comprehensive test suite covering all 8 providers

Previously only Google and Groq handled unpaired surrogates correctly.
Now all providers (Anthropic, OpenAI Completions/Responses, Google, xAI, Groq, Cerebras, zAI) sanitize text before API submission.
This commit is contained in:
Mario Zechner 2025-10-13 14:26:54 +02:00
parent 949cd4efd8
commit 4e7a340460
6 changed files with 420 additions and 24 deletions

View file

@ -22,6 +22,7 @@ import type {
} from "../types.js";
import { AssistantMessageEventStream } from "../utils/event-stream.js";
import { parseStreamingJson } from "../utils/json-parse.js";
import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
import { validateToolArguments } from "../utils/validation.js";
import { transformMessages } from "./transorm-messages.js";
@ -284,7 +285,7 @@ function buildParams(
if (context.systemPrompt) {
params.system.push({
type: "text",
text: context.systemPrompt,
text: sanitizeSurrogates(context.systemPrompt),
cache_control: {
type: "ephemeral",
},
@ -295,7 +296,7 @@ function buildParams(
params.system = [
{
type: "text",
text: context.systemPrompt,
text: sanitizeSurrogates(context.systemPrompt),
cache_control: {
type: "ephemeral",
},
@ -349,7 +350,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
if (msg.content.trim().length > 0) {
params.push({
role: "user",
content: msg.content,
content: sanitizeSurrogates(msg.content),
});
}
} else {
@ -357,7 +358,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
if (item.type === "text") {
return {
type: "text",
text: item.text,
text: sanitizeSurrogates(item.text),
};
} else {
return {
@ -391,13 +392,13 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
if (block.text.trim().length === 0) continue;
blocks.push({
type: "text",
text: block.text,
text: sanitizeSurrogates(block.text),
});
} else if (block.type === "thinking") {
if (block.thinking.trim().length === 0) continue;
blocks.push({
type: "thinking",
thinking: block.thinking,
thinking: sanitizeSurrogates(block.thinking),
signature: block.thinkingSignature || "",
});
} else if (block.type === "toolCall") {
@ -422,7 +423,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
toolResults.push({
type: "tool_result",
tool_use_id: sanitizeToolCallId(msg.toolCallId),
content: msg.output,
content: sanitizeSurrogates(msg.output),
is_error: msg.isError,
});
@ -433,7 +434,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
toolResults.push({
type: "tool_result",
tool_use_id: sanitizeToolCallId(nextMsg.toolCallId),
content: nextMsg.output,
content: sanitizeSurrogates(nextMsg.output),
is_error: nextMsg.isError,
});
j++;

View file

@ -22,6 +22,7 @@ import type {
ToolCall,
} from "../types.js";
import { AssistantMessageEventStream } from "../utils/event-stream.js";
import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
import { validateToolArguments } from "../utils/validation.js";
import { transformMessages } from "./transorm-messages.js";
@ -278,7 +279,7 @@ function buildParams(
const config: GenerateContentConfig = {
...(Object.keys(generationConfig).length > 0 && generationConfig),
...(context.systemPrompt && { systemInstruction: context.systemPrompt }),
...(context.systemPrompt && { systemInstruction: sanitizeSurrogates(context.systemPrompt) }),
...(context.tools && context.tools.length > 0 && { tools: convertTools(context.tools) }),
};
@ -323,12 +324,12 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context)
if (typeof msg.content === "string") {
contents.push({
role: "user",
parts: [{ text: msg.content }],
parts: [{ text: sanitizeSurrogates(msg.content) }],
});
} else {
const parts: Part[] = msg.content.map((item) => {
if (item.type === "text") {
return { text: item.text };
return { text: sanitizeSurrogates(item.text) };
} else {
return {
inlineData: {
@ -350,12 +351,12 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context)
for (const block of msg.content) {
if (block.type === "text") {
parts.push({ text: block.text });
parts.push({ text: sanitizeSurrogates(block.text) });
} else if (block.type === "thinking") {
const thinkingPart: Part = {
thought: true,
thoughtSignature: block.thinkingSignature,
text: block.thinking,
text: sanitizeSurrogates(block.thinking),
};
parts.push(thinkingPart);
} else if (block.type === "toolCall") {
@ -383,7 +384,7 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context)
id: msg.toolCallId,
name: msg.toolName,
response: {
result: msg.output,
result: sanitizeSurrogates(msg.output),
isError: msg.isError,
},
},

View file

@ -22,6 +22,7 @@ import type {
} from "../types.js";
import { AssistantMessageEventStream } from "../utils/event-stream.js";
import { parseStreamingJson } from "../utils/json-parse.js";
import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
import { validateToolArguments } from "../utils/validation.js";
import { transformMessages } from "./transorm-messages.js";
@ -310,7 +311,7 @@ function convertMessages(model: Model<"openai-completions">, context: Context):
const useDeveloperRole =
model.reasoning && !model.baseUrl.includes("cerebras.ai") && !model.baseUrl.includes("api.x.ai");
const role = useDeveloperRole ? "developer" : "system";
params.push({ role: role, content: context.systemPrompt });
params.push({ role: role, content: sanitizeSurrogates(context.systemPrompt) });
}
for (const msg of transformedMessages) {
@ -318,14 +319,14 @@ function convertMessages(model: Model<"openai-completions">, context: Context):
if (typeof msg.content === "string") {
params.push({
role: "user",
content: msg.content,
content: sanitizeSurrogates(msg.content),
});
} else {
const content: ChatCompletionContentPart[] = msg.content.map((item): ChatCompletionContentPart => {
if (item.type === "text") {
return {
type: "text",
text: item.text,
text: sanitizeSurrogates(item.text),
} satisfies ChatCompletionContentPartText;
} else {
return {
@ -354,7 +355,7 @@ function convertMessages(model: Model<"openai-completions">, context: Context):
const textBlocks = msg.content.filter((b) => b.type === "text") as TextContent[];
if (textBlocks.length > 0) {
assistantMsg.content = textBlocks.map((b) => {
return { type: "text", text: b.text };
return { type: "text", text: sanitizeSurrogates(b.text) };
});
}
@ -386,7 +387,7 @@ function convertMessages(model: Model<"openai-completions">, context: Context):
} else if (msg.role === "toolResult") {
params.push({
role: "tool",
content: msg.output,
content: sanitizeSurrogates(msg.output),
tool_call_id: msg.toolCallId,
});
}

View file

@ -26,6 +26,7 @@ import type {
} from "../types.js";
import { AssistantMessageEventStream } from "../utils/event-stream.js";
import { parseStreamingJson } from "../utils/json-parse.js";
import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
import { validateToolArguments } from "../utils/validation.js";
import { transformMessages } from "./transorm-messages.js";
@ -364,7 +365,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
const role = model.reasoning ? "developer" : "system";
messages.push({
role,
content: context.systemPrompt,
content: sanitizeSurrogates(context.systemPrompt),
});
}
@ -373,14 +374,14 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
if (typeof msg.content === "string") {
messages.push({
role: "user",
content: [{ type: "input_text", text: msg.content }],
content: [{ type: "input_text", text: sanitizeSurrogates(msg.content) }],
});
} else {
const content: ResponseInputContent[] = msg.content.map((item): ResponseInputContent => {
if (item.type === "text") {
return {
type: "input_text",
text: item.text,
text: sanitizeSurrogates(item.text),
} satisfies ResponseInputText;
} else {
return {
@ -414,7 +415,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
output.push({
type: "message",
role: "assistant",
content: [{ type: "output_text", text: textBlock.text, annotations: [] }],
content: [{ type: "output_text", text: sanitizeSurrogates(textBlock.text), annotations: [] }],
status: "completed",
id: textBlock.textSignature || "msg_" + Math.random().toString(36).substring(2, 15),
} satisfies ResponseOutputMessage);
@ -436,7 +437,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
messages.push({
type: "function_call_output",
call_id: msg.toolCallId.split("|")[0],
output: msg.output,
output: sanitizeSurrogates(msg.output),
});
}
}

View file

@ -0,0 +1,25 @@
/**
* Removes unpaired Unicode surrogate characters from a string.
*
* Unpaired surrogates (high surrogates 0xD800-0xDBFF without matching low surrogates 0xDC00-0xDFFF,
* or vice versa) cause JSON serialization errors in many API providers.
*
* Valid emoji and other characters outside the Basic Multilingual Plane use properly paired
* surrogates and will NOT be affected by this function.
*
* @param text - The text to sanitize
* @returns The sanitized text with unpaired surrogates removed
*
* @example
* // Valid emoji (properly paired surrogates) are preserved
* sanitizeSurrogates("Hello 🙈 World") // => "Hello 🙈 World"
*
* // Unpaired high surrogate is removed
* const unpaired = String.fromCharCode(0xD83D); // high surrogate without low
* sanitizeSurrogates(`Text ${unpaired} here`) // => "Text here"
*/
export function sanitizeSurrogates(text: string): string {
// Replace unpaired high surrogates (0xD800-0xDBFF not followed by low surrogate)
// Replace unpaired low surrogates (0xDC00-0xDFFF not preceded by high surrogate)
return text.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g, "");
}