Add Unicode surrogate sanitization for all providers

Fixes issue where unpaired Unicode surrogates in tool results cause JSON serialization errors in API providers, particularly Anthropic.

- Add sanitizeSurrogates() utility function to remove unpaired surrogates
- Apply sanitization in all provider convertMessages() functions:
  - User message text content (string and text blocks)
  - Assistant message text and thinking blocks
  - Tool result output
  - System prompts
- Valid emoji (properly paired surrogates) are preserved
- Add comprehensive test suite covering all 8 providers

Previously only Google and Groq handled unpaired surrogates correctly.
Now all providers (Anthropic, OpenAI Completions/Responses, Google, xAI, Groq, Cerebras, zAI) sanitize text before API submission.
This commit is contained in:
Mario Zechner 2025-10-13 14:26:54 +02:00
parent 949cd4efd8
commit 4e7a340460
6 changed files with 420 additions and 24 deletions

View file

@ -26,6 +26,7 @@ import type {
} from "../types.js";
import { AssistantMessageEventStream } from "../utils/event-stream.js";
import { parseStreamingJson } from "../utils/json-parse.js";
import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
import { validateToolArguments } from "../utils/validation.js";
import { transformMessages } from "./transorm-messages.js";
@ -364,7 +365,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
const role = model.reasoning ? "developer" : "system";
messages.push({
role,
content: context.systemPrompt,
content: sanitizeSurrogates(context.systemPrompt),
});
}
@ -373,14 +374,14 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
if (typeof msg.content === "string") {
messages.push({
role: "user",
content: [{ type: "input_text", text: msg.content }],
content: [{ type: "input_text", text: sanitizeSurrogates(msg.content) }],
});
} else {
const content: ResponseInputContent[] = msg.content.map((item): ResponseInputContent => {
if (item.type === "text") {
return {
type: "input_text",
text: item.text,
text: sanitizeSurrogates(item.text),
} satisfies ResponseInputText;
} else {
return {
@ -414,7 +415,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
output.push({
type: "message",
role: "assistant",
content: [{ type: "output_text", text: textBlock.text, annotations: [] }],
content: [{ type: "output_text", text: sanitizeSurrogates(textBlock.text), annotations: [] }],
status: "completed",
id: textBlock.textSignature || "msg_" + Math.random().toString(36).substring(2, 15),
} satisfies ResponseOutputMessage);
@ -436,7 +437,7 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
messages.push({
type: "function_call_output",
call_id: msg.toolCallId.split("|")[0],
output: msg.output,
output: sanitizeSurrogates(msg.output),
});
}
}