Add Unicode surrogate sanitization for all providers

Fixes issue where unpaired Unicode surrogates in tool results cause JSON serialization errors in API providers, particularly Anthropic.

- Add sanitizeSurrogates() utility function to remove unpaired surrogates
- Apply sanitization in all provider convertMessages() functions:
  - User message text content (string and text blocks)
  - Assistant message text and thinking blocks
  - Tool result output
  - System prompts
- Valid emoji (properly paired surrogates) are preserved
- Add comprehensive test suite covering all 8 providers

Previously only Google and Groq handled unpaired surrogates correctly.
Now all providers (Anthropic, OpenAI Completions/Responses, Google, xAI, Groq, Cerebras, zAI) sanitize text before API submission.
This commit is contained in:
Mario Zechner 2025-10-13 14:26:54 +02:00
parent 949cd4efd8
commit 4e7a340460
6 changed files with 420 additions and 24 deletions

View file

@ -22,6 +22,7 @@ import type {
ToolCall,
} from "../types.js";
import { AssistantMessageEventStream } from "../utils/event-stream.js";
import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
import { validateToolArguments } from "../utils/validation.js";
import { transformMessages } from "./transorm-messages.js";
@ -278,7 +279,7 @@ function buildParams(
const config: GenerateContentConfig = {
...(Object.keys(generationConfig).length > 0 && generationConfig),
...(context.systemPrompt && { systemInstruction: context.systemPrompt }),
...(context.systemPrompt && { systemInstruction: sanitizeSurrogates(context.systemPrompt) }),
...(context.tools && context.tools.length > 0 && { tools: convertTools(context.tools) }),
};
@ -323,12 +324,12 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context)
if (typeof msg.content === "string") {
contents.push({
role: "user",
parts: [{ text: msg.content }],
parts: [{ text: sanitizeSurrogates(msg.content) }],
});
} else {
const parts: Part[] = msg.content.map((item) => {
if (item.type === "text") {
return { text: item.text };
return { text: sanitizeSurrogates(item.text) };
} else {
return {
inlineData: {
@ -350,12 +351,12 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context)
for (const block of msg.content) {
if (block.type === "text") {
parts.push({ text: block.text });
parts.push({ text: sanitizeSurrogates(block.text) });
} else if (block.type === "thinking") {
const thinkingPart: Part = {
thought: true,
thoughtSignature: block.thinkingSignature,
text: block.thinking,
text: sanitizeSurrogates(block.thinking),
};
parts.push(thinkingPart);
} else if (block.type === "toolCall") {
@ -383,7 +384,7 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context)
id: msg.toolCallId,
name: msg.toolName,
response: {
result: msg.output,
result: sanitizeSurrogates(msg.output),
isError: msg.isError,
},
},