From f29752ac82364584af426894771fddd933251e02 Mon Sep 17 00:00:00 2001 From: Mario Zechner Date: Sun, 31 Aug 2025 19:32:12 +0200 Subject: [PATCH] refactor(ai): Update API to support multiple thinking and text blocks BREAKING CHANGE: AssistantMessage now uses content array instead of separate fields - Changed AssistantMessage.content from string to array of content blocks - Removed separate thinking, toolCalls, and signature fields - Content blocks can be TextContent, ThinkingContent, or ToolCall types - Updated streaming events to include start/end events for text and thinking - Fixed multiTurn test to handle new content structure Note: Currently only Anthropic provider is updated to work with new API Other providers need to be updated to match the new interface --- packages/ai/src/providers/anthropic.ts | 122 ++++++++--------- packages/ai/src/types.ts | 78 +++++------ packages/ai/test/providers.test.ts | 174 +++++++++++-------------- 3 files changed, 167 insertions(+), 207 deletions(-) diff --git a/packages/ai/src/providers/anthropic.ts b/packages/ai/src/providers/anthropic.ts index a7790045..5291dcfe 100644 --- a/packages/ai/src/providers/anthropic.ts +++ b/packages/ai/src/providers/anthropic.ts @@ -6,17 +6,7 @@ import type { Tool, } from "@anthropic-ai/sdk/resources/messages.js"; import { calculateCost } from "../models.js"; -import type { - AssistantMessage, - Context, - LLM, - LLMOptions, - Message, - Model, - StopReason, - ToolCall, - Usage, -} from "../types.js"; +import type { AssistantMessage, Context, LLM, LLMOptions, Message, Model, StopReason, Usage } from "../types.js"; export interface AnthropicLLMOptions extends LLMOptions { thinking?: { @@ -130,63 +120,65 @@ export class AnthropicLLM implements LLM { ); let blockType: "text" | "thinking" | "other" = "other"; + let blockContent = ""; for await (const event of stream) { if (event.type === "content_block_start") { if (event.content_block.type === "text") { blockType = "text"; + blockContent = ""; + options?.onEvent?.({ type: "text_start" }); } else if (event.content_block.type === "thinking") { blockType = "thinking"; + blockContent = ""; + options?.onEvent?.({ type: "thinking_start" }); } else { blockType = "other"; + blockContent = ""; } } if (event.type === "content_block_delta") { if (event.delta.type === "text_delta") { - options?.onText?.(event.delta.text, false); - blockType = "text"; // Ensure block type is set + options?.onEvent?.({ type: "text_delta", content: blockContent, delta: event.delta.text }); + blockContent += event.delta.text; } if (event.delta.type === "thinking_delta") { - options?.onThinking?.(event.delta.thinking, false); - blockType = "thinking"; // Ensure block type is set + options?.onEvent?.({ type: "thinking_delta", content: blockContent, delta: event.delta.thinking }); + blockContent += event.delta.thinking; } } if (event.type === "content_block_stop") { if (blockType === "text") { - options?.onText?.("", true); + options?.onEvent?.({ type: "text_end", content: blockContent }); } else if (blockType === "thinking") { - options?.onThinking?.("", true); + options?.onEvent?.({ type: "thinking_end", content: blockContent }); } blockType = "other"; } } const msg = await stream.finalMessage(); - const thinking = msg.content.some((block) => block.type === "thinking") - ? msg.content - .filter((block) => block.type === "thinking") - .map((block) => block.thinking) - .join("\n") - : undefined; - // This is kinda wrong if there is more than one thinking block. We do not use interleaved thinking though, so we should - // always have a single thinking block. - const thinkingSignature = msg.content.some((block) => block.type === "thinking") - ? msg.content - .filter((block) => block.type === "thinking") - .map((block) => block.signature) - .join("\n") - : undefined; - const content = msg.content.some((block) => block.type === "text") - ? msg.content - .filter((block) => block.type === "text") - .map((block) => block.text) - .join("\n") - : undefined; - const toolCalls: ToolCall[] = msg.content - .filter((block) => block.type === "tool_use") - .map((block) => ({ - id: block.id, - name: block.name, - arguments: block.input as Record, - })); + const blocks: AssistantMessage["content"] = []; + for (const block of msg.content) { + if (block.type === "text" && block.text) { + blocks.push({ + type: "text", + text: block.text, + }); + } else if (block.type === "thinking" && block.thinking) { + blocks.push({ + type: "thinking", + thinking: block.thinking, + thinkingSignature: block.signature, + }); + } else if (block.type === "tool_use") { + blocks.push({ + type: "toolCall", + id: block.id, + name: block.name, + arguments: block.input as Record, + }); + } + } + const usage: Usage = { input: msg.usage.input_tokens, output: msg.usage.output_tokens, @@ -204,10 +196,7 @@ export class AnthropicLLM implements LLM { return { role: "assistant", - content, - thinking, - thinkingSignature, - toolCalls, + content: blocks, provider: this.modelInfo.provider, model: this.modelInfo.id, usage, @@ -216,6 +205,7 @@ export class AnthropicLLM implements LLM { } catch (error) { return { role: "assistant", + content: [], provider: this.modelInfo.provider, model: this.modelInfo.id, usage: { @@ -270,28 +260,24 @@ export class AnthropicLLM implements LLM { } else if (msg.role === "assistant") { const blocks: ContentBlockParam[] = []; - if (msg.thinking && msg.thinkingSignature) { - blocks.push({ - type: "thinking", - thinking: msg.thinking, - signature: msg.thinkingSignature, - }); - } - - if (msg.content) { - blocks.push({ - type: "text", - text: msg.content, - }); - } - - if (msg.toolCalls) { - for (const toolCall of msg.toolCalls) { + for (const block of msg.content) { + if (block.type === "text") { + blocks.push({ + type: "text", + text: block.text, + }); + } else if (block.type === "thinking") { + blocks.push({ + type: "thinking", + thinking: block.thinking, + signature: block.thinkingSignature || "", + }); + } else if (block.type === "toolCall") { blocks.push({ type: "tool_use", - id: toolCall.id, - name: toolCall.name, - input: toolCall.arguments, + id: block.id, + name: block.name, + input: block.arguments, }); } } diff --git a/packages/ai/src/types.ts b/packages/ai/src/types.ts index 4c75089a..2011da5a 100644 --- a/packages/ai/src/types.ts +++ b/packages/ai/src/types.ts @@ -1,8 +1,7 @@ export interface LLMOptions { temperature?: number; maxTokens?: number; - onText?: (text: string, complete: boolean) => void; - onThinking?: (thinking: string, complete: boolean) => void; + onEvent?: (event: AssistantMessageEvent) => void; signal?: AbortSignal; } @@ -14,6 +13,13 @@ export interface LLM { export interface TextContent { type: "text"; text: string; + textSignature?: string; // e.g., for OpenAI responses, the message ID +} + +export interface ThinkingContent { + type: "thinking"; + thinking: string; + thinkingSignature?: string; // e.g., for OpenAI responses, the reasoning item ID } export interface ImageContent { @@ -22,6 +28,29 @@ export interface ImageContent { mimeType: string; // e.g., "image/jpeg", "image/png" } +export interface ToolCall { + type: "toolCall"; + id: string; + name: string; + arguments: Record; +} + +export interface Usage { + input: number; + output: number; + cacheRead: number; + cacheWrite: number; + cost: { + input: number; + output: number; + cacheRead: number; + cacheWrite: number; + total: number; + }; +} + +export type StopReason = "stop" | "length" | "toolUse" | "safety" | "error"; + export interface UserMessage { role: "user"; content: string | (TextContent | ImageContent)[]; @@ -29,18 +58,7 @@ export interface UserMessage { export interface AssistantMessage { role: "assistant"; - thinking?: string; - // Leaky abstraction: provider specific, does not translate to other providers - thinkingSignature?: string; - content?: string; - // Leaky abstraction: provider specific, does not translate to other providers - // e.g. OpenAI responses must include id for assistant responses - contentSignature?: string; - toolCalls?: { - id: string; - name: string; - arguments: Record; - }[]; + content: (TextContent | ThinkingContent | ToolCall)[]; provider: string; model: string; usage: Usage; @@ -70,37 +88,19 @@ export interface Context { tools?: Tool[]; } -export type Event = +export type AssistantMessageEvent = | { type: "start"; model: string; provider: string } - | { type: "text"; content: string; delta: string } - | { type: "thinking"; content: string; delta: string } + | { type: "text_start" } + | { type: "text_delta"; content: string; delta: string } + | { type: "text_end"; content: string } + | { type: "thinking_start" } + | { type: "thinking_delta"; content: string; delta: string } + | { type: "thinking_end"; content: string } | { type: "toolCall"; toolCall: ToolCall } | { type: "usage"; usage: Usage } | { type: "done"; reason: StopReason; message: AssistantMessage } | { type: "error"; error: Error }; -export interface ToolCall { - id: string; - name: string; - arguments: Record; -} - -export interface Usage { - input: number; - output: number; - cacheRead: number; - cacheWrite: number; - cost: { - input: number; - output: number; - cacheRead: number; - cacheWrite: number; - total: number; - }; -} - -export type StopReason = "stop" | "length" | "toolUse" | "safety" | "error"; - // Model interface for the unified model system export interface Model { id: string; diff --git a/packages/ai/test/providers.test.ts b/packages/ai/test/providers.test.ts index d2ac09ac..36dd28d7 100644 --- a/packages/ai/test/providers.test.ts +++ b/packages/ai/test/providers.test.ts @@ -47,7 +47,7 @@ async function basicTextGeneration(llm: LLM) { expect(response.usage.input).toBeGreaterThan(0); expect(response.usage.output).toBeGreaterThan(0); expect(response.error).toBeFalsy(); - expect(response.content).toContain("Hello test successful"); + expect(response.content.map(b => b.type == "text" ? b.text : "").join("\n")).toContain("Hello test successful"); context.messages.push(response); context.messages.push({ role: "user", content: "Now say 'Goodbye test successful'" }); @@ -59,7 +59,7 @@ async function basicTextGeneration(llm: LLM) { expect(secondResponse.usage.input).toBeGreaterThan(0); expect(secondResponse.usage.output).toBeGreaterThan(0); expect(secondResponse.error).toBeFalsy(); - expect(secondResponse.content).toContain("Goodbye test successful"); + expect(secondResponse.content.map(b => b.type == "text" ? b.text : "").join("\n")).toContain("Goodbye test successful"); } async function handleToolCall(llm: LLM) { @@ -74,14 +74,14 @@ async function handleToolCall(llm: LLM) { const response = await llm.complete(context); expect(response.stopReason).toBe("toolUse"); - expect(response.toolCalls).toBeTruthy(); - expect(response.toolCalls!.length).toBeGreaterThan(0); - const toolCall = response.toolCalls![0]; + expect(response.content.some(b => b.type == "toolCall")).toBeTruthy(); + const toolCall = response.content.find(b => b.type == "toolCall")!; expect(toolCall.name).toBe("calculator"); expect(toolCall.id).toBeTruthy(); } async function handleStreaming(llm: LLM) { + let textStarted = false; let textChunks = ""; let textCompleted = false; @@ -90,37 +90,50 @@ async function handleStreaming(llm: LLM) { }; const response = await llm.complete(context, { - onText: (chunk, complete) => { - textChunks += chunk; - if (complete) textCompleted = true; + onEvent: (event) => { + if (event.type === "text_start") { + textStarted = true; + } else if (event.type === "text_delta") { + textChunks += event.delta; + } else if (event.type === "text_end") { + textCompleted = true; + } } } as T); + expect(textStarted).toBe(true); expect(textChunks.length).toBeGreaterThan(0); expect(textCompleted).toBe(true); - expect(response.content).toBeTruthy(); + expect(response.content.some(b => b.type == "text")).toBeTruthy(); } -async function handleThinking(llm: LLM, options: T, requireThinking: boolean = true) { +async function handleThinking(llm: LLM, options: T) { + let thinkingStarted = false; let thinkingChunks = ""; + let thinkingCompleted = false; const context: Context = { messages: [{ role: "user", content: "What is 15 + 27? Think step by step." }] }; const response = await llm.complete(context, { - onThinking: (chunk) => { - thinkingChunks += chunk; + onEvent: (event) => { + if (event.type === "thinking_start") { + thinkingStarted = true; + } else if (event.type === "thinking_delta") { + thinkingChunks += event.delta; + } else if (event.type === "thinking_end") { + thinkingCompleted = true; + } }, ...options }); - expect(response.content).toBeTruthy(); - // For providers that should always return thinking when enabled - if (requireThinking) { - expect(thinkingChunks.length > 0 || !!response.thinking).toBe(true); - } + expect(thinkingStarted).toBe(true); + expect(thinkingChunks.length).toBeGreaterThan(0); + expect(thinkingCompleted).toBe(true); + expect(response.content.some(b => b.type == "thinking")).toBeTruthy(); } async function handleImage(llm: LLM) { @@ -157,8 +170,8 @@ async function handleImage(llm: LLM) { const response = await llm.complete(context); // Check the response mentions red and circle - expect(response.content).toBeTruthy(); - const lowerContent = response.content?.toLowerCase() || ""; + expect(response.content.length > 0).toBeTruthy(); + const lowerContent = response.content.find(b => b.type == "text")?.text || ""; expect(lowerContent).toContain("red"); expect(lowerContent).toContain("circle"); } @@ -175,74 +188,33 @@ async function multiTurn(llm: LLM, thinkingOptions: T) tools: [calculatorTool] }; - // First turn - should get thinking and/or tool calls - const firstResponse = await llm.complete(context, thinkingOptions); - - // Verify we got either thinking content or tool calls (or both) - const hasThinking = firstResponse.thinking !== undefined && firstResponse.thinking.length > 0; - const hasToolCalls = firstResponse.toolCalls && firstResponse.toolCalls.length > 0; - - expect(hasThinking || hasToolCalls).toBe(true); - - // If we got tool calls, verify they're correct - if (hasToolCalls) { - expect(firstResponse.toolCalls).toBeTruthy(); - expect(firstResponse.toolCalls!.length).toBeGreaterThan(0); - } - - // If we have thinking with tool calls, we should have thinkingSignature for proper multi-turn context - // Note: Some providers may not return thinking when tools are used - if (firstResponse.thinking && hasToolCalls) { - // For now, we'll just check if it exists when both are present - // Some providers may not support thinkingSignature yet - if (firstResponse.thinkingSignature !== undefined) { - expect(firstResponse.thinkingSignature).toBeTruthy(); - } - } - - // Add the assistant response to context - context.messages.push(firstResponse); - - // Process tool calls and add results - for (const toolCall of firstResponse.toolCalls || []) { - expect(toolCall.name).toBe("calculator"); - expect(toolCall.id).toBeTruthy(); - expect(toolCall.arguments).toBeTruthy(); - - const { a, b, operation } = toolCall.arguments; - let result: number; - switch (operation) { - case "add": result = a + b; break; - case "multiply": result = a * b; break; - default: result = 0; - } - - context.messages.push({ - role: "toolResult", - content: `${result}`, - toolCallId: toolCall.id, - isError: false - }); - } - - // Second turn - complete the conversation - // Keep processing until we get a response with content (not just tool calls) - let finalResponse: AssistantMessage | undefined; - const maxTurns = 3; // Prevent infinite loops + // Collect all text content from all assistant responses + let allTextContent = ""; + let hasSeenThinking = false; + let hasSeenToolCalls = false; + const maxTurns = 5; // Prevent infinite loops for (let turn = 0; turn < maxTurns; turn++) { const response = await llm.complete(context, thinkingOptions); + + // Add the assistant response to context context.messages.push(response); - if (response.stopReason === "stop" && response.content) { - finalResponse = response; - break; - } + // Process content blocks + for (const block of response.content) { + if (block.type === "text") { + allTextContent += block.text + " "; + } else if (block.type === "thinking") { + hasSeenThinking = true; + } else if (block.type === "toolCall") { + hasSeenToolCalls = true; - // If we got more tool calls, process them - if (response.toolCalls) { - for (const toolCall of response.toolCalls) { - const { a, b, operation } = toolCall.arguments; + // Process the tool call + expect(block.name).toBe("calculator"); + expect(block.id).toBeTruthy(); + expect(block.arguments).toBeTruthy(); + + const { a, b, operation } = block.arguments; let result: number; switch (operation) { case "add": result = a + b; break; @@ -250,24 +222,30 @@ async function multiTurn(llm: LLM, thinkingOptions: T) default: result = 0; } + // Add tool result to context context.messages.push({ role: "toolResult", content: `${result}`, - toolCallId: toolCall.id, + toolCallId: block.id, isError: false }); } } + + // If we got a stop response with text content, we're likely done + expect(response.stopReason).not.toBe("error"); + if (response.stopReason === "stop") { + break; + } } - expect(finalResponse).toBeTruthy(); - expect(finalResponse!.content).toBeTruthy(); - expect(finalResponse!.role).toBe("assistant"); + // Verify we got either thinking content or tool calls (or both) + expect(hasSeenThinking || hasSeenToolCalls).toBe(true); - // The final response should reference the calculations - expect( - finalResponse!.content!.includes("714") || finalResponse!.content!.includes("887") - ).toBe(true); + // The accumulated text should reference both calculations + expect(allTextContent).toBeTruthy(); + expect(allTextContent.includes("714")).toBe(true); + expect(allTextContent.includes("887")).toBe(true); } describe("AI Providers E2E Tests", () => { @@ -343,7 +321,7 @@ describe("AI Providers E2E Tests", () => { }); it("should handle thinking mode", async () => { - await handleThinking(llm, {reasoningEffort: "medium"}, false); + await handleThinking(llm, {reasoningEffort: "medium"}); }); it("should handle multi-turn with thinking and tools", async () => { @@ -407,7 +385,7 @@ describe("AI Providers E2E Tests", () => { }); it("should handle thinking mode", async () => { - await handleThinking(llm, {reasoningEffort: "medium"}, false); + await handleThinking(llm, {reasoningEffort: "medium"}); }); it("should handle multi-turn with thinking and tools", async () => { @@ -435,7 +413,7 @@ describe("AI Providers E2E Tests", () => { }); it("should handle thinking mode", async () => { - await handleThinking(llm, {reasoningEffort: "medium"}, false); + await handleThinking(llm, {reasoningEffort: "medium"}); }); it("should handle multi-turn with thinking and tools", async () => { @@ -463,7 +441,7 @@ describe("AI Providers E2E Tests", () => { }); it("should handle thinking mode", async () => { - await handleThinking(llm, {reasoningEffort: "medium"}, false); + await handleThinking(llm, {reasoningEffort: "medium"}); }); it("should handle multi-turn with thinking and tools", async () => { @@ -491,7 +469,7 @@ describe("AI Providers E2E Tests", () => { }); it("should handle thinking mode", async () => { - await handleThinking(llm, {reasoningEffort: "medium"}, false); + await handleThinking(llm, {reasoningEffort: "medium"}); }); it("should handle multi-turn with thinking and tools", async () => { @@ -589,7 +567,7 @@ describe("AI Providers E2E Tests", () => { }); it("should handle thinking mode", async () => { - await handleThinking(llm, {reasoningEffort: "medium"}, false); + await handleThinking(llm, {reasoningEffort: "medium"}); }); it("should handle multi-turn with thinking and tools", async () => { @@ -617,7 +595,7 @@ describe("AI Providers E2E Tests", () => { }); it("should handle thinking mode", async () => { - await handleThinking(llm, {reasoningEffort: "medium"}, false); + await handleThinking(llm, {reasoningEffort: "medium"}); }); it("should handle multi-turn with thinking and tools", async () => { @@ -644,10 +622,6 @@ describe("AI Providers E2E Tests", () => { await handleStreaming(llm); }); - it("should handle thinking mode", async () => { - await handleThinking(llm, {thinking: {enabled: true}}, false); - }); - it("should handle multi-turn with thinking and tools", async () => { await multiTurn(llm, {thinking: {enabled: true}}); });