feat(ai): Add cross-provider message handoff support

- Add transformMessages utility to handle cross-provider compatibility - Convert thinking blocks to <thinking> tagged text when switching providers - Preserve native thinking blocks when staying with same provider/model - Add comprehensive handoff tests verifying all provider combinations - Fix OpenAI Completions to return partial results on abort - Update tool call ID format for Anthropic compatibility - Document cross-provider handoff capabilities in README
2026-04-17 10:02:23 +00:00 · 2025-09-01 18:43:49 +02:00 · 2025-09-01 18:43:49 +02:00 · 46b5800d36
commit 46b5800d36
parent bf1f410c2b
10 changed files with 828 additions and 130 deletions
--- a/packages/ai/test/abort.test.ts
+++ b/packages/ai/test/abort.test.ts
@ -6,22 +6,25 @@ import { AnthropicLLM } from "../src/providers/anthropic.js";
 import type { LLM, LLMOptions, Context } from "../src/types.js";
 import { getModel } from "../src/models.js";

-async function testAbortSignal<T extends LLMOptions>(llm: LLM<T>, options: T) {
-    const controller = new AbortController();
-
-    // Abort after 100ms
-    setTimeout(() => controller.abort(), 5000);
-
+async function testAbortSignal<T extends LLMOptions>(llm: LLM<T>, options: T = {} as T) {
    const context: Context = {
        messages: [{
            role: "user",
-            content: "What is 15 + 27? Think step by step. Then list 100 first names."
+            content: "What is 15 + 27? Think step by step. Then list 50 first names."
        }]
    };

-    const response = await llm.complete(context, {
+    let abortFired = false;
+    const controller = new AbortController();
+    const response = await llm.generate(context, {
        ...options,
-        signal: controller.signal
+        signal: controller.signal,
+        onEvent: (event) => {
+            // console.log(JSON.stringify(event, null, 2));
+            if (abortFired) return;
+            setTimeout(() => controller.abort(), 2000);
+            abortFired = true;
+        }
    });

    // If we get here without throwing, the abort didn't work
@ -29,15 +32,15 @@ async function testAbortSignal<T extends LLMOptions>(llm: LLM<T>, options: T) {
    expect(response.content.length).toBeGreaterThan(0);

    context.messages.push(response);
-    context.messages.push({ role: "user", content: "Please continue." });
+    context.messages.push({ role: "user", content: "Please continue, but only generate 5 names." });

    // Ensure we can still make requests after abort
-    const followUp = await llm.complete(context, options);
+    const followUp = await llm.generate(context, options);
    expect(followUp.stopReason).toBe("stop");
    expect(followUp.content.length).toBeGreaterThan(0);
 }

-async function testImmediateAbort<T extends LLMOptions>(llm: LLM<T>, options: T) {
+async function testImmediateAbort<T extends LLMOptions>(llm: LLM<T>, options: T = {} as T) {
    const controller = new AbortController();

    // Abort immediately
@ -47,7 +50,7 @@ async function testImmediateAbort<T extends LLMOptions>(llm: LLM<T>, options: T)
        messages: [{ role: "user", content: "Hello" }]
    };

-    const response = await llm.complete(context, {
+    const response = await llm.generate(context, {
        ...options,
        signal: controller.signal
    });
@ -75,15 +78,15 @@ describe("AI Providers Abort Tests", () => {
        let llm: OpenAICompletionsLLM;

        beforeAll(() => {
-            llm = new OpenAICompletionsLLM(getModel("openai", "gpt-5-mini")!, process.env.OPENAI_API_KEY!);
+            llm = new OpenAICompletionsLLM(getModel("openai", "gpt-4o-mini")!, process.env.OPENAI_API_KEY!);
        });

        it("should abort mid-stream", async () => {
-            await testAbortSignal(llm, { reasoningEffort: "medium"});
+            await testAbortSignal(llm);
        });

        it("should handle immediate abort", async () => {
-            await testImmediateAbort(llm, { reasoningEffort: "medium" });
+            await testImmediateAbort(llm);
        });
    });

--- a/packages/ai/test/handoff.test.ts
+++ b/packages/ai/test/handoff.test.ts
@ -0,0 +1,503 @@
+import { describe, it, expect, beforeAll } from "vitest";
+import { GoogleLLM } from "../src/providers/google.js";
+import { OpenAICompletionsLLM } from "../src/providers/openai-completions.js";
+import { OpenAIResponsesLLM } from "../src/providers/openai-responses.js";
+import { AnthropicLLM } from "../src/providers/anthropic.js";
+import type { LLM, Context, AssistantMessage, Tool, Message } from "../src/types.js";
+import { getModel } from "../src/models.js";
+
+// Tool for testing
+const weatherTool: Tool = {
+    name: "get_weather",
+    description: "Get the weather for a location",
+    parameters: {
+        type: "object",
+        properties: {
+            location: { type: "string", description: "City name" }
+        },
+        required: ["location"]
+    }
+};
+
+// Pre-built contexts representing typical outputs from each provider
+const providerContexts = {
+    // Anthropic-style message with thinking block
+    anthropic: {
+        message: {
+            role: "assistant",
+            content: [
+                {
+                    type: "thinking",
+                    thinking: "Let me calculate 17 * 23. That's 17 * 20 + 17 * 3 = 340 + 51 = 391",
+                    thinkingSignature: "signature_abc123"
+                },
+                {
+                    type: "text",
+                    text: "I'll help you with the calculation and check the weather. The result of 17 × 23 is 391. The capital of Austria is Vienna. Now let me check the weather for you."
+                },
+                {
+                    type: "toolCall",
+                    id: "toolu_01abc123",
+                    name: "get_weather",
+                    arguments: { location: "Tokyo" }
+                }
+            ],
+            provider: "anthropic",
+            model: "claude-3-5-haiku-latest",
+            usage: { input: 100, output: 50, cacheRead: 0, cacheWrite: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
+            stopReason: "toolUse"
+        } as AssistantMessage,
+        toolResult: {
+            role: "toolResult" as const,
+            toolCallId: "toolu_01abc123",
+            toolName: "get_weather",
+            content: "Weather in Tokyo: 18°C, partly cloudy",
+            isError: false
+        },
+        facts: {
+            calculation: 391,
+            city: "Tokyo",
+            temperature: 18,
+            capital: "Vienna"
+        }
+    },
+
+    // Google-style message with thinking
+    google: {
+        message: {
+            role: "assistant",
+            content: [
+                {
+                    type: "thinking",
+                    thinking: "I need to multiply 19 * 24. Let me work through this: 19 * 24 = 19 * 20 + 19 * 4 = 380 + 76 = 456",
+                    thinkingSignature: undefined
+                },
+                {
+                    type: "text",
+                    text: "The multiplication of 19 × 24 equals 456. The capital of France is Paris. Let me check the weather in Berlin for you."
+                },
+                {
+                    type: "toolCall",
+                    id: "call_gemini_123",
+                    name: "get_weather",
+                    arguments: { location: "Berlin" }
+                }
+            ],
+            provider: "google",
+            model: "gemini-2.5-flash",
+            usage: { input: 120, output: 60, cacheRead: 0, cacheWrite: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
+            stopReason: "toolUse"
+        } as AssistantMessage,
+        toolResult: {
+            role: "toolResult" as const,
+            toolCallId: "call_gemini_123",
+            toolName: "get_weather",
+            content: "Weather in Berlin: 22°C, sunny",
+            isError: false
+        },
+        facts: {
+            calculation: 456,
+            city: "Berlin",
+            temperature: 22,
+            capital: "Paris"
+        }
+    },
+
+    // OpenAI Completions style (with reasoning_content)
+    openaiCompletions: {
+        message: {
+            role: "assistant",
+            content: [
+                {
+                    type: "thinking",
+                    thinking: "Let me calculate 21 * 25. That's 21 * 25 = 525",
+                    thinkingSignature: "reasoning_content"
+                },
+                {
+                    type: "text",
+                    text: "The result of 21 × 25 is 525. The capital of Spain is Madrid. I'll check the weather in London now."
+                },
+                {
+                    type: "toolCall",
+                    id: "call_abc123",
+                    name: "get_weather",
+                    arguments: { location: "London" }
+                }
+            ],
+            provider: "openai",
+            model: "gpt-4o-mini",
+            usage: { input: 110, output: 55, cacheRead: 0, cacheWrite: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
+            stopReason: "toolUse"
+        } as AssistantMessage,
+        toolResult: {
+            role: "toolResult" as const,
+            toolCallId: "call_abc123",
+            toolName: "get_weather",
+            content: "Weather in London: 15°C, rainy",
+            isError: false
+        },
+        facts: {
+            calculation: 525,
+            city: "London",
+            temperature: 15,
+            capital: "Madrid"
+        }
+    },
+
+    // OpenAI Responses style (with complex tool call IDs)
+    openaiResponses: {
+        message: {
+            role: "assistant",
+            content: [
+                {
+                    type: "thinking",
+                    thinking: "Calculating 18 * 27: 18 * 27 = 486",
+                    thinkingSignature: '{"type":"reasoning","id":"rs_2b2342acdde","summary":[{"type":"summary_text","text":"Calculating 18 * 27: 18 * 27 = 486"}]}'
+                },
+                {
+                    type: "text",
+                    text: "The calculation of 18 × 27 gives us 486. The capital of Italy is Rome. Let me check Sydney's weather.",
+                    textSignature: "msg_response_456"
+                },
+                {
+                    type: "toolCall",
+                    id: "call_789_item_012",  // Anthropic requires alphanumeric, dash, and underscore only
+                    name: "get_weather",
+                    arguments: { location: "Sydney" }
+                }
+            ],
+            provider: "openai",
+            model: "gpt-5-mini",
+            usage: { input: 115, output: 58, cacheRead: 0, cacheWrite: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
+            stopReason: "toolUse"
+        } as AssistantMessage,
+        toolResult: {
+            role: "toolResult" as const,
+            toolCallId: "call_789_item_012",  // Match the updated ID format
+            toolName: "get_weather",
+            content: "Weather in Sydney: 25°C, clear",
+            isError: false
+        },
+        facts: {
+            calculation: 486,
+            city: "Sydney",
+            temperature: 25,
+            capital: "Rome"
+        }
+    },
+
+    // Aborted message (stopReason: 'error')
+    aborted: {
+        message: {
+            role: "assistant",
+            content: [
+                {
+                    type: "thinking",
+                    thinking: "Let me start calculating 20 * 30...",
+                    thinkingSignature: "partial_sig"
+                },
+                {
+                    type: "text",
+                    text: "I was about to calculate 20 × 30 which is"
+                }
+            ],
+            provider: "test",
+            model: "test-model",
+            usage: { input: 50, output: 25, cacheRead: 0, cacheWrite: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
+            stopReason: "error",
+            error: "Request was aborted"
+        } as AssistantMessage,
+        toolResult: null,
+        facts: {
+            calculation: 600,
+            city: "none",
+            temperature: 0,
+            capital: "none"
+        }
+    }
+};
+
+/**
+ * Test that a provider can handle contexts from different sources
+ */
+async function testProviderHandoff(
+    targetProvider: LLM<any>,
+    sourceLabel: string,
+    sourceContext: typeof providerContexts[keyof typeof providerContexts]
+): Promise<boolean> {
+    // Build conversation context
+    const messages: Message[] = [
+        {
+            role: "user",
+            content: "Please do some calculations, tell me about capitals, and check the weather."
+        },
+        sourceContext.message
+    ];
+
+    // Add tool result if present
+    if (sourceContext.toolResult) {
+        messages.push(sourceContext.toolResult);
+    }
+
+    // Ask follow-up question
+    messages.push({
+        role: "user",
+        content: `Based on our conversation, please answer:
+                 1) What was the multiplication result?
+                 2) Which city's weather did we check?
+                 3) What was the temperature?
+                 4) What capital city was mentioned?
+                 Please include the specific numbers and names.`
+    });
+
+    const context: Context = {
+        messages,
+        tools: [weatherTool]
+    };
+
+    try {
+        const response = await targetProvider.generate(context, {});
+
+        // Check for error
+        if (response.stopReason === "error") {
+            console.log(`[${sourceLabel} → ${targetProvider.getModel().provider}] Failed with error: ${response.error}`);
+            return false;
+        }
+
+        // Extract text from response
+        const responseText = response.content
+            .filter(b => b.type === "text")
+            .map(b => b.text)
+            .join(" ")
+            .toLowerCase();
+
+        // For aborted messages, we don't expect to find the facts
+        if (sourceContext.message.stopReason === "error") {
+            const hasToolCalls = response.content.some(b => b.type === "toolCall");
+            const hasThinking = response.content.some(b => b.type === "thinking");
+            const hasText = response.content.some(b => b.type === "text");
+
+            expect(response.stopReason === "stop" || response.stopReason === "toolUse").toBe(true);
+            expect(hasThinking || hasText || hasToolCalls).toBe(true);
+            console.log(`[${sourceLabel} → ${targetProvider.getModel().provider}] Handled aborted message successfully, tool calls: ${hasToolCalls}, thinking: ${hasThinking}, text: ${hasText}`);
+            return true;
+        }
+
+        // Check if response contains our facts
+        const hasCalculation = responseText.includes(sourceContext.facts.calculation.toString());
+        const hasCity = sourceContext.facts.city !== "none" && responseText.includes(sourceContext.facts.city.toLowerCase());
+        const hasTemperature = sourceContext.facts.temperature > 0 && responseText.includes(sourceContext.facts.temperature.toString());
+        const hasCapital = sourceContext.facts.capital !== "none" && responseText.includes(sourceContext.facts.capital.toLowerCase());
+
+        const success = hasCalculation && hasCity && hasTemperature && hasCapital;
+
+        console.log(`[${sourceLabel} → ${targetProvider.getModel().provider}] Handoff test:`);
+        if (!success) {
+            console.log(`  Calculation (${sourceContext.facts.calculation}): ${hasCalculation ? '✓' : '✗'}`);
+            console.log(`  City (${sourceContext.facts.city}): ${hasCity ? '✓' : '✗'}`);
+            console.log(`  Temperature (${sourceContext.facts.temperature}): ${hasTemperature ? '✓' : '✗'}`);
+            console.log(`  Capital (${sourceContext.facts.capital}): ${hasCapital ? '✓' : '✗'}`);
+        } else {
+            console.log(`  ✓ All facts found`);
+        }
+
+        return success;
+    } catch (error) {
+        console.error(`[${sourceLabel} → ${targetProvider.getModel().provider}] Exception:`, error);
+        return false;
+    }
+}
+
+describe("Cross-Provider Handoff Tests", () => {
+    describe.skipIf(!process.env.ANTHROPIC_API_KEY)("Anthropic Provider Handoff", () => {
+        let provider: AnthropicLLM;
+
+        beforeAll(() => {
+            const model = getModel("anthropic", "claude-3-5-haiku-20241022");
+            if (model) {
+                provider = new AnthropicLLM(model, process.env.ANTHROPIC_API_KEY!);
+            }
+        });
+
+        it("should handle contexts from all providers", async () => {
+            if (!provider) {
+                console.log("Anthropic provider not available, skipping");
+                return;
+            }
+
+            console.log("\nTesting Anthropic with pre-built contexts:\n");
+
+            const contextTests = [
+                { label: "Anthropic-style", context: providerContexts.anthropic, sourceModel: "claude-3-5-haiku-20241022" },
+                { label: "Google-style", context: providerContexts.google, sourceModel: "gemini-2.5-flash" },
+                { label: "OpenAI-Completions", context: providerContexts.openaiCompletions, sourceModel: "gpt-4o-mini" },
+                { label: "OpenAI-Responses", context: providerContexts.openaiResponses, sourceModel: "gpt-5-mini" },
+                { label: "Aborted", context: providerContexts.aborted, sourceModel: null }
+            ];
+
+            let successCount = 0;
+            let skippedCount = 0;
+
+            for (const { label, context, sourceModel } of contextTests) {
+                // Skip testing same model against itself
+                if (sourceModel && sourceModel === provider.getModel().id) {
+                    console.log(`[${label} → ${provider.getModel().provider}] Skipping same-model test`);
+                    skippedCount++;
+                    continue;
+                }
+                const success = await testProviderHandoff(provider, label, context);
+                if (success) successCount++;
+            }
+
+            const totalTests = contextTests.length - skippedCount;
+            console.log(`\nAnthropic success rate: ${successCount}/${totalTests} (${skippedCount} skipped)\n`);
+
+            // All non-skipped handoffs should succeed
+            expect(successCount).toBe(totalTests);
+        });
+    });
+
+    describe.skipIf(!process.env.GEMINI_API_KEY)("Google Provider Handoff", () => {
+        let provider: GoogleLLM;
+
+        beforeAll(() => {
+            const model = getModel("google", "gemini-2.5-flash");
+            if (model) {
+                provider = new GoogleLLM(model, process.env.GEMINI_API_KEY!);
+            }
+        });
+
+        it("should handle contexts from all providers", async () => {
+            if (!provider) {
+                console.log("Google provider not available, skipping");
+                return;
+            }
+
+            console.log("\nTesting Google with pre-built contexts:\n");
+
+            const contextTests = [
+                { label: "Anthropic-style", context: providerContexts.anthropic, sourceModel: "claude-3-5-haiku-20241022" },
+                { label: "Google-style", context: providerContexts.google, sourceModel: "gemini-2.5-flash" },
+                { label: "OpenAI-Completions", context: providerContexts.openaiCompletions, sourceModel: "gpt-4o-mini" },
+                { label: "OpenAI-Responses", context: providerContexts.openaiResponses, sourceModel: "gpt-5-mini" },
+                { label: "Aborted", context: providerContexts.aborted, sourceModel: null }
+            ];
+
+            let successCount = 0;
+            let skippedCount = 0;
+
+            for (const { label, context, sourceModel } of contextTests) {
+                // Skip testing same model against itself
+                if (sourceModel && sourceModel === provider.getModel().id) {
+                    console.log(`[${label} → ${provider.getModel().provider}] Skipping same-model test`);
+                    skippedCount++;
+                    continue;
+                }
+                const success = await testProviderHandoff(provider, label, context);
+                if (success) successCount++;
+            }
+
+            const totalTests = contextTests.length - skippedCount;
+            console.log(`\nGoogle success rate: ${successCount}/${totalTests} (${skippedCount} skipped)\n`);
+
+            // All non-skipped handoffs should succeed
+            expect(successCount).toBe(totalTests);
+        });
+    });
+
+    describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Completions Provider Handoff", () => {
+        let provider: OpenAICompletionsLLM;
+
+        beforeAll(() => {
+            const model = getModel("openai", "gpt-4o-mini");
+            if (model) {
+                provider = new OpenAICompletionsLLM(model, process.env.OPENAI_API_KEY!);
+            }
+        });
+
+        it("should handle contexts from all providers", async () => {
+            if (!provider) {
+                console.log("OpenAI Completions provider not available, skipping");
+                return;
+            }
+
+            console.log("\nTesting OpenAI Completions with pre-built contexts:\n");
+
+            const contextTests = [
+                { label: "Anthropic-style", context: providerContexts.anthropic, sourceModel: "claude-3-5-haiku-20241022" },
+                { label: "Google-style", context: providerContexts.google, sourceModel: "gemini-2.5-flash" },
+                { label: "OpenAI-Completions", context: providerContexts.openaiCompletions, sourceModel: "gpt-4o-mini" },
+                { label: "OpenAI-Responses", context: providerContexts.openaiResponses, sourceModel: "gpt-5-mini" },
+                { label: "Aborted", context: providerContexts.aborted, sourceModel: null }
+            ];
+
+            let successCount = 0;
+            let skippedCount = 0;
+
+            for (const { label, context, sourceModel } of contextTests) {
+                // Skip testing same model against itself
+                if (sourceModel && sourceModel === provider.getModel().id) {
+                    console.log(`[${label} → ${provider.getModel().provider}] Skipping same-model test`);
+                    skippedCount++;
+                    continue;
+                }
+                const success = await testProviderHandoff(provider, label, context);
+                if (success) successCount++;
+            }
+
+            const totalTests = contextTests.length - skippedCount;
+            console.log(`\nOpenAI Completions success rate: ${successCount}/${totalTests} (${skippedCount} skipped)\n`);
+
+            // All non-skipped handoffs should succeed
+            expect(successCount).toBe(totalTests);
+        });
+    });
+
+    describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Responses Provider Handoff", () => {
+        let provider: OpenAIResponsesLLM;
+
+        beforeAll(() => {
+            const model = getModel("openai", "gpt-5-mini");
+            if (model) {
+                provider = new OpenAIResponsesLLM(model, process.env.OPENAI_API_KEY!);
+            }
+        });
+
+        it("should handle contexts from all providers", async () => {
+            if (!provider) {
+                console.log("OpenAI Responses provider not available, skipping");
+                return;
+            }
+
+            console.log("\nTesting OpenAI Responses with pre-built contexts:\n");
+
+            const contextTests = [
+                { label: "Anthropic-style", context: providerContexts.anthropic, sourceModel: "claude-3-5-haiku-20241022" },
+                { label: "Google-style", context: providerContexts.google, sourceModel: "gemini-2.5-flash" },
+                { label: "OpenAI-Completions", context: providerContexts.openaiCompletions, sourceModel: "gpt-4o-mini" },
+                { label: "OpenAI-Responses", context: providerContexts.openaiResponses, sourceModel: "gpt-5-mini" },
+                { label: "Aborted", context: providerContexts.aborted, sourceModel: null }
+            ];
+
+            let successCount = 0;
+            let skippedCount = 0;
+
+            for (const { label, context, sourceModel } of contextTests) {
+                // Skip testing same model against itself
+                if (sourceModel && sourceModel === provider.getModel().id) {
+                    console.log(`[${label} → ${provider.getModel().provider}] Skipping same-model test`);
+                    skippedCount++;
+                    continue;
+                }
+                const success = await testProviderHandoff(provider, label, context);
+                if (success) successCount++;
+            }
+
+            const totalTests = contextTests.length - skippedCount;
+            console.log(`\nOpenAI Responses success rate: ${successCount}/${totalTests} (${skippedCount} skipped)\n`);
+
+            // All non-skipped handoffs should succeed
+            expect(successCount).toBe(totalTests);
+        });
+    });
+});
--- a/packages/ai/test/providers.test.ts
+++ b/packages/ai/test/providers.test.ts
@ -40,11 +40,11 @@ async function basicTextGeneration<T extends LLMOptions>(llm: LLM<T>) {
                ]
            };

-            const response = await llm.complete(context);
+            const response = await llm.generate(context);

            expect(response.role).toBe("assistant");
            expect(response.content).toBeTruthy();
-            expect(response.usage.input).toBeGreaterThan(0);
+            expect(response.usage.input + response.usage.cacheRead).toBeGreaterThan(0);
            expect(response.usage.output).toBeGreaterThan(0);
            expect(response.error).toBeFalsy();
            expect(response.content.map(b => b.type == "text" ? b.text : "").join("")).toContain("Hello test successful");
@ -52,7 +52,7 @@ async function basicTextGeneration<T extends LLMOptions>(llm: LLM<T>) {
            context.messages.push(response);
            context.messages.push({ role: "user", content: "Now say 'Goodbye test successful'" });

-            const secondResponse = await llm.complete(context);
+            const secondResponse = await llm.generate(context);

            expect(secondResponse.role).toBe("assistant");
            expect(secondResponse.content).toBeTruthy();
@ -72,7 +72,7 @@ async function handleToolCall<T extends LLMOptions>(llm: LLM<T>) {
        tools: [calculatorTool]
    };

-    const response = await llm.complete(context);
+    const response = await llm.generate(context);
    expect(response.stopReason).toBe("toolUse");
    expect(response.content.some(b => b.type == "toolCall")).toBeTruthy();
    const toolCall = response.content.find(b => b.type == "toolCall")!;
@ -89,7 +89,7 @@ async function handleStreaming<T extends LLMOptions>(llm: LLM<T>) {
        messages: [{ role: "user", content: "Count from 1 to 3" }]
    };

-    const response = await llm.complete(context, {
+    const response = await llm.generate(context, {
        onEvent: (event) => {
            if (event.type === "text_start") {
                textStarted = true;
@ -113,14 +113,15 @@ async function handleThinking<T extends LLMOptions>(llm: LLM<T>, options: T) {
    let thinkingCompleted = false;

    const context: Context = {
-        messages: [{ role: "user", content: "What is 15 + 27? Think step by step." }]
+        messages: [{ role: "user", content: `Think about ${(Math.random() * 255) | 0} + 27. Think step by step. Then output the result.` }]
    };

-    const response = await llm.complete(context, {
+    const response = await llm.generate(context, {
       onEvent: (event) => {
            if (event.type === "thinking_start") {
                thinkingStarted = true;
            } else if (event.type === "thinking_delta") {
+                expect(event.content.endsWith(event.delta)).toBe(true);
                thinkingChunks += event.delta;
            } else if (event.type === "thinking_end") {
                thinkingCompleted = true;
@ -130,6 +131,7 @@ async function handleThinking<T extends LLMOptions>(llm: LLM<T>, options: T) {
    });


+    expect(response.stopReason, `Error: ${(response as any).error}`).toBe("stop");
    expect(thinkingStarted).toBe(true);
    expect(thinkingChunks.length).toBeGreaterThan(0);
    expect(thinkingCompleted).toBe(true);
@ -160,14 +162,14 @@ async function handleImage<T extends LLMOptions>(llm: LLM<T>) {
            {
                role: "user",
                content: [
-                    { type: "text", text: "What do you see in this image? Please describe the shape and color." },
+                    { type: "text", text: "What do you see in this image? Please describe the shape (circle, rectangle, square, triangle, ...) and color (red, blue, green, ...)." },
                    imageContent,
                ],
            },
        ],
    };

-    const response = await llm.complete(context);
+    const response = await llm.generate(context);

    // Check the response mentions red and circle
    expect(response.content.length > 0).toBeTruthy();
@ -195,7 +197,7 @@ async function multiTurn<T extends LLMOptions>(llm: LLM<T>, thinkingOptions: T)
    const maxTurns = 5; // Prevent infinite loops

    for (let turn = 0; turn < maxTurns; turn++) {
-        const response = await llm.complete(context, thinkingOptions);
+        const response = await llm.generate(context, thinkingOptions);

        // Add the assistant response to context
        context.messages.push(response);
@ -325,12 +327,12 @@ describe("AI Providers E2E Tests", () => {
            await handleStreaming(llm);
        });

-        it("should handle thinking mode", async () => {
-            await handleThinking(llm, {reasoningEffort: "medium"});
+        it("should handle thinking mode", {retry: 2}, async () => {
+            await handleThinking(llm, {reasoningEffort: "high"});
        });

        it("should handle multi-turn with thinking and tools", async () => {
-            await multiTurn(llm, {reasoningEffort: "medium"});
+            await multiTurn(llm, {reasoningEffort: "high"});
        });

        it("should handle image input", async () => {
@ -370,34 +372,6 @@ describe("AI Providers E2E Tests", () => {
        });
    });

-    describe.skipIf(!process.env.ANTHROPIC_API_KEY)("Anthropic Provider (Haiku 3.5)", () => {
-        let llm: AnthropicLLM;
-
-        beforeAll(() => {
-            llm = createLLM("anthropic", "claude-3-5-haiku-latest");
-        });
-
-        it("should complete basic text generation", async () => {
-            await basicTextGeneration(llm);
-        });
-
-        it("should handle tool calling", async () => {
-            await handleToolCall(llm);
-        });
-
-        it("should handle streaming", async () => {
-            await handleStreaming(llm);
-        });
-
-        it("should handle multi-turn with thinking and tools", async () => {
-            await multiTurn(llm, {thinking: {enabled: true}});
-        });
-
-        it("should handle image input", async () => {
-            await handleImage(llm);
-        });
-    });
-
    describe.skipIf(!process.env.XAI_API_KEY)("xAI Provider (grok-code-fast-1 via OpenAI Completions)", () => {
        let llm: OpenAICompletionsLLM;

@ -505,7 +479,7 @@ describe("AI Providers E2E Tests", () => {
            await handleThinking(llm, {reasoningEffort: "medium"});
        });

-        it("should handle multi-turn with thinking and tools", async () => {
+        it("should handle multi-turn with thinking and tools", { retry: 2 }, async () => {
            await multiTurn(llm, {reasoningEffort: "medium"});
        });

@ -611,4 +585,34 @@ describe("AI Providers E2E Tests", () => {
            await multiTurn(llm, {reasoningEffort: "medium"});
        });
    });
+
+    /*
+    describe.skipIf(!process.env.ANTHROPIC_API_KEY)("Anthropic Provider (Haiku 3.5)", () => {
+        let llm: AnthropicLLM;
+
+        beforeAll(() => {
+            llm = createLLM("anthropic", "claude-3-5-haiku-latest");
+        });
+
+        it("should complete basic text generation", async () => {
+            await basicTextGeneration(llm);
+        });
+
+        it("should handle tool calling", async () => {
+            await handleToolCall(llm);
+        });
+
+        it("should handle streaming", async () => {
+            await handleStreaming(llm);
+        });
+
+        it("should handle multi-turn with thinking and tools", async () => {
+            await multiTurn(llm, {thinking: {enabled: true}});
+        });
+
+        it("should handle image input", async () => {
+            await handleImage(llm);
+        });
+    });
+    */
 });