From 0ce59236dc21732af543b04e455dff42cf26804a Mon Sep 17 00:00:00 2001
From: Mario Zechner <badlogicgames@gmail.com>
Date: Mon, 19 Jan 2026 00:14:24 +0100
Subject: [PATCH] test(ai): remove handoff test closes #258

---
 packages/ai/test/handoff.test.ts | 573 -------------------------------
 1 file changed, 573 deletions(-)
 delete mode 100644 packages/ai/test/handoff.test.ts
diff --git a/packages/ai/test/handoff.test.ts b/packages/ai/test/handoff.test.ts
deleted file mode 100644
index 457f0148..00000000
--- a/packages/ai/test/handoff.test.ts
+++ /dev/null
@@ -1,573 +0,0 @@
-import { Type } from "@sinclair/typebox";
-import { describe, expect, it } from "vitest";
-import { getModel } from "../src/models.js";
-import { complete } from "../src/stream.js";
-import type { Api, AssistantMessage, Context, Message, Model, Tool, ToolResultMessage } from "../src/types.js";
-
-// Tool for testing
-const weatherSchema = Type.Object({
-	location: Type.String({ description: "City name" }),
-});
-
-const weatherTool: Tool<typeof weatherSchema> = {
-	name: "get_weather",
-	description: "Get the weather for a location",
-	parameters: weatherSchema,
-};
-
-// Pre-built contexts representing typical outputs from each provider
-const providerContexts = {
-	// Anthropic-style message with thinking block
-	anthropic: {
-		message: {
-			role: "assistant",
-			api: "anthropic-messages",
-			content: [
-				{
-					type: "thinking",
-					thinking: "Let me calculate 17 * 23. That's 17 * 20 + 17 * 3 = 340 + 51 = 391",
-					thinkingSignature: "signature_abc123",
-				},
-				{
-					type: "text",
-					text: "I'll help you with the calculation and check the weather. The result of 17 × 23 is 391. The capital of Austria is Vienna. Now let me check the weather for you.",
-				},
-				{
-					type: "toolCall",
-					id: "toolu_01abc123",
-					name: "get_weather",
-					arguments: { location: "Tokyo" },
-				},
-			],
-			provider: "anthropic",
-			model: "claude-3-5-haiku-latest",
-			usage: {
-				input: 100,
-				output: 50,
-				cacheRead: 0,
-				cacheWrite: 0,
-				totalTokens: 150,
-				cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
-			},
-			stopReason: "toolUse",
-			timestamp: Date.now(),
-		} satisfies AssistantMessage,
-		toolResult: {
-			role: "toolResult" as const,
-			toolCallId: "toolu_01abc123",
-			toolName: "get_weather",
-			content: [{ type: "text", text: "Weather in Tokyo: 18°C, partly cloudy" }],
-			isError: false,
-			timestamp: Date.now(),
-		} satisfies ToolResultMessage,
-		facts: {
-			calculation: 391,
-			city: "Tokyo",
-			temperature: 18,
-			capital: "Vienna",
-		},
-	},
-
-	// Google-style message with thinking
-	google: {
-		message: {
-			role: "assistant",
-			api: "google-generative-ai",
-			content: [
-				{
-					type: "thinking",
-					thinking:
-						"I need to multiply 19 * 24. Let me work through this: 19 * 24 = 19 * 20 + 19 * 4 = 380 + 76 = 456",
-					thinkingSignature: undefined,
-				},
-				{
-					type: "text",
-					text: "The multiplication of 19 × 24 equals 456. The capital of France is Paris. Let me check the weather in Berlin for you.",
-				},
-				{
-					type: "toolCall",
-					id: "call_gemini_123",
-					name: "get_weather",
-					arguments: { location: "Berlin" },
-				},
-			],
-			provider: "google",
-			model: "gemini-2.5-flash",
-			usage: {
-				input: 120,
-				output: 60,
-				cacheRead: 0,
-				cacheWrite: 0,
-				totalTokens: 180,
-				cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
-			},
-			stopReason: "toolUse",
-			timestamp: Date.now(),
-		} satisfies AssistantMessage,
-		toolResult: {
-			role: "toolResult" as const,
-			toolCallId: "call_gemini_123",
-			toolName: "get_weather",
-			content: [{ type: "text", text: "Weather in Berlin: 22°C, sunny" }],
-			isError: false,
-			timestamp: Date.now(),
-		} satisfies ToolResultMessage,
-		facts: {
-			calculation: 456,
-			city: "Berlin",
-			temperature: 22,
-			capital: "Paris",
-		},
-	},
-
-	// OpenAI Completions style (with reasoning_content)
-	openaiCompletions: {
-		message: {
-			role: "assistant",
-			api: "openai-completions",
-			content: [
-				{
-					type: "thinking",
-					thinking: "Let me calculate 21 * 25. That's 21 * 25 = 525",
-					thinkingSignature: "reasoning_content",
-				},
-				{
-					type: "text",
-					text: "The result of 21 × 25 is 525. The capital of Spain is Madrid. I'll check the weather in London now.",
-				},
-				{
-					type: "toolCall",
-					id: "call_abc123",
-					name: "get_weather",
-					arguments: { location: "London" },
-				},
-			],
-			provider: "openai",
-			model: "gpt-4o-mini",
-			usage: {
-				input: 110,
-				output: 55,
-				cacheRead: 0,
-				cacheWrite: 0,
-				totalTokens: 165,
-				cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
-			},
-			stopReason: "toolUse",
-			timestamp: Date.now(),
-		} satisfies AssistantMessage,
-		toolResult: {
-			role: "toolResult" as const,
-			toolCallId: "call_abc123",
-			toolName: "get_weather",
-			content: [{ type: "text", text: "Weather in London: 15°C, rainy" }],
-			isError: false,
-			timestamp: Date.now(),
-		} satisfies ToolResultMessage,
-		facts: {
-			calculation: 525,
-			city: "London",
-			temperature: 15,
-			capital: "Madrid",
-		},
-	},
-
-	// OpenAI Responses style (with complex tool call IDs)
-	openaiResponses: {
-		message: {
-			role: "assistant",
-			api: "openai-responses",
-			content: [
-				{
-					type: "thinking",
-					thinking: "Calculating 18 * 27: 18 * 27 = 486",
-					thinkingSignature:
-						'{"type":"reasoning","id":"rs_2b2342acdde","summary":[{"type":"summary_text","text":"Calculating 18 * 27: 18 * 27 = 486"}]}',
-				},
-				{
-					type: "text",
-					text: "The calculation of 18 × 27 gives us 486. The capital of Italy is Rome. Let me check Sydney's weather.",
-					textSignature: "msg_response_456",
-				},
-				{
-					type: "toolCall",
-					id: "call_789_item_012", // Anthropic requires alphanumeric, dash, and underscore only
-					name: "get_weather",
-					arguments: { location: "Sydney" },
-				},
-			],
-			provider: "openai",
-			model: "gpt-5-mini",
-			usage: {
-				input: 115,
-				output: 58,
-				cacheRead: 0,
-				cacheWrite: 0,
-				totalTokens: 173,
-				cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
-			},
-			stopReason: "toolUse",
-			timestamp: Date.now(),
-		} satisfies AssistantMessage,
-		toolResult: {
-			role: "toolResult" as const,
-			toolCallId: "call_789_item_012", // Match the updated ID format
-			toolName: "get_weather",
-			content: [{ type: "text", text: "Weather in Sydney: 25°C, clear" }],
-			isError: false,
-			timestamp: Date.now(),
-		} satisfies ToolResultMessage,
-		facts: {
-			calculation: 486,
-			city: "Sydney",
-			temperature: 25,
-			capital: "Rome",
-		},
-	},
-
-	// Aborted message (stopReason: 'error')
-	aborted: {
-		message: {
-			role: "assistant",
-			api: "anthropic-messages",
-			content: [
-				{
-					type: "thinking",
-					thinking: "Let me start calculating 20 * 30...",
-					thinkingSignature: "partial_sig",
-				},
-				{
-					type: "text",
-					text: "I was about to calculate 20 × 30 which is",
-				},
-			],
-			provider: "test",
-			model: "test-model",
-			usage: {
-				input: 50,
-				output: 25,
-				cacheRead: 0,
-				cacheWrite: 0,
-				totalTokens: 75,
-				cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
-			},
-			stopReason: "error",
-			errorMessage: "Request was aborted",
-			timestamp: Date.now(),
-		} satisfies AssistantMessage,
-		toolResult: null,
-		facts: {
-			calculation: 600,
-			city: "none",
-			temperature: 0,
-			capital: "none",
-		},
-	},
-};
-
-/**
- * Test that a provider can handle contexts from different sources
- */
-async function testProviderHandoff<TApi extends Api>(
-	targetModel: Model<TApi>,
-	sourceLabel: string,
-	sourceContext: (typeof providerContexts)[keyof typeof providerContexts],
-): Promise<boolean> {
-	// Build conversation context
-	let assistantMessage: AssistantMessage = sourceContext.message;
-	let toolResult: ToolResultMessage | undefined | null = sourceContext.toolResult;
-
-	// If target is Mistral, convert tool call IDs to Mistral format
-	if (targetModel.provider === "mistral" && assistantMessage.content.some((c) => c.type === "toolCall")) {
-		// Clone the message to avoid mutating the original
-		assistantMessage = {
-			...assistantMessage,
-			content: assistantMessage.content.map((content) => {
-				if (content.type === "toolCall") {
-					// Generate a Mistral-style tool call ID (uppercase letters and numbers)
-					const mistralId = "T7TcP5RVB"; // Using the format we know works
-					return {
-						...content,
-						id: mistralId,
-					};
-				}
-				return content;
-			}),
-		} as AssistantMessage;
-
-		// Also update the tool result if present
-		if (toolResult) {
-			toolResult = {
-				...toolResult,
-				toolCallId: "T7TcP5RVB", // Match the tool call ID
-			};
-		}
-	}
-
-	const messages: Message[] = [
-		{
-			role: "user",
-			content: "Please do some calculations, tell me about capitals, and check the weather.",
-			timestamp: Date.now(),
-		},
-		assistantMessage,
-	];
-
-	// Add tool result if present
-	if (toolResult) {
-		messages.push(toolResult);
-	}
-
-	// Ask follow-up question
-	messages.push({
-		role: "user",
-		content: `Based on our conversation, please answer:
-                 1) What was the multiplication result?
-                 2) Which city's weather did we check?
-                 3) What was the temperature?
-                 4) What capital city was mentioned?
-                 Please include the specific numbers and names.`,
-		timestamp: Date.now(),
-	});
-
-	const context: Context = {
-		messages,
-		tools: [weatherTool],
-	};
-
-	try {
-		const response = await complete(targetModel, context, {});
-
-		// Check for error
-		if (response.stopReason === "error") {
-			console.log(`[${sourceLabel} → ${targetModel.provider}] Failed with error: ${response.errorMessage}`);
-			return false;
-		}
-
-		// Extract text from response
-		const responseText = response.content
-			.filter((b) => b.type === "text")
-			.map((b) => b.text)
-			.join(" ")
-			.toLowerCase();
-
-		// For aborted messages, we don't expect to find the facts
-		if (sourceContext.message.stopReason === "error") {
-			const hasToolCalls = response.content.some((b) => b.type === "toolCall");
-			const hasThinking = response.content.some((b) => b.type === "thinking");
-			const hasText = response.content.some((b) => b.type === "text");
-
-			expect(response.stopReason === "stop" || response.stopReason === "toolUse").toBe(true);
-			expect(hasThinking || hasText || hasToolCalls).toBe(true);
-			console.log(
-				`[${sourceLabel} → ${targetModel.provider}] Handled aborted message successfully, tool calls: ${hasToolCalls}, thinking: ${hasThinking}, text: ${hasText}`,
-			);
-			return true;
-		}
-
-		// Check if response contains our facts
-		const hasCalculation = responseText.includes(sourceContext.facts.calculation.toString());
-		const hasCity =
-			sourceContext.facts.city !== "none" && responseText.includes(sourceContext.facts.city.toLowerCase());
-		const hasTemperature =
-			sourceContext.facts.temperature > 0 && responseText.includes(sourceContext.facts.temperature.toString());
-		const hasCapital =
-			sourceContext.facts.capital !== "none" && responseText.includes(sourceContext.facts.capital.toLowerCase());
-
-		const success = hasCalculation && hasCity && hasTemperature && hasCapital;
-
-		console.log(`[${sourceLabel} → ${targetModel.provider}] Handoff test:`);
-		if (!success) {
-			console.log(`  Calculation (${sourceContext.facts.calculation}): ${hasCalculation ? "✓" : "✗"}`);
-			console.log(`  City (${sourceContext.facts.city}): ${hasCity ? "✓" : "✗"}`);
-			console.log(`  Temperature (${sourceContext.facts.temperature}): ${hasTemperature ? "✓" : "✗"}`);
-			console.log(`  Capital (${sourceContext.facts.capital}): ${hasCapital ? "✓" : "✗"}`);
-		} else {
-			console.log(`  ✓ All facts found`);
-		}
-
-		return success;
-	} catch (error) {
-		console.error(`[${sourceLabel} → ${targetModel.provider}] Exception:`, error);
-		return false;
-	}
-}
-
-describe("Cross-Provider Handoff Tests", () => {
-	describe.skipIf(!process.env.ANTHROPIC_API_KEY)("Anthropic Provider Handoff", () => {
-		const model = getModel("anthropic", "claude-3-5-haiku-20241022");
-
-		it("should handle contexts from all providers", async () => {
-			console.log("\nTesting Anthropic with pre-built contexts:\n");
-
-			const contextTests = [
-				{ label: "Anthropic-style", context: providerContexts.anthropic, sourceModel: "claude-3-5-haiku-20241022" },
-				{ label: "Google-style", context: providerContexts.google, sourceModel: "gemini-2.5-flash" },
-				{ label: "OpenAI-Completions", context: providerContexts.openaiCompletions, sourceModel: "gpt-4o-mini" },
-				{ label: "OpenAI-Responses", context: providerContexts.openaiResponses, sourceModel: "gpt-5-mini" },
-				{ label: "Aborted", context: providerContexts.aborted, sourceModel: null },
-			];
-
-			let successCount = 0;
-			let skippedCount = 0;
-
-			for (const { label, context, sourceModel } of contextTests) {
-				// Skip testing same model against itself
-				if (sourceModel && sourceModel === model.id) {
-					console.log(`[${label} → ${model.provider}] Skipping same-model test`);
-					skippedCount++;
-					continue;
-				}
-				const success = await testProviderHandoff(model, label, context);
-				if (success) successCount++;
-			}
-
-			const totalTests = contextTests.length - skippedCount;
-			console.log(`\nAnthropic success rate: ${successCount}/${totalTests} (${skippedCount} skipped)\n`);
-
-			// All non-skipped handoffs should succeed
-			expect(successCount).toBe(totalTests);
-		});
-	});
-
-	describe.skipIf(!process.env.GEMINI_API_KEY)("Google Provider Handoff", () => {
-		const model = getModel("google", "gemini-2.5-flash");
-
-		it("should handle contexts from all providers", async () => {
-			console.log("\nTesting Google with pre-built contexts:\n");
-
-			const contextTests = [
-				{ label: "Anthropic-style", context: providerContexts.anthropic, sourceModel: "claude-3-5-haiku-20241022" },
-				{ label: "Google-style", context: providerContexts.google, sourceModel: "gemini-2.5-flash" },
-				{ label: "OpenAI-Completions", context: providerContexts.openaiCompletions, sourceModel: "gpt-4o-mini" },
-				{ label: "OpenAI-Responses", context: providerContexts.openaiResponses, sourceModel: "gpt-5-mini" },
-				{ label: "Aborted", context: providerContexts.aborted, sourceModel: null },
-			];
-
-			let successCount = 0;
-			let skippedCount = 0;
-
-			for (const { label, context, sourceModel } of contextTests) {
-				// Skip testing same model against itself
-				if (sourceModel && sourceModel === model.id) {
-					console.log(`[${label} → ${model.provider}] Skipping same-model test`);
-					skippedCount++;
-					continue;
-				}
-				const success = await testProviderHandoff(model, label, context);
-				if (success) successCount++;
-			}
-
-			const totalTests = contextTests.length - skippedCount;
-			console.log(`\nGoogle success rate: ${successCount}/${totalTests} (${skippedCount} skipped)\n`);
-
-			// All non-skipped handoffs should succeed
-			expect(successCount).toBe(totalTests);
-		});
-	});
-
-	describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Completions Provider Handoff", () => {
-		const { compat: _compat, ...baseModel } = getModel("openai", "gpt-4o-mini");
-		void _compat;
-		const model: Model<"openai-completions"> = {
-			...baseModel,
-			api: "openai-completions",
-		};
-
-		it("should handle contexts from all providers", async () => {
-			console.log("\nTesting OpenAI Completions with pre-built contexts:\n");
-
-			const contextTests = [
-				{ label: "Anthropic-style", context: providerContexts.anthropic, sourceModel: "claude-3-5-haiku-20241022" },
-				{ label: "Google-style", context: providerContexts.google, sourceModel: "gemini-2.5-flash" },
-				{ label: "OpenAI-Completions", context: providerContexts.openaiCompletions, sourceModel: "gpt-4o-mini" },
-				{ label: "OpenAI-Responses", context: providerContexts.openaiResponses, sourceModel: "gpt-5-mini" },
-				{ label: "Aborted", context: providerContexts.aborted, sourceModel: null },
-			];
-
-			let successCount = 0;
-			let skippedCount = 0;
-
-			for (const { label, context, sourceModel } of contextTests) {
-				// Skip testing same model against itself
-				if (sourceModel && sourceModel === model.id) {
-					console.log(`[${label} → ${model.provider}] Skipping same-model test`);
-					skippedCount++;
-					continue;
-				}
-				const success = await testProviderHandoff(model, label, context);
-				if (success) successCount++;
-			}
-
-			const totalTests = contextTests.length - skippedCount;
-			console.log(`\nOpenAI Completions success rate: ${successCount}/${totalTests} (${skippedCount} skipped)\n`);
-
-			// All non-skipped handoffs should succeed
-			expect(successCount).toBe(totalTests);
-		});
-	});
-
-	describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Responses Provider Handoff", () => {
-		const model = getModel("openai", "gpt-5-mini");
-
-		it("should handle contexts from all providers", async () => {
-			console.log("\nTesting OpenAI Responses with pre-built contexts:\n");
-
-			const contextTests = [
-				{ label: "Anthropic-style", context: providerContexts.anthropic, sourceModel: "claude-3-5-haiku-20241022" },
-				{ label: "Google-style", context: providerContexts.google, sourceModel: "gemini-2.5-flash" },
-				{ label: "OpenAI-Completions", context: providerContexts.openaiCompletions, sourceModel: "gpt-4o-mini" },
-				{ label: "OpenAI-Responses", context: providerContexts.openaiResponses, sourceModel: "gpt-5-mini" },
-				{ label: "Aborted", context: providerContexts.aborted, sourceModel: null },
-			];
-
-			let successCount = 0;
-			let skippedCount = 0;
-
-			for (const { label, context, sourceModel } of contextTests) {
-				// Skip testing same model against itself
-				if (sourceModel && sourceModel === model.id) {
-					console.log(`[${label} → ${model.provider}] Skipping same-model test`);
-					skippedCount++;
-					continue;
-				}
-				const success = await testProviderHandoff(model, label, context);
-				if (success) successCount++;
-			}
-
-			const totalTests = contextTests.length - skippedCount;
-			console.log(`\nOpenAI Responses success rate: ${successCount}/${totalTests} (${skippedCount} skipped)\n`);
-
-			// All non-skipped handoffs should succeed
-			expect(successCount).toBe(totalTests);
-		});
-	});
-
-	describe.skipIf(!process.env.MISTRAL_API_KEY)("Mistral Provider Handoff", () => {
-		const model = getModel("mistral", "devstral-medium-latest");
-
-		it("should handle contexts from all providers", async () => {
-			console.log("\nTesting Mistral with pre-built contexts:\n");
-
-			const contextTests = [
-				{ label: "Anthropic-style", context: providerContexts.anthropic, sourceModel: "claude-3-5-haiku-20241022" },
-				{ label: "Google-style", context: providerContexts.google, sourceModel: "gemini-2.5-flash" },
-				{ label: "OpenAI-Completions", context: providerContexts.openaiCompletions, sourceModel: "gpt-4o-mini" },
-				{ label: "OpenAI-Responses", context: providerContexts.openaiResponses, sourceModel: "gpt-5-mini" },
-				{ label: "Aborted", context: providerContexts.aborted, sourceModel: null },
-			];
-
-			let successCount = 0;
-			const totalTests = contextTests.length;
-
-			for (const { label, context } of contextTests) {
-				const success = await testProviderHandoff(model, label, context);
-				if (success) successCount++;
-			}
-
-			console.log(`\nMistral success rate: ${successCount}/${totalTests}\n`);
-
-			// All handoffs should succeed
-			expect(successCount).toBe(totalTests);
-		}, 60000);
-	});
-});