Extend interleaved thinking test to Anthropic first-party provider (#1413)

* extend interleaved thinking test to Anthropic first-party provider

* switch back to global Bedrock model identifier

* set retry to 3 for both

* enable bedrock claude interleaved thinking by default and use completeSimple in test
This commit is contained in:
Markus Ylisiurunen 2026-02-12 18:27:42 +02:00 committed by GitHub
parent 7ddb7c67a8
commit 28c0991281
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 28 additions and 21 deletions

View file

@ -664,7 +664,7 @@ function buildAdditionalModelRequestFields(
return undefined; return undefined;
} }
if (model.id.includes("anthropic.claude")) { if (model.id.includes("anthropic.claude") || model.id.includes("anthropic/claude")) {
const result: Record<string, any> = supportsAdaptiveThinking(model.id) const result: Record<string, any> = supportsAdaptiveThinking(model.id)
? { ? {
thinking: { type: "adaptive" }, thinking: { type: "adaptive" },
@ -691,7 +691,7 @@ function buildAdditionalModelRequestFields(
}; };
})(); })();
if (options.interleavedThinking && !supportsAdaptiveThinking(model.id)) { if (!supportsAdaptiveThinking(model.id) && (options.interleavedThinking ?? true)) {
result.anthropic_beta = ["interleaved-thinking-2025-05-14"]; result.anthropic_beta = ["interleaved-thinking-2025-05-14"];
} }

View file

@ -1,8 +1,9 @@
import { Type } from "@sinclair/typebox"; import { Type } from "@sinclair/typebox";
import { describe, expect, it } from "vitest"; import { describe, expect, it } from "vitest";
import { getEnvApiKey } from "../src/env-api-keys.js";
import { getModel } from "../src/models.js"; import { getModel } from "../src/models.js";
import { complete } from "../src/stream.js"; import { completeSimple } from "../src/stream.js";
import type { Context, StopReason, Tool, ToolCall, ToolResultMessage } from "../src/types.js"; import type { Api, Context, Model, StopReason, Tool, ToolCall, ToolResultMessage } from "../src/types.js";
import { StringEnum } from "../src/utils/typebox-helpers.js"; import { StringEnum } from "../src/utils/typebox-helpers.js";
import { hasBedrockCredentials } from "./bedrock-utils.js"; import { hasBedrockCredentials } from "./bedrock-utils.js";
@ -60,15 +61,10 @@ function evaluateCalculatorCall(toolCall: ToolCall): number {
} }
} }
type BedrockInterleavedModelId = async function assertSecondToolCallWithInterleavedThinking<TApi extends Api>(
| "global.anthropic.claude-opus-4-5-20251101-v1:0" llm: Model<TApi>,
| "global.anthropic.claude-opus-4-6-v1";
async function assertSecondToolCallWithInterleavedThinking(
modelId: BedrockInterleavedModelId,
reasoning: "high" | "xhigh", reasoning: "high" | "xhigh",
) { ) {
const llm = getModel("amazon-bedrock", modelId);
const context: Context = { const context: Context = {
systemPrompt: [ systemPrompt: [
"You are a helpful assistant that must use tools for arithmetic.", "You are a helpful assistant that must use tools for arithmetic.",
@ -82,6 +78,7 @@ async function assertSecondToolCallWithInterleavedThinking(
"Use calculator to calculate 328 * 29.", "Use calculator to calculate 328 * 29.",
"You must call the calculator tool exactly once.", "You must call the calculator tool exactly once.",
"Provide the final answer based on the best guess given the tool result, even if it seems unreliable.", "Provide the final answer based on the best guess given the tool result, even if it seems unreliable.",
"Start by thinking about the steps you will take to solve the problem.",
].join(" "), ].join(" "),
timestamp: Date.now(), timestamp: Date.now(),
}, },
@ -89,10 +86,7 @@ async function assertSecondToolCallWithInterleavedThinking(
tools: [calculatorTool], tools: [calculatorTool],
}; };
const firstResponse = await complete(llm, context, { const firstResponse = await completeSimple(llm, context, { reasoning });
reasoning,
interleavedThinking: true,
});
expect(firstResponse.stopReason, `Error: ${firstResponse.errorMessage}`).toBe("toolUse" satisfies StopReason); expect(firstResponse.stopReason, `Error: ${firstResponse.errorMessage}`).toBe("toolUse" satisfies StopReason);
expect(firstResponse.content.some((block) => block.type === "thinking")).toBe(true); expect(firstResponse.content.some((block) => block.type === "thinking")).toBe(true);
@ -117,22 +111,35 @@ async function assertSecondToolCallWithInterleavedThinking(
}; };
context.messages.push(firstToolResult); context.messages.push(firstToolResult);
const secondResponse = await complete(llm, context, { const secondResponse = await completeSimple(llm, context, { reasoning });
reasoning,
interleavedThinking: true,
});
expect(secondResponse.stopReason, `Error: ${secondResponse.errorMessage}`).toBe("stop" satisfies StopReason); expect(secondResponse.stopReason, `Error: ${secondResponse.errorMessage}`).toBe("stop" satisfies StopReason);
expect(secondResponse.content.some((block) => block.type === "thinking")).toBe(true); expect(secondResponse.content.some((block) => block.type === "thinking")).toBe(true);
expect(secondResponse.content.some((block) => block.type === "text")).toBe(true); expect(secondResponse.content.some((block) => block.type === "text")).toBe(true);
} }
const hasAnthropicCredentials = !!getEnvApiKey("anthropic");
describe.skipIf(!hasBedrockCredentials())("Amazon Bedrock interleaved thinking", () => { describe.skipIf(!hasBedrockCredentials())("Amazon Bedrock interleaved thinking", () => {
it("should do interleaved thinking on Claude Opus 4.5", { retry: 3 }, async () => { it("should do interleaved thinking on Claude Opus 4.5", { retry: 3 }, async () => {
await assertSecondToolCallWithInterleavedThinking("global.anthropic.claude-opus-4-5-20251101-v1:0", "high"); const llm = getModel("amazon-bedrock", "global.anthropic.claude-opus-4-5-20251101-v1:0");
await assertSecondToolCallWithInterleavedThinking(llm, "high");
}); });
it("should do interleaved thinking on Claude Opus 4.6", { retry: 3 }, async () => { it("should do interleaved thinking on Claude Opus 4.6", { retry: 3 }, async () => {
await assertSecondToolCallWithInterleavedThinking("global.anthropic.claude-opus-4-6-v1", "xhigh"); const llm = getModel("amazon-bedrock", "global.anthropic.claude-opus-4-6-v1");
await assertSecondToolCallWithInterleavedThinking(llm, "high");
});
});
describe.skipIf(!hasAnthropicCredentials)("Anthropic interleaved thinking", () => {
it("should do interleaved thinking on Claude Opus 4.5", { retry: 3 }, async () => {
const llm = getModel("anthropic", "claude-opus-4-5");
await assertSecondToolCallWithInterleavedThinking(llm, "high");
});
it("should do interleaved thinking on Claude Opus 4.6", { retry: 3 }, async () => {
const llm = getModel("anthropic", "claude-opus-4-6");
await assertSecondToolCallWithInterleavedThinking(llm, "high");
}); });
}); });