mirror of
https://github.com/getcompanion-ai/co-mono.git
synced 2026-04-21 19:00:44 +00:00
Extend interleaved thinking test to Anthropic first-party provider (#1413)
* extend interleaved thinking test to Anthropic first-party provider * switch back to global Bedrock model identifier * set retry to 3 for both * enable bedrock claude interleaved thinking by default and use completeSimple in test
This commit is contained in:
parent
7ddb7c67a8
commit
28c0991281
2 changed files with 28 additions and 21 deletions
|
|
@ -664,7 +664,7 @@ function buildAdditionalModelRequestFields(
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (model.id.includes("anthropic.claude")) {
|
if (model.id.includes("anthropic.claude") || model.id.includes("anthropic/claude")) {
|
||||||
const result: Record<string, any> = supportsAdaptiveThinking(model.id)
|
const result: Record<string, any> = supportsAdaptiveThinking(model.id)
|
||||||
? {
|
? {
|
||||||
thinking: { type: "adaptive" },
|
thinking: { type: "adaptive" },
|
||||||
|
|
@ -691,7 +691,7 @@ function buildAdditionalModelRequestFields(
|
||||||
};
|
};
|
||||||
})();
|
})();
|
||||||
|
|
||||||
if (options.interleavedThinking && !supportsAdaptiveThinking(model.id)) {
|
if (!supportsAdaptiveThinking(model.id) && (options.interleavedThinking ?? true)) {
|
||||||
result.anthropic_beta = ["interleaved-thinking-2025-05-14"];
|
result.anthropic_beta = ["interleaved-thinking-2025-05-14"];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,9 @@
|
||||||
import { Type } from "@sinclair/typebox";
|
import { Type } from "@sinclair/typebox";
|
||||||
import { describe, expect, it } from "vitest";
|
import { describe, expect, it } from "vitest";
|
||||||
|
import { getEnvApiKey } from "../src/env-api-keys.js";
|
||||||
import { getModel } from "../src/models.js";
|
import { getModel } from "../src/models.js";
|
||||||
import { complete } from "../src/stream.js";
|
import { completeSimple } from "../src/stream.js";
|
||||||
import type { Context, StopReason, Tool, ToolCall, ToolResultMessage } from "../src/types.js";
|
import type { Api, Context, Model, StopReason, Tool, ToolCall, ToolResultMessage } from "../src/types.js";
|
||||||
import { StringEnum } from "../src/utils/typebox-helpers.js";
|
import { StringEnum } from "../src/utils/typebox-helpers.js";
|
||||||
import { hasBedrockCredentials } from "./bedrock-utils.js";
|
import { hasBedrockCredentials } from "./bedrock-utils.js";
|
||||||
|
|
||||||
|
|
@ -60,15 +61,10 @@ function evaluateCalculatorCall(toolCall: ToolCall): number {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type BedrockInterleavedModelId =
|
async function assertSecondToolCallWithInterleavedThinking<TApi extends Api>(
|
||||||
| "global.anthropic.claude-opus-4-5-20251101-v1:0"
|
llm: Model<TApi>,
|
||||||
| "global.anthropic.claude-opus-4-6-v1";
|
|
||||||
|
|
||||||
async function assertSecondToolCallWithInterleavedThinking(
|
|
||||||
modelId: BedrockInterleavedModelId,
|
|
||||||
reasoning: "high" | "xhigh",
|
reasoning: "high" | "xhigh",
|
||||||
) {
|
) {
|
||||||
const llm = getModel("amazon-bedrock", modelId);
|
|
||||||
const context: Context = {
|
const context: Context = {
|
||||||
systemPrompt: [
|
systemPrompt: [
|
||||||
"You are a helpful assistant that must use tools for arithmetic.",
|
"You are a helpful assistant that must use tools for arithmetic.",
|
||||||
|
|
@ -82,6 +78,7 @@ async function assertSecondToolCallWithInterleavedThinking(
|
||||||
"Use calculator to calculate 328 * 29.",
|
"Use calculator to calculate 328 * 29.",
|
||||||
"You must call the calculator tool exactly once.",
|
"You must call the calculator tool exactly once.",
|
||||||
"Provide the final answer based on the best guess given the tool result, even if it seems unreliable.",
|
"Provide the final answer based on the best guess given the tool result, even if it seems unreliable.",
|
||||||
|
"Start by thinking about the steps you will take to solve the problem.",
|
||||||
].join(" "),
|
].join(" "),
|
||||||
timestamp: Date.now(),
|
timestamp: Date.now(),
|
||||||
},
|
},
|
||||||
|
|
@ -89,10 +86,7 @@ async function assertSecondToolCallWithInterleavedThinking(
|
||||||
tools: [calculatorTool],
|
tools: [calculatorTool],
|
||||||
};
|
};
|
||||||
|
|
||||||
const firstResponse = await complete(llm, context, {
|
const firstResponse = await completeSimple(llm, context, { reasoning });
|
||||||
reasoning,
|
|
||||||
interleavedThinking: true,
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(firstResponse.stopReason, `Error: ${firstResponse.errorMessage}`).toBe("toolUse" satisfies StopReason);
|
expect(firstResponse.stopReason, `Error: ${firstResponse.errorMessage}`).toBe("toolUse" satisfies StopReason);
|
||||||
expect(firstResponse.content.some((block) => block.type === "thinking")).toBe(true);
|
expect(firstResponse.content.some((block) => block.type === "thinking")).toBe(true);
|
||||||
|
|
@ -117,22 +111,35 @@ async function assertSecondToolCallWithInterleavedThinking(
|
||||||
};
|
};
|
||||||
context.messages.push(firstToolResult);
|
context.messages.push(firstToolResult);
|
||||||
|
|
||||||
const secondResponse = await complete(llm, context, {
|
const secondResponse = await completeSimple(llm, context, { reasoning });
|
||||||
reasoning,
|
|
||||||
interleavedThinking: true,
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(secondResponse.stopReason, `Error: ${secondResponse.errorMessage}`).toBe("stop" satisfies StopReason);
|
expect(secondResponse.stopReason, `Error: ${secondResponse.errorMessage}`).toBe("stop" satisfies StopReason);
|
||||||
expect(secondResponse.content.some((block) => block.type === "thinking")).toBe(true);
|
expect(secondResponse.content.some((block) => block.type === "thinking")).toBe(true);
|
||||||
expect(secondResponse.content.some((block) => block.type === "text")).toBe(true);
|
expect(secondResponse.content.some((block) => block.type === "text")).toBe(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const hasAnthropicCredentials = !!getEnvApiKey("anthropic");
|
||||||
|
|
||||||
describe.skipIf(!hasBedrockCredentials())("Amazon Bedrock interleaved thinking", () => {
|
describe.skipIf(!hasBedrockCredentials())("Amazon Bedrock interleaved thinking", () => {
|
||||||
it("should do interleaved thinking on Claude Opus 4.5", { retry: 3 }, async () => {
|
it("should do interleaved thinking on Claude Opus 4.5", { retry: 3 }, async () => {
|
||||||
await assertSecondToolCallWithInterleavedThinking("global.anthropic.claude-opus-4-5-20251101-v1:0", "high");
|
const llm = getModel("amazon-bedrock", "global.anthropic.claude-opus-4-5-20251101-v1:0");
|
||||||
|
await assertSecondToolCallWithInterleavedThinking(llm, "high");
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should do interleaved thinking on Claude Opus 4.6", { retry: 3 }, async () => {
|
it("should do interleaved thinking on Claude Opus 4.6", { retry: 3 }, async () => {
|
||||||
await assertSecondToolCallWithInterleavedThinking("global.anthropic.claude-opus-4-6-v1", "xhigh");
|
const llm = getModel("amazon-bedrock", "global.anthropic.claude-opus-4-6-v1");
|
||||||
|
await assertSecondToolCallWithInterleavedThinking(llm, "high");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe.skipIf(!hasAnthropicCredentials)("Anthropic interleaved thinking", () => {
|
||||||
|
it("should do interleaved thinking on Claude Opus 4.5", { retry: 3 }, async () => {
|
||||||
|
const llm = getModel("anthropic", "claude-opus-4-5");
|
||||||
|
await assertSecondToolCallWithInterleavedThinking(llm, "high");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should do interleaved thinking on Claude Opus 4.6", { retry: 3 }, async () => {
|
||||||
|
const llm = getModel("anthropic", "claude-opus-4-6");
|
||||||
|
await assertSecondToolCallWithInterleavedThinking(llm, "high");
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
Loading…
Add table
Add a link
Reference in a new issue