🤖 fix: use effort param only for Claude Opus 4.5 (#743)

ammar-agent · web-flow · commit c5475e19c4ec · 2025-11-24T20:00:25.000-06:00
## Summary

The `effort` parameter is a new feature specific to Claude Opus 4.5 for
controlling how much computational work the model applies to each task.
Other Anthropic models (Sonnet 4.5, Haiku 4.5, Opus 4.1, etc.) must
continue to use the `thinking.budgetTokens` approach for extended
thinking.

## Changes

- Add `ANTHROPIC_EFFORT` mapping to `thinking.ts` for Opus 4.5's effort
levels
- Update `buildProviderOptions` to detect Opus 4.5 and use the `effort`
parameter
- Update `calculateEffectiveMaxOutputTokens` to skip budget adjustment
for Opus 4.5 (effort-based models don't need this)
- Add unit tests for provider options builder covering both Opus 4.5
(effort) and other models (thinking.budgetTokens)
- Add integration tests (`sendMessage.reasoning.test.ts`) for reasoning
with both Sonnet 4.5 and Opus 4.5

## Testing

- Unit tests pass: `bun test
src/common/utils/ai/providerOptions.test.ts`
- All existing tests pass: `make test` (1199 pass)
- Integration tests added for regression prevention (run with
`TEST_INTEGRATION=1`)

_Generated with `mux`_
diff --git a/src/common/types/thinking.ts b/src/common/types/thinking.ts
@@ -14,17 +14,33 @@ export type ThinkingLevel = "off" | "low" | "medium" | "high";
 export type ThinkingLevelOn = Exclude<ThinkingLevel, "off">;
 
 /**
- * Anthropic effort level mapping
+ * Anthropic thinking token budget mapping
  *
- * Maps our unified thinking levels to Anthropic's effort parameter:
- * - off: No effort specified (undefined)
- * - low: Most efficient - significant token savings
- * - medium: Balanced approach with moderate token savings
- * - high: Maximum capability (default behavior)
+ * These heuristics balance thinking depth with response time and cost.
+ * Used for models that support extended thinking with budgetTokens
+ * (e.g., Sonnet 4.5, Haiku 4.5, Opus 4.1, etc.)
  *
- * The effort parameter controls all token spend including thinking,
- * text responses, and tool calls. Unlike budget_tokens, it doesn't require
- * thinking to be explicitly enabled.
+ * - off: No extended thinking
+ * - low: Quick thinking for straightforward tasks (4K tokens)
+ * - medium: Standard thinking for moderate complexity (10K tokens)
+ * - high: Deep thinking for complex problems (20K tokens)
+ */
+export const ANTHROPIC_THINKING_BUDGETS: Record<ThinkingLevel, number> = {
+  off: 0,
+  low: 4000,
+  medium: 10000,
+  high: 20000,
+};
+
+/**
+ * Anthropic Opus 4.5 effort parameter mapping
+ *
+ * The effort parameter is a new feature ONLY available for Claude Opus 4.5.
+ * It controls how much computational work the model applies to each task.
+ *
+ * Other Anthropic models must use the thinking.budgetTokens approach instead.
+ *
+ * @see https://www.anthropic.com/news/claude-opus-4-5
  */
 export const ANTHROPIC_EFFORT: Record<ThinkingLevel, "low" | "medium" | "high" | undefined> = {
   off: undefined,
diff --git a/src/common/utils/ai/providerOptions.test.ts b/src/common/utils/ai/providerOptions.test.ts
@@ -0,0 +1,119 @@
+/**
+ * Tests for provider options builder
+ */
+
+import { describe, test, expect, mock } from "bun:test";
+import { buildProviderOptions } from "./providerOptions";
+import type { ThinkingLevel } from "@/common/types/thinking";
+
+// Mock the log module to avoid console noise
+void mock.module("@/node/services/log", () => ({
+  log: {
+    debug: (): void => undefined,
+    info: (): void => undefined,
+    warn: (): void => undefined,
+    error: (): void => undefined,
+  },
+}));
+
+// Mock enforceThinkingPolicy to pass through
+void mock.module("@/browser/utils/thinking/policy", () => ({
+  enforceThinkingPolicy: (_model: string, level: ThinkingLevel) => level,
+}));
+
+describe("buildProviderOptions - Anthropic", () => {
+  describe("Opus 4.5 (effort parameter)", () => {
+    test("should use effort parameter for claude-opus-4-5", () => {
+      const result = buildProviderOptions("anthropic:claude-opus-4-5", "medium");
+
+      expect(result).toEqual({
+        anthropic: {
+          disableParallelToolUse: false,
+          sendReasoning: true,
+          effort: "medium",
+        },
+      });
+    });
+
+    test("should use effort parameter for claude-opus-4-5-20251101", () => {
+      const result = buildProviderOptions("anthropic:claude-opus-4-5-20251101", "high");
+
+      expect(result).toEqual({
+        anthropic: {
+          disableParallelToolUse: false,
+          sendReasoning: true,
+          effort: "high",
+        },
+      });
+    });
+
+    test("should omit effort when thinking is off for Opus 4.5", () => {
+      const result = buildProviderOptions("anthropic:claude-opus-4-5", "off");
+
+      expect(result).toEqual({
+        anthropic: {
+          disableParallelToolUse: false,
+          sendReasoning: true,
+        },
+      });
+    });
+  });
+
+  describe("Other Anthropic models (thinking/budgetTokens)", () => {
+    test("should use thinking.budgetTokens for claude-sonnet-4-5", () => {
+      const result = buildProviderOptions("anthropic:claude-sonnet-4-5", "medium");
+
+      expect(result).toEqual({
+        anthropic: {
+          disableParallelToolUse: false,
+          sendReasoning: true,
+          thinking: {
+            type: "enabled",
+            budgetTokens: 10000,
+          },
+        },
+      });
+    });
+
+    test("should use thinking.budgetTokens for claude-opus-4-1", () => {
+      const result = buildProviderOptions("anthropic:claude-opus-4-1", "high");
+
+      expect(result).toEqual({
+        anthropic: {
+          disableParallelToolUse: false,
+          sendReasoning: true,
+          thinking: {
+            type: "enabled",
+            budgetTokens: 20000,
+          },
+        },
+      });
+    });
+
+    test("should use thinking.budgetTokens for claude-haiku-4-5", () => {
+      const result = buildProviderOptions("anthropic:claude-haiku-4-5", "low");
+
+      expect(result).toEqual({
+        anthropic: {
+          disableParallelToolUse: false,
+          sendReasoning: true,
+          thinking: {
+            type: "enabled",
+            budgetTokens: 4000,
+          },
+        },
+      });
+    });
+
+    test("should omit thinking when thinking is off for non-Opus 4.5", () => {
+      const result = buildProviderOptions("anthropic:claude-sonnet-4-5", "off");
+
+      expect(result).toEqual({
+        anthropic: {
+          disableParallelToolUse: false,
+          sendReasoning: true,
+        },
+      });
+    });
+  });
+});
diff --git a/src/common/utils/ai/providerOptions.ts b/src/common/utils/ai/providerOptions.ts
@@ -12,6 +12,7 @@ import type { MuxProviderOptions } from "@/common/types/providerOptions";
 import type { ThinkingLevel } from "@/common/types/thinking";
 import {
   ANTHROPIC_EFFORT,
+  ANTHROPIC_THINKING_BUDGETS,
   GEMINI_THINKING_BUDGETS,
   OPENAI_REASONING_EFFORT,
   OPENROUTER_REASONING_EFFORT,
@@ -83,19 +84,53 @@ export function buildProviderOptions(
 
   // Build Anthropic-specific options
   if (provider === "anthropic") {
-    const effort = ANTHROPIC_EFFORT[effectiveThinking];
+    // Extract model name from model string (e.g., "anthropic:claude-opus-4-5" -> "claude-opus-4-5")
+    const [, modelName] = modelString.split(":");
+
+    // Check if this is Opus 4.5 (supports effort parameter)
+    // Opus 4.5 uses the new "effort" parameter for reasoning control
+    // All other Anthropic models use the "thinking" parameter with budgetTokens
+    const isOpus45 = modelName?.includes("opus-4-5") ?? false;
+
+    if (isOpus45) {
+      // Opus 4.5: Use effort parameter for reasoning control
+      const effort = ANTHROPIC_EFFORT[effectiveThinking];
+      log.debug("buildProviderOptions: Anthropic Opus 4.5 config", {
+        effort,
+        thinkingLevel: effectiveThinking,
+      });
+
+      const options: ProviderOptions = {
+        anthropic: {
+          disableParallelToolUse: false, // Always enable concurrent tool execution
+          sendReasoning: true, // Include reasoning traces in requests sent to the model
+          // Use effort parameter (Opus 4.5 only) to control token spend
+          // SDK auto-adds beta header "effort-2025-11-24" when effort is set
+          ...(effort && { effort }),
+        },
+      };
+      log.debug("buildProviderOptions: Returning Anthropic Opus 4.5 options", options);
+      return options;
+    }
+
+    // Other Anthropic models: Use thinking parameter with budgetTokens
+    const budgetTokens = ANTHROPIC_THINKING_BUDGETS[effectiveThinking];
     log.debug("buildProviderOptions: Anthropic config", {
-      effort,
+      budgetTokens,
       thinkingLevel: effectiveThinking,
     });
 
     const options: ProviderOptions = {
       anthropic: {
         disableParallelToolUse: false, // Always enable concurrent tool execution
         sendReasoning: true, // Include reasoning traces in requests sent to the model
-        // Use effort parameter to control token spend (thinking, text, and tool calls)
-        // SDK auto-adds beta header "effort-2025-11-24" when effort is set
-        ...(effort && { effort }),
+        // Conditionally add thinking configuration (non-Opus 4.5 models)
+        ...(budgetTokens > 0 && {
+          thinking: {
+            type: "enabled",
+            budgetTokens,
+          },
+        }),
       },
     };
     log.debug("buildProviderOptions: Returning Anthropic options", options);
diff --git a/tests/ipcMain/sendMessage.reasoning.test.ts b/tests/ipcMain/sendMessage.reasoning.test.ts
@@ -0,0 +1,60 @@
+/**
+ * Integration tests for reasoning/thinking functionality across Anthropic models.
+ * Verifies Opus 4.5 uses `effort` and Sonnet 4.5 uses `thinking.budgetTokens`.
+ */
+
+import { shouldRunIntegrationTests, validateApiKeys } from "./setup";
+import { sendMessage, assertStreamSuccess, waitForStreamSuccess } from "./helpers";
+import { createSharedRepo, cleanupSharedRepo, withSharedWorkspace } from "./sendMessageTestHelpers";
+import { KNOWN_MODELS } from "@/common/constants/knownModels";
+
+const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
+
+if (shouldRunIntegrationTests()) {
+  validateApiKeys(["ANTHROPIC_API_KEY"]);
+}
+
+beforeAll(createSharedRepo);
+afterAll(cleanupSharedRepo);
+
+describeIntegration("Anthropic reasoning parameter tests", () => {
+  test.concurrent(
+    "Sonnet 4.5 with thinking (budgetTokens)",
+    async () => {
+      await withSharedWorkspace("anthropic", async ({ env, workspaceId }) => {
+        const result = await sendMessage(
+          env.mockIpcRenderer,
+          workspaceId,
+          "What is 2+2? Answer in one word.",
+          { model: KNOWN_MODELS.SONNET.id, thinkingLevel: "low" }
+        );
+        expect(result.success).toBe(true);
+
+        const collector = await waitForStreamSuccess(env.sentEvents, workspaceId, 30000);
+        assertStreamSuccess(collector);
+        expect(collector.getDeltas().length).toBeGreaterThan(0);
+      });
+    },
+    60000
+  );
+
+  test.concurrent(
+    "Opus 4.5 with thinking (effort)",
+    async () => {
+      await withSharedWorkspace("anthropic", async ({ env, workspaceId }) => {
+        const result = await sendMessage(
+          env.mockIpcRenderer,
+          workspaceId,
+          "What is 4+4? Answer in one word.",
+          { model: KNOWN_MODELS.OPUS.id, thinkingLevel: "low" }
+        );
+        expect(result.success).toBe(true);
+
+        const collector = await waitForStreamSuccess(env.sentEvents, workspaceId, 60000);
+        assertStreamSuccess(collector);
+        expect(collector.getDeltas().length).toBeGreaterThan(0);
+      });
+    },
+    90000
+  );
+});