braintrustdata · Qard · Jan 19, 2026 · Jan 14, 2026 · Jan 15, 2026
diff --git a/js/llm.ts b/js/llm.ts
@@ -11,6 +11,7 @@ import {
   ChatCompletionMessageParam,
   ChatCompletionTool,
 } from "openai/resources";
+import type { ReasoningEffort } from "openai/resources/shared";
 import { makePartial, ScorerWithPartial } from "./partial";
 import { renderMessages } from "./render-messages";
 
@@ -23,6 +24,9 @@ const COT_SUFFIX =
 export type LLMArgs = {
   maxTokens?: number;
   temperature?: number;
+  reasoningEffort?: ReasoningEffort;
+  reasoningEnabled?: boolean;
+  reasoningBudget?: number;
 } & OpenAIAuth;
 
 /**
@@ -113,17 +117,35 @@ export async function OpenAIClassifier<RenderArgs, Output>(
     classificationTools: classificationTools,
     maxTokens,
     temperature,
+    reasoningEffort,
+    reasoningEnabled,
+    reasoningBudget,
     cache,
     ...remainingRenderArgs
   } = remaining;
 
-  const extraArgs: { temperature?: number; max_tokens?: number } = {};
+  const extraArgs: {
+    temperature?: number;
+    max_tokens?: number;
+    reasoning_effort?: ReasoningEffort;
+    reasoning_enabled?: boolean;
+    reasoning_budget?: number;
+  } = {};
   if (temperature !== undefined) {
     extraArgs.temperature = temperature;
   }
   if (maxTokens !== undefined) {
     extraArgs.max_tokens = maxTokens;
   }
+  if (reasoningEffort !== undefined) {
+    extraArgs.reasoning_effort = reasoningEffort;
+  }
+  if (reasoningEnabled !== undefined) {
+    extraArgs.reasoning_enabled = reasoningEnabled;
+  }
+  if (reasoningBudget !== undefined) {
+    extraArgs.reasoning_budget = reasoningBudget;
+  }
 
   const renderArgs = {
     output,
@@ -216,6 +238,9 @@ export function LLMClassifierFromTemplate<RenderArgs>({
   useCoT: useCoTArg,
   temperature,
   maxTokens: maxTokensArg,
+  reasoningEffort,
+  reasoningEnabled,
+  reasoningBudget,
 }: {
   name: string;
   promptTemplate: string;
@@ -224,6 +249,9 @@ export function LLMClassifierFromTemplate<RenderArgs>({
   useCoT?: boolean;
   temperature?: number;
   maxTokens?: number;
+  reasoningEffort?: ReasoningEffort;
+  reasoningEnabled?: boolean;
+  reasoningBudget?: number;
 }): Scorer<string, LLMClassifierArgs<RenderArgs>> {
   const choiceStrings = Object.keys(choiceScores);
   const ret = async (
@@ -252,6 +280,9 @@ export function LLMClassifierFromTemplate<RenderArgs>({
       model,
       maxTokens,
       temperature,
+      reasoningEffort,
+      reasoningEnabled,
+      reasoningBudget,
       __choices: choiceStrings,
       ...runtimeArgs,
 

diff --git a/js/oai.ts b/js/oai.ts
@@ -4,6 +4,7 @@ import {
   ChatCompletionTool,
   ChatCompletionToolChoiceOption,
 } from "openai/resources";
+import { ReasoningEffort } from "openai/resources/shared";
 import { AzureOpenAI, OpenAI } from "openai";
 
 export interface CachedLLMParams {
@@ -17,6 +18,7 @@ export interface CachedLLMParams {
   tool_choice?: ChatCompletionToolChoiceOption;
   temperature?: number;
   max_tokens?: number;
+  reasoning_effort?: ReasoningEffort;
   span_info?: {
     spanAttributes?: Record<string, string>;
   };

diff --git a/js/reasoning-effort.test.ts b/js/reasoning-effort.test.ts
@@ -0,0 +1,95 @@
+import { expect, test, describe } from "vitest";
+import { LLMClassifierFromTemplate } from "./llm";
+import { Score } from "./score";
+
+describe("reasoning parameters", () => {
+  test("accepts reasoningEffort in LLMArgs", () => {
+    // This test just verifies that the type system accepts reasoningEffort
+    // We don't actually call the API to avoid requiring credentials in tests
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Evaluate: {{output}}",
+      choiceScores: { good: 1, bad: 0 },
+      model: "o3-mini",
+      reasoningEffort: "high",
+    });
+
+    expect(classifier).toBeDefined();
+    expect(typeof classifier).toBe("function");
+  });
+
+  test("accepts all valid reasoningEffort values", () => {
+    const validValues: Array<
+      "minimal" | "low" | "medium" | "high" | null | undefined
+    > = ["minimal", "low", "medium", "high", null, undefined];
+
+    for (const value of validValues) {
+      const classifier = LLMClassifierFromTemplate({
+        name: "test",
+        promptTemplate: "Evaluate: {{output}}",
+        choiceScores: { good: 1, bad: 0 },
+        model: "o3-mini",
+        reasoningEffort: value,
+      });
+
+      expect(classifier).toBeDefined();
+    }
+  });
+
+  test("reasoningEffort can be passed at runtime", () => {
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Evaluate: {{output}}",
+      choiceScores: { good: 1, bad: 0 },
+      model: "o3-mini",
+    });
+
+    // TypeScript should allow passing reasoningEffort at runtime
+    // This verifies the type allows it (actual API call would require credentials)
+    expect(classifier).toBeDefined();
+  });
+
+  test("accepts reasoningEnabled parameter", () => {
+    // Test that the type system accepts reasoningEnabled
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Evaluate: {{output}}",
+      choiceScores: { good: 1, bad: 0 },
+      model: "claude-3-5-sonnet-20241022",
+      reasoningEnabled: true,
+    });
+
+    expect(classifier).toBeDefined();
+    expect(typeof classifier).toBe("function");
+  });
+
+  test("accepts reasoningBudget parameter", () => {
+    // Test that the type system accepts reasoningBudget
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Evaluate: {{output}}",
+      choiceScores: { good: 1, bad: 0 },
+      model: "claude-3-5-sonnet-20241022",
+      reasoningBudget: 2048,
+    });
+
+    expect(classifier).toBeDefined();
+    expect(typeof classifier).toBe("function");
+  });
+
+  test("accepts all reasoning parameters together", () => {
+    // Test that all reasoning parameters can be used together
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Evaluate: {{output}}",
+      choiceScores: { good: 1, bad: 0 },
+      model: "o3-mini",
+      reasoningEffort: "high",
+      reasoningEnabled: true,
+      reasoningBudget: 4096,
+    });
+
+    expect(classifier).toBeDefined();
+    expect(typeof classifier).toBe("function");
+  });
+});
diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py
@@ -168,6 +168,9 @@ def __init__(
         render_args=None,
         max_tokens=None,
         temperature=None,
+        reasoning_effort=None,
+        reasoning_enabled=None,
+        reasoning_budget=None,
         engine=None,
         api_key=None,
         base_url=None,
@@ -189,6 +192,15 @@ def __init__(
         if max_tokens is not None:
             self.extra_args["max_tokens"] = max(max_tokens, 5)
 
+        if reasoning_effort is not None:
+            self.extra_args["reasoning_effort"] = reasoning_effort
+
+        if reasoning_enabled is not None:
+            self.extra_args["reasoning_enabled"] = reasoning_enabled
+
+        if reasoning_budget is not None:
+            self.extra_args["reasoning_budget"] = reasoning_budget
+
         self.render_args = {}
         if render_args:
             self.render_args.update(render_args)
@@ -311,6 +323,9 @@ class LLMClassifier(OpenAILLMClassifier):
         use_cot: Enable chain of thought reasoning. Defaults to True.
         max_tokens: Maximum tokens to generate. If not specified, uses the model's default.
         temperature: Controls randomness (0-1). If not specified, uses the model's default.
+        reasoning_effort: Controls reasoning depth for o-series models (e.g., "low", "medium", "high").
+        reasoning_enabled: Enable extended thinking for supported models (e.g., Claude). Defaults to None.
+        reasoning_budget: Token allocation for model's internal reasoning. Defaults to None.
         engine: Deprecated by OpenAI. Use model instead.
         api_key: Deprecated. Use client instead.
         base_url: Deprecated. Use client instead.
@@ -329,6 +344,9 @@ def __init__(
         use_cot=True,
         max_tokens=None,
         temperature=None,
+        reasoning_effort=None,
+        reasoning_enabled=None,
+        reasoning_budget=None,
         engine=None,
         api_key=None,
         base_url=None,
@@ -356,6 +374,9 @@ def __init__(
             classification_tools=build_classification_tools(use_cot, choice_strings),
             max_tokens=max_tokens,
             temperature=temperature,
+            reasoning_effort=reasoning_effort,
+            reasoning_enabled=reasoning_enabled,
+            reasoning_budget=reasoning_budget,
             engine=engine,
             api_key=api_key,
             base_url=base_url,

diff --git a/py/autoevals/test_reasoning_effort.py b/py/autoevals/test_reasoning_effort.py
@@ -0,0 +1,121 @@
+"""Tests for reasoning parameter support."""
+
+import pytest
+
+from autoevals.llm import LLMClassifier
+
+
+def test_reasoning_effort_in_constructor():
+    """Test that LLMClassifier accepts reasoning_effort parameter."""
+    classifier = LLMClassifier(
+        name="test",
+        prompt_template="Evaluate: {{output}}",
+        choice_scores={"good": 1, "bad": 0},
+        model="o3-mini",
+        reasoning_effort="high",
+    )
+
+    assert classifier is not None
+    assert classifier.extra_args.get("reasoning_effort") == "high"
+
+
+def test_reasoning_effort_values():
+    """Test that all valid reasoning_effort values are accepted."""
+    valid_values = ["minimal", "low", "medium", "high", None]
+
+    for value in valid_values:
+        classifier = LLMClassifier(
+            name="test",
+            prompt_template="Evaluate: {{output}}",
+            choice_scores={"good": 1, "bad": 0},
+            model="o3-mini",
+            reasoning_effort=value,
+        )
+
+        assert classifier is not None
+        if value is not None:
+            assert classifier.extra_args.get("reasoning_effort") == value
+        else:
+            assert "reasoning_effort" not in classifier.extra_args
+
+
+def test_reasoning_effort_not_set_by_default():
+    """Test that reasoning_effort is not set when not provided."""
+    classifier = LLMClassifier(
+        name="test",
+        prompt_template="Evaluate: {{output}}",
+        choice_scores={"good": 1, "bad": 0},
+        model="o3-mini",
+    )
+
+    assert "reasoning_effort" not in classifier.extra_args
+
+
+def test_reasoning_enabled_in_constructor():
+    """Test that LLMClassifier accepts reasoning_enabled parameter."""
+    classifier = LLMClassifier(
+        name="test",
+        prompt_template="Evaluate: {{output}}",
+        choice_scores={"good": 1, "bad": 0},
+        model="claude-3-5-sonnet-20241022",
+        reasoning_enabled=True,
+    )
+
+    assert classifier is not None
+    assert classifier.extra_args.get("reasoning_enabled") is True
+
+
+def test_reasoning_budget_in_constructor():
+    """Test that LLMClassifier accepts reasoning_budget parameter."""
+    classifier = LLMClassifier(
+        name="test",
+        prompt_template="Evaluate: {{output}}",
+        choice_scores={"good": 1, "bad": 0},
+        model="claude-3-5-sonnet-20241022",
+        reasoning_budget=2048,
+    )
+
+    assert classifier is not None
+    assert classifier.extra_args.get("reasoning_budget") == 2048
+
+
+def test_all_reasoning_parameters():
+    """Test that all reasoning parameters can be used together."""
+    classifier = LLMClassifier(
+        name="test",
+        prompt_template="Evaluate: {{output}}",
+        choice_scores={"good": 1, "bad": 0},
+        model="o3-mini",
+        reasoning_effort="high",
+        reasoning_enabled=True,
+        reasoning_budget=4096,
+    )
+
+    assert classifier is not None
+    assert classifier.extra_args.get("reasoning_effort") == "high"
+    assert classifier.extra_args.get("reasoning_enabled") is True
+    assert classifier.extra_args.get("reasoning_budget") == 4096
+
+
+def test_reasoning_enabled_not_set_by_default():
+    """Test that reasoning_enabled is not set when not provided."""
+    classifier = LLMClassifier(
+        name="test",
+        prompt_template="Evaluate: {{output}}",
+        choice_scores={"good": 1, "bad": 0},
+        model="claude-3-5-sonnet-20241022",
+    )
+
+    assert "reasoning_enabled" not in classifier.extra_args
+
+
+def test_reasoning_budget_not_set_by_default():
+    """Test that reasoning_budget is not set when not provided."""
+    classifier = LLMClassifier(
+        name="test",
+        prompt_template="Evaluate: {{output}}",
+        choice_scores={"good": 1, "bad": 0},
+        model="claude-3-5-sonnet-20241022",
+    )
+
+    assert "reasoning_budget" not in classifier.extra_args