diff --git a/js/llm.ts b/js/llm.ts index 7a63480..2b220dd 100644 --- a/js/llm.ts +++ b/js/llm.ts @@ -11,6 +11,7 @@ import { ChatCompletionMessageParam, ChatCompletionTool, } from "openai/resources"; +import type { ReasoningEffort } from "openai/resources/shared"; import { makePartial, ScorerWithPartial } from "./partial"; import { renderMessages } from "./render-messages"; @@ -23,6 +24,9 @@ const COT_SUFFIX = export type LLMArgs = { maxTokens?: number; temperature?: number; + reasoningEffort?: ReasoningEffort; + reasoningEnabled?: boolean; + reasoningBudget?: number; } & OpenAIAuth; /** @@ -113,17 +117,35 @@ export async function OpenAIClassifier( classificationTools: classificationTools, maxTokens, temperature, + reasoningEffort, + reasoningEnabled, + reasoningBudget, cache, ...remainingRenderArgs } = remaining; - const extraArgs: { temperature?: number; max_tokens?: number } = {}; + const extraArgs: { + temperature?: number; + max_tokens?: number; + reasoning_effort?: ReasoningEffort; + reasoning_enabled?: boolean; + reasoning_budget?: number; + } = {}; if (temperature !== undefined) { extraArgs.temperature = temperature; } if (maxTokens !== undefined) { extraArgs.max_tokens = maxTokens; } + if (reasoningEffort !== undefined) { + extraArgs.reasoning_effort = reasoningEffort; + } + if (reasoningEnabled !== undefined) { + extraArgs.reasoning_enabled = reasoningEnabled; + } + if (reasoningBudget !== undefined) { + extraArgs.reasoning_budget = reasoningBudget; + } const renderArgs = { output, @@ -216,6 +238,9 @@ export function LLMClassifierFromTemplate({ useCoT: useCoTArg, temperature, maxTokens: maxTokensArg, + reasoningEffort, + reasoningEnabled, + reasoningBudget, }: { name: string; promptTemplate: string; @@ -224,6 +249,9 @@ export function LLMClassifierFromTemplate({ useCoT?: boolean; temperature?: number; maxTokens?: number; + reasoningEffort?: ReasoningEffort; + reasoningEnabled?: boolean; + reasoningBudget?: number; }): Scorer> { const choiceStrings = Object.keys(choiceScores); const ret = async ( @@ -252,6 +280,9 @@ export function LLMClassifierFromTemplate({ model, maxTokens, temperature, + reasoningEffort, + reasoningEnabled, + reasoningBudget, __choices: choiceStrings, ...runtimeArgs, diff --git a/js/oai.ts b/js/oai.ts index bc0a762..268908f 100644 --- a/js/oai.ts +++ b/js/oai.ts @@ -4,6 +4,7 @@ import { ChatCompletionTool, ChatCompletionToolChoiceOption, } from "openai/resources"; +import { ReasoningEffort } from "openai/resources/shared"; import { AzureOpenAI, OpenAI } from "openai"; export interface CachedLLMParams { @@ -17,6 +18,7 @@ export interface CachedLLMParams { tool_choice?: ChatCompletionToolChoiceOption; temperature?: number; max_tokens?: number; + reasoning_effort?: ReasoningEffort; span_info?: { spanAttributes?: Record; }; diff --git a/js/reasoning-effort.test.ts b/js/reasoning-effort.test.ts new file mode 100644 index 0000000..33e699e --- /dev/null +++ b/js/reasoning-effort.test.ts @@ -0,0 +1,95 @@ +import { expect, test, describe } from "vitest"; +import { LLMClassifierFromTemplate } from "./llm"; +import { Score } from "./score"; + +describe("reasoning parameters", () => { + test("accepts reasoningEffort in LLMArgs", () => { + // This test just verifies that the type system accepts reasoningEffort + // We don't actually call the API to avoid requiring credentials in tests + const classifier = LLMClassifierFromTemplate({ + name: "test", + promptTemplate: "Evaluate: {{output}}", + choiceScores: { good: 1, bad: 0 }, + model: "o3-mini", + reasoningEffort: "high", + }); + + expect(classifier).toBeDefined(); + expect(typeof classifier).toBe("function"); + }); + + test("accepts all valid reasoningEffort values", () => { + const validValues: Array< + "minimal" | "low" | "medium" | "high" | null | undefined + > = ["minimal", "low", "medium", "high", null, undefined]; + + for (const value of validValues) { + const classifier = LLMClassifierFromTemplate({ + name: "test", + promptTemplate: "Evaluate: {{output}}", + choiceScores: { good: 1, bad: 0 }, + model: "o3-mini", + reasoningEffort: value, + }); + + expect(classifier).toBeDefined(); + } + }); + + test("reasoningEffort can be passed at runtime", () => { + const classifier = LLMClassifierFromTemplate({ + name: "test", + promptTemplate: "Evaluate: {{output}}", + choiceScores: { good: 1, bad: 0 }, + model: "o3-mini", + }); + + // TypeScript should allow passing reasoningEffort at runtime + // This verifies the type allows it (actual API call would require credentials) + expect(classifier).toBeDefined(); + }); + + test("accepts reasoningEnabled parameter", () => { + // Test that the type system accepts reasoningEnabled + const classifier = LLMClassifierFromTemplate({ + name: "test", + promptTemplate: "Evaluate: {{output}}", + choiceScores: { good: 1, bad: 0 }, + model: "claude-3-5-sonnet-20241022", + reasoningEnabled: true, + }); + + expect(classifier).toBeDefined(); + expect(typeof classifier).toBe("function"); + }); + + test("accepts reasoningBudget parameter", () => { + // Test that the type system accepts reasoningBudget + const classifier = LLMClassifierFromTemplate({ + name: "test", + promptTemplate: "Evaluate: {{output}}", + choiceScores: { good: 1, bad: 0 }, + model: "claude-3-5-sonnet-20241022", + reasoningBudget: 2048, + }); + + expect(classifier).toBeDefined(); + expect(typeof classifier).toBe("function"); + }); + + test("accepts all reasoning parameters together", () => { + // Test that all reasoning parameters can be used together + const classifier = LLMClassifierFromTemplate({ + name: "test", + promptTemplate: "Evaluate: {{output}}", + choiceScores: { good: 1, bad: 0 }, + model: "o3-mini", + reasoningEffort: "high", + reasoningEnabled: true, + reasoningBudget: 4096, + }); + + expect(classifier).toBeDefined(); + expect(typeof classifier).toBe("function"); + }); +}); diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py index 2431429..0bbc7d4 100644 --- a/py/autoevals/llm.py +++ b/py/autoevals/llm.py @@ -168,6 +168,9 @@ def __init__( render_args=None, max_tokens=None, temperature=None, + reasoning_effort=None, + reasoning_enabled=None, + reasoning_budget=None, engine=None, api_key=None, base_url=None, @@ -189,6 +192,15 @@ def __init__( if max_tokens is not None: self.extra_args["max_tokens"] = max(max_tokens, 5) + if reasoning_effort is not None: + self.extra_args["reasoning_effort"] = reasoning_effort + + if reasoning_enabled is not None: + self.extra_args["reasoning_enabled"] = reasoning_enabled + + if reasoning_budget is not None: + self.extra_args["reasoning_budget"] = reasoning_budget + self.render_args = {} if render_args: self.render_args.update(render_args) @@ -311,6 +323,9 @@ class LLMClassifier(OpenAILLMClassifier): use_cot: Enable chain of thought reasoning. Defaults to True. max_tokens: Maximum tokens to generate. If not specified, uses the model's default. temperature: Controls randomness (0-1). If not specified, uses the model's default. + reasoning_effort: Controls reasoning depth for o-series models (e.g., "low", "medium", "high"). + reasoning_enabled: Enable extended thinking for supported models (e.g., Claude). Defaults to None. + reasoning_budget: Token allocation for model's internal reasoning. Defaults to None. engine: Deprecated by OpenAI. Use model instead. api_key: Deprecated. Use client instead. base_url: Deprecated. Use client instead. @@ -329,6 +344,9 @@ def __init__( use_cot=True, max_tokens=None, temperature=None, + reasoning_effort=None, + reasoning_enabled=None, + reasoning_budget=None, engine=None, api_key=None, base_url=None, @@ -356,6 +374,9 @@ def __init__( classification_tools=build_classification_tools(use_cot, choice_strings), max_tokens=max_tokens, temperature=temperature, + reasoning_effort=reasoning_effort, + reasoning_enabled=reasoning_enabled, + reasoning_budget=reasoning_budget, engine=engine, api_key=api_key, base_url=base_url, diff --git a/py/autoevals/test_reasoning_effort.py b/py/autoevals/test_reasoning_effort.py new file mode 100644 index 0000000..c18950a --- /dev/null +++ b/py/autoevals/test_reasoning_effort.py @@ -0,0 +1,121 @@ +"""Tests for reasoning parameter support.""" + +import pytest + +from autoevals.llm import LLMClassifier + + +def test_reasoning_effort_in_constructor(): + """Test that LLMClassifier accepts reasoning_effort parameter.""" + classifier = LLMClassifier( + name="test", + prompt_template="Evaluate: {{output}}", + choice_scores={"good": 1, "bad": 0}, + model="o3-mini", + reasoning_effort="high", + ) + + assert classifier is not None + assert classifier.extra_args.get("reasoning_effort") == "high" + + +def test_reasoning_effort_values(): + """Test that all valid reasoning_effort values are accepted.""" + valid_values = ["minimal", "low", "medium", "high", None] + + for value in valid_values: + classifier = LLMClassifier( + name="test", + prompt_template="Evaluate: {{output}}", + choice_scores={"good": 1, "bad": 0}, + model="o3-mini", + reasoning_effort=value, + ) + + assert classifier is not None + if value is not None: + assert classifier.extra_args.get("reasoning_effort") == value + else: + assert "reasoning_effort" not in classifier.extra_args + + +def test_reasoning_effort_not_set_by_default(): + """Test that reasoning_effort is not set when not provided.""" + classifier = LLMClassifier( + name="test", + prompt_template="Evaluate: {{output}}", + choice_scores={"good": 1, "bad": 0}, + model="o3-mini", + ) + + assert "reasoning_effort" not in classifier.extra_args + + +def test_reasoning_enabled_in_constructor(): + """Test that LLMClassifier accepts reasoning_enabled parameter.""" + classifier = LLMClassifier( + name="test", + prompt_template="Evaluate: {{output}}", + choice_scores={"good": 1, "bad": 0}, + model="claude-3-5-sonnet-20241022", + reasoning_enabled=True, + ) + + assert classifier is not None + assert classifier.extra_args.get("reasoning_enabled") is True + + +def test_reasoning_budget_in_constructor(): + """Test that LLMClassifier accepts reasoning_budget parameter.""" + classifier = LLMClassifier( + name="test", + prompt_template="Evaluate: {{output}}", + choice_scores={"good": 1, "bad": 0}, + model="claude-3-5-sonnet-20241022", + reasoning_budget=2048, + ) + + assert classifier is not None + assert classifier.extra_args.get("reasoning_budget") == 2048 + + +def test_all_reasoning_parameters(): + """Test that all reasoning parameters can be used together.""" + classifier = LLMClassifier( + name="test", + prompt_template="Evaluate: {{output}}", + choice_scores={"good": 1, "bad": 0}, + model="o3-mini", + reasoning_effort="high", + reasoning_enabled=True, + reasoning_budget=4096, + ) + + assert classifier is not None + assert classifier.extra_args.get("reasoning_effort") == "high" + assert classifier.extra_args.get("reasoning_enabled") is True + assert classifier.extra_args.get("reasoning_budget") == 4096 + + +def test_reasoning_enabled_not_set_by_default(): + """Test that reasoning_enabled is not set when not provided.""" + classifier = LLMClassifier( + name="test", + prompt_template="Evaluate: {{output}}", + choice_scores={"good": 1, "bad": 0}, + model="claude-3-5-sonnet-20241022", + ) + + assert "reasoning_enabled" not in classifier.extra_args + + +def test_reasoning_budget_not_set_by_default(): + """Test that reasoning_budget is not set when not provided.""" + classifier = LLMClassifier( + name="test", + prompt_template="Evaluate: {{output}}", + choice_scores={"good": 1, "bad": 0}, + model="claude-3-5-sonnet-20241022", + ) + + assert "reasoning_budget" not in classifier.extra_args