Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion js/llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
ChatCompletionMessageParam,
ChatCompletionTool,
} from "openai/resources";
import type { ReasoningEffort } from "openai/resources/shared";
import { makePartial, ScorerWithPartial } from "./partial";
import { renderMessages } from "./render-messages";

Expand All @@ -23,6 +24,9 @@ const COT_SUFFIX =
export type LLMArgs = {
maxTokens?: number;
temperature?: number;
reasoningEffort?: ReasoningEffort;
reasoningEnabled?: boolean;
reasoningBudget?: number;
} & OpenAIAuth;

/**
Expand Down Expand Up @@ -113,17 +117,35 @@ export async function OpenAIClassifier<RenderArgs, Output>(
classificationTools: classificationTools,
maxTokens,
temperature,
reasoningEffort,
reasoningEnabled,
reasoningBudget,
cache,
...remainingRenderArgs
} = remaining;

const extraArgs: { temperature?: number; max_tokens?: number } = {};
const extraArgs: {
temperature?: number;
max_tokens?: number;
reasoning_effort?: ReasoningEffort;
reasoning_enabled?: boolean;
reasoning_budget?: number;
} = {};
if (temperature !== undefined) {
extraArgs.temperature = temperature;
}
if (maxTokens !== undefined) {
extraArgs.max_tokens = maxTokens;
}
if (reasoningEffort !== undefined) {
extraArgs.reasoning_effort = reasoningEffort;
}
if (reasoningEnabled !== undefined) {
extraArgs.reasoning_enabled = reasoningEnabled;
}
if (reasoningBudget !== undefined) {
extraArgs.reasoning_budget = reasoningBudget;
}

const renderArgs = {
output,
Expand Down Expand Up @@ -216,6 +238,9 @@ export function LLMClassifierFromTemplate<RenderArgs>({
useCoT: useCoTArg,
temperature,
maxTokens: maxTokensArg,
reasoningEffort,
reasoningEnabled,
reasoningBudget,
}: {
name: string;
promptTemplate: string;
Expand All @@ -224,6 +249,9 @@ export function LLMClassifierFromTemplate<RenderArgs>({
useCoT?: boolean;
temperature?: number;
maxTokens?: number;
reasoningEffort?: ReasoningEffort;
reasoningEnabled?: boolean;
reasoningBudget?: number;
}): Scorer<string, LLMClassifierArgs<RenderArgs>> {
const choiceStrings = Object.keys(choiceScores);
const ret = async (
Expand Down Expand Up @@ -252,6 +280,9 @@ export function LLMClassifierFromTemplate<RenderArgs>({
model,
maxTokens,
temperature,
reasoningEffort,
reasoningEnabled,
reasoningBudget,
__choices: choiceStrings,
...runtimeArgs,

Expand Down
2 changes: 2 additions & 0 deletions js/oai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import {
ChatCompletionTool,
ChatCompletionToolChoiceOption,
} from "openai/resources";
import { ReasoningEffort } from "openai/resources/shared";
import { AzureOpenAI, OpenAI } from "openai";

export interface CachedLLMParams {
Expand All @@ -17,6 +18,7 @@ export interface CachedLLMParams {
tool_choice?: ChatCompletionToolChoiceOption;
temperature?: number;
max_tokens?: number;
reasoning_effort?: ReasoningEffort;
span_info?: {
spanAttributes?: Record<string, string>;
};
Expand Down
95 changes: 95 additions & 0 deletions js/reasoning-effort.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import { expect, test, describe } from "vitest";
import { LLMClassifierFromTemplate } from "./llm";
import { Score } from "./score";

describe("reasoning parameters", () => {
test("accepts reasoningEffort in LLMArgs", () => {
// This test just verifies that the type system accepts reasoningEffort
// We don't actually call the API to avoid requiring credentials in tests
const classifier = LLMClassifierFromTemplate({
name: "test",
promptTemplate: "Evaluate: {{output}}",
choiceScores: { good: 1, bad: 0 },
model: "o3-mini",
reasoningEffort: "high",
});

expect(classifier).toBeDefined();
expect(typeof classifier).toBe("function");
});

test("accepts all valid reasoningEffort values", () => {
const validValues: Array<
"minimal" | "low" | "medium" | "high" | null | undefined
> = ["minimal", "low", "medium", "high", null, undefined];

for (const value of validValues) {
const classifier = LLMClassifierFromTemplate({
name: "test",
promptTemplate: "Evaluate: {{output}}",
choiceScores: { good: 1, bad: 0 },
model: "o3-mini",
reasoningEffort: value,
});

expect(classifier).toBeDefined();
}
});

test("reasoningEffort can be passed at runtime", () => {
const classifier = LLMClassifierFromTemplate({
name: "test",
promptTemplate: "Evaluate: {{output}}",
choiceScores: { good: 1, bad: 0 },
model: "o3-mini",
});

// TypeScript should allow passing reasoningEffort at runtime
// This verifies the type allows it (actual API call would require credentials)
expect(classifier).toBeDefined();
});

test("accepts reasoningEnabled parameter", () => {
// Test that the type system accepts reasoningEnabled
const classifier = LLMClassifierFromTemplate({
name: "test",
promptTemplate: "Evaluate: {{output}}",
choiceScores: { good: 1, bad: 0 },
model: "claude-3-5-sonnet-20241022",
reasoningEnabled: true,
});

expect(classifier).toBeDefined();
expect(typeof classifier).toBe("function");
});

test("accepts reasoningBudget parameter", () => {
// Test that the type system accepts reasoningBudget
const classifier = LLMClassifierFromTemplate({
name: "test",
promptTemplate: "Evaluate: {{output}}",
choiceScores: { good: 1, bad: 0 },
model: "claude-3-5-sonnet-20241022",
reasoningBudget: 2048,
});

expect(classifier).toBeDefined();
expect(typeof classifier).toBe("function");
});

test("accepts all reasoning parameters together", () => {
// Test that all reasoning parameters can be used together
const classifier = LLMClassifierFromTemplate({
name: "test",
promptTemplate: "Evaluate: {{output}}",
choiceScores: { good: 1, bad: 0 },
model: "o3-mini",
reasoningEffort: "high",
reasoningEnabled: true,
reasoningBudget: 4096,
});

expect(classifier).toBeDefined();
expect(typeof classifier).toBe("function");
});
});
21 changes: 21 additions & 0 deletions py/autoevals/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ def __init__(
render_args=None,
max_tokens=None,
temperature=None,
reasoning_effort=None,
reasoning_enabled=None,
reasoning_budget=None,
engine=None,
api_key=None,
base_url=None,
Expand All @@ -189,6 +192,15 @@ def __init__(
if max_tokens is not None:
self.extra_args["max_tokens"] = max(max_tokens, 5)

if reasoning_effort is not None:
self.extra_args["reasoning_effort"] = reasoning_effort

if reasoning_enabled is not None:
self.extra_args["reasoning_enabled"] = reasoning_enabled

if reasoning_budget is not None:
self.extra_args["reasoning_budget"] = reasoning_budget

self.render_args = {}
if render_args:
self.render_args.update(render_args)
Expand Down Expand Up @@ -311,6 +323,9 @@ class LLMClassifier(OpenAILLMClassifier):
use_cot: Enable chain of thought reasoning. Defaults to True.
max_tokens: Maximum tokens to generate. If not specified, uses the model's default.
temperature: Controls randomness (0-1). If not specified, uses the model's default.
reasoning_effort: Controls reasoning depth for o-series models (e.g., "low", "medium", "high").
reasoning_enabled: Enable extended thinking for supported models (e.g., Claude). Defaults to None.
reasoning_budget: Token allocation for model's internal reasoning. Defaults to None.
engine: Deprecated by OpenAI. Use model instead.
api_key: Deprecated. Use client instead.
base_url: Deprecated. Use client instead.
Expand All @@ -329,6 +344,9 @@ def __init__(
use_cot=True,
max_tokens=None,
temperature=None,
reasoning_effort=None,
reasoning_enabled=None,
reasoning_budget=None,
engine=None,
api_key=None,
base_url=None,
Expand Down Expand Up @@ -356,6 +374,9 @@ def __init__(
classification_tools=build_classification_tools(use_cot, choice_strings),
max_tokens=max_tokens,
temperature=temperature,
reasoning_effort=reasoning_effort,
reasoning_enabled=reasoning_enabled,
reasoning_budget=reasoning_budget,
engine=engine,
api_key=api_key,
base_url=base_url,
Expand Down
121 changes: 121 additions & 0 deletions py/autoevals/test_reasoning_effort.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""Tests for reasoning parameter support."""

import pytest

from autoevals.llm import LLMClassifier


def test_reasoning_effort_in_constructor():
"""Test that LLMClassifier accepts reasoning_effort parameter."""
classifier = LLMClassifier(
name="test",
prompt_template="Evaluate: {{output}}",
choice_scores={"good": 1, "bad": 0},
model="o3-mini",
reasoning_effort="high",
)

assert classifier is not None
assert classifier.extra_args.get("reasoning_effort") == "high"


def test_reasoning_effort_values():
"""Test that all valid reasoning_effort values are accepted."""
valid_values = ["minimal", "low", "medium", "high", None]

for value in valid_values:
classifier = LLMClassifier(
name="test",
prompt_template="Evaluate: {{output}}",
choice_scores={"good": 1, "bad": 0},
model="o3-mini",
reasoning_effort=value,
)

assert classifier is not None
if value is not None:
assert classifier.extra_args.get("reasoning_effort") == value
else:
assert "reasoning_effort" not in classifier.extra_args


def test_reasoning_effort_not_set_by_default():
"""Test that reasoning_effort is not set when not provided."""
classifier = LLMClassifier(
name="test",
prompt_template="Evaluate: {{output}}",
choice_scores={"good": 1, "bad": 0},
model="o3-mini",
)

assert "reasoning_effort" not in classifier.extra_args


def test_reasoning_enabled_in_constructor():
"""Test that LLMClassifier accepts reasoning_enabled parameter."""
classifier = LLMClassifier(
name="test",
prompt_template="Evaluate: {{output}}",
choice_scores={"good": 1, "bad": 0},
model="claude-3-5-sonnet-20241022",
reasoning_enabled=True,
)

assert classifier is not None
assert classifier.extra_args.get("reasoning_enabled") is True


def test_reasoning_budget_in_constructor():
"""Test that LLMClassifier accepts reasoning_budget parameter."""
classifier = LLMClassifier(
name="test",
prompt_template="Evaluate: {{output}}",
choice_scores={"good": 1, "bad": 0},
model="claude-3-5-sonnet-20241022",
reasoning_budget=2048,
)

assert classifier is not None
assert classifier.extra_args.get("reasoning_budget") == 2048


def test_all_reasoning_parameters():
"""Test that all reasoning parameters can be used together."""
classifier = LLMClassifier(
name="test",
prompt_template="Evaluate: {{output}}",
choice_scores={"good": 1, "bad": 0},
model="o3-mini",
reasoning_effort="high",
reasoning_enabled=True,
reasoning_budget=4096,
)

assert classifier is not None
assert classifier.extra_args.get("reasoning_effort") == "high"
assert classifier.extra_args.get("reasoning_enabled") is True
assert classifier.extra_args.get("reasoning_budget") == 4096


def test_reasoning_enabled_not_set_by_default():
"""Test that reasoning_enabled is not set when not provided."""
classifier = LLMClassifier(
name="test",
prompt_template="Evaluate: {{output}}",
choice_scores={"good": 1, "bad": 0},
model="claude-3-5-sonnet-20241022",
)

assert "reasoning_enabled" not in classifier.extra_args


def test_reasoning_budget_not_set_by_default():
"""Test that reasoning_budget is not set when not provided."""
classifier = LLMClassifier(
name="test",
prompt_template="Evaluate: {{output}}",
choice_scores={"good": 1, "bad": 0},
model="claude-3-5-sonnet-20241022",
)

assert "reasoning_budget" not in classifier.extra_args