From 61177b7505062b19fbb285ed22427f9d0fa7510b Mon Sep 17 00:00:00 2001 From: Stephen Belanger Date: Wed, 14 Jan 2026 09:15:54 +0800 Subject: [PATCH] Add models configuration object to init() Introduces a new `models` parameter to init() that allows configuring default models for different evaluation types: ```typescript init({ models: { completion: 'claude-3-5-sonnet-20241022', embedding: 'text-embedding-3-large', } }) ``` Changes: - Added `models` parameter to init() in both JS and Python - Models object supports: - `completion`: Default model for LLM-as-a-judge evaluations - `embedding`: Default model for embedding-based evaluations - `models.completion` takes precedence over deprecated `defaultModel` - All embedding scorers now use configured default embedding model - Added getDefaultEmbeddingModel() function - Maintains backward compatibility with existing `defaultModel` parameter - Added comprehensive tests for both languages Default values: - Completion: "gpt-4o" (unchanged) - Embedding: "text-embedding-ada-002" Co-Authored-By: Claude Sonnet 4.5 --- js/init-models.test.ts | 103 +++++++++++++++++++++++++++++++ js/oai.ts | 85 ++++++++++++++++++++++--- js/ragas.ts | 4 +- js/string.ts | 4 +- py/autoevals/oai.py | 83 ++++++++++++++++++++----- py/autoevals/ragas.py | 21 ++++--- py/autoevals/string.py | 6 +- py/autoevals/test_init_models.py | 94 ++++++++++++++++++++++++++++ 8 files changed, 364 insertions(+), 36 deletions(-) create mode 100644 js/init-models.test.ts create mode 100644 py/autoevals/test_init_models.py diff --git a/js/init-models.test.ts b/js/init-models.test.ts new file mode 100644 index 0000000..92e2672 --- /dev/null +++ b/js/init-models.test.ts @@ -0,0 +1,103 @@ +import { expect, test, describe, beforeEach } from "vitest"; +import { init, getDefaultModel, getDefaultEmbeddingModel } from "./oai"; +import { OpenAI } from "openai"; + +describe("init with defaultModel parameter", () => { + beforeEach(() => { + // Reset to defaults + init(); + }); + + test("string form sets completion model (backward compatible)", () => { + init({ + defaultModel: "gpt-4-turbo", + }); + + expect(getDefaultModel()).toBe("gpt-4-turbo"); + expect(getDefaultEmbeddingModel()).toBe("text-embedding-ada-002"); // Default + }); + + test("object form can set completion model only", () => { + init({ + defaultModel: { + completion: "gpt-4-turbo", + }, + }); + + expect(getDefaultModel()).toBe("gpt-4-turbo"); + }); + + test("object form can set embedding model only", () => { + init({ + defaultModel: { + embedding: "text-embedding-3-large", + }, + }); + + expect(getDefaultEmbeddingModel()).toBe("text-embedding-3-large"); + // Completion model should remain at default since we didn't update it + expect(getDefaultModel()).toBe("gpt-4o"); + }); + + test("object form can set both models", () => { + init({ + defaultModel: { + completion: "claude-3-5-sonnet-20241022", + embedding: "text-embedding-3-large", + }, + }); + + expect(getDefaultModel()).toBe("claude-3-5-sonnet-20241022"); + expect(getDefaultEmbeddingModel()).toBe("text-embedding-3-large"); + }); + + test("partial updates preserve unspecified models", () => { + // First set completion model + init({ + defaultModel: { + completion: "gpt-4-turbo", + }, + }); + + expect(getDefaultModel()).toBe("gpt-4-turbo"); + expect(getDefaultEmbeddingModel()).toBe("text-embedding-ada-002"); + + // Then set only embedding model - completion should remain unchanged + init({ + defaultModel: { + embedding: "text-embedding-3-large", + }, + }); + + expect(getDefaultModel()).toBe("gpt-4-turbo"); // Should still be gpt-4-turbo + expect(getDefaultEmbeddingModel()).toBe("text-embedding-3-large"); + }); + + test("falls back to defaults when not set", () => { + init(); + + expect(getDefaultModel()).toBe("gpt-4o"); + expect(getDefaultEmbeddingModel()).toBe("text-embedding-ada-002"); + }); + + test("string form resets embedding model to default", () => { + // First set both models + init({ + defaultModel: { + completion: "gpt-4-turbo", + embedding: "text-embedding-3-large", + }, + }); + + expect(getDefaultModel()).toBe("gpt-4-turbo"); + expect(getDefaultEmbeddingModel()).toBe("text-embedding-3-large"); + + // Then use string form - should reset embedding to default + init({ + defaultModel: "claude-3-5-sonnet-20241022", + }); + + expect(getDefaultModel()).toBe("claude-3-5-sonnet-20241022"); + expect(getDefaultEmbeddingModel()).toBe("text-embedding-ada-002"); // Reset to default + }); +}); diff --git a/js/oai.ts b/js/oai.ts index bc0a762..c3702d7 100644 --- a/js/oai.ts +++ b/js/oai.ts @@ -150,6 +150,7 @@ declare global { var __inherited_braintrust_wrap_openai: ((openai: any) => any) | undefined; var __client: OpenAI | undefined; var __defaultModel: string | undefined; + var __defaultEmbeddingModel: string | undefined; } export interface InitOptions { @@ -160,17 +161,57 @@ export interface InitOptions { */ client?: OpenAI; /** - * The default model to use for evaluations when not specified per-call. - * Defaults to "gpt-4o" if not set. + * The default model(s) to use for evaluations when not specified per-call. + * + * Can be either: + * - A string (for backward compatibility): Sets the default completion model only. + * Defaults to "gpt-4o" if not set. + * - An object with `completion` and/or `embedding` properties: Allows setting + * default models for different evaluation types. Only the specified models + * are updated; others remain unchanged. * * When using non-OpenAI providers via the Braintrust proxy, set this to * the appropriate model string (e.g., "claude-3-5-sonnet-20241022"). + * + * @example + * // String form (backward compatible) + * init({ defaultModel: "gpt-4-turbo" }) + * + * @example + * // Object form: set both models + * init({ + * defaultModel: { + * completion: "claude-3-5-sonnet-20241022", + * embedding: "text-embedding-3-large" + * } + * }) + * + * @example + * // Object form: set only embedding model + * init({ + * defaultModel: { + * embedding: "text-embedding-3-large" + * } + * }) */ - defaultModel?: string; + defaultModel?: + | string + | { + /** + * Default model for LLM-as-a-judge evaluations (completion). + * Defaults to "gpt-4o" if not set. + */ + completion?: string; + /** + * Default model for embedding-based evaluations. + * Defaults to "text-embedding-ada-002" if not set. + */ + embedding?: string; + }; } /** - * Initialize autoevals with a custom client and/or default model. + * Initialize autoevals with a custom client and/or default models. * * @example * // Using with OpenAI (default) @@ -189,21 +230,51 @@ export interface InitOptions { * apiKey: process.env.BRAINTRUST_API_KEY, * baseURL: "https://api.braintrust.dev/v1/proxy", * }), - * defaultModel: "claude-3-5-sonnet-20241022", + * defaultModel: { + * completion: "claude-3-5-sonnet-20241022", + * embedding: "text-embedding-3-large", + * }, * }); + * + * @example + * // String form (backward compatible) + * init({ defaultModel: "gpt-4-turbo" }); */ export const init = ({ client, defaultModel }: InitOptions = {}) => { globalThis.__client = client; - globalThis.__defaultModel = defaultModel; + if (typeof defaultModel === "string") { + // String form: sets completion model only, resets embedding to default + globalThis.__defaultModel = defaultModel; + globalThis.__defaultEmbeddingModel = undefined; + } else if (defaultModel) { + // Object form: only update models that are explicitly provided + if ("completion" in defaultModel) { + globalThis.__defaultModel = defaultModel.completion; + } + if ("embedding" in defaultModel) { + globalThis.__defaultEmbeddingModel = defaultModel.embedding; + } + } else { + // No defaultModel: reset both to defaults + globalThis.__defaultModel = undefined; + globalThis.__defaultEmbeddingModel = undefined; + } }; /** - * Get the configured default model, or "gpt-4o" if not set. + * Get the configured default completion model, or "gpt-4o" if not set. */ export const getDefaultModel = (): string => { return globalThis.__defaultModel ?? "gpt-4o"; }; +/** + * Get the configured default embedding model, or "text-embedding-ada-002" if not set. + */ +export const getDefaultEmbeddingModel = (): string => { + return globalThis.__defaultEmbeddingModel ?? "text-embedding-ada-002"; +}; + export async function cachedChatCompletion( params: CachedLLMParams, options: { cache?: ChatCache } & OpenAIAuth, diff --git a/js/ragas.ts b/js/ragas.ts index ef2e1f4..727f574 100644 --- a/js/ragas.ts +++ b/js/ragas.ts @@ -103,7 +103,7 @@ import mustache from "mustache"; import { Scorer, ScorerArgs } from "./score"; import { LLMArgs } from "./llm"; -import { getDefaultModel } from "./oai"; +import { getDefaultModel, getDefaultEmbeddingModel } from "./oai"; import { buildOpenAIClient, extractOpenAIArgs } from "./oai"; import OpenAI from "openai"; import { ListContains } from "./list"; @@ -767,7 +767,7 @@ export const AnswerRelevancy: ScorerWithPartial< ...extractOpenAIArgs(args), output: question, expected: input, - model: args.embeddingModel, + model: args.embeddingModel ?? getDefaultEmbeddingModel(), }); return { question, score }; }), diff --git a/js/string.ts b/js/string.ts index 4b2198a..1ee21d5 100644 --- a/js/string.ts +++ b/js/string.ts @@ -1,6 +1,6 @@ import { Scorer, ScorerArgs } from "./score"; import levenshtein from "js-levenshtein"; -import { OpenAIAuth, buildOpenAIClient } from "./oai"; +import { OpenAIAuth, buildOpenAIClient, getDefaultEmbeddingModel } from "./oai"; import cossim from "compute-cosine-similarity"; import { makePartial, ScorerWithPartial } from "./partial"; @@ -69,7 +69,7 @@ export const EmbeddingSimilarity: ScorerWithPartial< [output, expected].map((input) => openai.embeddings.create({ input, - model: args.model ?? "text-embedding-ada-002", + model: args.model ?? getDefaultEmbeddingModel(), }), ), ); diff --git a/py/autoevals/oai.py b/py/autoevals/oai.py index 33eef02..147ce24 100644 --- a/py/autoevals/oai.py +++ b/py/autoevals/oai.py @@ -7,11 +7,25 @@ from collections.abc import Callable from contextvars import ContextVar from dataclasses import dataclass -from typing import Any, Optional, Protocol, TypeVar, Union, cast, runtime_checkable +from typing import Any, Optional, Protocol, TypedDict, TypeVar, Union, cast, runtime_checkable PROXY_URL = "https://api.braintrust.dev/v1/proxy" +class DefaultModelConfig(TypedDict, total=False): + """Configuration for default models used by Autoevals. + + This is used when passing the object form of `default_model` to `init()`. + + Attributes: + completion: Default model for LLM-as-a-judge evaluations. + embedding: Default model for embedding-based evaluations. + """ + + completion: str + embedding: str + + @runtime_checkable class ChatCompletions(Protocol): create: Callable[..., Any] @@ -198,6 +212,7 @@ def is_wrapped(self) -> bool: _client_var = ContextVar[Optional[LLMClient]]("client") _default_model_var = ContextVar[Optional[str]]("default_model") +_default_embedding_model_var = ContextVar[Optional[str]]("default_embedding_model") T = TypeVar("T") @@ -239,8 +254,12 @@ def resolve_client(client: Client, is_async: bool = False) -> LLMClient: return LLMClient(openai=client, is_async=is_async) -def init(client: Client | None = None, is_async: bool = False, default_model: str | None = None): - """Initialize Autoevals with an optional custom LLM client and default model. +def init( + client: Client | None = None, + is_async: bool = False, + default_model: str | DefaultModelConfig | None = None, +): + """Initialize Autoevals with an optional custom LLM client and default models. This function sets up the global client context for Autoevals to use. If no client is provided, the default OpenAI client will be used. @@ -253,21 +272,25 @@ def init(client: Client | None = None, is_async: bool = False, default_model: st - OpenAIV1: Wrapped in a new LLMClient instance (OpenAI SDK v1) is_async: Whether to create a client with async operations. Defaults to False. Deprecated: Use the `client` argument directly with your desired async/sync configuration. - default_model: The default model to use for evaluations when not specified per-call. - Defaults to "gpt-4o" if not set. When using non-OpenAI providers via the Braintrust - proxy, set this to the appropriate model string (e.g., "claude-3-5-sonnet-20241022"). + default_model: The default model(s) to use for evaluations when not specified per-call. + Can be either: + - A string (for backward compatibility): Sets the default completion model only. + Defaults to "gpt-4o" if not set. + - A dictionary with "completion" and/or "embedding" keys: Allows setting default + models for different evaluation types. Only the specified models are updated; + others remain unchanged. + + When using non-OpenAI providers via the Braintrust proxy, set this to the + appropriate model string (e.g., "claude-3-5-sonnet-20241022"). Example: - Using with OpenAI (default):: + String form (backward compatible):: - from openai import OpenAI from autoevals import init + init(default_model="gpt-4-turbo") - init(client=OpenAI()) - - Using with Anthropic via Braintrust proxy:: + Object form - set both models:: - import os from openai import OpenAI from autoevals import init @@ -276,18 +299,48 @@ def init(client: Client | None = None, is_async: bool = False, default_model: st api_key=os.environ["BRAINTRUST_API_KEY"], base_url="https://api.braintrust.dev/v1/proxy", ), - default_model="claude-3-5-sonnet-20241022", + default_model={ + "completion": "claude-3-5-sonnet-20241022", + "embedding": "text-embedding-3-large", + }, + ) + + Object form - set only embedding model:: + + init( + default_model={ + "embedding": "text-embedding-3-large", + } ) """ _client_var.set(resolve_client(client, is_async=is_async) if client else None) - _default_model_var.set(default_model) + + if isinstance(default_model, str): + # String form: sets completion model only, resets embedding to default + _default_model_var.set(default_model) + _default_embedding_model_var.set(None) + elif default_model: + # Object form: only update models that are explicitly provided + if "completion" in default_model: + _default_model_var.set(default_model["completion"]) + if "embedding" in default_model: + _default_embedding_model_var.set(default_model["embedding"]) + else: + # No default_model: reset both to defaults + _default_model_var.set(None) + _default_embedding_model_var.set(None) def get_default_model() -> str: - """Get the configured default model, or "gpt-4o" if not set.""" + """Get the configured default completion model, or "gpt-4o" if not set.""" return _default_model_var.get(None) or "gpt-4o" +def get_default_embedding_model() -> str: + """Get the configured default embedding model, or "text-embedding-ada-002" if not set.""" + return _default_embedding_model_var.get(None) or "text-embedding-ada-002" + + warned_deprecated_api_key_base_url = False diff --git a/py/autoevals/ragas.py b/py/autoevals/ragas.py index 794ab03..fd98719 100644 --- a/py/autoevals/ragas.py +++ b/py/autoevals/ragas.py @@ -113,7 +113,14 @@ def context_relevancy_scorer(output, expected, input, metadata): from . import Score from .list import ListContains from .llm import OpenAILLMScorer -from .oai import Client, _default_model_var, arun_cached_request, get_default_model, run_cached_request +from .oai import ( + Client, + _default_model_var, + arun_cached_request, + get_default_embedding_model, + get_default_model, + run_cached_request, +) from .string import EmbeddingSimilarity @@ -147,8 +154,6 @@ def _get_model(model: str | None) -> str: return DEFAULT_RAGAS_MODEL -DEFAULT_RAGAS_EMBEDDING_MODEL = "text-embedding-3-small" - ENTITY_PROMPT = """Given a text, extract unique entities without repetition. Ensure you consider different forms or mentions of the same entity as a single entity. The output should be a well-formatted JSON instance that conforms to the JSON schema below. @@ -1133,7 +1138,7 @@ def __init__( model: str | None = None, strictness=3, temperature=0.5, - embedding_model=DEFAULT_RAGAS_EMBEDDING_MODEL, + embedding_model=None, client: Client | None = None, **kwargs, ): @@ -1196,7 +1201,9 @@ def _run_eval_sync(self, output, expected=None, input=None, context=None, **kwar for _ in range(self.strictness) ] similarity = [ - EmbeddingSimilarity(client=self.client).eval(output=q["question"], expected=input, model=self.model) + EmbeddingSimilarity(client=self.client).eval( + output=q["question"], expected=input, model=self.embedding_model + ) for q in questions ] @@ -1234,7 +1241,7 @@ class AnswerSimilarity(OpenAILLMScorer): def __init__( self, pairwise_scorer=None, - model=DEFAULT_RAGAS_EMBEDDING_MODEL, + model=None, client: Client | None = None, **kwargs, ): @@ -1388,7 +1395,7 @@ def __init__( self.model = _get_model(model) self.answer_similarity = answer_similarity or AnswerSimilarity( - model=embedding_model if embedding_model is not None else DEFAULT_RAGAS_EMBEDDING_MODEL, + model=embedding_model, client=client, ) diff --git a/py/autoevals/string.py b/py/autoevals/string.py index a6dc39c..92d9c3a 100644 --- a/py/autoevals/string.py +++ b/py/autoevals/string.py @@ -24,7 +24,7 @@ from autoevals.partial import ScorerWithPartial from autoevals.value import normalize_value -from .oai import LLMClient, arun_cached_request, run_cached_request +from .oai import LLMClient, arun_cached_request, get_default_embedding_model, run_cached_request from .score import Score @@ -115,7 +115,7 @@ async def compare_texts(): def __init__( self, prefix="", - model=MODEL, + model=None, expected_min=0.7, api_key=None, base_url=None, @@ -124,7 +124,7 @@ def __init__( self.prefix = prefix self.expected_min = expected_min - self.extra_args = {"model": model} + self.extra_args = {"model": model if model is not None else get_default_embedding_model()} if api_key: self.extra_args["api_key"] = api_key if base_url: diff --git a/py/autoevals/test_init_models.py b/py/autoevals/test_init_models.py new file mode 100644 index 0000000..5a6242d --- /dev/null +++ b/py/autoevals/test_init_models.py @@ -0,0 +1,94 @@ +"""Tests for init() with default_model parameter supporting both string and object forms.""" + +import pytest + +from autoevals import init +from autoevals.oai import get_default_embedding_model, get_default_model + + +@pytest.fixture(autouse=True) +def reset_state(): + """Reset global state before each test.""" + init() + yield + init() + + +def test_string_form_sets_completion_model_backward_compatible(): + """Test that string form sets completion model (backward compatible).""" + init(default_model="gpt-4-turbo") + + assert get_default_model() == "gpt-4-turbo" + assert get_default_embedding_model() == "text-embedding-ada-002" # Default + + +def test_object_form_can_set_completion_model_only(): + """Test that object form can set only completion model.""" + init(default_model={"completion": "gpt-4-turbo"}) + + assert get_default_model() == "gpt-4-turbo" + + +def test_object_form_can_set_embedding_model_only(): + """Test that object form can set only embedding model.""" + init(default_model={"embedding": "text-embedding-3-large"}) + + assert get_default_embedding_model() == "text-embedding-3-large" + # Completion model should remain at default since we didn't update it + assert get_default_model() == "gpt-4o" + + +def test_object_form_can_set_both_models(): + """Test that object form can set both models.""" + init( + default_model={ + "completion": "claude-3-5-sonnet-20241022", + "embedding": "text-embedding-3-large", + } + ) + + assert get_default_model() == "claude-3-5-sonnet-20241022" + assert get_default_embedding_model() == "text-embedding-3-large" + + +def test_partial_updates_preserve_unspecified_models(): + """Test that partial updates preserve models that are not explicitly set.""" + # First set completion model + init(default_model={"completion": "gpt-4-turbo"}) + + assert get_default_model() == "gpt-4-turbo" + assert get_default_embedding_model() == "text-embedding-ada-002" + + # Then set only embedding model - completion should remain unchanged + init(default_model={"embedding": "text-embedding-3-large"}) + + assert get_default_model() == "gpt-4-turbo" # Should still be gpt-4-turbo + assert get_default_embedding_model() == "text-embedding-3-large" + + +def test_falls_back_to_defaults_when_not_set(): + """Test that defaults are used when default_model is not provided.""" + init() + + assert get_default_model() == "gpt-4o" + assert get_default_embedding_model() == "text-embedding-ada-002" + + +def test_string_form_resets_embedding_model_to_default(): + """Test that string form resets embedding model to default.""" + # First set both models + init( + default_model={ + "completion": "gpt-4-turbo", + "embedding": "text-embedding-3-large", + } + ) + + assert get_default_model() == "gpt-4-turbo" + assert get_default_embedding_model() == "text-embedding-3-large" + + # Then use string form - should reset embedding to default + init(default_model="claude-3-5-sonnet-20241022") + + assert get_default_model() == "claude-3-5-sonnet-20241022" + assert get_default_embedding_model() == "text-embedding-ada-002" # Reset to default