From 000aaf25deccd99ff86e76c4a63da92504ab3c79 Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Tue, 23 Dec 2025 23:00:10 +0530
Subject: [PATCH 01/14] initial spec

---
 Cargo.lock                                    |  12 +
 Cargo.toml                                    |   2 +-
 crates/rullm-anthropic/Cargo.toml             |   7 +
 crates/rullm-anthropic/spec/implementation.md | 256 ++++++++++++++++++
 crates/rullm-anthropic/spec/message-api.md    | 130 +++++++++
 crates/rullm-anthropic/src/lib.rs             |  14 +
 crates/rullm-openai/Cargo.toml                |   7 +
 .../rullm-openai/spec}/chat-completion.md     |   0
 .../rullm-openai/spec}/chat-completion2.md    |   0
 crates/rullm-openai/src/lib.rs                |  26 ++
 spec/chat-completion-comparison.md            | 234 ++++++++++++++++
 spec/chat-completion-comparison2.md           |  69 +++++
 12 files changed, 756 insertions(+), 1 deletion(-)
 create mode 100644 crates/rullm-anthropic/Cargo.toml
 create mode 100644 crates/rullm-anthropic/spec/implementation.md
 create mode 100644 crates/rullm-anthropic/spec/message-api.md
 create mode 100644 crates/rullm-anthropic/src/lib.rs
 create mode 100644 crates/rullm-openai/Cargo.toml
 rename {spec => crates/rullm-openai/spec}/chat-completion.md (100%)
 rename {spec => crates/rullm-openai/spec}/chat-completion2.md (100%)
 create mode 100644 crates/rullm-openai/src/lib.rs
 create mode 100644 spec/chat-completion-comparison.md
 create mode 100644 spec/chat-completion-comparison2.md

diff --git a/Cargo.lock b/Cargo.lock
index ff8613cd..e6979e88 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1723,6 +1723,14 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "rullm-anthropic"
+version = "0.1.0"
+
+[[package]]
+name = "rullm-chat-completion"
+version = "0.1.0"
+
 [[package]]
 name = "rullm-cli"
 version = "0.1.0"
@@ -1783,6 +1791,10 @@ dependencies = [
  "tracing-subscriber",
 ]
 
+[[package]]
+name = "rullm-gemini"
+version = "0.1.0"
+
 [[package]]
 name = "rustc-demangle"
 version = "0.1.26"
diff --git a/Cargo.toml b/Cargo.toml
index 26ba3f6a..adb7bacc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,5 @@
 [workspace]
-members = ["crates/rullm-core", "crates/rullm-cli"]
+members = ["crates/rullm-core", "crates/rullm-cli", "crates/rullm-openai", "crates/rullm-anthropic"]
 resolver = "2"
 
 [workspace.package]
diff --git a/crates/rullm-anthropic/Cargo.toml b/crates/rullm-anthropic/Cargo.toml
new file mode 100644
index 00000000..984c5b2c
--- /dev/null
+++ b/crates/rullm-anthropic/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "rullm-anthropic"
+version.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+
+[dependencies]
diff --git a/crates/rullm-anthropic/spec/implementation.md b/crates/rullm-anthropic/spec/implementation.md
new file mode 100644
index 00000000..5f945428
--- /dev/null
+++ b/crates/rullm-anthropic/spec/implementation.md
@@ -0,0 +1,256 @@
+# Rust SDK port notes (implementation guidance)
+
+This document captures what we need to know to port the official Anthropic SDK to Rust. It is based on:
+- The local `reference.md` spec
+- The Go SDK (strong typed model, helpers, timeout logic)
+- The Python and TypeScript SDKs (streaming helpers, ergonomics)
+
+The goal is to provide a Rust API that is feature-parity with the official SDKs while fitting the rullm workspace style.
+
+## 1) Scope and parity targets
+
+Minimum parity for a first pass (mirrors Go/TS/Python):
+- Core API client with config and auth
+- `messages`:
+  - create (non-stream)
+  - create with streaming
+  - stream helper (aggregates raw SSE events)
+  - count_tokens
+  - message batches (create/get/list/cancel/delete/results)
+- `models` (list/get)
+- `completions` (legacy API)
+- `beta` resources (optional, via `anthropic-beta` header)
+
+Optional parity:
+- Bedrock / Vertex clients (Go/Python include these)
+- Helpers for prompt caching
+- Deprecated model warnings
+
+## 2) Client configuration and auth
+
+Match official SDK behavior:
+- Env vars:
+  - `ANTHROPIC_API_KEY`
+  - `ANTHROPIC_AUTH_TOKEN`
+  - `ANTHROPIC_BASE_URL`
+- Base URL default: `https://api.anthropic.com`
+- Headers:
+  - `anthropic-version: 2023-06-01`
+  - `X-Api-Key` OR `Authorization: Bearer <token>`
+- Provide per-request overrides:
+  - timeout
+  - extra headers
+  - extra query params
+  - extra body fields
+
+Recommendation:
+- Build a `Client` struct similar to Go:
+  - `Client::new(api_key, auth_token, base_url, options...)`
+  - `Client::from_env()`
+  - sub-services: `messages`, `models`, `completions`, `beta`
+
+## 3) HTTP layer and retries
+
+The official SDKs include retry support and expose:
+- `max_retries`
+- default timeout
+
+Rust port should:
+- Use `reqwest` (already in workspace)
+- Support global and per-request timeout
+- Expose retry policy (even a simple fixed retry is OK initially)
+- Surface `request-id` header in responses
+
+## 4) Serialization strategy (serde)
+
+The Messages API relies heavily on tagged unions. Use:
+- `#[serde(tag = "type", rename_all = "snake_case")]` for unions with a `type` discriminator
+- `#[serde(untagged)]` for unions like `string | [blocks]`
+- `serde_json::Value` for tool input and JSON Schema fields
+
+Suggested core enums:
+- `ContentBlock` (output)
+- `ContentBlockParam` (input)
+- `ToolUnion` / `ToolChoice`
+- `ThinkingConfig`
+- `Citation` and citation location variants
+- `MessageStreamEvent` and delta variants
+
+Notes from SDKs:
+- Go avoids the `string` shorthand for `messages.content` (requires blocks).
+- Python/TS accept `string | [blocks]`.
+- Rust can support both by using `#[serde(untagged)]` plus helpers that convert strings to text blocks.
+
+## 5) Messages API design in Rust
+
+### 5.1 Request structs
+Required:
+- `model: String` (or `Model` wrapper)
+- `max_tokens: u32`
+- `messages: Vec<MessageParam>`
+
+Optional:
+- `system: String | Vec<TextBlockParam>`
+- `metadata`
+- `stop_sequences`
+- `temperature`, `top_p`, `top_k`
+- `tools`, `tool_choice`
+- `thinking`
+- `service_tier`
+- `stream`
+
+### 5.2 Response structs
+`Message` includes:
+- `id`, `type`, `role`, `model`
+- `content: Vec<ContentBlock>`
+- `stop_reason`, `stop_sequence`
+- `usage` (input/output tokens, cache fields, service_tier, server_tool_use)
+
+### 5.3 Tooling types
+Support both custom and server tools:
+- Custom: `name`, `description?`, `input_schema`
+- Server tools: `bash_20250124`, `text_editor_20250124/20250429/20250728`, `web_search_20250305`
+Tool choice union:
+- `auto`, `any`, `tool`, `none`
+- `disable_parallel_tool_use` boolean
+
+### 5.4 Content blocks (input)
+At minimum:
+- `text`
+- `image` (base64 or url)
+- `document` (pdf base64 or url, plain text, or embedded content)
+- `search_result`
+- `tool_result`
+
+Advanced (for parity):
+- `tool_use` (rare in input, but used for continuity)
+- `server_tool_use`
+- `web_search_tool_result`
+- `thinking`, `redacted_thinking`
+
+## 6) Streaming support
+
+### 6.1 Raw SSE
+Streaming sends SSE events, each with a JSON object containing a `type`:
+- `message_start`
+- `content_block_start`
+- `content_block_delta`
+- `content_block_stop`
+- `message_delta`
+- `message_stop`
+
+`content_block_delta` variants:
+- `text_delta`
+- `input_json_delta` (tool input)
+- `citations_delta`
+- `thinking_delta`
+- `signature_delta`
+
+### 6.2 Stream helper (recommended)
+Python/TS expose a higher-level stream helper that:
+- Accumulates a `Message` snapshot
+- Emits derived events (`text`, `citation`, `thinking`, `input_json`, etc.)
+- Provides `text_stream` and `get_final_message()` helpers
+
+Rust port should consider:
+- `MessageStream` wrapper that consumes raw SSE events
+- `MessageStream::text_stream()` yields only text deltas
+- `MessageStream::final_message()` returns accumulated message
+
+### 6.3 Partial JSON parsing
+`input_json_delta` sends partial JSON strings.
+- TS uses a partial JSON parser
+- Python uses `jiter` partial parsing
+
+Rust options:
+- Accumulate raw JSON text per tool-use block and parse on each delta
+- Use a partial JSON parser crate if available
+
+Maintain both:
+- `partial_json` text buffer
+- best-effort parsed `serde_json::Value` snapshot
+
+## 7) Timeout behavior (important)
+
+Go and TS enforce a non-streaming timeout:
+- Default non-streaming timeout: 10 minutes
+- `expected_time = 1h * max_tokens / 128000`
+- If `expected_time > 10 minutes`, or `max_tokens` exceeds a model-specific limit, require streaming
+
+Port should:
+- Include a `MODEL_NONSTREAMING_TOKENS` map (from SDKs)
+- Compute timeout and error when streaming is required
+- Allow caller override for timeout
+
+## 8) Errors
+
+Error responses are structured:
+- `error: { type, message }`
+- `request_id`
+
+Types include:
+- `invalid_request_error`
+- `authentication_error`
+- `billing_error`
+- `permission_error`
+- `not_found_error`
+- `rate_limit_error`
+- `timeout_error`
+- `api_error`
+- `overloaded_error`
+
+Rust error enum should:
+- Preserve HTTP status
+- Preserve `request_id`
+- Keep raw body for debugging
+
+## 9) Pagination
+
+List endpoints use cursor params:
+- `after_id`, `before_id`, `limit`
+
+Responses include:
+- `data: []`
+- `has_more`
+- `first_id`, `last_id`
+
+Provide a `Page<T>` with cursor helpers, similar to Go's pagination module.
+
+## 10) Deprecation warnings (optional)
+
+Python/TS warn on deprecated models using a known list.
+Rust port can:
+- Maintain a `DEPRECATED_MODELS` map
+- Emit warnings (log or `eprintln!`)
+
+## 11) Beta support
+
+Go SDK includes `beta` services and uses the `anthropic-beta` header.
+For parity:
+- Allow optional `betas: Vec<String>` header
+- Provide beta resources where needed (models/messages/files)
+
+## 12) Suggested module layout
+
+```
+crates/rullm-anthropic/src/
+  client.rs            // Client config, auth, request builder
+  error.rs             // Error types and mapping
+  resources/
+    messages.rs        // create, stream, count_tokens
+    message_batches.rs // create/get/list/cancel/delete/results
+    models.rs
+    completions.rs
+    beta/...
+  types/
+    message.rs
+    content_block.rs
+    tool.rs
+    streaming.rs
+  streaming/
+    sse.rs             // SSE parser or reuse rullm-core
+    message_stream.rs  // high-level aggregator
+```
+
+This layout mirrors the official SDKs while fitting Rust conventions.
+
diff --git a/crates/rullm-anthropic/spec/message-api.md b/crates/rullm-anthropic/spec/message-api.md
new file mode 100644
index 00000000..585600d5
--- /dev/null
+++ b/crates/rullm-anthropic/spec/message-api.md
@@ -0,0 +1,130 @@
+# Anthropic Messages API (high-level)
+
+This is a concise, implementation-focused overview of the Messages API as used by the official SDKs (Go, Python, TypeScript) and the local `reference.md`.
+
+## Endpoints
+
+- `POST /v1/messages`
+  - Create a message (non-streaming)
+  - For streaming: include `"stream": true` in the request body
+- `POST /v1/messages/count_tokens`
+  - Count input tokens for a request without generating output
+- `POST /v1/messages/batches` and related batch endpoints
+  - Asynchronous batch processing for multiple messages requests
+
+## Auth and headers
+
+- Base URL: `https://api.anthropic.com` (overridable via `ANTHROPIC_BASE_URL`)
+- Auth: either `X-Api-Key` or `Authorization: Bearer <token>`
+- Required version header: `anthropic-version: 2023-06-01`
+- Responses include `request-id` header (useful for debugging)
+
+## Core request shape (create message)
+
+Required:
+- `model: string`
+- `max_tokens: number`
+- `messages: MessageParam[]`
+
+Optional (most common):
+- `system: string | TextBlockParam[]`
+- `metadata: { user_id?: string }`
+- `stop_sequences: string[]`
+- `temperature: number` (0.0 to 1.0)
+- `top_p: number` (0.0 to 1.0)
+- `top_k: number`
+- `tools: Tool[] | ServerTool[]`
+- `tool_choice: ToolChoice`
+- `thinking: ThinkingConfig`
+- `service_tier: "auto" | "standard_only"`
+- `stream: boolean`
+
+Notes:
+- `system` is a top-level field; there is no `"system"` role in messages.
+- Consecutive messages with the same role are allowed; the server will merge them.
+- If the last input message is `assistant`, the response continues from that content.
+- There is a documented limit of 100,000 messages per request.
+
+## MessageParam and content blocks
+
+`messages` is an array of `{ role, content }` where `role` is `user` or `assistant`.
+
+`content` can be:
+- A string (shorthand for a single text block)
+- An array of content blocks
+
+### Common input content block types
+
+- `text`: `{ type: "text", text, cache_control?, citations? }`
+- `image`: `{ type: "image", source, cache_control? }`
+  - `source` is either base64 (`{ type: "base64", media_type, data }`) or URL (`{ type: "url", url }`)
+- `document`: `{ type: "document", source, title?, context?, citations?, cache_control? }`
+  - `source` can be base64 PDF, URL PDF, plain text, or embedded content blocks
+- `search_result`: `{ type: "search_result", source, title, content: TextBlockParam[], cache_control? }`
+- `tool_result`: `{ type: "tool_result", tool_use_id, content?, is_error?, cache_control? }`
+
+Less common / advanced:
+- `tool_use`, `server_tool_use`, `web_search_tool_result`
+- `thinking`, `redacted_thinking`
+
+Cache control:
+- `cache_control` uses `{"type":"ephemeral","ttl":"5m"|"1h"}` to mark caching breakpoints.
+
+## Tools and tool_choice
+
+`tools` can include:
+- Custom client tools: `{ name, description?, input_schema }`
+- Server tools (built-in types): `bash_20250124`, `text_editor_20250124/20250429/20250728`, `web_search_20250305`
+
+`tool_choice` controls tool usage:
+- `{ type: "auto" | "any" | "none" }`
+- `{ type: "tool", name }`
+- `disable_parallel_tool_use` can be set to force a single tool use.
+
+## Response: Message object
+
+The response message includes:
+- `id`, `type: "message"`, `role: "assistant"`, `model`
+- `content: ContentBlock[]`
+- `stop_reason`: `end_turn | max_tokens | stop_sequence | tool_use | pause_turn | refusal`
+- `stop_sequence` (if applicable)
+- `usage`:
+  - `input_tokens`, `output_tokens`
+  - `cache_creation_input_tokens`, `cache_read_input_tokens`
+  - `cache_creation` breakdown by TTL
+  - `service_tier: "standard" | "priority" | "batch"`
+  - `server_tool_use` counts (e.g., web search)
+
+Output `content` blocks can include:
+- `text`, `tool_use`, `thinking`, `redacted_thinking`, `server_tool_use`, `web_search_tool_result`
+
+## Streaming behavior
+
+Streaming uses SSE with event objects that include a `type` field. Typical sequence:
+
+1. `message_start` (contains a message skeleton)
+2. `content_block_start` (new block)
+3. `content_block_delta` (block updates)
+   - `text_delta`
+   - `input_json_delta` (tool input streaming)
+   - `citations_delta`
+   - `thinking_delta`
+   - `signature_delta`
+4. `content_block_stop` (block complete)
+5. `message_delta` (stop_reason, stop_sequence, usage updates)
+6. `message_stop`
+
+Implementations usually accumulate deltas into a final Message snapshot.
+
+## Count tokens
+
+`POST /v1/messages/count_tokens` accepts the same `messages`, `system`, `tools`, and `tool_choice` structure, but does not generate output. The response is:
+
+- `{ input_tokens: number }`
+
+This count includes text, images, documents, and tool definitions.
+
+## Message batches (optional in SDK)
+
+The batch API lets you submit multiple `/v1/messages` requests for asynchronous processing. Results can be retrieved later and include per-request status and output.
+
diff --git a/crates/rullm-anthropic/src/lib.rs b/crates/rullm-anthropic/src/lib.rs
new file mode 100644
index 00000000..b93cf3ff
--- /dev/null
+++ b/crates/rullm-anthropic/src/lib.rs
@@ -0,0 +1,14 @@
+pub fn add(left: u64, right: u64) -> u64 {
+    left + right
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn it_works() {
+        let result = add(2, 2);
+        assert_eq!(result, 4);
+    }
+}
diff --git a/crates/rullm-openai/Cargo.toml b/crates/rullm-openai/Cargo.toml
new file mode 100644
index 00000000..ae54ba3e
--- /dev/null
+++ b/crates/rullm-openai/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "rullm-chat-completion"
+version.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+
+[dependencies]
diff --git a/spec/chat-completion.md b/crates/rullm-openai/spec/chat-completion.md
similarity index 100%
rename from spec/chat-completion.md
rename to crates/rullm-openai/spec/chat-completion.md
diff --git a/spec/chat-completion2.md b/crates/rullm-openai/spec/chat-completion2.md
similarity index 100%
rename from spec/chat-completion2.md
rename to crates/rullm-openai/spec/chat-completion2.md
diff --git a/crates/rullm-openai/src/lib.rs b/crates/rullm-openai/src/lib.rs
new file mode 100644
index 00000000..f1c3ba20
--- /dev/null
+++ b/crates/rullm-openai/src/lib.rs
@@ -0,0 +1,26 @@
+/// Different types of roles a message can have.
+// see if it makes sense to add a Other(String) role here.
+// incase some providers have a unique role.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum MessageRole {
+    User,
+    Assistant,
+    Tool,
+    System,
+}
+
+pub enum ContentPart {
+    Text(String),
+    Binary(String),
+    ToolCall(()),
+    ToolResponse(()),
+}
+
+pub struct Tool {}
+pub struct ToolCall {}
+pub struct ToolResponse {}
+
+pub struct ChatMessage {}
+
+pub struct ChatRequest {}
+pub struct ChatResponse {}
diff --git a/spec/chat-completion-comparison.md b/spec/chat-completion-comparison.md
new file mode 100644
index 00000000..7fd61106
--- /dev/null
+++ b/spec/chat-completion-comparison.md
@@ -0,0 +1,234 @@
+# Chat completion APIs: A cross-provider comparison guide
+
+The AI API landscape has coalesced around OpenAI's design patterns, but significant differences remain beneath the surface. **Groq and OpenRouter offer near-perfect OpenAI compatibility**, while Anthropic and Google use distinct schemas that require code changes when switching providers. This guide maps the common ground and critical divergences developers need to navigate.
+
+## The OpenAI compatibility spectrum
+
+Three providers—Groq, OpenRouter, and OpenAI itself—share an identical request/response schema, making code portability straightforward. Anthropic and Google Gemini diverge significantly, each with unique terminology and structural choices.
+
+| Provider | OpenAI Compatible | Migration Complexity |
+|----------|-------------------|---------------------|
+| **OpenAI** | Baseline reference | N/A |
+| **Groq** | Yes (drop-in) | Change base URL + API key |
+| **OpenRouter** | Yes (drop-in) | Change base URL + API key |
+| **Anthropic** | No | Requires schema rewrite |
+| **Google Gemini** | No | Requires schema rewrite |
+
+To use OpenAI's Python SDK with Groq or OpenRouter, only the base URL changes:
+
+```python
+from openai import OpenAI
+client = OpenAI(
+    base_url="https://api.groq.com/openai/v1",  # or "https://openrouter.ai/api/v1"
+    api_key="YOUR_API_KEY"
+)
+```
+
+## Endpoints and authentication patterns
+
+All five providers use REST APIs with JSON payloads, but authentication headers and endpoint paths differ substantially.
+
+| Provider | Base URL | Endpoint | Auth Header |
+|----------|----------|----------|-------------|
+| **OpenAI** | `api.openai.com` | `/v1/chat/completions` | `Authorization: Bearer $KEY` |
+| **Anthropic** | `api.anthropic.com` | `/v1/messages` | `x-api-key: $KEY` + `anthropic-version: 2023-06-01` |
+| **Google Gemini** | `generativelanguage.googleapis.com` | `/v1beta/models/{model}:generateContent` | `x-goog-api-key: $KEY` or OAuth |
+| **Groq** | `api.groq.com` | `/openai/v1/chat/completions` | `Authorization: Bearer $KEY` |
+| **OpenRouter** | `openrouter.ai` | `/api/v1/chat/completions` | `Authorization: Bearer $KEY` |
+
+Anthropic uniquely requires a version header (`anthropic-version`) on every request. Google offers two authentication paths: API keys for Google AI Studio (simpler) or OAuth/service accounts for Vertex AI (enterprise).
+
+## Message structure diverges at the system prompt
+
+The most impactful difference across providers is **how system prompts are handled**. OpenAI, Groq, and OpenRouter include system instructions as a message with `role: "system"`. Anthropic separates it into a top-level `system` field. Google uses `systemInstruction` as a separate object.
+
+**OpenAI/Groq/OpenRouter format:**
+```json
+{
+  "model": "gpt-4o",
+  "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello!"}
+  ]
+}
+```
+
+**Anthropic format:**
+```json
+{
+  "model": "claude-sonnet-4-5",
+  "max_tokens": 1024,
+  "system": "You are a helpful assistant.",
+  "messages": [
+    {"role": "user", "content": "Hello!"}
+  ]
+}
+```
+
+**Google Gemini format:**
+```json
+{
+  "systemInstruction": {"parts": [{"text": "You are a helpful assistant."}]},
+  "contents": [
+    {"role": "user", "parts": [{"text": "Hello!"}]}
+  ]
+}
+```
+
+Note that Google uses `contents` instead of `messages`, `parts` instead of `content`, and `role: "model"` instead of `role: "assistant"`. These terminology differences require complete request restructuring.
+
+## Required versus optional parameters
+
+A subtle but critical difference: **Anthropic requires `max_tokens`** on every request, while OpenAI treats it as optional (defaulting to model maximum). This catches many developers migrating from OpenAI.
+
+| Parameter | OpenAI | Anthropic | Gemini | Groq |
+|-----------|--------|-----------|--------|------|
+| `max_tokens` | Optional | **Required** | Optional (`maxOutputTokens`) | Optional |
+| `temperature` | 0-2 (default 1) | 0-1 (default 1) | 0-2 (default 1) | 0-2 (default 1) |
+| `top_p` | ✓ | ✓ | ✓ (`topP`) | ✓ |
+| `top_k` | ✗ | ✓ | ✓ (`topK`) | ✗ |
+| `frequency_penalty` | ✓ | ✗ | ✓ | ✗ |
+| `presence_penalty` | ✓ | ✗ | ✓ | ✗ |
+
+Groq notably **does not support** `frequency_penalty`, `presence_penalty`, `logprobs`, or `n > 1`—parameters common in OpenAI workflows. Requests using these will return 400 errors.
+
+## Response structures show similar divergence
+
+OpenAI, Groq, and OpenRouter return responses in an identical structure with a `choices` array. Anthropic returns `content` as an array of typed blocks. Google returns `candidates` with nested `content.parts`.
+
+**OpenAI/Groq/OpenRouter response:**
+```json
+{
+  "id": "chatcmpl-abc123",
+  "choices": [{
+    "index": 0,
+    "message": {"role": "assistant", "content": "Hello!"},
+    "finish_reason": "stop"
+  }],
+  "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+}
+```
+
+**Anthropic response:**
+```json
+{
+  "id": "msg_01XFD...",
+  "content": [{"type": "text", "text": "Hello!"}],
+  "stop_reason": "end_turn",
+  "usage": {"input_tokens": 10, "output_tokens": 5}
+}
+```
+
+**Google Gemini response:**
+```json
+{
+  "candidates": [{
+    "content": {"parts": [{"text": "Hello!"}], "role": "model"},
+    "finishReason": "STOP"
+  }],
+  "usageMetadata": {"promptTokenCount": 10, "candidatesTokenCount": 5}
+}
+```
+
+Finish reason values also differ: OpenAI uses `stop`, Anthropic uses `end_turn`, and Google uses `STOP`. Tool-triggered stops are `tool_calls` (OpenAI/Groq), `tool_use` (Anthropic), or indicated by function call content in Gemini.
+
+## Streaming implementations vary significantly
+
+All providers use Server-Sent Events (SSE), but event structure differs. OpenAI-compatible APIs send incremental `delta` objects and terminate with `data: [DONE]`. Anthropic uses **typed event streams** with explicit event names like `message_start`, `content_block_delta`, and `message_stop`.
+
+**OpenAI/Groq streaming chunk:**
+```
+data: {"choices":[{"delta":{"content":"Hello"}}]}
+data: {"choices":[{"delta":{"content":" there"}}]}
+data: [DONE]
+```
+
+**Anthropic streaming events:**
+```
+event: message_start
+data: {"type":"message_start","message":{...}}
+
+event: content_block_delta  
+data: {"type":"content_block_delta","delta":{"type":"text_delta","text":"Hello"}}
+
+event: message_stop
+data: {"type":"message_stop"}
+```
+
+Anthropic's approach provides richer metadata (separate events for tool use, thinking blocks) but requires different parsing logic. Google streams partial `GenerateContentResponse` objects via `streamGenerateContent?alt=sse`.
+
+## Tool calling follows OpenAI's lead with variations
+
+Function/tool calling has achieved reasonable standardization, with all providers supporting JSON Schema-based tool definitions. The structure is nearly identical across OpenAI, Groq, and OpenRouter. Anthropic uses `input_schema` instead of `parameters`, and Google wraps tools in a `functionDeclarations` array.
+
+**OpenAI/Groq tool definition:**
+```json
+{"type": "function", "function": {"name": "get_weather", "parameters": {...}}}
+```
+
+**Anthropic tool definition:**
+```json
+{"name": "get_weather", "input_schema": {...}}
+```
+
+**Google Gemini tool definition:**
+```json
+{"functionDeclarations": [{"name": "get_weather", "parameters": {...}}]}
+```
+
+All providers support `auto`, `none`, and forced tool selection. Anthropic adds `any` (must use at least one tool). Parallel tool calls are supported by OpenAI, Groq, and Anthropic (default enabled).
+
+## Unique features worth noting
+
+Each provider offers distinctive capabilities beyond the baseline API:
+
+- **OpenAI**: Structured Outputs with strict JSON Schema enforcement (`json_schema` response format), Batch API with 50% discount
+- **Anthropic**: Extended thinking for Claude 4/3.7 with configurable token budgets, prompt caching with 90% cost reduction on cache hits, computer use tools
+- **Google Gemini**: Built-in Google Search grounding, native code execution, video/audio/document processing up to 2 hours of video
+- **Groq**: Exceptional speed (**394-1000+ tokens/second**) via custom LPU hardware, timing metrics in usage response
+- **OpenRouter**: Access to **400+ models** from all providers, automatic fallbacks, model routing with `:floor` (cheapest) and `:nitro` (fastest) suffixes, zero-markup pricing
+
+## Pricing models share structure but not rates
+
+All providers charge per-token with separate input/output rates. Anthropic charges 90% less for cached content. Groq offers 50% off for batch processing. OpenRouter passes through provider pricing with a 5.5% fee on credit purchases.
+
+| Provider | Example Model | Input (per 1M) | Output (per 1M) |
+|----------|---------------|----------------|-----------------|
+| OpenAI | GPT-4o | ~$2.50 | ~$10.00 |
+| Anthropic | Claude 3.5 Sonnet | $3.00 | $15.00 |
+| Google | Gemini 2.5 Flash | $0.15 | $0.60 |
+| Groq | Llama 3.3 70B | $0.59 | $0.79 |
+
+Free tiers exist for Google AI Studio, Groq, and OpenRouter (with `:free` suffix models).
+
+## SDK availability and language support
+
+All providers offer first-party Python and TypeScript/JavaScript SDKs. Anthropic and Google provide the broadest language coverage.
+
+| Provider | Python | TypeScript | Go | Java | Other |
+|----------|--------|------------|----|----|-------|
+| OpenAI | ✓ | ✓ | Beta | — | — |
+| Anthropic | ✓ | ✓ | ✓ | ✓ | Ruby, C# (beta) |
+| Google | ✓ | ✓ | ✓ | ✓ | Dart, Swift, Kotlin |
+| Groq | ✓ | ✓ | — | — | OpenAI SDK compatible |
+| OpenRouter | ✓ Beta | ✓ | — | — | OpenAI SDK compatible |
+
+For Groq and OpenRouter, using the OpenAI SDK with a modified base URL is the recommended approach, enabling code reuse across providers.
+
+## Practical migration strategies
+
+When building multi-provider applications, consider these patterns:
+
+1. **Use OpenAI-compatible providers for easy switching**: Groq and OpenRouter can share code paths with OpenAI. Abstract only the base URL and API key.
+
+2. **Create provider-specific adapters for Anthropic/Gemini**: The structural differences require transformation layers. Map `system` messages to Anthropic's top-level field, convert `assistant` to `model` for Gemini.
+
+3. **Normalize on the OpenAI response format**: Parse provider responses into a common structure. OpenRouter already does this for all 400+ models.
+
+4. **Handle parameter gaps gracefully**: Remove unsupported parameters (like `frequency_penalty` for Groq) rather than letting requests fail.
+
+5. **Consider OpenRouter as a unification layer**: For applications needing multiple model providers, OpenRouter provides a single API surface with automatic fallbacks and model routing.
+
+## Conclusion
+
+The chat completion API landscape centers on OpenAI's design patterns, with Groq and OpenRouter offering true compatibility and Anthropic/Google requiring adaptation layers. The key migration hurdles are system prompt handling, required parameters (Anthropic's `max_tokens`), and response parsing differences. For maximum flexibility, applications should abstract provider-specific code behind a common interface, or leverage OpenRouter's unified gateway to access all major models through a single, consistent API.
diff --git a/spec/chat-completion-comparison2.md b/spec/chat-completion-comparison2.md
new file mode 100644
index 00000000..216ee87a
--- /dev/null
+++ b/spec/chat-completion-comparison2.md
@@ -0,0 +1,69 @@
+Architectural Convergence and Divergence in Modern Large Language Model Interfaces: A Comparative Analysis of Anthropic, Gemini, Groq, and OpenRouter1. Introduction: The Standardization of the "Chat" ParadigmThe rapid proliferation of Large Language Models (LLMs) has necessitated the evolution of Application Programming Interfaces (APIs) from simple text-completion endpoints to complex, state-aware conversational interfaces. In the nascent stages of the generative AI boom, the interaction model was predominantly "text-in, text-out"—a raw string completion paradigm where the model simply predicted the next sequence of tokens based on a provided prefix. However, as models grew in capability and application architectures shifted toward conversational agents, this primitive abstraction proved insufficient for managing the complexities of dialogue history, role-based instruction, and multi-turn reasoning.OpenAI’s introduction of the Chat Completions API marked a pivotal shift in this landscape, establishing a structural schema that organizes input not as a monolithic string, but as a structured list of message objects. This "Chat Completion" paradigm—characterized by the stateless exchange of JSON arrays containing distinct roles (System, User, Assistant)—has effectively become the lingua franca of the industry. It provides a semantic framework that allows developers to model complex interactions, inject system-level behavioral guardrails, and manage conversation state on the client side.However, while the high-level conceptual model of "Chat Completion" has been widely adopted, the underlying implementation details exhibit significant divergence. Competitors and alternative providers such as Anthropic, Google (Gemini), Groq, and OpenRouter have each interpreted this paradigm through the lens of their specific architectural priorities, safety philosophies, and infrastructure capabilities.This report provides an exhaustive technical analysis of these four providers. It moves beyond superficial feature comparisons to dissect the structural, operational, and semantic differences in their API designs. By examining how each provider implements the chat abstraction, handles authentication, manages complex capabilities like tool use and multimodality, and communicates operational metrics like rate limits, this research aims to equip software architects with the nuanced understanding required to build resilient, multi-provider AI systems. The analysis reveals that while the industry is converging on a shared mental model, the ecosystem remains fragmented in implementation, requiring sophisticated adaptation strategies to achieve true interoperability.2. API Design Philosophy and Architectural ParadigmsThe four providers analyzed—Anthropic, Gemini, Groq, and OpenRouter—represent distinct strategic positions in the AI market. Their API designs are not merely technical specifications but reflections of their broader organizational goals, ranging from safety-centric research to high-velocity inference and ecosystem aggregation.2.1 Anthropic: The Explicit Structure and Safety-First DesignAnthropic’s approach to API design is characterized by strictness and explicitness. The Messages API is designed to enforce "Constitutional AI" principles at the interface level. Unlike flexible schemas that might allow for ambiguous role assignments, Anthropic enforces a rigorous alternation between user and assistant roles. This design choice prevents "jailbreaking" techniques that rely on confusing the model about who is speaking.1Furthermore, Anthropic treats the system prompt not as just another message in the list, but as a top-level parameter. This architectural decision elevates the system instruction above the conversational flow, granting it a higher tiered authority in guiding the model's behavior. This distinct separation of concerns—separating the "rules" (system) from the "dialogue" (messages)—is a hallmark of Anthropic’s safety-first philosophy.2 The API also utilizes a versioning header (anthropic-version), forcing developers to pin their integration to a specific point in time (e.g., 2023-06-01). This indicates a priority on enterprise stability, ensuring that backend improvements do not silently break client-side parsing logic.32.2 Google Gemini: The Multimodal-Native IntegrationGoogle’s Gemini API, accessible via both Google AI Studio and Vertex AI, represents a departure from the text-centric view of LLMs. Gemini is built as a multimodal-native model, and its API schema reflects this. Instead of a standard messages list, Gemini employs a contents array composed of parts. This parts-based architecture is agnostic to data type, treating text, images, video, and audio as equivalent fundamental units of meaning.4The design is heavily influenced by the Google Cloud ecosystem. The integration with Vertex AI introduces complexity in authentication and routing (involving Project IDs and Location IDs in the URL) that is absent in simpler, key-based APIs. This signals that Gemini is designed not just as a standalone model, but as a component within a larger enterprise cloud infrastructure. The API’s ability to handle massive context windows (up to 2 million tokens) also influences its design, necessitating mechanisms for uploading and referencing large files rather than embedding them directly in the request payload.52.3 Groq: The Velocity-Centric Inference EngineGroq occupies a unique position as an infrastructure provider rather than a model trainer. Their core value proposition is the Language Processing Unit (LPU), a hardware architecture designed for ultra-low latency inference. Consequently, Groq’s API strategy is one of "frictionless adoption." They have made the strategic decision to adhere almost strictly to the OpenAI API specification.7By mimicking the endpoint structure (/v1/chat/completions), authentication methods, and payload schemas of the market leader, Groq eliminates the switching costs for developers. The philosophy here is "drop-in compatibility." If a developer has an application running on GPT-4, they should be able to switch to Llama-3 running on Groq by changing only the base URL and API key. This design choice highlights Groq's focus on speed and efficiency over architectural novelty.92.4 OpenRouter: The Normalization and Aggregation LayerOpenRouter serves as a meta-layer or gateway, sitting between the developer and model providers. Its architectural philosophy is "Normalization." The AI ecosystem is fragmented, with different providers using different schemas, tokenizers, and pricing models. OpenRouter abstracts this complexity by providing a unified, OpenAI-compatible interface that routes to dozens of underlying providers (including Anthropic, Google, and Groq).10The API design focuses on routing intelligence. Features like model: "auto" and "fallback" configurations allow the API to make dynamic decisions about which underlying model to call based on cost, latency, or uptime. Additionally, OpenRouter introduces headers like HTTP-Referer and X-Title to build a community-ranking system, incentivizing developers to identify their apps in exchange for visibility. This positions OpenRouter not just as a pipe, but as a marketplace.103. Authentication, Security, and Access ControlThe mechanism by which an API validates the identity of the requester is the first point of integration. While the concept of an "API Key" is universal, the transmission and management of these credentials vary significantly, impacting how client libraries must be configured.3.1 Header Specifications and TransmissionThe industry standard for RESTful APIs is the Authorization header using the Bearer scheme. Groq and OpenRouter adhere to this standard, simplifying integration with generic HTTP clients.Groq: Expects Authorization: Bearer <GROQ_API_KEY>. This allows the use of standard OpenAI client libraries, which are hardcoded to use this header format.8OpenRouter: Also uses Authorization: Bearer <OPENROUTER_API_KEY>. However, it adds a layer of optional but recommended headers: HTTP-Referer (for site rankings) and X-Title (app name). While not strictly required for authentication, these headers play a role in the platform's ecosystem mechanics.10Anthropic deviates from this standard. It requires a custom header x-api-key for the credential. This seemingly minor difference breaks compatibility with generic OpenAI-compatible clients unless a proxy or adapter (like LiteLLM) is used. Additionally, the mandatory anthropic-version header is a security and stability feature. By requiring the client to declare the schema version they expect, Anthropic prevents "silent breaking" updates. If the API response format changes (e.g., how tool use is structured), older clients sending an older version header will continue to receive the legacy format, ensuring backward compatibility.23.2 The Complexity of Google Gemini AuthenticationGemini presents the most bifurcated authentication model, reflecting its dual targeting of hobbyists and enterprise users.Google AI Studio (Prototyping): Uses a simple API key transmitted via the x-goog-api-key header. This is akin to the Anthropic/OpenAI model and is designed for ease of use.5Vertex AI (Enterprise): Uses Google Cloud IAM (Identity and Access Management). Here, there is no static long-lived API key. Instead, the application must authenticate as a Google Cloud Service Account, obtain a short-lived OAuth 2.0 access token (e.g., via gcloud auth print-access-token), and pass that in the Authorization: Bearer header. This approach integrates deeply with enterprise security policies, allowing for granular permission scoping (e.g., a service account that can invoke models but not tune them).11This dichotomy means that code written for Gemini prototypes in AI Studio often requires significant refactoring to be deployed to a production Vertex AI environment, a friction point not present with the other providers.4. Request Structure: The Anatomy of a ConversationThe core of the Chat Completion API is the request body, specifically how the conversation history is structured. While all providers accept a list of messages, the schema of those messages—and specifically how "content" is defined—reveals deep architectural differences.4.1 The Message Object and Role DefinitionsThe "Standard" format, popularized by OpenAI and adopted by Groq and OpenRouter, expects a messages array where each object has a role (system, user, assistant) and content (string).Anthropic's Divergence:Anthropic’s Messages API extracts the system instruction from the message list entirely.JSON{
+  "system": "You are a helpful assistant.",
+  "messages": [
+    {"role": "user", "content": "Hello"}
+  ]
+}
+This structural change enforces a hierarchy. System instructions are not part of the "conversation"; they are the "constitution" governing the conversation. Inside the messages array, Anthropic strictly enforces alternating roles. A sequence of user, user is invalid and will result in a 400 error. The client is forced to merge consecutive messages from the same role. This strictness reduces ambiguity for the model but increases the validation burden on the client.1Gemini's Divergence:Gemini uses contents (plural) instead of messages, and role values are user and model (instead of assistant).JSON{
+  "systemInstruction": { "parts": },
+  "contents": [
+    {
+      "role": "user",
+      "parts": [ { "text": "Hello" } ]
+    }
+  ]
+}
+The use of parts instead of content is foundational. It implies that a message is never just a string; it is a composite object that can contain text, images, video references, or function calls. While OpenAI/Anthropic support similar multimodal arrays, Gemini’s schema treats text as just one type of part among many, rather than the default.44.2 Handling of System InstructionsThe placement of system instructions is a key differentiator in prompt engineering strategies.Groq & OpenRouter: Support the standard {"role": "system", "content": "..."} message at the beginning of the array. This is treated as part of the context window.Anthropic: The top-level system parameter allows the model to cache these instructions separately (via Prompt Caching), potentially optimizing performance for agents that share a common persona across many users.2Gemini: Uses systemInstruction configuration. Similar to Anthropic, this separates the directive from the dialogue, but the syntax involves a nested parts object, adding verbosity to the request payload.44.3 Control Parameters and ConfigurationWhile all providers support standard sampling parameters like temperature and top_p, the parameter names and valid ranges differ.Max Tokens:Anthropic: max_tokens (Required). The API will error if this is missing.1Gemini: maxOutputTokens inside a generationConfig object.4Groq/OpenRouter: max_tokens or the newer max_completion_tokens.7Thinking/Reasoning:Anthropic (Claude 3.7) introduces a thinking block in the request, requiring a budget_tokens parameter. This explicitly reserves capacity for chain-of-thought generation before the final answer.1Gemini 2.0 supports thinking_config with levels (e.g., "low", "high"), integrating reasoning depth as a configuration toggle rather than just a token budget.125. Multimodality and Media HandlingThe processing of non-text inputs (images, video, audio) highlights the infrastructure differences between the providers.5.1 Image TransmissionAnthropic: Images are passed as content blocks with base64 encoding.JSON{
+  "type": "image",
+  "source": {
+    "type": "base64",
+    "media_type": "image/jpeg",
+    "data": "..."
+  }
+}
+This method is simple but bandwidth-intensive. It bloats the JSON payload size, potentially hitting HTTP body size limits for high-resolution images.1Groq: Adheres to the OpenAI image_url format, accepting either a public URL or a base64 string. Currently, Groq’s vision support is model-dependent (e.g., Llama 3.2 Vision).9Gemini: Offers the most robust solution for heavy media. While it supports inline_data (base64), its primary strength is file_data.JSON{
+  "file_data": {
+    "mime_type": "video/mp4",
+    "file_uri": "gs://my-bucket/video.mp4"
+  }
+}
+Developers can upload files to Google Cloud Storage or the Gemini File API and pass the URI. This allows Gemini to process hours of video or audio, which would be impossible to transmit via base64. The model can "watch" a video and answer questions about specific timestamps, a capability unique to its architecture.45.2 Audio and VideoGemini is currently the only provider among the four to support native video and audio inputs in the main chat endpoint. Groq supports audio via a separate audio/transcriptions endpoint (using Whisper), but not as a multimodal input to the chat model itself.9 Anthropic allows for document inputs (PDFs) which are processed as images or text, but lacks native video support in the API.36. Tool Use and Function Calling: The Technical CoreTool Use (or Function Calling) is the critical capability for building agents. It allows the model to output structured JSON to call external APIs. This area exhibits the most significant schema fragmentation.6.1 Tool DefinitionsThe mechanism for telling the model what tools are available varies.Groq / OpenRouter: Use the OpenAI tools format.JSON"tools":
+Anthropic: Uses a flatter structure.JSON"tools":
+The key difference is input_schema vs parameters. While semantically identical, the key names differ, requiring adapters in client code.14Gemini: Wraps definitions in function_declarations.JSON"tools": [{
+  "function_declarations": [{
+    "name": "get_weather",
+    "parameters": {... }
+  }]
+}]
+Gemini allows specifying the schema as a subset of OpenAPI 3.0, but strict adherence is required.156.2 Invocation (The Model's Request)When the model decides to call a tool, the response format differs.Groq / OpenRouter: The message contains a tool_calls array. Each item has a unique id and function arguments.Anthropic: The model outputs a tool_use content block.JSON{
+  "type": "tool_use",
+  "id": "toolu_01...",
+  "name": "get_weather",
+  "input": { "city": "London" }
+}
+Crucially, Anthropic allows text blocks (chain-of-thought) to precede the tool_use block in the same message. This allows the model to "explain" why it is calling the tool before doing so.2Gemini: The model outputs a functionCall part.JSON"parts": [{
+  "functionCall": {
+    "name": "get_weather",
+    "args": { "city": "London" }
+  }
+}]
+Historically, Gemini did not generate unique IDs for function calls, relying on the order of execution. However, newer versions are adopting IDs to support parallel function calling.116.3 Result Submission ( The Client's Response)Completing the tool loop requires sending the result back to the model.Groq / OpenRouter: A dedicated message with role: "tool" is sent, referencing the tool_call_id.Anthropic: The result is sent in a user message containing a tool_result block.JSON{
+  "role": "user",
+  "content": [{
+    "type": "tool_result",
+    "tool_use_id": "toolu_01...",
+    "content": "25 C"
+  }]
+}
+This is a major semantic difference: in Anthropic's world, the User reports the tool result. There is no separate "Tool" role.16Gemini: Uses a functionResponse part, typically within a user (or sometimes function) role context. The API is strict about the order: the conversation history must show functionCall followed immediately by functionResponse.177. Response Structure and Streaming MechanicsFor real-time applications, the structure of the response and the mechanics of Server-Sent Events (SSE) are vital for managing latency and user experience.7.1 Static Response ObjectsGroq / OpenRouter: Return the standard choices array. Even if requesting a single completion, it is wrapped in a list. The content is accessed via choices.message.content.7Anthropic: Returns a top-level content array.JSON{
+  "type": "message",
+  "role": "assistant",
+  "content": [ { "type": "text", "text": "Hello" } ]
+}
+This array structure is consistent with the request format, treating output as blocks.2Gemini: Returns candidates.JSON{
+  "candidates": [{
+    "content": { "parts": [{ "text": "Hello" }] },
+    "finishReason": "STOP"
+  }]
+}
+The nesting is deeper: response.candidates.content.parts.text. Accessing the text requires traversing three layers of hierarchy.47.2 Streaming Event ProtocolsAll providers use SSE, but the event taxonomy differs.Groq / OpenRouter: Stream chunks containing delta objects. The structure mimics the static response but with partial strings. The stream ends with a `` message.10Anthropic: Implements a verbose event system.message_start: Metadata about the message (usage, ID).content_block_start: Indicates a new block (text or tool use) is beginning.content_block_delta: The actual content generation (text_delta).message_stop: Final usage stats.This verbosity allows clients to reconstruct complex, multi-block responses (e.g., text followed by a tool use) with high fidelity.1Gemini: Streams full GenerateContentResponse objects. A key nuance is that Gemini may emit "empty" chunks that contain only citation metadata or safety ratings, requiring the client to filter for actual text content to avoid displaying blanks.48. Operational Metrics: Rate Limits and Stop ReasonsObservability is handled via response headers and body fields.8.1 Rate Limit HeadersGroq: Uses the standard x-ratelimit-* headers (requests, tokens, reset time). This transparency allows clients to implement "token bucket" throttling algorithms easily.18Anthropic: Uses anthropic-ratelimit-* headers. They explicitly separate input-tokens limits from output-tokens limits. This distinction is crucial because output tokens are computationally more expensive and often have tighter limits.19Gemini: Does not consistently provide rate limit headers in the response for immediate backoff calculation. Developers must rely on Google Cloud Quota dashboards or handle 429 errors which contain retry-after information. The limits are enforced at the Project level, shared across all API keys in that project.208.2 Stop ReasonsUnderstanding why the model stopped is essential for debugging.Anthropic: end_turn (natural completion), max_tokens (cutoff), tool_use (calling a function).Gemini: STOP, MAX_TOKENS, SAFETY (content filter triggered), RECITATION. The RECITATION reason is unique to Google; it triggers if the model output is too similar to copyrighted training data, effectively blocking the response to prevent copyright infringement.22OpenRouter: Normalizes these codes. It maps provider-specific reasons to a standard set (e.g., mapping end_turn to stop) but preserves the native_finish_reason for advanced debugging.249. Common Features vs. Unique Differentiators9.1 CommonalitiesStatelessness: All API interactions are stateless; context must be re-sent.JSON Schema: All use JSON for payload transport.Roles: All distinguish between User and System/Model roles.Sampling: All support temperature and top_p.Security: All use TLS/SSL and API Key/Token authentication.9.2 Major Differences (Summary Table)FeatureAnthropicGoogle GeminiGroqOpenRouterSystem PromptTop-level parametersystemInstruction configMessage with role: systemMessage with role: systemInput Structuremessages (Strict Roles)contents with partsmessages (Standard)messages (Standard)Tool Responseuser role + tool_resultfunctionResponse parttool role messagetool role messageVideo InputNo (Frames as images)Native file_data (URI)No (Frames as images)Via Provider (if supported)Rate Limit Headersanthropic-ratelimit-*N/A (Cloud Quotas)x-ratelimit-*x-openrouter-creditsUnique FeaturePrompt Caching2M Context & GroundingLPU Inference SpeedModel Routing & Fallbacks10. Conclusion and Strategic RecommendationsThe "Chat Completion" API has evolved into a standard architectural pattern, but it is not a monolithic standard. While Groq and OpenRouter adhere closely to the OpenAI specification to minimize friction, Anthropic and Google Gemini have diverged to support their specific philosophies of safety and multimodality.For developers and architects, this fragmentation implies that a true "multi-provider" strategy requires more than just swapping base URLs. It necessitates an abstraction layer (adapter pattern) that can normalize the structural differences in:Tool Use Handshakes: Converting between tool roles and tool_result blocks.Multimodal Uploads: Handling base64 vs. Cloud Storage URIs.Rate Limit Handling: Parsing diverse header formats to manage backoff.Anthropic is the choice for workflows requiring strict adherence to complex instructions and safety, leveraging Prompt Caching for cost efficiency in long-context tasks. Gemini dominates in scenarios involving heavy media analysis (video/audio) and deep integration with the Google Cloud ecosystem. Groq provides the raw speed necessary for real-time, user-facing applications where latency is the primary KPI. OpenRouter acts as the unifying fabric, offering the path of least resistance for accessing the diverse capabilities of the open ecosystem without the operational overhead of managing individual provider idiosyncrasies.Understanding these nuances is the key to transitioning from a fragile, single-provider prototype to a robust, model-agnostic enterprise application.References1 (Anthropic)4 (Gemini)7 (Groq)10 (OpenRouter)

From 8be2bbd180691cac6e5c097dd26b5106321be13a Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Thu, 25 Dec 2025 00:55:15 +0530
Subject: [PATCH 02/14] add spec for chat completion

---
 .../rullm-openai/spec/chat-completion-api.md  | 103 ++++++
 .../spec/chat-completion-difference.md        | 331 ++++++++++++++++++
 crates/rullm-openai/spec/implementation.md    | 224 ++++++++++++
 3 files changed, 658 insertions(+)
 create mode 100644 crates/rullm-openai/spec/chat-completion-api.md
 create mode 100644 crates/rullm-openai/spec/chat-completion-difference.md
 create mode 100644 crates/rullm-openai/spec/implementation.md

diff --git a/crates/rullm-openai/spec/chat-completion-api.md b/crates/rullm-openai/spec/chat-completion-api.md
new file mode 100644
index 00000000..0e6435bb
--- /dev/null
+++ b/crates/rullm-openai/spec/chat-completion-api.md
@@ -0,0 +1,103 @@
+# OpenAI Chat Completions API - High-Level Spec
+
+This is a high-level overview of the OpenAI Chat Completions REST API. For full
+field-level details, see `chat-completion.md` and `chat-completion2.md`.
+
+## Positioning
+- Endpoint family: `/v1/chat/completions`
+- Status: supported but legacy; newer integrations often use the Responses API.
+- Still required when you want classic chat-completion object shapes or stored
+  completion CRUD endpoints.
+
+## Endpoints
+- POST `/v1/chat/completions` - create a completion (optionally streaming)
+- GET `/v1/chat/completions/{completion_id}` - retrieve stored completion
+- GET `/v1/chat/completions` - list stored completions (pagination)
+- POST `/v1/chat/completions/{completion_id}` - update stored completion metadata
+- DELETE `/v1/chat/completions/{completion_id}` - delete stored completion
+- GET `/v1/chat/completions/{completion_id}/messages` - list stored messages
+
+## Auth and Headers
+- Authorization: `Authorization: Bearer <API_KEY>`
+- Optional routing: `OpenAI-Organization`, `OpenAI-Project`
+- Content-Type: `application/json`
+- Useful response headers: `x-request-id`, `openai-processing-ms`, `x-ratelimit-*`
+
+## Core Request Shape
+```json
+{
+  "model": "gpt-4o",
+  "messages": [...],
+  "stream": false
+}
+```
+
+### Messages and Content
+Messages are role-tagged objects. `content` is either a string or an array of
+content parts.
+
+Roles (non-exhaustive):
+- `system` (legacy instructions)
+- `developer` (preferred for reasoning models)
+- `user`
+- `assistant`
+- `tool`
+- `function` (deprecated)
+
+Content parts (union by `type`):
+- `text` `{ type: "text", text: "..." }`
+- `image_url` `{ type: "image_url", image_url: { url, detail? } }`
+- `input_audio` `{ type: "input_audio", input_audio: { data, format } }`
+- `file` `{ type: "file", file: { file_id | file_data, filename? } }`
+- `refusal` (assistant-only content part)
+
+Assistant messages may omit `content` and instead include `tool_calls`.
+Tool responses use role `tool` and include `tool_call_id`.
+
+## Common Request Parameters (high-level)
+- Sampling: `temperature`, `top_p`, `presence_penalty`, `frequency_penalty`
+- Tokens: `max_completion_tokens`, `max_tokens` (deprecated)
+- Output count: `n`
+- Stopping: `stop`
+- Logprobs: `logprobs`, `top_logprobs`
+- Tools: `tools`, `tool_choice`, `parallel_tool_calls`
+- Structured outputs: `response_format` (`json_schema` or `json_object`)
+- Audio output: `modalities`, `audio`
+- Web search: `web_search_options`
+- Predicted outputs: `prediction`
+- Prompt caching: `prompt_cache_key`, `prompt_cache_retention`
+- Safety: `safety_identifier` (replaces `user`)
+- Storage: `store`, `metadata`
+- Service tiers: `service_tier`
+- Reasoning: `reasoning_effort`, `verbosity`
+- Streaming: `stream`, `stream_options`
+
+## Non-Streaming Response Shape
+Chat completion object:
+- `id`, `object: "chat.completion"`, `created`, `model`
+- `choices[]`: each includes `message`, `finish_reason`, optional `logprobs`
+- `message`: `role: assistant`, `content` or `refusal`, `tool_calls`, `audio`,
+  optional `annotations` (web search)
+- `usage`: `prompt_tokens`, `completion_tokens`, `total_tokens` + details
+- `service_tier`, `system_fingerprint` (deprecated)
+
+Finish reasons can include: `stop`, `length`, `tool_calls`, `content_filter`,
+`function_call` (deprecated).
+
+## Streaming (SSE)
+- Enable with `stream: true`.
+- The server emits SSE events whose data is a `chat.completion.chunk` object.
+- Each chunk has `choices[].delta` with partial data:
+  - `role`, `content`, `refusal`, `tool_calls`, `function_call` (deprecated)
+- Tool call arguments arrive as streamed string fragments.
+- `stream_options` supports:
+  - `include_usage` (final usage-only chunk)
+  - `include_obfuscation` (adds obfuscation fields to normalize payload sizes)
+- Stream ends with `data: [DONE]` or connection close.
+
+## Errors and Rate Limits
+- Errors return a top-level `error` object with fields like `message`, `type`,
+  `param`, `code`.
+- Streaming may emit an error object inside the SSE `data` payload.
+- Rate limit headers provide request and token budgets; clients should parse and
+  surface them.
diff --git a/crates/rullm-openai/spec/chat-completion-difference.md b/crates/rullm-openai/spec/chat-completion-difference.md
new file mode 100644
index 00000000..a0109679
--- /dev/null
+++ b/crates/rullm-openai/spec/chat-completion-difference.md
@@ -0,0 +1,331 @@
+# OpenAI Chat Completions API compatibility across major providers
+
+**All five providers—OpenRouter, Google Gemini, Groq, xAI (Grok), and MoonshotAI—explicitly claim OpenAI API compatibility**, but the devil is in the details. Each provider supports only the core `POST /v1/chat/completions` endpoint while omitting OpenAI's newer conversation management endpoints. Building a truly universal client requires understanding the subtle incompatibilities, parameter restrictions, and behavioral quirks unique to each provider.
+
+## The universal truth: core endpoint only
+
+None of the five providers support OpenAI's conversation management endpoints. The retrieve (`GET /v1/chat/completions/{id}`), list (`GET /v1/chat/completions`), update (`POST /v1/chat/completions/{id}`), delete (`DELETE /v1/chat/completions/{id}`), and list messages (`GET /v1/chat/completions/{id}/messages`) endpoints are universally unsupported. Every provider offers only the stateless `POST /v1/chat/completions` endpoint for creating completions, with some offering proprietary alternatives for conversation state management.
+
+## Provider comparison at a glance
+
+| Feature | OpenRouter | Gemini | Groq | xAI (Grok) | MoonshotAI |
+|---------|------------|--------|------|------------|------------|
+| **Compatibility Level** | Drop-in replacement | Beta, with caveats | "Mostly compatible" | Full (model-dependent) | Full |
+| **Base URL** | `https://openrouter.ai/api/v1` | `https://generativelanguage.googleapis.com/v1beta/openai/` | `https://api.groq.com/openai/v1` | `https://api.x.ai/v1` | `https://api.moonshot.ai/v1` |
+| **logprobs** | ✅ | ❌ | ❌ | ✅ | ❌ |
+| **n > 1** | ✅ | ✅ | ❌ | ✅ | ✅ (with restrictions) |
+| **Streaming + JSON** | ✅ | ✅ | ❌ | ❌ | ✅ |
+| **presence_penalty** | ✅ | ✅ | ⚠️ Inactive | ⚠️ Non-reasoning only | ❓ Undocumented |
+| **frequency_penalty** | ✅ | ✅ | ⚠️ Inactive | ⚠️ Non-reasoning only | ❓ Undocumented |
+
+---
+
+## OpenRouter delivers the most complete compatibility
+
+OpenRouter explicitly positions itself as a "drop-in replacement for OpenAI" and comes closest to delivering on that promise. The platform normalizes requests across **300+ models** from different providers, transforming OpenAI-format tool calls for providers that don't natively support them.
+
+### Base configuration
+```python
+from openai import OpenAI
+client = OpenAI(
+    base_url="https://openrouter.ai/api/v1",
+    api_key="sk-or-v1-xxx",
+    default_headers={
+        "HTTP-Referer": "https://your-app.com",
+        "X-Title": "Your App Name"
+    }
+)
+```
+
+### Model naming convention
+Models require organization prefixes: `openai/gpt-4o`, `anthropic/claude-3.5-sonnet`, `google/gemini-2.0-flash-exp`. Append suffixes for variants: `:free` for free tier, `:nitro` for speed, `:extended` for longer context.
+
+### Critical gotchas developers encounter
+
+**Token counting uses GPT-4o tokenizer universally**, not each model's native tokenizer. The API returns normalized token counts, but billing uses native counts—expect discrepancies when monitoring usage programmatically.
+
+**Streaming includes comment payloads** like `: OPENROUTER PROCESSING` to prevent connection timeouts. Some SSE client implementations fail to parse these correctly, causing crashes in applications like Frigate's embeddings maintainer.
+
+**Reasoning tokens aren't exposed in standard format**. DeepSeek R1 returns `reasoning_content` in the delta, which isn't part of OpenAI's schema—tools expecting standard streaming format won't display thinking indicators.
+
+**Response IDs differ**: OpenRouter returns `gen-xxxxxx` format versus OpenAI's `chatcmpl-xxx`.
+
+### Parameter support matrix
+| Category | Supported | Notes |
+|----------|-----------|-------|
+| Sampling (temperature, top_p, penalties) | ✅ Full | |
+| max_tokens | ✅ | max_completion_tokens undocumented |
+| stop sequences | ✅ | |
+| logprobs/top_logprobs | ✅ | |
+| tools/tool_choice | ✅ | Transformed for non-OpenAI providers |
+| parallel_tool_calls | ✅ | Default true |
+| response_format (json_object, json_schema) | ✅ | |
+| stream/stream_options | ✅ | include_usage works |
+| reasoning_effort | ⚠️ | Model-specific, passed to provider |
+
+### Provider-specific extensions
+OpenRouter adds powerful routing features unavailable elsewhere:
+- **`models`**: Array of fallback models for automatic failover
+- **`provider.order/only/ignore`**: Control which upstream providers serve requests
+- **`provider.require_parameters`**: Ensure provider supports all requested parameters
+- **Model variant suffixes**: `:exacto` for curated tool-calling providers
+
+---
+
+## Google Gemini remains in beta with notable gaps
+
+Google launched OpenAI compatibility in November 2024, but explicitly states it's "still in beta while we extend feature support." The implementation covers core functionality but has sharp edges around schema validation and multi-turn tool calls.
+
+### Base configuration
+```python
+# Gemini API (consumer)
+client = OpenAI(
+    api_key="YOUR_GEMINI_API_KEY",  # From Google AI Studio
+    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
+)
+
+# Vertex AI (enterprise) - requires OAuth token refresh
+client = OpenAI(
+    api_key=credentials.token,  # Expires in 1 hour!
+    base_url=f"https://aiplatform.googleapis.com/v1/projects/{project_id}/locations/global/endpoints/openapi"
+)
+```
+
+### Model naming
+Use bare model names for Gemini API (`gemini-2.5-flash`) or prefixed for Vertex AI (`google/gemini-2.5-flash`).
+
+### Critical gotchas developers encounter
+
+**Logprobs are completely unsupported** through the OpenAI compatibility layer—requests fail with "Unknown name 'logprobs': Cannot find field" despite Gemini's native API supporting them.
+
+**Gemini 3's thought_signature requirement** breaks multi-turn tool calls. The model returns a `thought_signature` field that must be passed back with tool responses, or requests fail with "function call is missing a thought_signature." Most OpenAI-compatible clients don't preserve this field.
+
+**Content filtering returns None messages**. When safety filters trigger, `choices[0].message` may be `None` with `finish_reason: content_filter`, crashing clients that expect a message object.
+
+**Schema validation is stricter than OpenAI**. Unknown fields like `type` inside function objects cause "Unknown name 'type' at 'tools[0].function'" errors. Union types in JSON schema (like `str|int`) that work on OpenAI return "Request contains an invalid argument."
+
+**The endpoint URL changed** from `/v1beta/chat/completions` to `/v1beta/openai/chat/completions`—breaking change for early adopters.
+
+### Parameter support matrix
+| Category | Supported | Notes |
+|----------|-----------|-------|
+| Sampling (temperature, top_p, penalties) | ✅ | |
+| max_tokens/max_completion_tokens | ✅ | |
+| n (multiple completions) | ✅ | |
+| stop sequences | ✅ | |
+| logprobs/top_logprobs | ❌ | Fails with error |
+| tools/tool_choice | ✅ | Stricter validation |
+| response_format | ✅ | json_object and json_schema |
+| reasoning_effort | ✅ | minimal/low/medium/high/none |
+| web_search_options | ✅ | Maps to GoogleSearch tool |
+
+### Provider-specific extensions
+Access Gemini-specific features via `extra_body`:
+```python
+response = client.chat.completions.create(
+    model="gemini-2.5-flash",
+    messages=[...],
+    extra_body={
+        "google": {
+            "thinking_config": {
+                "thinking_budget": 8192,
+                "include_thoughts": True
+            }
+        }
+    }
+)
+```
+
+---
+
+## Groq trades features for speed
+
+Groq's documentation honestly states they're "**mostly compatible**" with OpenAI—not fully. The platform prioritizes inference speed on custom LPU hardware, but this comes with meaningful parameter restrictions.
+
+### Base configuration
+```python
+client = OpenAI(
+    base_url="https://api.groq.com/openai/v1",
+    api_key=os.environ.get("GROQ_API_KEY")
+)
+```
+
+### Critical gotchas developers encounter
+
+**Multiple completions (`n > 1`) are not supported**—requests return 400 errors. This is a hard limitation, not a bug.
+
+**Logprobs are completely unsupported**—`logprobs`, `top_logprobs`, and `logit_bias` all return 400 errors.
+
+**JSON mode and streaming are mutually exclusive**. Setting `response_format: json_object` with `stream: true` returns "response_format does not support streaming." This breaks many agentic frameworks that expect both simultaneously.
+
+**Penalty parameters are documented but inactive**: "presence_penalty and frequency_penalty are not yet supported by any of our models."
+
+**Temperature 0 becomes 1e-8**—Groq converts exact zero to a near-zero value, potentially causing subtle determinism differences.
+
+**Streaming finish_reason bug** confirmed in November 2025: some models don't properly return `finish_reason` in streaming responses, breaking OpenAI specification compliance.
+
+### Parameter support matrix
+| Category | Supported | Notes |
+|----------|-----------|-------|
+| temperature/top_p | ✅ | temp 0 → 1e-8 |
+| presence_penalty/frequency_penalty | ⚠️ | Documented but inactive |
+| max_completion_tokens | ✅ | Preferred over max_tokens |
+| n (multiple completions) | ❌ | Must be 1 |
+| stop sequences | ✅ | Up to 4 sequences |
+| logprobs/top_logprobs/logit_bias | ❌ | Returns 400 |
+| tools/tool_choice | ✅ | Max 128 functions |
+| parallel_tool_calls | ✅ | |
+| response_format | ✅ | No streaming with JSON |
+| stream | ✅ | Not with JSON mode |
+| reasoning_effort | ✅ | Model-dependent |
+| service_tier | ✅ | auto/on_demand/flex/performance |
+
+### Provider-specific extensions
+Groq adds unique features for their infrastructure:
+- **`reasoning_format`**: Control reasoning output (`hidden`, `raw`, `parsed`)
+- **`service_tier`**: Priority levels (`flex` for lower priority/cost)
+- **Built-in tools**: Web search, code execution, browser automation, Wolfram Alpha
+- **Usage breakdown**: Response includes `queue_time`, `prompt_time`, `completion_time`
+
+---
+
+## xAI Grok requires model-specific parameter handling
+
+xAI claims "full compatibility with the OpenAI REST API," but Grok-4 reasoning models restrict several common parameters. Successfully using Grok requires knowing which parameters work with which model generation.
+
+### Base configuration
+```python
+client = OpenAI(
+    api_key="xai-xxx",  # Keys start with xai- prefix
+    base_url="https://api.x.ai/v1"
+)
+```
+
+### Model-specific restrictions are the primary gotcha
+
+**Grok-4 (reasoning models) don't support**:
+- `presence_penalty` (returns error)
+- `frequency_penalty` (returns error)  
+- `stop` sequences (returns error)
+- `reasoning_effort` (only grok-3-mini supports this)
+
+**Use `max_completion_tokens` instead of `max_tokens`** for Grok-4. Many tools auto-inject `max_tokens`, causing errors.
+
+**`stream_options` reported as unsupported** in some integrations (n8n), though basic `stream=true` works. The `stream_options.include_usage` parameter may cause "Argument not supported" errors.
+
+**Structured outputs don't work with streaming**—must choose one or the other.
+
+### Parameter support matrix
+| Category | Supported | Notes |
+|----------|-----------|-------|
+| temperature/top_p | ✅ | |
+| presence_penalty/frequency_penalty | ⚠️ | Grok-3 only, not reasoning models |
+| max_completion_tokens | ✅ | Required for Grok-4 |
+| max_tokens | ⚠️ | Works on Grok-2/3, not Grok-4 |
+| n (multiple completions) | ✅ | |
+| stop sequences | ⚠️ | Grok-3 only, not reasoning models |
+| logprobs/top_logprobs | ✅ | |
+| tools/tool_choice | ✅ | Max 128-200 functions |
+| parallel_tool_calls | ✅ | Default enabled |
+| response_format | ✅ | No streaming with json_schema |
+| stream | ✅ | |
+| reasoning_effort | ⚠️ | grok-3-mini only ("low"/"high") |
+
+### Provider-specific extensions
+xAI offers unique search and agentic capabilities:
+- **`search_parameters`**: Live web/X/news search (deprecating January 2026)
+- **Deferred completions**: Submit request, retrieve result later via `request_id`
+- **`x-grok-conv-id` header**: Optimize prompt caching with UUID
+- **Reasoning content**: `use_encrypted_content: true` for encrypted reasoning traces
+
+---
+
+## MoonshotAI restricts temperature range significantly
+
+MoonshotAI (Kimi) offers "full OpenAI compatibility" from a Chinese AI company, with both global (`api.moonshot.ai`) and China (`api.moonshot.cn`) endpoints. The main constraint is a **temperature ceiling of 1.0** versus OpenAI's 2.0.
+
+### Base configuration
+```python
+client = OpenAI(
+    api_key="MOONSHOT_API_KEY",
+    base_url="https://api.moonshot.ai/v1"  # or api.moonshot.cn for China
+)
+```
+
+### Critical gotchas developers encounter
+
+**Temperature maximum is 1.0**—values above 1 are clamped. Additionally, if `temperature < 0.3` and `n > 1`, MoonshotAI raises an exception.
+
+**Vision requires base64 only**—`image_url` with HTTP URLs doesn't work; images must be base64-encoded.
+
+**Reasoning content requires special access**. The `kimi-k2-thinking` model returns `reasoning_content` which isn't in OpenAI SDK types—use `hasattr(obj, "reasoning_content")` and `getattr()` to access it safely.
+
+**5-minute request timeout**—longer reasoning or generation returns 504 errors.
+
+### Parameter support matrix
+| Category | Supported | Notes |
+|----------|-----------|-------|
+| temperature | ✅ | Range 0-1 only (recommend 0.6) |
+| top_p | ✅ | |
+| presence_penalty/frequency_penalty | ❓ | Undocumented |
+| max_tokens | ✅ | Up to 32,000 for K2 |
+| n | ✅ | Restricted with low temperature |
+| stop sequences | ✅ | |
+| logprobs/top_logprobs | ❓ | Undocumented |
+| tools/tool_choice | ✅ | Up to 128 functions |
+| response_format | ✅ | json_object confirmed |
+| stream | ✅ | Recommended for thinking models |
+
+### Provider-specific extensions
+MoonshotAI offers unique built-in capabilities:
+- **`$web_search`**: Official built-in web search tool ($0.005/call)
+- **`$date`**: Get current date
+- **File API**: Upload documents for extraction and OCR
+- **Automatic context caching**: No configuration needed, cached tokens cost 75% less
+
+---
+
+## Building a universal client requires defensive coding
+
+Based on these findings, a universal Chat Completions client should implement:
+
+### Parameter validation by provider
+```python
+PROVIDER_LIMITS = {
+    "groq": {"n_max": 1, "logprobs": False, "json_streaming": False},
+    "xai_reasoning": {"presence_penalty": False, "frequency_penalty": False, "stop": False},
+    "moonshot": {"temperature_max": 1.0},
+    "gemini": {"logprobs": False}
+}
+```
+
+### Graceful degradation for unsupported features
+Strip unsupported parameters rather than failing. For example, remove `logprobs` for Gemini/Groq/MoonshotAI, convert `max_tokens` to `max_completion_tokens` for Grok-4.
+
+### Handle response format variations
+- OpenRouter adds `native_finish_reason` field
+- Gemini may return `None` message on content filter
+- xAI/MoonshotAI add `reasoning_content` field
+- Groq adds `x_groq` timing metadata
+
+### Model name translation
+Each provider has unique naming conventions:
+- OpenRouter: `org/model` (e.g., `openai/gpt-4o`)
+- Gemini: bare names (e.g., `gemini-2.5-flash`)
+- Groq: vendor prefixes or short IDs (e.g., `llama-3.3-70b-versatile`)
+- xAI: version suffixes (e.g., `grok-4`, `grok-4-0709`)
+- MoonshotAI: product names (e.g., `kimi-k2-0905-preview`)
+
+---
+
+## Conclusion
+
+For maximum compatibility with minimal friction, **OpenRouter provides the most complete OpenAI API implementation** with automatic transformation for diverse upstream providers. However, its normalized token counting and response format additions require awareness.
+
+**Gemini and Groq have the most significant feature gaps**—no logprobs, and Groq's inability to combine JSON mode with streaming breaks common agentic patterns.
+
+**xAI requires model-aware parameter handling**—code that works with Grok-3 may fail on Grok-4 due to removed parameter support.
+
+**MoonshotAI's temperature restriction** is the most limiting factor, but otherwise provides solid compatibility for standard use cases.
+
+All providers achieve compatibility for the **80% case** of basic chat completions, streaming, and tool calling. The incompatibilities emerge in advanced features like logprobs, multiple completions, and structured output with streaming—exactly the features that power sophisticated AI applications.
diff --git a/crates/rullm-openai/spec/implementation.md b/crates/rullm-openai/spec/implementation.md
new file mode 100644
index 00000000..79ea1084
--- /dev/null
+++ b/crates/rullm-openai/spec/implementation.md
@@ -0,0 +1,224 @@
+# Rust Port Notes for OpenAI Chat Completions
+
+This document captures implementation guidance for a standalone Rust SDK for
+OpenAI Chat Completions, based on:
+- `crates/rullm-openai/spec/chat-completion.md` and `chat-completion2.md`
+- openai-go and openai-node (generated from the OpenAPI spec by Stainless)
+- codex-rs (Chat Completions streaming support inside the Codex CLI)
+
+The goal is to expose a reusable, standalone Chat Completions client, not tied
+to the Codex CLI.
+
+## 1) API Surface to Implement
+Match the OpenAI SDK patterns (Go/Node) at minimum:
+
+- `POST /chat/completions` (create, non-streaming)
+- `POST /chat/completions` (create, streaming)
+- `GET /chat/completions/{id}` (retrieve stored completion)
+- `GET /chat/completions` (list stored completions)
+- `POST /chat/completions/{id}` (update metadata)
+- `DELETE /chat/completions/{id}` (delete)
+- `GET /chat/completions/{id}/messages` (list stored messages)
+
+Recommended shape for Rust:
+- `ChatCompletionsClient::create(params) -> ChatCompletion`
+- `ChatCompletionsClient::stream(params) -> Stream<ChatCompletionChunk>`
+- `ChatCompletionsClient::retrieve(id)` / `list(params)` / `update(id, params)`
+- `ChatCompletionsClient::delete(id)`
+- `ChatCompletionsClient::list_messages(id, params)`
+
+## 2) Core Type Map (Request/Response)
+
+### 2.1 Request Types
+Define a `ChatCompletionCreateParams` struct that mirrors the Go/Node field set.
+Include all current parameters, even if some are deprecated, to preserve API
+compatibility:
+
+- Required: `model`, `messages`
+- Sampling: `temperature`, `top_p`, `presence_penalty`, `frequency_penalty`
+- Tokens: `max_completion_tokens`, `max_tokens` (deprecated)
+- Output count: `n`
+- Stopping: `stop` (string or array)
+- Logprobs: `logprobs`, `top_logprobs`
+- Tools: `tools`, `tool_choice`, `parallel_tool_calls`
+- Structured outputs: `response_format` (json_schema/json_object/text)
+- Audio output: `modalities`, `audio`
+- Web search: `web_search_options`
+- Predicted outputs: `prediction`
+- Prompt caching: `prompt_cache_key`, `prompt_cache_retention`
+- Safety: `safety_identifier` (replace `user`)
+- Storage: `store`, `metadata`
+- Service tier: `service_tier`
+- Reasoning: `reasoning_effort`, `verbosity`
+- Streaming: `stream`, `stream_options`
+
+Support `null` and omitted fields where the API allows them.
+
+### 2.2 Message Types
+`messages` is a union by `role`. Suggested Rust modeling:
+
+- `enum ChatCompletionMessageParam` tagged by `role`
+  - `System`, `Developer`, `User`, `Assistant`, `Tool`, `Function (deprecated)`
+
+Common fields:
+- `content` for most roles
+- `name` optional for `system`, `developer`, `user`, `assistant`
+- `tool_call_id` required for `tool` role
+- `tool_calls` or `function_call` (deprecated) for assistant messages
+- `audio` is allowed on assistant messages
+
+Content is a union:
+- `String`
+- `Array<ContentPart>`
+
+### 2.3 Content Parts
+`ContentPart` is a union by `type`. From openai-node/openai-go:
+- `text` { text }
+- `image_url` { image_url: { url, detail? } }
+- `input_audio` { input_audio: { data, format } }
+- `file` { file: { file_id | file_data, filename? } }
+- `refusal` (assistant-only content part)
+
+### 2.4 Tools and Tool Calls
+Define tool and tool call unions with explicit `type` tags:
+
+Tools (`tools` in request):
+- `function` { function: FunctionDefinition }
+- `custom` { custom: { name, description?, format? } }
+  - `format`: `text` or `grammar` (with `definition` and `syntax`)
+
+Tool calls (in responses and deltas):
+- `function` { id, function: { name, arguments } }
+- `custom` { id, custom: { name, input } }
+
+Tool choice options (`tool_choice`) are a union:
+- string: `none`, `auto`, `required`
+- named tool choice: `{ type: "function", function: { name } }`
+- custom named tool choice: `{ type: "custom", custom: { name } }`
+- allowed tools: `{ type: "allowed_tools", allowed_tools: { mode, tools } }`
+
+Also keep deprecated fields:
+- `functions` and `function_call` (request)
+- `function_call` (assistant message/stream delta)
+
+### 2.5 Response Types
+Non-streaming response: `ChatCompletion`:
+- `id`, `object: "chat.completion"`, `created`, `model`
+- `choices[]`: `index`, `message`, `finish_reason`, optional `logprobs`
+- `usage` (prompt/completion/total + details)
+- `service_tier`, `system_fingerprint` (deprecated)
+
+Streaming response: `ChatCompletionChunk`:
+- `id`, `object: "chat.completion.chunk"`, `created`, `model`
+- `choices[]` with `delta` objects
+- `usage` optional (final usage chunk if `include_usage`)
+
+`delta` fields can include:
+- `role`, `content`, `refusal`, `tool_calls`, `function_call` (deprecated)
+- Logprobs per choice
+
+### 2.6 Usage and Logprobs
+Usage should include detail fields (when present):
+- completion tokens: `accepted_prediction_tokens`, `rejected_prediction_tokens`,
+  `reasoning_tokens`, `audio_tokens`
+- prompt tokens: `cached_tokens`, `audio_tokens`
+
+Logprobs include per-token info for both content and refusal.
+
+## 3) Serde Modeling Tips
+
+- Use `#[serde(tag = "role", rename_all = "snake_case")]` for message unions.
+- Use `#[serde(tag = "type", rename_all = "snake_case")]` for content parts and
+  tool/tool_call unions.
+- For `content`, `stop`, `tool_choice`, and `response_format`, use `#[serde(untagged)]`
+  enums to support string vs array or object unions.
+- Preserve forward compatibility by:
+  - `#[serde(default)]` for optional fields
+  - `#[serde(flatten)]` to capture unknown fields in responses
+  - avoiding strict enum exhaustiveness where new variants may appear
+
+## 4) Streaming and SSE Handling
+
+### 4.1 SSE decoding
+- The API uses `text/event-stream` with `data: {json}` and `data: [DONE]`.
+- Implement a tolerant SSE parser that:
+  - buffers partial chunks
+  - ignores empty/comment lines
+  - ends on `[DONE]` or socket close
+  - treats `error` objects inside `data` as terminal errors
+
+### 4.2 Delta accumulation
+Follow openai-go and codex-rs patterns:
+- Concatenate `delta.content` and `delta.refusal` fragments in order.
+- For `tool_calls`, merge by `index` and `id` and concatenate
+  `function.arguments` fragments.
+- Handle missing indices (codex-rs maps by `id` or last index).
+- Support multiple parallel tool calls (do not assume `index == 0`).
+- Keep `finish_reason` per choice.
+- Accumulate logprobs and usage (usage often reported only at the final chunk).
+
+Suggested helper: a `ChatCompletionAccumulator` (like openai-go) that merges
+chunks into a full `ChatCompletion`, plus convenience helpers for detecting
+when content or tool calls have just completed.
+
+### 4.3 Stream options
+`stream_options` includes:
+- `include_usage` (final usage-only chunk)
+- `include_obfuscation` (extra fields on deltas, must be ignored if unknown)
+
+## 5) Structured Outputs and Parsing Helpers
+
+OpenAI SDKs provide helpers to parse structured outputs:
+- `response_format` with `type: json_schema`
+- `strict` in function definitions to enforce schema adherence
+
+Optional convenience in Rust:
+- Provide a helper that parses `choice.message.content` into a typed struct
+  when `response_format` is `json_schema`.
+- Provide a helper that parses tool call arguments into JSON when `strict` is
+  true or when the caller opts in.
+
+These are optional, but common in openai-node (`parse` and tool runner helpers).
+
+## 6) Error Handling and Resilience
+
+- Map HTTP error responses to a structured `ErrorObject` (message/type/param/code).
+- Bubble `x-request-id` and rate limit headers up to the caller.
+- Accept unknown enum values and ignore unknown fields.
+- Do not hard-fail on unsupported parameters; let the API reject if needed.
+
+## 7) Notes from codex-rs
+
+codex-rs includes a dedicated chat completion SSE parser:
+- It is robust to missing tool call indices
+- It concatenates tool arguments across deltas
+- It emits reasoning deltas when present (`delta.reasoning` may be a string or
+  nested object)
+- It treats `finish_reason == length` as a context window error
+
+This logic is a good reference for a resilient streaming implementation.
+
+## 8) Gaps vs current rullm-core OpenAI types
+
+The current rullm-core types cover only a subset of the modern API. The Rust
+port should add:
+- `developer` role
+- `audio` input and output types
+- `file` content parts
+- `refusal` content parts
+- `custom` tools
+- `tool_choice` variants for allowed tools
+- `web_search_options` and `annotations`
+- `prediction` and prompt caching fields
+- `reasoning_effort`, `verbosity`, `service_tier`
+- `prompt_cache_key`, `prompt_cache_retention`, `safety_identifier`
+- `stream_options.include_usage` and `include_obfuscation`
+
+## 9) Tests to Include
+
+- Non-streaming: full response decode with tool calls and annotations
+- Streaming: content deltas, refusal deltas, tool call argument assembly
+- Streaming: `include_usage` final chunk with empty choices
+- Tool choice unions and stop unions serialize correctly
+- Content part unions (text, image_url, input_audio, file)
+

From 062cc3e8769febad535ec2ee33cb3ce4ec9da01f Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sat, 3 Jan 2026 15:40:44 +0530
Subject: [PATCH 03/14] add implementation plan

---
 .../spec/implementation-final.md              | 298 ++++++++++
 .../rullm-openai/spec/implementation-final.md | 508 ++++++++++++++++++
 2 files changed, 806 insertions(+)
 create mode 100644 crates/rullm-anthropic/spec/implementation-final.md
 create mode 100644 crates/rullm-openai/spec/implementation-final.md

diff --git a/crates/rullm-anthropic/spec/implementation-final.md b/crates/rullm-anthropic/spec/implementation-final.md
new file mode 100644
index 00000000..75f66746
--- /dev/null
+++ b/crates/rullm-anthropic/spec/implementation-final.md
@@ -0,0 +1,298 @@
+# Anthropic Messages Rust Client - Implementation Design
+
+This document proposes an idiomatic Rust client for the Anthropic Messages API.
+The Messages API is Anthropic-specific, but some gateways expose Anthropic-compatible
+endpoints; the design keeps an optional compat layer for that case. It is based on
+`spec/message-api.md`, `spec/implementation.md`, and patterns in rullm-core. The
+design emphasizes ergonomic builders, strong typing, streaming helpers, and clean
+error handling.
+
+## 1) Goals and non-goals
+
+Goals
+- Feature parity with official Anthropic SDKs for the Messages API.
+- Excellent developer experience: easy defaults, expressive builders, helpers for
+  common tasks, and easy streaming consumption.
+- Optional compat ergonomics for Anthropic-compatible gateways (not a claim that
+  other providers natively use the Messages API).
+- Forward compatibility: tolerate unknown enum values and fields.
+
+Non-goals (initial release)
+- A cross-provider abstraction layer for non-Anthropic APIs (OpenAI/Gemini/etc.).
+- Full Bedrock/Vertex implementations (can be added later).
+
+## 2) Package layout (proposed)
+
+```
+crates/rullm-anthropic/src/
+  client.rs        // Client, ClientBuilder, RequestOptions
+  config.rs        // env helpers, base url, auth modes
+  error.rs         // AnthropicError, ErrorObject
+  messages/        // requests, responses, types
+    mod.rs
+    types.rs       // content blocks, tools, streaming events
+    stream.rs      // SSE parsing + accumulator
+  models.rs        // list/get models
+  batches.rs       // create/get/list/cancel/delete/results
+  completions.rs   // legacy API
+  compat.rs        // optional compat types + conversions (Anthropic-compatible gateways)
+  transport.rs     // HttpTransport trait + reqwest impl
+  lib.rs           // re-exports
+```
+
+Notes
+- Keep the public surface in `lib.rs` small and intentional.
+- Prefer `crate::` paths (avoid `super::`).
+- Avoid `pub use` unless re-exporting external dependencies.
+
+## 3) Client configuration and auth
+
+### 3.1 ClientBuilder
+Provide a builder with explicit fields and env defaults:
+
+- `Client::builder()` -> `ClientBuilder`
+- `Client::from_env()` -> uses:
+  - `ANTHROPIC_API_KEY`
+  - `ANTHROPIC_AUTH_TOKEN`
+  - `ANTHROPIC_BASE_URL`
+
+Auth modes:
+- API key: `x-api-key: <key>`
+- OAuth token: `Authorization: Bearer <token>`
+
+Required headers:
+- `anthropic-version: 2023-06-01`
+- `content-type: application/json`
+
+Recommended builder fields:
+- `api_key: Option<Arc<str>>`
+- `auth_token: Option<Arc<str>>`
+- `base_url: Arc<str>` (default `https://api.anthropic.com`)
+- `timeout: Duration` (global default)
+- `max_retries: u32`
+- `beta: Vec<Arc<str>>` (optional `anthropic-beta` header)
+- `default_headers: HeaderMap` (merge-able)
+
+### 3.2 RequestOptions (per-request override)
+A small options struct to keep the API uniform across clients (even if the
+provider APIs differ):
+
+- `timeout: Option<Duration>`
+- `extra_headers: HeaderMap`
+- `extra_query: Vec<(Arc<str>, Arc<str>)>`
+- `extra_body: serde_json::Map<String, Value>`
+
+This mirrors `extra_headers/extra_query/extra_body` patterns in other SDKs and
+makes the client usable with Anthropic-compatible gateways.
+
+## 4) Optional compat types (Anthropic-compatible gateways)
+
+The client can expose a minimal common interface for applications that talk to
+Anthropic-compatible gateways. This is useful when the same app targets multiple
+providers through a gateway that accepts the Anthropic Messages API format.
+
+Proposed compat types (align with rullm-core):
+- `ChatRole` (System/User/Assistant/Tool)
+- `ChatMessage { role, content }`
+- `ChatRequest { messages, temperature, max_tokens, top_p, stream }`
+- `ChatResponse { message, model, usage, finish_reason }`
+- `ChatStreamEvent { Token(String), Done, Error(String) }`
+
+Conversions:
+- `impl From<MessagesRequest> for ChatRequest` (best-effort mapping)
+- `impl TryFrom<ChatRequest> for MessagesRequest` (errors if unsupported fields)
+- `impl From<MessagesResponse> for ChatResponse` (extract first text block)
+
+This keeps Anthropic ergonomics while enabling Anthropic-compatible gateway use.
+
+## 5) Messages API surface
+
+### 5.1 Primary entry points
+Expose a sub-client similar to official SDKs:
+
+- `Client::messages()` -> `MessagesClient`
+- `MessagesClient::create(req, opts)` -> `Message`
+- `MessagesClient::stream(req, opts)` -> `MessageStream`
+- `MessagesClient::count_tokens(req, opts)` -> `CountTokensResponse`
+- `MessagesClient::batches()` -> `BatchesClient`
+
+### 5.2 Builder ergonomics
+Provide a builder for the request that favors clarity:
+
+```
+MessagesRequest::builder("claude-3-5-sonnet-20241022")
+  .max_tokens(1024)
+  .system("You are helpful")
+  .message(Message::user("Hello"))
+  .temperature(0.7)
+  .build()?;
+```
+
+Design notes
+- `system` is top-level (no system role in messages).
+- Accept `system` as `SystemContent` (string or text blocks).
+- `messages` accept `MessageContent` (string shorthand or blocks).
+
+### 5.3 Type modeling overview
+
+Request
+- `MessagesRequest { model, max_tokens, messages, system?, metadata?, stop_sequences?, temperature?, top_p?, top_k?, tools?, tool_choice?, thinking?, service_tier?, stream? }`
+
+Response
+- `Message { id, type, role, model, content, stop_reason?, stop_sequence?, usage }`
+
+Use `serde` tagging:
+- `#[serde(tag = "type", rename_all = "snake_case")]` for content blocks
+- `#[serde(untagged)]` for `string | [blocks]` unions
+
+## 6) Content blocks and tools
+
+### 6.1 ContentBlockParam (input)
+Support all common and advanced blocks:
+- `text`
+- `image` (base64 or url)
+- `document` (pdf base64/url, plain text, or embedded blocks)
+- `search_result`
+- `tool_result`
+- advanced: `tool_use`, `server_tool_use`, `web_search_tool_result`,
+  `thinking`, `redacted_thinking`
+
+### 6.2 ContentBlock (output)
+Support output blocks:
+- `text`, `tool_use`, `thinking`, `redacted_thinking`, `server_tool_use`,
+  `web_search_tool_result`
+
+### 6.3 Tools
+Use a union for custom and server tools:
+- Custom: `{ name, description?, input_schema }`
+- Server tools: `bash_20250124`, `text_editor_20250124/20250429/20250728`,
+  `web_search_20250305`
+
+Tool choice union:
+- `auto | any | none | tool(name)`
+- `disable_parallel_tool_use: bool`
+
+## 7) Streaming design
+
+### 7.1 Raw SSE
+Streaming uses SSE with event `type`:
+- `message_start`
+- `content_block_start`
+- `content_block_delta`
+- `content_block_stop`
+- `message_delta`
+- `message_stop`
+
+Implement a tolerant SSE parser:
+- buffer partial chunks
+- ignore empty/comment lines
+- stop on stream close
+- surface JSON parse errors as `AnthropicError::Serialization`
+
+### 7.2 MessageStream helper
+Provide a higher-level stream wrapper that merges deltas into a full message.
+
+Proposed API:
+- `MessageStream::events()` -> raw `StreamEvent`
+- `MessageStream::text_stream()` -> `impl Stream<Item = Result<Arc<str>, Error>>`
+- `MessageStream::final_message()` -> `Result<Message, Error>` (awaits completion)
+
+Use a `MessageAccumulator` internally:
+- append text deltas
+- merge tool input JSON fragments
+- update usage/stop_reason
+
+### 7.3 Tool input JSON deltas
+Maintain both:
+- `partial_json: String`
+- `parsed: Option<Value>` (best-effort)
+
+Parsing strategy:
+- append fragment on each delta
+- attempt `serde_json::from_str` after each update
+- keep the last successful parse
+
+This avoids a hard dependency on a partial JSON parser while still offering
+useful intermediate values.
+
+## 8) Timeout policy
+
+The official SDKs enforce a non-streaming timeout policy. Mirror it:
+
+- Default non-stream timeout: 10 minutes
+- `expected_time = 1h * max_tokens / 128000`
+- If `expected_time > 10m`, require streaming
+- Maintain a `MODEL_NONSTREAMING_TOKENS` map (from SDKs)
+
+Expose this as:
+- `ClientConfig::non_streaming_policy`
+- `MessagesRequest::validate_non_streaming(&policy)`
+
+Allow opt-out via `RequestOptions::allow_long_non_streaming`.
+
+## 9) Error handling
+
+Use a structured error enum and preserve request_id:
+
+```
+enum AnthropicError {
+  Api { status: StatusCode, request_id: Option<Arc<str>>, error: ErrorObject },
+  Transport(reqwest::Error),
+  Serialization(String, Box<dyn std::error::Error + Send + Sync>),
+  Timeout,
+  InvalidRequest(String),
+}
+```
+
+`ErrorObject` mirrors the response:
+- `type`, `message` (plus optional `param` when present)
+
+Always surface `request-id` header in errors and responses.
+
+## 10) Rust ergonomics and idioms
+
+- Avoid panics in library code. No `unwrap`/`expect` in production paths.
+- Use `Arc<str>` and `Arc<[T]>` for immutable data cloned often.
+- Prefer `From`/`TryFrom` for conversions rather than custom `to_*` methods.
+- Provide `Option<&T>` accessors instead of `&Option<T>`.
+- Use `&str`/`&[T]` in accessors instead of `&String`/`&Vec<T>`.
+
+## 11) Example usage (final API shape)
+
+Non-streaming:
+```
+let client = Client::from_env()?;
+let req = MessagesRequest::builder("claude-3-5-sonnet-20241022")
+    .max_tokens(512)
+    .system("You are helpful")
+    .message(Message::user("Explain Rust lifetimes."))
+    .temperature(0.7)
+    .build()?;
+
+let msg = client.messages().create(req, RequestOptions::default()).await?;
+let text = msg.text(); // helper to join text blocks
+```
+
+Streaming:
+```
+let stream = client.messages().stream(req, RequestOptions::default()).await?;
+let mut text = String::new();
+let mut s = stream.text_stream();
+while let Some(chunk) = s.next().await {
+    text.push_str(&chunk?);
+}
+let final_msg = stream.final_message().await?;
+```
+
+## 12) Implementation notes for Anthropic-compatible gateways
+
+To keep this client usable with Anthropic-compatible gateways:
+- Keep `RequestOptions`, `ClientBuilder`, and `transport::HttpTransport` in a
+  familiar shape across rullm crates.
+- Provide `compat` conversions (ChatRequest/ChatResponse) for apps that target
+  a gateway exposing the Anthropic Messages API.
+- Keep the `MessageStream` API consistent (text_stream + final_message).
+
+This yields a cohesive developer experience across Anthropic and any gateway
+that implements the Anthropic Messages API while still exposing full Anthropic
+functionality.
diff --git a/crates/rullm-openai/spec/implementation-final.md b/crates/rullm-openai/spec/implementation-final.md
new file mode 100644
index 00000000..f2b1e9d7
--- /dev/null
+++ b/crates/rullm-openai/spec/implementation-final.md
@@ -0,0 +1,508 @@
+# Idiomatic Rust Client Design for OpenAI Chat Completions (Multi-Provider)
+
+This document defines a Rust client design for the OpenAI **Chat Completions**
+API with first-class support for OpenAI-compatible providers (OpenRouter,
+Gemini, Groq, xAI, MoonshotAI). It prioritizes developer experience, forward
+compatibility, and graceful handling of provider differences.
+
+This is a design spec only. It references the request/response shapes and
+compatibility notes in `spec/chat-completion*.md`.
+
+---
+
+## 1) Goals and Non-Goals
+
+**Goals**
+- Ergonomic API for common use (`client.chat().model(...).user(...)`).
+- Full coverage of Chat Completions parameters and response shapes.
+- Streaming support with correct SSE parsing and delta accumulation.
+- Forward-compatible JSON decoding (unknown fields and enum values tolerated).
+- Provider-aware parameter handling with graceful degradation.
+- Clean integration with Rust async ecosystems.
+
+**Non-Goals**
+- Implement the Responses API (this client is Chat Completions focused).
+- Enforce strict compile-time correctness for role/field combos (runtime
+  validation is optional and configurable).
+
+---
+
+## 2) Module Layout (Suggested)
+
+```
+crates/rullm-openai/
+  src/
+    client.rs          // ChatCompletionsClient + HTTP wiring
+    config.rs          // ClientConfig, ProviderProfile, CapabilityResolver
+    types.rs           // Request/response structs, message/content/tool types
+    streaming.rs       // SSE decoder + ChatCompletionStream + accumulator
+    error.rs           // Error types + retry classification
+    compat.rs          // Parameter policy + capability rules
+    util.rs            // Small helpers (headers, url, serialization)
+```
+
+---
+
+## 3) Client API Surface
+
+### 3.1 Primary Client
+
+```
+pub struct ChatCompletionsClient { /* cloneable */ }
+
+impl ChatCompletionsClient {
+    pub fn new(config: ClientConfig) -> Result<Self, ClientError>;
+
+    // Core endpoint (non-streaming)
+    pub async fn create(
+        &self,
+        req: ChatCompletionRequest,
+    ) -> Result<ApiResponse<ChatCompletion>, ClientError>;
+
+    // Core endpoint (streaming)
+    pub async fn stream(
+        &self,
+        req: ChatCompletionRequest,
+    ) -> Result<ChatCompletionStream, ClientError>;
+
+    // Stored completions (OpenAI only; gated by capability profile)
+    pub async fn retrieve(&self, id: &str) -> Result<ApiResponse<ChatCompletion>, ClientError>;
+    pub async fn list(&self, params: ListParams) -> Result<ApiResponse<ChatCompletionList>, ClientError>;
+    pub async fn update(&self, id: &str, params: UpdateParams) -> Result<ApiResponse<ChatCompletion>, ClientError>;
+    pub async fn delete(&self, id: &str) -> Result<ApiResponse<DeleteResponse>, ClientError>;
+    pub async fn list_messages(&self, id: &str, params: ListParams)
+        -> Result<ApiResponse<ChatMessageList>, ClientError>;
+
+    // DX convenience
+    pub fn chat(&self) -> ChatRequestBuilder;
+}
+```
+
+### 3.2 Convenience Builder
+
+```
+let resp = client.chat()
+    .model("gpt-4o")
+    .system("You are concise.")
+    .user("Summarize this")
+    .temperature(0.2)
+    .send()
+    .await?;
+
+let stream = client.chat()
+    .model("gpt-4o")
+    .user("Stream this")
+    .stream()
+    .await?;
+```
+
+Design notes:
+- The builder collects a `Vec<Message>` and converts to `Arc<[Message]>`
+  on `send`/`stream`.
+- `send()` returns `ApiResponse<ChatCompletion>`.
+- `stream()` returns `ChatCompletionStream`.
+
+---
+
+## 4) Configuration and Provider Profiles
+
+### 4.1 ClientConfig
+
+```
+pub struct ClientConfig {
+    pub api_key: Arc<str>,
+    pub base_url: Url,
+    pub default_headers: HeaderMap,
+    pub timeout: Duration,
+    pub provider: ProviderProfile,
+    pub parameter_policy: ParameterPolicy,
+    pub capability_resolver: Arc<dyn CapabilityResolver>,
+}
+```
+
+### 4.2 ProviderProfile (Built-in)
+
+`ProviderProfile` supplies defaults and capability constraints.
+
+```
+pub enum ProviderKind { OpenAI, OpenRouter, Gemini, Groq, Xai, Moonshot, Custom }
+
+pub struct ProviderProfile {
+    pub kind: ProviderKind,
+    pub base_url: Url,
+    pub supports_stored_completions: bool,
+    pub capabilities: Capabilities,
+    pub model_rules: Vec<ModelRule>,
+}
+```
+
+**Built-in profiles** include known constraints (from
+`chat-completion-difference.md`):
+- Groq: `n=1`, no logprobs, JSON mode cannot stream.
+- Gemini: no logprobs, stricter schema validation.
+- xAI: reasoning models disallow penalties and stop; no JSON streaming.
+- Moonshot: temperature max 1.0, image URLs base64 only.
+- OpenRouter: accepts most params, adds comment SSE lines.
+
+### 4.3 CapabilityResolver
+
+```
+pub trait CapabilityResolver: Send + Sync {
+    fn capabilities_for(&self, model: &ModelId) -> Capabilities;
+}
+```
+
+`Capabilities` is a simple struct with booleans + numeric limits, e.g.
+`supports_logprobs`, `supports_streaming_json`, `temperature_max`, `supports_n`.
+
+### 4.4 ParameterPolicy
+
+```
+pub enum ParameterPolicy {
+    StrictError,       // reject unsupported parameters
+    WarnAndStrip,      // drop unsupported parameters and emit warnings
+    PassThrough,       // send as-is (let server reject)
+}
+```
+
+The client emits a `CompatibilityReport` (warnings, applied transforms) via
+`ApiResponse::meta` so users can log or test for mismatches.
+
+---
+
+## 5) Core Types (Request/Response)
+
+### 5.1 Identifiers and Common Newtypes
+
+Use `Arc<str>` for immutable strings:
+
+```
+pub struct ModelId(pub Arc<str>);
+
+pub struct Role(pub Arc<str>);
+impl Role {
+    pub const SYSTEM: Role = Role::static_str("system");
+    pub const DEVELOPER: Role = Role::static_str("developer");
+    pub const USER: Role = Role::static_str("user");
+    pub const ASSISTANT: Role = Role::static_str("assistant");
+    pub const TOOL: Role = Role::static_str("tool");
+}
+```
+
+Using string newtypes avoids breaking when new roles appear.
+
+### 5.2 Messages and Content
+
+```
+#[derive(Serialize, Deserialize)]
+pub struct Message {
+    pub role: Role,
+    pub content: Option<MessageContent>,
+    pub name: Option<Arc<str>>,
+    pub tool_calls: Option<Arc<[ToolCall]>>,
+    pub tool_call_id: Option<Arc<str>>,
+    pub audio: Option<AssistantAudio>,
+    pub function_call: Option<FunctionCall>, // deprecated
+    #[serde(flatten)]
+    pub extra: Map<String, Value>,
+}
+
+#[serde(untagged)]
+pub enum MessageContent {
+    Text(Arc<str>),
+    Parts(Arc<[ContentPart]>),
+}
+```
+
+Convenience constructors:
+- `Message::system(text)`
+- `Message::developer(text)`
+- `Message::user(text_or_parts)`
+- `Message::assistant(text_or_parts)`
+- `Message::tool(tool_call_id, content)`
+
+### 5.3 Content Parts
+
+```
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ContentPart {
+    Text { text: Arc<str> },
+    ImageUrl { image_url: ImageUrlPart },
+    InputAudio { input_audio: InputAudioPart },
+    File { file: FilePart },
+    Refusal { refusal: Arc<str> },
+    #[serde(other)]
+    Other,
+}
+```
+
+Note: If `serde(other)` is too lossy, use a `RawContentPart` fallback that
+preserves `type` and payload via `serde_json::Value`.
+
+### 5.4 Tools and Tool Calls
+
+```
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ToolDefinition {
+    Function { function: FunctionDefinition },
+    Custom { custom: CustomToolDefinition },
+    #[serde(other)]
+    Other,
+}
+
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ToolCall {
+    Function { id: Arc<str>, function: FunctionCall },
+    Custom { id: Arc<str>, custom: CustomToolCall },
+    #[serde(other)]
+    Other,
+}
+```
+
+Tool choice uses an untagged enum:
+```
+#[serde(untagged)]
+pub enum ToolChoice {
+    Mode(ToolChoiceMode),
+    Function { r#type: Arc<str>, function: ToolChoiceFunction },
+    Custom { r#type: Arc<str>, custom: ToolChoiceCustom },
+    AllowedTools { r#type: Arc<str>, allowed_tools: AllowedToolsSpec },
+}
+```
+
+### 5.5 Request Struct
+
+```
+pub struct ChatCompletionRequest {
+    pub model: ModelId,
+    pub messages: Arc<[Message]>,
+
+    // sampling + stopping
+    pub temperature: Option<f32>,
+    pub top_p: Option<f32>,
+    pub n: Option<u32>,
+    pub stop: Option<Stop>,
+    pub presence_penalty: Option<f32>,
+    pub frequency_penalty: Option<f32>,
+
+    // tokens
+    pub max_completion_tokens: Option<u32>,
+    pub max_tokens: Option<u32>, // deprecated
+
+    // logprobs
+    pub logprobs: Option<bool>,
+    pub top_logprobs: Option<u32>,
+    pub logit_bias: Option<Map<String, i32>>,
+
+    // tools
+    pub tools: Option<Arc<[ToolDefinition]>>,
+    pub tool_choice: Option<ToolChoice>,
+    pub parallel_tool_calls: Option<bool>,
+    pub functions: Option<Arc<[FunctionDefinition]>>, // deprecated
+    pub function_call: Option<FunctionCall>, // deprecated
+
+    // response formatting
+    pub response_format: Option<ResponseFormat>,
+
+    // multimodal + audio
+    pub modalities: Option<Arc<[Arc<str>]> >,
+    pub audio: Option<AudioConfig>,
+
+    // advanced features
+    pub stream: Option<bool>,
+    pub stream_options: Option<StreamOptions>,
+    pub prediction: Option<Prediction>,
+    pub web_search_options: Option<WebSearchOptions>,
+    pub reasoning_effort: Option<Arc<str>>,
+    pub verbosity: Option<Arc<str>>,
+    pub service_tier: Option<Arc<str>>,
+    pub store: Option<bool>,
+    pub metadata: Option<Map<String, Value>>,
+
+    // identifiers
+    pub seed: Option<u64>,
+    pub user: Option<Arc<str>>, // deprecated
+    pub safety_identifier: Option<Arc<str>>,
+    pub prompt_cache_key: Option<Arc<str>>,
+    pub prompt_cache_retention: Option<Arc<str>>,
+
+    // escape hatch
+    pub extra_body: Option<Map<String, Value>>,
+}
+```
+
+Notes:
+- Use `Arc<[T]>` for immutable arrays (messages, tools, modalities).
+- Use `Option` for nullable/omitted fields.
+- `extra_body` for provider-specific extensions (Gemini thinking config, etc).
+
+### 5.6 Response Structs
+
+```
+pub struct ChatCompletion {
+    pub id: Arc<str>,
+    pub object: Arc<str>,
+    pub created: u64,
+    pub model: ModelId,
+    pub choices: Arc<[ChatChoice]>,
+    pub usage: Option<Usage>,
+    pub service_tier: Option<Arc<str>>,
+    pub system_fingerprint: Option<Arc<str>>,
+    #[serde(flatten)]
+    pub extra: Map<String, Value>,
+}
+
+pub struct ChatChoice {
+    pub index: u32,
+    pub message: Option<Message>,
+    pub finish_reason: Option<Arc<str>>,
+    pub logprobs: Option<Logprobs>,
+    #[serde(flatten)]
+    pub extra: Map<String, Value>,
+}
+```
+
+Streaming chunk:
+
+```
+pub struct ChatCompletionChunk {
+    pub id: Arc<str>,
+    pub object: Arc<str>,
+    pub created: u64,
+    pub model: ModelId,
+    pub choices: Arc<[ChatChunkChoice]>,
+    pub usage: Option<Usage>,
+    #[serde(flatten)]
+    pub extra: Map<String, Value>,
+}
+```
+
+---
+
+## 6) Streaming and Accumulation
+
+### 6.1 SSE Parser
+- Accept `data:` lines only (ignore comments and empty lines).
+- Terminate on `[DONE]` or EOF.
+- Surface `error` objects embedded in SSE data.
+
+### 6.2 Accumulator
+
+Provide `ChatCompletionAccumulator` to merge chunks into a final
+`ChatCompletion`:
+- Concatenate `delta.content` fragments.
+- Merge tool call arguments per `tool_call.id` (not just index).
+- Track `finish_reason` per choice.
+- Handle usage-only final chunk (`choices` empty).
+
+Stream API:
+```
+pub struct ChatCompletionStream {
+    pub fn accumulator(self) -> ChatCompletionAccumulator;
+}
+```
+
+---
+
+## 7) Provider Compatibility Strategy
+
+### 7.1 Capability-aware Request Shaping
+
+Before sending, apply per-provider and per-model rules:
+- Strip or reject unsupported fields (depending on `ParameterPolicy`).
+- Transform deprecated/compat fields (e.g., `max_tokens` -> `max_completion_tokens`).
+- Clamp values (e.g., Moonshot temperature <= 1.0).
+
+### 7.2 Compatibility Report
+
+```
+pub struct CompatibilityReport {
+    pub stripped_fields: Vec<&'static str>,
+    pub transformed_fields: Vec<(&'static str, &'static str)>,
+    pub warnings: Vec<Arc<str>>,
+}
+```
+
+`ApiResponse<T>` includes `meta.compatibility: Option<CompatibilityReport>`.
+
+### 7.3 Response Variations
+- Preserve provider extensions via `#[serde(flatten)] extra` on response types.
+- Expose raw JSON for clients that need direct access:
+  `ApiResponse::raw_json()`.
+
+---
+
+## 8) Error Handling and Metadata
+
+### 8.1 Error Types
+
+```
+pub enum ClientError {
+    Http(HttpError),
+    Api(ApiError),
+    Deserialize(DeserializeError),
+    Stream(StreamError),
+    Capability(CapabilityError),
+}
+```
+
+`ApiError` wraps the server `error` object and includes the HTTP status code.
+
+### 8.2 Response Metadata
+
+```
+pub struct ResponseMeta {
+    pub request_id: Option<Arc<str>>,
+    pub ratelimit: Option<RateLimitInfo>,
+    pub compatibility: Option<CompatibilityReport>,
+    pub latency_ms: Option<u64>,
+}
+
+pub struct ApiResponse<T> {
+    pub data: T,
+    pub meta: ResponseMeta,
+}
+```
+
+---
+
+## 9) Developer Experience Helpers
+
+- `ChatCompletion::first_text()` returns the first text content (if any).
+- `ChatCompletion::tool_calls()` returns tool calls from the first choice.
+- `MessageContent::text()` returns `Option<&str>`.
+- `ToolCall::arguments_json()` parses JSON arguments to `serde_json::Value`.
+- `ChatCompletion::parse_json<T: Deserialize>()` for structured outputs.
+
+All helpers must avoid panics; return `Result` with detailed error types.
+
+---
+
+## 10) Testing Plan (Minimal)
+
+- JSON decode for non-streaming response with tools and annotations.
+- SSE stream parsing with content + tool call delta assembly.
+- Usage-only final chunk when `include_usage` is set.
+- Provider capability stripping and warnings.
+- Unknown fields preserved via `extra`.
+
+---
+
+## 11) Rust Idioms and Safety Notes
+
+- Avoid `unwrap()` in production code.
+- Use `Arc<[T]>` and `Arc<str>` for immutable shared data.
+- Prefer `From/TryFrom` for conversions and `Result` for fallible APIs.
+- Avoid wildcard enum imports and `super::` in module paths.
+- No global mutable state; configuration is explicit.
+
+---
+
+## 12) Summary of the Design Approach
+
+This design favors **flexible, forward-compatible types** with an ergonomic
+builder for the common case. Provider differences are handled centrally via
+capability profiles and parameter policies, making the client useful across
+OpenAI-compatible endpoints without forcing users to learn each provider's
+quirks. The streaming implementation is resilient, and the DX helpers make
+structured output and tool calling pleasant without hiding critical behavior.

From 346fa06c9b8795b57a9c59a9ac63cce389da307c Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sat, 3 Jan 2026 17:45:10 +0530
Subject: [PATCH 04/14] update implementation plan

---
 .../spec/implementation-final.md              |  80 ++++---------
 .../rullm-openai/spec/implementation-final.md | 111 ++++--------------
 2 files changed, 46 insertions(+), 145 deletions(-)

diff --git a/crates/rullm-anthropic/spec/implementation-final.md b/crates/rullm-anthropic/spec/implementation-final.md
index 75f66746..fffc174f 100644
--- a/crates/rullm-anthropic/spec/implementation-final.md
+++ b/crates/rullm-anthropic/spec/implementation-final.md
@@ -1,11 +1,9 @@
 # Anthropic Messages Rust Client - Implementation Design
 
 This document proposes an idiomatic Rust client for the Anthropic Messages API.
-The Messages API is Anthropic-specific, but some gateways expose Anthropic-compatible
-endpoints; the design keeps an optional compat layer for that case. It is based on
-`spec/message-api.md`, `spec/implementation.md`, and patterns in rullm-core. The
-design emphasizes ergonomic builders, strong typing, streaming helpers, and clean
-error handling.
+It is based on `spec/message-api.md`, `spec/implementation.md`, and patterns in
+rullm-core. The design emphasizes ergonomic builders, strong typing, streaming
+helpers, and clean error handling.
 
 ## 1) Goals and non-goals
 
@@ -13,8 +11,6 @@ Goals
 - Feature parity with official Anthropic SDKs for the Messages API.
 - Excellent developer experience: easy defaults, expressive builders, helpers for
   common tasks, and easy streaming consumption.
-- Optional compat ergonomics for Anthropic-compatible gateways (not a claim that
-  other providers natively use the Messages API).
 - Forward compatibility: tolerate unknown enum values and fields.
 
 Non-goals (initial release)
@@ -34,8 +30,6 @@ crates/rullm-anthropic/src/
     stream.rs      // SSE parsing + accumulator
   models.rs        // list/get models
   batches.rs       // create/get/list/cancel/delete/results
-  completions.rs   // legacy API
-  compat.rs        // optional compat types + conversions (Anthropic-compatible gateways)
   transport.rs     // HttpTransport trait + reqwest impl
   lib.rs           // re-exports
 ```
@@ -82,32 +76,11 @@ provider APIs differ):
 - `extra_query: Vec<(Arc<str>, Arc<str>)>`
 - `extra_body: serde_json::Map<String, Value>`
 
-This mirrors `extra_headers/extra_query/extra_body` patterns in other SDKs and
-makes the client usable with Anthropic-compatible gateways.
+This mirrors `extra_headers/extra_query/extra_body` patterns in other SDKs.
 
-## 4) Optional compat types (Anthropic-compatible gateways)
+## 4) Messages API surface
 
-The client can expose a minimal common interface for applications that talk to
-Anthropic-compatible gateways. This is useful when the same app targets multiple
-providers through a gateway that accepts the Anthropic Messages API format.
-
-Proposed compat types (align with rullm-core):
-- `ChatRole` (System/User/Assistant/Tool)
-- `ChatMessage { role, content }`
-- `ChatRequest { messages, temperature, max_tokens, top_p, stream }`
-- `ChatResponse { message, model, usage, finish_reason }`
-- `ChatStreamEvent { Token(String), Done, Error(String) }`
-
-Conversions:
-- `impl From<MessagesRequest> for ChatRequest` (best-effort mapping)
-- `impl TryFrom<ChatRequest> for MessagesRequest` (errors if unsupported fields)
-- `impl From<MessagesResponse> for ChatResponse` (extract first text block)
-
-This keeps Anthropic ergonomics while enabling Anthropic-compatible gateway use.
-
-## 5) Messages API surface
-
-### 5.1 Primary entry points
+### 4.1 Primary entry points
 Expose a sub-client similar to official SDKs:
 
 - `Client::messages()` -> `MessagesClient`
@@ -116,7 +89,7 @@ Expose a sub-client similar to official SDKs:
 - `MessagesClient::count_tokens(req, opts)` -> `CountTokensResponse`
 - `MessagesClient::batches()` -> `BatchesClient`
 
-### 5.2 Builder ergonomics
+### 4.2 Builder ergonomics
 Provide a builder for the request that favors clarity:
 
 ```
@@ -133,7 +106,7 @@ Design notes
 - Accept `system` as `SystemContent` (string or text blocks).
 - `messages` accept `MessageContent` (string shorthand or blocks).
 
-### 5.3 Type modeling overview
+### 4.3 Type modeling overview
 
 Request
 - `MessagesRequest { model, max_tokens, messages, system?, metadata?, stop_sequences?, temperature?, top_p?, top_k?, tools?, tool_choice?, thinking?, service_tier?, stream? }`
@@ -145,9 +118,9 @@ Use `serde` tagging:
 - `#[serde(tag = "type", rename_all = "snake_case")]` for content blocks
 - `#[serde(untagged)]` for `string | [blocks]` unions
 
-## 6) Content blocks and tools
+## 5) Content blocks and tools
 
-### 6.1 ContentBlockParam (input)
+### 5.1 ContentBlockParam (input)
 Support all common and advanced blocks:
 - `text`
 - `image` (base64 or url)
@@ -157,12 +130,12 @@ Support all common and advanced blocks:
 - advanced: `tool_use`, `server_tool_use`, `web_search_tool_result`,
   `thinking`, `redacted_thinking`
 
-### 6.2 ContentBlock (output)
+### 5.2 ContentBlock (output)
 Support output blocks:
 - `text`, `tool_use`, `thinking`, `redacted_thinking`, `server_tool_use`,
   `web_search_tool_result`
 
-### 6.3 Tools
+### 5.3 Tools
 Use a union for custom and server tools:
 - Custom: `{ name, description?, input_schema }`
 - Server tools: `bash_20250124`, `text_editor_20250124/20250429/20250728`,
@@ -172,9 +145,9 @@ Tool choice union:
 - `auto | any | none | tool(name)`
 - `disable_parallel_tool_use: bool`
 
-## 7) Streaming design
+## 6) Streaming design
 
-### 7.1 Raw SSE
+### 6.1 Raw SSE
 Streaming uses SSE with event `type`:
 - `message_start`
 - `content_block_start`
@@ -189,7 +162,7 @@ Implement a tolerant SSE parser:
 - stop on stream close
 - surface JSON parse errors as `AnthropicError::Serialization`
 
-### 7.2 MessageStream helper
+### 6.2 MessageStream helper
 Provide a higher-level stream wrapper that merges deltas into a full message.
 
 Proposed API:
@@ -202,7 +175,7 @@ Use a `MessageAccumulator` internally:
 - merge tool input JSON fragments
 - update usage/stop_reason
 
-### 7.3 Tool input JSON deltas
+### 6.3 Tool input JSON deltas
 Maintain both:
 - `partial_json: String`
 - `parsed: Option<Value>` (best-effort)
@@ -215,7 +188,7 @@ Parsing strategy:
 This avoids a hard dependency on a partial JSON parser while still offering
 useful intermediate values.
 
-## 8) Timeout policy
+## 7) Timeout policy
 
 The official SDKs enforce a non-streaming timeout policy. Mirror it:
 
@@ -230,7 +203,7 @@ Expose this as:
 
 Allow opt-out via `RequestOptions::allow_long_non_streaming`.
 
-## 9) Error handling
+## 8) Error handling
 
 Use a structured error enum and preserve request_id:
 
@@ -249,7 +222,7 @@ enum AnthropicError {
 
 Always surface `request-id` header in errors and responses.
 
-## 10) Rust ergonomics and idioms
+## 9) Rust ergonomics and idioms
 
 - Avoid panics in library code. No `unwrap`/`expect` in production paths.
 - Use `Arc<str>` and `Arc<[T]>` for immutable data cloned often.
@@ -257,7 +230,7 @@ Always surface `request-id` header in errors and responses.
 - Provide `Option<&T>` accessors instead of `&Option<T>`.
 - Use `&str`/`&[T]` in accessors instead of `&String`/`&Vec<T>`.
 
-## 11) Example usage (final API shape)
+## 10) Example usage (final API shape)
 
 Non-streaming:
 ```
@@ -283,16 +256,3 @@ while let Some(chunk) = s.next().await {
 }
 let final_msg = stream.final_message().await?;
 ```
-
-## 12) Implementation notes for Anthropic-compatible gateways
-
-To keep this client usable with Anthropic-compatible gateways:
-- Keep `RequestOptions`, `ClientBuilder`, and `transport::HttpTransport` in a
-  familiar shape across rullm crates.
-- Provide `compat` conversions (ChatRequest/ChatResponse) for apps that target
-  a gateway exposing the Anthropic Messages API.
-- Keep the `MessageStream` API consistent (text_stream + final_message).
-
-This yields a cohesive developer experience across Anthropic and any gateway
-that implements the Anthropic Messages API while still exposing full Anthropic
-functionality.
diff --git a/crates/rullm-openai/spec/implementation-final.md b/crates/rullm-openai/spec/implementation-final.md
index f2b1e9d7..a183db1a 100644
--- a/crates/rullm-openai/spec/implementation-final.md
+++ b/crates/rullm-openai/spec/implementation-final.md
@@ -1,9 +1,10 @@
-# Idiomatic Rust Client Design for OpenAI Chat Completions (Multi-Provider)
+# Idiomatic Rust Client Design for OpenAI Chat Completions (Core API)
 
 This document defines a Rust client design for the OpenAI **Chat Completions**
-API with first-class support for OpenAI-compatible providers (OpenRouter,
-Gemini, Groq, xAI, MoonshotAI). It prioritizes developer experience, forward
-compatibility, and graceful handling of provider differences.
+API with a provider-agnostic core that can target any OpenAI-compatible endpoint.
+It prioritizes developer experience, forward compatibility, and flexible
+authentication/header handling. Provider-specific capability gating is left to
+higher-level crates.
 
 This is a design spec only. It references the request/response shapes and
 compatibility notes in `spec/chat-completion*.md`.
@@ -17,7 +18,8 @@ compatibility notes in `spec/chat-completion*.md`.
 - Full coverage of Chat Completions parameters and response shapes.
 - Streaming support with correct SSE parsing and delta accumulation.
 - Forward-compatible JSON decoding (unknown fields and enum values tolerated).
-- Provider-aware parameter handling with graceful degradation.
+- Flexible authentication and extra headers.
+- Provider-agnostic core (no built-in provider profiles or capability gating).
 - Clean integration with Rust async ecosystems.
 
 **Non-Goals**
@@ -33,11 +35,10 @@ compatibility notes in `spec/chat-completion*.md`.
 crates/rullm-openai/
   src/
     client.rs          // ChatCompletionsClient + HTTP wiring
-    config.rs          // ClientConfig, ProviderProfile, CapabilityResolver
+    config.rs          // ClientConfig, auth + headers
     types.rs           // Request/response structs, message/content/tool types
     streaming.rs       // SSE decoder + ChatCompletionStream + accumulator
     error.rs           // Error types + retry classification
-    compat.rs          // Parameter policy + capability rules
     util.rs            // Small helpers (headers, url, serialization)
 ```
 
@@ -65,14 +66,6 @@ impl ChatCompletionsClient {
         req: ChatCompletionRequest,
     ) -> Result<ChatCompletionStream, ClientError>;
 
-    // Stored completions (OpenAI only; gated by capability profile)
-    pub async fn retrieve(&self, id: &str) -> Result<ApiResponse<ChatCompletion>, ClientError>;
-    pub async fn list(&self, params: ListParams) -> Result<ApiResponse<ChatCompletionList>, ClientError>;
-    pub async fn update(&self, id: &str, params: UpdateParams) -> Result<ApiResponse<ChatCompletion>, ClientError>;
-    pub async fn delete(&self, id: &str) -> Result<ApiResponse<DeleteResponse>, ClientError>;
-    pub async fn list_messages(&self, id: &str, params: ListParams)
-        -> Result<ApiResponse<ChatMessageList>, ClientError>;
-
     // DX convenience
     pub fn chat(&self) -> ChatRequestBuilder;
 }
@@ -104,69 +97,35 @@ Design notes:
 
 ---
 
-## 4) Configuration and Provider Profiles
+## 4) Configuration and Authentication
 
 ### 4.1 ClientConfig
 
 ```
 pub struct ClientConfig {
-    pub api_key: Arc<str>,
     pub base_url: Url,
+    pub auth: AuthConfig,
     pub default_headers: HeaderMap,
     pub timeout: Duration,
-    pub provider: ProviderProfile,
-    pub parameter_policy: ParameterPolicy,
-    pub capability_resolver: Arc<dyn CapabilityResolver>,
 }
 ```
 
-### 4.2 ProviderProfile (Built-in)
-
-`ProviderProfile` supplies defaults and capability constraints.
+### 4.2 AuthConfig
 
 ```
-pub enum ProviderKind { OpenAI, OpenRouter, Gemini, Groq, Xai, Moonshot, Custom }
-
-pub struct ProviderProfile {
-    pub kind: ProviderKind,
-    pub base_url: Url,
-    pub supports_stored_completions: bool,
-    pub capabilities: Capabilities,
-    pub model_rules: Vec<ModelRule>,
+pub enum AuthConfig {
+    None,
+    BearerToken(Arc<str>),
+    Header { name: HeaderName, value: HeaderValue },
+    QueryParam { name: Arc<str>, value: Arc<str> },
 }
 ```
 
-**Built-in profiles** include known constraints (from
-`chat-completion-difference.md`):
-- Groq: `n=1`, no logprobs, JSON mode cannot stream.
-- Gemini: no logprobs, stricter schema validation.
-- xAI: reasoning models disallow penalties and stop; no JSON streaming.
-- Moonshot: temperature max 1.0, image URLs base64 only.
-- OpenRouter: accepts most params, adds comment SSE lines.
-
-### 4.3 CapabilityResolver
-
-```
-pub trait CapabilityResolver: Send + Sync {
-    fn capabilities_for(&self, model: &ModelId) -> Capabilities;
-}
-```
-
-`Capabilities` is a simple struct with booleans + numeric limits, e.g.
-`supports_logprobs`, `supports_streaming_json`, `temperature_max`, `supports_n`.
-
-### 4.4 ParameterPolicy
-
-```
-pub enum ParameterPolicy {
-    StrictError,       // reject unsupported parameters
-    WarnAndStrip,      // drop unsupported parameters and emit warnings
-    PassThrough,       // send as-is (let server reject)
-}
-```
-
-The client emits a `CompatibilityReport` (warnings, applied transforms) via
-`ApiResponse::meta` so users can log or test for mismatches.
+Notes:
+- Use `default_headers` for extra headers (e.g., `OpenAI-Organization`,
+  `OpenAI-Project`, OpenRouter `HTTP-Referer`/`X-Title`, or custom auth headers).
+- This core client does not hard-code provider identities or capability rules.
+  Higher-level crates can layer provider-specific behavior on top.
 
 ---
 
@@ -404,28 +363,12 @@ pub struct ChatCompletionStream {
 
 ---
 
-## 7) Provider Compatibility Strategy
-
-### 7.1 Capability-aware Request Shaping
-
-Before sending, apply per-provider and per-model rules:
-- Strip or reject unsupported fields (depending on `ParameterPolicy`).
-- Transform deprecated/compat fields (e.g., `max_tokens` -> `max_completion_tokens`).
-- Clamp values (e.g., Moonshot temperature <= 1.0).
-
-### 7.2 Compatibility Report
-
-```
-pub struct CompatibilityReport {
-    pub stripped_fields: Vec<&'static str>,
-    pub transformed_fields: Vec<(&'static str, &'static str)>,
-    pub warnings: Vec<Arc<str>>,
-}
-```
-
-`ApiResponse<T>` includes `meta.compatibility: Option<CompatibilityReport>`.
+## 7) Provider Extensions and Pass-through
 
-### 7.3 Response Variations
+- The core client sends requests as provided; it does not strip, clamp, or
+  transform parameters for specific providers.
+- Provider-specific constraints should be handled by higher-level crates or
+  application code.
 - Preserve provider extensions via `#[serde(flatten)] extra` on response types.
 - Expose raw JSON for clients that need direct access:
   `ApiResponse::raw_json()`.
@@ -442,7 +385,6 @@ pub enum ClientError {
     Api(ApiError),
     Deserialize(DeserializeError),
     Stream(StreamError),
-    Capability(CapabilityError),
 }
 ```
 
@@ -454,7 +396,6 @@ pub enum ClientError {
 pub struct ResponseMeta {
     pub request_id: Option<Arc<str>>,
     pub ratelimit: Option<RateLimitInfo>,
-    pub compatibility: Option<CompatibilityReport>,
     pub latency_ms: Option<u64>,
 }
 

From 4299b39e6064f964b7c0f8c6e5c2541d2d7a3406 Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sat, 3 Jan 2026 17:46:03 +0530
Subject: [PATCH 05/14] intial rullm-openai implementation

---
 Cargo.lock                           |   27 +-
 Cargo.toml                           |    2 +-
 crates/rullm-openai/Cargo.toml       |   21 +
 crates/rullm-openai/src/client.rs    |  373 +++++++++
 crates/rullm-openai/src/config.rs    |  293 +++++++
 crates/rullm-openai/src/error.rs     |  140 ++++
 crates/rullm-openai/src/lib.rs       |  108 ++-
 crates/rullm-openai/src/streaming.rs |  452 +++++++++++
 crates/rullm-openai/src/types.rs     | 1070 ++++++++++++++++++++++++++
 9 files changed, 2457 insertions(+), 29 deletions(-)
 create mode 100644 crates/rullm-openai/src/client.rs
 create mode 100644 crates/rullm-openai/src/config.rs
 create mode 100644 crates/rullm-openai/src/error.rs
 create mode 100644 crates/rullm-openai/src/streaming.rs
 create mode 100644 crates/rullm-openai/src/types.rs

diff --git a/Cargo.lock b/Cargo.lock
index e6979e88..9e951be3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1726,10 +1726,33 @@ dependencies = [
 [[package]]
 name = "rullm-anthropic"
 version = "0.1.0"
+dependencies = [
+ "async-stream",
+ "bytes",
+ "futures",
+ "reqwest 0.11.27",
+ "serde",
+ "serde_json",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-test",
+]
 
 [[package]]
 name = "rullm-chat-completion"
 version = "0.1.0"
+dependencies = [
+ "async-stream",
+ "bytes",
+ "futures",
+ "reqwest 0.11.27",
+ "serde",
+ "serde_json",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-test",
+ "tracing",
+]
 
 [[package]]
 name = "rullm-cli"
@@ -1791,10 +1814,6 @@ dependencies = [
  "tracing-subscriber",
 ]
 
-[[package]]
-name = "rullm-gemini"
-version = "0.1.0"
-
 [[package]]
 name = "rustc-demangle"
 version = "0.1.26"
diff --git a/Cargo.toml b/Cargo.toml
index adb7bacc..e12ea588 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,7 @@ rand = "0.8"
 reqwest = { version = "0.11", features = ["json", "stream"] }
 bytes = "1.0"
 log = "0.4"
-serde = { version = "1.0", features = ["derive"] }
+serde = { version = "1.0", features = ["derive", "rc"] }
 serde_json = "1.0"
 thiserror = "1.0"
 anyhow = "1.0"
diff --git a/crates/rullm-openai/Cargo.toml b/crates/rullm-openai/Cargo.toml
index ae54ba3e..a7052f9c 100644
--- a/crates/rullm-openai/Cargo.toml
+++ b/crates/rullm-openai/Cargo.toml
@@ -5,3 +5,24 @@ edition.workspace = true
 rust-version.workspace = true
 
 [dependencies]
+# HTTP client
+reqwest = { workspace = true }
+bytes = { workspace = true }
+
+# Async runtime
+tokio = { workspace = true }
+futures = { workspace = true }
+async-stream = { workspace = true }
+
+# Serialization
+serde = { workspace = true }
+serde_json = { workspace = true }
+
+# Error handling
+thiserror = { workspace = true }
+
+# Logging
+tracing = { workspace = true }
+
+[dev-dependencies]
+tokio-test = { workspace = true }
diff --git a/crates/rullm-openai/src/client.rs b/crates/rullm-openai/src/client.rs
new file mode 100644
index 00000000..3f5afc5b
--- /dev/null
+++ b/crates/rullm-openai/src/client.rs
@@ -0,0 +1,373 @@
+use std::sync::Arc;
+
+use reqwest::header::{AUTHORIZATION, CONTENT_TYPE, HeaderValue};
+
+use crate::config::{ApiResponse, AuthConfig, ClientConfig, ResponseMeta};
+use crate::error::{ApiError, ApiErrorBody, ClientError, DeserializeError, HttpError};
+use crate::streaming::ChatCompletionStream;
+use crate::types::{
+    ChatCompletion, ChatCompletionRequest, Message, ModelId, ResponseFormat, Stop, StreamOptions,
+    ToolChoice, ToolDefinition,
+};
+
+/// The main client for the Chat Completions API.
+#[derive(Clone)]
+pub struct ChatCompletionsClient {
+    http: reqwest::Client,
+    config: Arc<ClientConfig>,
+}
+
+impl ChatCompletionsClient {
+    /// Create a new client with the given configuration.
+    pub fn new(config: ClientConfig) -> Result<Self, ClientError> {
+        let mut builder = reqwest::Client::builder().timeout(config.timeout);
+
+        // Add default headers
+        let mut headers = config.default_headers.clone();
+        headers.insert(CONTENT_TYPE, HeaderValue::from_static("application/json"));
+
+        builder = builder.default_headers(headers);
+
+        let http = builder.build().map_err(|e| HttpError {
+            message: format!("Failed to build HTTP client: {}", e),
+            source: Some(e),
+        })?;
+
+        Ok(Self {
+            http,
+            config: Arc::new(config),
+        })
+    }
+
+    /// Create a chat completion (non-streaming).
+    pub async fn create(
+        &self,
+        req: ChatCompletionRequest,
+    ) -> Result<ApiResponse<ChatCompletion>, ClientError> {
+        let url = format!("{}/chat/completions", self.config.base_url);
+
+        let mut request = self.http.post(&url);
+        request = self.apply_auth(request);
+
+        let body = serde_json::to_string(&req)?;
+        request = request.body(body);
+
+        let response = request.send().await?;
+        let status = response.status().as_u16();
+        let headers = response.headers().clone();
+        let meta = ResponseMeta::from_headers(&headers);
+
+        let raw_body = response.text().await?;
+
+        if status >= 400 {
+            return Err(self.parse_error_response(status, &raw_body));
+        }
+
+        let completion: ChatCompletion =
+            serde_json::from_str(&raw_body).map_err(|e| DeserializeError {
+                message: format!("Failed to parse response: {}", e),
+                source: Some(e),
+                raw_body: Some(Arc::from(raw_body.as_str())),
+            })?;
+
+        Ok(ApiResponse::with_raw_json(
+            completion,
+            meta,
+            Arc::from(raw_body),
+        ))
+    }
+
+    /// Create a streaming chat completion.
+    pub async fn stream(
+        &self,
+        mut req: ChatCompletionRequest,
+    ) -> Result<ChatCompletionStream, ClientError> {
+        // Ensure streaming is enabled
+        req.stream = Some(true);
+
+        let url = format!("{}/chat/completions", self.config.base_url);
+
+        let mut request = self.http.post(&url);
+        request = self.apply_auth(request);
+
+        let body = serde_json::to_string(&req)?;
+        request = request.body(body);
+
+        let response = request.send().await?;
+        let status = response.status().as_u16();
+
+        if status >= 400 {
+            let raw_body = response.text().await?;
+            return Err(self.parse_error_response(status, &raw_body));
+        }
+
+        Ok(ChatCompletionStream::new(response.bytes_stream()))
+    }
+
+    /// Get a convenience builder for chat requests.
+    pub fn chat(&self) -> ChatRequestBuilder {
+        ChatRequestBuilder::new(self.clone())
+    }
+
+    /// Apply authentication to a request.
+    fn apply_auth(&self, mut request: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
+        match &self.config.auth {
+            AuthConfig::None => {}
+            AuthConfig::BearerToken(token) => {
+                let auth_value = format!("Bearer {}", token);
+                if let Ok(header) = HeaderValue::from_str(&auth_value) {
+                    request = request.header(AUTHORIZATION, header);
+                }
+            }
+            AuthConfig::Header { name, value } => {
+                request = request.header(name.clone(), value.clone());
+            }
+            AuthConfig::QueryParam { name, value } => {
+                request = request.query(&[(name.as_ref(), value.as_ref())]);
+            }
+        }
+        request
+    }
+
+    /// Parse an error response.
+    fn parse_error_response(&self, status: u16, raw_body: &str) -> ClientError {
+        // Try to parse as an API error
+        if let Ok(wrapper) = serde_json::from_str::<ErrorWrapper>(raw_body) {
+            return ClientError::Api(ApiError {
+                status,
+                error: wrapper.error,
+                raw_body: Some(Arc::from(raw_body)),
+            });
+        }
+
+        // Fallback to a generic error
+        ClientError::Api(ApiError {
+            status,
+            error: ApiErrorBody {
+                message: Arc::from(raw_body),
+                error_type: None,
+                param: None,
+                code: None,
+            },
+            raw_body: Some(Arc::from(raw_body)),
+        })
+    }
+}
+
+impl std::fmt::Debug for ChatCompletionsClient {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ChatCompletionsClient")
+            .field("base_url", &self.config.base_url)
+            .finish()
+    }
+}
+
+/// Wrapper for parsing API error responses.
+#[derive(serde::Deserialize)]
+struct ErrorWrapper {
+    error: ApiErrorBody,
+}
+
+/// Builder for creating chat completion requests with a fluent API.
+pub struct ChatRequestBuilder {
+    client: ChatCompletionsClient,
+    model: Option<ModelId>,
+    messages: Vec<Message>,
+    temperature: Option<f32>,
+    top_p: Option<f32>,
+    n: Option<u32>,
+    stop: Option<Stop>,
+    max_completion_tokens: Option<u32>,
+    tools: Option<Vec<ToolDefinition>>,
+    tool_choice: Option<ToolChoice>,
+    response_format: Option<ResponseFormat>,
+    stream_options: Option<StreamOptions>,
+    seed: Option<u64>,
+}
+
+impl ChatRequestBuilder {
+    /// Create a new builder with the given client.
+    fn new(client: ChatCompletionsClient) -> Self {
+        Self {
+            client,
+            model: None,
+            messages: Vec::new(),
+            temperature: None,
+            top_p: None,
+            n: None,
+            stop: None,
+            max_completion_tokens: None,
+            tools: None,
+            tool_choice: None,
+            response_format: None,
+            stream_options: None,
+            seed: None,
+        }
+    }
+
+    /// Set the model to use.
+    pub fn model(mut self, model: impl Into<ModelId>) -> Self {
+        self.model = Some(model.into());
+        self
+    }
+
+    /// Add a system message.
+    pub fn system(mut self, content: impl Into<Arc<str>>) -> Self {
+        self.messages.push(Message::system(content));
+        self
+    }
+
+    /// Add a developer message.
+    pub fn developer(mut self, content: impl Into<Arc<str>>) -> Self {
+        self.messages.push(Message::developer(content));
+        self
+    }
+
+    /// Add a user message.
+    pub fn user(mut self, content: impl Into<Arc<str>>) -> Self {
+        self.messages.push(Message::user(content));
+        self
+    }
+
+    /// Add an assistant message.
+    pub fn assistant(mut self, content: impl Into<Arc<str>>) -> Self {
+        self.messages.push(Message::assistant(content));
+        self
+    }
+
+    /// Add a custom message.
+    pub fn message(mut self, message: Message) -> Self {
+        self.messages.push(message);
+        self
+    }
+
+    /// Add multiple messages.
+    pub fn messages(mut self, messages: impl IntoIterator<Item = Message>) -> Self {
+        self.messages.extend(messages);
+        self
+    }
+
+    /// Set the temperature.
+    pub fn temperature(mut self, temperature: f32) -> Self {
+        self.temperature = Some(temperature);
+        self
+    }
+
+    /// Set top-p sampling.
+    pub fn top_p(mut self, top_p: f32) -> Self {
+        self.top_p = Some(top_p);
+        self
+    }
+
+    /// Set the number of completions to generate.
+    pub fn n(mut self, n: u32) -> Self {
+        self.n = Some(n);
+        self
+    }
+
+    /// Set stop sequences.
+    pub fn stop(mut self, stop: Stop) -> Self {
+        self.stop = Some(stop);
+        self
+    }
+
+    /// Set the maximum number of completion tokens.
+    pub fn max_completion_tokens(mut self, max: u32) -> Self {
+        self.max_completion_tokens = Some(max);
+        self
+    }
+
+    /// Add a tool definition.
+    pub fn tool(mut self, tool: ToolDefinition) -> Self {
+        self.tools.get_or_insert_with(Vec::new).push(tool);
+        self
+    }
+
+    /// Set tool definitions.
+    pub fn tools(mut self, tools: Vec<ToolDefinition>) -> Self {
+        self.tools = Some(tools);
+        self
+    }
+
+    /// Set the tool choice.
+    pub fn tool_choice(mut self, choice: ToolChoice) -> Self {
+        self.tool_choice = Some(choice);
+        self
+    }
+
+    /// Set the response format.
+    pub fn response_format(mut self, format: ResponseFormat) -> Self {
+        self.response_format = Some(format);
+        self
+    }
+
+    /// Enable usage reporting in stream.
+    pub fn include_usage(mut self) -> Self {
+        self.stream_options = Some(StreamOptions {
+            include_usage: Some(true),
+        });
+        self
+    }
+
+    /// Set a seed for deterministic sampling.
+    pub fn seed(mut self, seed: u64) -> Self {
+        self.seed = Some(seed);
+        self
+    }
+
+    /// Build the request.
+    fn build_request(&self) -> Result<ChatCompletionRequest, ClientError> {
+        let model = self.model.clone().ok_or_else(|| {
+            ClientError::Http(HttpError {
+                message: "Model is required".to_string(),
+                source: None,
+            })
+        })?;
+
+        Ok(ChatCompletionRequest {
+            model,
+            messages: self.messages.clone().into(),
+            temperature: self.temperature,
+            top_p: self.top_p,
+            n: self.n,
+            stop: self.stop.clone(),
+            presence_penalty: None,
+            frequency_penalty: None,
+            max_completion_tokens: self.max_completion_tokens,
+            max_tokens: None,
+            logprobs: None,
+            top_logprobs: None,
+            logit_bias: None,
+            tools: self.tools.as_ref().map(|t| t.clone().into()),
+            tool_choice: self.tool_choice.clone(),
+            parallel_tool_calls: None,
+            functions: None,
+            response_format: self.response_format.clone(),
+            modalities: None,
+            audio: None,
+            stream: None,
+            stream_options: self.stream_options.clone(),
+            prediction: None,
+            web_search_options: None,
+            reasoning_effort: None,
+            service_tier: None,
+            store: None,
+            metadata: None,
+            seed: self.seed,
+            user: None,
+            extra_body: serde_json::Map::new(),
+        })
+    }
+
+    /// Send the request and get a non-streaming response.
+    pub async fn send(self) -> Result<ApiResponse<ChatCompletion>, ClientError> {
+        let req = self.build_request()?;
+        self.client.create(req).await
+    }
+
+    /// Send the request and get a streaming response.
+    pub async fn stream(self) -> Result<ChatCompletionStream, ClientError> {
+        let req = self.build_request()?;
+        self.client.stream(req).await
+    }
+}
diff --git a/crates/rullm-openai/src/config.rs b/crates/rullm-openai/src/config.rs
new file mode 100644
index 00000000..d6c4fd9e
--- /dev/null
+++ b/crates/rullm-openai/src/config.rs
@@ -0,0 +1,293 @@
+use std::sync::Arc;
+use std::time::Duration;
+
+use reqwest::header::{HeaderMap, HeaderName, HeaderValue};
+
+use crate::error::ClientError;
+
+/// Configuration for the Chat Completions client.
+#[derive(Debug, Clone)]
+pub struct ClientConfig {
+    /// Base URL for the API (e.g., "https://api.openai.com/v1")
+    pub base_url: Arc<str>,
+    /// Authentication configuration
+    pub auth: AuthConfig,
+    /// Default headers to include in all requests
+    pub default_headers: HeaderMap,
+    /// Request timeout
+    pub timeout: Duration,
+}
+
+impl Default for ClientConfig {
+    fn default() -> Self {
+        Self {
+            base_url: Arc::from("https://api.openai.com/v1"),
+            auth: AuthConfig::None,
+            default_headers: HeaderMap::new(),
+            timeout: Duration::from_secs(60),
+        }
+    }
+}
+
+impl ClientConfig {
+    /// Create a new builder for ClientConfig.
+    pub fn builder() -> ClientConfigBuilder {
+        ClientConfigBuilder::new()
+    }
+}
+
+/// Builder for creating a [`ClientConfig`].
+#[derive(Debug, Default)]
+pub struct ClientConfigBuilder {
+    base_url: Option<Arc<str>>,
+    auth: Option<AuthConfig>,
+    default_headers: HeaderMap,
+    timeout: Option<Duration>,
+}
+
+impl ClientConfigBuilder {
+    /// Create a new builder with default values.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Set the base URL for the API.
+    pub fn base_url(mut self, url: impl Into<Arc<str>>) -> Self {
+        self.base_url = Some(url.into());
+        self
+    }
+
+    /// Set the authentication configuration.
+    pub fn auth(mut self, auth: AuthConfig) -> Self {
+        self.auth = Some(auth);
+        self
+    }
+
+    /// Set a bearer token for authentication.
+    pub fn bearer_token(mut self, token: impl Into<Arc<str>>) -> Self {
+        self.auth = Some(AuthConfig::BearerToken(token.into()));
+        self
+    }
+
+    /// Add a default header that will be included in all requests.
+    pub fn header(mut self, name: HeaderName, value: HeaderValue) -> Self {
+        self.default_headers.insert(name, value);
+        self
+    }
+
+    /// Set the request timeout.
+    pub fn timeout(mut self, timeout: Duration) -> Self {
+        self.timeout = Some(timeout);
+        self
+    }
+
+    /// Build the [`ClientConfig`].
+    pub fn build(self) -> Result<ClientConfig, ClientError> {
+        Ok(ClientConfig {
+            base_url: self
+                .base_url
+                .unwrap_or_else(|| Arc::from("https://api.openai.com/v1")),
+            auth: self.auth.unwrap_or(AuthConfig::None),
+            default_headers: self.default_headers,
+            timeout: self.timeout.unwrap_or(Duration::from_secs(60)),
+        })
+    }
+}
+
+/// Authentication configuration for the client.
+#[derive(Debug, Clone)]
+pub enum AuthConfig {
+    /// No authentication
+    None,
+    /// Bearer token authentication (Authorization: Bearer <token>)
+    BearerToken(Arc<str>),
+    /// Custom header authentication
+    Header {
+        name: HeaderName,
+        value: HeaderValue,
+    },
+    /// Query parameter authentication
+    QueryParam { name: Arc<str>, value: Arc<str> },
+}
+
+impl AuthConfig {
+    /// Create a bearer token auth config.
+    pub fn bearer(token: impl Into<Arc<str>>) -> Self {
+        Self::BearerToken(token.into())
+    }
+
+    /// Create a custom header auth config.
+    pub fn header(name: HeaderName, value: HeaderValue) -> Self {
+        Self::Header { name, value }
+    }
+
+    /// Create a query parameter auth config.
+    pub fn query_param(name: impl Into<Arc<str>>, value: impl Into<Arc<str>>) -> Self {
+        Self::QueryParam {
+            name: name.into(),
+            value: value.into(),
+        }
+    }
+}
+
+/// Rate limit information from response headers.
+#[derive(Debug, Clone, Default)]
+pub struct RateLimitInfo {
+    /// Maximum requests allowed in the window
+    pub limit_requests: Option<u32>,
+    /// Maximum tokens allowed in the window
+    pub limit_tokens: Option<u32>,
+    /// Remaining requests in the current window
+    pub remaining_requests: Option<u32>,
+    /// Remaining tokens in the current window
+    pub remaining_tokens: Option<u32>,
+    /// Time until the request limit resets (in seconds)
+    pub reset_requests_secs: Option<f64>,
+    /// Time until the token limit resets (in seconds)
+    pub reset_tokens_secs: Option<f64>,
+}
+
+impl RateLimitInfo {
+    /// Parse rate limit info from response headers.
+    pub fn from_headers(headers: &HeaderMap) -> Self {
+        Self {
+            limit_requests: parse_header_u32(headers, "x-ratelimit-limit-requests"),
+            limit_tokens: parse_header_u32(headers, "x-ratelimit-limit-tokens"),
+            remaining_requests: parse_header_u32(headers, "x-ratelimit-remaining-requests"),
+            remaining_tokens: parse_header_u32(headers, "x-ratelimit-remaining-tokens"),
+            reset_requests_secs: parse_reset_header(headers, "x-ratelimit-reset-requests"),
+            reset_tokens_secs: parse_reset_header(headers, "x-ratelimit-reset-tokens"),
+        }
+    }
+}
+
+fn parse_header_u32(headers: &HeaderMap, name: &str) -> Option<u32> {
+    headers
+        .get(name)
+        .and_then(|v| v.to_str().ok())
+        .and_then(|s| s.parse().ok())
+}
+
+fn parse_reset_header(headers: &HeaderMap, name: &str) -> Option<f64> {
+    headers
+        .get(name)
+        .and_then(|v| v.to_str().ok())
+        .and_then(parse_reset_duration)
+}
+
+/// Parse a reset duration string like "1s", "1m30s", "500ms", etc.
+fn parse_reset_duration(s: &str) -> Option<f64> {
+    let s = s.trim();
+
+    // Try parsing as a simple float (seconds)
+    if let Ok(secs) = s.parse::<f64>() {
+        return Some(secs);
+    }
+
+    // Parse duration format like "1m30s", "500ms", "2s"
+    let mut total_secs = 0.0;
+    let mut current_num = String::new();
+
+    for c in s.chars() {
+        if c.is_ascii_digit() || c == '.' {
+            current_num.push(c);
+        } else {
+            if current_num.is_empty() {
+                continue;
+            }
+            let num: f64 = current_num.parse().ok()?;
+            current_num.clear();
+
+            match c {
+                'h' => total_secs += num * 3600.0,
+                'm' if s.contains("ms") => {} // handled separately
+                'm' => total_secs += num * 60.0,
+                's' => total_secs += num,
+                _ => {}
+            }
+        }
+    }
+
+    // Handle milliseconds specially
+    if s.ends_with("ms") {
+        if let Some(ms_str) = s.strip_suffix("ms") {
+            if let Ok(ms) = ms_str.trim().parse::<f64>() {
+                return Some(ms / 1000.0);
+            }
+        }
+    }
+
+    if total_secs > 0.0 {
+        Some(total_secs)
+    } else {
+        None
+    }
+}
+
+/// Response metadata extracted from HTTP headers.
+#[derive(Debug, Clone, Default)]
+pub struct ResponseMeta {
+    /// Request ID from the x-request-id header
+    pub request_id: Option<Arc<str>>,
+    /// Rate limit information
+    pub ratelimit: Option<RateLimitInfo>,
+    /// Processing time in milliseconds
+    pub latency_ms: Option<u64>,
+}
+
+impl ResponseMeta {
+    /// Parse metadata from response headers.
+    pub fn from_headers(headers: &HeaderMap) -> Self {
+        Self {
+            request_id: headers
+                .get("x-request-id")
+                .and_then(|v| v.to_str().ok())
+                .map(Arc::from),
+            ratelimit: Some(RateLimitInfo::from_headers(headers)),
+            latency_ms: headers
+                .get("openai-processing-ms")
+                .and_then(|v| v.to_str().ok())
+                .and_then(|s| s.parse().ok()),
+        }
+    }
+}
+
+/// Wrapper for API responses that includes metadata.
+#[derive(Debug, Clone)]
+pub struct ApiResponse<T> {
+    /// The response data
+    pub data: T,
+    /// Response metadata
+    pub meta: ResponseMeta,
+    /// Raw JSON body for debugging
+    pub raw_json: Option<Arc<str>>,
+}
+
+impl<T> ApiResponse<T> {
+    /// Create a new API response.
+    pub fn new(data: T, meta: ResponseMeta) -> Self {
+        Self {
+            data,
+            meta,
+            raw_json: None,
+        }
+    }
+
+    /// Create an API response with raw JSON preserved.
+    pub fn with_raw_json(data: T, meta: ResponseMeta, raw_json: Arc<str>) -> Self {
+        Self {
+            data,
+            meta,
+            raw_json: Some(raw_json),
+        }
+    }
+
+    /// Map the data to a different type.
+    pub fn map<U>(self, f: impl FnOnce(T) -> U) -> ApiResponse<U> {
+        ApiResponse {
+            data: f(self.data),
+            meta: self.meta,
+            raw_json: self.raw_json,
+        }
+    }
+}
diff --git a/crates/rullm-openai/src/error.rs b/crates/rullm-openai/src/error.rs
new file mode 100644
index 00000000..d9979a8c
--- /dev/null
+++ b/crates/rullm-openai/src/error.rs
@@ -0,0 +1,140 @@
+use std::sync::Arc;
+
+use serde::{Deserialize, Serialize};
+
+/// The primary error type for the Chat Completions client.
+#[derive(Debug, thiserror::Error)]
+pub enum ClientError {
+    /// HTTP transport error (network, DNS, TLS, etc.)
+    #[error("HTTP error: {0}")]
+    Http(#[from] HttpError),
+
+    /// API returned an error response
+    #[error("API error: {0}")]
+    Api(#[from] ApiError),
+
+    /// Failed to deserialize the response
+    #[error("Deserialize error: {0}")]
+    Deserialize(#[from] DeserializeError),
+
+    /// Stream-specific errors
+    #[error("Stream error: {0}")]
+    Stream(#[from] StreamError),
+}
+
+/// HTTP transport error wrapping reqwest errors.
+#[derive(Debug, thiserror::Error)]
+#[error("{message}")]
+pub struct HttpError {
+    pub message: String,
+    pub source: Option<reqwest::Error>,
+}
+
+impl From<reqwest::Error> for HttpError {
+    fn from(err: reqwest::Error) -> Self {
+        Self {
+            message: err.to_string(),
+            source: Some(err),
+        }
+    }
+}
+
+impl From<reqwest::Error> for ClientError {
+    fn from(err: reqwest::Error) -> Self {
+        ClientError::Http(HttpError::from(err))
+    }
+}
+
+/// API error returned by the server.
+#[derive(Debug, thiserror::Error)]
+#[error("{error}")]
+pub struct ApiError {
+    /// HTTP status code
+    pub status: u16,
+    /// The error object from the API
+    pub error: ApiErrorBody,
+    /// Raw response body for debugging
+    pub raw_body: Option<Arc<str>>,
+}
+
+impl ApiError {
+    /// Returns true if this is a rate limit error (429).
+    pub fn is_rate_limit(&self) -> bool {
+        self.status == 429
+    }
+
+    /// Returns true if this is a server error (5xx).
+    pub fn is_server_error(&self) -> bool {
+        (500..600).contains(&self.status)
+    }
+
+    /// Returns true if this error is retryable.
+    pub fn is_retryable(&self) -> bool {
+        self.is_rate_limit() || self.is_server_error()
+    }
+}
+
+/// The error body returned by the OpenAI API.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ApiErrorBody {
+    pub message: Arc<str>,
+    #[serde(rename = "type")]
+    pub error_type: Option<Arc<str>>,
+    pub param: Option<Arc<str>>,
+    pub code: Option<Arc<str>>,
+}
+
+impl std::fmt::Display for ApiErrorBody {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.message)?;
+        if let Some(ref code) = self.code {
+            write!(f, " (code: {})", code)?;
+        }
+        Ok(())
+    }
+}
+
+/// Deserialization error.
+#[derive(Debug, thiserror::Error)]
+#[error("Failed to deserialize response: {message}")]
+pub struct DeserializeError {
+    pub message: String,
+    pub source: Option<serde_json::Error>,
+    pub raw_body: Option<Arc<str>>,
+}
+
+impl From<serde_json::Error> for DeserializeError {
+    fn from(err: serde_json::Error) -> Self {
+        Self {
+            message: err.to_string(),
+            source: Some(err),
+            raw_body: None,
+        }
+    }
+}
+
+impl From<serde_json::Error> for ClientError {
+    fn from(err: serde_json::Error) -> Self {
+        ClientError::Deserialize(DeserializeError::from(err))
+    }
+}
+
+/// Stream-specific errors.
+#[derive(Debug, thiserror::Error)]
+pub enum StreamError {
+    /// Invalid SSE line format
+    #[error("Invalid SSE line: {0}")]
+    InvalidSseLine(Arc<str>),
+
+    /// Error parsing SSE data as JSON
+    #[error("Failed to parse SSE data: {0}")]
+    ParseError(String),
+
+    /// API error embedded in stream
+    #[error("API error in stream: {0}")]
+    ApiError(#[from] ApiError),
+
+    /// Stream was closed unexpectedly
+    #[error("Stream closed unexpectedly")]
+    UnexpectedClose,
+}
diff --git a/crates/rullm-openai/src/lib.rs b/crates/rullm-openai/src/lib.rs
index f1c3ba20..39db8a79 100644
--- a/crates/rullm-openai/src/lib.rs
+++ b/crates/rullm-openai/src/lib.rs
@@ -1,26 +1,86 @@
-/// Different types of roles a message can have.
-// see if it makes sense to add a Other(String) role here.
-// incase some providers have a unique role.
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub enum MessageRole {
-    User,
-    Assistant,
-    Tool,
-    System,
-}
+//! # rullm-chat-completion
+//!
+//! An idiomatic Rust client for the OpenAI Chat Completions API.
+//!
+//! This library provides a provider-agnostic client for any OpenAI-compatible
+//! chat completions endpoint, with full support for streaming, tools, and
+//! structured outputs.
+//!
+//! ## Quick Start
+//!
+//! ```rust,no_run
+//! use rullm_chat_completion::{ChatCompletionsClient, ClientConfig, AuthConfig};
+//!
+//! #[tokio::main]
+//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
+//!     let config = ClientConfig::builder()
+//!         .bearer_token("your-api-key")
+//!         .build()?;
+//!
+//!     let client = ChatCompletionsClient::new(config)?;
+//!
+//!     let response = client.chat()
+//!         .model("gpt-4o")
+//!         .system("You are a helpful assistant.")
+//!         .user("Hello!")
+//!         .send()
+//!         .await?;
+//!
+//!     println!("{}", response.data.first_text().unwrap_or("No response"));
+//!     Ok(())
+//! }
+//! ```
+//!
+//! ## Streaming
+//!
+//! ```rust,no_run
+//! use futures::StreamExt;
+//! use rullm_chat_completion::{ChatCompletionsClient, ClientConfig};
+//!
+//! #[tokio::main]
+//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
+//!     let config = ClientConfig::builder()
+//!         .bearer_token("your-api-key")
+//!         .build()?;
+//!
+//!     let client = ChatCompletionsClient::new(config)?;
+//!
+//!     let mut stream = client.chat()
+//!         .model("gpt-4o")
+//!         .user("Tell me a story")
+//!         .stream()
+//!         .await?;
+//!
+//!     while let Some(chunk) = stream.next().await {
+//!         let chunk = chunk?;
+//!         if let Some(content) = chunk.choices.first()
+//!             .and_then(|c| c.delta.content.as_ref())
+//!         {
+//!             print!("{}", content);
+//!         }
+//!     }
+//!     Ok(())
+//! }
+//! ```
 
-pub enum ContentPart {
-    Text(String),
-    Binary(String),
-    ToolCall(()),
-    ToolResponse(()),
-}
+mod client;
+mod config;
+mod error;
+mod streaming;
+mod types;
 
-pub struct Tool {}
-pub struct ToolCall {}
-pub struct ToolResponse {}
-
-pub struct ChatMessage {}
-
-pub struct ChatRequest {}
-pub struct ChatResponse {}
+// Re-export main types
+pub use client::{ChatCompletionsClient, ChatRequestBuilder};
+pub use config::{
+    ApiResponse, AuthConfig, ClientConfig, ClientConfigBuilder, RateLimitInfo, ResponseMeta,
+};
+pub use error::{ApiError, ApiErrorBody, ClientError, DeserializeError, HttpError, StreamError};
+pub use streaming::{ChatCompletionAccumulator, ChatCompletionStream};
+pub use types::{
+    ApproximateLocation, AssistantAudio, AudioConfig, ChatChoice, ChatChunkChoice, ChatCompletion,
+    ChatCompletionChunk, ChatCompletionRequest, ChunkDelta, CompletionTokensDetails, ContentPart,
+    FilePart, FunctionCall, FunctionCallDelta, FunctionDefinition, ImageUrlPart, InputAudioPart,
+    JsonSchemaFormat, Logprobs, Message, MessageContent, ModelId, Prediction, PromptTokensDetails,
+    ResponseFormat, Role, Stop, StreamOptions, TokenLogprob, ToolCall, ToolCallDelta, ToolChoice,
+    ToolChoiceFunction, ToolDefinition, TopLogprob, Usage, UserLocation, WebSearchOptions,
+};
diff --git a/crates/rullm-openai/src/streaming.rs b/crates/rullm-openai/src/streaming.rs
new file mode 100644
index 00000000..961f7043
--- /dev/null
+++ b/crates/rullm-openai/src/streaming.rs
@@ -0,0 +1,452 @@
+use std::collections::HashMap;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use bytes::Bytes;
+use futures::StreamExt;
+use futures::stream::Stream;
+
+use crate::error::{ClientError, StreamError};
+use crate::types::{
+    ChatChoice, ChatCompletion, ChatCompletionChunk, ChunkDelta, FunctionCall, Message,
+    MessageContent, ModelId, Role, ToolCall, ToolCallDelta, Usage,
+};
+
+/// A stream of chat completion chunks.
+pub struct ChatCompletionStream {
+    inner: Pin<Box<dyn Stream<Item = Result<ChatCompletionChunk, ClientError>> + Send>>,
+}
+
+impl ChatCompletionStream {
+    /// Create a new stream from a byte stream.
+    pub fn new<S>(byte_stream: S) -> Self
+    where
+        S: Stream<Item = Result<Bytes, reqwest::Error>> + Send + 'static,
+    {
+        let chunk_stream = parse_sse_stream(byte_stream);
+        Self {
+            inner: Box::pin(chunk_stream),
+        }
+    }
+
+    /// Consume this stream and return an accumulator that collects chunks into a final response.
+    pub fn accumulator(self) -> ChatCompletionAccumulator {
+        ChatCompletionAccumulator::new(self)
+    }
+}
+
+impl Stream for ChatCompletionStream {
+    type Item = Result<ChatCompletionChunk, ClientError>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.inner.as_mut().poll_next(cx)
+    }
+}
+
+/// Parse an SSE byte stream into chat completion chunks.
+fn parse_sse_stream<S>(
+    byte_stream: S,
+) -> impl Stream<Item = Result<ChatCompletionChunk, ClientError>>
+where
+    S: Stream<Item = Result<Bytes, reqwest::Error>> + Send + 'static,
+{
+    async_stream::try_stream! {
+        let mut buffer = String::new();
+
+        futures::pin_mut!(byte_stream);
+
+        while let Some(result) = byte_stream.next().await {
+            let bytes = result?;
+            let text = String::from_utf8_lossy(&bytes);
+            buffer.push_str(&text);
+
+            // Process complete lines
+            while let Some(line_end) = buffer.find('\n') {
+                let line = buffer[..line_end].trim_end_matches('\r').to_string();
+                buffer = buffer[line_end + 1..].to_string();
+
+                // Skip empty lines and comments
+                if line.is_empty() || line.starts_with(':') {
+                    continue;
+                }
+
+                // Parse SSE data lines
+                if let Some(data) = line.strip_prefix("data: ") {
+                    let data = data.trim();
+
+                    // Check for stream end
+                    if data == "[DONE]" {
+                        return;
+                    }
+
+                    // Parse JSON
+                    match serde_json::from_str::<ChatCompletionChunk>(data) {
+                        Ok(chunk) => yield chunk,
+                        Err(e) => {
+                            // Check if it's an error object
+                            if let Ok(error_wrapper) = serde_json::from_str::<SseErrorWrapper>(data) {
+                                Err(ClientError::Stream(StreamError::ApiError(
+                                    crate::error::ApiError {
+                                        status: 0,
+                                        error: error_wrapper.error,
+                                        raw_body: Some(Arc::from(data)),
+                                    },
+                                )))?;
+                            } else {
+                                Err(ClientError::Stream(StreamError::ParseError(
+                                    format!("Failed to parse SSE data: {}", e),
+                                )))?;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Handle any remaining data in buffer
+        if !buffer.trim().is_empty() {
+            let data = buffer.trim();
+            if let Some(data) = data.strip_prefix("data: ") {
+                if data != "[DONE]" && !data.is_empty() {
+                    match serde_json::from_str::<ChatCompletionChunk>(data) {
+                        Ok(chunk) => yield chunk,
+                        Err(_) => {
+                            // Ignore trailing incomplete data
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+/// Wrapper for error objects in SSE data.
+#[derive(serde::Deserialize)]
+struct SseErrorWrapper {
+    error: crate::error::ApiErrorBody,
+}
+
+/// Accumulator for collecting streaming chunks into a final response.
+pub struct ChatCompletionAccumulator {
+    stream: ChatCompletionStream,
+    id: Option<Arc<str>>,
+    object: Option<Arc<str>>,
+    created: Option<u64>,
+    model: Option<ModelId>,
+    service_tier: Option<Arc<str>>,
+    system_fingerprint: Option<Arc<str>>,
+    choices: HashMap<u32, AccumulatedChoice>,
+    usage: Option<Usage>,
+}
+
+impl ChatCompletionAccumulator {
+    /// Create a new accumulator from a stream.
+    pub fn new(stream: ChatCompletionStream) -> Self {
+        Self {
+            stream,
+            id: None,
+            object: None,
+            created: None,
+            model: None,
+            service_tier: None,
+            system_fingerprint: None,
+            choices: HashMap::new(),
+            usage: None,
+        }
+    }
+
+    /// Consume all chunks and return the final chat completion.
+    pub async fn collect(mut self) -> Result<ChatCompletion, ClientError> {
+        while let Some(chunk) = self.stream.next().await {
+            self.process_chunk(chunk?);
+        }
+        Ok(self.into_completion())
+    }
+
+    /// Process a chunk and update the accumulated state.
+    fn process_chunk(&mut self, chunk: ChatCompletionChunk) {
+        // Update metadata from first chunk
+        if self.id.is_none() {
+            self.id = Some(chunk.id);
+            self.object = Some(chunk.object);
+            self.created = Some(chunk.created);
+            self.model = Some(chunk.model);
+        }
+
+        // Update optional fields
+        if chunk.service_tier.is_some() {
+            self.service_tier = chunk.service_tier;
+        }
+        if chunk.system_fingerprint.is_some() {
+            self.system_fingerprint = chunk.system_fingerprint;
+        }
+        if chunk.usage.is_some() {
+            self.usage = chunk.usage;
+        }
+
+        // Process choices
+        for choice in chunk.choices.iter() {
+            let accumulated = self
+                .choices
+                .entry(choice.index)
+                .or_insert_with(|| AccumulatedChoice::new(choice.index));
+            accumulated.process_delta(&choice.delta);
+
+            if choice.finish_reason.is_some() {
+                accumulated.finish_reason = choice.finish_reason.clone();
+            }
+            if choice.logprobs.is_some() {
+                accumulated.logprobs = choice.logprobs.clone();
+            }
+        }
+    }
+
+    /// Convert the accumulated state into a final ChatCompletion.
+    fn into_completion(self) -> ChatCompletion {
+        let mut choices: Vec<(u32, ChatChoice)> = self
+            .choices
+            .into_iter()
+            .map(|(idx, acc)| (idx, acc.into_choice()))
+            .collect();
+        choices.sort_by_key(|(idx, _)| *idx);
+
+        ChatCompletion {
+            id: self.id.unwrap_or_else(|| Arc::from("")),
+            object: self.object.unwrap_or_else(|| Arc::from("chat.completion")),
+            created: self.created.unwrap_or(0),
+            model: self.model.unwrap_or_else(|| ModelId::new("")),
+            choices: choices.into_iter().map(|(_, c)| c).collect(),
+            usage: self.usage,
+            service_tier: self.service_tier,
+            system_fingerprint: self.system_fingerprint,
+            extra: serde_json::Map::new(),
+        }
+    }
+}
+
+/// Accumulated state for a single choice.
+struct AccumulatedChoice {
+    index: u32,
+    role: Option<Role>,
+    content: String,
+    refusal: Option<String>,
+    tool_calls: HashMap<u32, AccumulatedToolCall>,
+    finish_reason: Option<Arc<str>>,
+    logprobs: Option<crate::types::Logprobs>,
+}
+
+impl AccumulatedChoice {
+    fn new(index: u32) -> Self {
+        Self {
+            index,
+            role: None,
+            content: String::new(),
+            refusal: None,
+            tool_calls: HashMap::new(),
+            finish_reason: None,
+            logprobs: None,
+        }
+    }
+
+    fn process_delta(&mut self, delta: &ChunkDelta) {
+        if delta.role.is_some() {
+            self.role = delta.role.clone();
+        }
+
+        if let Some(ref content) = delta.content {
+            self.content.push_str(content);
+        }
+
+        if let Some(ref refusal) = delta.refusal {
+            self.refusal
+                .get_or_insert_with(String::new)
+                .push_str(refusal);
+        }
+
+        if let Some(ref tool_calls) = delta.tool_calls {
+            for tc_delta in tool_calls.iter() {
+                let accumulated = self
+                    .tool_calls
+                    .entry(tc_delta.index)
+                    .or_insert_with(|| AccumulatedToolCall::new(tc_delta.index));
+                accumulated.process_delta(tc_delta);
+            }
+        }
+    }
+
+    fn into_choice(self) -> ChatChoice {
+        // Build tool calls if any
+        let tool_calls = if self.tool_calls.is_empty() {
+            None
+        } else {
+            let mut calls: Vec<(u32, ToolCall)> = self
+                .tool_calls
+                .into_iter()
+                .map(|(idx, acc)| (idx, acc.into_tool_call()))
+                .collect();
+            calls.sort_by_key(|(idx, _)| *idx);
+            Some(calls.into_iter().map(|(_, tc)| tc).collect::<Arc<[_]>>())
+        };
+
+        // Build content
+        let content = if self.content.is_empty() {
+            None
+        } else {
+            Some(MessageContent::Text(Arc::from(self.content)))
+        };
+
+        // Build message
+        let message = Some(Message {
+            role: self.role.unwrap_or_else(Role::assistant),
+            content,
+            name: None,
+            tool_calls,
+            tool_call_id: None,
+            audio: None,
+            function_call: None,
+            extra: serde_json::Map::new(),
+        });
+
+        ChatChoice {
+            index: self.index,
+            message,
+            finish_reason: self.finish_reason,
+            logprobs: self.logprobs,
+            extra: serde_json::Map::new(),
+        }
+    }
+}
+
+/// Accumulated state for a tool call.
+struct AccumulatedToolCall {
+    #[allow(dead_code)]
+    index: u32,
+    id: Option<Arc<str>>,
+    call_type: Option<Arc<str>>,
+    function_name: Option<Arc<str>>,
+    function_arguments: String,
+}
+
+impl AccumulatedToolCall {
+    fn new(index: u32) -> Self {
+        Self {
+            index,
+            id: None,
+            call_type: None,
+            function_name: None,
+            function_arguments: String::new(),
+        }
+    }
+
+    fn process_delta(&mut self, delta: &ToolCallDelta) {
+        if delta.id.is_some() {
+            self.id = delta.id.clone();
+        }
+        if delta.call_type.is_some() {
+            self.call_type = delta.call_type.clone();
+        }
+        if let Some(ref func) = delta.function {
+            if func.name.is_some() {
+                self.function_name = func.name.clone();
+            }
+            if let Some(ref args) = func.arguments {
+                self.function_arguments.push_str(args);
+            }
+        }
+    }
+
+    fn into_tool_call(self) -> ToolCall {
+        ToolCall {
+            id: self.id.unwrap_or_else(|| Arc::from("")),
+            call_type: self.call_type.unwrap_or_else(|| Arc::from("function")),
+            function: FunctionCall {
+                name: self.function_name.unwrap_or_else(|| Arc::from("")),
+                arguments: Arc::from(self.function_arguments),
+            },
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use bytes::Bytes;
+    use futures::stream;
+
+    #[tokio::test]
+    async fn test_parse_simple_sse() {
+        let data = r#"data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4","choices":[{"index":0,"delta":{"content":"Hello"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4","choices":[{"index":0,"delta":{"content":" world"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}
+
+data: [DONE]
+"#;
+
+        let byte_stream = stream::once(async move { Ok::<_, reqwest::Error>(Bytes::from(data)) });
+        let mut stream = ChatCompletionStream::new(byte_stream);
+
+        let mut chunks = Vec::new();
+        while let Some(result) = stream.next().await {
+            chunks.push(result.unwrap());
+        }
+
+        assert_eq!(chunks.len(), 4);
+        assert_eq!(
+            chunks[0].choices[0].delta.role.as_ref().unwrap().0.as_ref(),
+            "assistant"
+        );
+        assert_eq!(
+            chunks[1].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap()
+                .as_ref(),
+            "Hello"
+        );
+        assert_eq!(
+            chunks[2].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap()
+                .as_ref(),
+            " world"
+        );
+        assert_eq!(
+            chunks[3].choices[0]
+                .finish_reason
+                .as_ref()
+                .unwrap()
+                .as_ref(),
+            "stop"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_accumulator() {
+        let data = r#"data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4","choices":[{"index":0,"delta":{"content":"Hello"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4","choices":[{"index":0,"delta":{"content":" world!"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"gpt-4","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}
+
+data: [DONE]
+"#;
+
+        let byte_stream = stream::once(async move { Ok::<_, reqwest::Error>(Bytes::from(data)) });
+        let stream = ChatCompletionStream::new(byte_stream);
+        let completion = stream.accumulator().collect().await.unwrap();
+
+        assert_eq!(completion.id.as_ref(), "chatcmpl-123");
+        assert_eq!(completion.first_text(), Some("Hello world!"));
+        assert_eq!(completion.finish_reason(), Some("stop"));
+    }
+}
diff --git a/crates/rullm-openai/src/types.rs b/crates/rullm-openai/src/types.rs
new file mode 100644
index 00000000..fb724daf
--- /dev/null
+++ b/crates/rullm-openai/src/types.rs
@@ -0,0 +1,1070 @@
+use std::sync::Arc;
+
+use serde::{Deserialize, Serialize};
+use serde_json::{Map, Value};
+
+// =============================================================================
+// Core Newtypes
+// =============================================================================
+
+/// Model identifier (e.g., "gpt-4o", "gpt-4-turbo").
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct ModelId(pub Arc<str>);
+
+impl ModelId {
+    /// Create a new model ID.
+    pub fn new(id: impl Into<Arc<str>>) -> Self {
+        Self(id.into())
+    }
+}
+
+impl<T: Into<Arc<str>>> From<T> for ModelId {
+    fn from(s: T) -> Self {
+        Self(s.into())
+    }
+}
+
+impl std::ops::Deref for ModelId {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+/// Message role (system, user, assistant, tool, developer).
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct Role(pub Arc<str>);
+
+impl Role {
+    /// System role for legacy instructions.
+    pub const SYSTEM: &'static str = "system";
+    /// Developer role for reasoning model instructions.
+    pub const DEVELOPER: &'static str = "developer";
+    /// User role for user messages.
+    pub const USER: &'static str = "user";
+    /// Assistant role for model responses.
+    pub const ASSISTANT: &'static str = "assistant";
+    /// Tool role for tool call results.
+    pub const TOOL: &'static str = "tool";
+
+    /// Create a new role.
+    pub fn new(role: impl Into<Arc<str>>) -> Self {
+        Self(role.into())
+    }
+
+    /// Create a system role.
+    pub fn system() -> Self {
+        Self(Arc::from(Self::SYSTEM))
+    }
+
+    /// Create a developer role.
+    pub fn developer() -> Self {
+        Self(Arc::from(Self::DEVELOPER))
+    }
+
+    /// Create a user role.
+    pub fn user() -> Self {
+        Self(Arc::from(Self::USER))
+    }
+
+    /// Create an assistant role.
+    pub fn assistant() -> Self {
+        Self(Arc::from(Self::ASSISTANT))
+    }
+
+    /// Create a tool role.
+    pub fn tool() -> Self {
+        Self(Arc::from(Self::TOOL))
+    }
+}
+
+impl std::ops::Deref for Role {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+// =============================================================================
+// Messages
+// =============================================================================
+
+/// A chat message.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Message {
+    /// The role of the message author.
+    pub role: Role,
+    /// The content of the message.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content: Option<MessageContent>,
+    /// Optional name of the author.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<Arc<str>>,
+    /// Tool calls made by the assistant.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_calls: Option<Arc<[ToolCall]>>,
+    /// ID of the tool call this message is responding to.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_call_id: Option<Arc<str>>,
+    /// Audio output from the assistant.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub audio: Option<AssistantAudio>,
+    /// Deprecated function call.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub function_call: Option<FunctionCall>,
+    /// Additional fields for forward compatibility.
+    #[serde(flatten)]
+    pub extra: Map<String, Value>,
+}
+
+impl Message {
+    /// Create a system message.
+    pub fn system(text: impl Into<Arc<str>>) -> Self {
+        Self {
+            role: Role::system(),
+            content: Some(MessageContent::Text(text.into())),
+            name: None,
+            tool_calls: None,
+            tool_call_id: None,
+            audio: None,
+            function_call: None,
+            extra: Map::new(),
+        }
+    }
+
+    /// Create a developer message.
+    pub fn developer(text: impl Into<Arc<str>>) -> Self {
+        Self {
+            role: Role::developer(),
+            content: Some(MessageContent::Text(text.into())),
+            name: None,
+            tool_calls: None,
+            tool_call_id: None,
+            audio: None,
+            function_call: None,
+            extra: Map::new(),
+        }
+    }
+
+    /// Create a user message with text content.
+    pub fn user(text: impl Into<Arc<str>>) -> Self {
+        Self {
+            role: Role::user(),
+            content: Some(MessageContent::Text(text.into())),
+            name: None,
+            tool_calls: None,
+            tool_call_id: None,
+            audio: None,
+            function_call: None,
+            extra: Map::new(),
+        }
+    }
+
+    /// Create a user message with content parts.
+    pub fn user_parts(parts: impl Into<Arc<[ContentPart]>>) -> Self {
+        Self {
+            role: Role::user(),
+            content: Some(MessageContent::Parts(parts.into())),
+            name: None,
+            tool_calls: None,
+            tool_call_id: None,
+            audio: None,
+            function_call: None,
+            extra: Map::new(),
+        }
+    }
+
+    /// Create an assistant message with text content.
+    pub fn assistant(text: impl Into<Arc<str>>) -> Self {
+        Self {
+            role: Role::assistant(),
+            content: Some(MessageContent::Text(text.into())),
+            name: None,
+            tool_calls: None,
+            tool_call_id: None,
+            audio: None,
+            function_call: None,
+            extra: Map::new(),
+        }
+    }
+
+    /// Create an assistant message with tool calls.
+    pub fn assistant_tool_calls(tool_calls: impl Into<Arc<[ToolCall]>>) -> Self {
+        Self {
+            role: Role::assistant(),
+            content: None,
+            name: None,
+            tool_calls: Some(tool_calls.into()),
+            tool_call_id: None,
+            audio: None,
+            function_call: None,
+            extra: Map::new(),
+        }
+    }
+
+    /// Create a tool result message.
+    pub fn tool(tool_call_id: impl Into<Arc<str>>, content: impl Into<Arc<str>>) -> Self {
+        Self {
+            role: Role::tool(),
+            content: Some(MessageContent::Text(content.into())),
+            name: None,
+            tool_calls: None,
+            tool_call_id: Some(tool_call_id.into()),
+            audio: None,
+            function_call: None,
+            extra: Map::new(),
+        }
+    }
+}
+
+/// Message content - either plain text or an array of content parts.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum MessageContent {
+    /// Plain text content.
+    Text(Arc<str>),
+    /// Array of content parts (for multimodal messages).
+    Parts(Arc<[ContentPart]>),
+}
+
+impl MessageContent {
+    /// Get the text content if this is a text message.
+    pub fn as_text(&self) -> Option<&str> {
+        match self {
+            MessageContent::Text(text) => Some(text),
+            MessageContent::Parts(_) => None,
+        }
+    }
+
+    /// Get all text from the content, concatenating text parts.
+    pub fn text(&self) -> Option<String> {
+        match self {
+            MessageContent::Text(text) => Some(text.to_string()),
+            MessageContent::Parts(parts) => {
+                let texts: Vec<&str> = parts
+                    .iter()
+                    .filter_map(|p| match p {
+                        ContentPart::Text { text } => Some(text.as_ref()),
+                        _ => None,
+                    })
+                    .collect();
+                if texts.is_empty() {
+                    None
+                } else {
+                    Some(texts.join(""))
+                }
+            }
+        }
+    }
+}
+
+// =============================================================================
+// Content Parts
+// =============================================================================
+
+/// A content part in a multimodal message.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ContentPart {
+    /// Text content.
+    Text { text: Arc<str> },
+    /// Image URL content.
+    ImageUrl { image_url: ImageUrlPart },
+    /// Audio input content.
+    InputAudio { input_audio: InputAudioPart },
+    /// File content.
+    File { file: FilePart },
+    /// Refusal content (assistant only).
+    Refusal { refusal: Arc<str> },
+}
+
+impl ContentPart {
+    /// Create a text content part.
+    pub fn text(text: impl Into<Arc<str>>) -> Self {
+        Self::Text { text: text.into() }
+    }
+
+    /// Create an image URL content part.
+    pub fn image_url(url: impl Into<Arc<str>>) -> Self {
+        Self::ImageUrl {
+            image_url: ImageUrlPart {
+                url: url.into(),
+                detail: None,
+            },
+        }
+    }
+
+    /// Create an image URL content part with detail level.
+    pub fn image_url_with_detail(url: impl Into<Arc<str>>, detail: impl Into<Arc<str>>) -> Self {
+        Self::ImageUrl {
+            image_url: ImageUrlPart {
+                url: url.into(),
+                detail: Some(detail.into()),
+            },
+        }
+    }
+}
+
+/// Image URL part details.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ImageUrlPart {
+    /// The URL of the image.
+    pub url: Arc<str>,
+    /// Detail level ("low", "high", "auto").
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub detail: Option<Arc<str>>,
+}
+
+/// Audio input part details.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct InputAudioPart {
+    /// Base64-encoded audio data.
+    pub data: Arc<str>,
+    /// Audio format ("wav", "mp3", etc.).
+    pub format: Arc<str>,
+}
+
+/// File part details.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FilePart {
+    /// File ID if using file API.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub file_id: Option<Arc<str>>,
+    /// Base64-encoded file data.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub file_data: Option<Arc<str>>,
+    /// Optional filename.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub filename: Option<Arc<str>>,
+}
+
+// =============================================================================
+// Audio
+// =============================================================================
+
+/// Audio output from the assistant.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AssistantAudio {
+    /// Audio ID.
+    pub id: Arc<str>,
+    /// Expiration timestamp.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub expires_at: Option<u64>,
+    /// Base64-encoded audio data.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub data: Option<Arc<str>>,
+    /// Transcript of the audio.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub transcript: Option<Arc<str>>,
+}
+
+/// Audio configuration for the request.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AudioConfig {
+    /// Voice to use for audio output.
+    pub voice: Arc<str>,
+    /// Audio output format.
+    pub format: Arc<str>,
+}
+
+// =============================================================================
+// Tools
+// =============================================================================
+
+/// Tool definition.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ToolDefinition {
+    /// Function tool.
+    Function { function: FunctionDefinition },
+}
+
+/// Function definition for a tool.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FunctionDefinition {
+    /// The name of the function.
+    pub name: Arc<str>,
+    /// A description of what the function does.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub description: Option<Arc<str>>,
+    /// The parameters the function accepts (JSON Schema).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub parameters: Option<Value>,
+    /// Whether to enable strict mode for parameters.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub strict: Option<bool>,
+}
+
+impl FunctionDefinition {
+    /// Create a new function definition.
+    pub fn new(name: impl Into<Arc<str>>) -> Self {
+        Self {
+            name: name.into(),
+            description: None,
+            parameters: None,
+            strict: None,
+        }
+    }
+
+    /// Set the description.
+    pub fn with_description(mut self, description: impl Into<Arc<str>>) -> Self {
+        self.description = Some(description.into());
+        self
+    }
+
+    /// Set the parameters schema.
+    pub fn with_parameters(mut self, parameters: Value) -> Self {
+        self.parameters = Some(parameters);
+        self
+    }
+
+    /// Enable strict mode.
+    pub fn with_strict(mut self, strict: bool) -> Self {
+        self.strict = Some(strict);
+        self
+    }
+}
+
+/// A tool call from the assistant.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ToolCall {
+    /// The ID of the tool call.
+    pub id: Arc<str>,
+    /// The type of tool call (always "function" for now).
+    #[serde(rename = "type")]
+    pub call_type: Arc<str>,
+    /// The function being called.
+    pub function: FunctionCall,
+}
+
+impl ToolCall {
+    /// Parse the function arguments as JSON.
+    pub fn arguments_json(&self) -> Result<Value, serde_json::Error> {
+        serde_json::from_str(&self.function.arguments)
+    }
+
+    /// Parse the function arguments as a typed value.
+    pub fn arguments_as<T: serde::de::DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
+        serde_json::from_str(&self.function.arguments)
+    }
+}
+
+/// A function call (name and arguments).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FunctionCall {
+    /// The name of the function.
+    pub name: Arc<str>,
+    /// The arguments to pass to the function (JSON string).
+    pub arguments: Arc<str>,
+}
+
+/// Tool choice specification.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum ToolChoice {
+    /// Simple mode ("auto", "none", "required").
+    Mode(Arc<str>),
+    /// Specific function to call.
+    Function {
+        #[serde(rename = "type")]
+        choice_type: Arc<str>,
+        function: ToolChoiceFunction,
+    },
+}
+
+impl ToolChoice {
+    /// Auto mode - let the model decide.
+    pub fn auto() -> Self {
+        Self::Mode(Arc::from("auto"))
+    }
+
+    /// None mode - don't call any tools.
+    pub fn none() -> Self {
+        Self::Mode(Arc::from("none"))
+    }
+
+    /// Required mode - must call a tool.
+    pub fn required() -> Self {
+        Self::Mode(Arc::from("required"))
+    }
+
+    /// Force a specific function.
+    pub fn function(name: impl Into<Arc<str>>) -> Self {
+        Self::Function {
+            choice_type: Arc::from("function"),
+            function: ToolChoiceFunction { name: name.into() },
+        }
+    }
+}
+
+/// Function specification for tool choice.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ToolChoiceFunction {
+    /// The name of the function to call.
+    pub name: Arc<str>,
+}
+
+// =============================================================================
+// Response Format
+// =============================================================================
+
+/// Response format specification.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ResponseFormat {
+    /// Plain text response.
+    Text,
+    /// JSON object response.
+    JsonObject,
+    /// JSON schema response.
+    JsonSchema { json_schema: JsonSchemaFormat },
+}
+
+impl ResponseFormat {
+    /// Create a text response format.
+    pub fn text() -> Self {
+        Self::Text
+    }
+
+    /// Create a JSON object response format.
+    pub fn json_object() -> Self {
+        Self::JsonObject
+    }
+
+    /// Create a JSON schema response format.
+    pub fn json_schema(name: impl Into<Arc<str>>, schema: Value) -> Self {
+        Self::JsonSchema {
+            json_schema: JsonSchemaFormat {
+                name: name.into(),
+                description: None,
+                schema,
+                strict: None,
+            },
+        }
+    }
+}
+
+/// JSON schema format specification.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JsonSchemaFormat {
+    /// The name of the schema.
+    pub name: Arc<str>,
+    /// Description of the schema.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub description: Option<Arc<str>>,
+    /// The JSON schema.
+    pub schema: Value,
+    /// Whether to enable strict mode.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub strict: Option<bool>,
+}
+
+// =============================================================================
+// Stop Sequences
+// =============================================================================
+
+/// Stop sequence(s) for generation.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum Stop {
+    /// Single stop sequence.
+    Single(Arc<str>),
+    /// Multiple stop sequences.
+    Multiple(Arc<[Arc<str>]>),
+}
+
+impl Stop {
+    /// Create a single stop sequence.
+    pub fn single(s: impl Into<Arc<str>>) -> Self {
+        Self::Single(s.into())
+    }
+
+    /// Create multiple stop sequences.
+    pub fn multiple(sequences: impl IntoIterator<Item = impl Into<Arc<str>>>) -> Self {
+        Self::Multiple(sequences.into_iter().map(Into::into).collect())
+    }
+}
+
+// =============================================================================
+// Advanced Options
+// =============================================================================
+
+/// Stream options for streaming requests.
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct StreamOptions {
+    /// Include usage information in the final chunk.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub include_usage: Option<bool>,
+}
+
+/// Prediction for predicted outputs.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Prediction {
+    /// The type of prediction (always "content").
+    #[serde(rename = "type")]
+    pub prediction_type: Arc<str>,
+    /// The predicted content.
+    pub content: MessageContent,
+}
+
+/// Web search options.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WebSearchOptions {
+    /// Whether to enable web search.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub search_context_size: Option<Arc<str>>,
+    /// User location for search.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub user_location: Option<UserLocation>,
+}
+
+/// User location for web search.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct UserLocation {
+    /// Approximate location type.
+    #[serde(rename = "type")]
+    pub location_type: Arc<str>,
+    /// Approximate location.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub approximate: Option<ApproximateLocation>,
+}
+
+/// Approximate location details.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ApproximateLocation {
+    /// City name.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub city: Option<Arc<str>>,
+    /// Country code.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub country: Option<Arc<str>>,
+    /// Region/state.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub region: Option<Arc<str>>,
+    /// Timezone.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub timezone: Option<Arc<str>>,
+}
+
+// =============================================================================
+// Request
+// =============================================================================
+
+/// Chat completion request.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatCompletionRequest {
+    /// Model to use for completion.
+    pub model: ModelId,
+    /// Messages for the conversation.
+    pub messages: Arc<[Message]>,
+
+    // Sampling parameters
+    /// Temperature for sampling (0.0 to 2.0).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+    /// Top-p (nucleus) sampling.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f32>,
+    /// Number of completions to generate.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub n: Option<u32>,
+    /// Stop sequences.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop: Option<Stop>,
+    /// Presence penalty (-2.0 to 2.0).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub presence_penalty: Option<f32>,
+    /// Frequency penalty (-2.0 to 2.0).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub frequency_penalty: Option<f32>,
+
+    // Token limits
+    /// Maximum tokens to generate (preferred).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_completion_tokens: Option<u32>,
+    /// Maximum tokens (deprecated, use max_completion_tokens).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_tokens: Option<u32>,
+
+    // Logprobs
+    /// Whether to return log probabilities.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logprobs: Option<bool>,
+    /// Number of top log probabilities to return.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_logprobs: Option<u32>,
+    /// Token bias map.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logit_bias: Option<Map<String, Value>>,
+
+    // Tools
+    /// Tool definitions.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tools: Option<Arc<[ToolDefinition]>>,
+    /// Tool choice specification.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_choice: Option<ToolChoice>,
+    /// Whether to allow parallel tool calls.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub parallel_tool_calls: Option<bool>,
+    /// Deprecated function definitions.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub functions: Option<Arc<[FunctionDefinition]>>,
+
+    // Response formatting
+    /// Response format specification.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub response_format: Option<ResponseFormat>,
+
+    // Multimodal
+    /// Output modalities (e.g., ["text", "audio"]).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub modalities: Option<Arc<[Arc<str>]>>,
+    /// Audio configuration.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub audio: Option<AudioConfig>,
+
+    // Advanced
+    /// Enable streaming.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stream: Option<bool>,
+    /// Stream options.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stream_options: Option<StreamOptions>,
+    /// Predicted output.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prediction: Option<Prediction>,
+    /// Web search options.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub web_search_options: Option<WebSearchOptions>,
+    /// Reasoning effort level.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub reasoning_effort: Option<Arc<str>>,
+    /// Service tier preference.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub service_tier: Option<Arc<str>>,
+    /// Whether to store the completion.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub store: Option<bool>,
+    /// Metadata for stored completions.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<Map<String, Value>>,
+
+    // Identifiers
+    /// Seed for deterministic sampling.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub seed: Option<u64>,
+    /// User identifier (deprecated).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub user: Option<Arc<str>>,
+
+    // Escape hatch
+    /// Extra fields for provider-specific extensions.
+    #[serde(flatten)]
+    pub extra_body: Map<String, Value>,
+}
+
+impl ChatCompletionRequest {
+    /// Create a new chat completion request.
+    pub fn new(model: impl Into<ModelId>, messages: impl Into<Arc<[Message]>>) -> Self {
+        Self {
+            model: model.into(),
+            messages: messages.into(),
+            temperature: None,
+            top_p: None,
+            n: None,
+            stop: None,
+            presence_penalty: None,
+            frequency_penalty: None,
+            max_completion_tokens: None,
+            max_tokens: None,
+            logprobs: None,
+            top_logprobs: None,
+            logit_bias: None,
+            tools: None,
+            tool_choice: None,
+            parallel_tool_calls: None,
+            functions: None,
+            response_format: None,
+            modalities: None,
+            audio: None,
+            stream: None,
+            stream_options: None,
+            prediction: None,
+            web_search_options: None,
+            reasoning_effort: None,
+            service_tier: None,
+            store: None,
+            metadata: None,
+            seed: None,
+            user: None,
+            extra_body: Map::new(),
+        }
+    }
+}
+
+// =============================================================================
+// Response
+// =============================================================================
+
+/// Chat completion response.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatCompletion {
+    /// Unique identifier for this completion.
+    pub id: Arc<str>,
+    /// Object type (always "chat.completion").
+    pub object: Arc<str>,
+    /// Unix timestamp when the completion was created.
+    pub created: u64,
+    /// Model used for the completion.
+    pub model: ModelId,
+    /// Completion choices.
+    pub choices: Arc<[ChatChoice]>,
+    /// Token usage information.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub usage: Option<Usage>,
+    /// Service tier used.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub service_tier: Option<Arc<str>>,
+    /// System fingerprint (deprecated).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub system_fingerprint: Option<Arc<str>>,
+    /// Additional fields for forward compatibility.
+    #[serde(flatten)]
+    pub extra: Map<String, Value>,
+}
+
+impl ChatCompletion {
+    /// Get the first choice's text content.
+    pub fn first_text(&self) -> Option<&str> {
+        self.choices.first().and_then(|c| {
+            c.message
+                .as_ref()
+                .and_then(|m| m.content.as_ref())
+                .and_then(|content| content.as_text())
+        })
+    }
+
+    /// Get tool calls from the first choice.
+    pub fn tool_calls(&self) -> Option<&[ToolCall]> {
+        self.choices
+            .first()
+            .and_then(|c| c.message.as_ref())
+            .and_then(|m| m.tool_calls.as_ref())
+            .map(|tc| tc.as_ref())
+    }
+
+    /// Parse the first choice's content as JSON.
+    pub fn parse_json<T: serde::de::DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
+        let text = self.first_text().unwrap_or("");
+        serde_json::from_str(text)
+    }
+
+    /// Get the finish reason for the first choice.
+    pub fn finish_reason(&self) -> Option<&str> {
+        self.choices
+            .first()
+            .and_then(|c| c.finish_reason.as_ref())
+            .map(|s| s.as_ref())
+    }
+}
+
+/// A completion choice.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatChoice {
+    /// The index of this choice.
+    pub index: u32,
+    /// The generated message.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub message: Option<Message>,
+    /// The reason the model stopped generating.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub finish_reason: Option<Arc<str>>,
+    /// Log probabilities.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logprobs: Option<Logprobs>,
+    /// Additional fields.
+    #[serde(flatten)]
+    pub extra: Map<String, Value>,
+}
+
+/// Log probabilities.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Logprobs {
+    /// Log probabilities for each token.
+    pub content: Option<Arc<[TokenLogprob]>>,
+    /// Refusal log probabilities.
+    pub refusal: Option<Arc<[TokenLogprob]>>,
+}
+
+/// Log probability for a single token.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TokenLogprob {
+    /// The token.
+    pub token: Arc<str>,
+    /// Log probability of this token.
+    pub logprob: f64,
+    /// Byte representation.
+    pub bytes: Option<Arc<[u8]>>,
+    /// Top log probabilities.
+    pub top_logprobs: Option<Arc<[TopLogprob]>>,
+}
+
+/// Top log probability entry.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TopLogprob {
+    /// The token.
+    pub token: Arc<str>,
+    /// Log probability.
+    pub logprob: f64,
+    /// Byte representation.
+    pub bytes: Option<Arc<[u8]>>,
+}
+
+/// Token usage information.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Usage {
+    /// Number of tokens in the prompt.
+    pub prompt_tokens: u32,
+    /// Number of tokens in the completion.
+    pub completion_tokens: u32,
+    /// Total number of tokens.
+    pub total_tokens: u32,
+    /// Detailed prompt token breakdown.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prompt_tokens_details: Option<PromptTokensDetails>,
+    /// Detailed completion token breakdown.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub completion_tokens_details: Option<CompletionTokensDetails>,
+}
+
+/// Detailed prompt token breakdown.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PromptTokensDetails {
+    /// Cached tokens.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub cached_tokens: Option<u32>,
+    /// Audio tokens.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub audio_tokens: Option<u32>,
+}
+
+/// Detailed completion token breakdown.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CompletionTokensDetails {
+    /// Reasoning tokens.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub reasoning_tokens: Option<u32>,
+    /// Audio tokens.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub audio_tokens: Option<u32>,
+    /// Accepted prediction tokens.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub accepted_prediction_tokens: Option<u32>,
+    /// Rejected prediction tokens.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub rejected_prediction_tokens: Option<u32>,
+}
+
+// =============================================================================
+// Streaming Types
+// =============================================================================
+
+/// A streaming chunk from the chat completion.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatCompletionChunk {
+    /// Unique identifier for this completion.
+    pub id: Arc<str>,
+    /// Object type (always "chat.completion.chunk").
+    pub object: Arc<str>,
+    /// Unix timestamp when the chunk was created.
+    pub created: u64,
+    /// Model used for the completion.
+    pub model: ModelId,
+    /// Chunk choices.
+    pub choices: Arc<[ChatChunkChoice]>,
+    /// Token usage (only in final chunk with include_usage).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub usage: Option<Usage>,
+    /// Service tier used.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub service_tier: Option<Arc<str>>,
+    /// System fingerprint.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub system_fingerprint: Option<Arc<str>>,
+    /// Additional fields.
+    #[serde(flatten)]
+    pub extra: Map<String, Value>,
+}
+
+/// A choice in a streaming chunk.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatChunkChoice {
+    /// The index of this choice.
+    pub index: u32,
+    /// The delta (partial message).
+    pub delta: ChunkDelta,
+    /// The reason the model stopped generating.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub finish_reason: Option<Arc<str>>,
+    /// Log probabilities.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logprobs: Option<Logprobs>,
+    /// Additional fields.
+    #[serde(flatten)]
+    pub extra: Map<String, Value>,
+}
+
+/// Delta content in a streaming chunk.
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct ChunkDelta {
+    /// The role (usually only in first chunk).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub role: Option<Role>,
+    /// Content fragment.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content: Option<Arc<str>>,
+    /// Refusal fragment.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub refusal: Option<Arc<str>>,
+    /// Tool call fragments.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_calls: Option<Arc<[ToolCallDelta]>>,
+    /// Deprecated function call.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub function_call: Option<FunctionCallDelta>,
+    /// Additional fields.
+    #[serde(flatten)]
+    pub extra: Map<String, Value>,
+}
+
+/// Tool call delta in streaming.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ToolCallDelta {
+    /// The index of this tool call.
+    pub index: u32,
+    /// Tool call ID (usually only in first chunk for this call).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<Arc<str>>,
+    /// Tool type.
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub call_type: Option<Arc<str>>,
+    /// Function call delta.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub function: Option<FunctionCallDelta>,
+}
+
+/// Function call delta in streaming.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FunctionCallDelta {
+    /// Function name (usually only in first chunk).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<Arc<str>>,
+    /// Arguments fragment.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub arguments: Option<Arc<str>>,
+}

From 148331f18f6bffa4f738ec187d857ccd145bf889 Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sat, 3 Jan 2026 17:46:16 +0530
Subject: [PATCH 06/14] intial rullm-anthropic implementation

---
 crates/rullm-anthropic/Cargo.toml             |  11 +
 crates/rullm-anthropic/src/client.rs          | 234 +++++
 crates/rullm-anthropic/src/config.rs          | 366 +++++++
 crates/rullm-anthropic/src/error.rs           | 140 +++
 crates/rullm-anthropic/src/lib.rs             | 136 ++-
 crates/rullm-anthropic/src/messages/mod.rs    |  12 +
 crates/rullm-anthropic/src/messages/stream.rs | 623 ++++++++++++
 crates/rullm-anthropic/src/messages/types.rs  | 910 ++++++++++++++++++
 crates/rullm-anthropic/src/transport.rs       | 231 +++++
 9 files changed, 2651 insertions(+), 12 deletions(-)
 create mode 100644 crates/rullm-anthropic/src/client.rs
 create mode 100644 crates/rullm-anthropic/src/config.rs
 create mode 100644 crates/rullm-anthropic/src/error.rs
 create mode 100644 crates/rullm-anthropic/src/messages/mod.rs
 create mode 100644 crates/rullm-anthropic/src/messages/stream.rs
 create mode 100644 crates/rullm-anthropic/src/messages/types.rs
 create mode 100644 crates/rullm-anthropic/src/transport.rs

diff --git a/crates/rullm-anthropic/Cargo.toml b/crates/rullm-anthropic/Cargo.toml
index 984c5b2c..5655f4bb 100644
--- a/crates/rullm-anthropic/Cargo.toml
+++ b/crates/rullm-anthropic/Cargo.toml
@@ -5,3 +5,14 @@ edition.workspace = true
 rust-version.workspace = true
 
 [dependencies]
+tokio = { workspace = true }
+reqwest = { workspace = true }
+bytes = { workspace = true }
+serde = { workspace = true, features = ["rc"] }
+serde_json = { workspace = true }
+thiserror = { workspace = true }
+futures = { workspace = true }
+async-stream = { workspace = true }
+
+[dev-dependencies]
+tokio-test = { workspace = true }
diff --git a/crates/rullm-anthropic/src/client.rs b/crates/rullm-anthropic/src/client.rs
new file mode 100644
index 00000000..a281f295
--- /dev/null
+++ b/crates/rullm-anthropic/src/client.rs
@@ -0,0 +1,234 @@
+//! Main client implementation for the Anthropic API
+//!
+//! Provides `Client` and sub-clients like `MessagesClient`.
+
+use crate::config::{ClientBuilder, ClientConfig, RequestOptions};
+use crate::error::Result;
+use crate::messages::{
+    CountTokensRequest, CountTokensResponse, MessageStream, MessagesRequest, MessagesResponse,
+    StreamEvent, parse_sse_stream,
+};
+use crate::transport::HttpTransport;
+use futures::Stream;
+use std::sync::Arc;
+
+/// Main Anthropic API client
+///
+/// # Example
+///
+/// ```no_run
+/// use rullm_anthropic::{Client, Message, MessagesRequest, RequestOptions};
+///
+/// # async fn example() -> Result<(), rullm_anthropic::AnthropicError> {
+/// let client = Client::from_env()?;
+///
+/// let request = MessagesRequest::builder("claude-3-5-sonnet-20241022", 1024)
+///     .system("You are a helpful assistant.")
+///     .message(Message::user("Hello!"))
+///     .build();
+///
+/// let response = client.messages().create(request, RequestOptions::default()).await?;
+/// println!("{}", response.text());
+/// # Ok(())
+/// # }
+/// ```
+#[derive(Clone)]
+pub struct Client {
+    transport: Arc<HttpTransport>,
+}
+
+impl Client {
+    /// Create a new client builder
+    pub fn builder() -> ClientBuilder {
+        ClientBuilder::new()
+    }
+
+    /// Create a client from environment variables
+    ///
+    /// Reads from:
+    /// - `ANTHROPIC_API_KEY` - API key
+    /// - `ANTHROPIC_AUTH_TOKEN` - OAuth token (takes precedence)
+    /// - `ANTHROPIC_BASE_URL` - Base URL override
+    pub fn from_env() -> Result<Self> {
+        let config = ClientBuilder::from_env().build()?;
+        Self::new(config)
+    }
+
+    /// Create a client with the given configuration
+    pub fn new(config: ClientConfig) -> Result<Self> {
+        let transport = HttpTransport::new(config)?;
+        Ok(Self {
+            transport: Arc::new(transport),
+        })
+    }
+
+    /// Get the messages sub-client
+    pub fn messages(&self) -> MessagesClient {
+        MessagesClient {
+            transport: self.transport.clone(),
+        }
+    }
+
+    /// Get the base URL
+    pub fn base_url(&self) -> &str {
+        self.transport.base_url()
+    }
+}
+
+/// Messages API sub-client
+#[derive(Clone)]
+pub struct MessagesClient {
+    transport: Arc<HttpTransport>,
+}
+
+impl MessagesClient {
+    const MESSAGES_PATH: &'static str = "/v1/messages";
+    const COUNT_TOKENS_PATH: &'static str = "/v1/messages/count_tokens";
+
+    /// Create a message (non-streaming)
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use rullm_anthropic::{Client, Message, MessagesRequest, RequestOptions};
+    ///
+    /// # async fn example() -> Result<(), rullm_anthropic::AnthropicError> {
+    /// let client = Client::from_env()?;
+    ///
+    /// let request = MessagesRequest::builder("claude-3-5-sonnet-20241022", 1024)
+    ///     .message(Message::user("What is 2+2?"))
+    ///     .temperature(0.0)
+    ///     .build();
+    ///
+    /// let response = client.messages().create(request, RequestOptions::default()).await?;
+    /// println!("Answer: {}", response.text());
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn create(
+        &self,
+        request: MessagesRequest,
+        options: RequestOptions,
+    ) -> Result<MessagesResponse> {
+        let (response, _meta) = self
+            .transport
+            .post_json(Self::MESSAGES_PATH, &request, &options)
+            .await?;
+        Ok(response)
+    }
+
+    /// Create a streaming message
+    ///
+    /// Returns a `MessageStream` that provides:
+    /// - Raw event stream via `Stream` trait
+    /// - Text-only stream via `text_stream()`
+    /// - Final message via `final_message()`
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use rullm_anthropic::{Client, Message, MessagesRequest, RequestOptions};
+    /// use futures::StreamExt;
+    /// use std::pin::pin;
+    ///
+    /// # async fn example() -> Result<(), rullm_anthropic::AnthropicError> {
+    /// let client = Client::from_env()?;
+    /// let messages = client.messages();
+    ///
+    /// let request = MessagesRequest::builder("claude-3-5-sonnet-20241022", 1024)
+    ///     .message(Message::user("Tell me a story"))
+    ///     .build();
+    ///
+    /// let stream = messages.stream(request, RequestOptions::default()).await?;
+    /// let mut text_stream = pin!(stream.text_stream());
+    ///
+    /// while let Some(chunk) = text_stream.next().await {
+    ///     print!("{}", chunk?);
+    /// }
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn stream(
+        &self,
+        mut request: MessagesRequest,
+        options: RequestOptions,
+    ) -> Result<MessageStream<impl Stream<Item = Result<StreamEvent>> + Unpin>> {
+        // Force streaming
+        request.stream = Some(true);
+
+        let (byte_stream, _meta) = self
+            .transport
+            .post_stream(Self::MESSAGES_PATH, &request, &options)
+            .await?;
+
+        let event_stream = parse_sse_stream(byte_stream);
+        Ok(MessageStream::new(Box::pin(event_stream)))
+    }
+
+    /// Count tokens for a request without sending it
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use rullm_anthropic::{Client, CountTokensRequest, Message, RequestOptions};
+    ///
+    /// # async fn example() -> Result<(), rullm_anthropic::AnthropicError> {
+    /// let client = Client::from_env()?;
+    ///
+    /// let request = CountTokensRequest::new(
+    ///     "claude-3-5-sonnet-20241022",
+    ///     vec![Message::user("Hello, how are you?")],
+    /// );
+    ///
+    /// let count = client.messages().count_tokens(request, RequestOptions::default()).await?;
+    /// println!("Input tokens: {}", count.input_tokens);
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn count_tokens(
+        &self,
+        request: CountTokensRequest,
+        options: RequestOptions,
+    ) -> Result<CountTokensResponse> {
+        let (response, _meta) = self
+            .transport
+            .post_json(Self::COUNT_TOKENS_PATH, &request, &options)
+            .await?;
+        Ok(response)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::messages::{Message, Role};
+
+    #[test]
+    fn test_client_builder() {
+        let result = Client::builder().api_key("test-key").build();
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_messages_request_builder() {
+        let request = MessagesRequest::builder("claude-3-5-sonnet-20241022", 1024)
+            .system("You are helpful")
+            .message(Message::user("Hello"))
+            .temperature(0.7)
+            .build();
+
+        assert_eq!(request.model.as_ref(), "claude-3-5-sonnet-20241022");
+        assert_eq!(request.max_tokens, 1024);
+        assert_eq!(request.messages.len(), 1);
+        assert!(request.temperature.is_some());
+    }
+
+    #[test]
+    fn test_message_helpers() {
+        let user_msg = Message::user("Hello");
+        assert!(matches!(user_msg.role, Role::User));
+
+        let assistant_msg = Message::assistant("Hi there!");
+        assert!(matches!(assistant_msg.role, Role::Assistant));
+    }
+}
diff --git a/crates/rullm-anthropic/src/config.rs b/crates/rullm-anthropic/src/config.rs
new file mode 100644
index 00000000..e1eaf4e3
--- /dev/null
+++ b/crates/rullm-anthropic/src/config.rs
@@ -0,0 +1,366 @@
+//! Configuration and client builder for the Anthropic client
+
+use crate::error::{AnthropicError, Result};
+use reqwest::header::{HeaderMap, HeaderName, HeaderValue};
+use std::env;
+use std::sync::Arc;
+use std::time::Duration;
+
+/// Default base URL for the Anthropic API
+pub const DEFAULT_BASE_URL: &str = "https://api.anthropic.com";
+
+/// Default API version header value
+pub const DEFAULT_API_VERSION: &str = "2023-06-01";
+
+/// Default timeout for non-streaming requests (10 minutes as per SDK)
+pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(600);
+
+/// Environment variable for API key
+pub const ENV_API_KEY: &str = "ANTHROPIC_API_KEY";
+
+/// Environment variable for OAuth token
+pub const ENV_AUTH_TOKEN: &str = "ANTHROPIC_AUTH_TOKEN";
+
+/// Environment variable for base URL override
+pub const ENV_BASE_URL: &str = "ANTHROPIC_BASE_URL";
+
+/// Authentication mode for the client
+#[derive(Debug, Clone)]
+pub enum AuthMode {
+    /// API key authentication (x-api-key header)
+    ApiKey(Arc<str>),
+    /// OAuth token authentication (Bearer token)
+    OAuth(Arc<str>),
+}
+
+impl AuthMode {
+    /// Apply authentication headers to a HeaderMap
+    pub fn apply_to_headers(&self, headers: &mut HeaderMap) {
+        match self {
+            AuthMode::ApiKey(key) => {
+                if let Ok(value) = HeaderValue::from_str(key) {
+                    headers.insert("x-api-key", value);
+                }
+            }
+            AuthMode::OAuth(token) => {
+                if let Ok(value) = HeaderValue::from_str(&format!("Bearer {}", token)) {
+                    headers.insert("authorization", value);
+                }
+            }
+        }
+    }
+}
+
+/// Per-request options to override client defaults
+#[derive(Debug, Clone, Default)]
+pub struct RequestOptions {
+    /// Override timeout for this request
+    pub timeout: Option<Duration>,
+
+    /// Extra headers to merge with default headers
+    pub extra_headers: HeaderMap,
+
+    /// Extra query parameters
+    pub extra_query: Vec<(Arc<str>, Arc<str>)>,
+
+    /// Extra body fields to merge (for advanced use)
+    pub extra_body: serde_json::Map<String, serde_json::Value>,
+
+    /// Allow long non-streaming requests (bypasses timeout policy)
+    pub allow_long_non_streaming: bool,
+}
+
+impl RequestOptions {
+    /// Create a new RequestOptions with custom timeout
+    pub fn with_timeout(timeout: Duration) -> Self {
+        Self {
+            timeout: Some(timeout),
+            ..Default::default()
+        }
+    }
+
+    /// Add an extra header
+    pub fn header(mut self, name: HeaderName, value: HeaderValue) -> Self {
+        self.extra_headers.insert(name, value);
+        self
+    }
+
+    /// Add an extra query parameter
+    pub fn query(mut self, key: impl Into<Arc<str>>, value: impl Into<Arc<str>>) -> Self {
+        self.extra_query.push((key.into(), value.into()));
+        self
+    }
+
+    /// Allow long non-streaming requests
+    pub fn allow_long_request(mut self) -> Self {
+        self.allow_long_non_streaming = true;
+        self
+    }
+}
+
+/// Builder for constructing a Client
+#[derive(Debug, Clone)]
+pub struct ClientBuilder {
+    /// API key for authentication
+    api_key: Option<Arc<str>>,
+
+    /// OAuth token for authentication
+    auth_token: Option<Arc<str>>,
+
+    /// Base URL for API requests
+    base_url: Arc<str>,
+
+    /// Default timeout for requests
+    timeout: Duration,
+
+    /// Maximum number of retries for failed requests
+    max_retries: u32,
+
+    /// Beta features to enable
+    beta: Vec<Arc<str>>,
+
+    /// Default headers to include in all requests
+    default_headers: HeaderMap,
+}
+
+impl Default for ClientBuilder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ClientBuilder {
+    /// Create a new ClientBuilder with default values
+    pub fn new() -> Self {
+        Self {
+            api_key: None,
+            auth_token: None,
+            base_url: Arc::from(DEFAULT_BASE_URL),
+            timeout: DEFAULT_TIMEOUT,
+            max_retries: 2,
+            beta: Vec::new(),
+            default_headers: HeaderMap::new(),
+        }
+    }
+
+    /// Create a ClientBuilder from environment variables
+    ///
+    /// Reads from:
+    /// - `ANTHROPIC_API_KEY` - API key
+    /// - `ANTHROPIC_AUTH_TOKEN` - OAuth token (takes precedence over API key)
+    /// - `ANTHROPIC_BASE_URL` - Base URL override
+    pub fn from_env() -> Self {
+        let mut builder = Self::new();
+
+        if let Ok(key) = env::var(ENV_API_KEY) {
+            builder.api_key = Some(Arc::from(key));
+        }
+
+        if let Ok(token) = env::var(ENV_AUTH_TOKEN) {
+            builder.auth_token = Some(Arc::from(token));
+        }
+
+        if let Ok(url) = env::var(ENV_BASE_URL) {
+            builder.base_url = Arc::from(url);
+        }
+
+        builder
+    }
+
+    /// Set the API key
+    pub fn api_key(mut self, key: impl Into<Arc<str>>) -> Self {
+        self.api_key = Some(key.into());
+        self
+    }
+
+    /// Set the OAuth token
+    pub fn auth_token(mut self, token: impl Into<Arc<str>>) -> Self {
+        self.auth_token = Some(token.into());
+        self
+    }
+
+    /// Set the base URL
+    pub fn base_url(mut self, url: impl Into<Arc<str>>) -> Self {
+        self.base_url = url.into();
+        self
+    }
+
+    /// Set the default timeout
+    pub fn timeout(mut self, timeout: Duration) -> Self {
+        self.timeout = timeout;
+        self
+    }
+
+    /// Set the maximum number of retries
+    pub fn max_retries(mut self, retries: u32) -> Self {
+        self.max_retries = retries;
+        self
+    }
+
+    /// Add a beta feature
+    pub fn beta(mut self, feature: impl Into<Arc<str>>) -> Self {
+        self.beta.push(feature.into());
+        self
+    }
+
+    /// Add multiple beta features
+    pub fn betas(mut self, features: impl IntoIterator<Item = impl Into<Arc<str>>>) -> Self {
+        self.beta.extend(features.into_iter().map(Into::into));
+        self
+    }
+
+    /// Add a default header
+    pub fn header(mut self, name: HeaderName, value: HeaderValue) -> Self {
+        self.default_headers.insert(name, value);
+        self
+    }
+
+    /// Validate and build the configuration
+    pub fn build(self) -> Result<ClientConfig> {
+        // Determine auth mode - OAuth takes precedence
+        let auth = if let Some(token) = self.auth_token {
+            AuthMode::OAuth(token)
+        } else if let Some(key) = self.api_key {
+            AuthMode::ApiKey(key)
+        } else {
+            return Err(AnthropicError::configuration(
+                "No API key or OAuth token provided. Set ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN environment variable.",
+            ));
+        };
+
+        // Build default headers
+        let mut headers = self.default_headers;
+
+        // Add required headers
+        headers.insert(
+            "anthropic-version",
+            HeaderValue::from_static(DEFAULT_API_VERSION),
+        );
+        headers.insert("content-type", HeaderValue::from_static("application/json"));
+
+        // Add beta header if features are specified
+        if !self.beta.is_empty() {
+            let beta_value = self.beta.join(",");
+            if let Ok(value) = HeaderValue::from_str(&beta_value) {
+                headers.insert("anthropic-beta", value);
+            }
+        }
+
+        // Apply auth headers
+        auth.apply_to_headers(&mut headers);
+
+        Ok(ClientConfig {
+            auth,
+            base_url: self.base_url,
+            timeout: self.timeout,
+            max_retries: self.max_retries,
+            default_headers: headers,
+        })
+    }
+}
+
+/// Validated client configuration
+#[derive(Debug, Clone)]
+pub struct ClientConfig {
+    /// Authentication mode
+    pub(crate) auth: AuthMode,
+
+    /// Base URL for API requests
+    pub(crate) base_url: Arc<str>,
+
+    /// Default timeout for requests
+    pub(crate) timeout: Duration,
+
+    /// Maximum number of retries
+    pub(crate) max_retries: u32,
+
+    /// Default headers for all requests
+    pub(crate) default_headers: HeaderMap,
+}
+
+impl ClientConfig {
+    /// Get the authentication mode
+    pub fn auth(&self) -> &AuthMode {
+        &self.auth
+    }
+
+    /// Get the base URL
+    pub fn base_url(&self) -> &str {
+        &self.base_url
+    }
+
+    /// Get the default timeout
+    pub fn timeout(&self) -> Duration {
+        self.timeout
+    }
+
+    /// Get the maximum retries
+    pub fn max_retries(&self) -> u32 {
+        self.max_retries
+    }
+
+    /// Get a reference to the default headers
+    pub fn headers(&self) -> &HeaderMap {
+        &self.default_headers
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_builder_with_api_key() {
+        let config = ClientBuilder::new()
+            .api_key("test-key")
+            .build()
+            .expect("should build");
+
+        assert!(matches!(config.auth, AuthMode::ApiKey(_)));
+        assert_eq!(config.base_url(), DEFAULT_BASE_URL);
+    }
+
+    #[test]
+    fn test_builder_with_oauth() {
+        let config = ClientBuilder::new()
+            .auth_token("test-token")
+            .build()
+            .expect("should build");
+
+        assert!(matches!(config.auth, AuthMode::OAuth(_)));
+    }
+
+    #[test]
+    fn test_oauth_takes_precedence() {
+        let config = ClientBuilder::new()
+            .api_key("api-key")
+            .auth_token("oauth-token")
+            .build()
+            .expect("should build");
+
+        assert!(matches!(config.auth, AuthMode::OAuth(_)));
+    }
+
+    #[test]
+    fn test_builder_no_auth_fails() {
+        let result = ClientBuilder::new().build();
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_builder_with_beta() {
+        let config = ClientBuilder::new()
+            .api_key("test-key")
+            .beta("feature-1")
+            .beta("feature-2")
+            .build()
+            .expect("should build");
+
+        let beta_header = config.default_headers.get("anthropic-beta");
+        assert!(beta_header.is_some());
+        assert_eq!(
+            beta_header.unwrap().to_str().unwrap(),
+            "feature-1,feature-2"
+        );
+    }
+}
diff --git a/crates/rullm-anthropic/src/error.rs b/crates/rullm-anthropic/src/error.rs
new file mode 100644
index 00000000..3e303fdd
--- /dev/null
+++ b/crates/rullm-anthropic/src/error.rs
@@ -0,0 +1,140 @@
+//! Error types for the Anthropic client
+
+use reqwest::StatusCode;
+use serde::{Deserialize, Serialize};
+use std::sync::Arc;
+use thiserror::Error;
+
+/// Main error type for the Anthropic client
+#[derive(Error, Debug)]
+pub enum AnthropicError {
+    /// API error returned by Anthropic
+    #[error("API error ({status}): {error}")]
+    Api {
+        status: StatusCode,
+        request_id: Option<Arc<str>>,
+        error: ErrorObject,
+    },
+
+    /// HTTP transport error
+    #[error("Transport error: {0}")]
+    Transport(#[from] reqwest::Error),
+
+    /// JSON serialization/deserialization error
+    #[error("Serialization error: {message}")]
+    Serialization {
+        message: Arc<str>,
+        #[source]
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    /// Request timeout
+    #[error("Request timed out")]
+    Timeout,
+
+    /// Invalid request configuration
+    #[error("Invalid request: {0}")]
+    InvalidRequest(Arc<str>),
+
+    /// Configuration error (missing API key, etc.)
+    #[error("Configuration error: {0}")]
+    Configuration(Arc<str>),
+}
+
+impl AnthropicError {
+    /// Create a serialization error
+    pub fn serialization(
+        message: impl Into<Arc<str>>,
+        source: impl Into<Box<dyn std::error::Error + Send + Sync>>,
+    ) -> Self {
+        Self::Serialization {
+            message: message.into(),
+            source: source.into(),
+        }
+    }
+
+    /// Create an invalid request error
+    pub fn invalid_request(message: impl Into<Arc<str>>) -> Self {
+        Self::InvalidRequest(message.into())
+    }
+
+    /// Create a configuration error
+    pub fn configuration(message: impl Into<Arc<str>>) -> Self {
+        Self::Configuration(message.into())
+    }
+
+    /// Create an API error
+    pub fn api(status: StatusCode, request_id: Option<Arc<str>>, error: ErrorObject) -> Self {
+        Self::Api {
+            status,
+            request_id,
+            error,
+        }
+    }
+
+    /// Get the request ID if available
+    pub fn request_id(&self) -> Option<&str> {
+        match self {
+            Self::Api { request_id, .. } => request_id.as_deref(),
+            _ => None,
+        }
+    }
+
+    /// Check if this is a retryable error
+    pub fn is_retryable(&self) -> bool {
+        match self {
+            Self::Api { status, error, .. } => {
+                // Rate limit (429) and overloaded (529) are retryable
+                *status == StatusCode::TOO_MANY_REQUESTS
+                    || status.as_u16() == 529
+                    || error.error_type.as_ref() == "overloaded_error"
+            }
+            Self::Transport(e) => e.is_timeout() || e.is_connect(),
+            Self::Timeout => true,
+            _ => false,
+        }
+    }
+}
+
+/// Error object returned by the Anthropic API
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ErrorObject {
+    /// Error type (e.g., "invalid_request_error", "authentication_error")
+    #[serde(rename = "type")]
+    pub error_type: Arc<str>,
+
+    /// Human-readable error message
+    pub message: Arc<str>,
+
+    /// Parameter that caused the error (if applicable)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub param: Option<Arc<str>>,
+}
+
+impl std::fmt::Display for ErrorObject {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if let Some(param) = &self.param {
+            write!(
+                f,
+                "{}: {} (param: {})",
+                self.error_type, self.message, param
+            )
+        } else {
+            write!(f, "{}: {}", self.error_type, self.message)
+        }
+    }
+}
+
+/// API error response wrapper
+#[derive(Debug, Clone, Deserialize)]
+pub struct ApiErrorResponse {
+    /// Always "error" for error responses
+    #[serde(rename = "type")]
+    pub response_type: Arc<str>,
+
+    /// The error details
+    pub error: ErrorObject,
+}
+
+/// Result type alias for Anthropic operations
+pub type Result<T> = std::result::Result<T, AnthropicError>;
diff --git a/crates/rullm-anthropic/src/lib.rs b/crates/rullm-anthropic/src/lib.rs
index b93cf3ff..874dfce4 100644
--- a/crates/rullm-anthropic/src/lib.rs
+++ b/crates/rullm-anthropic/src/lib.rs
@@ -1,14 +1,126 @@
-pub fn add(left: u64, right: u64) -> u64 {
-    left + right
-}
+//! Anthropic Messages API Rust Client
+//!
+//! A feature-complete, idiomatic Rust client for the Anthropic Messages API.
+//!
+//! # Features
+//!
+//! - Full Messages API support with all parameters
+//! - Ergonomic builder patterns for requests
+//! - Streaming support with high-level helpers
+//! - Strong typing for all API objects
+//! - Comprehensive error handling
+//!
+//! # Quick Start
+//!
+//! ```no_run
+//! use rullm_anthropic::{Client, Message, MessagesRequest, RequestOptions};
+//!
+//! # async fn example() -> Result<(), rullm_anthropic::AnthropicError> {
+//! // Create client from environment
+//! let client = Client::from_env()?;
+//!
+//! // Build a request
+//! let request = MessagesRequest::builder("claude-3-5-sonnet-20241022", 1024)
+//!     .system("You are a helpful assistant.")
+//!     .message(Message::user("Hello!"))
+//!     .temperature(0.7)
+//!     .build();
+//!
+//! // Send request
+//! let response = client.messages().create(request, RequestOptions::default()).await?;
+//! println!("{}", response.text());
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Streaming
+//!
+//! ```no_run
+//! use rullm_anthropic::{Client, Message, MessagesRequest, RequestOptions};
+//! use futures::StreamExt;
+//! use std::pin::pin;
+//!
+//! # async fn example() -> Result<(), rullm_anthropic::AnthropicError> {
+//! let client = Client::from_env()?;
+//! let messages = client.messages();
+//!
+//! let request = MessagesRequest::builder("claude-3-5-sonnet-20241022", 1024)
+//!     .message(Message::user("Tell me a story"))
+//!     .build();
+//!
+//! let stream = messages.stream(request, RequestOptions::default()).await?;
+//! let mut text_stream = pin!(stream.text_stream());
+//!
+//! while let Some(chunk) = text_stream.next().await {
+//!     print!("{}", chunk?);
+//! }
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Authentication
+//!
+//! The client supports two authentication methods:
+//!
+//! - **API Key**: Set `ANTHROPIC_API_KEY` environment variable
+//! - **OAuth Token**: Set `ANTHROPIC_AUTH_TOKEN` environment variable (takes precedence)
+//!
+//! You can also configure authentication programmatically:
+//!
+//! ```no_run
+//! use rullm_anthropic::Client;
+//!
+//! # fn example() -> Result<(), rullm_anthropic::AnthropicError> {
+//! let client = Client::builder()
+//!     .api_key("your-api-key")
+//!     .build()?;
+//! # Ok(())
+//! # }
+//! ```
 
-#[cfg(test)]
-mod tests {
-    use super::*;
+pub mod client;
+pub mod config;
+pub mod error;
+pub mod messages;
+pub mod transport;
 
-    #[test]
-    fn it_works() {
-        let result = add(2, 2);
-        assert_eq!(result, 4);
-    }
-}
+// Re-export main types at crate root for convenience
+pub use client::{Client, MessagesClient};
+pub use config::{ClientBuilder, ClientConfig, RequestOptions};
+pub use error::{AnthropicError, ErrorObject, Result};
+pub use messages::{
+    // System content
+    CacheControl,
+    // Content blocks
+    ContentBlock,
+    ContentBlockParam,
+    // Other types
+    CountTokensRequest,
+    CountTokensResponse,
+    // Tools
+    CustomTool,
+    // Streaming
+    Delta,
+    DocumentSource,
+    ImageSource,
+    // Request/Response
+    Message,
+    MessageContent,
+    MessageStream,
+    MessagesRequest,
+    MessagesRequestBuilder,
+    MessagesResponse,
+    Metadata,
+    Role,
+    ServerTool,
+    ServiceTier,
+    StopReason,
+    StreamEvent,
+    SystemBlock,
+    SystemContent,
+    ThinkingConfig,
+    Tool,
+    ToolChoice,
+    ToolResultContent,
+    Usage,
+};
diff --git a/crates/rullm-anthropic/src/messages/mod.rs b/crates/rullm-anthropic/src/messages/mod.rs
new file mode 100644
index 00000000..0ff2c680
--- /dev/null
+++ b/crates/rullm-anthropic/src/messages/mod.rs
@@ -0,0 +1,12 @@
+//! Messages API module
+//!
+//! Contains types for requests, responses, and streaming.
+
+pub mod stream;
+pub mod types;
+
+pub use stream::{
+    ContentBlockStartData, Delta, MessageAccumulator, MessageDeltaData, MessageStartData,
+    MessageStream, StreamErrorData, StreamEvent, parse_sse_stream,
+};
+pub use types::*;
diff --git a/crates/rullm-anthropic/src/messages/stream.rs b/crates/rullm-anthropic/src/messages/stream.rs
new file mode 100644
index 00000000..5d8872b1
--- /dev/null
+++ b/crates/rullm-anthropic/src/messages/stream.rs
@@ -0,0 +1,623 @@
+//! Streaming support for the Messages API
+//!
+//! Provides SSE parsing, streaming event types, and message accumulation.
+
+use crate::error::{AnthropicError, Result};
+use crate::messages::types::{ContentBlock, MessagesResponse, Role, StopReason, Usage};
+use futures::{Stream, StreamExt};
+use serde::{Deserialize, Serialize};
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+// =============================================================================
+// Stream Event Types
+// =============================================================================
+
+/// Streaming event from the Messages API
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum StreamEvent {
+    /// Start of message
+    MessageStart { message: MessageStartData },
+
+    /// Start of content block
+    ContentBlockStart {
+        index: u32,
+        content_block: ContentBlockStartData,
+    },
+
+    /// Incremental content delta
+    ContentBlockDelta { index: u32, delta: Delta },
+
+    /// End of content block
+    ContentBlockStop { index: u32 },
+
+    /// Message delta (stop reason, usage)
+    MessageDelta {
+        delta: MessageDeltaData,
+        usage: Usage,
+    },
+
+    /// End of stream
+    MessageStop,
+
+    /// Ping event (keep-alive)
+    Ping,
+
+    /// Error event
+    Error { error: StreamErrorData },
+}
+
+/// Message start data
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MessageStartData {
+    pub id: Arc<str>,
+    #[serde(rename = "type")]
+    pub message_type: Arc<str>,
+    pub role: Role,
+    pub content: Vec<serde_json::Value>,
+    pub model: Arc<str>,
+    pub stop_reason: Option<StopReason>,
+    pub stop_sequence: Option<Arc<str>>,
+    pub usage: Usage,
+}
+
+/// Content block start data
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ContentBlockStartData {
+    Text { text: Arc<str> },
+    ToolUse { id: Arc<str>, name: Arc<str> },
+    Thinking { thinking: Arc<str> },
+    ServerToolUse { id: Arc<str>, name: Arc<str> },
+}
+
+/// Delta (incremental change)
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum Delta {
+    /// Text delta
+    TextDelta { text: Arc<str> },
+    /// Tool input delta
+    InputJsonDelta { partial_json: Arc<str> },
+    /// Thinking delta
+    ThinkingDelta { thinking: Arc<str> },
+}
+
+impl Delta {
+    /// Get text content if this is a text delta
+    pub fn as_text(&self) -> Option<&str> {
+        match self {
+            Self::TextDelta { text } => Some(text),
+            _ => None,
+        }
+    }
+}
+
+/// Message delta data
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MessageDeltaData {
+    pub stop_reason: Option<StopReason>,
+    pub stop_sequence: Option<Arc<str>>,
+}
+
+/// Stream error data
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct StreamErrorData {
+    #[serde(rename = "type")]
+    pub error_type: Arc<str>,
+    pub message: Arc<str>,
+}
+
+// =============================================================================
+// SSE Parser
+// =============================================================================
+
+/// Parses Server-Sent Events from a byte stream
+pub struct SseParser<S> {
+    stream: S,
+    buffer: String,
+    event_queue: Vec<String>,
+}
+
+impl<S> SseParser<S>
+where
+    S: Stream<Item = std::result::Result<bytes::Bytes, reqwest::Error>> + Unpin,
+{
+    /// Create a new SSE parser
+    pub fn new(stream: S) -> Self {
+        Self {
+            stream,
+            buffer: String::new(),
+            event_queue: Vec::new(),
+        }
+    }
+
+    fn parse_events(&mut self) {
+        // Split by SSE event delimiter "\n\n"
+        while let Some(pos) = self.buffer.find("\n\n") {
+            let event_block = self.buffer[..pos].to_string();
+            self.buffer.drain(..pos + 2);
+
+            // Extract data lines
+            for line in event_block.lines() {
+                if let Some(data) = line.strip_prefix("data: ") {
+                    // Skip [DONE] messages (though Anthropic doesn't use these)
+                    let trimmed = data.trim();
+                    if !trimmed.is_empty() && trimmed != "[DONE]" {
+                        self.event_queue.push(data.to_string());
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<S> Stream for SseParser<S>
+where
+    S: Stream<Item = std::result::Result<bytes::Bytes, reqwest::Error>> + Unpin,
+{
+    type Item = Result<String>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        loop {
+            // Return queued events first
+            if !self.event_queue.is_empty() {
+                return Poll::Ready(Some(Ok(self.event_queue.remove(0))));
+            }
+
+            // Parse any complete events from buffer
+            self.parse_events();
+            if !self.event_queue.is_empty() {
+                return Poll::Ready(Some(Ok(self.event_queue.remove(0))));
+            }
+
+            // Get more data from stream
+            match Pin::new(&mut self.stream).poll_next(cx) {
+                Poll::Ready(Some(Ok(bytes))) => {
+                    match std::str::from_utf8(&bytes) {
+                        Ok(text) => {
+                            // Normalize CRLF to LF
+                            let normalized = text.replace("\r\n", "\n");
+                            self.buffer.push_str(&normalized);
+                        }
+                        Err(e) => {
+                            return Poll::Ready(Some(Err(AnthropicError::serialization(
+                                "Invalid UTF-8 in SSE stream",
+                                Box::new(e),
+                            ))));
+                        }
+                    }
+                }
+                Poll::Ready(Some(Err(e))) => {
+                    return Poll::Ready(Some(Err(AnthropicError::Transport(e))));
+                }
+                Poll::Ready(None) => {
+                    // Stream ended, parse remaining
+                    self.parse_events();
+                    if !self.event_queue.is_empty() {
+                        return Poll::Ready(Some(Ok(self.event_queue.remove(0))));
+                    }
+                    return Poll::Ready(None);
+                }
+                Poll::Pending => return Poll::Pending,
+            }
+        }
+    }
+}
+
+/// Parse raw SSE data into stream events
+pub fn parse_sse_stream<S>(stream: S) -> impl Stream<Item = Result<StreamEvent>>
+where
+    S: Stream<Item = std::result::Result<bytes::Bytes, reqwest::Error>> + Unpin,
+{
+    SseParser::new(stream).map(|result| {
+        result.and_then(|data| {
+            serde_json::from_str::<StreamEvent>(&data).map_err(|e| {
+                AnthropicError::serialization(format!("Failed to parse event: {data}"), Box::new(e))
+            })
+        })
+    })
+}
+
+// =============================================================================
+// Message Accumulator
+// =============================================================================
+
+/// Accumulates streaming events into a complete message
+#[derive(Debug, Clone)]
+pub struct MessageAccumulator {
+    id: Option<Arc<str>>,
+    model: Option<Arc<str>>,
+    role: Role,
+    content_blocks: Vec<AccumulatingBlock>,
+    stop_reason: Option<StopReason>,
+    stop_sequence: Option<Arc<str>>,
+    usage: Usage,
+}
+
+#[derive(Debug, Clone)]
+enum AccumulatingBlock {
+    Text(String),
+    ToolUse {
+        id: Arc<str>,
+        name: Arc<str>,
+        partial_json: String,
+    },
+    Thinking(String),
+    ServerToolUse {
+        id: Arc<str>,
+        name: Arc<str>,
+        partial_json: String,
+    },
+}
+
+impl Default for MessageAccumulator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl MessageAccumulator {
+    /// Create a new accumulator
+    pub fn new() -> Self {
+        Self {
+            id: None,
+            model: None,
+            role: Role::Assistant,
+            content_blocks: Vec::new(),
+            stop_reason: None,
+            stop_sequence: None,
+            usage: Usage::default(),
+        }
+    }
+
+    /// Process a stream event
+    pub fn process(&mut self, event: &StreamEvent) {
+        match event {
+            StreamEvent::MessageStart { message } => {
+                self.id = Some(message.id.clone());
+                self.model = Some(message.model.clone());
+                self.role = message.role;
+                self.usage = message.usage.clone();
+            }
+            StreamEvent::ContentBlockStart {
+                index,
+                content_block,
+            } => {
+                let idx = *index as usize;
+                // Ensure we have enough slots
+                while self.content_blocks.len() <= idx {
+                    self.content_blocks
+                        .push(AccumulatingBlock::Text(String::new()));
+                }
+
+                self.content_blocks[idx] = match content_block {
+                    ContentBlockStartData::Text { text } => {
+                        AccumulatingBlock::Text(text.to_string())
+                    }
+                    ContentBlockStartData::ToolUse { id, name } => AccumulatingBlock::ToolUse {
+                        id: id.clone(),
+                        name: name.clone(),
+                        partial_json: String::new(),
+                    },
+                    ContentBlockStartData::Thinking { thinking } => {
+                        AccumulatingBlock::Thinking(thinking.to_string())
+                    }
+                    ContentBlockStartData::ServerToolUse { id, name } => {
+                        AccumulatingBlock::ServerToolUse {
+                            id: id.clone(),
+                            name: name.clone(),
+                            partial_json: String::new(),
+                        }
+                    }
+                };
+            }
+            StreamEvent::ContentBlockDelta { index, delta } => {
+                let idx = *index as usize;
+                if idx < self.content_blocks.len() {
+                    match (&mut self.content_blocks[idx], delta) {
+                        (AccumulatingBlock::Text(text), Delta::TextDelta { text: new_text }) => {
+                            text.push_str(new_text);
+                        }
+                        (
+                            AccumulatingBlock::ToolUse { partial_json, .. },
+                            Delta::InputJsonDelta {
+                                partial_json: new_json,
+                            },
+                        ) => {
+                            partial_json.push_str(new_json);
+                        }
+                        (
+                            AccumulatingBlock::Thinking(thinking),
+                            Delta::ThinkingDelta {
+                                thinking: new_thinking,
+                            },
+                        ) => {
+                            thinking.push_str(new_thinking);
+                        }
+                        (
+                            AccumulatingBlock::ServerToolUse { partial_json, .. },
+                            Delta::InputJsonDelta {
+                                partial_json: new_json,
+                            },
+                        ) => {
+                            partial_json.push_str(new_json);
+                        }
+                        _ => {}
+                    }
+                }
+            }
+            StreamEvent::MessageDelta { delta, usage } => {
+                self.stop_reason = delta.stop_reason;
+                self.stop_sequence = delta.stop_sequence.clone();
+                self.usage = usage.clone();
+            }
+            StreamEvent::ContentBlockStop { .. } | StreamEvent::MessageStop | StreamEvent::Ping => {
+            }
+            StreamEvent::Error { error } => {
+                // Log error but don't fail accumulation
+                eprintln!("Stream error: {}: {}", error.error_type, error.message);
+            }
+        }
+    }
+
+    /// Get current text content (for streaming display)
+    pub fn current_text(&self) -> String {
+        self.content_blocks
+            .iter()
+            .filter_map(|block| match block {
+                AccumulatingBlock::Text(text) => Some(text.as_str()),
+                _ => None,
+            })
+            .collect::<Vec<_>>()
+            .join("")
+    }
+
+    /// Build the final response
+    pub fn build(self) -> Result<MessagesResponse> {
+        let id = self.id.ok_or_else(|| {
+            AnthropicError::serialization(
+                "Missing message ID in stream",
+                Box::new(std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    "missing id",
+                )),
+            )
+        })?;
+
+        let model = self.model.ok_or_else(|| {
+            AnthropicError::serialization(
+                "Missing model in stream",
+                Box::new(std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    "missing model",
+                )),
+            )
+        })?;
+
+        let content: Vec<ContentBlock> = self
+            .content_blocks
+            .into_iter()
+            .map(|block| match block {
+                AccumulatingBlock::Text(text) => ContentBlock::Text {
+                    text: Arc::from(text),
+                },
+                AccumulatingBlock::ToolUse {
+                    id,
+                    name,
+                    partial_json,
+                } => {
+                    let input =
+                        serde_json::from_str(&partial_json).unwrap_or(serde_json::Value::Null);
+                    ContentBlock::ToolUse { id, name, input }
+                }
+                AccumulatingBlock::Thinking(thinking) => ContentBlock::Thinking {
+                    thinking: Arc::from(thinking),
+                },
+                AccumulatingBlock::ServerToolUse {
+                    id,
+                    name,
+                    partial_json,
+                } => {
+                    let input =
+                        serde_json::from_str(&partial_json).unwrap_or(serde_json::Value::Null);
+                    ContentBlock::ServerToolUse { id, name, input }
+                }
+            })
+            .collect();
+
+        Ok(MessagesResponse {
+            id,
+            response_type: Arc::from("message"),
+            role: self.role,
+            content,
+            model,
+            stop_reason: self.stop_reason,
+            stop_sequence: self.stop_sequence,
+            usage: self.usage,
+        })
+    }
+}
+
+// =============================================================================
+// Message Stream
+// =============================================================================
+
+/// High-level stream wrapper for consuming streaming responses
+pub struct MessageStream<S>
+where
+    S: Stream<Item = Result<StreamEvent>> + Unpin,
+{
+    inner: S,
+    accumulator: MessageAccumulator,
+    finished: bool,
+}
+
+impl<S> MessageStream<S>
+where
+    S: Stream<Item = Result<StreamEvent>> + Unpin,
+{
+    /// Create a new message stream
+    pub fn new(stream: S) -> Self {
+        Self {
+            inner: stream,
+            accumulator: MessageAccumulator::new(),
+            finished: false,
+        }
+    }
+
+    /// Get a stream of text chunks only
+    pub fn text_stream(self) -> impl Stream<Item = Result<Arc<str>>> {
+        async_stream::stream! {
+            let mut stream = self;
+            while let Some(event) = stream.inner.next().await {
+                match event {
+                    Ok(StreamEvent::ContentBlockDelta { delta: Delta::TextDelta { text }, .. }) => {
+                        yield Ok(text);
+                    }
+                    Ok(event) => {
+                        stream.accumulator.process(&event);
+                    }
+                    Err(e) => {
+                        yield Err(e);
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    /// Get the final accumulated message after consuming the stream
+    pub async fn final_message(mut self) -> Result<MessagesResponse> {
+        while let Some(event) = self.inner.next().await {
+            let event = event?;
+            self.accumulator.process(&event);
+            if matches!(event, StreamEvent::MessageStop) {
+                self.finished = true;
+                break;
+            }
+        }
+        self.accumulator.build()
+    }
+}
+
+impl<S> Stream for MessageStream<S>
+where
+    S: Stream<Item = Result<StreamEvent>> + Unpin,
+{
+    type Item = Result<StreamEvent>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        if self.finished {
+            return Poll::Ready(None);
+        }
+
+        match Pin::new(&mut self.inner).poll_next(cx) {
+            Poll::Ready(Some(Ok(event))) => {
+                self.accumulator.process(&event);
+                if matches!(event, StreamEvent::MessageStop) {
+                    self.finished = true;
+                }
+                Poll::Ready(Some(Ok(event)))
+            }
+            Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))),
+            Poll::Ready(None) => Poll::Ready(None),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use futures::stream;
+
+    fn bytes_from_str(s: &str) -> bytes::Bytes {
+        bytes::Bytes::from(s.to_string())
+    }
+
+    #[tokio::test]
+    async fn test_sse_parser_single_event() {
+        let data = vec![Ok(bytes_from_str("data: {\"type\":\"ping\"}\n\n"))];
+        let stream = stream::iter(data);
+        let mut parser = SseParser::new(stream);
+
+        let result = parser.next().await;
+        assert!(result.is_some());
+        assert_eq!(result.unwrap().unwrap(), "{\"type\":\"ping\"}");
+    }
+
+    #[tokio::test]
+    async fn test_sse_parser_multiple_events() {
+        let data = vec![Ok(bytes_from_str(
+            "data: {\"type\":\"ping\"}\n\ndata: {\"type\":\"message_stop\"}\n\n",
+        ))];
+        let stream = stream::iter(data);
+        let parser = SseParser::new(stream);
+        let results: Vec<_> = parser.collect().await;
+
+        assert_eq!(results.len(), 2);
+        assert!(results[0].is_ok());
+        assert!(results[1].is_ok());
+    }
+
+    #[tokio::test]
+    async fn test_sse_parser_chunked() {
+        let data = vec![
+            Ok(bytes_from_str("data: {\"type\":")),
+            Ok(bytes_from_str("\"ping\"}\n\n")),
+        ];
+        let stream = stream::iter(data);
+        let mut parser = SseParser::new(stream);
+
+        let result = parser.next().await;
+        assert!(result.is_some());
+        assert_eq!(result.unwrap().unwrap(), "{\"type\":\"ping\"}");
+    }
+
+    #[tokio::test]
+    async fn test_accumulator_text() {
+        let mut acc = MessageAccumulator::new();
+
+        acc.process(&StreamEvent::MessageStart {
+            message: MessageStartData {
+                id: Arc::from("msg_123"),
+                message_type: Arc::from("message"),
+                role: Role::Assistant,
+                content: vec![],
+                model: Arc::from("claude-3-5-sonnet-20241022"),
+                stop_reason: None,
+                stop_sequence: None,
+                usage: Usage::default(),
+            },
+        });
+
+        acc.process(&StreamEvent::ContentBlockStart {
+            index: 0,
+            content_block: ContentBlockStartData::Text {
+                text: Arc::from(""),
+            },
+        });
+
+        acc.process(&StreamEvent::ContentBlockDelta {
+            index: 0,
+            delta: Delta::TextDelta {
+                text: Arc::from("Hello"),
+            },
+        });
+
+        acc.process(&StreamEvent::ContentBlockDelta {
+            index: 0,
+            delta: Delta::TextDelta {
+                text: Arc::from(" world!"),
+            },
+        });
+
+        assert_eq!(acc.current_text(), "Hello world!");
+
+        let response = acc.build().unwrap();
+        assert_eq!(response.text(), "Hello world!");
+    }
+}
diff --git a/crates/rullm-anthropic/src/messages/types.rs b/crates/rullm-anthropic/src/messages/types.rs
new file mode 100644
index 00000000..d976f464
--- /dev/null
+++ b/crates/rullm-anthropic/src/messages/types.rs
@@ -0,0 +1,910 @@
+//! Types for the Anthropic Messages API
+//!
+//! This module contains comprehensive type definitions for requests, responses,
+//! content blocks, tools, and streaming events.
+
+use serde::{Deserialize, Serialize};
+use std::sync::Arc;
+
+// =============================================================================
+// Request Types
+// =============================================================================
+
+/// Messages API request with all Anthropic parameters
+#[derive(Debug, Clone, Serialize)]
+pub struct MessagesRequest {
+    /// The model to use (e.g., "claude-3-5-sonnet-20241022")
+    pub model: Arc<str>,
+
+    /// The maximum number of tokens to generate
+    pub max_tokens: u32,
+
+    /// Input messages for the conversation
+    pub messages: Vec<Message>,
+
+    /// System prompt(s) to guide the model's behavior
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub system: Option<SystemContent>,
+
+    /// Metadata about the request
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<Metadata>,
+
+    /// Custom sequences that will cause the model to stop generating
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop_sequences: Option<Vec<Arc<str>>>,
+
+    /// Whether to incrementally stream the response
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stream: Option<bool>,
+
+    /// Amount of randomness injected into the response (0.0 to 1.0)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+
+    /// Use nucleus sampling (0.0 to 1.0)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f32>,
+
+    /// Only sample from the top K options for each subsequent token
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_k: Option<u32>,
+
+    /// Definitions of tools that the model may use
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tools: Option<Vec<Tool>>,
+
+    /// How the model should use the provided tools
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_choice: Option<ToolChoice>,
+
+    /// Configuration for extended thinking
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub thinking: Option<ThinkingConfig>,
+
+    /// Service tier to use
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub service_tier: Option<ServiceTier>,
+}
+
+/// Builder for constructing MessagesRequest
+#[derive(Debug, Clone)]
+pub struct MessagesRequestBuilder {
+    model: Arc<str>,
+    max_tokens: u32,
+    messages: Vec<Message>,
+    system: Option<SystemContent>,
+    metadata: Option<Metadata>,
+    stop_sequences: Option<Vec<Arc<str>>>,
+    temperature: Option<f32>,
+    top_p: Option<f32>,
+    top_k: Option<u32>,
+    tools: Option<Vec<Tool>>,
+    tool_choice: Option<ToolChoice>,
+    thinking: Option<ThinkingConfig>,
+    service_tier: Option<ServiceTier>,
+}
+
+impl MessagesRequestBuilder {
+    /// Create a new builder with required fields
+    pub fn new(model: impl Into<Arc<str>>, max_tokens: u32) -> Self {
+        Self {
+            model: model.into(),
+            max_tokens,
+            messages: Vec::new(),
+            system: None,
+            metadata: None,
+            stop_sequences: None,
+            temperature: None,
+            top_p: None,
+            top_k: None,
+            tools: None,
+            tool_choice: None,
+            thinking: None,
+            service_tier: None,
+        }
+    }
+
+    /// Add a message to the conversation
+    pub fn message(mut self, message: Message) -> Self {
+        self.messages.push(message);
+        self
+    }
+
+    /// Add multiple messages to the conversation
+    pub fn messages(mut self, messages: impl IntoIterator<Item = Message>) -> Self {
+        self.messages.extend(messages);
+        self
+    }
+
+    /// Set a simple text system prompt
+    pub fn system(mut self, content: impl Into<Arc<str>>) -> Self {
+        self.system = Some(SystemContent::Text(content.into()));
+        self
+    }
+
+    /// Set system prompt with blocks (for cache control)
+    pub fn system_blocks(mut self, blocks: Vec<SystemBlock>) -> Self {
+        self.system = Some(SystemContent::Blocks(blocks));
+        self
+    }
+
+    /// Set metadata
+    pub fn metadata(mut self, metadata: Metadata) -> Self {
+        self.metadata = Some(metadata);
+        self
+    }
+
+    /// Set custom stop sequences
+    pub fn stop_sequences(
+        mut self,
+        sequences: impl IntoIterator<Item = impl Into<Arc<str>>>,
+    ) -> Self {
+        self.stop_sequences = Some(sequences.into_iter().map(Into::into).collect());
+        self
+    }
+
+    /// Set temperature (0.0 to 1.0)
+    pub fn temperature(mut self, temp: f32) -> Self {
+        self.temperature = Some(temp);
+        self
+    }
+
+    /// Set top_p for nucleus sampling
+    pub fn top_p(mut self, p: f32) -> Self {
+        self.top_p = Some(p);
+        self
+    }
+
+    /// Set top_k sampling
+    pub fn top_k(mut self, k: u32) -> Self {
+        self.top_k = Some(k);
+        self
+    }
+
+    /// Set available tools
+    pub fn tools(mut self, tools: Vec<Tool>) -> Self {
+        self.tools = Some(tools);
+        self
+    }
+
+    /// Set tool choice mode
+    pub fn tool_choice(mut self, choice: ToolChoice) -> Self {
+        self.tool_choice = Some(choice);
+        self
+    }
+
+    /// Enable extended thinking
+    pub fn thinking(mut self, config: ThinkingConfig) -> Self {
+        self.thinking = Some(config);
+        self
+    }
+
+    /// Set service tier
+    pub fn service_tier(mut self, tier: ServiceTier) -> Self {
+        self.service_tier = Some(tier);
+        self
+    }
+
+    /// Build the request
+    pub fn build(self) -> MessagesRequest {
+        MessagesRequest {
+            model: self.model,
+            max_tokens: self.max_tokens,
+            messages: self.messages,
+            system: self.system,
+            metadata: self.metadata,
+            stop_sequences: self.stop_sequences,
+            stream: None,
+            temperature: self.temperature,
+            top_p: self.top_p,
+            top_k: self.top_k,
+            tools: self.tools,
+            tool_choice: self.tool_choice,
+            thinking: self.thinking,
+            service_tier: self.service_tier,
+        }
+    }
+}
+
+impl MessagesRequest {
+    /// Create a new builder
+    pub fn builder(model: impl Into<Arc<str>>, max_tokens: u32) -> MessagesRequestBuilder {
+        MessagesRequestBuilder::new(model, max_tokens)
+    }
+}
+
+// =============================================================================
+// Message Types
+// =============================================================================
+
+/// A message in the conversation
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Message {
+    /// The role of the message sender
+    pub role: Role,
+
+    /// The content of the message
+    pub content: MessageContent,
+}
+
+impl Message {
+    /// Create a user message with text content
+    pub fn user(content: impl Into<Arc<str>>) -> Self {
+        Self {
+            role: Role::User,
+            content: MessageContent::Text(content.into()),
+        }
+    }
+
+    /// Create an assistant message with text content
+    pub fn assistant(content: impl Into<Arc<str>>) -> Self {
+        Self {
+            role: Role::Assistant,
+            content: MessageContent::Text(content.into()),
+        }
+    }
+
+    /// Create a user message with content blocks
+    pub fn user_with_blocks(blocks: Vec<ContentBlockParam>) -> Self {
+        Self {
+            role: Role::User,
+            content: MessageContent::Blocks(blocks),
+        }
+    }
+
+    /// Create an assistant message with content blocks
+    pub fn assistant_with_blocks(blocks: Vec<ContentBlockParam>) -> Self {
+        Self {
+            role: Role::Assistant,
+            content: MessageContent::Blocks(blocks),
+        }
+    }
+}
+
+/// Role of the message sender
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum Role {
+    /// User message
+    User,
+    /// Assistant message (model response)
+    Assistant,
+}
+
+/// Message content can be text or array of content blocks
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum MessageContent {
+    /// Simple text content
+    Text(Arc<str>),
+    /// Array of content blocks (for multimodal inputs, tool use, etc.)
+    Blocks(Vec<ContentBlockParam>),
+}
+
+// =============================================================================
+// Content Block Types (Input)
+// =============================================================================
+
+/// Input content block (for request messages)
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ContentBlockParam {
+    /// Text content
+    Text {
+        text: Arc<str>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        cache_control: Option<CacheControl>,
+    },
+
+    /// Image content
+    Image {
+        source: ImageSource,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        cache_control: Option<CacheControl>,
+    },
+
+    /// Document content (PDF, etc.)
+    Document {
+        source: DocumentSource,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        cache_control: Option<CacheControl>,
+    },
+
+    /// Tool result (response from a tool call)
+    ToolResult {
+        tool_use_id: Arc<str>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        content: Option<ToolResultContent>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        is_error: Option<bool>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        cache_control: Option<CacheControl>,
+    },
+}
+
+impl ContentBlockParam {
+    /// Create a text content block
+    pub fn text(content: impl Into<Arc<str>>) -> Self {
+        Self::Text {
+            text: content.into(),
+            cache_control: None,
+        }
+    }
+
+    /// Create a text content block with cache control
+    pub fn text_with_cache(content: impl Into<Arc<str>>) -> Self {
+        Self::Text {
+            text: content.into(),
+            cache_control: Some(CacheControl::ephemeral()),
+        }
+    }
+
+    /// Create an image from base64 data
+    pub fn image_base64(media_type: impl Into<Arc<str>>, data: impl Into<Arc<str>>) -> Self {
+        Self::Image {
+            source: ImageSource::Base64 {
+                media_type: media_type.into(),
+                data: data.into(),
+            },
+            cache_control: None,
+        }
+    }
+
+    /// Create an image from URL
+    pub fn image_url(url: impl Into<Arc<str>>) -> Self {
+        Self::Image {
+            source: ImageSource::Url { url: url.into() },
+            cache_control: None,
+        }
+    }
+
+    /// Create a tool result
+    pub fn tool_result(tool_use_id: impl Into<Arc<str>>, content: impl Into<Arc<str>>) -> Self {
+        Self::ToolResult {
+            tool_use_id: tool_use_id.into(),
+            content: Some(ToolResultContent::Text(content.into())),
+            is_error: None,
+            cache_control: None,
+        }
+    }
+
+    /// Create a tool error result
+    pub fn tool_error(tool_use_id: impl Into<Arc<str>>, error: impl Into<Arc<str>>) -> Self {
+        Self::ToolResult {
+            tool_use_id: tool_use_id.into(),
+            content: Some(ToolResultContent::Text(error.into())),
+            is_error: Some(true),
+            cache_control: None,
+        }
+    }
+}
+
+/// Image source (base64 or URL)
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ImageSource {
+    /// Base64-encoded image
+    Base64 {
+        media_type: Arc<str>,
+        data: Arc<str>,
+    },
+    /// URL reference
+    Url { url: Arc<str> },
+}
+
+/// Document source (base64, URL, or text)
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum DocumentSource {
+    /// Base64-encoded document
+    Base64 {
+        media_type: Arc<str>,
+        data: Arc<str>,
+    },
+    /// URL reference
+    Url { url: Arc<str> },
+    /// Plain text document
+    Text { text: Arc<str> },
+}
+
+/// Tool result content
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum ToolResultContent {
+    /// Simple text result
+    Text(Arc<str>),
+    /// Multiple content blocks
+    Blocks(Vec<ContentBlockParam>),
+}
+
+// =============================================================================
+// Content Block Types (Output)
+// =============================================================================
+
+/// Output content block (from response)
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ContentBlock {
+    /// Text content
+    Text { text: Arc<str> },
+
+    /// Tool use request
+    ToolUse {
+        id: Arc<str>,
+        name: Arc<str>,
+        input: serde_json::Value,
+    },
+
+    /// Thinking block (extended thinking)
+    Thinking { thinking: Arc<str> },
+
+    /// Redacted thinking (for safety)
+    RedactedThinking { data: Arc<str> },
+
+    /// Server tool use (bash, text_editor, web_search)
+    ServerToolUse {
+        id: Arc<str>,
+        name: Arc<str>,
+        input: serde_json::Value,
+    },
+
+    /// Web search tool result
+    WebSearchToolResult {
+        tool_use_id: Arc<str>,
+        content: serde_json::Value,
+    },
+}
+
+impl ContentBlock {
+    /// Get text content if this is a text block
+    pub fn as_text(&self) -> Option<&str> {
+        match self {
+            Self::Text { text } => Some(text),
+            _ => None,
+        }
+    }
+
+    /// Get tool use details if this is a tool use block
+    pub fn as_tool_use(&self) -> Option<(&str, &str, &serde_json::Value)> {
+        match self {
+            Self::ToolUse { id, name, input } => Some((id, name, input)),
+            _ => None,
+        }
+    }
+}
+
+// =============================================================================
+// System Content Types
+// =============================================================================
+
+/// System prompt content (string or blocks)
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum SystemContent {
+    /// Simple text system prompt
+    Text(Arc<str>),
+    /// Array of system blocks (for cache control)
+    Blocks(Vec<SystemBlock>),
+}
+
+/// A block in the system prompt
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SystemBlock {
+    /// Block type (always "text")
+    #[serde(rename = "type")]
+    pub block_type: Arc<str>,
+
+    /// Text content
+    pub text: Arc<str>,
+
+    /// Cache control
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub cache_control: Option<CacheControl>,
+}
+
+impl SystemBlock {
+    /// Create a text system block
+    pub fn text(content: impl Into<Arc<str>>) -> Self {
+        Self {
+            block_type: Arc::from("text"),
+            text: content.into(),
+            cache_control: None,
+        }
+    }
+
+    /// Create a text system block with cache control
+    pub fn text_with_cache(content: impl Into<Arc<str>>) -> Self {
+        Self {
+            block_type: Arc::from("text"),
+            text: content.into(),
+            cache_control: Some(CacheControl::ephemeral()),
+        }
+    }
+}
+
+/// Cache control configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CacheControl {
+    /// Cache type (currently only "ephemeral")
+    #[serde(rename = "type")]
+    pub cache_type: Arc<str>,
+}
+
+impl CacheControl {
+    /// Create an ephemeral cache control
+    pub fn ephemeral() -> Self {
+        Self {
+            cache_type: Arc::from("ephemeral"),
+        }
+    }
+}
+
+// =============================================================================
+// Metadata Types
+// =============================================================================
+
+/// Request metadata
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Metadata {
+    /// An external identifier for the user
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub user_id: Option<Arc<str>>,
+}
+
+impl Metadata {
+    /// Create metadata with user ID
+    pub fn with_user_id(user_id: impl Into<Arc<str>>) -> Self {
+        Self {
+            user_id: Some(user_id.into()),
+        }
+    }
+}
+
+// =============================================================================
+// Tool Types
+// =============================================================================
+
+/// Tool definition (custom or server tool)
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum Tool {
+    /// Custom tool defined by user
+    Custom(CustomTool),
+    /// Server-provided tool
+    Server(ServerTool),
+}
+
+/// Custom tool definition
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CustomTool {
+    /// Name of the tool
+    pub name: Arc<str>,
+
+    /// Description of what the tool does
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub description: Option<Arc<str>>,
+
+    /// JSON schema for the tool's input
+    pub input_schema: serde_json::Value,
+
+    /// Cache control
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub cache_control: Option<CacheControl>,
+}
+
+impl CustomTool {
+    /// Create a new custom tool
+    pub fn new(
+        name: impl Into<Arc<str>>,
+        description: impl Into<Arc<str>>,
+        input_schema: serde_json::Value,
+    ) -> Self {
+        Self {
+            name: name.into(),
+            description: Some(description.into()),
+            input_schema,
+            cache_control: None,
+        }
+    }
+}
+
+/// Server-provided tool (bash, text_editor, web_search)
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ServerTool {
+    /// Bash tool
+    #[serde(rename = "bash_20250124")]
+    Bash {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        name: Option<Arc<str>>,
+    },
+
+    /// Text editor tool (various versions)
+    #[serde(rename = "text_editor_20250124")]
+    TextEditor20250124 {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        name: Option<Arc<str>>,
+    },
+
+    #[serde(rename = "text_editor_20250429")]
+    TextEditor20250429 {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        name: Option<Arc<str>>,
+    },
+
+    /// Web search tool
+    #[serde(rename = "web_search_20250305")]
+    WebSearch {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        name: Option<Arc<str>>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        allowed_domains: Option<Vec<Arc<str>>>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        blocked_domains: Option<Vec<Arc<str>>>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        max_uses: Option<u32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        user_location: Option<UserLocation>,
+    },
+}
+
+/// User location for web search
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct UserLocation {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub city: Option<Arc<str>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub region: Option<Arc<str>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub country: Option<Arc<str>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub timezone: Option<Arc<str>>,
+}
+
+/// Tool choice configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ToolChoice {
+    /// Let the model decide whether to use tools
+    Auto {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        disable_parallel_tool_use: Option<bool>,
+    },
+    /// Model must use at least one tool
+    Any {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        disable_parallel_tool_use: Option<bool>,
+    },
+    /// Model must not use any tools
+    None,
+    /// Model must use a specific tool
+    Tool {
+        name: Arc<str>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        disable_parallel_tool_use: Option<bool>,
+    },
+}
+
+impl ToolChoice {
+    /// Create auto tool choice
+    pub fn auto() -> Self {
+        Self::Auto {
+            disable_parallel_tool_use: None,
+        }
+    }
+
+    /// Create any tool choice
+    pub fn any() -> Self {
+        Self::Any {
+            disable_parallel_tool_use: None,
+        }
+    }
+
+    /// Create none tool choice
+    pub fn none() -> Self {
+        Self::None
+    }
+
+    /// Create tool-specific choice
+    pub fn tool(name: impl Into<Arc<str>>) -> Self {
+        Self::Tool {
+            name: name.into(),
+            disable_parallel_tool_use: None,
+        }
+    }
+}
+
+// =============================================================================
+// Thinking and Service Tier
+// =============================================================================
+
+/// Extended thinking configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ThinkingConfig {
+    /// Thinking type
+    #[serde(rename = "type")]
+    pub thinking_type: Arc<str>,
+
+    /// Budget tokens for thinking
+    pub budget_tokens: u32,
+}
+
+impl ThinkingConfig {
+    /// Create enabled thinking config with budget
+    pub fn enabled(budget_tokens: u32) -> Self {
+        Self {
+            thinking_type: Arc::from("enabled"),
+            budget_tokens,
+        }
+    }
+}
+
+/// Service tier for request routing
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ServiceTier {
+    /// Auto-select tier
+    Auto,
+    /// Standard tier
+    Standard,
+}
+
+// =============================================================================
+// Response Types
+// =============================================================================
+
+/// Messages API response
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MessagesResponse {
+    /// Unique object identifier
+    pub id: Arc<str>,
+
+    /// Object type (always "message")
+    #[serde(rename = "type")]
+    pub response_type: Arc<str>,
+
+    /// Conversational role of the generated message
+    pub role: Role,
+
+    /// Content blocks in the response
+    pub content: Vec<ContentBlock>,
+
+    /// The model that handled the request
+    pub model: Arc<str>,
+
+    /// The reason we stopped generating
+    pub stop_reason: Option<StopReason>,
+
+    /// Which custom stop sequence was generated (if any)
+    pub stop_sequence: Option<Arc<str>>,
+
+    /// Token usage information
+    pub usage: Usage,
+}
+
+impl MessagesResponse {
+    /// Get all text content joined as a single string
+    pub fn text(&self) -> String {
+        self.content
+            .iter()
+            .filter_map(|block| block.as_text())
+            .collect::<Vec<_>>()
+            .join("")
+    }
+
+    /// Get all tool use blocks
+    pub fn tool_uses(&self) -> Vec<(&str, &str, &serde_json::Value)> {
+        self.content
+            .iter()
+            .filter_map(|block| block.as_tool_use())
+            .collect()
+    }
+}
+
+/// Reason for stopping generation
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum StopReason {
+    /// Natural end of message
+    EndTurn,
+    /// Hit a custom stop sequence
+    StopSequence,
+    /// Reached max_tokens
+    MaxTokens,
+    /// Model wants to use a tool
+    ToolUse,
+}
+
+/// Token usage information
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct Usage {
+    /// Number of input tokens
+    pub input_tokens: u32,
+
+    /// Number of output tokens
+    pub output_tokens: u32,
+
+    /// Number of tokens read from cache
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub cache_creation_input_tokens: Option<u32>,
+
+    /// Number of tokens used to create cache
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub cache_read_input_tokens: Option<u32>,
+}
+
+// =============================================================================
+// Count Tokens Types
+// =============================================================================
+
+/// Count tokens request
+#[derive(Debug, Clone, Serialize)]
+pub struct CountTokensRequest {
+    /// The model to use for counting
+    pub model: Arc<str>,
+
+    /// Messages to count tokens for
+    pub messages: Vec<Message>,
+
+    /// System prompt
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub system: Option<SystemContent>,
+
+    /// Tools to include in count
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tools: Option<Vec<Tool>>,
+
+    /// Tool choice to include in count
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_choice: Option<ToolChoice>,
+
+    /// Thinking config to include in count
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub thinking: Option<ThinkingConfig>,
+}
+
+impl CountTokensRequest {
+    /// Create a new count tokens request with required fields
+    pub fn new(model: impl Into<Arc<str>>, messages: Vec<Message>) -> Self {
+        Self {
+            model: model.into(),
+            messages,
+            system: None,
+            tools: None,
+            tool_choice: None,
+            thinking: None,
+        }
+    }
+
+    /// Set the system prompt
+    pub fn with_system(mut self, system: SystemContent) -> Self {
+        self.system = Some(system);
+        self
+    }
+
+    /// Set the tools
+    pub fn with_tools(mut self, tools: Vec<Tool>) -> Self {
+        self.tools = Some(tools);
+        self
+    }
+
+    /// Set the tool choice
+    pub fn with_tool_choice(mut self, tool_choice: ToolChoice) -> Self {
+        self.tool_choice = Some(tool_choice);
+        self
+    }
+
+    /// Set the thinking config
+    pub fn with_thinking(mut self, thinking: ThinkingConfig) -> Self {
+        self.thinking = Some(thinking);
+        self
+    }
+}
+
+/// Count tokens response
+#[derive(Debug, Clone, Deserialize)]
+pub struct CountTokensResponse {
+    /// Number of input tokens
+    pub input_tokens: u32,
+}
diff --git a/crates/rullm-anthropic/src/transport.rs b/crates/rullm-anthropic/src/transport.rs
new file mode 100644
index 00000000..1458f06b
--- /dev/null
+++ b/crates/rullm-anthropic/src/transport.rs
@@ -0,0 +1,231 @@
+//! HTTP transport abstraction for the Anthropic client
+//!
+//! Provides a trait-based transport layer that can be mocked for testing.
+
+use crate::config::{ClientConfig, RequestOptions};
+use crate::error::{AnthropicError, ApiErrorResponse, Result};
+use bytes::Bytes;
+use futures::Stream;
+use reqwest::header::HeaderMap;
+use reqwest::{Client, Response, StatusCode};
+use std::pin::Pin;
+use std::sync::Arc;
+use std::time::Duration;
+
+/// HTTP transport for making requests to the Anthropic API
+pub struct HttpTransport {
+    client: Client,
+    config: ClientConfig,
+}
+
+impl HttpTransport {
+    /// Create a new HTTP transport with the given configuration
+    pub fn new(config: ClientConfig) -> Result<Self> {
+        let client = Client::builder()
+            .timeout(config.timeout)
+            .build()
+            .map_err(AnthropicError::Transport)?;
+
+        Ok(Self { client, config })
+    }
+
+    /// Get the base URL
+    pub fn base_url(&self) -> &str {
+        &self.config.base_url
+    }
+
+    /// Get the default headers
+    pub fn default_headers(&self) -> &HeaderMap {
+        &self.config.default_headers
+    }
+
+    /// Make a POST request and return the response
+    pub async fn post<T: serde::Serialize>(
+        &self,
+        path: &str,
+        body: &T,
+        options: &RequestOptions,
+    ) -> Result<Response> {
+        let url = format!("{}{}", self.config.base_url, path);
+
+        let mut request = self.client.post(&url);
+
+        // Apply default headers
+        request = request.headers(self.config.default_headers.clone());
+
+        // Apply extra headers from options
+        for (key, value) in &options.extra_headers {
+            request = request.header(key, value);
+        }
+
+        // Apply query parameters
+        for (key, value) in &options.extra_query {
+            request = request.query(&[(key.as_ref(), value.as_ref())]);
+        }
+
+        // Apply timeout override
+        if let Some(timeout) = options.timeout {
+            request = request.timeout(timeout);
+        }
+
+        // Send request with body
+        let response = request.json(body).send().await?;
+
+        Ok(response)
+    }
+
+    /// Make a POST request and parse the JSON response
+    pub async fn post_json<T: serde::Serialize, R: serde::de::DeserializeOwned>(
+        &self,
+        path: &str,
+        body: &T,
+        options: &RequestOptions,
+    ) -> Result<(R, ResponseMeta)> {
+        let response = self.post(path, body, options).await?;
+        let meta = ResponseMeta::from_response(&response);
+
+        if !response.status().is_success() {
+            return Err(parse_error_response(response, meta.request_id.clone()).await);
+        }
+
+        let data: R = response
+            .json()
+            .await
+            .map_err(|e| AnthropicError::serialization("Failed to parse response", Box::new(e)))?;
+
+        Ok((data, meta))
+    }
+
+    /// Make a streaming POST request
+    pub async fn post_stream<T: serde::Serialize>(
+        &self,
+        path: &str,
+        body: &T,
+        options: &RequestOptions,
+    ) -> Result<(
+        Pin<Box<dyn Stream<Item = std::result::Result<Bytes, reqwest::Error>> + Send>>,
+        ResponseMeta,
+    )> {
+        let response = self.post(path, body, options).await?;
+        let meta = ResponseMeta::from_response(&response);
+
+        if !response.status().is_success() {
+            return Err(parse_error_response(response, meta.request_id.clone()).await);
+        }
+
+        Ok((Box::pin(response.bytes_stream()), meta))
+    }
+
+    /// Get the configuration
+    pub fn config(&self) -> &ClientConfig {
+        &self.config
+    }
+
+    /// Get the maximum retries
+    pub fn max_retries(&self) -> u32 {
+        self.config.max_retries
+    }
+
+    /// Get the default timeout
+    pub fn timeout(&self) -> Duration {
+        self.config.timeout
+    }
+}
+
+/// Metadata from an API response
+#[derive(Debug, Clone, Default)]
+pub struct ResponseMeta {
+    /// Request ID from the x-request-id header
+    pub request_id: Option<Arc<str>>,
+}
+
+impl ResponseMeta {
+    /// Extract metadata from a response
+    fn from_response(response: &Response) -> Self {
+        let request_id = response
+            .headers()
+            .get("x-request-id")
+            .and_then(|v| v.to_str().ok())
+            .map(Arc::from);
+
+        Self { request_id }
+    }
+}
+
+/// Parse an error response from the API
+async fn parse_error_response(response: Response, request_id: Option<Arc<str>>) -> AnthropicError {
+    let status = response.status();
+
+    // Try to parse the error body
+    let error_result: std::result::Result<ApiErrorResponse, _> = response.json().await;
+
+    match error_result {
+        Ok(error_response) => AnthropicError::api(status, request_id, error_response.error),
+        Err(_) => {
+            // Couldn't parse error, create a generic one
+            AnthropicError::api(
+                status,
+                request_id,
+                crate::error::ErrorObject {
+                    error_type: Arc::from(status_to_error_type(status)),
+                    message: Arc::from(format!("HTTP {}", status)),
+                    param: None,
+                },
+            )
+        }
+    }
+}
+
+/// Map status code to error type string
+fn status_to_error_type(status: StatusCode) -> &'static str {
+    match status {
+        StatusCode::BAD_REQUEST => "invalid_request_error",
+        StatusCode::UNAUTHORIZED => "authentication_error",
+        StatusCode::FORBIDDEN => "permission_error",
+        StatusCode::NOT_FOUND => "not_found_error",
+        StatusCode::TOO_MANY_REQUESTS => "rate_limit_error",
+        StatusCode::INTERNAL_SERVER_ERROR => "api_error",
+        StatusCode::SERVICE_UNAVAILABLE => "overloaded_error",
+        _ if status.as_u16() == 529 => "overloaded_error",
+        _ => "api_error",
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::config::ClientBuilder;
+
+    #[test]
+    fn test_response_meta_default() {
+        let meta = ResponseMeta::default();
+        assert!(meta.request_id.is_none());
+    }
+
+    #[test]
+    fn test_transport_creation() {
+        let config = ClientBuilder::new()
+            .api_key("test-key")
+            .build()
+            .expect("should build config");
+
+        let transport = HttpTransport::new(config);
+        assert!(transport.is_ok());
+    }
+
+    #[test]
+    fn test_status_to_error_type() {
+        assert_eq!(
+            status_to_error_type(StatusCode::BAD_REQUEST),
+            "invalid_request_error"
+        );
+        assert_eq!(
+            status_to_error_type(StatusCode::UNAUTHORIZED),
+            "authentication_error"
+        );
+        assert_eq!(
+            status_to_error_type(StatusCode::TOO_MANY_REQUESTS),
+            "rate_limit_error"
+        );
+    }
+}

From 7d453bed75b4396851df4ded046e22a96667e09f Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sat, 3 Jan 2026 18:52:13 +0530
Subject: [PATCH 07/14] remove gemini provider support

---
 README.md                                     |   3 +-
 crates/rullm-cli/src/aliases.rs               |   2 +-
 crates/rullm-cli/src/args.rs                  |   6 +-
 crates/rullm-cli/src/auth.rs                  |   4 -
 crates/rullm-cli/src/cli_client.rs            | 126 +-----
 crates/rullm-cli/src/client.rs                |   1 -
 crates/rullm-cli/src/commands/auth.rs         |   5 +-
 crates/rullm-cli/src/commands/chat/mod.rs     |   2 +-
 crates/rullm-cli/src/commands/info.rs         |   7 -
 crates/rullm-cli/src/commands/mod.rs          |   2 +-
 crates/rullm-cli/src/provider.rs              |  13 +-
 crates/rullm-core/Cargo.toml                  |   4 -
 crates/rullm-core/examples/README.md          |  50 +--
 crates/rullm-core/examples/basic_usage.rs     |   5 +-
 crates/rullm-core/examples/gemini_stream.rs   | 276 -------------
 crates/rullm-core/examples/google_simple.rs   | 135 -------
 .../rullm-core/examples/test_all_providers.rs |  31 --
 crates/rullm-core/src/config.rs               |  18 +-
 crates/rullm-core/src/lib.rs                  |   9 +-
 .../rullm-core/src/providers/google/client.rs | 169 --------
 .../rullm-core/src/providers/google/config.rs |  58 ---
 crates/rullm-core/src/providers/google/mod.rs |  30 --
 .../rullm-core/src/providers/google/types.rs  | 376 ------------------
 crates/rullm-core/src/providers/mod.rs        |   3 -
 24 files changed, 20 insertions(+), 1315 deletions(-)
 delete mode 100644 crates/rullm-core/examples/gemini_stream.rs
 delete mode 100644 crates/rullm-core/examples/google_simple.rs
 delete mode 100644 crates/rullm-core/src/providers/google/client.rs
 delete mode 100644 crates/rullm-core/src/providers/google/config.rs
 delete mode 100644 crates/rullm-core/src/providers/google/mod.rs
 delete mode 100644 crates/rullm-core/src/providers/google/types.rs

diff --git a/README.md b/README.md
index f730a811..958533ee 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,6 @@ rullm "What is the capital of France?"
 # Use different models with aliases
 rullm --model gpt4 "Explain quantum computing"
 rullm --model claude "Write a poem about the ocean"
-rullm --model gemini "What's the weather like?"
 
 # Use templates for structured queries ({{input}} placeholder is automatically filled)
 rullm -t code-review "Review this function"
@@ -32,7 +31,7 @@ rullm chat --model claude
 
 # Disable streaming for buffered output
 rullm --no-streaming "Write a poem about the ocean"
-rullm chat --no-streaming --model gemini
+rullm chat --no-streaming --model claude
 
 # Set up your API keys
 rullm keys set openai
diff --git a/crates/rullm-cli/src/aliases.rs b/crates/rullm-cli/src/aliases.rs
index 7622815c..19614730 100644
--- a/crates/rullm-cli/src/aliases.rs
+++ b/crates/rullm-cli/src/aliases.rs
@@ -196,7 +196,7 @@ impl AliasResolver {
         }
 
         // Try to infer provider from model name patterns
-        for provider in [Provider::OpenAI, Provider::Anthropic, Provider::Google] {
+        for provider in [Provider::OpenAI, Provider::Anthropic] {
             for alias in provider.aliases() {
                 // Check if the model starts with an alias followed by a separator
                 if input.starts_with(&format!("{alias}-"))
diff --git a/crates/rullm-cli/src/args.rs b/crates/rullm-cli/src/args.rs
index 87dfa3c4..fbec0201 100644
--- a/crates/rullm-cli/src/args.rs
+++ b/crates/rullm-cli/src/args.rs
@@ -23,7 +23,7 @@ const CLI_EXAMPLES: &str = r#"EXAMPLES:
   rullm -t code-review "Review this code"         # Use template for query
   rullm -t greeting "Hello"                     # Template with input parameter
   rullm chat                                      # Start interactive chat
-  rullm chat -m gemini/gemini-pro                # Chat with specific model
+  rullm chat -m anthropic/claude-3-sonnet        # Chat with specific model
   rullm chat --no-streaming -m claude            # Interactive chat without streaming"#;
 
 /// Helper function to remove quotes from values, eliminating duplication
@@ -118,7 +118,7 @@ pub struct Cli {
     #[command(subcommand)]
     pub command: Option<Commands>,
 
-    /// Model to use in format: provider/model-name (e.g., openai/gpt-4, gemini/gemini-pro, anthropic/claude-3-sonnet)
+    /// Model to use in format: provider/model-name (e.g., openai/gpt-4, anthropic/claude-3-sonnet)
     #[arg(short, long, add = ArgValueCompleter::new(model_completer))]
     pub model: Option<String>,
 
@@ -181,7 +181,7 @@ impl Models {
 
 pub fn model_completer(current: &OsStr) -> Vec<CompletionCandidate> {
     // Predefined providers or aliases
-    const PROVIDED: &[&str] = &["openai:", "anthropic:", "google:"];
+    const PROVIDED: &[&str] = &["openai:", "anthropic:"];
 
     let cli_config = CliConfig::load();
     let cur_str = current.to_string_lossy();
diff --git a/crates/rullm-cli/src/auth.rs b/crates/rullm-cli/src/auth.rs
index fe78c7a1..a5dbf9a6 100644
--- a/crates/rullm-cli/src/auth.rs
+++ b/crates/rullm-cli/src/auth.rs
@@ -97,8 +97,6 @@ pub struct AuthConfig {
     pub groq: Option<Credential>,
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub openrouter: Option<Credential>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub google: Option<Credential>,
 }
 
 impl AuthConfig {
@@ -164,7 +162,6 @@ impl AuthConfig {
             Provider::OpenAI => self.openai.as_ref(),
             Provider::Groq => self.groq.as_ref(),
             Provider::OpenRouter => self.openrouter.as_ref(),
-            Provider::Google => self.google.as_ref(),
         }
     }
 
@@ -175,7 +172,6 @@ impl AuthConfig {
             Provider::OpenAI => &mut self.openai,
             Provider::Groq => &mut self.groq,
             Provider::OpenRouter => &mut self.openrouter,
-            Provider::Google => &mut self.google,
         }
     }
 
diff --git a/crates/rullm-cli/src/cli_client.rs b/crates/rullm-cli/src/cli_client.rs
index e22a692d..15d37d8a 100644
--- a/crates/rullm-cli/src/cli_client.rs
+++ b/crates/rullm-cli/src/cli_client.rs
@@ -6,11 +6,10 @@
 use futures::StreamExt;
 use rullm_core::error::LlmError;
 use rullm_core::providers::anthropic::AnthropicConfig;
-use rullm_core::providers::google::GoogleAiConfig;
 use rullm_core::providers::openai_compatible::{
     OpenAICompatibleConfig, OpenAICompatibleProvider, OpenAIConfig, identities,
 };
-use rullm_core::providers::{AnthropicClient, GoogleClient, OpenAIClient};
+use rullm_core::providers::{AnthropicClient, OpenAIClient};
 use std::pin::Pin;
 
 /// Claude Code identification text for OAuth requests
@@ -56,11 +55,6 @@ pub enum CliClient {
         config: CliConfig,
         is_oauth: bool,
     },
-    Google {
-        client: GoogleClient,
-        model: String,
-        config: CliConfig,
-    },
     Groq {
         client: OpenAICompatibleProvider,
         model: String,
@@ -106,21 +100,6 @@ impl CliClient {
         })
     }
 
-    /// Create Google client
-    pub fn google(
-        api_key: impl Into<String>,
-        model: impl Into<String>,
-        config: CliConfig,
-    ) -> Result<Self, LlmError> {
-        let client_config = GoogleAiConfig::new(api_key);
-        let client = GoogleClient::new(client_config)?;
-        Ok(Self::Google {
-            client,
-            model: model.into(),
-            config,
-        })
-    }
-
     /// Create Groq client
     pub fn groq(
         api_key: impl Into<String>,
@@ -219,51 +198,6 @@ impl CliClient {
 
                 Ok(content)
             }
-            Self::Google {
-                client,
-                model,
-                config,
-            } => {
-                use rullm_core::providers::google::{
-                    Content, GenerateContentRequest, GenerationConfig,
-                };
-
-                let mut request = GenerateContentRequest::new(vec![Content::user(message)]);
-
-                if config.temperature.is_some() || config.max_tokens.is_some() {
-                    let gen_config = GenerationConfig {
-                        temperature: config.temperature,
-                        max_output_tokens: config.max_tokens,
-                        stop_sequences: None,
-                        top_p: None,
-                        top_k: None,
-                        response_mime_type: None,
-                        response_schema: None,
-                    };
-                    request.generation_config = Some(gen_config);
-                }
-
-                let response = client.generate_content(model, request).await?;
-                let content = response
-                    .candidates
-                    .first()
-                    .map(|c| {
-                        c.content
-                            .parts
-                            .iter()
-                            .filter_map(|part| match part {
-                                rullm_core::providers::google::Part::Text { text } => {
-                                    Some(text.clone())
-                                }
-                                _ => None,
-                            })
-                            .collect::<Vec<_>>()
-                            .join("")
-                    })
-                    .ok_or_else(|| LlmError::model("No content in response"))?;
-
-                Ok(content)
-            }
             Self::Groq {
                 client,
                 model,
@@ -386,62 +320,6 @@ impl CliClient {
                     }
                 })))
             }
-            Self::Google {
-                client,
-                model,
-                config,
-            } => {
-                use rullm_core::providers::google::{
-                    Content, GenerateContentRequest, GenerationConfig,
-                };
-
-                let contents: Vec<Content> = messages
-                    .iter()
-                    .map(|(role, content)| match role.as_str() {
-                        "user" => Content::user(content),
-                        _ => Content::model(content),
-                    })
-                    .collect();
-
-                let mut request = GenerateContentRequest::new(contents);
-                if config.temperature.is_some() || config.max_tokens.is_some() {
-                    request.generation_config = Some(GenerationConfig {
-                        temperature: config.temperature,
-                        max_output_tokens: config.max_tokens,
-                        stop_sequences: None,
-                        top_p: None,
-                        top_k: None,
-                        response_mime_type: None,
-                        response_schema: None,
-                    });
-                }
-
-                let stream = client.stream_generate_content(model, request).await?;
-                Ok(Box::pin(stream.filter_map(|response_result| async move {
-                    match response_result {
-                        Ok(response) => response
-                            .candidates
-                            .first()
-                            .map(|candidate| {
-                                let text = candidate
-                                    .content
-                                    .parts
-                                    .iter()
-                                    .filter_map(|part| match part {
-                                        rullm_core::providers::google::Part::Text { text } => {
-                                            Some(text.clone())
-                                        }
-                                        _ => None,
-                                    })
-                                    .collect::<Vec<_>>()
-                                    .join("");
-                                Ok(text)
-                            })
-                            .filter(|s| matches!(s, Ok(t) if !t.is_empty())),
-                        Err(e) => Some(Err(e)),
-                    }
-                })))
-            }
             Self::Groq {
                 client,
                 model,
@@ -491,7 +369,6 @@ impl CliClient {
         match self {
             Self::OpenAI { .. } => "openai",
             Self::Anthropic { .. } => "anthropic",
-            Self::Google { .. } => "google",
             Self::Groq { .. } => "groq",
             Self::OpenRouter { .. } => "openrouter",
         }
@@ -502,7 +379,6 @@ impl CliClient {
         match self {
             Self::OpenAI { model, .. }
             | Self::Anthropic { model, .. }
-            | Self::Google { model, .. }
             | Self::Groq { model, .. }
             | Self::OpenRouter { model, .. } => model,
         }
diff --git a/crates/rullm-cli/src/client.rs b/crates/rullm-cli/src/client.rs
index 98c7e2a5..e1b7856e 100644
--- a/crates/rullm-cli/src/client.rs
+++ b/crates/rullm-cli/src/client.rs
@@ -41,7 +41,6 @@ pub fn create_client(
         Provider::Groq => CliClient::groq(api_key, model_name, config),
         Provider::OpenRouter => CliClient::openrouter(api_key, model_name, config),
         Provider::Anthropic => CliClient::anthropic(api_key, model_name, config, is_oauth),
-        Provider::Google => CliClient::google(api_key, model_name, config),
     }
 }
 
diff --git a/crates/rullm-cli/src/commands/auth.rs b/crates/rullm-cli/src/commands/auth.rs
index 377385e1..b1fe2ea8 100644
--- a/crates/rullm-cli/src/commands/auth.rs
+++ b/crates/rullm-cli/src/commands/auth.rs
@@ -20,12 +20,12 @@ pub struct AuthArgs {
 pub enum AuthAction {
     /// Login to a provider (OAuth or API key)
     Login {
-        /// Provider name (anthropic, openai, groq, openrouter, google)
+        /// Provider name (anthropic, openai, groq, openrouter)
         provider: Option<Provider>,
     },
     /// Logout from a provider (remove stored credentials)
     Logout {
-        /// Provider name (anthropic, openai, groq, openrouter, google)
+        /// Provider name (anthropic, openai, groq, openrouter)
         provider: Option<Provider>,
     },
     /// List all credentials and environment variables
@@ -222,7 +222,6 @@ fn format_provider_display(provider: &Provider) -> &'static str {
         Provider::OpenAI => "OpenAI",
         Provider::Groq => "Groq",
         Provider::OpenRouter => "OpenRouter",
-        Provider::Google => "Google",
     }
 }
 
diff --git a/crates/rullm-cli/src/commands/chat/mod.rs b/crates/rullm-cli/src/commands/chat/mod.rs
index 3c4af395..7718a41e 100644
--- a/crates/rullm-cli/src/commands/chat/mod.rs
+++ b/crates/rullm-cli/src/commands/chat/mod.rs
@@ -26,7 +26,7 @@ pub use prompt::ChatPrompt;
 
 #[derive(Args)]
 pub struct ChatArgs {
-    /// Model to use in format: provider:model-name (e.g., openai:gpt-4, gemini:gemini-pro, anthropic:claude-3-sonnet)
+    /// Model to use in format: provider:model-name (e.g., openai:gpt-4, anthropic:claude-3-sonnet)
     #[arg(short, long, add = ArgValueCompleter::new(model_completer))]
     pub model: Option<String>,
 }
diff --git a/crates/rullm-cli/src/commands/info.rs b/crates/rullm-cli/src/commands/info.rs
index cb96f42d..f0d7018f 100644
--- a/crates/rullm-cli/src/commands/info.rs
+++ b/crates/rullm-cli/src/commands/info.rs
@@ -53,13 +53,6 @@ impl InfoArgs {
             ),
             output_level,
         );
-        crate::output::note(
-            &format!(
-                "GOOGLE_AI_API_KEY = {}",
-                env_var_status("GOOGLE_AI_API_KEY")
-            ),
-            output_level,
-        );
 
         crate::output::heading("\nVersion info:", output_level);
         crate::output::note(
diff --git a/crates/rullm-cli/src/commands/mod.rs b/crates/rullm-cli/src/commands/mod.rs
index a1964484..db7b491e 100644
--- a/crates/rullm-cli/src/commands/mod.rs
+++ b/crates/rullm-cli/src/commands/mod.rs
@@ -33,7 +33,7 @@ const CHAT_EXAMPLES: &str = r#"EXAMPLES:
   rullm chat                               # Start chat with default model
   rullm chat -m openai/gpt-4              # Chat with GPT-4
   rullm chat -m claude                     # Chat using claude alias
-  rullm chat -m gemini/gemini-pro          # Chat with Gemini Pro"#;
+  rullm chat -m anthropic/claude-3-sonnet  # Chat with Claude Sonnet"#;
 
 const MODELS_EXAMPLES: &str = r#"EXAMPLES:
   rullm models list                        # List cached models
diff --git a/crates/rullm-cli/src/provider.rs b/crates/rullm-cli/src/provider.rs
index b6362e05..c197f6f5 100644
--- a/crates/rullm-cli/src/provider.rs
+++ b/crates/rullm-cli/src/provider.rs
@@ -8,7 +8,6 @@ pub enum Provider {
     Groq,
     OpenRouter,
     Anthropic,
-    Google,
 }
 
 impl std::fmt::Display for Provider {
@@ -18,7 +17,6 @@ impl std::fmt::Display for Provider {
             Provider::Groq => "groq",
             Provider::OpenRouter => "openrouter",
             Provider::Anthropic => "anthropic",
-            Provider::Google => "google",
         };
         write!(f, "{name}")
     }
@@ -26,13 +24,7 @@ impl std::fmt::Display for Provider {
 
 impl ValueEnum for Provider {
     fn value_variants<'a>() -> &'a [Self] {
-        &[
-            Self::OpenAI,
-            Self::Groq,
-            Self::OpenRouter,
-            Self::Anthropic,
-            Self::Google,
-        ]
+        &[Self::OpenAI, Self::Groq, Self::OpenRouter, Self::Anthropic]
     }
 
     fn to_possible_value(&self) -> Option<PossibleValue> {
@@ -41,7 +33,6 @@ impl ValueEnum for Provider {
             Self::Groq => PossibleValue::new("groq"),
             Self::OpenRouter => PossibleValue::new("openrouter"),
             Self::Anthropic => PossibleValue::new("anthropic"),
-            Self::Google => PossibleValue::new("google"),
         };
         Some(value)
     }
@@ -54,7 +45,6 @@ impl Provider {
             Provider::Groq => &["groq"],
             Provider::OpenRouter => &["openrouter"],
             Provider::Anthropic => &["anthropic", "claude"],
-            Provider::Google => &["google", "gemini"],
         }
     }
 
@@ -87,7 +77,6 @@ impl Provider {
             Provider::Groq => "GROQ_API_KEY",
             Provider::OpenRouter => "OPENROUTER_API_KEY",
             Provider::Anthropic => "ANTHROPIC_API_KEY",
-            Provider::Google => "GOOGLE_AI_API_KEY",
         }
     }
 }
diff --git a/crates/rullm-core/Cargo.toml b/crates/rullm-core/Cargo.toml
index d1c81129..349d7a4f 100644
--- a/crates/rullm-core/Cargo.toml
+++ b/crates/rullm-core/Cargo.toml
@@ -50,10 +50,6 @@ path = "examples/test_all_providers.rs"
 name = "anthropic_simple"
 path = "examples/anthropic_simple.rs"
 
-[[example]]
-name = "google_simple"
-path = "examples/google_simple.rs"
-
 [[example]]
 name = "openai_conversation"
 path = "examples/openai_conversation.rs"
diff --git a/crates/rullm-core/examples/README.md b/crates/rullm-core/examples/README.md
index f9cfbc50..970f2987 100644
--- a/crates/rullm-core/examples/README.md
+++ b/crates/rullm-core/examples/README.md
@@ -1,6 +1,6 @@
 # LLM Provider Examples
 
-This directory contains examples demonstrating how to use the OpenAI, Anthropic, and Google AI providers in the LLM library.
+This directory contains examples demonstrating how to use the OpenAI and Anthropic providers in the LLM library.
 
 ## Prerequisites
 
@@ -11,14 +11,10 @@ This directory contains examples demonstrating how to use the OpenAI, Anthropic,
    export OPENAI_ORGANIZATION="org-123"  # Optional
    export OPENAI_PROJECT="proj-456"      # Optional
    export OPENAI_BASE_URL="https://custom-endpoint.com/v1"  # Optional
-   
+
    # Anthropic
    export ANTHROPIC_API_KEY="sk-ant-your-actual-api-key"
    export ANTHROPIC_BASE_URL="https://custom-endpoint.com"  # Optional
-   
-   # Google AI
-   export GOOGLE_AI_API_KEY="your-google-ai-api-key"
-   export GOOGLE_AI_BASE_URL="https://custom-endpoint.com"  # Optional
    ```
 
 2. **Install dependencies:**
@@ -118,41 +114,6 @@ let mut stream = provider
 - Creative content: 0.7-1.0 for variety
 - Balanced conversation: 0.6-0.7
 
-### 3. Google Gemini Streaming (`gemini_stream.rs`)
-
-**Run:** `cargo run --example gemini_stream`
-
-**Environment:** Requires `GOOGLE_API_KEY`
-
-Shows Gemini streaming with:
-- **Technical explanations** with precision
-- **Creative writing** using experimental models
-- **Code analysis** and review capabilities
-- **Model comparison** between Gemini variants
-- **Sentence counting** and response analysis
-
-**Code snippet:**
-```rust
-let request = ChatRequestBuilder::new()
-    .system("You are a helpful AI assistant built by Google.")
-    .user("Explain machine learning in simple terms.")
-    .temperature(0.7)
-    .max_tokens(150)
-    .stream(true)
-    .build();
-
-let mut stream = provider
-    .chat_completion_stream(request, "gemini-1.5-flash", None)
-    .await;
-
-// Handle streaming events...
-```
-
-**Models used:** 
-- `gemini-1.5-flash` (fast responses)
-- `gemini-1.5-pro` (balanced performance)  
-- `gemini-2.0-flash-exp` (experimental features)
-
 ### Streaming API Patterns
 
 **Event handling:**
@@ -202,8 +163,7 @@ cargo build --examples
 
 # Test individual streaming examples
 cargo run --example openai_stream     # Requires OPENAI_API_KEY
-cargo run --example anthropic_stream  # Requires ANTHROPIC_API_KEY  
-cargo run --example gemini_stream     # Requires GOOGLE_API_KEY
+cargo run --example anthropic_stream  # Requires ANTHROPIC_API_KEY
 
 # Run lint checks
 cargo clippy --all-targets --all-features
@@ -396,14 +356,13 @@ Comprehensive test that validates all LLM providers with health checks:
 # Set up your API keys
 export OPENAI_API_KEY="sk-..."
 export ANTHROPIC_API_KEY="sk-ant-..."
-export GOOGLE_API_KEY="..."
 
 # Run the comprehensive test
 cargo run --example test_all_providers
 ```
 
 **Features:**
-- Tests OpenAI, Anthropic, and Google providers
+- Tests OpenAI and Anthropic providers
 - Performs health checks
 - Provides detailed success/failure reporting
 - Gracefully handles missing API keys
@@ -422,7 +381,6 @@ cargo run --example test_all_providers
 ├─────────────┼────────┤
 │ OpenAI      │ ✅ Pass │
 │ Anthropic   │ ✅ Pass │
-│ Google      │ ✅ Pass │
 └─────────────┴────────┘
 
 🎉 All providers are working correctly!
diff --git a/crates/rullm-core/examples/basic_usage.rs b/crates/rullm-core/examples/basic_usage.rs
index 694bca88..d5e0f86f 100644
--- a/crates/rullm-core/examples/basic_usage.rs
+++ b/crates/rullm-core/examples/basic_usage.rs
@@ -43,10 +43,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     println!("  • These compat_types are minimal types for OpenAI-compatible providers");
     println!("  • For full-featured OpenAI, use OpenAIClient with ChatCompletionRequest");
     println!("  • For Anthropic, use AnthropicClient with MessagesRequest");
-    println!("  • For Google, use GoogleClient with GenerateContentRequest");
-    println!(
-        "\nSee provider-specific examples (openai_simple, anthropic_simple, google_simple) for details."
-    );
+    println!("\nSee provider-specific examples (openai_simple, anthropic_simple) for details.");
 
     Ok(())
 }
diff --git a/crates/rullm-core/examples/gemini_stream.rs b/crates/rullm-core/examples/gemini_stream.rs
deleted file mode 100644
index 9bbf4b41..00000000
--- a/crates/rullm-core/examples/gemini_stream.rs
+++ /dev/null
@@ -1,276 +0,0 @@
-use futures::StreamExt;
-use rullm_core::providers::google::{
-    Content, GenerateContentRequest, GenerationConfig, GoogleClient, Part,
-};
-
-// Helper to extract text from response
-fn extract_text(response: &rullm_core::providers::google::GenerateContentResponse) -> String {
-    response
-        .candidates
-        .iter()
-        .flat_map(|candidate| &candidate.content.parts)
-        .filter_map(|part| match part {
-            Part::Text { text } => Some(text.as_str()),
-            _ => None,
-        })
-        .collect::<Vec<_>>()
-        .join("")
-}
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    println!("🔄 Google Gemini Streaming Chat Example");
-    println!("=======================================\n");
-
-    // 1. Configuration from environment
-    // Set GOOGLE_API_KEY environment variable before running
-    let client = GoogleClient::from_env()?;
-
-    // 2. Simple streaming chat with Gemini Flash
-    println!("💬 Simple streaming chat:");
-    let request = GenerateContentRequest::new(vec![Content::user(
-        "Explain machine learning in simple terms.",
-    )])
-    .with_system("You are a helpful AI assistant built by Google.".to_string())
-    .with_generation_config(GenerationConfig {
-        temperature: Some(0.7),
-        max_output_tokens: Some(150),
-        stop_sequences: None,
-        top_p: None,
-        top_k: None,
-        response_mime_type: None,
-        response_schema: None,
-    });
-
-    let mut stream = client
-        .stream_generate_content("gemini-1.5-flash", request)
-        .await?;
-
-    print!("🤖 Gemini: ");
-    while let Some(response_result) = stream.next().await {
-        match response_result {
-            Ok(response) => {
-                let text = extract_text(&response);
-                if !text.is_empty() {
-                    print!("{text}");
-                    std::io::Write::flush(&mut std::io::stdout())?;
-                }
-            }
-            Err(e) => {
-                println!("\n❌ Stream error: {e}");
-                break;
-            }
-        }
-    }
-    println!("\n✅ Stream completed successfully");
-
-    // 3. Multi-turn technical conversation
-    println!("\n\n🗨️ Multi-turn technical conversation:");
-    let conversation_request = GenerateContentRequest::new(vec![
-        Content::user("What are the differences between Rust and Go?"),
-        Content::model("Rust focuses on memory safety and zero-cost abstractions, while Go emphasizes simplicity and built-in concurrency."),
-        Content::user("Which would you recommend for a web API?"),
-    ])
-    .with_system("You are a technical expert who gives precise, helpful answers.".to_string())
-    .with_generation_config(GenerationConfig {
-        temperature: Some(0.5),
-        max_output_tokens: Some(200),
-        stop_sequences: None,
-        top_p: None,
-        top_k: None,
-        response_mime_type: None,
-        response_schema: None,
-    });
-
-    let mut conversation_stream = client
-        .stream_generate_content("gemini-1.5-pro", conversation_request)
-        .await?;
-
-    print!("🤖 Expert Gemini: ");
-    while let Some(response_result) = conversation_stream.next().await {
-        match response_result {
-            Ok(response) => {
-                let text = extract_text(&response);
-                if !text.is_empty() {
-                    print!("{text}");
-                    std::io::Write::flush(&mut std::io::stdout())?;
-                }
-            }
-            Err(e) => {
-                println!("\n❌ Stream error: {e}");
-                break;
-            }
-        }
-    }
-    println!("\n✅ Technical conversation completed");
-
-    // 4. Creative writing with experimental Gemini 2.0
-    println!("\n\n🎨 Creative writing stream (Gemini 2.0 experimental):");
-    let creative_request = GenerateContentRequest::new(vec![Content::user(
-        "Write a short story about an AI that discovers it can paint digital masterpieces.",
-    )])
-    .with_system("You are a creative writer who crafts engaging, vivid stories.".to_string())
-    .with_generation_config(GenerationConfig {
-        temperature: Some(0.9), // Higher creativity
-        top_p: Some(0.95),
-        max_output_tokens: Some(250),
-        stop_sequences: None,
-        top_k: None,
-        response_mime_type: None,
-        response_schema: None,
-    });
-
-    let mut creative_stream = client
-        .stream_generate_content("gemini-2.0-flash-exp", creative_request)
-        .await?;
-
-    print!("✍️ Creative Story: ");
-    let mut char_count = 0;
-    while let Some(response_result) = creative_stream.next().await {
-        match response_result {
-            Ok(response) => {
-                let text = extract_text(&response);
-                if !text.is_empty() {
-                    print!("{text}");
-                    std::io::Write::flush(&mut std::io::stdout())?;
-                    char_count += text.len();
-                }
-            }
-            Err(e) => {
-                println!("\n❌ Stream error: {e}");
-                break;
-            }
-        }
-    }
-    println!("\n✅ Creative stream completed (~{char_count} characters)");
-
-    // 5. Code analysis with streaming
-    println!("\n\n💻 Code analysis stream:");
-    let code_request = GenerateContentRequest::new(vec![Content::user(
-        "Review this Rust function and suggest improvements:\n\nfn fibonacci(n: u32) -> u32 {\n    if n <= 1 {\n        n\n    } else {\n        fibonacci(n - 1) + fibonacci(n - 2)\n    }\n}",
-    )])
-    .with_system("You are a code reviewer who provides detailed, constructive feedback.".to_string())
-    .with_generation_config(GenerationConfig {
-        temperature: Some(0.3), // Lower temperature for technical accuracy
-        max_output_tokens: Some(300),
-        stop_sequences: None,
-        top_p: None,
-        top_k: None,
-        response_mime_type: None,
-        response_schema: None,
-    });
-
-    let mut code_stream = client
-        .stream_generate_content("gemini-1.5-pro", code_request)
-        .await?;
-
-    print!("🔍 Code Reviewer: ");
-    while let Some(response_result) = code_stream.next().await {
-        match response_result {
-            Ok(response) => {
-                let text = extract_text(&response);
-                if !text.is_empty() {
-                    print!("{text}");
-                    std::io::Write::flush(&mut std::io::stdout())?;
-                }
-            }
-            Err(e) => {
-                println!("\n❌ Stream error: {e}");
-                break;
-            }
-        }
-    }
-    println!("\n✅ Code review completed");
-
-    // 6. Model comparison streaming
-    println!("\n\n⚖️ Model comparison streaming:");
-    let models = ["gemini-1.5-flash", "gemini-1.5-pro"];
-    let question = "What makes quantum computing different from classical computing?";
-
-    for model in &models {
-        println!("\n📋 Streaming with {model}:");
-        let request = GenerateContentRequest::new(vec![Content::user(question)])
-            .with_generation_config(GenerationConfig {
-                temperature: Some(0.6),
-                max_output_tokens: Some(120),
-                stop_sequences: None,
-                top_p: None,
-                top_k: None,
-                response_mime_type: None,
-                response_schema: None,
-            });
-
-        let mut stream = client.stream_generate_content(model, request).await?;
-
-        print!("🤖 {model}: ");
-        while let Some(response_result) = stream.next().await {
-            match response_result {
-                Ok(response) => {
-                    let text = extract_text(&response);
-                    if !text.is_empty() {
-                        print!("{text}");
-                        std::io::Write::flush(&mut std::io::stdout())?;
-                    }
-                }
-                Err(e) => {
-                    println!("\n❌ Stream error: {e}");
-                    break;
-                }
-            }
-        }
-        println!("\n✅ {model} completed");
-    }
-
-    // 7. Error handling demonstration
-    println!("\n\n⚠️ Error handling demonstration:");
-    let invalid_request =
-        GenerateContentRequest::new(vec![Content::user("Test with invalid model.")])
-            .with_generation_config(GenerationConfig {
-                temperature: Some(0.7),
-                stop_sequences: None,
-                max_output_tokens: None,
-                top_p: None,
-                top_k: None,
-                response_mime_type: None,
-                response_schema: None,
-            });
-
-    match client
-        .stream_generate_content("gemini-invalid-model", invalid_request)
-        .await
-    {
-        Ok(mut error_stream) => {
-            while let Some(response_result) = error_stream.next().await {
-                match response_result {
-                    Ok(response) => {
-                        let text = extract_text(&response);
-                        if !text.is_empty() {
-                            print!("{text}");
-                        }
-                    }
-                    Err(error) => {
-                        println!("🔴 Request error (as expected): {error}");
-                        break;
-                    }
-                }
-            }
-        }
-        Err(error) => {
-            println!("🔴 Request error (as expected): {error}");
-        }
-    }
-
-    println!("\n\n🎯 Tips for using Google Gemini streaming:");
-    println!("• Set GOOGLE_API_KEY environment variable");
-    println!("• Use stream_generate_content() for streaming responses");
-    println!("• Process GenerateContentResponse chunks as they arrive");
-    println!(
-        "• Models: gemini-1.5-flash (fast), gemini-1.5-pro (balanced), gemini-2.0-flash-exp (experimental)"
-    );
-    println!("• Gemini supports reasoning, code analysis, and creative tasks");
-    println!("• Lower temperature (0.1-0.4) for factual/technical content");
-    println!("• Higher temperature (0.7-1.0) for creative content");
-    println!("• Use top_p for more controlled randomness");
-
-    Ok(())
-}
diff --git a/crates/rullm-core/examples/google_simple.rs b/crates/rullm-core/examples/google_simple.rs
deleted file mode 100644
index 12d0c9a4..00000000
--- a/crates/rullm-core/examples/google_simple.rs
+++ /dev/null
@@ -1,135 +0,0 @@
-use rullm_core::providers::google::{
-    Content, GenerateContentRequest, GenerationConfig, GoogleClient, Part,
-};
-
-// Helper to extract text from response
-fn extract_text(response: &rullm_core::providers::google::GenerateContentResponse) -> String {
-    response
-        .candidates
-        .iter()
-        .flat_map(|candidate| &candidate.content.parts)
-        .filter_map(|part| match part {
-            Part::Text { text } => Some(text.as_str()),
-            _ => None,
-        })
-        .collect::<Vec<_>>()
-        .join("")
-}
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // 1. Basic Configuration using from_env
-    let client = GoogleClient::from_env()?;
-
-    // 2. Simple Chat Completion
-    let request = GenerateContentRequest::new(vec![Content::user("What is 2 + 2?")])
-        .with_system("You are a helpful assistant.".to_string())
-        .with_generation_config(GenerationConfig {
-            temperature: Some(0.7),
-            max_output_tokens: Some(1024),
-            stop_sequences: None,
-            top_p: None,
-            top_k: None,
-            response_mime_type: None,
-            response_schema: None,
-        });
-
-    let response = client.generate_content("gemini-1.5-flash", request).await?;
-
-    println!("🤖 Assistant: {}", extract_text(&response));
-    if let Some(usage) = &response.usage_metadata {
-        println!("📊 Tokens used: {}", usage.total_token_count);
-    }
-
-    // 3. Multi-message conversation
-    let conversation_request = GenerateContentRequest::new(vec![
-        Content::user("What is 5 * 7?"),
-        Content::model("5 * 7 = 35"),
-        Content::user("What about 6 * 8?"),
-    ])
-    .with_system("You are a helpful math tutor.".to_string())
-    .with_generation_config(GenerationConfig {
-        max_output_tokens: Some(100),
-        stop_sequences: None,
-        temperature: None,
-        top_p: None,
-        top_k: None,
-        response_mime_type: None,
-        response_schema: None,
-    });
-
-    let conversation_response = client
-        .generate_content("gemini-1.5-pro", conversation_request)
-        .await?;
-
-    println!("\n💬 Conversation:");
-    println!("Assistant: {}", extract_text(&conversation_response));
-
-    // 4. Different models comparison
-    let models = ["gemini-1.5-flash", "gemini-1.5-pro", "gemini-2.0-flash-exp"];
-    let question = "Explain async/await in one sentence.";
-
-    for model in &models {
-        let request = GenerateContentRequest::new(vec![Content::user(question)])
-            .with_generation_config(GenerationConfig {
-                temperature: Some(0.5),
-                max_output_tokens: Some(50),
-                stop_sequences: None,
-                top_p: None,
-                top_k: None,
-                response_mime_type: None,
-                response_schema: None,
-            });
-
-        match client.generate_content(model, request).await {
-            Ok(response) => {
-                println!("\n🔬 {model} says:");
-                println!("{}", extract_text(&response));
-            }
-            Err(e) => {
-                println!("❌ Error with {model}: {e}");
-            }
-        }
-    }
-
-    // 5. Advanced parameters with Google-specific features
-    let creative_request =
-        GenerateContentRequest::new(vec![Content::user("Write a haiku about programming.")])
-            .with_system("You are a creative writer.".to_string())
-            .with_generation_config(GenerationConfig {
-                temperature: Some(1.0), // Higher creativity
-                top_p: Some(0.9),
-                max_output_tokens: Some(200),
-                stop_sequences: None,
-                top_k: None,
-                response_mime_type: None,
-                response_schema: None,
-            });
-
-    let creative_response = client
-        .generate_content("gemini-1.5-pro", creative_request)
-        .await?;
-
-    println!("\n🎨 Creative Response:");
-    println!("{}", extract_text(&creative_response));
-    if let Some(candidate) = creative_response.candidates.first() {
-        if let Some(reason) = &candidate.finish_reason {
-            println!("Finish reason: {:?}", reason);
-        }
-    }
-
-    // 6. Display safety ratings if available (Google AI specific)
-    if let Some(candidate) = creative_response.candidates.first() {
-        if let Some(safety_ratings) = &candidate.safety_ratings {
-            println!("🛡️ Safety ratings: {} checks", safety_ratings.len());
-        }
-    }
-
-    // 7. Health check
-    match client.health_check().await {
-        Ok(_) => println!("\n✅ Google AI is healthy"),
-        Err(e) => println!("\n❌ Health check failed: {e}"),
-    }
-
-    Ok(())
-}
diff --git a/crates/rullm-core/examples/test_all_providers.rs b/crates/rullm-core/examples/test_all_providers.rs
index 921af2c5..5cbdae59 100644
--- a/crates/rullm-core/examples/test_all_providers.rs
+++ b/crates/rullm-core/examples/test_all_providers.rs
@@ -1,5 +1,4 @@
 use rullm_core::providers::anthropic::{AnthropicClient, AnthropicConfig};
-use rullm_core::providers::google::{GoogleAiConfig, GoogleClient};
 use rullm_core::providers::openai::OpenAIClient;
 use rullm_core::providers::openai_compatible::OpenAIConfig;
 use std::env;
@@ -39,20 +38,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
     println!();
 
-    // 3. Test Google Provider
-    println!("🔍 Testing Google Provider...");
-    match test_google_provider().await {
-        Ok(()) => {
-            println!("✅ Google: Health check passed");
-            results.push(("Google", true));
-        }
-        Err(e) => {
-            println!("❌ Google: Failed - {e}");
-            results.push(("Google", false));
-        }
-    }
-    println!();
-
     // Summary
     println!("📊 SUMMARY:");
     println!("┌─────────────┬────────┐");
@@ -109,19 +94,3 @@ async fn test_anthropic_provider() -> Result<(), Box<dyn std::error::Error>> {
 
     Ok(())
 }
-
-async fn test_google_provider() -> Result<(), Box<dyn std::error::Error>> {
-    let api_key =
-        env::var("GOOGLE_API_KEY").map_err(|_| "GOOGLE_API_KEY environment variable not set")?;
-
-    let config = GoogleAiConfig::new(api_key);
-    let client = GoogleClient::new(config)?;
-
-    // Test health check
-    match client.health_check().await {
-        Ok(_) => println!("   Health check: ✅ Passed"),
-        Err(e) => println!("   Health check: ⚠️  Warning - {e}"),
-    }
-
-    Ok(())
-}
diff --git a/crates/rullm-core/src/config.rs b/crates/rullm-core/src/config.rs
index 8528f4ba..47f6de8e 100644
--- a/crates/rullm-core/src/config.rs
+++ b/crates/rullm-core/src/config.rs
@@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::time::Duration;
 
-use crate::providers::{AnthropicConfig, GoogleAiConfig, OpenAICompatibleConfig, OpenAIConfig};
+use crate::providers::{AnthropicConfig, OpenAICompatibleConfig, OpenAIConfig};
 
 /// Configuration trait for LLM providers
 pub trait ProviderConfig: Send + Sync {
@@ -134,22 +134,6 @@ impl ConfigBuilder {
         Ok(config)
     }
 
-    /// Create Google AI config from environment
-    pub fn google_ai_from_env() -> Result<GoogleAiConfig, crate::error::LlmError> {
-        let api_key = std::env::var("GOOGLE_AI_API_KEY").map_err(|_| {
-            crate::error::LlmError::configuration("GOOGLE_AI_API_KEY environment variable not set")
-        })?;
-
-        let mut config = GoogleAiConfig::new(api_key);
-
-        if let Ok(base_url) = std::env::var("GOOGLE_AI_BASE_URL") {
-            config = config.with_base_url(base_url);
-        }
-
-        config.validate()?;
-        Ok(config)
-    }
-
     /// Create Groq config from environment
     pub fn groq_from_env() -> Result<OpenAICompatibleConfig, crate::error::LlmError> {
         let api_key = std::env::var("GROQ_API_KEY").map_err(|_| {
diff --git a/crates/rullm-core/src/lib.rs b/crates/rullm-core/src/lib.rs
index 0f7700ab..28892c91 100644
--- a/crates/rullm-core/src/lib.rs
+++ b/crates/rullm-core/src/lib.rs
@@ -5,7 +5,7 @@
 //!
 //! ## Features
 //!
-//! - Multiple LLM Providers (OpenAI, Anthropic, Google AI)
+//! - Multiple LLM Providers (OpenAI, Anthropic)
 //! - Tower middleware with connection pooling and async/await
 //! - Rate limiting, timeouts, and error handling
 //! - Dual APIs: Simple string-based API and advanced API with full control
@@ -107,14 +107,12 @@
 //! The library includes streaming examples for each provider:
 //!
 //! - `openai_stream.rs` - OpenAI GPT models streaming
-//! - `anthropic_stream.rs` - Anthropic Claude models streaming  
-//! - `gemini_stream.rs` - Google Gemini models streaming
+//! - `anthropic_stream.rs` - Anthropic Claude models streaming
 //!
 //! Run examples with:
 //! ```bash
 //! cargo run --example openai_stream     # Requires OPENAI_API_KEY
 //! cargo run --example anthropic_stream  # Requires ANTHROPIC_API_KEY
-//! cargo run --example gemini_stream     # Requires GOOGLE_API_KEY
 //! ```
 //!
 //! ### Provider-Specific Streaming Features
@@ -123,7 +121,6 @@
 //! |----------|--------|--------------|
 //! | OpenAI | GPT-3.5, GPT-4 | Token counting, creative writing |
 //! | Anthropic | Claude 3 variants | Reasoning, code analysis |
-//! | Google | Gemini 1.5/2.0 | Multimodal, experimental models |
 //!
 //! ## Error Handling
 //!
@@ -155,7 +152,7 @@ pub mod providers;
 pub mod utils;
 
 // Concrete client exports
-pub use providers::{AnthropicClient, GoogleClient, OpenAIClient, OpenAICompatibleProvider};
+pub use providers::{AnthropicClient, OpenAIClient, OpenAICompatibleProvider};
 
 pub use config::{ConfigBuilder, HttpProviderConfig, ProviderConfig};
 pub use error::LlmError;
diff --git a/crates/rullm-core/src/providers/google/client.rs b/crates/rullm-core/src/providers/google/client.rs
deleted file mode 100644
index 51207923..00000000
--- a/crates/rullm-core/src/providers/google/client.rs
+++ /dev/null
@@ -1,169 +0,0 @@
-use super::config::GoogleAiConfig;
-use super::types::*;
-use crate::config::ProviderConfig;
-use crate::error::LlmError;
-use crate::utils::sse::sse_lines;
-use futures::Stream;
-use futures::StreamExt;
-use reqwest::Client;
-use std::pin::Pin;
-
-/// Google Gemini client with full API support
-#[derive(Clone)]
-pub struct GoogleClient {
-    config: GoogleAiConfig,
-    client: Client,
-    base_url: String,
-}
-
-impl GoogleClient {
-    /// Create a new Google client
-    pub fn new(config: GoogleAiConfig) -> Result<Self, LlmError> {
-        config.validate()?;
-        let base_url = config
-            .base_url
-            .clone()
-            .unwrap_or_else(|| "https://generativelanguage.googleapis.com/v1beta".to_string());
-
-        Ok(Self {
-            config,
-            client: Client::new(),
-            base_url,
-        })
-    }
-
-    /// Create client from environment variables
-    pub fn from_env() -> Result<Self, LlmError> {
-        let config = crate::config::ConfigBuilder::google_ai_from_env()?;
-        Self::new(config)
-    }
-
-    /// Generate content
-    pub async fn generate_content(
-        &self,
-        model: &str,
-        request: GenerateContentRequest,
-    ) -> Result<GenerateContentResponse, LlmError> {
-        let url = format!(
-            "{}/models/{}:generateContent?key={}",
-            self.base_url,
-            model,
-            self.config.api_key()
-        );
-
-        let mut req = self.client.post(&url);
-        for (key, value) in self.config.headers() {
-            req = req.header(key, value);
-        }
-
-        let response = req.json(&request).send().await?;
-
-        if !response.status().is_success() {
-            let status = response.status().to_string();
-            let error_text = response
-                .text()
-                .await
-                .unwrap_or_else(|_| "Unknown error".to_string());
-
-            return Err(LlmError::api(
-                "google",
-                format!("API Error: {status} - {error_text}"),
-                Some(status),
-                None,
-            ));
-        }
-
-        let response_data: GenerateContentResponse = response.json().await.map_err(|e| {
-            LlmError::serialization("Failed to parse GenerateContentResponse", Box::new(e))
-        })?;
-
-        Ok(response_data)
-    }
-
-    /// Stream generate content
-    pub async fn stream_generate_content(
-        &self,
-        model: &str,
-        request: GenerateContentRequest,
-    ) -> Result<
-        Pin<Box<dyn Stream<Item = Result<GenerateContentResponse, LlmError>> + Send>>,
-        LlmError,
-    > {
-        let url = format!(
-            "{}/models/{}:streamGenerateContent?alt=sse&key={}",
-            self.base_url,
-            model,
-            self.config.api_key()
-        );
-
-        let mut header_map = reqwest::header::HeaderMap::new();
-        for (key, value) in self.config.headers() {
-            if let (Ok(name), Ok(val)) = (
-                reqwest::header::HeaderName::from_bytes(key.as_bytes()),
-                reqwest::header::HeaderValue::from_str(&value),
-            ) {
-                header_map.insert(name, val);
-            }
-        }
-
-        let response = self
-            .client
-            .post(&url)
-            .headers(header_map)
-            .json(&request)
-            .send()
-            .await?;
-
-        if !response.status().is_success() {
-            let status = response.status().to_string();
-            let error_text = response
-                .text()
-                .await
-                .unwrap_or_else(|_| "Unknown error".to_string());
-
-            return Err(LlmError::api(
-                "google",
-                format!("API Error: {status} - {error_text}"),
-                Some(status),
-                None,
-            ));
-        }
-
-        let byte_stream = response.bytes_stream();
-        let sse_stream = sse_lines(byte_stream);
-
-        Ok(Box::pin(sse_stream.map(|event_result| {
-            event_result.and_then(|data| {
-                serde_json::from_str::<GenerateContentResponse>(&data).map_err(|e| {
-                    LlmError::serialization(
-                        format!("Failed to parse GenerateContentResponse: {}", e),
-                        Box::new(e),
-                    )
-                })
-            })
-        })))
-    }
-
-    /// Health check
-    pub async fn health_check(&self) -> Result<(), LlmError> {
-        let url = format!("{}/models?key={}", self.base_url, self.config.api_key());
-
-        let mut req = self.client.get(&url);
-        for (key, value) in self.config.headers() {
-            req = req.header(key, value);
-        }
-
-        let response = req.send().await?;
-
-        if response.status().is_success() {
-            Ok(())
-        } else {
-            Err(LlmError::api(
-                "google",
-                "Health check failed",
-                Some(response.status().to_string()),
-                None,
-            ))
-        }
-    }
-}
diff --git a/crates/rullm-core/src/providers/google/config.rs b/crates/rullm-core/src/providers/google/config.rs
deleted file mode 100644
index 236b3811..00000000
--- a/crates/rullm-core/src/providers/google/config.rs
+++ /dev/null
@@ -1,58 +0,0 @@
-use crate::config::ProviderConfig;
-use crate::error::LlmError;
-use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-use std::time::Duration;
-
-/// Google AI configuration
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct GoogleAiConfig {
-    pub api_key: String,
-    pub base_url: Option<String>,
-    pub timeout_seconds: u64,
-}
-
-impl GoogleAiConfig {
-    pub fn new(api_key: impl Into<String>) -> Self {
-        Self {
-            api_key: api_key.into(),
-            base_url: None,
-            timeout_seconds: 30,
-        }
-    }
-
-    pub fn with_base_url(mut self, base_url: impl Into<String>) -> Self {
-        self.base_url = Some(base_url.into());
-        self
-    }
-}
-
-impl ProviderConfig for GoogleAiConfig {
-    fn api_key(&self) -> &str {
-        &self.api_key
-    }
-
-    fn base_url(&self) -> &str {
-        self.base_url
-            .as_deref()
-            .unwrap_or("https://generativelanguage.googleapis.com/v1beta")
-    }
-
-    fn timeout(&self) -> Duration {
-        Duration::from_secs(self.timeout_seconds)
-    }
-
-    fn headers(&self) -> HashMap<String, String> {
-        let mut headers = HashMap::new();
-        headers.insert("Content-Type".to_string(), "application/json".to_string());
-        headers
-    }
-
-    fn validate(&self) -> Result<(), LlmError> {
-        if self.api_key.is_empty() {
-            return Err(LlmError::configuration("Google AI API key is required"));
-        }
-
-        Ok(())
-    }
-}
diff --git a/crates/rullm-core/src/providers/google/mod.rs b/crates/rullm-core/src/providers/google/mod.rs
deleted file mode 100644
index 13fcbb20..00000000
--- a/crates/rullm-core/src/providers/google/mod.rs
+++ /dev/null
@@ -1,30 +0,0 @@
-//! Google Gemini provider implementation with complete API support
-//!
-//! This module provides a feature-complete Google Gemini client that supports all
-//! parameters and features available in the Google Gemini API.
-//!
-//! # Example
-//!
-//! ```no_run
-//! use rullm_core::providers::google::{GoogleClient, GenerateContentRequest, Content};
-//!
-//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
-//! let client = GoogleClient::from_env()?;
-//!
-//! let request = GenerateContentRequest::new(vec![
-//!     Content::user("Hello!"),
-//! ]);
-//!
-//! let response = client.generate_content("gemini-pro", request).await?;
-//! println!("{:?}", response.candidates);
-//! # Ok(())
-//! # }
-//! ```
-
-pub mod client;
-pub mod config;
-pub mod types;
-
-pub use client::GoogleClient;
-pub use config::GoogleAiConfig;
-pub use types::*;
diff --git a/crates/rullm-core/src/providers/google/types.rs b/crates/rullm-core/src/providers/google/types.rs
deleted file mode 100644
index 20ba2ff9..00000000
--- a/crates/rullm-core/src/providers/google/types.rs
+++ /dev/null
@@ -1,376 +0,0 @@
-//! Complete Google Gemini API types
-//!
-//! This module contains comprehensive type definitions for the Google Gemini API.
-
-use serde::{Deserialize, Serialize};
-
-/// Generate content request
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct GenerateContentRequest {
-    /// Contents of the conversation
-    pub contents: Vec<Content>,
-
-    /// System instruction
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub system_instruction: Option<Content>,
-
-    /// Generation configuration
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation_config: Option<GenerationConfig>,
-
-    /// Safety settings
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub safety_settings: Option<Vec<SafetySetting>>,
-
-    /// Tool configuration
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tools: Option<Vec<Tool>>,
-
-    /// Tool configuration
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tool_config: Option<ToolConfig>,
-}
-
-/// Content block with role and parts
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Content {
-    /// Role of the content author
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub role: Option<String>, // "user" or "model"
-
-    /// Parts of the content
-    pub parts: Vec<Part>,
-}
-
-/// A part of the content (text, inline data, function call, etc.)
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum Part {
-    /// Text part
-    Text { text: String },
-    /// Inline data (image, etc.)
-    InlineData { inline_data: InlineData },
-    /// Function call
-    FunctionCall { function_call: FunctionCall },
-    /// Function response
-    FunctionResponse { function_response: FunctionResponse },
-}
-
-/// Inline data (base64-encoded image, etc.)
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct InlineData {
-    /// MIME type
-    pub mime_type: String,
-    /// Base64-encoded data
-    pub data: String,
-}
-
-/// Function call from the model
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct FunctionCall {
-    /// Name of the function
-    pub name: String,
-    /// Arguments as JSON
-    pub args: serde_json::Value,
-}
-
-/// Function response to the model
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct FunctionResponse {
-    /// Name of the function that was called
-    pub name: String,
-    /// Response from the function
-    pub response: serde_json::Value,
-}
-
-/// Generation configuration parameters
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct GenerationConfig {
-    /// Stop sequences
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub stop_sequences: Option<Vec<String>>,
-
-    /// Temperature (0.0 to 2.0)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub temperature: Option<f32>,
-
-    /// Max output tokens
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub max_output_tokens: Option<u32>,
-
-    /// Top-p (nucleus sampling)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub top_p: Option<f32>,
-
-    /// Top-k
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub top_k: Option<u32>,
-
-    /// Response MIME type (for JSON mode)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub response_mime_type: Option<String>,
-
-    /// Response schema (for structured output)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub response_schema: Option<serde_json::Value>,
-}
-
-/// Safety setting
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
-pub struct SafetySetting {
-    /// Safety category
-    pub category: SafetyCategory,
-    /// Threshold for blocking
-    pub threshold: SafetyThreshold,
-}
-
-/// Safety categories
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
-pub enum SafetyCategory {
-    HarmCategoryHarassment,
-    HarmCategoryHateSpeech,
-    HarmCategorySexuallyExplicit,
-    HarmCategoryDangerousContent,
-}
-
-/// Safety thresholds
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
-pub enum SafetyThreshold {
-    BlockNone,
-    BlockOnlyHigh,
-    BlockMediumAndAbove,
-    BlockLowAndAbove,
-}
-
-/// Tool definition
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct Tool {
-    /// Function declarations
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub function_declarations: Option<Vec<FunctionDeclaration>>,
-}
-
-/// Function declaration
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct FunctionDeclaration {
-    /// Name of the function
-    pub name: String,
-    /// Description
-    pub description: String,
-    /// Parameters as JSON schema
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub parameters: Option<serde_json::Value>,
-}
-
-/// Tool configuration
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct ToolConfig {
-    /// Function calling config
-    pub function_calling_config: FunctionCallingConfig,
-}
-
-/// Function calling configuration
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct FunctionCallingConfig {
-    /// Mode (AUTO, ANY, NONE)
-    pub mode: String,
-    /// Allowed function names (if mode is ANY)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub allowed_function_names: Option<Vec<String>>,
-}
-
-/// Generate content response
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct GenerateContentResponse {
-    /// Candidates
-    pub candidates: Vec<Candidate>,
-
-    /// Prompt feedback
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub prompt_feedback: Option<PromptFeedback>,
-
-    /// Usage metadata
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub usage_metadata: Option<UsageMetadata>,
-}
-
-/// A candidate response
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct Candidate {
-    /// Content of the candidate
-    pub content: Content,
-
-    /// Finish reason
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub finish_reason: Option<FinishReason>,
-
-    /// Safety ratings
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub safety_ratings: Option<Vec<SafetyRating>>,
-
-    /// Citation metadata
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub citation_metadata: Option<CitationMetadata>,
-
-    /// Token count
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub token_count: Option<u32>,
-
-    /// Index
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub index: Option<u32>,
-}
-
-/// Finish reasons
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
-pub enum FinishReason {
-    Stop,
-    MaxTokens,
-    Safety,
-    Recitation,
-    Other,
-}
-
-/// Safety rating
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct SafetyRating {
-    /// Category
-    pub category: SafetyCategory,
-    /// Probability
-    pub probability: String,
-    /// Blocked
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub blocked: Option<bool>,
-}
-
-/// Citation metadata
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct CitationMetadata {
-    /// Citation sources
-    pub citation_sources: Vec<CitationSource>,
-}
-
-/// Citation source
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct CitationSource {
-    /// Start index
-    pub start_index: u32,
-    /// End index
-    pub end_index: u32,
-    /// URI
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub uri: Option<String>,
-    /// License
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub license: Option<String>,
-}
-
-/// Prompt feedback (for blocked requests)
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct PromptFeedback {
-    /// Block reason
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub block_reason: Option<String>,
-
-    /// Safety ratings
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub safety_ratings: Option<Vec<SafetyRating>>,
-}
-
-/// Usage metadata
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct UsageMetadata {
-    /// Prompt token count
-    pub prompt_token_count: u32,
-    /// Candidates token count
-    pub candidates_token_count: u32,
-    /// Total token count
-    pub total_token_count: u32,
-}
-
-// Builder methods
-impl GenerateContentRequest {
-    pub fn new(contents: Vec<Content>) -> Self {
-        Self {
-            contents,
-            system_instruction: None,
-            generation_config: None,
-            safety_settings: None,
-            tools: None,
-            tool_config: None,
-        }
-    }
-
-    pub fn with_system(mut self, system: String) -> Self {
-        self.system_instruction = Some(Content {
-            role: None,
-            parts: vec![Part::Text { text: system }],
-        });
-        self
-    }
-
-    pub fn with_generation_config(mut self, config: GenerationConfig) -> Self {
-        self.generation_config = Some(config);
-        self
-    }
-}
-
-impl Content {
-    pub fn user(text: impl Into<String>) -> Self {
-        Self {
-            role: Some("user".to_string()),
-            parts: vec![Part::Text { text: text.into() }],
-        }
-    }
-
-    pub fn model(text: impl Into<String>) -> Self {
-        Self {
-            role: Some("model".to_string()),
-            parts: vec![Part::Text { text: text.into() }],
-        }
-    }
-
-    pub fn user_with_parts(parts: Vec<Part>) -> Self {
-        Self {
-            role: Some("user".to_string()),
-            parts,
-        }
-    }
-
-    pub fn model_with_parts(parts: Vec<Part>) -> Self {
-        Self {
-            role: Some("model".to_string()),
-            parts,
-        }
-    }
-}
-
-impl Part {
-    pub fn text(text: impl Into<String>) -> Self {
-        Self::Text { text: text.into() }
-    }
-
-    pub fn image(mime_type: impl Into<String>, data: impl Into<String>) -> Self {
-        Self::InlineData {
-            inline_data: InlineData {
-                mime_type: mime_type.into(),
-                data: data.into(),
-            },
-        }
-    }
-}
diff --git a/crates/rullm-core/src/providers/mod.rs b/crates/rullm-core/src/providers/mod.rs
index 71bec32b..21defbae 100644
--- a/crates/rullm-core/src/providers/mod.rs
+++ b/crates/rullm-core/src/providers/mod.rs
@@ -1,16 +1,13 @@
 // New feature-complete provider implementations
 pub mod anthropic;
-pub mod google;
 pub mod openai;
 pub mod openai_compatible; // Used for Groq/OpenRouter
 
 // Export concrete clients
 pub use anthropic::AnthropicClient;
-pub use google::GoogleClient;
 pub use openai::OpenAIClient;
 pub use openai_compatible::{OpenAICompatibleProvider, ProviderIdentity, identities};
 
 // Export provider-specific configs
 pub use anthropic::AnthropicConfig;
-pub use google::GoogleAiConfig;
 pub use openai_compatible::{OpenAICompatibleConfig, OpenAIConfig};

From 4e4a73b3eb60f68af7c85f7ca86ff91d9d80d6a5 Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sat, 3 Jan 2026 18:56:04 +0530
Subject: [PATCH 08/14] rename rullm-openai to rullm-chat-completion

---
 crates/{rullm-openai => rullm-chat-completion}/Cargo.toml         | 0
 .../spec/chat-completion-api.md                                   | 0
 .../spec/chat-completion-difference.md                            | 0
 .../spec/chat-completion.md                                       | 0
 .../spec/chat-completion2.md                                      | 0
 .../spec/implementation-final.md                                  | 0
 .../spec/implementation.md                                        | 0
 crates/{rullm-openai => rullm-chat-completion}/src/client.rs      | 0
 crates/{rullm-openai => rullm-chat-completion}/src/config.rs      | 0
 crates/{rullm-openai => rullm-chat-completion}/src/error.rs       | 0
 crates/{rullm-openai => rullm-chat-completion}/src/lib.rs         | 0
 crates/{rullm-openai => rullm-chat-completion}/src/streaming.rs   | 0
 crates/{rullm-openai => rullm-chat-completion}/src/types.rs       | 0
 13 files changed, 0 insertions(+), 0 deletions(-)
 rename crates/{rullm-openai => rullm-chat-completion}/Cargo.toml (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/spec/chat-completion-api.md (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/spec/chat-completion-difference.md (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/spec/chat-completion.md (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/spec/chat-completion2.md (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/spec/implementation-final.md (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/spec/implementation.md (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/src/client.rs (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/src/config.rs (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/src/error.rs (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/src/lib.rs (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/src/streaming.rs (100%)
 rename crates/{rullm-openai => rullm-chat-completion}/src/types.rs (100%)

diff --git a/crates/rullm-openai/Cargo.toml b/crates/rullm-chat-completion/Cargo.toml
similarity index 100%
rename from crates/rullm-openai/Cargo.toml
rename to crates/rullm-chat-completion/Cargo.toml
diff --git a/crates/rullm-openai/spec/chat-completion-api.md b/crates/rullm-chat-completion/spec/chat-completion-api.md
similarity index 100%
rename from crates/rullm-openai/spec/chat-completion-api.md
rename to crates/rullm-chat-completion/spec/chat-completion-api.md
diff --git a/crates/rullm-openai/spec/chat-completion-difference.md b/crates/rullm-chat-completion/spec/chat-completion-difference.md
similarity index 100%
rename from crates/rullm-openai/spec/chat-completion-difference.md
rename to crates/rullm-chat-completion/spec/chat-completion-difference.md
diff --git a/crates/rullm-openai/spec/chat-completion.md b/crates/rullm-chat-completion/spec/chat-completion.md
similarity index 100%
rename from crates/rullm-openai/spec/chat-completion.md
rename to crates/rullm-chat-completion/spec/chat-completion.md
diff --git a/crates/rullm-openai/spec/chat-completion2.md b/crates/rullm-chat-completion/spec/chat-completion2.md
similarity index 100%
rename from crates/rullm-openai/spec/chat-completion2.md
rename to crates/rullm-chat-completion/spec/chat-completion2.md
diff --git a/crates/rullm-openai/spec/implementation-final.md b/crates/rullm-chat-completion/spec/implementation-final.md
similarity index 100%
rename from crates/rullm-openai/spec/implementation-final.md
rename to crates/rullm-chat-completion/spec/implementation-final.md
diff --git a/crates/rullm-openai/spec/implementation.md b/crates/rullm-chat-completion/spec/implementation.md
similarity index 100%
rename from crates/rullm-openai/spec/implementation.md
rename to crates/rullm-chat-completion/spec/implementation.md
diff --git a/crates/rullm-openai/src/client.rs b/crates/rullm-chat-completion/src/client.rs
similarity index 100%
rename from crates/rullm-openai/src/client.rs
rename to crates/rullm-chat-completion/src/client.rs
diff --git a/crates/rullm-openai/src/config.rs b/crates/rullm-chat-completion/src/config.rs
similarity index 100%
rename from crates/rullm-openai/src/config.rs
rename to crates/rullm-chat-completion/src/config.rs
diff --git a/crates/rullm-openai/src/error.rs b/crates/rullm-chat-completion/src/error.rs
similarity index 100%
rename from crates/rullm-openai/src/error.rs
rename to crates/rullm-chat-completion/src/error.rs
diff --git a/crates/rullm-openai/src/lib.rs b/crates/rullm-chat-completion/src/lib.rs
similarity index 100%
rename from crates/rullm-openai/src/lib.rs
rename to crates/rullm-chat-completion/src/lib.rs
diff --git a/crates/rullm-openai/src/streaming.rs b/crates/rullm-chat-completion/src/streaming.rs
similarity index 100%
rename from crates/rullm-openai/src/streaming.rs
rename to crates/rullm-chat-completion/src/streaming.rs
diff --git a/crates/rullm-openai/src/types.rs b/crates/rullm-chat-completion/src/types.rs
similarity index 100%
rename from crates/rullm-openai/src/types.rs
rename to crates/rullm-chat-completion/src/types.rs

From e8677237eb143b0117f47eb32e2c0104244cafb0 Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sat, 3 Jan 2026 19:38:45 +0530
Subject: [PATCH 09/14] refactor: remove rullm-core and use provider crates
 directly

---
 Cargo.lock                                    | 152 +-----
 Cargo.toml                                    |   2 +-
 crates/rullm-anthropic/src/client.rs          |   3 +-
 crates/rullm-cli/Cargo.toml                   |   7 +-
 crates/rullm-cli/src/aliases.rs               |  44 +-
 crates/rullm-cli/src/cli_client.rs            | 329 +++++-------
 crates/rullm-cli/src/client.rs                |   5 +-
 crates/rullm-cli/src/commands/mod.rs          |   6 +-
 crates/rullm-cli/src/error.rs                 |  41 ++
 crates/rullm-cli/src/main.rs                  |   1 +
 crates/rullm-core/Cargo.toml                  |  59 --
 crates/rullm-core/examples/README.md          | 393 --------------
 .../rullm-core/examples/anthropic_simple.rs   | 142 -----
 .../rullm-core/examples/anthropic_stream.rs   | 240 ---------
 crates/rullm-core/examples/basic_usage.rs     |  49 --
 crates/rullm-core/examples/openai_basic.rs    |  50 --
 crates/rullm-core/examples/openai_config.rs   | 162 ------
 .../examples/openai_conversation.rs           | 168 ------
 crates/rullm-core/examples/openai_simple.rs   | 131 -----
 crates/rullm-core/examples/openai_stream.rs   | 166 ------
 .../rullm-core/examples/test_all_providers.rs |  96 ----
 crates/rullm-core/src/compat_types.rs         | 109 ----
 crates/rullm-core/src/config.rs               | 168 ------
 crates/rullm-core/src/error.rs                | 220 --------
 crates/rullm-core/src/lib.rs                  | 172 ------
 .../src/providers/anthropic/client.rs         | 216 --------
 .../src/providers/anthropic/config.rs         |  85 ---
 .../rullm-core/src/providers/anthropic/mod.rs |  32 --
 .../src/providers/anthropic/types.rs          | 436 ---------------
 crates/rullm-core/src/providers/mod.rs        |  13 -
 .../rullm-core/src/providers/openai/client.rs | 172 ------
 crates/rullm-core/src/providers/openai/mod.rs |  32 --
 .../rullm-core/src/providers/openai/types.rs  | 502 ------------------
 .../src/providers/openai_compatible/config.rs | 111 ----
 .../src/providers/openai_compatible/mod.rs    | 423 ---------------
 crates/rullm-core/src/utils/mod.rs            |   4 -
 crates/rullm-core/src/utils/sse.rs            | 259 ---------
 crates/rullm-core/src/utils/test_helpers.rs   | 184 -------
 38 files changed, 230 insertions(+), 5154 deletions(-)
 create mode 100644 crates/rullm-cli/src/error.rs
 delete mode 100644 crates/rullm-core/Cargo.toml
 delete mode 100644 crates/rullm-core/examples/README.md
 delete mode 100644 crates/rullm-core/examples/anthropic_simple.rs
 delete mode 100644 crates/rullm-core/examples/anthropic_stream.rs
 delete mode 100644 crates/rullm-core/examples/basic_usage.rs
 delete mode 100644 crates/rullm-core/examples/openai_basic.rs
 delete mode 100644 crates/rullm-core/examples/openai_config.rs
 delete mode 100644 crates/rullm-core/examples/openai_conversation.rs
 delete mode 100644 crates/rullm-core/examples/openai_simple.rs
 delete mode 100644 crates/rullm-core/examples/openai_stream.rs
 delete mode 100644 crates/rullm-core/examples/test_all_providers.rs
 delete mode 100644 crates/rullm-core/src/compat_types.rs
 delete mode 100644 crates/rullm-core/src/config.rs
 delete mode 100644 crates/rullm-core/src/error.rs
 delete mode 100644 crates/rullm-core/src/lib.rs
 delete mode 100644 crates/rullm-core/src/providers/anthropic/client.rs
 delete mode 100644 crates/rullm-core/src/providers/anthropic/config.rs
 delete mode 100644 crates/rullm-core/src/providers/anthropic/mod.rs
 delete mode 100644 crates/rullm-core/src/providers/anthropic/types.rs
 delete mode 100644 crates/rullm-core/src/providers/mod.rs
 delete mode 100644 crates/rullm-core/src/providers/openai/client.rs
 delete mode 100644 crates/rullm-core/src/providers/openai/mod.rs
 delete mode 100644 crates/rullm-core/src/providers/openai/types.rs
 delete mode 100644 crates/rullm-core/src/providers/openai_compatible/config.rs
 delete mode 100644 crates/rullm-core/src/providers/openai_compatible/mod.rs
 delete mode 100644 crates/rullm-core/src/utils/mod.rs
 delete mode 100644 crates/rullm-core/src/utils/sse.rs
 delete mode 100644 crates/rullm-core/src/utils/test_helpers.rs

diff --git a/Cargo.lock b/Cargo.lock
index 9e951be3..71ea4405 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -17,18 +17,6 @@ version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
 
-[[package]]
-name = "ahash"
-version = "0.8.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
-dependencies = [
- "cfg-if",
- "once_cell",
- "version_check",
- "zerocopy",
-]
-
 [[package]]
 name = "aho-corasick"
 version = "1.1.3"
@@ -131,17 +119,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "async-trait"
-version = "0.1.88"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "atomic-waker"
 version = "1.1.2"
@@ -1190,16 +1167,6 @@ version = "2.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
 
-[[package]]
-name = "metrics"
-version = "0.23.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3045b4193fbdc5b5681f32f11070da9be3609f189a79f3390706d42587f46bb5"
-dependencies = [
- "ahash",
- "portable-atomic",
-]
-
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -1409,26 +1376,6 @@ version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 
-[[package]]
-name = "pin-project"
-version = "1.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
-dependencies = [
- "pin-project-internal",
-]
-
-[[package]]
-name = "pin-project-internal"
-version = "1.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "pin-project-lite"
 version = "0.2.16"
@@ -1447,12 +1394,6 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 
-[[package]]
-name = "portable-atomic"
-version = "1.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483"
-
 [[package]]
 name = "potential_utf"
 version = "0.1.2"
@@ -1495,35 +1436,14 @@ version = "5.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
 
-[[package]]
-name = "rand"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
-dependencies = [
- "libc",
- "rand_chacha 0.3.1",
- "rand_core 0.6.4",
-]
-
 [[package]]
 name = "rand"
 version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
 dependencies = [
- "rand_chacha 0.9.0",
- "rand_core 0.9.3",
-]
-
-[[package]]
-name = "rand_chacha"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
-dependencies = [
- "ppv-lite86",
- "rand_core 0.6.4",
+ "rand_chacha",
+ "rand_core",
 ]
 
 [[package]]
@@ -1533,16 +1453,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
 dependencies = [
  "ppv-lite86",
- "rand_core 0.9.3",
-]
-
-[[package]]
-name = "rand_core"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
-dependencies = [
- "getrandom 0.2.16",
+ "rand_core",
 ]
 
 [[package]]
@@ -1700,7 +1611,7 @@ dependencies = [
  "sync_wrapper 1.0.2",
  "tokio",
  "tokio-native-tls",
- "tower 0.5.2",
+ "tower",
  "tower-http",
  "tower-service",
  "url",
@@ -1759,6 +1670,7 @@ name = "rullm-cli"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "async-stream",
  "atty",
  "base64 0.22.1",
  "chrono",
@@ -1768,16 +1680,18 @@ dependencies = [
  "futures",
  "hex",
  "owo-colors",
- "rand 0.9.2",
+ "rand",
  "reedline",
  "reqwest 0.12.24",
- "rullm-core",
+ "rullm-anthropic",
+ "rullm-chat-completion",
  "serde",
  "serde_json",
  "sha2",
  "strum 0.27.2",
  "strum_macros 0.27.2",
  "tempfile",
+ "thiserror 1.0.69",
  "tokio",
  "toml",
  "tracing",
@@ -1786,34 +1700,6 @@ dependencies = [
  "webbrowser",
 ]
 
-[[package]]
-name = "rullm-core"
-version = "0.1.0"
-dependencies = [
- "async-stream",
- "async-trait",
- "bytes",
- "clap",
- "futures",
- "log",
- "metrics",
- "once_cell",
- "rand 0.8.5",
- "reqwest 0.11.27",
- "serde",
- "serde_json",
- "strum 0.27.2",
- "strum_macros 0.27.2",
- "tempfile",
- "thiserror 1.0.69",
- "tokio",
- "tokio-test",
- "toml",
- "tower 0.4.13",
- "tower-service",
- "tracing-subscriber",
-]
-
 [[package]]
 name = "rustc-demangle"
 version = "0.1.26"
@@ -2411,23 +2297,6 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
 
-[[package]]
-name = "tower"
-version = "0.4.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
-dependencies = [
- "futures-core",
- "futures-util",
- "pin-project",
- "pin-project-lite",
- "tokio",
- "tokio-util",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
 [[package]]
 name = "tower"
 version = "0.5.2"
@@ -2456,7 +2325,7 @@ dependencies = [
  "http-body 1.0.1",
  "iri-string",
  "pin-project-lite",
- "tower 0.5.2",
+ "tower",
  "tower-layer",
  "tower-service",
 ]
@@ -2479,7 +2348,6 @@ version = "0.1.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
 dependencies = [
- "log",
  "pin-project-lite",
  "tracing-attributes",
  "tracing-core",
diff --git a/Cargo.toml b/Cargo.toml
index e12ea588..89944f41 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,5 @@
 [workspace]
-members = ["crates/rullm-core", "crates/rullm-cli", "crates/rullm-openai", "crates/rullm-anthropic"]
+members = ["crates/rullm-cli", "crates/rullm-chat-completion", "crates/rullm-anthropic"]
 resolver = "2"
 
 [workspace.package]
diff --git a/crates/rullm-anthropic/src/client.rs b/crates/rullm-anthropic/src/client.rs
index a281f295..6d89badf 100644
--- a/crates/rullm-anthropic/src/client.rs
+++ b/crates/rullm-anthropic/src/client.rs
@@ -152,7 +152,8 @@ impl MessagesClient {
         &self,
         mut request: MessagesRequest,
         options: RequestOptions,
-    ) -> Result<MessageStream<impl Stream<Item = Result<StreamEvent>> + Unpin>> {
+    ) -> Result<MessageStream<impl Stream<Item = Result<StreamEvent>> + Unpin + Send + 'static>>
+    {
         // Force streaming
         request.stream = Some(true);
 
diff --git a/crates/rullm-cli/Cargo.toml b/crates/rullm-cli/Cargo.toml
index 96fdf084..4c94efde 100644
--- a/crates/rullm-cli/Cargo.toml
+++ b/crates/rullm-cli/Cargo.toml
@@ -8,10 +8,12 @@ rust-version.workspace = true
 default = []
 
 [dependencies]
-# Depend on the lib crate
-rullm-core = { package = "rullm-core", path = "../rullm-core" }
+# Provider crates
+rullm-anthropic = { path = "../rullm-anthropic" }
+rullm-chat-completion = { path = "../rullm-chat-completion" }
 
 futures.workspace = true
+async-stream.workspace = true
 
 # CLI-specific dependencies
 clap.workspace = true
@@ -29,6 +31,7 @@ strum_macros.workspace = true
 serde_json.workspace = true
 anyhow.workspace = true
 chrono.workspace = true
+thiserror.workspace = true
 reedline.workspace = true
 tempfile.workspace = true
 
diff --git a/crates/rullm-cli/src/aliases.rs b/crates/rullm-cli/src/aliases.rs
index 19614730..4d890180 100644
--- a/crates/rullm-cli/src/aliases.rs
+++ b/crates/rullm-cli/src/aliases.rs
@@ -1,7 +1,7 @@
 use crate::constants::ALIASES_CONFIG_FILE;
+use crate::error::CliError;
 
 use super::provider::Provider;
-use rullm_core::error::LlmError;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::path::Path;
@@ -16,7 +16,7 @@ pub struct UserAliasConfig {
 
 impl UserAliasConfig {
     /// Load user aliases from a TOML file
-    pub fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self, LlmError> {
+    pub fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self, CliError> {
         let path = path.as_ref();
 
         if !path.exists() {
@@ -24,7 +24,7 @@ impl UserAliasConfig {
         }
 
         let content = std::fs::read_to_string(path)
-            .map_err(|e| LlmError::validation(format!("Failed to read alias config: {e}")))?;
+            .map_err(|e| CliError::validation(format!("Failed to read alias config: {e}")))?;
 
         // Handle empty files gracefully
         if content.trim().is_empty() {
@@ -32,22 +32,22 @@ impl UserAliasConfig {
         }
 
         toml::from_str(&content)
-            .map_err(|e| LlmError::validation(format!("Failed to parse alias config: {e}")))
+            .map_err(|e| CliError::validation(format!("Failed to parse alias config: {e}")))
     }
 
     /// Save user aliases to a TOML file
-    pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<(), LlmError> {
+    pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<(), CliError> {
         let path = path.as_ref();
 
         // Create directory if it doesn't exist
         if let Some(parent) = path.parent() {
             std::fs::create_dir_all(parent).map_err(|e| {
-                LlmError::validation(format!("Failed to create config directory: {e}"))
+                CliError::validation(format!("Failed to create config directory: {e}"))
             })?;
         }
 
         let content = toml::to_string_pretty(self)
-            .map_err(|e| LlmError::validation(format!("Failed to serialize alias config: {e}")))?;
+            .map_err(|e| CliError::validation(format!("Failed to serialize alias config: {e}")))?;
 
         // Ensure we always have the [aliases] section even if empty
         let content = if self.aliases.is_empty() {
@@ -57,11 +57,11 @@ impl UserAliasConfig {
         };
 
         std::fs::write(path, content)
-            .map_err(|e| LlmError::validation(format!("Failed to write alias config: {e}")))
+            .map_err(|e| CliError::validation(format!("Failed to write alias config: {e}")))
     }
 
     /// Add a new alias
-    pub fn add_alias(&mut self, alias: &str, target: &str) -> Result<(), LlmError> {
+    pub fn add_alias(&mut self, alias: &str, target: &str) -> Result<(), CliError> {
         // Validate the target format
         Self::validate_target(target)?;
 
@@ -76,21 +76,21 @@ impl UserAliasConfig {
     }
 
     /// Validate that a target is in valid provider:model format
-    fn validate_target(target: &str) -> Result<(), LlmError> {
+    fn validate_target(target: &str) -> Result<(), CliError> {
         if let Some((provider_str, model_name)) = target.split_once(':') {
             if Provider::from_alias(provider_str).is_none() {
-                return Err(LlmError::validation(format!(
+                return Err(CliError::validation(format!(
                     "Invalid provider '{provider_str}' in target '{target}'"
                 )));
             }
             if model_name.trim().is_empty() {
-                return Err(LlmError::validation(format!(
+                return Err(CliError::validation(format!(
                     "Model name cannot be empty in target '{target}'"
                 )));
             }
             Ok(())
         } else {
-            Err(LlmError::validation(format!(
+            Err(CliError::validation(format!(
                 "Target '{target}' must be in 'provider:model' format"
             )))
         }
@@ -138,9 +138,9 @@ impl AliasResolver {
     /// 3. Check default aliases
     /// 4. Try pattern inference
     /// 5. Error if unresolvable
-    pub fn resolve(&self, input: &str) -> Result<(Provider, String), LlmError> {
+    pub fn resolve(&self, input: &str) -> Result<(Provider, String), CliError> {
         if input.trim().is_empty() {
-            return Err(LlmError::validation("Input cannot be empty".to_string()));
+            return Err(CliError::validation("Input cannot be empty".to_string()));
         }
 
         let normalized_input = if self.case_insensitive {
@@ -154,7 +154,7 @@ impl AliasResolver {
             if let Some(provider) = Provider::from_alias(provider_str) {
                 return Ok((provider, model_name.to_string()));
             } else {
-                return Err(LlmError::validation(format!(
+                return Err(CliError::validation(format!(
                     "Unknown provider prefix: '{provider_str}'"
                 )));
             }
@@ -170,27 +170,27 @@ impl AliasResolver {
     }
 
     /// Parse a target string to (Provider, model)
-    fn parse_target(&self, target: &str) -> Result<(Provider, String), LlmError> {
+    fn parse_target(&self, target: &str) -> Result<(Provider, String), CliError> {
         if let Some((provider_str, model_name)) = target.split_once(':') {
             if let Some(provider) = Provider::from_alias(provider_str) {
                 Ok((provider, model_name.to_string()))
             } else {
-                Err(LlmError::validation(format!(
+                Err(CliError::validation(format!(
                     "Unknown provider '{provider_str}' in target '{target}'"
                 )))
             }
         } else {
-            Err(LlmError::validation(format!(
+            Err(CliError::validation(format!(
                 "Invalid target format '{target}', expected 'provider:model'"
             )))
         }
     }
 
     /// Fallback pattern inference using existing logic
-    fn infer_from_pattern(&self, input: &str) -> Result<(Provider, String), LlmError> {
+    fn infer_from_pattern(&self, input: &str) -> Result<(Provider, String), CliError> {
         // Check if input is just a provider name (should error)
         if Provider::from_alias(input).is_some() {
-            return Err(LlmError::validation(format!(
+            return Err(CliError::validation(format!(
                 "Input '{input}' is a provider name, not a model. Use format 'provider:model' or a specific model alias."
             )));
         }
@@ -209,7 +209,7 @@ impl AliasResolver {
             }
         }
 
-        Err(LlmError::validation(format!(
+        Err(CliError::validation(format!(
             "Unable to determine provider for model: '{input}'. Use format 'provider:model' or a recognized alias."
         )))
     }
diff --git a/crates/rullm-cli/src/cli_client.rs b/crates/rullm-cli/src/cli_client.rs
index 15d37d8a..c7ac565a 100644
--- a/crates/rullm-cli/src/cli_client.rs
+++ b/crates/rullm-cli/src/cli_client.rs
@@ -3,34 +3,30 @@
 //! This module provides a simple enum wrapper for CLI usage that supports
 //! basic chat operations without exposing the full complexity of each provider's API.
 
+use crate::error::CliError;
 use futures::StreamExt;
-use rullm_core::error::LlmError;
-use rullm_core::providers::anthropic::AnthropicConfig;
-use rullm_core::providers::openai_compatible::{
-    OpenAICompatibleConfig, OpenAICompatibleProvider, OpenAIConfig, identities,
+use rullm_anthropic::{
+    Client as AnthropicClient, Message as AnthropicMessage, MessagesRequest, RequestOptions,
+    SystemBlock, SystemContent,
 };
-use rullm_core::providers::{AnthropicClient, OpenAIClient};
+use rullm_chat_completion::{ChatCompletionsClient, ClientConfig, Message as ChatMessage};
 use std::pin::Pin;
 
 /// Claude Code identification text for OAuth requests
 const CLAUDE_CODE_SPOOF_TEXT: &str = "You are Claude Code, Anthropic's official CLI for Claude.";
 
 /// Prepend Claude Code system block to an existing system prompt (for OAuth requests)
-fn prepend_claude_code_system(
-    existing: Option<rullm_core::providers::anthropic::SystemPrompt>,
-) -> rullm_core::providers::anthropic::SystemPrompt {
-    use rullm_core::providers::anthropic::{SystemBlock, SystemPrompt};
-
+fn prepend_claude_code_system(existing: Option<SystemContent>) -> SystemContent {
     let spoof_block = SystemBlock::text_with_cache(CLAUDE_CODE_SPOOF_TEXT);
 
     match existing {
-        None => SystemPrompt::Blocks(vec![spoof_block]),
-        Some(SystemPrompt::Text(text)) => {
-            SystemPrompt::Blocks(vec![spoof_block, SystemBlock::text(text)])
+        None => SystemContent::Blocks(vec![spoof_block]),
+        Some(SystemContent::Text(text)) => {
+            SystemContent::Blocks(vec![spoof_block, SystemBlock::text(text)])
         }
-        Some(SystemPrompt::Blocks(mut blocks)) => {
+        Some(SystemContent::Blocks(mut blocks)) => {
             blocks.insert(0, spoof_block);
-            SystemPrompt::Blocks(blocks)
+            SystemContent::Blocks(blocks)
         }
     }
 }
@@ -45,7 +41,7 @@ pub struct CliConfig {
 /// CLI adapter enum that wraps concrete provider clients
 pub enum CliClient {
     OpenAI {
-        client: OpenAIClient,
+        client: ChatCompletionsClient,
         model: String,
         config: CliConfig,
     },
@@ -56,12 +52,12 @@ pub enum CliClient {
         is_oauth: bool,
     },
     Groq {
-        client: OpenAICompatibleProvider,
+        client: ChatCompletionsClient,
         model: String,
         config: CliConfig,
     },
     OpenRouter {
-        client: OpenAICompatibleProvider,
+        client: ChatCompletionsClient,
         model: String,
         config: CliConfig,
     },
@@ -73,9 +69,12 @@ impl CliClient {
         api_key: impl Into<String>,
         model: impl Into<String>,
         config: CliConfig,
-    ) -> Result<Self, LlmError> {
-        let client_config = OpenAIConfig::new(api_key);
-        let client = OpenAIClient::new(client_config)?;
+    ) -> Result<Self, CliError> {
+        let client_config = ClientConfig::builder()
+            .bearer_token(api_key.into())
+            .build()
+            .map_err(|e| CliError::Other(e.to_string()))?;
+        let client = ChatCompletionsClient::new(client_config)?;
         Ok(Self::OpenAI {
             client,
             model: model.into(),
@@ -89,11 +88,16 @@ impl CliClient {
         model: impl Into<String>,
         config: CliConfig,
         use_oauth: bool,
-    ) -> Result<Self, LlmError> {
-        let client_config = AnthropicConfig::new(api_key).with_oauth(use_oauth);
-        let client = AnthropicClient::new(client_config)?;
+    ) -> Result<Self, CliError> {
+        let api_key_str = api_key.into();
+        let client_config = if use_oauth {
+            AnthropicClient::builder().auth_token(api_key_str).build()?
+        } else {
+            AnthropicClient::builder().api_key(api_key_str).build()?
+        };
+        let anthropic_client = AnthropicClient::new(client_config)?;
         Ok(Self::Anthropic {
-            client,
+            client: anthropic_client,
             model: model.into(),
             config,
             is_oauth: use_oauth,
@@ -105,9 +109,13 @@ impl CliClient {
         api_key: impl Into<String>,
         model: impl Into<String>,
         config: CliConfig,
-    ) -> Result<Self, LlmError> {
-        let client_config = OpenAICompatibleConfig::groq(api_key);
-        let client = OpenAICompatibleProvider::new(client_config, identities::GROQ)?;
+    ) -> Result<Self, CliError> {
+        let client_config = ClientConfig::builder()
+            .base_url("https://api.groq.com/openai/v1")
+            .bearer_token(api_key.into())
+            .build()
+            .map_err(|e| CliError::Other(e.to_string()))?;
+        let client = ChatCompletionsClient::new(client_config)?;
         Ok(Self::Groq {
             client,
             model: model.into(),
@@ -120,9 +128,13 @@ impl CliClient {
         api_key: impl Into<String>,
         model: impl Into<String>,
         config: CliConfig,
-    ) -> Result<Self, LlmError> {
-        let client_config = OpenAICompatibleConfig::openrouter(api_key);
-        let client = OpenAICompatibleProvider::new(client_config, identities::OPENROUTER)?;
+    ) -> Result<Self, CliError> {
+        let client_config = ClientConfig::builder()
+            .base_url("https://openrouter.ai/api/v1")
+            .bearer_token(api_key.into())
+            .build()
+            .map_err(|e| CliError::Other(e.to_string()))?;
+        let client = ChatCompletionsClient::new(client_config)?;
         Ok(Self::OpenRouter {
             client,
             model: model.into(),
@@ -131,37 +143,38 @@ impl CliClient {
     }
 
     /// Simple chat - send a message and get a response
-    pub async fn chat(&self, message: &str) -> Result<String, LlmError> {
+    pub async fn chat(&self, message: &str) -> Result<String, CliError> {
         match self {
             Self::OpenAI {
                 client,
                 model,
                 config,
+            }
+            | Self::Groq {
+                client,
+                model,
+                config,
+            }
+            | Self::OpenRouter {
+                client,
+                model,
+                config,
             } => {
-                use rullm_core::providers::openai::{ChatCompletionRequest, ChatMessage};
-
-                let mut request =
-                    ChatCompletionRequest::new(model, vec![ChatMessage::user(message)]);
+                let mut builder = client.chat().model(model.as_str()).user(message);
 
                 if let Some(temp) = config.temperature {
-                    request.temperature = Some(temp);
+                    builder = builder.temperature(temp);
                 }
                 if let Some(max) = config.max_tokens {
-                    request.max_tokens = Some(max);
+                    builder = builder.max_completion_tokens(max);
                 }
 
-                let response = client.chat_completion(request).await?;
-                let content = response
-                    .choices
-                    .first()
-                    .and_then(|c| c.message.content.as_ref())
-                    .and_then(|c| match c {
-                        rullm_core::providers::openai::MessageContent::Text(t) => Some(t.clone()),
-                        _ => None,
-                    })
-                    .ok_or_else(|| LlmError::model("No content in response"))?;
-
-                Ok(content)
+                let response = builder.send().await?;
+                response
+                    .data
+                    .first_text()
+                    .map(|s| s.to_string())
+                    .ok_or_else(|| CliError::Other("No content in response".to_string()))
             }
             Self::Anthropic {
                 client,
@@ -169,58 +182,24 @@ impl CliClient {
                 config,
                 is_oauth,
             } => {
-                use rullm_core::providers::anthropic::{Message, MessagesRequest};
-
                 let max_tokens = config.max_tokens.unwrap_or(1024);
-                let mut request =
-                    MessagesRequest::new(model, vec![Message::user(message)], max_tokens);
+                let mut builder = MessagesRequest::builder(model.as_str(), max_tokens)
+                    .message(AnthropicMessage::user(message));
 
                 if let Some(temp) = config.temperature {
-                    request.temperature = Some(temp);
+                    builder = builder.temperature(temp);
                 }
 
                 if *is_oauth {
-                    request.system = Some(prepend_claude_code_system(request.system.take()));
+                    builder = builder.system_blocks(prepend_claude_code_system(None).into_blocks());
                 }
 
-                let response = client.messages(request).await?;
-                let content = response
-                    .content
-                    .iter()
-                    .filter_map(|block| match block {
-                        rullm_core::providers::anthropic::ContentBlock::Text { text } => {
-                            Some(text.clone())
-                        }
-                        _ => None,
-                    })
-                    .collect::<Vec<_>>()
-                    .join("");
-
-                Ok(content)
-            }
-            Self::Groq {
-                client,
-                model,
-                config,
-            }
-            | Self::OpenRouter {
-                client,
-                model,
-                config,
-            } => {
-                use rullm_core::{ChatRequestBuilder, ChatRole};
-
-                let mut request = ChatRequestBuilder::new().add_message(ChatRole::User, message);
-
-                if let Some(temp) = config.temperature {
-                    request = request.temperature(temp);
-                }
-                if let Some(max) = config.max_tokens {
-                    request = request.max_tokens(max);
-                }
-
-                let response = client.chat_completion(request.build(), model).await?;
-                Ok(response.message.content)
+                let request = builder.build();
+                let response = client
+                    .messages()
+                    .create(request, RequestOptions::default())
+                    .await?;
+                Ok(response.text())
             }
         }
     }
@@ -229,53 +208,52 @@ impl CliClient {
     pub async fn stream_chat_raw(
         &self,
         messages: Vec<(String, String)>, // (role, content) pairs
-    ) -> Result<Pin<Box<dyn futures::Stream<Item = Result<String, LlmError>> + Send>>, LlmError>
+    ) -> Result<Pin<Box<dyn futures::Stream<Item = Result<String, CliError>> + Send>>, CliError>
     {
         match self {
             Self::OpenAI {
                 client,
                 model,
                 config,
+            }
+            | Self::Groq {
+                client,
+                model,
+                config,
+            }
+            | Self::OpenRouter {
+                client,
+                model,
+                config,
             } => {
-                use rullm_core::providers::openai::{ChatCompletionRequest, ChatMessage, Role};
-
-                let msgs: Vec<ChatMessage> = messages
-                    .iter()
-                    .map(|(role, content)| {
-                        let r = match role.as_str() {
-                            "system" => Role::System,
-                            "user" => Role::User,
-                            "assistant" => Role::Assistant,
-                            _ => Role::User,
-                        };
-                        ChatMessage {
-                            role: r,
-                            content: Some(rullm_core::providers::openai::MessageContent::Text(
-                                content.clone(),
-                            )),
-                            name: None,
-                            tool_calls: None,
-                            tool_call_id: None,
-                        }
-                    })
-                    .collect();
+                let mut builder = client.chat().model(model.as_str());
+
+                for (role, content) in &messages {
+                    let msg = match role.as_str() {
+                        "system" => ChatMessage::system(content.as_str()),
+                        "user" => ChatMessage::user(content.as_str()),
+                        "assistant" => ChatMessage::assistant(content.as_str()),
+                        _ => ChatMessage::user(content.as_str()),
+                    };
+                    builder = builder.message(msg);
+                }
 
-                let mut request = ChatCompletionRequest::new(model, msgs);
                 if let Some(temp) = config.temperature {
-                    request.temperature = Some(temp);
+                    builder = builder.temperature(temp);
                 }
                 if let Some(max) = config.max_tokens {
-                    request.max_tokens = Some(max);
+                    builder = builder.max_completion_tokens(max);
                 }
 
-                let stream = client.chat_completion_stream(request).await?;
+                let stream = builder.stream().await?;
                 Ok(Box::pin(stream.filter_map(|chunk_result| async move {
                     match chunk_result {
                         Ok(chunk) => chunk
                             .choices
                             .first()
-                            .and_then(|choice| choice.delta.content.clone().map(Ok)),
-                        Err(e) => Some(Err(e)),
+                            .and_then(|choice| choice.delta.content.as_ref())
+                            .map(|content| Ok(content.to_string())),
+                        Err(e) => Some(Err(CliError::ChatCompletion(e))),
                     }
                 })))
             }
@@ -285,81 +263,46 @@ impl CliClient {
                 config,
                 is_oauth,
             } => {
-                use rullm_core::providers::anthropic::{Message, MessagesRequest};
-
-                let msgs: Vec<Message> = messages
+                let msgs: Vec<AnthropicMessage> = messages
                     .iter()
-                    .filter_map(|(role, content)| {
-                        match role.as_str() {
-                            "user" => Some(Message::user(content)),
-                            "assistant" => Some(Message::assistant(content)),
-                            _ => None, // Skip system messages for now
-                        }
+                    .filter_map(|(role, content)| match role.as_str() {
+                        "user" => Some(AnthropicMessage::user(content.as_str())),
+                        "assistant" => Some(AnthropicMessage::assistant(content.as_str())),
+                        _ => None, // Skip system messages for now
                     })
                     .collect();
 
                 let max_tokens = config.max_tokens.unwrap_or(1024);
-                let mut request = MessagesRequest::new(model, msgs, max_tokens);
-                if let Some(temp) = config.temperature {
-                    request.temperature = Some(temp);
-                }
-
-                if *is_oauth {
-                    request.system = Some(prepend_claude_code_system(request.system.take()));
-                }
-
-                let stream = client.messages_stream(request).await?;
-                Ok(Box::pin(stream.filter_map(|event_result| async move {
-                    match event_result {
-                        Ok(rullm_core::providers::anthropic::StreamEvent::ContentBlockDelta {
-                            delta: rullm_core::providers::anthropic::Delta::TextDelta { text },
-                            ..
-                        }) => Some(Ok(text)),
-                        Ok(_) => None,
-                        Err(e) => Some(Err(e)),
-                    }
-                })))
-            }
-            Self::Groq {
-                client,
-                model,
-                config,
-            }
-            | Self::OpenRouter {
-                client,
-                model,
-                config,
-            } => {
-                use rullm_core::{ChatRequestBuilder, ChatRole, ChatStreamEvent};
-
-                let mut builder = ChatRequestBuilder::new();
-                for (role, content) in messages {
-                    let r = match role.as_str() {
-                        "system" => ChatRole::System,
-                        "user" => ChatRole::User,
-                        "assistant" => ChatRole::Assistant,
-                        _ => ChatRole::User,
-                    };
-                    builder = builder.add_message(r, content);
-                }
+                let mut builder =
+                    MessagesRequest::builder(model.as_str(), max_tokens).messages(msgs.into_iter());
 
                 if let Some(temp) = config.temperature {
                     builder = builder.temperature(temp);
                 }
-                if let Some(max) = config.max_tokens {
-                    builder = builder.max_tokens(max);
+
+                if *is_oauth {
+                    builder = builder.system_blocks(prepend_claude_code_system(None).into_blocks());
                 }
 
-                let stream = client
-                    .chat_completion_stream(builder.build(), model, None)
-                    .await;
-                Ok(Box::pin(stream.filter_map(|event_result| async move {
-                    match event_result {
-                        Ok(ChatStreamEvent::Token(token)) => Some(Ok(token)),
-                        Ok(_) => None,
-                        Err(e) => Some(Err(e)),
+                let request = builder.build();
+                let messages_client = client.messages();
+                let stream = messages_client
+                    .stream(request, RequestOptions::default())
+                    .await?;
+
+                // Use text_stream() which provides a cleaner interface for text-only streaming
+                let text_stream = stream.text_stream();
+
+                Ok(Box::pin(async_stream::stream! {
+                    use std::pin::pin;
+                    let mut stream = pin!(text_stream);
+                    while let Some(result) = stream.next().await {
+                        match result {
+                            Ok(text) => yield Ok(text.to_string()),
+                            Err(e) => yield Err(CliError::Anthropic(e)),
+                        }
                     }
-                })))
+                }))
             }
         }
     }
@@ -384,3 +327,17 @@ impl CliClient {
         }
     }
 }
+
+// Helper trait for SystemContent
+trait SystemContentExt {
+    fn into_blocks(self) -> Vec<SystemBlock>;
+}
+
+impl SystemContentExt for SystemContent {
+    fn into_blocks(self) -> Vec<SystemBlock> {
+        match self {
+            SystemContent::Text(text) => vec![SystemBlock::text(text)],
+            SystemContent::Blocks(blocks) => blocks,
+        }
+    }
+}
diff --git a/crates/rullm-cli/src/client.rs b/crates/rullm-cli/src/client.rs
index e1b7856e..e69bac02 100644
--- a/crates/rullm-cli/src/client.rs
+++ b/crates/rullm-cli/src/client.rs
@@ -2,10 +2,9 @@ use super::provider::Provider;
 use crate::args::{Cli, CliConfig};
 use crate::auth;
 use crate::cli_client::{CliClient, CliConfig as CoreCliConfig};
+use crate::error::CliError;
 use anyhow::{Context, Result};
 
-use rullm_core::LlmError;
-
 pub fn create_client(
     provider: &Provider,
     api_key: &str,
@@ -13,7 +12,7 @@ pub fn create_client(
     cli: &Cli,
     model_name: &str,
     is_oauth: bool,
-) -> Result<CliClient, LlmError> {
+) -> Result<CliClient, CliError> {
     // Build CoreCliConfig based on CLI args
     let mut config = CoreCliConfig::default();
 
diff --git a/crates/rullm-cli/src/commands/mod.rs b/crates/rullm-cli/src/commands/mod.rs
index db7b491e..3e3e6d32 100644
--- a/crates/rullm-cli/src/commands/mod.rs
+++ b/crates/rullm-cli/src/commands/mod.rs
@@ -1,9 +1,9 @@
 use clap::Subcommand;
 
 use crate::cli_client::CliClient;
+use crate::error::CliError;
 use anyhow::Result;
 use futures::StreamExt;
-use rullm_core::LlmError;
 use std::io::{self, Write};
 
 use crate::spinner::Spinner;
@@ -117,7 +117,7 @@ pub async fn run_single_query(
     query: &str,
     system_prompt: Option<&str>,
     streaming: bool,
-) -> Result<(), LlmError> {
+) -> Result<(), CliError> {
     if streaming {
         // Use token-by-token streaming for real-time output
         if let Some(_system) = system_prompt {
@@ -162,7 +162,7 @@ pub async fn run_single_query(
                                 print!("{token}");
                                 io::stdout()
                                     .flush()
-                                    .map_err(|e| LlmError::unknown(e.to_string()))?;
+                                    .map_err(|e| CliError::unknown(e.to_string()))?;
                             }
                             Err(err) => {
                                 spinner.stop_and_replace(&format!("Error: {err}\n"));
diff --git a/crates/rullm-cli/src/error.rs b/crates/rullm-cli/src/error.rs
new file mode 100644
index 00000000..a18dd554
--- /dev/null
+++ b/crates/rullm-cli/src/error.rs
@@ -0,0 +1,41 @@
+//! CLI error types
+//!
+//! Unified error handling for the CLI that wraps errors from provider crates.
+
+use std::sync::Arc;
+use thiserror::Error;
+
+use rullm_anthropic::AnthropicError;
+use rullm_chat_completion::ClientError;
+
+/// Main error type for CLI operations
+#[derive(Error, Debug)]
+pub enum CliError {
+    /// Error from the Anthropic provider
+    #[error("Anthropic error: {0}")]
+    Anthropic(#[from] AnthropicError),
+
+    /// Error from the chat completion provider (OpenAI, Groq, OpenRouter)
+    #[error("Chat completion error: {0}")]
+    ChatCompletion(#[from] ClientError),
+
+    /// Validation error (invalid input, configuration, etc.)
+    #[error("Validation error: {0}")]
+    Validation(Arc<str>),
+
+    /// Generic error
+    #[error("{0}")]
+    Other(String),
+}
+
+impl CliError {
+    /// Create a validation error
+    pub fn validation(msg: impl Into<Arc<str>>) -> Self {
+        Self::Validation(msg.into())
+    }
+
+    /// Create a generic error
+    pub fn unknown(msg: impl Into<String>) -> Self {
+        Self::Other(msg.into())
+    }
+}
diff --git a/crates/rullm-cli/src/main.rs b/crates/rullm-cli/src/main.rs
index 3758ce4f..7d3553a7 100644
--- a/crates/rullm-cli/src/main.rs
+++ b/crates/rullm-cli/src/main.rs
@@ -9,6 +9,7 @@ mod client;
 mod commands;
 mod config;
 mod constants;
+mod error;
 mod oauth;
 mod output;
 mod provider;
diff --git a/crates/rullm-core/Cargo.toml b/crates/rullm-core/Cargo.toml
deleted file mode 100644
index 349d7a4f..00000000
--- a/crates/rullm-core/Cargo.toml
+++ /dev/null
@@ -1,59 +0,0 @@
-[package]
-name = "rullm-core"
-version.workspace = true
-edition.workspace = true
-rust-version.workspace = true
-
-[dependencies]
-tokio.workspace = true
-tower.workspace = true
-rand.workspace = true
-reqwest.workspace = true
-bytes.workspace = true
-log.workspace = true
-serde.workspace = true
-serde_json.workspace = true
-thiserror.workspace = true
-tower-service.workspace = true
-async-trait.workspace = true
-futures.workspace = true
-async-stream.workspace = true
-clap.workspace = true
-strum.workspace = true
-strum_macros.workspace = true
-toml.workspace = true
-metrics.workspace = true
-once_cell.workspace = true
-
-[dev-dependencies]
-tokio-test.workspace = true
-tracing-subscriber.workspace = true
-tempfile.workspace = true
-
-[[example]]
-name = "openai_basic"
-path = "examples/openai_basic.rs"
-
-[[example]]
-name = "openai_simple"
-path = "examples/openai_simple.rs"
-
-[[example]]
-name = "openai_config"
-path = "examples/openai_config.rs"
-
-[[example]]
-name = "test_all_providers"
-path = "examples/test_all_providers.rs"
-
-[[example]]
-name = "anthropic_simple"
-path = "examples/anthropic_simple.rs"
-
-[[example]]
-name = "openai_conversation"
-path = "examples/openai_conversation.rs"
-
-[[example]]
-name = "basic_usage"
-path = "examples/basic_usage.rs"
diff --git a/crates/rullm-core/examples/README.md b/crates/rullm-core/examples/README.md
deleted file mode 100644
index 970f2987..00000000
--- a/crates/rullm-core/examples/README.md
+++ /dev/null
@@ -1,393 +0,0 @@
-# LLM Provider Examples
-
-This directory contains examples demonstrating how to use the OpenAI and Anthropic providers in the LLM library.
-
-## Prerequisites
-
-1. **Set up your API keys:**
-   ```bash
-   # OpenAI
-   export OPENAI_API_KEY="sk-your-actual-api-key-here"
-   export OPENAI_ORGANIZATION="org-123"  # Optional
-   export OPENAI_PROJECT="proj-456"      # Optional
-   export OPENAI_BASE_URL="https://custom-endpoint.com/v1"  # Optional
-
-   # Anthropic
-   export ANTHROPIC_API_KEY="sk-ant-your-actual-api-key"
-   export ANTHROPIC_BASE_URL="https://custom-endpoint.com"  # Optional
-   ```
-
-2. **Install dependencies:**
-   ```bash
-   cargo build
-   ```
-
-## Streaming Examples
-
-The streaming API allows you to receive responses in real-time as tokens are generated, providing a more interactive experience for chat applications.
-
-### Overview
-
-All streaming examples use the `chat_completion_stream` method which returns a `StreamResult<ChatStreamEvent>` - a stream of events including tokens, completion signals, and errors. The streaming API uses the same request builders as regular completions but with `.stream(true)` enabled.
-
-### 1. OpenAI Streaming (`openai_stream.rs`)
-
-**Run:** `cargo run --example openai_stream`
-
-**Environment:** Requires `OPENAI_API_KEY`
-
-Demonstrates OpenAI streaming with:
-- **Simple streaming chat** with real-time token display
-- **Multi-turn conversations** with context preservation  
-- **Creative writing** with high temperature settings
-- **Error handling** for invalid models and network issues
-- **Token counting** and performance metrics
-
-**Code snippet:**
-```rust
-let request = ChatRequestBuilder::new()
-    .system("You are a helpful assistant.")
-    .user("Tell me a short joke about programming.")
-    .temperature(0.7)
-    .max_tokens(100)
-    .stream(true) // Enable streaming
-    .build();
-
-let mut stream = provider
-    .chat_completion_stream(request, "gpt-3.5-turbo", None)
-    .await;
-
-while let Some(event) = stream.next().await {
-    match event? {
-        ChatStreamEvent::Token(token) => {
-            print!("{}", token);
-            std::io::Write::flush(&mut std::io::stdout())?;
-        }
-        ChatStreamEvent::Done => {
-            println!("\n✅ Stream completed");
-            break;
-        }
-        ChatStreamEvent::Error(error) => {
-            println!("\n❌ Stream error: {}", error);
-            break;
-        }
-    }
-}
-```
-
-**Models used:** `gpt-3.5-turbo`, `gpt-4o-mini`, `gpt-4`
-
-### 2. Anthropic Claude Streaming (`anthropic_stream.rs`)
-
-**Run:** `cargo run --example anthropic_stream`
-
-**Environment:** Requires `ANTHROPIC_API_KEY`
-
-Shows Claude streaming with:
-- **Philosophical conversations** demonstrating reasoning abilities
-- **Creative storytelling** with vivid imagery
-- **Code explanation** with technical accuracy
-- **Model comparison** across Claude variants
-- **Word counting** and content analysis
-
-**Code snippet:**
-```rust
-let request = ChatRequestBuilder::new()
-    .system("You are Claude, a helpful and thoughtful AI assistant.")
-    .user("Explain quantum computing in simple terms.")
-    .temperature(0.7)
-    .max_tokens(150)
-    .stream(true)
-    .build();
-
-let mut stream = provider
-    .chat_completion_stream(request, "claude-3-haiku-20240307", None)
-    .await;
-
-// Handle streaming events...
-```
-
-**Models used:** `claude-3-haiku-20240307`, `claude-3-sonnet-20240229`, `claude-3-5-sonnet-20241022`, `claude-3-opus-20240229`
-
-**Temperature settings:**
-- Technical content: 0.1-0.4 for accuracy
-- Creative content: 0.7-1.0 for variety
-- Balanced conversation: 0.6-0.7
-
-### Streaming API Patterns
-
-**Event handling:**
-```rust
-while let Some(event) = stream.next().await {
-    match event? {
-        ChatStreamEvent::Token(token) => {
-            // Display token immediately
-            print!("{}", token);
-            std::io::Write::flush(&mut std::io::stdout())?;
-        }
-        ChatStreamEvent::Done => {
-            // Stream completed successfully
-            println!("\n✅ Completed");
-            break;
-        }
-        ChatStreamEvent::Error(error) => {
-            // Handle stream-specific errors
-            println!("\n❌ Error: {}", error);
-            break;
-        }
-    }
-}
-```
-
-**Real-time display:**
-- Use `print!()` instead of `println!()` for tokens
-- Call `std::io::Write::flush()` after each token for immediate display
-- Handle partial words and unicode characters gracefully
-
-**Error handling:**
-- Network errors are yielded as `Err(LlmError)`
-- API errors come as `ChatStreamEvent::Error(String)`
-- Always check for both error types in production code
-
-**Performance tips:**
-- Use faster models (flash variants) for better streaming experience
-- Set appropriate `max_tokens` to prevent long responses
-- Consider `top_p` parameter for controlled randomness
-- Lower temperature (0.1-0.4) for consistent streaming
-
-### Testing Streaming Examples
-
-```bash
-# Build all examples to verify compilation
-cargo build --examples
-
-# Test individual streaming examples
-cargo run --example openai_stream     # Requires OPENAI_API_KEY
-cargo run --example anthropic_stream  # Requires ANTHROPIC_API_KEY
-
-# Run lint checks
-cargo clippy --all-targets --all-features
-```
-
-## Examples
-
-### 1. Basic Usage (`openai_basic.rs`)
-
-**Run:** `cargo run --example openai_basic`
-
-Demonstrates:
-- Using ConfigBuilder for environment-based configuration
-- Basic chat completion request
-- Token usage tracking
-
-```rust
-// Configuration using ConfigBuilder (recommended)
-let config = ConfigBuilder::openai_from_env()?;
-let provider = OpenAIProvider::new(config)?;
-
-// Simple request
-let request = ChatRequestBuilder::new()
-    .system("You are a helpful assistant.")
-    .user("What is 2+2?")
-    .temperature(0.7)
-    .build();
-
-let response = provider.chat_completion(request, "gpt-4").await?;
-```
-
-### 2. Simple Examples (`openai_simple.rs`)
-
-**Run:** `cargo run --example openai_simple`
-
-Demonstrates:
-- Multiple conversation patterns
-- Different models comparison
-- Advanced parameter usage (temperature, top_p, penalties)
-- Error handling
-
-Key features:
-- **Multi-message conversations** with context
-- **Model comparison** between GPT-3.5-turbo and GPT-4o-mini
-- **Creative writing** with high temperature settings
-- **Parameter experimentation** (frequency_penalty, presence_penalty, top_p)
-
-### 3. Configuration Examples (`openai_config.rs`)
-
-**Run:** `cargo run --example openai_config`
-
-Demonstrates:
-- Different configuration options
-- Organization and project settings
-- Custom base URLs (useful for proxies/Azure OpenAI)
-- Configuration validation
-- Error handling patterns
-- Request builder patterns
-
-Key features:
-- **Environment-based configuration**
-- **Custom endpoints** for custom API URLs
-- **Validation and error handling**
-- **Health checks** and model availability
-- **Request builder patterns** from minimal to full-featured
-
-## Usage Patterns
-
-### Configuration Options
-
-**Recommended: Use ConfigBuilder for environment-based config:**
-
-```rust
-use llm_core::config::ConfigBuilder;
-
-// Automatically reads OPENAI_API_KEY, OPENAI_ORGANIZATION, OPENAI_PROJECT, OPENAI_BASE_URL
-let config = ConfigBuilder::openai_from_env()?;
-let provider = OpenAIProvider::new(config)?;
-```
-
-**Alternative: Manual configuration:**
-
-```rust
-use llm_core::config::OpenAIConfig;
-
-// Manual configuration
-let config = OpenAIConfig::new("sk-your-key")
-    .with_organization("org-123")  
-    .with_project("proj-456")
-    .with_base_url("https://your-custom-endpoint.com/v1");
-```
-
-### Request Building
-
-```rust
-// Minimal
-let request = ChatRequestBuilder::new("gpt-3.5-turbo")
-    .user("Hello!")
-    .build();
-
-// Full-featured
-let request = ChatRequestBuilder::new("gpt-4")
-    .system("You are a helpful assistant.")
-    .user("Question 1")
-    .assistant("Answer 1")
-    .user("Question 2")
-    .temperature(0.7)
-    .max_tokens(150)
-    .top_p(0.9)
-    .frequency_penalty(0.1)
-    .presence_penalty(0.1)
-    .stop_sequences(vec!["END".to_string()])
-    .build();
-```
-
-### Error Handling
-
-```rust
-match provider.chat_completion(request).await {
-    Ok(response) => {
-        println!("Response: {}", response.message.content);
-        println!("Tokens: {}", response.usage.total_tokens);
-    }
-    Err(e) => {
-        eprintln!("Error: {}", e);
-        // Handle different error types
-    }
-}
-```
-
-## API Reference
-
-### Core Types
-
-- **`OpenAIProvider`** - Main provider implementation
-- **`OpenAIConfig`** - Configuration for OpenAI API
-- **`ChatRequestBuilder`** - Fluent builder for chat requests
-- **`ChatRequest`** - Chat completion request
-- **`ChatResponse`** - Chat completion response
-
-### Key Methods
-
-- **`provider.chat_completion(request)`** - Send chat completion
-- **`provider.health_check()`** - Test API connectivity
-- **`config.validate()`** - Validate configuration
-
-### Supported Models
-
-- `gpt-4o`
-- `gpt-4o-mini`
-- `gpt-4-turbo`
-- `gpt-4`
-- `gpt-3.5-turbo`
-
-### Parameters
-
-| Parameter | Type | Description |
-|-----------|------|-------------|
-| `temperature` | `f32` | Controls randomness (0.0-2.0) |
-| `max_tokens` | `u32` | Maximum response length |
-| `top_p` | `f32` | Nucleus sampling (0.0-1.0) |
-| `frequency_penalty` | `f32` | Reduce repetition (-2.0 to 2.0) |
-| `presence_penalty` | `f32` | Encourage new topics (-2.0 to 2.0) |
-| `stop` | `Vec<String>` | Stop sequences |
-
-## Tips
-
-1. **Start with `openai_simple.rs`** for basic understanding
-2. **Use `gpt-3.5-turbo`** for faster/cheaper testing
-3. **Set lower temperature** (0.1-0.3) for consistent responses
-4. **Set higher temperature** (0.8-1.0) for creative tasks
-5. **Use `max_tokens`** to control costs
-6. **Test with `health_check()`** before making requests
-
-## Troubleshooting
-
-**"API key not found"**: Make sure `OPENAI_API_KEY` environment variable is set
-
-**"Invalid API key"**: Ensure your API key starts with `sk-` and is valid
-
-**"Rate limit"**: Add delays between requests or reduce concurrency
-
-**"Model not found"**: Check that the model name is supported (see list above)
-
-## Test All Providers (`test_all_providers.rs`)
-
-Comprehensive test that validates all LLM providers with health checks:
-
-```bash
-# Set up your API keys
-export OPENAI_API_KEY="sk-..."
-export ANTHROPIC_API_KEY="sk-ant-..."
-
-# Run the comprehensive test
-cargo run --example test_all_providers
-```
-
-**Features:**
-- Tests OpenAI and Anthropic providers
-- Performs health checks
-- Provides detailed success/failure reporting
-- Gracefully handles missing API keys
-
-**Sample Output:**
-```
-🚀 Testing All LLM Providers
-
-🔍 Testing OpenAI Provider...
-   Health check: ✅ Passed
-✅ OpenAI: Health check passed
-
-📊 SUMMARY:
-┌─────────────┬────────┐
-│ Provider    │ Status │
-├─────────────┼────────┤
-│ OpenAI      │ ✅ Pass │
-│ Anthropic   │ ✅ Pass │
-└─────────────┴────────┘
-
-🎉 All providers are working correctly!
-```
-
-Use this example for:
-- Verifying your API keys work
-- Testing network connectivity
-- Validating provider implementations
-- CI/CD pipeline health checks 
diff --git a/crates/rullm-core/examples/anthropic_simple.rs b/crates/rullm-core/examples/anthropic_simple.rs
deleted file mode 100644
index 517ac641..00000000
--- a/crates/rullm-core/examples/anthropic_simple.rs
+++ /dev/null
@@ -1,142 +0,0 @@
-use rullm_core::providers::anthropic::{AnthropicClient, Message, MessagesRequest};
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // 1. Basic Configuration using from_env
-    let client = AnthropicClient::from_env()?;
-
-    // 2. Simple Chat Completion
-    let request = MessagesRequest::new(
-        "claude-3-haiku-20240307",
-        vec![Message::user("What is 2 + 2?")],
-        1024,
-    )
-    .with_system("You are a helpful assistant.")
-    .with_temperature(0.7);
-
-    let response = client.messages(request).await?;
-
-    // Extract text from content blocks
-    let text = response
-        .content
-        .iter()
-        .filter_map(|block| match block {
-            rullm_core::providers::anthropic::ContentBlock::Text { text } => Some(text.as_str()),
-            _ => None,
-        })
-        .collect::<Vec<_>>()
-        .join("");
-
-    println!("🤖 Claude: {}", text);
-    println!(
-        "📊 Tokens used: {} input + {} output",
-        response.usage.input_tokens, response.usage.output_tokens
-    );
-
-    // 3. Multi-message conversation
-    let conversation_request = MessagesRequest::new(
-        "claude-3-sonnet-20240229",
-        vec![
-            Message::user("What is 5 * 7?"),
-            Message::assistant("5 * 7 = 35"),
-            Message::user("What about 6 * 8?"),
-        ],
-        100,
-    )
-    .with_system("You are a helpful math tutor.");
-
-    let conversation_response = client.messages(conversation_request).await?;
-
-    let conversation_text = conversation_response
-        .content
-        .iter()
-        .filter_map(|block| match block {
-            rullm_core::providers::anthropic::ContentBlock::Text { text } => Some(text.as_str()),
-            _ => None,
-        })
-        .collect::<Vec<_>>()
-        .join("");
-
-    println!("\n💬 Conversation:");
-    println!("Claude: {}", conversation_text);
-
-    // 4. Different Claude models comparison
-    let models = [
-        "claude-3-haiku-20240307",
-        "claude-3-sonnet-20240229",
-        "claude-3-opus-20240229",
-    ];
-    let question = "Explain async/await in one sentence.";
-
-    for model in &models {
-        let request =
-            MessagesRequest::new(*model, vec![Message::user(question)], 50).with_temperature(0.5);
-
-        match client.messages(request).await {
-            Ok(response) => {
-                let text = response
-                    .content
-                    .iter()
-                    .filter_map(|block| match block {
-                        rullm_core::providers::anthropic::ContentBlock::Text { text } => {
-                            Some(text.as_str())
-                        }
-                        _ => None,
-                    })
-                    .collect::<Vec<_>>()
-                    .join("");
-
-                println!("\n🔬 {model} says:");
-                println!("{}", text);
-            }
-            Err(e) => {
-                println!("❌ Error with {model}: {e}");
-                // Note: Some models might not be available depending on your API access
-            }
-        }
-    }
-
-    // 5. Advanced parameters with Anthropic-specific features
-    let creative_request = MessagesRequest::new(
-        "claude-3-5-sonnet-20241022",
-        vec![Message::user("Write a haiku about programming.")],
-        200,
-    )
-    .with_system("You are a creative writer.")
-    .with_temperature(1.0) // Higher creativity
-    .with_top_p(0.9); // Nucleus sampling
-
-    let creative_response = client.messages(creative_request).await?;
-
-    let creative_text = creative_response
-        .content
-        .iter()
-        .filter_map(|block| match block {
-            rullm_core::providers::anthropic::ContentBlock::Text { text } => Some(text.as_str()),
-            _ => None,
-        })
-        .collect::<Vec<_>>()
-        .join("");
-
-    println!("\n🎨 Creative Response:");
-    println!("{}", creative_text);
-    println!("Model: {}", creative_response.model);
-    if let Some(reason) = creative_response.stop_reason {
-        println!("Stop reason: {:?}", reason);
-    }
-
-    // 6. Token estimation
-    let text = "This is a sample text for token estimation.";
-    let estimated_tokens = client
-        .count_tokens("claude-3-haiku-20240307", vec![Message::user(text)], None)
-        .await?;
-    println!("\n📏 Estimated tokens for '{text}': {estimated_tokens}");
-
-    // 7. Health check
-    match client.health_check().await {
-        Ok(_) => println!("\n✅ Anthropic API is healthy"),
-        Err(e) => println!("\n❌ Health check failed: {e}"),
-    }
-
-    Ok(())
-}
diff --git a/crates/rullm-core/examples/anthropic_stream.rs b/crates/rullm-core/examples/anthropic_stream.rs
deleted file mode 100644
index cd74fda8..00000000
--- a/crates/rullm-core/examples/anthropic_stream.rs
+++ /dev/null
@@ -1,240 +0,0 @@
-use futures::StreamExt;
-use rullm_core::providers::anthropic::{
-    AnthropicClient, Delta, Message, MessagesRequest, StreamEvent,
-};
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    println!("🔄 Anthropic Claude Streaming Chat Example");
-    println!("==========================================\n");
-
-    // 1. Configuration from environment
-    // Set ANTHROPIC_API_KEY environment variable before running
-    let client = AnthropicClient::from_env()?;
-
-    // 2. Simple streaming chat with Claude
-    println!("💬 Simple streaming chat:");
-    let request = MessagesRequest::new(
-        "claude-3-haiku-20240307",
-        vec![Message::user("Explain quantum computing in simple terms.")],
-        150,
-    )
-    .with_system("You are Claude, a helpful and thoughtful AI assistant.")
-    .with_temperature(0.7);
-
-    let mut stream = client.messages_stream(request).await?;
-
-    print!("🤖 Claude: ");
-    while let Some(event_result) = stream.next().await {
-        match event_result {
-            Ok(event) => match event {
-                StreamEvent::ContentBlockDelta {
-                    delta: Delta::TextDelta { text },
-                    ..
-                } => {
-                    print!("{text}");
-                    std::io::Write::flush(&mut std::io::stdout())?;
-                }
-                StreamEvent::MessageStop => {
-                    println!("\n✅ Stream completed successfully");
-                    break;
-                }
-                StreamEvent::Error { error } => {
-                    println!("\n❌ Stream error: {}", error.message);
-                    break;
-                }
-                _ => {} // Other events like MessageStart, ContentBlockStart, etc.
-            },
-            Err(e) => {
-                println!("\n❌ Stream error: {e}");
-                break;
-            }
-        }
-    }
-
-    // 3. Multi-turn philosophical conversation
-    println!("\n\n🗨️ Multi-turn philosophical conversation:");
-    let conversation_request = MessagesRequest::new(
-        "claude-3-sonnet-20240229",
-        vec![
-            Message::user("What is consciousness?"),
-            Message::assistant(
-                "Consciousness is the subjective experience of being aware - the 'what it's like' quality of experience.",
-            ),
-            Message::user("Could an AI ever be truly conscious?"),
-        ],
-        200,
-    )
-    .with_system("You are a philosopher who enjoys exploring deep questions.")
-    .with_temperature(0.6);
-
-    let mut conversation_stream = client.messages_stream(conversation_request).await?;
-
-    print!("🤖 Philosopher Claude: ");
-    while let Some(event_result) = conversation_stream.next().await {
-        match event_result {
-            Ok(event) => match event {
-                StreamEvent::ContentBlockDelta {
-                    delta: Delta::TextDelta { text },
-                    ..
-                } => {
-                    print!("{text}");
-                    std::io::Write::flush(&mut std::io::stdout())?;
-                }
-                StreamEvent::MessageStop => {
-                    println!("\n✅ Philosophical stream completed");
-                    break;
-                }
-                StreamEvent::Error { error } => {
-                    println!("\n❌ Stream error: {}", error.message);
-                    break;
-                }
-                _ => {}
-            },
-            Err(e) => {
-                println!("\n❌ Stream error: {e}");
-                break;
-            }
-        }
-    }
-
-    // 4. Creative writing with Claude's storytelling capabilities
-    println!("\n\n🎨 Creative story stream:");
-    let creative_request = MessagesRequest::new(
-        "claude-3-5-sonnet-20241022",
-        vec![Message::user(
-            "Write a short story about a lighthouse keeper who discovers something extraordinary.",
-        )],
-        300,
-    )
-    .with_system("You are a master storyteller with a gift for vivid imagery.")
-    .with_temperature(0.9) // Higher creativity
-    .with_top_p(0.95);
-
-    let mut creative_stream = client.messages_stream(creative_request).await?;
-
-    print!("✍️ Story: ");
-    let mut char_count = 0;
-    while let Some(event_result) = creative_stream.next().await {
-        match event_result {
-            Ok(event) => match event {
-                StreamEvent::ContentBlockDelta {
-                    delta: Delta::TextDelta { text },
-                    ..
-                } => {
-                    print!("{text}");
-                    std::io::Write::flush(&mut std::io::stdout())?;
-                    char_count += text.len();
-                }
-                StreamEvent::MessageStop => {
-                    println!("\n✅ Story completed (~{char_count} characters)");
-                    break;
-                }
-                StreamEvent::Error { error } => {
-                    println!("\n❌ Stream error: {}", error.message);
-                    break;
-                }
-                _ => {}
-            },
-            Err(e) => {
-                println!("\n❌ Stream error: {e}");
-                break;
-            }
-        }
-    }
-
-    // 5. Code explanation with streaming
-    println!("\n\n💻 Code explanation stream:");
-    let code_request = MessagesRequest::new(
-        "claude-3-opus-20240229",
-        vec![Message::user(
-            "Explain this Rust code step by step:\n\nlet mut v = vec![1, 2, 3];\nv.iter().map(|x| x * 2).collect::<Vec<_>>()",
-        )],
-        250,
-    )
-    .with_system("You are a programming mentor who explains code clearly and concisely.")
-    .with_temperature(0.3); // Lower temperature for technical accuracy
-
-    let mut code_stream = client.messages_stream(code_request).await?;
-
-    print!("🧑‍💻 Mentor: ");
-    while let Some(event_result) = code_stream.next().await {
-        match event_result {
-            Ok(event) => match event {
-                StreamEvent::ContentBlockDelta {
-                    delta: Delta::TextDelta { text },
-                    ..
-                } => {
-                    print!("{text}");
-                    std::io::Write::flush(&mut std::io::stdout())?;
-                }
-                StreamEvent::MessageStop => {
-                    println!("\n✅ Code explanation completed");
-                    break;
-                }
-                StreamEvent::Error { error } => {
-                    println!("\n❌ Stream error: {}", error.message);
-                    break;
-                }
-                _ => {}
-            },
-            Err(e) => {
-                println!("\n❌ Stream error: {e}");
-                break;
-            }
-        }
-    }
-
-    // 6. Error handling demonstration
-    println!("\n\n⚠️ Error handling demonstration:");
-    let invalid_request = MessagesRequest::new(
-        "claude-invalid-model",
-        vec![Message::user("Test with invalid model.")],
-        100,
-    )
-    .with_temperature(0.7);
-
-    match client.messages_stream(invalid_request).await {
-        Ok(mut error_stream) => {
-            while let Some(event_result) = error_stream.next().await {
-                match event_result {
-                    Ok(event) => match event {
-                        StreamEvent::ContentBlockDelta {
-                            delta: Delta::TextDelta { text },
-                            ..
-                        } => {
-                            print!("{text}");
-                        }
-                        StreamEvent::Error { error } => {
-                            println!("📡 Stream error event (as expected): {}", error.message);
-                            break;
-                        }
-                        StreamEvent::MessageStop => {
-                            println!("Unexpectedly completed");
-                            break;
-                        }
-                        _ => {}
-                    },
-                    Err(error) => {
-                        println!("🔴 Request error: {error}");
-                        break;
-                    }
-                }
-            }
-        }
-        Err(error) => {
-            println!("🔴 Request error (as expected): {error}");
-        }
-    }
-
-    println!("\n\n🎯 Tips for using Anthropic Claude streaming:");
-    println!("• Set ANTHROPIC_API_KEY environment variable");
-    println!("• Process StreamEvent variants: ContentBlockDelta, MessageStop, Error");
-    println!("• Extract text from Delta::TextDelta events");
-    println!("• Claude models: haiku (fast), sonnet (balanced), opus (largest)");
-    println!("• Claude supports reasoning, analysis, and creative writing");
-    println!("• Lower temperature (0.1-0.4) for factual content");
-    println!("• Higher temperature (0.7-1.0) for creative content");
-
-    Ok(())
-}
diff --git a/crates/rullm-core/examples/basic_usage.rs b/crates/rullm-core/examples/basic_usage.rs
deleted file mode 100644
index d5e0f86f..00000000
--- a/crates/rullm-core/examples/basic_usage.rs
+++ /dev/null
@@ -1,49 +0,0 @@
-use rullm_core::{ChatRequestBuilder, ChatRole};
-
-// This example demonstrates the unified interface (compat_types) for OpenAI-compatible providers
-// It shows the builder pattern without requiring actual provider implementations
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    println!("LLM Library - Unified Interface Example (compat_types)");
-    println!("=======================================================");
-    println!();
-    println!("This demonstrates the builder pattern for creating OpenAI-compatible requests.");
-    println!(
-        "These types are used by OpenAICompatibleProvider for providers like Groq/OpenRouter.\n"
-    );
-
-    // This demonstrates the builder pattern for creating requests
-    let request = ChatRequestBuilder::new()
-        .add_message(
-            ChatRole::System,
-            "You are a helpful assistant that provides concise answers",
-        )
-        .add_message(ChatRole::User, "What is the capital of France?")
-        .temperature(0.7)
-        .max_tokens(100)
-        .build();
-
-    println!("Created chat request:");
-    println!("  Messages: {} total", request.messages.len());
-    println!("  Temperature: {:?}", request.temperature);
-    println!("  Max tokens: {:?}", request.max_tokens);
-    println!("  Stream: {:?}", request.stream);
-
-    for (i, message) in request.messages.iter().enumerate() {
-        println!(
-            "  Message {}: {:?} - {}",
-            i + 1,
-            message.role,
-            message.content
-        );
-    }
-
-    println!("\n🔍 Key Points:");
-    println!("  • These compat_types are minimal types for OpenAI-compatible providers");
-    println!("  • For full-featured OpenAI, use OpenAIClient with ChatCompletionRequest");
-    println!("  • For Anthropic, use AnthropicClient with MessagesRequest");
-    println!("\nSee provider-specific examples (openai_simple, anthropic_simple) for details.");
-
-    Ok(())
-}
diff --git a/crates/rullm-core/examples/openai_basic.rs b/crates/rullm-core/examples/openai_basic.rs
deleted file mode 100644
index a2ea1504..00000000
--- a/crates/rullm-core/examples/openai_basic.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-use rullm_core::providers::openai::{
-    ChatCompletionRequest, ChatMessage, ContentPart, MessageContent, OpenAIClient,
-};
-
-// Helper to extract text from MessageContent
-fn extract_text(content: &MessageContent) -> String {
-    match content {
-        MessageContent::Text(text) => text.clone(),
-        MessageContent::Parts(parts) => parts
-            .iter()
-            .filter_map(|part| match part {
-                ContentPart::Text { text } => Some(text.as_str()),
-                _ => None,
-            })
-            .collect::<Vec<_>>()
-            .join(""),
-    }
-}
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // Configure OpenAI client from environment
-    let client = OpenAIClient::from_env()?;
-
-    // Build a simple chat request
-    let mut request = ChatCompletionRequest::new(
-        "gpt-4",
-        vec![
-            ChatMessage::system("You are a helpful assistant that explains concepts clearly."),
-            ChatMessage::user("What is the difference between async and sync programming?"),
-        ],
-    );
-    request.temperature = Some(0.7);
-    request.max_tokens = Some(300);
-
-    // Make the request
-    let response = client.chat_completion(request).await?;
-
-    println!("Model: {}", response.model);
-    println!(
-        "Response: {}",
-        extract_text(response.choices[0].message.content.as_ref().unwrap())
-    );
-    println!(
-        "Token usage - Prompt: {}, Completion: {}, Total: {}",
-        response.usage.prompt_tokens, response.usage.completion_tokens, response.usage.total_tokens
-    );
-
-    Ok(())
-}
diff --git a/crates/rullm-core/examples/openai_config.rs b/crates/rullm-core/examples/openai_config.rs
deleted file mode 100644
index 20457f8f..00000000
--- a/crates/rullm-core/examples/openai_config.rs
+++ /dev/null
@@ -1,162 +0,0 @@
-use rullm_core::config::ProviderConfig;
-use rullm_core::providers::openai::{
-    ChatCompletionRequest, ChatMessage, ContentPart, MessageContent, OpenAIClient,
-};
-use rullm_core::providers::openai_compatible::OpenAIConfig;
-
-// Helper to extract text from MessageContent
-fn extract_text(content: &MessageContent) -> String {
-    match content {
-        MessageContent::Text(text) => text.clone(),
-        MessageContent::Parts(parts) => parts
-            .iter()
-            .filter_map(|part| match part {
-                ContentPart::Text { text } => Some(text.as_str()),
-                _ => None,
-            })
-            .collect::<Vec<_>>()
-            .join(""),
-    }
-}
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    println!("=== OpenAI Client Configuration Examples ===\n");
-
-    // 1. Basic configuration
-    println!("1. Basic Configuration:");
-    let basic_config = OpenAIConfig::new("your-api-key-here");
-    println!("   Base URL: {}", basic_config.base_url());
-    println!("   Timeout: {:?}", basic_config.timeout());
-    println!("   Headers: {:?}", basic_config.headers());
-
-    // 2. Configuration with organization and project
-    println!("\n2. Configuration with Organization & Project:");
-    let org_config = OpenAIConfig::new("your-api-key-here")
-        .with_organization("org-123")
-        .with_project("proj-456");
-
-    let headers = org_config.headers();
-    if let Some(org) = headers.get("OpenAI-Organization") {
-        println!("   Organization: {org}");
-    }
-    if let Some(project) = headers.get("OpenAI-Project") {
-        println!("   Project: {project}");
-    }
-
-    // 3. Configuration with custom base URL (for proxies, Azure OpenAI, etc.)
-    println!("\n3. Custom Base URL:");
-    let custom_config =
-        OpenAIConfig::new("your-api-key-here").with_base_url("https://your-custom-endpoint.com/v1");
-    println!("   Custom Base URL: {}", custom_config.base_url());
-
-    // 4. Configuration from environment (real example)
-    println!("\n4. Configuration from Environment:");
-    match std::env::var("OPENAI_API_KEY") {
-        Ok(api_key) => {
-            let env_config = OpenAIConfig::new(api_key);
-
-            // Validate configuration
-            match env_config.validate() {
-                Ok(_) => {
-                    println!("   ✅ Configuration is valid");
-
-                    // Create client and test
-                    match OpenAIClient::new(env_config) {
-                        Ok(client) => {
-                            println!("   ✅ Client created successfully");
-
-                            // Test health check
-                            match client.health_check().await {
-                                Ok(_) => println!("   ✅ Health check passed"),
-                                Err(e) => println!("   ❌ Health check failed: {e}"),
-                            }
-
-                            // Make a simple request
-                            println!("\n   Testing chat completion...");
-                            let mut test_request = ChatCompletionRequest::new(
-                                "gpt-3.5-turbo",
-                                vec![ChatMessage::user("Hello, test request")],
-                            );
-                            test_request.temperature = Some(0.5);
-                            test_request.max_tokens = Some(10);
-
-                            match client.chat_completion(test_request).await {
-                                Ok(response) => {
-                                    println!(
-                                        "   ✅ Test response: {}",
-                                        extract_text(
-                                            response.choices[0].message.content.as_ref().unwrap()
-                                        )
-                                    );
-                                    println!("   Tokens used: {}", response.usage.total_tokens);
-                                }
-                                Err(e) => println!("   ❌ Test request failed: {e}"),
-                            }
-                        }
-                        Err(e) => println!("   ❌ Failed to create client: {e}"),
-                    }
-                }
-                Err(e) => println!("   ❌ Invalid configuration: {e}"),
-            }
-        }
-        Err(_) => {
-            println!("   ⚠️  OPENAI_API_KEY not set - skipping real API test");
-            println!("   Set OPENAI_API_KEY environment variable to test with real API");
-        }
-    }
-
-    // 5. Error handling examples
-    println!("\n5. Error Handling Examples:");
-
-    // Invalid API key format
-    let invalid_config = OpenAIConfig::new("invalid-key");
-    match invalid_config.validate() {
-        Ok(_) => println!("   Unexpected: validation passed"),
-        Err(e) => println!("   ✅ Caught invalid API key: {e}"),
-    }
-
-    // Empty API key
-    let empty_config = OpenAIConfig::new("");
-    match empty_config.validate() {
-        Ok(_) => println!("   Unexpected: validation passed"),
-        Err(e) => println!("   ✅ Caught empty API key: {e}"),
-    }
-
-    // 6. Request construction patterns
-    println!("\n6. Request Construction Patterns:");
-
-    // Minimal request
-    let minimal = ChatCompletionRequest::new("gpt-3.5-turbo", vec![ChatMessage::user("Hello")]);
-    println!("   Minimal request: {} message(s)", minimal.messages.len());
-
-    // Full-featured request
-    let mut full_request = ChatCompletionRequest::new(
-        "gpt-3.5-turbo",
-        vec![
-            ChatMessage::system("You are a helpful assistant."),
-            ChatMessage::user("What's the weather like?"),
-            ChatMessage::assistant("I don't have access to current weather data."),
-            ChatMessage::user("That's okay, what can you help with?"),
-        ],
-    );
-    full_request.temperature = Some(0.7);
-    full_request.max_tokens = Some(150);
-    full_request.top_p = Some(0.9);
-    full_request.frequency_penalty = Some(0.1);
-    full_request.presence_penalty = Some(0.1);
-    full_request.stop = Some(vec!["END".to_string()]);
-
-    println!(
-        "   Full request: {} message(s)",
-        full_request.messages.len()
-    );
-    println!("   Temperature: {:?}", full_request.temperature);
-    println!("   Max tokens: {:?}", full_request.max_tokens);
-    println!("   Top P: {:?}", full_request.top_p);
-    println!("   Frequency penalty: {:?}", full_request.frequency_penalty);
-    println!("   Presence penalty: {:?}", full_request.presence_penalty);
-    println!("   Stop sequences: {:?}", full_request.stop);
-
-    Ok(())
-}
diff --git a/crates/rullm-core/examples/openai_conversation.rs b/crates/rullm-core/examples/openai_conversation.rs
deleted file mode 100644
index fa630ff3..00000000
--- a/crates/rullm-core/examples/openai_conversation.rs
+++ /dev/null
@@ -1,168 +0,0 @@
-use rullm_core::providers::openai::{
-    ChatCompletionRequest, ChatMessage, ContentPart, MessageContent, OpenAIClient,
-};
-use std::io::{self, Write};
-
-// Helper to extract text from MessageContent
-fn extract_text(content: &MessageContent) -> String {
-    match content {
-        MessageContent::Text(text) => text.clone(),
-        MessageContent::Parts(parts) => parts
-            .iter()
-            .filter_map(|part| match part {
-                ContentPart::Text { text } => Some(text.as_str()),
-                _ => None,
-            })
-            .collect::<Vec<_>>()
-            .join(""),
-    }
-}
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // Configure OpenAI client from environment
-    let client = OpenAIClient::from_env()?;
-
-    // Health check
-    match client.health_check().await {
-        Ok(_) => println!("✅ Client is healthy\n"),
-        Err(e) => {
-            println!("❌ Health check failed: {e}");
-            return Ok(());
-        }
-    }
-
-    println!("=== Multi-turn Conversation Example ===");
-    println!("Type 'quit' or 'exit' to end the conversation\n");
-
-    // Start with system message and context
-    let mut conversation: Vec<ChatMessage> = vec![ChatMessage::system(
-        "You are a helpful programming assistant. Keep responses concise but informative.",
-    )];
-
-    // Interactive conversation loop
-    loop {
-        print!("\nYou: ");
-        io::stdout().flush()?;
-
-        let mut input = String::new();
-        io::stdin().read_line(&mut input)?;
-        let user_input = input.trim();
-
-        if user_input.is_empty() || user_input == "quit" || user_input == "exit" {
-            break;
-        }
-
-        conversation.push(ChatMessage::user(user_input));
-
-        // Build request from conversation history
-        let mut request = ChatCompletionRequest::new("gpt-4o-mini", conversation.clone());
-        request.temperature = Some(0.7);
-        request.max_tokens = Some(500);
-
-        print!("Assistant: ");
-        io::stdout().flush()?;
-
-        match client.chat_completion(request).await {
-            Ok(response) => {
-                let assistant_content = response.choices[0].message.content.as_ref().unwrap();
-
-                // Extract text from MessageContent
-                let assistant_text = match assistant_content {
-                    rullm_core::providers::openai::MessageContent::Text(text) => text.clone(),
-                    rullm_core::providers::openai::MessageContent::Parts(parts) => parts
-                        .iter()
-                        .filter_map(|part| match part {
-                            rullm_core::providers::openai::ContentPart::Text { text } => {
-                                Some(text.as_str())
-                            }
-                            _ => None,
-                        })
-                        .collect::<Vec<_>>()
-                        .join(""),
-                };
-
-                println!("{}", assistant_text);
-
-                // Add assistant response to conversation history
-                conversation.push(ChatMessage::assistant(assistant_text));
-
-                // Show token usage
-                println!(
-                    "\n📊 Tokens used: {} prompt + {} completion = {} total",
-                    response.usage.prompt_tokens,
-                    response.usage.completion_tokens,
-                    response.usage.total_tokens
-                );
-            }
-            Err(e) => {
-                println!("Error: {e}");
-            }
-        }
-    }
-
-    println!("\n=== Advanced Configuration Example ===");
-
-    // Example with different models and parameters
-    let models_to_test = ["gpt-3.5-turbo", "gpt-4o-mini"];
-    let question = "Explain the concept of ownership in Rust in one sentence.";
-
-    for &model in &models_to_test {
-        println!("\n🤖 Testing model: {model}");
-
-        let mut request = ChatCompletionRequest::new(
-            model,
-            vec![
-                ChatMessage::system("You are a concise technical writer."),
-                ChatMessage::user(question),
-            ],
-        );
-        request.temperature = Some(0.3); // Lower temperature for more consistent responses
-        request.max_tokens = Some(100); // Limit response length
-        request.top_p = Some(0.9); // Nucleus sampling
-        request.frequency_penalty = Some(0.1); // Reduce repetition
-        request.presence_penalty = Some(0.1); // Encourage diverse topics
-
-        match client.chat_completion(request).await {
-            Ok(response) => {
-                println!(
-                    "Response: {}",
-                    extract_text(response.choices[0].message.content.as_ref().unwrap())
-                );
-                println!("Tokens: {}", response.usage.total_tokens);
-            }
-            Err(e) => {
-                println!("Error with {model}: {e}");
-            }
-        }
-    }
-
-    println!("\n=== Stop Sequences Example ===");
-
-    // Example using stop sequences
-    let mut request = ChatCompletionRequest::new(
-        "gpt-3.5-turbo",
-        vec![
-            ChatMessage::system("You are a code generator. Always end code blocks with '// END'"),
-            ChatMessage::user("Write a simple hello world function in Rust"),
-        ],
-    );
-    request.stop = Some(vec!["// END".to_string()]); // Stop generation at this sequence
-    request.temperature = Some(0.5);
-
-    match client.chat_completion(request).await {
-        Ok(response) => {
-            println!("Code generation (stopped at '// END'):");
-            println!(
-                "{}",
-                extract_text(response.choices[0].message.content.as_ref().unwrap())
-            );
-            println!("Finish reason: {}", response.choices[0].finish_reason);
-        }
-        Err(e) => {
-            println!("Error: {e}");
-        }
-    }
-
-    Ok(())
-}
diff --git a/crates/rullm-core/examples/openai_simple.rs b/crates/rullm-core/examples/openai_simple.rs
deleted file mode 100644
index aa8c6c9f..00000000
--- a/crates/rullm-core/examples/openai_simple.rs
+++ /dev/null
@@ -1,131 +0,0 @@
-use rullm_core::providers::openai::{
-    ChatCompletionRequest, ChatMessage, ContentPart, MessageContent, OpenAIClient,
-};
-
-// Helper to extract text from MessageContent
-fn extract_text(content: &MessageContent) -> String {
-    match content {
-        MessageContent::Text(text) => text.clone(),
-        MessageContent::Parts(parts) => parts
-            .iter()
-            .filter_map(|part| match part {
-                ContentPart::Text { text } => Some(text.as_str()),
-                _ => None,
-            })
-            .collect::<Vec<_>>()
-            .join(""),
-    }
-}
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // 1. Basic Configuration using from_env
-    let client = OpenAIClient::from_env()?;
-
-    // 2. Simple Chat Completion
-    let request = ChatCompletionRequest::new(
-        "gpt-3.5-turbo",
-        vec![
-            ChatMessage::system("You are a helpful assistant."),
-            ChatMessage::user("What is 2 + 2?"),
-        ],
-    );
-
-    let response = client.chat_completion(request).await?;
-
-    println!(
-        "🤖 Assistant: {}",
-        extract_text(response.choices[0].message.content.as_ref().unwrap())
-    );
-    println!("📊 Tokens used: {}", response.usage.total_tokens);
-
-    // 3. Multi-message conversation
-    let mut conversation_request = ChatCompletionRequest::new(
-        "gpt-4o-mini",
-        vec![
-            ChatMessage::system("You are a helpful math tutor."),
-            ChatMessage::user("What is 5 * 7?"),
-            ChatMessage::assistant("5 * 7 = 35"),
-            ChatMessage::user("What about 6 * 8?"),
-        ],
-    );
-    conversation_request.max_tokens = Some(100);
-
-    let conversation_response = client.chat_completion(conversation_request).await?;
-
-    println!("\n💬 Conversation:");
-    println!(
-        "Assistant: {}",
-        extract_text(
-            conversation_response.choices[0]
-                .message
-                .content
-                .as_ref()
-                .unwrap()
-        )
-    );
-
-    // 4. Different models comparison
-    let models = ["gpt-3.5-turbo", "gpt-4o-mini"];
-    let question = "Explain async/await in one sentence.";
-
-    for model in &models {
-        let mut request = ChatCompletionRequest::new(*model, vec![ChatMessage::user(question)]);
-        request.temperature = Some(0.5);
-        request.max_tokens = Some(50);
-
-        match client.chat_completion(request).await {
-            Ok(response) => {
-                println!("\n🔬 {model} says:");
-                println!(
-                    "{}",
-                    extract_text(response.choices[0].message.content.as_ref().unwrap())
-                );
-            }
-            Err(e) => {
-                println!("❌ Error with {model}: {e}");
-            }
-        }
-    }
-
-    // 5. Advanced parameters
-    let mut creative_request = ChatCompletionRequest::new(
-        "gpt-4",
-        vec![
-            ChatMessage::system("You are a creative writer."),
-            ChatMessage::user("Write a haiku about programming."),
-        ],
-    );
-    creative_request.temperature = Some(1.0); // Higher creativity
-    creative_request.top_p = Some(0.9); // Nucleus sampling
-    // creative_request.frequency_penalty = Some(0.2); // Reduce repetition
-    // creative_request.presence_penalty = Some(0.2); // Encourage diverse topics
-    // creative_request.stop = Some(vec!["END".to_string(), "STOP".to_string()]); // Stop sequences
-
-    let creative_response = client.chat_completion(creative_request).await?;
-
-    println!("\n🎨 Creative Response:");
-    println!(
-        "{}",
-        extract_text(
-            creative_response.choices[0]
-                .message
-                .content
-                .as_ref()
-                .unwrap()
-        )
-    );
-    println!("Model: {}", creative_response.model);
-    println!(
-        "Finish reason: {}",
-        creative_response.choices[0].finish_reason
-    );
-
-    // 6. Health check
-    match client.health_check().await {
-        Ok(_) => println!("\n✅ OpenAI API is healthy"),
-        Err(e) => println!("\n❌ Health check failed: {e}"),
-    }
-
-    Ok(())
-}
diff --git a/crates/rullm-core/examples/openai_stream.rs b/crates/rullm-core/examples/openai_stream.rs
deleted file mode 100644
index 9d4fc3e9..00000000
--- a/crates/rullm-core/examples/openai_stream.rs
+++ /dev/null
@@ -1,166 +0,0 @@
-use futures::StreamExt;
-use rullm_core::providers::openai::{ChatCompletionRequest, ChatMessage, OpenAIClient};
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    println!("🔄 OpenAI Streaming Chat Example");
-    println!("================================\n");
-
-    // 1. Configuration from environment
-    // Set OPENAI_API_KEY environment variable before running
-    let client = OpenAIClient::from_env()?;
-
-    // 2. Simple streaming chat
-    println!("💬 Simple streaming chat:");
-    let mut request = ChatCompletionRequest::new(
-        "gpt-3.5-turbo",
-        vec![
-            ChatMessage::system("You are a helpful assistant."),
-            ChatMessage::user("Tell me a short joke about programming."),
-        ],
-    );
-    request.temperature = Some(0.7);
-    request.max_tokens = Some(100);
-    request.stream = Some(true); // Enable streaming
-
-    let mut stream = client.chat_completion_stream(request).await?;
-
-    print!("🤖 Assistant: ");
-    while let Some(chunk_result) = stream.next().await {
-        match chunk_result {
-            Ok(chunk) => {
-                if let Some(choice) = chunk.choices.first() {
-                    if let Some(content) = &choice.delta.content {
-                        print!("{content}");
-                        std::io::Write::flush(&mut std::io::stdout())?;
-                    }
-                }
-            }
-            Err(e) => {
-                println!("\n❌ Stream error: {e}");
-                break;
-            }
-        }
-    }
-    println!("\n✅ Stream completed successfully");
-
-    // 3. Multi-turn conversation streaming
-    println!("\n\n🗨️ Multi-turn conversation streaming:");
-    let mut conversation_request = ChatCompletionRequest::new(
-        "gpt-4o-mini",
-        vec![
-            ChatMessage::system("You are a coding tutor. Give concise explanations."),
-            ChatMessage::user("What is async/await?"),
-            ChatMessage::assistant(
-                "Async/await is a pattern for writing asynchronous code that looks synchronous.",
-            ),
-            ChatMessage::user("Can you give a simple example in Rust?"),
-        ],
-    );
-    conversation_request.temperature = Some(0.5);
-    conversation_request.max_tokens = Some(150);
-    conversation_request.stream = Some(true);
-
-    let mut conversation_stream = client.chat_completion_stream(conversation_request).await?;
-
-    print!("🤖 Tutor: ");
-    while let Some(chunk_result) = conversation_stream.next().await {
-        match chunk_result {
-            Ok(chunk) => {
-                if let Some(choice) = chunk.choices.first() {
-                    if let Some(content) = &choice.delta.content {
-                        print!("{content}");
-                        std::io::Write::flush(&mut std::io::stdout())?;
-                    }
-                }
-            }
-            Err(e) => {
-                println!("\n❌ Conversation stream error: {e}");
-                break;
-            }
-        }
-    }
-    println!("\n✅ Conversation stream completed");
-
-    // 4. Creative writing with higher temperature
-    println!("\n\n🎨 Creative writing stream (high temperature):");
-    let mut creative_request = ChatCompletionRequest::new(
-        "gpt-4",
-        vec![
-            ChatMessage::system("You are a creative writer."),
-            ChatMessage::user("Write a very short story about a robot learning to dream."),
-        ],
-    );
-    creative_request.temperature = Some(1.0); // Higher creativity
-    creative_request.top_p = Some(0.9);
-    creative_request.max_tokens = Some(200);
-    creative_request.stream = Some(true);
-
-    let mut creative_stream = client.chat_completion_stream(creative_request).await?;
-
-    print!("✍️ Story: ");
-    let mut token_count = 0;
-    while let Some(chunk_result) = creative_stream.next().await {
-        match chunk_result {
-            Ok(chunk) => {
-                if let Some(choice) = chunk.choices.first() {
-                    if let Some(content) = &choice.delta.content {
-                        print!("{content}");
-                        std::io::Write::flush(&mut std::io::stdout())?;
-                        token_count += 1;
-                    }
-                }
-            }
-            Err(e) => {
-                // Stream complete is returned as an error
-                if e.to_string().contains("Stream complete") {
-                    break;
-                }
-                println!("\n❌ Creative stream error: {e}");
-                break;
-            }
-        }
-    }
-    println!("\n✅ Creative stream completed ({token_count} chunks received)");
-
-    // 5. Error handling demonstration
-    println!("\n\n⚠️ Error handling demonstration:");
-    let mut invalid_request = ChatCompletionRequest::new(
-        "invalid-model-name",
-        vec![ChatMessage::user("This request has an invalid model test.")],
-    );
-    invalid_request.temperature = Some(0.7);
-    invalid_request.stream = Some(true);
-
-    match client.chat_completion_stream(invalid_request).await {
-        Ok(mut error_stream) => {
-            while let Some(chunk_result) = error_stream.next().await {
-                match chunk_result {
-                    Ok(chunk) => {
-                        if let Some(choice) = chunk.choices.first() {
-                            if let Some(content) = &choice.delta.content {
-                                print!("{content}");
-                            }
-                        }
-                    }
-                    Err(error) => {
-                        println!("📡 Stream error (as expected): {error}");
-                        break;
-                    }
-                }
-            }
-        }
-        Err(error) => {
-            println!("🔴 Request error (as expected): {error}");
-        }
-    }
-
-    println!("\n\n🎯 Tips for using OpenAI streaming:");
-    println!("• Set OPENAI_API_KEY environment variable");
-    println!("• Set request.stream = Some(true) to enable streaming");
-    println!("• Process ChatCompletionChunk deltas as they arrive");
-    println!("• Flush stdout for real-time output");
-    println!("• Consider using lower max_tokens for faster streaming");
-
-    Ok(())
-}
diff --git a/crates/rullm-core/examples/test_all_providers.rs b/crates/rullm-core/examples/test_all_providers.rs
deleted file mode 100644
index 5cbdae59..00000000
--- a/crates/rullm-core/examples/test_all_providers.rs
+++ /dev/null
@@ -1,96 +0,0 @@
-use rullm_core::providers::anthropic::{AnthropicClient, AnthropicConfig};
-use rullm_core::providers::openai::OpenAIClient;
-use rullm_core::providers::openai_compatible::OpenAIConfig;
-use std::env;
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    println!("🚀 Testing All LLM Providers\n");
-
-    // Test results tracking
-    let mut results = Vec::new();
-
-    // 1. Test OpenAI Provider
-    println!("🔍 Testing OpenAI Provider...");
-    match test_openai_provider().await {
-        Ok(()) => {
-            println!("✅ OpenAI: Health check passed");
-            results.push(("OpenAI", true));
-        }
-        Err(e) => {
-            println!("❌ OpenAI: Failed - {e}");
-            results.push(("OpenAI", false));
-        }
-    }
-    println!();
-
-    // 2. Test Anthropic Provider
-    println!("🔍 Testing Anthropic Provider...");
-    match test_anthropic_provider().await {
-        Ok(()) => {
-            println!("✅ Anthropic: Health check passed");
-            results.push(("Anthropic", true));
-        }
-        Err(e) => {
-            println!("❌ Anthropic: Failed - {e}");
-            results.push(("Anthropic", false));
-        }
-    }
-    println!();
-
-    // Summary
-    println!("📊 SUMMARY:");
-    println!("┌─────────────┬────────┐");
-    println!("│ Provider    │ Status │");
-    println!("├─────────────┼────────┤");
-    for (provider, success) in &results {
-        let status = if *success { "✅ Pass" } else { "❌ Fail" };
-        println!("│ {provider:11} │ {status:6} │");
-    }
-    println!("└─────────────┴────────┘");
-
-    let successful_providers = results.iter().filter(|(_, success)| *success).count();
-    let total_providers = results.len();
-
-    if successful_providers == total_providers {
-        println!("\n🎉 All providers are working correctly!");
-    } else {
-        println!(
-            "\n⚠️  {successful_providers}/{total_providers} providers working. Check API keys and network connectivity."
-        );
-    }
-
-    Ok(())
-}
-
-async fn test_openai_provider() -> Result<(), Box<dyn std::error::Error>> {
-    let api_key =
-        env::var("OPENAI_API_KEY").map_err(|_| "OPENAI_API_KEY environment variable not set")?;
-
-    let config = OpenAIConfig::new(api_key);
-    let client = OpenAIClient::new(config)?;
-
-    // Test health check
-    match client.health_check().await {
-        Ok(_) => println!("   Health check: ✅ Passed"),
-        Err(e) => println!("   Health check: ⚠️  Warning - {e}"),
-    }
-
-    Ok(())
-}
-
-async fn test_anthropic_provider() -> Result<(), Box<dyn std::error::Error>> {
-    let api_key = env::var("ANTHROPIC_API_KEY")
-        .map_err(|_| "ANTHROPIC_API_KEY environment variable not set")?;
-
-    let config = AnthropicConfig::new(api_key);
-    let client = AnthropicClient::new(config)?;
-
-    // Test health check
-    match client.health_check().await {
-        Ok(_) => println!("   Health check: ✅ Passed"),
-        Err(e) => println!("   Health check: ⚠️  Warning - {e}"),
-    }
-
-    Ok(())
-}
diff --git a/crates/rullm-core/src/compat_types.rs b/crates/rullm-core/src/compat_types.rs
deleted file mode 100644
index 29dda7d9..00000000
--- a/crates/rullm-core/src/compat_types.rs
+++ /dev/null
@@ -1,109 +0,0 @@
-//! Compatibility types for OpenAI-compatible provider
-//!
-//! These are minimal types to support Groq/OpenRouter through the OpenAICompatibleProvider.
-//! For full-featured usage, use the concrete provider clients directly.
-
-use serde::{Deserialize, Serialize};
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "lowercase")]
-pub enum ChatRole {
-    System,
-    User,
-    Assistant,
-    Tool,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ChatMessage {
-    pub role: ChatRole,
-    pub content: String,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ChatRequest {
-    pub messages: Vec<ChatMessage>,
-    pub temperature: Option<f32>,
-    pub max_tokens: Option<u32>,
-    pub top_p: Option<f32>,
-    pub stream: Option<bool>,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ChatResponse {
-    pub message: ChatMessage,
-    pub model: String,
-    pub usage: TokenUsage,
-    pub finish_reason: Option<String>,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct TokenUsage {
-    pub prompt_tokens: u32,
-    pub completion_tokens: u32,
-    pub total_tokens: u32,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum ChatStreamEvent {
-    Token(String),
-    Done,
-    Error(String),
-}
-
-pub struct ChatRequestBuilder {
-    messages: Vec<ChatMessage>,
-    temperature: Option<f32>,
-    max_tokens: Option<u32>,
-    top_p: Option<f32>,
-}
-
-impl ChatRequestBuilder {
-    pub fn new() -> Self {
-        Self {
-            messages: Vec::new(),
-            temperature: None,
-            max_tokens: None,
-            top_p: None,
-        }
-    }
-
-    pub fn add_message(mut self, role: ChatRole, content: impl Into<String>) -> Self {
-        self.messages.push(ChatMessage {
-            role,
-            content: content.into(),
-        });
-        self
-    }
-
-    pub fn temperature(mut self, temperature: f32) -> Self {
-        self.temperature = Some(temperature);
-        self
-    }
-
-    pub fn max_tokens(mut self, max_tokens: u32) -> Self {
-        self.max_tokens = Some(max_tokens);
-        self
-    }
-
-    pub fn top_p(mut self, top_p: f32) -> Self {
-        self.top_p = Some(top_p);
-        self
-    }
-
-    pub fn build(self) -> ChatRequest {
-        ChatRequest {
-            messages: self.messages,
-            temperature: self.temperature,
-            max_tokens: self.max_tokens,
-            top_p: self.top_p,
-            stream: Some(false),
-        }
-    }
-}
-
-impl Default for ChatRequestBuilder {
-    fn default() -> Self {
-        Self::new()
-    }
-}
diff --git a/crates/rullm-core/src/config.rs b/crates/rullm-core/src/config.rs
deleted file mode 100644
index 47f6de8e..00000000
--- a/crates/rullm-core/src/config.rs
+++ /dev/null
@@ -1,168 +0,0 @@
-use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-use std::time::Duration;
-
-use crate::providers::{AnthropicConfig, OpenAICompatibleConfig, OpenAIConfig};
-
-/// Configuration trait for LLM providers
-pub trait ProviderConfig: Send + Sync {
-    /// Get the API key for this provider
-    fn api_key(&self) -> &str;
-
-    /// Get the base URL for API requests
-    fn base_url(&self) -> &str;
-
-    /// Get default request timeout
-    fn timeout(&self) -> Duration;
-
-    /// Get any additional headers required by the provider
-    fn headers(&self) -> HashMap<String, String>;
-
-    /// Validate the configuration
-    fn validate(&self) -> Result<(), crate::error::LlmError>;
-}
-
-/// Generic configuration for HTTP-based providers
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct HttpProviderConfig {
-    pub api_key: String,
-    pub base_url: String,
-    pub timeout_seconds: u64,
-    pub headers: HashMap<String, String>,
-}
-
-impl HttpProviderConfig {
-    pub fn new(api_key: impl Into<String>, base_url: impl Into<String>) -> Self {
-        Self {
-            api_key: api_key.into(),
-            base_url: base_url.into(),
-            timeout_seconds: 30,
-            headers: HashMap::new(),
-        }
-    }
-
-    pub fn with_timeout(mut self, timeout_seconds: u64) -> Self {
-        self.timeout_seconds = timeout_seconds;
-        self
-    }
-
-    pub fn with_header(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
-        self.headers.insert(key.into(), value.into());
-        self
-    }
-}
-
-impl ProviderConfig for HttpProviderConfig {
-    fn api_key(&self) -> &str {
-        &self.api_key
-    }
-
-    fn base_url(&self) -> &str {
-        &self.base_url
-    }
-
-    fn timeout(&self) -> Duration {
-        Duration::from_secs(self.timeout_seconds)
-    }
-
-    fn headers(&self) -> HashMap<String, String> {
-        self.headers.clone()
-    }
-
-    fn validate(&self) -> Result<(), crate::error::LlmError> {
-        if self.api_key.is_empty() {
-            return Err(crate::error::LlmError::configuration("API key is required"));
-        }
-
-        if self.base_url.is_empty() {
-            return Err(crate::error::LlmError::configuration(
-                "Base URL is required",
-            ));
-        }
-
-        if !self.base_url.starts_with("http://") && !self.base_url.starts_with("https://") {
-            return Err(crate::error::LlmError::configuration(
-                "Base URL must be a valid HTTP/HTTPS URL",
-            ));
-        }
-
-        Ok(())
-    }
-}
-
-/// Configuration builder for creating provider configs from environment variables
-pub struct ConfigBuilder;
-
-impl ConfigBuilder {
-    /// Create OpenAI config from environment
-    pub fn openai_from_env() -> Result<OpenAIConfig, crate::error::LlmError> {
-        let api_key = std::env::var("OPENAI_API_KEY").map_err(|_| {
-            crate::error::LlmError::configuration("OPENAI_API_KEY environment variable not set")
-        })?;
-
-        let mut config = OpenAIConfig::new(api_key);
-
-        if let Ok(org) = std::env::var("OPENAI_ORGANIZATION") {
-            config = config.with_organization(org);
-        }
-
-        if let Ok(project) = std::env::var("OPENAI_PROJECT") {
-            config = config.with_project(project);
-        }
-
-        if let Ok(base_url) = std::env::var("OPENAI_BASE_URL") {
-            config = config.with_base_url(base_url);
-        }
-
-        config.validate()?;
-        Ok(config)
-    }
-
-    /// Create Anthropic config from environment
-    pub fn anthropic_from_env() -> Result<AnthropicConfig, crate::error::LlmError> {
-        let api_key = std::env::var("ANTHROPIC_API_KEY").map_err(|_| {
-            crate::error::LlmError::configuration("ANTHROPIC_API_KEY environment variable not set")
-        })?;
-
-        let mut config = AnthropicConfig::new(api_key);
-
-        if let Ok(base_url) = std::env::var("ANTHROPIC_BASE_URL") {
-            config = config.with_base_url(base_url);
-        }
-
-        config.validate()?;
-        Ok(config)
-    }
-
-    /// Create Groq config from environment
-    pub fn groq_from_env() -> Result<OpenAICompatibleConfig, crate::error::LlmError> {
-        let api_key = std::env::var("GROQ_API_KEY").map_err(|_| {
-            crate::error::LlmError::configuration("GROQ_API_KEY environment variable not set")
-        })?;
-
-        let mut config = OpenAICompatibleConfig::groq(api_key);
-
-        if let Ok(base_url) = std::env::var("GROQ_BASE_URL") {
-            config = config.with_base_url(base_url);
-        }
-
-        config.validate()?;
-        Ok(config)
-    }
-
-    /// Create OpenRouter config from environment
-    pub fn openrouter_from_env() -> Result<OpenAICompatibleConfig, crate::error::LlmError> {
-        let api_key = std::env::var("OPENROUTER_API_KEY").map_err(|_| {
-            crate::error::LlmError::configuration("OPENROUTER_API_KEY environment variable not set")
-        })?;
-
-        let mut config = OpenAICompatibleConfig::openrouter(api_key);
-
-        if let Ok(base_url) = std::env::var("OPENROUTER_BASE_URL") {
-            config = config.with_base_url(base_url);
-        }
-
-        config.validate()?;
-        Ok(config)
-    }
-}
diff --git a/crates/rullm-core/src/error.rs b/crates/rullm-core/src/error.rs
deleted file mode 100644
index 702299cd..00000000
--- a/crates/rullm-core/src/error.rs
+++ /dev/null
@@ -1,220 +0,0 @@
-use std::collections::HashMap;
-use thiserror::Error;
-
-/// Main error type for the LLM library
-#[derive(Error, Debug)]
-pub enum LlmError {
-    /// Network-related errors
-    #[error("Network error: {message}")]
-    Network {
-        message: String,
-        #[source]
-        source: Option<Box<dyn std::error::Error + Send + Sync>>,
-    },
-
-    /// Authentication errors
-    #[error("Authentication failed: {message}")]
-    Authentication { message: String },
-
-    /// Rate limiting errors
-    #[error("Rate limit exceeded: {message}. Retry after: {retry_after:?}")]
-    RateLimit {
-        message: String,
-        retry_after: Option<std::time::Duration>,
-    },
-
-    /// Provider-specific API errors
-    #[error("API error from {provider}: {message} (code: {code:?})")]
-    Api {
-        provider: String,
-        message: String,
-        code: Option<String>,
-        details: Option<HashMap<String, serde_json::Value>>,
-    },
-
-    /// Configuration errors
-    #[error("Configuration error: {message}")]
-    Configuration { message: String },
-
-    /// Validation errors for requests
-    #[error("Validation error: {message}")]
-    Validation { message: String },
-
-    /// Timeout errors
-    #[error("Request timed out after {duration:?}")]
-    Timeout { duration: std::time::Duration },
-
-    /// Serialization/deserialization errors
-    #[error("Serialization error: {message}")]
-    Serialization {
-        message: String,
-        #[source]
-        source: Box<dyn std::error::Error + Send + Sync>,
-    },
-
-    /// Model-specific errors (model not found, unsupported, etc.)
-    #[error("Model error: {message}")]
-    Model { message: String },
-
-    /// Resource errors (quota exceeded, insufficient credits, etc.)
-    #[error("Resource error: {message}")]
-    Resource { message: String },
-
-    /// Provider service unavailable
-    #[error("Service unavailable: {provider} is currently unavailable")]
-    ServiceUnavailable { provider: String },
-
-    /// Generic errors for cases not covered above
-    #[error("Unexpected error: {message}")]
-    Unknown {
-        message: String,
-        #[source]
-        source: Option<Box<dyn std::error::Error + Send + Sync>>,
-    },
-}
-
-impl LlmError {
-    /// Create a network error
-    pub fn network(message: impl Into<String>) -> Self {
-        Self::Network {
-            message: message.into(),
-            source: None,
-        }
-    }
-
-    /// Create a network error with source
-    pub fn network_with_source(
-        message: impl Into<String>,
-        source: impl Into<Box<dyn std::error::Error + Send + Sync>>,
-    ) -> Self {
-        Self::Network {
-            message: message.into(),
-            source: Some(source.into()),
-        }
-    }
-
-    /// Create an authentication error
-    pub fn authentication(message: impl Into<String>) -> Self {
-        Self::Authentication {
-            message: message.into(),
-        }
-    }
-
-    /// Create a rate limit error
-    pub fn rate_limit(
-        message: impl Into<String>,
-        retry_after: Option<std::time::Duration>,
-    ) -> Self {
-        Self::RateLimit {
-            message: message.into(),
-            retry_after,
-        }
-    }
-
-    /// Create an API error
-    pub fn api(
-        provider: impl Into<String>,
-        message: impl Into<String>,
-        code: Option<String>,
-        details: Option<HashMap<String, serde_json::Value>>,
-    ) -> Self {
-        Self::Api {
-            provider: provider.into(),
-            message: message.into(),
-            code,
-            details,
-        }
-    }
-
-    /// Create a configuration error
-    pub fn configuration(message: impl Into<String>) -> Self {
-        Self::Configuration {
-            message: message.into(),
-        }
-    }
-
-    /// Create a validation error
-    pub fn validation(message: impl Into<String>) -> Self {
-        Self::Validation {
-            message: message.into(),
-        }
-    }
-
-    /// Create a timeout error
-    pub fn timeout(duration: std::time::Duration) -> Self {
-        Self::Timeout { duration }
-    }
-
-    /// Create a serialization error
-    pub fn serialization(
-        message: impl Into<String>,
-        source: impl Into<Box<dyn std::error::Error + Send + Sync>>,
-    ) -> Self {
-        Self::Serialization {
-            message: message.into(),
-            source: source.into(),
-        }
-    }
-
-    /// Create a model error
-    pub fn model(message: impl Into<String>) -> Self {
-        Self::Model {
-            message: message.into(),
-        }
-    }
-
-    /// Create a resource error
-    pub fn resource(message: impl Into<String>) -> Self {
-        Self::Resource {
-            message: message.into(),
-        }
-    }
-
-    /// Create a service unavailable error
-    pub fn service_unavailable(provider: impl Into<String>) -> Self {
-        Self::ServiceUnavailable {
-            provider: provider.into(),
-        }
-    }
-
-    /// Create an unknown error
-    pub fn unknown(message: impl Into<String>) -> Self {
-        Self::Unknown {
-            message: message.into(),
-            source: None,
-        }
-    }
-
-    /// Create an unknown error with source
-    pub fn unknown_with_source(
-        message: impl Into<String>,
-        source: impl Into<Box<dyn std::error::Error + Send + Sync>>,
-    ) -> Self {
-        Self::Unknown {
-            message: message.into(),
-            source: Some(source.into()),
-        }
-    }
-}
-
-/// Convert from reqwest errors
-impl From<reqwest::Error> for LlmError {
-    fn from(err: reqwest::Error) -> Self {
-        if err.is_timeout() {
-            LlmError::timeout(std::time::Duration::from_secs(30)) // Default timeout
-        } else if err.is_connect() {
-            LlmError::network_with_source("Connection failed", err)
-        } else if err.is_request() {
-            LlmError::validation(format!("Invalid request: {err}"))
-        } else {
-            LlmError::network_with_source("HTTP request failed", err)
-        }
-    }
-}
-
-/// Convert from serde_json errors
-impl From<serde_json::Error> for LlmError {
-    fn from(err: serde_json::Error) -> Self {
-        LlmError::serialization("JSON serialization failed", err)
-    }
-}
diff --git a/crates/rullm-core/src/lib.rs b/crates/rullm-core/src/lib.rs
deleted file mode 100644
index 28892c91..00000000
--- a/crates/rullm-core/src/lib.rs
+++ /dev/null
@@ -1,172 +0,0 @@
-//! # rullm-core - Rust LLM Library
-//!
-//! A Rust library for interacting with Large Language Models (LLMs).
-//! Built with Tower middleware, featuring rate limiting and error handling.
-//!
-//! ## Features
-//!
-//! - Multiple LLM Providers (OpenAI, Anthropic)
-//! - Tower middleware with connection pooling and async/await
-//! - Rate limiting, timeouts, and error handling
-//! - Dual APIs: Simple string-based API and advanced API with full control
-//! - Streaming support for token-by-token responses
-//! - Test suite with examples
-//! - Metrics, logging, and error handling
-//!
-//! ## Quick Start
-//!
-//! ### Simple API (Recommended)
-//!
-//! ```rust,no_run
-//! use rullm_core::simple::{SimpleLlm, SimpleLlmClient};
-//!
-//! #[tokio::main]
-//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
-//!     let client = SimpleLlmClient::openai("your-api-key")?;
-//!     let response = client.chat("What is the capital of France?").await?;
-//!     println!("Response: {}", response);
-//!     Ok(())
-//! }
-//! ```
-//!
-//! ### Advanced API (Full Control)
-//!
-//! ```rust,no_run
-//! use rullm_core::{OpenAIConfig, OpenAIProvider, ChatCompletion, ChatRequestBuilder, ChatRole};
-//!
-//! #[tokio::main]
-//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
-//!     let config = OpenAIConfig::new("your-api-key");
-//!     let provider = OpenAIProvider::new(config)?;
-//!
-//!     let request = ChatRequestBuilder::new()
-//!         .user("Hello, world!")
-//!         .temperature(0.7)
-//!         .max_tokens(100)
-//!         .build();
-//!
-//!     let response = provider.chat_completion(request, "gpt-3.5-turbo").await?;
-//!     println!("Response: {}", response.message.content);
-//!     Ok(())
-//! }
-//! ```
-//!
-//! ## Streaming API Overview
-//!
-//! The streaming API enables real-time token-by-token responses for interactive
-//! chat applications and live user experiences.
-//!
-//! ### Core Streaming Types
-//!
-//! - [`ChatStreamEvent`] - Events emitted during streaming (Token, Done, Error)
-//! - [`StreamResult`] - Type alias for `Pin<Box<dyn Stream<Item = Result<ChatStreamEvent, LlmError>>>>`
-//! - [`ChatProvider::chat_completion_stream`] - Main streaming method for all providers
-//!
-//! ### Basic Streaming Usage
-//!
-//! ```rust,no_run
-//! use rullm_core::{OpenAIProvider, OpenAIConfig, ChatCompletion, ChatRequestBuilder, ChatStreamEvent};
-//! use futures::StreamExt;
-//!
-//! #[tokio::main]
-//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
-//!     let config = OpenAIConfig::new("your-api-key");
-//!     let provider = OpenAIProvider::new(config)?;
-//!
-//!     let request = ChatRequestBuilder::new()
-//!         .user("Tell me a story")
-//!         .stream(true) // Enable streaming
-//!         .build();
-//!
-//!     let mut stream = provider
-//!         .chat_completion_stream(request, "gpt-3.5-turbo", None)
-//!         .await;
-//!
-//!     while let Some(event) = stream.next().await {
-//!         match event? {
-//!             ChatStreamEvent::Token(token) => {
-//!                 print!("{}", token);
-//!                 std::io::Write::flush(&mut std::io::stdout())?;
-//!             }
-//!             ChatStreamEvent::Done => {
-//!                 println!("\n✅ Stream completed");
-//!                 break;
-//!             }
-//!             ChatStreamEvent::Error(error) => {
-//!                 println!("\n❌ Stream error: {}", error);
-//!                 break;
-//!             }
-//!         }
-//!     }
-//!     Ok(())
-//! }
-//! ```
-//!
-//! ### Streaming Examples
-//!
-//! The library includes streaming examples for each provider:
-//!
-//! - `openai_stream.rs` - OpenAI GPT models streaming
-//! - `anthropic_stream.rs` - Anthropic Claude models streaming
-//!
-//! Run examples with:
-//! ```bash
-//! cargo run --example openai_stream     # Requires OPENAI_API_KEY
-//! cargo run --example anthropic_stream  # Requires ANTHROPIC_API_KEY
-//! ```
-//!
-//! ### Provider-Specific Streaming Features
-//!
-//! | Provider | Models | Key Features |
-//! |----------|--------|--------------|
-//! | OpenAI | GPT-3.5, GPT-4 | Token counting, creative writing |
-//! | Anthropic | Claude 3 variants | Reasoning, code analysis |
-//!
-//! ## Error Handling
-//!
-//! All operations return [`Result<T, LlmError>`](LlmError) for error handling:
-//!
-//! ```rust,ignore
-//! use rullm_core::{LlmError, OpenAIProvider, OpenAIConfig, ChatCompletion, ChatRequestBuilder};
-//!
-//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
-//! # let config = OpenAIConfig::new("your-api-key");
-//! # let provider = OpenAIProvider::new(config)?;
-//! # let request = ChatRequestBuilder::new().user("test").build();
-//! match provider.chat_completion(request, "gpt-4").await {
-//!     Ok(response) => println!("Success: {}", response.message.content),
-//!     Err(LlmError::Authentication { .. }) => println!("Invalid API key"),
-//!     Err(LlmError::RateLimit { retry_after, .. }) => {
-//!         println!("Rate limited, retry after: {:?}", retry_after);
-//!     }
-//!     Err(e) => println!("Other error: {}", e),
-//! }
-//! # Ok(())
-//! # }
-//! ```
-
-pub mod compat_types;
-pub mod config;
-pub mod error;
-pub mod providers;
-pub mod utils;
-
-// Concrete client exports
-pub use providers::{AnthropicClient, OpenAIClient, OpenAICompatibleProvider};
-
-pub use config::{ConfigBuilder, HttpProviderConfig, ProviderConfig};
-pub use error::LlmError;
-pub use utils::sse::sse_lines;
-
-// Compatibility types for OpenAI-compatible providers
-pub use compat_types::{
-    ChatMessage, ChatRequest, ChatRequestBuilder, ChatResponse, ChatRole, ChatStreamEvent,
-    TokenUsage,
-};
-
-// Re-export test utilities for integration tests and examples
-#[cfg(test)]
-pub use utils::test_helpers;
-
-// Re-export commonly used types
-pub use serde::{Deserialize, Serialize};
diff --git a/crates/rullm-core/src/providers/anthropic/client.rs b/crates/rullm-core/src/providers/anthropic/client.rs
deleted file mode 100644
index 4bda4938..00000000
--- a/crates/rullm-core/src/providers/anthropic/client.rs
+++ /dev/null
@@ -1,216 +0,0 @@
-use super::config::AnthropicConfig;
-use super::types::*;
-use crate::config::ProviderConfig;
-use crate::error::LlmError;
-use crate::utils::sse::sse_lines;
-use futures::Stream;
-use futures::StreamExt;
-use reqwest::Client;
-use std::pin::Pin;
-
-/// Anthropic client with full Messages API support
-#[derive(Clone)]
-pub struct AnthropicClient {
-    config: AnthropicConfig,
-    client: Client,
-    base_url: String,
-}
-
-impl AnthropicClient {
-    /// Create a new Anthropic client
-    pub fn new(config: AnthropicConfig) -> Result<Self, LlmError> {
-        config.validate()?;
-        let base_url = config
-            .base_url
-            .clone()
-            .unwrap_or_else(|| "https://api.anthropic.com".to_string());
-
-        Ok(Self {
-            config,
-            client: Client::new(),
-            base_url,
-        })
-    }
-
-    /// Create client from environment variables
-    pub fn from_env() -> Result<Self, LlmError> {
-        let config = crate::config::ConfigBuilder::anthropic_from_env()?;
-        Self::new(config)
-    }
-
-    /// Send a messages request
-    pub async fn messages(&self, request: MessagesRequest) -> Result<MessagesResponse, LlmError> {
-        let url = format!("{}/v1/messages", self.base_url);
-
-        let mut req = self.client.post(&url);
-
-        // Add headers from config
-        for (key, value) in self.config.headers() {
-            req = req.header(key, value);
-        }
-
-        let response = req.json(&request).send().await?;
-
-        if !response.status().is_success() {
-            let status = response.status().to_string();
-            let error_text = response
-                .text()
-                .await
-                .unwrap_or_else(|_| "Unknown error".to_string());
-
-            return Err(LlmError::api(
-                "anthropic",
-                format!("API Error: {status} - {error_text}"),
-                Some(status),
-                None,
-            ));
-        }
-
-        let response_data: MessagesResponse = response.json().await.map_err(|e| {
-            LlmError::serialization("Failed to parse MessagesResponse", Box::new(e))
-        })?;
-
-        Ok(response_data)
-    }
-
-    /// Send a streaming messages request
-    pub async fn messages_stream(
-        &self,
-        mut request: MessagesRequest,
-    ) -> Result<Pin<Box<dyn Stream<Item = Result<StreamEvent, LlmError>> + Send>>, LlmError> {
-        // Force streaming
-        request.stream = Some(true);
-
-        let url = format!("{}/v1/messages", self.base_url);
-
-        // Build headers
-        let mut header_map = reqwest::header::HeaderMap::new();
-        for (key, value) in self.config.headers() {
-            if let (Ok(name), Ok(val)) = (
-                reqwest::header::HeaderName::from_bytes(key.as_bytes()),
-                reqwest::header::HeaderValue::from_str(&value),
-            ) {
-                header_map.insert(name, val);
-            }
-        }
-
-        let response = self
-            .client
-            .post(&url)
-            .headers(header_map)
-            .json(&request)
-            .send()
-            .await?;
-
-        if !response.status().is_success() {
-            let status = response.status().to_string();
-            let error_text = response
-                .text()
-                .await
-                .unwrap_or_else(|_| "Unknown error".to_string());
-
-            return Err(LlmError::api(
-                "anthropic",
-                format!("API Error: {status} - {error_text}"),
-                Some(status),
-                None,
-            ));
-        }
-
-        let byte_stream = response.bytes_stream();
-        let sse_stream = sse_lines(byte_stream);
-
-        Ok(Box::pin(sse_stream.map(|event_result| {
-            event_result.and_then(|data| {
-                serde_json::from_str::<StreamEvent>(&data).map_err(|e| {
-                    LlmError::serialization(
-                        format!("Failed to parse StreamEvent: {}", e),
-                        Box::new(e),
-                    )
-                })
-            })
-        })))
-    }
-
-    /// Count tokens (requires a separate API call)
-    pub async fn count_tokens(
-        &self,
-        model: &str,
-        messages: Vec<Message>,
-        system: Option<SystemPrompt>,
-    ) -> Result<u32, LlmError> {
-        let url = format!("{}/v1/messages/count_tokens", self.base_url);
-
-        let body = serde_json::json!({
-            "model": model,
-            "messages": messages,
-            "system": system,
-        });
-
-        let mut req = self.client.post(&url);
-        for (key, value) in self.config.headers() {
-            req = req.header(key, value);
-        }
-
-        let response = req.json(&body).send().await?;
-
-        if !response.status().is_success() {
-            let status = response.status().to_string();
-            let error_text = response
-                .text()
-                .await
-                .unwrap_or_else(|_| "Unknown error".to_string());
-
-            return Err(LlmError::api(
-                "anthropic",
-                format!("API Error: {status} - {error_text}"),
-                Some(status),
-                None,
-            ));
-        }
-
-        let json: serde_json::Value = response.json().await.map_err(|e| {
-            LlmError::serialization("Failed to parse count_tokens response", Box::new(e))
-        })?;
-
-        let tokens = json["input_tokens"].as_u64().ok_or_else(|| {
-            LlmError::serialization(
-                "Missing input_tokens in response",
-                Box::new(std::io::Error::new(
-                    std::io::ErrorKind::InvalidData,
-                    "Invalid response format",
-                )),
-            )
-        })? as u32;
-
-        Ok(tokens)
-    }
-
-    /// Health check
-    pub async fn health_check(&self) -> Result<(), LlmError> {
-        // Anthropic doesn't have a dedicated health endpoint
-        // We can do a minimal request to check connectivity
-        let url = format!("{}/v1/messages", self.base_url);
-
-        let minimal_request =
-            MessagesRequest::new("claude-3-haiku-20240307", vec![Message::user("hi")], 1);
-
-        let mut req = self.client.post(&url);
-        for (key, value) in self.config.headers() {
-            req = req.header(key, value);
-        }
-
-        let response = req.json(&minimal_request).send().await?;
-
-        if response.status().is_success() {
-            Ok(())
-        } else {
-            Err(LlmError::api(
-                "anthropic",
-                "Health check failed",
-                Some(response.status().to_string()),
-                None,
-            ))
-        }
-    }
-}
diff --git a/crates/rullm-core/src/providers/anthropic/config.rs b/crates/rullm-core/src/providers/anthropic/config.rs
deleted file mode 100644
index baf54655..00000000
--- a/crates/rullm-core/src/providers/anthropic/config.rs
+++ /dev/null
@@ -1,85 +0,0 @@
-use crate::config::ProviderConfig;
-use crate::error::LlmError;
-use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-use std::time::Duration;
-
-/// Anthropic-specific configuration
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AnthropicConfig {
-    pub api_key: String,
-    pub base_url: Option<String>,
-    pub timeout_seconds: u64,
-    /// Whether to use OAuth authentication (Bearer token) instead of API key (x-api-key)
-    #[serde(default)]
-    pub use_oauth: bool,
-}
-
-impl AnthropicConfig {
-    pub fn new(api_key: impl Into<String>) -> Self {
-        Self {
-            api_key: api_key.into(),
-            base_url: None,
-            timeout_seconds: 30,
-            use_oauth: false,
-        }
-    }
-
-    pub fn with_base_url(mut self, base_url: impl Into<String>) -> Self {
-        self.base_url = Some(base_url.into());
-        self
-    }
-
-    pub fn with_oauth(mut self, use_oauth: bool) -> Self {
-        self.use_oauth = use_oauth;
-        self
-    }
-}
-
-impl ProviderConfig for AnthropicConfig {
-    fn api_key(&self) -> &str {
-        &self.api_key
-    }
-
-    fn base_url(&self) -> &str {
-        self.base_url
-            .as_deref()
-            .unwrap_or("https://api.anthropic.com")
-    }
-
-    fn timeout(&self) -> Duration {
-        Duration::from_secs(self.timeout_seconds)
-    }
-
-    fn headers(&self) -> HashMap<String, String> {
-        let mut headers = HashMap::new();
-
-        if self.use_oauth {
-            // OAuth: use Bearer token + required beta headers
-            headers.insert(
-                "Authorization".to_string(),
-                format!("Bearer {}", self.api_key),
-            );
-            headers.insert(
-                "anthropic-beta".to_string(),
-                "oauth-2025-04-20,claude-code-20250219,interleaved-thinking-2025-05-14,fine-grained-tool-streaming-2025-05-14".to_string(),
-            );
-            headers.insert("anthropic-version".to_string(), "2023-06-01".to_string());
-        } else {
-            // API key: use x-api-key header
-            headers.insert("x-api-key".to_string(), self.api_key.clone());
-            headers.insert("anthropic-version".to_string(), "2023-06-01".to_string());
-        }
-
-        headers.insert("Content-Type".to_string(), "application/json".to_string());
-        headers
-    }
-
-    fn validate(&self) -> Result<(), LlmError> {
-        if self.api_key.is_empty() {
-            return Err(LlmError::configuration("Anthropic API key is required"));
-        }
-
-        Ok(())
-    }
-}
diff --git a/crates/rullm-core/src/providers/anthropic/mod.rs b/crates/rullm-core/src/providers/anthropic/mod.rs
deleted file mode 100644
index 081ca01a..00000000
--- a/crates/rullm-core/src/providers/anthropic/mod.rs
+++ /dev/null
@@ -1,32 +0,0 @@
-//! Anthropic provider implementation with complete Messages API support
-//!
-//! This module provides a feature-complete Anthropic client that supports all
-//! parameters and features available in the Anthropic Messages API.
-//!
-//! # Example
-//!
-//! ```no_run
-//! use rullm_core::providers::anthropic::{AnthropicClient, MessagesRequest, Message};
-//!
-//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
-//! let client = AnthropicClient::from_env()?;
-//!
-//! let request = MessagesRequest::new(
-//!     "claude-3-opus-20240229",
-//!     vec![Message::user("Hello!")],
-//!     1024,
-//! );
-//!
-//! let response = client.messages(request).await?;
-//! println!("{:?}", response.content);
-//! # Ok(())
-//! # }
-//! ```
-
-pub mod client;
-pub mod config;
-pub mod types;
-
-pub use client::AnthropicClient;
-pub use config::AnthropicConfig;
-pub use types::*;
diff --git a/crates/rullm-core/src/providers/anthropic/types.rs b/crates/rullm-core/src/providers/anthropic/types.rs
deleted file mode 100644
index f0a9c54f..00000000
--- a/crates/rullm-core/src/providers/anthropic/types.rs
+++ /dev/null
@@ -1,436 +0,0 @@
-//! Complete Anthropic Messages API types
-//!
-//! This module contains comprehensive type definitions for the Anthropic Messages API,
-//! including all parameters and features supported by Claude models.
-
-use serde::{Deserialize, Serialize};
-
-/// Messages API request with all Anthropic parameters
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct MessagesRequest {
-    /// The model to use (e.g., "claude-3-opus-20240229")
-    pub model: String,
-
-    /// Input messages for the conversation
-    pub messages: Vec<Message>,
-
-    /// The maximum number of tokens to generate
-    pub max_tokens: u32,
-
-    /// System prompt(s) to guide the model's behavior
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub system: Option<SystemPrompt>,
-
-    /// Metadata about the request
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub metadata: Option<Metadata>,
-
-    /// Custom sequences that will cause the model to stop generating
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub stop_sequences: Option<Vec<String>>,
-
-    /// Whether to incrementally stream the response
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub stream: Option<bool>,
-
-    /// Amount of randomness injected into the response (0.0 to 1.0)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub temperature: Option<f32>,
-
-    /// Use nucleus sampling (0.0 to 1.0)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub top_p: Option<f32>,
-
-    /// Only sample from the top K options for each subsequent token
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub top_k: Option<u32>,
-
-    /// Definitions of tools that the model may use
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tools: Option<Vec<Tool>>,
-
-    /// How the model should use the provided tools
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tool_choice: Option<ToolChoice>,
-}
-
-/// A message in the conversation
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Message {
-    /// The role of the message sender
-    pub role: Role,
-
-    /// The content of the message
-    pub content: MessageContent,
-}
-
-/// Role of the message sender
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "lowercase")]
-pub enum Role {
-    /// User message
-    User,
-    /// Assistant message (model response)
-    Assistant,
-}
-
-/// Message content can be text or array of content blocks
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum MessageContent {
-    /// Simple text content
-    Text(String),
-    /// Array of content blocks (for multimodal inputs, tool use, etc.)
-    Blocks(Vec<ContentBlock>),
-}
-
-/// A block of content within a message
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum ContentBlock {
-    /// Text content
-    Text { text: String },
-    /// Image content
-    Image { source: ImageSource },
-    /// Tool use (request to call a tool)
-    ToolUse {
-        id: String,
-        name: String,
-        input: serde_json::Value,
-    },
-    /// Tool result (response from a tool)
-    ToolResult {
-        tool_use_id: String,
-        #[serde(skip_serializing_if = "Option::is_none")]
-        content: Option<String>,
-        #[serde(skip_serializing_if = "Option::is_none")]
-        is_error: Option<bool>,
-    },
-}
-
-/// Image source (base64 or URL)
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum ImageSource {
-    /// Base64-encoded image
-    Base64 { media_type: String, data: String },
-}
-
-/// System prompt can be a string or array of text blocks
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum SystemPrompt {
-    /// Simple text system prompt
-    Text(String),
-    /// Array of system text blocks
-    Blocks(Vec<SystemBlock>),
-}
-
-/// A text block in the system prompt
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct SystemBlock {
-    #[serde(rename = "type")]
-    pub block_type: String, // "text"
-    pub text: String,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub cache_control: Option<CacheControl>,
-}
-
-/// Cache control for prompt caching
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct CacheControl {
-    #[serde(rename = "type")]
-    pub cache_type: String, // "ephemeral"
-}
-
-impl CacheControl {
-    /// Create an ephemeral cache control
-    pub fn ephemeral() -> Self {
-        Self {
-            cache_type: "ephemeral".to_string(),
-        }
-    }
-}
-
-impl SystemBlock {
-    /// Create a text system block
-    pub fn text(text: impl Into<String>) -> Self {
-        Self {
-            block_type: "text".to_string(),
-            text: text.into(),
-            cache_control: None,
-        }
-    }
-
-    /// Create a text system block with ephemeral cache control
-    pub fn text_with_cache(text: impl Into<String>) -> Self {
-        Self {
-            block_type: "text".to_string(),
-            text: text.into(),
-            cache_control: Some(CacheControl::ephemeral()),
-        }
-    }
-}
-
-/// Request metadata
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Metadata {
-    /// An external identifier for the user
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub user_id: Option<String>,
-}
-
-/// Tool definition
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Tool {
-    /// Name of the tool
-    pub name: String,
-    /// Description of what the tool does
-    pub description: String,
-    /// JSON schema for the tool's input
-    pub input_schema: serde_json::Value,
-}
-
-/// Tool choice configuration
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum ToolChoice {
-    /// Let the model decide
-    Auto {
-        #[serde(rename = "type")]
-        choice_type: String, // "auto"
-    },
-    /// Model must use a tool
-    Any {
-        #[serde(rename = "type")]
-        choice_type: String, // "any"
-    },
-    /// Force a specific tool
-    Tool {
-        #[serde(rename = "type")]
-        choice_type: String, // "tool"
-        name: String,
-    },
-}
-
-/// Messages API response
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct MessagesResponse {
-    /// Unique object identifier
-    pub id: String,
-    /// Object type (always "message")
-    #[serde(rename = "type")]
-    pub response_type: String,
-    /// Conversational role of the generated message
-    pub role: Role,
-    /// Content blocks in the response
-    pub content: Vec<ContentBlock>,
-    /// The model that handled the request
-    pub model: String,
-    /// The reason we stopped generating
-    pub stop_reason: Option<StopReason>,
-    /// Which custom stop sequence was generated (if any)
-    pub stop_sequence: Option<String>,
-    /// Token usage information
-    pub usage: Usage,
-}
-
-/// Reason for stopping generation
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "snake_case")]
-pub enum StopReason {
-    /// Natural end of message
-    EndTurn,
-    /// Hit a custom stop sequence
-    StopSequence,
-    /// Reached max_tokens
-    MaxTokens,
-    /// Model wants to use a tool
-    ToolUse,
-}
-
-/// Token usage information
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Usage {
-    /// Number of input tokens
-    pub input_tokens: u32,
-    /// Number of output tokens
-    pub output_tokens: u32,
-    /// Number of tokens read from cache
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub cache_creation_input_tokens: Option<u32>,
-    /// Number of tokens used to create cache
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub cache_read_input_tokens: Option<u32>,
-}
-
-/// Streaming event types
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum StreamEvent {
-    /// Start of message
-    MessageStart { message: MessageStartData },
-    /// Start of content block
-    ContentBlockStart {
-        index: u32,
-        content_block: ContentBlockStart,
-    },
-    /// Incremental content
-    ContentBlockDelta { index: u32, delta: Delta },
-    /// End of content block
-    ContentBlockStop { index: u32 },
-    /// End of message
-    MessageDelta {
-        delta: MessageDeltaData,
-        usage: Usage,
-    },
-    /// End of stream
-    MessageStop,
-    /// Ping event (keep-alive)
-    Ping,
-    /// Error event
-    Error { error: ErrorData },
-}
-
-/// Message start data
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct MessageStartData {
-    pub id: String,
-    #[serde(rename = "type")]
-    pub message_type: String,
-    pub role: Role,
-    pub content: Vec<serde_json::Value>,
-    pub model: String,
-    pub stop_reason: Option<String>,
-    pub stop_sequence: Option<String>,
-    pub usage: Usage,
-}
-
-/// Content block start
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum ContentBlockStart {
-    Text { text: String },
-    ToolUse { id: String, name: String },
-}
-
-/// Delta (incremental change)
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum Delta {
-    /// Text delta
-    TextDelta { text: String },
-    /// Tool input delta
-    InputJsonDelta { partial_json: String },
-}
-
-/// Message delta data
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct MessageDeltaData {
-    pub stop_reason: Option<StopReason>,
-    pub stop_sequence: Option<String>,
-}
-
-/// Error data
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ErrorData {
-    #[serde(rename = "type")]
-    pub error_type: String,
-    pub message: String,
-}
-
-// Builder for MessagesRequest
-impl MessagesRequest {
-    pub fn new(model: impl Into<String>, messages: Vec<Message>, max_tokens: u32) -> Self {
-        Self {
-            model: model.into(),
-            messages,
-            max_tokens,
-            system: None,
-            metadata: None,
-            stop_sequences: None,
-            stream: None,
-            temperature: None,
-            top_p: None,
-            top_k: None,
-            tools: None,
-            tool_choice: None,
-        }
-    }
-
-    pub fn with_system(mut self, system: impl Into<String>) -> Self {
-        self.system = Some(SystemPrompt::Text(system.into()));
-        self
-    }
-
-    pub fn with_temperature(mut self, temperature: f32) -> Self {
-        self.temperature = Some(temperature);
-        self
-    }
-
-    pub fn with_top_p(mut self, top_p: f32) -> Self {
-        self.top_p = Some(top_p);
-        self
-    }
-
-    pub fn with_top_k(mut self, top_k: u32) -> Self {
-        self.top_k = Some(top_k);
-        self
-    }
-
-    pub fn with_stop_sequences(mut self, stop_sequences: Vec<String>) -> Self {
-        self.stop_sequences = Some(stop_sequences);
-        self
-    }
-
-    pub fn with_tools(mut self, tools: Vec<Tool>) -> Self {
-        self.tools = Some(tools);
-        self
-    }
-}
-
-// Helper methods for creating messages
-impl Message {
-    pub fn user(content: impl Into<String>) -> Self {
-        Self {
-            role: Role::User,
-            content: MessageContent::Text(content.into()),
-        }
-    }
-
-    pub fn assistant(content: impl Into<String>) -> Self {
-        Self {
-            role: Role::Assistant,
-            content: MessageContent::Text(content.into()),
-        }
-    }
-
-    pub fn user_with_blocks(blocks: Vec<ContentBlock>) -> Self {
-        Self {
-            role: Role::User,
-            content: MessageContent::Blocks(blocks),
-        }
-    }
-
-    pub fn assistant_with_blocks(blocks: Vec<ContentBlock>) -> Self {
-        Self {
-            role: Role::Assistant,
-            content: MessageContent::Blocks(blocks),
-        }
-    }
-}
-
-impl ContentBlock {
-    pub fn text(text: impl Into<String>) -> Self {
-        Self::Text { text: text.into() }
-    }
-
-    pub fn image_base64(media_type: impl Into<String>, data: impl Into<String>) -> Self {
-        Self::Image {
-            source: ImageSource::Base64 {
-                media_type: media_type.into(),
-                data: data.into(),
-            },
-        }
-    }
-}
diff --git a/crates/rullm-core/src/providers/mod.rs b/crates/rullm-core/src/providers/mod.rs
deleted file mode 100644
index 21defbae..00000000
--- a/crates/rullm-core/src/providers/mod.rs
+++ /dev/null
@@ -1,13 +0,0 @@
-// New feature-complete provider implementations
-pub mod anthropic;
-pub mod openai;
-pub mod openai_compatible; // Used for Groq/OpenRouter
-
-// Export concrete clients
-pub use anthropic::AnthropicClient;
-pub use openai::OpenAIClient;
-pub use openai_compatible::{OpenAICompatibleProvider, ProviderIdentity, identities};
-
-// Export provider-specific configs
-pub use anthropic::AnthropicConfig;
-pub use openai_compatible::{OpenAICompatibleConfig, OpenAIConfig};
diff --git a/crates/rullm-core/src/providers/openai/client.rs b/crates/rullm-core/src/providers/openai/client.rs
deleted file mode 100644
index 5e1c0d65..00000000
--- a/crates/rullm-core/src/providers/openai/client.rs
+++ /dev/null
@@ -1,172 +0,0 @@
-use super::types::*;
-use crate::config::ProviderConfig;
-use crate::error::LlmError;
-use crate::providers::openai_compatible::OpenAIConfig;
-use crate::utils::sse::sse_lines;
-use futures::Stream;
-use futures::StreamExt;
-use reqwest::Client;
-use std::pin::Pin;
-
-/// OpenAI client with full API support
-#[derive(Clone)]
-pub struct OpenAIClient {
-    config: OpenAIConfig,
-    client: Client,
-    base_url: String,
-}
-
-impl OpenAIClient {
-    /// Create a new OpenAI client
-    pub fn new(config: OpenAIConfig) -> Result<Self, LlmError> {
-        config.validate()?;
-        let base_url = config
-            .base_url
-            .clone()
-            .unwrap_or_else(|| "https://api.openai.com/v1".to_string());
-
-        Ok(Self {
-            config,
-            client: Client::new(),
-            base_url,
-        })
-    }
-
-    /// Create client from environment variables
-    pub fn from_env() -> Result<Self, LlmError> {
-        let config = crate::config::ConfigBuilder::openai_from_env()?;
-        Self::new(config)
-    }
-
-    /// Send a chat completion request
-    pub async fn chat_completion(
-        &self,
-        request: ChatCompletionRequest,
-    ) -> Result<ChatCompletionResponse, LlmError> {
-        let url = format!("{}/chat/completions", self.base_url);
-
-        let mut req = self.client.post(&url);
-
-        // Add headers from config
-        for (key, value) in self.config.headers() {
-            req = req.header(key, value);
-        }
-
-        let response = req.json(&request).send().await?;
-
-        if !response.status().is_success() {
-            let status = response.status().to_string();
-            let error_text = response
-                .text()
-                .await
-                .unwrap_or_else(|_| "Unknown error".to_string());
-
-            return Err(LlmError::api(
-                "openai",
-                format!("API Error: {status} - {error_text}"),
-                Some(status),
-                None,
-            ));
-        }
-
-        let response_data: ChatCompletionResponse = response.json().await.map_err(|e| {
-            LlmError::serialization("Failed to parse ChatCompletionResponse", Box::new(e))
-        })?;
-
-        Ok(response_data)
-    }
-
-    /// Send a streaming chat completion request
-    pub async fn chat_completion_stream(
-        &self,
-        mut request: ChatCompletionRequest,
-    ) -> Result<Pin<Box<dyn Stream<Item = Result<ChatCompletionChunk, LlmError>> + Send>>, LlmError>
-    {
-        // Force streaming
-        request.stream = Some(true);
-
-        let url = format!("{}/chat/completions", self.base_url);
-
-        // Build headers
-        let mut header_map = reqwest::header::HeaderMap::new();
-        for (key, value) in self.config.headers() {
-            if let (Ok(name), Ok(val)) = (
-                reqwest::header::HeaderName::from_bytes(key.as_bytes()),
-                reqwest::header::HeaderValue::from_str(&value),
-            ) {
-                header_map.insert(name, val);
-            }
-        }
-        header_map.insert(
-            reqwest::header::ACCEPT,
-            reqwest::header::HeaderValue::from_static("text/event-stream"),
-        );
-
-        let response = self
-            .client
-            .post(&url)
-            .headers(header_map)
-            .json(&request)
-            .send()
-            .await?;
-
-        if !response.status().is_success() {
-            let status = response.status().to_string();
-            let error_text = response
-                .text()
-                .await
-                .unwrap_or_else(|_| "Unknown error".to_string());
-
-            return Err(LlmError::api(
-                "openai",
-                format!("API Error: {status} - {error_text}"),
-                Some(status),
-                None,
-            ));
-        }
-
-        let byte_stream = response.bytes_stream();
-        let sse_stream = sse_lines(byte_stream);
-
-        Ok(Box::pin(sse_stream.map(|event_result| {
-            event_result.and_then(|data| {
-                // OpenAI sends "[DONE]" to signal end of stream
-                if data.trim() == "[DONE]" {
-                    // We could return a special marker, but for now just skip it
-                    // The stream will end naturally
-                    return Err(LlmError::model("Stream complete"));
-                }
-
-                serde_json::from_str::<ChatCompletionChunk>(&data).map_err(|e| {
-                    LlmError::serialization(
-                        format!("Failed to parse ChatCompletionChunk: {}", e),
-                        Box::new(e),
-                    )
-                })
-            })
-        })))
-    }
-
-    /// Health check
-    pub async fn health_check(&self) -> Result<(), LlmError> {
-        let url = format!("{}/models", self.base_url);
-
-        let mut req = self.client.get(&url);
-        for (key, value) in self.config.headers() {
-            req = req.header(key, value);
-        }
-
-        let response = req.send().await?;
-
-        if response.status().is_success() {
-            Ok(())
-        } else {
-            Err(LlmError::api(
-                "openai",
-                "Health check failed",
-                Some(response.status().to_string()),
-                None,
-            ))
-        }
-    }
-}
diff --git a/crates/rullm-core/src/providers/openai/mod.rs b/crates/rullm-core/src/providers/openai/mod.rs
deleted file mode 100644
index b53eca39..00000000
--- a/crates/rullm-core/src/providers/openai/mod.rs
+++ /dev/null
@@ -1,32 +0,0 @@
-//! OpenAI provider implementation with complete API support
-//!
-//! This module provides a feature-complete OpenAI client that supports all
-//! parameters and features available in the OpenAI Chat Completions API.
-//!
-//! # Example
-//!
-//! ```no_run
-//! use rullm_core::providers::openai::{OpenAIClient, ChatCompletionRequest, ChatMessage};
-//!
-//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
-//! let client = OpenAIClient::from_env()?;
-//!
-//! let request = ChatCompletionRequest::new(
-//!     "gpt-4",
-//!     vec![
-//!         ChatMessage::system("You are a helpful assistant"),
-//!         ChatMessage::user("Hello!"),
-//!     ],
-//! );
-//!
-//! let response = client.chat_completion(request).await?;
-//! println!("{}", response.choices[0].message.content.as_ref().unwrap());
-//! # Ok(())
-//! # }
-//! ```
-
-pub mod client;
-pub mod types;
-
-pub use client::OpenAIClient;
-pub use types::*;
diff --git a/crates/rullm-core/src/providers/openai/types.rs b/crates/rullm-core/src/providers/openai/types.rs
deleted file mode 100644
index 7408a718..00000000
--- a/crates/rullm-core/src/providers/openai/types.rs
+++ /dev/null
@@ -1,502 +0,0 @@
-//! Complete OpenAI API types
-//!
-//! This module contains comprehensive type definitions for the OpenAI API,
-//! including all parameters and features supported by OpenAI's chat completions endpoint.
-
-use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-
-/// Chat completion request with all OpenAI parameters
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ChatCompletionRequest {
-    /// ID of the model to use
-    pub model: String,
-
-    /// Messages comprising the conversation so far
-    pub messages: Vec<ChatMessage>,
-
-    /// What sampling temperature to use, between 0 and 2
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub temperature: Option<f32>,
-
-    /// The maximum number of tokens to generate
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub max_tokens: Option<u32>,
-
-    /// Nucleus sampling parameter
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub top_p: Option<f32>,
-
-    /// How many chat completion choices to generate
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub n: Option<u32>,
-
-    /// Whether to stream back partial progress
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub stream: Option<bool>,
-
-    /// Up to 4 sequences where the API will stop generating
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub stop: Option<Vec<String>>,
-
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub frequency_penalty: Option<f32>,
-
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub presence_penalty: Option<f32>,
-
-    /// Modify the likelihood of specified tokens appearing in the completion
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub logit_bias: Option<HashMap<String, i32>>,
-
-    /// A unique identifier representing your end-user
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub user: Option<String>,
-
-    /// Whether to return log probabilities of the output tokens
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub logprobs: Option<bool>,
-
-    /// An integer between 0 and 20 specifying the number of most likely tokens to return at each token position
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub top_logprobs: Option<u32>,
-
-    /// This feature is in Beta. Seed for deterministic sampling
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub seed: Option<i64>,
-
-    /// An object specifying the format that the model must output
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub response_format: Option<ResponseFormat>,
-
-    /// A list of tools the model may call
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tools: Option<Vec<Tool>>,
-
-    /// Controls which (if any) function is called by the model
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tool_choice: Option<ToolChoice>,
-
-    /// Whether to enable parallel function calling
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub parallel_tool_calls: Option<bool>,
-}
-
-/// A message in the conversation
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ChatMessage {
-    /// The role of the message author
-    pub role: Role,
-
-    /// The contents of the message
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub content: Option<MessageContent>,
-
-    /// The name of the author (optional)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub name: Option<String>,
-
-    /// Tool calls generated by the model
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tool_calls: Option<Vec<ToolCall>>,
-
-    /// Tool call ID (for tool role messages)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tool_call_id: Option<String>,
-}
-
-/// Message content can be text or array of content parts
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum MessageContent {
-    /// Text content
-    Text(String),
-    /// Array of content parts (for multimodal inputs)
-    Parts(Vec<ContentPart>),
-}
-
-/// A part of the message content (text or image)
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum ContentPart {
-    /// Text content part
-    Text { text: String },
-    /// Image URL content part
-    ImageUrl { image_url: ImageUrl },
-}
-
-/// Image URL with optional detail level
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ImageUrl {
-    /// The URL of the image or base64 encoded image
-    pub url: String,
-    /// The detail level of the image
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub detail: Option<ImageDetail>,
-}
-
-/// Image detail level
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(rename_all = "lowercase")]
-pub enum ImageDetail {
-    /// Low detail (faster, cheaper)
-    Low,
-    /// High detail (slower, more expensive)
-    High,
-    /// Automatic detail selection
-    Auto,
-}
-
-/// Role of the message author
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "lowercase")]
-pub enum Role {
-    /// System message (instructions)
-    System,
-    /// User message
-    User,
-    /// Assistant message (model response)
-    Assistant,
-    /// Tool/function response
-    Tool,
-}
-
-/// Response format specification
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum ResponseFormat {
-    /// Return text (default)
-    Text,
-    /// Return valid JSON
-    JsonObject,
-    /// JSON schema (for structured outputs)
-    JsonSchema { json_schema: JsonSchema },
-}
-
-/// JSON schema for structured outputs
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct JsonSchema {
-    /// Name of the schema
-    pub name: String,
-    /// Optional description
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub description: Option<String>,
-    /// JSON schema definition
-    pub schema: serde_json::Value,
-    /// Whether to enforce strict schema adherence
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub strict: Option<bool>,
-}
-
-/// A tool (function) that the model can call
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Tool {
-    /// Type of tool (currently only "function")
-    #[serde(rename = "type")]
-    pub tool_type: String,
-    /// Function definition
-    pub function: FunctionDefinition,
-}
-
-/// Function definition
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct FunctionDefinition {
-    /// Name of the function
-    pub name: String,
-    /// Description of what the function does
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub description: Option<String>,
-    /// Parameters as JSON schema
-    pub parameters: serde_json::Value,
-}
-
-/// Controls which function the model calls
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum ToolChoice {
-    /// Let the model decide
-    Auto,
-    /// Model must call a function
-    Required,
-    /// Model must not call any function
-    None,
-    /// Force a specific function
-    Specific {
-        #[serde(rename = "type")]
-        tool_type: String,
-        function: FunctionChoice,
-    },
-}
-
-/// Specific function to call
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct FunctionChoice {
-    /// Name of the function to call
-    pub name: String,
-}
-
-/// Tool call made by the model
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ToolCall {
-    /// ID of the tool call
-    pub id: String,
-    /// Type of tool (currently only "function")
-    #[serde(rename = "type")]
-    pub tool_type: String,
-    /// Function being called
-    pub function: FunctionCall,
-}
-
-/// Function call details
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct FunctionCall {
-    /// Name of the function
-    pub name: String,
-    /// Arguments as JSON string
-    pub arguments: String,
-}
-
-/// Chat completion response
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ChatCompletionResponse {
-    /// Unique identifier for the completion
-    pub id: String,
-    /// Object type (always "chat.completion")
-    pub object: String,
-    /// Unix timestamp of when the completion was created
-    pub created: u64,
-    /// Model used for completion
-    pub model: String,
-    /// List of completion choices
-    pub choices: Vec<Choice>,
-    /// Token usage information
-    pub usage: Usage,
-    /// System fingerprint (for reproducibility)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub system_fingerprint: Option<String>,
-}
-
-/// A completion choice
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Choice {
-    /// Index of the choice
-    pub index: u32,
-    /// The generated message
-    pub message: ChatMessage,
-    /// Log probabilities (if requested)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub logprobs: Option<LogProbs>,
-    /// Reason why the model stopped generating
-    pub finish_reason: String,
-}
-
-/// Log probability information
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LogProbs {
-    /// Log probabilities for each token
-    pub content: Vec<TokenLogProb>,
-}
-
-/// Log probability for a single token
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct TokenLogProb {
-    /// The token
-    pub token: String,
-    /// Log probability of the token
-    pub logprob: f64,
-    /// Bytes representation of the token
-    pub bytes: Option<Vec<u8>>,
-    /// Top alternative tokens and their log probabilities
-    pub top_logprobs: Vec<TopLogProb>,
-}
-
-/// Top alternative token with its log probability
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct TopLogProb {
-    /// The token
-    pub token: String,
-    /// Log probability of the token
-    pub logprob: f64,
-    /// Bytes representation of the token
-    pub bytes: Option<Vec<u8>>,
-}
-
-/// Token usage statistics
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Usage {
-    /// Number of tokens in the prompt
-    pub prompt_tokens: u32,
-    /// Number of tokens in the completion
-    pub completion_tokens: u32,
-    /// Total tokens used
-    pub total_tokens: u32,
-    /// Breakdown of prompt tokens (if available)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub prompt_tokens_details: Option<PromptTokensDetails>,
-    /// Breakdown of completion tokens (if available)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub completion_tokens_details: Option<CompletionTokensDetails>,
-}
-
-/// Detailed prompt token breakdown
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PromptTokensDetails {
-    /// Tokens from cached content
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub cached_tokens: Option<u32>,
-}
-
-/// Detailed completion token breakdown
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct CompletionTokensDetails {
-    /// Tokens generated for reasoning
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub reasoning_tokens: Option<u32>,
-}
-
-/// Streaming chunk in chat completions
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ChatCompletionChunk {
-    /// Unique identifier for the chunk
-    pub id: String,
-    /// Object type (always "chat.completion.chunk")
-    pub object: String,
-    /// Unix timestamp
-    pub created: u64,
-    /// Model used
-    pub model: String,
-    /// System fingerprint
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub system_fingerprint: Option<String>,
-    /// List of delta choices
-    pub choices: Vec<ChunkChoice>,
-}
-
-/// A choice in a streaming chunk
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ChunkChoice {
-    /// Index of the choice
-    pub index: u32,
-    /// Delta content (incremental changes)
-    pub delta: Delta,
-    /// Log probabilities (if requested)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub logprobs: Option<LogProbs>,
-    /// Reason for stopping (present in final chunk)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub finish_reason: Option<String>,
-}
-
-/// Delta content in streaming response
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Delta {
-    /// Role (only in first chunk)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub role: Option<Role>,
-    /// Content delta
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub content: Option<String>,
-    /// Tool calls delta
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tool_calls: Option<Vec<ToolCallDelta>>,
-}
-
-/// Tool call delta in streaming response
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ToolCallDelta {
-    /// Index of the tool call
-    pub index: u32,
-    /// ID (only in first chunk for this tool call)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub id: Option<String>,
-    /// Type (only in first chunk)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(rename = "type")]
-    pub tool_type: Option<String>,
-    /// Function delta
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub function: Option<FunctionCallDelta>,
-}
-
-/// Function call delta
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct FunctionCallDelta {
-    /// Name (only in first chunk)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub name: Option<String>,
-    /// Arguments delta (incremental JSON string)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub arguments: Option<String>,
-}
-
-// Builder for ChatCompletionRequest
-impl ChatCompletionRequest {
-    pub fn new(model: impl Into<String>, messages: Vec<ChatMessage>) -> Self {
-        Self {
-            model: model.into(),
-            messages,
-            temperature: None,
-            max_tokens: None,
-            top_p: None,
-            n: None,
-            stream: None,
-            stop: None,
-            frequency_penalty: None,
-            presence_penalty: None,
-            logit_bias: None,
-            user: None,
-            logprobs: None,
-            top_logprobs: None,
-            seed: None,
-            response_format: None,
-            tools: None,
-            tool_choice: None,
-            parallel_tool_calls: None,
-        }
-    }
-}
-
-// Helper methods for creating messages
-impl ChatMessage {
-    pub fn system(content: impl Into<String>) -> Self {
-        Self {
-            role: Role::System,
-            content: Some(MessageContent::Text(content.into())),
-            name: None,
-            tool_calls: None,
-            tool_call_id: None,
-        }
-    }
-
-    pub fn user(content: impl Into<String>) -> Self {
-        Self {
-            role: Role::User,
-            content: Some(MessageContent::Text(content.into())),
-            name: None,
-            tool_calls: None,
-            tool_call_id: None,
-        }
-    }
-
-    pub fn assistant(content: impl Into<String>) -> Self {
-        Self {
-            role: Role::Assistant,
-            content: Some(MessageContent::Text(content.into())),
-            name: None,
-            tool_calls: None,
-            tool_call_id: None,
-        }
-    }
-
-    pub fn tool(tool_call_id: impl Into<String>, content: impl Into<String>) -> Self {
-        Self {
-            role: Role::Tool,
-            content: Some(MessageContent::Text(content.into())),
-            name: None,
-            tool_calls: None,
-            tool_call_id: Some(tool_call_id.into()),
-        }
-    }
-}
diff --git a/crates/rullm-core/src/providers/openai_compatible/config.rs b/crates/rullm-core/src/providers/openai_compatible/config.rs
deleted file mode 100644
index 9eeac7cb..00000000
--- a/crates/rullm-core/src/providers/openai_compatible/config.rs
+++ /dev/null
@@ -1,111 +0,0 @@
-use crate::config::ProviderConfig;
-use crate::error::LlmError;
-use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-use std::time::Duration;
-
-/// OpenAI-compatible configuration (supports OpenAI, Groq, OpenRouter, etc.)
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct OpenAICompatibleConfig {
-    pub api_key: String,
-    pub organization: Option<String>,
-    pub project: Option<String>,
-    pub base_url: Option<String>,
-    pub timeout_seconds: u64,
-}
-
-/// Type alias for backwards compatibility
-pub type OpenAIConfig = OpenAICompatibleConfig;
-
-impl OpenAICompatibleConfig {
-    pub fn new(api_key: impl Into<String>) -> Self {
-        Self {
-            api_key: api_key.into(),
-            organization: None,
-            project: None,
-            base_url: None,
-            timeout_seconds: 30,
-        }
-    }
-
-    pub fn groq(api_key: impl Into<String>) -> Self {
-        Self {
-            api_key: api_key.into(),
-            organization: None,
-            project: None,
-            base_url: Some("https://api.groq.com/openai/v1".to_string()),
-            timeout_seconds: 30,
-        }
-    }
-
-    pub fn openrouter(api_key: impl Into<String>) -> Self {
-        Self {
-            api_key: api_key.into(),
-            organization: None,
-            project: None,
-            base_url: Some("https://openrouter.ai/api/v1".to_string()),
-            timeout_seconds: 30,
-        }
-    }
-
-    pub fn with_organization(mut self, org: impl Into<String>) -> Self {
-        self.organization = Some(org.into());
-        self
-    }
-
-    pub fn with_project(mut self, project: impl Into<String>) -> Self {
-        self.project = Some(project.into());
-        self
-    }
-
-    pub fn with_base_url(mut self, base_url: impl Into<String>) -> Self {
-        self.base_url = Some(base_url.into());
-        self
-    }
-}
-
-impl ProviderConfig for OpenAICompatibleConfig {
-    fn api_key(&self) -> &str {
-        &self.api_key
-    }
-
-    fn base_url(&self) -> &str {
-        self.base_url
-            .as_deref()
-            .unwrap_or("https://api.openai.com/v1")
-    }
-
-    fn timeout(&self) -> Duration {
-        Duration::from_secs(self.timeout_seconds)
-    }
-
-    fn headers(&self) -> HashMap<String, String> {
-        let mut headers = HashMap::new();
-        headers.insert(
-            "Authorization".to_string(),
-            format!("Bearer {}", self.api_key),
-        );
-        headers.insert("Content-Type".to_string(), "application/json".to_string());
-
-        if let Some(org) = &self.organization {
-            headers.insert("OpenAI-Organization".to_string(), org.clone());
-        }
-
-        if let Some(project) = &self.project {
-            headers.insert("OpenAI-Project".to_string(), project.clone());
-        }
-
-        headers
-    }
-
-    fn validate(&self) -> Result<(), LlmError> {
-        if self.api_key.is_empty() {
-            return Err(LlmError::configuration("API key is required"));
-        }
-
-        // Relaxed validation: don't require 'sk-' prefix since Groq and OpenRouter use different formats
-        // OpenAI keys start with 'sk-', Groq uses 'gsk_', OpenRouter uses different format
-
-        Ok(())
-    }
-}
diff --git a/crates/rullm-core/src/providers/openai_compatible/mod.rs b/crates/rullm-core/src/providers/openai_compatible/mod.rs
deleted file mode 100644
index c21dcae9..00000000
--- a/crates/rullm-core/src/providers/openai_compatible/mod.rs
+++ /dev/null
@@ -1,423 +0,0 @@
-pub mod config;
-
-pub use config::{OpenAICompatibleConfig, OpenAIConfig};
-
-use crate::compat_types::{
-    ChatMessage, ChatRequest, ChatResponse, ChatRole, ChatStreamEvent, TokenUsage,
-};
-use crate::config::ProviderConfig;
-use crate::error::LlmError;
-use crate::utils::sse::sse_lines;
-use futures::StreamExt;
-use reqwest::Client;
-use std::pin::Pin;
-
-/// Provider identity metadata
-#[derive(Debug, Clone)]
-pub struct ProviderIdentity {
-    pub name: &'static str,
-    pub aliases: &'static [&'static str],
-    pub env_key: &'static str,
-    pub default_base_url: &'static str,
-}
-
-/// Predefined provider identities for OpenAI-compatible APIs
-pub mod identities {
-    use super::ProviderIdentity;
-
-    pub const OPENAI: ProviderIdentity = ProviderIdentity {
-        name: "openai",
-        aliases: &["openai", "gpt"],
-        env_key: "OPENAI_API_KEY",
-        default_base_url: "https://api.openai.com/v1",
-    };
-
-    pub const GROQ: ProviderIdentity = ProviderIdentity {
-        name: "groq",
-        aliases: &["groq"],
-        env_key: "GROQ_API_KEY",
-        default_base_url: "https://api.groq.com/openai/v1",
-    };
-
-    pub const OPENROUTER: ProviderIdentity = ProviderIdentity {
-        name: "openrouter",
-        aliases: &["openrouter"],
-        env_key: "OPENROUTER_API_KEY",
-        default_base_url: "https://openrouter.ai/api/v1",
-    };
-}
-
-/// Generic OpenAI-compatible provider implementation
-#[derive(Clone)]
-pub struct OpenAICompatibleProvider {
-    config: OpenAICompatibleConfig,
-    client: Client,
-    identity: ProviderIdentity,
-}
-
-impl OpenAICompatibleProvider {
-    /// Create a new OpenAI-compatible provider with custom identity
-    pub fn new(
-        config: OpenAICompatibleConfig,
-        identity: ProviderIdentity,
-    ) -> Result<Self, LlmError> {
-        config.validate()?;
-        let client = Client::new();
-        Ok(Self {
-            config,
-            client,
-            identity,
-        })
-    }
-
-    /// Create an OpenAI provider
-    pub fn openai(config: OpenAICompatibleConfig) -> Result<Self, LlmError> {
-        Self::new(config, identities::OPENAI)
-    }
-
-    /// Create a Groq provider
-    pub fn groq(config: OpenAICompatibleConfig) -> Result<Self, LlmError> {
-        Self::new(config, identities::GROQ)
-    }
-
-    /// Create an OpenRouter provider
-    pub fn openrouter(config: OpenAICompatibleConfig) -> Result<Self, LlmError> {
-        Self::new(config, identities::OPENROUTER)
-    }
-
-    /// Convert our ChatRequest to OpenAI's API format
-    fn to_openai_request(&self, request: &ChatRequest, model: &str) -> serde_json::Value {
-        let mut openai_request = serde_json::json!({
-            "model": model,
-            "messages": request.messages.iter().map(|msg| serde_json::json!({
-                "role": msg.role,
-                "content": msg.content
-            })).collect::<Vec<_>>()
-        });
-
-        if let Some(temp) = request.temperature {
-            openai_request["temperature"] =
-                serde_json::Value::Number(serde_json::Number::from_f64(temp as f64).unwrap());
-        }
-
-        if let Some(max_tokens) = request.max_tokens {
-            openai_request["max_tokens"] =
-                serde_json::Value::Number(serde_json::Number::from(max_tokens));
-        }
-
-        if let Some(top_p) = request.top_p {
-            openai_request["top_p"] =
-                serde_json::Value::Number(serde_json::Number::from_f64(top_p as f64).unwrap());
-        }
-
-        if let Some(stream) = request.stream {
-            openai_request["stream"] = serde_json::Value::Bool(stream);
-        }
-
-        openai_request
-    }
-
-    /// Parse OpenAI's response format into our ChatResponse
-    fn parse_openai_response(&self, response: serde_json::Value) -> Result<ChatResponse, LlmError> {
-        let choices = response["choices"].as_array().ok_or_else(|| {
-            LlmError::serialization(
-                "Missing 'choices' in OpenAI response",
-                Box::new(std::io::Error::new(
-                    std::io::ErrorKind::InvalidData,
-                    "Invalid response format",
-                )),
-            )
-        })?;
-
-        let first_choice = choices.first().ok_or_else(|| {
-            LlmError::serialization(
-                "No choices in OpenAI response",
-                Box::new(std::io::Error::new(
-                    std::io::ErrorKind::InvalidData,
-                    "Empty choices array",
-                )),
-            )
-        })?;
-
-        let message = &first_choice["message"];
-        let content = message["content"].as_str().ok_or_else(|| {
-            LlmError::serialization(
-                "Missing content in OpenAI response",
-                Box::new(std::io::Error::new(
-                    std::io::ErrorKind::InvalidData,
-                    "Missing content field",
-                )),
-            )
-        })?;
-
-        let role = message["role"].as_str().ok_or_else(|| {
-            LlmError::serialization(
-                "Missing role in OpenAI response",
-                Box::new(std::io::Error::new(
-                    std::io::ErrorKind::InvalidData,
-                    "Missing role field",
-                )),
-            )
-        })?;
-
-        let parsed_role = match role {
-            "assistant" => ChatRole::Assistant,
-            "user" => ChatRole::User,
-            "system" => ChatRole::System,
-            "tool" => ChatRole::Tool,
-            _ => {
-                return Err(LlmError::serialization(
-                    format!("Unknown role: {role}"),
-                    Box::new(std::io::Error::new(
-                        std::io::ErrorKind::InvalidData,
-                        "Invalid role",
-                    )),
-                ));
-            }
-        };
-
-        let usage = &response["usage"];
-        let token_usage = TokenUsage {
-            prompt_tokens: usage["prompt_tokens"].as_u64().unwrap_or(0) as u32,
-            completion_tokens: usage["completion_tokens"].as_u64().unwrap_or(0) as u32,
-            total_tokens: usage["total_tokens"].as_u64().unwrap_or(0) as u32,
-        };
-
-        let model = response["model"].as_str().unwrap_or("unknown").to_string();
-
-        let finish_reason = first_choice["finish_reason"]
-            .as_str()
-            .map(|s| s.to_string());
-
-        Ok(ChatResponse {
-            message: ChatMessage {
-                role: parsed_role,
-                content: content.to_string(),
-            },
-            model,
-            usage: token_usage,
-            finish_reason,
-        })
-    }
-
-    /// Health check
-    pub async fn health_check(&self) -> Result<(), LlmError> {
-        let url = format!("{}/models", self.config.base_url());
-
-        let mut req = self.client.get(&url);
-        for (key, value) in self.config.headers() {
-            req = req.header(&key, &value);
-        }
-        let response = req.send().await?;
-
-        if response.status().is_success() {
-            Ok(())
-        } else {
-            Err(LlmError::api(
-                self.identity.name,
-                "Health check failed",
-                Some(response.status().to_string()),
-                None,
-            ))
-        }
-    }
-
-    /// Chat completion
-    pub async fn chat_completion(
-        &self,
-        request: ChatRequest,
-        model: &str,
-    ) -> Result<ChatResponse, LlmError> {
-        let url = format!("{}/chat/completions", self.config.base_url());
-        let body = self.to_openai_request(&request, model);
-
-        let mut req = self.client.post(&url);
-        for (key, value) in self.config.headers() {
-            req = req.header(&key, &value);
-        }
-        let response = req.json(&body).send().await?;
-
-        if !response.status().is_success() {
-            let status = response.status().to_string();
-            let error_text = response
-                .text()
-                .await
-                .unwrap_or_else(|_| "Unknown error".to_string());
-
-            return Err(LlmError::api(
-                self.identity.name,
-                format!("API Error: {status} - {error_text}"),
-                Some(status),
-                None,
-            ));
-        }
-
-        let response_json: serde_json::Value = response
-            .json()
-            .await
-            .map_err(|e| LlmError::serialization("Failed to parse JSON response", Box::new(e)))?;
-
-        self.parse_openai_response(response_json)
-    }
-
-    /// Chat completion stream
-    pub async fn chat_completion_stream(
-        &self,
-        request: ChatRequest,
-        model: &str,
-        _buffer_size: Option<usize>,
-    ) -> Pin<Box<dyn futures::Stream<Item = Result<ChatStreamEvent, LlmError>> + Send>> {
-        let url = format!("{}/chat/completions", self.config.base_url());
-
-        // Create streaming request with stream: true
-        let mut streaming_request = request.clone();
-        streaming_request.stream = Some(true);
-        let body = self.to_openai_request(&streaming_request, model);
-
-        // Make the streaming HTTP request using reqwest Client directly
-        let client = reqwest::Client::new();
-        let headers = self.config.headers();
-
-        // Convert HashMap to HeaderMap
-        let mut header_map = reqwest::header::HeaderMap::new();
-        for (key, value) in headers {
-            if let (Ok(name), Ok(val)) = (
-                reqwest::header::HeaderName::from_bytes(key.as_bytes()),
-                reqwest::header::HeaderValue::from_str(&value),
-            ) {
-                header_map.insert(name, val);
-            }
-        }
-        header_map.insert(
-            reqwest::header::ACCEPT,
-            reqwest::header::HeaderValue::from_static("text/event-stream"),
-        );
-
-        let response_future = client.post(&url).headers(header_map).json(&body).send();
-        let provider_name = self.identity.name;
-
-        Box::pin(async_stream::stream! {
-            // Handle the initial request
-            let response = match response_future.await {
-                Ok(resp) => {
-                    if !resp.status().is_success() {
-                        let status = resp.status().to_string();
-                        let error_text = resp
-                            .text()
-                            .await
-                            .unwrap_or_else(|_| "Unknown error".to_string());
-                        yield Err(LlmError::api(
-                            provider_name,
-                            format!("API Error: {status} - {error_text}"),
-                            Some(status),
-                            None,
-                        ));
-                        return;
-                    }
-                    resp
-                }
-                Err(e) => {
-                    yield Err(LlmError::network(format!("Request failed: {e}")));
-                    return;
-                }
-            };
-
-            // Get the byte stream and parse SSE events
-            let byte_stream = response.bytes_stream();
-            let mut sse_stream = sse_lines(byte_stream);
-
-            while let Some(event_result) = sse_stream.next().await {
-                match event_result {
-                    Ok(data) => {
-                        // Parse the JSON chunk
-                        match serde_json::from_str::<serde_json::Value>(&data) {
-                            Ok(chunk) => {
-                                // Extract content from choices[0].delta.content
-                                if let Some(choices) = chunk["choices"].as_array() {
-                                    if let Some(first_choice) = choices.first() {
-                                        if let Some(delta) = first_choice.get("delta") {
-                                            if let Some(content) = delta["content"].as_str() {
-                                                yield Ok(ChatStreamEvent::Token(content.to_string()));
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                            Err(e) => {
-                                yield Err(LlmError::serialization(
-                                    format!("Failed to parse chunk JSON: {e}"),
-                                    Box::new(e),
-                                ));
-                                return;
-                            }
-                        }
-                    }
-                    Err(e) => {
-                        yield Err(e);
-                        return;
-                    }
-                }
-            }
-
-            // Emit Done event when streaming completes
-            yield Ok(ChatStreamEvent::Done);
-        })
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::utils::test_helpers::fake_sse_response;
-    use futures::StreamExt;
-
-    #[tokio::test]
-    async fn test_openai_compatible_stream_parsing() {
-        // Create fake OpenAI-style SSE events
-        let events = vec![
-            r#"{"choices":[{"delta":{"content":"Hello"}}]}"#,
-            r#"{"choices":[{"delta":{"content":" "}}]}"#,
-            r#"{"choices":[{"delta":{"content":"world"}}]}"#,
-            r#"{"choices":[{"delta":{"content":"!"}}]}"#,
-        ];
-
-        // Create fake SSE stream
-        let fake_stream = fake_sse_response(&events, None);
-
-        // Parse using our sse_lines function
-        let mut sse_stream = sse_lines(fake_stream);
-        let mut tokens = Vec::new();
-
-        // Process all events like the real implementation does
-        while let Some(event_result) = sse_stream.next().await {
-            match event_result {
-                Ok(data) => {
-                    // Parse the JSON chunk
-                    match serde_json::from_str::<serde_json::Value>(&data) {
-                        Ok(chunk) => {
-                            // Extract content from choices[0].delta.content
-                            if let Some(choices) = chunk["choices"].as_array() {
-                                if let Some(first_choice) = choices.first() {
-                                    if let Some(delta) = first_choice.get("delta") {
-                                        if let Some(content) = delta["content"].as_str() {
-                                            tokens.push(content.to_string());
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                        Err(e) => panic!("Failed to parse chunk JSON: {e}"),
-                    }
-                }
-                Err(e) => panic!("SSE parsing error: {e}"),
-            }
-        }
-
-        // Verify we got the expected tokens
-        assert_eq!(tokens, vec!["Hello", " ", "world", "!"]);
-
-        // Verify concatenated content
-        let full_content: String = tokens.join("");
-        assert_eq!(full_content, "Hello world!");
-    }
-}
diff --git a/crates/rullm-core/src/utils/mod.rs b/crates/rullm-core/src/utils/mod.rs
deleted file mode 100644
index 245b7206..00000000
--- a/crates/rullm-core/src/utils/mod.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-pub mod sse;
-
-#[cfg(test)]
-pub mod test_helpers;
diff --git a/crates/rullm-core/src/utils/sse.rs b/crates/rullm-core/src/utils/sse.rs
deleted file mode 100644
index 61847ffb..00000000
--- a/crates/rullm-core/src/utils/sse.rs
+++ /dev/null
@@ -1,259 +0,0 @@
-use crate::error::LlmError;
-use futures::Stream;
-use std::pin::Pin;
-use std::task::{Context, Poll};
-
-/// Parses Server-Sent Events (SSE) from a byte stream, extracting data payloads
-/// and filtering out [DONE] messages.
-pub fn sse_lines<S>(stream: S) -> impl Stream<Item = Result<String, LlmError>>
-where
-    S: Stream<Item = Result<bytes::Bytes, reqwest::Error>> + Unpin,
-{
-    SseParser::new(stream)
-}
-
-struct SseParser<S> {
-    stream: S,
-    buffer: String,
-    event_queue: Vec<String>,
-}
-
-impl<S> SseParser<S>
-where
-    S: Stream<Item = Result<bytes::Bytes, reqwest::Error>> + Unpin,
-{
-    fn new(stream: S) -> Self {
-        Self {
-            stream,
-            buffer: String::new(),
-            event_queue: Vec::new(),
-        }
-    }
-
-    fn parse_events(&mut self) {
-        // Split by SSE event delimiter "\n\n"
-        while let Some(double_newline_pos) = self.buffer.find("\n\n") {
-            let event_block = self.buffer[..double_newline_pos].to_string();
-            self.buffer.drain(..double_newline_pos + 2);
-
-            // Process the event block
-            for line in event_block.lines() {
-                if let Some(data) = line.strip_prefix("data: ") {
-                    // Skip [DONE] messages
-                    if data.trim() != "[DONE]" {
-                        self.event_queue.push(data.to_string());
-                    }
-                }
-                // Ignore lines without "data: " prefix
-            }
-        }
-    }
-}
-
-impl<S> Stream for SseParser<S>
-where
-    S: Stream<Item = Result<bytes::Bytes, reqwest::Error>> + Unpin,
-{
-    type Item = Result<String, LlmError>;
-
-    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        loop {
-            // First, check if we have events in the queue
-            if !self.event_queue.is_empty() {
-                return Poll::Ready(Some(Ok(self.event_queue.remove(0))));
-            }
-
-            // Parse any complete events from the buffer
-            self.parse_events();
-            if !self.event_queue.is_empty() {
-                return Poll::Ready(Some(Ok(self.event_queue.remove(0))));
-            }
-
-            // No complete events in buffer, try to get more data
-            match Pin::new(&mut self.stream).poll_next(cx) {
-                Poll::Ready(Some(Ok(bytes))) => {
-                    // Add new bytes to buffer
-                    match std::str::from_utf8(&bytes) {
-                        Ok(text) => {
-                            // Normalize CRLF to LF to handle Windows-style/HTTP CRLF delimiters
-                            let normalized = text.replace("\r\n", "\n");
-                            self.buffer.push_str(&normalized);
-                            // Continue loop to try parsing again
-                        }
-                        Err(e) => {
-                            return Poll::Ready(Some(Err(LlmError::serialization(
-                                "Invalid UTF-8 in SSE stream",
-                                Box::new(e),
-                            ))));
-                        }
-                    }
-                }
-                Poll::Ready(Some(Err(e))) => {
-                    return Poll::Ready(Some(Err(LlmError::network(format!("Stream error: {e}")))));
-                }
-                Poll::Ready(None) => {
-                    // Stream ended, parse any remaining events
-                    self.parse_events();
-                    if !self.event_queue.is_empty() {
-                        return Poll::Ready(Some(Ok(self.event_queue.remove(0))));
-                    }
-                    return Poll::Ready(None);
-                }
-                Poll::Pending => return Poll::Pending,
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::utils::test_helpers::{
-        fake_sse_response, fake_sse_response_chunked, fake_sse_response_with_done,
-    };
-    use futures::{StreamExt, stream};
-
-    fn bytes_from_str(s: &str) -> bytes::Bytes {
-        bytes::Bytes::from(s.to_string())
-    }
-
-    #[tokio::test]
-    async fn test_single_event() {
-        let data = vec![Ok(bytes_from_str("data: hello\n\n"))];
-        let stream = stream::iter(data);
-
-        let sse_stream = sse_lines(stream);
-        let results: Vec<Result<String, LlmError>> = sse_stream.collect().await;
-        let events: Vec<String> = results.into_iter().collect::<Result<Vec<_>, _>>().unwrap();
-
-        assert_eq!(events, vec!["hello"]);
-    }
-
-    #[tokio::test]
-    async fn test_multi_event() {
-        let data = vec![Ok(bytes_from_str("data: foo\n\ndata: bar\n\n"))];
-        let stream = stream::iter(data);
-
-        let sse_stream = sse_lines(stream);
-        let results: Vec<Result<String, LlmError>> = sse_stream.collect().await;
-        let events: Vec<String> = results.into_iter().collect::<Result<Vec<_>, _>>().unwrap();
-
-        assert_eq!(events, vec!["foo", "bar"]);
-    }
-
-    #[tokio::test]
-    async fn test_done_filter() {
-        let data = vec![Ok(bytes_from_str(
-            "data: baz\n\ndata: [DONE]\n\ndata: qux\n\n",
-        ))];
-        let stream = stream::iter(data);
-
-        let sse_stream = sse_lines(stream);
-        let results: Vec<Result<String, LlmError>> = sse_stream.collect().await;
-        let events: Vec<String> = results.into_iter().collect::<Result<Vec<_>, _>>().unwrap();
-
-        assert_eq!(events, vec!["baz", "qux"]);
-    }
-
-    #[tokio::test]
-    async fn test_partial_chunks() {
-        let data = vec![
-            Ok(bytes_from_str("data: split")),
-            Ok(bytes_from_str("-me\n\n")),
-        ];
-        let stream = stream::iter(data);
-
-        let sse_stream = sse_lines(stream);
-        let results: Vec<Result<String, LlmError>> = sse_stream.collect().await;
-        let events: Vec<String> = results.into_iter().collect::<Result<Vec<_>, _>>().unwrap();
-
-        assert_eq!(events, vec!["split-me"]);
-    }
-
-    #[tokio::test]
-    async fn test_empty_stream() {
-        let data: Vec<Result<bytes::Bytes, reqwest::Error>> = vec![];
-        let stream = stream::iter(data);
-
-        let sse_stream = sse_lines(stream);
-        let results: Vec<Result<String, LlmError>> = sse_stream.collect().await;
-        let events: Vec<String> = results.into_iter().collect::<Result<Vec<_>, _>>().unwrap();
-
-        assert_eq!(events, Vec::<String>::new());
-    }
-
-    #[tokio::test]
-    async fn test_no_data_prefix() {
-        let data = vec![Ok(bytes_from_str("event: test\nid: 123\ndata: valid\n\n"))];
-        let stream = stream::iter(data);
-
-        let sse_stream = sse_lines(stream);
-        let results: Vec<Result<String, LlmError>> = sse_stream.collect().await;
-        let events: Vec<String> = results.into_iter().collect::<Result<Vec<_>, _>>().unwrap();
-
-        assert_eq!(events, vec!["valid"]);
-    }
-
-    // New tests using the fake SSE helpers
-
-    #[tokio::test]
-    async fn test_fake_sse_helper_basic() {
-        let events = ["message1", "message2", "message3"];
-        let stream = fake_sse_response(&events, None);
-
-        let sse_stream = sse_lines(stream);
-        let results: Vec<Result<String, LlmError>> = sse_stream.collect().await;
-        let parsed_events: Vec<String> =
-            results.into_iter().collect::<Result<Vec<_>, _>>().unwrap();
-
-        assert_eq!(parsed_events, vec!["message1", "message2", "message3"]);
-    }
-
-    #[tokio::test]
-    async fn test_fake_sse_helper_with_done() {
-        let events = ["before_done"];
-        let stream = fake_sse_response_with_done(&events);
-
-        let sse_stream = sse_lines(stream);
-        let results: Vec<Result<String, LlmError>> = sse_stream.collect().await;
-        let parsed_events: Vec<String> =
-            results.into_iter().collect::<Result<Vec<_>, _>>().unwrap();
-
-        // Should only contain "before_done", [DONE] should be filtered out
-        assert_eq!(parsed_events, vec!["before_done"]);
-    }
-
-    #[tokio::test]
-    async fn test_fake_sse_helper_chunked_boundaries() {
-        let events = ["chunk_test", "split_me"];
-        let stream = fake_sse_response_chunked(&events, 7); // Small chunks to force splits
-
-        let sse_stream = sse_lines(stream);
-        let results: Vec<Result<String, LlmError>> = sse_stream.collect().await;
-        let parsed_events: Vec<String> =
-            results.into_iter().collect::<Result<Vec<_>, _>>().unwrap();
-
-        assert_eq!(parsed_events, vec!["chunk_test", "split_me"]);
-    }
-
-    #[tokio::test]
-    async fn test_realistic_openai_style_stream() {
-        // Simulate a realistic OpenAI-style streaming response
-        let events = [
-            r#"{"choices": [{"delta": {"content": "Hello"}}]}"#,
-            r#"{"choices": [{"delta": {"content": " there"}}]}"#,
-            r#"{"choices": [{"delta": {"content": "!"}}]}"#,
-        ];
-        let stream = fake_sse_response_with_done(&events);
-
-        let sse_stream = sse_lines(stream);
-        let results: Vec<Result<String, LlmError>> = sse_stream.collect().await;
-        let parsed_events: Vec<String> =
-            results.into_iter().collect::<Result<Vec<_>, _>>().unwrap();
-
-        assert_eq!(parsed_events.len(), 3);
-        assert!(parsed_events[0].contains("Hello"));
-        assert!(parsed_events[1].contains(" there"));
-        assert!(parsed_events[2].contains("!"));
-    }
-}
diff --git a/crates/rullm-core/src/utils/test_helpers.rs b/crates/rullm-core/src/utils/test_helpers.rs
deleted file mode 100644
index 38f0db43..00000000
--- a/crates/rullm-core/src/utils/test_helpers.rs
+++ /dev/null
@@ -1,184 +0,0 @@
-//! Test utilities for simulating Server-Sent Events (SSE) responses
-//!
-//! This module provides helpers for creating realistic SSE streams in unit tests,
-//! allowing testing of streaming parsers with various edge cases and chunk boundaries.
-
-use futures::Stream;
-use std::pin::Pin;
-use std::task::{Context, Poll};
-
-/// Configuration for fake SSE response generation
-#[derive(Debug, Clone, Default)]
-pub struct FakeSseConfig {
-    /// Whether to append a final "[DONE]" event
-    pub include_done: bool,
-    /// Split events across multiple chunks to test partial frame handling
-    pub chunk_size: Option<usize>,
-}
-
-/// Creates a fake SSE response stream for testing
-///
-/// # Arguments
-/// * `events` - Array of event data (without "data: " prefix)
-/// * `config` - Optional configuration for response behavior
-///
-/// # Returns
-/// A stream that yields `Result<bytes::Bytes, reqwest::Error>` compatible with SSE parsers
-///
-/// # Examples
-/// ```
-/// use futures::StreamExt;
-/// use rullm_core::utils::test_helpers::fake_sse_response;
-///
-/// #[tokio::test]
-/// async fn test_basic_sse() {
-///     let events = ["hello", "world"];
-///     let stream = fake_sse_response(&events, None);
-///     let chunks: Vec<_> = stream.collect().await;
-///     // Verify chunks contain properly formatted SSE data
-/// }
-/// ```
-pub fn fake_sse_response(
-    events: &[&str],
-    config: Option<FakeSseConfig>,
-) -> impl Stream<Item = Result<bytes::Bytes, reqwest::Error>> {
-    let config = config.unwrap_or_default();
-
-    // Build the complete SSE response
-    let mut response = String::new();
-    for event in events {
-        response.push_str(&format!("data: {event}\n\n"));
-    }
-
-    if config.include_done {
-        response.push_str("data: [DONE]\n\n");
-    }
-
-    FakeSseStream::new(response, config)
-}
-
-/// Internal stream implementation for fake SSE responses
-pub struct FakeSseStream {
-    data: Vec<u8>,
-    position: usize,
-    chunk_size: Option<usize>,
-}
-
-impl FakeSseStream {
-    fn new(response: String, config: FakeSseConfig) -> Self {
-        Self {
-            data: response.into_bytes(),
-            position: 0,
-            chunk_size: config.chunk_size,
-        }
-    }
-}
-
-impl Stream for FakeSseStream {
-    type Item = Result<bytes::Bytes, reqwest::Error>;
-
-    fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        // Check if we've reached the end
-        if self.position >= self.data.len() {
-            return Poll::Ready(None);
-        }
-
-        // Determine chunk size (use configured size or remaining data)
-        let chunk_size = self.chunk_size.unwrap_or(self.data.len() - self.position);
-        let end_pos = std::cmp::min(self.position + chunk_size, self.data.len());
-
-        // Extract the chunk
-        let chunk = self.data[self.position..end_pos].to_vec();
-        self.position = end_pos;
-
-        Poll::Ready(Some(Ok(bytes::Bytes::from(chunk))))
-    }
-}
-
-/// Creates a fake SSE response with events split across chunk boundaries
-///
-/// This is particularly useful for testing partial frame handling in SSE parsers.
-pub fn fake_sse_response_chunked(
-    events: &[&str],
-    chunk_size: usize,
-) -> impl Stream<Item = Result<bytes::Bytes, reqwest::Error>> {
-    fake_sse_response(
-        events,
-        Some(FakeSseConfig {
-            chunk_size: Some(chunk_size),
-            ..Default::default()
-        }),
-    )
-}
-
-/// Creates a fake SSE response that includes a [DONE] event at the end
-pub fn fake_sse_response_with_done(
-    events: &[&str],
-) -> impl Stream<Item = Result<bytes::Bytes, reqwest::Error>> {
-    fake_sse_response(
-        events,
-        Some(FakeSseConfig {
-            include_done: true,
-            ..Default::default()
-        }),
-    )
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use futures::StreamExt;
-
-    #[tokio::test]
-    async fn test_basic_fake_sse_response() {
-        let events = ["hello", "world"];
-        let stream = fake_sse_response(&events, None);
-        let chunks: Vec<Result<bytes::Bytes, reqwest::Error>> = stream.collect().await;
-
-        assert_eq!(chunks.len(), 1);
-        let chunk = chunks[0].as_ref().unwrap();
-        let data = String::from_utf8(chunk.to_vec()).unwrap();
-        assert_eq!(data, "data: hello\n\ndata: world\n\n");
-    }
-
-    #[tokio::test]
-    async fn test_fake_sse_response_with_done() {
-        let events = ["test"];
-        let stream = fake_sse_response_with_done(&events);
-        let chunks: Vec<Result<bytes::Bytes, reqwest::Error>> = stream.collect().await;
-
-        assert_eq!(chunks.len(), 1);
-        let chunk = chunks[0].as_ref().unwrap();
-        let data = String::from_utf8(chunk.to_vec()).unwrap();
-        assert_eq!(data, "data: test\n\ndata: [DONE]\n\n");
-    }
-
-    #[tokio::test]
-    async fn test_fake_sse_response_chunked() {
-        let events = ["hello", "world"];
-        let stream = fake_sse_response_chunked(&events, 5); // Small chunks
-        let chunks: Vec<Result<bytes::Bytes, reqwest::Error>> = stream.collect().await;
-
-        // Should have multiple chunks due to small chunk size
-        assert!(chunks.len() > 1);
-
-        // Reconstruct the full response
-        let mut full_data = String::new();
-        for chunk in chunks {
-            let bytes = chunk.unwrap();
-            full_data.push_str(core::str::from_utf8(&bytes).unwrap());
-        }
-
-        assert_eq!(full_data, "data: hello\n\ndata: world\n\n");
-    }
-
-    #[tokio::test]
-    async fn test_empty_events() {
-        let events: &[&str] = &[];
-        let stream = fake_sse_response(events, None);
-        let chunks: Vec<Result<bytes::Bytes, reqwest::Error>> = stream.collect().await;
-
-        // Should produce no chunks for empty events
-        assert!(chunks.is_empty());
-    }
-}

From 449bc7291bd3a35ff650de420b5b09ee134b4317 Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sat, 3 Jan 2026 20:59:46 +0530
Subject: [PATCH 10/14] fix(cli): add required beta headers for Anthropic OAuth

OAuth requests require anthropic-beta header with oauth-2025-04-20 and
other beta features to authenticate successfully.
---
 crates/rullm-cli/src/cli_client.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/crates/rullm-cli/src/cli_client.rs b/crates/rullm-cli/src/cli_client.rs
index c7ac565a..ae51d587 100644
--- a/crates/rullm-cli/src/cli_client.rs
+++ b/crates/rullm-cli/src/cli_client.rs
@@ -91,7 +91,15 @@ impl CliClient {
     ) -> Result<Self, CliError> {
         let api_key_str = api_key.into();
         let client_config = if use_oauth {
-            AnthropicClient::builder().auth_token(api_key_str).build()?
+            AnthropicClient::builder()
+                .auth_token(api_key_str)
+                .betas([
+                    "oauth-2025-04-20",
+                    "claude-code-20250219",
+                    "interleaved-thinking-2025-05-14",
+                    "fine-grained-tool-streaming-2025-05-14",
+                ])
+                .build()?
         } else {
             AnthropicClient::builder().api_key(api_key_str).build()?
         };

From b7308007fc7ea3b41308440447e91d25b1aade6a Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sat, 3 Jan 2026 21:59:42 +0530
Subject: [PATCH 11/14] feat(cli): add Gemini provider support via
 OpenAI-compatible endpoint

Uses Google's OpenAI-compatible API at generativelanguage.googleapis.com.
Supports gemini and google aliases with GEMINI_API_KEY env var.
---
 crates/rullm-cli/src/auth.rs          |  4 +++
 crates/rullm-cli/src/cli_client.rs    | 38 ++++++++++++++++++++++++++-
 crates/rullm-cli/src/client.rs        |  1 +
 crates/rullm-cli/src/commands/auth.rs |  1 +
 crates/rullm-cli/src/provider.rs      | 13 ++++++++-
 5 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/crates/rullm-cli/src/auth.rs b/crates/rullm-cli/src/auth.rs
index a5dbf9a6..49d5fea9 100644
--- a/crates/rullm-cli/src/auth.rs
+++ b/crates/rullm-cli/src/auth.rs
@@ -97,6 +97,8 @@ pub struct AuthConfig {
     pub groq: Option<Credential>,
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub openrouter: Option<Credential>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub gemini: Option<Credential>,
 }
 
 impl AuthConfig {
@@ -162,6 +164,7 @@ impl AuthConfig {
             Provider::OpenAI => self.openai.as_ref(),
             Provider::Groq => self.groq.as_ref(),
             Provider::OpenRouter => self.openrouter.as_ref(),
+            Provider::Gemini => self.gemini.as_ref(),
         }
     }
 
@@ -172,6 +175,7 @@ impl AuthConfig {
             Provider::OpenAI => &mut self.openai,
             Provider::Groq => &mut self.groq,
             Provider::OpenRouter => &mut self.openrouter,
+            Provider::Gemini => &mut self.gemini,
         }
     }
 
diff --git a/crates/rullm-cli/src/cli_client.rs b/crates/rullm-cli/src/cli_client.rs
index ae51d587..ad54446d 100644
--- a/crates/rullm-cli/src/cli_client.rs
+++ b/crates/rullm-cli/src/cli_client.rs
@@ -61,6 +61,11 @@ pub enum CliClient {
         model: String,
         config: CliConfig,
     },
+    Gemini {
+        client: ChatCompletionsClient,
+        model: String,
+        config: CliConfig,
+    },
 }
 
 impl CliClient {
@@ -150,6 +155,25 @@ impl CliClient {
         })
     }
 
+    /// Create Gemini client (using OpenAI-compatible endpoint)
+    pub fn gemini(
+        api_key: impl Into<String>,
+        model: impl Into<String>,
+        config: CliConfig,
+    ) -> Result<Self, CliError> {
+        let client_config = ClientConfig::builder()
+            .base_url("https://generativelanguage.googleapis.com/v1beta/openai")
+            .bearer_token(api_key.into())
+            .build()
+            .map_err(|e| CliError::Other(e.to_string()))?;
+        let client = ChatCompletionsClient::new(client_config)?;
+        Ok(Self::Gemini {
+            client,
+            model: model.into(),
+            config,
+        })
+    }
+
     /// Simple chat - send a message and get a response
     pub async fn chat(&self, message: &str) -> Result<String, CliError> {
         match self {
@@ -167,6 +191,11 @@ impl CliClient {
                 client,
                 model,
                 config,
+            }
+            | Self::Gemini {
+                client,
+                model,
+                config,
             } => {
                 let mut builder = client.chat().model(model.as_str()).user(message);
 
@@ -233,6 +262,11 @@ impl CliClient {
                 client,
                 model,
                 config,
+            }
+            | Self::Gemini {
+                client,
+                model,
+                config,
             } => {
                 let mut builder = client.chat().model(model.as_str());
 
@@ -322,6 +356,7 @@ impl CliClient {
             Self::Anthropic { .. } => "anthropic",
             Self::Groq { .. } => "groq",
             Self::OpenRouter { .. } => "openrouter",
+            Self::Gemini { .. } => "gemini",
         }
     }
 
@@ -331,7 +366,8 @@ impl CliClient {
             Self::OpenAI { model, .. }
             | Self::Anthropic { model, .. }
             | Self::Groq { model, .. }
-            | Self::OpenRouter { model, .. } => model,
+            | Self::OpenRouter { model, .. }
+            | Self::Gemini { model, .. } => model,
         }
     }
 }
diff --git a/crates/rullm-cli/src/client.rs b/crates/rullm-cli/src/client.rs
index e69bac02..c9693f8b 100644
--- a/crates/rullm-cli/src/client.rs
+++ b/crates/rullm-cli/src/client.rs
@@ -40,6 +40,7 @@ pub fn create_client(
         Provider::Groq => CliClient::groq(api_key, model_name, config),
         Provider::OpenRouter => CliClient::openrouter(api_key, model_name, config),
         Provider::Anthropic => CliClient::anthropic(api_key, model_name, config, is_oauth),
+        Provider::Gemini => CliClient::gemini(api_key, model_name, config),
     }
 }
 
diff --git a/crates/rullm-cli/src/commands/auth.rs b/crates/rullm-cli/src/commands/auth.rs
index b1fe2ea8..90c3b932 100644
--- a/crates/rullm-cli/src/commands/auth.rs
+++ b/crates/rullm-cli/src/commands/auth.rs
@@ -222,6 +222,7 @@ fn format_provider_display(provider: &Provider) -> &'static str {
         Provider::OpenAI => "OpenAI",
         Provider::Groq => "Groq",
         Provider::OpenRouter => "OpenRouter",
+        Provider::Gemini => "Gemini",
     }
 }
 
diff --git a/crates/rullm-cli/src/provider.rs b/crates/rullm-cli/src/provider.rs
index c197f6f5..f26863f5 100644
--- a/crates/rullm-cli/src/provider.rs
+++ b/crates/rullm-cli/src/provider.rs
@@ -8,6 +8,7 @@ pub enum Provider {
     Groq,
     OpenRouter,
     Anthropic,
+    Gemini,
 }
 
 impl std::fmt::Display for Provider {
@@ -17,6 +18,7 @@ impl std::fmt::Display for Provider {
             Provider::Groq => "groq",
             Provider::OpenRouter => "openrouter",
             Provider::Anthropic => "anthropic",
+            Provider::Gemini => "gemini",
         };
         write!(f, "{name}")
     }
@@ -24,7 +26,13 @@ impl std::fmt::Display for Provider {
 
 impl ValueEnum for Provider {
     fn value_variants<'a>() -> &'a [Self] {
-        &[Self::OpenAI, Self::Groq, Self::OpenRouter, Self::Anthropic]
+        &[
+            Self::OpenAI,
+            Self::Groq,
+            Self::OpenRouter,
+            Self::Anthropic,
+            Self::Gemini,
+        ]
     }
 
     fn to_possible_value(&self) -> Option<PossibleValue> {
@@ -33,6 +41,7 @@ impl ValueEnum for Provider {
             Self::Groq => PossibleValue::new("groq"),
             Self::OpenRouter => PossibleValue::new("openrouter"),
             Self::Anthropic => PossibleValue::new("anthropic"),
+            Self::Gemini => PossibleValue::new("gemini"),
         };
         Some(value)
     }
@@ -45,6 +54,7 @@ impl Provider {
             Provider::Groq => &["groq"],
             Provider::OpenRouter => &["openrouter"],
             Provider::Anthropic => &["anthropic", "claude"],
+            Provider::Gemini => &["gemini", "google"],
         }
     }
 
@@ -77,6 +87,7 @@ impl Provider {
             Provider::Groq => "GROQ_API_KEY",
             Provider::OpenRouter => "OPENROUTER_API_KEY",
             Provider::Anthropic => "ANTHROPIC_API_KEY",
+            Provider::Gemini => "GEMINI_API_KEY",
         }
     }
 }

From aad324ae853bb35d56f7ae19c9d90cd92e0d83e6 Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sun, 4 Jan 2026 00:09:16 +0530
Subject: [PATCH 12/14] docs: consolidate and clean up spec files

---
 .../spec/implementation-final.md              | 258 ----------
 crates/rullm-anthropic/spec/implementation.md | 256 ----------
 .../spec/chat-completion-api.md               | 103 ----
 .../spec/chat-completion-comparison.md        | 379 +++++++++++++++
 .../spec/chat-completion.md                   |  42 ++
 .../spec/chat-completion2.md                  | 289 -----------
 .../spec/implementation-final.md              | 449 ------------------
 .../spec/implementation.md                    | 224 ---------
 spec/chat-completion-comparison.md            | 234 ---------
 spec/chat-completion-comparison2.md           |  69 ---
 10 files changed, 421 insertions(+), 1882 deletions(-)
 delete mode 100644 crates/rullm-anthropic/spec/implementation-final.md
 delete mode 100644 crates/rullm-anthropic/spec/implementation.md
 delete mode 100644 crates/rullm-chat-completion/spec/chat-completion-api.md
 create mode 100644 crates/rullm-chat-completion/spec/chat-completion-comparison.md
 delete mode 100644 crates/rullm-chat-completion/spec/chat-completion2.md
 delete mode 100644 crates/rullm-chat-completion/spec/implementation-final.md
 delete mode 100644 crates/rullm-chat-completion/spec/implementation.md
 delete mode 100644 spec/chat-completion-comparison.md
 delete mode 100644 spec/chat-completion-comparison2.md

diff --git a/crates/rullm-anthropic/spec/implementation-final.md b/crates/rullm-anthropic/spec/implementation-final.md
deleted file mode 100644
index fffc174f..00000000
--- a/crates/rullm-anthropic/spec/implementation-final.md
+++ /dev/null
@@ -1,258 +0,0 @@
-# Anthropic Messages Rust Client - Implementation Design
-
-This document proposes an idiomatic Rust client for the Anthropic Messages API.
-It is based on `spec/message-api.md`, `spec/implementation.md`, and patterns in
-rullm-core. The design emphasizes ergonomic builders, strong typing, streaming
-helpers, and clean error handling.
-
-## 1) Goals and non-goals
-
-Goals
-- Feature parity with official Anthropic SDKs for the Messages API.
-- Excellent developer experience: easy defaults, expressive builders, helpers for
-  common tasks, and easy streaming consumption.
-- Forward compatibility: tolerate unknown enum values and fields.
-
-Non-goals (initial release)
-- A cross-provider abstraction layer for non-Anthropic APIs (OpenAI/Gemini/etc.).
-- Full Bedrock/Vertex implementations (can be added later).
-
-## 2) Package layout (proposed)
-
-```
-crates/rullm-anthropic/src/
-  client.rs        // Client, ClientBuilder, RequestOptions
-  config.rs        // env helpers, base url, auth modes
-  error.rs         // AnthropicError, ErrorObject
-  messages/        // requests, responses, types
-    mod.rs
-    types.rs       // content blocks, tools, streaming events
-    stream.rs      // SSE parsing + accumulator
-  models.rs        // list/get models
-  batches.rs       // create/get/list/cancel/delete/results
-  transport.rs     // HttpTransport trait + reqwest impl
-  lib.rs           // re-exports
-```
-
-Notes
-- Keep the public surface in `lib.rs` small and intentional.
-- Prefer `crate::` paths (avoid `super::`).
-- Avoid `pub use` unless re-exporting external dependencies.
-
-## 3) Client configuration and auth
-
-### 3.1 ClientBuilder
-Provide a builder with explicit fields and env defaults:
-
-- `Client::builder()` -> `ClientBuilder`
-- `Client::from_env()` -> uses:
-  - `ANTHROPIC_API_KEY`
-  - `ANTHROPIC_AUTH_TOKEN`
-  - `ANTHROPIC_BASE_URL`
-
-Auth modes:
-- API key: `x-api-key: <key>`
-- OAuth token: `Authorization: Bearer <token>`
-
-Required headers:
-- `anthropic-version: 2023-06-01`
-- `content-type: application/json`
-
-Recommended builder fields:
-- `api_key: Option<Arc<str>>`
-- `auth_token: Option<Arc<str>>`
-- `base_url: Arc<str>` (default `https://api.anthropic.com`)
-- `timeout: Duration` (global default)
-- `max_retries: u32`
-- `beta: Vec<Arc<str>>` (optional `anthropic-beta` header)
-- `default_headers: HeaderMap` (merge-able)
-
-### 3.2 RequestOptions (per-request override)
-A small options struct to keep the API uniform across clients (even if the
-provider APIs differ):
-
-- `timeout: Option<Duration>`
-- `extra_headers: HeaderMap`
-- `extra_query: Vec<(Arc<str>, Arc<str>)>`
-- `extra_body: serde_json::Map<String, Value>`
-
-This mirrors `extra_headers/extra_query/extra_body` patterns in other SDKs.
-
-## 4) Messages API surface
-
-### 4.1 Primary entry points
-Expose a sub-client similar to official SDKs:
-
-- `Client::messages()` -> `MessagesClient`
-- `MessagesClient::create(req, opts)` -> `Message`
-- `MessagesClient::stream(req, opts)` -> `MessageStream`
-- `MessagesClient::count_tokens(req, opts)` -> `CountTokensResponse`
-- `MessagesClient::batches()` -> `BatchesClient`
-
-### 4.2 Builder ergonomics
-Provide a builder for the request that favors clarity:
-
-```
-MessagesRequest::builder("claude-3-5-sonnet-20241022")
-  .max_tokens(1024)
-  .system("You are helpful")
-  .message(Message::user("Hello"))
-  .temperature(0.7)
-  .build()?;
-```
-
-Design notes
-- `system` is top-level (no system role in messages).
-- Accept `system` as `SystemContent` (string or text blocks).
-- `messages` accept `MessageContent` (string shorthand or blocks).
-
-### 4.3 Type modeling overview
-
-Request
-- `MessagesRequest { model, max_tokens, messages, system?, metadata?, stop_sequences?, temperature?, top_p?, top_k?, tools?, tool_choice?, thinking?, service_tier?, stream? }`
-
-Response
-- `Message { id, type, role, model, content, stop_reason?, stop_sequence?, usage }`
-
-Use `serde` tagging:
-- `#[serde(tag = "type", rename_all = "snake_case")]` for content blocks
-- `#[serde(untagged)]` for `string | [blocks]` unions
-
-## 5) Content blocks and tools
-
-### 5.1 ContentBlockParam (input)
-Support all common and advanced blocks:
-- `text`
-- `image` (base64 or url)
-- `document` (pdf base64/url, plain text, or embedded blocks)
-- `search_result`
-- `tool_result`
-- advanced: `tool_use`, `server_tool_use`, `web_search_tool_result`,
-  `thinking`, `redacted_thinking`
-
-### 5.2 ContentBlock (output)
-Support output blocks:
-- `text`, `tool_use`, `thinking`, `redacted_thinking`, `server_tool_use`,
-  `web_search_tool_result`
-
-### 5.3 Tools
-Use a union for custom and server tools:
-- Custom: `{ name, description?, input_schema }`
-- Server tools: `bash_20250124`, `text_editor_20250124/20250429/20250728`,
-  `web_search_20250305`
-
-Tool choice union:
-- `auto | any | none | tool(name)`
-- `disable_parallel_tool_use: bool`
-
-## 6) Streaming design
-
-### 6.1 Raw SSE
-Streaming uses SSE with event `type`:
-- `message_start`
-- `content_block_start`
-- `content_block_delta`
-- `content_block_stop`
-- `message_delta`
-- `message_stop`
-
-Implement a tolerant SSE parser:
-- buffer partial chunks
-- ignore empty/comment lines
-- stop on stream close
-- surface JSON parse errors as `AnthropicError::Serialization`
-
-### 6.2 MessageStream helper
-Provide a higher-level stream wrapper that merges deltas into a full message.
-
-Proposed API:
-- `MessageStream::events()` -> raw `StreamEvent`
-- `MessageStream::text_stream()` -> `impl Stream<Item = Result<Arc<str>, Error>>`
-- `MessageStream::final_message()` -> `Result<Message, Error>` (awaits completion)
-
-Use a `MessageAccumulator` internally:
-- append text deltas
-- merge tool input JSON fragments
-- update usage/stop_reason
-
-### 6.3 Tool input JSON deltas
-Maintain both:
-- `partial_json: String`
-- `parsed: Option<Value>` (best-effort)
-
-Parsing strategy:
-- append fragment on each delta
-- attempt `serde_json::from_str` after each update
-- keep the last successful parse
-
-This avoids a hard dependency on a partial JSON parser while still offering
-useful intermediate values.
-
-## 7) Timeout policy
-
-The official SDKs enforce a non-streaming timeout policy. Mirror it:
-
-- Default non-stream timeout: 10 minutes
-- `expected_time = 1h * max_tokens / 128000`
-- If `expected_time > 10m`, require streaming
-- Maintain a `MODEL_NONSTREAMING_TOKENS` map (from SDKs)
-
-Expose this as:
-- `ClientConfig::non_streaming_policy`
-- `MessagesRequest::validate_non_streaming(&policy)`
-
-Allow opt-out via `RequestOptions::allow_long_non_streaming`.
-
-## 8) Error handling
-
-Use a structured error enum and preserve request_id:
-
-```
-enum AnthropicError {
-  Api { status: StatusCode, request_id: Option<Arc<str>>, error: ErrorObject },
-  Transport(reqwest::Error),
-  Serialization(String, Box<dyn std::error::Error + Send + Sync>),
-  Timeout,
-  InvalidRequest(String),
-}
-```
-
-`ErrorObject` mirrors the response:
-- `type`, `message` (plus optional `param` when present)
-
-Always surface `request-id` header in errors and responses.
-
-## 9) Rust ergonomics and idioms
-
-- Avoid panics in library code. No `unwrap`/`expect` in production paths.
-- Use `Arc<str>` and `Arc<[T]>` for immutable data cloned often.
-- Prefer `From`/`TryFrom` for conversions rather than custom `to_*` methods.
-- Provide `Option<&T>` accessors instead of `&Option<T>`.
-- Use `&str`/`&[T]` in accessors instead of `&String`/`&Vec<T>`.
-
-## 10) Example usage (final API shape)
-
-Non-streaming:
-```
-let client = Client::from_env()?;
-let req = MessagesRequest::builder("claude-3-5-sonnet-20241022")
-    .max_tokens(512)
-    .system("You are helpful")
-    .message(Message::user("Explain Rust lifetimes."))
-    .temperature(0.7)
-    .build()?;
-
-let msg = client.messages().create(req, RequestOptions::default()).await?;
-let text = msg.text(); // helper to join text blocks
-```
-
-Streaming:
-```
-let stream = client.messages().stream(req, RequestOptions::default()).await?;
-let mut text = String::new();
-let mut s = stream.text_stream();
-while let Some(chunk) = s.next().await {
-    text.push_str(&chunk?);
-}
-let final_msg = stream.final_message().await?;
-```
diff --git a/crates/rullm-anthropic/spec/implementation.md b/crates/rullm-anthropic/spec/implementation.md
deleted file mode 100644
index 5f945428..00000000
--- a/crates/rullm-anthropic/spec/implementation.md
+++ /dev/null
@@ -1,256 +0,0 @@
-# Rust SDK port notes (implementation guidance)
-
-This document captures what we need to know to port the official Anthropic SDK to Rust. It is based on:
-- The local `reference.md` spec
-- The Go SDK (strong typed model, helpers, timeout logic)
-- The Python and TypeScript SDKs (streaming helpers, ergonomics)
-
-The goal is to provide a Rust API that is feature-parity with the official SDKs while fitting the rullm workspace style.
-
-## 1) Scope and parity targets
-
-Minimum parity for a first pass (mirrors Go/TS/Python):
-- Core API client with config and auth
-- `messages`:
-  - create (non-stream)
-  - create with streaming
-  - stream helper (aggregates raw SSE events)
-  - count_tokens
-  - message batches (create/get/list/cancel/delete/results)
-- `models` (list/get)
-- `completions` (legacy API)
-- `beta` resources (optional, via `anthropic-beta` header)
-
-Optional parity:
-- Bedrock / Vertex clients (Go/Python include these)
-- Helpers for prompt caching
-- Deprecated model warnings
-
-## 2) Client configuration and auth
-
-Match official SDK behavior:
-- Env vars:
-  - `ANTHROPIC_API_KEY`
-  - `ANTHROPIC_AUTH_TOKEN`
-  - `ANTHROPIC_BASE_URL`
-- Base URL default: `https://api.anthropic.com`
-- Headers:
-  - `anthropic-version: 2023-06-01`
-  - `X-Api-Key` OR `Authorization: Bearer <token>`
-- Provide per-request overrides:
-  - timeout
-  - extra headers
-  - extra query params
-  - extra body fields
-
-Recommendation:
-- Build a `Client` struct similar to Go:
-  - `Client::new(api_key, auth_token, base_url, options...)`
-  - `Client::from_env()`
-  - sub-services: `messages`, `models`, `completions`, `beta`
-
-## 3) HTTP layer and retries
-
-The official SDKs include retry support and expose:
-- `max_retries`
-- default timeout
-
-Rust port should:
-- Use `reqwest` (already in workspace)
-- Support global and per-request timeout
-- Expose retry policy (even a simple fixed retry is OK initially)
-- Surface `request-id` header in responses
-
-## 4) Serialization strategy (serde)
-
-The Messages API relies heavily on tagged unions. Use:
-- `#[serde(tag = "type", rename_all = "snake_case")]` for unions with a `type` discriminator
-- `#[serde(untagged)]` for unions like `string | [blocks]`
-- `serde_json::Value` for tool input and JSON Schema fields
-
-Suggested core enums:
-- `ContentBlock` (output)
-- `ContentBlockParam` (input)
-- `ToolUnion` / `ToolChoice`
-- `ThinkingConfig`
-- `Citation` and citation location variants
-- `MessageStreamEvent` and delta variants
-
-Notes from SDKs:
-- Go avoids the `string` shorthand for `messages.content` (requires blocks).
-- Python/TS accept `string | [blocks]`.
-- Rust can support both by using `#[serde(untagged)]` plus helpers that convert strings to text blocks.
-
-## 5) Messages API design in Rust
-
-### 5.1 Request structs
-Required:
-- `model: String` (or `Model` wrapper)
-- `max_tokens: u32`
-- `messages: Vec<MessageParam>`
-
-Optional:
-- `system: String | Vec<TextBlockParam>`
-- `metadata`
-- `stop_sequences`
-- `temperature`, `top_p`, `top_k`
-- `tools`, `tool_choice`
-- `thinking`
-- `service_tier`
-- `stream`
-
-### 5.2 Response structs
-`Message` includes:
-- `id`, `type`, `role`, `model`
-- `content: Vec<ContentBlock>`
-- `stop_reason`, `stop_sequence`
-- `usage` (input/output tokens, cache fields, service_tier, server_tool_use)
-
-### 5.3 Tooling types
-Support both custom and server tools:
-- Custom: `name`, `description?`, `input_schema`
-- Server tools: `bash_20250124`, `text_editor_20250124/20250429/20250728`, `web_search_20250305`
-Tool choice union:
-- `auto`, `any`, `tool`, `none`
-- `disable_parallel_tool_use` boolean
-
-### 5.4 Content blocks (input)
-At minimum:
-- `text`
-- `image` (base64 or url)
-- `document` (pdf base64 or url, plain text, or embedded content)
-- `search_result`
-- `tool_result`
-
-Advanced (for parity):
-- `tool_use` (rare in input, but used for continuity)
-- `server_tool_use`
-- `web_search_tool_result`
-- `thinking`, `redacted_thinking`
-
-## 6) Streaming support
-
-### 6.1 Raw SSE
-Streaming sends SSE events, each with a JSON object containing a `type`:
-- `message_start`
-- `content_block_start`
-- `content_block_delta`
-- `content_block_stop`
-- `message_delta`
-- `message_stop`
-
-`content_block_delta` variants:
-- `text_delta`
-- `input_json_delta` (tool input)
-- `citations_delta`
-- `thinking_delta`
-- `signature_delta`
-
-### 6.2 Stream helper (recommended)
-Python/TS expose a higher-level stream helper that:
-- Accumulates a `Message` snapshot
-- Emits derived events (`text`, `citation`, `thinking`, `input_json`, etc.)
-- Provides `text_stream` and `get_final_message()` helpers
-
-Rust port should consider:
-- `MessageStream` wrapper that consumes raw SSE events
-- `MessageStream::text_stream()` yields only text deltas
-- `MessageStream::final_message()` returns accumulated message
-
-### 6.3 Partial JSON parsing
-`input_json_delta` sends partial JSON strings.
-- TS uses a partial JSON parser
-- Python uses `jiter` partial parsing
-
-Rust options:
-- Accumulate raw JSON text per tool-use block and parse on each delta
-- Use a partial JSON parser crate if available
-
-Maintain both:
-- `partial_json` text buffer
-- best-effort parsed `serde_json::Value` snapshot
-
-## 7) Timeout behavior (important)
-
-Go and TS enforce a non-streaming timeout:
-- Default non-streaming timeout: 10 minutes
-- `expected_time = 1h * max_tokens / 128000`
-- If `expected_time > 10 minutes`, or `max_tokens` exceeds a model-specific limit, require streaming
-
-Port should:
-- Include a `MODEL_NONSTREAMING_TOKENS` map (from SDKs)
-- Compute timeout and error when streaming is required
-- Allow caller override for timeout
-
-## 8) Errors
-
-Error responses are structured:
-- `error: { type, message }`
-- `request_id`
-
-Types include:
-- `invalid_request_error`
-- `authentication_error`
-- `billing_error`
-- `permission_error`
-- `not_found_error`
-- `rate_limit_error`
-- `timeout_error`
-- `api_error`
-- `overloaded_error`
-
-Rust error enum should:
-- Preserve HTTP status
-- Preserve `request_id`
-- Keep raw body for debugging
-
-## 9) Pagination
-
-List endpoints use cursor params:
-- `after_id`, `before_id`, `limit`
-
-Responses include:
-- `data: []`
-- `has_more`
-- `first_id`, `last_id`
-
-Provide a `Page<T>` with cursor helpers, similar to Go's pagination module.
-
-## 10) Deprecation warnings (optional)
-
-Python/TS warn on deprecated models using a known list.
-Rust port can:
-- Maintain a `DEPRECATED_MODELS` map
-- Emit warnings (log or `eprintln!`)
-
-## 11) Beta support
-
-Go SDK includes `beta` services and uses the `anthropic-beta` header.
-For parity:
-- Allow optional `betas: Vec<String>` header
-- Provide beta resources where needed (models/messages/files)
-
-## 12) Suggested module layout
-
-```
-crates/rullm-anthropic/src/
-  client.rs            // Client config, auth, request builder
-  error.rs             // Error types and mapping
-  resources/
-    messages.rs        // create, stream, count_tokens
-    message_batches.rs // create/get/list/cancel/delete/results
-    models.rs
-    completions.rs
-    beta/...
-  types/
-    message.rs
-    content_block.rs
-    tool.rs
-    streaming.rs
-  streaming/
-    sse.rs             // SSE parser or reuse rullm-core
-    message_stream.rs  // high-level aggregator
-```
-
-This layout mirrors the official SDKs while fitting Rust conventions.
-
diff --git a/crates/rullm-chat-completion/spec/chat-completion-api.md b/crates/rullm-chat-completion/spec/chat-completion-api.md
deleted file mode 100644
index 0e6435bb..00000000
--- a/crates/rullm-chat-completion/spec/chat-completion-api.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# OpenAI Chat Completions API - High-Level Spec
-
-This is a high-level overview of the OpenAI Chat Completions REST API. For full
-field-level details, see `chat-completion.md` and `chat-completion2.md`.
-
-## Positioning
-- Endpoint family: `/v1/chat/completions`
-- Status: supported but legacy; newer integrations often use the Responses API.
-- Still required when you want classic chat-completion object shapes or stored
-  completion CRUD endpoints.
-
-## Endpoints
-- POST `/v1/chat/completions` - create a completion (optionally streaming)
-- GET `/v1/chat/completions/{completion_id}` - retrieve stored completion
-- GET `/v1/chat/completions` - list stored completions (pagination)
-- POST `/v1/chat/completions/{completion_id}` - update stored completion metadata
-- DELETE `/v1/chat/completions/{completion_id}` - delete stored completion
-- GET `/v1/chat/completions/{completion_id}/messages` - list stored messages
-
-## Auth and Headers
-- Authorization: `Authorization: Bearer <API_KEY>`
-- Optional routing: `OpenAI-Organization`, `OpenAI-Project`
-- Content-Type: `application/json`
-- Useful response headers: `x-request-id`, `openai-processing-ms`, `x-ratelimit-*`
-
-## Core Request Shape
-```json
-{
-  "model": "gpt-4o",
-  "messages": [...],
-  "stream": false
-}
-```
-
-### Messages and Content
-Messages are role-tagged objects. `content` is either a string or an array of
-content parts.
-
-Roles (non-exhaustive):
-- `system` (legacy instructions)
-- `developer` (preferred for reasoning models)
-- `user`
-- `assistant`
-- `tool`
-- `function` (deprecated)
-
-Content parts (union by `type`):
-- `text` `{ type: "text", text: "..." }`
-- `image_url` `{ type: "image_url", image_url: { url, detail? } }`
-- `input_audio` `{ type: "input_audio", input_audio: { data, format } }`
-- `file` `{ type: "file", file: { file_id | file_data, filename? } }`
-- `refusal` (assistant-only content part)
-
-Assistant messages may omit `content` and instead include `tool_calls`.
-Tool responses use role `tool` and include `tool_call_id`.
-
-## Common Request Parameters (high-level)
-- Sampling: `temperature`, `top_p`, `presence_penalty`, `frequency_penalty`
-- Tokens: `max_completion_tokens`, `max_tokens` (deprecated)
-- Output count: `n`
-- Stopping: `stop`
-- Logprobs: `logprobs`, `top_logprobs`
-- Tools: `tools`, `tool_choice`, `parallel_tool_calls`
-- Structured outputs: `response_format` (`json_schema` or `json_object`)
-- Audio output: `modalities`, `audio`
-- Web search: `web_search_options`
-- Predicted outputs: `prediction`
-- Prompt caching: `prompt_cache_key`, `prompt_cache_retention`
-- Safety: `safety_identifier` (replaces `user`)
-- Storage: `store`, `metadata`
-- Service tiers: `service_tier`
-- Reasoning: `reasoning_effort`, `verbosity`
-- Streaming: `stream`, `stream_options`
-
-## Non-Streaming Response Shape
-Chat completion object:
-- `id`, `object: "chat.completion"`, `created`, `model`
-- `choices[]`: each includes `message`, `finish_reason`, optional `logprobs`
-- `message`: `role: assistant`, `content` or `refusal`, `tool_calls`, `audio`,
-  optional `annotations` (web search)
-- `usage`: `prompt_tokens`, `completion_tokens`, `total_tokens` + details
-- `service_tier`, `system_fingerprint` (deprecated)
-
-Finish reasons can include: `stop`, `length`, `tool_calls`, `content_filter`,
-`function_call` (deprecated).
-
-## Streaming (SSE)
-- Enable with `stream: true`.
-- The server emits SSE events whose data is a `chat.completion.chunk` object.
-- Each chunk has `choices[].delta` with partial data:
-  - `role`, `content`, `refusal`, `tool_calls`, `function_call` (deprecated)
-- Tool call arguments arrive as streamed string fragments.
-- `stream_options` supports:
-  - `include_usage` (final usage-only chunk)
-  - `include_obfuscation` (adds obfuscation fields to normalize payload sizes)
-- Stream ends with `data: [DONE]` or connection close.
-
-## Errors and Rate Limits
-- Errors return a top-level `error` object with fields like `message`, `type`,
-  `param`, `code`.
-- Streaming may emit an error object inside the SSE `data` payload.
-- Rate limit headers provide request and token budgets; clients should parse and
-  surface them.
diff --git a/crates/rullm-chat-completion/spec/chat-completion-comparison.md b/crates/rullm-chat-completion/spec/chat-completion-comparison.md
new file mode 100644
index 00000000..2f81db26
--- /dev/null
+++ b/crates/rullm-chat-completion/spec/chat-completion-comparison.md
@@ -0,0 +1,379 @@
+# Chat Completion APIs: Cross-Provider Comparison Guide
+
+The AI API landscape has coalesced around OpenAI's design patterns, but significant differences remain beneath the surface. This guide maps the common ground and critical divergences developers need to navigate when building multi-provider applications.
+
+## OpenAI Compatibility Spectrum
+
+Three providers—Groq, OpenRouter, and OpenAI itself—share an identical request/response schema, making code portability straightforward. Anthropic and Google Gemini diverge significantly, each with unique terminology and structural choices.
+
+| Provider | OpenAI Compatible | Migration Complexity |
+|----------|-------------------|---------------------|
+| **OpenAI** | Baseline reference | N/A |
+| **Groq** | Yes (drop-in) | Change base URL + API key |
+| **OpenRouter** | Yes (drop-in) | Change base URL + API key |
+| **Anthropic** | No | Requires schema rewrite |
+| **Google Gemini** | No | Requires schema rewrite |
+
+To use OpenAI's Python SDK with Groq or OpenRouter, only the base URL changes:
+
+```python
+from openai import OpenAI
+client = OpenAI(
+    base_url="https://api.groq.com/openai/v1",  # or "https://openrouter.ai/api/v1"
+    api_key="YOUR_API_KEY"
+)
+```
+
+### Design Philosophy by Provider
+
+- **Anthropic**: Safety-first with strict schema enforcement. Enforces alternating user/assistant roles to prevent jailbreaking. System prompts elevated to top-level parameter for higher authority. Version header required for enterprise stability.
+- **Google Gemini**: Multimodal-native. Uses `parts`-based architecture treating text, images, video, and audio as equivalent units. Designed for enterprise cloud integration via Vertex AI.
+- **Groq**: Velocity-centric. Mimics OpenAI specification exactly for frictionless adoption—drop-in compatibility prioritized over architectural novelty.
+- **OpenRouter**: Normalization layer. Abstracts ecosystem fragmentation behind unified OpenAI-compatible interface with routing intelligence.
+
+## Endpoints and Authentication
+
+All providers use REST APIs with JSON payloads, but authentication headers and endpoint paths differ.
+
+| Provider | Base URL | Endpoint | Auth Header |
+|----------|----------|----------|-------------|
+| **OpenAI** | `api.openai.com` | `/v1/chat/completions` | `Authorization: Bearer $KEY` |
+| **Anthropic** | `api.anthropic.com` | `/v1/messages` | `x-api-key: $KEY` + `anthropic-version: 2023-06-01` |
+| **Google Gemini** | `generativelanguage.googleapis.com` | `/v1beta/models/{model}:generateContent` | `x-goog-api-key: $KEY` or OAuth |
+| **Groq** | `api.groq.com` | `/openai/v1/chat/completions` | `Authorization: Bearer $KEY` |
+| **OpenRouter** | `openrouter.ai` | `/api/v1/chat/completions` | `Authorization: Bearer $KEY` |
+
+**Key differences:**
+- Anthropic requires `anthropic-version` header on every request—forces clients to pin to specific schema version
+- OpenRouter accepts optional `HTTP-Referer` and `X-Title` headers for community rankings
+
+### Google's Bifurcated Authentication
+
+Google offers two distinct authentication paths:
+
+1. **Google AI Studio (Prototyping)**: Simple API key via `x-goog-api-key` header
+2. **Vertex AI (Enterprise)**: Google Cloud IAM with OAuth 2.0 access tokens via Service Accounts
+
+Code written for AI Studio often requires significant refactoring for Vertex AI deployment—a friction point absent with other providers.
+
+## Request Structure
+
+### Message Format
+
+The most impactful difference is **how system prompts are handled**.
+
+**OpenAI/Groq/OpenRouter format:**
+```json
+{
+  "model": "gpt-4o",
+  "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello!"}
+  ]
+}
+```
+
+**Anthropic format:**
+```json
+{
+  "model": "claude-sonnet-4-5",
+  "max_tokens": 1024,
+  "system": "You are a helpful assistant.",
+  "messages": [
+    {"role": "user", "content": "Hello!"}
+  ]
+}
+```
+
+**Google Gemini format:**
+```json
+{
+  "systemInstruction": {"parts": [{"text": "You are a helpful assistant."}]},
+  "contents": [
+    {"role": "user", "parts": [{"text": "Hello!"}]}
+  ]
+}
+```
+
+**Terminology mapping:**
+| Concept | OpenAI/Groq/OpenRouter | Anthropic | Gemini |
+|---------|------------------------|-----------|--------|
+| Message list | `messages` | `messages` | `contents` |
+| Message content | `content` | `content` | `parts` |
+| Assistant role | `assistant` | `assistant` | `model` |
+| System prompt | Message with `role: system` | Top-level `system` | `systemInstruction` |
+
+**Anthropic strict alternation**: Anthropic enforces rigorous alternation between user and assistant roles. A sequence of `user, user` is invalid (400 error). Client must merge consecutive same-role messages.
+
+### Control Parameters
+
+| Parameter | OpenAI | Anthropic | Gemini | Groq |
+|-----------|--------|-----------|--------|------|
+| `max_tokens` | Optional | **Required** | Optional (`maxOutputTokens`) | Optional |
+| `temperature` | 0-2 (default 1) | 0-1 (default 1) | 0-2 (default 1) | 0-2 (default 1) |
+| `top_p` | ✓ | ✓ | ✓ (`topP`) | ✓ |
+| `top_k` | ✗ | ✓ | ✓ (`topK`) | ✗ |
+| `frequency_penalty` | ✓ | ✗ | ✓ | ✗ |
+| `presence_penalty` | ✓ | ✗ | ✓ | ✗ |
+
+**Critical**: Anthropic requires `max_tokens` on every request—catches many developers migrating from OpenAI.
+
+**Groq limitations**: Does not support `frequency_penalty`, `presence_penalty`, `logprobs`, or `n > 1`. Requests using these return 400 errors.
+
+### Thinking/Reasoning Configuration
+
+- **Anthropic (Claude 3.7+)**: `thinking` block with `budget_tokens` parameter—explicitly reserves capacity for chain-of-thought
+- **Gemini 2.0**: `thinking_config` with levels (`"low"`, `"high"`)—reasoning depth as configuration toggle
+
+## Response Structure
+
+**OpenAI/Groq/OpenRouter response:**
+```json
+{
+  "id": "chatcmpl-abc123",
+  "choices": [{
+    "index": 0,
+    "message": {"role": "assistant", "content": "Hello!"},
+    "finish_reason": "stop"
+  }],
+  "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+}
+```
+
+**Anthropic response:**
+```json
+{
+  "id": "msg_01XFD...",
+  "content": [{"type": "text", "text": "Hello!"}],
+  "stop_reason": "end_turn",
+  "usage": {"input_tokens": 10, "output_tokens": 5}
+}
+```
+
+**Google Gemini response:**
+```json
+{
+  "candidates": [{
+    "content": {"parts": [{"text": "Hello!"}], "role": "model"},
+    "finishReason": "STOP"
+  }],
+  "usageMetadata": {"promptTokenCount": 10, "candidatesTokenCount": 5}
+}
+```
+
+### Stop/Finish Reasons
+
+| Reason | OpenAI/Groq | Anthropic | Gemini |
+|--------|-------------|-----------|--------|
+| Natural completion | `stop` | `end_turn` | `STOP` |
+| Token limit | `length` | `max_tokens` | `MAX_TOKENS` |
+| Tool call | `tool_calls` | `tool_use` | (function call in content) |
+| Content filter | `content_filter` | — | `SAFETY` |
+| Copyright | — | — | `RECITATION` |
+
+**Gemini-specific**: `RECITATION` triggers when output is too similar to copyrighted training data, blocking the response.
+
+## Streaming
+
+All providers use Server-Sent Events (SSE), but event structure differs significantly.
+
+**OpenAI/Groq streaming chunk:**
+```
+data: {"choices":[{"delta":{"content":"Hello"}}]}
+data: {"choices":[{"delta":{"content":" there"}}]}
+data: [DONE]
+```
+
+**Anthropic streaming events:**
+```
+event: message_start
+data: {"type":"message_start","message":{...}}
+
+event: content_block_start
+data: {"type":"content_block_start","index":0,"content_block":{"type":"text"}}
+
+event: content_block_delta
+data: {"type":"content_block_delta","delta":{"type":"text_delta","text":"Hello"}}
+
+event: message_stop
+data: {"type":"message_stop"}
+```
+
+Anthropic's verbose event system provides richer metadata—separate events for tool use, thinking blocks—but requires different parsing logic.
+
+**Gemini**: Streams partial `GenerateContentResponse` objects via `streamGenerateContent?alt=sse`. May emit "empty" chunks containing only citation metadata or safety ratings—client must filter for actual text content.
+
+## Tool Use / Function Calling
+
+All providers support JSON Schema-based tool definitions, but schemas differ.
+
+### Tool Definitions
+
+**OpenAI/Groq/OpenRouter:**
+```json
+{"type": "function", "function": {"name": "get_weather", "parameters": {...}}}
+```
+
+**Anthropic:**
+```json
+{"name": "get_weather", "input_schema": {...}}
+```
+
+**Google Gemini:**
+```json
+{"functionDeclarations": [{"name": "get_weather", "parameters": {...}}]}
+```
+
+Note: Anthropic uses `input_schema` instead of `parameters`.
+
+### Tool Invocation (Model's Request)
+
+**OpenAI/Groq/OpenRouter** — `tool_calls` array with unique `id`:
+```json
+{"tool_calls": [{"id": "call_abc", "function": {"name": "get_weather", "arguments": "{\"city\":\"London\"}"}}]}
+```
+
+**Anthropic** — `tool_use` content block (can follow text blocks for chain-of-thought):
+```json
+{"type": "tool_use", "id": "toolu_01...", "name": "get_weather", "input": {"city": "London"}}
+```
+
+**Gemini** — `functionCall` part:
+```json
+{"parts": [{"functionCall": {"name": "get_weather", "args": {"city": "London"}}}]}
+```
+
+### Result Submission (Client's Response)
+
+**OpenAI/Groq/OpenRouter** — Dedicated `tool` role:
+```json
+{"role": "tool", "tool_call_id": "call_abc", "content": "25°C"}
+```
+
+**Anthropic** — `tool_result` block in `user` message:
+```json
+{"role": "user", "content": [{"type": "tool_result", "tool_use_id": "toolu_01...", "content": "25°C"}]}
+```
+
+**Gemini** — `functionResponse` part:
+```json
+{"role": "user", "parts": [{"functionResponse": {"name": "get_weather", "response": {"result": "25°C"}}}]}
+```
+
+**Key difference**: Anthropic has no separate "tool" role—the user reports tool results.
+
+### Tool Choice Options
+
+All providers support `auto`, `none`, and forced tool selection. Anthropic adds `any` (must use at least one tool). Parallel tool calls supported by OpenAI, Groq, and Anthropic (default enabled).
+
+## Multimodality
+
+### Image Transmission
+
+**Anthropic** — Base64 encoding (bandwidth-intensive):
+```json
+{"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": "..."}}
+```
+
+**Groq** — OpenAI `image_url` format (URL or base64):
+```json
+{"type": "image_url", "image_url": {"url": "https://..." }}
+```
+
+**Gemini** — Supports inline `inline_data` (base64) or `file_data` (Cloud Storage URI):
+```json
+{"file_data": {"mime_type": "video/mp4", "file_uri": "gs://my-bucket/video.mp4"}}
+```
+
+Gemini's `file_data` allows processing hours of video/audio—impossible via base64.
+
+### Media Support Matrix
+
+| Media Type | OpenAI | Anthropic | Gemini | Groq |
+|------------|--------|-----------|--------|------|
+| Images | ✓ | ✓ | ✓ | ✓ (model-dependent) |
+| Video | ✗ | ✗ (frames as images) | ✓ Native | ✗ |
+| Audio | ✓ | ✗ | ✓ Native | Separate endpoint |
+| Documents (PDF) | ✓ | ✓ | ✓ | ✗ |
+
+Gemini is currently the only provider supporting native video and audio in the main chat endpoint.
+
+## Rate Limits
+
+Rate limit communication varies significantly.
+
+| Provider | Headers | Notes |
+|----------|---------|-------|
+| **Groq** | `x-ratelimit-*` | Standard (requests, tokens, reset time) |
+| **Anthropic** | `anthropic-ratelimit-*` | Separates input-tokens from output-tokens limits |
+| **OpenRouter** | `x-openrouter-credits` | Credit-based |
+| **Gemini** | N/A | Uses Google Cloud Quota dashboards; 429 errors contain `retry-after` |
+
+## Pricing
+
+All providers charge per-token with separate input/output rates.
+
+| Provider | Example Model | Input (per 1M) | Output (per 1M) |
+|----------|---------------|----------------|-----------------|
+| OpenAI | GPT-4o | ~$2.50 | ~$10.00 |
+| Anthropic | Claude 3.5 Sonnet | $3.00 | $15.00 |
+| Google | Gemini 2.5 Flash | $0.15 | $0.60 |
+| Groq | Llama 3.3 70B | $0.59 | $0.79 |
+
+**Discounts**: Anthropic offers 90% off for cached content. Groq offers 50% off for batch processing.
+
+**Free tiers**: Google AI Studio, Groq, and OpenRouter (`:free` suffix models).
+
+## Unique Provider Features
+
+- **OpenAI**: Structured Outputs with strict JSON Schema enforcement, Batch API with 50% discount
+- **Anthropic**: Extended thinking with configurable token budgets, prompt caching (90% cost reduction), computer use tools
+- **Google Gemini**: Built-in Google Search grounding, native code execution, 2M token context, video/audio processing up to 2 hours
+- **Groq**: 394-1000+ tokens/second via LPU hardware, timing metrics in usage response
+- **OpenRouter**: 400+ models, automatic fallbacks, model routing (`:floor` cheapest, `:nitro` fastest), zero-markup pricing
+
+## SDK Availability
+
+| Provider | Python | TypeScript | Go | Java | Other |
+|----------|--------|------------|----|----|-------|
+| OpenAI | ✓ | ✓ | Beta | — | — |
+| Anthropic | ✓ | ✓ | ✓ | ✓ | Ruby, C# (beta) |
+| Google | ✓ | ✓ | ✓ | ✓ | Dart, Swift, Kotlin |
+| Groq | ✓ | ✓ | — | — | OpenAI SDK compatible |
+| OpenRouter | ✓ Beta | ✓ | — | — | OpenAI SDK compatible |
+
+For Groq and OpenRouter, using the OpenAI SDK with modified base URL is recommended.
+
+## Migration Strategies
+
+### Practical Patterns
+
+1. **Use OpenAI-compatible providers for easy switching**: Groq and OpenRouter share code paths with OpenAI. Abstract only base URL and API key.
+
+2. **Create provider-specific adapters for Anthropic/Gemini**: Structural differences require transformation layers. Map `system` messages to Anthropic's top-level field, convert `assistant` to `model` for Gemini.
+
+3. **Normalize on OpenAI response format**: Parse provider responses into common structure. OpenRouter already does this for 400+ models.
+
+4. **Handle parameter gaps gracefully**: Remove unsupported parameters (like `frequency_penalty` for Groq) rather than letting requests fail.
+
+5. **Consider OpenRouter as unification layer**: For multi-provider needs, provides single API surface with automatic fallbacks.
+
+### Strategic Recommendations
+
+| Use Case | Recommended Provider |
+|----------|---------------------|
+| Safety-critical, complex instructions | Anthropic (strict schema, prompt caching) |
+| Heavy media analysis (video/audio) | Gemini (native support, 2M context) |
+| Real-time, latency-critical | Groq (LPU speed) |
+| Multi-model access, reduced operational overhead | OpenRouter (unified gateway) |
+
+## Summary
+
+The Chat Completion API has evolved into a standard architectural pattern, but implementation remains fragmented. Key migration hurdles:
+
+1. **System prompt handling** — Message vs top-level parameter vs nested config
+2. **Required parameters** — Anthropic's mandatory `max_tokens`
+3. **Response parsing** — `choices` vs `content` vs `candidates`
+4. **Tool use handshakes** — Different roles and ID handling
+
+For maximum flexibility, abstract provider-specific code behind a common interface, or leverage OpenRouter's unified gateway.
diff --git a/crates/rullm-chat-completion/spec/chat-completion.md b/crates/rullm-chat-completion/spec/chat-completion.md
index 4185c723..0cee74f7 100644
--- a/crates/rullm-chat-completion/spec/chat-completion.md
+++ b/crates/rullm-chat-completion/spec/chat-completion.md
@@ -44,6 +44,18 @@ POST /chat/completions
 
 Creates a model response for the given chat conversation.
 
+### Stored Completion Endpoints (requires `store: true`)
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| GET | `/chat/completions/{id}` | Retrieve stored completion |
+| GET | `/chat/completions` | List stored completions (paginated) |
+| GET | `/chat/completions/{id}/messages` | List messages from stored completion |
+| POST | `/chat/completions/{id}` | Update metadata |
+| DELETE | `/chat/completions/{id}` | Delete stored completion |
+
+Pagination uses `after`, `limit`, and `order` parameters.
+
 ---
 
 ## Authentication
@@ -59,6 +71,24 @@ Optional organization header:
 OpenAI-Organization: YOUR_ORG_ID
 ```
 
+Optional project header:
+```
+OpenAI-Project: YOUR_PROJECT_ID
+```
+
+### Response Headers (Debugging)
+
+| Header | Description |
+|--------|-------------|
+| `x-request-id` | Request identifier for support/debugging |
+| `openai-processing-ms` | Server processing time |
+
+### Backward Compatibility
+
+Clients should implement forward-compatible JSON decoding:
+- Ignore unknown fields
+- Don't exhaustively match enums without a fallback
+
 ---
 
 ## Request Structure
@@ -233,6 +263,7 @@ Result of a tool/function call.
 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
 | `reasoning_effort` | string | "medium" | Reasoning depth: `"minimal"` (gpt-5 only), `"low"`, `"medium"`, `"high"`. |
+| `verbosity` | string | null | Output verbosity for supported models. |
 
 ### Optional Parameters - Multimodal
 
@@ -805,11 +836,15 @@ Check `usage.completion_tokens_details`:
 
 ### Retry Strategy
 
+**Transient errors (retry with backoff):** 429, 500, 502, 503
+**Permanent errors (do not retry):** 400, 401, 403, 404
+
 ```
 1. Wait: min(2^attempt * 1000ms, 60000ms) + random_jitter
 2. Max attempts: 5
 3. On 429: Check x-ratelimit-reset-* headers
 4. On 5xx: Always retry
+5. Always log x-request-id for diagnostics
 ```
 
 ---
@@ -927,6 +962,13 @@ Tokens refill on a rolling 60-second window, not all at once.
 - [ ] Service tier selection
 - [ ] Seed for reproducibility
 
+### Client Requirements
+- [ ] Keep-alive connections
+- [ ] Configurable request timeout
+- [ ] Forward-compatible JSON decoding (ignore unknown fields)
+- [ ] SSE parser with both connection-close and `[DONE]` termination
+- [ ] Parse `x-ratelimit-*` headers for adaptive throttling
+
 ---
 
 ## Sources
diff --git a/crates/rullm-chat-completion/spec/chat-completion2.md b/crates/rullm-chat-completion/spec/chat-completion2.md
deleted file mode 100644
index 30d9c1f6..00000000
--- a/crates/rullm-chat-completion/spec/chat-completion2.md
+++ /dev/null
@@ -1,289 +0,0 @@
-# OpenAI Chat Completions API
-
-This document specifies the **Chat Completions** REST API under `POST /v1/chat/completions` (plus stored-completion CRUD endpoints), including request/response shapes, streaming (SSE), and related features (tools/function calling, structured outputs, multimodal inputs, audio, and web search). 
-
----
-
-## 1) Positioning / when to use Chat Completions
-OpenAI’s docs now emphasize the **Responses API** for many new integrations, but **Chat Completions** remains a supported API surface and is still the correct choice if you specifically want `/v1/chat/completions` semantics and object shapes. 
-
----
-
-## 2) Base URL, auth, headers, and versioning
-
-### 2.1 Base URL
-- Base URL: `https://api.openai.com/v1` 
-
-### 2.2 Authentication
-- Header auth: `Authorization: Bearer <OPENAI_API_KEY>` 
-
-### 2.3 Core request headers
-- `Content-Type: application/json` for JSON requests. 
-- Optional account routing headers:
-  - `OpenAI-Organization: <org_id>` 
-  - `OpenAI-Project: <project_id>` 
-
-### 2.4 Debugging / observability headers
-- Responses include a request identifier header you should log (notably `x-request-id`) to correlate failures and support requests. 
-- Responses include timing information like `openai-processing-ms`. 
-
-### 2.5 Backward compatibility expectations
-- OpenAI publishes a backward-compatibility policy for the API; clients should be resilient to additive fields and new enum values. 
-
-**Client requirement:** Implement JSON decoding as forward-compatible: ignore unknown fields; do not exhaustively match enums without a fallback. 
-
----
-
-## 3) Error model, retries, and rate limits
-
-### 3.1 Error response shape (high-level)
-Errors are returned with a top-level `error` object (containing fields like `message`, `type`, and sometimes `param`/`code`), alongside standard HTTP status codes. 
-
-### 3.2 Recommended retry policy (client-side)
-- Retry only **transient** failures (commonly `429`, `500`, `503`) using exponential backoff + jitter; do **not** blindly retry `400`/`401`/`403`/`404` because they’re usually permanent for that request. 
-- Always log `x-request-id` (and any client request id you add) for diagnostics. 
-
-### 3.3 Rate limit headers
-OpenAI documents rate limits and returns `x-ratelimit-*` headers (covering request and token budgets with limit/remaining/reset patterns). Your client should parse and surface these for adaptive throttling. 
-
----
-
-## 4) Endpoint inventory (Chat Completions)
-
-### 4.1 Create (optionally stream)
-- `POST /v1/chat/completions` — generate an assistant response; supports streaming via SSE; can optionally store the completion. 
-
-### 4.2 Stored completion retrieval & management (requires `store: true` on create)
-- `GET /v1/chat/completions/{completion_id}` — retrieve a stored completion. 
-- `GET /v1/chat/completions` — list stored completions (pagination + filters). 
-- `GET /v1/chat/completions/{completion_id}/messages` — list messages from a stored completion (pagination). 
-- `POST /v1/chat/completions/{completion_id}` — update metadata on a stored completion. 
-- `DELETE /v1/chat/completions/{completion_id}` — delete a stored completion. 
-
----
-
-## 5) `POST /v1/chat/completions` — Create
-
-### 5.1 Primary use cases
-- Standard “chat” generation: model responds to a conversation history you provide in `messages`. 
-- Tool/function calling: model asks your client to call functions; client executes and feeds results back. 
-- Streaming UI: token-by-token deltas over SSE. 
-- Structured outputs: force JSON or schema-conformant JSON. 
-- Multimodal input: images and audio in the conversation (model-dependent). 
-- Web search models: model performs a web search and returns citations/annotations. 
-- Persisting outputs for later retrieval: set `store: true` and then use stored completion endpoints. 
-
----
-
-### 5.2 Request body (top-level fields)
-**Required**
-- `model: string` — model identifier. 
-- `messages: array` — conversation inputs (text and/or content parts). 
-
-**Common optional fields (sampling / stopping / token limits)**
-- `temperature?: number` — sampling temperature. 
-- `top_p?: number` — nucleus sampling. 
-- `n?: integer` — number of choices to generate. 
-- `stop?: string | string[] | null` — stop sequences; docs note model-specific support limitations (e.g., not supported by some “o” models). 
-- `max_completion_tokens?: integer | null` — cap for generated tokens (including reasoning tokens where applicable). 
-- `max_tokens?: integer | null` — deprecated; docs note incompatibility with some newer model families. 
-- `presence_penalty?: number | null`, `frequency_penalty?: number | null` 
-
-**Logprobs**
-- `logprobs?: boolean | null` — request logprobs in output. 
-- `top_logprobs?: integer | null` — number of top tokens to return (when `logprobs` is true). 
-
-**Streaming**
-- `stream?: boolean | null` — enable SSE. 
-- `stream_options?: object | null` — stream options; docs show `include_usage` behavior. 
-
-**Tools / function calling**
-- `tools?: array` — tool definitions (notably function tools). 
-- `tool_choice?: "none" | "auto" | "required" | object` — tool selection policy (including forcing a specific tool). 
-- `parallel_tool_calls?: boolean` — whether model may emit multiple tool calls in one turn. 
-- `functions` / `function_call` — deprecated tool/function fields. 
-
-**Structured outputs**
-- `response_format?: object` — JSON mode and schema mode. 
-
-**Multimodal output**
-- `modalities?: string[]` — request output modalities (model-dependent). 
-- `audio?: object | null` — audio output settings when requesting audio modality. 
-
-**Other**
-- `reasoning_effort?: string` — reasoning control for supported models (docs note model-specific constraints). 
-- `verbosity?: string` — output verbosity control for supported models. 
-- `prediction?: object` — “predicted output” optimization payload. 
-- `web_search_options?: object` — used with web-search models. 
-- `service_tier?: string` — service tier selection. 
-- `seed?: integer | null` — deprecated determinism hint. 
-- `logit_bias?: object | null` — token-level biasing. 
-
-**Storing & metadata**
-- `store?: boolean | null` — enable stored-completion retrieval; docs warn that some large inputs (e.g., large images) may be dropped when storing. 
-- `metadata?: object` — user-defined key/value metadata for stored items. 
-- `user?: string` — deprecated in favor of newer identifiers (docs reference `safety_identifier` and `prompt_cache_key`). 
-
----
-
-### 5.3 `messages[]` — conversation schema (client-facing)
-Chat Completions are **stateless**: you send the prior turns each request (or a summarized state). 
-
-**Message object (conceptual)**
-- `role: string` — typical roles include `system`, `user`, `assistant`, and tool-related roles (exact accepted roles depend on the API mode and model). 
-- `content: string | array` — either plain text or a list of typed content parts for multimodal inputs. 
-
-**Content parts (multimodal)**
-- Image input uses a content-part pattern documented in the Images/Vision guide for Chat Completions. 
-- Audio input uses a content-part pattern documented in the Audio guide for Chat Completions. 
-
-**Client requirement:** Model capabilities differ; your client should not hard-code “text only” assumptions, and should treat message `content` as a tagged union. 
-
----
-
-## 6) `POST /v1/chat/completions` — Response
-
-### 6.1 Non-streaming response object
-A successful create returns a **chat completion object** containing identifiers and an array of `choices`. 
-
-**Top-level (commonly present)**
-- `id: string`
-- `object: "chat.completion"`
-- `created: integer` (unix seconds)
-- `model: string`
-- `choices: array`
-- `usage: object` (token accounting) 
-
-**Choice object (conceptual)**
-- `index: integer`
-- `message: { role, content, ... }`
-- `finish_reason: string | null`
-- `logprobs: object | null` (if requested/supported) 
-
-**Client requirement:** Do not assume a single choice; handle `n > 1` by returning multiple candidate messages. 
-
----
-
-## 7) Streaming (SSE) — `stream: true`
-
-### 7.1 Transport / framing
-Chat Completions streaming uses **Server-Sent Events** (SSE) delivering **chat completion chunk** objects incrementally. 
-
-The OpenAI API reference for legacy streaming explicitly describes “data-only SSE” messages and a terminal `data: [DONE]` sentinel; Chat Completions streaming is documented in terms of chunk objects and deltas. For robust clients, support both: end on connection close and/or `[DONE]` sentinel if present. 
-
-### 7.2 Chunk object shape
-Each event contains a `chat.completion.chunk` object with `choices[].delta` carrying incremental text/tool-call deltas. 
-
-Key fields:
-- `object: "chat.completion.chunk"`
-- `id`, `created`, `model`
-- `choices[]: { index, delta, finish_reason? }`
-- Optional `usage` when using `stream_options` to include usage. 
-
-### 7.3 Delta accumulation rules (client requirement)
-- Concatenate `choices[i].delta.content` fragments in-order.
-- Treat tool-call deltas as structured fragments that must be assembled into a complete tool call before execution.
-- Terminate a choice when `finish_reason` becomes non-null. 
-
----
-
-## 8) Tools / function calling (Chat Completions)
-
-### 8.1 Use case
-Let the model request structured, executable actions (API calls, database queries, etc.) by emitting tool calls; your client executes them and returns tool outputs back into the conversation. 
-
-### 8.2 Request fields (high-level)
-- Provide available tools in `tools`.
-- Control selection with `tool_choice`.
-- Allow or disallow multiple calls with `parallel_tool_calls`. 
-
-### 8.3 Response behavior (high-level)
-- Assistant messages may include tool-call descriptors instead of (or in addition to) normal text content.
-- Client must translate tool calls into actual executions and then append tool results as subsequent messages and call the API again. 
-
----
-
-## 9) Structured outputs (`response_format`)
-
-### 9.1 Use case
-- Enforce machine-parseable JSON output (basic JSON mode) or schema-conformant JSON (structured outputs) for deterministic integration with downstream code. 
-
-### 9.2 Modes (documented)
-- JSON mode via a `response_format` object (older JSON mode).
-- Schema-based structured outputs via a `response_format` object (JSON Schema). 
-
-**Client requirement:** Treat `response_format` as a tagged union; do not assume only one sub-variant. 
-
----
-
-## 10) Multimodal inputs (images + audio) and audio outputs
-
-### 10.1 Image inputs (vision)
-- Chat Completions supports image input via content parts as documented in the Images/Vision guide when `api-mode=chat`. 
-- Client should support:
-  - Remote URLs
-  - Base64 “data:” URLs
-  - Any per-image options described in the guide (e.g., detail level), model-dependent. 
-
-### 10.2 Audio inputs and outputs
-- Audio input: send base64-encoded audio as typed content parts (guide documents the required fields). 
-- Audio output: request audio via `modalities` + `audio` settings (voice/format), then decode audio bytes from the response. 
-
-**Client requirement:** Audio and image support are model-dependent; surface capability errors clearly (do not silently fall back unless the caller asked you to). 
-
----
-
-## 11) Web search (Chat Completions)
-
-### 11.1 Use case
-For web-search-capable models, allow the model to retrieve information from the web before responding, returning citation annotations suitable for UI rendering and attribution. 
-
-### 11.2 Request fields
-- Use `web_search_options` alongside a web-search model. 
-
-### 11.3 Response fields (citations / annotations)
-- The Web Search tool guide documents citation annotations (including URL citations) and how to render them. 
-
-**Client requirement:** Preserve and expose annotations separately from text so callers can render citations reliably even if the visible text format changes. 
-
----
-
-## 12) Stored completions (`store: true`) — retrieval, listing, message listing, metadata update, delete
-
-### 12.1 Use case
-- Persist responses for later inspection, evaluation, or building UIs that revisit prior outputs without keeping your own transcript store. 
-
-### 12.2 Key behaviors
-- `store: true` on create enables subsequent retrieval/listing endpoints.
-- Update endpoint is for metadata updates on stored items.
-- Messages endpoint returns the stored message list with pagination controls. 
-
-### 12.3 Pagination (high-level)
-List endpoints support typical cursor pagination parameters like `after`, plus `limit` and `order`. 
-
----
-
-## 13) Practical client requirements checklist
-
-### 13.1 HTTP layer
-- Keep-alive connections; configurable request timeout; request body size limits; gzip/deflate handling as supported by your HTTP library. 
-
-### 13.2 JSON decoding
-- Tolerate unknown fields and enum expansions; treat tagged unions (`content`, `response_format`, tools) as extensible. 
-
-### 13.3 Streaming
-- Implement a correct SSE parser:
-  - parse `data:` lines into JSON chunks
-  - assemble `delta` fragments per `choice.index`
-  - handle both “connection close” and `[DONE]` style termination defensively. 
-
-### 13.4 Tool calling
-- Support multiple tool calls (including parallel), and tool-call assembly in both non-streaming and streaming modes. 
-
-### 13.5 Errors & rate limits
-- Parse error objects; map to typed errors; classify retryable vs permanent.
-- Parse `x-ratelimit-*` headers for adaptive throttling and caller visibility. 
-
----
-
-If you want, I can convert this into a single “contract” section (request/response JSON Schema-like tables for every field and nested object) purely as documentation—still no code.
diff --git a/crates/rullm-chat-completion/spec/implementation-final.md b/crates/rullm-chat-completion/spec/implementation-final.md
deleted file mode 100644
index a183db1a..00000000
--- a/crates/rullm-chat-completion/spec/implementation-final.md
+++ /dev/null
@@ -1,449 +0,0 @@
-# Idiomatic Rust Client Design for OpenAI Chat Completions (Core API)
-
-This document defines a Rust client design for the OpenAI **Chat Completions**
-API with a provider-agnostic core that can target any OpenAI-compatible endpoint.
-It prioritizes developer experience, forward compatibility, and flexible
-authentication/header handling. Provider-specific capability gating is left to
-higher-level crates.
-
-This is a design spec only. It references the request/response shapes and
-compatibility notes in `spec/chat-completion*.md`.
-
----
-
-## 1) Goals and Non-Goals
-
-**Goals**
-- Ergonomic API for common use (`client.chat().model(...).user(...)`).
-- Full coverage of Chat Completions parameters and response shapes.
-- Streaming support with correct SSE parsing and delta accumulation.
-- Forward-compatible JSON decoding (unknown fields and enum values tolerated).
-- Flexible authentication and extra headers.
-- Provider-agnostic core (no built-in provider profiles or capability gating).
-- Clean integration with Rust async ecosystems.
-
-**Non-Goals**
-- Implement the Responses API (this client is Chat Completions focused).
-- Enforce strict compile-time correctness for role/field combos (runtime
-  validation is optional and configurable).
-
----
-
-## 2) Module Layout (Suggested)
-
-```
-crates/rullm-openai/
-  src/
-    client.rs          // ChatCompletionsClient + HTTP wiring
-    config.rs          // ClientConfig, auth + headers
-    types.rs           // Request/response structs, message/content/tool types
-    streaming.rs       // SSE decoder + ChatCompletionStream + accumulator
-    error.rs           // Error types + retry classification
-    util.rs            // Small helpers (headers, url, serialization)
-```
-
----
-
-## 3) Client API Surface
-
-### 3.1 Primary Client
-
-```
-pub struct ChatCompletionsClient { /* cloneable */ }
-
-impl ChatCompletionsClient {
-    pub fn new(config: ClientConfig) -> Result<Self, ClientError>;
-
-    // Core endpoint (non-streaming)
-    pub async fn create(
-        &self,
-        req: ChatCompletionRequest,
-    ) -> Result<ApiResponse<ChatCompletion>, ClientError>;
-
-    // Core endpoint (streaming)
-    pub async fn stream(
-        &self,
-        req: ChatCompletionRequest,
-    ) -> Result<ChatCompletionStream, ClientError>;
-
-    // DX convenience
-    pub fn chat(&self) -> ChatRequestBuilder;
-}
-```
-
-### 3.2 Convenience Builder
-
-```
-let resp = client.chat()
-    .model("gpt-4o")
-    .system("You are concise.")
-    .user("Summarize this")
-    .temperature(0.2)
-    .send()
-    .await?;
-
-let stream = client.chat()
-    .model("gpt-4o")
-    .user("Stream this")
-    .stream()
-    .await?;
-```
-
-Design notes:
-- The builder collects a `Vec<Message>` and converts to `Arc<[Message]>`
-  on `send`/`stream`.
-- `send()` returns `ApiResponse<ChatCompletion>`.
-- `stream()` returns `ChatCompletionStream`.
-
----
-
-## 4) Configuration and Authentication
-
-### 4.1 ClientConfig
-
-```
-pub struct ClientConfig {
-    pub base_url: Url,
-    pub auth: AuthConfig,
-    pub default_headers: HeaderMap,
-    pub timeout: Duration,
-}
-```
-
-### 4.2 AuthConfig
-
-```
-pub enum AuthConfig {
-    None,
-    BearerToken(Arc<str>),
-    Header { name: HeaderName, value: HeaderValue },
-    QueryParam { name: Arc<str>, value: Arc<str> },
-}
-```
-
-Notes:
-- Use `default_headers` for extra headers (e.g., `OpenAI-Organization`,
-  `OpenAI-Project`, OpenRouter `HTTP-Referer`/`X-Title`, or custom auth headers).
-- This core client does not hard-code provider identities or capability rules.
-  Higher-level crates can layer provider-specific behavior on top.
-
----
-
-## 5) Core Types (Request/Response)
-
-### 5.1 Identifiers and Common Newtypes
-
-Use `Arc<str>` for immutable strings:
-
-```
-pub struct ModelId(pub Arc<str>);
-
-pub struct Role(pub Arc<str>);
-impl Role {
-    pub const SYSTEM: Role = Role::static_str("system");
-    pub const DEVELOPER: Role = Role::static_str("developer");
-    pub const USER: Role = Role::static_str("user");
-    pub const ASSISTANT: Role = Role::static_str("assistant");
-    pub const TOOL: Role = Role::static_str("tool");
-}
-```
-
-Using string newtypes avoids breaking when new roles appear.
-
-### 5.2 Messages and Content
-
-```
-#[derive(Serialize, Deserialize)]
-pub struct Message {
-    pub role: Role,
-    pub content: Option<MessageContent>,
-    pub name: Option<Arc<str>>,
-    pub tool_calls: Option<Arc<[ToolCall]>>,
-    pub tool_call_id: Option<Arc<str>>,
-    pub audio: Option<AssistantAudio>,
-    pub function_call: Option<FunctionCall>, // deprecated
-    #[serde(flatten)]
-    pub extra: Map<String, Value>,
-}
-
-#[serde(untagged)]
-pub enum MessageContent {
-    Text(Arc<str>),
-    Parts(Arc<[ContentPart]>),
-}
-```
-
-Convenience constructors:
-- `Message::system(text)`
-- `Message::developer(text)`
-- `Message::user(text_or_parts)`
-- `Message::assistant(text_or_parts)`
-- `Message::tool(tool_call_id, content)`
-
-### 5.3 Content Parts
-
-```
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum ContentPart {
-    Text { text: Arc<str> },
-    ImageUrl { image_url: ImageUrlPart },
-    InputAudio { input_audio: InputAudioPart },
-    File { file: FilePart },
-    Refusal { refusal: Arc<str> },
-    #[serde(other)]
-    Other,
-}
-```
-
-Note: If `serde(other)` is too lossy, use a `RawContentPart` fallback that
-preserves `type` and payload via `serde_json::Value`.
-
-### 5.4 Tools and Tool Calls
-
-```
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum ToolDefinition {
-    Function { function: FunctionDefinition },
-    Custom { custom: CustomToolDefinition },
-    #[serde(other)]
-    Other,
-}
-
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum ToolCall {
-    Function { id: Arc<str>, function: FunctionCall },
-    Custom { id: Arc<str>, custom: CustomToolCall },
-    #[serde(other)]
-    Other,
-}
-```
-
-Tool choice uses an untagged enum:
-```
-#[serde(untagged)]
-pub enum ToolChoice {
-    Mode(ToolChoiceMode),
-    Function { r#type: Arc<str>, function: ToolChoiceFunction },
-    Custom { r#type: Arc<str>, custom: ToolChoiceCustom },
-    AllowedTools { r#type: Arc<str>, allowed_tools: AllowedToolsSpec },
-}
-```
-
-### 5.5 Request Struct
-
-```
-pub struct ChatCompletionRequest {
-    pub model: ModelId,
-    pub messages: Arc<[Message]>,
-
-    // sampling + stopping
-    pub temperature: Option<f32>,
-    pub top_p: Option<f32>,
-    pub n: Option<u32>,
-    pub stop: Option<Stop>,
-    pub presence_penalty: Option<f32>,
-    pub frequency_penalty: Option<f32>,
-
-    // tokens
-    pub max_completion_tokens: Option<u32>,
-    pub max_tokens: Option<u32>, // deprecated
-
-    // logprobs
-    pub logprobs: Option<bool>,
-    pub top_logprobs: Option<u32>,
-    pub logit_bias: Option<Map<String, i32>>,
-
-    // tools
-    pub tools: Option<Arc<[ToolDefinition]>>,
-    pub tool_choice: Option<ToolChoice>,
-    pub parallel_tool_calls: Option<bool>,
-    pub functions: Option<Arc<[FunctionDefinition]>>, // deprecated
-    pub function_call: Option<FunctionCall>, // deprecated
-
-    // response formatting
-    pub response_format: Option<ResponseFormat>,
-
-    // multimodal + audio
-    pub modalities: Option<Arc<[Arc<str>]> >,
-    pub audio: Option<AudioConfig>,
-
-    // advanced features
-    pub stream: Option<bool>,
-    pub stream_options: Option<StreamOptions>,
-    pub prediction: Option<Prediction>,
-    pub web_search_options: Option<WebSearchOptions>,
-    pub reasoning_effort: Option<Arc<str>>,
-    pub verbosity: Option<Arc<str>>,
-    pub service_tier: Option<Arc<str>>,
-    pub store: Option<bool>,
-    pub metadata: Option<Map<String, Value>>,
-
-    // identifiers
-    pub seed: Option<u64>,
-    pub user: Option<Arc<str>>, // deprecated
-    pub safety_identifier: Option<Arc<str>>,
-    pub prompt_cache_key: Option<Arc<str>>,
-    pub prompt_cache_retention: Option<Arc<str>>,
-
-    // escape hatch
-    pub extra_body: Option<Map<String, Value>>,
-}
-```
-
-Notes:
-- Use `Arc<[T]>` for immutable arrays (messages, tools, modalities).
-- Use `Option` for nullable/omitted fields.
-- `extra_body` for provider-specific extensions (Gemini thinking config, etc).
-
-### 5.6 Response Structs
-
-```
-pub struct ChatCompletion {
-    pub id: Arc<str>,
-    pub object: Arc<str>,
-    pub created: u64,
-    pub model: ModelId,
-    pub choices: Arc<[ChatChoice]>,
-    pub usage: Option<Usage>,
-    pub service_tier: Option<Arc<str>>,
-    pub system_fingerprint: Option<Arc<str>>,
-    #[serde(flatten)]
-    pub extra: Map<String, Value>,
-}
-
-pub struct ChatChoice {
-    pub index: u32,
-    pub message: Option<Message>,
-    pub finish_reason: Option<Arc<str>>,
-    pub logprobs: Option<Logprobs>,
-    #[serde(flatten)]
-    pub extra: Map<String, Value>,
-}
-```
-
-Streaming chunk:
-
-```
-pub struct ChatCompletionChunk {
-    pub id: Arc<str>,
-    pub object: Arc<str>,
-    pub created: u64,
-    pub model: ModelId,
-    pub choices: Arc<[ChatChunkChoice]>,
-    pub usage: Option<Usage>,
-    #[serde(flatten)]
-    pub extra: Map<String, Value>,
-}
-```
-
----
-
-## 6) Streaming and Accumulation
-
-### 6.1 SSE Parser
-- Accept `data:` lines only (ignore comments and empty lines).
-- Terminate on `[DONE]` or EOF.
-- Surface `error` objects embedded in SSE data.
-
-### 6.2 Accumulator
-
-Provide `ChatCompletionAccumulator` to merge chunks into a final
-`ChatCompletion`:
-- Concatenate `delta.content` fragments.
-- Merge tool call arguments per `tool_call.id` (not just index).
-- Track `finish_reason` per choice.
-- Handle usage-only final chunk (`choices` empty).
-
-Stream API:
-```
-pub struct ChatCompletionStream {
-    pub fn accumulator(self) -> ChatCompletionAccumulator;
-}
-```
-
----
-
-## 7) Provider Extensions and Pass-through
-
-- The core client sends requests as provided; it does not strip, clamp, or
-  transform parameters for specific providers.
-- Provider-specific constraints should be handled by higher-level crates or
-  application code.
-- Preserve provider extensions via `#[serde(flatten)] extra` on response types.
-- Expose raw JSON for clients that need direct access:
-  `ApiResponse::raw_json()`.
-
----
-
-## 8) Error Handling and Metadata
-
-### 8.1 Error Types
-
-```
-pub enum ClientError {
-    Http(HttpError),
-    Api(ApiError),
-    Deserialize(DeserializeError),
-    Stream(StreamError),
-}
-```
-
-`ApiError` wraps the server `error` object and includes the HTTP status code.
-
-### 8.2 Response Metadata
-
-```
-pub struct ResponseMeta {
-    pub request_id: Option<Arc<str>>,
-    pub ratelimit: Option<RateLimitInfo>,
-    pub latency_ms: Option<u64>,
-}
-
-pub struct ApiResponse<T> {
-    pub data: T,
-    pub meta: ResponseMeta,
-}
-```
-
----
-
-## 9) Developer Experience Helpers
-
-- `ChatCompletion::first_text()` returns the first text content (if any).
-- `ChatCompletion::tool_calls()` returns tool calls from the first choice.
-- `MessageContent::text()` returns `Option<&str>`.
-- `ToolCall::arguments_json()` parses JSON arguments to `serde_json::Value`.
-- `ChatCompletion::parse_json<T: Deserialize>()` for structured outputs.
-
-All helpers must avoid panics; return `Result` with detailed error types.
-
----
-
-## 10) Testing Plan (Minimal)
-
-- JSON decode for non-streaming response with tools and annotations.
-- SSE stream parsing with content + tool call delta assembly.
-- Usage-only final chunk when `include_usage` is set.
-- Provider capability stripping and warnings.
-- Unknown fields preserved via `extra`.
-
----
-
-## 11) Rust Idioms and Safety Notes
-
-- Avoid `unwrap()` in production code.
-- Use `Arc<[T]>` and `Arc<str>` for immutable shared data.
-- Prefer `From/TryFrom` for conversions and `Result` for fallible APIs.
-- Avoid wildcard enum imports and `super::` in module paths.
-- No global mutable state; configuration is explicit.
-
----
-
-## 12) Summary of the Design Approach
-
-This design favors **flexible, forward-compatible types** with an ergonomic
-builder for the common case. Provider differences are handled centrally via
-capability profiles and parameter policies, making the client useful across
-OpenAI-compatible endpoints without forcing users to learn each provider's
-quirks. The streaming implementation is resilient, and the DX helpers make
-structured output and tool calling pleasant without hiding critical behavior.
diff --git a/crates/rullm-chat-completion/spec/implementation.md b/crates/rullm-chat-completion/spec/implementation.md
deleted file mode 100644
index 79ea1084..00000000
--- a/crates/rullm-chat-completion/spec/implementation.md
+++ /dev/null
@@ -1,224 +0,0 @@
-# Rust Port Notes for OpenAI Chat Completions
-
-This document captures implementation guidance for a standalone Rust SDK for
-OpenAI Chat Completions, based on:
-- `crates/rullm-openai/spec/chat-completion.md` and `chat-completion2.md`
-- openai-go and openai-node (generated from the OpenAPI spec by Stainless)
-- codex-rs (Chat Completions streaming support inside the Codex CLI)
-
-The goal is to expose a reusable, standalone Chat Completions client, not tied
-to the Codex CLI.
-
-## 1) API Surface to Implement
-Match the OpenAI SDK patterns (Go/Node) at minimum:
-
-- `POST /chat/completions` (create, non-streaming)
-- `POST /chat/completions` (create, streaming)
-- `GET /chat/completions/{id}` (retrieve stored completion)
-- `GET /chat/completions` (list stored completions)
-- `POST /chat/completions/{id}` (update metadata)
-- `DELETE /chat/completions/{id}` (delete)
-- `GET /chat/completions/{id}/messages` (list stored messages)
-
-Recommended shape for Rust:
-- `ChatCompletionsClient::create(params) -> ChatCompletion`
-- `ChatCompletionsClient::stream(params) -> Stream<ChatCompletionChunk>`
-- `ChatCompletionsClient::retrieve(id)` / `list(params)` / `update(id, params)`
-- `ChatCompletionsClient::delete(id)`
-- `ChatCompletionsClient::list_messages(id, params)`
-
-## 2) Core Type Map (Request/Response)
-
-### 2.1 Request Types
-Define a `ChatCompletionCreateParams` struct that mirrors the Go/Node field set.
-Include all current parameters, even if some are deprecated, to preserve API
-compatibility:
-
-- Required: `model`, `messages`
-- Sampling: `temperature`, `top_p`, `presence_penalty`, `frequency_penalty`
-- Tokens: `max_completion_tokens`, `max_tokens` (deprecated)
-- Output count: `n`
-- Stopping: `stop` (string or array)
-- Logprobs: `logprobs`, `top_logprobs`
-- Tools: `tools`, `tool_choice`, `parallel_tool_calls`
-- Structured outputs: `response_format` (json_schema/json_object/text)
-- Audio output: `modalities`, `audio`
-- Web search: `web_search_options`
-- Predicted outputs: `prediction`
-- Prompt caching: `prompt_cache_key`, `prompt_cache_retention`
-- Safety: `safety_identifier` (replace `user`)
-- Storage: `store`, `metadata`
-- Service tier: `service_tier`
-- Reasoning: `reasoning_effort`, `verbosity`
-- Streaming: `stream`, `stream_options`
-
-Support `null` and omitted fields where the API allows them.
-
-### 2.2 Message Types
-`messages` is a union by `role`. Suggested Rust modeling:
-
-- `enum ChatCompletionMessageParam` tagged by `role`
-  - `System`, `Developer`, `User`, `Assistant`, `Tool`, `Function (deprecated)`
-
-Common fields:
-- `content` for most roles
-- `name` optional for `system`, `developer`, `user`, `assistant`
-- `tool_call_id` required for `tool` role
-- `tool_calls` or `function_call` (deprecated) for assistant messages
-- `audio` is allowed on assistant messages
-
-Content is a union:
-- `String`
-- `Array<ContentPart>`
-
-### 2.3 Content Parts
-`ContentPart` is a union by `type`. From openai-node/openai-go:
-- `text` { text }
-- `image_url` { image_url: { url, detail? } }
-- `input_audio` { input_audio: { data, format } }
-- `file` { file: { file_id | file_data, filename? } }
-- `refusal` (assistant-only content part)
-
-### 2.4 Tools and Tool Calls
-Define tool and tool call unions with explicit `type` tags:
-
-Tools (`tools` in request):
-- `function` { function: FunctionDefinition }
-- `custom` { custom: { name, description?, format? } }
-  - `format`: `text` or `grammar` (with `definition` and `syntax`)
-
-Tool calls (in responses and deltas):
-- `function` { id, function: { name, arguments } }
-- `custom` { id, custom: { name, input } }
-
-Tool choice options (`tool_choice`) are a union:
-- string: `none`, `auto`, `required`
-- named tool choice: `{ type: "function", function: { name } }`
-- custom named tool choice: `{ type: "custom", custom: { name } }`
-- allowed tools: `{ type: "allowed_tools", allowed_tools: { mode, tools } }`
-
-Also keep deprecated fields:
-- `functions` and `function_call` (request)
-- `function_call` (assistant message/stream delta)
-
-### 2.5 Response Types
-Non-streaming response: `ChatCompletion`:
-- `id`, `object: "chat.completion"`, `created`, `model`
-- `choices[]`: `index`, `message`, `finish_reason`, optional `logprobs`
-- `usage` (prompt/completion/total + details)
-- `service_tier`, `system_fingerprint` (deprecated)
-
-Streaming response: `ChatCompletionChunk`:
-- `id`, `object: "chat.completion.chunk"`, `created`, `model`
-- `choices[]` with `delta` objects
-- `usage` optional (final usage chunk if `include_usage`)
-
-`delta` fields can include:
-- `role`, `content`, `refusal`, `tool_calls`, `function_call` (deprecated)
-- Logprobs per choice
-
-### 2.6 Usage and Logprobs
-Usage should include detail fields (when present):
-- completion tokens: `accepted_prediction_tokens`, `rejected_prediction_tokens`,
-  `reasoning_tokens`, `audio_tokens`
-- prompt tokens: `cached_tokens`, `audio_tokens`
-
-Logprobs include per-token info for both content and refusal.
-
-## 3) Serde Modeling Tips
-
-- Use `#[serde(tag = "role", rename_all = "snake_case")]` for message unions.
-- Use `#[serde(tag = "type", rename_all = "snake_case")]` for content parts and
-  tool/tool_call unions.
-- For `content`, `stop`, `tool_choice`, and `response_format`, use `#[serde(untagged)]`
-  enums to support string vs array or object unions.
-- Preserve forward compatibility by:
-  - `#[serde(default)]` for optional fields
-  - `#[serde(flatten)]` to capture unknown fields in responses
-  - avoiding strict enum exhaustiveness where new variants may appear
-
-## 4) Streaming and SSE Handling
-
-### 4.1 SSE decoding
-- The API uses `text/event-stream` with `data: {json}` and `data: [DONE]`.
-- Implement a tolerant SSE parser that:
-  - buffers partial chunks
-  - ignores empty/comment lines
-  - ends on `[DONE]` or socket close
-  - treats `error` objects inside `data` as terminal errors
-
-### 4.2 Delta accumulation
-Follow openai-go and codex-rs patterns:
-- Concatenate `delta.content` and `delta.refusal` fragments in order.
-- For `tool_calls`, merge by `index` and `id` and concatenate
-  `function.arguments` fragments.
-- Handle missing indices (codex-rs maps by `id` or last index).
-- Support multiple parallel tool calls (do not assume `index == 0`).
-- Keep `finish_reason` per choice.
-- Accumulate logprobs and usage (usage often reported only at the final chunk).
-
-Suggested helper: a `ChatCompletionAccumulator` (like openai-go) that merges
-chunks into a full `ChatCompletion`, plus convenience helpers for detecting
-when content or tool calls have just completed.
-
-### 4.3 Stream options
-`stream_options` includes:
-- `include_usage` (final usage-only chunk)
-- `include_obfuscation` (extra fields on deltas, must be ignored if unknown)
-
-## 5) Structured Outputs and Parsing Helpers
-
-OpenAI SDKs provide helpers to parse structured outputs:
-- `response_format` with `type: json_schema`
-- `strict` in function definitions to enforce schema adherence
-
-Optional convenience in Rust:
-- Provide a helper that parses `choice.message.content` into a typed struct
-  when `response_format` is `json_schema`.
-- Provide a helper that parses tool call arguments into JSON when `strict` is
-  true or when the caller opts in.
-
-These are optional, but common in openai-node (`parse` and tool runner helpers).
-
-## 6) Error Handling and Resilience
-
-- Map HTTP error responses to a structured `ErrorObject` (message/type/param/code).
-- Bubble `x-request-id` and rate limit headers up to the caller.
-- Accept unknown enum values and ignore unknown fields.
-- Do not hard-fail on unsupported parameters; let the API reject if needed.
-
-## 7) Notes from codex-rs
-
-codex-rs includes a dedicated chat completion SSE parser:
-- It is robust to missing tool call indices
-- It concatenates tool arguments across deltas
-- It emits reasoning deltas when present (`delta.reasoning` may be a string or
-  nested object)
-- It treats `finish_reason == length` as a context window error
-
-This logic is a good reference for a resilient streaming implementation.
-
-## 8) Gaps vs current rullm-core OpenAI types
-
-The current rullm-core types cover only a subset of the modern API. The Rust
-port should add:
-- `developer` role
-- `audio` input and output types
-- `file` content parts
-- `refusal` content parts
-- `custom` tools
-- `tool_choice` variants for allowed tools
-- `web_search_options` and `annotations`
-- `prediction` and prompt caching fields
-- `reasoning_effort`, `verbosity`, `service_tier`
-- `prompt_cache_key`, `prompt_cache_retention`, `safety_identifier`
-- `stream_options.include_usage` and `include_obfuscation`
-
-## 9) Tests to Include
-
-- Non-streaming: full response decode with tool calls and annotations
-- Streaming: content deltas, refusal deltas, tool call argument assembly
-- Streaming: `include_usage` final chunk with empty choices
-- Tool choice unions and stop unions serialize correctly
-- Content part unions (text, image_url, input_audio, file)
-
diff --git a/spec/chat-completion-comparison.md b/spec/chat-completion-comparison.md
deleted file mode 100644
index 7fd61106..00000000
--- a/spec/chat-completion-comparison.md
+++ /dev/null
@@ -1,234 +0,0 @@
-# Chat completion APIs: A cross-provider comparison guide
-
-The AI API landscape has coalesced around OpenAI's design patterns, but significant differences remain beneath the surface. **Groq and OpenRouter offer near-perfect OpenAI compatibility**, while Anthropic and Google use distinct schemas that require code changes when switching providers. This guide maps the common ground and critical divergences developers need to navigate.
-
-## The OpenAI compatibility spectrum
-
-Three providers—Groq, OpenRouter, and OpenAI itself—share an identical request/response schema, making code portability straightforward. Anthropic and Google Gemini diverge significantly, each with unique terminology and structural choices.
-
-| Provider | OpenAI Compatible | Migration Complexity |
-|----------|-------------------|---------------------|
-| **OpenAI** | Baseline reference | N/A |
-| **Groq** | Yes (drop-in) | Change base URL + API key |
-| **OpenRouter** | Yes (drop-in) | Change base URL + API key |
-| **Anthropic** | No | Requires schema rewrite |
-| **Google Gemini** | No | Requires schema rewrite |
-
-To use OpenAI's Python SDK with Groq or OpenRouter, only the base URL changes:
-
-```python
-from openai import OpenAI
-client = OpenAI(
-    base_url="https://api.groq.com/openai/v1",  # or "https://openrouter.ai/api/v1"
-    api_key="YOUR_API_KEY"
-)
-```
-
-## Endpoints and authentication patterns
-
-All five providers use REST APIs with JSON payloads, but authentication headers and endpoint paths differ substantially.
-
-| Provider | Base URL | Endpoint | Auth Header |
-|----------|----------|----------|-------------|
-| **OpenAI** | `api.openai.com` | `/v1/chat/completions` | `Authorization: Bearer $KEY` |
-| **Anthropic** | `api.anthropic.com` | `/v1/messages` | `x-api-key: $KEY` + `anthropic-version: 2023-06-01` |
-| **Google Gemini** | `generativelanguage.googleapis.com` | `/v1beta/models/{model}:generateContent` | `x-goog-api-key: $KEY` or OAuth |
-| **Groq** | `api.groq.com` | `/openai/v1/chat/completions` | `Authorization: Bearer $KEY` |
-| **OpenRouter** | `openrouter.ai` | `/api/v1/chat/completions` | `Authorization: Bearer $KEY` |
-
-Anthropic uniquely requires a version header (`anthropic-version`) on every request. Google offers two authentication paths: API keys for Google AI Studio (simpler) or OAuth/service accounts for Vertex AI (enterprise).
-
-## Message structure diverges at the system prompt
-
-The most impactful difference across providers is **how system prompts are handled**. OpenAI, Groq, and OpenRouter include system instructions as a message with `role: "system"`. Anthropic separates it into a top-level `system` field. Google uses `systemInstruction` as a separate object.
-
-**OpenAI/Groq/OpenRouter format:**
-```json
-{
-  "model": "gpt-4o",
-  "messages": [
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": "Hello!"}
-  ]
-}
-```
-
-**Anthropic format:**
-```json
-{
-  "model": "claude-sonnet-4-5",
-  "max_tokens": 1024,
-  "system": "You are a helpful assistant.",
-  "messages": [
-    {"role": "user", "content": "Hello!"}
-  ]
-}
-```
-
-**Google Gemini format:**
-```json
-{
-  "systemInstruction": {"parts": [{"text": "You are a helpful assistant."}]},
-  "contents": [
-    {"role": "user", "parts": [{"text": "Hello!"}]}
-  ]
-}
-```
-
-Note that Google uses `contents` instead of `messages`, `parts` instead of `content`, and `role: "model"` instead of `role: "assistant"`. These terminology differences require complete request restructuring.
-
-## Required versus optional parameters
-
-A subtle but critical difference: **Anthropic requires `max_tokens`** on every request, while OpenAI treats it as optional (defaulting to model maximum). This catches many developers migrating from OpenAI.
-
-| Parameter | OpenAI | Anthropic | Gemini | Groq |
-|-----------|--------|-----------|--------|------|
-| `max_tokens` | Optional | **Required** | Optional (`maxOutputTokens`) | Optional |
-| `temperature` | 0-2 (default 1) | 0-1 (default 1) | 0-2 (default 1) | 0-2 (default 1) |
-| `top_p` | ✓ | ✓ | ✓ (`topP`) | ✓ |
-| `top_k` | ✗ | ✓ | ✓ (`topK`) | ✗ |
-| `frequency_penalty` | ✓ | ✗ | ✓ | ✗ |
-| `presence_penalty` | ✓ | ✗ | ✓ | ✗ |
-
-Groq notably **does not support** `frequency_penalty`, `presence_penalty`, `logprobs`, or `n > 1`—parameters common in OpenAI workflows. Requests using these will return 400 errors.
-
-## Response structures show similar divergence
-
-OpenAI, Groq, and OpenRouter return responses in an identical structure with a `choices` array. Anthropic returns `content` as an array of typed blocks. Google returns `candidates` with nested `content.parts`.
-
-**OpenAI/Groq/OpenRouter response:**
-```json
-{
-  "id": "chatcmpl-abc123",
-  "choices": [{
-    "index": 0,
-    "message": {"role": "assistant", "content": "Hello!"},
-    "finish_reason": "stop"
-  }],
-  "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
-}
-```
-
-**Anthropic response:**
-```json
-{
-  "id": "msg_01XFD...",
-  "content": [{"type": "text", "text": "Hello!"}],
-  "stop_reason": "end_turn",
-  "usage": {"input_tokens": 10, "output_tokens": 5}
-}
-```
-
-**Google Gemini response:**
-```json
-{
-  "candidates": [{
-    "content": {"parts": [{"text": "Hello!"}], "role": "model"},
-    "finishReason": "STOP"
-  }],
-  "usageMetadata": {"promptTokenCount": 10, "candidatesTokenCount": 5}
-}
-```
-
-Finish reason values also differ: OpenAI uses `stop`, Anthropic uses `end_turn`, and Google uses `STOP`. Tool-triggered stops are `tool_calls` (OpenAI/Groq), `tool_use` (Anthropic), or indicated by function call content in Gemini.
-
-## Streaming implementations vary significantly
-
-All providers use Server-Sent Events (SSE), but event structure differs. OpenAI-compatible APIs send incremental `delta` objects and terminate with `data: [DONE]`. Anthropic uses **typed event streams** with explicit event names like `message_start`, `content_block_delta`, and `message_stop`.
-
-**OpenAI/Groq streaming chunk:**
-```
-data: {"choices":[{"delta":{"content":"Hello"}}]}
-data: {"choices":[{"delta":{"content":" there"}}]}
-data: [DONE]
-```
-
-**Anthropic streaming events:**
-```
-event: message_start
-data: {"type":"message_start","message":{...}}
-
-event: content_block_delta  
-data: {"type":"content_block_delta","delta":{"type":"text_delta","text":"Hello"}}
-
-event: message_stop
-data: {"type":"message_stop"}
-```
-
-Anthropic's approach provides richer metadata (separate events for tool use, thinking blocks) but requires different parsing logic. Google streams partial `GenerateContentResponse` objects via `streamGenerateContent?alt=sse`.
-
-## Tool calling follows OpenAI's lead with variations
-
-Function/tool calling has achieved reasonable standardization, with all providers supporting JSON Schema-based tool definitions. The structure is nearly identical across OpenAI, Groq, and OpenRouter. Anthropic uses `input_schema` instead of `parameters`, and Google wraps tools in a `functionDeclarations` array.
-
-**OpenAI/Groq tool definition:**
-```json
-{"type": "function", "function": {"name": "get_weather", "parameters": {...}}}
-```
-
-**Anthropic tool definition:**
-```json
-{"name": "get_weather", "input_schema": {...}}
-```
-
-**Google Gemini tool definition:**
-```json
-{"functionDeclarations": [{"name": "get_weather", "parameters": {...}}]}
-```
-
-All providers support `auto`, `none`, and forced tool selection. Anthropic adds `any` (must use at least one tool). Parallel tool calls are supported by OpenAI, Groq, and Anthropic (default enabled).
-
-## Unique features worth noting
-
-Each provider offers distinctive capabilities beyond the baseline API:
-
-- **OpenAI**: Structured Outputs with strict JSON Schema enforcement (`json_schema` response format), Batch API with 50% discount
-- **Anthropic**: Extended thinking for Claude 4/3.7 with configurable token budgets, prompt caching with 90% cost reduction on cache hits, computer use tools
-- **Google Gemini**: Built-in Google Search grounding, native code execution, video/audio/document processing up to 2 hours of video
-- **Groq**: Exceptional speed (**394-1000+ tokens/second**) via custom LPU hardware, timing metrics in usage response
-- **OpenRouter**: Access to **400+ models** from all providers, automatic fallbacks, model routing with `:floor` (cheapest) and `:nitro` (fastest) suffixes, zero-markup pricing
-
-## Pricing models share structure but not rates
-
-All providers charge per-token with separate input/output rates. Anthropic charges 90% less for cached content. Groq offers 50% off for batch processing. OpenRouter passes through provider pricing with a 5.5% fee on credit purchases.
-
-| Provider | Example Model | Input (per 1M) | Output (per 1M) |
-|----------|---------------|----------------|-----------------|
-| OpenAI | GPT-4o | ~$2.50 | ~$10.00 |
-| Anthropic | Claude 3.5 Sonnet | $3.00 | $15.00 |
-| Google | Gemini 2.5 Flash | $0.15 | $0.60 |
-| Groq | Llama 3.3 70B | $0.59 | $0.79 |
-
-Free tiers exist for Google AI Studio, Groq, and OpenRouter (with `:free` suffix models).
-
-## SDK availability and language support
-
-All providers offer first-party Python and TypeScript/JavaScript SDKs. Anthropic and Google provide the broadest language coverage.
-
-| Provider | Python | TypeScript | Go | Java | Other |
-|----------|--------|------------|----|----|-------|
-| OpenAI | ✓ | ✓ | Beta | — | — |
-| Anthropic | ✓ | ✓ | ✓ | ✓ | Ruby, C# (beta) |
-| Google | ✓ | ✓ | ✓ | ✓ | Dart, Swift, Kotlin |
-| Groq | ✓ | ✓ | — | — | OpenAI SDK compatible |
-| OpenRouter | ✓ Beta | ✓ | — | — | OpenAI SDK compatible |
-
-For Groq and OpenRouter, using the OpenAI SDK with a modified base URL is the recommended approach, enabling code reuse across providers.
-
-## Practical migration strategies
-
-When building multi-provider applications, consider these patterns:
-
-1. **Use OpenAI-compatible providers for easy switching**: Groq and OpenRouter can share code paths with OpenAI. Abstract only the base URL and API key.
-
-2. **Create provider-specific adapters for Anthropic/Gemini**: The structural differences require transformation layers. Map `system` messages to Anthropic's top-level field, convert `assistant` to `model` for Gemini.
-
-3. **Normalize on the OpenAI response format**: Parse provider responses into a common structure. OpenRouter already does this for all 400+ models.
-
-4. **Handle parameter gaps gracefully**: Remove unsupported parameters (like `frequency_penalty` for Groq) rather than letting requests fail.
-
-5. **Consider OpenRouter as a unification layer**: For applications needing multiple model providers, OpenRouter provides a single API surface with automatic fallbacks and model routing.
-
-## Conclusion
-
-The chat completion API landscape centers on OpenAI's design patterns, with Groq and OpenRouter offering true compatibility and Anthropic/Google requiring adaptation layers. The key migration hurdles are system prompt handling, required parameters (Anthropic's `max_tokens`), and response parsing differences. For maximum flexibility, applications should abstract provider-specific code behind a common interface, or leverage OpenRouter's unified gateway to access all major models through a single, consistent API.
diff --git a/spec/chat-completion-comparison2.md b/spec/chat-completion-comparison2.md
deleted file mode 100644
index 216ee87a..00000000
--- a/spec/chat-completion-comparison2.md
+++ /dev/null
@@ -1,69 +0,0 @@
-Architectural Convergence and Divergence in Modern Large Language Model Interfaces: A Comparative Analysis of Anthropic, Gemini, Groq, and OpenRouter1. Introduction: The Standardization of the "Chat" ParadigmThe rapid proliferation of Large Language Models (LLMs) has necessitated the evolution of Application Programming Interfaces (APIs) from simple text-completion endpoints to complex, state-aware conversational interfaces. In the nascent stages of the generative AI boom, the interaction model was predominantly "text-in, text-out"—a raw string completion paradigm where the model simply predicted the next sequence of tokens based on a provided prefix. However, as models grew in capability and application architectures shifted toward conversational agents, this primitive abstraction proved insufficient for managing the complexities of dialogue history, role-based instruction, and multi-turn reasoning.OpenAI’s introduction of the Chat Completions API marked a pivotal shift in this landscape, establishing a structural schema that organizes input not as a monolithic string, but as a structured list of message objects. This "Chat Completion" paradigm—characterized by the stateless exchange of JSON arrays containing distinct roles (System, User, Assistant)—has effectively become the lingua franca of the industry. It provides a semantic framework that allows developers to model complex interactions, inject system-level behavioral guardrails, and manage conversation state on the client side.However, while the high-level conceptual model of "Chat Completion" has been widely adopted, the underlying implementation details exhibit significant divergence. Competitors and alternative providers such as Anthropic, Google (Gemini), Groq, and OpenRouter have each interpreted this paradigm through the lens of their specific architectural priorities, safety philosophies, and infrastructure capabilities.This report provides an exhaustive technical analysis of these four providers. It moves beyond superficial feature comparisons to dissect the structural, operational, and semantic differences in their API designs. By examining how each provider implements the chat abstraction, handles authentication, manages complex capabilities like tool use and multimodality, and communicates operational metrics like rate limits, this research aims to equip software architects with the nuanced understanding required to build resilient, multi-provider AI systems. The analysis reveals that while the industry is converging on a shared mental model, the ecosystem remains fragmented in implementation, requiring sophisticated adaptation strategies to achieve true interoperability.2. API Design Philosophy and Architectural ParadigmsThe four providers analyzed—Anthropic, Gemini, Groq, and OpenRouter—represent distinct strategic positions in the AI market. Their API designs are not merely technical specifications but reflections of their broader organizational goals, ranging from safety-centric research to high-velocity inference and ecosystem aggregation.2.1 Anthropic: The Explicit Structure and Safety-First DesignAnthropic’s approach to API design is characterized by strictness and explicitness. The Messages API is designed to enforce "Constitutional AI" principles at the interface level. Unlike flexible schemas that might allow for ambiguous role assignments, Anthropic enforces a rigorous alternation between user and assistant roles. This design choice prevents "jailbreaking" techniques that rely on confusing the model about who is speaking.1Furthermore, Anthropic treats the system prompt not as just another message in the list, but as a top-level parameter. This architectural decision elevates the system instruction above the conversational flow, granting it a higher tiered authority in guiding the model's behavior. This distinct separation of concerns—separating the "rules" (system) from the "dialogue" (messages)—is a hallmark of Anthropic’s safety-first philosophy.2 The API also utilizes a versioning header (anthropic-version), forcing developers to pin their integration to a specific point in time (e.g., 2023-06-01). This indicates a priority on enterprise stability, ensuring that backend improvements do not silently break client-side parsing logic.32.2 Google Gemini: The Multimodal-Native IntegrationGoogle’s Gemini API, accessible via both Google AI Studio and Vertex AI, represents a departure from the text-centric view of LLMs. Gemini is built as a multimodal-native model, and its API schema reflects this. Instead of a standard messages list, Gemini employs a contents array composed of parts. This parts-based architecture is agnostic to data type, treating text, images, video, and audio as equivalent fundamental units of meaning.4The design is heavily influenced by the Google Cloud ecosystem. The integration with Vertex AI introduces complexity in authentication and routing (involving Project IDs and Location IDs in the URL) that is absent in simpler, key-based APIs. This signals that Gemini is designed not just as a standalone model, but as a component within a larger enterprise cloud infrastructure. The API’s ability to handle massive context windows (up to 2 million tokens) also influences its design, necessitating mechanisms for uploading and referencing large files rather than embedding them directly in the request payload.52.3 Groq: The Velocity-Centric Inference EngineGroq occupies a unique position as an infrastructure provider rather than a model trainer. Their core value proposition is the Language Processing Unit (LPU), a hardware architecture designed for ultra-low latency inference. Consequently, Groq’s API strategy is one of "frictionless adoption." They have made the strategic decision to adhere almost strictly to the OpenAI API specification.7By mimicking the endpoint structure (/v1/chat/completions), authentication methods, and payload schemas of the market leader, Groq eliminates the switching costs for developers. The philosophy here is "drop-in compatibility." If a developer has an application running on GPT-4, they should be able to switch to Llama-3 running on Groq by changing only the base URL and API key. This design choice highlights Groq's focus on speed and efficiency over architectural novelty.92.4 OpenRouter: The Normalization and Aggregation LayerOpenRouter serves as a meta-layer or gateway, sitting between the developer and model providers. Its architectural philosophy is "Normalization." The AI ecosystem is fragmented, with different providers using different schemas, tokenizers, and pricing models. OpenRouter abstracts this complexity by providing a unified, OpenAI-compatible interface that routes to dozens of underlying providers (including Anthropic, Google, and Groq).10The API design focuses on routing intelligence. Features like model: "auto" and "fallback" configurations allow the API to make dynamic decisions about which underlying model to call based on cost, latency, or uptime. Additionally, OpenRouter introduces headers like HTTP-Referer and X-Title to build a community-ranking system, incentivizing developers to identify their apps in exchange for visibility. This positions OpenRouter not just as a pipe, but as a marketplace.103. Authentication, Security, and Access ControlThe mechanism by which an API validates the identity of the requester is the first point of integration. While the concept of an "API Key" is universal, the transmission and management of these credentials vary significantly, impacting how client libraries must be configured.3.1 Header Specifications and TransmissionThe industry standard for RESTful APIs is the Authorization header using the Bearer scheme. Groq and OpenRouter adhere to this standard, simplifying integration with generic HTTP clients.Groq: Expects Authorization: Bearer <GROQ_API_KEY>. This allows the use of standard OpenAI client libraries, which are hardcoded to use this header format.8OpenRouter: Also uses Authorization: Bearer <OPENROUTER_API_KEY>. However, it adds a layer of optional but recommended headers: HTTP-Referer (for site rankings) and X-Title (app name). While not strictly required for authentication, these headers play a role in the platform's ecosystem mechanics.10Anthropic deviates from this standard. It requires a custom header x-api-key for the credential. This seemingly minor difference breaks compatibility with generic OpenAI-compatible clients unless a proxy or adapter (like LiteLLM) is used. Additionally, the mandatory anthropic-version header is a security and stability feature. By requiring the client to declare the schema version they expect, Anthropic prevents "silent breaking" updates. If the API response format changes (e.g., how tool use is structured), older clients sending an older version header will continue to receive the legacy format, ensuring backward compatibility.23.2 The Complexity of Google Gemini AuthenticationGemini presents the most bifurcated authentication model, reflecting its dual targeting of hobbyists and enterprise users.Google AI Studio (Prototyping): Uses a simple API key transmitted via the x-goog-api-key header. This is akin to the Anthropic/OpenAI model and is designed for ease of use.5Vertex AI (Enterprise): Uses Google Cloud IAM (Identity and Access Management). Here, there is no static long-lived API key. Instead, the application must authenticate as a Google Cloud Service Account, obtain a short-lived OAuth 2.0 access token (e.g., via gcloud auth print-access-token), and pass that in the Authorization: Bearer header. This approach integrates deeply with enterprise security policies, allowing for granular permission scoping (e.g., a service account that can invoke models but not tune them).11This dichotomy means that code written for Gemini prototypes in AI Studio often requires significant refactoring to be deployed to a production Vertex AI environment, a friction point not present with the other providers.4. Request Structure: The Anatomy of a ConversationThe core of the Chat Completion API is the request body, specifically how the conversation history is structured. While all providers accept a list of messages, the schema of those messages—and specifically how "content" is defined—reveals deep architectural differences.4.1 The Message Object and Role DefinitionsThe "Standard" format, popularized by OpenAI and adopted by Groq and OpenRouter, expects a messages array where each object has a role (system, user, assistant) and content (string).Anthropic's Divergence:Anthropic’s Messages API extracts the system instruction from the message list entirely.JSON{
-  "system": "You are a helpful assistant.",
-  "messages": [
-    {"role": "user", "content": "Hello"}
-  ]
-}
-This structural change enforces a hierarchy. System instructions are not part of the "conversation"; they are the "constitution" governing the conversation. Inside the messages array, Anthropic strictly enforces alternating roles. A sequence of user, user is invalid and will result in a 400 error. The client is forced to merge consecutive messages from the same role. This strictness reduces ambiguity for the model but increases the validation burden on the client.1Gemini's Divergence:Gemini uses contents (plural) instead of messages, and role values are user and model (instead of assistant).JSON{
-  "systemInstruction": { "parts": },
-  "contents": [
-    {
-      "role": "user",
-      "parts": [ { "text": "Hello" } ]
-    }
-  ]
-}
-The use of parts instead of content is foundational. It implies that a message is never just a string; it is a composite object that can contain text, images, video references, or function calls. While OpenAI/Anthropic support similar multimodal arrays, Gemini’s schema treats text as just one type of part among many, rather than the default.44.2 Handling of System InstructionsThe placement of system instructions is a key differentiator in prompt engineering strategies.Groq & OpenRouter: Support the standard {"role": "system", "content": "..."} message at the beginning of the array. This is treated as part of the context window.Anthropic: The top-level system parameter allows the model to cache these instructions separately (via Prompt Caching), potentially optimizing performance for agents that share a common persona across many users.2Gemini: Uses systemInstruction configuration. Similar to Anthropic, this separates the directive from the dialogue, but the syntax involves a nested parts object, adding verbosity to the request payload.44.3 Control Parameters and ConfigurationWhile all providers support standard sampling parameters like temperature and top_p, the parameter names and valid ranges differ.Max Tokens:Anthropic: max_tokens (Required). The API will error if this is missing.1Gemini: maxOutputTokens inside a generationConfig object.4Groq/OpenRouter: max_tokens or the newer max_completion_tokens.7Thinking/Reasoning:Anthropic (Claude 3.7) introduces a thinking block in the request, requiring a budget_tokens parameter. This explicitly reserves capacity for chain-of-thought generation before the final answer.1Gemini 2.0 supports thinking_config with levels (e.g., "low", "high"), integrating reasoning depth as a configuration toggle rather than just a token budget.125. Multimodality and Media HandlingThe processing of non-text inputs (images, video, audio) highlights the infrastructure differences between the providers.5.1 Image TransmissionAnthropic: Images are passed as content blocks with base64 encoding.JSON{
-  "type": "image",
-  "source": {
-    "type": "base64",
-    "media_type": "image/jpeg",
-    "data": "..."
-  }
-}
-This method is simple but bandwidth-intensive. It bloats the JSON payload size, potentially hitting HTTP body size limits for high-resolution images.1Groq: Adheres to the OpenAI image_url format, accepting either a public URL or a base64 string. Currently, Groq’s vision support is model-dependent (e.g., Llama 3.2 Vision).9Gemini: Offers the most robust solution for heavy media. While it supports inline_data (base64), its primary strength is file_data.JSON{
-  "file_data": {
-    "mime_type": "video/mp4",
-    "file_uri": "gs://my-bucket/video.mp4"
-  }
-}
-Developers can upload files to Google Cloud Storage or the Gemini File API and pass the URI. This allows Gemini to process hours of video or audio, which would be impossible to transmit via base64. The model can "watch" a video and answer questions about specific timestamps, a capability unique to its architecture.45.2 Audio and VideoGemini is currently the only provider among the four to support native video and audio inputs in the main chat endpoint. Groq supports audio via a separate audio/transcriptions endpoint (using Whisper), but not as a multimodal input to the chat model itself.9 Anthropic allows for document inputs (PDFs) which are processed as images or text, but lacks native video support in the API.36. Tool Use and Function Calling: The Technical CoreTool Use (or Function Calling) is the critical capability for building agents. It allows the model to output structured JSON to call external APIs. This area exhibits the most significant schema fragmentation.6.1 Tool DefinitionsThe mechanism for telling the model what tools are available varies.Groq / OpenRouter: Use the OpenAI tools format.JSON"tools":
-Anthropic: Uses a flatter structure.JSON"tools":
-The key difference is input_schema vs parameters. While semantically identical, the key names differ, requiring adapters in client code.14Gemini: Wraps definitions in function_declarations.JSON"tools": [{
-  "function_declarations": [{
-    "name": "get_weather",
-    "parameters": {... }
-  }]
-}]
-Gemini allows specifying the schema as a subset of OpenAPI 3.0, but strict adherence is required.156.2 Invocation (The Model's Request)When the model decides to call a tool, the response format differs.Groq / OpenRouter: The message contains a tool_calls array. Each item has a unique id and function arguments.Anthropic: The model outputs a tool_use content block.JSON{
-  "type": "tool_use",
-  "id": "toolu_01...",
-  "name": "get_weather",
-  "input": { "city": "London" }
-}
-Crucially, Anthropic allows text blocks (chain-of-thought) to precede the tool_use block in the same message. This allows the model to "explain" why it is calling the tool before doing so.2Gemini: The model outputs a functionCall part.JSON"parts": [{
-  "functionCall": {
-    "name": "get_weather",
-    "args": { "city": "London" }
-  }
-}]
-Historically, Gemini did not generate unique IDs for function calls, relying on the order of execution. However, newer versions are adopting IDs to support parallel function calling.116.3 Result Submission ( The Client's Response)Completing the tool loop requires sending the result back to the model.Groq / OpenRouter: A dedicated message with role: "tool" is sent, referencing the tool_call_id.Anthropic: The result is sent in a user message containing a tool_result block.JSON{
-  "role": "user",
-  "content": [{
-    "type": "tool_result",
-    "tool_use_id": "toolu_01...",
-    "content": "25 C"
-  }]
-}
-This is a major semantic difference: in Anthropic's world, the User reports the tool result. There is no separate "Tool" role.16Gemini: Uses a functionResponse part, typically within a user (or sometimes function) role context. The API is strict about the order: the conversation history must show functionCall followed immediately by functionResponse.177. Response Structure and Streaming MechanicsFor real-time applications, the structure of the response and the mechanics of Server-Sent Events (SSE) are vital for managing latency and user experience.7.1 Static Response ObjectsGroq / OpenRouter: Return the standard choices array. Even if requesting a single completion, it is wrapped in a list. The content is accessed via choices.message.content.7Anthropic: Returns a top-level content array.JSON{
-  "type": "message",
-  "role": "assistant",
-  "content": [ { "type": "text", "text": "Hello" } ]
-}
-This array structure is consistent with the request format, treating output as blocks.2Gemini: Returns candidates.JSON{
-  "candidates": [{
-    "content": { "parts": [{ "text": "Hello" }] },
-    "finishReason": "STOP"
-  }]
-}
-The nesting is deeper: response.candidates.content.parts.text. Accessing the text requires traversing three layers of hierarchy.47.2 Streaming Event ProtocolsAll providers use SSE, but the event taxonomy differs.Groq / OpenRouter: Stream chunks containing delta objects. The structure mimics the static response but with partial strings. The stream ends with a `` message.10Anthropic: Implements a verbose event system.message_start: Metadata about the message (usage, ID).content_block_start: Indicates a new block (text or tool use) is beginning.content_block_delta: The actual content generation (text_delta).message_stop: Final usage stats.This verbosity allows clients to reconstruct complex, multi-block responses (e.g., text followed by a tool use) with high fidelity.1Gemini: Streams full GenerateContentResponse objects. A key nuance is that Gemini may emit "empty" chunks that contain only citation metadata or safety ratings, requiring the client to filter for actual text content to avoid displaying blanks.48. Operational Metrics: Rate Limits and Stop ReasonsObservability is handled via response headers and body fields.8.1 Rate Limit HeadersGroq: Uses the standard x-ratelimit-* headers (requests, tokens, reset time). This transparency allows clients to implement "token bucket" throttling algorithms easily.18Anthropic: Uses anthropic-ratelimit-* headers. They explicitly separate input-tokens limits from output-tokens limits. This distinction is crucial because output tokens are computationally more expensive and often have tighter limits.19Gemini: Does not consistently provide rate limit headers in the response for immediate backoff calculation. Developers must rely on Google Cloud Quota dashboards or handle 429 errors which contain retry-after information. The limits are enforced at the Project level, shared across all API keys in that project.208.2 Stop ReasonsUnderstanding why the model stopped is essential for debugging.Anthropic: end_turn (natural completion), max_tokens (cutoff), tool_use (calling a function).Gemini: STOP, MAX_TOKENS, SAFETY (content filter triggered), RECITATION. The RECITATION reason is unique to Google; it triggers if the model output is too similar to copyrighted training data, effectively blocking the response to prevent copyright infringement.22OpenRouter: Normalizes these codes. It maps provider-specific reasons to a standard set (e.g., mapping end_turn to stop) but preserves the native_finish_reason for advanced debugging.249. Common Features vs. Unique Differentiators9.1 CommonalitiesStatelessness: All API interactions are stateless; context must be re-sent.JSON Schema: All use JSON for payload transport.Roles: All distinguish between User and System/Model roles.Sampling: All support temperature and top_p.Security: All use TLS/SSL and API Key/Token authentication.9.2 Major Differences (Summary Table)FeatureAnthropicGoogle GeminiGroqOpenRouterSystem PromptTop-level parametersystemInstruction configMessage with role: systemMessage with role: systemInput Structuremessages (Strict Roles)contents with partsmessages (Standard)messages (Standard)Tool Responseuser role + tool_resultfunctionResponse parttool role messagetool role messageVideo InputNo (Frames as images)Native file_data (URI)No (Frames as images)Via Provider (if supported)Rate Limit Headersanthropic-ratelimit-*N/A (Cloud Quotas)x-ratelimit-*x-openrouter-creditsUnique FeaturePrompt Caching2M Context & GroundingLPU Inference SpeedModel Routing & Fallbacks10. Conclusion and Strategic RecommendationsThe "Chat Completion" API has evolved into a standard architectural pattern, but it is not a monolithic standard. While Groq and OpenRouter adhere closely to the OpenAI specification to minimize friction, Anthropic and Google Gemini have diverged to support their specific philosophies of safety and multimodality.For developers and architects, this fragmentation implies that a true "multi-provider" strategy requires more than just swapping base URLs. It necessitates an abstraction layer (adapter pattern) that can normalize the structural differences in:Tool Use Handshakes: Converting between tool roles and tool_result blocks.Multimodal Uploads: Handling base64 vs. Cloud Storage URIs.Rate Limit Handling: Parsing diverse header formats to manage backoff.Anthropic is the choice for workflows requiring strict adherence to complex instructions and safety, leveraging Prompt Caching for cost efficiency in long-context tasks. Gemini dominates in scenarios involving heavy media analysis (video/audio) and deep integration with the Google Cloud ecosystem. Groq provides the raw speed necessary for real-time, user-facing applications where latency is the primary KPI. OpenRouter acts as the unifying fabric, offering the path of least resistance for accessing the diverse capabilities of the open ecosystem without the operational overhead of managing individual provider idiosyncrasies.Understanding these nuances is the key to transitioning from a fragile, single-provider prototype to a robust, model-agnostic enterprise application.References1 (Anthropic)4 (Gemini)7 (Groq)10 (OpenRouter)

From 0a6a0372b46f284d2968025e98eb48792bbc3f66 Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sun, 4 Jan 2026 00:34:40 +0530
Subject: [PATCH 13/14] fix(cli): add gemini to provider autocomplete and help
 text

---
 crates/rullm-cli/src/args.rs          | 2 +-
 crates/rullm-cli/src/commands/auth.rs | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/rullm-cli/src/args.rs b/crates/rullm-cli/src/args.rs
index fbec0201..d276bf0e 100644
--- a/crates/rullm-cli/src/args.rs
+++ b/crates/rullm-cli/src/args.rs
@@ -181,7 +181,7 @@ impl Models {
 
 pub fn model_completer(current: &OsStr) -> Vec<CompletionCandidate> {
     // Predefined providers or aliases
-    const PROVIDED: &[&str] = &["openai:", "anthropic:"];
+    const PROVIDED: &[&str] = &["openai:", "anthropic:", "gemini:"];
 
     let cli_config = CliConfig::load();
     let cur_str = current.to_string_lossy();
diff --git a/crates/rullm-cli/src/commands/auth.rs b/crates/rullm-cli/src/commands/auth.rs
index 90c3b932..523a3d38 100644
--- a/crates/rullm-cli/src/commands/auth.rs
+++ b/crates/rullm-cli/src/commands/auth.rs
@@ -20,12 +20,12 @@ pub struct AuthArgs {
 pub enum AuthAction {
     /// Login to a provider (OAuth or API key)
     Login {
-        /// Provider name (anthropic, openai, groq, openrouter)
+        /// Provider name (anthropic, openai, groq, openrouter, gemini)
         provider: Option<Provider>,
     },
     /// Logout from a provider (remove stored credentials)
     Logout {
-        /// Provider name (anthropic, openai, groq, openrouter)
+        /// Provider name (anthropic, openai, groq, openrouter, gemini)
         provider: Option<Provider>,
     },
     /// List all credentials and environment variables

From ceef28b0507f26da8ac0c2dfa8d52f6632bced1b Mon Sep 17 00:00:00 2001
From: lambda <git@itzlambda.com>
Date: Sun, 4 Jan 2026 00:49:26 +0530
Subject: [PATCH 14/14] fix(cli): attach system messages to Anthropic request's
 system field

Previously system messages were silently filtered out. Now they're
extracted and passed via the top-level system field per the API spec.
---
 crates/rullm-cli/src/cli_client.rs | 50 ++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/crates/rullm-cli/src/cli_client.rs b/crates/rullm-cli/src/cli_client.rs
index ad54446d..4efadcec 100644
--- a/crates/rullm-cli/src/cli_client.rs
+++ b/crates/rullm-cli/src/cli_client.rs
@@ -15,6 +15,27 @@ use std::pin::Pin;
 /// Claude Code identification text for OAuth requests
 const CLAUDE_CODE_SPOOF_TEXT: &str = "You are Claude Code, Anthropic's official CLI for Claude.";
 
+/// Extract system messages from conversation, concatenating multiple with double newlines.
+/// Returns None if no system messages present.
+fn extract_system_content(messages: &[(String, String)]) -> Option<String> {
+    let system_messages: Vec<&str> = messages
+        .iter()
+        .filter_map(|(role, content)| {
+            if role == "system" {
+                Some(content.as_str())
+            } else {
+                None
+            }
+        })
+        .collect();
+
+    if system_messages.is_empty() {
+        None
+    } else {
+        Some(system_messages.join("\n\n"))
+    }
+}
+
 /// Prepend Claude Code system block to an existing system prompt (for OAuth requests)
 fn prepend_claude_code_system(existing: Option<SystemContent>) -> SystemContent {
     let spoof_block = SystemBlock::text_with_cache(CLAUDE_CODE_SPOOF_TEXT);
@@ -305,12 +326,16 @@ impl CliClient {
                 config,
                 is_oauth,
             } => {
+                // Extract system messages first (they go in a top-level field, not in messages)
+                let user_system = extract_system_content(&messages);
+
+                // Filter to only user/assistant messages
                 let msgs: Vec<AnthropicMessage> = messages
                     .iter()
                     .filter_map(|(role, content)| match role.as_str() {
                         "user" => Some(AnthropicMessage::user(content.as_str())),
                         "assistant" => Some(AnthropicMessage::assistant(content.as_str())),
-                        _ => None, // Skip system messages for now
+                        _ => None,
                     })
                     .collect();
 
@@ -322,8 +347,27 @@ impl CliClient {
                     builder = builder.temperature(temp);
                 }
 
-                if *is_oauth {
-                    builder = builder.system_blocks(prepend_claude_code_system(None).into_blocks());
+                // Attach system content (combining with OAuth prefix if needed)
+                let system_content = match (user_system, *is_oauth) {
+                    (Some(text), true) => {
+                        // OAuth + user system: prepend Claude Code to user's system
+                        Some(prepend_claude_code_system(Some(SystemContent::Text(
+                            text.into(),
+                        ))))
+                    }
+                    (Some(text), false) => {
+                        // No OAuth + user system: just user's system
+                        Some(SystemContent::Text(text.into()))
+                    }
+                    (None, true) => {
+                        // OAuth + no user system: just Claude Code
+                        Some(prepend_claude_code_system(None))
+                    }
+                    (None, false) => None,
+                };
+
+                if let Some(content) = system_content {
+                    builder = builder.system_blocks(content.into_blocks());
                 }
 
                 let request = builder.build();