diff --git a/sdk/guides/critic.mdx b/sdk/guides/critic.mdx new file mode 100644 index 00000000..e18658d5 --- /dev/null +++ b/sdk/guides/critic.mdx @@ -0,0 +1,384 @@ +--- +title: Critic (Experimental) +description: Real-time evaluation of agent actions using an external LLM-based critic model. +--- + + +**This feature is highly experimental** and subject to change. The API, configuration, and behavior may evolve significantly based on feedback and testing. Not recommended for production use without thorough evaluation. + + + +This example is available on GitHub: [examples/01_standalone_sdk/34_critic_model_example.py](https://github.com/OpenHands/software-agent-sdk/blob/main/examples/01_standalone_sdk/34_critic_model_example.py) + + +The Critic feature enables real-time evaluation of agent actions and messages using an external LLM. By providing quality scores and feedback during agent execution, critics help assess agent performance and can inform decision-making or monitoring systems. + +## What is a Critic? + +A **critic** is an evaluation model that analyzes agent actions and conversation history to predict the quality or success probability of agent decisions. The critic runs in parallel with the agent and provides: + +- **Quality scores**: Probability scores between 0.0 and 1.0 indicating predicted success +- **Detailed feedback**: Optional structured feedback with multiple probability dimensions (e.g., sentiment, task completion, errors) +- **Real-time evaluation**: Scores computed during agent execution, not just at completion + +## When to Use Critics + +Critics are useful for: + +- **Quality monitoring**: Track agent performance in real-time during execution +- **Early intervention**: Detect potential failures before task completion +- **Performance analysis**: Collect evaluation data for debugging and improvement +- **Multi-agent systems**: Use critic feedback to guide delegation or retry logic + + +Critics add latency to agent execution since each evaluation requires an API call. Consider the performance impact when deciding which actions to evaluate. + + +## Evaluation Modes + +Critics support two evaluation modes: + +1. **`finish_and_message` (default)**: Evaluate only on `FinishAction` and agent `MessageEvent` + - Minimal performance impact + - Focuses on task completion and agent responses + - Recommended for most use cases + +2. **`all_actions`**: Evaluate after every agent action + - Higher evaluation frequency and more data + - Significant performance overhead + - Useful for detailed analysis or debugging + +## Setting Up an API-Based Critic + +The `APIBasedCritic` connects to an external LLM server to perform evaluations. Here's a basic setup: + +```python icon="python" expandable examples/01_standalone_sdk/34_critic_model_example.py +"""Example demonstrating critic-based evaluation of agent actions. + +This is EXPERIMENTAL. + +This shows how to configure an agent with a critic to evaluate action quality +in real-time. The critic scores are displayed in the conversation visualizer. + +For All-Hands LLM proxy (llm-proxy.*.all-hands.dev), the critic is auto-configured +using the same base_url with /vllm suffix and "critic" as the model name. +""" + +import os +import re +import sys + +from openhands.sdk import LLM, Agent, Conversation, Tool +from openhands.sdk.critic import APIBasedCritic +from openhands.sdk.critic.base import CriticBase +from openhands.tools.file_editor import FileEditorTool +from openhands.tools.task_tracker import TaskTrackerTool +from openhands.tools.terminal import TerminalTool + + +def get_required_env(name: str) -> str: + value = os.getenv(name) + if value: + return value + sys.exit( + f"Missing required environment variable: {name}. " + f"Set {name} before running this example." + ) + + +def get_default_critic(llm: LLM) -> CriticBase | None: + """Auto-configure critic for All-Hands LLM proxy. + + When the LLM base_url matches `llm-proxy.*.all-hands.dev`, returns an + APIBasedCritic configured with: + - server_url: {base_url}/vllm + - api_key: same as LLM + - model_name: "critic" + + Returns None if base_url doesn't match or api_key is not set. + """ + base_url = llm.base_url + api_key = llm.api_key + if base_url is None or api_key is None: + return None + + # Match: llm-proxy.{env}.all-hands.dev (e.g., staging, prod, eval) + pattern = r"^https?://llm-proxy\.[^./]+\.all-hands\.dev" + if not re.match(pattern, base_url): + return None + + return APIBasedCritic( + server_url=f"{base_url.rstrip('/')}/vllm", + api_key=api_key, + model_name="critic", + ) + + +llm_api_key = get_required_env("LLM_API_KEY") + +llm = LLM( + model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"), + api_key=llm_api_key, + base_url=os.getenv("LLM_BASE_URL", None), +) + +# Try auto-configuration for All-Hands proxy, fall back to explicit env vars +critic = get_default_critic(llm) +if critic is None: + critic = APIBasedCritic( + server_url=get_required_env("CRITIC_SERVER_URL"), + api_key=get_required_env("CRITIC_API_KEY"), + model_name=get_required_env("CRITIC_MODEL_NAME"), + ) + + +# Configure agent with critic +agent = Agent( + llm=llm, + tools=[ + Tool(name=TerminalTool.name), + Tool(name=FileEditorTool.name), + Tool(name=TaskTrackerTool.name), + ], + # Add critic to evaluate agent actions + critic=critic, +) + +cwd = os.getcwd() +conversation = Conversation(agent=agent, workspace=cwd) + +conversation.send_message( + "Create a file called GREETING.txt with a friendly greeting message." +) +conversation.run() + +print("\nAll done! Check the output above for 'Critic Score' in the visualizer.") +``` + +```bash Running the Example +# Option 1: Using All-Hands LLM proxy (auto-configures critic) +export LLM_API_KEY="your-api-key" +export LLM_BASE_URL="https://llm-proxy.eval.all-hands.dev" + +# Option 2: Manual critic configuration +export LLM_API_KEY="your-api-key" +export CRITIC_SERVER_URL="https://your-critic-server.com/vllm" +export CRITIC_API_KEY="your-critic-api-key" +export CRITIC_MODEL_NAME="critic" + +cd agent-sdk +uv run python examples/01_standalone_sdk/34_critic_model_example.py +``` + +## Configuration Options + +### APIBasedCritic Parameters + +- **`server_url`**: The HTTP endpoint for the critic LLM server (typically a vLLM or compatible server) +- **`api_key`**: Authentication key for the critic server (stored securely as `SecretStr`) +- **`model_name`**: The model identifier to use on the critic server (e.g., "critic") +- **`mode`**: Evaluation mode - `"finish_and_message"` (default) or `"all_actions"` + +### Example: Evaluate All Actions + +```python +from openhands.sdk.critic import APIBasedCritic + +# Configure critic to evaluate every action +critic = APIBasedCritic( + server_url="https://critic-server.com/vllm", + api_key="your-api-key", + model_name="critic", + mode="all_actions", # Evaluate after every agent action +) +``` + +## Understanding Critic Results + +Critic evaluations produce `CriticResult` objects with: + +- **`score`**: Float between 0.0 and 1.0 representing predicted success probability +- **`message`**: Optional feedback message (can be plain text or JSON with detailed probabilities) +- **`success`**: Boolean property (True if score >= 0.5, False otherwise) + +### Visualizing Results + +Critic results are automatically displayed in the conversation visualizer with color-coded formatting: + +``` +╭────────────── Critic Score ──────────────╮ +│ Overall: 0.8500 │ +│ │ +│ Detailed Probabilities: │ +│ sentiment_positive: 0.9200 │ +│ task_completion: 0.8500 │ +│ sentiment_neutral: 0.3400 │ +│ error_detected: 0.0800 │ +╰──────────────────────────────────────────╯ +``` + +### Accessing Results Programmatically + +```python +from openhands.sdk import Event, ActionEvent, MessageEvent + +def callback(event: Event): + if isinstance(event, (ActionEvent, MessageEvent)): + if event.critic_result is not None: + print(f"Critic score: {event.critic_result.score:.3f}") + print(f"Success: {event.critic_result.success}") + if event.critic_result.message: + print(f"Feedback: {event.critic_result.message}") + +conversation = Conversation(agent=agent, callbacks=[callback]) +``` + +## How It Works + +1. **Event Capture**: When configured, the agent captures relevant events (actions/messages based on mode) +2. **History Assembly**: The critic receives the full conversation history up to the evaluation point +3. **Template Rendering**: Conversation history is rendered into the critic model's chat template format (Qwen3-4B-Instruct-2507) +4. **API Call**: The critic sends the rendered prompt to the external LLM server +5. **Score Extraction**: The response is parsed to extract quality scores and feedback +6. **Result Attachment**: The `CriticResult` is attached to the corresponding event for visualization and analysis + +## Technical Details + +### Chat Template Format + +The critic uses the Qwen3-4B-Instruct-2507 chat template format: + +``` +<|im_start|>system +You are an AI assistant...<|im_end|> +<|im_start|>user +Task description...<|im_end|> +<|im_start|>assistant +Agent response...<|im_end|> +``` + +This format is automatically applied to conversation history before sending to the critic server. + +### Security + +- API keys are stored as Pydantic `SecretStr` to prevent accidental logging or exposure +- Keys are validated to ensure they are non-empty +- Keys are never exposed in string representations of critic objects + +### Performance Considerations + +- **Latency**: Each evaluation adds ~100-500ms depending on server response time +- **Cost**: Each evaluation consumes critic model tokens (typically 500-2000 tokens per evaluation) +- **Parallelization**: Critic evaluations run sequentially; they do not currently run in parallel with agent actions + +## Limitations and Known Issues + + +- **Experimental API**: The critic interface may change significantly in future versions +- **Model Dependency**: Requires access to a compatible critic model server (vLLM or similar) +- **Limited Feedback**: Critic quality depends entirely on the underlying model's training and capabilities +- **No Retroactive Correction**: Critics provide feedback but do not automatically correct agent behavior +- **Performance Impact**: Frequent evaluations can significantly slow down agent execution + + +## Custom Critic Implementations + +You can implement custom critics by extending the `CriticBase` class: + +```python +from openhands.sdk.critic.base import CriticBase, CriticResult +from openhands.sdk.event import LLMConvertibleEvent +from collections.abc import Sequence + +class CustomCritic(CriticBase): + """Custom critic with your own evaluation logic.""" + + def evaluate( + self, + events: Sequence[LLMConvertibleEvent], + git_patch: str | None = None + ) -> CriticResult: + # Your custom evaluation logic here + # Example: Simple heuristic-based evaluation + score = 0.5 # Default neutral score + + if events: + last_event = events[-1] + # Add your custom logic + if "error" in str(last_event).lower(): + score = 0.2 + elif "success" in str(last_event).lower(): + score = 0.9 + + return CriticResult( + score=score, + message="Custom evaluation feedback" + ) +``` + +## Examples of Other Critic Types + +The SDK includes several built-in critics for different use cases: + +### PassCritic + +Always returns a perfect score (1.0). Useful for testing or disabling evaluation: + +```python +from openhands.sdk.critic import PassCritic + +critic = PassCritic() # Always returns score=1.0 +``` + +### AgentFinishedCritic + +Only evaluates when the agent finishes with a `FinishAction`: + +```python +from openhands.sdk.critic import AgentFinishedCritic + +critic = AgentFinishedCritic() +``` + +### EmptyPatchCritic + +Checks if the agent produced any code changes (git patch): + +```python +from openhands.sdk.critic import EmptyPatchCritic + +critic = EmptyPatchCritic() # Returns 0.0 if patch is empty, 1.0 otherwise +``` + +## Troubleshooting + +### Critic Evaluations Not Appearing + +- Verify the critic is properly configured and passed to the Agent +- Check that events match the evaluation mode (e.g., using `finish_and_message` mode will only show scores at finish/message events) +- Ensure the critic server is accessible and responding + +### High Latency + +- Consider using `finish_and_message` mode instead of `all_actions` +- Check network latency to the critic server +- Optimize critic model inference speed on the server side + +### Low Quality Scores + +- The critic model may need fine-tuning for your specific use case +- Verify the critic is receiving sufficient conversation context +- Check that the chat template format is correctly applied + +### API Authentication Errors + +- Verify `CRITIC_API_KEY` is set correctly +- Check that the API key has not expired +- Ensure the server URL is correct and accessible + +## Next Steps + +- **[Observability](/sdk/guides/observability)** - Monitor and log agent behavior +- **[Metrics](/sdk/guides/metrics)** - Collect performance metrics +- **[Stuck Detector](/sdk/guides/agent-stuck-detector)** - Detect unproductive agent patterns +- **[Iterative Refinement](/sdk/guides/iterative-refinement)** - Improve agent outputs through iteration