EntityProcess · christso · Nov 28, 2025 · Nov 29, 2025 · Nov 29, 2025
diff --git a/docs/examples/simple/README.md b/docs/examples/simple/README.md
@@ -68,3 +68,16 @@ simple/
 - **`python.instructions.md`**: Python coding guidelines
 - **`javascript.instructions.md`**: JavaScript coding guidelines
 - These instruction files can be referenced in eval files to provide context
+
+## Next Steps
+
+1. **Try running the examples**: Use `agentv eval evals/coding/example-eval.yaml`
+2. **Modify eval cases**: Experiment with your own test scenarios
+3. **Explore advanced examples**: See [../advanced/](../advanced/) for production patterns
+4. **Create your own evals**: Use these as templates for your domain
+
+## Related Documentation
+
+- [Advanced Examples](../advanced/README.md) - Production-ready scenarios and optimization
+- [AgentV Schema V2](../../features/schema-v2.md) - Full schema reference
+- [Local CLI Provider Guide](../../features/local-cli-provider.md) - Custom CLI integration
diff --git a/docs/examples/simple/evals/coding/example-eval.yaml b/docs/examples/simple/evals/coding/example-eval.yaml
@@ -2,7 +2,7 @@
 # Demonstrates schema features with real file references and minimal redundancy
 
 $schema: agentv-eval-v2
-description: Example showing basic features, conversation threading, multiple evaluators, and ACE optimization
+description: Example showing basic features, conversation threading, multiple evaluators
 
 # File-level default target (optional)
 # Can be overridden per eval case via execution.target

diff --git a/docs/examples/simple/evals/incident-triage/incident-severity.yaml b/docs/examples/simple/evals/incident-triage/incident-severity.yaml
diff --git a/docs/examples/simple/evals/psychology/three-layer-listening.yaml b/docs/examples/simple/evals/psychology/three-layer-listening.yaml
@@ -0,0 +1,208 @@
+$schema: agentv-eval-v2
+description: >-
+  Comprehensive Evaluation Suite for Psychology Listening Skills.
+  TIER 1: Basic capability (Explicit Content & Emotion).
+  TIER 2: Advanced capability (Implicit Process, Cultural Subtext, & Resistance).
+target: default
+
+evalcases:
+  # ==============================================================================
+  # TIER 1: BASIC BENCHMARKS (Explicit Text)
+  # Objective: Ensure model can identify clearly stated facts and emotions.
+  # Failure Mode to Watch: "Over-analysis" (Hallucinating subtext where none exists).
+  # ==============================================================================
+
+  # Case 1: The Overwhelmed Wife (Adversarial: Passive Resignation)
+  - id: basic-overwhelmed-wife
+    outcome: >-
+      ZERO SCORE if output is not valid JSON with keys 'content', 'emotion', 'process'.
+      If valid JSON: Must identify 'Resentment' or 'Despair' in emotion (not just anger).
+      Must identify 'Testing the relationship' or 'Silent withdrawal' in process.
+      Must identify 'Imbalance of labor' in content.
+    execution:
+      evaluators:
+        - name: json_format_validator
+          type: code
+          script: uv run validate_json_format.py
+          cwd: ../../evaluators/scripts
+        - name: content_evaluator
+          type: llm_judge
+    input_messages:
+      - role: user
+        content: |-
+          Analyze the following client statement using the 3-Layer Listening method.
+
+          Client Statement:
+          "I used to fight with him about the chores. I used to scream when he played games while I cooked. Yesterday, I walked right past him. He was on his phone. The trash was overflowing. I didn't say a word. I just cooked dinner, ate with the kids, and went to bed. He didn't even look up. I wonder... if I stopped paying the internet bill, would he notice me then?"
+    expected_messages:
+      - role: assistant
+        content: |-
+          {
+            "content": "Client stopped verbally fighting about chores/neglect; observed husband's continued disengagement; considering passive-aggressive action (cutting internet).",
+            "emotion": "Resentment, Despair, Numbness/Detachment, Cold Anger.",
+            "process": "Shift from 'Pursuer' (screaming) to 'Distancer' (silence); Testing the relationship's viability; Using withdrawal as a final attempt to gain attention or preparing to detach completely."
+          }
+
+  # Case 2: The Traditional Mother (Adversarial: Guilt & Reframing)
+  - id: basic-traditional-mother
+    outcome: >-
+      ZERO SCORE if output is not valid JSON with keys 'content', 'emotion', 'process'.
+      If valid JSON: Must identify 'Financial exploitation' framed as 'Family values' in content.
+      Must identify 'Gaslighting' or 'Guilt-tripping' in process.
+      Must identify 'Daughter's boundaries' being violated in process.
+    execution:
+      evaluators:
+        - name: json_format_validator
+          type: code
+          script: uv run validate_json_format.py
+          cwd: ../../evaluators/scripts
+        - name: content_evaluator
+          type: llm_judge
+    input_messages:
+      - role: user
+        content: |-
+          Analyze the following client statement using the 3-Layer Listening method.
+
+          Client Statement:
+          "My daughter is doing so well, she saves so much money. Her brother is just starting out, he's struggling. As a family, we help each other, right? I simply suggested she help him with the down payment. It's an investment for the family's future. But she stopped talking to me! I don't understand. Isn't family about sharing? Why is she being so calculated and cold with her own flesh and blood?"
+    expected_messages:
+      - role: assistant
+        content: |-
+          {
+            "content": "Mother asked successful daughter to pay for struggling son's down payment; Daughter cut contact; Mother frames this as 'family help' and daughter as 'cold'.",
+            "emotion": "Confusion (feigned or genuine), Self-righteousness, Victimhood, Indignation.",
+            "process": "Manipulative Reframing: Defines 'financial exploitation' as 'family values'; Gaslighting: Labels daughter's boundary setting as 'being cold/calculated'; Enmeshment: Fails to see daughter as a separate entity with own rights."
+          }
+
+  # Case 3: The Returning Mother (Explicit Distress)
+  - id: basic-returning-mother
+    outcome: >-
+      ZERO SCORE if output is not valid JSON with keys 'content', 'emotion', 'process'.
+      If valid JSON: Must identify 'Motherhood to Workforce' in content.
+      Must identify 'Internalizing rejection as worthlessness' in process.
+      Must identify 'Validation and small wins' in process.
+    execution:
+      evaluators:
+        - name: json_format_validator
+          type: code
+          script: uv run validate_json_format.py
+          cwd: ../../evaluators/scripts
+        - name: content_evaluator
+          type: llm_judge
+    input_messages:
+      - role: user
+        content: |-
+          Analyze the following client statement using the 3-Layer Listening method.
+
+          Client Statement:
+          "I have a 3-year-old daughter who started kindergarten. I used to watch the child at home and felt useless, so I wanted to find a job. My husband supports me, but I sent many resumes with no response. I finally got an interview but didn't pass. I can't find a suitable job. I doubt myself—am I unable to do anything? I am in pain, unhappy every day, and don't know what to do."
+    expected_messages:
+      - role: assistant
+        content: |-
+          {
+            "content": "Transitioning from full-time Motherhood to Workforce; loss of previous role clarity.",
+            "emotion": "Pain, Unhappiness, Self-doubt, Anxiety, Confusion.",
+            "process": "Overgeneralization and Internalization: Views job rejection as proof of total personal incompetence ('unable to do anything'). Immediate need: Normalization of job market difficulties; Validation of her worth outside of employment; Establishing small, achievable goals."
+          }
+
+  # ==============================================================================
+  # TIER 2: ADVANCED BENCHMARKS (Implicit/Process)
+  # Objective: Test "Reasoning Gap." Model must infer unstated dynamics (Somatization, Triangulation, Resistance).
+  # Failure Mode to Watch: "Literalism" (Accepting face-value statements that contradict the subtext).
+  # ==============================================================================
+
+  # Case 4: The Alexithymic Executive (Adversarial: Denial of Distress)
+  # Challenge: Identify "Alexithymia" where the client denies emotion but reports symptoms.
+  - id: adv-somatic-mask-worker
+    outcome: >-
+      ZERO SCORE if output is not valid JSON with keys 'content', 'emotion', 'process'.
+      If valid JSON: Must identify 'Alexithymia' or 'Emotional detachment' in process.
+      Must identify 'Throat tightness' as 'Somatic Displacement' in process.
+      Must identify 'Denial of stress' in content.
+    execution:
+      evaluators:
+        - name: json_format_validator
+          type: code
+          script: uv run validate_json_format.py
+          cwd: ../../evaluators/scripts
+        - name: content_evaluator
+          type: llm_judge
+    input_messages:
+      - role: user
+        content: |-
+          Analyze the following client statement using the 3-Layer Listening method.
+
+          Client Statement:
+          "My life is perfect. Work is booming, just got a promotion. I'm thriving on the pressure. I just need you to recommend a specialist for this weird tightness in my throat. It's like a lump that won't go away. My wife keeps asking if I'm 'anxious', which is ridiculous. I don't do anxiety. I just have a throat issue."
+    expected_messages:
+      - role: assistant
+        content: |-
+          {
+            "content": "Reports 'perfect life' and career success; Denies anxiety/stress explicitly; Seeks medical fix for 'throat lump' (Globus sensation).",
+            "emotion": "Disconnected/Numb (Alexithymia), Defensive pride, Underlying panic (displaced to throat).",
+            "process": "Alexithymia: Inability to identify/verbalize feelings. Somatic Displacement: Stress is converted into physical symptoms (throat tightness) because emotional expression is viewed as 'ridiculous' or weak."
+          }
+
+  # Case 5: The Benevolent Gatekeeper (Adversarial: Dysfunctional Helping)
+  # Challenge: Identify that "Helping communicate" is actually "Preventing intimacy".
+  - id: adv-sandwich-husband
+    outcome: >-
+      ZERO SCORE if output is not valid JSON with keys 'content', 'emotion', 'process'.
+      If valid JSON: Must identify 'Gatekeeping' or 'Blocking direct communication' in process.
+      Must identify 'Distortion of reality' (translating criticism as care) in content.
+      Must identify 'Fear of conflict' in emotion.
+    execution:
+      evaluators:
+        - name: json_format_validator
+          type: code
+          script: uv run validate_json_format.py
+          cwd: ../../evaluators/scripts
+        - name: content_evaluator
+          type: llm_judge
+    input_messages:
+      - role: user
+        content: |-
+          Analyze the following client statement using the 3-Layer Listening method.
+
+          Client Statement:
+          "I'm the bridge in this family. When Mom says my wife's cooking is 'bland', I tell my wife that Mom is just worried about her salt intake. When my wife rolls her eyes at Mom's stories, I tell Mom she's just tired from work. I work so hard to translate for them so they don't get hurt. Why do I feel so exhausted if I'm doing such a good thing?"
+    expected_messages:
+      - role: assistant
+        content: |-
+          {
+            "content": "Client actively 'translates' (distorts) critical messages between wife and mother to prevent conflict; Reports exhaustion.",
+            "emotion": "Exhaustion, Anxiety, Fear of conflict, False pride in being the 'bridge'.",
+            "process": "Gatekeeping/Triangulation: By 'translating', the client prevents direct contact and authentic relationship (even conflict) between the women. He maintains the dysfunction by absorbing the tension, preventing resolution."
+          }
+
+  # Case 6: The Intellectualizer (Adversarial: Rationalization)
+  # Challenge: Identify "Intellectualization" where logic is used to avoid feeling/acting.
+  - id: adv-face-saving-resistance
+    outcome: >-
+      ZERO SCORE if output is not valid JSON with keys 'content', 'emotion', 'process'.
+      If valid JSON: Must identify 'Intellectualization' or 'Rationalization' in process.
+      Must identify 'Avoidance of vulnerability' in emotion.
+      Must identify 'Theoretical agreement vs. Practical avoidance' in content.
+    execution:
+      evaluators:
+        - name: json_format_validator
+          type: code
+          script: uv run validate_json_format.py
+          cwd: ../../evaluators/scripts
+        - name: content_evaluator
+          type: llm_judge
+    input_messages:
+      - role: user
+        content: |-
+          Analyze the following client statement (Context: Therapist suggested expressing anger to a parent).
+
+          Client Statement:
+          "I've read about this. It's the 'Empty Chair' technique, right? Gestalt therapy. I understand the theoretical mechanism—catharsis reduces cortisol levels. It's fascinating how the brain processes repressed anger. I'd love to discuss the efficacy rates of this method compared to CBT before we try it."
+    expected_messages:
+      - role: assistant
+        content: |-
+          {
+            "content": "Client discusses the theory/mechanism of the technique ('cortisol', 'Gestalt', 'efficacy rates') instead of doing it.",
+            "emotion": "Detachment, Superiority (intellectual), Fear of vulnerability/loss of control.",
+            "process": "Intellectualization: Moving the conversation to the 'head' (theory/data) to avoid the 'heart' (experiencing the anger). Using analysis as a shield against the therapeutic intervention."
+          }
diff --git a/docs/examples/simple/evaluators/scripts/validate_json_format.py b/docs/examples/simple/evaluators/scripts/validate_json_format.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+"""
+JSON Format Validator for AgentV
+Validates that the candidate answer is strictly valid JSON with required keys.
+Returns score 0.0 if not valid JSON, otherwise passes to next evaluator.
+"""
+
+import json
+import sys
+from typing import Any
+
+
+def validate_json_format(candidate_answer: str, required_keys: list[str]) -> dict[str, Any]:
+    """
+    Validate that candidate_answer is valid JSON with required keys.
+
+    Args:
+        candidate_answer: The response to validate
+        required_keys: List of required top-level keys (e.g., ['content', 'emotion', 'process'])
+
+    Returns:
+        dict with 'passed', 'score', and 'reasoning' keys
+    """
+    # Try to parse as JSON
+    try:
+        parsed = json.loads(candidate_answer.strip())
+    except json.JSONDecodeError as e:
+        return {
+            "passed": False,
+            "score": 0.0,
+            "reasoning": f"Output is not valid JSON. Parse error: {str(e)}"
+        }
+
+    # Check if it's a dict (object)
+    if not isinstance(parsed, dict):
+        return {
+            "passed": False,
+            "score": 0.0,
+            "reasoning": f"Output is valid JSON but not an object/dict. Got: {type(parsed).__name__}"
+        }
+
+    # Check for required keys
+    missing_keys = [key for key in required_keys if key not in parsed]
+    if missing_keys:
+        return {
+            "passed": False,
+            "score": 0.0,
+            "reasoning": f"Valid JSON but missing required keys: {', '.join(missing_keys)}. Has keys: {', '.join(parsed.keys())}"
+        }
+
+    # All checks passed
+    return {
+        "passed": True,
+        "score": 1.0,
+        "reasoning": f"Valid JSON with all required keys: {', '.join(required_keys)}"
+    }
+
+
+def main():
+    """Main entry point for AgentV code evaluator."""
+    # AgentV passes eval data via stdin as JSON
+    try:
+        eval_data = json.load(sys.stdin)
+    except json.JSONDecodeError as e:
+        print(json.dumps({
+            "passed": False,
+            "score": 0.0,
+            "reasoning": f"Failed to parse input JSON: {str(e)}"
+        }))
+        sys.exit(1)
+
+    # Extract candidate answer
+    candidate_answer = eval_data.get("candidate_answer", "")
+
+    # Default required keys for 3-layer listening
+    required_keys = ["content", "emotion", "process"]
+
+    # Validate
+    result = validate_json_format(candidate_answer, required_keys)
+
+    # Output result as JSON
+    print(json.dumps(result, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/...imple/optimizers/ace-code-generation.yaml → ...imple/optimizers/ace-incident-triage.yaml b/...imple/optimizers/ace-code-generation.yaml → ...imple/optimizers/ace-incident-triage.yaml
@@ -8,22 +8,17 @@ type: ace
 
 # Eval files to use for optimization
 # ACE will run these evals to measure prompt performance and guide improvements
+# Use eval files designed with explicit reasoning prompts for best ACE results
 eval_files:
-  - ../evals/example-eval.yaml
-  # - ../evals/code-generation-edge-cases.test.yaml
-  # - ../evals/code-review-security.test.yaml
+  - ../evals/incident-triage/incident-severity.yaml
 
 # ACE playbook configuration
 # Defines the optimization strategy and constraints
-playbook_path: ./playbooks/code-generation.json
+playbook_path: ./playbooks/incident-triage.json
 
 # Maximum optimization iterations across ALL eval cases
 max_epochs: 5
 
-# Number of analysis rounds per failed eval case
-# ACE analyzes failures and suggests prompt improvements
-max_reflector_rounds: 3
-
 # Allow ACE to create new sections in the prompt
 # When true, ACE can add new instructions/guidelines
 # When false, ACE can only modify existing content