From 4ff0c863abe5b35b4e82f38500310fa1ce0797ca Mon Sep 17 00:00:00 2001
From: Larris Xie <larris.xie@gmail.com>
Date: Sat, 13 Sep 2025 21:14:35 -0400
Subject: [PATCH 1/3] Small fixes

---
 backend/__init__.py         |  3 +++
 backend/agents/database.py  |  4 ++--
 backend/agents/run_suite.py | 45 -------------------------------------
 backend/agents/runner.py    |  6 +++--
 backend/tests/__init__.py   |  0
 backend/tests/agent_test.py | 14 +++++++-----
 6 files changed, 17 insertions(+), 55 deletions(-)
 create mode 100644 backend/__init__.py
 delete mode 100644 backend/agents/run_suite.py
 create mode 100644 backend/tests/__init__.py

diff --git a/backend/__init__.py b/backend/__init__.py
new file mode 100644
index 0000000..6e2b56f
--- /dev/null
+++ b/backend/__init__.py
@@ -0,0 +1,3 @@
+"""Backend package initializer."""
+
+
diff --git a/backend/agents/database.py b/backend/agents/database.py
index 2e49f3b..7a1831a 100644
--- a/backend/agents/database.py
+++ b/backend/agents/database.py
@@ -1,9 +1,9 @@
 from typing import Dict, Any, Optional
 from supabase import create_client
-from dotenv import load_dotenv
+from dotenv import load_dotenv, find_dotenv
 import os
 
-load_dotenv()
+load_dotenv(find_dotenv())
 SUPABASE_URL = os.getenv('SUPABASE_URL')
 SUPABASE_KEY = os.getenv('SUPABASE_KEY')
 supabase = create_client(SUPABASE_URL, SUPABASE_KEY) if SUPABASE_URL and SUPABASE_KEY else None
diff --git a/backend/agents/run_suite.py b/backend/agents/run_suite.py
deleted file mode 100644
index aba570e..0000000
--- a/backend/agents/run_suite.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-"""
-Bridge script to run agent suites from Node.js CICD pipeline
-This script is called by qai-pipeline.js to execute individual test suites
-"""
-import sys
-import asyncio
-from pathlib import Path
-
-# Add the agents directory to the Python path
-sys.path.append(str(Path(__file__).parent))
-
-from runner import run_qai_tests
-
-async def main():
-    if len(sys.argv) != 2:
-        print("Usage: python run_suite.py <suite_id>", file=sys.stderr)
-        sys.exit(1)
-    
-    try:
-        suite_id = int(sys.argv[1])
-        print(f"🚀 Starting agent execution for suite {suite_id}")
-        
-        # Run the agent with the database-backed suite
-        result = await run_qai_tests(suite_id=suite_id)
-        
-        print(f"✅ Agent execution completed for suite {suite_id}")
-        print(f"📊 Agent status: {result['agent_result']['status']}")
-        
-        # Exit with appropriate code
-        if result['agent_result']['status'] == 'success':
-            sys.exit(0)
-        else:
-            print(f"❌ Agent failed: {result['agent_result'].get('error', 'Unknown error')}", file=sys.stderr)
-            sys.exit(1)
-            
-    except ValueError:
-        print(f"Error: Invalid suite_id '{sys.argv[1]}'. Must be an integer.", file=sys.stderr)
-        sys.exit(1)
-    except Exception as e:
-        print(f"❌ Agent execution failed: {str(e)}", file=sys.stderr)
-        sys.exit(1)
-
-if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
diff --git a/backend/agents/runner.py b/backend/agents/runner.py
index 3f83c51..43cdb33 100644
--- a/backend/agents/runner.py
+++ b/backend/agents/runner.py
@@ -85,6 +85,8 @@ def _prepare_step_for_storage(item: Dict[str, Any]):
                 instructions=build_agent_instructions(tests, spec),
                 )
             
+            await computer.venv_install("recording_venv", [])
+            
             # Open the browser before starting agent steps
             try:
                 await computer.interface.left_click(536, 742)
@@ -107,7 +109,7 @@ def _prepare_step_for_storage(item: Dict[str, Any]):
                 # Start recording inside VM
                 try:
                     remote_dir = make_remote_recording_dir(suite_id, test_name)
-                    await computer.venv_exec("demo_venv", start_recording, output_dir=remote_dir, fps=5)
+                    await computer.venv_exec("recording_venv", start_recording, output_dir=remote_dir, fps=5)
                     print(f"[Agent {suite_id}] recording started for {test_name}")
                 except Exception as _e:
                     print(f"[Agent {suite_id}] recording start failed for {test_name}: {_e}")
@@ -145,7 +147,7 @@ def _prepare_step_for_storage(item: Dict[str, Any]):
                     
                     # Stop recording and get S3 URL
                     try:
-                        recording_stop = await computer.venv_exec("demo_venv", stop_recording)
+                        recording_stop = await computer.venv_exec("recording_venv", stop_recording)
                         if isinstance(recording_stop, dict):
                             upload = recording_stop.get("upload") or {}
                             resp = upload.get("response") or {}
diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/backend/tests/agent_test.py b/backend/tests/agent_test.py
index a948852..5df3aa0 100644
--- a/backend/tests/agent_test.py
+++ b/backend/tests/agent_test.py
@@ -10,9 +10,9 @@ def main() -> None:
 
     specs = [
         {
-            "persona": "navigation",
+            "name": "navigation",
+            "suite_id": 1,
             "container_name": os.getenv("CUA_CONTAINER_NAME"),
-            "suite": "navigation",
             "tests": [
                 {
                     "name": "admissions-header",
@@ -42,9 +42,9 @@ def main() -> None:
             ],
         },
         {
-            "persona": "events",
-            "container_name": os.getenv("CUA_CONTAINER_NAME2") or os.getenv("CUA_CONTAINER_NAME"),
-            "suite": "events",
+            "name": "events",
+            "suite_id": 2,
+            "container_name": os.getenv("CUA_CONTAINER_NAME2"),
             "tests": [
                 {
                     "name": "first-event",
@@ -62,7 +62,9 @@ def main() -> None:
         },
     ]
 
-    summary = asyncio.run(run_agents(specs))
+    pr_name = os.getenv("PR_NAME", "local-dev")
+    pr_link = os.getenv("PR_LINK", "http://localhost")
+    summary = asyncio.run(run_agents(specs, pr_name, pr_link))
     print(summary)
 
 

From 9e80ce08e33437a029ee7ace59c71d022bb908a4 Mon Sep 17 00:00:00 2001
From: Larris Xie <larris.xie@gmail.com>
Date: Sat, 13 Sep 2025 22:28:44 -0400
Subject: [PATCH 2/3] Optimizes prompts

---
 backend/agents/prompts.py   | 57 ++++++++++++++++---------------------
 backend/agents/runner.py    | 33 +++++----------------
 backend/agents/utils.py     | 45 ++++++++++++++++++++++-------
 backend/tests/agent_test.py |  2 +-
 4 files changed, 67 insertions(+), 70 deletions(-)

diff --git a/backend/agents/prompts.py b/backend/agents/prompts.py
index 44b54f2..a601a21 100644
--- a/backend/agents/prompts.py
+++ b/backend/agents/prompts.py
@@ -4,60 +4,53 @@
 from dotenv import load_dotenv
 load_dotenv()
 
+
 def build_agent_instructions(tests: List[Dict], suite: Dict) -> str:
-	"""Build optimized instructions for autonomous agent testing"""
-	
-	base_url = os.getenv('DEPLOYMENT_URL', 'https://staging.example.com')
-	
-	# Create comprehensive testing instructions
+	"""Build optimized instructions for autonomous agent testing with concise STEP emissions."""
+	base_url = os.getenv("DEPLOYMENT_URL", "https://staging.example.com")
+
+	# Create concise, UI-friendly testing instructions
 	instructions = f"""
 You are an autonomous QA testing agent for web applications. Your goal is to thoroughly test the deployment at {base_url}.
 
 SUITE: {suite['name']}
 TOTAL TESTS: {len(tests)}
 
+COMMUNICATION RULES (MANDATORY):
+- Only output concise major steps in the exact format: "STEP: <short gerund phrase>".
+- Use action/gerund form ending with "-ing" (e.g., "Clicking Admissions", "Navigating to home").
+- Major steps are human-meaningful actions: navigation, key clicks that change pages, form submissions, and verifications.
+- Exclude micro steps (mouse moves, small scrolls, key-by-key typing) unless they are the core action.
+- Exclude continuations from the last step (ex. "Continuing to scroll").
+- Keep each step under 7 words when possible.
+- After completing each test scenario, output exactly one line: "RESULT: PASSED" or "RESULT: FAILED".
+
 TESTING APPROACH:
-1. Start by taking a screenshot to see the current state
-2. Navigate to the base URL: {base_url}
-3. Perform comprehensive exploratory testing
-4. Look for bugs, broken functionality, and edge cases
-5. Test user flows and interactions
-6. Pay special attention to recent changes that might have introduced issues
+1. Navigate to the base URL: {base_url}
+2. Execute each scenario's intent
+3. Verify the expected destination/state
+4. Document ONLY major actions using STEP lines in gerund form
 
 SPECIFIC TEST SCENARIOS:
 """
-	
+
 	for i, test in enumerate(tests, 1):
 		instructions += f"""
 {i}. {test['name']}
    Description: {test.get('summary', 'No description provided')}
    Priority: {'HIGH' if 'critical' in test.get('summary', '').lower() else 'MEDIUM'}
 """
-	
+
 	instructions += """
 
 TESTING GUIDELINES:
-- Be thorough and methodical in your approach
-- Take screenshots at key moments to document your findings
+- Be thorough, but keep communication to STEP lines only
+- Take screenshots at key moments for your own reasoning, but do not describe them
 - Test both happy paths and edge cases
-- Look for unexpected behaviors, errors, or broken functionality
-- Pay attention to UI/UX issues and usability problems
-- Test form submissions, navigation, and interactive elements
-- Check for responsive design and mobile compatibility if applicable
-- Document any bugs or issues you discover with clear descriptions
-
-SUCCESS CRITERIA:
-- Complete testing of all specified scenarios
-- Identify and document any bugs or issues found
-- Verify that core functionality works as expected
-- Provide clear feedback on the overall quality of the deployment
-
-Remember: You are looking for unexpected bugs and issues that developers might miss. Be creative in your testing approach and explore edge cases.
+- Prioritize actions that meaningfully change app state or page
 
 FINAL VERDICT FORMAT (MANDATORY):
-- After completing each test scenario, output exactly one line with no extra commentary:
-- RESULT: PASSED   (if the scenario executed successfully and no critical issues were found)
-- RESULT: FAILED   (if execution could not complete or a critical/blocking issue was found)
+- After each scenario, output exactly one line: RESULT: PASSED or RESULT: FAILED
 """
-	
+
 	return instructions
\ No newline at end of file
diff --git a/backend/agents/runner.py b/backend/agents/runner.py
index 43cdb33..29cd500 100644
--- a/backend/agents/runner.py
+++ b/backend/agents/runner.py
@@ -15,7 +15,7 @@
     set_suite_result_id,
 )
 from .prompts import build_agent_instructions
-from .utils import normalize_tests, make_remote_recording_dir, process_item
+from .utils import normalize_tests, make_remote_recording_dir, process_item, extract_major_steps
 from .record import start_recording, stop_recording
 
 class RunStatus(Enum):
@@ -29,7 +29,7 @@ class RunStatus(Enum):
 
 async def run_single_agent(spec: Dict[str, Any]) -> Dict[str, Any]:
     # Setup CUA agent
-    model = spec.get("model") or os.getenv("CUA_MODEL", "anthropic/claude-3-5-sonnet-20241022")
+    model = spec.get("model") or os.getenv("CUA_MODEL", "claude-sonnet-4-20250514")
     budget = spec.get("budget", 5.0)
     suite_id = spec.get("suite_id")
     
@@ -50,26 +50,7 @@ async def _execute() -> Dict[str, Any]:
         # Results from all tests from the suite
         suite_results: List[Dict[str, Any]] = []
         
-        def _prepare_step_for_storage(item: Dict[str, Any]):
-            t = item.get("type")
-            if t == "message":
-                try:
-                    content = item.get("content") or []
-                    for block in content:
-                        if isinstance(block, dict) and block.get("text"):
-                            return block["text"]
-                except Exception:
-                    return item
-            elif t in ("computer_call", "computer_call_output", "function_call", "function_call_output"):
-                pruned = dict(item)
-                if pruned.get("type") == "computer_call_output":
-                    output = pruned.get("output", {})
-                    if isinstance(output, dict) and "image_url" in output:
-                        output = dict(output)
-                        output["image_url"] = "[omitted]"
-                        pruned["output"] = output
-                return pruned
-            return item
+        
         
         async with Computer(
             os_type=os_type,
@@ -117,12 +98,12 @@ def _prepare_step_for_storage(item: Dict[str, Any]):
                 try:
                     async for result in agent.run(test_instructions):
                         for item in result.get("output", []):
-                            # Add agent's current step
+                            # Add agent's current condensed steps
                             test_agent_steps = process_item(item, suite_id, test_agent_steps)
-                            # Persist step immediately to DB
+                            # Persist condensed steps immediately to DB
                             if test_id is not None:
-                                step_payload = _prepare_step_for_storage(item)
-                                await append_test_step(test_id, step_payload)
+                                for step in extract_major_steps(item):
+                                    await append_test_step(test_id, step)
                             # Parse explicit verdict from agent message content
                             try:
                                 if isinstance(item, dict) and item.get("type") == "message":
diff --git a/backend/agents/utils.py b/backend/agents/utils.py
index d954d0f..fdb23ad 100644
--- a/backend/agents/utils.py
+++ b/backend/agents/utils.py
@@ -37,40 +37,63 @@ def make_remote_recording_dir(suite_id: str, test_name: str) -> str:
 
 def process_item(item: dict, suite_id: str, test_agent_steps: list[dict]) -> dict:
     item_type = item.get("type")
-                            
+    
     if item_type == "message":
         try:
             content = item.get("content") or []
             for block in content:
                 if isinstance(block, dict) and block.get("text"):
-                    print(f"[Agent {suite_id}] message: {block['text']}")
-                    test_agent_steps.append(block["text"])
+                    text = block["text"]
+                    for line in str(text).splitlines():
+                        candidate = line.strip()
+                        if candidate.upper().startswith("STEP:"):
+                            step_text = candidate.split(":", 1)[1].strip()
+                            step_text = step_text
+                            if step_text:
+                                print(f"[Agent {suite_id}] STEP: {step_text}")
+                                test_agent_steps.append(step_text)
         except Exception:
             pass
     
     elif item_type in ("computer_call", "computer_call_output", "function_call", "function_call_output"):
+        # Keep debug output, but do not append raw tool calls to steps
         pruned = dict(item)
         if pruned.get("type") == "computer_call_output":
             output = pruned.get("output", {})
             if isinstance(output, dict) and "image_url" in output:
-                output = dict(output)
-                output["image_url"] = "[omitted]"
-                pruned["output"] = output
                 print(f"[Agent {suite_id}] computer_call_output: screenshot captured")
-            
         elif pruned.get("type") == "computer_call":
             action = pruned.get("action", {}) or {}
             a_type = action.get("type", "unknown")
             a_args = {k: v for k, v in action.items() if k != "type"}
             print(f"[Agent {suite_id}] computer_call: {a_type}({a_args})")
-            
         elif pruned.get("type") == "function_call":
             fname = pruned.get("name", "<anon>")
             print(f"[Agent {suite_id}] function_call: {fname}")
-            
         elif pruned.get("type") == "function_call_output":
             print(f"[Agent {suite_id}] function_call_output: received")
             
-        test_agent_steps.append(pruned)
-            
     return test_agent_steps
+
+
+def extract_major_steps(item: dict) -> list[str]:
+    """Extract condensed STEP lines from a single agent output item."""
+    steps: list[str] = []
+    try:
+        if item.get("type") != "message":
+            return steps
+        content = item.get("content") or []
+        for block in content:
+            if isinstance(block, dict) and block.get("text"):
+                text = block["text"]
+                for line in str(text).splitlines():
+                    candidate = line.strip()
+                    if candidate.upper().startswith("STEP:"):
+                        step_text = candidate.split(":", 1)[1].strip()
+                        step_text = step_text
+                        if step_text:
+                            steps.append(step_text)
+    except Exception:
+        return steps
+    return steps
+
diff --git a/backend/tests/agent_test.py b/backend/tests/agent_test.py
index 5df3aa0..23677d6 100644
--- a/backend/tests/agent_test.py
+++ b/backend/tests/agent_test.py
@@ -20,7 +20,7 @@ def main() -> None:
                         {
                             "role": "user",
                             "content": (
-                                "Go to https://uwaterloo.ca. "
+                                "Go to http://myproject123.io"
                                 "Click the header link labeled \"Admissions\". "
                                 "Confirm that the page you land on is related to Admissions."
                             ),

From 46408f1a63f9f7a06e237ad1578e1f31832a4811 Mon Sep 17 00:00:00 2001
From: Larris Xie <larris.xie@gmail.com>
Date: Sat, 13 Sep 2025 23:29:59 -0400
Subject: [PATCH 3/3] Changes

---
 backend/agents/database.py   |  61 +++++-
 backend/agents/main.py       | 368 +++++++++--------------------------
 backend/agents/runner.py     |  81 +++++++-
 backend/cicd/qai-pipeline.js |  85 ++------
 4 files changed, 235 insertions(+), 360 deletions(-)

diff --git a/backend/agents/database.py b/backend/agents/database.py
index fc7594e..f62a666 100644
--- a/backend/agents/database.py
+++ b/backend/agents/database.py
@@ -1,4 +1,4 @@
-from typing import Dict, Any, Optional
+from typing import Dict, Any, Optional, List
 from supabase import create_client
 from dotenv import load_dotenv, find_dotenv
 import os
@@ -149,3 +149,62 @@ async def get_result_id_for_suite(suite_id: int) -> Optional[int]:
 	except Exception as e:
 		print(f"[db] ❌ get_result_id_for_suite error: {str(e)}")
 		return None
+
+
+async def get_suites_with_tests_for_result(result_id: int) -> List[Dict[str, Any]]:
+	"""Fetch all suites (and their tests) for a given result_id, formatted for agent specs."""
+	try:
+		if not _has_client():
+			print(f"[db] Skipping get_suites_with_tests_for_result for result {result_id}: no client")
+			return []
+		# Fetch suites under the result
+		suites_resp = supabase.table('suites').select('id,name').eq('result_id', result_id).execute()
+		suites = suites_resp.data or []
+		specs: List[Dict[str, Any]] = []
+		for s in suites:
+			suite_id = s.get('id')
+			name = s.get('name', 'Untitled Suite')
+			if suite_id is None:
+				continue
+			# Fetch tests for this suite
+			tests_resp = supabase.table('tests').select('*').eq('suite_id', suite_id).execute()
+			tests = tests_resp.data or []
+			formatted_tests: List[Dict[str, Any]] = []
+			for t in tests:
+				formatted_tests.append({
+					'name': t.get('name', 'Untitled Test'),
+					'instructions': t.get('summary', '').split('\n') if t.get('summary') else ['Run basic test'],
+				})
+			specs.append({
+				'suite_id': suite_id,
+				'name': name,
+				'tests': formatted_tests,
+			})
+		return specs
+	except Exception as e:
+		print(f"[db] ❌ get_suites_with_tests_for_result error: {str(e)}")
+		return []
+
+
+async def get_result_basics(result_id: int) -> Optional[Dict[str, Any]]:
+	"""Fetch basic fields from results needed to run agents (pr_name, pr-link)."""
+	try:
+		if not _has_client():
+			return None
+		resp = supabase.table('results').select('id, pr_name, pr-link').eq('id', result_id).limit(1).execute()
+		if resp.data:
+			return resp.data[0]
+		return None
+	except Exception as e:
+		print(f"[db] ❌ get_result_basics error: {str(e)}")
+		return None
+
+
+async def update_result_fields(result_id: int, fields: Dict[str, Any]) -> None:
+	"""Generic update for a results row."""
+	try:
+		if not _has_client():
+			return
+		supabase.table('results').update(fields).eq('id', result_id).execute()
+	except Exception as e:
+		print(f"[db] ❌ update_result_fields error: {str(e)}")
diff --git a/backend/agents/main.py b/backend/agents/main.py
index a716318..30a451c 100644
--- a/backend/agents/main.py
+++ b/backend/agents/main.py
@@ -2,18 +2,11 @@
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from typing import List, Dict, Any, Optional
-import asyncio
 import os
 from dotenv import load_dotenv
 
-from runner import run_single_agent, run_agents, run_qai_tests
+from runner import run_agents, run_qai_tests, run_suites_for_result
 from database import (
-    get_or_create_test,
-    create_result,
-    set_suite_result_id,
-    get_suite_with_tests,
-    update_test_fields,
-    append_test_step,
     _has_client
 )
 
@@ -74,257 +67,22 @@ class MultiAgentRunRequest(BaseModel):
     pr_name: str
     pr_link: str
 
+class RunResultRequest(BaseModel):
+    result_id: int
+
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
-    return {
+    db_connected = _has_client()
+    print(f"[API] Health check requested - Database connected: {db_connected}")
+    
+    response = {
         "status": "healthy",
-        "database_connected": _has_client(),
+        "database_connected": db_connected,
         "version": "1.0.0"
     }
-
-# Results endpoints (matching API.md)
-
-@app.post("/results")
-async def create_result_endpoint(request: CreateResultRequest):
-    """Create a new result (PR test run)"""
-    try:
-        if not _has_client():
-            raise HTTPException(status_code=500, detail="Database not configured")
-        
-        # Use the database function to create result
-        result_id = await create_result(request.prName, request.prLink, {}, "PENDING")
-        if result_id is None:
-            raise HTTPException(status_code=500, detail="Failed to create result")
-        
-        # Return response matching API.md format
-        return {
-            "success": True,
-            "message": "Result created successfully",
-            "data": {
-                "id": result_id,
-                "pr-link": request.prLink,
-                "pr-name": request.prName,
-                "res-success": False
-            }
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to create result: {str(e)}")
-
-@app.patch("/results/{result_id}")
-async def update_result_endpoint(result_id: int, request: UpdateResultRequest):
-    """Update result success status"""
-    try:
-        if not _has_client():
-            raise HTTPException(status_code=500, detail="Database not configured")
-        
-        from database import supabase
-        
-        response = supabase.table('results').update({
-            'res-success': request.resSuccess
-        }).eq('id', result_id).execute()
-        
-        if not response.data:
-            raise HTTPException(status_code=404, detail="Result not found")
-        
-        return {
-            "success": True,
-            "message": "Result updated successfully",
-            "data": response.data[0]
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to update result: {str(e)}")
-
-@app.get("/results")
-async def get_all_results():
-    """Get all results"""
-    try:
-        if not _has_client():
-            raise HTTPException(status_code=500, detail="Database not configured")
-        
-        from database import supabase
-        
-        response = supabase.table('results').select('*').order('created_at', desc=True).execute()
-        
-        return {
-            "success": True,
-            "data": response.data or []
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to fetch results: {str(e)}")
-
-# Suite endpoints
-
-@app.post("/suites")
-async def create_suite_endpoint(request: CreateSuiteRequest):
-    """Create a new test suite"""
-    try:
-        if not _has_client():
-            raise HTTPException(status_code=500, detail="Database not configured")
-        
-        from database import supabase
-        
-        suite_data = {
-            'result_id': request.resultId,
-            'name': request.name,
-            'suites-success': request.suitesSuccess
-        }
-        if request.s3Link:
-            suite_data['s3-link'] = request.s3Link
-        
-        response = supabase.table('suites').insert([suite_data]).execute()
-        
-        if not response.data:
-            raise HTTPException(status_code=500, detail="Failed to create suite")
-        
-        return {
-            "success": True,
-            "message": "Suite created successfully",
-            "data": response.data[0]
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to create suite: {str(e)}")
-
-@app.patch("/suites/{suite_id}")
-async def update_suite_endpoint(suite_id: int, request: UpdateSuiteRequest):
-    """Update suite success status and/or S3 link"""
-    try:
-        if not _has_client():
-            raise HTTPException(status_code=500, detail="Database not configured")
-        
-        from database import supabase
-        
-        update_data = {}
-        if request.suitesSuccess is not None:
-            update_data['suites-success'] = request.suitesSuccess
-        if request.s3Link is not None:
-            update_data['s3-link'] = request.s3Link
-        
-        if not update_data:
-            raise HTTPException(status_code=400, detail="No update data provided")
-        
-        response = supabase.table('suites').update(update_data).eq('id', suite_id).execute()
-        
-        if not response.data:
-            raise HTTPException(status_code=404, detail="Suite not found")
-        
-        return {
-            "success": True,
-            "message": "Suite updated successfully",
-            "data": response.data[0]
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to update suite: {str(e)}")
-
-@app.get("/results/{result_id}/suites")
-async def get_suites_for_result(result_id: int):
-    """Get suites for a specific result"""
-    try:
-        if not _has_client():
-            raise HTTPException(status_code=500, detail="Database not configured")
-        
-        from database import supabase
-        
-        response = supabase.table('suites').select('*').eq('result_id', result_id).order('created_at', desc=True).execute()
-        
-        return {
-            "success": True,
-            "data": response.data or []
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to fetch suites: {str(e)}")
-
-# Test endpoints
-
-@app.post("/tests")
-async def create_test_endpoint(request: CreateTestRequest):
-    """Create a new individual test"""
-    try:
-        if not _has_client():
-            raise HTTPException(status_code=500, detail="Database not configured")
-        
-        from database import supabase
-        
-        test_data = {
-            'suite_id': request.suiteId,
-            'name': request.name,
-            'summary': request.summary,
-            'test-success': request.testSuccess
-        }
-        
-        response = supabase.table('tests').insert([test_data]).execute()
-        
-        if not response.data:
-            raise HTTPException(status_code=500, detail="Failed to create test")
-        
-        return {
-            "success": True,
-            "message": "Test created successfully",
-            "data": response.data[0]
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to create test: {str(e)}")
-
-@app.patch("/tests/{test_id}")
-async def update_test_endpoint(test_id: int, request: UpdateTestRequest):
-    """Update test success status and/or summary"""
-    try:
-        if not _has_client():
-            raise HTTPException(status_code=500, detail="Database not configured")
-        
-        from database import supabase
-        
-        update_data = {}
-        if request.testSuccess is not None:
-            update_data['test-success'] = request.testSuccess
-        if request.summary is not None:
-            update_data['summary'] = request.summary
-        
-        if not update_data:
-            raise HTTPException(status_code=400, detail="No update data provided")
-        
-        response = supabase.table('tests').update(update_data).eq('id', test_id).execute()
-        
-        if not response.data:
-            raise HTTPException(status_code=404, detail="Test not found")
-        
-        return {
-            "success": True,
-            "message": "Test updated successfully",
-            "data": response.data[0]
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to update test: {str(e)}")
-
-@app.get("/suites/{suite_id}/tests")
-async def get_tests_for_suite(suite_id: int):
-    """Get tests for a specific suite"""
-    try:
-        if not _has_client():
-            raise HTTPException(status_code=500, detail="Database not configured")
-        
-        from database import supabase
-        
-        response = supabase.table('tests').select('*').eq('suite_id', suite_id).order('created_at', desc=True).execute()
-        
-        return {
-            "success": True,
-            "data": response.data or []
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to fetch tests: {str(e)}")
+    print(f"[API] Health check response: {response}")
+    return response
 
 # Agent execution endpoints
 
@@ -333,42 +91,48 @@ async def run_suite_endpoint(request: RunSuiteRequest):
     """
     Run a test suite by ID - This is the main endpoint called by the CICD pipeline
     """
+    suite_id = request.suite_id
+    print(f"[API] Starting suite execution for suite_id: {suite_id}")
+    print(f"[API] Request received at: {__import__('datetime').datetime.now().isoformat()}")
+    
     try:
-        result = await run_qai_tests(request.suite_id)
+        print(f"[API] Calling run_qai_tests for suite_id: {suite_id}")
+        result = await run_qai_tests(suite_id)
+        print(f"[API] run_qai_tests completed for suite_id: {suite_id}")
+        print(f"[API] Result status: {result.get('agent_result', {}).get('status', 'unknown')}")
         
         if result['agent_result']['status'] == 'success':
-            return {
+            print(f"[API] Suite {suite_id} executed successfully")
+            print(f"[API] Tests run: {result['agent_result'].get('tests_run', 0)}")
+            
+            response = {
                 "status": "success",
-                "message": f"Suite {request.suite_id} executed successfully",
+                "message": f"Suite {suite_id} executed successfully",
                 "data": result
             }
+            print(f"[API] Returning success response for suite_id: {suite_id}")
+            return response
         else:
+            error_msg = result['agent_result'].get('error', 'Unknown error')
+            print(f"[API] Suite execution failed for suite_id: {suite_id}")
+            print(f"[API] Error: {error_msg}")
             raise HTTPException(
                 status_code=500,
-                detail=f"Suite execution failed: {result['agent_result'].get('error', 'Unknown error')}"
+                detail=f"Suite execution failed: {error_msg}"
             )
-    except HTTPException:
+    except HTTPException as he:
+        print(f"[API] HTTPException raised for suite_id: {suite_id}")
+        print(f"[API] HTTPException detail: {he.detail}")
+        print(f"[API] HTTPException status_code: {he.status_code}")
         raise
     except Exception as e:
+        print(f"[API] Unexpected error for suite_id: {suite_id}")
+        print(f"[API] Exception type: {type(e).__name__}")
+        print(f"[API] Exception message: {str(e)}")
+        print(f"[API] Exception details: {e}")
         raise HTTPException(status_code=500, detail=f"Suite execution failed: {str(e)}")
-
-@app.post("/run-agent")
-async def run_agent_endpoint(request: AgentRunRequest):
-    """Run a single agent test (legacy endpoint)"""
-    try:
-        result = await run_single_agent(request.spec)
-        return {"status": "success", "result": result}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Agent execution failed: {str(e)}")
-
-@app.post("/run-agents")
-async def run_agents_endpoint(request: MultiAgentRunRequest):
-    """Run multiple agent tests (legacy endpoint)"""
-    try:
-        result = await run_agents(request.test_specs, request.pr_name, request.pr_link)
-        return {"status": "success", "result": result}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Agents execution failed: {str(e)}")
+    finally:
+        print(f"[API] Request completed for suite_id: {suite_id}")
 
 @app.get("/")
 async def root():
@@ -378,14 +142,56 @@ async def root():
         "version": "1.0.0",
         "endpoints": {
             "health": "/health",
-            "results": "/results",
-            "suites": "/suites", 
-            "tests": "/tests",
-            "run_suite": "/run-suite"
+            "run_suite": "/run-suite",
+            "run_result": "/run-result",
+            "run_agents": "/run-agents"
         }
     }
 
+
+@app.post("/run-result")
+async def run_result_endpoint(request: RunResultRequest):
+    """Run all suites and tests for a given result_id."""
+    result_id = request.result_id
+    print(f"[API] Starting result execution for result_id: {result_id}")
+    try:
+        summary = await run_suites_for_result(result_id)
+        status = summary.get("run_status")
+        if status == "PASSED" or status == "FAILED":
+            return {
+                "status": "success",
+                "message": f"Result {result_id} executed",
+                "data": summary,
+            }
+        return {
+            "status": "success",
+            "message": f"Result {result_id} executed",
+            "data": summary,
+        }
+    except Exception as e:
+        print(f"[API] run_result failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Result execution failed: {str(e)}")
+
+
+@app.post("/run-agents")
+async def run_agents_endpoint(request: MultiAgentRunRequest):
+    """Run multiple agent suites provided directly as test_specs along with PR metadata."""
+    try:
+        summary = await run_agents(request.test_specs, request.pr_name, request.pr_link)
+        return {
+            "status": "success",
+            "message": "Agents executed",
+            "data": summary,
+        }
+    except Exception as e:
+        print(f"[API] run_agents failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Agents execution failed: {str(e)}")
+
 # For Vercel deployment
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "8000")))
\ No newline at end of file
+    port = int(os.getenv("PORT", "8000"))
+    print(f"[API] Starting QAI Agent Runner API on port {port}")
+    print(f"[API] Database connection status: {_has_client()}")
+    print(f"[API] Environment loaded from .env: {os.getenv('CUA_API_KEY', 'Not set')[:10]}...")
+    uvicorn.run(app, host="0.0.0.0", port=port)
\ No newline at end of file
diff --git a/backend/agents/runner.py b/backend/agents/runner.py
index 93d7650..fd99596 100644
--- a/backend/agents/runner.py
+++ b/backend/agents/runner.py
@@ -13,10 +13,12 @@
     update_test_fields,
     create_result,
     set_suite_result_id,
+    get_suites_with_tests_for_result,
+    update_result_fields,
 )
-from .prompts import build_agent_instructions
-from .utils import normalize_tests, make_remote_recording_dir, process_item, extract_major_steps
-from .record import start_recording, stop_recording
+from prompts import build_agent_instructions
+from utils import normalize_tests, make_remote_recording_dir, process_item, extract_major_steps
+from record import start_recording, stop_recording
 
 class RunStatus(Enum):
     QUEUED = "QUEUED"
@@ -97,6 +99,7 @@ async def _execute() -> Dict[str, Any]:
                     
                 try:
                     async for result in agent.run(test_instructions):
+                        print(f"WTF HEREHEREHEREHEREHEREHERE {result}")
                         for item in result.get("output", []):
                             # Add agent's current condensed steps
                             test_agent_steps = process_item(item, suite_id, test_agent_steps)
@@ -169,22 +172,28 @@ async def run_qai_tests(suite_id: int) -> Dict[str, Any]:
     
     try:
         # Fetch suite and test data from database
+        print(f"[run_qai_tests] Fetching suite data for suite_id: {suite_id}")
         suite_data = await get_suite_with_tests(suite_id)
         if not suite_data:
+            print(f"[run_qai_tests] Suite {suite_id} not found in database")
             return {
                 'agent_result': {
                     'status': 'failed',
                     'error': f'Suite {suite_id} not found'
                 }
             }
+        print(f"[run_qai_tests] Suite data: {suite_data}")
+        
+        print(f"[run_qai_tests] Retrieved suite data: {suite_data.get('name', 'Unknown')} with {len(suite_data.get('tests', []))} tests")
         
         # Convert database format to agent spec format
         spec = {
             'suite_id': suite_id,
+            'name': suite_data.get('name'),  # Add suite name for build_agent_instructions
             'model': os.getenv("CUA_MODEL", "anthropic/claude-3-5-sonnet-20241022"),
             'budget': 5.0,
             'container_name': os.getenv("CUA_CONTAINER_NAME"),
-            'tests': suite_data.get('tests', [])
+            'tests': suite_data.get('tests')
         }
         
         # Run the agent
@@ -247,3 +256,67 @@ async def run_agents(test_specs: List[Dict[str, Any]], pr_name: str, pr_link: st
     }
     print(json.dumps(summary))
     return summary
+
+
+async def run_suites_for_result(result_id: int) -> Dict[str, Any]:
+    """
+    Fetch all suites/tests for a given result_id and run them together.
+    Updates the existing result row with overall summary and run_status.
+    """
+    try:
+        # Load suite specs for this result
+        specs: List[Dict[str, Any]] = await get_suites_with_tests_for_result(result_id)
+        if not specs:
+            await update_result_fields(result_id, {"run_status": RunStatus.FAILED.value})
+            return {
+                "result_id": result_id,
+                "overall_result": {"passed_tests": 0, "failed_tests": 0, "total_tests": 0},
+                "run_status": RunStatus.FAILED.value,
+                "error": "No suites found for result"
+            }
+
+        # Run each suite's tests concurrently
+        tasks = [run_single_agent(spec) for spec in specs]
+        results: List[Any] = await asyncio.gather(*tasks, return_exceptions=True)
+
+        total_tests = 0
+        passed_tests = 0
+        failed_tests = 0
+        for res in results:
+            if isinstance(res, Exception):
+                # Count as a failed suite with unknown test count
+                continue
+            for t in res:
+                total_tests += 1
+                if t.get("test_success"):
+                    passed_tests += 1
+                else:
+                    failed_tests += 1
+
+        run_status = RunStatus.PASSED if failed_tests == 0 and total_tests > 0 else RunStatus.FAILED
+        overall_result = {
+            "passed_tests": passed_tests,
+            "failed_tests": failed_tests,
+            "total_tests": total_tests,
+        }
+
+        await update_result_fields(result_id, {
+            "overall_result": overall_result,
+            "run_status": run_status.value,
+        })
+
+        summary = {
+            "result_id": result_id,
+            "overall_result": overall_result,
+            "run_status": run_status.value,
+        }
+        print(json.dumps(summary))
+        return summary
+    except Exception as e:
+        await update_result_fields(result_id, {"run_status": RunStatus.FAILED.value})
+        return {
+            "result_id": result_id,
+            "overall_result": {"passed_tests": 0, "failed_tests": 0, "total_tests": 0},
+            "run_status": RunStatus.FAILED.value,
+            "error": str(e),
+        }
diff --git a/backend/cicd/qai-pipeline.js b/backend/cicd/qai-pipeline.js
index d8f80db..6ed576f 100644
--- a/backend/cicd/qai-pipeline.js
+++ b/backend/cicd/qai-pipeline.js
@@ -114,87 +114,24 @@ Generate focused test scenarios for autonomous agents.`
         throw new Error('No result_id available - database upload may have failed');
       }
       
-      // Get suites for this result to run agents
-      const { data: suites } = await this.supabase
-        .from('suites')
-        .select('id, name')
-        .eq('result_id', this.resultId);
-      
-      if (!suites || suites.length === 0) {
-        throw new Error('No suites found for this result');
-      }
-      
-      console.log(`🏃 Running ${suites.length} agent suites via API...`);
-      
-      const results = [];
+      // Call new single-shot endpoint to run all suites for this result
       const agentTimeout = parseInt(process.env.AGENT_TIMEOUT || '600000');
-      
-      for (const suite of suites) {
-        console.log(`🤖 Calling API for suite: ${suite.name} (ID: ${suite.id})`);
-        
-        try {
-          // Call the QAI API endpoint to run the suite
-          const response = await axios.post(
-            `${process.env.QAI_ENDPOINT}/run-suite`,
-            { suite_id: suite.id },
-            { 
-              timeout: agentTimeout + 60000, // API timeout + 1 minute buffer
-              headers: { 'Content-Type': 'application/json' }
-            }
-          );
-          
-          if (response.data.status === 'success') {
-            console.log(`✅ Suite ${suite.id} completed successfully via API`);
-            results.push({
-              suite_id: suite.id,
-              suite_name: suite.name,
-              success: true,
-              api_response: response.data
-            });
-          } else {
-            throw new Error(`API returned non-success status: ${response.data.status}`);
-          }
-          
-        } catch (error) {
-          console.error(`❌ Suite ${suite.id} API call failed: ${error.message}`);
-          
-          let errorMessage = error.message;
-          if (error.response) {
-            errorMessage = `HTTP ${error.response.status}: ${error.response.data?.detail || error.response.statusText}`;
-          } else if (error.code === 'ECONNREFUSED') {
-            errorMessage = 'Cannot connect to QAI API endpoint';
-          }
-          
-          results.push({
-            suite_id: suite.id,
-            suite_name: suite.name,
-            success: false,
-            error: errorMessage
-          });
+      console.log(`🏃 Calling /run-result for result_id=${this.resultId} ...`);
+      const response = await axios.post(
+        `${process.env.QAI_ENDPOINT}/run-result`,
+        { result_id: this.resultId },
+        {
+          timeout: agentTimeout + 60000,
+          headers: { 'Content-Type': 'application/json' }
         }
-      }
-      
-      console.log(`✅ Completed ${results.length} agent suite API calls`);
-      
-      this.saveFile('test-results.json', results);
+      );
 
-      const passed = results.filter(r => r.success).length;
-      const failed = results.length - passed;
-      
-      console.log(`📊 API Results: ${passed}/${results.length} suites passed`);
-      
-      if (failed > 0) {
-        console.log(`❌ Failed suites:`);
-        results.filter(r => !r.success).forEach(result => {
-          console.log(`   • ${result.suite_name}: ${result.error || 'Failed'}`);
-        });
+      if (response.data?.status !== 'success') {
+        throw new Error(`API returned non-success status: ${response.data?.status || 'unknown'}`);
       }
 
-      console.log(`💾 Database results updated by QAI API system`);
-      
       // Verify final database state
       const finalSuccess = await this.verifyFinalResults();
-      
       console.log(`::set-output name=success::${finalSuccess}`);
       return finalSuccess;
     } catch (error) {