Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions backend/agents/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def build_agent_instructions(tests: List[Dict], suite: Dict) -> str:
"""Build optimized instructions for autonomous agent testing with concise STEP emissions."""
base_url = os.getenv("DEPLOYMENT_URL", "https://staging.example.com")
base_url = os.getenv("DEPLOYMENT_URL", "https://www.larris.me/")

# Create concise, UI-friendly testing instructions
instructions = f"""
Expand All @@ -22,7 +22,7 @@ def build_agent_instructions(tests: List[Dict], suite: Dict) -> str:
- Major steps are human-meaningful actions: navigation, key clicks that change pages, form submissions, and verifications.
- Exclude micro steps (mouse moves, small scrolls, key-by-key typing) unless they are the core action.
- Exclude continuations from the last step (ex. "Continuing to scroll").
- Keep each step under 7 words when possible.
- Keep each step as simple as possible, under 7 words ideally.
- After completing each test scenario, output exactly one line: "RESULT: PASSED" or "RESULT: FAILED".

TESTING APPROACH:
Expand All @@ -46,7 +46,7 @@ def build_agent_instructions(tests: List[Dict], suite: Dict) -> str:
TESTING GUIDELINES:
- Be thorough, but keep communication to STEP lines only
- Take screenshots at key moments for your own reasoning, but do not describe them
- Test both happy paths and edge cases
- YOU ARE ON AN EXTREME TIME CRUNCH, test as EFFICIENTLY as possible, which could mean forming a conclusion PASS/FAIL faster instead of trying over and over again.
- Prioritize actions that meaningfully change app state or page

FINAL VERDICT FORMAT (MANDATORY):
Expand Down
12 changes: 0 additions & 12 deletions backend/agents/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,15 +213,3 @@ def status():
except Exception:
running = False
return {"ok": True, "running": running, "path": data.get("path"), "pid": pid, "fps": data.get("fps")}


def _pid_alive(pid):
# Kept for compatibility if imported directly; not used by venv_exec paths
import os as _os
try:
_os.kill(pid, 0)
return True
except Exception:
return False


17 changes: 15 additions & 2 deletions backend/agents/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@ class RunStatus(Enum):
async def run_single_agent(spec: Dict[str, Any]) -> Dict[str, Any]:
print(f"SPEC: {spec}")
# Setup CUA agent
model = spec.get("model") or os.getenv("CUA_MODEL", "claude-sonnet-4-20250514")
model = spec.get("model") or os.getenv("CUA_MODEL", "claude-opus-4-1-20250805") # claude-sonnet-4-20250514
budget = spec.get("budget", 5.0)
suite_id = spec.get("suite_id")

# Setup CUA computer
os_type = "linux"
provider_type = "cloud"
container_name = spec.get("container_name") or os.getenv("CUA_CONTAINER_NAME")
container_name = spec.get("container_name")
api_key = os.getenv("CUA_API_KEY")
if not api_key:
raise RuntimeError("CUA_API_KEY is required")
Expand Down Expand Up @@ -276,6 +276,19 @@ async def run_suites_for_result(result_id: int) -> Dict[str, Any]:
"run_status": RunStatus.FAILED.value,
"error": "No suites found for result"
}

# Assign containers per suite (CUA_CONTAINER_1..4)
container_envs: List[str] = []
for i in range(1, 5):
val = os.getenv(f"CUA_CONTAINER_{i}")
if val:
container_envs.append(val)
if not container_envs:
raise RuntimeError("No CUA_CONTAINER_[1-4] variables configured")

for idx, spec in enumerate(specs):
assigned = container_envs[idx] if idx < len(container_envs) else container_envs[-1]
spec["container_name"] = assigned

# Run each suite's tests concurrently
tasks = [run_single_agent(spec) for spec in specs]
Expand Down
102 changes: 0 additions & 102 deletions backend/cicd/DEPLOYMENT.md

This file was deleted.

19 changes: 16 additions & 3 deletions backend/cicd/qai-pipeline.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,13 @@ CHANGES: ${diff}

Generate focused test scenarios for autonomous agents.

For EACH scenario, also include a concise but rich summary (1-3 sentences) that gives the agent business context and the precise objective of the test. The summary should read like: "On <deployment url or app>, you are testing <feature or flow>; in this test, you <core action and intent> to validate <expected behavior/validation>".`
Constraints and guidance:
- Use at most 4 distinct test suites. Choose categories that best partition the behaviors changed by this PR (ex. Authentication, Navigation, New About Page, etc.).
- For EACH suite, prefer 2–3 high-value tests when meaningful, ideally E2E tests that a human would miss (think edge cases, race conditions, etc.). Aim for a total of ~6–10 scenarios overall, balancing coverage and noise.
- Do NOT create trivial or duplicative scenarios. Avoid superficial variations (e.g., same flow with only a color change). Deduplicate aggressively.
- If there is truly only one meaningful area to test, produce at least 2 complementary tests for that same persona (e.g., happy path vs clear edge/error path) rather than only one total scenario.

For EACH scenario, also include a concise but rich summary (1–3 sentences) that gives the agent context and the precise objective to carry out the test efficiently. The summary could read like: "On <deployment url>, you are testing <feature or flow>; in this test, you <core action and intent> to validate <expected behavior/validation>".`
}],
response_format: {
type: "json_schema",
Expand Down Expand Up @@ -87,7 +93,11 @@ For EACH scenario, also include a concise but rich summary (1-3 sentences) that
}
});

const parsedScenarios = completion.choices[0].message.parsed.scenarios;
let parsedScenarios = completion.choices[0].message.parsed.scenarios;
// Hard cap to 4 suites (personas) to match available containers
if (Array.isArray(parsedScenarios) && parsedScenarios.length > 4) {
parsedScenarios = parsedScenarios.slice(0, 4);
}
const deploymentUrl = process.env.DEPLOYMENT_URL || 'the app';
const scenarios = parsedScenarios.map(s => ({
...s,
Expand Down Expand Up @@ -191,10 +201,13 @@ For EACH scenario, also include a concise but rich summary (1-3 sentences) that
groups[persona].push(scenario);
return groups;
}, {});
// Enforce max 4 suites (personas)
const limitedPersonas = Object.keys(personaGroups).slice(0, 4);

// Create suite records (one per persona/agent)
this.suiteIds = {};
for (const [persona, personaScenarios] of Object.entries(personaGroups)) {
for (const persona of limitedPersonas) {
const personaScenarios = personaGroups[persona];
const suiteRecord = {
result_id: this.resultId, // Foreign key to results table
name: `${persona} Agent Suite`
Expand Down
Loading