diff --git a/backend/agents/prompts.py b/backend/agents/prompts.py index a601a21..8ebf50f 100644 --- a/backend/agents/prompts.py +++ b/backend/agents/prompts.py @@ -7,7 +7,7 @@ def build_agent_instructions(tests: List[Dict], suite: Dict) -> str: """Build optimized instructions for autonomous agent testing with concise STEP emissions.""" - base_url = os.getenv("DEPLOYMENT_URL", "https://staging.example.com") + base_url = os.getenv("DEPLOYMENT_URL", "https://www.larris.me/") # Create concise, UI-friendly testing instructions instructions = f""" @@ -22,7 +22,7 @@ def build_agent_instructions(tests: List[Dict], suite: Dict) -> str: - Major steps are human-meaningful actions: navigation, key clicks that change pages, form submissions, and verifications. - Exclude micro steps (mouse moves, small scrolls, key-by-key typing) unless they are the core action. - Exclude continuations from the last step (ex. "Continuing to scroll"). -- Keep each step under 7 words when possible. +- Keep each step as simple as possible, under 7 words ideally. - After completing each test scenario, output exactly one line: "RESULT: PASSED" or "RESULT: FAILED". TESTING APPROACH: @@ -46,7 +46,7 @@ def build_agent_instructions(tests: List[Dict], suite: Dict) -> str: TESTING GUIDELINES: - Be thorough, but keep communication to STEP lines only - Take screenshots at key moments for your own reasoning, but do not describe them -- Test both happy paths and edge cases +- YOU ARE ON AN EXTREME TIME CRUNCH, test as EFFICIENTLY as possible, which could mean forming a conclusion PASS/FAIL faster instead of trying over and over again. - Prioritize actions that meaningfully change app state or page FINAL VERDICT FORMAT (MANDATORY): diff --git a/backend/agents/record.py b/backend/agents/record.py index 1562d4d..62521db 100644 --- a/backend/agents/record.py +++ b/backend/agents/record.py @@ -213,15 +213,3 @@ def status(): except Exception: running = False return {"ok": True, "running": running, "path": data.get("path"), "pid": pid, "fps": data.get("fps")} - - -def _pid_alive(pid): - # Kept for compatibility if imported directly; not used by venv_exec paths - import os as _os - try: - _os.kill(pid, 0) - return True - except Exception: - return False - - diff --git a/backend/agents/runner.py b/backend/agents/runner.py index ebec2bd..0a37d58 100644 --- a/backend/agents/runner.py +++ b/backend/agents/runner.py @@ -32,14 +32,14 @@ class RunStatus(Enum): async def run_single_agent(spec: Dict[str, Any]) -> Dict[str, Any]: print(f"SPEC: {spec}") # Setup CUA agent - model = spec.get("model") or os.getenv("CUA_MODEL", "claude-sonnet-4-20250514") + model = spec.get("model") or os.getenv("CUA_MODEL", "claude-opus-4-1-20250805") # claude-sonnet-4-20250514 budget = spec.get("budget", 5.0) suite_id = spec.get("suite_id") # Setup CUA computer os_type = "linux" provider_type = "cloud" - container_name = spec.get("container_name") or os.getenv("CUA_CONTAINER_NAME") + container_name = spec.get("container_name") api_key = os.getenv("CUA_API_KEY") if not api_key: raise RuntimeError("CUA_API_KEY is required") @@ -276,6 +276,19 @@ async def run_suites_for_result(result_id: int) -> Dict[str, Any]: "run_status": RunStatus.FAILED.value, "error": "No suites found for result" } + + # Assign containers per suite (CUA_CONTAINER_1..4) + container_envs: List[str] = [] + for i in range(1, 5): + val = os.getenv(f"CUA_CONTAINER_{i}") + if val: + container_envs.append(val) + if not container_envs: + raise RuntimeError("No CUA_CONTAINER_[1-4] variables configured") + + for idx, spec in enumerate(specs): + assigned = container_envs[idx] if idx < len(container_envs) else container_envs[-1] + spec["container_name"] = assigned # Run each suite's tests concurrently tasks = [run_single_agent(spec) for spec in specs] diff --git a/backend/cicd/DEPLOYMENT.md b/backend/cicd/DEPLOYMENT.md deleted file mode 100644 index 851056e..0000000 --- a/backend/cicd/DEPLOYMENT.md +++ /dev/null @@ -1,102 +0,0 @@ -# QAI CI/CD Deployment Checklist - -## ✅ Ready to Test Once You Add Secrets - -Yes! The system is fully testable with a GitHub repo + PR once you add the required secrets. - -## Required GitHub Secrets - -Add these in your repo settings → Secrets and variables → Actions: - -1. **`OPENAI_API_KEY`** - Your OpenAI API key (starts with `sk-`) -2. **`QAI_ENDPOINT`** - URL where your agent testing endpoint will be hosted -3. **`GITHUB_TOKEN`** - ✅ Automatically provided by GitHub Actions - -## (Optional) Secrets - -- **`DEPLOYMENT_URL`** - Override the staging URL if needed -- **`AGENT_TIMEOUT`** - Custom timeout in milliseconds (default: 300000) - -## Testing Locally - -1. Copy secrets to `.env`: -```bash -cd backend/cicd -cp .env.example .env -# Edit .env with your actual values -``` - -2. Install dependencies: -```bash -npm install -``` - -3. Test individual scripts: -```bash -# Requires a PR context (set GITHUB_REPOSITORY, etc.) -node analyze-pr.js - -# Requires test-scenarios.json to exist -node run-tests.js - -# Requires PR context -node update-summary.js -``` - -## Integration Requirements - -### Agent Endpoint Format -Your teammate's agent endpoint should: - -**Accept POST requests with:** -```json -{ - "url": "https://staging-url.com", - "scenarios": [ - { - "description": "Test login form", - "priority": "high", - "type": "ui", - "persona": "new_user", - "steps": ["Navigate to login", "Enter credentials", "Submit"] - } - ], - "timeout": 300000 -} -``` - -**Return response:** -```json -[ - { - "scenario": { /* original scenario object */ }, - "success": true, - "error": null, - "video_url": "https://s3.../recording.mp4", - "duration": 12500 - } -] -``` - -## How to Test - -1. **Create a test repo** with this CI/CD setup -2. **Add the GitHub secrets** -3. **Create a PR** with some code changes -4. **Watch the action run** in Actions tab -5. **Check outputs** in the action logs - -The workflow will: -- ✅ Analyze your PR changes -- ✅ Generate relevant test scenarios using LLM -- ✅ Upload scenarios to database and call QAI API endpoint -- ✅ Pass/fail the CI based on test results from the API -- ✅ Update codebase summary if tests pass - -**Note:** The pipeline now calls the QAI API endpoint (`/run-suite`) instead of running agents locally in GitHub Actions. This means GitHub Actions only needs the `QAI_ENDPOINT` URL and doesn't require `CUA_API_KEY` or other agent-specific credentials. - -## Structured Output Benefits - -✅ **No JSON parsing failures** - Uses OpenAI's structured output with schema validation -✅ **Guaranteed format** - Schema enforces required fields and types -✅ **Type safety** - Enum values for priority/type prevent invalid data \ No newline at end of file diff --git a/backend/cicd/qai-pipeline.js b/backend/cicd/qai-pipeline.js index d1efe64..64a72de 100644 --- a/backend/cicd/qai-pipeline.js +++ b/backend/cicd/qai-pipeline.js @@ -57,7 +57,13 @@ CHANGES: ${diff} Generate focused test scenarios for autonomous agents. -For EACH scenario, also include a concise but rich summary (1-3 sentences) that gives the agent business context and the precise objective of the test. The summary should read like: "On , you are testing ; in this test, you to validate ".` +Constraints and guidance: +- Use at most 4 distinct test suites. Choose categories that best partition the behaviors changed by this PR (ex. Authentication, Navigation, New About Page, etc.). +- For EACH suite, prefer 2–3 high-value tests when meaningful, ideally E2E tests that a human would miss (think edge cases, race conditions, etc.). Aim for a total of ~6–10 scenarios overall, balancing coverage and noise. +- Do NOT create trivial or duplicative scenarios. Avoid superficial variations (e.g., same flow with only a color change). Deduplicate aggressively. +- If there is truly only one meaningful area to test, produce at least 2 complementary tests for that same persona (e.g., happy path vs clear edge/error path) rather than only one total scenario. + +For EACH scenario, also include a concise but rich summary (1–3 sentences) that gives the agent context and the precise objective to carry out the test efficiently. The summary could read like: "On , you are testing ; in this test, you to validate ".` }], response_format: { type: "json_schema", @@ -87,7 +93,11 @@ For EACH scenario, also include a concise but rich summary (1-3 sentences) that } }); - const parsedScenarios = completion.choices[0].message.parsed.scenarios; + let parsedScenarios = completion.choices[0].message.parsed.scenarios; + // Hard cap to 4 suites (personas) to match available containers + if (Array.isArray(parsedScenarios) && parsedScenarios.length > 4) { + parsedScenarios = parsedScenarios.slice(0, 4); + } const deploymentUrl = process.env.DEPLOYMENT_URL || 'the app'; const scenarios = parsedScenarios.map(s => ({ ...s, @@ -191,10 +201,13 @@ For EACH scenario, also include a concise but rich summary (1-3 sentences) that groups[persona].push(scenario); return groups; }, {}); + // Enforce max 4 suites (personas) + const limitedPersonas = Object.keys(personaGroups).slice(0, 4); // Create suite records (one per persona/agent) this.suiteIds = {}; - for (const [persona, personaScenarios] of Object.entries(personaGroups)) { + for (const persona of limitedPersonas) { + const personaScenarios = personaGroups[persona]; const suiteRecord = { result_id: this.resultId, // Foreign key to results table name: `${persona} Agent Suite`