diff --git a/.cursor/commands/qa.md b/.cursor/commands/qa.md index 92df8d6..6bed171 100644 --- a/.cursor/commands/qa.md +++ b/.cursor/commands/qa.md @@ -56,7 +56,7 @@ Here are all valid language + template combinations: | typescript | anthropic-computer-use | ts-anthropic-cua | ts-anthropic-cua | Yes | ANTHROPIC_API_KEY | | typescript | magnitude | ts-magnitude | ts-magnitude | Yes | ANTHROPIC_API_KEY | | typescript | openai-computer-use | ts-openai-cua | ts-openai-cua | Yes | OPENAI_API_KEY | -| typescript | gemini-computer-use | ts-gemini-cua | ts-gemini-cua | Yes | GOOGLE_API_KEY, OPENAI_API_KEY | +| typescript | gemini-computer-use | ts-gemini-cua | ts-gemini-cua | Yes | GOOGLE_API_KEY | | python | sample-app | py-sample-app | python-basic | No | - | | python | captcha-solver | py-captcha-solver | python-captcha-solver | No | - | | python | browser-use | py-browser-use | python-bu | Yes | OPENAI_API_KEY | @@ -154,14 +154,11 @@ echo "OPENAI_API_KEY=" > .env cd .. ``` -**ts-gemini-cua** (needs GOOGLE_API_KEY and OPENAI_API_KEY): +**ts-gemini-cua** (needs GOOGLE_API_KEY): ```bash cd ts-gemini-cua -cat > .env << EOF -GOOGLE_API_KEY= -OPENAI_API_KEY= -EOF +echo "GOOGLE_API_KEY=" > .env ../bin/kernel deploy index.ts --env-file .env cd .. ``` @@ -214,7 +211,7 @@ kernel invoke ts-stagehand teamsize-task --payload '{"company": "Kernel"}' kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}' kernel invoke ts-magnitude mag-url-extract --payload '{"url": "https://en.wikipedia.org/wiki/Special:Random"}' kernel invoke ts-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' -kernel invoke ts-gemini-cua gemini-cua-task +kernel invoke ts-gemini-cua gemini-cua-task --payload '{"startingUrl": "https://www.magnitasks.com/", "instruction": "Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board? You are done successfully when the items are moved."}' # Python apps kernel invoke python-basic get-page-title --payload '{"url": "https://www.google.com"}' @@ -232,8 +229,6 @@ kernel invoke python-openagi-cua openagi-default-task -p '{"instruction": "Navig If the human agrees, invoke each template and collect results. Present findings in this format: ### Testing Guidelines - -- **Timeout:** Cancel each invocation after 90 seconds if it has not completed. Mark the status as `TIMEOUT` in the results table. - **Parallel execution:** You may run multiple invocations in parallel to speed up testing. - **Error handling:** Capture any runtime errors and include them in the Notes column. @@ -258,7 +253,6 @@ If the human agrees, invoke each template and collect results. Present findings Status values: - **SUCCESS**: App started and returned a result - **FAILED**: App encountered a runtime error -- **TIMEOUT**: App did not complete within 90 seconds (cancelled) Notes should include brief error messages for failures or confirmation of successful output. diff --git a/README.md b/README.md index 22bcf5d..4d73dc3 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ Create an API key from the [Kernel dashboard](https://dashboard.onkernel.com). - `browser-use` - Template with Browser Use SDK (Python only) - `anthropic-computer-use` - Anthropic Computer Use prompt loop - `openai-computer-use` - OpenAI Computer Use Agent sample - - `gemini-computer-use` - Gemini Computer Use Agent sample (TypeScript only) + - `gemini-computer-use` - Implements a Gemini computer use agent (TypeScript only) - `openagi-computer-use` - OpenAGI Lux computer-use models (Python only) - `magnitude` - Magnitude framework sample (TypeScript only) diff --git a/pkg/templates/typescript/gemini-computer-use/.env.example b/pkg/templates/typescript/gemini-computer-use/.env.example index a2973a1..0bb1de3 100644 --- a/pkg/templates/typescript/gemini-computer-use/.env.example +++ b/pkg/templates/typescript/gemini-computer-use/.env.example @@ -1,3 +1,2 @@ # Copy this file to .env and fill in your API keys GOOGLE_API_KEY=your_google_api_key_here -OPENAI_API_KEY=your_openai_api_key_here diff --git a/pkg/templates/typescript/gemini-computer-use/README.md b/pkg/templates/typescript/gemini-computer-use/README.md index e6ae943..bce8930 100644 --- a/pkg/templates/typescript/gemini-computer-use/README.md +++ b/pkg/templates/typescript/gemini-computer-use/README.md @@ -4,14 +4,13 @@ A Kernel application that demonstrates Computer Use Agent (CUA) capabilities usi ## What It Does -This app uses [Gemini 2.5's computer use model](https://blog.google/technology/google-deepmind/gemini-computer-use-model/) capabilities to autonomously navigate websites and complete tasks. The example task searches for Kernel's company page on YCombinator and writes a blog post about their product. +This app uses [Gemini 2.5's computer use model](https://blog.google/technology/google-deepmind/gemini-computer-use-model/) capabilities to autonomously navigate websites and complete tasks. The agent can interact with web pages just like a human would - clicking, typing, scrolling, and extracting information. ## Setup 1. **Add your API keys as environment variables:** - `KERNEL_API_KEY` - Get from [Kernel dashboard](https://dashboard.onkernel.com/sign-in) - `GOOGLE_API_KEY` - Get from [Google AI Studio](https://aistudio.google.com/apikey) - - `OPENAI_API_KEY` - Get from [OpenAI platform](https://platform.openai.com/api-keys) ## Running Locally @@ -25,9 +24,10 @@ This runs the agent without a Kernel invocation context and provides the browser ## Deploying to Kernel -1. **Deploy the application:** +1. **Copy the example env file, add your API keys, and deploy:** ```bash - kernel deploy index.ts --env GOOGLE_API_KEY=XXX --env OPENAI_API_KEY=XXX + cp .example.env .env + kernel deploy index.ts --env-file .env ``` 2. **Invoke the action:** @@ -37,6 +37,30 @@ This runs the agent without a Kernel invocation context and provides the browser The action creates a Kernel-managed browser and associates it with the invocation for tracking and monitoring. +## Alternative Model Providers + +Stagehand's CUA agent supports multiple model providers. You can switch from Gemini to OpenAI or Anthropic by changing the model configuration in `index.ts` and redeploying your Kernel app: + +**OpenAI Computer Use:** +```typescript +model: { + modelName: "openai/computer-use-preview", + apiKey: process.env.OPENAI_API_KEY +} +``` + +**Anthropic Claude Sonnet:** +```typescript +model: { + modelName: "anthropic/claude-sonnet-4-20250514", + apiKey: process.env.ANTHROPIC_API_KEY +} +``` + +When using alternative providers, make sure to: +1. Add the corresponding API key to your environment variables +2. Update the deploy command to include the new API key (e.g., `--env OPENAI_API_KEY=XXX`) + ## Documentation - [Kernel Documentation](https://docs.onkernel.com/quickstart) diff --git a/pkg/templates/typescript/gemini-computer-use/index.ts b/pkg/templates/typescript/gemini-computer-use/index.ts index 2d6997f..eee4015 100644 --- a/pkg/templates/typescript/gemini-computer-use/index.ts +++ b/pkg/templates/typescript/gemini-computer-use/index.ts @@ -7,29 +7,32 @@ const kernel = new Kernel({ const app = kernel.app('ts-gemini-cua'); +interface CuaTaskInput { + startingUrl?: string; + instruction?: string; +} + interface SearchQueryOutput { success: boolean; result: string; error?: string; } -// API Keys for LLM providers +// API Key for LLM provider // - GOOGLE_API_KEY: Required for Gemini 2.5 Computer Use Agent -// - OPENAI_API_KEY: Required for Stagehand's GPT-4o model // Set via environment variables or `kernel deploy --env-file .env` // See https://docs.onkernel.com/launch/deploy#environment-variables const GOOGLE_API_KEY = process.env.GOOGLE_API_KEY; -const OPENAI_API_KEY = process.env.OPENAI_API_KEY; - -if (!OPENAI_API_KEY) { - throw new Error('OPENAI_API_KEY is not set'); -} if (!GOOGLE_API_KEY) { throw new Error('GOOGLE_API_KEY is not set'); } -async function runStagehandTask(invocationId?: string): Promise { +async function runStagehandTask( + invocationId?: string, + startingUrl: string = "https://www.magnitasks.com/", + instruction: string = "Click the Tasks option in the left-side bar, and move the 5 items in the 'To Do' and 'In Progress' items to the 'Done' section of the Kanban board? You are done successfully when the items are moved." +): Promise { // Executes a Computer Use Agent (CUA) task using Gemini 2.5 and Stagehand const browserOptions = { @@ -49,11 +52,7 @@ async function runStagehandTask(invocationId?: string): Promise( +app.action( 'gemini-cua-task', - async (ctx: KernelContext): Promise => { - return runStagehandTask(ctx.invocation_id); + async (ctx: KernelContext, payload?: CuaTaskInput): Promise => { + return runStagehandTask( + ctx.invocation_id, + payload?.startingUrl, + payload?.instruction + ); }, );