From 1b39ca5d52d4144141313451fda9e83f3cbe0333 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Sun, 21 Dec 2025 09:18:05 -0500 Subject: [PATCH 1/3] Update Gemini cua template to reflect stagehand v3 > Updated to stagehand v3 sdk conventions > Added ability to specify starting url + instructions > Removed openai api key requirement that was in stagehand v2 version --- .../gemini-computer-use/.env.example | 1 - .../typescript/gemini-computer-use/index.ts | 56 +++++++++---------- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/pkg/templates/typescript/gemini-computer-use/.env.example b/pkg/templates/typescript/gemini-computer-use/.env.example index a2973a1..0bb1de3 100644 --- a/pkg/templates/typescript/gemini-computer-use/.env.example +++ b/pkg/templates/typescript/gemini-computer-use/.env.example @@ -1,3 +1,2 @@ # Copy this file to .env and fill in your API keys GOOGLE_API_KEY=your_google_api_key_here -OPENAI_API_KEY=your_openai_api_key_here diff --git a/pkg/templates/typescript/gemini-computer-use/index.ts b/pkg/templates/typescript/gemini-computer-use/index.ts index 2d6997f..eee4015 100644 --- a/pkg/templates/typescript/gemini-computer-use/index.ts +++ b/pkg/templates/typescript/gemini-computer-use/index.ts @@ -7,29 +7,32 @@ const kernel = new Kernel({ const app = kernel.app('ts-gemini-cua'); +interface CuaTaskInput { + startingUrl?: string; + instruction?: string; +} + interface SearchQueryOutput { success: boolean; result: string; error?: string; } -// API Keys for LLM providers +// API Key for LLM provider // - GOOGLE_API_KEY: Required for Gemini 2.5 Computer Use Agent -// - OPENAI_API_KEY: Required for Stagehand's GPT-4o model // Set via environment variables or `kernel deploy --env-file .env` // See https://docs.onkernel.com/launch/deploy#environment-variables const GOOGLE_API_KEY = process.env.GOOGLE_API_KEY; -const OPENAI_API_KEY = process.env.OPENAI_API_KEY; - -if (!OPENAI_API_KEY) { - throw new Error('OPENAI_API_KEY is not set'); -} if (!GOOGLE_API_KEY) { throw new Error('GOOGLE_API_KEY is not set'); } -async function runStagehandTask(invocationId?: string): Promise { +async function runStagehandTask( + invocationId?: string, + startingUrl: string = "https://www.magnitasks.com/", + instruction: string = "Click the Tasks option in the left-side bar, and move the 5 items in the 'To Do' and 'In Progress' items to the 'Done' section of the Kanban board? You are done successfully when the items are moved." +): Promise { // Executes a Computer Use Agent (CUA) task using Gemini 2.5 and Stagehand const browserOptions = { @@ -49,11 +52,7 @@ async function runStagehandTask(invocationId?: string): Promise( +app.action( 'gemini-cua-task', - async (ctx: KernelContext): Promise => { - return runStagehandTask(ctx.invocation_id); + async (ctx: KernelContext, payload?: CuaTaskInput): Promise => { + return runStagehandTask( + ctx.invocation_id, + payload?.startingUrl, + payload?.instruction + ); }, ); From af7a754260c9be18384efaefbf246fc2093947f7 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Sun, 21 Dec 2025 09:18:52 -0500 Subject: [PATCH 2/3] Update readmes Update readmes for the general app and also the specific template to reflect the new situation and also to provide some initial insight into how you can use other model providers. --- README.md | 2 +- .../typescript/gemini-computer-use/README.md | 32 ++++++++++++++++--- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 22bcf5d..4d73dc3 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ Create an API key from the [Kernel dashboard](https://dashboard.onkernel.com). - `browser-use` - Template with Browser Use SDK (Python only) - `anthropic-computer-use` - Anthropic Computer Use prompt loop - `openai-computer-use` - OpenAI Computer Use Agent sample - - `gemini-computer-use` - Gemini Computer Use Agent sample (TypeScript only) + - `gemini-computer-use` - Implements a Gemini computer use agent (TypeScript only) - `openagi-computer-use` - OpenAGI Lux computer-use models (Python only) - `magnitude` - Magnitude framework sample (TypeScript only) diff --git a/pkg/templates/typescript/gemini-computer-use/README.md b/pkg/templates/typescript/gemini-computer-use/README.md index e6ae943..bce8930 100644 --- a/pkg/templates/typescript/gemini-computer-use/README.md +++ b/pkg/templates/typescript/gemini-computer-use/README.md @@ -4,14 +4,13 @@ A Kernel application that demonstrates Computer Use Agent (CUA) capabilities usi ## What It Does -This app uses [Gemini 2.5's computer use model](https://blog.google/technology/google-deepmind/gemini-computer-use-model/) capabilities to autonomously navigate websites and complete tasks. The example task searches for Kernel's company page on YCombinator and writes a blog post about their product. +This app uses [Gemini 2.5's computer use model](https://blog.google/technology/google-deepmind/gemini-computer-use-model/) capabilities to autonomously navigate websites and complete tasks. The agent can interact with web pages just like a human would - clicking, typing, scrolling, and extracting information. ## Setup 1. **Add your API keys as environment variables:** - `KERNEL_API_KEY` - Get from [Kernel dashboard](https://dashboard.onkernel.com/sign-in) - `GOOGLE_API_KEY` - Get from [Google AI Studio](https://aistudio.google.com/apikey) - - `OPENAI_API_KEY` - Get from [OpenAI platform](https://platform.openai.com/api-keys) ## Running Locally @@ -25,9 +24,10 @@ This runs the agent without a Kernel invocation context and provides the browser ## Deploying to Kernel -1. **Deploy the application:** +1. **Copy the example env file, add your API keys, and deploy:** ```bash - kernel deploy index.ts --env GOOGLE_API_KEY=XXX --env OPENAI_API_KEY=XXX + cp .example.env .env + kernel deploy index.ts --env-file .env ``` 2. **Invoke the action:** @@ -37,6 +37,30 @@ This runs the agent without a Kernel invocation context and provides the browser The action creates a Kernel-managed browser and associates it with the invocation for tracking and monitoring. +## Alternative Model Providers + +Stagehand's CUA agent supports multiple model providers. You can switch from Gemini to OpenAI or Anthropic by changing the model configuration in `index.ts` and redeploying your Kernel app: + +**OpenAI Computer Use:** +```typescript +model: { + modelName: "openai/computer-use-preview", + apiKey: process.env.OPENAI_API_KEY +} +``` + +**Anthropic Claude Sonnet:** +```typescript +model: { + modelName: "anthropic/claude-sonnet-4-20250514", + apiKey: process.env.ANTHROPIC_API_KEY +} +``` + +When using alternative providers, make sure to: +1. Add the corresponding API key to your environment variables +2. Update the deploy command to include the new API key (e.g., `--env OPENAI_API_KEY=XXX`) + ## Documentation - [Kernel Documentation](https://docs.onkernel.com/quickstart) From 5678806a9348c97137a308392f32aa197fd7ca04 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Sun, 21 Dec 2025 09:46:31 -0500 Subject: [PATCH 3/3] Update qa.md Updated qa.md with the changes made to the gemini computer use template --- .cursor/commands/qa.md | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/.cursor/commands/qa.md b/.cursor/commands/qa.md index 92df8d6..6bed171 100644 --- a/.cursor/commands/qa.md +++ b/.cursor/commands/qa.md @@ -56,7 +56,7 @@ Here are all valid language + template combinations: | typescript | anthropic-computer-use | ts-anthropic-cua | ts-anthropic-cua | Yes | ANTHROPIC_API_KEY | | typescript | magnitude | ts-magnitude | ts-magnitude | Yes | ANTHROPIC_API_KEY | | typescript | openai-computer-use | ts-openai-cua | ts-openai-cua | Yes | OPENAI_API_KEY | -| typescript | gemini-computer-use | ts-gemini-cua | ts-gemini-cua | Yes | GOOGLE_API_KEY, OPENAI_API_KEY | +| typescript | gemini-computer-use | ts-gemini-cua | ts-gemini-cua | Yes | GOOGLE_API_KEY | | python | sample-app | py-sample-app | python-basic | No | - | | python | captcha-solver | py-captcha-solver | python-captcha-solver | No | - | | python | browser-use | py-browser-use | python-bu | Yes | OPENAI_API_KEY | @@ -154,14 +154,11 @@ echo "OPENAI_API_KEY=" > .env cd .. ``` -**ts-gemini-cua** (needs GOOGLE_API_KEY and OPENAI_API_KEY): +**ts-gemini-cua** (needs GOOGLE_API_KEY): ```bash cd ts-gemini-cua -cat > .env << EOF -GOOGLE_API_KEY= -OPENAI_API_KEY= -EOF +echo "GOOGLE_API_KEY=" > .env ../bin/kernel deploy index.ts --env-file .env cd .. ``` @@ -214,7 +211,7 @@ kernel invoke ts-stagehand teamsize-task --payload '{"company": "Kernel"}' kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}' kernel invoke ts-magnitude mag-url-extract --payload '{"url": "https://en.wikipedia.org/wiki/Special:Random"}' kernel invoke ts-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' -kernel invoke ts-gemini-cua gemini-cua-task +kernel invoke ts-gemini-cua gemini-cua-task --payload '{"startingUrl": "https://www.magnitasks.com/", "instruction": "Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board? You are done successfully when the items are moved."}' # Python apps kernel invoke python-basic get-page-title --payload '{"url": "https://www.google.com"}' @@ -232,8 +229,6 @@ kernel invoke python-openagi-cua openagi-default-task -p '{"instruction": "Navig If the human agrees, invoke each template and collect results. Present findings in this format: ### Testing Guidelines - -- **Timeout:** Cancel each invocation after 90 seconds if it has not completed. Mark the status as `TIMEOUT` in the results table. - **Parallel execution:** You may run multiple invocations in parallel to speed up testing. - **Error handling:** Capture any runtime errors and include them in the Notes column. @@ -258,7 +253,6 @@ If the human agrees, invoke each template and collect results. Present findings Status values: - **SUCCESS**: App started and returned a result - **FAILED**: App encountered a runtime error -- **TIMEOUT**: App did not complete within 90 seconds (cancelled) Notes should include brief error messages for failures or confirmation of successful output.