diff --git a/README.md b/README.md index 13ee3ad..9a6d507 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ observation = computer.observe() for _ in range(25): # max actions action = session.plan(observation) + print(f"Executing: {action}") if action.kind == "stop": break observation = computer.execute(action) diff --git a/src/generalagents/action.py b/src/generalagents/action.py index 453968d..1c23c0b 100644 --- a/src/generalagents/action.py +++ b/src/generalagents/action.py @@ -105,6 +105,7 @@ class ActionDrag: class ActionScroll: kind: Literal["scroll"] scroll_delta: int + coordinate: Coordinate @dataclass diff --git a/src/generalagents/agent.py b/src/generalagents/agent.py index 20c26fb..ee3a06d 100644 --- a/src/generalagents/agent.py +++ b/src/generalagents/agent.py @@ -14,6 +14,7 @@ def __init__( self, model: str, api_key: str, + base_url: str, instruction: str, temperature: float, max_previous_actions: int, @@ -23,7 +24,7 @@ def __init__( self.instruction = instruction self.max_previous_actions = max_previous_actions self.client = httpx.Client( - base_url="https://api.generalagents.com", + base_url=base_url, headers={"Authorization": f"Bearer {api_key}"}, ) self.previous_actions = [] @@ -31,8 +32,8 @@ def __init__( def plan(self, observation: Image.Image) -> Action: buffer = BytesIO() - observation.save(buffer, format="PNG") - image_url = f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf8')}" + observation.save(buffer, format="WEBP") + image_url = f"data:image/webp;base64,{base64.b64encode(buffer.getvalue()).decode('utf8')}" data = { "model": self.model, @@ -47,7 +48,6 @@ def plan(self, observation: Image.Image) -> Action: action = res.json()["action"] self.previous_actions.append(action) - print(f"Received action {action}") return cattrs.structure(action, Action) # pyright: ignore [reportArgumentType] https://peps.python.org/pep-0747 @@ -56,6 +56,7 @@ def __init__( self, model: str, api_key: str = os.getenv("GENERALAGENTS_API_KEY", ""), + base_url: str = "https://api.generalagents.com", temperature: float = 0.3, max_previous_actions: int = 20, ): @@ -68,6 +69,7 @@ def __init__( raise ValueError(msg) self.model = model self.api_key = api_key + self.base_url = base_url.rstrip("/") self.temperature = temperature self.max_previous_actions = max_previous_actions @@ -75,6 +77,7 @@ def start(self, instruction: str) -> Session: return Session( self.model, api_key=self.api_key, + base_url=self.base_url, instruction=instruction, temperature=self.temperature, max_previous_actions=self.max_previous_actions, diff --git a/src/generalagents/macos/computer.py b/src/generalagents/macos/computer.py index caa1ad0..cdd6e82 100644 --- a/src/generalagents/macos/computer.py +++ b/src/generalagents/macos/computer.py @@ -76,7 +76,8 @@ def _execute_action(self, action: Action) -> None: pyautogui.moveTo(*self._scaled(start)) pyautogui.dragTo(*self._scaled(end), duration=0.5) - case ActionScroll(kind="scroll", scroll_delta=delta): + case ActionScroll(kind="scroll", scroll_delta=delta, coordinate=coord): + pyautogui.moveTo(*self._scaled(coord)) pyautogui.scroll(float(delta * self.scale_factor)) case ActionWait(kind="wait"): diff --git a/tests/test_structure.py b/tests/test_structure.py index 7658bed..d9b012e 100644 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -66,8 +66,8 @@ def test_structure(): assert action == cattrs.structure(dict_, Action) # pyright: ignore [reportArgumentType] # Test ActionScroll - dict_ = {"kind": "scroll", "scroll_delta": -100} - action = ActionScroll(kind="scroll", scroll_delta=-100) + dict_ = {"kind": "scroll", "scroll_delta": -100, "coordinate": {"x": 100, "y": 200}} + action = ActionScroll(kind="scroll", scroll_delta=-100, coordinate=Coordinate(x=100, y=200)) assert action == cattrs.structure(dict_, Action) # pyright: ignore [reportArgumentType] # Test ActionWait