Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions src/generalagents/action.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

@dataclass
class Coordinate:
"""Represents a point on the screen with x and y coordinates.

x is increasing rightwards and y is increasing downwards
"""

x: int
y: int

Expand Down Expand Up @@ -54,67 +59,89 @@ class Coordinate:

@dataclass
class ActionKeyPress:
"""Press one or more keyboard keys simultaneously."""

kind: Literal["key_press"]
keys: list[KeyboardKey]


@dataclass
class ActionType:
"""Type a sequence of characters."""

kind: Literal["type"]
text: str


@dataclass
class ActionLeftClick:
"""Left click the mouse button at a specific coordinate."""

kind: Literal["left_click"]
coordinate: Coordinate


@dataclass
class ActionRightClick:
"""Right click the mouse button at a specific coordinate."""

kind: Literal["right_click"]
coordinate: Coordinate


@dataclass
class ActionDoubleClick:
"""Double click the left mouse button at a specific coordinate."""

kind: Literal["double_click"]
coordinate: Coordinate


@dataclass
class ActionTripleClick:
"""Triple click the left mouse button at a specific coordinate."""

kind: Literal["triple_click"]
coordinate: Coordinate


@dataclass
class ActionMouseMove:
"""Move the mouse cursor to a specific coordinate without clicking."""

kind: Literal["mouse_move"]
coordinate: Coordinate


@dataclass
class ActionDrag:
"""Drag the mouse from one coordinate to another while holding the left button."""

kind: Literal["drag"]
drag_start: Coordinate
drag_end: Coordinate


@dataclass
class ActionScroll:
"""Scroll the mouse wheel at a specific coordinate with the given delta."""

kind: Literal["scroll"]
scroll_delta: int
coordinate: Coordinate


@dataclass
class ActionWait:
"""Pause execution to wait for UI changes or animations."""

kind: Literal["wait"]


@dataclass
class ActionStop:
"""Stop or complete the current task sequence."""

kind: Literal["stop"]


Expand Down
46 changes: 44 additions & 2 deletions src/generalagents/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,16 @@ def __init__(
temperature: float,
max_previous_actions: int,
):
""""""
"""A Session for interacting with the GeneralAgents API.

Args:
model: The model identifier to use for predictions.
api_key: The API key for authentication.
base_url: The base URL of the GeneralAgents API.
instruction: The instruction to guide the agent's behavior.
temperature: Sampling temperature for controlling randomness (0.0-1.0).
max_previous_actions: Maximum number of previous actions to include in context.
"""
self.model = model
self.instruction = instruction
self.max_previous_actions = max_previous_actions
Expand All @@ -36,6 +45,19 @@ def plan(
*,
allowed_action_kinds: list[ActionKind] | None = None,
) -> Action:
"""Plan the next action based on the current screen observation.

Args:
observation: Screenshot of the current screen state as a PIL Image.
allowed_action_kinds: Optional list of action kinds to restrict the model to.
If None, all action kinds are allowed.

Returns:
An Action object representing the next action to perform.

Raises:
httpx.HTTPStatusError: If the API request fails.
"""
buffer = BytesIO()
observation.save(buffer, format="WEBP")
image_url = f"data:image/webp;base64,{base64.b64encode(buffer.getvalue()).decode('utf8')}"
Expand Down Expand Up @@ -66,7 +88,19 @@ def __init__(
temperature: float = 0.3,
max_previous_actions: int = 20,
):
""""""
"""Initialize an Agent for computer control.

Args:
model: The model identifier to use for predictions.
api_key: The API key for authentication. Defaults to GENERALAGENTS_API_KEY
environment variable.
base_url: The base URL of the GeneralAgents API.
temperature: Sampling temperature for controlling randomness (0.0-1.0).
max_previous_actions: Maximum number of previous actions to include in context.

Raises:
ValueError: If no API key is provided and the environment variable is not set.
"""
if not api_key:
msg = (
"No API key provided, please set an environment variable "
Expand All @@ -80,6 +114,14 @@ def __init__(
self.max_previous_actions = max_previous_actions

def start(self, instruction: str) -> Session:
"""Start a new session with the specified instruction.

Args:
instruction: The instruction to guide the agent's behavior.

Returns:
A Session object configured with this agent's parameters.
"""
return Session(
self.model,
api_key=self.api_key,
Expand Down
19 changes: 18 additions & 1 deletion src/generalagents/macos/computer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,19 @@

class Computer:
def __init__(self, pause_after_action: float = 0.1, pause_for_wait: float = 0.1):
"""A Computer interface for macOS control.

Args:
pause_after_action: Time in seconds to wait after executing an action.
pause_for_wait: Time in seconds to wait when executing a wait action.
"""
self.pause_after_action = pause_after_action
self.pause_for_wait = pause_for_wait

w, h = pyautogui.size()

# On high-DPI displays (e.g. Retina), pyautogui.size() may return scaled-down dimensions.
# To standardize, we calculate a scale factor based on the maximum dimension and resize accordingly.
self.scale_factor = Fraction(max(w, h), 1200)
self.size = (round(w / self.scale_factor), round(h / self.scale_factor))

Expand All @@ -44,7 +53,15 @@ def observe(self) -> Image.Image:
return Image.open(f.name).resize(self.size)

def execute(self, action: Action) -> Image.Image:
"""Execute a control action and observe the resulting state of the computer."""
"""Execute a control action and observe the resulting state of the computer.

Args:
action: The action to execute (e.g., mouse click, keyboard input).

Returns:
A screenshot of the screen after the action has been performed,
allowing observation of the effect of the action.
"""
self._execute_action(action)
time.sleep(self.pause_after_action)
return self.observe()
Expand Down