From 657f71496777547c2f96fa1997324ca1a1e34ae5 Mon Sep 17 00:00:00 2001 From: Sebastian Date: Thu, 3 Apr 2025 01:10:08 -0700 Subject: [PATCH 1/4] Doc strings added to: - agent.py - action.py - computer.py --- src/generalagents/action.py | 24 +++++++++++++++ src/generalagents/agent.py | 46 +++++++++++++++++++++++++++-- src/generalagents/macos/computer.py | 21 ++++++++++++- 3 files changed, 88 insertions(+), 3 deletions(-) diff --git a/src/generalagents/action.py b/src/generalagents/action.py index 1c23c0b..07c7633 100644 --- a/src/generalagents/action.py +++ b/src/generalagents/action.py @@ -4,6 +4,8 @@ @dataclass class Coordinate: + """Represents a point on the screen with x and y coordinates.""" + x: int y: int @@ -54,48 +56,64 @@ class Coordinate: @dataclass class ActionKeyPress: + """Represents pressing one or more keyboard keys simultaneously.""" + kind: Literal["key_press"] keys: list[KeyboardKey] @dataclass class ActionType: + """Represents typing a sequence of characters.""" + kind: Literal["type"] text: str @dataclass class ActionLeftClick: + """Represents a left mouse button click at a specific coordinate.""" + kind: Literal["left_click"] coordinate: Coordinate @dataclass class ActionRightClick: + """Represents a right mouse button click at a specific coordinate.""" + kind: Literal["right_click"] coordinate: Coordinate @dataclass class ActionDoubleClick: + """Represents a double click with the left mouse button at a specific coordinate.""" + kind: Literal["double_click"] coordinate: Coordinate @dataclass class ActionTripleClick: + """Represents a triple click with the left mouse button at a specific coordinate.""" + kind: Literal["triple_click"] coordinate: Coordinate @dataclass class ActionMouseMove: + """Represents moving the mouse cursor to a specific coordinate without clicking.""" + kind: Literal["mouse_move"] coordinate: Coordinate @dataclass class ActionDrag: + """Represents dragging the mouse from one coordinate to another while holding the left button.""" + kind: Literal["drag"] drag_start: Coordinate drag_end: Coordinate @@ -103,6 +121,8 @@ class ActionDrag: @dataclass class ActionScroll: + """Represents scrolling the mouse wheel at a specific coordinate with the given delta.""" + kind: Literal["scroll"] scroll_delta: int coordinate: Coordinate @@ -110,11 +130,15 @@ class ActionScroll: @dataclass class ActionWait: + """Represents a pause in execution to wait for UI changes or animations.""" + kind: Literal["wait"] @dataclass class ActionStop: + """Represents stopping or completing the current task sequence.""" + kind: Literal["stop"] diff --git a/src/generalagents/agent.py b/src/generalagents/agent.py index 3f17697..ec46831 100644 --- a/src/generalagents/agent.py +++ b/src/generalagents/agent.py @@ -19,7 +19,16 @@ def __init__( temperature: float, max_previous_actions: int, ): - """""" + """Initialize a Session for interacting with the GeneralAgents API. + + Args: + model: The model identifier to use for predictions. + api_key: The API key for authentication. + base_url: The base URL of the GeneralAgents API. + instruction: The instruction to guide the agent's behavior. + temperature: Sampling temperature for controlling randomness (0.0-1.0). + max_previous_actions: Maximum number of previous actions to include in context. + """ self.model = model self.instruction = instruction self.max_previous_actions = max_previous_actions @@ -36,6 +45,19 @@ def plan( *, allowed_action_kinds: list[ActionKind] | None = None, ) -> Action: + """Plan the next action based on the current screen observation. + + Args: + observation: Screenshot of the current screen state as a PIL Image. + allowed_action_kinds: Optional list of action kinds to restrict the model to. + If None, all action kinds are allowed. + + Returns: + An Action object representing the next action to perform. + + Raises: + httpx.HTTPStatusError: If the API request fails. + """ buffer = BytesIO() observation.save(buffer, format="WEBP") image_url = f"data:image/webp;base64,{base64.b64encode(buffer.getvalue()).decode('utf8')}" @@ -66,7 +88,19 @@ def __init__( temperature: float = 0.3, max_previous_actions: int = 20, ): - """""" + """Initialize an Agent for computer control. + + Args: + model: The model identifier to use for predictions. + api_key: The API key for authentication. Defaults to GENERALAGENTS_API_KEY + environment variable. + base_url: The base URL of the GeneralAgents API. + temperature: Sampling temperature for controlling randomness (0.0-1.0). + max_previous_actions: Maximum number of previous actions to include in context. + + Raises: + ValueError: If no API key is provided and the environment variable is not set. + """ if not api_key: msg = ( "No API key provided, please set an environment variable " @@ -80,6 +114,14 @@ def __init__( self.max_previous_actions = max_previous_actions def start(self, instruction: str) -> Session: + """Start a new session with the specified instruction. + + Args: + instruction: The instruction to guide the agent's behavior. + + Returns: + A Session object configured with this agent's parameters. + """ return Session( self.model, api_key=self.api_key, diff --git a/src/generalagents/macos/computer.py b/src/generalagents/macos/computer.py index cdd6e82..82eecc3 100644 --- a/src/generalagents/macos/computer.py +++ b/src/generalagents/macos/computer.py @@ -27,6 +27,17 @@ class Computer: def __init__(self, pause_after_action: float = 0.1, pause_for_wait: float = 0.1): + """Initialize a Computer interface for macOS control. + + Args: + pause_after_action: Time in seconds to wait after executing an action. + pause_for_wait: Time in seconds to wait when executing a wait action. + + Note: + The scale_factor is automatically calculated by dividing the screen size by 1200, + which helps normalize coordinates for different screen resolutions. This way, + actions specified for a 1200px reference screen can be scaled to the actual screen. + """ self.pause_after_action = pause_after_action self.pause_for_wait = pause_for_wait @@ -41,7 +52,15 @@ def observe(self) -> Image.Image: return Image.open(f.name).resize(self.size) def execute(self, action: Action) -> Image.Image: - """Execute a control action and observe the resulting state of the computer.""" + """Execute a control action and observe the resulting state of the computer. + + Args: + action: The action to execute (e.g., mouse click, keyboard input). + + Returns: + Image.Image: A screenshot of the screen after the action has been performed, + allowing observation of the effect of the action. + """ self._execute_action(action) time.sleep(self.pause_after_action) return self.observe() From 46991f0214ef90e5e8356a61db33f24c17cbbd09 Mon Sep 17 00:00:00 2001 From: Sebastian Date: Wed, 9 Apr 2025 20:11:05 -0700 Subject: [PATCH 2/4] fixed PR comments --- src/generalagents/action.py | 27 +++++++++++++++------------ src/generalagents/macos/computer.py | 10 ++++------ 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/generalagents/action.py b/src/generalagents/action.py index 07c7633..e20008a 100644 --- a/src/generalagents/action.py +++ b/src/generalagents/action.py @@ -4,7 +4,10 @@ @dataclass class Coordinate: - """Represents a point on the screen with x and y coordinates.""" + """Represents a point on the screen with x and y coordinates. + + x is increasing rightwards and y is increasing downwards + """ x: int y: int @@ -56,7 +59,7 @@ class Coordinate: @dataclass class ActionKeyPress: - """Represents pressing one or more keyboard keys simultaneously.""" + """Press one or more keyboard keys simultaneously.""" kind: Literal["key_press"] keys: list[KeyboardKey] @@ -64,7 +67,7 @@ class ActionKeyPress: @dataclass class ActionType: - """Represents typing a sequence of characters.""" + """Type a sequence of characters.""" kind: Literal["type"] text: str @@ -72,7 +75,7 @@ class ActionType: @dataclass class ActionLeftClick: - """Represents a left mouse button click at a specific coordinate.""" + """Left click the mouse button at a specific coordinate.""" kind: Literal["left_click"] coordinate: Coordinate @@ -80,7 +83,7 @@ class ActionLeftClick: @dataclass class ActionRightClick: - """Represents a right mouse button click at a specific coordinate.""" + """Right click the mouse button at a specific coordinate.""" kind: Literal["right_click"] coordinate: Coordinate @@ -88,7 +91,7 @@ class ActionRightClick: @dataclass class ActionDoubleClick: - """Represents a double click with the left mouse button at a specific coordinate.""" + """Double click the left mouse button at a specific coordinate.""" kind: Literal["double_click"] coordinate: Coordinate @@ -96,7 +99,7 @@ class ActionDoubleClick: @dataclass class ActionTripleClick: - """Represents a triple click with the left mouse button at a specific coordinate.""" + """Triple click the left mouse button at a specific coordinate.""" kind: Literal["triple_click"] coordinate: Coordinate @@ -104,7 +107,7 @@ class ActionTripleClick: @dataclass class ActionMouseMove: - """Represents moving the mouse cursor to a specific coordinate without clicking.""" + """Move the mouse cursor to a specific coordinate without clicking.""" kind: Literal["mouse_move"] coordinate: Coordinate @@ -112,7 +115,7 @@ class ActionMouseMove: @dataclass class ActionDrag: - """Represents dragging the mouse from one coordinate to another while holding the left button.""" + """Drag the mouse from one coordinate to another while holding the left button.""" kind: Literal["drag"] drag_start: Coordinate @@ -121,7 +124,7 @@ class ActionDrag: @dataclass class ActionScroll: - """Represents scrolling the mouse wheel at a specific coordinate with the given delta.""" + """Scroll the mouse wheel at a specific coordinate with the given delta.""" kind: Literal["scroll"] scroll_delta: int @@ -130,14 +133,14 @@ class ActionScroll: @dataclass class ActionWait: - """Represents a pause in execution to wait for UI changes or animations.""" + """Pause execution to wait for UI changes or animations.""" kind: Literal["wait"] @dataclass class ActionStop: - """Represents stopping or completing the current task sequence.""" + """Stop or complete the current task sequence.""" kind: Literal["stop"] diff --git a/src/generalagents/macos/computer.py b/src/generalagents/macos/computer.py index c1f5326..fd05305 100644 --- a/src/generalagents/macos/computer.py +++ b/src/generalagents/macos/computer.py @@ -35,16 +35,14 @@ def __init__(self, pause_after_action: float = 0.1, pause_for_wait: float = 0.1) Args: pause_after_action: Time in seconds to wait after executing an action. pause_for_wait: Time in seconds to wait when executing a wait action. - - Note: - The scale_factor is automatically calculated by dividing the screen size by 1200, - which helps normalize coordinates for different screen resolutions. This way, - actions specified for a 1200px reference screen can be scaled to the actual screen. """ self.pause_after_action = pause_after_action self.pause_for_wait = pause_for_wait w, h = pyautogui.size() + + # On high-DPI displays (e.g. Retina), pyautogui.size() may return scaled-down dimensions. + # To standardize, we calculate a scale factor based on the maximum dimension and resize accordingly. self.scale_factor = Fraction(max(w, h), 1200) self.size = (round(w / self.scale_factor), round(h / self.scale_factor)) @@ -61,7 +59,7 @@ def execute(self, action: Action) -> Image.Image: action: The action to execute (e.g., mouse click, keyboard input). Returns: - Image.Image: A screenshot of the screen after the action has been performed, + A screenshot of the screen after the action has been performed, allowing observation of the effect of the action. """ self._execute_action(action) From 5e3c86e28ff04f6300561a867a140db6fb55eb1a Mon Sep 17 00:00:00 2001 From: Sebastian Date: Wed, 9 Apr 2025 20:13:17 -0700 Subject: [PATCH 3/4] initializer comment wording --- src/generalagents/agent.py | 2 +- src/generalagents/macos/computer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/generalagents/agent.py b/src/generalagents/agent.py index ec46831..893b6de 100644 --- a/src/generalagents/agent.py +++ b/src/generalagents/agent.py @@ -19,7 +19,7 @@ def __init__( temperature: float, max_previous_actions: int, ): - """Initialize a Session for interacting with the GeneralAgents API. + """A Session for interacting with the GeneralAgents API. Args: model: The model identifier to use for predictions. diff --git a/src/generalagents/macos/computer.py b/src/generalagents/macos/computer.py index fd05305..ed69fa5 100644 --- a/src/generalagents/macos/computer.py +++ b/src/generalagents/macos/computer.py @@ -30,7 +30,7 @@ class Computer: def __init__(self, pause_after_action: float = 0.1, pause_for_wait: float = 0.1): - """Initialize a Computer interface for macOS control. + """A Computer interface for macOS control. Args: pause_after_action: Time in seconds to wait after executing an action. From 4cf70fe5d9e6d0c99fcc4bdae03e738a0ead217c Mon Sep 17 00:00:00 2001 From: Sebastian Date: Thu, 10 Apr 2025 10:03:17 -0700 Subject: [PATCH 4/4] format --- src/generalagents/action.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/generalagents/action.py b/src/generalagents/action.py index e20008a..5ae0233 100644 --- a/src/generalagents/action.py +++ b/src/generalagents/action.py @@ -5,7 +5,7 @@ @dataclass class Coordinate: """Represents a point on the screen with x and y coordinates. - + x is increasing rightwards and y is increasing downwards """