diff --git a/src/generalagents/action.py b/src/generalagents/action.py index 1c23c0b..5ae0233 100644 --- a/src/generalagents/action.py +++ b/src/generalagents/action.py @@ -4,6 +4,11 @@ @dataclass class Coordinate: + """Represents a point on the screen with x and y coordinates. + + x is increasing rightwards and y is increasing downwards + """ + x: int y: int @@ -54,48 +59,64 @@ class Coordinate: @dataclass class ActionKeyPress: + """Press one or more keyboard keys simultaneously.""" + kind: Literal["key_press"] keys: list[KeyboardKey] @dataclass class ActionType: + """Type a sequence of characters.""" + kind: Literal["type"] text: str @dataclass class ActionLeftClick: + """Left click the mouse button at a specific coordinate.""" + kind: Literal["left_click"] coordinate: Coordinate @dataclass class ActionRightClick: + """Right click the mouse button at a specific coordinate.""" + kind: Literal["right_click"] coordinate: Coordinate @dataclass class ActionDoubleClick: + """Double click the left mouse button at a specific coordinate.""" + kind: Literal["double_click"] coordinate: Coordinate @dataclass class ActionTripleClick: + """Triple click the left mouse button at a specific coordinate.""" + kind: Literal["triple_click"] coordinate: Coordinate @dataclass class ActionMouseMove: + """Move the mouse cursor to a specific coordinate without clicking.""" + kind: Literal["mouse_move"] coordinate: Coordinate @dataclass class ActionDrag: + """Drag the mouse from one coordinate to another while holding the left button.""" + kind: Literal["drag"] drag_start: Coordinate drag_end: Coordinate @@ -103,6 +124,8 @@ class ActionDrag: @dataclass class ActionScroll: + """Scroll the mouse wheel at a specific coordinate with the given delta.""" + kind: Literal["scroll"] scroll_delta: int coordinate: Coordinate @@ -110,11 +133,15 @@ class ActionScroll: @dataclass class ActionWait: + """Pause execution to wait for UI changes or animations.""" + kind: Literal["wait"] @dataclass class ActionStop: + """Stop or complete the current task sequence.""" + kind: Literal["stop"] diff --git a/src/generalagents/agent.py b/src/generalagents/agent.py index 3f17697..893b6de 100644 --- a/src/generalagents/agent.py +++ b/src/generalagents/agent.py @@ -19,7 +19,16 @@ def __init__( temperature: float, max_previous_actions: int, ): - """""" + """A Session for interacting with the GeneralAgents API. + + Args: + model: The model identifier to use for predictions. + api_key: The API key for authentication. + base_url: The base URL of the GeneralAgents API. + instruction: The instruction to guide the agent's behavior. + temperature: Sampling temperature for controlling randomness (0.0-1.0). + max_previous_actions: Maximum number of previous actions to include in context. + """ self.model = model self.instruction = instruction self.max_previous_actions = max_previous_actions @@ -36,6 +45,19 @@ def plan( *, allowed_action_kinds: list[ActionKind] | None = None, ) -> Action: + """Plan the next action based on the current screen observation. + + Args: + observation: Screenshot of the current screen state as a PIL Image. + allowed_action_kinds: Optional list of action kinds to restrict the model to. + If None, all action kinds are allowed. + + Returns: + An Action object representing the next action to perform. + + Raises: + httpx.HTTPStatusError: If the API request fails. + """ buffer = BytesIO() observation.save(buffer, format="WEBP") image_url = f"data:image/webp;base64,{base64.b64encode(buffer.getvalue()).decode('utf8')}" @@ -66,7 +88,19 @@ def __init__( temperature: float = 0.3, max_previous_actions: int = 20, ): - """""" + """Initialize an Agent for computer control. + + Args: + model: The model identifier to use for predictions. + api_key: The API key for authentication. Defaults to GENERALAGENTS_API_KEY + environment variable. + base_url: The base URL of the GeneralAgents API. + temperature: Sampling temperature for controlling randomness (0.0-1.0). + max_previous_actions: Maximum number of previous actions to include in context. + + Raises: + ValueError: If no API key is provided and the environment variable is not set. + """ if not api_key: msg = ( "No API key provided, please set an environment variable " @@ -80,6 +114,14 @@ def __init__( self.max_previous_actions = max_previous_actions def start(self, instruction: str) -> Session: + """Start a new session with the specified instruction. + + Args: + instruction: The instruction to guide the agent's behavior. + + Returns: + A Session object configured with this agent's parameters. + """ return Session( self.model, api_key=self.api_key, diff --git a/src/generalagents/macos/computer.py b/src/generalagents/macos/computer.py index 7eebed0..ed69fa5 100644 --- a/src/generalagents/macos/computer.py +++ b/src/generalagents/macos/computer.py @@ -30,10 +30,19 @@ class Computer: def __init__(self, pause_after_action: float = 0.1, pause_for_wait: float = 0.1): + """A Computer interface for macOS control. + + Args: + pause_after_action: Time in seconds to wait after executing an action. + pause_for_wait: Time in seconds to wait when executing a wait action. + """ self.pause_after_action = pause_after_action self.pause_for_wait = pause_for_wait w, h = pyautogui.size() + + # On high-DPI displays (e.g. Retina), pyautogui.size() may return scaled-down dimensions. + # To standardize, we calculate a scale factor based on the maximum dimension and resize accordingly. self.scale_factor = Fraction(max(w, h), 1200) self.size = (round(w / self.scale_factor), round(h / self.scale_factor)) @@ -44,7 +53,15 @@ def observe(self) -> Image.Image: return Image.open(f.name).resize(self.size) def execute(self, action: Action) -> Image.Image: - """Execute a control action and observe the resulting state of the computer.""" + """Execute a control action and observe the resulting state of the computer. + + Args: + action: The action to execute (e.g., mouse click, keyboard input). + + Returns: + A screenshot of the screen after the action has been performed, + allowing observation of the effect of the action. + """ self._execute_action(action) time.sleep(self.pause_after_action) return self.observe()