From 85106b5c3bcac46159dcb5decafccfe4734831fe Mon Sep 17 00:00:00 2001 From: msmps <7691252+msmps@users.noreply.github.com> Date: Wed, 28 Jan 2026 21:04:59 +0000 Subject: [PATCH 1/2] feat: add UI element detection to snapshots Add element detection for identifying interactive UI elements in terminal screens. The system uses visual style segmentation and pattern classification to identify buttons, inputs, and toggles. Key components (crates/pilotty-core/src/elements/): - grid: maps vt100 screen to typed cells with style info - segment: clusters adjacent cells by visual style - classify: pattern-matches clusters to element types - Element: output type with kind, position, text, confidence Detection patterns: - Buttons: [OK], , inverse-video text, underlined text - Inputs: underscore runs (____), cursor-containing regions - Toggles: [x], [ ], (o), ( ) checkbox/radio patterns Classification priority ensures pattern recognition takes precedence over cursor position (cursor on button = focused button, not input). Elements include row/col for direct use with click command. Confidence scores indicate detection certainty (1.0 = high, 0.6 = low). Known limitation: ncurses apps that style brackets differently from interior text (e.g., dialog) may not detect buttons correctly. --- .gitignore | 1 + Cargo.lock | 1 + Cargo.toml | 1 + README.md | 75 +- crates/pilotty-cli/src/daemon/server.rs | 223 ++++- crates/pilotty-cli/src/daemon/session.rs | 54 +- crates/pilotty-cli/src/daemon/terminal.rs | 171 ++++ crates/pilotty-core/Cargo.toml | 1 + crates/pilotty-core/src/elements/classify.rs | 853 ++++++++++++++++++ crates/pilotty-core/src/elements/grid.rs | 149 +++ crates/pilotty-core/src/elements/mod.rs | 170 ++++ crates/pilotty-core/src/elements/segment.rs | 208 +++++ crates/pilotty-core/src/elements/style.rs | 126 +++ crates/pilotty-core/src/lib.rs | 27 +- crates/pilotty-core/src/protocol.rs | 10 +- crates/pilotty-core/src/snapshot.rs | 109 ++- skills/pilotty/SKILL.md | 204 ++++- .../pilotty/references/element-detection.md | 280 ++++++ .../pilotty/templates/dialog-interaction.sh | 95 +- skills/pilotty/templates/element-detection.sh | 145 +++ 20 files changed, 2795 insertions(+), 108 deletions(-) create mode 100644 crates/pilotty-core/src/elements/classify.rs create mode 100644 crates/pilotty-core/src/elements/grid.rs create mode 100644 crates/pilotty-core/src/elements/mod.rs create mode 100644 crates/pilotty-core/src/elements/segment.rs create mode 100644 crates/pilotty-core/src/elements/style.rs create mode 100644 skills/pilotty/references/element-detection.md create mode 100755 skills/pilotty/templates/element-detection.sh diff --git a/.gitignore b/.gitignore index cc1d69b..ebcf97a 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ target/ # Agent/AI tooling .opencode/ .claude/ +.agents/ # Internal docs (not for public repo) docs/ diff --git a/Cargo.lock b/Cargo.lock index 971c859..e8eecf7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -491,6 +491,7 @@ version = "0.0.4" dependencies = [ "serde", "serde_json", + "unicode-width", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 2e89f7c..7d6705d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ anyhow = "1" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } regex = "1" +unicode-width = "0.2" # System libc = "0.2" diff --git a/README.md b/README.md index 18d7dc8..7b4da51 100644 --- a/README.md +++ b/README.md @@ -150,11 +150,82 @@ The `snapshot` command returns structured data about the terminal screen: "snapshot_id": 42, "size": { "cols": 80, "rows": 24 }, "cursor": { "row": 5, "col": 10, "visible": true }, - "text": "... plain text content ..." + "text": "Options: [x] Enable [ ] Debug\nActions: [OK] [Cancel]", + "elements": [ + { "kind": "toggle", "row": 0, "col": 9, "width": 3, "text": "[x]", "confidence": 1.0, "checked": true }, + { "kind": "toggle", "row": 0, "col": 22, "width": 3, "text": "[ ]", "confidence": 1.0, "checked": false }, + { "kind": "button", "row": 1, "col": 9, "width": 4, "text": "[OK]", "confidence": 0.8 }, + { "kind": "button", "row": 1, "col": 14, "width": 8, "text": "[Cancel]", "confidence": 0.8 } + ], + "content_hash": 12345678901234567890 } ``` -Use the cursor position and text content to understand the screen state and navigate using keyboard commands (Tab, Enter, arrow keys) or click at specific coordinates. +## UI Elements (Contextual) + +pilotty automatically detects interactive UI elements in terminal applications. Elements provide **read-only context** to help understand UI structure, with position data (row, col) for use with the click command. + +**Use keyboard navigation (`pilotty key Tab`, `pilotty key Enter`, `pilotty type "text"`) for reliable TUI interaction** rather than element-based actions, as UI element detection depends on visual patterns that may disappear after interaction. + +### Element Kinds + +| Kind | Detection Patterns | Confidence | +|------|-------------------|------------| +| **button** | Inverse video, `[OK]`, `` | 1.0 / 0.8 | +| **input** | Cursor position, `____` underscores | 1.0 / 0.6 | +| **toggle** | `[x]`, `[ ]`, `☑`, `☐` | 1.0 | + +### Element Fields + +| Field | Description | +|-------|-------------| +| `kind` | Element type: `button`, `input`, or `toggle` | +| `row` | Row position (0-based) | +| `col` | Column position (0-based) | +| `width` | Width in terminal cells | +| `text` | Text content of the element | +| `confidence` | Detection confidence (0.0-1.0) | +| `focused` | Whether element has focus (only present if true) | +| `checked` | Toggle state (only present for toggles) | + +### Change Detection + +The `content_hash` field enables screen change detection between snapshots: + +```bash +# Get initial snapshot +SNAP1=$(pilotty snapshot) +HASH1=$(echo "$SNAP1" | jq -r '.content_hash') + +# Perform some action +pilotty key Tab + +# Check if screen changed +SNAP2=$(pilotty snapshot) +HASH2=$(echo "$SNAP2" | jq -r '.content_hash') + +if [ "$HASH1" != "$HASH2" ]; then + echo "Screen content changed" +fi +``` + +### Workflow Example + +```bash +# 1. Spawn a TUI with dialog elements +pilotty spawn dialog --yesno "Continue?" 10 40 + +# 2. Wait for dialog to render +pilotty wait-for "Continue" + +# 3. Get snapshot with elements (for context) +pilotty snapshot | jq '.elements' +# Shows detected buttons, helps understand UI structure + +# 4. Navigate and interact with keyboard (reliable approach) +pilotty key Tab # Move to next element +pilotty key Enter # Activate selected element +``` ## Sessions diff --git a/crates/pilotty-cli/src/daemon/server.rs b/crates/pilotty-cli/src/daemon/server.rs index 10493dd..c0fb344 100644 --- a/crates/pilotty-cli/src/daemon/server.rs +++ b/crates/pilotty-cli/src/daemon/server.rs @@ -615,15 +615,18 @@ async fn handle_snapshot( Err(e) => return Response::error(request_id, e), }; + let format = format.unwrap_or(SnapshotFormat::Full); + + // Full format includes UI element detection + let with_elements = matches!(format, SnapshotFormat::Full); + // Get snapshot data (drains PTY output first) - let snapshot = match sessions.get_snapshot_data(&session_id).await { + let snapshot = match sessions.get_snapshot_data(&session_id, with_elements).await { Ok(data) => data, Err(e) => return Response::error(request_id, e), }; let (cursor_row, cursor_col) = snapshot.cursor_pos; - let format = format.unwrap_or(SnapshotFormat::Full); - match format { SnapshotFormat::Text => { // Format as plain text with cursor indicator @@ -637,9 +640,10 @@ async fn handle_snapshot( }, ) } - SnapshotFormat::Full | SnapshotFormat::Compact => { - // Build ScreenState JSON + SnapshotFormat::Full => { + // Full: text + elements + metadata + content_hash let snapshot_id = sessions.next_snapshot_id(); + let screen_state = ScreenState { snapshot_id, size: TerminalSize { @@ -651,11 +655,29 @@ async fn handle_snapshot( col: cursor_col, visible: snapshot.cursor_visible, }, - text: if format == SnapshotFormat::Full { - Some(snapshot.text) - } else { - None + text: Some(snapshot.text), + elements: snapshot.elements, + content_hash: snapshot.content_hash, + }; + Response::success(request_id, ResponseData::ScreenState(screen_state)) + } + SnapshotFormat::Compact => { + // Compact: metadata only, no text, elements, or hash + let snapshot_id = sessions.next_snapshot_id(); + let screen_state = ScreenState { + snapshot_id, + size: TerminalSize { + cols: snapshot.size.cols, + rows: snapshot.size.rows, }, + cursor: CursorState { + row: cursor_row, + col: cursor_col, + visible: snapshot.cursor_visible, + }, + text: None, + elements: None, + content_hash: None, }; Response::success(request_id, ResponseData::ScreenState(screen_state)) } @@ -1016,16 +1038,20 @@ async fn handle_wait_for( Err(e) => return Response::error(request_id, e), }; - // Compile regex if needed + // Compile regex if needed. + // Limit compiled pattern size to prevent slow compilation. let compiled_regex = if use_regex { - match regex::Regex::new(&pattern) { + match regex::RegexBuilder::new(&pattern) + .size_limit(256 * 1024) // 256KB compiled size limit + .build() + { Ok(r) => Some(r), Err(e) => { return Response::error( request_id, ApiError::invalid_input_with_suggestion( format!("Invalid regex pattern: {}", e), - "Check your regex syntax. Common issues: unescaped special chars, unbalanced parentheses.", + "Check your regex syntax. Common issues: unescaped special chars, unbalanced parentheses, or pattern too complex.", ), ); } @@ -1054,8 +1080,8 @@ async fn handle_wait_for( ); } - // Get current screen text - let snapshot = match sessions.get_snapshot_data(&session_id).await { + // Get current screen text (no elements needed for wait_for) + let snapshot = match sessions.get_snapshot_data(&session_id, false).await { Ok(data) => data, Err(e) => return Response::error(request_id, e), }; @@ -2347,4 +2373,173 @@ mod tests { let _ = std::fs::remove_file(&socket_path); let _ = std::fs::remove_file(&pid_path); } + + #[tokio::test] + async fn test_snapshot_with_elements() { + use pilotty_core::elements::ElementKind; + + let temp_dir = std::env::temp_dir(); + let socket_path = temp_dir.join(format!("pilotty-elem-{}.sock", std::process::id())); + let pid_path = socket_path.with_extension("pid"); + + let server = DaemonServer::bind_to(socket_path.clone(), pid_path.clone()) + .await + .expect("Failed to bind server"); + + let server_handle = tokio::spawn(async move { + let _ = timeout(Duration::from_secs(5), server.run()).await; + }); + + tokio::time::sleep(Duration::from_millis(50)).await; + + let stream = UnixStream::connect(&socket_path) + .await + .expect("Failed to connect"); + let (reader, mut writer) = stream.into_split(); + let mut reader = BufReader::new(reader); + + // Spawn a session with output containing detectable elements: + // - [OK] and [Cancel] → Buttons (bracket pattern, confidence 0.8) + // - [x] and [ ] → Toggles (checkbox pattern, confidence 1.0) + let spawn_request = Request { + id: "spawn-elem".to_string(), + command: Command::Spawn { + command: vec![ + "printf".to_string(), + "Options: [x] Enable [ ] Debug\nActions: [OK] [Cancel]\n".to_string(), + ], + session_name: Some("elem-test".to_string()), + cwd: None, + }, + }; + let request_json = serde_json::to_string(&spawn_request).unwrap(); + writer + .write_all(request_json.as_bytes()) + .await + .expect("write"); + writer.write_all(b"\n").await.expect("newline"); + writer.flush().await.expect("flush"); + + let mut response_line = String::new(); + timeout(Duration::from_secs(2), reader.read_line(&mut response_line)) + .await + .expect("timeout") + .expect("read"); + + // Give printf time to complete + tokio::time::sleep(Duration::from_millis(200)).await; + + // Request snapshot with Full format (includes elements) + let snap_request = Request { + id: "snap-elem".to_string(), + command: Command::Snapshot { + session: Some("elem-test".to_string()), + format: Some(SnapshotFormat::Full), + }, + }; + let snap_json = serde_json::to_string(&snap_request).unwrap(); + writer.write_all(snap_json.as_bytes()).await.expect("write"); + writer.write_all(b"\n").await.expect("newline"); + writer.flush().await.expect("flush"); + + response_line.clear(); + timeout(Duration::from_secs(2), reader.read_line(&mut response_line)) + .await + .expect("timeout") + .expect("read"); + + let snap_response: Response = + serde_json::from_str(&response_line).expect("parse snap response"); + assert!(snap_response.success, "Snapshot should succeed"); + + // Verify ScreenState with elements + if let Some(ResponseData::ScreenState(screen_state)) = snap_response.data { + // Full format includes text + assert!( + screen_state.text.is_some(), + "Full format should include text" + ); + + // Full format SHOULD include elements + assert!( + screen_state.elements.is_some(), + "Full format should include elements" + ); + + // Full format SHOULD include content_hash + assert!( + screen_state.content_hash.is_some(), + "Full format should include content_hash" + ); + + let elements = screen_state.elements.unwrap(); + + // Should detect at least the toggles (checkboxes are high confidence) + // [x] -> Toggle checked=true, [ ] -> Toggle checked=false + let toggles: Vec<_> = elements + .iter() + .filter(|e| e.kind == ElementKind::Toggle) + .collect(); + assert!( + toggles.len() >= 2, + "Should detect at least 2 toggles, found {}", + toggles.len() + ); + + // Verify toggle states + let checked_toggle = toggles.iter().find(|t| t.checked == Some(true)); + let unchecked_toggle = toggles.iter().find(|t| t.checked == Some(false)); + assert!( + checked_toggle.is_some(), + "Should have a checked toggle ([x])" + ); + assert!( + unchecked_toggle.is_some(), + "Should have an unchecked toggle ([ ])" + ); + + // Check toggle confidence is 1.0 (checkbox pattern) + for toggle in &toggles { + assert!( + (toggle.confidence - 1.0).abs() < f32::EPSILON, + "Toggle confidence should be 1.0, got {}", + toggle.confidence + ); + } + + // May also detect [OK] and [Cancel] as buttons + let buttons: Vec<_> = elements + .iter() + .filter(|e| e.kind == ElementKind::Button) + .collect(); + // Buttons have 0.8 confidence (bracket pattern) + for button in &buttons { + assert!( + (button.confidence - 0.8).abs() < f32::EPSILON, + "Button confidence should be 0.8, got {}", + button.confidence + ); + } + + // Verify JSON serialization is clean (check raw response) + // - Non-focused elements should NOT have "focused" in their JSON + // - Buttons should NOT have "checked" in their JSON + let raw_json = &response_line; + // Count occurrences of "focused" - should only appear for focused elements + let focused_count = raw_json.matches("\"focused\"").count(); + let elements_with_focus = elements.iter().filter(|e| e.focused).count(); + assert_eq!( + focused_count, elements_with_focus, + "JSON should only include 'focused' for focused elements" + ); + } else { + panic!( + "Expected ScreenState response data, got: {:?}", + snap_response.data + ); + } + + server_handle.abort(); + let _ = std::fs::remove_file(&socket_path); + } } diff --git a/crates/pilotty-cli/src/daemon/session.rs b/crates/pilotty-cli/src/daemon/session.rs index 9edcdae..8039be0 100644 --- a/crates/pilotty-cli/src/daemon/session.rs +++ b/crates/pilotty-cli/src/daemon/session.rs @@ -9,8 +9,11 @@ use chrono::{DateTime, Utc}; use tokio::sync::{Mutex, RwLock}; use tracing::{debug, info}; +use pilotty_core::elements::classify::{detect, ClassifyContext}; +use pilotty_core::elements::Element; use pilotty_core::error::ApiError; use pilotty_core::protocol::SessionInfo; +use pilotty_core::snapshot::compute_content_hash; use crate::daemon::pty::{AsyncPtyHandle, PtySession, TermSize}; use crate::daemon::terminal::TerminalEmulator; @@ -56,6 +59,11 @@ pub struct SnapshotData { pub cursor_pos: (u16, u16), pub cursor_visible: bool, pub size: TermSize, + /// Detected UI elements (computed on demand). + pub elements: Option>, + /// Hash of screen content for change detection. + /// Present when `with_elements=true`. + pub content_hash: Option, } /// An active PTY session. @@ -88,21 +96,6 @@ impl Session { } } - /// Get the plain text content of the terminal screen. - pub async fn get_text(&self) -> String { - self.terminal.lock().await.get_text() - } - - /// Get the cursor position (row, col) - 0-indexed. - pub async fn cursor_position(&self) -> (u16, u16) { - self.terminal.lock().await.cursor_position() - } - - /// Check if the cursor is visible. - pub async fn cursor_visible(&self) -> bool { - self.terminal.lock().await.cursor_visible() - } - /// Check if terminal is in application cursor mode. pub async fn application_cursor(&self) -> bool { self.terminal.lock().await.application_cursor() @@ -382,7 +375,14 @@ impl SessionManager { /// /// Uses a read lock on sessions since all operations use interior mutability, /// avoiding potential deadlocks from holding a write lock during I/O. - pub async fn get_snapshot_data(&self, id: &SessionId) -> Result { + /// + /// If `with_elements` is true, element detection runs to identify + /// UI elements like buttons, checkboxes, and menu items. + pub async fn get_snapshot_data( + &self, + id: &SessionId, + with_elements: bool, + ) -> Result { let sessions = self.sessions.read().await; let session = sessions .get(id) @@ -391,17 +391,33 @@ impl SessionManager { // Drain pending PTY output to update terminal state session.drain_pty_output().await; + // Lock terminal once for all reads + let terminal = session.terminal.lock().await; + // Get snapshot data - let text = session.get_text().await; - let cursor_pos = session.cursor_position().await; - let cursor_visible = session.cursor_visible().await; + let text = terminal.get_text(); + let cursor_pos = terminal.cursor_position(); + let cursor_visible = terminal.cursor_visible(); let size = session.size; + // Detect UI elements and compute content hash if requested + let (elements, content_hash) = if with_elements { + let (cursor_row, cursor_col) = cursor_pos; + let ctx = ClassifyContext::new().with_cursor(cursor_row, cursor_col); + let elems = detect(&*terminal, &ctx); + let hash = compute_content_hash(&text); + (Some(elems), Some(hash)) + } else { + (None, None) + }; + Ok(SnapshotData { text, cursor_pos, cursor_visible, size, + elements, + content_hash, }) } diff --git a/crates/pilotty-cli/src/daemon/terminal.rs b/crates/pilotty-cli/src/daemon/terminal.rs index af84669..f925c02 100644 --- a/crates/pilotty-cli/src/daemon/terminal.rs +++ b/crates/pilotty-cli/src/daemon/terminal.rs @@ -4,6 +4,8 @@ //! that can parse ANSI escape sequences from PTY output. use crate::daemon::pty::TermSize; +use pilotty_core::elements::grid::{ScreenCell, ScreenGrid}; +use pilotty_core::elements::style::{CellStyle, Color}; /// Terminal emulator that parses ANSI escape sequences. /// @@ -91,6 +93,49 @@ impl TerminalEmulator { } } +/// Convert vt100 color to core Color type. +fn convert_color(vt_color: vt100::Color) -> Color { + match vt_color { + vt100::Color::Default => Color::Default, + vt100::Color::Idx(idx) => Color::Indexed { index: idx }, + vt100::Color::Rgb(r, g, b) => Color::Rgb { r, g, b }, + } +} + +/// Convert vt100 cell to core ScreenCell. +fn convert_cell(vt_cell: &vt100::Cell) -> ScreenCell { + // Get the character from the cell contents + // vt100::Cell::contents() returns a String (may be empty for wide char continuations) + let contents = vt_cell.contents(); + let ch = contents.chars().next().unwrap_or(' '); + + let style = CellStyle { + bold: vt_cell.bold(), + underline: vt_cell.underline(), + inverse: vt_cell.inverse(), + fg_color: convert_color(vt_cell.fgcolor()), + bg_color: convert_color(vt_cell.bgcolor()), + }; + + ScreenCell::new(ch, style) +} + +impl ScreenGrid for TerminalEmulator { + fn rows(&self) -> u16 { + let (rows, _cols) = self.parser.screen().size(); + rows + } + + fn cols(&self) -> u16 { + let (_rows, cols) = self.parser.screen().size(); + cols + } + + fn cell(&self, row: u16, col: u16) -> Option { + self.parser.screen().cell(row, col).map(convert_cell) + } +} + #[cfg(test)] mod tests { use super::*; @@ -490,4 +535,130 @@ mod tests { "Should be normal mode after ESC[?1l" ); } + + // ScreenGrid implementation tests + + #[test] + fn test_screen_grid_dimensions() { + let term = TerminalEmulator::new(TermSize { cols: 80, rows: 24 }); + + assert_eq!(ScreenGrid::rows(&term), 24); + assert_eq!(ScreenGrid::cols(&term), 80); + } + + #[test] + fn test_screen_grid_cell_access() { + let mut term = TerminalEmulator::new(TermSize { cols: 80, rows: 24 }); + term.feed(b"Hello"); + + // Check cells with content via ScreenGrid trait + let cell_h = ScreenGrid::cell(&term, 0, 0).expect("Cell should exist"); + assert_eq!(cell_h.ch, 'H'); + + let cell_o = ScreenGrid::cell(&term, 0, 4).expect("Cell should exist"); + assert_eq!(cell_o.ch, 'o'); + + // Check empty cell + let cell_empty = ScreenGrid::cell(&term, 0, 10).expect("Cell should exist"); + assert_eq!(cell_empty.ch, ' '); + } + + #[test] + fn test_screen_grid_out_of_bounds() { + let term = TerminalEmulator::new(TermSize { cols: 80, rows: 24 }); + + assert!(ScreenGrid::cell(&term, 0, 0).is_some()); + assert!(ScreenGrid::cell(&term, 23, 79).is_some()); + assert!(ScreenGrid::cell(&term, 24, 0).is_none()); // row out of bounds + assert!(ScreenGrid::cell(&term, 0, 80).is_none()); // col out of bounds + } + + #[test] + fn test_screen_grid_color_mapping_default() { + let mut term = TerminalEmulator::new(TermSize { cols: 80, rows: 24 }); + term.feed(b"A"); + + let cell = ScreenGrid::cell(&term, 0, 0).expect("Cell should exist"); + assert_eq!(cell.style.fg_color, Color::Default); + assert_eq!(cell.style.bg_color, Color::Default); + } + + #[test] + fn test_screen_grid_color_mapping_indexed() { + let mut term = TerminalEmulator::new(TermSize { cols: 80, rows: 24 }); + // Red foreground (color 1), blue background (color 4) + term.feed(b"\x1b[31;44mX"); + + let cell = ScreenGrid::cell(&term, 0, 0).expect("Cell should exist"); + assert_eq!(cell.style.fg_color, Color::Indexed { index: 1 }); + assert_eq!(cell.style.bg_color, Color::Indexed { index: 4 }); + } + + #[test] + fn test_screen_grid_color_mapping_rgb() { + let mut term = TerminalEmulator::new(TermSize { cols: 80, rows: 24 }); + // 24-bit RGB: ESC[38;2;255;128;64m for fg, ESC[48;2;0;0;0m for bg + term.feed(b"\x1b[38;2;255;128;64mR"); + + let cell = ScreenGrid::cell(&term, 0, 0).expect("Cell should exist"); + assert_eq!( + cell.style.fg_color, + Color::Rgb { + r: 255, + g: 128, + b: 64 + } + ); + } + + #[test] + fn test_screen_grid_style_bold() { + let mut term = TerminalEmulator::new(TermSize { cols: 80, rows: 24 }); + term.feed(b"N\x1b[1mB\x1b[0m"); + + let normal = ScreenGrid::cell(&term, 0, 0).expect("Cell should exist"); + assert!(!normal.style.bold); + + let bold = ScreenGrid::cell(&term, 0, 1).expect("Cell should exist"); + assert!(bold.style.bold); + } + + #[test] + fn test_screen_grid_style_underline() { + let mut term = TerminalEmulator::new(TermSize { cols: 80, rows: 24 }); + term.feed(b"N\x1b[4mU\x1b[0m"); + + let normal = ScreenGrid::cell(&term, 0, 0).expect("Cell should exist"); + assert!(!normal.style.underline); + + let underlined = ScreenGrid::cell(&term, 0, 1).expect("Cell should exist"); + assert!(underlined.style.underline); + } + + #[test] + fn test_screen_grid_style_inverse() { + let mut term = TerminalEmulator::new(TermSize { cols: 80, rows: 24 }); + // \x1b[7m = inverse on + term.feed(b"N\x1b[7mI\x1b[0m"); + + let normal = ScreenGrid::cell(&term, 0, 0).expect("Cell should exist"); + assert!(!normal.style.inverse); + + let inverse = ScreenGrid::cell(&term, 0, 1).expect("Cell should exist"); + assert!(inverse.style.inverse); + } + + #[test] + fn test_screen_grid_combined_styles() { + let mut term = TerminalEmulator::new(TermSize { cols: 80, rows: 24 }); + // Bold + underline + inverse + red fg + blue bg + term.feed(b"\x1b[1;4;7;31;44mS"); + + let cell = ScreenGrid::cell(&term, 0, 0).expect("Cell should exist"); + assert!(cell.style.bold); + assert!(cell.style.underline); + assert!(cell.style.inverse); + assert_eq!(cell.style.fg_color, Color::Indexed { index: 1 }); + assert_eq!(cell.style.bg_color, Color::Indexed { index: 4 }); + } } diff --git a/crates/pilotty-core/Cargo.toml b/crates/pilotty-core/Cargo.toml index 4178ed6..5e82a1a 100644 --- a/crates/pilotty-core/Cargo.toml +++ b/crates/pilotty-core/Cargo.toml @@ -8,3 +8,4 @@ description = "Core types and logic for pilotty" [dependencies] serde = { workspace = true } serde_json = { workspace = true } +unicode-width = { workspace = true } diff --git a/crates/pilotty-core/src/elements/classify.rs b/crates/pilotty-core/src/elements/classify.rs new file mode 100644 index 0000000..4f938d1 --- /dev/null +++ b/crates/pilotty-core/src/elements/classify.rs @@ -0,0 +1,853 @@ +//! Classification: converting clusters into interactive elements. +//! +//! The classifier applies priority-ordered rules to determine each cluster's +//! kind. Only interactive elements (Button, Input, Toggle) are returned; +//! non-interactive content stays in `snapshot.text`. +//! +//! # Rule Priority (highest to lowest) +//! +//! 1. Cursor position → Input (confidence: 1.0, focused: true) +//! 2. Checkbox patterns `[x]`, `[ ]`, `☑`, `☐` → Toggle (confidence: 1.0) +//! 3. Inverse video → Button (confidence: 1.0, focused: true) +//! 4. Bracket patterns `[OK]`, `` → Button (confidence: 0.8) +//! 5. Underscore field `____` → Input (confidence: 0.6) +//! +//! Non-interactive patterns (links, progress bars, errors, status indicators, +//! box-drawing, menu prefixes, static text) are filtered out. + +use unicode_width::UnicodeWidthStr; + +use crate::elements::segment::Cluster; +use crate::elements::{Element, ElementKind}; + +// ============================================================================ +// Constants +// ============================================================================ + +/// Maximum cluster text length to process for tokenization. +/// Protects against memory exhaustion from malicious terminal output. +/// Terminal lines rarely exceed this; longer text won't contain meaningful UI elements. +const MAX_CLUSTER_TEXT_LEN: usize = 4096; + +// ============================================================================ +// Token Extraction +// ============================================================================ + +/// A token extracted from a cluster's text. +/// +/// Tokens are sub-patterns within a cluster that match interactive elements: +/// - Bracketed tokens: `[OK]`, ``, `[ ]`, `[x]` +/// - Underscore runs: `____`, `__________` +#[derive(Debug, Clone, PartialEq, Eq)] +struct Token { + /// Text content of the token. + text: String, + /// Byte offset from start of cluster text (used to slice prefix for width calculation). + byte_offset: usize, +} + +/// Calculate the display-width column offset for a token within cluster text. +/// +/// This handles CJK characters correctly (width 2) by computing the display +/// width of the text prefix before the token. +fn token_col_offset(text: &str, byte_offset: usize) -> u16 { + text.get(..byte_offset) + .map(|prefix| prefix.width().min(u16::MAX as usize) as u16) + .unwrap_or(0) +} + +/// Extract bracketed tokens from text. +/// +/// Finds patterns like `[OK]`, ``, `(Submit)`, `[ ]`, `[x]`. +/// Returns tokens with their byte offsets within the text (for display width calculation). +/// +/// Returns empty if text exceeds MAX_CLUSTER_TEXT_LEN to prevent memory exhaustion. +fn extract_bracketed_tokens(text: &str) -> Vec { + // Protect against memory exhaustion from extremely long input + if text.len() > MAX_CLUSTER_TEXT_LEN { + return Vec::new(); + } + + let mut tokens = Vec::new(); + + for (char_idx, ch) in text.char_indices() { + // Look for opening brackets + let close_bracket = match ch { + '[' => Some(']'), + '<' => Some('>'), + '(' => Some(')'), + '【' => Some('】'), + '「' => Some('」'), + _ => None, + }; + + if let Some(closer) = close_bracket { + // Find matching closer in the remainder of the string + if let Some(end_rel) = text[char_idx + ch.len_utf8()..].find(closer) { + let token_start = char_idx; + let token_end = char_idx + ch.len_utf8() + end_rel + closer.len_utf8(); + let token_text = &text[token_start..token_end]; + + // Only extract if it looks interactive (not just empty or single char) + if token_text.chars().count() >= 3 || is_unicode_checkbox(token_text) { + tokens.push(Token { + text: token_text.to_string(), + byte_offset: token_start, + }); + } + } + } + } + + // Deduplicate overlapping tokens by keeping only non-overlapping ones + let mut result = Vec::new(); + let mut last_end = 0; + for token in tokens { + if token.byte_offset >= last_end { + last_end = token.byte_offset + token.text.len(); + result.push(token); + } + } + + result +} + +/// Check if text is a single unicode checkbox character. +fn is_unicode_checkbox(text: &str) -> bool { + matches!(text, "☑" | "☐" | "□" | "✓" | "✔" | "☒") +} + +/// Extract underscore runs from text. +/// +/// Finds patterns like `____`, `__________` (3+ underscores). +/// Returns tokens with their byte offsets within the text (for display width calculation). +/// +/// Returns empty if text exceeds MAX_CLUSTER_TEXT_LEN to prevent memory exhaustion. +fn extract_underscore_runs(text: &str) -> Vec { + // Protect against memory exhaustion from extremely long input + if text.len() > MAX_CLUSTER_TEXT_LEN { + return Vec::new(); + } + + let mut tokens = Vec::new(); + let mut in_run = false; + let mut run_start = 0; + + for (byte_idx, ch) in text.char_indices() { + if ch == '_' { + if !in_run { + in_run = true; + run_start = byte_idx; + } + } else if in_run { + // End of underscore run + let run_text = &text[run_start..byte_idx]; + if run_text.len() >= 3 { + tokens.push(Token { + text: run_text.to_string(), + byte_offset: run_start, + }); + } + in_run = false; + } + } + + // Handle run at end of string + if in_run { + let run_text = &text[run_start..]; + if run_text.len() >= 3 { + tokens.push(Token { + text: run_text.to_string(), + byte_offset: run_start, + }); + } + } + + tokens +} + +/// Context for classification decisions that depend on screen position. +#[derive(Debug, Clone, Copy, Default)] +pub struct ClassifyContext { + /// Optional cursor row (if known). Clusters at cursor position become Input. + pub cursor_row: Option, + /// Optional cursor column (if known). + pub cursor_col: Option, +} + +impl ClassifyContext { + /// Create a new context with no cursor information. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Set cursor position. + #[must_use] + pub fn with_cursor(mut self, row: u16, col: u16) -> Self { + self.cursor_row = Some(row); + self.cursor_col = Some(col); + self + } +} + +/// Internal element data during classification. +/// +/// Used during classification to collect elements before converting +/// to the public Element type. +#[derive(Debug, Clone)] +struct DetectedElement { + kind: ElementKind, + row: u16, + col: u16, + width: u16, + text: String, + confidence: f32, + checked: Option, + focused: bool, +} + +impl DetectedElement { + /// Create a button element. + fn button(row: u16, col: u16, text: String, confidence: f32, focused: bool) -> Self { + Self { + kind: ElementKind::Button, + row, + col, + width: text.width().min(u16::MAX as usize) as u16, + text, + confidence, + checked: None, + focused, + } + } + + /// Create an input element. + fn input(row: u16, col: u16, text: String, confidence: f32, focused: bool) -> Self { + Self { + kind: ElementKind::Input, + row, + col, + width: text.width().min(u16::MAX as usize) as u16, + text, + confidence, + checked: None, + focused, + } + } + + /// Create a toggle element. + fn toggle(row: u16, col: u16, text: String, checked: bool) -> Self { + Self { + kind: ElementKind::Toggle, + row, + col, + width: text.width().min(u16::MAX as usize) as u16, + text, + confidence: 1.0, + checked: Some(checked), + focused: false, + } + } + + /// Convert to Element. + fn into_element(self) -> Element { + let mut elem = Element::new( + self.kind, + self.row, + self.col, + self.width, + self.text, + self.confidence, + ); + if let Some(checked) = self.checked { + elem = elem.with_checked(checked); + } + if self.focused { + elem = elem.with_focused(true); + } + elem + } +} + +// ============================================================================ +// Pattern Detection Helpers +// ============================================================================ + +/// Check if text matches a single button bracket pattern: `[OK]`, ``, `(Confirm)` +/// +/// Requires: +/// - Exactly one pair of matching brackets +/// - At least one non-bracket character inside +/// - No brackets in the interior (to reject `[Yes] [No]`) +fn is_button_pattern(text: &str) -> bool { + let trimmed = text.trim(); + if trimmed.len() < 3 { + return false; + } + + let chars: Vec = trimmed.chars().collect(); + let first = chars[0]; + let last = chars[chars.len() - 1]; + + // Check for matching bracket pairs + let (opener, closer) = match (first, last) { + ('[', ']') => ('[', ']'), + ('<', '>') => ('<', '>'), + ('(', ')') => ('(', ')'), + ('【', '】') => ('【', '】'), + ('「', '」') => ('「', '」'), + _ => return false, + }; + + // Interior must have non-whitespace content (not just empty brackets) + let interior: String = chars[1..chars.len() - 1].iter().collect(); + + // Reject if interior contains more brackets (e.g., "[Yes] [No]") + if interior.contains(opener) || interior.contains(closer) { + return false; + } + + // Reject if it looks like a checkbox pattern + if is_checkbox_content(&interior) { + return false; + } + + // Reject if it looks like a progress bar inside brackets + if is_progress_bar_content(&interior) { + return false; + } + + // Must have actual label content + !interior.trim().is_empty() +} + +/// Helper to check if content inside brackets looks like progress bar content. +fn is_progress_bar_content(content: &str) -> bool { + if content.is_empty() { + return false; + } + + // Count progress-bar typical characters + let progress_chars: usize = content + .chars() + .filter(|&c| matches!(c, '=' | '>' | '-' | '#' | ' ' | '█' | '░')) + .count(); + + // If more than 80% of chars are progress-like, it's probably a progress bar + progress_chars * 10 >= content.len() * 8 +} + +/// Check if text matches checkbox patterns. +/// +/// Supported patterns: +/// - `[x]`, `[X]`, `[ ]` - ASCII checkboxes +/// - `[*]`, `[-]` - Alternative markers +/// - `☑`, `☐`, `✓`, `✗` - Unicode checkboxes +/// - `(x)`, `( )`, `(*)` - Parenthesized variants +fn is_checkbox_pattern(text: &str) -> Option { + let trimmed = text.trim(); + + // Single character unicode checkboxes + match trimmed { + "☑" | "✓" | "✔" | "☒" => return Some(true), + "☐" | "□" => return Some(false), + _ => {} + } + + // Bracketed checkboxes: [x], [ ], [*], [-], etc. + if trimmed.len() == 3 { + let chars: Vec = trimmed.chars().collect(); + if (chars[0] == '[' && chars[2] == ']') || (chars[0] == '(' && chars[2] == ')') { + return match chars[1] { + 'x' | 'X' | '*' | '✓' | '✔' => Some(true), + ' ' | '.' => Some(false), + '-' => Some(false), // indeterminate treated as unchecked + _ => None, + }; + } + } + + None +} + +/// Helper to check if content inside brackets looks like checkbox content. +fn is_checkbox_content(content: &str) -> bool { + let trimmed = content.trim(); + matches!(trimmed, "x" | "X" | " " | "*" | "-" | "✓" | "✔") +} + +/// Check if text looks like an input field placeholder. +/// +/// Patterns: `____`, `[ ]`, `: _____` +fn is_input_pattern(text: &str) -> bool { + let trimmed = text.trim(); + + // Series of underscores + if trimmed.chars().all(|c| c == '_') && trimmed.len() >= 3 { + return true; + } + + // Empty bracketed field with mostly spaces + if trimmed.starts_with('[') && trimmed.ends_with(']') && trimmed.len() >= 4 { + let inner: String = trimmed.chars().skip(1).take(trimmed.len() - 2).collect(); + if inner.trim().is_empty() && inner.len() >= 2 { + return true; + } + } + + // Colon followed by underscores: "Name: ___" + if let Some(colon_pos) = trimmed.find(':') { + let after_colon = trimmed[colon_pos + 1..].trim_start(); + if after_colon.chars().all(|c| c == '_') && after_colon.len() >= 3 { + return true; + } + } + + false +} + +// ============================================================================ +// Core Classification +// ============================================================================ + +/// Classify a text pattern into a detected element at the given position. +/// +/// This is the low-level classifier that doesn't consider tokenization. +/// Returns `None` for non-interactive patterns. +/// +/// Classification priority: +/// 1. Checkbox patterns → Toggle (state is unambiguous) +/// 2. Inverse video → Button (focused) - TUI convention for selection +/// 3. Bracket patterns → Button (with focus if cursor present) +/// 4. Underscore/labeled fields → Input (with focus if cursor present) +/// 5. Cursor on unrecognized text → Input (fallback for editable regions) +fn classify_text( + text: &str, + row: u16, + col: u16, + is_inverse: bool, + cursor_in_range: bool, +) -> Option { + // Rule 1: Checkbox patterns → Toggle + // Checkboxes have unambiguous visual state, highest confidence + if let Some(checked) = is_checkbox_pattern(text) { + return Some(DetectedElement::toggle(row, col, text.to_string(), checked)); + } + + // Rule 2: Inverse video → Button (focused) + // TUI convention: inverse video = selected/focused item + if is_inverse { + return Some(DetectedElement::button( + row, + col, + text.to_string(), + 1.0, + true, + )); + } + + // Rule 3: Bracket patterns → Button + // Cursor on button makes it focused, not an input + if is_button_pattern(text) { + return Some(DetectedElement::button( + row, + col, + text.to_string(), + if cursor_in_range { 1.0 } else { 0.8 }, + cursor_in_range, + )); + } + + // Rule 4: Underscore field → Input + if is_input_pattern(text) { + return Some(DetectedElement::input( + row, + col, + text.to_string(), + if cursor_in_range { 1.0 } else { 0.6 }, + cursor_in_range, + )); + } + + // Rule 5: Cursor on unrecognized pattern → Input (fallback) + // If cursor is here and we don't know what it is, assume editable + if cursor_in_range { + return Some(DetectedElement::input( + row, + col, + text.to_string(), + 1.0, + true, + )); + } + + None +} + +/// Check if cursor is within a range. +/// +/// Uses saturating arithmetic to prevent overflow when col + width exceeds u16::MAX. +fn cursor_in_range(ctx: &ClassifyContext, row: u16, col: u16, width: u16) -> bool { + if let (Some(cursor_row), Some(cursor_col)) = (ctx.cursor_row, ctx.cursor_col) { + cursor_row == row && cursor_col >= col && cursor_col < col.saturating_add(width) + } else { + false + } +} + +/// Extract elements from a cluster using tokenization. +/// +/// If the cluster contains bracketed tokens or underscore runs, those are +/// extracted as separate elements. The parent cluster is dropped if tokens +/// are found (tokens win, inherit parent's focus if inverse). +/// +/// This handles cases like: +/// - `"Save [OK] Cancel"` → extracts `[OK]` as Button +/// - `"Name: ____"` → extracts `____` as Input +fn extract_elements_from_cluster(cluster: &Cluster, ctx: &ClassifyContext) -> Vec { + let row = cluster.row; + let col = cluster.col; + let text = &cluster.text; + let is_inverse = cluster.style.is_inverse(); + + // First, try to classify the whole cluster + let cursor_hit = cursor_in_range(ctx, row, col, cluster.width); + let whole_cluster_elem = classify_text(text, row, col, is_inverse, cursor_hit); + + // Check if the whole cluster is already a "tight" interactive pattern + // (checkbox, bracketed button, or underscore-only input) + if let Some(ref elem) = whole_cluster_elem { + // If it's a toggle (checkbox pattern), return immediately + if elem.kind == ElementKind::Toggle { + return vec![elem.clone()]; + } + + // If it's a bracket button and the text is entirely the bracket pattern + if elem.kind == ElementKind::Button && is_button_pattern(text) { + return vec![elem.clone()]; + } + + // If it's an input and the text is entirely underscores + if elem.kind == ElementKind::Input && text.trim().chars().all(|c| c == '_') { + return vec![elem.clone()]; + } + } + + // Try to extract tokens from within the cluster + let mut elements = Vec::new(); + let parent_focused = is_inverse; // Tokens inherit focus from inverse parent + + // Extract bracketed tokens + for token in extract_bracketed_tokens(text) { + let token_col = col + token_col_offset(text, token.byte_offset); + let token_cursor_hit = cursor_in_range(ctx, row, token_col, token.text.width() as u16); + + // Classify the token text + // Note: tokens extracted from inverse clusters inherit focus + if let Some(mut elem) = classify_text(&token.text, row, token_col, false, token_cursor_hit) + { + if parent_focused && !elem.focused { + elem.focused = true; + // Upgrade confidence if inheriting focus + if elem.confidence < 1.0 { + elem.confidence = 1.0; + } + } + elements.push(elem); + } + } + + // Extract underscore runs (only if no bracketed tokens found) + if elements.is_empty() { + for token in extract_underscore_runs(text) { + let token_col = col + token_col_offset(text, token.byte_offset); + let token_cursor_hit = cursor_in_range(ctx, row, token_col, token.text.width() as u16); + + if let Some(mut elem) = + classify_text(&token.text, row, token_col, false, token_cursor_hit) + { + if parent_focused && !elem.focused { + elem.focused = true; + elem.confidence = 1.0; + } + elements.push(elem); + } + } + } + + // If tokens were found, return them (dedup rule: tokens win) + if !elements.is_empty() { + return elements; + } + + // No tokens found, return whole cluster classification if any + whole_cluster_elem.into_iter().collect() +} + +/// Classify clusters into interactive elements. +/// +/// Uses tokenization to extract sub-elements from clusters. If a cluster +/// contains bracketed tokens or underscore runs, those are extracted as +/// separate elements and the parent cluster is dropped (dedup rule). +/// +/// Only returns interactive elements (Button, Input, Toggle). +/// Non-interactive clusters are filtered out. +/// +/// Elements are sorted by position (row, then col) for consistent ordering. +#[must_use] +pub fn classify(clusters: Vec, ctx: &ClassifyContext) -> Vec { + let mut detected: Vec = Vec::new(); + + for cluster in clusters { + detected.extend(extract_elements_from_cluster(&cluster, ctx)); + } + + // Sort by position (row, then col) for consistent ordering + detected.sort_by(|a, b| (a.row, a.col).cmp(&(b.row, b.col))); + + // Convert to Elements + detected + .into_iter() + .map(|elem| elem.into_element()) + .collect() +} + +/// Convenience function: segment a grid and classify in one step. +/// +/// This is the main entry point for element detection. +#[must_use] +pub fn detect( + grid: &G, + ctx: &ClassifyContext, +) -> Vec { + let clusters = crate::elements::segment::segment(grid); + classify(clusters, ctx) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::elements::grid::test_support::SimpleGrid; + use crate::elements::segment::Cluster; + use crate::elements::style::CellStyle; + + fn cluster(text: &str) -> Cluster { + Cluster::new(0, 0, text.to_string(), CellStyle::default()) + } + + fn cluster_at(row: u16, col: u16, text: &str) -> Cluster { + Cluster::new(row, col, text.to_string(), CellStyle::default()) + } + + fn inverse_cluster(text: &str) -> Cluster { + Cluster::new(0, 0, text.to_string(), CellStyle::new().with_inverse(true)) + } + + fn classify_cluster(cluster: &Cluster, ctx: &ClassifyContext) -> Option { + extract_elements_from_cluster(cluster, ctx) + .into_iter() + .next() + } + + #[test] + fn button_bracket_patterns() { + let ctx = ClassifyContext::new(); + + let result = classify_cluster(&cluster("[OK]"), &ctx).unwrap(); + assert_eq!(result.kind, ElementKind::Button); + assert!((result.confidence - 0.8).abs() < f32::EPSILON); + + assert_eq!( + classify_cluster(&cluster(""), &ctx).unwrap().kind, + ElementKind::Button + ); + assert_eq!( + classify_cluster(&cluster("(Submit)"), &ctx).unwrap().kind, + ElementKind::Button + ); + } + + #[test] + fn toggle_checkbox_patterns() { + let ctx = ClassifyContext::new(); + + let checked = classify_cluster(&cluster("[x]"), &ctx).unwrap(); + assert_eq!(checked.kind, ElementKind::Toggle); + assert_eq!(checked.checked, Some(true)); + + let unchecked = classify_cluster(&cluster("[ ]"), &ctx).unwrap(); + assert_eq!(unchecked.kind, ElementKind::Toggle); + assert_eq!(unchecked.checked, Some(false)); + } + + #[test] + fn input_patterns() { + let ctx = ClassifyContext::new(); + + let underscore = classify_cluster(&cluster("_____"), &ctx).unwrap(); + assert_eq!(underscore.kind, ElementKind::Input); + assert!((underscore.confidence - 0.6).abs() < f32::EPSILON); + + // Cursor position creates focused input + let ctx_cursor = ClassifyContext::new().with_cursor(0, 5); + let cursor_input = classify_cluster(&cluster_at(0, 0, "some text"), &ctx_cursor).unwrap(); + assert_eq!(cursor_input.kind, ElementKind::Input); + assert!(cursor_input.focused); + } + + #[test] + fn inverse_video_creates_focused_button() { + let ctx = ClassifyContext::new(); + let result = classify_cluster(&inverse_cluster("File"), &ctx).unwrap(); + assert_eq!(result.kind, ElementKind::Button); + assert!(result.focused); + assert!((result.confidence - 1.0).abs() < f32::EPSILON); + } + + #[test] + fn non_interactive_filtered() { + let ctx = ClassifyContext::new(); + assert!(classify_cluster(&cluster("Hello World"), &ctx).is_none()); + assert!(classify_cluster(&cluster("https://example.com"), &ctx).is_none()); + } + + #[test] + fn classify_returns_sorted_elements() { + let ctx = ClassifyContext::new(); + let clusters = vec![cluster("[OK]"), cluster("[Cancel]"), cluster("[ ]")]; + let elements = classify(clusters, &ctx); + + assert_eq!(elements.len(), 3); + assert_eq!(elements[0].kind, ElementKind::Button); + assert_eq!(elements[1].kind, ElementKind::Button); + assert_eq!(elements[2].kind, ElementKind::Toggle); + } + + #[test] + fn detect_full_pipeline() { + let mut grid = SimpleGrid::from_text(&["[OK] [Cancel] [ ]"], 20); + let inverse = CellStyle::new().with_inverse(true); + let bold = CellStyle::new().with_bold(true); + + grid.style_range(0, 0, 4, inverse); + grid.style_range(0, 5, 13, bold); + + let elements = detect(&grid, &ClassifyContext::new()); + let kinds: Vec = elements.iter().map(|e| e.kind).collect(); + + assert!(kinds.contains(&ElementKind::Button)); + assert!(kinds.contains(&ElementKind::Toggle)); + } + + #[test] + fn tokenizer_extracts_from_text() { + let tokens = extract_bracketed_tokens("Save [OK] [Cancel]"); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens[0].text, "[OK]"); + assert_eq!(tokens[1].text, "[Cancel]"); + } + + #[test] + fn dedup_extracts_button_from_text() { + let ctx = ClassifyContext::new(); + let elements = extract_elements_from_cluster(&cluster("Save [OK] Cancel"), &ctx); + + assert_eq!(elements.len(), 1); + assert_eq!(elements[0].text, "[OK]"); + assert_eq!(elements[0].col, 5); + } + + // ======================================================================== + // Security & Edge Case Tests + // ======================================================================== + + #[test] + fn extract_tokens_rejects_oversized_input() { + // Verify that extremely long text is rejected to prevent memory exhaustion + let huge_text = "[".repeat(MAX_CLUSTER_TEXT_LEN + 1); + assert!(extract_bracketed_tokens(&huge_text).is_empty()); + + let huge_underscores = "_".repeat(MAX_CLUSTER_TEXT_LEN + 1); + assert!(extract_underscore_runs(&huge_underscores).is_empty()); + } + + #[test] + fn cursor_in_range_handles_overflow() { + // Verify saturating_add prevents overflow panic + let ctx = ClassifyContext::new().with_cursor(0, u16::MAX); + + // Should not panic even with extreme values + assert!(!cursor_in_range(&ctx, 0, u16::MAX - 10, 100)); + + // Cursor near MAX should still work correctly + let ctx = ClassifyContext::new().with_cursor(0, u16::MAX - 5); + assert!(cursor_in_range(&ctx, 0, u16::MAX - 10, 10)); + } + + // ======================================================================== + // Unicode Width Tests + // ======================================================================== + + #[test] + fn element_width_cjk() { + // CJK characters should have width 2 each + let ctx = ClassifyContext::new(); + let elem = classify_cluster(&cluster("[确认]"), &ctx).unwrap(); + // [=1 + 确=2 + 认=2 + ]=1 = 6 + assert_eq!(elem.width, 6); + } + + #[test] + fn element_width_ascii() { + // ASCII characters should have width 1 each + let ctx = ClassifyContext::new(); + let elem = classify_cluster(&cluster("[OK]"), &ctx).unwrap(); + // [=1 + O=1 + K=1 + ]=1 = 4 + assert_eq!(elem.width, 4); + } + + #[test] + fn element_width_mixed() { + // Mixed ASCII and CJK + let ctx = ClassifyContext::new(); + let elem = classify_cluster(&cluster("[OK确认]"), &ctx).unwrap(); + // [=1 + O=1 + K=1 + 确=2 + 认=2 + ]=1 = 8 + assert_eq!(elem.width, 8); + } + + #[test] + fn token_col_with_cjk_prefix_bracketed() { + // CJK characters before a bracketed token should offset by display width, not char count + let ctx = ClassifyContext::new(); + // 确(width=2) + 认(width=2) = 4 columns before [OK] + let cluster = Cluster::new(0, 0, "确认[OK]".to_string(), CellStyle::default()); + let elements = extract_elements_from_cluster(&cluster, &ctx); + assert_eq!(elements.len(), 1); + assert_eq!(elements[0].text, "[OK]"); + assert_eq!(elements[0].col, 4); // Not 2 (char count)! + } + + #[test] + fn token_col_with_cjk_prefix_underscore() { + // CJK characters before an underscore run should offset by display width + let ctx = ClassifyContext::new(); + // 名(width=2) + 前(width=2) + :(width=1) = 5 columns before ____ + let cluster = Cluster::new(0, 0, "名前:____".to_string(), CellStyle::default()); + let elements = extract_elements_from_cluster(&cluster, &ctx); + assert_eq!(elements.len(), 1); + assert_eq!(elements[0].text, "____"); + assert_eq!(elements[0].col, 5); // Not 3 (char count)! + } + + #[test] + fn token_col_ascii_unchanged() { + // ASCII text should still work correctly (char count == display width) + let ctx = ClassifyContext::new(); + let cluster = Cluster::new(0, 5, "Save [OK] Cancel".to_string(), CellStyle::default()); + let elements = extract_elements_from_cluster(&cluster, &ctx); + assert_eq!(elements.len(), 1); + assert_eq!(elements[0].text, "[OK]"); + assert_eq!(elements[0].col, 10); // 5 (cluster col) + 5 (offset of [OK]) + } +} diff --git a/crates/pilotty-core/src/elements/grid.rs b/crates/pilotty-core/src/elements/grid.rs new file mode 100644 index 0000000..6ef8f09 --- /dev/null +++ b/crates/pilotty-core/src/elements/grid.rs @@ -0,0 +1,149 @@ +//! Screen grid abstraction for element detection segmentation. +//! +//! Defines the `ScreenGrid` trait for uniform access to terminal screen content. + +use crate::elements::style::CellStyle; + +/// A single terminal cell with its character and visual style. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ScreenCell { + /// The character in this cell (space for empty cells). + pub ch: char, + /// Visual style attributes. + pub style: CellStyle, +} + +impl ScreenCell { + /// Create a new screen cell. + #[must_use] + pub fn new(ch: char, style: CellStyle) -> Self { + Self { ch, style } + } +} + +/// Trait for accessing terminal screen content. +/// +/// This abstraction allows element detection to work with any terminal backend. +/// Uses 0-based coordinates matching the cursor API convention. +pub trait ScreenGrid { + /// Number of rows in the grid. + fn rows(&self) -> u16; + + /// Number of columns in the grid. + fn cols(&self) -> u16; + + /// Get cell at the given position. Returns `None` if out of bounds. + fn cell(&self, row: u16, col: u16) -> Option; +} + +#[cfg(test)] +pub(crate) mod test_support { + use super::*; + + /// A simple in-memory grid for testing. + #[derive(Debug, Clone)] + pub struct SimpleGrid { + cells: Vec, + rows: u16, + cols: u16, + } + + impl SimpleGrid { + /// Create a new grid filled with empty cells. + #[must_use] + pub fn new(rows: u16, cols: u16) -> Self { + let cell_count = rows as usize * cols as usize; + Self { + cells: vec![ScreenCell::new(' ', CellStyle::default()); cell_count], + rows, + cols, + } + } + + /// Create a grid from text lines. + #[must_use] + pub fn from_text(lines: &[&str], cols: u16) -> Self { + let rows = lines.len() as u16; + let mut grid = Self::new(rows, cols); + + for (row_idx, line) in lines.iter().enumerate() { + for (col_idx, ch) in line.chars().enumerate() { + if col_idx < cols as usize { + if let Some(idx) = grid.index(row_idx as u16, col_idx as u16) { + grid.cells[idx] = ScreenCell::new(ch, CellStyle::default()); + } + } + } + } + + grid + } + + /// Apply a style to a range of cells in a row. + pub fn style_range(&mut self, row: u16, start_col: u16, end_col: u16, style: CellStyle) { + for col in start_col..end_col { + if let Some(idx) = self.index(row, col) { + self.cells[idx].style = style; + } + } + } + + fn index(&self, row: u16, col: u16) -> Option { + if row < self.rows && col < self.cols { + Some(row as usize * self.cols as usize + col as usize) + } else { + None + } + } + } + + impl ScreenGrid for SimpleGrid { + fn rows(&self) -> u16 { + self.rows + } + + fn cols(&self) -> u16 { + self.cols + } + + fn cell(&self, row: u16, col: u16) -> Option { + self.index(row, col).map(|i| self.cells[i].clone()) + } + } +} + +// Re-export for tests in other modules +#[cfg(test)] +pub(crate) use test_support::SimpleGrid; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn screen_cell_creation() { + let cell = ScreenCell::new('A', CellStyle::default()); + assert_eq!(cell.ch, 'A'); + } + + #[test] + fn simple_grid_from_text() { + let grid = SimpleGrid::from_text(&["Hello", "World"], 10); + assert_eq!(grid.rows(), 2); + assert_eq!(grid.cols(), 10); + assert_eq!(grid.cell(0, 0).unwrap().ch, 'H'); + assert_eq!(grid.cell(1, 0).unwrap().ch, 'W'); + } + + #[test] + fn simple_grid_style_range() { + let mut grid = SimpleGrid::from_text(&["[OK]"], 10); + let inverse = CellStyle::new().with_inverse(true); + + grid.style_range(0, 0, 4, inverse); + + assert!(grid.cell(0, 0).unwrap().style.inverse); + assert!(grid.cell(0, 3).unwrap().style.inverse); + assert!(!grid.cell(0, 4).unwrap().style.inverse); + } +} diff --git a/crates/pilotty-core/src/elements/mod.rs b/crates/pilotty-core/src/elements/mod.rs new file mode 100644 index 0000000..4602451 --- /dev/null +++ b/crates/pilotty-core/src/elements/mod.rs @@ -0,0 +1,170 @@ +//! UI element detection types. +//! +//! This module provides types for detecting and classifying terminal UI elements. +//! It uses a heuristic pipeline that segments the terminal buffer by visual +//! style, then classifies segments into semantic kinds. +//! +//! # Element Kinds +//! +//! We use a simplified 3-kind model instead of many roles: +//! - **Button**: Clickable elements (bracketed text, inverse video) +//! - **Input**: Text entry fields (cursor position, underscore runs) +//! - **Toggle**: Checkbox/radio elements with on/off state +//! +//! # Detection Rules (priority order) +//! +//! 1. Cursor position → Input (confidence: 1.0, focused: true) +//! 2. Checkbox pattern `[x]`/`[ ]`/`☑`/`☐` → Toggle (confidence: 1.0) +//! 3. Inverse video → Button (confidence: 1.0, focused: true) +//! 4. Bracket pattern `[OK]`/`` → Button (confidence: 0.8) +//! 5. Underscore field `____` → Input (confidence: 0.6) +//! +//! Non-interactive elements (links, progress bars, status text) are filtered out. +//! They remain in `snapshot.text` for agents to read, not as elements. + +pub mod classify; +pub mod grid; +pub mod segment; +pub mod style; + +use serde::{Deserialize, Serialize}; + +/// Kind of interactive element. +/// +/// Simplified from 11 roles to 3 kinds based on what agents actually need: +/// - What kind is it? (button/input/toggle) +/// - Is it focused? +/// - What's the toggle state? (for toggles only) +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ElementKind { + /// Clickable element (buttons, menu items, tabs). + /// Detected via: inverse video, bracket patterns `[OK]`, ``. + Button, + /// Text entry field. + /// Detected via: cursor position, underscore runs `____`. + Input, + /// Checkbox or radio button with on/off state. + /// Detected via: `[x]`, `[ ]`, `☑`, `☐` patterns. + Toggle, +} + +/// A detected interactive UI element. +/// +/// # Coordinates +/// +/// All coordinates are 0-based (row, col) to match cursor API. +/// Height is always 1 in v1 (single-row elements only). +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Element { + /// Kind of interactive element. + pub kind: ElementKind, + + /// Row index (0-based, from top). + pub row: u16, + + /// Column index (0-based, from left). + pub col: u16, + + /// Width in terminal cells. + pub width: u16, + + /// Text content of the element. + pub text: String, + + /// Detection confidence (0.0-1.0). + /// - 1.0: High confidence (cursor, inverse video, checkbox pattern) + /// - 0.8: Medium confidence (bracket pattern) + /// - 0.6: Low confidence (underscore run) + pub confidence: f32, + + /// Whether this element currently has focus. + /// Orthogonal to kind, applies to any element type. + #[serde(default, skip_serializing_if = "is_false")] + pub focused: bool, + + /// Checked state for Toggle kind (None for non-toggles). + #[serde(skip_serializing_if = "Option::is_none")] + pub checked: Option, +} + +/// Helper for serde skip_serializing_if. +fn is_false(b: &bool) -> bool { + !*b +} + +impl Element { + /// Create a new element. + #[must_use] + pub fn new( + kind: ElementKind, + row: u16, + col: u16, + width: u16, + text: String, + confidence: f32, + ) -> Self { + Self { + kind, + row, + col, + width, + text, + confidence, + focused: false, + checked: None, + } + } + + /// Set checked state (for toggles). + #[must_use] + pub fn with_checked(mut self, checked: bool) -> Self { + self.checked = Some(checked); + self + } + + /// Set focused state. + #[must_use] + pub fn with_focused(mut self, focused: bool) -> Self { + self.focused = focused; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn element_kind_serializes_to_snake_case() { + assert_eq!( + serde_json::to_string(&ElementKind::Button).unwrap(), + "\"button\"" + ); + assert_eq!( + serde_json::to_string(&ElementKind::Toggle).unwrap(), + "\"toggle\"" + ); + } + + #[test] + fn element_serialization_omits_optional_fields() { + let elem = Element::new(ElementKind::Button, 0, 0, 4, "OK".to_string(), 0.8); + let json = serde_json::to_string(&elem).unwrap(); + + // Buttons shouldn't have checked, unfocused elements shouldn't have focused + assert!(!json.contains("checked")); + assert!(!json.contains("focused")); + } + + #[test] + fn element_serialization_includes_set_fields() { + let elem = Element::new(ElementKind::Toggle, 0, 0, 3, "[x]".to_string(), 1.0) + .with_checked(true) + .with_focused(true); + let json = serde_json::to_string(&elem).unwrap(); + + assert!(json.contains("\"checked\":true")); + assert!(json.contains("\"focused\":true")); + } +} diff --git a/crates/pilotty-core/src/elements/segment.rs b/crates/pilotty-core/src/elements/segment.rs new file mode 100644 index 0000000..eae531d --- /dev/null +++ b/crates/pilotty-core/src/elements/segment.rs @@ -0,0 +1,208 @@ +//! Segmentation: grouping adjacent cells by visual style. +//! +//! Scans the terminal grid row by row, grouping adjacent cells with identical +//! visual styles into clusters for classification. + +use unicode_width::UnicodeWidthStr; + +use crate::elements::grid::ScreenGrid; +use crate::elements::style::CellStyle; + +/// A cluster of adjacent cells with identical visual style. +/// +/// Clusters are the intermediate representation between raw cells and +/// classified elements. Each cluster spans a contiguous horizontal region +/// of a single row. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Cluster { + /// Row index (0-based, from top). + pub row: u16, + /// Column index (0-based, from left). + pub col: u16, + /// Width in terminal cells. + pub width: u16, + /// Text content of the cluster. + pub text: String, + /// Visual style shared by all cells in this cluster. + pub style: CellStyle, +} + +impl Cluster { + /// Create a new cluster. + #[must_use] + pub fn new(row: u16, col: u16, text: String, style: CellStyle) -> Self { + // Use unicode-width for proper terminal column alignment. + // CJK characters are width 2, zero-width chars are width 0. + let width = text.width().min(u16::MAX as usize) as u16; + Self { + row, + col, + width, + text, + style, + } + } + + /// Check if this cluster contains only whitespace. + #[must_use] + pub fn is_whitespace_only(&self) -> bool { + self.text.chars().all(|c| c.is_whitespace()) + } +} + +/// Segment a single row into clusters. +fn segment_row(grid: &G, row: u16) -> Vec { + let mut clusters = Vec::new(); + + if row >= grid.rows() { + return clusters; + } + + let mut current_text = String::new(); + let mut current_style: Option = None; + let mut start_col: u16 = 0; + + for col in 0..grid.cols() { + let Some(cell) = grid.cell(row, col) else { + continue; + }; + + match current_style { + Some(ref style) if *style == cell.style => { + // Same style, extend current cluster + current_text.push(cell.ch); + } + _ => { + // Style changed or first cell, finalize previous cluster + if let Some(style) = current_style.take() { + if !current_text.is_empty() { + clusters.push(Cluster::new( + row, + start_col, + std::mem::take(&mut current_text), + style, + )); + } + } + // Start new cluster + start_col = col; + current_style = Some(cell.style); + current_text.push(cell.ch); + } + } + } + + // Don't forget the last cluster + if let Some(style) = current_style { + if !current_text.is_empty() { + clusters.push(Cluster::new(row, start_col, current_text, style)); + } + } + + clusters +} + +/// Segment an entire grid into clusters. +fn segment_grid(grid: &G) -> Vec { + let mut clusters = Vec::new(); + + for row in 0..grid.rows() { + clusters.extend(segment_row(grid, row)); + } + + clusters +} + +/// Filter out whitespace-only clusters. +fn filter_whitespace(clusters: Vec) -> Vec { + clusters + .into_iter() + .filter(|c| !c.is_whitespace_only()) + .collect() +} + +/// Segment a grid and filter whitespace in one step. +/// +/// Convenience function that combines `segment_grid` and `filter_whitespace`. +#[must_use] +pub fn segment(grid: &G) -> Vec { + filter_whitespace(segment_grid(grid)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::elements::grid::test_support::SimpleGrid; + + #[test] + fn cluster_creation() { + let cluster = Cluster::new(5, 10, "Hello".to_string(), CellStyle::default()); + assert_eq!(cluster.row, 5); + assert_eq!(cluster.col, 10); + assert_eq!(cluster.width, 5); + assert_eq!(cluster.text, "Hello"); + assert!(!cluster.is_whitespace_only()); + } + + #[test] + fn segment_splits_by_style() { + let mut grid = SimpleGrid::from_text(&["AABBBCC"], 7); + let bold = CellStyle::new().with_bold(true); + let inverse = CellStyle::new().with_inverse(true); + + grid.style_range(0, 2, 5, bold); + grid.style_range(0, 5, 7, inverse); + + let clusters = segment_row(&grid, 0); + + assert_eq!(clusters.len(), 3); + assert_eq!(clusters[0].text, "AA"); + assert_eq!(clusters[0].col, 0); + assert_eq!(clusters[1].text, "BBB"); + assert!(clusters[1].style.bold); + assert_eq!(clusters[2].text, "CC"); + assert!(clusters[2].style.inverse); + } + + #[test] + fn segment_filters_whitespace() { + let mut grid = SimpleGrid::from_text(&["[OK] [Cancel]"], 20); + let inverse = CellStyle::new().with_inverse(true); + + grid.style_range(0, 0, 4, inverse); + grid.style_range(0, 9, 17, inverse); + + let clusters = segment(&grid); + + assert!(clusters.iter().all(|c| !c.is_whitespace_only())); + let texts: Vec<&str> = clusters.iter().map(|c| c.text.as_str()).collect(); + assert!(texts.contains(&"[OK]")); + assert!(texts.contains(&"[Cancel]")); + } + + // ======================================================================== + // Unicode Width Tests + // ======================================================================== + + #[test] + fn cluster_width_cjk() { + // CJK characters should have width 2 each + let cluster = Cluster::new(0, 0, "你好".to_string(), CellStyle::default()); + assert_eq!(cluster.width, 4); // 2 + 2 = 4 + } + + #[test] + fn cluster_width_ascii() { + // ASCII characters should have width 1 each + let cluster = Cluster::new(0, 0, "Hello".to_string(), CellStyle::default()); + assert_eq!(cluster.width, 5); + } + + #[test] + fn cluster_width_mixed() { + // Mixed ASCII and CJK + let cluster = Cluster::new(0, 0, "Hi你好".to_string(), CellStyle::default()); + // H=1 + i=1 + 你=2 + 好=2 = 6 + assert_eq!(cluster.width, 6); + } +} diff --git a/crates/pilotty-core/src/elements/style.rs b/crates/pilotty-core/src/elements/style.rs new file mode 100644 index 0000000..a600a5d --- /dev/null +++ b/crates/pilotty-core/src/elements/style.rs @@ -0,0 +1,126 @@ +//! Visual style types for element detection segmentation. +//! +//! These types represent cell styling independent of the vt100 crate, +//! allowing the core element detection types to remain vt100-agnostic. + +use serde::{Deserialize, Serialize}; + +/// Terminal color representation. +/// +/// Maps to standard terminal color modes: +/// - Default: terminal's default foreground/background +/// - Indexed: 256-color palette (0-255) +/// - Rgb: 24-bit true color +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum Color { + /// Terminal default color. + #[default] + Default, + /// 256-color palette index (0-255). + Indexed { index: u8 }, + /// 24-bit RGB color. + Rgb { r: u8, g: u8, b: u8 }, +} + +impl Color { + /// Create an indexed color. + #[must_use] + pub fn indexed(index: u8) -> Self { + Self::Indexed { index } + } + + /// Create an RGB color. + #[must_use] + pub fn rgb(r: u8, g: u8, b: u8) -> Self { + Self::Rgb { r, g, b } + } +} + +/// Visual style attributes for a terminal cell. +/// +/// Used for segmentation: adjacent cells with identical styles are grouped +/// into clusters. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default, Serialize, Deserialize)] +pub struct CellStyle { + /// Bold text attribute. + pub bold: bool, + /// Underlined text attribute. + pub underline: bool, + /// Inverse video (swapped fg/bg). + pub inverse: bool, + /// Foreground color. + pub fg_color: Color, + /// Background color. + pub bg_color: Color, +} + +impl CellStyle { + /// Create a new cell style with default values. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Set bold attribute. + #[must_use] + pub fn with_bold(mut self, bold: bool) -> Self { + self.bold = bold; + self + } + + /// Set underline attribute. + #[must_use] + pub fn with_underline(mut self, underline: bool) -> Self { + self.underline = underline; + self + } + + /// Set inverse attribute. + #[must_use] + pub fn with_inverse(mut self, inverse: bool) -> Self { + self.inverse = inverse; + self + } + + /// Set foreground color. + #[must_use] + pub fn with_fg(mut self, color: Color) -> Self { + self.fg_color = color; + self + } + + /// Set background color. + #[must_use] + pub fn with_bg(mut self, color: Color) -> Self { + self.bg_color = color; + self + } + + /// Check if this style uses inverse video. + /// + /// Inverse video is a strong signal for selected menu items and tabs. + #[must_use] + pub fn is_inverse(&self) -> bool { + self.inverse + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cell_style_default() { + let style = CellStyle::default(); + assert!(!style.bold); + assert!(!style.inverse); + assert_eq!(style.fg_color, Color::Default); + } + + #[test] + fn is_inverse_helper() { + assert!(!CellStyle::new().is_inverse()); + assert!(CellStyle::new().with_inverse(true).is_inverse()); + } +} diff --git a/crates/pilotty-core/src/lib.rs b/crates/pilotty-core/src/lib.rs index f8b3c45..6c98556 100644 --- a/crates/pilotty-core/src/lib.rs +++ b/crates/pilotty-core/src/lib.rs @@ -1,8 +1,31 @@ //! Core types and logic for pilotty. //! -//! This crate provides the shared data structures and algorithms used by both -//! the CLI/daemon and the MCP server. +//! This crate provides shared data structures and algorithms for AI-driven +//! terminal automation. It's used by both the CLI/daemon and MCP server. +//! +//! # Modules +//! +//! - [`error`]: API error types with actionable suggestions for AI consumers +//! - [`input`]: Terminal input encoding (keys, mouse, modifiers) +//! - [`protocol`]: JSON-line request/response protocol +//! - [`snapshot`]: Screen state capture and change detection +//! - [`elements`]: UI element detection +//! +//! # Element Detection +//! +//! pilotty detects interactive UI elements using a simplified 3-kind model +//! optimized for AI agents: +//! +//! | Kind | Detection | Confidence | +//! |------|-----------|------------| +//! | **Button** | Inverse video, `[OK]`, `` | 1.0 / 0.8 | +//! | **Input** | Cursor position, `____` underscores | 1.0 / 0.6 | +//! | **Toggle** | `[x]`, `[ ]`, `☑`, `☐` | 1.0 | +//! +//! Elements include row/col coordinates for use with the click command. +//! The `content_hash` field enables efficient change detection. +pub mod elements; pub mod error; pub mod input; pub mod protocol; diff --git a/crates/pilotty-core/src/protocol.rs b/crates/pilotty-core/src/protocol.rs index c6eac27..42154ea 100644 --- a/crates/pilotty-core/src/protocol.rs +++ b/crates/pilotty-core/src/protocol.rs @@ -79,12 +79,12 @@ pub enum Command { #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum SnapshotFormat { - /// Full JSON with all metadata. + /// Full JSON with all metadata including text and elements. #[default] Full, - /// Compact format with inline refs. + /// Compact format: omits text and elements, just metadata. Compact, - /// Plain text only. + /// Plain text only (no JSON structure). Text, } @@ -97,7 +97,7 @@ pub enum ScrollDirection { } /// A response from daemon to CLI. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Response { pub id: String, pub success: bool, @@ -128,7 +128,7 @@ impl Response { } /// Response payload variants. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[serde(tag = "type", rename_all = "snake_case")] pub enum ResponseData { /// Full screen state snapshot. diff --git a/crates/pilotty-core/src/snapshot.rs b/crates/pilotty-core/src/snapshot.rs index bf8c884..9c0cbc0 100644 --- a/crates/pilotty-core/src/snapshot.rs +++ b/crates/pilotty-core/src/snapshot.rs @@ -1,7 +1,32 @@ -//! Screen state types. +//! Screen state capture and change detection. +//! +//! This module provides types for capturing terminal screen state, including +//! text content, cursor position, and detected UI elements. +//! +//! # Snapshot Formats +//! +//! The daemon supports two snapshot formats: +//! +//! | Format | Content | Use Case | +//! |--------|---------|----------| +//! | **Full** | text + elements + hash | Complete state for new screens | +//! | **Compact** | metadata only | Quick status checks | +//! +//! # Change Detection +//! +//! The `content_hash` field provides efficient change detection. Agents can +//! compare hashes across snapshots without parsing the full element list: +//! +//! ```ignore +//! if new_snapshot.content_hash != old_snapshot.content_hash { +//! // Screen changed, re-analyze elements +//! } +//! ``` use serde::{Deserialize, Serialize}; +use crate::elements::Element; + /// Terminal dimensions. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct TerminalSize { @@ -18,7 +43,7 @@ pub struct CursorState { } /// Complete screen state snapshot. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct ScreenState { pub snapshot_id: u64, pub size: TerminalSize, @@ -26,6 +51,21 @@ pub struct ScreenState { /// Plain text content of the screen. #[serde(skip_serializing_if = "Option::is_none")] pub text: Option, + /// Detected interactive UI elements. + /// + /// Elements are detected using visual style segmentation and pattern + /// classification. Each element includes its position (row, col) for + /// interaction via the click command. + #[serde(skip_serializing_if = "Option::is_none")] + pub elements: Option>, + /// Hash of screen content for change detection. + /// + /// Computed from the screen text using a fast non-cryptographic hash. + /// Present when `elements` is requested (`with_elements=true`). + /// Agents can compare hashes across snapshots to detect screen changes + /// without parsing the full element list. + #[serde(skip_serializing_if = "Option::is_none")] + pub content_hash: Option, } impl ScreenState { @@ -39,6 +79,71 @@ impl ScreenState { visible: true, }, text: None, + elements: None, + content_hash: None, } } } + +/// Compute a content hash from screen text. +/// +/// Uses FNV-1a, a fast non-cryptographic hash suitable for change detection. +#[must_use] +pub fn compute_content_hash(text: &str) -> u64 { + // FNV-1a parameters for 64-bit + const FNV_OFFSET: u64 = 0xcbf29ce484222325; + const FNV_PRIME: u64 = 0x00000100000001B3; + + let mut hash = FNV_OFFSET; + for byte in text.bytes() { + hash ^= u64::from(byte); + hash = hash.wrapping_mul(FNV_PRIME); + } + hash +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn content_hash_deterministic() { + let text = "Hello, World!"; + let hash1 = compute_content_hash(text); + let hash2 = compute_content_hash(text); + assert_eq!(hash1, hash2); + } + + #[test] + fn content_hash_differs_for_different_text() { + let hash1 = compute_content_hash("Hello"); + let hash2 = compute_content_hash("World"); + assert_ne!(hash1, hash2); + } + + #[test] + fn content_hash_empty_string() { + // Empty string should return the FNV-1a offset basis + let hash = compute_content_hash(""); + assert_eq!(hash, 0xcbf29ce484222325); + } + + #[test] + fn content_hash_single_char_difference() { + // Even a single character difference should produce different hashes + let hash1 = compute_content_hash("test"); + let hash2 = compute_content_hash("tess"); + assert_ne!(hash1, hash2); + } + + #[test] + fn content_hash_unicode() { + // Unicode text should hash consistently + let text = "日本語テスト 🚀"; + let hash1 = compute_content_hash(text); + let hash2 = compute_content_hash(text); + assert_eq!(hash1, hash2); + // Should differ from ASCII + assert_ne!(hash1, compute_content_hash("ascii")); + } +} diff --git a/skills/pilotty/SKILL.md b/skills/pilotty/SKILL.md index de357f4..c9efb74 100644 --- a/skills/pilotty/SKILL.md +++ b/skills/pilotty/SKILL.md @@ -30,7 +30,7 @@ This is the #1 cause of agent failures. When in doubt: **flags first, then comma ```bash pilotty spawn vim file.txt # Start TUI app in managed session pilotty wait-for "file.txt" # Wait for app to be ready -pilotty snapshot # Get screen state with cursor position +pilotty snapshot # Get screen state with UI elements pilotty key i # Enter insert mode pilotty type "Hello, World!" # Type text pilotty key Escape # Exit insert mode @@ -41,9 +41,10 @@ pilotty kill # End session 1. **Spawn**: `pilotty spawn ` starts the app in a background PTY 2. **Wait**: `pilotty wait-for ` ensures the app is ready -3. **Snapshot**: `pilotty snapshot` returns screen state with text content and cursor position -4. **Interact**: Use keyboard commands (`key`, `type`) or click at coordinates (`click `) -5. **Re-snapshot**: After screen changes, snapshot again to see updated state +3. **Snapshot**: `pilotty snapshot` returns screen state with detected UI elements +4. **Understand**: Parse `elements[]` to identify buttons, inputs, toggles +5. **Interact**: Use keyboard commands (`key`, `type`) to navigate and interact +6. **Re-snapshot**: Check `content_hash` to detect screen changes ## Commands @@ -56,14 +57,14 @@ pilotty kill # Kill default session pilotty kill -s myapp # Kill specific session pilotty list-sessions # List all active sessions pilotty daemon # Manually start daemon (usually auto-starts) -pilotty stop # Stop daemon and all sessions +pilotty shutdown # Stop daemon and all sessions pilotty examples # Show end-to-end workflow example ``` ### Screen capture ```bash -pilotty snapshot # Full JSON with text content +pilotty snapshot # Full JSON with text content and elements pilotty snapshot --format compact # JSON without text field pilotty snapshot --format text # Plain text with cursor indicator pilotty snapshot -s myapp # Snapshot specific session @@ -125,16 +126,23 @@ PILOTTY_SOCKET_DIR="/tmp/pilotty" # Override socket directory RUST_LOG="debug" # Enable debug logging ``` -## Snapshot output +## Snapshot Output -The `snapshot` command returns structured JSON: +The `snapshot` command returns structured JSON with detected UI elements: ```json { "snapshot_id": 42, "size": { "cols": 80, "rows": 24 }, "cursor": { "row": 5, "col": 10, "visible": true }, - "text": "... plain text content ..." + "text": "Settings:\n [x] Notifications [ ] Dark mode\n [Save] [Cancel]", + "elements": [ + { "kind": "toggle", "row": 1, "col": 2, "width": 3, "text": "[x]", "confidence": 1.0, "checked": true }, + { "kind": "toggle", "row": 1, "col": 20, "width": 3, "text": "[ ]", "confidence": 1.0, "checked": false }, + { "kind": "button", "row": 2, "col": 2, "width": 6, "text": "[Save]", "confidence": 0.8 }, + { "kind": "button", "row": 2, "col": 10, "width": 8, "text": "[Cancel]", "confidence": 0.8 } + ], + "content_hash": 12345678901234567890 } ``` @@ -147,7 +155,85 @@ bash-3.2$ [_] The `[_]` shows cursor position. Use the text content to understand screen state and navigate with keyboard commands. -## Navigation approach +--- + +## Element Detection + +pilotty automatically detects interactive UI elements in terminal applications. Elements provide **read-only context** to help understand UI structure. + +### Element Kinds + +| Kind | Detection Patterns | Confidence | Fields | +|------|-------------------|------------|--------| +| **toggle** | `[x]`, `[ ]`, `[*]`, `☑`, `☐` | 1.0 | `checked: bool` | +| **button** | Inverse video, `[OK]`, ``, `(Submit)` | 1.0 / 0.8 | `focused: bool` (if true) | +| **input** | Cursor position, `____` underscores | 1.0 / 0.6 | `focused: bool` (if true) | + +### Element Fields + +| Field | Type | Description | +|-------|------|-------------| +| `kind` | string | Element type: `button`, `input`, or `toggle` | +| `row` | number | Row position (0-based from top) | +| `col` | number | Column position (0-based from left) | +| `width` | number | Width in terminal cells (CJK chars = 2) | +| `text` | string | Text content of the element | +| `confidence` | number | Detection confidence (0.0-1.0) | +| `focused` | bool | Whether element has focus (only present if true) | +| `checked` | bool | Toggle state (only present for toggles) | + +### Confidence Levels + +| Confidence | Meaning | +|------------|---------| +| **1.0** | High confidence: Cursor position, inverse video, checkbox patterns | +| **0.8** | Medium confidence: Bracket patterns `[OK]`, `` | +| **0.6** | Lower confidence: Underscore input fields `____` | + +### Change Detection + +The `content_hash` field enables efficient screen change detection: + +```bash +# Get initial state +SNAP1=$(pilotty snapshot) +HASH1=$(echo "$SNAP1" | jq -r '.content_hash') + +# Perform action +pilotty key Tab + +# Check if screen changed +SNAP2=$(pilotty snapshot) +HASH2=$(echo "$SNAP2" | jq -r '.content_hash') + +if [ "$HASH1" != "$HASH2" ]; then + echo "Screen changed - re-analyze elements" +fi +``` + +### Using Elements Effectively + +Elements are **read-only context** for understanding the UI. Use **keyboard navigation** for reliable interaction: + +```bash +# 1. Get snapshot to understand UI structure +pilotty snapshot | jq '.elements' +# Output shows toggles (checked/unchecked) and buttons with positions + +# 2. Navigate and interact with keyboard (reliable approach) +pilotty key Tab # Move to next element +pilotty key Space # Toggle checkbox +pilotty key Enter # Activate button + +# 3. Verify state changed +pilotty snapshot | jq '.elements[] | select(.kind == "toggle")' +``` + +**Key insight**: Use elements to understand WHAT is on screen, use keyboard to interact with it. + +--- + +## Navigation Approach pilotty uses keyboard-first navigation, just like a human would: @@ -160,6 +246,7 @@ pilotty key Tab # Move to next element pilotty key Enter # Activate/select pilotty key Escape # Cancel/back pilotty key Up # Move up in list/menu +pilotty key Space # Toggle checkbox # 3. Type text when needed pilotty type "search term" @@ -169,7 +256,9 @@ pilotty key Enter pilotty click 5 10 # Click at row 5, col 10 ``` -**Key insight**: Parse the snapshot text to understand what's on screen, then use keyboard commands to navigate. This works reliably across all TUI applications. +**Key insight**: Parse the snapshot text and elements to understand what's on screen, then use keyboard commands to navigate. This works reliably across all TUI applications. + +--- ## Example: Edit file with vim @@ -197,22 +286,64 @@ pilotty key -s editor Enter pilotty list-sessions ``` -## Example: Dialog interaction +## Example: Dialog checklist interaction ```bash -# 1. Spawn dialog (--name before command) -pilotty spawn --name dialog dialog --yesno "Continue?" 10 40 +# 1. Spawn dialog checklist (--name before command) +pilotty spawn --name opts dialog --checklist "Select features:" 12 50 4 \ + "notifications" "Push notifications" on \ + "darkmode" "Dark mode theme" off \ + "autosave" "Auto-save documents" on \ + "telemetry" "Usage analytics" off + +# 2. Wait for dialog to render +sleep 0.5 -# 2. Get snapshot to see the dialog -pilotty snapshot -s dialog --format text -# Shows: < Yes > and < No > buttons +# 3. Get snapshot and examine elements +pilotty snapshot -s opts | jq '.elements[] | select(.kind == "toggle")' +# Shows toggle elements with checked state and positions -# 3. Navigate with keyboard -pilotty key -s dialog Tab # Move to next button -pilotty key -s dialog Enter # Activate selected button +# 4. Navigate to "darkmode" and toggle it +pilotty key -s opts Down # Move to second option +pilotty key -s opts Space # Toggle it on -# Or click at coordinates if you know the button position -pilotty click -s dialog 8 15 # Click at row 8, col 15 +# 5. Verify the change +pilotty snapshot -s opts | jq '.elements[] | select(.kind == "toggle") | {text, checked}' + +# 6. Confirm selection +pilotty key -s opts Enter + +# 7. Clean up +pilotty kill -s opts +``` + +## Example: Form filling with elements + +```bash +# 1. Spawn a form application +pilotty spawn --name form my-form-app + +# 2. Get snapshot to understand form structure +pilotty snapshot -s form | jq '.elements' +# Shows inputs, toggles, and buttons with positions for click command + +# 3. Tab to first input (likely already focused) +pilotty type -s form "myusername" + +# 4. Tab to password field +pilotty key -s form Tab +pilotty type -s form "mypassword" + +# 5. Tab to remember me and toggle +pilotty key -s form Tab +pilotty key -s form Space + +# 6. Tab to Login and activate +pilotty key -s form Tab +pilotty key -s form Enter + +# 7. Check result +pilotty snapshot -s form --format text ``` ## Example: Monitor with htop @@ -235,6 +366,8 @@ pilotty key -s monitor q # Quit pilotty kill -s monitor ``` +--- + ## Sessions Each session is isolated with its own: @@ -262,7 +395,7 @@ The first session spawned without `--name` is automatically named `default`. > **Important:** The `--name` flag must come **before** the command. Everything after the command is passed as arguments to that command. -## Daemon architecture +## Daemon Architecture pilotty uses a background daemon for session management: @@ -273,7 +406,7 @@ pilotty uses a background daemon for session management: You rarely need to manage the daemon manually. -## Error handling +## Error Handling Errors include actionable suggestions: @@ -293,7 +426,9 @@ Errors include actionable suggestions: } ``` -## Common patterns +--- + +## Common Patterns ### Wait then act @@ -310,6 +445,16 @@ pilotty snapshot --format text | grep "Error" # Check for errors pilotty key Enter # Then proceed ``` +### Check for specific element + +```bash +# Check if the first toggle is checked +pilotty snapshot | jq '.elements[] | select(.kind == "toggle") | {text, checked}' | head -1 + +# Find element at specific position +pilotty snapshot | jq '.elements[] | select(.row == 5 and .col == 10)' +``` + ### Retry on timeout ```bash @@ -319,7 +464,9 @@ pilotty wait-for "Ready" -t 5000 || { } ``` -## Deep-dive documentation +--- + +## Deep-dive Documentation For detailed patterns and edge cases, see: @@ -327,8 +474,9 @@ For detailed patterns and edge cases, see: |-----------|-------------| | [references/session-management.md](references/session-management.md) | Multi-session patterns, isolation, cleanup | | [references/key-input.md](references/key-input.md) | Complete key combinations reference | +| [references/element-detection.md](references/element-detection.md) | Detection rules, confidence, patterns | -## Ready-to-use templates +## Ready-to-use Templates Executable workflow scripts: @@ -337,10 +485,12 @@ Executable workflow scripts: | [templates/vim-workflow.sh](templates/vim-workflow.sh) | Edit file with vim, save, exit | | [templates/dialog-interaction.sh](templates/dialog-interaction.sh) | Handle dialog/whiptail prompts | | [templates/multi-session.sh](templates/multi-session.sh) | Parallel TUI orchestration | +| [templates/element-detection.sh](templates/element-detection.sh) | Element detection demo | Usage: ```bash ./templates/vim-workflow.sh /tmp/myfile.txt "File content here" ./templates/dialog-interaction.sh ./templates/multi-session.sh +./templates/element-detection.sh ``` diff --git a/skills/pilotty/references/element-detection.md b/skills/pilotty/references/element-detection.md new file mode 100644 index 0000000..15080dc --- /dev/null +++ b/skills/pilotty/references/element-detection.md @@ -0,0 +1,280 @@ +# Element Detection + +pilotty automatically detects interactive UI elements in terminal applications. Elements provide **read-only context** to help agents understand UI structure. + +## Overview + +pilotty analyzes terminal screen content and detects: +- **Toggles**: Checkboxes like `[x]`, `[ ]`, `[*]`, `☑`, `☐` +- **Buttons**: Action elements like `[OK]`, ``, `(Submit)` +- **Inputs**: Text fields marked by underscores `____` or cursor position + +Each detected element includes: +- Kind, position (row, col), width, text content +- Confidence score (0.0-1.0) +- State information (checked for toggles, focused for inputs/buttons) + +## Detection Rules + +### Priority Order (Highest to Lowest) + +1. **Cursor Position** - Input (confidence: 1.0, focused: true) +2. **Checkbox Patterns** - Toggle (confidence: 1.0) +3. **Inverse Video** - Button (confidence: 1.0, focused: true) +4. **Bracket Patterns** - Button (confidence: 0.8) +5. **Underscore Fields** - Input (confidence: 0.6) + +### Toggle Detection + +Toggles are detected from checkbox patterns: + +| Pattern | State | Notes | +|---------|-------|-------| +| `[x]`, `[X]` | checked: true | Standard checked | +| `[ ]` | checked: false | Standard unchecked | +| `[*]` | checked: true | Dialog/ncurses style | +| `☑`, `✓`, `✔`, `☒` | checked: true | Unicode checkmarks | +| `☐`, `□` | checked: false | Unicode unchecked | + +Example detection: +```json +{ + "kind": "toggle", + "row": 5, + "col": 2, + "width": 3, + "text": "[x]", + "confidence": 1.0, + "checked": true +} +``` + +### Button Detection + +Buttons are detected from: + +1. **Inverse video** (highest confidence) + - Text with reversed foreground/background colors + - Common in dialog, whiptail, and ncurses apps + - Confidence: 1.0, focused: true + +2. **Bracket patterns** (medium confidence) + - Square brackets: `[OK]`, `[Cancel]`, `[Save]` + - Angle brackets: ``, `` + - Parentheses: `(Submit)`, `(Reset)` + - Confidence: 0.8 + +Example detection: +```json +{ + "kind": "button", + "row": 10, + "col": 5, + "width": 6, + "text": "[Save]", + "confidence": 0.8 +} +``` + +### Input Detection + +Inputs are detected from: + +1. **Cursor position** (highest confidence) + - The cell where the cursor is located + - Confidence: 1.0, focused: true + +2. **Underscore runs** (lower confidence) + - 3+ consecutive underscores: `___`, `__________` + - Common in form-style TUIs + - Confidence: 0.6 + +Example detection: +```json +{ + "kind": "input", + "row": 8, + "col": 12, + "width": 10, + "text": "__________", + "confidence": 0.6 +} +``` + +## Non-Interactive Patterns (Filtered) + +The following patterns are recognized but NOT returned as interactive elements: + +| Pattern | Why Filtered | +|---------|--------------| +| `http://`, `https://` | Links are not clickable in most TUIs | +| `[====]`, `[####]` | Progress bars | +| `[ERROR]`, `[WARNING]`, `[INFO]` | Status indicators | +| `[1]`, `[2]`, `1)`, `a)` | Menu prefixes | +| `├`, `┤`, `│`, `┌`, `┐` | Box-drawing characters | + +## Element Fields Reference + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `kind` | string | Yes | `button`, `input`, or `toggle` | +| `row` | number | Yes | Row position (0-based from top) | +| `col` | number | Yes | Column position (0-based from left) | +| `width` | number | Yes | Width in terminal cells | +| `text` | string | Yes | Element text content | +| `confidence` | number | Yes | Detection confidence (0.0-1.0) | +| `focused` | bool | No | Present and true if element has focus | +| `checked` | bool | No | Present for toggles only | + +### Width Calculation + +Element width uses Unicode display width: +- ASCII characters: width 1 +- CJK characters (Chinese, Japanese, Korean): width 2 +- Emoji: width 2 +- Zero-width characters: width 0 + +This matches terminal column alignment. + +## Content Hash + +Each snapshot includes a `content_hash` field for change detection: + +```json +{ + "content_hash": 12345678901234567890, + ... +} +``` + +The hash is computed from the visible screen text content. Use it to: +- Detect if the screen changed between snapshots +- Avoid re-processing unchanged screens + +```bash +HASH1=$(pilotty snapshot | jq -r '.content_hash') +pilotty key Tab +HASH2=$(pilotty snapshot | jq -r '.content_hash') +[ "$HASH1" != "$HASH2" ] && echo "Screen changed" +``` + +## Best Practices + +### 1. Elements for Understanding, Keyboard for Interaction + +Elements tell you WHAT is on screen. Use keyboard to interact: + +```bash +# See what's on screen +pilotty snapshot | jq '.elements[] | {kind, text, row, col, checked}' + +# Navigate with keyboard +pilotty key Tab # Move between elements +pilotty key Space # Toggle checkboxes +pilotty key Enter # Activate buttons +``` + +### 2. Check Confidence Levels + +Higher confidence means more reliable detection: + +```bash +# Filter to high-confidence elements only +pilotty snapshot | jq '.elements[] | select(.confidence >= 0.8)' +``` + +### 3. Find Elements by Content or Position + +```bash +# Find element by text content +pilotty snapshot | jq '.elements[] | select(.text | contains("Save"))' + +# Find element at specific position +pilotty snapshot | jq '.elements[] | select(.row == 5 and .col == 10)' + +# Get first toggle +pilotty snapshot | jq '[.elements[] | select(.kind == "toggle")][0]' +``` + +## Limitations + +### What Detection Does NOT Find + +1. **Menu items without markers** - Plain text menus need keyboard navigation +2. **Custom widgets** - Non-standard UI patterns may not be recognized +3. **Color-only highlighting** - Elements must have text patterns or inverse video +4. **Disabled elements** - No distinction between enabled/disabled + +### What Detection Cannot Do + +1. **Click elements directly by name** - Use row/col with click command +2. **Track elements across screens** - Elements may move; use text content to re-find + +## Troubleshooting + +### No Elements Detected + +1. Check if the app uses standard patterns: + ```bash + pilotty snapshot --format text # View raw screen + ``` + +2. Look for inverse video (may show elements on button/input): + ```bash + pilotty snapshot | jq '.elements[] | select(.confidence == 1.0)' + ``` + +### Wrong Element Kind + +The classifier uses heuristics. If `[x]` is detected as a button instead of toggle: +1. Check for surrounding context +2. Use `text` field to identify element purpose + +### Elements Missing After Action + +Element positions may change between snapshots. Track elements by: +- Text content (most reliable) +- Element kind +- Approximate row/column position + +## Example: Complete Workflow + +```bash +#!/bin/bash +SESSION="form" + +# 1. Spawn application +pilotty spawn --name $SESSION dialog --checklist "Options:" 15 50 4 \ + "opt1" "Feature A" on \ + "opt2" "Feature B" off \ + "opt3" "Feature C" on \ + "opt4" "Feature D" off + +sleep 0.5 + +# 2. Analyze initial state +echo "Initial state:" +pilotty snapshot -s $SESSION | jq '.elements[] | select(.kind == "toggle") | {text, checked}' + +# 3. Find unchecked toggles +UNCHECKED=$(pilotty snapshot -s $SESSION | jq '[.elements[] | select(.kind == "toggle" and .checked == false)] | length') +echo "Unchecked toggles: $UNCHECKED" + +# 4. Navigate and toggle opt2 +pilotty key -s $SESSION Down # Move to opt2 +pilotty key -s $SESSION Space # Toggle it + +# 5. Verify change via content_hash +HASH1=$(pilotty snapshot -s $SESSION | jq -r '.content_hash') +echo "Hash after toggle: $HASH1" + +# 6. Confirm and check final state +pilotty key -s $SESSION Enter +sleep 0.3 + +echo "Final state:" +pilotty snapshot -s $SESSION | jq '.elements[] | select(.kind == "toggle") | {text, checked}' + +# 7. Cleanup +pilotty kill -s $SESSION +``` diff --git a/skills/pilotty/templates/dialog-interaction.sh b/skills/pilotty/templates/dialog-interaction.sh index ae73233..0db4a18 100755 --- a/skills/pilotty/templates/dialog-interaction.sh +++ b/skills/pilotty/templates/dialog-interaction.sh @@ -1,6 +1,6 @@ #!/bin/bash # Template: Interact with dialog/whiptail prompts -# Demonstrates handling various dialog types +# Demonstrates handling various dialog types with element detection # # Usage: ./dialog-interaction.sh # Requires: dialog or whiptail installed @@ -16,26 +16,32 @@ if ! command -v dialog &> /dev/null; then exit 1 fi +# Cleanup on exit +cleanup() { + pilotty kill -s "$SESSION_NAME" 2>/dev/null || true +} +trap cleanup EXIT + echo "=== Dialog Interaction Demo ===" # --- Yes/No Dialog --- echo "" echo "1. Yes/No Dialog" -pilotty spawn --name "$SESSION_NAME" dialog --yesno "Do you want to continue?" 10 40 +pilotty spawn --name "$SESSION_NAME" dialog --yesno "Do you want to continue?" 10 40 >/dev/null # Wait for dialog to render -pilotty wait-for -s "$SESSION_NAME" "continue" -t 5000 +pilotty wait-for -s "$SESSION_NAME" "continue" -t 5000 >/dev/null -# Take snapshot to see buttons -echo "Snapshot:" -pilotty snapshot -s "$SESSION_NAME" --format compact +# Show detected elements +echo "Detected elements:" +pilotty snapshot -s "$SESSION_NAME" | jq -r '.elements[] | " \(.kind) \(.text) at (\(.row),\(.col))"' # Select Yes using keyboard (Enter selects the default button) -pilotty key -s "$SESSION_NAME" Enter # Select default (Yes) +pilotty key -s "$SESSION_NAME" Enter >/dev/null sleep 0.5 -echo "Selected: Yes" +echo "Selected: Yes (via Enter)" # --- Menu Dialog --- echo "" @@ -45,36 +51,45 @@ pilotty spawn --name "$SESSION_NAME" dialog --menu "Choose an option:" 15 50 4 \ 1 "Option One" \ 2 "Option Two" \ 3 "Option Three" \ - 4 "Exit" + 4 "Exit" >/dev/null -pilotty wait-for -s "$SESSION_NAME" "Choose" -t 5000 +pilotty wait-for -s "$SESSION_NAME" "Choose" -t 5000 >/dev/null -# Navigate with arrow keys (pilotty auto-detects application cursor mode) -pilotty key -s "$SESSION_NAME" Down # Move to option 2 -pilotty key -s "$SESSION_NAME" Down # Move to option 3 -pilotty key -s "$SESSION_NAME" Enter # Select +# Navigate with arrow keys +pilotty key -s "$SESSION_NAME" Down >/dev/null # Move to option 2 +pilotty key -s "$SESSION_NAME" Down >/dev/null # Move to option 3 +pilotty key -s "$SESSION_NAME" Enter >/dev/null # Select sleep 0.5 -echo "Selected: Option Three" +echo "Selected: Option Three (via arrow keys + Enter)" -# --- Checklist Dialog --- +# --- Checklist Dialog with Element Detection --- echo "" -echo "3. Checklist Dialog" +echo "3. Checklist Dialog (with element detection)" pilotty spawn --name "$SESSION_NAME" dialog --checklist "Select items:" 15 50 4 \ 1 "Item A" off \ 2 "Item B" off \ 3 "Item C" off \ - 4 "Item D" off + 4 "Item D" off >/dev/null + +pilotty wait-for -s "$SESSION_NAME" "Select" -t 5000 >/dev/null -pilotty wait-for -s "$SESSION_NAME" "Select" -t 5000 +# Show initial toggle states +echo "Initial toggle states:" +pilotty snapshot -s "$SESSION_NAME" | jq -r '.elements[] | select(.kind == "toggle") | " \(.text) at (\(.row),\(.col)) checked=\(.checked)"' # Toggle items with Space -pilotty key -s "$SESSION_NAME" Space # Toggle Item A -pilotty key -s "$SESSION_NAME" Down -pilotty key -s "$SESSION_NAME" Down -pilotty key -s "$SESSION_NAME" Space # Toggle Item C -pilotty key -s "$SESSION_NAME" Enter # Confirm +pilotty key -s "$SESSION_NAME" Space >/dev/null # Toggle Item A +pilotty key -s "$SESSION_NAME" Down >/dev/null +pilotty key -s "$SESSION_NAME" Down >/dev/null +pilotty key -s "$SESSION_NAME" Space >/dev/null # Toggle Item C + +# Show updated toggle states +echo "After toggling:" +pilotty snapshot -s "$SESSION_NAME" | jq -r '.elements[] | select(.kind == "toggle") | " \(.text) at (\(.row),\(.col)) checked=\(.checked)"' + +pilotty key -s "$SESSION_NAME" Enter >/dev/null # Confirm sleep 0.5 echo "Selected: Item A, Item C" @@ -83,13 +98,17 @@ echo "Selected: Item A, Item C" echo "" echo "4. Input Dialog" -pilotty spawn --name "$SESSION_NAME" dialog --inputbox "Enter your name:" 10 40 +pilotty spawn --name "$SESSION_NAME" dialog --inputbox "Enter your name:" 10 40 >/dev/null -pilotty wait-for -s "$SESSION_NAME" "name" -t 5000 +pilotty wait-for -s "$SESSION_NAME" "name" -t 5000 >/dev/null + +# Show detected input element +echo "Detected input element:" +pilotty snapshot -s "$SESSION_NAME" | jq -r '.elements[] | select(.kind == "input") | " \(.kind) at (\(.row),\(.col)) width=\(.width)"' # Type input pilotty type -s "$SESSION_NAME" "Agent Smith" -pilotty key -s "$SESSION_NAME" Enter +pilotty key -s "$SESSION_NAME" Enter >/dev/null sleep 0.5 echo "Entered: Agent Smith" @@ -98,22 +117,24 @@ echo "Entered: Agent Smith" echo "" echo "5. Message Box" -pilotty spawn --name "$SESSION_NAME" dialog --msgbox "Demo complete!" 10 40 +pilotty spawn --name "$SESSION_NAME" dialog --msgbox "Demo complete!" 10 40 >/dev/null -pilotty wait-for -s "$SESSION_NAME" "complete" -t 5000 +pilotty wait-for -s "$SESSION_NAME" "complete" -t 5000 >/dev/null -# Take final snapshot to see the OK button -pilotty snapshot -s "$SESSION_NAME" +# Show button element +echo "Detected button:" +pilotty snapshot -s "$SESSION_NAME" | jq -r '.elements[] | select(.kind == "button" or .kind == "input") | " \(.kind) \(.text) at (\(.row),\(.col))"' # Dismiss with Enter -pilotty key -s "$SESSION_NAME" Enter +pilotty key -s "$SESSION_NAME" Enter >/dev/null sleep 0.5 -# Cleanup -if pilotty list-sessions 2>/dev/null | grep -q "$SESSION_NAME"; then - pilotty kill -s "$SESSION_NAME" -fi - echo "" echo "=== Demo Complete ===" +echo "" +echo "Key takeaways:" +echo " - Use snapshot | jq '.elements' to see detected UI elements" +echo " - Toggles have 'checked' field for state tracking" +echo " - Use keyboard (Tab, Space, Enter, arrows) for reliable navigation" +echo " - content_hash can detect screen changes between snapshots" diff --git a/skills/pilotty/templates/element-detection.sh b/skills/pilotty/templates/element-detection.sh new file mode 100755 index 0000000..6b2ccb8 --- /dev/null +++ b/skills/pilotty/templates/element-detection.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# Element Detection Template +# Demonstrates pilotty's element detection and interaction +# +# Usage: ./element-detection.sh + +set -e + +# Configuration +PILOTTY="${PILOTTY:-pilotty}" +SESSION="element-demo" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +NC='\033[0m' + +# Cleanup on exit +cleanup() { + $PILOTTY kill -s "$SESSION" 2>/dev/null || true +} +trap cleanup EXIT + +echo -e "${BLUE}=== Element Detection Demo ===${NC}" +echo "" + +# ----------------------------------------------------------------------------- +# Step 1: Spawn a TUI with UI elements +# ----------------------------------------------------------------------------- +echo -e "${YELLOW}Step 1: Spawning dialog checklist...${NC}" + +$PILOTTY spawn --name "$SESSION" -- dialog --checklist "Select features to enable:" 15 60 5 \ + "notifications" "Push notifications" on \ + "darkmode" "Dark mode theme" off \ + "autosave" "Auto-save documents" on \ + "analytics" "Usage analytics" off \ + "updates" "Auto-updates" on >/dev/null + +sleep 0.5 + +# ----------------------------------------------------------------------------- +# Step 2: Get snapshot with elements +# ----------------------------------------------------------------------------- +echo -e "${YELLOW}Step 2: Getting snapshot with detected elements...${NC}" +echo "" + +SNAPSHOT=$($PILOTTY snapshot -s "$SESSION") + +# Show element summary +echo -e "${GREEN}Detected elements:${NC}" +echo "$SNAPSHOT" | jq -r '.elements[] | " \(.kind) \(.text) at (\(.row),\(.col)) conf=\(.confidence)"' +echo "" + +# ----------------------------------------------------------------------------- +# Step 3: Analyze toggles +# ----------------------------------------------------------------------------- +echo -e "${YELLOW}Step 3: Analyzing toggle states...${NC}" +echo "" + +TOGGLES=$(echo "$SNAPSHOT" | jq '[.elements[] | select(.kind == "toggle")]') +CHECKED=$(echo "$TOGGLES" | jq '[.[] | select(.checked == true)] | length') +UNCHECKED=$(echo "$TOGGLES" | jq '[.[] | select(.checked == false)] | length') + +echo -e " Checked toggles: ${GREEN}$CHECKED${NC}" +echo -e " Unchecked toggles: ${RED}$UNCHECKED${NC}" +echo "" + +# Show each toggle +echo -e "${GREEN}Toggle details:${NC}" +echo "$TOGGLES" | jq -r '.[] | " \(.text) at (\(.row),\(.col)) checked=\(.checked)"' +echo "" + +# ----------------------------------------------------------------------------- +# Step 4: Toggle an unchecked option +# ----------------------------------------------------------------------------- +echo -e "${YELLOW}Step 4: Toggling 'darkmode' (currently off)...${NC}" + +# Get initial hash for change detection +HASH1=$(echo "$SNAPSHOT" | jq -r '.content_hash') + +# Navigate to darkmode (second option) and toggle +$PILOTTY key -s "$SESSION" Down >/dev/null # Move to darkmode +$PILOTTY key -s "$SESSION" Space >/dev/null # Toggle it + +sleep 0.2 + +# Get new snapshot and hash +SNAPSHOT2=$($PILOTTY snapshot -s "$SESSION") +HASH2=$(echo "$SNAPSHOT2" | jq -r '.content_hash') + +# Verify change +if [ "$HASH1" != "$HASH2" ]; then + echo -e " ${GREEN}Screen changed! (hash: $HASH1 -> $HASH2)${NC}" +else + echo -e " ${RED}No change detected${NC}" +fi +echo "" + +# Show updated toggle states +echo -e "${GREEN}Updated toggle states:${NC}" +echo "$SNAPSHOT2" | jq -r '.elements[] | select(.kind == "toggle") | " \(.text) at (\(.row),\(.col)) checked=\(.checked)"' +echo "" + +# ----------------------------------------------------------------------------- +# Step 5: Find and interact with button +# ----------------------------------------------------------------------------- +echo -e "${YELLOW}Step 5: Looking for action button...${NC}" + +BUTTON=$(echo "$SNAPSHOT2" | jq -r '.elements[] | select(.kind == "button" or .kind == "input") | "\(.text) at (\(.row),\(.col))"' | head -1) +if [ -n "$BUTTON" ]; then + echo -e " Found button: ${GREEN}$BUTTON${NC}" +else + echo -e " ${YELLOW}No button element detected, using keyboard to confirm${NC}" +fi +echo "" + +# ----------------------------------------------------------------------------- +# Step 6: Confirm selection +# ----------------------------------------------------------------------------- +echo -e "${YELLOW}Step 6: Confirming selection with Enter...${NC}" + +$PILOTTY key -s "$SESSION" Enter >/dev/null + +sleep 0.3 + +# Check final state +echo -e "${GREEN}Final screen state:${NC}" +$PILOTTY snapshot -s "$SESSION" --format text 2>/dev/null | head -5 || echo " (dialog closed)" +echo "" + +# ----------------------------------------------------------------------------- +# Summary +# ----------------------------------------------------------------------------- +echo -e "${BLUE}=== Summary ===${NC}" +echo "" +echo "This demo showed how to:" +echo " 1. Spawn a TUI application" +echo " 2. Get snapshot with detected elements" +echo " 3. Analyze element states (toggles, buttons)" +echo " 4. Use content_hash for change detection" +echo " 5. Navigate with keyboard based on element context" +echo "" +echo -e "${GREEN}Demo complete!${NC}" From 342009050f6212839bd31630e5ee0dca7adffca1 Mon Sep 17 00:00:00 2001 From: msmps <7691252+msmps@users.noreply.github.com> Date: Wed, 28 Jan 2026 22:03:36 +0000 Subject: [PATCH 2/2] docs: readme --- README.md | 19 +++++++++++++++---- npm/README.md | 38 +++++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 7b4da51..c3a0b6e 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,16 @@

- pilotty logo + pilotty - Terminal automation CLI enabling AI agents to control TUI applications

pilotty

+

+ The terminal equivalent of agent-browser +

+

Terminal automation CLI for AI agents
- Like agent-browser, but for TUI applications. + Control vim, htop, lazygit, dialog, and any TUI programmatically

@@ -21,16 +25,23 @@ > [!NOTE] > **Built with AI, for AI.** This project was built with the support of an AI agent, planned thoroughly with a tight feedback loop and reviewed at each step. While we've tested extensively, edge cases may exist. Use in production at your own discretion, and please [report any issues](https://github.com/msmps/pilotty/issues) you find! -pilotty enables AI agents to interact with terminal applications (vim, htop, lazygit, dialog, etc.) through a simple CLI interface. It manages PTY sessions, captures terminal output, and provides keyboard/mouse input capabilities for navigating TUI applications. +pilotty enables AI agents to interact with terminal applications through a simple command-line interface. It manages pseudo-terminal (PTY) sessions with full VT100 terminal emulation, captures screen state, and provides keyboard/mouse input for navigating terminal user interfaces. Think of it as headless terminal automation for AI workflows. ## Features -- **PTY Management**: Spawn and manage terminal applications in background sessions +- **PTY (Pseudo-Terminal) Management**: Spawn and manage terminal applications in background sessions +- **Terminal Emulation**: Full VT100 emulation for accurate screen capture and state tracking - **Keyboard Navigation**: Interact with TUIs using Tab, Enter, arrow keys, and key combos - **AI-Friendly Output**: Clean JSON responses with actionable suggestions on errors - **Multi-Session**: Run multiple terminal apps simultaneously in isolated sessions - **Zero Config**: Daemon auto-starts on first command, auto-stops after 5 minutes idle +## Why pilotty? + +[agent-browser](https://github.com/vercel-labs/agent-browser) by Vercel Labs lets AI agents control web browsers. pilotty does the same for terminals. + +**Origin story:** Built to solve a personal problem, pilotty was created to enable AI agents to interact with [OpenTUI](https://github.com/anomalyco/opentui) interfaces and control [OpenCode](https://github.com/anomalyco/opencode) programmatically. If you're building TUIs or working with terminal applications, pilotty lets AI navigate them just like a human would. + ## Installation ### npm (recommended) diff --git a/npm/README.md b/npm/README.md index 32395ce..a4a28b2 100644 --- a/npm/README.md +++ b/npm/README.md @@ -1,25 +1,26 @@

- pilotty logo + pilotty - Terminal automation CLI enabling AI agents to control TUI applications

pilotty

- Terminal automation CLI for AI agents
- Like agent-browser, but for TUI applications. + The terminal equivalent of agent-browser

---- +

+ Terminal automation CLI for AI agents
+ Control vim, htop, lazygit, dialog, and any TUI programmatically +

-pilotty enables AI agents to interact with terminal applications (vim, htop, lazygit, dialog, etc.) through a simple CLI interface. It manages PTY sessions, captures terminal output, and provides keyboard/mouse input capabilities for navigating TUI applications. +

+ npm version + License +

-## Features +--- -- **PTY Management**: Spawn and manage terminal applications in background sessions -- **Keyboard Navigation**: Interact with TUIs using Tab, Enter, arrow keys, and key combos -- **AI-Friendly Output**: Clean JSON responses with actionable suggestions on errors -- **Multi-Session**: Run multiple terminal apps simultaneously in isolated sessions -- **Zero Config**: Daemon auto-starts on first command, auto-stops after 5 minutes idle +pilotty enables AI agents to interact with terminal applications through a simple command-line interface. It manages pseudo-terminal (PTY) sessions with full VT100 terminal emulation, captures screen state, and provides keyboard/mouse input for navigating terminal user interfaces. ## Installation @@ -83,6 +84,17 @@ The `snapshot` command returns structured data about the terminal screen: Use the cursor position and text content to understand the screen state and navigate using keyboard commands (Tab, Enter, arrow keys) or click at specific coordinates. +## Documentation + +See the **[GitHub repository](https://github.com/msmps/pilotty)** for full documentation including: + +- All commands reference +- Session management +- Key combinations +- UI element detection +- AI agent workflow examples +- Daemon architecture + ## Building from Source ```bash @@ -94,10 +106,6 @@ cargo build --release Requires [Rust](https://rustup.rs) 1.70+. -## Documentation - -See the [GitHub repository](https://github.com/msmps/pilotty) for full documentation including all commands, key combinations, and AI agent workflow examples. - ## License MIT