diff --git a/mock_data.py b/mock_data.py index 145981f..7d8e23e 100644 --- a/mock_data.py +++ b/mock_data.py @@ -16,90 +16,105 @@ def _generate_jobs_data(now): "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=2), "image": "alice/pytorch:2.1", + "gpu": 2, }, { "name": "hyperparameter-tuning-job", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=5, minutes=15), "image": "david/optuna:3.4", + "gpu": 1, }, { "name": "image-preprocessing-batch", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, minutes=40), "image": "frank/opencv:4.8", + "gpu": 0, }, { "name": "distributed-training-resnet", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=7, minutes=30), "image": "bob/horovod:0.28", + "gpu": 4, }, { "name": "feature-extraction-pipeline", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=2, minutes=30), "image": "eve/sklearn:1.3", + "gpu": 0, }, { "name": "model-serving-warmup", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, minutes=15), "image": "carol/triton:23.10", + "gpu": 1, }, { "name": "batch-prediction-service", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=3), "image": "david/tensorflow:2.14", + "gpu": 2, }, { "name": "recommendation-engine-train", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, minutes=30), "image": "frank/lightgbm:4.1", + "gpu": 0, }, { "name": "speech-recognition-train", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=3, minutes=45), "image": "iris/whisper:large-v3", + "gpu": 2, }, { "name": "time-series-forecasting", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=5), "image": "jack/prophet:1.1", + "gpu": 0, }, { "name": "semantic-search-indexing", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=7, minutes=15), "image": "leo/elasticsearch:8.11", + "gpu": 0, }, { "name": "document-embedding-job", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=9, minutes=30), "image": "nancy/sentence-transformers:2.2", + "gpu": 1, }, { "name": "reinforcement-learning-agent", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=10, minutes=45), "image": "peter/stable-baselines3:2.2", + "gpu": 2, }, { "name": "anomaly-detection-pipeline", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=13, minutes=15), "image": "rachel/isolation-forest:1.3", + "gpu": 0, }, { "name": "clickstream-analytics", "active": 1, "succeeded": 0, "failed": 0, "startTime": _time_ago(now, hours=15), "image": "tina/flink:1.18", + "gpu": 0, }, # Completed jobs { @@ -108,6 +123,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=3, minutes=30), "completionTime": _time_ago(now, hours=2, minutes=45), "image": "charlie/fastapi:0.104", + "gpu": 1, }, { "name": "model-evaluation-suite", @@ -115,6 +131,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=6, minutes=30), "completionTime": _time_ago(now, hours=6), "image": "emily/mlflow:2.8", + "gpu": 0, }, { "name": "nlp-sentiment-analysis", @@ -122,6 +139,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=9), "completionTime": _time_ago(now, hours=8, minutes=15), "image": "henry/transformers:4.35", + "gpu": 1, }, { "name": "database-backup-export", @@ -129,6 +147,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=11, minutes=30), "completionTime": _time_ago(now, hours=11), "image": "ivan/pgdump:16", + "gpu": 0, }, { "name": "log-aggregation-batch", @@ -136,6 +155,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=13, minutes=30), "completionTime": _time_ago(now, hours=12, minutes=38), "image": "judy/logstash:8.11", + "gpu": 0, }, { "name": "text-classification-job", @@ -143,6 +163,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=1, minutes=30), "completionTime": _time_ago(now, minutes=48), "image": "grace/bert-base:1.0", + "gpu": 1, }, { "name": "object-detection-yolo", @@ -150,6 +171,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=6, minutes=30), "completionTime": _time_ago(now, hours=5, minutes=5), "image": "karen/yolov8:2.0", + "gpu": 2, }, { "name": "ab-testing-analysis", @@ -157,6 +179,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=10, minutes=30), "completionTime": _time_ago(now, hours=10, minutes=12), "image": "oscar/scipy:1.11", + "gpu": 0, }, { "name": "data-lake-sync", @@ -164,6 +187,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=12), "completionTime": _time_ago(now, hours=11, minutes=8), "image": "quinn/delta-lake:3.0", + "gpu": 0, }, # Failed jobs { @@ -172,6 +196,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=4, minutes=30), "completionTime": _time_ago(now, hours=4, minutes=15), "image": "emily/spark:3.5", + "gpu": 0, }, { "name": "video-encoding-job", @@ -179,6 +204,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=10, minutes=30), "completionTime": _time_ago(now, hours=10, minutes=25), "image": "grace/ffmpeg:6.0", + "gpu": 1, }, { "name": "etl-customer-data", @@ -186,6 +212,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=8, minutes=30), "completionTime": _time_ago(now, hours=8, minutes=27), "image": "ivan/airflow:2.7", + "gpu": 0, }, { "name": "fraud-detection-model", @@ -193,6 +220,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=2, minutes=15), "completionTime": _time_ago(now, hours=2, minutes=10), "image": "henry/catboost:1.2", + "gpu": 1, }, { "name": "graph-neural-network", @@ -200,6 +228,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=8), "completionTime": _time_ago(now, hours=7, minutes=55), "image": "maria/pytorch-geometric:2.4", + "gpu": 2, }, { "name": "multilingual-translation", @@ -207,6 +236,7 @@ def _generate_jobs_data(now): "startTime": _time_ago(now, hours=14, minutes=30), "completionTime": _time_ago(now, hours=14, minutes=22), "image": "steve/marian-mt:3.1", + "gpu": 1, }, ] @@ -216,6 +246,43 @@ def _generate_jobs_data(now): def _build_jobs_items(jobs_data): jobs_items = [] for job_info in jobs_data: + # Build resources with GPU if specified + resources = {} + node_selector = {} + gpu_count = job_info.get("gpu", 0) + + if gpu_count > 0: + resources = { + "requests": { + "cpu": "20", + "memory": "128Gi", + "nvidia.com/gpu": str(gpu_count) + }, + "limits": { + "cpu": "20", + "memory": "128Gi", + "nvidia.com/gpu": str(gpu_count) + } + } + # Assign GPU type based on job requirements (like real jobs do) + if gpu_count >= 4: + node_selector = {"nvidia.com/gpu.product": "NVIDIA-H100-80GB-HBM3"} + elif gpu_count >= 2: + node_selector = {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"} + else: + node_selector = {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-40GB"} + + pod_spec = { + "containers": [ + { + "image": job_info["image"], + "resources": resources + } + ] + } + if node_selector: + pod_spec["nodeSelector"] = node_selector + job = { "metadata": {"name": job_info["name"]}, "status": { @@ -227,11 +294,7 @@ def _build_jobs_items(jobs_data): "spec": { "completions": 1, "template": { - "spec": { - "containers": [ - {"image": job_info["image"]} - ] - } + "spec": pod_spec } } } @@ -251,6 +314,9 @@ def _generate_pod_suffix(): def _generate_pods_items(jobs_data): pods_items = [] + # GPU node assignment based on job GPU requirements + gpu_nodes = ['gpu-node-01', 'gpu-node-02', 'gpu-node-03'] + for job_info in jobs_data: # ~60% get 1 pod, ~30% get 2 pods, ~10% get 3 pods rand = random.random() @@ -262,12 +328,28 @@ def _generate_pods_items(jobs_data): phase = "Succeeded" else: phase = "Failed" + + # Assign node based on GPU requirements + gpu_req = job_info.get("gpu", 0) + if gpu_req > 0: + # Assign to a GPU node (prefer A100 for larger jobs) + if gpu_req >= 4: + node_name = 'gpu-node-01' # A100 + elif gpu_req >= 2: + node_name = random.choice(['gpu-node-01', 'gpu-node-03']) # A100 or V100 + else: + node_name = random.choice(gpu_nodes) + else: + node_name = f"cpu-node-{random.randint(1, 5):02d}" for _ in range(num_pods): pods_items.append({ "metadata": { "name": f"{job_info['name']}-{_generate_pod_suffix()}" }, + "spec": { + "nodeName": node_name + }, "status": { "phase": phase } diff --git a/monitor.py b/monitor.py index 531016f..6e86c8d 100644 --- a/monitor.py +++ b/monitor.py @@ -17,7 +17,8 @@ from rich.layout import Layout from rich.panel import Panel from rich.table import Table -from rich.console import Console +from rich.console import Console, Group +from rich.text import Text from rich import box from mock_data import generate_mock_data from version import __version__ @@ -102,7 +103,157 @@ def get_quota(ns, use_mock=False, mock_data=None): return data -def get_jobs_pods(ns, use_mock=False, mock_data=None): +def get_gpu_info(ns, use_mock=False, mock_data=None): + """Get GPU information from cluster nodes and pods.""" + if use_mock: + return { + 'nodes': [ + {'name': 'gpu-node-01', 'gpu_type': 'H100-80GB', 'gpu_count': 8, 'allocated': 4}, + {'name': 'gpu-node-02', 'gpu_type': 'H100-80GB', 'gpu_count': 8, 'allocated': 0}, + {'name': 'gpu-node-03', 'gpu_type': 'A100-80GB', 'gpu_count': 8, 'allocated': 6}, + {'name': 'gpu-node-04', 'gpu_type': 'A100-80GB', 'gpu_count': 8, 'allocated': 2}, + {'name': 'gpu-node-05', 'gpu_type': 'A100-40GB', 'gpu_count': 4, 'allocated': 3}, + ], + 'total_gpus': 36, + 'allocated_gpus': 15, + 'gpu_types': ['H100-80GB', 'A100-80GB', 'A100-40GB'], + 'node_gpu_map': { + 'gpu-node-01': 'H100-80GB', + 'gpu-node-02': 'H100-80GB', + 'gpu-node-03': 'A100-80GB', + 'gpu-node-04': 'A100-80GB', + 'gpu-node-05': 'A100-40GB', + } + } + + gpu_info = { + 'nodes': [], + 'total_gpus': 0, + 'allocated_gpus': 0, + 'gpu_types': set(), + 'node_gpu_map': {} # Maps node name -> GPU type + } + + # Get nodes with GPU capacity + nodes_json = run_cmd("kubectl get nodes -o json") + try: + nodes = json.loads(nodes_json).get('items', []) + for node in nodes: + node_name = node['metadata']['name'] + labels = node['metadata'].get('labels', {}) + capacity = node.get('status', {}).get('capacity', {}) + allocatable = node.get('status', {}).get('allocatable', {}) + + # Check for NVIDIA GPUs + gpu_count = 0 + for key in capacity: + if 'gpu' in key.lower(): + try: + gpu_count = int(capacity[key]) + except: + pass + break + + if gpu_count > 0: + # Try to get GPU type from common label patterns + gpu_type = 'GPU' + gpu_label_keys = [ + 'nvidia.com/gpu.product', + 'gpu.nvidia.com/product', + 'accelerator', + 'nvidia.com/gpu.machine', + 'node.kubernetes.io/instance-type' + ] + for label_key in gpu_label_keys: + if label_key in labels: + raw_type = labels[label_key].replace('-', ' ') + # Shorten common GPU names + gpu_type = _shorten_gpu_name(raw_type) + break + + gpu_info['nodes'].append({ + 'name': node_name, + 'gpu_type': gpu_type, + 'gpu_count': gpu_count, + 'allocated': 0 # Will be updated from pod info + }) + gpu_info['total_gpus'] += gpu_count + gpu_info['gpu_types'].add(gpu_type) + gpu_info['node_gpu_map'][node_name] = gpu_type + except: + pass + + # Get GPU allocation from pods + pods_json = run_cmd(f"kubectl get pods --all-namespaces -o json") + try: + pods = json.loads(pods_json).get('items', []) + for pod in pods: + if pod['status'].get('phase') not in ['Running', 'Pending']: + continue + node_name = pod['spec'].get('nodeName', '') + containers = pod['spec'].get('containers', []) + for container in containers: + resources = container.get('resources', {}).get('requests', {}) + for key, value in resources.items(): + if 'gpu' in key.lower(): + try: + gpu_req = int(value) + gpu_info['allocated_gpus'] += gpu_req + # Update node allocation + for node in gpu_info['nodes']: + if node['name'] == node_name: + node['allocated'] += gpu_req + except (ValueError, TypeError): + pass + except: + pass + + gpu_info['gpu_types'] = list(gpu_info['gpu_types']) + return gpu_info + + +def _shorten_gpu_name(name): + """Shorten GPU names for display.""" + name = name.upper() + # Common patterns to shorten + if 'A100' in name: + if '80G' in name: + return 'A100-80GB' + elif '40G' in name: + return 'A100-40GB' + return 'A100' + elif 'H100' in name: + if '80G' in name: + return 'H100-80GB' + return 'H100' + elif 'V100' in name: + if '32G' in name: + return 'V100-32GB' + elif '16G' in name: + return 'V100-16GB' + return 'V100' + elif 'T4' in name: + return 'T4' + elif 'P100' in name: + return 'P100' + elif 'P40' in name: + return 'P40' + elif 'L40' in name: + return 'L40S' if 'L40S' in name else 'L40' + elif 'RTX' in name: + # Extract RTX model number + import re + match = re.search(r'RTX\s*(\d+)', name) + if match: + return f'RTX {match.group(1)}' + return 'RTX' + # Return shortened version if too long + if len(name) > 12: + return name[:10] + '..' + return name + + +def get_jobs_pods(ns, use_mock=False, mock_data=None, gpu_info=None): if use_mock and mock_data: jobs = mock_data['jobs']['items'] pods = mock_data['pods']['items'] @@ -112,12 +263,23 @@ def get_jobs_pods(ns, use_mock=False, mock_data=None): try: j = json.loads(jobs_json) - jobs = j.get('items', []) + jobs = j.get('items', []) p = json.loads(pods_json) pods = p.get('items', []) except Exception: jobs = [] pods = [] + + # Build node_gpu_map from gpu_info or mock + node_gpu_map = {} + if gpu_info: + node_gpu_map = gpu_info.get('node_gpu_map', {}) + elif use_mock: + node_gpu_map = { + 'gpu-node-01': 'A100-40GB', + 'gpu-node-02': 'A100-40GB', + 'gpu-node-03': 'V100-32GB', + } jobs_data = [] try: @@ -162,21 +324,57 @@ def get_jobs_pods(ns, use_mock=False, mock_data=None): except Exception: pass - # User + # User and GPU info user = "Unknown" + gpu_request = 0 + gpu_type_from_selector = None try: - img = spec['template']['spec']['containers'][0]['image'] + pod_spec = spec['template']['spec'] + containers = pod_spec['containers'] + img = containers[0]['image'] parts = img.split('/') user = parts[0] if len(parts) > 1 else img.split(':')[0] + + # Get GPU requests + for container in containers: + resources = container.get('resources', {}).get('requests', {}) + for key, value in resources.items(): + if 'gpu' in key.lower(): + try: + gpu_request += int(value) + except (ValueError, TypeError): + pass + + # Get GPU type from nodeSelector (most reliable source) + node_selector = pod_spec.get('nodeSelector', {}) + gpu_selector_keys = [ + 'nvidia.com/gpu.product', + 'gpu.nvidia.com/product', + 'accelerator', + 'nvidia.com/gpu.machine', + ] + for key in gpu_selector_keys: + if key in node_selector: + gpu_type_from_selector = _shorten_gpu_name(node_selector[key]) + break except: pass - # Pods + # Pods - track node and GPU type my_pods = [] + job_gpu_type = gpu_type_from_selector # Prefer nodeSelector (available before scheduling) for pod in pods: if pod['metadata']['name'].startswith(name + "-"): p_status = pod['status']['phase'] - my_pods.append(f"{pod['metadata']['name']} ({p_status})") + p_node = pod['spec'].get('nodeName', '') + my_pods.append({ + 'name': pod['metadata']['name'], + 'status': p_status, + 'node': p_node + }) + # Fallback: get GPU type from node if not in nodeSelector + if gpu_request > 0 and not job_gpu_type and p_node and p_node in node_gpu_map: + job_gpu_type = node_gpu_map[p_node] jobs_data.append({ 'name': name, @@ -184,7 +382,9 @@ def get_jobs_pods(ns, use_mock=False, mock_data=None): 'user': user, 'completions': f"{succeeded}/{req}", 'duration': duration, - 'pods': my_pods + 'pods': my_pods, + 'gpu': gpu_request, + 'gpu_type': job_gpu_type # The actual GPU type being used }) except Exception: @@ -193,6 +393,47 @@ def get_jobs_pods(ns, use_mock=False, mock_data=None): return jobs_data +def get_pod_logs(ns, pod_name, tail_lines=100, use_mock=False): + """Fetch logs for a specific pod.""" + if use_mock: + # Generate mock log data + mock_logs = [] + import random + log_messages = [ + "INFO: Starting application...", + "INFO: Loading configuration from /etc/config/app.yaml", + "INFO: Connecting to database...", + "INFO: Database connection established", + "INFO: Initializing worker threads...", + "DEBUG: Worker pool size: 4", + "INFO: Processing batch 1/10", + "INFO: Processing batch 2/10", + "WARNING: High memory usage detected (85%)", + "INFO: Processing batch 3/10", + "INFO: Processing batch 4/10", + "DEBUG: Cache hit ratio: 0.87", + "INFO: Processing batch 5/10", + "ERROR: Failed to process item #42: timeout", + "INFO: Retrying item #42...", + "INFO: Processing batch 6/10", + "INFO: Processing batch 7/10", + "INFO: Processing batch 8/10", + "DEBUG: Checkpoint saved", + "INFO: Processing batch 9/10", + "INFO: Processing batch 10/10", + "INFO: All batches completed successfully", + "INFO: Cleaning up resources...", + "INFO: Application finished", + ] + for i, msg in enumerate(log_messages[:tail_lines]): + timestamp = f"2026-01-21T10:{i:02d}:00Z" + mock_logs.append(f"{timestamp} {msg}") + return "\n".join(mock_logs) + + cmd = f"kubectl -n {ns} logs {pod_name} --tail={tail_lines}" + return run_cmd(cmd) + + def get_local_metrics(): # CPU cpu_total = psutil.cpu_percent(interval=None) @@ -265,14 +506,8 @@ def make_layout(): return layout -def generate_table(jobs, offset=0, max_rows=None): - table = Table(box=box.SIMPLE_HEAD, expand=True, show_lines=False) - table.add_column("Job / Pod Name", style="cyan", no_wrap=True) - table.add_column("User", style="magenta") - table.add_column("Status", justify="center") - table.add_column("Comp", justify="right") - table.add_column("Duration", justify="right") - +def build_row_index(jobs): + """Build a flat list of all rows (jobs and pods) for selection tracking.""" all_rows = [] for job in jobs: if job['status'] == 'Completed': @@ -281,19 +516,39 @@ def generate_table(jobs, offset=0, max_rows=None): status_style = "red" else: status_style = "yellow" + + # Format GPU display - show count and type if available + gpu_count = job.get('gpu', 0) + gpu_type = job.get('gpu_type') + if gpu_count > 0: + if gpu_type: + gpu_display = f"{gpu_count}x {gpu_type}" + else: + gpu_display = str(gpu_count) + else: + gpu_display = "-" all_rows.append({ 'type': 'job', - 'name': f"[bold]{job['name']}[/]", + 'name': job['name'], + 'display_name': f"[bold]{job['name']}[/]", 'user': job['user'], 'status': f"[{status_style}]{job['status']}[/]", + 'gpu': gpu_display, 'completions': job['completions'], - 'duration': job['duration'] + 'duration': job['duration'], + 'pod_name': None # Jobs don't have pod_name for log viewing }) - for i, pod_str in enumerate(job['pods']): - p_name = pod_str.split(' (')[0] - p_status = pod_str.split(' (')[1].rstrip(')') + for i, pod_info in enumerate(job['pods']): + # Handle both old format (string) and new format (dict) + if isinstance(pod_info, dict): + p_name = pod_info['name'] + p_status = pod_info['status'] + else: + # Legacy string format: "pod-name (Status)" + p_name = pod_info.split(' (')[0] + p_status = pod_info.split(' (')[1].rstrip(')') is_last = (i == len(job['pods']) - 1) prefix = "└── " if is_last else "├── " @@ -302,41 +557,142 @@ def generate_table(jobs, offset=0, max_rows=None): all_rows.append({ 'type': 'pod', - 'name': f" {prefix}{p_name}", + 'name': p_name, + 'display_name': f" {prefix}{p_name}", 'user': "", 'status': f"[{p_status_style}]{p_status}[/]", + 'gpu': "", 'completions': "", - 'duration': "" + 'duration': "", + 'pod_name': p_name # Actual pod name for log fetching }) + return all_rows + + +def generate_table(jobs, offset=0, max_rows=None, selected_index=0): + table = Table(box=box.SIMPLE_HEAD, expand=True, show_lines=False) + table.add_column("Job / Pod Name", style="cyan", no_wrap=True) + table.add_column("User", style="magenta") + table.add_column("Status", justify="center") + table.add_column("GPU", justify="center", style="yellow") + table.add_column("Comp", justify="right") + table.add_column("Duration", justify="right") + + all_rows = build_row_index(jobs) visible_rows = all_rows[offset:] if max_rows: visible_rows = visible_rows[:max_rows] - for row in visible_rows: + for i, row in enumerate(visible_rows): + actual_index = offset + i + is_selected = (actual_index == selected_index) + + # Apply selection highlighting + if is_selected: + name_display = f"[reverse]{row['display_name']}[/reverse]" + # Add indicator for pods that can show logs + if row['type'] == 'pod': + name_display = f"[reverse]▶ {row['display_name'].strip()}[/reverse]" + else: + name_display = row['display_name'] + table.add_row( - row['name'], + name_display, row['user'], row['status'], + row['gpu'], row['completions'], row['duration'] ) - return table + return table, all_rows -def generate_cluster_resources(quota): +def generate_cluster_resources(quota, gpu_info=None): grid = Table.grid(expand=True) grid.add_column() grid.add_column(justify="right") - grid.add_row("CPU", quota['cpu']['str']) - grid.add_row("MEM", quota['mem']['str']) - grid.add_row("GPU", quota['gpu']['str']) + grid.add_row("[bold]CPU[/bold]", quota['cpu']['str']) + grid.add_row("[bold]MEM[/bold]", quota['mem']['str']) + grid.add_row("[bold]GPU[/bold]", quota['gpu']['str']) + + # Add detailed GPU info by type + if gpu_info and gpu_info.get('nodes'): + grid.add_row("", "") # Spacer + + # Aggregate by GPU type + gpu_by_type = {} + for node in gpu_info['nodes']: + gpu_type = node['gpu_type'] + if gpu_type not in gpu_by_type: + gpu_by_type[gpu_type] = {'total': 0, 'allocated': 0} + gpu_by_type[gpu_type]['total'] += node['gpu_count'] + gpu_by_type[gpu_type]['allocated'] += node['allocated'] + + # Display each GPU type with usage + for gpu_type, counts in gpu_by_type.items(): + used = counts['allocated'] + total = counts['total'] + # Color based on utilization + if total > 0: + pct = (used / total) * 100 + if pct >= 80: + color = "red" + elif pct >= 50: + color = "yellow" + else: + color = "green" + else: + color = "dim" + grid.add_row( + f" [cyan]{gpu_type}[/cyan]", + f"[{color}]{used}/{total}[/{color}]" + ) return Panel(grid, title="Cluster Quota", border_style="blue") +def generate_log_viewer(logs, pod_name, scroll_offset=0, max_lines=None): + """Generate a log viewer panel for a specific pod.""" + lines = logs.split('\n') if logs else ["No logs available"] + + # Apply scroll offset + visible_lines = lines[scroll_offset:] + if max_lines: + visible_lines = visible_lines[:max_lines] + + # Color-code log lines based on level + formatted_lines = [] + for line in visible_lines: + line_upper = line.upper() + if 'ERROR' in line_upper: + formatted_lines.append(f"[red]{line}[/red]") + elif 'WARNING' in line_upper or 'WARN' in line_upper: + formatted_lines.append(f"[yellow]{line}[/yellow]") + elif 'DEBUG' in line_upper: + formatted_lines.append(f"[dim]{line}[/dim]") + elif 'INFO' in line_upper: + formatted_lines.append(f"[green]{line}[/green]") + else: + formatted_lines.append(line) + + log_text = "\n".join(formatted_lines) if formatted_lines else "No logs available" + + # Create scroll indicator + total_lines = len(lines) + scroll_info = f" ({scroll_offset + 1}-{min(scroll_offset + (max_lines or total_lines), total_lines)}/{total_lines})" + + return Panel( + log_text, + title=f"Logs: {pod_name}{scroll_info}", + subtitle="[dim]↑/↓ Scroll | ESC/Backspace Close | r Refresh[/dim]", + border_style="cyan", + expand=True + ) + + def print_help(): console = Console(force_terminal=True, legacy_windows=False) @@ -382,6 +738,9 @@ def print_help(): console.print("[bold yellow]Keyboard Shortcuts:[/bold yellow]") console.print(" [cyan]↑/↓[/cyan] Navigate up and down") + console.print(" [cyan]Enter[/cyan] View logs for selected pod") + console.print(" [cyan]ESC/Backspace[/cyan] Close log viewer") + console.print(" [cyan]r[/cyan] Refresh logs (in log viewer)") console.print(" [cyan]q[/cyan] Quit the application") console.print(" [cyan]Ctrl+C[/cyan] Force exit\n") @@ -439,7 +798,9 @@ def main(): layout["header"].update(Panel( f"Kubernetes Monitor - Namespace: [bold green]{args.namespace}[/] {mode_str}", style="white on blue")) - layout["footer"].update(Panel("Press 'q' or Ctrl+C to exit", style="dim")) + layout["footer"].update(Panel( + "[cyan]↑/↓[/cyan] Navigate [cyan]Enter[/cyan] View Logs [cyan]q[/cyan] Quit", + style="dim")) old_settings = None if platform.system() != "Windows": @@ -453,10 +814,18 @@ def main(): last_fetch = 0 fetch_interval = 2 scroll_offset = 0 + selected_index = 0 # Track selected row + + # Log viewer state + viewing_logs = False + current_logs = "" + current_pod_name = "" + log_scroll_offset = 0 quota = get_quota(args.namespace, use_mock=args.mock, mock_data=mock_data) + gpu_info = get_gpu_info(args.namespace, use_mock=args.mock, mock_data=mock_data) jobs = get_jobs_pods(args.namespace, use_mock=args.mock, - mock_data=mock_data) + mock_data=mock_data, gpu_info=gpu_info) while True: # Input Handling - process all buffered input and use last nav key @@ -472,27 +841,52 @@ def main(): key = 'up' elif next2 == 'B': # Down arrow on Unix/Linux key = 'down' + else: + # Just ESC key (no bracket following) + key = 'escape' elif char.lower() == 'q': key = 'q' + elif char == '\n' or char == '\r': # Enter key + key = 'enter' + elif char == '\x7f' or char == '\x08': # Backspace + key = 'backspace' + elif char.lower() == 'r': + key = 'r' else: while msvcrt.kbhit(): key_input = msvcrt.getch() - if key_input == b'H': # Up arrow on Windows + if key_input == b'\xe0': # Extended key prefix on Windows + key_input = msvcrt.getch() + if key_input == b'H': # Up arrow on Windows + key = 'up' + elif key_input == b'P': # Down arrow on Windows + key = 'down' + elif key_input == b'H': # Up arrow on Windows (alternate) key = 'up' - elif key_input == b'P': # Down arrow on Windows + elif key_input == b'P': # Down arrow on Windows (alternate) key = 'down' + elif key_input == b'\r': # Enter key + key = 'enter' + elif key_input == b'\x1b': # Escape key + key = 'escape' + elif key_input == b'\x08': # Backspace + key = 'backspace' else: decoded = key_input.decode('utf-8', errors='ignore').lower() if decoded == 'q': key = 'q' + elif decoded == 'r': + key = 'r' - if key == 'q': + # Handle quit + if key == 'q' and not viewing_logs: break cpu_total, cpu_per_core, mem, gpu = get_local_metrics() - # Calculate total rows needed for all jobs (1 row/job + 1 row/pod) - total_rows = sum(1 + len(job['pods']) for job in jobs) + # Build row index for selection tracking + all_rows = build_row_index(jobs) + total_rows = len(all_rows) # Calculate max visible rows (approximate based on available height) # Account for: @@ -500,32 +894,103 @@ def main(): available_height = console.height - 10 max_visible_rows = max(10, available_height) - # Calculate max scroll position with buffer to ensure - # last job's pods are visible - max_scroll = max(0, total_rows - max_visible_rows + 3) - - # Navigation - if key == 'up': - scroll_offset = max(0, scroll_offset - 1) - elif key == 'down': - scroll_offset = min(max_scroll, scroll_offset + 1) - - now = time.time() - if now - last_fetch > fetch_interval: - quota = get_quota(args.namespace, use_mock=args.mock, - mock_data=mock_data) - jobs = get_jobs_pods(args.namespace, use_mock=args.mock, - mock_data=mock_data) - last_fetch = now - - jobs_title = f"Jobs ({len(jobs)})" - - layout["cluster_resources"].update(generate_cluster_resources(quota)) - layout["local_resources"].update( - generate_local_resources(cpu_total, cpu_per_core, mem, gpu)) - layout["right"].update(Panel(generate_table( - jobs, offset=scroll_offset, max_rows=max_visible_rows), - title=jobs_title, border_style="green")) + if viewing_logs: + # Log viewer mode + log_lines = current_logs.split('\n') if current_logs else [] + max_log_scroll = max(0, len(log_lines) - max_visible_rows + 5) + + if key == 'escape' or key == 'backspace' or key == 'q': + viewing_logs = False + current_logs = "" + current_pod_name = "" + log_scroll_offset = 0 + elif key == 'up': + log_scroll_offset = max(0, log_scroll_offset - 1) + elif key == 'down': + log_scroll_offset = min(max_log_scroll, log_scroll_offset + 1) + elif key == 'r': + # Manual refresh logs + current_logs = get_pod_logs( + args.namespace, current_pod_name, + tail_lines=500, use_mock=args.mock + ) + + # Auto-refresh logs periodically + now = time.time() + if now - last_fetch > fetch_interval: + current_logs = get_pod_logs( + args.namespace, current_pod_name, + tail_lines=500, use_mock=args.mock + ) + last_fetch = now + + # Update layout with log viewer + layout["cluster_resources"].update(generate_cluster_resources(quota, gpu_info)) + layout["local_resources"].update( + generate_local_resources(cpu_total, cpu_per_core, mem, gpu)) + layout["right"].update(generate_log_viewer( + current_logs, current_pod_name, + scroll_offset=log_scroll_offset, + max_lines=max_visible_rows + )) + layout["footer"].update(Panel( + f"[cyan]↑/↓[/cyan] Scroll [cyan]r[/cyan] Refresh [cyan]ESC/Backspace[/cyan] Close [dim](auto-refresh {fetch_interval}s)[/dim] Viewing: [bold]{current_pod_name}[/bold]", + style="dim")) + else: + # Normal job/pod list mode + # Calculate max scroll position with buffer to ensure + # last job's pods are visible + max_scroll = max(0, total_rows - max_visible_rows + 3) + + # Navigation + if key == 'up': + selected_index = max(0, selected_index - 1) + # Auto-scroll to keep selection visible + if selected_index < scroll_offset: + scroll_offset = selected_index + elif key == 'down': + selected_index = min(total_rows - 1, selected_index + 1) + # Auto-scroll to keep selection visible + if selected_index >= scroll_offset + max_visible_rows: + scroll_offset = selected_index - max_visible_rows + 1 + elif key == 'enter': + # Open log viewer for selected pod + if total_rows > 0 and selected_index < len(all_rows): + selected_row = all_rows[selected_index] + if selected_row['type'] == 'pod' and selected_row['pod_name']: + viewing_logs = True + current_pod_name = selected_row['pod_name'] + log_scroll_offset = 0 + current_logs = get_pod_logs( + args.namespace, current_pod_name, + tail_lines=500, use_mock=args.mock + ) + + now = time.time() + if now - last_fetch > fetch_interval: + quota = get_quota(args.namespace, use_mock=args.mock, + mock_data=mock_data) + gpu_info = get_gpu_info(args.namespace, use_mock=args.mock, + mock_data=mock_data) + jobs = get_jobs_pods(args.namespace, use_mock=args.mock, + mock_data=mock_data, gpu_info=gpu_info) + last_fetch = now + + jobs_title = f"Jobs ({len(jobs)})" + + # Generate table with selection + table, _ = generate_table( + jobs, offset=scroll_offset, + max_rows=max_visible_rows, selected_index=selected_index + ) + + layout["cluster_resources"].update(generate_cluster_resources(quota, gpu_info)) + layout["local_resources"].update( + generate_local_resources(cpu_total, cpu_per_core, mem, gpu)) + layout["right"].update(Panel(table, title=jobs_title, border_style="green")) + layout["footer"].update(Panel( + "[cyan]↑/↓[/cyan] Navigate [cyan]Enter[/cyan] View Logs [cyan]q[/cyan] Quit", + style="dim")) time.sleep(0.1)