Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
07a3268
NCU profiling wrapper generation and execution
Jan 7, 2026
3c4b124
Refactor profiling components and add kernel_perf_util
Jan 7, 2026
11f4e79
Refactor profiling components and add kernel_perf_util
Jan 7, 2026
251f419
Refactor profiling components and add kernel_perf_util
Jan 7, 2026
b789660
update directory name and add package in pyproject
Jan 7, 2026
4d35d57
Remove kernel_perf_util directory
Jan 7, 2026
d871678
move gpu spec.py to future PR and fix import
Jan 7, 2026
db0c754
Add copyright header
Jan 7, 2026
cd29759
fix ruff
Jan 7, 2026
bbfa6cd
address previous comments
Jan 13, 2026
543453a
fix ruff
Jan 13, 2026
706c9cc
Add unified benchmarking module for kernel performance measurement
Jan 8, 2026
4febdd6
Introducing benchmarking infra for kernel performance
Jan 8, 2026
d92a7b7
fix ruff
Jan 9, 2026
2994315
fix ruff
Jan 9, 2026
1378fc3
address comments
Jan 14, 2026
45fec80
Diagnose module - prompt constructor
Jan 11, 2026
b640cde
Refactors the diagnose_prompt module into a modular architecture
Jan 13, 2026
e952123
fix diff issue
Jan 13, 2026
e7ba29a
fix ruff issue
Jan 13, 2026
72ac4d1
fix
Jan 15, 2026
e2c599e
fix ruff
Jan 15, 2026
d5e6edc
optimization prompt
Jan 13, 2026
054367f
add optimization orchestrator and add an API in the worker.py
Jan 14, 2026
8f7cce7
fix ruff
Jan 14, 2026
45ec33d
fix
Jan 14, 2026
dd55d1d
fix
Jan 15, 2026
04a4891
fix from e2e testing
Jan 18, 2026
f057055
integrating opt component into optimization worker. Add necessary uti…
Jan 18, 2026
7a0c656
refactor helper function and clean up comments
Jan 18, 2026
010c66c
fix missing arg of _call_llm
Jan 18, 2026
160774a
move bottleneck_analyzer to its own class in the opt_worker_component…
Jan 18, 2026
0535ff8
update ncu wrapper, profiler to capture the current kernel instead of…
Jan 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions kernel_perf_agent/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Kernel Performance Utilities

Low-level, reusable utilities for kernel optimization.
34 changes: 34 additions & 0 deletions kernel_perf_agent/kernel_opt/diagnose_prompt/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Diagnose Prompt Module for Hardware Bottleneck Analysis.

This module provides prompt building utilities for the Judge LLM that
analyzes NCU profiling metrics to identify performance bottlenecks.
"""

from .gpu_specs import get_gpu_specs
from .judger_prompts import (
build_judge_optimization_prompt,
extract_judge_response,
validate_judge_response,
)

__all__ = [
"get_gpu_specs",
"build_judge_optimization_prompt",
"extract_judge_response",
"validate_judge_response",
]
142 changes: 142 additions & 0 deletions kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
GPU Specifications Database for Bottleneck Analysis

This module provides GPU hardware specifications needed for performance analysis
and bottleneck identification. It includes peak compute performance, memory bandwidth,
cache sizes, and SM counts for common NVIDIA GPUs.

"""

import subprocess
from typing import Any

from kernel_perf_agent.kernel_opt.diagnose_prompt.gpu_specs_database import (
GPU_SPECS_DATABASE,
)

__all__ = ["GPU_SPECS_DATABASE", "query_gpu_name", "get_gpu_specs"]


def query_gpu_name() -> str | None:
"""
Query GPU name using nvidia-smi.

Returns:
GPU name string, or None if query fails
"""
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0:
# Take only the first GPU (nvidia-smi returns one line per GPU)
gpu_name = result.stdout.strip().split("\n")[0].strip()
return gpu_name
except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
pass
return None


def get_gpu_specs(gpu_name: str | None = None) -> dict[str, Any]:
"""
Get GPU specifications for bottleneck analysis.

This function returns hardware specifications needed for performance analysis,
including peak compute performance, memory bandwidth, cache sizes, and SM counts.

Args:
gpu_name: GPU name (if None, auto-detect with nvidia-smi)

Returns:
Dictionary with GPU specifications containing:
- name: GPU name
- architecture: GPU architecture (e.g., "Ampere", "Hopper")
- peak_fp32_tflops: Peak FP32 compute performance in TFLOPS
- peak_fp16_tflops: Peak FP16 compute performance in TFLOPS
- peak_bf16_tflops: Peak BF16 compute performance in TFLOPS (0 if not supported)
- peak_memory_bw_gbps: Peak memory bandwidth in GB/s
- sm_count: Number of streaming multiprocessors
- max_threads_per_sm: Maximum threads per SM
- l1_cache_kb: L1 cache size in KB per SM
- l2_cache_mb: Total L2 cache size in MB
- memory_gb: Total GPU memory in GB
- memory_type: Memory type (e.g., "HBM2e", "GDDR6X")

Examples:
>>> specs = get_gpu_specs() # Auto-detect
>>> print(f"Peak BW: {specs['peak_memory_bw_gbps']} GB/s")

>>> specs = get_gpu_specs("NVIDIA A100")
>>> print(f"SM Count: {specs['sm_count']}")
"""
# Auto-detect if not provided
if gpu_name is None:
gpu_name = query_gpu_name()

# Return default if detection failed
if gpu_name is None:
print("⚠️ GPU auto-detection failed, using A100 specs as fallback")
return GPU_SPECS_DATABASE["NVIDIA A100"].copy()

# Try exact match
if gpu_name in GPU_SPECS_DATABASE:
return GPU_SPECS_DATABASE[gpu_name].copy()

# Try fuzzy match (contains or partial match)
gpu_name_lower = gpu_name.lower()
for key, specs in GPU_SPECS_DATABASE.items():
key_lower = key.lower()
# Check if either name contains the other
if gpu_name_lower in key_lower or key_lower in gpu_name_lower:
print(f"ℹ️ Matched '{gpu_name}' to '{key}' (fuzzy match)")
return specs.copy()

# Fallback to A100 specs with warning
print(f"⚠️ Unknown GPU: '{gpu_name}', using A100 specs as fallback")
print(f" Available GPUs: {', '.join(GPU_SPECS_DATABASE.keys())}")
return GPU_SPECS_DATABASE["NVIDIA A100"].copy()


if __name__ == "__main__":
print("GPU Specifications Module")
print("=" * 60)

# Auto-detect GPU
detected_name = query_gpu_name()
if detected_name:
print(f"\nDetected GPU: {detected_name}")
else:
print("\nNo GPU detected (nvidia-smi not available)")
exit()

# Get specs
specs = get_gpu_specs()
print(
f"\nUsing specs for: {specs['name']} ({specs.get('architecture', 'Unknown')})"
)
print(f" - Peak Memory Bandwidth: {specs['peak_memory_bw_gbps']} GB/s")
print(f" - Peak FP32 Performance: {specs['peak_fp32_tflops']} TFLOPS")
print(f" - SM Count: {specs['sm_count']}")

# Show all available GPUs
print(f"\n{'=' * 60}")
print("Available GPU specifications in database:")
for gpu_name in sorted(GPU_SPECS_DATABASE.keys()):
print(f" - {gpu_name}")
82 changes: 82 additions & 0 deletions kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
GPU Specifications Database

This module contains the GPU hardware specifications database used for
performance analysis and bottleneck identification. Separated into its
own file to allow easier module overriding.

Sources: NVIDIA official specifications, manufacturer datasheets
"""

GPU_SPECS_DATABASE: dict[str, dict[str, object]] = {
"NVIDIA A100": {
"name": "NVIDIA A100",
"architecture": "Ampere",
"peak_fp32_tflops": 19.5,
"peak_fp16_tflops": 312.0,
"peak_bf16_tflops": 312.0,
"peak_memory_bw_gbps": 1555,
"sm_count": 108,
"max_threads_per_sm": 2048,
"l1_cache_kb": 192,
"l2_cache_mb": 40,
"memory_gb": 40,
"memory_type": "HBM2e",
},
"NVIDIA H100": {
"name": "NVIDIA H100",
"architecture": "Hopper",
"peak_fp32_tflops": 51.0,
"peak_fp16_tflops": 989.0,
"peak_bf16_tflops": 989.0,
"peak_memory_bw_gbps": 3352,
"sm_count": 132,
"max_threads_per_sm": 2048,
"l1_cache_kb": 256,
"l2_cache_mb": 50,
"memory_gb": 80,
"memory_type": "HBM3",
},
"NVIDIA RTX 4090": {
"name": "NVIDIA RTX 4090",
"architecture": "Ada Lovelace",
"peak_fp32_tflops": 82.6,
"peak_fp16_tflops": 165.0,
"peak_bf16_tflops": 165.0,
"peak_memory_bw_gbps": 1008,
"sm_count": 128,
"max_threads_per_sm": 1536,
"l1_cache_kb": 128,
"l2_cache_mb": 72,
"memory_gb": 24,
"memory_type": "GDDR6X",
},
"NVIDIA RTX 5080": {
"name": "NVIDIA RTX 5080",
"architecture": "Blackwell",
"peak_fp32_tflops": 57.0,
"peak_fp16_tflops": 114.0,
"peak_bf16_tflops": 114.0,
"peak_memory_bw_gbps": 960,
"sm_count": 84,
"max_threads_per_sm": 1536,
"l1_cache_kb": 128,
"l2_cache_mb": 64,
"memory_gb": 16,
"memory_type": "GDDR7",
},
}
Loading