-
Notifications
You must be signed in to change notification settings - Fork 27
[Optimization 3/n] Add Diagnosis Module (Prompt Builder for Hardware Bottleneck) #73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
07a3268
3c4b124
11f4e79
251f419
b789660
4d35d57
d871678
db0c754
cd29759
bbfa6cd
543453a
706c9cc
4febdd6
d92a7b7
2994315
1378fc3
45fec80
b640cde
e952123
e7ba29a
72ac4d1
e2c599e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| # Kernel Performance Utilities | ||
|
|
||
| Low-level, reusable utilities for kernel optimization. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| # Copyright (c) Meta Platforms, Inc. and affiliates. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """ | ||
| Diagnose Prompt Module for Hardware Bottleneck Analysis. | ||
|
|
||
| This module provides prompt building utilities for the Judge LLM that | ||
| analyzes NCU profiling metrics to identify performance bottlenecks. | ||
| """ | ||
|
|
||
| from .gpu_specs import get_gpu_specs | ||
| from .judger_prompts import ( | ||
| build_judge_optimization_prompt, | ||
| extract_judge_response, | ||
| validate_judge_response, | ||
| ) | ||
|
|
||
| __all__ = [ | ||
| "get_gpu_specs", | ||
| "build_judge_optimization_prompt", | ||
| "extract_judge_response", | ||
| "validate_judge_response", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,142 @@ | ||
| # Copyright (c) Meta Platforms, Inc. and affiliates. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """ | ||
| GPU Specifications Database for Bottleneck Analysis | ||
|
|
||
| This module provides GPU hardware specifications needed for performance analysis | ||
| and bottleneck identification. It includes peak compute performance, memory bandwidth, | ||
| cache sizes, and SM counts for common NVIDIA GPUs. | ||
|
|
||
| """ | ||
|
|
||
| import subprocess | ||
| from typing import Any | ||
|
|
||
| from kernel_perf_agent.kernel_opt.diagnose_prompt.gpu_specs_database import ( | ||
| GPU_SPECS_DATABASE, | ||
| ) | ||
|
|
||
| __all__ = ["GPU_SPECS_DATABASE", "query_gpu_name", "get_gpu_specs"] | ||
|
|
||
|
|
||
| def query_gpu_name() -> str | None: | ||
| """ | ||
| Query GPU name using nvidia-smi. | ||
|
|
||
| Returns: | ||
| GPU name string, or None if query fails | ||
| """ | ||
| try: | ||
| result = subprocess.run( | ||
| ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], | ||
| capture_output=True, | ||
| text=True, | ||
| timeout=5, | ||
| ) | ||
| if result.returncode == 0: | ||
| # Take only the first GPU (nvidia-smi returns one line per GPU) | ||
| gpu_name = result.stdout.strip().split("\n")[0].strip() | ||
| return gpu_name | ||
| except (subprocess.TimeoutExpired, FileNotFoundError, Exception): | ||
| pass | ||
| return None | ||
|
|
||
|
|
||
| def get_gpu_specs(gpu_name: str | None = None) -> dict[str, Any]: | ||
| """ | ||
| Get GPU specifications for bottleneck analysis. | ||
|
|
||
| This function returns hardware specifications needed for performance analysis, | ||
| including peak compute performance, memory bandwidth, cache sizes, and SM counts. | ||
|
|
||
| Args: | ||
| gpu_name: GPU name (if None, auto-detect with nvidia-smi) | ||
|
|
||
| Returns: | ||
| Dictionary with GPU specifications containing: | ||
| - name: GPU name | ||
| - architecture: GPU architecture (e.g., "Ampere", "Hopper") | ||
| - peak_fp32_tflops: Peak FP32 compute performance in TFLOPS | ||
| - peak_fp16_tflops: Peak FP16 compute performance in TFLOPS | ||
| - peak_bf16_tflops: Peak BF16 compute performance in TFLOPS (0 if not supported) | ||
| - peak_memory_bw_gbps: Peak memory bandwidth in GB/s | ||
| - sm_count: Number of streaming multiprocessors | ||
| - max_threads_per_sm: Maximum threads per SM | ||
| - l1_cache_kb: L1 cache size in KB per SM | ||
| - l2_cache_mb: Total L2 cache size in MB | ||
| - memory_gb: Total GPU memory in GB | ||
| - memory_type: Memory type (e.g., "HBM2e", "GDDR6X") | ||
|
|
||
| Examples: | ||
| >>> specs = get_gpu_specs() # Auto-detect | ||
| >>> print(f"Peak BW: {specs['peak_memory_bw_gbps']} GB/s") | ||
|
|
||
| >>> specs = get_gpu_specs("NVIDIA A100") | ||
| >>> print(f"SM Count: {specs['sm_count']}") | ||
| """ | ||
| # Auto-detect if not provided | ||
| if gpu_name is None: | ||
| gpu_name = query_gpu_name() | ||
|
|
||
| # Return default if detection failed | ||
| if gpu_name is None: | ||
| print("⚠️ GPU auto-detection failed, using A100 specs as fallback") | ||
| return GPU_SPECS_DATABASE["NVIDIA A100"].copy() | ||
|
|
||
| # Try exact match | ||
| if gpu_name in GPU_SPECS_DATABASE: | ||
| return GPU_SPECS_DATABASE[gpu_name].copy() | ||
|
|
||
| # Try fuzzy match (contains or partial match) | ||
| gpu_name_lower = gpu_name.lower() | ||
| for key, specs in GPU_SPECS_DATABASE.items(): | ||
| key_lower = key.lower() | ||
| # Check if either name contains the other | ||
| if gpu_name_lower in key_lower or key_lower in gpu_name_lower: | ||
| print(f"ℹ️ Matched '{gpu_name}' to '{key}' (fuzzy match)") | ||
| return specs.copy() | ||
|
Comment on lines
+103
to
+109
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Curious if you've encountered this case before?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sometimes I'll just put a100 or h100 in my optimization workflow. What do you think, should we just force the gpu name input to be exactly matching?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Enum it |
||
|
|
||
| # Fallback to A100 specs with warning | ||
| print(f"⚠️ Unknown GPU: '{gpu_name}', using A100 specs as fallback") | ||
| print(f" Available GPUs: {', '.join(GPU_SPECS_DATABASE.keys())}") | ||
| return GPU_SPECS_DATABASE["NVIDIA A100"].copy() | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| print("GPU Specifications Module") | ||
| print("=" * 60) | ||
|
|
||
| # Auto-detect GPU | ||
| detected_name = query_gpu_name() | ||
| if detected_name: | ||
| print(f"\nDetected GPU: {detected_name}") | ||
| else: | ||
| print("\nNo GPU detected (nvidia-smi not available)") | ||
Jack-Khuu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| exit() | ||
|
|
||
| # Get specs | ||
| specs = get_gpu_specs() | ||
| print( | ||
| f"\nUsing specs for: {specs['name']} ({specs.get('architecture', 'Unknown')})" | ||
| ) | ||
| print(f" - Peak Memory Bandwidth: {specs['peak_memory_bw_gbps']} GB/s") | ||
| print(f" - Peak FP32 Performance: {specs['peak_fp32_tflops']} TFLOPS") | ||
| print(f" - SM Count: {specs['sm_count']}") | ||
|
|
||
| # Show all available GPUs | ||
| print(f"\n{'=' * 60}") | ||
| print("Available GPU specifications in database:") | ||
| for gpu_name in sorted(GPU_SPECS_DATABASE.keys()): | ||
| print(f" - {gpu_name}") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| # Copyright (c) Meta Platforms, Inc. and affiliates. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """ | ||
| GPU Specifications Database | ||
|
|
||
| This module contains the GPU hardware specifications database used for | ||
| performance analysis and bottleneck identification. Separated into its | ||
| own file to allow easier module overriding. | ||
|
|
||
| Sources: NVIDIA official specifications, manufacturer datasheets | ||
| """ | ||
|
|
||
| GPU_SPECS_DATABASE: dict[str, dict[str, object]] = { | ||
| "NVIDIA A100": { | ||
| "name": "NVIDIA A100", | ||
| "architecture": "Ampere", | ||
| "peak_fp32_tflops": 19.5, | ||
| "peak_fp16_tflops": 312.0, | ||
| "peak_bf16_tflops": 312.0, | ||
| "peak_memory_bw_gbps": 1555, | ||
| "sm_count": 108, | ||
| "max_threads_per_sm": 2048, | ||
| "l1_cache_kb": 192, | ||
| "l2_cache_mb": 40, | ||
| "memory_gb": 40, | ||
| "memory_type": "HBM2e", | ||
| }, | ||
| "NVIDIA H100": { | ||
| "name": "NVIDIA H100", | ||
| "architecture": "Hopper", | ||
| "peak_fp32_tflops": 51.0, | ||
| "peak_fp16_tflops": 989.0, | ||
| "peak_bf16_tflops": 989.0, | ||
| "peak_memory_bw_gbps": 3352, | ||
| "sm_count": 132, | ||
| "max_threads_per_sm": 2048, | ||
| "l1_cache_kb": 256, | ||
| "l2_cache_mb": 50, | ||
| "memory_gb": 80, | ||
| "memory_type": "HBM3", | ||
| }, | ||
| "NVIDIA RTX 4090": { | ||
| "name": "NVIDIA RTX 4090", | ||
| "architecture": "Ada Lovelace", | ||
| "peak_fp32_tflops": 82.6, | ||
| "peak_fp16_tflops": 165.0, | ||
| "peak_bf16_tflops": 165.0, | ||
| "peak_memory_bw_gbps": 1008, | ||
| "sm_count": 128, | ||
| "max_threads_per_sm": 1536, | ||
| "l1_cache_kb": 128, | ||
| "l2_cache_mb": 72, | ||
| "memory_gb": 24, | ||
| "memory_type": "GDDR6X", | ||
| }, | ||
| "NVIDIA RTX 5080": { | ||
| "name": "NVIDIA RTX 5080", | ||
| "architecture": "Blackwell", | ||
| "peak_fp32_tflops": 57.0, | ||
| "peak_fp16_tflops": 114.0, | ||
| "peak_bf16_tflops": 114.0, | ||
| "peak_memory_bw_gbps": 960, | ||
| "sm_count": 84, | ||
| "max_threads_per_sm": 1536, | ||
| "l1_cache_kb": 128, | ||
| "l2_cache_mb": 64, | ||
| "memory_gb": 16, | ||
| "memory_type": "GDDR7", | ||
| }, | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we fall back to A100? Or does returning an empty dict make more sense?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree returning empty dict is cleaner, but it will also lead to KeyError in the optimization flow. Should we make a decision of disabling the optimization if no gpu_name is found?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that makes sense; if there are setup/detection issues then we shouldn't optimize