From e6ea6db38a036f2f11a89ce39494022d53a2eca8 Mon Sep 17 00:00:00 2001 From: Alexis Girault Date: Mon, 31 Mar 2025 12:58:07 -0400 Subject: [PATCH 1/4] build: use sm list from nvcc Signed-off-by: Alexis Girault --- scripts/get_cuda_gencode.sh | 94 ++++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/scripts/get_cuda_gencode.sh b/scripts/get_cuda_gencode.sh index 666d604..7c51526 100755 --- a/scripts/get_cuda_gencode.sh +++ b/scripts/get_cuda_gencode.sh @@ -17,50 +17,66 @@ if [ "$CUDA_VERSION_MAJOR" -lt 8 ]; then exit 1 fi -# Initialize with Pascal architecture -COMPUTE_LIST="60 61 62" -SM_LIST="60 61 62" - -# Add Volta (7.0) if CUDA >= 9.0 -if [ "$CUDA_VERSION_MAJOR" -ge 9 ]; then - COMPUTE_LIST="$COMPUTE_LIST 70 72" - SM_LIST="$SM_LIST 70 72" -fi +# Get the list of supported SM architectures (sm_XX) from nvcc +# Filter to only include SM >= 60 (Pascal) +SUPPORTED_SM_LIST=$("$NVCC" --list-gpu-code 2>/dev/null | sed 's/sm_//' | awk '$1 >= 60') -# Add Turing (7.5) if CUDA >= 10.0 -if [ "$CUDA_VERSION_MAJOR" -ge 10 ]; then - COMPUTE_LIST="$COMPUTE_LIST 75" - SM_LIST="$SM_LIST 75" -fi +# Initialize empty lists +COMPUTE_LIST="" +SM_LIST="" -# Add Ampere (8.0, 8.6, 8.7) if CUDA >= 11.1 -if [ "$CUDA_VERSION_MAJOR" -ge 11 ] && [ "$CUDA_VERSION_MINOR" -ge 1 ]; then - COMPUTE_LIST="$COMPUTE_LIST 80 86 87" - SM_LIST="$SM_LIST 80 86 87" -fi +if [ -n "$SUPPORTED_SM_LIST" ]; then + # Use the list of architectures supported by nvcc + SM_LIST=$SUPPORTED_SM_LIST + COMPUTE_LIST=$SUPPORTED_SM_LIST +else + echo "Warning: Could not determine supported architectures from nvcc, falling back to version-based detection" >&2 -# Add Ada Lovelace (8.9) if CUDA >= 11.8 -if [ "$CUDA_VERSION_MAJOR" -ge 11 ] && [ "$CUDA_VERSION_MINOR" -ge 8 ]; then - COMPUTE_LIST="$COMPUTE_LIST 89" - SM_LIST="$SM_LIST 89" -fi + # Initialize with Pascal architecture + COMPUTE_LIST="60 61 62" + SM_LIST="60 61 62" -# Add Hopper (9.0) if CUDA >= 12.0 -if [ "$CUDA_VERSION_MAJOR" -ge 12 ]; then - COMPUTE_LIST="$COMPUTE_LIST 90" - SM_LIST="$SM_LIST 90" -fi + # Add Volta (7.0) if CUDA >= 9.0 + if [ "$CUDA_VERSION_MAJOR" -ge 9 ]; then + COMPUTE_LIST="$COMPUTE_LIST 70 72" + SM_LIST="$SM_LIST 70 72" + fi -# Add Blackwell (10.0) if CUDA >= 12.6 -if [ "$CUDA_VERSION_MAJOR" -ge 12 ] && [ "$CUDA_VERSION_MINOR" -ge 6 ]; then - COMPUTE_LIST="$COMPUTE_LIST 100" - SM_LIST="$SM_LIST 100" -fi + # Add Turing (7.5) if CUDA >= 10.0 + if [ "$CUDA_VERSION_MAJOR" -ge 10 ]; then + COMPUTE_LIST="$COMPUTE_LIST 75" + SM_LIST="$SM_LIST 75" + fi + + # Add Ampere (8.0, 8.6, 8.7) if CUDA >= 11.1 + if [ "$CUDA_VERSION_MAJOR" -ge 11 ] && [ "$CUDA_VERSION_MINOR" -ge 1 ]; then + COMPUTE_LIST="$COMPUTE_LIST 80 86 87" + SM_LIST="$SM_LIST 80 86 87" + fi + + # Add Ada Lovelace (8.9) if CUDA >= 11.8 + if [ "$CUDA_VERSION_MAJOR" -ge 11 ] && [ "$CUDA_VERSION_MINOR" -ge 8 ]; then + COMPUTE_LIST="$COMPUTE_LIST 89" + SM_LIST="$SM_LIST 89" + fi + + # Add Hopper (9.0) if CUDA >= 12.0 + if [ "$CUDA_VERSION_MAJOR" -ge 12 ]; then + COMPUTE_LIST="$COMPUTE_LIST 90" + SM_LIST="$SM_LIST 90" + fi + + # Add Blackwell (10.0) if CUDA >= 12.6 + if [ "$CUDA_VERSION_MAJOR" -ge 12 ] && [ "$CUDA_VERSION_MINOR" -ge 6 ]; then + COMPUTE_LIST="$COMPUTE_LIST 100" + SM_LIST="$SM_LIST 100" + fi -# Add Blackwell (12.0) if CUDA >= 12.8 -if [ "$CUDA_VERSION_MAJOR" -ge 12 ] && [ "$CUDA_VERSION_MINOR" -ge 8 ]; then - COMPUTE_LIST="$COMPUTE_LIST 120" - SM_LIST="$SM_LIST 120" + # Add Blackwell (12.0) if CUDA >= 12.8 + if [ "$CUDA_VERSION_MAJOR" -ge 12 ] && [ "$CUDA_VERSION_MINOR" -ge 8 ]; then + COMPUTE_LIST="$COMPUTE_LIST 120" + SM_LIST="$SM_LIST 120" + fi fi # Generate NVCC flags @@ -73,4 +89,4 @@ for sm in $SM_LIST; do GENCODE_FLAGS="$GENCODE_FLAGS -gencode arch=compute_$sm,code=sm_$sm" done -echo "$GENCODE_FLAGS" \ No newline at end of file +echo "$GENCODE_FLAGS" From 72fa69c78d659d465d6b7a04f16f8ec373eacf0b Mon Sep 17 00:00:00 2001 From: Alexis Girault Date: Mon, 31 Mar 2025 13:03:31 -0400 Subject: [PATCH 2/4] build: correct blackwell sm use - 10.0 is from CTK 12.8+ - 10.1 was missing Signed-off-by: Alexis Girault --- scripts/get_cuda_gencode.sh | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/scripts/get_cuda_gencode.sh b/scripts/get_cuda_gencode.sh index 7c51526..ee411b6 100755 --- a/scripts/get_cuda_gencode.sh +++ b/scripts/get_cuda_gencode.sh @@ -66,16 +66,10 @@ else SM_LIST="$SM_LIST 90" fi - # Add Blackwell (10.0) if CUDA >= 12.6 + # Add Blackwell (10.0, 10.1, 12.0) if CUDA >= 12.8 if [ "$CUDA_VERSION_MAJOR" -ge 12 ] && [ "$CUDA_VERSION_MINOR" -ge 6 ]; then - COMPUTE_LIST="$COMPUTE_LIST 100" - SM_LIST="$SM_LIST 100" - fi - - # Add Blackwell (12.0) if CUDA >= 12.8 - if [ "$CUDA_VERSION_MAJOR" -ge 12 ] && [ "$CUDA_VERSION_MINOR" -ge 8 ]; then - COMPUTE_LIST="$COMPUTE_LIST 120" - SM_LIST="$SM_LIST 120" + COMPUTE_LIST="$COMPUTE_LIST 100 101 120" + SM_LIST="$SM_LIST 100 101 120" fi fi From c521788ac86c057f79778bc48477b85df0e399b3 Mon Sep 17 00:00:00 2001 From: Alexis Girault Date: Mon, 31 Mar 2025 13:09:24 -0400 Subject: [PATCH 3/4] build: consolidate arch lists for ptx and sass The list was the same Signed-off-by: Alexis Girault --- scripts/get_cuda_gencode.sh | 43 ++++++++++++------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/scripts/get_cuda_gencode.sh b/scripts/get_cuda_gencode.sh index ee411b6..c59969b 100755 --- a/scripts/get_cuda_gencode.sh +++ b/scripts/get_cuda_gencode.sh @@ -19,68 +19,53 @@ fi # Get the list of supported SM architectures (sm_XX) from nvcc # Filter to only include SM >= 60 (Pascal) -SUPPORTED_SM_LIST=$("$NVCC" --list-gpu-code 2>/dev/null | sed 's/sm_//' | awk '$1 >= 60') +ARCH_LIST=$("$NVCC" --list-gpu-code 2>/dev/null | sed 's/sm_//' | awk '$1 >= 60') -# Initialize empty lists -COMPUTE_LIST="" -SM_LIST="" - -if [ -n "$SUPPORTED_SM_LIST" ]; then - # Use the list of architectures supported by nvcc - SM_LIST=$SUPPORTED_SM_LIST - COMPUTE_LIST=$SUPPORTED_SM_LIST -else +if [ -z "$ARCH_LIST" ]; then echo "Warning: Could not determine supported architectures from nvcc, falling back to version-based detection" >&2 # Initialize with Pascal architecture - COMPUTE_LIST="60 61 62" - SM_LIST="60 61 62" + ARCH_LIST="60 61 62" # Add Volta (7.0) if CUDA >= 9.0 if [ "$CUDA_VERSION_MAJOR" -ge 9 ]; then - COMPUTE_LIST="$COMPUTE_LIST 70 72" - SM_LIST="$SM_LIST 70 72" + ARCH_LIST="$ARCH_LIST 70 72" fi # Add Turing (7.5) if CUDA >= 10.0 if [ "$CUDA_VERSION_MAJOR" -ge 10 ]; then - COMPUTE_LIST="$COMPUTE_LIST 75" - SM_LIST="$SM_LIST 75" + ARCH_LIST="$ARCH_LIST 75" fi # Add Ampere (8.0, 8.6, 8.7) if CUDA >= 11.1 if [ "$CUDA_VERSION_MAJOR" -ge 11 ] && [ "$CUDA_VERSION_MINOR" -ge 1 ]; then - COMPUTE_LIST="$COMPUTE_LIST 80 86 87" - SM_LIST="$SM_LIST 80 86 87" + ARCH_LIST="$ARCH_LIST 80 86 87" fi # Add Ada Lovelace (8.9) if CUDA >= 11.8 if [ "$CUDA_VERSION_MAJOR" -ge 11 ] && [ "$CUDA_VERSION_MINOR" -ge 8 ]; then - COMPUTE_LIST="$COMPUTE_LIST 89" - SM_LIST="$SM_LIST 89" + ARCH_LIST="$ARCH_LIST 89" fi # Add Hopper (9.0) if CUDA >= 12.0 if [ "$CUDA_VERSION_MAJOR" -ge 12 ]; then - COMPUTE_LIST="$COMPUTE_LIST 90" - SM_LIST="$SM_LIST 90" + ARCH_LIST="$ARCH_LIST 90" fi # Add Blackwell (10.0, 10.1, 12.0) if CUDA >= 12.8 - if [ "$CUDA_VERSION_MAJOR" -ge 12 ] && [ "$CUDA_VERSION_MINOR" -ge 6 ]; then - COMPUTE_LIST="$COMPUTE_LIST 100 101 120" - SM_LIST="$SM_LIST 100 101 120" + if [ "$CUDA_VERSION_MAJOR" -ge 12 ] && [ "$CUDA_VERSION_MINOR" -ge 8 ]; then + ARCH_LIST="$ARCH_LIST 100 101 120" fi fi # Generate NVCC flags GENCODE_FLAGS="" -for compute in $COMPUTE_LIST; do - GENCODE_FLAGS="$GENCODE_FLAGS -gencode arch=compute_$compute,code=compute_$compute" +for arch in $ARCH_LIST; do + GENCODE_FLAGS="$GENCODE_FLAGS -gencode arch=compute_$arch,code=compute_$arch" done -for sm in $SM_LIST; do - GENCODE_FLAGS="$GENCODE_FLAGS -gencode arch=compute_$sm,code=sm_$sm" +for arch in $ARCH_LIST; do + GENCODE_FLAGS="$GENCODE_FLAGS -gencode arch=compute_$arch,code=sm_$arch" done echo "$GENCODE_FLAGS" From d1fde74a3f65ab59cf1a8958aa41e46505427175 Mon Sep 17 00:00:00 2001 From: Alexis Girault Date: Mon, 31 Mar 2025 13:14:26 -0400 Subject: [PATCH 4/4] build: only build ptx for last arch Signed-off-by: Alexis Girault --- scripts/get_cuda_gencode.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/get_cuda_gencode.sh b/scripts/get_cuda_gencode.sh index c59969b..946ea28 100755 --- a/scripts/get_cuda_gencode.sh +++ b/scripts/get_cuda_gencode.sh @@ -60,12 +60,14 @@ fi # Generate NVCC flags GENCODE_FLAGS="" -for arch in $ARCH_LIST; do - GENCODE_FLAGS="$GENCODE_FLAGS -gencode arch=compute_$arch,code=compute_$arch" -done +# Generate SM-specific code for all architectures for arch in $ARCH_LIST; do GENCODE_FLAGS="$GENCODE_FLAGS -gencode arch=compute_$arch,code=sm_$arch" done +# Generate PTX code only for the latest architecture +LATEST_ARCH=$(echo "$ARCH_LIST" | tr ' ' '\n' | sort -n | tail -1) +GENCODE_FLAGS="$GENCODE_FLAGS -gencode arch=compute_$LATEST_ARCH,code=compute_$LATEST_ARCH" + echo "$GENCODE_FLAGS"