From 472f3a7fd9b2a16cf9801ecf70210e11376710d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 17 Dec 2025 09:33:57 +0000
Subject: [PATCH 1/3] Improve CUDA capability handling

We computed a kernel's capabilities by taking the loose intersection of
the stated kernel capabilities (or the default) and the capabilities
reported to be supported by CMake/Torch. However, this led to issues
with e.g. capability 8.9, which is not in these lists (anymore?), but is
fine to compile for.

To solve this issue, we will ignore the capabilities reported by
CMake/Torch and instead use our own list of capabilities. This list is
the list of all capabilities supported by a CUDA version minus some
really old capabilities that are not supported by Torch anyway. This
behavior is used by enabling the new `BUILD_ALL_SUPPORTED_ARCHS` CMake
option (which is the default for the Nix and Windows builders).

When `BUILD_ALL_SUPPORTED_ARCHS` is not set, we will try to detect the
capability of the user's CUDA GPU. This speeds up development - since
one then only has to compile for a single capability. If this fails for
some reason, we'll revert to using all capabilities as if
`BUILD_ALL_SUPPORTED_ARCHS` was set.
---
 build2cmake/src/templates/cuda/kernel.cmake   |  2 +-
 build2cmake/src/templates/cuda/preamble.cmake | 24 ++++--
 build2cmake/src/templates/utils.cmake         | 77 +++++++++++++++----
 build2cmake/src/torch/cuda.rs                 |  8 --
 lib/torch-extension/arch.nix                  |  1 +
 pkgs/build2cmake/default.nix                  |  1 -
 scripts/windows/builder.ps1                   |  3 +
 7 files changed, 82 insertions(+), 34 deletions(-)

diff --git a/build2cmake/src/templates/cuda/kernel.cmake b/build2cmake/src/templates/cuda/kernel.cmake
index 733492a5..04d67d2c 100644
--- a/build2cmake/src/templates/cuda/kernel.cmake
+++ b/build2cmake/src/templates/cuda/kernel.cmake
@@ -18,7 +18,7 @@ if(GPU_LANG STREQUAL "CUDA")
   {% if cuda_capabilities %}
     cuda_archs_loose_intersection({{kernel_name}}_ARCHS "{{ cuda_capabilities|join(";") }}" "${CUDA_ARCHS}")
   {% else %}
-    cuda_archs_loose_intersection({{kernel_name}}_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}" "${CUDA_ARCHS}")
+    set({{kernel_name}}_ARCHS "${CUDA_KERNEL_ARCHS}")
   {% endif %}
   message(STATUS "Capabilities for kernel {{kernel_name}}: {{ '${' + kernel_name + '_ARCHS}'}}")
   set_gencode_flags_for_srcs(SRCS {{'"${' + kernel_name + '_SRC}"'}} CUDA_ARCHS "{{ '${' + kernel_name + '_ARCHS}'}}")
diff --git a/build2cmake/src/templates/cuda/preamble.cmake b/build2cmake/src/templates/cuda/preamble.cmake
index 05024e06..a87471df 100644
--- a/build2cmake/src/templates/cuda/preamble.cmake
+++ b/build2cmake/src/templates/cuda/preamble.cmake
@@ -9,8 +9,6 @@ include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 
-set(CUDA_SUPPORTED_ARCHS "{{ cuda_supported_archs }}")
-
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
@@ -50,6 +48,8 @@ if (NOT TARGET_DEVICE STREQUAL "cuda" AND
     return()
 endif()
 
+option(BUILD_ALL_SUPPORTED_ARCHS "Build all supported architectures" off)
+
 if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
  set(CUDA_DEFAULT_KERNEL_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0+PTX")
@@ -91,12 +91,20 @@ endif()
 
 if(GPU_LANG STREQUAL "CUDA")
   clear_cuda_arches(CUDA_ARCH_FLAGS)
-  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
-  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
-  # Filter the target architectures by the supported supported archs
-  # since for some files we will build for all CUDA_ARCHS.
-  cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
-  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
+  cuda_remove_ptx_suffixes(CUDA_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+  message(STATUS "CUDA supported base architectures: ${CUDA_ARCHS}")
+
+  if(BUILD_ALL_SUPPORTED_ARCHS)
+    set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+  else()
+    try_run_python(CUDA_KERNEL_ARCHS SUCCESS "import torch; cc=torch.cuda.get_device_capability(); print(f\"{cc[0]}.{cc[1]}\")" "Failed to get CUDA capability")
+    if(NOT SUCCESS)
+      message(WARNING "Failed to detect CUDA capability, using default capabilities.")
+      set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+    endif()
+  endif()
+
+  message(STATUS "CUDA supported kernel architectures: ${CUDA_KERNEL_ARCHS}")
 
   if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA")
     list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}")
diff --git a/build2cmake/src/templates/utils.cmake b/build2cmake/src/templates/utils.cmake
index d4d4cef2..bdcea367 100644
--- a/build2cmake/src/templates/utils.cmake
+++ b/build2cmake/src/templates/utils.cmake
@@ -42,6 +42,29 @@ function (run_python OUT EXPR ERR_MSG)
   set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
 endfunction()
 
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, `SUCCESS` is set to FALSE. If successful, `SUCCESS` is set to TRUE.
+#
+function (try_run_python OUT SUCCESS EXPR)
+  execute_process(
+    COMMAND
+    "${Python3_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE PYTHON_OUT
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_QUIET
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    set(${SUCCESS} FALSE PARENT_SCOPE)
+    set(${OUT} "" PARENT_SCOPE)
+  else()
+    set(${SUCCESS} TRUE PARENT_SCOPE)
+    set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+  endif()
+endfunction()
+
 # Run `EXPR` in python after importing `PKG`. Use the result of this to extend
 # `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
 macro (append_cmake_prefix_path PKG EXPR)
@@ -173,13 +196,13 @@ macro(clear_cuda_arches CUDA_ARCH_FLAGS)
 endmacro()
 
 #
-# Extract unique CUDA architectures from a list of compute capabilities codes in 
-# the form `<major><minor>[<letter>]`, convert them to the form sort 
-# `<major>.<minor>`, dedupes them and then sorts them in ascending order and 
+# Extract unique CUDA architectures from a list of compute capabilities codes in
+# the form `<major><minor>[<letter>]`, convert them to the form sort
+# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
 # stores them in `OUT_ARCHES`.
 #
 # Example:
-#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" 
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
 #   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
 #   OUT_ARCHES="7.5;...;9.0"
 function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
@@ -200,15 +223,15 @@ function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
 endfunction()
 
 #
-# For a specific file set the `-gencode` flag in compile options conditionally 
-# for the CUDA language. 
+# For a specific file set the `-gencode` flag in compile options conditionally
+# for the CUDA language.
 #
 # Example:
 #   set_gencode_flag_for_srcs(
 #     SRCS "foo.cu"
 #     ARCH "compute_75"
 #     CODE "sm_75")
-#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for 
+#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
 #    `foo.cu` (only for the CUDA language).
 #
 macro(set_gencode_flag_for_srcs)
@@ -228,14 +251,14 @@ macro(set_gencode_flag_for_srcs)
 endmacro(set_gencode_flag_for_srcs)
 
 #
-# For a list of source files set the `-gencode` flags in the files specific 
+# For a list of source files set the `-gencode` flags in the files specific
 #  compile options (specifically for the CUDA language).
 #
 # arguments are:
 #  SRCS: list of source files
 #  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
 #  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
-#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS 
+#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
 #    that is larger than BUILD_PTX_FOR_ARCH.
 #
 macro(set_gencode_flags_for_srcs)
@@ -383,12 +406,14 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
   endforeach()
   set(_CUDA_ARCHS ${_FINAL_ARCHS})
 
+  list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
   set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
 endfunction()
 
 #
-# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form 
-# `<name>` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list. 
+# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form
+# `<name>` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list.
 # The loose intersection is defined as:
 #   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
 #  where `<=` is the version comparison operator.
@@ -404,28 +429,48 @@ endfunction()
 #
 function(hip_archs_loose_intersection OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS)
   list(REMOVE_DUPLICATES SRC_ROCM_ARCHS)
-  
+
   # ROCm architectures are typically in format gfxNNN or gfxNNNx where N is a digit
   # and x is a letter. We can sort them by string comparison which works for this format.
   list(SORT SRC_ROCM_ARCHS COMPARE STRING ORDER ASCENDING)
-  
+
   set(_ROCM_ARCHS)
-  
+
   # Find the intersection of supported architectures
   foreach(_SRC_ARCH ${SRC_ROCM_ARCHS})
     if(_SRC_ARCH IN_LIST TGT_ROCM_ARCHS)
       list(APPEND _ROCM_ARCHS ${_SRC_ARCH})
     endif()
   endforeach()
-  
+
   list(REMOVE_DUPLICATES _ROCM_ARCHS)
   set(${OUT_ROCM_ARCHS} ${_ROCM_ARCHS} PARENT_SCOPE)
 endfunction()
 
+function(cuda_remove_ptx_suffixes OUT_CUDA_ARCHS CUDA_ARCHS)
+  set(_CUDA_ARCHS "${CUDA_ARCHS}")
+
+  # handle +PTX suffix: separate base arch for matching, record PTX requests
+  foreach(_arch ${CUDA_ARCHS})
+    if(_arch MATCHES "\\+PTX$")
+      string(REPLACE "+PTX" "" _base "${_arch}")
+      list(REMOVE_ITEM _CUDA_ARCHS "${_arch}")
+      list(APPEND _CUDA_ARCHS "${_base}")
+    endif()
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHS)
+  list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
+  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
+endfunction()
+
+
+
 #
 # Override the GPU architectures detected by cmake/torch and filter them by
 # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
-# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set 
+# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
 # the architectures on a per file basis.
 #
 # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
diff --git a/build2cmake/src/torch/cuda.rs b/build2cmake/src/torch/cuda.rs
index d9753a79..be288bf3 100644
--- a/build2cmake/src/torch/cuda.rs
+++ b/build2cmake/src/torch/cuda.rs
@@ -17,13 +17,6 @@ static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake");
 static WINDOWS_UTILS: &str = include_str!("../templates/windows.cmake");
 static REGISTRATION_H: &str = include_str!("../templates/registration.h");
 static HIPIFY: &str = include_str!("../templates/cuda/hipify.py");
-static CUDA_SUPPORTED_ARCHS_JSON: &str = include_str!("../cuda_supported_archs.json");
-
-fn cuda_supported_archs() -> String {
-    let supported_archs: Vec<String> = serde_json::from_str(CUDA_SUPPORTED_ARCHS_JSON)
-        .expect("Error parsing supported CUDA archs");
-    supported_archs.join(";")
-}
 
 pub fn write_torch_ext_cuda(
     env: &Environment,
@@ -414,7 +407,6 @@ pub fn render_preamble(
                 cuda_maxver => cuda_maxver.map(|v| v.to_string()),
                 torch_minver => torch_minver.map(|v| v.to_string()),
                 torch_maxver => torch_maxver.map(|v| v.to_string()),
-                cuda_supported_archs => cuda_supported_archs(),
                 platform => env::consts::OS
             },
             &mut *write,
diff --git a/lib/torch-extension/arch.nix b/lib/torch-extension/arch.nix
index 3d4c58af..d26d7135 100644
--- a/lib/torch-extension/arch.nix
+++ b/lib/torch-extension/arch.nix
@@ -222,6 +222,7 @@ stdenv.mkDerivation (prevAttrs: {
   dontSetupCUDAToolkitCompilers = true;
 
   cmakeFlags = [
+    (lib.cmakeBool "BUILD_ALL_SUPPORTED_ARCHS" true)
     (lib.cmakeFeature "Python_EXECUTABLE" "${python3.withPackages (ps: [ torch ])}/bin/python")
     # Fix: file RPATH_CHANGE could not write new RPATH, we are rewriting
     # rpaths anyway.
diff --git a/pkgs/build2cmake/default.nix b/pkgs/build2cmake/default.nix
index 846a3882..80f5c2a7 100644
--- a/pkgs/build2cmake/default.nix
+++ b/pkgs/build2cmake/default.nix
@@ -21,7 +21,6 @@ rustPlatform.buildRustPackage {
         || file.name == "Cargo.lock"
         || file.name == "pyproject.toml"
         || file.name == "pyproject_universal.toml"
-        || file.name == "cuda_supported_archs.json"
         || file.name == "python_dependencies.json"
         || (builtins.any file.hasExt [
           "cmake"
diff --git a/scripts/windows/builder.ps1 b/scripts/windows/builder.ps1
index 6f917e2e..40669fea 100644
--- a/scripts/windows/builder.ps1
+++ b/scripts/windows/builder.ps1
@@ -350,6 +350,9 @@ function Get-CMakeConfigureArgs {
         $kwargs = @("..", "-G", "Visual Studio 17 2022", "-A", $vsArch)
     }
 
+    # Build for all supported GPU archs, not just the detected arch.
+    $kwargs += "-DBUILD_ALL_SUPPORTED_ARCHS"
+
     # Detect Python from current environment
     $pythonExe = (Get-Command python -ErrorAction SilentlyContinue).Source
     if ($pythonExe) {

From 62dbd2798ef935184ff534085b7beac809bc59d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 17 Dec 2025 13:09:55 +0000
Subject: [PATCH 2/3] Add comments to clarify some of the capability handling

---
 build2cmake/src/templates/cuda/preamble.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/build2cmake/src/templates/cuda/preamble.cmake b/build2cmake/src/templates/cuda/preamble.cmake
index a87471df..ddbbdde3 100644
--- a/build2cmake/src/templates/cuda/preamble.cmake
+++ b/build2cmake/src/templates/cuda/preamble.cmake
@@ -90,7 +90,12 @@ endif()
 
 
 if(GPU_LANG STREQUAL "CUDA")
+  # This clears out -gencode arguments from `CMAKE_CUDA_FLAGS`, which we need
+  # to set our own set of capabilities.
   clear_cuda_arches(CUDA_ARCH_FLAGS)
+
+  # Get the capabilities without +PTX suffixes, so that we can use them as
+  # the target archs in the loose intersection with a kernel's capabilities.
   cuda_remove_ptx_suffixes(CUDA_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
   message(STATUS "CUDA supported base architectures: ${CUDA_ARCHS}")
 

From ecefc61a96094039940962a877ae4cfbda8c65db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 17 Dec 2025 13:29:53 +0000
Subject: [PATCH 3/3] Simplify gencode flags clearing

---
 build2cmake/src/templates/cuda/preamble.cmake |  2 +-
 build2cmake/src/templates/utils.cmake         | 12 +++---------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/build2cmake/src/templates/cuda/preamble.cmake b/build2cmake/src/templates/cuda/preamble.cmake
index ddbbdde3..8f09adc4 100644
--- a/build2cmake/src/templates/cuda/preamble.cmake
+++ b/build2cmake/src/templates/cuda/preamble.cmake
@@ -92,7 +92,7 @@ endif()
 if(GPU_LANG STREQUAL "CUDA")
   # This clears out -gencode arguments from `CMAKE_CUDA_FLAGS`, which we need
   # to set our own set of capabilities.
-  clear_cuda_arches(CUDA_ARCH_FLAGS)
+  clear_gencode_flags()
 
   # Get the capabilities without +PTX suffixes, so that we can use them as
   # the target archs in the loose intersection with a kernel's capabilities.
diff --git a/build2cmake/src/templates/utils.cmake b/build2cmake/src/templates/utils.cmake
index bdcea367..53cffc11 100644
--- a/build2cmake/src/templates/utils.cmake
+++ b/build2cmake/src/templates/utils.cmake
@@ -175,20 +175,14 @@ macro(string_to_ver OUT_VER IN_STR)
 endmacro()
 
 #
-# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
-# `CUDA_ARCH_FLAGS`.
+# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS`.
 #
 # Example:
 #   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
-#   clear_cuda_arches(CUDA_ARCH_FLAGS)
-#   CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
+#   clear_gencode_flags()
 #   CMAKE_CUDA_FLAGS="-Wall"
 #
-macro(clear_cuda_arches CUDA_ARCH_FLAGS)
-    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
-    string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
-      ${CMAKE_CUDA_FLAGS})
-
+macro(clear_gencode_flags)
     # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
     # and passed back via the `CUDA_ARCHITECTURES` property.
     string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS