huggingface · danieldk · Dec 18, 2025 · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025
diff --git a/build2cmake/src/templates/cuda/kernel.cmake b/build2cmake/src/templates/cuda/kernel.cmake
@@ -18,7 +18,7 @@ if(GPU_LANG STREQUAL "CUDA")
   {% if cuda_capabilities %}
     cuda_archs_loose_intersection({{kernel_name}}_ARCHS "{{ cuda_capabilities|join(";") }}" "${CUDA_ARCHS}")
   {% else %}
-    cuda_archs_loose_intersection({{kernel_name}}_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}" "${CUDA_ARCHS}")
+    set({{kernel_name}}_ARCHS "${CUDA_KERNEL_ARCHS}")
   {% endif %}
   message(STATUS "Capabilities for kernel {{kernel_name}}: {{ '${' + kernel_name + '_ARCHS}'}}")
   set_gencode_flags_for_srcs(SRCS {{'"${' + kernel_name + '_SRC}"'}} CUDA_ARCHS "{{ '${' + kernel_name + '_ARCHS}'}}")

diff --git a/build2cmake/src/templates/cuda/preamble.cmake b/build2cmake/src/templates/cuda/preamble.cmake
@@ -9,8 +9,6 @@ include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 
-set(CUDA_SUPPORTED_ARCHS "{{ cuda_supported_archs }}")
-
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
@@ -50,6 +48,8 @@ if (NOT TARGET_DEVICE STREQUAL "cuda" AND
     return()
 endif()
 
+option(BUILD_ALL_SUPPORTED_ARCHS "Build all supported architectures" off)
+
 if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
  set(CUDA_DEFAULT_KERNEL_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0+PTX")
@@ -90,13 +90,26 @@ endif()
 
 
 if(GPU_LANG STREQUAL "CUDA")
-  clear_cuda_arches(CUDA_ARCH_FLAGS)
-  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
-  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
-  # Filter the target architectures by the supported supported archs
-  # since for some files we will build for all CUDA_ARCHS.
-  cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
-  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
+  # This clears out -gencode arguments from `CMAKE_CUDA_FLAGS`, which we need
+  # to set our own set of capabilities.
+  clear_gencode_flags()
+
+  # Get the capabilities without +PTX suffixes, so that we can use them as
+  # the target archs in the loose intersection with a kernel's capabilities.
+  cuda_remove_ptx_suffixes(CUDA_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+  message(STATUS "CUDA supported base architectures: ${CUDA_ARCHS}")
+
+  if(BUILD_ALL_SUPPORTED_ARCHS)
+    set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+  else()
+    try_run_python(CUDA_KERNEL_ARCHS SUCCESS "import torch; cc=torch.cuda.get_device_capability(); print(f\"{cc[0]}.{cc[1]}\")" "Failed to get CUDA capability")
+    if(NOT SUCCESS)
+      message(WARNING "Failed to detect CUDA capability, using default capabilities.")
+      set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+    endif()
+  endif()
+
+  message(STATUS "CUDA supported kernel architectures: ${CUDA_KERNEL_ARCHS}")
 
   if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA")
     list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}")

diff --git a/build2cmake/src/templates/utils.cmake b/build2cmake/src/templates/utils.cmake
@@ -42,6 +42,29 @@ function (run_python OUT EXPR ERR_MSG)
   set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
 endfunction()
 
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, `SUCCESS` is set to FALSE. If successful, `SUCCESS` is set to TRUE.
+#
+function (try_run_python OUT SUCCESS EXPR)
+  execute_process(
+    COMMAND
+    "${Python3_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE PYTHON_OUT
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_QUIET
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    set(${SUCCESS} FALSE PARENT_SCOPE)
+    set(${OUT} "" PARENT_SCOPE)
+  else()
+    set(${SUCCESS} TRUE PARENT_SCOPE)
+    set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+  endif()
+endfunction()
+
 # Run `EXPR` in python after importing `PKG`. Use the result of this to extend
 # `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
 macro (append_cmake_prefix_path PKG EXPR)
@@ -152,34 +175,28 @@ macro(string_to_ver OUT_VER IN_STR)
 endmacro()
 
 #
-# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
-# `CUDA_ARCH_FLAGS`.
+# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS`.
 #
 # Example:
 #   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
-#   clear_cuda_arches(CUDA_ARCH_FLAGS)
-#   CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
+#   clear_gencode_flags()
 #   CMAKE_CUDA_FLAGS="-Wall"
 #
-macro(clear_cuda_arches CUDA_ARCH_FLAGS)
-    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
-    string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
-      ${CMAKE_CUDA_FLAGS})
-
+macro(clear_gencode_flags)
     # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
     # and passed back via the `CUDA_ARCHITECTURES` property.
     string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
       ${CMAKE_CUDA_FLAGS})
 endmacro()
 
 #
-# Extract unique CUDA architectures from a list of compute capabilities codes in 
-# the form `<major><minor>[<letter>]`, convert them to the form sort 
-# `<major>.<minor>`, dedupes them and then sorts them in ascending order and 
+# Extract unique CUDA architectures from a list of compute capabilities codes in
+# the form `<major><minor>[<letter>]`, convert them to the form sort
+# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
 # stores them in `OUT_ARCHES`.
 #
 # Example:
-#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" 
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
 #   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
 #   OUT_ARCHES="7.5;...;9.0"
 function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
@@ -200,15 +217,15 @@ function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
 endfunction()
 
 #
-# For a specific file set the `-gencode` flag in compile options conditionally 
-# for the CUDA language. 
+# For a specific file set the `-gencode` flag in compile options conditionally
+# for the CUDA language.
 #
 # Example:
 #   set_gencode_flag_for_srcs(
 #     SRCS "foo.cu"
 #     ARCH "compute_75"
 #     CODE "sm_75")
-#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for 
+#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
 #    `foo.cu` (only for the CUDA language).
 #
 macro(set_gencode_flag_for_srcs)
@@ -228,14 +245,14 @@ macro(set_gencode_flag_for_srcs)
 endmacro(set_gencode_flag_for_srcs)
 
 #
-# For a list of source files set the `-gencode` flags in the files specific 
+# For a list of source files set the `-gencode` flags in the files specific
 #  compile options (specifically for the CUDA language).
 #
 # arguments are:
 #  SRCS: list of source files
 #  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
 #  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
-#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS 
+#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
 #    that is larger than BUILD_PTX_FOR_ARCH.
 #
 macro(set_gencode_flags_for_srcs)
@@ -383,12 +400,14 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
   endforeach()
   set(_CUDA_ARCHS ${_FINAL_ARCHS})
 
+  list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
   set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
 endfunction()
 
 #
-# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form 
-# `<name>` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list. 
+# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form
+# `<name>` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list.
 # The loose intersection is defined as:
 #   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
 #  where `<=` is the version comparison operator.
@@ -404,28 +423,48 @@ endfunction()
 #
 function(hip_archs_loose_intersection OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS)
   list(REMOVE_DUPLICATES SRC_ROCM_ARCHS)
-  
+
   # ROCm architectures are typically in format gfxNNN or gfxNNNx where N is a digit
   # and x is a letter. We can sort them by string comparison which works for this format.
   list(SORT SRC_ROCM_ARCHS COMPARE STRING ORDER ASCENDING)
-  
+
   set(_ROCM_ARCHS)
-  
+
   # Find the intersection of supported architectures
   foreach(_SRC_ARCH ${SRC_ROCM_ARCHS})
     if(_SRC_ARCH IN_LIST TGT_ROCM_ARCHS)
       list(APPEND _ROCM_ARCHS ${_SRC_ARCH})
     endif()
   endforeach()
-  
+
   list(REMOVE_DUPLICATES _ROCM_ARCHS)
   set(${OUT_ROCM_ARCHS} ${_ROCM_ARCHS} PARENT_SCOPE)
 endfunction()
 
+function(cuda_remove_ptx_suffixes OUT_CUDA_ARCHS CUDA_ARCHS)
+  set(_CUDA_ARCHS "${CUDA_ARCHS}")
+
+  # handle +PTX suffix: separate base arch for matching, record PTX requests
+  foreach(_arch ${CUDA_ARCHS})
+    if(_arch MATCHES "\\+PTX$")
+      string(REPLACE "+PTX" "" _base "${_arch}")
+      list(REMOVE_ITEM _CUDA_ARCHS "${_arch}")
+      list(APPEND _CUDA_ARCHS "${_base}")
+    endif()
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHS)
+  list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
+  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
+endfunction()
+
+
+
 #
 # Override the GPU architectures detected by cmake/torch and filter them by
 # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
-# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set 
+# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
 # the architectures on a per file basis.
 #
 # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.

diff --git a/build2cmake/src/torch/cuda.rs b/build2cmake/src/torch/cuda.rs
@@ -17,13 +17,6 @@ static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake");
 static WINDOWS_UTILS: &str = include_str!("../templates/windows.cmake");
 static REGISTRATION_H: &str = include_str!("../templates/registration.h");
 static HIPIFY: &str = include_str!("../templates/cuda/hipify.py");
-static CUDA_SUPPORTED_ARCHS_JSON: &str = include_str!("../cuda_supported_archs.json");
-
-fn cuda_supported_archs() -> String {
-    let supported_archs: Vec<String> = serde_json::from_str(CUDA_SUPPORTED_ARCHS_JSON)
-        .expect("Error parsing supported CUDA archs");
-    supported_archs.join(";")
-}
 
 pub fn write_torch_ext_cuda(
     env: &Environment,
@@ -414,7 +407,6 @@ pub fn render_preamble(
                 cuda_maxver => cuda_maxver.map(|v| v.to_string()),
                 torch_minver => torch_minver.map(|v| v.to_string()),
                 torch_maxver => torch_maxver.map(|v| v.to_string()),
-                cuda_supported_archs => cuda_supported_archs(),
                 platform => env::consts::OS
             },
             &mut *write,

diff --git a/lib/torch-extension/arch.nix b/lib/torch-extension/arch.nix
@@ -222,6 +222,7 @@ stdenv.mkDerivation (prevAttrs: {
   dontSetupCUDAToolkitCompilers = true;
 
   cmakeFlags = [
+    (lib.cmakeBool "BUILD_ALL_SUPPORTED_ARCHS" true)
     (lib.cmakeFeature "Python_EXECUTABLE" "${python3.withPackages (ps: [ torch ])}/bin/python")
     # Fix: file RPATH_CHANGE could not write new RPATH, we are rewriting
     # rpaths anyway.

diff --git a/pkgs/build2cmake/default.nix b/pkgs/build2cmake/default.nix
@@ -21,7 +21,6 @@ rustPlatform.buildRustPackage {
         || file.name == "Cargo.lock"
         || file.name == "pyproject.toml"
         || file.name == "pyproject_universal.toml"
-        || file.name == "cuda_supported_archs.json"
         || file.name == "python_dependencies.json"
         || (builtins.any file.hasExt [
           "cmake"

diff --git a/scripts/windows/builder.ps1 b/scripts/windows/builder.ps1
@@ -350,6 +350,9 @@ function Get-CMakeConfigureArgs {
         $kwargs = @("..", "-G", "Visual Studio 17 2022", "-A", $vsArch)
     }
 
+    # Build for all supported GPU archs, not just the detected arch.
+    $kwargs += "-DBUILD_ALL_SUPPORTED_ARCHS"
+
     # Detect Python from current environment
     $pythonExe = (Get-Command python -ErrorAction SilentlyContinue).Source
     if ($pythonExe) {