diff --git a/build2cmake/src/templates/cuda/kernel.cmake b/build2cmake/src/templates/cuda/kernel.cmake index 733492a5..04d67d2c 100644 --- a/build2cmake/src/templates/cuda/kernel.cmake +++ b/build2cmake/src/templates/cuda/kernel.cmake @@ -18,7 +18,7 @@ if(GPU_LANG STREQUAL "CUDA") {% if cuda_capabilities %} cuda_archs_loose_intersection({{kernel_name}}_ARCHS "{{ cuda_capabilities|join(";") }}" "${CUDA_ARCHS}") {% else %} - cuda_archs_loose_intersection({{kernel_name}}_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}" "${CUDA_ARCHS}") + set({{kernel_name}}_ARCHS "${CUDA_KERNEL_ARCHS}") {% endif %} message(STATUS "Capabilities for kernel {{kernel_name}}: {{ '${' + kernel_name + '_ARCHS}'}}") set_gencode_flags_for_srcs(SRCS {{'"${' + kernel_name + '_SRC}"'}} CUDA_ARCHS "{{ '${' + kernel_name + '_ARCHS}'}}") diff --git a/build2cmake/src/templates/cuda/preamble.cmake b/build2cmake/src/templates/cuda/preamble.cmake index 05024e06..8f09adc4 100644 --- a/build2cmake/src/templates/cuda/preamble.cmake +++ b/build2cmake/src/templates/cuda/preamble.cmake @@ -9,8 +9,6 @@ include(FetchContent) file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") -set(CUDA_SUPPORTED_ARCHS "{{ cuda_supported_archs }}") - set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) @@ -50,6 +48,8 @@ if (NOT TARGET_DEVICE STREQUAL "cuda" AND return() endif() +option(BUILD_ALL_SUPPORTED_ARCHS "Build all supported architectures" off) + if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) set(CUDA_DEFAULT_KERNEL_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0+PTX") @@ -90,13 +90,26 @@ endif() if(GPU_LANG STREQUAL "CUDA") - clear_cuda_arches(CUDA_ARCH_FLAGS) - extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}") - message(STATUS "CUDA target architectures: ${CUDA_ARCHS}") - # Filter the target architectures by the supported supported archs - # since for some files we will build for all CUDA_ARCHS. - cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}") - message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}") + # This clears out -gencode arguments from `CMAKE_CUDA_FLAGS`, which we need + # to set our own set of capabilities. + clear_gencode_flags() + + # Get the capabilities without +PTX suffixes, so that we can use them as + # the target archs in the loose intersection with a kernel's capabilities. + cuda_remove_ptx_suffixes(CUDA_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}") + message(STATUS "CUDA supported base architectures: ${CUDA_ARCHS}") + + if(BUILD_ALL_SUPPORTED_ARCHS) + set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}") + else() + try_run_python(CUDA_KERNEL_ARCHS SUCCESS "import torch; cc=torch.cuda.get_device_capability(); print(f\"{cc[0]}.{cc[1]}\")" "Failed to get CUDA capability") + if(NOT SUCCESS) + message(WARNING "Failed to detect CUDA capability, using default capabilities.") + set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}") + endif() + endif() + + message(STATUS "CUDA supported kernel architectures: ${CUDA_KERNEL_ARCHS}") if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA") list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}") diff --git a/build2cmake/src/templates/utils.cmake b/build2cmake/src/templates/utils.cmake index d4d4cef2..53cffc11 100644 --- a/build2cmake/src/templates/utils.cmake +++ b/build2cmake/src/templates/utils.cmake @@ -42,6 +42,29 @@ function (run_python OUT EXPR ERR_MSG) set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) endfunction() +# +# Run `EXPR` in python. The standard output of python is stored in `OUT` and +# has trailing whitespace stripped. If an error is encountered when running +# python, `SUCCESS` is set to FALSE. If successful, `SUCCESS` is set to TRUE. +# +function (try_run_python OUT SUCCESS EXPR) + execute_process( + COMMAND + "${Python3_EXECUTABLE}" "-c" "${EXPR}" + OUTPUT_VARIABLE PYTHON_OUT + RESULT_VARIABLE PYTHON_ERROR_CODE + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(NOT PYTHON_ERROR_CODE EQUAL 0) + set(${SUCCESS} FALSE PARENT_SCOPE) + set(${OUT} "" PARENT_SCOPE) + else() + set(${SUCCESS} TRUE PARENT_SCOPE) + set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) + endif() +endfunction() + # Run `EXPR` in python after importing `PKG`. Use the result of this to extend # `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. macro (append_cmake_prefix_path PKG EXPR) @@ -152,20 +175,14 @@ macro(string_to_ver OUT_VER IN_STR) endmacro() # -# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in -# `CUDA_ARCH_FLAGS`. +# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS`. # # Example: # CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" -# clear_cuda_arches(CUDA_ARCH_FLAGS) -# CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75" +# clear_gencode_flags() # CMAKE_CUDA_FLAGS="-Wall" # -macro(clear_cuda_arches CUDA_ARCH_FLAGS) - # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS` - string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS - ${CMAKE_CUDA_FLAGS}) - +macro(clear_gencode_flags) # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified # and passed back via the `CUDA_ARCHITECTURES` property. string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS @@ -173,13 +190,13 @@ macro(clear_cuda_arches CUDA_ARCH_FLAGS) endmacro() # -# Extract unique CUDA architectures from a list of compute capabilities codes in -# the form `[]`, convert them to the form sort -# `.`, dedupes them and then sorts them in ascending order and +# Extract unique CUDA architectures from a list of compute capabilities codes in +# the form `[]`, convert them to the form sort +# `.`, dedupes them and then sorts them in ascending order and # stores them in `OUT_ARCHES`. # # Example: -# CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" +# CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" # extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS) # OUT_ARCHES="7.5;...;9.0" function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS) @@ -200,15 +217,15 @@ function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS) endfunction() # -# For a specific file set the `-gencode` flag in compile options conditionally -# for the CUDA language. +# For a specific file set the `-gencode` flag in compile options conditionally +# for the CUDA language. # # Example: # set_gencode_flag_for_srcs( # SRCS "foo.cu" # ARCH "compute_75" # CODE "sm_75") -# adds: "-gencode arch=compute_75,code=sm_75" to the compile options for +# adds: "-gencode arch=compute_75,code=sm_75" to the compile options for # `foo.cu` (only for the CUDA language). # macro(set_gencode_flag_for_srcs) @@ -228,14 +245,14 @@ macro(set_gencode_flag_for_srcs) endmacro(set_gencode_flag_for_srcs) # -# For a list of source files set the `-gencode` flags in the files specific +# For a list of source files set the `-gencode` flags in the files specific # compile options (specifically for the CUDA language). # # arguments are: # SRCS: list of source files # CUDA_ARCHS: list of CUDA architectures in the form `.[letter]` # BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built -# for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS +# for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS # that is larger than BUILD_PTX_FOR_ARCH. # macro(set_gencode_flags_for_srcs) @@ -383,12 +400,14 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR endforeach() set(_CUDA_ARCHS ${_FINAL_ARCHS}) + list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) + set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) endfunction() # -# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form -# `` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list. +# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form +# `` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list. # The loose intersection is defined as: # { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} } # where `<=` is the version comparison operator. @@ -404,28 +423,48 @@ endfunction() # function(hip_archs_loose_intersection OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS) list(REMOVE_DUPLICATES SRC_ROCM_ARCHS) - + # ROCm architectures are typically in format gfxNNN or gfxNNNx where N is a digit # and x is a letter. We can sort them by string comparison which works for this format. list(SORT SRC_ROCM_ARCHS COMPARE STRING ORDER ASCENDING) - + set(_ROCM_ARCHS) - + # Find the intersection of supported architectures foreach(_SRC_ARCH ${SRC_ROCM_ARCHS}) if(_SRC_ARCH IN_LIST TGT_ROCM_ARCHS) list(APPEND _ROCM_ARCHS ${_SRC_ARCH}) endif() endforeach() - + list(REMOVE_DUPLICATES _ROCM_ARCHS) set(${OUT_ROCM_ARCHS} ${_ROCM_ARCHS} PARENT_SCOPE) endfunction() +function(cuda_remove_ptx_suffixes OUT_CUDA_ARCHS CUDA_ARCHS) + set(_CUDA_ARCHS "${CUDA_ARCHS}") + + # handle +PTX suffix: separate base arch for matching, record PTX requests + foreach(_arch ${CUDA_ARCHS}) + if(_arch MATCHES "\\+PTX$") + string(REPLACE "+PTX" "" _base "${_arch}") + list(REMOVE_ITEM _CUDA_ARCHS "${_arch}") + list(APPEND _CUDA_ARCHS "${_base}") + endif() + endforeach() + + list(REMOVE_DUPLICATES _CUDA_ARCHS) + list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) + + set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) +endfunction() + + + # # Override the GPU architectures detected by cmake/torch and filter them by # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in -# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set +# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set # the architectures on a per file basis. # # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`. diff --git a/build2cmake/src/torch/cuda.rs b/build2cmake/src/torch/cuda.rs index d9753a79..be288bf3 100644 --- a/build2cmake/src/torch/cuda.rs +++ b/build2cmake/src/torch/cuda.rs @@ -17,13 +17,6 @@ static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake"); static WINDOWS_UTILS: &str = include_str!("../templates/windows.cmake"); static REGISTRATION_H: &str = include_str!("../templates/registration.h"); static HIPIFY: &str = include_str!("../templates/cuda/hipify.py"); -static CUDA_SUPPORTED_ARCHS_JSON: &str = include_str!("../cuda_supported_archs.json"); - -fn cuda_supported_archs() -> String { - let supported_archs: Vec = serde_json::from_str(CUDA_SUPPORTED_ARCHS_JSON) - .expect("Error parsing supported CUDA archs"); - supported_archs.join(";") -} pub fn write_torch_ext_cuda( env: &Environment, @@ -414,7 +407,6 @@ pub fn render_preamble( cuda_maxver => cuda_maxver.map(|v| v.to_string()), torch_minver => torch_minver.map(|v| v.to_string()), torch_maxver => torch_maxver.map(|v| v.to_string()), - cuda_supported_archs => cuda_supported_archs(), platform => env::consts::OS }, &mut *write, diff --git a/lib/torch-extension/arch.nix b/lib/torch-extension/arch.nix index 3d4c58af..d26d7135 100644 --- a/lib/torch-extension/arch.nix +++ b/lib/torch-extension/arch.nix @@ -222,6 +222,7 @@ stdenv.mkDerivation (prevAttrs: { dontSetupCUDAToolkitCompilers = true; cmakeFlags = [ + (lib.cmakeBool "BUILD_ALL_SUPPORTED_ARCHS" true) (lib.cmakeFeature "Python_EXECUTABLE" "${python3.withPackages (ps: [ torch ])}/bin/python") # Fix: file RPATH_CHANGE could not write new RPATH, we are rewriting # rpaths anyway. diff --git a/pkgs/build2cmake/default.nix b/pkgs/build2cmake/default.nix index 846a3882..80f5c2a7 100644 --- a/pkgs/build2cmake/default.nix +++ b/pkgs/build2cmake/default.nix @@ -21,7 +21,6 @@ rustPlatform.buildRustPackage { || file.name == "Cargo.lock" || file.name == "pyproject.toml" || file.name == "pyproject_universal.toml" - || file.name == "cuda_supported_archs.json" || file.name == "python_dependencies.json" || (builtins.any file.hasExt [ "cmake" diff --git a/scripts/windows/builder.ps1 b/scripts/windows/builder.ps1 index 6f917e2e..40669fea 100644 --- a/scripts/windows/builder.ps1 +++ b/scripts/windows/builder.ps1 @@ -350,6 +350,9 @@ function Get-CMakeConfigureArgs { $kwargs = @("..", "-G", "Visual Studio 17 2022", "-A", $vsArch) } + # Build for all supported GPU archs, not just the detected arch. + $kwargs += "-DBUILD_ALL_SUPPORTED_ARCHS" + # Detect Python from current environment $pythonExe = (Get-Command python -ErrorAction SilentlyContinue).Source if ($pythonExe) {