Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build2cmake/src/templates/cuda/kernel.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ if(GPU_LANG STREQUAL "CUDA")
{% if cuda_capabilities %}
cuda_archs_loose_intersection({{kernel_name}}_ARCHS "{{ cuda_capabilities|join(";") }}" "${CUDA_ARCHS}")
{% else %}
cuda_archs_loose_intersection({{kernel_name}}_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}" "${CUDA_ARCHS}")
set({{kernel_name}}_ARCHS "${CUDA_KERNEL_ARCHS}")
{% endif %}
message(STATUS "Capabilities for kernel {{kernel_name}}: {{ '${' + kernel_name + '_ARCHS}'}}")
set_gencode_flags_for_srcs(SRCS {{'"${' + kernel_name + '_SRC}"'}} CUDA_ARCHS "{{ '${' + kernel_name + '_ARCHS}'}}")
Expand Down
31 changes: 22 additions & 9 deletions build2cmake/src/templates/cuda/preamble.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")

set(CUDA_SUPPORTED_ARCHS "{{ cuda_supported_archs }}")

set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")

include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
Expand Down Expand Up @@ -50,6 +48,8 @@ if (NOT TARGET_DEVICE STREQUAL "cuda" AND
return()
endif()

option(BUILD_ALL_SUPPORTED_ARCHS "Build all supported architectures" off)

if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
set(CUDA_DEFAULT_KERNEL_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0+PTX")
Expand Down Expand Up @@ -90,13 +90,26 @@ endif()


if(GPU_LANG STREQUAL "CUDA")
clear_cuda_arches(CUDA_ARCH_FLAGS)
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
# This clears out -gencode arguments from `CMAKE_CUDA_FLAGS`, which we need
# to set our own set of capabilities.
clear_gencode_flags()

# Get the capabilities without +PTX suffixes, so that we can use them as
# the target archs in the loose intersection with a kernel's capabilities.
cuda_remove_ptx_suffixes(CUDA_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
message(STATUS "CUDA supported base architectures: ${CUDA_ARCHS}")

if(BUILD_ALL_SUPPORTED_ARCHS)
set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
else()
try_run_python(CUDA_KERNEL_ARCHS SUCCESS "import torch; cc=torch.cuda.get_device_capability(); print(f\"{cc[0]}.{cc[1]}\")" "Failed to get CUDA capability")
if(NOT SUCCESS)
message(WARNING "Failed to detect CUDA capability, using default capabilities.")
set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
endif()
endif()

message(STATUS "CUDA supported kernel architectures: ${CUDA_KERNEL_ARCHS}")

if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA")
list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}")
Expand Down
89 changes: 64 additions & 25 deletions build2cmake/src/templates/utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,29 @@ function (run_python OUT EXPR ERR_MSG)
set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
endfunction()

#
# Run `EXPR` in python. The standard output of python is stored in `OUT` and
# has trailing whitespace stripped. If an error is encountered when running
# python, `SUCCESS` is set to FALSE. If successful, `SUCCESS` is set to TRUE.
#
function (try_run_python OUT SUCCESS EXPR)
execute_process(
COMMAND
"${Python3_EXECUTABLE}" "-c" "${EXPR}"
OUTPUT_VARIABLE PYTHON_OUT
RESULT_VARIABLE PYTHON_ERROR_CODE
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)

if(NOT PYTHON_ERROR_CODE EQUAL 0)
set(${SUCCESS} FALSE PARENT_SCOPE)
set(${OUT} "" PARENT_SCOPE)
else()
set(${SUCCESS} TRUE PARENT_SCOPE)
set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
endif()
endfunction()

# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
macro (append_cmake_prefix_path PKG EXPR)
Expand Down Expand Up @@ -152,34 +175,28 @@ macro(string_to_ver OUT_VER IN_STR)
endmacro()

#
# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
# `CUDA_ARCH_FLAGS`.
# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS`.
#
# Example:
# CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
# clear_cuda_arches(CUDA_ARCH_FLAGS)
# CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
# clear_gencode_flags()
# CMAKE_CUDA_FLAGS="-Wall"
#
macro(clear_cuda_arches CUDA_ARCH_FLAGS)
# Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
${CMAKE_CUDA_FLAGS})

macro(clear_gencode_flags)
# Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
# and passed back via the `CUDA_ARCHITECTURES` property.
string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
${CMAKE_CUDA_FLAGS})
endmacro()

#
# Extract unique CUDA architectures from a list of compute capabilities codes in
# the form `<major><minor>[<letter>]`, convert them to the form sort
# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
# Extract unique CUDA architectures from a list of compute capabilities codes in
# the form `<major><minor>[<letter>]`, convert them to the form sort
# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
# stores them in `OUT_ARCHES`.
#
# Example:
# CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
# CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
# extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
# OUT_ARCHES="7.5;...;9.0"
function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
Expand All @@ -200,15 +217,15 @@ function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
endfunction()

#
# For a specific file set the `-gencode` flag in compile options conditionally
# for the CUDA language.
# For a specific file set the `-gencode` flag in compile options conditionally
# for the CUDA language.
#
# Example:
# set_gencode_flag_for_srcs(
# SRCS "foo.cu"
# ARCH "compute_75"
# CODE "sm_75")
# adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
# adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
# `foo.cu` (only for the CUDA language).
#
macro(set_gencode_flag_for_srcs)
Expand All @@ -228,14 +245,14 @@ macro(set_gencode_flag_for_srcs)
endmacro(set_gencode_flag_for_srcs)

#
# For a list of source files set the `-gencode` flags in the files specific
# For a list of source files set the `-gencode` flags in the files specific
# compile options (specifically for the CUDA language).
#
# arguments are:
# SRCS: list of source files
# CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
# BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
# for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
# for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
# that is larger than BUILD_PTX_FOR_ARCH.
#
macro(set_gencode_flags_for_srcs)
Expand Down Expand Up @@ -383,12 +400,14 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
endforeach()
set(_CUDA_ARCHS ${_FINAL_ARCHS})

list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)

set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
endfunction()

#
# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form
# `<name>` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list.
# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form
# `<name>` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list.
# The loose intersection is defined as:
# { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
# where `<=` is the version comparison operator.
Expand All @@ -404,28 +423,48 @@ endfunction()
#
function(hip_archs_loose_intersection OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS)
list(REMOVE_DUPLICATES SRC_ROCM_ARCHS)

# ROCm architectures are typically in format gfxNNN or gfxNNNx where N is a digit
# and x is a letter. We can sort them by string comparison which works for this format.
list(SORT SRC_ROCM_ARCHS COMPARE STRING ORDER ASCENDING)

set(_ROCM_ARCHS)

# Find the intersection of supported architectures
foreach(_SRC_ARCH ${SRC_ROCM_ARCHS})
if(_SRC_ARCH IN_LIST TGT_ROCM_ARCHS)
list(APPEND _ROCM_ARCHS ${_SRC_ARCH})
endif()
endforeach()

list(REMOVE_DUPLICATES _ROCM_ARCHS)
set(${OUT_ROCM_ARCHS} ${_ROCM_ARCHS} PARENT_SCOPE)
endfunction()

function(cuda_remove_ptx_suffixes OUT_CUDA_ARCHS CUDA_ARCHS)
set(_CUDA_ARCHS "${CUDA_ARCHS}")

# handle +PTX suffix: separate base arch for matching, record PTX requests
foreach(_arch ${CUDA_ARCHS})
if(_arch MATCHES "\\+PTX$")
string(REPLACE "+PTX" "" _base "${_arch}")
list(REMOVE_ITEM _CUDA_ARCHS "${_arch}")
list(APPEND _CUDA_ARCHS "${_base}")
endif()
endforeach()

list(REMOVE_DUPLICATES _CUDA_ARCHS)
list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)

set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
endfunction()



#
# Override the GPU architectures detected by cmake/torch and filter them by
# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
# the architectures on a per file basis.
#
# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
Expand Down
8 changes: 0 additions & 8 deletions build2cmake/src/torch/cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,6 @@ static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake");
static WINDOWS_UTILS: &str = include_str!("../templates/windows.cmake");
static REGISTRATION_H: &str = include_str!("../templates/registration.h");
static HIPIFY: &str = include_str!("../templates/cuda/hipify.py");
static CUDA_SUPPORTED_ARCHS_JSON: &str = include_str!("../cuda_supported_archs.json");

fn cuda_supported_archs() -> String {
let supported_archs: Vec<String> = serde_json::from_str(CUDA_SUPPORTED_ARCHS_JSON)
.expect("Error parsing supported CUDA archs");
supported_archs.join(";")
}

pub fn write_torch_ext_cuda(
env: &Environment,
Expand Down Expand Up @@ -414,7 +407,6 @@ pub fn render_preamble(
cuda_maxver => cuda_maxver.map(|v| v.to_string()),
torch_minver => torch_minver.map(|v| v.to_string()),
torch_maxver => torch_maxver.map(|v| v.to_string()),
cuda_supported_archs => cuda_supported_archs(),
platform => env::consts::OS
},
&mut *write,
Expand Down
1 change: 1 addition & 0 deletions lib/torch-extension/arch.nix
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ stdenv.mkDerivation (prevAttrs: {
dontSetupCUDAToolkitCompilers = true;

cmakeFlags = [
(lib.cmakeBool "BUILD_ALL_SUPPORTED_ARCHS" true)
(lib.cmakeFeature "Python_EXECUTABLE" "${python3.withPackages (ps: [ torch ])}/bin/python")
# Fix: file RPATH_CHANGE could not write new RPATH, we are rewriting
# rpaths anyway.
Expand Down
1 change: 0 additions & 1 deletion pkgs/build2cmake/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ rustPlatform.buildRustPackage {
|| file.name == "Cargo.lock"
|| file.name == "pyproject.toml"
|| file.name == "pyproject_universal.toml"
|| file.name == "cuda_supported_archs.json"
|| file.name == "python_dependencies.json"
|| (builtins.any file.hasExt [
"cmake"
Expand Down
3 changes: 3 additions & 0 deletions scripts/windows/builder.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,9 @@ function Get-CMakeConfigureArgs {
$kwargs = @("..", "-G", "Visual Studio 17 2022", "-A", $vsArch)
}

# Build for all supported GPU archs, not just the detected arch.
$kwargs += "-DBUILD_ALL_SUPPORTED_ARCHS"

# Detect Python from current environment
$pythonExe = (Get-Command python -ErrorAction SilentlyContinue).Source
if ($pythonExe) {
Expand Down
Loading