diff --git a/examples/cpp/32_native_cuda_kernels_reduction/CMakeLists.txt b/examples/cpp/32_native_cuda_kernels_reduction/CMakeLists.txt new file mode 100644 index 000000000..d214fcad5 --- /dev/null +++ b/examples/cpp/32_native_cuda_kernels_reduction/CMakeLists.txt @@ -0,0 +1,7 @@ +if (WITH_CUDA) + compile_cpp_example(native_cuda_kernels_reduction main.cpp) + + add_custom_target(cpp_example_native_cuda_kernels_reduction_dynamic_shm_cu ALL COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/sum_reductiom_dynamic_shm.cu sum_reductiom_dynamic_shm.cu) + add_dependencies(examples_cpp_native_cuda_kernels cpp_example_native_cuda_kernels_reduction_dynamic_shm_cu) + +endif() diff --git a/examples/cpp/32_native_cuda_kernels_reduction/Makefile b/examples/cpp/32_native_cuda_kernels_reduction/Makefile new file mode 100644 index 000000000..af0b6c1cc --- /dev/null +++ b/examples/cpp/32_native_cuda_kernels_reduction/Makefile @@ -0,0 +1,26 @@ +PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +ifndef OCCA_DIR + include $(PROJ_DIR)/../../../scripts/build/Makefile +else + include ${OCCA_DIR}/scripts/build/Makefile +endif + +#---[ COMPILATION ]------------------------------- +headers = $(wildcard $(incPath)/*.hpp) $(wildcard $(incPath)/*.tpp) +sources = $(wildcard $(srcPath)/*.cpp) + +objects = $(subst $(srcPath)/,$(objPath)/,$(sources:.cpp=.o)) + +${PROJ_DIR}/main: $(objects) $(headers) ${PROJ_DIR}/main.cpp + $(compiler) $(compilerFlags) -o ${PROJ_DIR}/main $(flags) $(objects) ${PROJ_DIR}/main.cpp $(paths) $(linkerFlags) + @if which install_name_tool > /dev/null 2>&1; then \ + install_name_tool -add_rpath "${OCCA_DIR}/lib" ${PROJ_DIR}/main; \ + fi + +$(objPath)/%.o:$(srcPath)/%.cpp $(wildcard $(subst $(srcPath)/,$(incPath)/,$(<:.cpp=.hpp))) $(wildcard $(subst $(srcPath)/,$(incPath)/,$(<:.cpp=.tpp))) + $(compiler) $(compilerFlags) -o $@ $(flags) -c $(paths) $< + +clean: + rm -f $(objPath)/*; + rm -f $(PROJ_DIR)/main +#================================================= diff --git a/examples/cpp/32_native_cuda_kernels_reduction/main.cpp b/examples/cpp/32_native_cuda_kernels_reduction/main.cpp new file mode 100644 index 000000000..106a28ea1 --- /dev/null +++ b/examples/cpp/32_native_cuda_kernels_reduction/main.cpp @@ -0,0 +1,84 @@ +#include + +#include + +//---[ Internal Tools ]----------------- +// Note: These headers are not officially supported +// Please don't rely on it outside of the occa examples +#include +#include +//====================================== + +occa::json parseArgs(int argc, const char **argv); + +int main(int argc, const char **argv) { + occa::json args = parseArgs(argc, argv); + + int entries = 32 * 32; + + float *h_data = new float[entries]; + float h_result = 0; + float ref_result = 0; + + for (int i = 0; i < entries; ++i) { + h_data[i] = i; + } + + // Setup the platform and device IDs + occa::device device({{"mode", "CUDA"}, + {"device_id", (int)args["options/device-id"]}}); + + // Allocate memory on the device + occa::memory d_data = device.malloc(entries); + occa::memory d_result = device.malloc(1); + + // Compile a regular CUDA kernel at run-time + occa::json kernelProps({{"okl/enabled", false}, {"sharedMemBytes", 32 * 4}}); + + occa::kernel reduce = + device.buildKernel("sum_reductiom_dynamic_shm.cu", "reduce", kernelProps); + + // Copy memory to the device + d_data.copyFrom(h_data); + d_result.copyFrom(&h_result); + + // Set the kernel dimensions + reduce.setRunDims(32, 32); + + // Launch device kernel + reduce(d_data, d_result); + + // Copy result to the host + d_result.copyTo(&h_result); + + // Calculate reference + for (int i = 0; i < entries; ++i) { + ref_result += h_data[i]; + } + + // Assert values + printf("Ref result: %f, GPU result: %f\n", ref_result, h_result); + + // Free host memory + delete[] h_data; + + return 0; +} + +occa::json parseArgs(int argc, const char **argv) { + occa::cli::parser parser; + parser + .withDescription( + "Example of using a regular CUDA kernel instead of an OCCA kernel") + .addOption( + occa::cli::option('d', "device-id", "OpenCL device ID (default: 0)") + .withArg() + .withDefaultValue(0)) + .addOption( + occa::cli::option('v', "verbose", "Compile kernels in verbose mode")); + + occa::json args = parser.parseArgs(argc, argv); + occa::settings()["kernel/verbose"] = args["options/verbose"]; + + return args; +} diff --git a/examples/cpp/32_native_cuda_kernels_reduction/sum_reductiom_dynamic_shm.cu b/examples/cpp/32_native_cuda_kernels_reduction/sum_reductiom_dynamic_shm.cu new file mode 100644 index 000000000..0e798a57f --- /dev/null +++ b/examples/cpp/32_native_cuda_kernels_reduction/sum_reductiom_dynamic_shm.cu @@ -0,0 +1,24 @@ +#define block_size 32 +extern "C" __global__ __launch_bounds__(block_size) void reduce( + float *g_idata, + float *res) +{ + { + int bid = (0) + blockIdx.x; + extern __shared__ float sdata[]; + { + int tid = (0) + threadIdx.x; + int i = bid * block_size + tid; + sdata[tid] = g_idata[i]; + __syncthreads(); + for (unsigned int s = block_size / 2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] += sdata[tid + s]; + } + __syncthreads(); + } + if (tid == 0) + atomicAdd(res, sdata[0]); + } + } +} \ No newline at end of file diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index 6c51ab1e9..ff1f78c86 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -17,6 +17,7 @@ add_subdirectory(18_nonblocking_streams) add_subdirectory(19_stream_tags) add_subdirectory(20_native_dpcpp_kernel) add_subdirectory(30_device_function) +add_subdirectory(32_native_cuda_kernels_reduction) # Don't force-compile OpenGL examples # add_subdirectory(16_finite_difference) diff --git a/src/occa/internal/modes/cuda/kernel.cpp b/src/occa/internal/modes/cuda/kernel.cpp index b90a7b454..5aed24f29 100644 --- a/src/occa/internal/modes/cuda/kernel.cpp +++ b/src/occa/internal/modes/cuda/kernel.cpp @@ -14,7 +14,9 @@ namespace occa { const occa::json &properties_) : occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_), cuModule(cuModule_), - cuFunction(NULL) {} + cuFunction(NULL) { + sharedMemBytes = properties_.get("sharedMemBytes", 0); + } kernel::kernel(modeDevice_t *modeDevice_, const std::string &name_, @@ -23,7 +25,9 @@ namespace occa { const occa::json &properties_) : occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_), cuModule(NULL), - cuFunction(cuFunction_) {} + cuFunction(cuFunction_) { + sharedMemBytes = properties_.get("sharedMemBytes", 0); + } kernel::kernel(modeDevice_t *modeDevice_, const std::string &name_, @@ -33,7 +37,9 @@ namespace occa { const occa::json &properties_) : occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_), cuModule(cuModule_), - cuFunction(cuFunction_) {} + cuFunction(cuFunction_) { + sharedMemBytes = properties_.get("sharedMemBytes", 0); + } kernel::~kernel() { if (cuModule) { @@ -92,12 +98,13 @@ namespace occa { devicePtr->setCudaContext(); + OCCA_CUDA_ERROR("Set max dynamic shm", cuFuncSetAttribute(cuFunction, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, sharedMemBytes)); OCCA_CUDA_ERROR("Launching Kernel", cuLaunchKernel(cuFunction, outerDims.x, outerDims.y, outerDims.z, innerDims.x, innerDims.y, innerDims.z, - 0, getCuStream(), - &(vArgs[0]), 0)); + sharedMemBytes, getCuStream(), + &(vArgs[0]), NULL)); } } } diff --git a/src/occa/internal/modes/cuda/kernel.hpp b/src/occa/internal/modes/cuda/kernel.hpp index 5424712e1..c51501606 100644 --- a/src/occa/internal/modes/cuda/kernel.hpp +++ b/src/occa/internal/modes/cuda/kernel.hpp @@ -19,6 +19,9 @@ namespace occa { mutable std::vector vArgs; + // Dynamic shared memory size + int sharedMemBytes = 0; + public: kernel(modeDevice_t *modeDevice_, const std::string &name_, diff --git a/src/occa/internal/modes/cuda/polyfill.hpp b/src/occa/internal/modes/cuda/polyfill.hpp index ac664ff41..2a0c9e991 100644 --- a/src/occa/internal/modes/cuda/polyfill.hpp +++ b/src/occa/internal/modes/cuda/polyfill.hpp @@ -21,7 +21,6 @@ namespace occa { typedef struct _CUdeviceptr* CUdeviceptr; typedef struct _CUevent* CUevent; typedef struct _CUfunction* CUfunction; - typedef struct _CUfunction_attribute* CUfunction_attribute; typedef struct _CUmodule* CUmodule; typedef struct _CUstream* CUstream; @@ -35,8 +34,6 @@ namespace occa { static const int CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 0; static const int CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 0; - static const CUfunction_attribute CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = NULL; - enum CUresult { CUDA_SUCCESS = 0, CUDA_ERROR_INVALID_VALUE, @@ -98,6 +95,20 @@ namespace occa { OCCA_CUDA_IS_NOT_ENABLED }; + enum CUfunction_attribute { + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + CU_FUNC_ATTRIBUTE_NUM_REGS = 4, + CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, + CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, + CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, + CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, + CU_FUNC_ATTRIBUTE_MAX + }; + //---[ Methods ]---------------------- inline CUresult cuInit(unsigned int Flags) { return OCCA_CUDA_IS_NOT_ENABLED; @@ -107,6 +118,10 @@ namespace occa { return OCCA_CUDA_IS_NOT_ENABLED; } + inline CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) { + return OCCA_CUDA_IS_NOT_ENABLED; + } + inline CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,