diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 11569b0..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "lib/libconjugrad"] - path = lib/libconjugrad - url = https://bitbucket.org/soedinglab/libconjugrad.git diff --git a/README.md b/README.md index 577caba..bf47a56 100644 --- a/README.md +++ b/README.md @@ -41,9 +41,9 @@ For the padded version: We recommend compiling CCMpred on the machine that should run the computations so that it can be optimized for the appropriate CPU/GPU architecture. ### Downloading -If you want to compile the most recent version, use the follwing to clone both CCMpred and its submodules: +If you want to compile the most recent version, use the following to clone the latest version of CCMpred: - git clone --recursive https://github.com/soedinglab/CCMpred.git + git clone https://github.com/soedinglab/CCMpred.git ### Compilation With the sourcecode ready, simply run cmake with the default settings and libraries should be auto-detected: diff --git a/lib/libconjugrad b/lib/libconjugrad deleted file mode 160000 index e503ee8..0000000 --- a/lib/libconjugrad +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e503ee8a1e1d392339a1cd3ddd540468fe896cd1 diff --git a/lib/libconjugrad/.gitignore b/lib/libconjugrad/.gitignore new file mode 100644 index 0000000..61eede7 --- /dev/null +++ b/lib/libconjugrad/.gitignore @@ -0,0 +1,55 @@ +### C + +# Object files +*.o +*.ko +*.obj +*.elf + +#Binaries +/bin + +# Libraries +*.lib +*.a + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + + +### vim + +[._]*.s[a-w][a-z] +[._]s[a-w][a-z] +*.un~ +Session.vim +.netrwhist +*~ + + +### CMake + +CMakeCache.txt +CMakeFiles +Makefile +cmake_install.cmake +install_manifest.txt +_build + + +### Python + +__pycache__/ +*.py[cod] + diff --git a/lib/libconjugrad/.ycm_extra_conf.py b/lib/libconjugrad/.ycm_extra_conf.py new file mode 100644 index 0000000..e9441b4 --- /dev/null +++ b/lib/libconjugrad/.ycm_extra_conf.py @@ -0,0 +1,120 @@ + +import os +import ycm_core + +flags = [ +'-Wall', +'-Wextra', +'-Werror', +'-Wc++98-compat', +'-Wno-long-long', +'-Wno-variadic-macros', +'-fexceptions', +'-DNDEBUG', + +'-std=c11', +'-x', +'c', +'-I', +'./include' +] + + +# Set this to the absolute path to the folder (NOT the file!) containing the +# compile_commands.json file to use that instead of 'flags'. See here for +# more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html +# +# Most projects will NOT need to set this to anything; you can just change the +# 'flags' list of compilation flags. Notice that YCM itself uses that approach. +compilation_database_folder = '' + +if os.path.exists( compilation_database_folder ): + database = ycm_core.CompilationDatabase( compilation_database_folder ) +else: + database = None + +SOURCE_EXTENSIONS = [ '.cpp', '.cxx', '.cc', '.c', '.m', '.mm' ] + +def DirectoryOfThisScript(): + return os.path.dirname( os.path.abspath( __file__ ) ) + + +def MakeRelativePathsInFlagsAbsolute( flags, working_directory ): + if not working_directory: + return list( flags ) + new_flags = [] + make_next_absolute = False + path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ] + for flag in flags: + new_flag = flag + + if make_next_absolute: + make_next_absolute = False + if not flag.startswith( '/' ): + new_flag = os.path.join( working_directory, flag ) + + for path_flag in path_flags: + if flag == path_flag: + make_next_absolute = True + break + + if flag.startswith( path_flag ): + path = flag[ len( path_flag ): ] + new_flag = path_flag + os.path.join( working_directory, path ) + break + + if new_flag: + new_flags.append( new_flag ) + return new_flags + + +def IsHeaderFile( filename ): + extension = os.path.splitext( filename )[ 1 ] + return extension in [ '.h', '.hxx', '.hpp', '.hh' ] + + +def GetCompilationInfoForFile( filename ): + # The compilation_commands.json file generated by CMake does not have entries + # for header files. So we do our best by asking the db for flags for a + # corresponding source file, if any. If one exists, the flags for that file + # should be good enough. + if IsHeaderFile( filename ): + basename = os.path.splitext( filename )[ 0 ] + for extension in SOURCE_EXTENSIONS: + replacement_file = basename + extension + if os.path.exists( replacement_file ): + compilation_info = database.GetCompilationInfoForFile( + replacement_file ) + if compilation_info.compiler_flags_: + return compilation_info + return None + return database.GetCompilationInfoForFile( filename ) + + +def FlagsForFile( filename, **kwargs ): + if database: + # Bear in mind that compilation_info.compiler_flags_ does NOT return a + # python list, but a "list-like" StringVec object + compilation_info = GetCompilationInfoForFile( filename ) + if not compilation_info: + return None + + final_flags = MakeRelativePathsInFlagsAbsolute( + compilation_info.compiler_flags_, + compilation_info.compiler_working_dir_ ) + + # NOTE: This is just for YouCompleteMe; it's highly likely that your project + # does NOT need to remove the stdlib flag. DO NOT USE THIS IN YOUR + # ycm_extra_conf IF YOU'RE NOT 100% SURE YOU NEED IT. + try: + final_flags.remove( '-stdlib=libc++' ) + except ValueError: + pass + else: + relative_to = DirectoryOfThisScript() + final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to ) + + return { + 'flags': final_flags, + 'do_cache': True + } diff --git a/lib/libconjugrad/CMakeLists.txt b/lib/libconjugrad/CMakeLists.txt new file mode 100644 index 0000000..dcb47ec --- /dev/null +++ b/lib/libconjugrad/CMakeLists.txt @@ -0,0 +1,64 @@ +cmake_minimum_required(VERSION 3.8) +include(CheckLanguage) + +project(libconjugrad) +set(LIBCONJUGRAD_MAJOR_VERSION 0) +set(LIBCONJUGRAD_MINOR_VERSION 0) +set(LIBCONJUGRAD_PATCH_VERSION 4) +set(LIBCONJUGRAD_VERSION ${LIBCONJUGRAD_MAJOR_VERSION}.${LIBCONJUGRAD_MINOR_VERSION}.${LIBCONJUGRAD_PATCH_VERSION}) + +if(CMAKE_VERSION VERSION_GREATER 2.8.11) + cmake_policy(SET CMP0022 OLD) +endif(CMAKE_VERSION VERSION_GREATER 2.8.11) + +set(CMAKE_C_FLAGS "-std=c99 -O3 -g -lm") +set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) + +include_directories("${PROJECT_SOURCE_DIR}/include") + +set(SOURCES src/conjugrad.c src/debug.c) + +if(NOT DEFINED WITH_SIMD) + set(WITH_SIMD ansi) +endif(NOT DEFINED WITH_SIMD) + +if(NOT DEFINED WITH_CUDA) + set(WITH_CUDA ON) +endif(NOT DEFINED WITH_CUDA) + +if(WITH_SIMD STREQUAL ansi) + add_definitions(-DCONJUGRAD_SIMD=0) + list(APPEND SOURCES src/arithmetic_ansi.c) +elseif(WITH_SIMD STREQUAL sse) + add_definitions(-DCONJUGRAD_SIMD=1) + list(APPEND SOURCES src/arithmetic_simd.c) +elseif(WITH_SIMD STREQUAL avx) + add_definitions(-DCONJUGRAD_SIMD=2) + list(APPEND SOURCES src/arithmetic_simd.c) +endif(WITH_SIMD STREQUAL ansi) + +set(BUILD_SHARED_LIBS OFF) + +if(WITH_CUDA) + find_package(CUDA) + check_language(CUDA) + enable_language(CUDA) + + #list(APPEND SOURCES src/conjugrad_cuda.c src/help_functions.cu src/conjugrad_kernels.cu) + list(APPEND SOURCES src/conjugrad_cuda.c src/conjugrad_kernels.cu) + + include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +endif(WITH_CUDA) + +add_library(conjugrad ${SOURCES}) + +install(TARGETS conjugrad DESTINATION lib) + +add_executable(rosenbrock EXCLUDE_FROM_ALL samples/rosenbrock.c) +target_link_libraries(rosenbrock conjugrad) + +add_executable(polynomial EXCLUDE_FROM_ALL samples/polynomial.c) +target_link_libraries(polynomial conjugrad) + +add_executable(sphere EXCLUDE_FROM_ALL samples/sphere.c) +target_link_libraries(sphere conjugrad) diff --git a/lib/libconjugrad/include/arithmetic.h b/lib/libconjugrad/include/arithmetic.h new file mode 100644 index 0000000..cf618ff --- /dev/null +++ b/lib/libconjugrad/include/arithmetic.h @@ -0,0 +1,34 @@ +#ifndef CONJUGRAD_ARITHMETIC_H +#define CONJUGRAD_ARITHMETIC_H + +#include "conjugrad.h" + + +// set all vector elements to 0 +void vec0(conjugrad_float_t *dst, int n); + +// copy src onto dst +void veccpy(conjugrad_float_t *dst, conjugrad_float_t *src, int n); + +// dst += src +//void veciaddv(conjugrad_float_t *dst, conjugrad_float_t *src, int n); + +// dst *= f +void vecimulc(conjugrad_float_t *dst, conjugrad_float_t f, int n); + +// dst -= f * src +void vecifma(conjugrad_float_t *dst, conjugrad_float_t *src, conjugrad_float_t f, int n); + +// dst = f * b - a +void vecsfms(conjugrad_float_t *dst, conjugrad_float_t *a, conjugrad_float_t f, conjugrad_float_t *b, int n); + +// dst = sum_n v[n] * w[n] +conjugrad_float_t vecdot(conjugrad_float_t *v, conjugrad_float_t *w, int n); + +// dst = sum_n v[n]^2 +conjugrad_float_t vecnorm(conjugrad_float_t *v, int n); + + +conjugrad_float_t *vecalloc(int n); + +#endif diff --git a/lib/libconjugrad/include/conjugrad.h b/lib/libconjugrad/include/conjugrad.h new file mode 100644 index 0000000..b287a90 --- /dev/null +++ b/lib/libconjugrad/include/conjugrad.h @@ -0,0 +1,142 @@ +#include + +#ifndef __LIBCONJ_H__ +#define __LIBCONJ_H__ + +#ifndef CONJUGRAD_FLOAT +#define CONJUGRAD_FLOAT 64 +#endif + +#if CONJUGRAD_FLOAT == 32 +typedef float conjugrad_float_t; + +#define F0 0.0f +#define F001 0.01f +#define F02 0.2f +#define F05 0.5f +#define F08 0.8f +#define F1 1.0f +#define F2 2.0f +#define FInf 0.0f/0.0f +#define fsqrt sqrtf +#define flog logf +#define fexp expf +#define fdlog __logf +#define fdexp __expf + + +#elif CONJUGRAD_FLOAT == 64 +typedef double conjugrad_float_t; + +#define F0 0.0 +#define F001 0.01 +#define F02 0.2 +#define F05 0.5 +#define F08 0.8 +#define F1 1.0 +#define F2 2.0 +#define FInf 0.0/0.0 +#define fsqrt sqrt +#define flog log +#define fexp exp +#define fdlog log +#define fdexp exp + +#else +#error "Only double-precision (CONJGRAD_FLOAT=64) or single-precision (CONJGRAD_FLOAT=32) supported" +#endif + +typedef struct { + int max_iterations; + int max_linesearch; + int k; + conjugrad_float_t epsilon; // Tolerance for convergence criterion + conjugrad_float_t ftol; // Tolerance for line search sufficient decrease criterion + conjugrad_float_t wolfe; // Tolerance for line search curvature criterion + conjugrad_float_t alpha_mul; // Line search backtracking factor in range (0,1) + conjugrad_float_t min_gnorm; // Value for initial gnorm that will be considered already minimized +} conjugrad_parameter_t; + + + +typedef conjugrad_float_t (*conjugrad_evaluate_t)( + void *instance, + const conjugrad_float_t *x, + conjugrad_float_t *g, + const int n +); + +typedef int (*conjugrad_progress_t)( + void *instance, + const conjugrad_float_t *x, + const conjugrad_float_t *g, + const conjugrad_float_t fx, + const conjugrad_float_t xnorm, + const conjugrad_float_t gnorm, + const conjugrad_float_t step, + int n, + int k, + int ls +); + +int conjugrad_gpu( + int n, + conjugrad_float_t *d_x, + conjugrad_float_t *d_fx, + conjugrad_evaluate_t proc_evaluate, + conjugrad_progress_t proc_progress, + void *instance, + conjugrad_parameter_t *param +); + +int conjugrad( + int n, + conjugrad_float_t *x, + conjugrad_float_t *fx, + conjugrad_evaluate_t proc_evaluate, + conjugrad_progress_t proc_progress, + void *instance, + conjugrad_parameter_t *param +); + +int linesearch( + int n, + conjugrad_float_t *x, + conjugrad_float_t *fx, + conjugrad_float_t *g, + conjugrad_float_t *s, + conjugrad_float_t *alpha, + conjugrad_evaluate_t proc_evaluate, + void *instance, + conjugrad_parameter_t *param +); + +conjugrad_parameter_t *conjugrad_init(); +conjugrad_float_t *conjugrad_malloc(int n); +void conjugrad_free(conjugrad_float_t *x); + + +void conjugrad_debug_numdiff( + int n, + conjugrad_float_t *x0, + int i, + conjugrad_evaluate_t proc_evaluate, + void *instance, + conjugrad_float_t epsilon, + bool have_extra_i, + int extra_i +); + + +enum { + CONJUGRAD_SUCCESS = 0, + CONJUGRAD_ALREADY_MINIMIZED, + + CONJUGRADERR_UNKNOWN = -1024, + CONJUGRADERR_MAXIMUMLINESEARCH, + CONJUGRADERR_MAXIMUMITERATION + +}; + + +#endif diff --git a/lib/libconjugrad/include/conjugrad_kernels.h b/lib/libconjugrad/include/conjugrad_kernels.h new file mode 100644 index 0000000..ec68aab --- /dev/null +++ b/lib/libconjugrad/include/conjugrad_kernels.h @@ -0,0 +1,50 @@ +#ifndef CONJUGRAD_KERNELS_H +#define CONJUGRAD_KERNELS_H + +#include "conjugrad.h" + +#define CHECK_ERR(a) if (cudaSuccess != (a)) { printf("CUDA error No. %d in %s at line %d\n", a, __FILE__, __LINE__); exit(EXIT_FAILURE); } + +#ifdef __cplusplus +extern "C" { +#endif + +void vecnorm_gpu( + conjugrad_float_t *d_g, + conjugrad_float_t *d_res, + int nvar +); + +void vecdot_gpu( + conjugrad_float_t *d_x, + conjugrad_float_t *d_y, + conjugrad_float_t *d_res, + int nvar +); + +void initialize_s_gpu( + conjugrad_float_t *d_s, + conjugrad_float_t *d_g, + int nvar +); + +void update_s_gpu( + conjugrad_float_t *d_old_s, + conjugrad_float_t *d_g, + conjugrad_float_t beta, + int nvar +); + +void update_x_gpu( + conjugrad_float_t *d_x, + conjugrad_float_t *d_s, + conjugrad_float_t alpha, + conjugrad_float_t prevalpha, + int nvar +); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/libconjugrad/include/help_functions.h b/lib/libconjugrad/include/help_functions.h new file mode 100644 index 0000000..f9aad7b --- /dev/null +++ b/lib/libconjugrad/include/help_functions.h @@ -0,0 +1,30 @@ +#include "conjugrad.h" + +#ifndef HELP_FUNCTIONS_H +#define HELP_FUNCTIONS_H + +#ifdef __cplusplus +extern "C" { +#endif + +__device__ void sum_reduction_function( + volatile conjugrad_float_t *s_data, + int tid +); + +__device__ void warp_reduction( + volatile conjugrad_float_t *s_data, + int tid +); + +__global__ void sum_reduction( + conjugrad_float_t *d_in, + conjugrad_float_t *d_out, + int nvar +); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/libconjugrad/samples/polynomial.c b/lib/libconjugrad/samples/polynomial.c new file mode 100644 index 0000000..a1ac6bd --- /dev/null +++ b/lib/libconjugrad/samples/polynomial.c @@ -0,0 +1,74 @@ +#include +#include +#include "conjugrad.h" + + +conjugrad_float_t evaluate( + void *instance, + const conjugrad_float_t *x, + conjugrad_float_t *g, + const int n +) { + (void)instance; + (void)n; + + conjugrad_float_t fx = 0.0; + + + conjugrad_float_t a = x[0]; + + //printf("eval step = %g, a = %g\n", alpha, a); + + conjugrad_float_t *g_a = &g[0]; + + *g_a = 0.; + + fx = (a-4)*(a-4); + + *g_a = 2 * a - 8; + + printf("fx = %g, da = %g\n", fx, g[0]); + + return fx; + + +} + +int progress( + void *instance, + const conjugrad_float_t *x, + const conjugrad_float_t *g, + const conjugrad_float_t fx, + const conjugrad_float_t xnorm, + const conjugrad_float_t gnorm, + const conjugrad_float_t step, + int n, + int k, + int ls +) { + (void)instance; + (void)x; + (void)g; + (void)n; + + printf("%d\t%d\t%g\t%g\t%g\t%g\n", k, ls, fx, xnorm, gnorm, step); + return true; +} + +int main(int argc, char **argv) { + (void)argc; + (void)argv; + + conjugrad_parameter_t *param = conjugrad_init(); + + + int n = 1; + conjugrad_float_t *x = conjugrad_malloc(n); + x[0] = 0; + conjugrad_float_t fx; + + int ret = conjugrad(n, x, &fx, evaluate, progress, NULL, param); + + printf("Return code %d\n", ret); + +} diff --git a/lib/libconjugrad/samples/rosenbrock.c b/lib/libconjugrad/samples/rosenbrock.c new file mode 100644 index 0000000..07d56a1 --- /dev/null +++ b/lib/libconjugrad/samples/rosenbrock.c @@ -0,0 +1,77 @@ +#include +#include +#include "conjugrad.h" + + +conjugrad_float_t evaluate( + void *instance, + const conjugrad_float_t *x, + conjugrad_float_t *g, + const int n +) { + (void)instance; + (void)n; + + conjugrad_float_t fx = 0.0; + + + conjugrad_float_t a = x[0]; + conjugrad_float_t b = x[1]; + + //printf("eval step = %g, a = %g, b = %g\n", alpha, a, b); + + conjugrad_float_t *g_a = &g[0]; + conjugrad_float_t *g_b = &g[1]; + + *g_a = *g_b = 0.; + + fx = (1-a) * (1-a) + 100 * (b - a*a) * (b - a*a); + + *g_a = -2 + 2*a - 400*a*b + 400*a*a*a; + *g_b = 200 * b - 200 * a * a; + + //printf("fx = %g, da = %g, db = %g\n", fx, g[0], g[1]); + + return fx; + + +} + +int progress( + void *instance, + const conjugrad_float_t *x, + const conjugrad_float_t *g, + const conjugrad_float_t fx, + const conjugrad_float_t xnorm, + const conjugrad_float_t gnorm, + const conjugrad_float_t step, + int n, + int k, + int ls +) { + (void)instance; + (void)x; + (void)g; + (void)n; + + printf("%d\t%d\t%g\t%g\t%g\t%g\n", k, ls, fx, xnorm, gnorm, step); + return true; +} + +int main(int argc, char **argv) { + (void)argc; + (void)argv; + + conjugrad_parameter_t *param = conjugrad_init(); + + int n = 2; + conjugrad_float_t *x = conjugrad_malloc(n); + x[0] = 0; + x[1] = 0; + conjugrad_float_t fx; + + int ret = conjugrad(n, x, &fx, evaluate, progress, NULL, param); + + printf("Return code %d\n", ret); + +} diff --git a/lib/libconjugrad/samples/sphere.c b/lib/libconjugrad/samples/sphere.c new file mode 100644 index 0000000..8b9fdf1 --- /dev/null +++ b/lib/libconjugrad/samples/sphere.c @@ -0,0 +1,78 @@ +#include +#include +#include "conjugrad.h" + + +conjugrad_float_t evaluate( + void *instance, + const conjugrad_float_t *x, + conjugrad_float_t *g, + const int n +) { + (void)instance; + (void)n; + + conjugrad_float_t fx = 0.0; + + + conjugrad_float_t a = x[0]; + conjugrad_float_t b = x[1]; + + // printf("eval step = %g, a = %g, b = %g\n", alpha, a, b); + + conjugrad_float_t *g_a = &g[0]; + conjugrad_float_t *g_b = &g[1]; + + *g_a = *g_b = 0.; + + fx = (a - 2) * (a - 2) + (b - 3) * (b - 3); + + *g_a = 2 * a - 4; + *g_b = 2 * b - 6; + + printf("fx = %g, da = %g, db = %g\n", fx, g[0], g[1]); + + return fx; + + +} + +int progress( + void *instance, + const conjugrad_float_t *x, + const conjugrad_float_t *g, + const conjugrad_float_t fx, + const conjugrad_float_t xnorm, + const conjugrad_float_t gnorm, + const conjugrad_float_t step, + int n, + int k, + int ls +) { + (void)instance; + (void)x; + (void)g; + (void)n; + + printf("%d\t%d\t%g\t%g\t%g\t%g\n", k, ls, fx, xnorm, gnorm, step); + return true; +} + +int main(int argc, char **argv) { + (void)argc; + (void)argv; + + conjugrad_parameter_t *param = conjugrad_init(); + + + int n = 2; + conjugrad_float_t *x = conjugrad_malloc(n); + x[0] = 0; + x[1] = 1; + conjugrad_float_t fx; + + int ret = conjugrad(n, x, &fx, evaluate, progress, NULL, param); + + printf("Return code %d\n", ret); + +} diff --git a/lib/libconjugrad/src/arithmetic_ansi.c b/lib/libconjugrad/src/arithmetic_ansi.c new file mode 100644 index 0000000..8efb73b --- /dev/null +++ b/lib/libconjugrad/src/arithmetic_ansi.c @@ -0,0 +1,51 @@ +#include +#include +#include +#include "arithmetic.h" +#include "conjugrad.h" + +void vec0(conjugrad_float_t *dst, int n) { + memset(dst, 0, sizeof(conjugrad_float_t) * n); +} + +void veccpy(conjugrad_float_t *dst, conjugrad_float_t *src, int n) { + memcpy(dst, src, sizeof(conjugrad_float_t) * n); +} + +void vecimulc(conjugrad_float_t *dst, conjugrad_float_t f, int n) { + for(int i = 0; i < n; i++) { + dst[i] *= f; + } +} + +void vecifma(conjugrad_float_t *dst, conjugrad_float_t *src, conjugrad_float_t f, int n) { + for(int i = 0; i < n; i++) { + dst[i] += f * src[i]; + } +} + +void vecsfms(conjugrad_float_t *dst, conjugrad_float_t *a, conjugrad_float_t f, conjugrad_float_t *b, int n) { + for(int i = 0; i < n; i++) { + dst[i] = f * b[i] - a[i]; + } +} + +conjugrad_float_t vecnorm(conjugrad_float_t *v, int n) { + conjugrad_float_t sum = 0.; + for(int i = 0; i < n; i++) { + sum += v[i] * v[i]; + } + return sum; +} + +conjugrad_float_t vecdot(conjugrad_float_t *v, conjugrad_float_t *w, int n) { + conjugrad_float_t sum = 0.; + for(int i = 0; i < n; i++) { + sum += v[i] * w[i]; + } + return sum; +} + +conjugrad_float_t *vecalloc(int n) { + return (conjugrad_float_t *) malloc(sizeof(conjugrad_float_t) * n); +} diff --git a/lib/libconjugrad/src/arithmetic_simd.c b/lib/libconjugrad/src/arithmetic_simd.c new file mode 100644 index 0000000..60d8949 --- /dev/null +++ b/lib/libconjugrad/src/arithmetic_simd.c @@ -0,0 +1,211 @@ +#include +#include +#include +#include "arithmetic.h" +#include "conjugrad.h" + +#include + +#ifndef CONJUGRAD_SIMD +#define CONJUGRAD_SIMD 1 +#endif + +#if CONJUGRAD_FLOAT == 32 +typedef float fval; + +#if CONJUGRAD_SIMD == 1 // SSE +#define STRIDE 4 +typedef __m128 simd; +#define simdset1 _mm_set1_ps +#define simdload _mm_load_ps +#define simdmul _mm_mul_ps +#define simdadd _mm_add_ps +#define simdsub _mm_sub_ps +#define simdstore _mm_store_ps + +#elif CONJUGRAD_SIMD == 2 // AVX +#define STRIDE 8 +typedef __m256 simd; +#define simdset1 _mm256_set1_ps +#define simdload _mm256_load_ps +#define simdmul _mm256_mul_ps +#define simdadd _mm256_add_ps +#define simdsub _mm256_sub_ps +#define simdstore _mm256_store_ps +#endif // CONJUGRAD_SIMD + +#else // CONJUGRAD_FLOAT +typedef double fval; + +#if CONJUGRAD_SIMD == 1 // SSE +#define STRIDE 2 +typedef __m128d simd; +#define simdset1 _mm_set1_pd +#define simdload _mm_load_pd +#define simdmul _mm_mul_pd +#define simdadd _mm_add_pd +#define simdsub _mm_sub_pd +#define simdstore _mm_store_pd + +#elif CONJUGRAD_SIMD == 2 // AVX +#define STRIDE 4 +typedef __m256d simd; +#define simdset1 _mm256_set1_pd +#define simdload _mm256_load_pd +#define simdmul _mm256_mul_pd +#define simdadd _mm256_add_pd +#define simdsub _mm256_sub_pd +#define simdstore _mm256_store_pd +#endif // CONJUGRAD_SIMD +#endif // CONJUGRAD_FLOAT + +void vec0(conjugrad_float_t *dst, int n) { + memset(dst, 0, sizeof(conjugrad_float_t) * n); +} + +void veccpy(conjugrad_float_t *dst, conjugrad_float_t *src, int n) { + memcpy(dst, src, sizeof(conjugrad_float_t) * n); +} + +void vecimulc(conjugrad_float_t *dst, conjugrad_float_t f, int n) { + simd _dst; + simd _f = simdset1(f); + + fval *pdst = (fval *)dst; + + int m = (n / STRIDE) * STRIDE; + for(int i = 0; i < m; i+= STRIDE) { + _dst = simdload(pdst); + _dst = simdmul(_dst, _f); + simdstore(pdst, _dst); + + pdst += STRIDE; + } + + for(int i = m; i < n; i++) { + dst[i] *= f; + } +} + +void vecifma(conjugrad_float_t *dst, conjugrad_float_t *src, conjugrad_float_t f, int n) { + simd _dst; + simd _src; + + simd _f = simdset1(f); + + fval *pdst = (fval *)dst; + fval *psrc = (fval *)src; + + int m = (n / STRIDE) * STRIDE; + for(int i = 0; i < m; i += STRIDE) { + _dst = simdload(pdst); + _src = simdload(psrc); + _src = simdmul(_src, _f); + _dst = simdadd(_dst, _src); + simdstore(pdst, _dst); + + pdst += STRIDE; + psrc += STRIDE; + } + + for(int i = m; i < n; i++) { + dst[i] += f * src[i]; + } +} + +void vecsfms(conjugrad_float_t *dst, conjugrad_float_t *a, conjugrad_float_t f, conjugrad_float_t *b, int n) { + simd _dst; + simd _a; + simd _b; + + simd _f = simdset1(f); + + fval *pdst = (fval *)dst; + fval *pa = (fval *)a; + fval *pb = (fval *)b; + + int m = (n / STRIDE) * STRIDE; + for(int i = 0; i < m; i += STRIDE) { + _a = simdload(pa); + _b = simdload(pb); + _dst = simdmul(_b, _f); + _dst = simdsub(_dst, _a); + simdstore(pdst, _dst); + + pdst += STRIDE; + pa += STRIDE; + pb += STRIDE; + } + + for(int i = m; i < n; i++) { + dst[i] = a[i] + f * b[i]; + } + +} + +conjugrad_float_t vecnorm(conjugrad_float_t *v, int n) { + + simd _sum = simdset1(.0f); + simd _v; + + + fval *pv = (fval *)v; + int m = (n / STRIDE) * STRIDE; + for(int i = 0; i < m; i += STRIDE) { + _v = simdload(pv); + _v = simdmul(_v, _v); + _sum = simdadd(_sum, _v); + pv += STRIDE; + } + + fval psum[STRIDE]; + simdstore(psum, _sum); + + fval sum = .0f; + for(int i = 0; i < STRIDE; i++) { + sum += psum[i]; + } + + for(int i = m; i < n; i++) { + sum += v[i] * v[i]; + } + + return sum; +} + +conjugrad_float_t vecdot(conjugrad_float_t *v, conjugrad_float_t *w, int n) { + simd _sum = simdset1(.0f); + simd _v; + simd _w; + + fval *pv = (fval *)v; + fval *pw = (fval *)w; + int m = (n / STRIDE) * STRIDE; + for(int i = 0; i < m; i += STRIDE) { + _v = simdload(pv); + _w = simdload(pw); + _v = simdmul(_v, _w); + _sum = simdadd(_sum, _v); + pv += STRIDE; + pw += STRIDE; + } + + fval psum[STRIDE]; + simdstore(psum, _sum); + + fval sum = .0f; + for(int i = 0; i < STRIDE; i++) { + sum += psum[i]; + } + for(int i = m; i < n; i++) { + sum += v[i] * w[i]; + } + + return sum; +} + +conjugrad_float_t *vecalloc(int n) { + conjugrad_float_t *out; + posix_memalign((void **)&out, sizeof(conjugrad_float_t) * STRIDE, sizeof(conjugrad_float_t) * n); + return out; +} diff --git a/lib/libconjugrad/src/conjugrad.c b/lib/libconjugrad/src/conjugrad.c new file mode 100644 index 0000000..a258691 --- /dev/null +++ b/lib/libconjugrad/src/conjugrad.c @@ -0,0 +1,189 @@ +#include "conjugrad.h" +#include "arithmetic.h" + +#include +#include +#include +#include + + +void swap(conjugrad_float_t **a, conjugrad_float_t **b) { + conjugrad_float_t *c = *a; + *a = *b; + *b = c; +} + +int conjugrad( + int n, + conjugrad_float_t *x, + conjugrad_float_t *fx, + conjugrad_evaluate_t proc_evaluate, + conjugrad_progress_t proc_progress, + void *instance, + conjugrad_parameter_t *param +) { + + conjugrad_float_t alpha, alphaprev, dg, dgprev, beta, gprevnorm, gnorm, xnorm; + conjugrad_float_t *g = conjugrad_malloc(n); + conjugrad_float_t *s = conjugrad_malloc(n); + int n_linesearch, n_iter = 0; + int ret = CONJUGRADERR_UNKNOWN; + conjugrad_float_t *k_last_fx = (conjugrad_float_t *)malloc(sizeof(conjugrad_float_t) * param->k); + conjugrad_float_t *check_fx = k_last_fx; + + vec0(s, n); + + *fx = proc_evaluate(instance, x, g, n); + + gnorm = vecnorm(g, n); + xnorm = vecnorm(x, n); + + if(gnorm <= param->min_gnorm || gnorm / xnorm <= param->epsilon) { + ret = CONJUGRAD_ALREADY_MINIMIZED; + goto conjugrad_exit; + } + + alpha = F1/fsqrt(gnorm); + + while(true) { + if(n_iter >= param->max_iterations) { + ret = CONJUGRADERR_MAXIMUMITERATION; + break; + }; + + // \delta x_n = - g_n + //vecimulc(g, -1, n); + + if(n_iter > 0) { + // fletcher-reeves: beta_n = ||x_n|| / ||x_{n-1}|| + beta = gnorm / gprevnorm; + + // s_n = \beta_n * s_{n-1} - g_n + vecsfms(s, g, beta, s, n); + dg = vecdot(s, g, n); + alpha = alphaprev * dgprev / dg; + + } else { + // s_0 = -g_0 + veccpy(s, g, n); + vecimulc(s, -1, n); + dg = vecdot(s, g, n); + } + + n_linesearch = linesearch(n, x, fx, g, s, &alpha, proc_evaluate, instance, param); + + gprevnorm = gnorm; + gnorm = vecnorm(g, n); + xnorm = vecnorm(x, n); + alphaprev = alpha; + dgprev = dg; + + if(n_linesearch < 0) { + ret = n_linesearch; + break; + } + + int pos = n_iter % param->k; + check_fx = k_last_fx + pos; + + if (n_iter >= param->k) { + conjugrad_float_t rel_change = (*check_fx - *fx) / *check_fx; + if (rel_change < param->epsilon) { + ret = CONJUGRAD_SUCCESS; + break; + } + } + + *check_fx = *fx; + + n_iter++; + proc_progress(instance, x, g, *fx, xnorm, gnorm, alpha, n, n_iter, n_linesearch); + + // convergence check + //if(xnorm < 1.0) { xnorm = 1.0; } + //if(gnorm / xnorm <= param->epsilon) { + // ret = CONJUGRAD_SUCCESS; + // break; + //} + } + + conjugrad_exit: + + conjugrad_free(g); + conjugrad_free(s); + free(k_last_fx); + return ret; +} + + +conjugrad_parameter_t *conjugrad_init() { + conjugrad_parameter_t *out = (conjugrad_parameter_t *)malloc(sizeof(conjugrad_parameter_t)); + out->max_linesearch = 100; + out->max_iterations = 1000; + out->epsilon = 1e-5; + out->ftol = 1e-4; + out->wolfe = 0.1; + out->alpha_mul = 0.5; + out->min_gnorm = 1e-8; + + return out; +} + +int linesearch( + int n, + conjugrad_float_t *x, + conjugrad_float_t *fx, + conjugrad_float_t *g, + conjugrad_float_t *s, + conjugrad_float_t *alpha, + conjugrad_evaluate_t proc_evaluate, + void *instance, + conjugrad_parameter_t *param +) { + + conjugrad_float_t fx_step; + int n_linesearch = 0; + + conjugrad_float_t dginit = vecdot(g, s, n); + conjugrad_float_t dgtest = dginit * param->ftol; + conjugrad_float_t dg; + conjugrad_float_t finit = *fx; + conjugrad_float_t prevalpha = 0; + + while(true) { + if(n_linesearch >= param->max_linesearch) { return CONJUGRADERR_MAXIMUMLINESEARCH; } + n_linesearch++; + + // do step + vecifma(x, s, *alpha - prevalpha, n); + + // x_{n+1} is available implicitly by x_{n+1} = x_n + alpha * s + fx_step = proc_evaluate(instance, x, g, n); + + // armijo condition + if(fx_step <= finit + *alpha * dgtest) { + + // wolfe condition (curvature) + dg = vecdot(s, g, n); + if(dg < param->wolfe * dginit) { + *fx = fx_step; + return n_linesearch; + } + + + } + + prevalpha = *alpha; + *alpha *= param->alpha_mul; + } +} + + + +conjugrad_float_t *conjugrad_malloc(int n) { + return vecalloc(n); +} + +void conjugrad_free(conjugrad_float_t *x) { + free(x); +} diff --git a/lib/libconjugrad/src/conjugrad_cuda.c b/lib/libconjugrad/src/conjugrad_cuda.c new file mode 100644 index 0000000..99b091d --- /dev/null +++ b/lib/libconjugrad/src/conjugrad_cuda.c @@ -0,0 +1,194 @@ +#include "conjugrad.h" +#include "conjugrad_kernels.h" +#include +#include +#include +#include +#include + + +int init_cg_mem_cuda(int nvar); + +int destroy_cg_mem_cuda(); + +int linesearch_gpu( + int n, + conjugrad_float_t *d_x, + conjugrad_float_t *fx, + conjugrad_float_t *d_g, + conjugrad_float_t *d_s, + conjugrad_float_t *alpha, + conjugrad_evaluate_t proc_evaluate, + void *instance, + conjugrad_parameter_t *param +); + +// global GPU pointer +conjugrad_float_t *d_g; +conjugrad_float_t *d_s; + + +int conjugrad_gpu( + int n, + conjugrad_float_t *d_x, + conjugrad_float_t *fx, + conjugrad_evaluate_t proc_evaluate, + conjugrad_progress_t proc_progress, + void *instance, + conjugrad_parameter_t *param +) { + conjugrad_float_t alpha, alphaprev, dg, dgprev, beta, gnorm, gprevnorm, xnorm; + // allocate memory on device + init_cg_mem_cuda(n); + int n_linesearch, n_iter = 0; + int ret = CONJUGRADERR_UNKNOWN; + conjugrad_float_t *k_last_fx = (conjugrad_float_t *)malloc(sizeof(conjugrad_float_t) * param->k); + conjugrad_float_t *check_fx = k_last_fx; + + + CHECK_ERR(cudaMemset(d_s, 0, sizeof(conjugrad_float_t) * n)); + + //call evaluate + *fx = proc_evaluate(instance, d_x, d_g, n); + + // get xnorm and gnorm + vecnorm_gpu(d_g, &gnorm, n); + vecnorm_gpu(d_x, &xnorm, n); + + if (gnorm / xnorm <= param->epsilon) { + ret = CONJUGRAD_ALREADY_MINIMIZED; + goto conjugrad_exit; + } + + alpha = F1 / fsqrt(gnorm); // F1 is 1.0 + + while (true) { + if (n_iter >= param->max_iterations) { + ret = CONJUGRADERR_MAXIMUMITERATION; + break; + } + + + + if (n_iter > 0) { + // fletcher-reeves: beta_n = ||x_n|| / ||x_{n-1}|| + // = ||g_n|| / ||g_{n-1}|| + beta = gnorm / gprevnorm; + + // s_n = \delta x_n + \beta_n * s_{n-1} + // = \beta_n * s_{n-1} - g_n + update_s_gpu(d_s, d_g, beta, n); + vecdot_gpu(d_s, d_g, &dg, n); + alpha = alphaprev * dgprev / dg; + + } else { + // s_0 = \delta x_0 + initialize_s_gpu(d_s, d_g, n); + vecdot_gpu(d_s, d_g, &dg, n); + } + + // linesearch + n_linesearch = linesearch_gpu(n, d_x, fx, d_g, d_s, &alpha, proc_evaluate, instance, param); + + gprevnorm = gnorm; + vecnorm_gpu(d_g, &gnorm, n); + vecnorm_gpu(d_x, &xnorm, n); + alphaprev = alpha; + dgprev = dg; + + if(n_linesearch < 0) { + ret = n_linesearch; + break; + } + + int pos = n_iter % param->k; + check_fx = k_last_fx + pos; + + if (n_iter >= param->k) { + conjugrad_float_t rel_change = (*check_fx - *fx) / *fx; + if (rel_change < param->epsilon) { + ret = CONJUGRAD_SUCCESS; + break; + } + } + + *check_fx = *fx; + + n_iter++; + proc_progress(instance, d_x, d_g, *fx, xnorm, gnorm, alpha, n, n_iter, n_linesearch); + + // convergence check + //if (xnorm < F1) { xnorm = F1; } + //if (gnorm / xnorm <= 1e-3f) { + // ret = CONJUGRAD_SUCCESS; + // break; + //} + } + + conjugrad_exit: + + destroy_cg_mem_cuda(); + free(k_last_fx); + return ret; +} + + +int linesearch_gpu( + int n, + conjugrad_float_t *d_x, + conjugrad_float_t *fx, + conjugrad_float_t *d_g, + conjugrad_float_t *d_s, + conjugrad_float_t *alpha, + conjugrad_evaluate_t proc_evaluate, + void *instance, + conjugrad_parameter_t *param +) { + conjugrad_float_t fx_step; + conjugrad_float_t prevalpha; + prevalpha = F0; // 0.0 + + int n_linesearch = 0; + + conjugrad_float_t dginit; + vecdot_gpu(d_s, d_g, &dginit, n); + conjugrad_float_t dgtest = dginit * param->ftol; + conjugrad_float_t dg; + conjugrad_float_t finit = *fx; + + while (true) { + if (n_linesearch >= param->max_linesearch) { return CONJUGRADERR_MAXIMUMLINESEARCH; } + n_linesearch++; + + // do step + update_x_gpu(d_x, d_s, *alpha, prevalpha, n); + + // evaluate + fx_step = proc_evaluate(instance, d_x, d_g, n); + + // armijo condition + if (fx_step <= finit + *alpha * dgtest) { + vecdot_gpu(d_s, d_g, &dg, n); + if (dg < param->wolfe * dginit) { + *fx = fx_step; + return n_linesearch; + } + } + prevalpha = *alpha; + *alpha *= param->alpha_mul; + } +} + + +int init_cg_mem_cuda(int nvar) { + CHECK_ERR(cudaMalloc((void **) &d_g, sizeof(conjugrad_float_t) * nvar)); + CHECK_ERR(cudaMalloc((void **) &d_s, sizeof(conjugrad_float_t) * nvar)); + return EXIT_SUCCESS; +} + + +int destroy_cg_mem_cuda() { + CHECK_ERR(cudaFree(d_g)); + CHECK_ERR(cudaFree(d_s)); + return EXIT_SUCCESS; +} diff --git a/lib/libconjugrad/src/conjugrad_kernels.cu b/lib/libconjugrad/src/conjugrad_kernels.cu new file mode 100644 index 0000000..f71c3b9 --- /dev/null +++ b/lib/libconjugrad/src/conjugrad_kernels.cu @@ -0,0 +1,250 @@ +#ifdef __cpluplus +extern "C" { +#endif +#include "conjugrad_kernels.h" +#include +#include +#ifdef __cpluplus +} +#endif + +//forward declaration of device functions +__device__ void sum_reduction_function( + volatile conjugrad_float_t *s_data, + int tid +); + +__device__ void warp_reduction( + volatile conjugrad_float_t *s_data, + int tid +); + +__global__ void sum_reduction( + conjugrad_float_t *d_in, + conjugrad_float_t *d_out, + int nvar +) { + int i = threadIdx.x; + + extern __shared__ conjugrad_float_t s_data[]; + + s_data[threadIdx.x] = F0; // 0.0 + + while (i < nvar) { + s_data[threadIdx.x] += d_in[i]; + i += blockDim.x; + } + __syncthreads(); + + sum_reduction_function(s_data, threadIdx.x); + + if (0 == threadIdx.x) { + d_out[0] = s_data[0]; + } +} + +__device__ void sum_reduction_function( + volatile conjugrad_float_t *s_data, + int tid +) { + for (unsigned int s = blockDim.x >> 1; s > 32; s >>= 1) { + if (threadIdx.x < s) { + s_data[threadIdx.x] += s_data[threadIdx.x + s]; + } + __syncthreads(); + } + if (threadIdx.x < 32) { + warp_reduction(s_data, threadIdx.x); + } +} +__device__ void warp_reduction( + volatile conjugrad_float_t *s_data, + int tid +) { + s_data[tid] += s_data[tid + 32]; + s_data[tid] += s_data[tid + 16]; + s_data[tid] += s_data[tid + 8]; + s_data[tid] += s_data[tid + 4]; + s_data[tid] += s_data[tid + 2]; + s_data[tid] += s_data[tid + 1]; +} + +__global__ void vecdot_inter( + conjugrad_float_t *d_x, + conjugrad_float_t *d_y, + conjugrad_float_t *d_vecdot_inter, + int nvar +) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + const int offset = blockDim.x * gridDim.x; + + extern __shared__ conjugrad_float_t s_vecdot_inter[]; + + s_vecdot_inter[threadIdx.x] = F0; // 0.0 + + while (i < nvar) { + s_vecdot_inter[threadIdx.x] = d_x[i] * d_y[i] + s_vecdot_inter[threadIdx.x]; + i += offset; + } + __syncthreads(); + + sum_reduction_function(s_vecdot_inter, threadIdx.x); + + if (0 == threadIdx.x) { + d_vecdot_inter[blockIdx.x] = s_vecdot_inter[0]; + } +} + +__global__ void update_s( + conjugrad_float_t *d_old_s, + conjugrad_float_t *d_g, + conjugrad_float_t beta, + int nvar +) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + const int offset = blockDim.x * gridDim.x; + + while (i < nvar) { + d_old_s[i] = beta * d_old_s[i] - d_g[i]; + i += offset; + } +} + +__global__ void initialize_s( + conjugrad_float_t *d_s, + conjugrad_float_t *d_g, + int nvar) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + const int offset = blockDim.x * gridDim.x; + + while (i < nvar) { + d_s[i] = - d_g[i]; + i += offset; + } +} + +__global__ void update_x( + conjugrad_float_t *d_x, + conjugrad_float_t *d_s, + conjugrad_float_t alpha, + conjugrad_float_t prevalpha, + int nvar) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + const int offset = blockDim.x * gridDim.x; + + while (i < nvar) { + d_x[i] = (alpha - prevalpha) * d_s[i] + d_x[i]; + i += offset; + } +} + +#ifdef __cpluplus +extern "C" { +#endif +void vecnorm_gpu( + conjugrad_float_t *d_x, + conjugrad_float_t *res, + int nvar +) { + unsigned int nblocks = 128; + unsigned int nthreads = 256; + size_t nbytes = sizeof(conjugrad_float_t) * nthreads; + + conjugrad_float_t *d_inter; + conjugrad_float_t *d_res; + CHECK_ERR(cudaMalloc((void **) &d_inter, sizeof(conjugrad_float_t) * nblocks)); + CHECK_ERR(cudaMalloc((void **) &d_res, sizeof(conjugrad_float_t))); + + vecdot_inter<<>>(d_x, d_x, d_inter, nvar); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + CHECK_ERR(err); + + + nbytes = sizeof(conjugrad_float_t) * nblocks; + sum_reduction<<<1, nblocks, nbytes>>>(d_inter, d_res, nblocks); + cudaDeviceSynchronize(); + err = cudaGetLastError(); + CHECK_ERR(err); + + + CHECK_ERR(cudaMemcpy(res, d_res, sizeof(conjugrad_float_t), cudaMemcpyDeviceToHost)); + + CHECK_ERR(cudaFree(d_inter)); + CHECK_ERR(cudaFree(d_res)); +} + +void vecdot_gpu( + conjugrad_float_t *d_x, + conjugrad_float_t *d_y, + conjugrad_float_t *res, + int nvar +) { + unsigned int nblocks = 128; + unsigned int nthreads = 256; + size_t nbytes = sizeof(conjugrad_float_t) * nthreads; + + conjugrad_float_t *d_inter; + conjugrad_float_t *d_res; + CHECK_ERR(cudaMalloc((void **) &d_inter, sizeof(conjugrad_float_t) * nblocks)); + CHECK_ERR(cudaMalloc((void **) &d_res, sizeof(conjugrad_float_t))); + + vecdot_inter<<>>(d_x, d_y, d_inter, nvar); + cudaDeviceSynchronize(); + CHECK_ERR(cudaGetLastError()); + + nbytes = sizeof(conjugrad_float_t) * nblocks; + sum_reduction<<<1, nblocks, nbytes>>>(d_inter, d_res, nblocks); + cudaDeviceSynchronize(); + CHECK_ERR(cudaGetLastError()); + + CHECK_ERR(cudaMemcpy(res, d_res, sizeof(conjugrad_float_t), cudaMemcpyDeviceToHost)); + + CHECK_ERR(cudaFree(d_inter)); + CHECK_ERR(cudaFree(d_res)); +} + +void initialize_s_gpu( + conjugrad_float_t *d_s, + conjugrad_float_t *d_g, + int nvar +) { + int nblocks = 128; + int nthreads = 256; + + initialize_s<<>>(d_s, d_g, nvar); + cudaDeviceSynchronize(); + CHECK_ERR(cudaGetLastError()); +} + +void update_s_gpu( + conjugrad_float_t *d_old_s, + conjugrad_float_t *d_g, + conjugrad_float_t beta, + int nvar +) { + unsigned int nblocks = 128; + unsigned int nthreads = 256; + + update_s<<>>(d_old_s, d_g, beta, nvar); + cudaDeviceSynchronize(); + CHECK_ERR(cudaGetLastError()); +} + +void update_x_gpu( + conjugrad_float_t *d_x, + conjugrad_float_t *d_s, + conjugrad_float_t alpha, + conjugrad_float_t prevalpha, + int nvar +) { + int nblocks = 128; + int nthreads = 256; + update_x<<>>(d_x, d_s, alpha, prevalpha, nvar); + cudaDeviceSynchronize(); + CHECK_ERR(cudaGetLastError()); +} + +#ifdef __cpluplus +} +#endif diff --git a/lib/libconjugrad/src/debug.c b/lib/libconjugrad/src/debug.c new file mode 100644 index 0000000..d437d0d --- /dev/null +++ b/lib/libconjugrad/src/debug.c @@ -0,0 +1,52 @@ +#include +#include +#include + +#include "conjugrad.h" + +void conjugrad_debug_numdiff( + int n, + conjugrad_float_t *x0, + int i, + conjugrad_evaluate_t proc_evaluate, + void *instance, + conjugrad_float_t epsilon, + bool have_extra_i, + int extra_i +) { + + conjugrad_float_t *xA = conjugrad_malloc(n); + conjugrad_float_t *xB = conjugrad_malloc(n); + conjugrad_float_t *g = conjugrad_malloc(n); + + memcpy(xA, x0, sizeof(conjugrad_float_t) * n); + memcpy(xB, x0, sizeof(conjugrad_float_t) * n); + + xA[i] -= epsilon; + xB[i] += epsilon; + + if(have_extra_i) { + xA[extra_i] -= epsilon; + xB[extra_i] += epsilon; + } + + conjugrad_float_t fxA = proc_evaluate(instance, xA, g, n); + conjugrad_float_t fxB = proc_evaluate(instance, xB, g, n); + conjugrad_float_t fx = proc_evaluate(instance, x0, g, n); + + conjugrad_float_t gNumeric = (fxB - fxA) / (2 * epsilon); + conjugrad_float_t gSymbolic = g[i]; + + printf("\t@%8d: gNum = %15g gSym = %15g", i, gNumeric, gSymbolic); + + if(have_extra_i) { + printf("\t@%8d: gSym = %15g", extra_i, g[extra_i]); + + } + + printf("\n"); + + conjugrad_free(xA); + conjugrad_free(xB); + conjugrad_free(g); +} diff --git a/lib/libconjugrad/src/help_functions.cu b/lib/libconjugrad/src/help_functions.cu new file mode 100644 index 0000000..2546246 --- /dev/null +++ b/lib/libconjugrad/src/help_functions.cu @@ -0,0 +1,59 @@ +#include "help_functions.h" +#ifdef __cplusplus +extern "C" { +#endif + +__global__ void sum_reduction( + conjugrad_float_t *d_in, + conjugrad_float_t *d_out, + int nvar +) { + int i = threadIdx.x; + + extern __shared__ conjugrad_float_t s_data[]; + + s_data[threadIdx.x] = F0; // 0.0 + + while (i < nvar) { + s_data[threadIdx.x] += d_in[i]; + i += blockDim.x; + } + __syncthreads(); + + sum_reduction_function(s_data, threadIdx.x); + + if (0 == threadIdx.x) { + d_out[0] = s_data[0]; + } +} + +__device__ void sum_reduction_function( + volatile conjugrad_float_t *s_data, + int tid +) { + for (unsigned int s = blockDim.x >> 1; s > 32; s >>= 1) { + if (threadIdx.x < s) { + s_data[threadIdx.x] += s_data[threadIdx.x + s]; + } + __syncthreads(); + } + if (threadIdx.x < 32) { + warp_reduction(s_data, threadIdx.x); + } +} + +__device__ void warp_reduction( + volatile conjugrad_float_t *s_data, + int tid +) { + s_data[tid] += s_data[tid + 32]; + s_data[tid] += s_data[tid + 16]; + s_data[tid] += s_data[tid + 8]; + s_data[tid] += s_data[tid + 4]; + s_data[tid] += s_data[tid + 2]; + s_data[tid] += s_data[tid + 1]; +} + +#ifdef __cplusplus +} +#endif