diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 11569b0..0000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "lib/libconjugrad"]
-	path = lib/libconjugrad
-	url = https://bitbucket.org/soedinglab/libconjugrad.git
diff --git a/README.md b/README.md
index 577caba..bf47a56 100644
--- a/README.md
+++ b/README.md
@@ -41,9 +41,9 @@ For the padded version:
 We recommend compiling CCMpred on the machine that should run the computations so that it can be optimized for the appropriate CPU/GPU architecture.
 
 ### Downloading
-If you want to compile the most recent version, use the follwing to clone both CCMpred and its submodules:
+If you want to compile the most recent version, use the following to clone the latest version of CCMpred:
 
-	git clone --recursive https://github.com/soedinglab/CCMpred.git
+	git clone https://github.com/soedinglab/CCMpred.git
 
 ### Compilation
 With the sourcecode ready, simply run cmake with the default settings and libraries should be auto-detected:
diff --git a/lib/libconjugrad b/lib/libconjugrad
deleted file mode 160000
index e503ee8..0000000
--- a/lib/libconjugrad
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit e503ee8a1e1d392339a1cd3ddd540468fe896cd1
diff --git a/lib/libconjugrad/.gitignore b/lib/libconjugrad/.gitignore
new file mode 100644
index 0000000..61eede7
--- /dev/null
+++ b/lib/libconjugrad/.gitignore
@@ -0,0 +1,55 @@
+### C
+
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+#Binaries
+/bin
+
+# Libraries
+*.lib
+*.a
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+
+### vim
+
+[._]*.s[a-w][a-z]
+[._]s[a-w][a-z]
+*.un~
+Session.vim
+.netrwhist
+*~
+
+
+### CMake
+
+CMakeCache.txt
+CMakeFiles
+Makefile
+cmake_install.cmake
+install_manifest.txt
+_build
+
+
+### Python
+
+__pycache__/
+*.py[cod]
+
diff --git a/lib/libconjugrad/.ycm_extra_conf.py b/lib/libconjugrad/.ycm_extra_conf.py
new file mode 100644
index 0000000..e9441b4
--- /dev/null
+++ b/lib/libconjugrad/.ycm_extra_conf.py
@@ -0,0 +1,120 @@
+
+import os
+import ycm_core
+
+flags = [
+'-Wall',
+'-Wextra',
+'-Werror',
+'-Wc++98-compat',
+'-Wno-long-long',
+'-Wno-variadic-macros',
+'-fexceptions',
+'-DNDEBUG',
+
+'-std=c11',
+'-x',
+'c',
+'-I',
+'./include'
+]
+
+
+# Set this to the absolute path to the folder (NOT the file!) containing the
+# compile_commands.json file to use that instead of 'flags'. See here for
+# more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html
+#
+# Most projects will NOT need to set this to anything; you can just change the
+# 'flags' list of compilation flags. Notice that YCM itself uses that approach.
+compilation_database_folder = ''
+
+if os.path.exists( compilation_database_folder ):
+  database = ycm_core.CompilationDatabase( compilation_database_folder )
+else:
+  database = None
+
+SOURCE_EXTENSIONS = [ '.cpp', '.cxx', '.cc', '.c', '.m', '.mm' ]
+
+def DirectoryOfThisScript():
+  return os.path.dirname( os.path.abspath( __file__ ) )
+
+
+def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
+  if not working_directory:
+    return list( flags )
+  new_flags = []
+  make_next_absolute = False
+  path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
+  for flag in flags:
+    new_flag = flag
+
+    if make_next_absolute:
+      make_next_absolute = False
+      if not flag.startswith( '/' ):
+        new_flag = os.path.join( working_directory, flag )
+
+    for path_flag in path_flags:
+      if flag == path_flag:
+        make_next_absolute = True
+        break
+
+      if flag.startswith( path_flag ):
+        path = flag[ len( path_flag ): ]
+        new_flag = path_flag + os.path.join( working_directory, path )
+        break
+
+    if new_flag:
+      new_flags.append( new_flag )
+  return new_flags
+
+
+def IsHeaderFile( filename ):
+  extension = os.path.splitext( filename )[ 1 ]
+  return extension in [ '.h', '.hxx', '.hpp', '.hh' ]
+
+
+def GetCompilationInfoForFile( filename ):
+  # The compilation_commands.json file generated by CMake does not have entries
+  # for header files. So we do our best by asking the db for flags for a
+  # corresponding source file, if any. If one exists, the flags for that file
+  # should be good enough.
+  if IsHeaderFile( filename ):
+    basename = os.path.splitext( filename )[ 0 ]
+    for extension in SOURCE_EXTENSIONS:
+      replacement_file = basename + extension
+      if os.path.exists( replacement_file ):
+        compilation_info = database.GetCompilationInfoForFile(
+          replacement_file )
+        if compilation_info.compiler_flags_:
+          return compilation_info
+    return None
+  return database.GetCompilationInfoForFile( filename )
+
+
+def FlagsForFile( filename, **kwargs ):
+  if database:
+    # Bear in mind that compilation_info.compiler_flags_ does NOT return a
+    # python list, but a "list-like" StringVec object
+    compilation_info = GetCompilationInfoForFile( filename )
+    if not compilation_info:
+      return None
+
+    final_flags = MakeRelativePathsInFlagsAbsolute(
+      compilation_info.compiler_flags_,
+      compilation_info.compiler_working_dir_ )
+
+    # NOTE: This is just for YouCompleteMe; it's highly likely that your project
+    # does NOT need to remove the stdlib flag. DO NOT USE THIS IN YOUR
+    # ycm_extra_conf IF YOU'RE NOT 100% SURE YOU NEED IT.
+    try:
+      final_flags.remove( '-stdlib=libc++' )
+    except ValueError:
+      pass
+  else:
+    relative_to = DirectoryOfThisScript()
+    final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
+
+  return {
+    'flags': final_flags,
+    'do_cache': True
+  }
diff --git a/lib/libconjugrad/CMakeLists.txt b/lib/libconjugrad/CMakeLists.txt
new file mode 100644
index 0000000..dcb47ec
--- /dev/null
+++ b/lib/libconjugrad/CMakeLists.txt
@@ -0,0 +1,64 @@
+cmake_minimum_required(VERSION 3.8)
+include(CheckLanguage)
+
+project(libconjugrad)
+set(LIBCONJUGRAD_MAJOR_VERSION 0)
+set(LIBCONJUGRAD_MINOR_VERSION 0)
+set(LIBCONJUGRAD_PATCH_VERSION 4)
+set(LIBCONJUGRAD_VERSION ${LIBCONJUGRAD_MAJOR_VERSION}.${LIBCONJUGRAD_MINOR_VERSION}.${LIBCONJUGRAD_PATCH_VERSION})
+
+if(CMAKE_VERSION VERSION_GREATER 2.8.11)
+	cmake_policy(SET CMP0022 OLD)
+endif(CMAKE_VERSION VERSION_GREATER 2.8.11)
+
+set(CMAKE_C_FLAGS "-std=c99 -O3 -g -lm")
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
+
+include_directories("${PROJECT_SOURCE_DIR}/include")
+
+set(SOURCES src/conjugrad.c src/debug.c)
+
+if(NOT DEFINED WITH_SIMD)
+	set(WITH_SIMD ansi)
+endif(NOT DEFINED WITH_SIMD)
+
+if(NOT DEFINED WITH_CUDA)
+	set(WITH_CUDA ON)
+endif(NOT DEFINED WITH_CUDA)
+
+if(WITH_SIMD STREQUAL ansi)
+	add_definitions(-DCONJUGRAD_SIMD=0)
+	list(APPEND SOURCES src/arithmetic_ansi.c)
+elseif(WITH_SIMD STREQUAL sse)
+	add_definitions(-DCONJUGRAD_SIMD=1)
+	list(APPEND SOURCES src/arithmetic_simd.c)
+elseif(WITH_SIMD STREQUAL avx)
+	add_definitions(-DCONJUGRAD_SIMD=2)
+	list(APPEND SOURCES src/arithmetic_simd.c)
+endif(WITH_SIMD STREQUAL ansi)
+
+set(BUILD_SHARED_LIBS OFF)
+
+if(WITH_CUDA)
+	find_package(CUDA)
+	check_language(CUDA)
+	enable_language(CUDA)
+
+	#list(APPEND SOURCES src/conjugrad_cuda.c src/help_functions.cu src/conjugrad_kernels.cu)
+	list(APPEND SOURCES src/conjugrad_cuda.c src/conjugrad_kernels.cu)
+
+	include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+endif(WITH_CUDA)
+
+add_library(conjugrad ${SOURCES})
+
+install(TARGETS conjugrad DESTINATION lib)
+
+add_executable(rosenbrock EXCLUDE_FROM_ALL samples/rosenbrock.c)
+target_link_libraries(rosenbrock conjugrad)
+
+add_executable(polynomial EXCLUDE_FROM_ALL samples/polynomial.c)
+target_link_libraries(polynomial conjugrad)
+
+add_executable(sphere EXCLUDE_FROM_ALL samples/sphere.c)
+target_link_libraries(sphere conjugrad)
diff --git a/lib/libconjugrad/include/arithmetic.h b/lib/libconjugrad/include/arithmetic.h
new file mode 100644
index 0000000..cf618ff
--- /dev/null
+++ b/lib/libconjugrad/include/arithmetic.h
@@ -0,0 +1,34 @@
+#ifndef CONJUGRAD_ARITHMETIC_H
+#define CONJUGRAD_ARITHMETIC_H
+
+#include "conjugrad.h"
+
+
+// set all vector elements to 0
+void vec0(conjugrad_float_t *dst, int n);
+
+// copy src onto dst
+void veccpy(conjugrad_float_t *dst, conjugrad_float_t *src, int n);
+
+// dst += src
+//void veciaddv(conjugrad_float_t *dst, conjugrad_float_t *src, int n);
+
+// dst *= f
+void vecimulc(conjugrad_float_t *dst, conjugrad_float_t f, int n);
+
+// dst -= f * src
+void vecifma(conjugrad_float_t *dst, conjugrad_float_t *src, conjugrad_float_t f, int n);
+
+// dst = f * b - a
+void vecsfms(conjugrad_float_t *dst, conjugrad_float_t *a, conjugrad_float_t f, conjugrad_float_t *b, int n);
+
+// dst = sum_n v[n] * w[n]
+conjugrad_float_t vecdot(conjugrad_float_t *v, conjugrad_float_t *w, int n);
+
+// dst = sum_n v[n]^2
+conjugrad_float_t vecnorm(conjugrad_float_t *v, int n);
+
+
+conjugrad_float_t *vecalloc(int n);
+
+#endif
diff --git a/lib/libconjugrad/include/conjugrad.h b/lib/libconjugrad/include/conjugrad.h
new file mode 100644
index 0000000..b287a90
--- /dev/null
+++ b/lib/libconjugrad/include/conjugrad.h
@@ -0,0 +1,142 @@
+#include <stdbool.h>
+
+#ifndef __LIBCONJ_H__
+#define __LIBCONJ_H__
+
+#ifndef CONJUGRAD_FLOAT
+#define CONJUGRAD_FLOAT 64
+#endif
+
+#if CONJUGRAD_FLOAT == 32
+typedef float conjugrad_float_t;
+
+#define F0   0.0f
+#define F001 0.01f
+#define F02  0.2f
+#define F05  0.5f
+#define F08  0.8f
+#define F1   1.0f
+#define F2   2.0f
+#define FInf 0.0f/0.0f
+#define fsqrt sqrtf
+#define flog logf
+#define fexp expf
+#define fdlog __logf
+#define fdexp __expf
+
+
+#elif CONJUGRAD_FLOAT == 64
+typedef double conjugrad_float_t;
+
+#define F0   0.0
+#define F001 0.01
+#define F02  0.2
+#define F05  0.5
+#define F08  0.8
+#define F1   1.0
+#define F2   2.0
+#define FInf 0.0/0.0
+#define fsqrt sqrt
+#define flog log
+#define fexp exp
+#define fdlog log
+#define fdexp exp
+
+#else
+#error "Only double-precision (CONJGRAD_FLOAT=64) or single-precision (CONJGRAD_FLOAT=32) supported"
+#endif
+
+typedef struct {
+	int max_iterations;
+	int max_linesearch;
+	int k;
+	conjugrad_float_t epsilon;	// Tolerance for convergence criterion
+	conjugrad_float_t ftol;		// Tolerance for line search sufficient decrease criterion
+	conjugrad_float_t wolfe;	// Tolerance for line search curvature criterion
+	conjugrad_float_t alpha_mul;	// Line search backtracking factor in range (0,1)
+	conjugrad_float_t min_gnorm;    // Value for initial gnorm that will be considered already minimized
+} conjugrad_parameter_t;
+
+
+
+typedef conjugrad_float_t (*conjugrad_evaluate_t)(
+	void *instance,
+	const conjugrad_float_t *x,
+	conjugrad_float_t *g,
+	const int n
+);
+
+typedef int (*conjugrad_progress_t)(
+	void *instance,
+	const conjugrad_float_t *x,
+	const conjugrad_float_t *g,
+	const conjugrad_float_t fx,
+	const conjugrad_float_t xnorm,
+	const conjugrad_float_t gnorm,
+	const conjugrad_float_t step,
+	int n,
+	int k,
+	int ls
+);
+
+int conjugrad_gpu(
+	int n,
+	conjugrad_float_t *d_x,
+	conjugrad_float_t *d_fx,
+	conjugrad_evaluate_t proc_evaluate,
+	conjugrad_progress_t proc_progress,
+	void *instance,
+	conjugrad_parameter_t *param
+);
+
+int conjugrad(
+	int n,
+	conjugrad_float_t *x,
+	conjugrad_float_t *fx,
+	conjugrad_evaluate_t proc_evaluate,
+	conjugrad_progress_t proc_progress,
+	void *instance,
+	conjugrad_parameter_t *param
+);
+
+int linesearch(
+	int n,
+	conjugrad_float_t *x,
+	conjugrad_float_t *fx,
+	conjugrad_float_t *g,
+	conjugrad_float_t *s,
+	conjugrad_float_t *alpha,
+	conjugrad_evaluate_t proc_evaluate,
+	void *instance,
+	conjugrad_parameter_t *param
+);
+
+conjugrad_parameter_t *conjugrad_init();
+conjugrad_float_t *conjugrad_malloc(int n);
+void conjugrad_free(conjugrad_float_t *x);
+
+
+void conjugrad_debug_numdiff(
+	int n,
+	conjugrad_float_t *x0,
+	int i,
+	conjugrad_evaluate_t proc_evaluate,
+	void *instance,
+	conjugrad_float_t epsilon,
+	bool have_extra_i,
+	int extra_i
+);
+
+
+enum {
+	CONJUGRAD_SUCCESS = 0,
+	CONJUGRAD_ALREADY_MINIMIZED,
+	
+	CONJUGRADERR_UNKNOWN = -1024,
+	CONJUGRADERR_MAXIMUMLINESEARCH,
+	CONJUGRADERR_MAXIMUMITERATION
+
+};
+
+
+#endif
diff --git a/lib/libconjugrad/include/conjugrad_kernels.h b/lib/libconjugrad/include/conjugrad_kernels.h
new file mode 100644
index 0000000..ec68aab
--- /dev/null
+++ b/lib/libconjugrad/include/conjugrad_kernels.h
@@ -0,0 +1,50 @@
+#ifndef CONJUGRAD_KERNELS_H
+#define CONJUGRAD_KERNELS_H
+
+#include "conjugrad.h"
+
+#define CHECK_ERR(a) if (cudaSuccess != (a)) { printf("CUDA error No. %d in %s at line %d\n", a, __FILE__, __LINE__); exit(EXIT_FAILURE); }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vecnorm_gpu(
+	conjugrad_float_t *d_g,
+	conjugrad_float_t *d_res,
+	int nvar
+);
+
+void vecdot_gpu(
+	conjugrad_float_t *d_x,
+	conjugrad_float_t *d_y,
+	conjugrad_float_t *d_res,
+	int nvar
+);
+
+void initialize_s_gpu(
+	conjugrad_float_t *d_s,
+	conjugrad_float_t *d_g,
+	int nvar
+);
+
+void update_s_gpu(
+	conjugrad_float_t *d_old_s,
+	conjugrad_float_t *d_g,
+	conjugrad_float_t beta,
+	int nvar
+);
+
+void update_x_gpu(
+	conjugrad_float_t *d_x,
+	conjugrad_float_t *d_s,
+	conjugrad_float_t alpha,
+	conjugrad_float_t prevalpha,
+	int nvar
+);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/libconjugrad/include/help_functions.h b/lib/libconjugrad/include/help_functions.h
new file mode 100644
index 0000000..f9aad7b
--- /dev/null
+++ b/lib/libconjugrad/include/help_functions.h
@@ -0,0 +1,30 @@
+#include "conjugrad.h"
+
+#ifndef HELP_FUNCTIONS_H
+#define HELP_FUNCTIONS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+__device__ void sum_reduction_function(
+	volatile conjugrad_float_t *s_data,
+	int tid
+);
+
+__device__ void warp_reduction(
+	volatile conjugrad_float_t *s_data,
+	int tid
+);
+
+__global__ void sum_reduction(
+	conjugrad_float_t *d_in,
+	conjugrad_float_t *d_out,
+	int nvar
+);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/libconjugrad/samples/polynomial.c b/lib/libconjugrad/samples/polynomial.c
new file mode 100644
index 0000000..a1ac6bd
--- /dev/null
+++ b/lib/libconjugrad/samples/polynomial.c
@@ -0,0 +1,74 @@
+#include <stdbool.h>
+#include <stdio.h>
+#include "conjugrad.h"
+
+
+conjugrad_float_t evaluate(
+	void *instance,
+	const conjugrad_float_t *x,
+	conjugrad_float_t *g,
+	const int n
+) {
+	(void)instance;
+	(void)n;
+
+	conjugrad_float_t fx = 0.0;
+
+
+	conjugrad_float_t a = x[0];
+
+	//printf("eval step = %g, a = %g\n", alpha, a);
+
+	conjugrad_float_t *g_a = &g[0];
+
+	*g_a =  0.;
+
+	fx = (a-4)*(a-4); 
+
+	*g_a = 2 * a - 8;
+
+	printf("fx = %g, da = %g\n", fx, g[0]);
+
+	return fx;
+
+
+}
+
+int progress(
+	void *instance,
+	const conjugrad_float_t *x,
+	const conjugrad_float_t *g,
+	const conjugrad_float_t fx,
+	const conjugrad_float_t xnorm,
+	const conjugrad_float_t gnorm,
+	const conjugrad_float_t step,
+	int n,
+	int k,
+	int ls
+) {
+	(void)instance;
+	(void)x;
+	(void)g;
+	(void)n;
+
+	printf("%d\t%d\t%g\t%g\t%g\t%g\n", k, ls, fx, xnorm, gnorm, step);
+	return true;
+}
+
+int main(int argc, char **argv) {
+	(void)argc;
+	(void)argv;
+
+	conjugrad_parameter_t *param = conjugrad_init();
+	
+
+	int n = 1;
+	conjugrad_float_t *x = conjugrad_malloc(n);
+	x[0] = 0;
+	conjugrad_float_t fx;
+
+	int ret = conjugrad(n, x, &fx, evaluate, progress, NULL, param);
+
+	printf("Return code %d\n", ret);
+
+}
diff --git a/lib/libconjugrad/samples/rosenbrock.c b/lib/libconjugrad/samples/rosenbrock.c
new file mode 100644
index 0000000..07d56a1
--- /dev/null
+++ b/lib/libconjugrad/samples/rosenbrock.c
@@ -0,0 +1,77 @@
+#include <stdbool.h>
+#include <stdio.h>
+#include "conjugrad.h"
+
+
+conjugrad_float_t evaluate(
+	void *instance,
+	const conjugrad_float_t *x,
+	conjugrad_float_t *g,
+	const int n
+) {
+	(void)instance;
+	(void)n;
+
+	conjugrad_float_t fx = 0.0;
+
+
+	conjugrad_float_t a = x[0];
+	conjugrad_float_t b = x[1];
+
+	//printf("eval step = %g, a = %g, b = %g\n", alpha, a, b);
+
+	conjugrad_float_t *g_a = &g[0];
+	conjugrad_float_t *g_b = &g[1];
+
+	*g_a = *g_b = 0.;
+
+	fx = (1-a) * (1-a) + 100 * (b - a*a) * (b - a*a);
+
+	*g_a = -2 + 2*a - 400*a*b + 400*a*a*a;
+	*g_b = 200 * b - 200 * a * a;
+
+	//printf("fx = %g, da = %g, db = %g\n", fx, g[0], g[1]);
+
+	return fx;
+
+
+}
+
+int progress(
+	void *instance,
+	const conjugrad_float_t *x,
+	const conjugrad_float_t *g,
+	const conjugrad_float_t fx,
+	const conjugrad_float_t xnorm,
+	const conjugrad_float_t gnorm,
+	const conjugrad_float_t step,
+	int n,
+	int k,
+	int ls
+) {
+	(void)instance;
+	(void)x;
+	(void)g;
+	(void)n;
+
+	printf("%d\t%d\t%g\t%g\t%g\t%g\n", k, ls, fx, xnorm, gnorm, step);
+	return true;
+}
+
+int main(int argc, char **argv) {
+	(void)argc;
+	(void)argv;
+
+	conjugrad_parameter_t *param = conjugrad_init();
+	
+	int n = 2;
+	conjugrad_float_t *x = conjugrad_malloc(n);
+	x[0] = 0;
+	x[1] = 0;
+	conjugrad_float_t fx;
+
+	int ret = conjugrad(n, x, &fx, evaluate, progress, NULL, param);
+
+	printf("Return code %d\n", ret);
+
+}
diff --git a/lib/libconjugrad/samples/sphere.c b/lib/libconjugrad/samples/sphere.c
new file mode 100644
index 0000000..8b9fdf1
--- /dev/null
+++ b/lib/libconjugrad/samples/sphere.c
@@ -0,0 +1,78 @@
+#include <stdbool.h>
+#include <stdio.h>
+#include "conjugrad.h"
+
+
+conjugrad_float_t evaluate(
+	void *instance,
+	const conjugrad_float_t *x,
+	conjugrad_float_t *g,
+	const int n
+) {
+	(void)instance;
+	(void)n;
+
+	conjugrad_float_t fx = 0.0;
+
+
+	conjugrad_float_t a = x[0];
+	conjugrad_float_t b = x[1];
+
+	// printf("eval step = %g, a = %g, b = %g\n", alpha, a, b);
+
+	conjugrad_float_t *g_a = &g[0];
+	conjugrad_float_t *g_b = &g[1];
+
+	*g_a = *g_b = 0.;
+
+	fx = (a - 2) * (a - 2) + (b - 3) * (b - 3); 
+
+	*g_a = 2 * a - 4;
+	*g_b = 2 * b - 6;
+
+	printf("fx = %g, da = %g, db = %g\n", fx, g[0], g[1]);
+
+	return fx;
+
+
+}
+
+int progress(
+	void *instance,
+	const conjugrad_float_t *x,
+	const conjugrad_float_t *g,
+	const conjugrad_float_t fx,
+	const conjugrad_float_t xnorm,
+	const conjugrad_float_t gnorm,
+	const conjugrad_float_t step,
+	int n,
+	int k,
+	int ls
+) {
+	(void)instance;
+	(void)x;
+	(void)g;
+	(void)n;
+
+	printf("%d\t%d\t%g\t%g\t%g\t%g\n", k, ls, fx, xnorm, gnorm, step);
+	return true;
+}
+
+int main(int argc, char **argv) {
+	(void)argc;
+	(void)argv;
+
+	conjugrad_parameter_t *param = conjugrad_init();
+	
+
+	int n = 2;
+	conjugrad_float_t *x = conjugrad_malloc(n);
+	x[0] = 0;
+	x[1] = 1;
+	conjugrad_float_t fx;
+
+	int ret = conjugrad(n, x, &fx, evaluate, progress, NULL, param);
+
+	printf("Return code %d\n", ret);
+
+}
diff --git a/lib/libconjugrad/src/arithmetic_ansi.c b/lib/libconjugrad/src/arithmetic_ansi.c
new file mode 100644
index 0000000..8efb73b
--- /dev/null
+++ b/lib/libconjugrad/src/arithmetic_ansi.c
@@ -0,0 +1,51 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdlib.h>
+#include "arithmetic.h"
+#include "conjugrad.h"
+
+void vec0(conjugrad_float_t *dst, int n) {
+	memset(dst, 0, sizeof(conjugrad_float_t) * n);
+}
+
+void veccpy(conjugrad_float_t *dst, conjugrad_float_t *src, int n) {
+	memcpy(dst, src, sizeof(conjugrad_float_t) * n);
+}
+
+void vecimulc(conjugrad_float_t *dst, conjugrad_float_t f, int n) {
+	for(int i = 0; i < n; i++) {
+		dst[i] *= f;
+	}
+}
+
+void vecifma(conjugrad_float_t *dst, conjugrad_float_t *src, conjugrad_float_t f, int n) {
+	for(int i = 0; i < n; i++) {
+		dst[i] += f * src[i];
+	}
+}
+
+void vecsfms(conjugrad_float_t *dst, conjugrad_float_t *a, conjugrad_float_t f, conjugrad_float_t *b, int n) {
+	for(int i = 0; i < n; i++) {
+		dst[i] = f * b[i] - a[i];
+	}
+}
+
+conjugrad_float_t vecnorm(conjugrad_float_t *v, int n) {
+	conjugrad_float_t sum = 0.;
+	for(int i = 0; i < n; i++) {
+		sum += v[i] * v[i];
+	}
+	return sum;
+}
+
+conjugrad_float_t vecdot(conjugrad_float_t *v, conjugrad_float_t *w, int n) {
+	conjugrad_float_t sum = 0.;
+	for(int i = 0; i < n; i++) {
+		sum += v[i] * w[i];
+	}
+	return sum;
+}
+
+conjugrad_float_t *vecalloc(int n) {
+	return (conjugrad_float_t *) malloc(sizeof(conjugrad_float_t) * n);
+}
diff --git a/lib/libconjugrad/src/arithmetic_simd.c b/lib/libconjugrad/src/arithmetic_simd.c
new file mode 100644
index 0000000..60d8949
--- /dev/null
+++ b/lib/libconjugrad/src/arithmetic_simd.c
@@ -0,0 +1,211 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "arithmetic.h"
+#include "conjugrad.h"
+
+#include <immintrin.h>
+
+#ifndef CONJUGRAD_SIMD
+#define CONJUGRAD_SIMD 1
+#endif
+
+#if CONJUGRAD_FLOAT == 32
+typedef float fval;
+
+#if CONJUGRAD_SIMD == 1 // SSE
+#define STRIDE 4
+typedef __m128 simd;
+#define simdset1 _mm_set1_ps
+#define simdload _mm_load_ps
+#define simdmul _mm_mul_ps
+#define simdadd _mm_add_ps
+#define simdsub _mm_sub_ps
+#define simdstore _mm_store_ps
+
+#elif CONJUGRAD_SIMD == 2 // AVX
+#define STRIDE 8
+typedef __m256 simd;
+#define simdset1 _mm256_set1_ps
+#define simdload _mm256_load_ps
+#define simdmul _mm256_mul_ps
+#define simdadd _mm256_add_ps
+#define simdsub _mm256_sub_ps
+#define simdstore _mm256_store_ps
+#endif // CONJUGRAD_SIMD
+
+#else // CONJUGRAD_FLOAT
+typedef double fval;
+
+#if CONJUGRAD_SIMD == 1 // SSE
+#define STRIDE 2
+typedef __m128d simd;
+#define simdset1 _mm_set1_pd
+#define simdload _mm_load_pd
+#define simdmul _mm_mul_pd
+#define simdadd _mm_add_pd
+#define simdsub _mm_sub_pd
+#define simdstore _mm_store_pd
+
+#elif CONJUGRAD_SIMD == 2 // AVX
+#define STRIDE 4
+typedef __m256d simd;
+#define simdset1 _mm256_set1_pd
+#define simdload _mm256_load_pd
+#define simdmul _mm256_mul_pd
+#define simdadd _mm256_add_pd
+#define simdsub _mm256_sub_pd
+#define simdstore _mm256_store_pd
+#endif // CONJUGRAD_SIMD
+#endif // CONJUGRAD_FLOAT
+
+void vec0(conjugrad_float_t *dst, int n) {
+	memset(dst, 0, sizeof(conjugrad_float_t) * n);
+}
+
+void veccpy(conjugrad_float_t *dst, conjugrad_float_t *src, int n) {
+	memcpy(dst, src, sizeof(conjugrad_float_t) * n);
+}
+
+void vecimulc(conjugrad_float_t *dst, conjugrad_float_t f, int n) {
+	simd _dst;
+	simd _f = simdset1(f);
+
+	fval *pdst = (fval *)dst;
+
+	int m = (n / STRIDE) * STRIDE;
+	for(int i = 0; i < m; i+= STRIDE) {
+		_dst = simdload(pdst);
+		_dst = simdmul(_dst, _f);
+		simdstore(pdst, _dst);
+
+		pdst += STRIDE;
+	}
+
+	for(int i = m; i < n; i++) {
+		dst[i] *= f;
+	}
+}
+
+void vecifma(conjugrad_float_t *dst, conjugrad_float_t *src, conjugrad_float_t f, int n) {
+	simd _dst;
+	simd _src;
+
+	simd _f = simdset1(f);
+
+	fval *pdst = (fval *)dst;
+	fval *psrc = (fval *)src;
+
+	int m = (n / STRIDE) * STRIDE;
+	for(int i = 0; i < m; i += STRIDE) {
+		_dst = simdload(pdst);
+		_src = simdload(psrc);
+		_src = simdmul(_src, _f);
+		_dst = simdadd(_dst, _src);
+		simdstore(pdst, _dst);
+
+		pdst += STRIDE;
+		psrc += STRIDE;
+	}
+
+	for(int i = m; i < n; i++) {
+		dst[i] += f * src[i];
+	}
+}
+
+void vecsfms(conjugrad_float_t *dst, conjugrad_float_t *a, conjugrad_float_t f, conjugrad_float_t *b, int n) {
+	simd _dst;
+	simd _a;
+	simd _b;
+
+	simd _f = simdset1(f);
+
+	fval *pdst = (fval *)dst;
+	fval *pa = (fval *)a;
+	fval *pb = (fval *)b;
+
+	int m = (n / STRIDE) * STRIDE;
+	for(int i = 0; i < m; i += STRIDE) {
+		_a = simdload(pa);
+		_b = simdload(pb);
+		_dst = simdmul(_b, _f);
+		_dst = simdsub(_dst, _a);
+		simdstore(pdst, _dst);
+
+		pdst += STRIDE;
+		pa += STRIDE;
+		pb += STRIDE;
+	}
+
+	for(int i = m; i < n; i++) {
+		dst[i] = a[i] + f * b[i];
+	}
+
+}
+
+conjugrad_float_t vecnorm(conjugrad_float_t *v, int n) {
+
+	simd _sum = simdset1(.0f);	
+	simd _v;
+
+
+	fval *pv = (fval *)v;
+	int m = (n / STRIDE) * STRIDE;
+	for(int i = 0; i < m; i += STRIDE) {
+		_v = simdload(pv);
+		_v = simdmul(_v, _v);
+		_sum = simdadd(_sum, _v);
+		pv += STRIDE;
+	}
+
+	fval psum[STRIDE];
+	simdstore(psum, _sum);
+
+	fval sum = .0f;
+	for(int i = 0; i < STRIDE; i++) {
+		sum += psum[i];
+	}
+
+	for(int i = m; i < n; i++) {
+		sum += v[i] * v[i];
+	}
+
+	return sum;
+}
+
+conjugrad_float_t vecdot(conjugrad_float_t *v, conjugrad_float_t *w, int n) {
+	simd _sum = simdset1(.0f);	
+	simd _v;
+	simd _w;
+
+	fval *pv = (fval *)v;
+	fval *pw = (fval *)w;
+	int m = (n / STRIDE) * STRIDE;
+	for(int i = 0; i < m; i += STRIDE) {
+		_v = simdload(pv);
+		_w = simdload(pw);
+		_v = simdmul(_v, _w);
+		_sum = simdadd(_sum, _v);
+		pv += STRIDE;
+		pw += STRIDE;
+	}
+
+	fval psum[STRIDE];
+	simdstore(psum, _sum);
+
+	fval sum = .0f;
+	for(int i = 0; i < STRIDE; i++) {
+		sum += psum[i];
+	}
+	for(int i = m; i < n; i++) {
+		sum += v[i] * w[i];
+	}
+
+	return sum;
+}
+
+conjugrad_float_t *vecalloc(int n) {
+	conjugrad_float_t *out;
+	posix_memalign((void **)&out, sizeof(conjugrad_float_t) * STRIDE, sizeof(conjugrad_float_t) * n);
+	return out;
+}
diff --git a/lib/libconjugrad/src/conjugrad.c b/lib/libconjugrad/src/conjugrad.c
new file mode 100644
index 0000000..a258691
--- /dev/null
+++ b/lib/libconjugrad/src/conjugrad.c
@@ -0,0 +1,189 @@
+#include "conjugrad.h"
+#include "arithmetic.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+
+void swap(conjugrad_float_t **a, conjugrad_float_t **b) {
+	conjugrad_float_t *c = *a;
+	*a = *b;
+	*b = c;
+}
+
+int conjugrad(
+	int n,
+	conjugrad_float_t *x,
+	conjugrad_float_t *fx,
+	conjugrad_evaluate_t proc_evaluate,
+	conjugrad_progress_t proc_progress,
+	void *instance,
+	conjugrad_parameter_t *param
+) {
+
+	conjugrad_float_t alpha, alphaprev, dg, dgprev, beta, gprevnorm, gnorm, xnorm;
+	conjugrad_float_t *g = conjugrad_malloc(n);
+	conjugrad_float_t *s = conjugrad_malloc(n);
+	int n_linesearch, n_iter = 0;
+	int ret = CONJUGRADERR_UNKNOWN;
+	conjugrad_float_t *k_last_fx = (conjugrad_float_t *)malloc(sizeof(conjugrad_float_t) * param->k);
+	conjugrad_float_t *check_fx = k_last_fx;
+
+	vec0(s, n);
+
+	*fx = proc_evaluate(instance, x, g, n);
+
+	gnorm = vecnorm(g, n);
+	xnorm = vecnorm(x, n);
+
+	if(gnorm <= param->min_gnorm || gnorm / xnorm <= param->epsilon) {
+		ret = CONJUGRAD_ALREADY_MINIMIZED;
+		goto conjugrad_exit;
+	}
+
+	alpha = F1/fsqrt(gnorm);
+
+	while(true) {
+		if(n_iter >= param->max_iterations) { 
+			ret = CONJUGRADERR_MAXIMUMITERATION;
+			break;
+		};
+
+		// \delta x_n = - g_n
+		//vecimulc(g, -1, n);
+
+		if(n_iter > 0) {
+			// fletcher-reeves: beta_n = ||x_n|| / ||x_{n-1}||
+			beta = gnorm / gprevnorm;
+
+			// s_n = \beta_n * s_{n-1} - g_n
+			vecsfms(s, g, beta, s, n);
+			dg = vecdot(s, g, n);
+			alpha = alphaprev * dgprev / dg;
+
+		} else {
+			// s_0 = -g_0
+			veccpy(s, g, n);
+			vecimulc(s, -1, n);
+			dg = vecdot(s, g, n);
+		}
+
+		n_linesearch = linesearch(n, x, fx, g, s, &alpha, proc_evaluate, instance, param);
+
+		gprevnorm = gnorm;
+		gnorm = vecnorm(g, n);
+		xnorm = vecnorm(x, n);
+		alphaprev = alpha;
+		dgprev = dg;
+
+		if(n_linesearch < 0) {
+			ret = n_linesearch;
+			break;
+		}
+
+		int pos = n_iter % param->k;
+		check_fx = k_last_fx + pos;
+
+		if (n_iter >= param->k) {
+			conjugrad_float_t rel_change = (*check_fx - *fx) / *check_fx;
+			if (rel_change < param->epsilon) {
+				ret = CONJUGRAD_SUCCESS;
+				break;
+			}
+		}
+
+		*check_fx = *fx;
+
+		n_iter++;
+		proc_progress(instance, x, g, *fx, xnorm, gnorm, alpha, n, n_iter, n_linesearch);
+
+		// convergence check
+		//if(xnorm < 1.0) { xnorm = 1.0; }
+		//if(gnorm / xnorm <= param->epsilon) {
+		//	ret = CONJUGRAD_SUCCESS;
+		//	break;
+		//}
+	}
+
+	conjugrad_exit:	
+
+	conjugrad_free(g);
+	conjugrad_free(s);
+	free(k_last_fx);
+	return ret;
+}
+
+
+conjugrad_parameter_t *conjugrad_init() {
+	conjugrad_parameter_t *out = (conjugrad_parameter_t *)malloc(sizeof(conjugrad_parameter_t));
+	out->max_linesearch = 100;
+	out->max_iterations = 1000;
+	out->epsilon = 1e-5;
+	out->ftol = 1e-4;
+	out->wolfe = 0.1;
+	out->alpha_mul = 0.5;
+	out->min_gnorm = 1e-8;
+
+	return out;
+}
+
+int linesearch(
+	int n,
+	conjugrad_float_t *x,
+	conjugrad_float_t *fx,
+	conjugrad_float_t *g,
+	conjugrad_float_t *s,
+	conjugrad_float_t *alpha,
+	conjugrad_evaluate_t proc_evaluate,
+	void *instance,
+	conjugrad_parameter_t *param
+) {
+
+	conjugrad_float_t fx_step;
+	int n_linesearch = 0;
+
+	conjugrad_float_t dginit = vecdot(g, s, n);
+	conjugrad_float_t dgtest = dginit * param->ftol;
+	conjugrad_float_t dg;
+	conjugrad_float_t finit = *fx;
+	conjugrad_float_t prevalpha = 0;
+
+	while(true) {
+		if(n_linesearch >= param->max_linesearch) { return CONJUGRADERR_MAXIMUMLINESEARCH; }
+		n_linesearch++;
+
+		// do step
+		vecifma(x, s, *alpha - prevalpha, n);
+
+		// x_{n+1} is available implicitly by x_{n+1} = x_n + alpha * s
+		fx_step = proc_evaluate(instance, x, g, n);
+
+		// armijo condition
+		if(fx_step <= finit + *alpha * dgtest) {
+
+			// wolfe condition (curvature)
+			dg = vecdot(s, g, n);
+			if(dg < param->wolfe * dginit) {
+				*fx = fx_step;
+				return n_linesearch;
+			}
+
+
+		}
+
+		prevalpha = *alpha;
+		*alpha *= param->alpha_mul;
+	}
+}
+
+
+
+conjugrad_float_t *conjugrad_malloc(int n) {
+	return vecalloc(n); 
+}
+
+void conjugrad_free(conjugrad_float_t *x) {
+	free(x);
+}
diff --git a/lib/libconjugrad/src/conjugrad_cuda.c b/lib/libconjugrad/src/conjugrad_cuda.c
new file mode 100644
index 0000000..99b091d
--- /dev/null
+++ b/lib/libconjugrad/src/conjugrad_cuda.c
@@ -0,0 +1,194 @@
+#include "conjugrad.h"
+#include "conjugrad_kernels.h"
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdbool.h>
+
+
+int init_cg_mem_cuda(int nvar);
+
+int destroy_cg_mem_cuda();
+
+int linesearch_gpu(
+	int n,
+	conjugrad_float_t *d_x,
+	conjugrad_float_t *fx,
+	conjugrad_float_t *d_g,
+	conjugrad_float_t *d_s,
+	conjugrad_float_t *alpha,
+	conjugrad_evaluate_t proc_evaluate,
+	void *instance,
+	conjugrad_parameter_t *param
+);
+
+// global GPU pointer
+conjugrad_float_t *d_g;
+conjugrad_float_t *d_s;
+
+
+int conjugrad_gpu(
+	int n,
+	conjugrad_float_t *d_x,
+	conjugrad_float_t *fx,
+	conjugrad_evaluate_t proc_evaluate,
+	conjugrad_progress_t proc_progress,
+	void *instance,
+	conjugrad_parameter_t *param
+) {
+	conjugrad_float_t alpha, alphaprev, dg, dgprev, beta, gnorm, gprevnorm, xnorm;
+	// allocate memory on device
+	init_cg_mem_cuda(n);
+	int n_linesearch, n_iter = 0;
+	int ret = CONJUGRADERR_UNKNOWN;
+	conjugrad_float_t *k_last_fx = (conjugrad_float_t *)malloc(sizeof(conjugrad_float_t) * param->k);
+	conjugrad_float_t *check_fx = k_last_fx;
+
+
+	CHECK_ERR(cudaMemset(d_s, 0, sizeof(conjugrad_float_t) * n));
+
+	//call evaluate
+	*fx = proc_evaluate(instance, d_x, d_g, n);
+
+	// get xnorm and gnorm
+	vecnorm_gpu(d_g, &gnorm, n);
+	vecnorm_gpu(d_x, &xnorm, n);
+
+	if (gnorm / xnorm <= param->epsilon) {
+		ret = CONJUGRAD_ALREADY_MINIMIZED;
+		goto conjugrad_exit;
+	}
+
+	alpha = F1 / fsqrt(gnorm); // F1 is 1.0
+
+	while (true) {
+		if (n_iter >= param->max_iterations) {
+			ret = CONJUGRADERR_MAXIMUMITERATION;
+			break;
+		}
+
+
+
+		if (n_iter > 0) {
+			// fletcher-reeves: beta_n = ||x_n|| / ||x_{n-1}||
+			//                         = ||g_n|| / ||g_{n-1}||
+			beta = gnorm / gprevnorm;
+
+			// s_n = \delta x_n + \beta_n * s_{n-1}
+			//     = \beta_n * s_{n-1} - g_n
+			update_s_gpu(d_s, d_g, beta, n);
+			vecdot_gpu(d_s, d_g, &dg, n);
+			alpha = alphaprev * dgprev / dg;
+
+		} else {
+			// s_0 = \delta x_0
+			initialize_s_gpu(d_s, d_g, n);
+			vecdot_gpu(d_s, d_g, &dg, n);
+		}
+
+		// linesearch
+		n_linesearch = linesearch_gpu(n, d_x, fx, d_g, d_s, &alpha, proc_evaluate, instance, param);
+
+		gprevnorm = gnorm;
+		vecnorm_gpu(d_g, &gnorm, n);
+		vecnorm_gpu(d_x, &xnorm, n);
+		alphaprev = alpha;
+		dgprev = dg;
+
+		if(n_linesearch < 0) {
+			ret = n_linesearch;
+			break;
+		}
+
+		int pos = n_iter % param->k;
+		check_fx = k_last_fx + pos;
+		
+		if (n_iter >= param->k) {
+			conjugrad_float_t rel_change = (*check_fx - *fx) / *fx;
+			if (rel_change < param->epsilon) {
+				ret = CONJUGRAD_SUCCESS;
+				break;
+			}
+		}
+
+		*check_fx = *fx;
+
+		n_iter++;
+		proc_progress(instance, d_x, d_g, *fx, xnorm, gnorm, alpha, n, n_iter, n_linesearch);
+
+		// convergence check
+		//if (xnorm < F1) { xnorm = F1; }
+		//if (gnorm / xnorm <= 1e-3f) {
+		//	ret = CONJUGRAD_SUCCESS;
+		//	break;
+		//}
+	}
+	
+	conjugrad_exit:
+	
+	destroy_cg_mem_cuda();
+	free(k_last_fx);
+	return ret;
+}
+
+
+int linesearch_gpu(
+	int n,
+	conjugrad_float_t *d_x,
+	conjugrad_float_t *fx,
+	conjugrad_float_t *d_g,
+	conjugrad_float_t *d_s,
+	conjugrad_float_t *alpha,
+	conjugrad_evaluate_t proc_evaluate,
+	void *instance,
+	conjugrad_parameter_t *param
+) {
+	conjugrad_float_t fx_step;
+	conjugrad_float_t prevalpha;
+	prevalpha = F0; // 0.0
+
+	int n_linesearch = 0;
+
+	conjugrad_float_t dginit;
+	vecdot_gpu(d_s, d_g, &dginit, n);
+	conjugrad_float_t dgtest = dginit * param->ftol;
+	conjugrad_float_t dg;
+	conjugrad_float_t finit = *fx;
+
+	while (true) {
+		if (n_linesearch >= param->max_linesearch) { return CONJUGRADERR_MAXIMUMLINESEARCH; }
+		n_linesearch++;
+
+		// do step
+		update_x_gpu(d_x, d_s, *alpha, prevalpha, n);
+
+		// evaluate
+		fx_step = proc_evaluate(instance, d_x, d_g, n);
+
+		// armijo condition
+		if (fx_step <= finit + *alpha * dgtest) {
+			vecdot_gpu(d_s, d_g, &dg, n);
+			if (dg < param->wolfe * dginit) {
+				*fx = fx_step;
+				return n_linesearch;
+			}
+		}
+		prevalpha = *alpha;
+		*alpha *= param->alpha_mul;
+	}
+}
+
+
+int init_cg_mem_cuda(int nvar) {
+	CHECK_ERR(cudaMalloc((void **) &d_g, sizeof(conjugrad_float_t) * nvar));
+	CHECK_ERR(cudaMalloc((void **) &d_s, sizeof(conjugrad_float_t) * nvar));
+	return EXIT_SUCCESS;
+}
+
+
+int destroy_cg_mem_cuda() {
+	CHECK_ERR(cudaFree(d_g));
+	CHECK_ERR(cudaFree(d_s));
+	return EXIT_SUCCESS;
+}
diff --git a/lib/libconjugrad/src/conjugrad_kernels.cu b/lib/libconjugrad/src/conjugrad_kernels.cu
new file mode 100644
index 0000000..f71c3b9
--- /dev/null
+++ b/lib/libconjugrad/src/conjugrad_kernels.cu
@@ -0,0 +1,250 @@
+#ifdef __cpluplus
+extern "C" {
+#endif
+#include "conjugrad_kernels.h"
+#include <cuda_runtime.h>
+#include <stdio.h>
+#ifdef __cpluplus
+}
+#endif
+
+//forward declaration of device functions
+__device__ void sum_reduction_function(
+	volatile conjugrad_float_t *s_data,
+	int tid
+);
+
+__device__ void warp_reduction(
+	volatile conjugrad_float_t *s_data,
+	int tid
+);
+
+__global__ void sum_reduction(
+	conjugrad_float_t *d_in,
+	conjugrad_float_t *d_out,
+	int nvar
+) {
+	int i = threadIdx.x;
+
+	extern __shared__ conjugrad_float_t s_data[];
+
+	s_data[threadIdx.x] = F0; // 0.0
+
+	while (i < nvar) {
+		s_data[threadIdx.x] += d_in[i];
+		i += blockDim.x;
+	}
+	__syncthreads();
+
+	sum_reduction_function(s_data, threadIdx.x);
+
+	if (0 == threadIdx.x) {
+		d_out[0] = s_data[0];
+	}
+}
+
+__device__ void sum_reduction_function(
+	volatile conjugrad_float_t *s_data,
+	int tid
+) {
+	for (unsigned int s = blockDim.x >> 1; s > 32; s >>= 1) {
+		if (threadIdx.x < s) {
+			s_data[threadIdx.x] += s_data[threadIdx.x + s];
+		}
+		__syncthreads();
+	}
+	if (threadIdx.x < 32) {
+		warp_reduction(s_data, threadIdx.x);
+	}
+}
+__device__ void warp_reduction(
+	volatile conjugrad_float_t *s_data,
+	int tid
+) {
+	s_data[tid] += s_data[tid + 32];
+	s_data[tid] += s_data[tid + 16];
+	s_data[tid] += s_data[tid +  8];
+	s_data[tid] += s_data[tid +  4];
+	s_data[tid] += s_data[tid +  2];
+	s_data[tid] += s_data[tid +  1];
+}
+
+__global__ void vecdot_inter(
+	conjugrad_float_t *d_x,
+	conjugrad_float_t *d_y,
+	conjugrad_float_t *d_vecdot_inter,
+	int nvar
+) {
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	const int offset = blockDim.x * gridDim.x;
+
+	extern __shared__ conjugrad_float_t s_vecdot_inter[];
+
+	s_vecdot_inter[threadIdx.x] = F0; // 0.0
+
+	while (i < nvar) {
+		s_vecdot_inter[threadIdx.x] = d_x[i] * d_y[i] + s_vecdot_inter[threadIdx.x];
+		i += offset;
+	}
+	__syncthreads();
+
+	sum_reduction_function(s_vecdot_inter, threadIdx.x);
+
+	if (0 == threadIdx.x) {
+		d_vecdot_inter[blockIdx.x] = s_vecdot_inter[0];
+	}
+}
+
+__global__ void update_s(
+	conjugrad_float_t *d_old_s,
+	conjugrad_float_t *d_g,
+	conjugrad_float_t beta,
+	int nvar
+) {
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	const int offset = blockDim.x * gridDim.x;
+
+	while (i < nvar) {
+		d_old_s[i] = beta * d_old_s[i] - d_g[i];
+		i += offset;
+	}
+}
+
+__global__ void initialize_s(
+	conjugrad_float_t *d_s,
+	conjugrad_float_t *d_g,
+	int nvar) {
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	const int offset = blockDim.x * gridDim.x;
+
+	while (i < nvar) {
+		d_s[i] = - d_g[i];
+		i += offset;
+	}
+}
+
+__global__ void update_x(
+	conjugrad_float_t *d_x,
+	conjugrad_float_t *d_s,
+	conjugrad_float_t alpha,
+	conjugrad_float_t prevalpha,
+	int nvar) {
+	int i = blockIdx.x * blockDim.x + threadIdx.x;
+	const int offset = blockDim.x * gridDim.x;
+
+	while (i < nvar) {
+		d_x[i] = (alpha - prevalpha) * d_s[i] + d_x[i];
+		i += offset;
+	}
+}
+
+#ifdef __cpluplus
+extern "C" {
+#endif
+void vecnorm_gpu(
+	conjugrad_float_t *d_x,
+	conjugrad_float_t *res,
+	int nvar
+) {
+	unsigned int nblocks = 128;
+	unsigned int nthreads = 256;
+	size_t nbytes = sizeof(conjugrad_float_t) * nthreads;
+
+	conjugrad_float_t *d_inter;
+	conjugrad_float_t *d_res;
+	CHECK_ERR(cudaMalloc((void **) &d_inter, sizeof(conjugrad_float_t) * nblocks));
+	CHECK_ERR(cudaMalloc((void **) &d_res, sizeof(conjugrad_float_t)));
+
+	vecdot_inter<<<nblocks, nthreads, nbytes>>>(d_x, d_x, d_inter, nvar);
+	cudaDeviceSynchronize();
+	cudaError_t err = cudaGetLastError();
+	CHECK_ERR(err);
+
+
+	nbytes = sizeof(conjugrad_float_t) * nblocks;
+	sum_reduction<<<1, nblocks, nbytes>>>(d_inter, d_res, nblocks); 
+	cudaDeviceSynchronize();
+	err = cudaGetLastError();
+	CHECK_ERR(err);
+
+
+	CHECK_ERR(cudaMemcpy(res, d_res, sizeof(conjugrad_float_t), cudaMemcpyDeviceToHost));
+
+	CHECK_ERR(cudaFree(d_inter));
+	CHECK_ERR(cudaFree(d_res));
+}
+
+void vecdot_gpu(
+	conjugrad_float_t *d_x,
+	conjugrad_float_t *d_y,
+	conjugrad_float_t *res,
+	int nvar
+) {
+	unsigned int nblocks = 128;
+	unsigned int nthreads = 256;
+	size_t nbytes = sizeof(conjugrad_float_t) * nthreads;
+
+	conjugrad_float_t *d_inter;
+	conjugrad_float_t *d_res;
+	CHECK_ERR(cudaMalloc((void **) &d_inter, sizeof(conjugrad_float_t) * nblocks));
+	CHECK_ERR(cudaMalloc((void **) &d_res, sizeof(conjugrad_float_t)));
+
+	vecdot_inter<<<nblocks, nthreads, nbytes>>>(d_x, d_y, d_inter, nvar);
+	cudaDeviceSynchronize();
+	CHECK_ERR(cudaGetLastError());
+
+	nbytes = sizeof(conjugrad_float_t) * nblocks;
+	sum_reduction<<<1, nblocks, nbytes>>>(d_inter, d_res, nblocks); 
+	cudaDeviceSynchronize();
+	CHECK_ERR(cudaGetLastError());
+
+	CHECK_ERR(cudaMemcpy(res, d_res, sizeof(conjugrad_float_t), cudaMemcpyDeviceToHost));
+
+	CHECK_ERR(cudaFree(d_inter));
+	CHECK_ERR(cudaFree(d_res));
+}
+
+void initialize_s_gpu(
+	conjugrad_float_t *d_s,
+	conjugrad_float_t *d_g,
+	int nvar
+) {
+	int nblocks = 128;
+	int nthreads = 256;
+
+	initialize_s<<<nblocks, nthreads>>>(d_s, d_g, nvar);
+	cudaDeviceSynchronize();
+	CHECK_ERR(cudaGetLastError());
+}
+
+void update_s_gpu(
+	conjugrad_float_t *d_old_s,
+	conjugrad_float_t *d_g,
+	conjugrad_float_t beta,
+	int nvar
+) {
+	unsigned int nblocks = 128;
+	unsigned int nthreads = 256;
+
+	update_s<<<nblocks, nthreads>>>(d_old_s, d_g, beta, nvar);
+	cudaDeviceSynchronize();
+	CHECK_ERR(cudaGetLastError());
+}
+
+void update_x_gpu(
+	conjugrad_float_t *d_x,
+	conjugrad_float_t *d_s,
+	conjugrad_float_t alpha,
+	conjugrad_float_t prevalpha,
+	int nvar
+) {
+	int nblocks = 128;
+	int nthreads = 256;
+	update_x<<<nblocks, nthreads>>>(d_x, d_s, alpha, prevalpha, nvar);
+	cudaDeviceSynchronize();
+	CHECK_ERR(cudaGetLastError());
+}
+
+#ifdef __cpluplus
+}
+#endif
diff --git a/lib/libconjugrad/src/debug.c b/lib/libconjugrad/src/debug.c
new file mode 100644
index 0000000..d437d0d
--- /dev/null
+++ b/lib/libconjugrad/src/debug.c
@@ -0,0 +1,52 @@
+#include <string.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+#include "conjugrad.h"
+
+void conjugrad_debug_numdiff(
+	int n,
+	conjugrad_float_t *x0,
+	int i,
+	conjugrad_evaluate_t proc_evaluate,
+	void *instance,
+	conjugrad_float_t epsilon,
+	bool have_extra_i,
+	int extra_i
+) {
+
+	conjugrad_float_t *xA = conjugrad_malloc(n);
+	conjugrad_float_t *xB = conjugrad_malloc(n);
+	conjugrad_float_t *g = conjugrad_malloc(n);
+
+	memcpy(xA, x0, sizeof(conjugrad_float_t) * n);
+	memcpy(xB, x0, sizeof(conjugrad_float_t) * n);
+
+	xA[i] -= epsilon;
+	xB[i] += epsilon;
+
+	if(have_extra_i) {
+		xA[extra_i] -= epsilon;
+		xB[extra_i] += epsilon;
+	}
+
+	conjugrad_float_t fxA = proc_evaluate(instance, xA, g, n);	
+	conjugrad_float_t fxB = proc_evaluate(instance, xB, g, n);	
+	conjugrad_float_t fx = proc_evaluate(instance, x0, g, n);	
+
+	conjugrad_float_t gNumeric = (fxB - fxA) / (2 * epsilon);
+	conjugrad_float_t gSymbolic = g[i];
+	
+	printf("\t@%8d: gNum = %15g gSym = %15g", i, gNumeric, gSymbolic);
+
+	if(have_extra_i) {
+		printf("\t@%8d: gSym = %15g", extra_i, g[extra_i]);
+
+	}
+
+	printf("\n");
+
+	conjugrad_free(xA);
+	conjugrad_free(xB);
+	conjugrad_free(g);
+}
diff --git a/lib/libconjugrad/src/help_functions.cu b/lib/libconjugrad/src/help_functions.cu
new file mode 100644
index 0000000..2546246
--- /dev/null
+++ b/lib/libconjugrad/src/help_functions.cu
@@ -0,0 +1,59 @@
+#include "help_functions.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+__global__ void sum_reduction(
+	conjugrad_float_t *d_in,
+	conjugrad_float_t *d_out,
+	int nvar
+) {
+	int i = threadIdx.x;
+
+	extern __shared__ conjugrad_float_t s_data[];
+
+	s_data[threadIdx.x] = F0; // 0.0
+
+	while (i < nvar) {
+		s_data[threadIdx.x] += d_in[i];
+		i += blockDim.x;
+	}
+	__syncthreads();
+
+	sum_reduction_function(s_data, threadIdx.x);
+
+	if (0 == threadIdx.x) {
+		d_out[0] = s_data[0];
+	}
+}
+
+__device__ void sum_reduction_function(
+	volatile conjugrad_float_t *s_data,
+	int tid
+) {
+	for (unsigned int s = blockDim.x >> 1; s > 32; s >>= 1) {
+		if (threadIdx.x < s) {
+			s_data[threadIdx.x] += s_data[threadIdx.x + s];
+		}
+		__syncthreads();
+	}
+	if (threadIdx.x < 32) {
+		warp_reduction(s_data, threadIdx.x);
+	}
+}
+
+__device__ void warp_reduction(
+	volatile conjugrad_float_t *s_data,
+	int tid
+) {
+	s_data[tid] += s_data[tid + 32];
+	s_data[tid] += s_data[tid + 16];
+	s_data[tid] += s_data[tid +  8];
+	s_data[tid] += s_data[tid +  4];
+	s_data[tid] += s_data[tid +  2];
+	s_data[tid] += s_data[tid +  1];
+}
+
+#ifdef __cplusplus
+}
+#endif