From cb2672bb864e0d741e5a675257087ac2a2d5f361 Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Fri, 7 Nov 2025 09:54:54 +0100
Subject: [PATCH 1/3] add log for debug

---
 src/core/system.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/core/system.cu b/src/core/system.cu
index 72ccf70d..18f9944c 100644
--- a/src/core/system.cu
+++ b/src/core/system.cu
@@ -729,7 +729,9 @@ void calc_temperature() {
             Texcl_solute += ener;
         }
         if (ener > Ekinmax) {
-            printf(">>> WARNING: hot atom %d: %f\n", i, ener/Boltz/3);
+            printf(">>> WARNING: hot atom %d, %f %f %f %f %f %f: %f %f %f %f %f\n", i, velocities[i].x, velocities[i].y, velocities[i].z, 
+            coords[i].x, coords[i].y, coords[i].z, ener, ener/Boltz/3, Ekinmax, mass_i, Ekinmax / (0.5 * mass_i));
+            // printf(">>> WARNING: hot atom %d: %f\n", i, ener/Boltz/3);
         }
     }
 
@@ -744,7 +746,9 @@ void calc_temperature() {
             Texcl_solvent += ener;
         }
         if (ener > Ekinmax) {
-            printf(">>> WARNING: hot atom %d: %f\n", i, ener/Boltz/3);
+            printf(">>> WARNING: hot atom %d, %f %f %f %f %f %f: %f %f %f %f %f\n", i, velocities[i].x, velocities[i].y, velocities[i].z, 
+            coords[i].x, coords[i].y, coords[i].z, ener, ener/Boltz/3, Ekinmax, mass_i, Ekinmax / (0.5 * mass_i));
+            // printf(">>> WARNING: hot atom %d: %f\n", i, ener/Boltz/3);
         }
     }
 

From 7d1cc82f8c8549788b749c54e27eca866914dbff Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Wed, 12 Nov 2025 09:28:28 +0100
Subject: [PATCH 2/3] change double to float

---
 src/core/bonded.cu     |  36 ++++----
 src/core/bonded.h      |   8 +-
 src/core/nonbonded.cu  |  58 ++++++------
 src/core/nonbonded.h   |   6 +-
 src/core/parse.cu      |  32 ++++---
 src/core/qatoms.cu     | 136 +++++++++++++--------------
 src/core/qatoms.h      |  24 ++---
 src/core/restraints.cu |  40 ++++----
 src/core/shake.cu      |   6 +-
 src/core/solvent.cu    | 174 +++++++++++++++++------------------
 src/core/solvent.h     |  14 +--
 src/core/system.cu     |  74 +++++++--------
 src/core/system.h      | 202 ++++++++++++++++++++---------------------
 src/core/utils.cu      |  12 +--
 src/core/utils.h       |   6 +-
 15 files changed, 418 insertions(+), 410 deletions(-)

diff --git a/src/core/bonded.cu b/src/core/bonded.cu
index f86d773e..08d8791c 100644
--- a/src/core/bonded.cu
+++ b/src/core/bonded.cu
@@ -10,18 +10,18 @@
  * =============================================
  */
 
-double calc_angle_forces(int start, int end) {
+float calc_angle_forces(int start, int end) {
     int aii, aji, aki;
 
     coord_t ai, aj, ak;
     coord_t rji, rjk;
     coord_t di, dk;
 
-    double bji2inv, bjk2inv, bjiinv, bjkinv;
+    float bji2inv, bjk2inv, bjiinv, bjkinv;
     cangle_t cangle;
-    double cos_th, th, dth, dv, f1;
-    double ener;
-    double angle = 0;
+    float cos_th, th, dth, dv, f1;
+    float ener;
+    float angle = 0;
 
     for (int i = start; i < end; i++) {
         aii = angles[i].ai - 1;
@@ -102,12 +102,12 @@ double calc_angle_forces(int start, int end) {
     return angle;
 }
 
-double calc_bond_forces(int start, int end) {
+float calc_bond_forces(int start, int end) {
     int aii, aji;
     coord_t ai, aj, dx;
     cbond_t cbond;
-    double dx2, dx1, ddx, ener, ampl;
-    double bond = 0;
+    float dx2, dx1, ddx, ener, ampl;
+    float bond = 0;
 
     for (int i = start; i < end; i++) {
         aii = bonds[i].ai-1;
@@ -147,18 +147,18 @@ double calc_bond_forces(int start, int end) {
     return bond;
 }
 
-double calc_torsion_forces(int start, int end) {
+float calc_torsion_forces(int start, int end) {
     int aii, aji, aki, ali;
 
     coord_t ai, aj, ak, al;
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
-    double bj2inv, bk2inv, bjinv, bkinv;
-    double cos_phi, phi;
-    double arg, dv, f1;
-    double ener;
-    double torsion = 0;
+    float bj2inv, bk2inv, bjinv, bkinv;
+    float cos_phi, phi;
+    float arg, dv, f1;
+    float ener;
+    float torsion = 0;
 
     torsion_t t;
     ctorsion_t ctors;
@@ -281,18 +281,18 @@ double calc_torsion_forces(int start, int end) {
     return torsion;
 }
 
-double calc_improper2_forces(int start, int end) {
+float calc_improper2_forces(int start, int end) {
     int aii, aji, aki, ali;
 
     coord_t ai, aj, ak, al;
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
-    double bj2inv, bk2inv, bjinv, bkinv;
-    double cos_phi, phi, arg, ener, dv, f1;
+    float bj2inv, bk2inv, bjinv, bkinv;
+    float cos_phi, phi, arg, ener, dv, f1;
     coord_t di, dl, dpi, dpj, dpk, dpl; 
 
     improper_t imp;
     cimproper_t cimp;
-    double improper = 0;
+    float improper = 0;
 
     for (int i = start; i < end; i++) {
         imp = impropers[i];
diff --git a/src/core/bonded.h b/src/core/bonded.h
index 7a46e4cc..176dcc14 100644
--- a/src/core/bonded.h
+++ b/src/core/bonded.h
@@ -1,9 +1,9 @@
 #ifndef __BONDED_H__
 #define __BONDED_H__
 
-double calc_angle_forces(int start, int end);
-double calc_bond_forces(int start, int end);
-double calc_torsion_forces(int start, int end);
-double calc_improper2_forces(int start, int end);
+float calc_angle_forces(int start, int end);
+float calc_bond_forces(int start, int end);
+float calc_torsion_forces(int start, int end);
+float calc_improper2_forces(int start, int end);
 
 #endif /* __BONDED_H__ */
\ No newline at end of file
diff --git a/src/core/nonbonded.cu b/src/core/nonbonded.cu
index 9d49dab7..dc8341d7 100644
--- a/src/core/nonbonded.cu
+++ b/src/core/nonbonded.cu
@@ -13,8 +13,8 @@ dvel_t *DV_X;
 dvel_t *PP_MAT, *h_PP_MAT;
 bool pp_gpu_set = false;
 
-double *D_PP_Evdw, *D_PP_Ecoul, *h_PP_Evdw, *h_PP_Ecoul;
-double *D_PP_evdw_TOT, *D_PP_ecoul_TOT, PP_evdw_TOT, PP_ecoul_TOT;
+float *D_PP_Evdw, *D_PP_Ecoul, *h_PP_Evdw, *h_PP_Ecoul;
+float *D_PP_evdw_TOT, *D_PP_ecoul_TOT, PP_evdw_TOT, PP_ecoul_TOT;
 
 // Constants pointers
 ccharge_t *D_ccharges;
@@ -27,13 +27,13 @@ bool *D_excluded;
 
 void calc_nonbonded_pp_forces() {
     bool bond14, bond23;
-    double scaling;
+    float scaling;
     coord_t da;
-    double r2a, ra, r6a;
-    double Vela, V_a, V_b;
-    double dva;
-    double crg_i, crg_j;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
+    float r2a, ra, r6a;
+    float Vela, V_a, V_b;
+    float dva;
+    float crg_i, crg_j;
+    float ai_aii, aj_aii, ai_bii, aj_bii;
     int i, j;
     catype_t ai_type, aj_type;
 
@@ -93,8 +93,8 @@ void calc_nonbonded_pp_forces_host() {
 
     int mem_size_PP_MAT = n_patoms * n_patoms * sizeof(dvel_t);
     int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(double);
-    int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(double);
+    int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(float);
+    int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(float);
 
     if (!pp_gpu_set) {
         pp_gpu_set = true;
@@ -112,12 +112,12 @@ void calc_nonbonded_pp_forces_host() {
         #endif
         check_cudaMalloc((void**) &D_PP_Ecoul, mem_size_PP_Ecoul);
 
-        check_cudaMalloc((void**) &D_PP_evdw_TOT, sizeof(double));
-        check_cudaMalloc((void**) &D_PP_ecoul_TOT, sizeof(double)); 
+        check_cudaMalloc((void**) &D_PP_evdw_TOT, sizeof(float));
+        check_cudaMalloc((void**) &D_PP_ecoul_TOT, sizeof(float)); 
 
         h_PP_MAT = (dvel_t*) malloc(mem_size_PP_MAT);
-        h_PP_Evdw = (double*) malloc(mem_size_PP_Evdw);
-        h_PP_Ecoul = (double*) malloc(mem_size_PP_Ecoul);
+        h_PP_Evdw = (float*) malloc(mem_size_PP_Evdw);
+        h_PP_Ecoul = (float*) malloc(mem_size_PP_Ecoul);
     }
 
     cudaMemcpy(X, coords, mem_size_X, cudaMemcpyHostToDevice);
@@ -148,8 +148,8 @@ void calc_nonbonded_pp_forces_host() {
 
     calc_energy_sum<<<1, threads>>>(n_blocks, n_blocks, D_PP_evdw_TOT, D_PP_ecoul_TOT, D_PP_Evdw, D_PP_Ecoul, true);
 
-    cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(float), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(float), cudaMemcpyDeviceToHost);
 
     E_nonbond_pp.Uvdw += PP_evdw_TOT;
     E_nonbond_pp.Ucoul += PP_ecoul_TOT;
@@ -161,17 +161,17 @@ void calc_nonbonded_pp_forces_host() {
  */
 
 __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
-    coord_t *Xs, coord_t *Ys, int *LJs, bool *excluded_s, double *Evdw, double *Ecoul, dvel_t *patom_a, dvel_t *patom_b,
+    coord_t *Xs, coord_t *Ys, int *LJs, bool *excluded_s, float *Evdw, float *Ecoul, dvel_t *patom_a, dvel_t *patom_b,
     ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, topo_t D_topo) {
     
     bool bond14, bond23;
-    double scaling;
+    float scaling;
     coord_t da;
-    double r2a, ra, r6a;
-    double Vela, V_a, V_b;
-    double dva;
-    double crg_i, crg_j;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
+    float r2a, ra, r6a;
+    float Vela, V_a, V_b;
+    float dva;
+    float crg_i, crg_j;
+    float ai_aii, aj_aii, ai_bii, aj_bii;
     catype_t ai_type, aj_type;
 
     bond23 = LJs[row * BLOCK_SIZE + column] == 3;
@@ -219,7 +219,7 @@ __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
 }
 
 __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute,
-    coord_t *X, double *Evdw, double *Ecoul, dvel_t *PP_MAT,
+    coord_t *X, float *Evdw, float *Ecoul, dvel_t *PP_MAT,
     ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, int *D_LJ_matrix, bool *D_excluded, topo_t D_topo) {
     // Block index
     int bx = blockIdx.x;
@@ -232,8 +232,8 @@ __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute,
     int aStart = BLOCK_SIZE * by;
     int bStart = BLOCK_SIZE * bx;
 
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ float Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ float Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
 
     Ecoul_S[ty][tx] = 0;
     Evdw_S[ty][tx] = 0;
@@ -272,7 +272,7 @@ __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute,
     int column = bx * BLOCK_SIZE + tx;
 
     if (bx != by || tx != ty) {
-        double evdw = 0, ecoul = 0;
+        float evdw = 0, ecoul = 0;
         calc_pp_dvel_matrix_incr(ty, pi, tx, pj, Xs, Ys, LJs, excluded_s, &evdw, &ecoul, &patom_a,
              &patom_b, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo);
         Evdw_S[ty][tx] = evdw;
@@ -287,8 +287,8 @@ __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute,
     int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
     if (tx == 0 && ty == 0) {
-        double tot_Evdw = 0;
-        double tot_Ecoul = 0;
+        float tot_Evdw = 0;
+        float tot_Ecoul = 0;
         for (int i = 0; i < BLOCK_SIZE; i++) {
             for (int j = 0; j < BLOCK_SIZE; j++) {
                 tot_Evdw += Evdw_S[i][j];
diff --git a/src/core/nonbonded.h b/src/core/nonbonded.h
index 0421082d..b961be56 100644
--- a/src/core/nonbonded.h
+++ b/src/core/nonbonded.h
@@ -27,11 +27,11 @@ extern bool *D_excluded;
 
 // P-P interactions
 __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
-    coord_t *Xs, coord_t *Ys, int *LJs, bool *excluded_s, double *Evdw, double *Ecoul, dvel_t *patom_a, dvel_t *patom_b,
+    coord_t *Xs, coord_t *Ys, int *LJs, bool *excluded_s, float *Evdw, float *Ecoul, dvel_t *patom_a, dvel_t *patom_b,
     ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, topo_t topo);
 
 __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute,
-    coord_t *X, double *Evdw, double *Ecoul, dvel_t *PP_MAT,
+    coord_t *X, float *Evdw, float *Ecoul, dvel_t *PP_MAT,
     ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, int *D_LJ_matrix, bool *D_excluded, topo_t topo);
 
 __global__ void calc_pp_dvel_vector(int n_patoms, dvel_t *DV_X, dvel_t *PP_MAT, p_atom_t *D_patoms);
@@ -39,6 +39,6 @@ __global__ void calc_pp_dvel_vector(int n_patoms, dvel_t *DV_X, dvel_t *PP_MAT,
 
 void clean_d_patoms();
 
-__global__ void calc_energy_sum(int rows, int columns, double *Evdw_TOT, double *Ecoul_TOT, double *Evdw, double *Ecoul, bool upper_diagonal);
+__global__ void calc_energy_sum(int rows, int columns, float *Evdw_TOT, float *Ecoul_TOT, float *Evdw, float *Ecoul, bool upper_diagonal);
 
 #endif /* __NONBONDED_H__ */
\ No newline at end of file
diff --git a/src/core/parse.cu b/src/core/parse.cu
index a11561cc..4019c2c2 100644
--- a/src/core/parse.cu
+++ b/src/core/parse.cu
@@ -210,7 +210,7 @@ void init_md(const char *filename) {
     #ifdef VERBOSE
     printf("reading in %d lambdas (%s in file)\n", n_lambdas, file.buffer[k][1]);
     #endif
-    lambdas = (double*) malloc(n_lambdas * sizeof(double));
+    lambdas = (float*) malloc(n_lambdas * sizeof(float));
     k++;
     for (int i = 0; i < n_lambdas; i++) {
         lambdas[i] = strtod(file.buffer[k][0], &eptr);
@@ -521,20 +521,28 @@ void init_excluded(const char *filename) {
     if (fp == NULL)
         exit(EXIT_FAILURE);
 
-    char line[8192];
-    
-    n_excluded = 0;
+// --- dynamically read a full line ---
+    char *line = NULL;
+    size_t len = 0;
+    ssize_t read_len = getline(&line, &len, fp);
+    if (read_len == -1) {
+        fprintf(stderr, "Error: failed to read line from %s\n", path);
+        free(line);
+        fclose(fp);
+        exit(EXIT_FAILURE);
+    }
 
-    if (fgets(line, 8192, fp)) {
-        for (int i = 0; i < n_atoms; i++) {
-            bool excl = (line[i] == '1');
-            excluded[i] = excl;
-            if (excl) {
-                n_excluded++;
-            }
-        }
+    // --- parse ---
+    n_excluded = 0;
+    for (int i = 0; i < n_atoms && i < read_len; i++) {
+        bool excl = (line[i] == '1');
+        excluded[i] = excl;
+        if (excl) n_excluded++;
     }
 
+    printf("Number of excluded atoms: %d\n", n_excluded);
+
+    free(line);
     fclose(fp);
 }
 
diff --git a/src/core/qatoms.cu b/src/core/qatoms.cu
index b4a0307e..dba06732 100644
--- a/src/core/qatoms.cu
+++ b/src/core/qatoms.cu
@@ -10,30 +10,30 @@ calc_qw_t *QW_MAT, *h_QW_MAT;
 calc_qp_t *QP_MAT, *h_QP_MAT;
 dvel_t *QQ_MAT;
 
-double *D_QP_Evdw, *D_QP_Ecoul, *h_QP_Evdw, *h_QP_Ecoul;
-double *D_QP_evdw_TOT, *D_QP_ecoul_TOT, QP_evdw_TOT, QP_ecoul_TOT;
+float *D_QP_Evdw, *D_QP_Ecoul, *h_QP_Evdw, *h_QP_Ecoul;
+float *D_QP_evdw_TOT, *D_QP_ecoul_TOT, QP_evdw_TOT, QP_ecoul_TOT;
 
-double *D_QW_Evdw, *D_QW_Ecoul, *h_QW_Evdw, *h_QW_Ecoul;
-double *D_QW_evdw_TOT, *D_QW_ecoul_TOT, QW_evdw_TOT, QW_ecoul_TOT;
+float *D_QW_Evdw, *D_QW_Ecoul, *h_QW_Evdw, *h_QW_Ecoul;
+float *D_QW_evdw_TOT, *D_QW_ecoul_TOT, QW_evdw_TOT, QW_ecoul_TOT;
 
 // Constants pointers
 q_catype_t *D_qcatypes;
 q_atype_t *D_qatypes;
 q_charge_t *D_qcharges;
 q_atom_t *D_qatoms;
-double *D_lambdas;
+float *D_lambdas;
 
 bool qp_gpu_set = false;
 
 void calc_nonbonded_qp_forces() {
     int i, j;
     coord_t da;
-    double r2, r6, r;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
+    float r2, r6, r;
+    float ai_aii, aj_aii, ai_bii, aj_bii;
     q_catype_t qi_type;
     catype_t aj_type;
     bool bond23, bond14;
-    double scaling, Vel, V_a, V_b, dv;
+    float scaling, Vel, V_a, V_b, dv;
 
     for (int qi = 0; qi < n_qatoms; qi++) {
         for (int pj = 0; pj < n_patoms; pj++) {
@@ -98,7 +98,7 @@ void calc_nonbonded_qp_forces_host() {
     int mem_size_qatypes = n_qatoms * n_lambdas * sizeof(q_atype_t);
     int mem_size_qcharges = n_qatoms * n_lambdas * sizeof(q_charge_t);
     int mem_size_qatoms = n_qatoms * sizeof(q_atom_t);
-    int mem_size_lambdas = n_lambdas * sizeof(double);
+    int mem_size_lambdas = n_lambdas * sizeof(float);
 
     int mem_size_ccharges = n_ccharges * sizeof(ccharge_t);
     int mem_size_charges = n_atoms * sizeof(charge_t);
@@ -111,8 +111,8 @@ void calc_nonbonded_qp_forces_host() {
     int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
     int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
     //TODO make Evdw & Ecoul work for # of states > 2
-    int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
-    int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
+    int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(float);
+    int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(float);
 
     if (!qp_gpu_set){
         qp_gpu_set = true;
@@ -207,11 +207,11 @@ void calc_nonbonded_qp_forces_host() {
         #endif
         check_cudaMalloc((void**) &D_QP_Ecoul, mem_size_QP_Ecoul);
 
-        check_cudaMalloc((void**) &D_QP_evdw_TOT, sizeof(double));
-        check_cudaMalloc((void**) &D_QP_ecoul_TOT, sizeof(double)); 
+        check_cudaMalloc((void**) &D_QP_evdw_TOT, sizeof(float));
+        check_cudaMalloc((void**) &D_QP_ecoul_TOT, sizeof(float)); 
 
-        h_QP_Evdw = (double*) malloc(mem_size_QP_Evdw);
-        h_QP_Ecoul = (double*) malloc(mem_size_QP_Ecoul);
+        h_QP_Evdw = (float*) malloc(mem_size_QP_Evdw);
+        h_QP_Ecoul = (float*) malloc(mem_size_QP_Ecoul);
 
         h_QP_MAT = (calc_qp_t*) malloc(mem_size_QP_MAT);
     }
@@ -250,8 +250,8 @@ void calc_nonbonded_qp_forces_host() {
     for (int state = 0; state < min(2, n_lambdas); state++) {
         calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, &D_QP_Evdw[state * n_blocks_p * n_blocks_q], &D_QP_Ecoul[state * n_blocks_p * n_blocks_q], false);
 
-        cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-        cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(float), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(float), cudaMemcpyDeviceToHost);
     
         EQ_nonbond_qp[state].Uvdw += QP_evdw_TOT;
         EQ_nonbond_qp[state].Ucoul += QP_ecoul_TOT;
@@ -261,11 +261,11 @@ void calc_nonbonded_qp_forces_host() {
 void calc_nonbonded_qw_forces() {
     int i;
     coord_t dO, dH1, dH2;
-    double r2O, rH1, rH2, r6O, rO, r2H1, r2H2;
-    double dvO, dvH1, dvH2;
-    double V_a, V_b, VelO, VelH1, VelH2;
+    float r2O, rH1, rH2, r6O, rO, r2H1, r2H2;
+    float dvO, dvH1, dvH2;
+    float V_a, V_b, VelO, VelH1, VelH2;
     q_catype_t qi_type;
-    double ai_aii, ai_bii;
+    float ai_aii, ai_bii;
 
     if (A_O == 0) {
         catype_t catype_ow;    // Atom type of first O, H atom
@@ -362,8 +362,8 @@ void calc_nonbonded_qw_forces_host() {
     int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
     int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
     //TODO make Evdw & Ecoul work for # of states > 2
-    int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
-    int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
+    int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(float);
+    int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(float);
 
     if (A_O == 0) {
         catype_t catype_ow;    // Atom type of first O atom
@@ -387,11 +387,11 @@ void calc_nonbonded_qw_forces_host() {
         #endif
         check_cudaMalloc((void**) &D_QW_Ecoul, mem_size_QW_Ecoul);
 
-        check_cudaMalloc((void**) &D_QW_evdw_TOT, sizeof(double));
-        check_cudaMalloc((void**) &D_QW_ecoul_TOT, sizeof(double)); 
+        check_cudaMalloc((void**) &D_QW_evdw_TOT, sizeof(float));
+        check_cudaMalloc((void**) &D_QW_ecoul_TOT, sizeof(float)); 
 
-        h_QW_Evdw = (double*) malloc(mem_size_QW_Evdw);
-        h_QW_Ecoul = (double*) malloc(mem_size_QW_Ecoul);
+        h_QW_Evdw = (float*) malloc(mem_size_QW_Evdw);
+        h_QW_Ecoul = (float*) malloc(mem_size_QW_Ecoul);
 
         h_QW_MAT = (calc_qw_t*) malloc(mem_size_MAT);
     }
@@ -406,7 +406,7 @@ void calc_nonbonded_qw_forces_host() {
     threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
     grid = dim3((n_waters + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y);
 
-    double evdw, ecoul;
+    float evdw, ecoul;
 
     calc_qw_dvel_matrix<<<grid, threads>>>(n_qatoms, n_waters, n_lambdas, crg_ow, crg_hw, A_O, B_O, X, W, D_QW_Evdw, D_QW_Ecoul, QW_MAT, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, topo);
     calc_qw_dvel_vector_column<<<((n_waters+BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_W, QW_MAT);
@@ -432,8 +432,8 @@ void calc_nonbonded_qw_forces_host() {
     for (int state = 0; state < min(2, n_lambdas); state++) {
         calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, &D_QW_Evdw[state * n_blocks_w * n_blocks_q], &D_QW_Ecoul[state * n_blocks_w * n_blocks_q], false);
 
-        cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-        cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(float), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(float), cudaMemcpyDeviceToHost);
     
         EQ_nonbond_qw[state].Uvdw += QW_evdw_TOT;
         EQ_nonbond_qw[state].Ucoul += QW_ecoul_TOT;
@@ -442,15 +442,15 @@ void calc_nonbonded_qw_forces_host() {
 
 void calc_nonbonded_qq_forces() {
     int ai, aj;
-    double crg_i, crg_j;
-    double elscale, scaling;
+    float crg_i, crg_j;
+    float elscale, scaling;
     q_catype_t qi_type, qj_type;
     bool bond23, bond14;
     coord_t da;
-    double r2a, ra, r6a;
-    double Vela, V_a, V_b;
-    double dva;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
+    float r2a, ra, r6a;
+    float Vela, V_a, V_b;
+    float dva;
+    float ai_aii, aj_aii, ai_bii, aj_bii;
 
     for (int state = 0; state < n_lambdas; state++) {
         for (int qi = 0; qi < n_qatoms; qi++) {
@@ -520,8 +520,8 @@ void calc_qangle_forces(int state) {
     int ic;
     int ai, aj, ak;
     coord_t rji, rjk;
-    double bji, bjk;
-    double cos_th, th, dth, ener, dv, f1;
+    float bji, bjk;
+    float cos_th, th, dth, ener, dv, f1;
     coord_t di, dk;
 
     for (int i = 0; i < n_qangles; i++) {
@@ -580,7 +580,7 @@ void calc_qangle_forces(int state) {
 void calc_qbond_forces(int state) {
     int ic;
     int ai, aj;
-    double b, db, ener, dv;
+    float b, db, ener, dv;
     coord_t rij;
 
     for (int i = 0; i < n_qbonds; i++) {
@@ -617,10 +617,10 @@ void calc_qtorsion_forces(int state) {
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
-    double bj2inv, bk2inv, bjinv, bkinv;
-    double bj, bk, cos_phi, phi;
-    double arg, dv, f1;
-    double ener;
+    float bj2inv, bk2inv, bjinv, bkinv;
+    float bj, bk, cos_phi, phi;
+    float arg, dv, f1;
+    float ener;
 
     for (int i = 0; i < n_qtorsions; i++) {
         ic = q_torsions[i + n_qtorsions * state].code;
@@ -730,18 +730,18 @@ void calc_qtorsion_forces(int state) {
 
 // Q-W interactions
 
-__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, double crg_ow, double crg_hw, double A_O, double B_O,
-    coord_t *Qs, coord_t *Ws, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t *qw,
-    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, double *D_lambdas, topo_t D_topo) {
+__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, float crg_ow, float crg_hw, float A_O, float B_O,
+    coord_t *Qs, coord_t *Ws, float Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], float Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t *qw,
+    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, float *D_lambdas, topo_t D_topo) {
 
     int j;
     coord_t dO, dH1, dH2;
-    double r2O, rH1, rH2, r6O, rO, r2H1, r2H2;
-    double dvO, dvH1, dvH2;
-    double V_a, V_b, VelO, VelH1, VelH2;
+    float r2O, rH1, rH2, r6O, rO, r2H1, r2H2;
+    float dvO, dvH1, dvH2;
+    float V_a, V_b, VelO, VelH1, VelH2;
     q_atype_t qa_type;
     q_catype_t qi_type;
-    double ai_aii, ai_bii;
+    float ai_aii, ai_bii;
 
     j = 3 * column;
     dO.x = Ws[j].x - Qs[row].x;
@@ -810,9 +810,9 @@ __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lamb
     (*qw).H2.z += dvH2 * dH2.z;
 }
 
-__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O,
-    coord_t *X, coord_t *W, double *Evdw, double *Ecoul, calc_qw_t *MAT,
-    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, double *D_lambdas, topo_t D_topo) {
+__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, float crg_ow, float crg_hw, float A_O, float B_O,
+    coord_t *X, coord_t *W, float *Evdw, float *Ecoul, calc_qw_t *MAT,
+    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, float *D_lambdas, topo_t D_topo) {
     // Block index
     int bx = blockIdx.x;
     int by = blockIdx.y;
@@ -822,8 +822,8 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d
     int ty = threadIdx.y;
 
     //TODO implement >2 states on GPU
-    __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
-    __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];    
+    __shared__ float Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
+    __shared__ float Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];    
 
     Ecoul_S[ty][tx] = 0;
     Evdw_S[ty][tx] = 0;
@@ -870,8 +870,8 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d
     if (tx == 0 && ty == 0) {
         //TODO implement >2 states on GPU
         for (int state = 0; state < min(2, n_lambdas); state++) {
-            double tot_Evdw = 0;
-            double tot_Ecoul = 0;
+            float tot_Evdw = 0;
+            float tot_Ecoul = 0;
             for (int i = 0; i < BLOCK_SIZE; i++) {
                 for (int j = 0; j < BLOCK_SIZE; j++) {
                     tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE];
@@ -957,17 +957,17 @@ __global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t *D
 // Q-P interactions
 
 __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms,
-    coord_t *Qs, coord_t *Ps, int *LJs, bool *excluded_s, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t *qp,
-    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, double *D_lambdas,
+    coord_t *Qs, coord_t *Ps, int *LJs, bool *excluded_s, float Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], float Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t *qp,
+    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, float *D_lambdas,
     catype_t *D_catypes, atype_t *D_atypes, ccharge_t *D_ccharges, charge_t *D_charges, topo_t D_topo) {
     
     coord_t da;
-    double r2, r6, r;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
+    float r2, r6, r;
+    float ai_aii, aj_aii, ai_bii, aj_bii;
     q_catype_t qi_type;
     catype_t aj_type;
     bool bond23, bond14;
-    double scaling, Vel, V_a, V_b, dv;
+    float scaling, Vel, V_a, V_b, dv;
 
     int j = D_patoms[pj].a-1;
 
@@ -1023,8 +1023,8 @@ __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, in
 }
 
 __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute,
-    coord_t *X, double *Evdw, double *Ecoul, calc_qp_t *QP_MAT,
-    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, double *D_lambdas,
+    coord_t *X, float *Evdw, float *Ecoul, calc_qp_t *QP_MAT,
+    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, float *D_lambdas,
     int *D_LJ_matrix, bool *D_excluded, catype_t *D_catypes, atype_t *D_atypes, ccharge_t *D_ccharges, charge_t *D_charges, topo_t D_topo) {
     
     // Block index
@@ -1036,8 +1036,8 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i
     int ty = threadIdx.y;
 
     //TODO implement >2 states on GPU
-    __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
-    __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];    
+    __shared__ float Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
+    __shared__ float Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];    
 
     Ecoul_S[ty][tx] = 0;
     Evdw_S[ty][tx] = 0;
@@ -1090,8 +1090,8 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i
     if (tx == 0 && ty == 0) {
         //TODO implement >2 states on GPU
         for (int state = 0; state < min(2, n_lambdas); state++) {
-            double tot_Evdw = 0;
-            double tot_Ecoul = 0;
+            float tot_Evdw = 0;
+            float tot_Ecoul = 0;
             for (int i = 0; i < BLOCK_SIZE; i++) {
                 for (int j = 0; j < BLOCK_SIZE; j++) {
                     tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE];
diff --git a/src/core/qatoms.h b/src/core/qatoms.h
index f85890c8..465b6009 100644
--- a/src/core/qatoms.h
+++ b/src/core/qatoms.h
@@ -44,18 +44,18 @@ extern q_catype_t *D_qcatypes;
 extern q_atype_t *D_qatypes;
 extern q_charge_t *D_qcharges;
 extern q_atom_t *D_qatoms;
-extern double *D_lambdas;
+extern float *D_lambdas;
 extern int *D_LJ_matrix;
 extern bool *D_excluded;
 
 // Q-W interactions
-__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O,
-    coord_t *Qs, coord_t *Ws, double *Evdw, double *Ecoul, calc_qw_t *qw,
-    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, double *D_lambdas, topo_t topo);
+__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, float crg_ow, float crg_hw, float A_O, float B_O,
+    coord_t *Qs, coord_t *Ws, float *Evdw, float *Ecoul, calc_qw_t *qw,
+    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, float *D_lambdas, topo_t topo);
 
-__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O,
-    coord_t *X, coord_t *W, double *Evdw, double *Ecoul, calc_qw_t *MAT,
-    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, double *D_lambdas, topo_t topo);
+__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, float crg_ow, float crg_hw, float A_O, float B_O,
+    coord_t *X, coord_t *W, float *Evdw, float *Ecoul, calc_qw_t *MAT,
+    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, float *D_lambdas, topo_t topo);
 
 __global__ void calc_qw_dvel_vector_row(int n_qatoms, int n_waters, dvel_t *DV_X, dvel_t *DV_W, calc_qw_t *QW_MAT, q_atom_t *D_qatoms);
 
@@ -66,20 +66,20 @@ __global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t *D
 // Q-P interactions
 
 __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms,
-    coord_t *Qs, coord_t *Ps, int *LJs, bool *excluded_s, double *Evdw, double *Ecoul, calc_qp_t *qp,
-    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, double *D_lambdas,
+    coord_t *Qs, coord_t *Ps, int *LJs, bool *excluded_s, float *Evdw, float *Ecoul, calc_qp_t *qp,
+    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, float *D_lambdas,
     catype_t *D_catypes, atype_t *D_atypes, ccharge_t *D_ccharges, charge_t *D_charges, topo_t topo);
 
 __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute,
-    coord_t *X, double *Evdw, double *Ecoul, calc_qp_t *QP_MAT,
-    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, double *D_lambdas,
+    coord_t *X, float *Evdw, float *Ecoul, calc_qp_t *QP_MAT,
+    q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, float *D_lambdas,
     int *D_LJ_matrix, bool *D_excluded, catype_t *D_catypes, atype_t *D_atypes, ccharge_t *D_ccharges, charge_t *D_charges, topo_t topo);
 
 __global__ void calc_qp_dvel_vector_row(int n_qatoms, int n_patoms, dvel_t *DV_X, calc_qp_t *QP_MAT, q_atom_t *D_qatoms);
 
 __global__ void calc_qp_dvel_vector_column(int n_qatoms, int n_patoms, dvel_t *DV_X, calc_qp_t *QP_MAT, p_atom_t *D_patoms);
 
-__global__ void calc_energy_sum(int rows, int columns, double *Evdw_TOT, double *Ecoul_TOT, double *Evdw, double *Ecoul, bool upper_diagonal);
+__global__ void calc_energy_sum(int rows, int columns, float *Evdw_TOT, float *Ecoul_TOT, float *Evdw, float *Ecoul, bool upper_diagonal);
 
 void clean_d_qatoms();
 
diff --git a/src/core/restraints.cu b/src/core/restraints.cu
index 56d20ef5..132d4f83 100644
--- a/src/core/restraints.cu
+++ b/src/core/restraints.cu
@@ -6,9 +6,9 @@
 #include <stdio.h>
 
 void calc_radix_w_forces() {
-    double b, db, ener, dv, fexp;
+    float b, db, ener, dv, fexp;
     coord_t dr;
-    double shift;
+    float shift;
 
     if (md.radial_force != 0) {
         shift = sqrt(Boltz * Tfree / md.radial_force);
@@ -52,12 +52,12 @@ void calc_radix_w_forces() {
 
 void calc_polx_w_forces(int iteration) {
     int wi, imin, jw, ii, iis, jmin;
-    double tmin;
+    float tmin;
     coord_t rmu, rcu, f1O, f1H1, f1H2, f2;
-    double rm, rc;
-    double cos_th;
-    double avtdum, arg, f0, dv;
-    double ener;
+    float rm, rc;
+    float cos_th;
+    float avtdum, arg, f0, dv;
+    float ener;
 
     for (int is = 0; is < n_shells; is++) {
         wshells[is].n_inshell = 0;
@@ -129,8 +129,8 @@ void calc_polx_w_forces(int iteration) {
     if (iteration != 0 && iteration % itdis_update == 0) {
         for (int is = 0; is < n_shells; is++) {
             printf("SHELL %d\n", is);
-            wshells[is].avtheta /= (double) itdis_update;
-            wshells[is].avn_inshell /= (double) itdis_update;
+            wshells[is].avtheta /= (float) itdis_update;
+            wshells[is].avn_inshell /= (float) itdis_update;
             wshells[is].theta_corr = wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb);
             printf("average theta = %f, average in shell = %f, theta_corr = %f\n",
                 wshells[is].avtheta * 180 / M_PI, wshells[is].avn_inshell, wshells[is].theta_corr * 180 / M_PI);
@@ -148,7 +148,7 @@ void calc_polx_w_forces(int iteration) {
         avtdum = 0;
         for (int il = 0; il < wshells[is].n_inshell; il++) {
             ii = nsort[il][is];
-            arg = 1 + ((1 - 2 * (double) (il+1)) / (double) wshells[is].n_inshell);
+            arg = 1 + ((1 - 2 * (float) (il+1)) / (float) wshells[is].n_inshell);
             theta0[il] = acos(arg);
             theta0[il] = theta0[il] - 3 * sin(theta0[il]) * wshells[is].cstb / 2;
             if (theta0[il] < 0) theta0[il] = 0;
@@ -184,7 +184,7 @@ void calc_polx_w_forces(int iteration) {
             if (cos_th > 1) cos_th = 1;
             if (cos_th < -1) cos_th = -1;
             f0 = sin(acos(cos_th));
-            if (abs(f0) < 1.0E-12) f0 = 1.0E-12;
+            if (abs(f0) < 1.0E-6) f0 = 1.0E-6;
             f0 = -1.0 / f0;
             f0 *= dv;
 
@@ -213,14 +213,14 @@ void calc_polx_w_forces(int iteration) {
             dvelocities[wi+2].z += f0 * (f1H2.z);
         }
 
-        wshells[is].avtheta     += avtdum / (double) wshells[is].n_inshell;
+        wshells[is].avtheta     += avtdum / (float) wshells[is].n_inshell;
         wshells[is].avn_inshell += wshells[is].n_inshell;
     }
 }
 
 void calc_pshell_forces() {
     coord_t dr;
-    double k, r2, ener;
+    float k, r2, ener;
 
     for (int i = 0; i < n_atoms_solute; i++) {
         if (shell[i] || excluded[i]) {
@@ -250,9 +250,9 @@ void calc_pshell_forces() {
 
 // sequence restraints (independent of Q-state)
 void calc_restrseq_forces() {
-    double k, mass, totmass;
+    float k, mass, totmass;
     coord_t dr;
-    double r2, ener;
+    float r2, ener;
 
     for (int s = 0; s < n_restrseqs; s++) {
         k = restrseqs[s].k;
@@ -349,7 +349,7 @@ void calc_restrseq_forces() {
 void calc_restrpos_forces() {
     int state, i;
     coord_t dr;
-    double lambda, ener, x2, y2, z2;
+    float lambda, ener, x2, y2, z2;
 
     for (int ir = 0; ir < n_restrspos; ir++) {
         state = restrspos[ir].ipsi-1;
@@ -396,7 +396,7 @@ void calc_restrpos_forces() {
 void calc_restrdis_forces() {
     int state, i, j;
     coord_t dr;
-    double lambda, b, db, dv, ener;
+    float lambda, b, db, dv, ener;
 
     for (int ir = 0; ir < n_restrdists; ir++) {
         state =  restrdists[ir].ipsi-1;
@@ -454,8 +454,8 @@ void calc_restrdis_forces() {
 void calc_restrang_forces() {
     int state, i, j, k;
     coord_t dr, dr2, di, dk;
-    double lambda, r2ij, r2jk, rij, rjk, cos_th, th;
-    double dth, dv, ener, f1;
+    float lambda, r2ij, r2jk, rij, rjk, cos_th, th;
+    float dth, dv, ener, f1;
 
     for (int ir = 0; ir < n_restrangs; ir++) {
         state = restrangs[ir].ipsi-1;
@@ -539,7 +539,7 @@ void calc_restrang_forces() {
 
 // extra half-harmonic wall restraints
 void calc_restrwall_forces() {
-    double k, b, db, ener, dv, fexp;
+    float k, b, db, ener, dv, fexp;
     coord_t dr;
 
     for (int ir = 0; ir < n_restrwalls; ir++) {
diff --git a/src/core/shake.cu b/src/core/shake.cu
index 645c0e3a..fa76d5bf 100644
--- a/src/core/shake.cu
+++ b/src/core/shake.cu
@@ -6,7 +6,7 @@
 
 int calc_shake_constraints(coord_t *coords, coord_t *xcoords) {
     int ai, aj, n_iterations, total_iterations, shake;
-    double xij2, diff, corr, scp, xxij2;
+    float xij2, diff, corr, scp, xxij2;
     coord_t xij, xxij;
 
     shake = 0;
@@ -102,8 +102,8 @@ void initial_shaking() {
 }
 
 void stop_cm_translation() {
-    double total_mass = 0;
-    double rmass = 0;
+    float total_mass = 0;
+    float rmass = 0;
     coord_t vcm;
     vcm.x = 0;
     vcm.y = 0;
diff --git a/src/core/solvent.cu b/src/core/solvent.cu
index 180152bc..33604898 100644
--- a/src/core/solvent.cu
+++ b/src/core/solvent.cu
@@ -16,20 +16,20 @@ dvel_t *DV_W;
 calc_ww_t *WW_MAT, *h_WW_MAT;
 calc_pw_t *PW_MAT, *h_PW_MAT;
 
-double *D_WW_Evdw, *D_WW_Ecoul, *h_WW_Evdw, *h_WW_Ecoul;
-double *D_WW_evdw_TOT, *D_WW_ecoul_TOT, WW_evdw_TOT, WW_ecoul_TOT;
+float *D_WW_Evdw, *D_WW_Ecoul, *h_WW_Evdw, *h_WW_Ecoul;
+float *D_WW_evdw_TOT, *D_WW_ecoul_TOT, WW_evdw_TOT, WW_ecoul_TOT;
 
-double *D_PW_Evdw, *D_PW_Ecoul, *h_PW_Evdw, *h_PW_Ecoul;
-double *D_PW_evdw_TOT, *D_PW_ecoul_TOT, PW_evdw_TOT, PW_ecoul_TOT;
+float *D_PW_Evdw, *D_PW_Ecoul, *h_PW_Evdw, *h_PW_Ecoul;
+float *D_PW_evdw_TOT, *D_PW_ecoul_TOT, PW_evdw_TOT, PW_ecoul_TOT;
 
 // Constants pointers
 bool pw_gpu_set = false;
 
 //ONLY call if there are actually solvent atoms, or get segfaulted
 void calc_nonbonded_ww_forces() {
-    double rOX, rH1X, rH2X, r2;
+    float rOX, rH1X, rH2X, r2;
     coord_t dOX, dH1X, dH2X;
-    double Vel, V_a, V_b, dv;
+    float Vel, V_a, V_b, dv;
 
     // Initialize water constants
     if (A_OO == 0) {
@@ -47,11 +47,11 @@ void calc_nonbonded_ww_forces() {
         crg_hw = ccharge_hw.charge;
     }
 
-    // double ecoul_block = 0, evdw_block = 0;
+    // float ecoul_block = 0, evdw_block = 0;
 
     for (int i = n_atoms_solute; i < n_atoms; i+=3) {
         for (int j = i+3; j < n_atoms; j+=3) {
-            double ecoul = 0, evdw = 0;
+            float ecoul = 0, evdw = 0;
             // if (excluded[i] || excluded[j]) continue;
             // --- O - (O,H1,H2) ---
             dOX.x = coords[j].x - coords[i].x;
@@ -286,9 +286,9 @@ void calc_nonbonded_ww_forces() {
 
 __device__ __forceinline__ void calculate_unforce_bound(
     const int y, const int x, const coord_t &q, const coord_t &p,
-    const topo_t &D_topo, const double &crg_ow, const double &crg_hw,
-    const double &A_OO, const double &B_OO, double &evdw, double &ecoul,
-    double &dv, double &tmpx, double &tmpy, double &tmpz) {
+    const topo_t &D_topo, const float &crg_ow, const float &crg_hw,
+    const float &A_OO, const float &B_OO, float &evdw, float &ecoul,
+    float &dv, float &tmpx, float &tmpy, float &tmpz) {
     int belong_y = y / 3;
     int belong_x = x / 3;
     if (belong_y == belong_x) {
@@ -302,16 +302,16 @@ __device__ __forceinline__ void calculate_unforce_bound(
     tmpx = p.x - q.x;
     tmpy = p.y - q.y;
     tmpz = p.z - q.z;
-    // double inv_dis = 1.0 / sqrt(pow(tmpx, 2) + pow(tmpy, 2) + pow(tmpz, 2));
-    double inv_dis = rsqrt(tmpx * tmpx + tmpy * tmpy + tmpz * tmpz);
-    double inv_dis2 = inv_dis * inv_dis;
+    // float inv_dis = 1.0 / sqrt(pow(tmpx, 2) + pow(tmpy, 2) + pow(tmpz, 2));
+    float inv_dis = rsqrt(tmpx * tmpx + tmpy * tmpy + tmpz * tmpz);
+    float inv_dis2 = inv_dis * inv_dis;
 
     ecoul = inv_dis * D_topo.coulomb_constant * (y_is_o ? crg_ow : crg_hw) *
             (x_is_o ? crg_ow : crg_hw);
-    double v_a = 0, v_b = 0;
+    float v_a = 0, v_b = 0;
     if (y_is_o && x_is_o) {
-        double inv_dis6 = inv_dis2 * inv_dis2 * inv_dis2;
-        double inv_dis12 = inv_dis6 * inv_dis6;
+        float inv_dis6 = inv_dis2 * inv_dis2 * inv_dis2;
+        float inv_dis12 = inv_dis6 * inv_dis6;
         v_a = A_OO * inv_dis12;
         v_b = B_OO * inv_dis6;
         evdw = v_a - v_b;
@@ -324,10 +324,10 @@ __device__ __forceinline__ void calculate_unforce_bound(
 template<const int Thread_x, const int Thread_y, const int Block_x,
     const int Block_y>
 __global__ void
-calc_ww(const int N, const double crg_ow, const double crg_hw,
-        const double A_OO, const double B_OO, const topo_t D_topo,
+calc_ww(const int N, const float crg_ow, const float crg_hw,
+        const float A_OO, const float B_OO, const topo_t D_topo,
         coord_t *__restrict__ W, dvel_t *__restrict__ DV_W,
-        double *__restrict__ Evdw_TOT, double *__restrict__ ecoul_TOT) {
+        float *__restrict__ Evdw_TOT, float *__restrict__ ecoul_TOT) {
     // Calculate block boundaries
     int NX = N;
     int NY = (N + 1) / 2;
@@ -341,11 +341,11 @@ calc_ww(const int N, const double crg_ow, const double crg_hw,
     // Shared memory declarations with padding to avoid bank conflicts
     __shared__ coord_t p[2 * Thread_x * Block_x + 1];
     __shared__ coord_t q[2 * Thread_y * Block_y + 1];
-    __shared__ double sum_row_x[2 * Thread_y * Block_y + 1];
-    __shared__ double sum_row_y[2 * Thread_y * Block_y + 1];
-    __shared__ double sum_row_z[2 * Thread_y * Block_y + 1];
+    __shared__ float sum_row_x[2 * Thread_y * Block_y + 1];
+    __shared__ float sum_row_y[2 * Thread_y * Block_y + 1];
+    __shared__ float sum_row_z[2 * Thread_y * Block_y + 1];
 
-    __shared__ double block_ecoul[(Thread_x * Thread_y + 31) / 32],
+    __shared__ float block_ecoul[(Thread_x * Thread_y + 31) / 32],
             block_evdw[(Thread_x * Thread_y + 31) / 32];
 
     // Thread indices
@@ -377,7 +377,7 @@ calc_ww(const int N, const double crg_ow, const double crg_hw,
     }
     __syncthreads();
     // Initialize column sums
-    double sum_col_x[2 * Block_x], sum_col_y[2 * Block_x], sum_col_z[2 * Block_x];
+    float sum_col_x[2 * Block_x], sum_col_y[2 * Block_x], sum_col_z[2 * Block_x];
     #pragma unroll
     for (int i = 0; i < Block_x; i++) {
         sum_col_x[i] = sum_col_x[Block_x + i] = 0;
@@ -386,7 +386,7 @@ calc_ww(const int N, const double crg_ow, const double crg_hw,
     }
 
     // Main computation loop with reduced thread divergence
-    double evdw_tot = 0, ecoul_tot = 0;
+    float evdw_tot = 0, ecoul_tot = 0;
     #pragma unroll
     for (int i = 0; i < Block_x; i++) {
         int i2 = i;
@@ -406,8 +406,8 @@ calc_ww(const int N, const double crg_ow, const double crg_hw,
             // Optimized condition check
             if (y >= x && (N % 2 == 0 || y != NY - 1)) {
                 int offset_y = y - block_y_left_begin;
-                double evdw = 0, ecoul = 0, dv = 0;
-                double tmpx = 0, tmpy = 0, tmpz = 0;
+                float evdw = 0, ecoul = 0, dv = 0;
+                float tmpx = 0, tmpy = 0, tmpz = 0;
 
                 int y2 = N - 1 - y;
                 int x2 = N - x;
@@ -417,9 +417,9 @@ calc_ww(const int N, const double crg_ow, const double crg_hw,
 
                 evdw_tot += evdw;
                 ecoul_tot += ecoul;
-                double v_x = dv * tmpx;
-                double v_y = dv * tmpy;
-                double v_z = dv * tmpz;
+                float v_x = dv * tmpx;
+                float v_y = dv * tmpy;
+                float v_z = dv * tmpz;
 
                 sum_col_x[Block_x + i2] += v_x;
                 sum_col_y[Block_x + i2] += v_y;
@@ -429,8 +429,8 @@ calc_ww(const int N, const double crg_ow, const double crg_hw,
                 sum_row_z[y_cal_num + offset_y] -= v_z;
             } else if (y < x) {
                 int offset_y = y - block_y_left_begin;
-                double evdw = 0, ecoul = 0, dv = 0;
-                double tmpx = 0, tmpy = 0, tmpz = 0;
+                float evdw = 0, ecoul = 0, dv = 0;
+                float tmpx = 0, tmpy = 0, tmpz = 0;
 
                 calculate_unforce_bound(y, x, q[offset_y], now_p0, D_topo, crg_ow,
                                      crg_hw, A_OO, B_OO, evdw, ecoul, dv, tmpx, tmpy,
@@ -438,9 +438,9 @@ calc_ww(const int N, const double crg_ow, const double crg_hw,
 
                 evdw_tot += evdw;
                 ecoul_tot += ecoul;
-                double v_x = dv * tmpx;
-                double v_y = dv * tmpy;
-                double v_z = dv * tmpz;
+                float v_x = dv * tmpx;
+                float v_y = dv * tmpy;
+                float v_z = dv * tmpz;
 
                 sum_col_x[i2] += v_x;
                 sum_col_y[i2] += v_y;
@@ -470,8 +470,8 @@ calc_ww(const int N, const double crg_ow, const double crg_hw,
 
     // Final reduction
     if (warp_id == 0) {
-        double val1 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_ecoul[lane_id] : 0;
-        double val2 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_evdw[lane_id] : 0;
+        float val1 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_ecoul[lane_id] : 0;
+        float val2 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_evdw[lane_id] : 0;
 
         #pragma unroll
         for (unsigned w = 16; w >= 1; w /= 2) {
@@ -546,8 +546,8 @@ void calc_nonbonded_ww_forces_host() {
 
         check_cudaMalloc((void **) &W, mem_size_W);
         check_cudaMalloc((void **) &DV_W, mem_size_DV_W);
-        check_cudaMalloc((void **) &D_WW_evdw_TOT, sizeof(double));
-        check_cudaMalloc((void **) &D_WW_ecoul_TOT, sizeof(double));
+        check_cudaMalloc((void **) &D_WW_evdw_TOT, sizeof(float));
+        check_cudaMalloc((void **) &D_WW_ecoul_TOT, sizeof(float));
     }
 
     WW_evdw_TOT = 0;
@@ -555,9 +555,9 @@ void calc_nonbonded_ww_forces_host() {
     cudaMemcpy(W, &coords[n_atoms_solute], mem_size_W, cudaMemcpyHostToDevice);
     cudaMemcpy(DV_W, &dvelocities[n_atoms_solute], mem_size_DV_W,
                cudaMemcpyHostToDevice);
-    cudaMemcpy(D_WW_evdw_TOT, &WW_evdw_TOT, sizeof(double),
+    cudaMemcpy(D_WW_evdw_TOT, &WW_evdw_TOT, sizeof(float),
                cudaMemcpyHostToDevice);
-    cudaMemcpy(D_WW_ecoul_TOT, &WW_ecoul_TOT, sizeof(double),
+    cudaMemcpy(D_WW_ecoul_TOT, &WW_ecoul_TOT, sizeof(float),
                cudaMemcpyHostToDevice);
 
     // Optimize thread block configuration
@@ -582,9 +582,9 @@ void calc_nonbonded_ww_forces_host() {
     cudaDeviceSynchronize();
     cudaMemcpy(&dvelocities[n_atoms_solute], DV_W, mem_size_DV_W,
                cudaMemcpyDeviceToHost);
-    cudaMemcpy(&WW_evdw_TOT, D_WW_evdw_TOT, sizeof(double),
+    cudaMemcpy(&WW_evdw_TOT, D_WW_evdw_TOT, sizeof(float),
                cudaMemcpyDeviceToHost);
-    cudaMemcpy(&WW_ecoul_TOT, D_WW_ecoul_TOT, sizeof(double),
+    cudaMemcpy(&WW_ecoul_TOT, D_WW_ecoul_TOT, sizeof(float),
                cudaMemcpyDeviceToHost);
     E_nonbond_ww.Uvdw += WW_evdw_TOT;
     E_nonbond_ww.Ucoul += WW_ecoul_TOT;
@@ -592,11 +592,11 @@ void calc_nonbonded_ww_forces_host() {
 
 void calc_nonbonded_pw_forces() {
     coord_t da;
-    double r2a, ra, r6a;
-    double Vela, V_a, V_b;
-    double dva;
-    double qi, qj;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
+    float r2a, ra, r6a;
+    float Vela, V_a, V_b;
+    float dva;
+    float qi, qj;
+    float ai_aii, aj_aii, ai_bii, aj_bii;
     catype_t ai_type, aj_type;
     int i;
 
@@ -655,8 +655,8 @@ void calc_nonbonded_pw_forces_host() {
     int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
     int n_blocks_w = (3*n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
-    int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(double);
-    int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(double);
+    int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(float);
+    int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(float);
 
     if (!pw_gpu_set) {
         pw_gpu_set = true;
@@ -684,8 +684,8 @@ void calc_nonbonded_pw_forces_host() {
         #endif
         check_cudaMalloc((void**) &D_PW_Ecoul, mem_size_PW_Ecoul);
 
-        check_cudaMalloc((void**) &D_PW_evdw_TOT, sizeof(double));
-        check_cudaMalloc((void**) &D_PW_ecoul_TOT, sizeof(double)); 
+        check_cudaMalloc((void**) &D_PW_evdw_TOT, sizeof(float));
+        check_cudaMalloc((void**) &D_PW_ecoul_TOT, sizeof(float)); 
 
         #ifdef DEBUG
         printf("All GPU solvent memory allocated\n");
@@ -730,8 +730,8 @@ void calc_nonbonded_pw_forces_host() {
 
     calc_energy_sum<<<1, threads>>>(n_blocks_p, n_blocks_w, D_PW_evdw_TOT, D_PW_ecoul_TOT, D_PW_Evdw, D_PW_Ecoul, false);
 
-    cudaMemcpy(&PW_evdw_TOT, D_PW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&PW_ecoul_TOT, D_PW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&PW_evdw_TOT, D_PW_evdw_TOT, sizeof(float), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&PW_ecoul_TOT, D_PW_ecoul_TOT, sizeof(float), cudaMemcpyDeviceToHost);
 
     E_nonbond_pw.Uvdw += PW_evdw_TOT;
     E_nonbond_pw.Ucoul += PW_ecoul_TOT;
@@ -754,12 +754,12 @@ __device__ void set_water(int n_waters, int row, int column, dvel_t *val, calc_w
     MAT[column + n_waters * row].H2 = val[2];
 }
 
-__device__ void calc_ww_dvel_matrix_incr(int row, int column, double crg_ow, double crg_hw, double A_OO, double B_OO,
-    coord_t *Xs, coord_t *Ys, double *Evdw, double *Ecoul, dvel_t *water_a, dvel_t *water_b, topo_t D_topo) {
-    double rOX, rH1X, rH2X, r2;
+__device__ void calc_ww_dvel_matrix_incr(int row, int column, float crg_ow, float crg_hw, float A_OO, float B_OO,
+    coord_t *Xs, coord_t *Ys, float *Evdw, float *Ecoul, dvel_t *water_a, dvel_t *water_b, topo_t D_topo) {
+    float rOX, rH1X, rH2X, r2;
     coord_t dOX, dH1X, dH2X;
-    double Vel, V_a, V_b, dv;
-    double tempX, tempY, tempZ;
+    float Vel, V_a, V_b, dv;
+    float tempX, tempY, tempZ;
     
     int i = 3 * row;
     int j = 3 * column;
@@ -991,8 +991,8 @@ __device__ void calc_ww_dvel_matrix_incr(int row, int column, double crg_ow, dou
     water_b[wj].z += (dv * dH2X.z);
 }
 
-__global__ void calc_ww_dvel_matrix(int n_waters, double crg_ow, double crg_hw, double A_OO, double B_OO,
-    coord_t *W, double *Evdw, double *Ecoul, calc_ww_t *MAT, topo_t D_topo) {
+__global__ void calc_ww_dvel_matrix(int n_waters, float crg_ow, float crg_hw, float A_OO, float B_OO,
+    coord_t *W, float *Evdw, float *Ecoul, calc_ww_t *MAT, topo_t D_topo) {
     // Block index
     int bx = blockIdx.x;
     int by = blockIdx.y;
@@ -1001,8 +1001,8 @@ __global__ void calc_ww_dvel_matrix(int n_waters, double crg_ow, double crg_hw,
     int tx = threadIdx.x;
     int ty = threadIdx.y;
 
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ float Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ float Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
 
     Ecoul_S[ty][tx] = 0;
     Evdw_S[ty][tx] = 0;
@@ -1057,7 +1057,7 @@ __global__ void calc_ww_dvel_matrix(int n_waters, double crg_ow, double crg_hw,
     // }
 
     if (bx != by || tx != ty) {
-        double evdw = 0, ecoul = 0;
+        float evdw = 0, ecoul = 0;
         calc_ww_dvel_matrix_incr(ty, tx, crg_ow, crg_hw, A_OO, B_OO, Xs, Ys, &evdw, &ecoul, water_a, water_b, D_topo);
         Evdw_S[ty][tx] = evdw;
         Ecoul_S[ty][tx] = ecoul;
@@ -1078,8 +1078,8 @@ __global__ void calc_ww_dvel_matrix(int n_waters, double crg_ow, double crg_hw,
     int rowlen = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
     if (tx == 0 && ty == 0) {
-        double tot_Evdw = 0;
-        double tot_Ecoul = 0;
+        float tot_Evdw = 0;
+        float tot_Ecoul = 0;
         for (int i = 0; i < BLOCK_SIZE; i++) {
             for (int j = 0; j < BLOCK_SIZE; j++) {
                 tot_Evdw += Evdw_S[i][j];
@@ -1153,15 +1153,15 @@ __global__ void calc_ww_dvel_vector(int n_waters, dvel_t *DV_W, calc_ww_t *MAT)
 // P-W interactions
 
 __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_atoms_solute,
-    coord_t *Xs, coord_t *Ws, bool *excluded_s, double *Evdw, double *Ecoul, calc_pw_t *pw,
+    coord_t *Xs, coord_t *Ws, bool *excluded_s, float *Evdw, float *Ecoul, calc_pw_t *pw,
     ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, topo_t D_topo) {
 
     coord_t da;
-    double r2a, ra, r6a;
-    double Vela, V_a, V_b;
-    double dva;
-    double qi, qj;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
+    float r2a, ra, r6a;
+    float Vela, V_a, V_b;
+    float dva;
+    float qi, qj;
+    float ai_aii, aj_aii, ai_bii, aj_bii;
     catype_t ai_type, aj_type;
     atype_t i_type, j_type;
 
@@ -1214,7 +1214,7 @@ __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int
 }
 
 __global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_waters,
-    coord_t *X, coord_t *W, double *Evdw, double *Ecoul, calc_pw_t *PW_MAT,
+    coord_t *X, coord_t *W, float *Evdw, float *Ecoul, calc_pw_t *PW_MAT,
     ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, bool *D_excluded, topo_t D_topo) {
     // Block index
     int bx = blockIdx.x;
@@ -1224,8 +1224,8 @@ __global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_wate
     int tx = threadIdx.x;
     int ty = threadIdx.y;
 
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ float Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ float Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
 
     Ecoul_S[ty][tx] = 0;
     Evdw_S[ty][tx] = 0;
@@ -1280,9 +1280,9 @@ __global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_wate
 
     // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty);
     // __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_patoms,
-    // coord_t *Ps, coord_t *Xs, double *Evdw, double *Ecoul, calc_pw_t *pw,
+    // coord_t *Ps, coord_t *Xs, float *Evdw, float *Ecoul, calc_pw_t *pw,
     // ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms)
-    double evdw = 0, ecoul = 0;
+    float evdw = 0, ecoul = 0;
     calc_pw_dvel_matrix_incr(ty, pi, tx, bStart + tx, n_atoms_solute, Xs, Ws, excluded_s, &evdw, &ecoul, &pw, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo);
     Evdw_S[ty][tx] = evdw;
     Ecoul_S[ty][tx] = ecoul;
@@ -1301,8 +1301,8 @@ __global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_wate
     int rowlen = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
     if (tx == 0 && ty == 0) {
-        double tot_Evdw = 0;
-        double tot_Ecoul = 0;
+        float tot_Evdw = 0;
+        float tot_Ecoul = 0;
         for (int i = 0; i < BLOCK_SIZE; i++) {
             for (int j = 0; j < BLOCK_SIZE; j++) {
                 tot_Evdw += Evdw_S[i][j];
@@ -1365,16 +1365,16 @@ __global__ void calc_pw_dvel_vector_column(int n_patoms, int n_waters, dvel_t *D
 }
 
 // General
-__global__ void calc_energy_sum(int rows, int columns, double *Evdw_TOT, double *Ecoul_TOT, double *Evdw, double *Ecoul, bool upper_diagonal) {
+__global__ void calc_energy_sum(int rows, int columns, float *Evdw_TOT, float *Ecoul_TOT, float *Evdw, float *Ecoul, bool upper_diagonal) {
     int tx = threadIdx.x;
     int ty = threadIdx.y;
 
-    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
-    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ float Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ float Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
 
     //TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work
-    double coul_TOT = 0;
-    double vdw_TOT = 0;
+    float coul_TOT = 0;
+    float vdw_TOT = 0;
     for (int i = ty; i < rows; i+= BLOCK_SIZE) {
         for (int j = tx; j < columns; j+= BLOCK_SIZE) {
             if (i <= j || !upper_diagonal) {
@@ -1389,8 +1389,8 @@ __global__ void calc_energy_sum(int rows, int columns, double *Evdw_TOT, double
     __syncthreads();
 
     if (tx == 0 && ty == 0) {
-        double Evdw_temp = 0;
-        double Ecoul_temp = 0;
+        float Evdw_temp = 0;
+        float Ecoul_temp = 0;
 
         for (int i = 0; i < BLOCK_SIZE; i++) {
             for (int j = 0; j < BLOCK_SIZE; j++) {
diff --git a/src/core/solvent.h b/src/core/solvent.h
index 1234751d..80bfebd1 100644
--- a/src/core/solvent.h
+++ b/src/core/solvent.h
@@ -39,21 +39,21 @@ extern bool *D_excluded;
 // W-W interactions
 __device__ void set_water(int n_waters, int row, int column, dvel_t *val, calc_ww_t *MAT);
 
-__device__ void calc_ww_dvel_matrix_incr(int row, int column, double crg_ow, double crg_hw, double A_OO, double B_OO,
-    coord_t *Xs, coord_t *Ys, double *Evdw, double *Ecoul, dvel_t *water_a, dvel_t *water_b, topo_t topo);
+__device__ void calc_ww_dvel_matrix_incr(int row, int column, float crg_ow, float crg_hw, float A_OO, float B_OO,
+    coord_t *Xs, coord_t *Ys, float *Evdw, float *Ecoul, dvel_t *water_a, dvel_t *water_b, topo_t topo);
 
-__global__ void calc_ww_dvel_matrix(int n_waters, double crg_ow, double crg_hw, double A_OO, double B_OO,
-    coord_t *X, double *Evdw, double *Ecoul, calc_ww_t *MAT, topo_t topo);
+__global__ void calc_ww_dvel_matrix(int n_waters, float crg_ow, float crg_hw, float A_OO, float B_OO,
+    coord_t *X, float *Evdw, float *Ecoul, calc_ww_t *MAT, topo_t topo);
     
 __global__ void calc_ww_dvel_vector(int n_waters, dvel_t *DV, calc_ww_t *MAT);
 
 // P-W interactions
 __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int wj, int n_atoms_solute,
-    coord_t *Ps, coord_t *Ws, bool *excluded_s, double *Evdw, double *Ecoul, calc_pw_t *pw,
+    coord_t *Ps, coord_t *Ws, bool *excluded_s, float *Evdw, float *Ecoul, calc_pw_t *pw,
     ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, topo_t topo);
 
 __global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_waters,
-    coord_t *X, coord_t *W, double *Evdw, double *Ecoul, calc_pw_t *PW_MAT,
+    coord_t *X, coord_t *W, float *Evdw, float *Ecoul, calc_pw_t *PW_MAT,
     ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, bool *D_excluded, topo_t topo);
 
 __global__ void calc_pw_dvel_vector_row(int n_patoms, int n_waters, dvel_t *DV_X, dvel_t *DV_W, calc_pw_t *PW_MAT, p_atom_t *D_patoms);
@@ -61,7 +61,7 @@ __global__ void calc_pw_dvel_vector_row(int n_patoms, int n_waters, dvel_t *DV_X
 __global__ void calc_pw_dvel_vector_column(int n_patoms, int n_waters, dvel_t *DV_X, dvel_t *DV_W, calc_pw_t *PW_MAT);
 
 // General
-__global__ void calc_energy_sum(int rows, int columns, double *Evdw_TOT, double *Ecoul_TOT, double *Evdw, double *Ecoul, bool upper_diagonal);
+__global__ void calc_energy_sum(int rows, int columns, float *Evdw_TOT, float *Ecoul_TOT, float *Evdw, float *Ecoul, bool upper_diagonal);
 
 void clean_d_solvent();
 
diff --git a/src/core/system.cu b/src/core/system.cu
index 18f9944c..6cd52ef3 100644
--- a/src/core/system.cu
+++ b/src/core/system.cu
@@ -28,7 +28,7 @@ int n_waters;
 int n_molecules = 0;
 
 char base_folder[1024];
-double dt, tau_T;
+float dt, tau_T;
 
 bool run_gpu = false;
 
@@ -85,7 +85,7 @@ int *LJ_matrix;
 bool *excluded;
 bool *heavy;
 int *molecules;
-double *winv;
+float *winv;
 cgrp_t *charge_groups;
 
 topo_t topo;
@@ -96,7 +96,7 @@ topo_t topo;
  */
 
 int n_lambdas;
-double *lambdas;
+float *lambdas;
 
 int n_qangcouples;
 int n_qangles;
@@ -340,23 +340,23 @@ void exclude_shaken_definitions() {
 coord_t* coords;
 vel_t* velocities;
 dvel_t* dvelocities;
-double Temp = 0;
-double Texcl = 0;
-double Tfree = 0;
-double Tscale = 1;
+float Temp = 0;
+float Texcl = 0;
+float Tfree = 0;
+float Tscale = 1;
 
 // Shake constrains
 coord_t* xcoords;
 
 // Water constants
-double A_O = 0, A_OO = 0, B_O, B_OO, crg_ow, crg_hw, mu_w = 0;
+float A_O = 0, A_OO = 0, B_O, B_OO, crg_ow, crg_hw, mu_w = 0;
 
 void init_velocities() {
     velocities = (vel_t*) malloc(n_atoms * sizeof(vel_t));
 
     // If not previous value set, use a Maxwell distribution to fill velocities
-    double kT = Boltz * md.initial_temperature;
-    double sd, mass;
+    float kT = Boltz * md.initial_temperature;
+    float sd, mass;
     for (int i = 0; i < n_atoms; i++) {
         mass = catypes[atypes[i].code - 1].m;
         sd = sqrt(kT / mass);
@@ -376,7 +376,7 @@ void init_xcoords() {
 }
 
 void init_inv_mass() {
-    winv = (double*) malloc(n_atoms * sizeof(double));
+    winv = (float*) malloc(n_atoms * sizeof(float));
     for (int ai = 0; ai < n_atoms; ai++) {
         winv[ai] = 1 / catypes[atypes[ai].code-1].m;
     }
@@ -402,11 +402,11 @@ restrdis_t *restrdists;
 restrang_t *restrangs;
 restrwall_t* restrwalls;
 
-double crgQtot = 0;
-double Dwmz, awmz;
+float crgQtot = 0;
+float Dwmz, awmz;
  
  // Shell layout. Defined once per run
-double *theta, *theta0, *tdum; //array size n_waters
+float *theta, *theta0, *tdum; //array size n_waters
 int n_max_inshell, n_shells;
 int **list_sh, **nsort; // array size (n_max_inshell, n_shells)
 shell_t* wshells;
@@ -427,7 +427,7 @@ void init_water_sphere() {
 //ONLY call if there are actually solvent atoms, or get segfaulted
 void init_wshells() {
     int n_inshell;
-    double drs, router, ri, dr, Vshell, rshell;
+    float drs, router, ri, dr, Vshell, rshell;
     if (mu_w == 0) {
         // Get water properties from first water molecule
         cbond_t cbondw = cbonds[bonds[n_atoms_solute].code-1];
@@ -479,9 +479,9 @@ void init_wshells() {
     n_max_inshell = n_waters; // Make largest a little bigger just in case
 
     // Initialize arrays needed for bookkeeping
-    theta = (double*) malloc(n_waters * sizeof(double));
-    theta0 = (double*) malloc(n_waters * sizeof(double));
-    tdum = (double*) malloc(n_waters * sizeof(double));
+    theta = (float*) malloc(n_waters * sizeof(float));
+    theta0 = (float*) malloc(n_waters * sizeof(float));
+    tdum = (float*) malloc(n_waters * sizeof(float));
 
     list_sh = (int**) malloc(n_max_inshell * sizeof(int*));
     nsort = (int**) malloc(n_max_inshell * sizeof(int*));
@@ -493,7 +493,7 @@ void init_wshells() {
 }
 
 void init_pshells() {
-    double mass, r2, rin2;
+    float mass, r2, rin2;
 
     heavy = (bool*) calloc(n_atoms, sizeof(bool));
     shell = (bool*) calloc(n_atoms, sizeof(bool));
@@ -529,7 +529,7 @@ void init_pshells() {
 }
 
 void init_pshells_with_charge_groups() {
-    double mass, r2, rin2;
+    float mass, r2, rin2;
 
     heavy = (bool*) calloc(n_atoms, sizeof(bool));
     shell = (bool*) calloc(n_atoms, sizeof(bool));
@@ -595,7 +595,7 @@ void init_shake() {
     int mol = 0;
     int shake;
     int n_solute_shake_constraints = 0;
-    double excl_shake = 0;
+    float excl_shake = 0;
 
     n_shake_constraints = 0;
     mol_n_shakes = (int*) calloc(n_molecules, sizeof(int));
@@ -704,18 +704,18 @@ void init_patoms() {
  * =============================================
  */
 
-double Ndegf, Ndegfree, Ndegf_solvent, Ndegf_solute, Ndegfree_solvent, Ndegfree_solute;
-double Tscale_solute = 1, Tscale_solvent = 1;
+float Ndegf, Ndegfree, Ndegf_solvent, Ndegf_solute, Ndegfree_solvent, Ndegfree_solute;
+float Tscale_solute = 1, Tscale_solvent = 1;
 
 void calc_temperature() {
     printf("Ndegf = %f, Ndegfree = %f, n_excluded = %d, Ndegfree_solvent = %f, Ndegfree_solute = %f\n", Ndegf, Ndegfree, n_excluded, Ndegfree_solvent, Ndegfree_solute);
     Temp = 0;
     Tfree = 0;
-    double Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0;
-    double Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0;
-    double Ekinmax = 1000.0 * Ndegf * Boltz * md.temperature / 2.0 / n_atoms;
-    double ener;
-    double mass_i;
+    float Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0;
+    float Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0;
+    float Ekinmax = 1000.0 * Ndegf * Boltz * md.temperature / 2.0 / n_atoms;
+    float ener;
+    float mass_i;
 
     Temp = 0;
     for (int i = 0; i < n_atoms_solute; i++) {
@@ -777,7 +777,7 @@ void calc_temperature() {
  */
 
 void calc_leapfrog() {
-    double mass_i, winv_i;
+    float mass_i, winv_i;
     for (int i = 0; i < n_atoms_solute; i++) {
         mass_i = catypes[atypes[i].code - 1].m;
 
@@ -1226,17 +1226,17 @@ void calc_integration_step(int iteration) {
 
     // Profiler info
 #ifdef __PROFILING__
-    printf("Elapsed time for bonded forces: %f\n", (end_bonded-start) / (double)CLOCKS_PER_SEC );
-    printf("Elapsed time for non-bonded forces: %f\n", (end_nonbonded-end_bonded) / (double)CLOCKS_PER_SEC);
-    printf("Elapsed time for pp interactions: %f\n", (end_pp-start_pp) / (double)CLOCKS_PER_SEC );
-    printf("Elapsed time for qq interaction: %f\n",  (end_qq-start_qq) / (double)CLOCKS_PER_SEC );
-    printf("Elapsed time for qp interaction: %f\n",  (end_qp-start_qp) / (double)CLOCKS_PER_SEC );
+    printf("Elapsed time for bonded forces: %f\n", (end_bonded-start) / (float)CLOCKS_PER_SEC );
+    printf("Elapsed time for non-bonded forces: %f\n", (end_nonbonded-end_bonded) / (float)CLOCKS_PER_SEC);
+    printf("Elapsed time for pp interactions: %f\n", (end_pp-start_pp) / (float)CLOCKS_PER_SEC );
+    printf("Elapsed time for qq interaction: %f\n",  (end_qq-start_qq) / (float)CLOCKS_PER_SEC );
+    printf("Elapsed time for qp interaction: %f\n",  (end_qp-start_qp) / (float)CLOCKS_PER_SEC );
     if (n_waters > 0) {
-        printf("Elapsed time for ww interactions: %f\n", (end_ww-start_ww) / (double)CLOCKS_PER_SEC );
-        printf("Elapsed time for pw interactions: %f\n", (end_pw-start_pw) / (double)CLOCKS_PER_SEC );
+        printf("Elapsed time for ww interactions: %f\n", (end_ww-start_ww) / (float)CLOCKS_PER_SEC );
+        printf("Elapsed time for pw interactions: %f\n", (end_pw-start_pw) / (float)CLOCKS_PER_SEC );
     }
     printf("---\n");
-    printf("Elapsed time for entire time-step: %f\n", (end_calculation-start) / (double)CLOCKS_PER_SEC);
+    printf("Elapsed time for entire time-step: %f\n", (end_calculation-start) / (float)CLOCKS_PER_SEC);
 #endif /* __PROFILING__ */
 
 }
diff --git a/src/core/system.h b/src/core/system.h
index 24161423..db2ec269 100644
--- a/src/core/system.h
+++ b/src/core/system.h
@@ -64,7 +64,7 @@ extern int n_molecules;
 
 extern char base_folder[1024];
 
-extern double dt;
+extern float dt;
 
 extern bool run_gpu;
 
@@ -76,29 +76,29 @@ extern bool run_gpu;
 struct md_t {
     // [MD]
     int steps;
-    double stepsize;
-    double temperature;
+    float stepsize;
+    float temperature;
     char thermostat[40];
-    double bath_coupling;
+    float bath_coupling;
     int random_seed;
-    double initial_temperature;
+    float initial_temperature;
     bool shake_solvent;
     bool shake_solute;
     bool shake_hydrogens;
     bool lrf;
     bool charge_groups;
     // [cut-offs]
-    double solute_solute;
-    double solvent_solvent;
-    double solute_solvent;
-    double q_atom;
+    float solute_solute;
+    float solvent_solvent;
+    float solute_solvent;
+    float q_atom;
     // [sphere]
-    double shell_radius; // Note: this is for the pshell
-    double shell_force;  // Note: this is for the pshell
+    float shell_radius; // Note: this is for the pshell
+    float shell_force;  // Note: this is for the pshell
     // [solvent]
-    double radial_force;
+    float radial_force;
     bool polarisation;
-    double polarisation_force;
+    float polarisation_force;
     // [intervals]
     int non_bond;
     int output;
@@ -121,9 +121,9 @@ extern bool separate_scaling;
  */
 
 struct coord_t {
-    double x;
-    double y;
-    double z;
+    float x;
+    float y;
+    float z;
 };
 
 struct bond_t {
@@ -134,8 +134,8 @@ struct bond_t {
 
 struct cbond_t {
     int code;
-    double kb;
-    double b0;
+    float kb;
+    float b0;
 };
 
 struct angle_t {
@@ -147,8 +147,8 @@ struct angle_t {
 
 struct cangle_t {
     int code;
-    double kth;
-    double th0;
+    float kth;
+    float th0;
 };
 
 struct torsion_t {
@@ -161,9 +161,9 @@ struct torsion_t {
 
 struct ctorsion_t {
     int code;
-    double k;
-    double n;
-    double d;
+    float k;
+    float n;
+    float d;
 };
 
 struct improper_t {
@@ -176,8 +176,8 @@ struct improper_t {
 
 struct cimproper_t {
     int code;
-    double k;
-    double phi0;
+    float k;
+    float phi0;
 };
 
 struct charge_t {
@@ -187,7 +187,7 @@ struct charge_t {
 
 struct ccharge_t {
     int code;
-    double charge;
+    float charge;
 };
 
 struct atype_t {
@@ -197,23 +197,23 @@ struct atype_t {
 
 struct catype_t {
     int code;
-    double m;
-    double aii_normal;
-    double bii_normal;
-    double aii_polar;
-    double bii_polar;
-    double aii_1_4;
-    double bii_1_4;
+    float m;
+    float aii_normal;
+    float bii_normal;
+    float aii_polar;
+    float bii_polar;
+    float aii_1_4;
+    float bii_1_4;
 };
 
 struct topo_t {
     int solvent_type;
-    double exclusion_radius;
-    double solvent_radius;
+    float exclusion_radius;
+    float solvent_radius;
     coord_t solute_center;
     coord_t solvent_center;
-    double el14_scale;
-    double coulomb_constant;
+    float el14_scale;
+    float coulomb_constant;
 };
 
 struct cgrp_t {
@@ -263,7 +263,7 @@ extern int *LJ_matrix;
 extern torsion_t *torsions;
 extern bool *excluded;
 extern bool *heavy;
-extern double *winv;
+extern float *winv;
 extern cgrp_t *charge_groups;
 
 extern int *molecules;
@@ -274,7 +274,7 @@ extern int *molecules;
  */
 
 extern int n_lambdas;
-extern double *lambdas;
+extern float *lambdas;
 
 struct q_angcouple_t {
     int acode;
@@ -303,45 +303,45 @@ struct q_bond_t {
 };
 
 struct q_cangle_t {
-    double kth;
-    double th0;
+    float kth;
+    float th0;
 };
 
 struct q_catype_t {
     char name[10];
-    double Ai;
-    double Bi;
-    double Ci;
-    double ai;
-    double Ai_14;
-    double Bi_14;
-    double m;
+    float Ai;
+    float Bi;
+    float Ci;
+    float ai;
+    float Ai_14;
+    float Bi_14;
+    float m;
 };
 
 struct q_cbond_t {
-    double kb;
-    double b0;
+    float kb;
+    float b0;
 };
 
 struct q_charge_t {
-    double q;
+    float q;
 };
 
 struct q_cimproper_t {
-    double k;
-    double phi0;
+    float k;
+    float phi0;
 };
 
 struct q_ctorsion_t {
-    double k;
-    double n;
-    double d;
+    float k;
+    float n;
+    float d;
 };
 
 struct q_elscale_t {
     int qi;
     int qj;
-    double mu;
+    float mu;
 };
 
 struct q_exclpair_t {
@@ -368,18 +368,18 @@ struct q_offdiag_t {
     int j;
     int qk;
     int ql;
-    double Aij;
-    double muij;
+    float Aij;
+    float muij;
 };
 
 struct q_shake_t {
     int ai;
     int aj;
-    double dist;
+    float dist;
 };
 
 struct q_softcore_t {
-    double s;
+    float s;
 };
 
 struct q_softpair_t {
@@ -451,7 +451,7 @@ extern q_torsion_t *q_torsions;
 struct restrseq_t {
     int ai;
     int aj;
-    double k;
+    float k;
     bool ih;
     int to_center; // Flag for restraining to geom. or mass center
 };
@@ -466,21 +466,21 @@ struct restrpos_t {
 struct restrdis_t {
     int ai, aj;
     int ipsi;
-    double d1, d2;
-    double k;
+    float d1, d2;
+    float k;
     char itext[20], jtext[20];
 };
 
 struct restrang_t {
     int ai, aj, ak;
     int ipsi;
-    double ang;
-    double k;
+    float ang;
+    float k;
 };
 
 struct restrwall_t {
     int ai, aj;
-    double d, k, aMorse, dMorse;
+    float d, k, aMorse, dMorse;
     bool ih;
 };
 
@@ -505,20 +505,20 @@ void init_restrseqs(char* filename);
 
 struct shell_t {
     int n_inshell;
-    double theta_corr;
-    double avtheta;
-    double avn_inshell;
-    double router;
-    double dr;
-    double cstb;
+    float theta_corr;
+    float avtheta;
+    float avn_inshell;
+    float router;
+    float dr;
+    float cstb;
 };
 
 // Total energy in the system. Defined once per run
-extern double crgQtot;
-extern double Dwmz, awmz;
+extern float crgQtot;
+extern float Dwmz, awmz;
 
 // Water shell layout. Defined once per run
-extern double *theta, *theta0, *tdum; //array size n_waters
+extern float *theta, *theta0, *tdum; //array size n_waters
 extern int n_max_inshell, n_shells;
 extern int **list_sh, **nsort; // array size (n_max_inshell, n_shells)
 extern shell_t* wshells;
@@ -534,7 +534,7 @@ void init_wshells();
 struct shake_bond_t {
     int ai;
     int aj;
-    double dist2;
+    float dist2;
     bool ready;
 };
 
@@ -553,42 +553,42 @@ struct p_atom_t {
 };
 
 struct vel_t {
-    double x;
-    double y;
-    double z;
+    float x;
+    float y;
+    float z;
 };
 
 struct dvel_t {
-    double x;
-    double y;
-    double z;
+    float x;
+    float y;
+    float z;
 };
 
 struct E_bonded_t {
-    double Ubond;
-    double Uangle;
-    double Utor;
-    double Uimp;
+    float Ubond;
+    float Uangle;
+    float Utor;
+    float Uimp;
 };
 
 struct E_nonbonded_t {
-    double Ucoul;
-    double Uvdw;
+    float Ucoul;
+    float Uvdw;
 };
 
 struct E_restraint_t {
-    double Uradx;
-    double Upolx;
-    double Ufix;
-    double Ushell;
-    double Upres;
-    double Urestr;
+    float Uradx;
+    float Upolx;
+    float Ufix;
+    float Ushell;
+    float Upres;
+    float Urestr;
 };
 
 struct energy_t {
-    double Ukin;
-    double Upot;
-    double Utot;
+    float Ukin;
+    float Upot;
+    float Utot;
 };
 
 extern p_atom_t *p_atoms;
@@ -601,8 +601,8 @@ extern E_bonded_t E_bond_p, E_bond_w, E_bond_q, *EQ_bond;
 extern E_nonbonded_t E_nonbond_pp, E_nonbond_pw, E_nonbond_ww, E_nonbond_qx;
 extern E_nonbonded_t *EQ_nonbond_qq, *EQ_nonbond_qp, *EQ_nonbond_qw, *EQ_nonbond_qx;
 extern E_restraint_t E_restraint, *EQ_restraint;
-extern double Temp, Tfree, Texcl;
-extern double A_O, A_OO, B_O, B_OO, crg_ow, crg_hw; // TODO: don't keep this in system.cu?
+extern float Temp, Tfree, Texcl;
+extern float A_O, A_OO, B_O, B_OO, crg_ow, crg_hw; // TODO: don't keep this in system.cu?
 
 void init_velocities();
 void init_dvelocities();
@@ -613,7 +613,7 @@ void init_energies();
  * =============================================
  */
 
-extern double Ndegf, Ndegfree, Ndegf_solvent, Ndegf_solute, Ndegfree_solvent, Ndegfree_solute;
+extern float Ndegf, Ndegfree, Ndegf_solvent, Ndegf_solute, Ndegfree_solvent, Ndegfree_solute;
 void calc_temperature();
 
 /* =============================================
diff --git a/src/core/utils.cu b/src/core/utils.cu
index a0fd20f2..009e764b 100644
--- a/src/core/utils.cu
+++ b/src/core/utils.cu
@@ -3,22 +3,22 @@
 
 // Get a value from a gaussian distributed random variable with
 // mean mean and standard deviation sd
-double gauss(double mean, double sd) {
-    double v1, v2, nd10;
+float gauss(float mean, float sd) {
+    float v1, v2, nd10;
 
-    v1 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. );
-    v2 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. );
+    v1 = ( (float)(rand()) + 1. )/( (float)(RAND_MAX) + 1. );
+    v2 = ( (float)(rand()) + 1. )/( (float)(RAND_MAX) + 1. );
     nd10 = cos(2 * M_PI * v2) * sqrt(-2. * log(v1));
 
     return sd * nd10 + mean;
 }
 
 
-double to_degrees(double radians) {
+float to_degrees(float radians) {
     return radians * (180.0 / M_PI);
 }
 
-double to_radians(double degrees) {
+float to_radians(float degrees) {
     return degrees * (M_PI / 180.0);
 }
 
diff --git a/src/core/utils.h b/src/core/utils.h
index 32d50900..4608fe8d 100644
--- a/src/core/utils.h
+++ b/src/core/utils.h
@@ -6,9 +6,9 @@
  * =============================================
  */
 
-double gauss(double mean, double sd);
-double to_degrees(double radians);
-double to_radians(double degrees);
+float gauss(float mean, float sd);
+float to_degrees(float radians);
+float to_radians(float degrees);
 
 /* =============================================
  * == DEVICE

From 9942e89ec47acc5013feab09bc7a18051331a9cb Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Wed, 12 Nov 2025 09:36:30 +0100
Subject: [PATCH 3/3] add comment

---
 src/core/restraints.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/core/restraints.cu b/src/core/restraints.cu
index 132d4f83..768ff40e 100644
--- a/src/core/restraints.cu
+++ b/src/core/restraints.cu
@@ -184,6 +184,7 @@ void calc_polx_w_forces(int iteration) {
             if (cos_th > 1) cos_th = 1;
             if (cos_th < -1) cos_th = -1;
             f0 = sin(acos(cos_th));
+            // After change double to float, need to change 1e-12 to 1e-6, because of precision
             if (abs(f0) < 1.0E-6) f0 = 1.0E-6;
             f0 = -1.0 / f0;
             f0 *= dv;