From cb2672bb864e0d741e5a675257087ac2a2d5f361 Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Fri, 7 Nov 2025 09:54:54 +0100 Subject: [PATCH 1/3] add log for debug --- src/core/system.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/core/system.cu b/src/core/system.cu index 72ccf70d..18f9944c 100644 --- a/src/core/system.cu +++ b/src/core/system.cu @@ -729,7 +729,9 @@ void calc_temperature() { Texcl_solute += ener; } if (ener > Ekinmax) { - printf(">>> WARNING: hot atom %d: %f\n", i, ener/Boltz/3); + printf(">>> WARNING: hot atom %d, %f %f %f %f %f %f: %f %f %f %f %f\n", i, velocities[i].x, velocities[i].y, velocities[i].z, + coords[i].x, coords[i].y, coords[i].z, ener, ener/Boltz/3, Ekinmax, mass_i, Ekinmax / (0.5 * mass_i)); + // printf(">>> WARNING: hot atom %d: %f\n", i, ener/Boltz/3); } } @@ -744,7 +746,9 @@ void calc_temperature() { Texcl_solvent += ener; } if (ener > Ekinmax) { - printf(">>> WARNING: hot atom %d: %f\n", i, ener/Boltz/3); + printf(">>> WARNING: hot atom %d, %f %f %f %f %f %f: %f %f %f %f %f\n", i, velocities[i].x, velocities[i].y, velocities[i].z, + coords[i].x, coords[i].y, coords[i].z, ener, ener/Boltz/3, Ekinmax, mass_i, Ekinmax / (0.5 * mass_i)); + // printf(">>> WARNING: hot atom %d: %f\n", i, ener/Boltz/3); } } From 7d1cc82f8c8549788b749c54e27eca866914dbff Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Wed, 12 Nov 2025 09:28:28 +0100 Subject: [PATCH 2/3] change double to float --- src/core/bonded.cu | 36 ++++---- src/core/bonded.h | 8 +- src/core/nonbonded.cu | 58 ++++++------ src/core/nonbonded.h | 6 +- src/core/parse.cu | 32 ++++--- src/core/qatoms.cu | 136 +++++++++++++-------------- src/core/qatoms.h | 24 ++--- src/core/restraints.cu | 40 ++++---- src/core/shake.cu | 6 +- src/core/solvent.cu | 174 +++++++++++++++++------------------ src/core/solvent.h | 14 +-- src/core/system.cu | 74 +++++++-------- src/core/system.h | 202 ++++++++++++++++++++--------------------- src/core/utils.cu | 12 +-- src/core/utils.h | 6 +- 15 files changed, 418 insertions(+), 410 deletions(-) diff --git a/src/core/bonded.cu b/src/core/bonded.cu index f86d773e..08d8791c 100644 --- a/src/core/bonded.cu +++ b/src/core/bonded.cu @@ -10,18 +10,18 @@ * ============================================= */ -double calc_angle_forces(int start, int end) { +float calc_angle_forces(int start, int end) { int aii, aji, aki; coord_t ai, aj, ak; coord_t rji, rjk; coord_t di, dk; - double bji2inv, bjk2inv, bjiinv, bjkinv; + float bji2inv, bjk2inv, bjiinv, bjkinv; cangle_t cangle; - double cos_th, th, dth, dv, f1; - double ener; - double angle = 0; + float cos_th, th, dth, dv, f1; + float ener; + float angle = 0; for (int i = start; i < end; i++) { aii = angles[i].ai - 1; @@ -102,12 +102,12 @@ double calc_angle_forces(int start, int end) { return angle; } -double calc_bond_forces(int start, int end) { +float calc_bond_forces(int start, int end) { int aii, aji; coord_t ai, aj, dx; cbond_t cbond; - double dx2, dx1, ddx, ener, ampl; - double bond = 0; + float dx2, dx1, ddx, ener, ampl; + float bond = 0; for (int i = start; i < end; i++) { aii = bonds[i].ai-1; @@ -147,18 +147,18 @@ double calc_bond_forces(int start, int end) { return bond; } -double calc_torsion_forces(int start, int end) { +float calc_torsion_forces(int start, int end) { int aii, aji, aki, ali; coord_t ai, aj, ak, al; coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; coord_t di, dl, dpi, dpj, dpk, dpl; - double bj2inv, bk2inv, bjinv, bkinv; - double cos_phi, phi; - double arg, dv, f1; - double ener; - double torsion = 0; + float bj2inv, bk2inv, bjinv, bkinv; + float cos_phi, phi; + float arg, dv, f1; + float ener; + float torsion = 0; torsion_t t; ctorsion_t ctors; @@ -281,18 +281,18 @@ double calc_torsion_forces(int start, int end) { return torsion; } -double calc_improper2_forces(int start, int end) { +float calc_improper2_forces(int start, int end) { int aii, aji, aki, ali; coord_t ai, aj, ak, al; coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; - double bj2inv, bk2inv, bjinv, bkinv; - double cos_phi, phi, arg, ener, dv, f1; + float bj2inv, bk2inv, bjinv, bkinv; + float cos_phi, phi, arg, ener, dv, f1; coord_t di, dl, dpi, dpj, dpk, dpl; improper_t imp; cimproper_t cimp; - double improper = 0; + float improper = 0; for (int i = start; i < end; i++) { imp = impropers[i]; diff --git a/src/core/bonded.h b/src/core/bonded.h index 7a46e4cc..176dcc14 100644 --- a/src/core/bonded.h +++ b/src/core/bonded.h @@ -1,9 +1,9 @@ #ifndef __BONDED_H__ #define __BONDED_H__ -double calc_angle_forces(int start, int end); -double calc_bond_forces(int start, int end); -double calc_torsion_forces(int start, int end); -double calc_improper2_forces(int start, int end); +float calc_angle_forces(int start, int end); +float calc_bond_forces(int start, int end); +float calc_torsion_forces(int start, int end); +float calc_improper2_forces(int start, int end); #endif /* __BONDED_H__ */ \ No newline at end of file diff --git a/src/core/nonbonded.cu b/src/core/nonbonded.cu index 9d49dab7..dc8341d7 100644 --- a/src/core/nonbonded.cu +++ b/src/core/nonbonded.cu @@ -13,8 +13,8 @@ dvel_t *DV_X; dvel_t *PP_MAT, *h_PP_MAT; bool pp_gpu_set = false; -double *D_PP_Evdw, *D_PP_Ecoul, *h_PP_Evdw, *h_PP_Ecoul; -double *D_PP_evdw_TOT, *D_PP_ecoul_TOT, PP_evdw_TOT, PP_ecoul_TOT; +float *D_PP_Evdw, *D_PP_Ecoul, *h_PP_Evdw, *h_PP_Ecoul; +float *D_PP_evdw_TOT, *D_PP_ecoul_TOT, PP_evdw_TOT, PP_ecoul_TOT; // Constants pointers ccharge_t *D_ccharges; @@ -27,13 +27,13 @@ bool *D_excluded; void calc_nonbonded_pp_forces() { bool bond14, bond23; - double scaling; + float scaling; coord_t da; - double r2a, ra, r6a; - double Vela, V_a, V_b; - double dva; - double crg_i, crg_j; - double ai_aii, aj_aii, ai_bii, aj_bii; + float r2a, ra, r6a; + float Vela, V_a, V_b; + float dva; + float crg_i, crg_j; + float ai_aii, aj_aii, ai_bii, aj_bii; int i, j; catype_t ai_type, aj_type; @@ -93,8 +93,8 @@ void calc_nonbonded_pp_forces_host() { int mem_size_PP_MAT = n_patoms * n_patoms * sizeof(dvel_t); int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; - int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(double); - int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(double); + int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(float); + int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(float); if (!pp_gpu_set) { pp_gpu_set = true; @@ -112,12 +112,12 @@ void calc_nonbonded_pp_forces_host() { #endif check_cudaMalloc((void**) &D_PP_Ecoul, mem_size_PP_Ecoul); - check_cudaMalloc((void**) &D_PP_evdw_TOT, sizeof(double)); - check_cudaMalloc((void**) &D_PP_ecoul_TOT, sizeof(double)); + check_cudaMalloc((void**) &D_PP_evdw_TOT, sizeof(float)); + check_cudaMalloc((void**) &D_PP_ecoul_TOT, sizeof(float)); h_PP_MAT = (dvel_t*) malloc(mem_size_PP_MAT); - h_PP_Evdw = (double*) malloc(mem_size_PP_Evdw); - h_PP_Ecoul = (double*) malloc(mem_size_PP_Ecoul); + h_PP_Evdw = (float*) malloc(mem_size_PP_Evdw); + h_PP_Ecoul = (float*) malloc(mem_size_PP_Ecoul); } cudaMemcpy(X, coords, mem_size_X, cudaMemcpyHostToDevice); @@ -148,8 +148,8 @@ void calc_nonbonded_pp_forces_host() { calc_energy_sum<<<1, threads>>>(n_blocks, n_blocks, D_PP_evdw_TOT, D_PP_ecoul_TOT, D_PP_Evdw, D_PP_Ecoul, true); - cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(float), cudaMemcpyDeviceToHost); E_nonbond_pp.Uvdw += PP_evdw_TOT; E_nonbond_pp.Ucoul += PP_ecoul_TOT; @@ -161,17 +161,17 @@ void calc_nonbonded_pp_forces_host() { */ __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj, - coord_t *Xs, coord_t *Ys, int *LJs, bool *excluded_s, double *Evdw, double *Ecoul, dvel_t *patom_a, dvel_t *patom_b, + coord_t *Xs, coord_t *Ys, int *LJs, bool *excluded_s, float *Evdw, float *Ecoul, dvel_t *patom_a, dvel_t *patom_b, ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, topo_t D_topo) { bool bond14, bond23; - double scaling; + float scaling; coord_t da; - double r2a, ra, r6a; - double Vela, V_a, V_b; - double dva; - double crg_i, crg_j; - double ai_aii, aj_aii, ai_bii, aj_bii; + float r2a, ra, r6a; + float Vela, V_a, V_b; + float dva; + float crg_i, crg_j; + float ai_aii, aj_aii, ai_bii, aj_bii; catype_t ai_type, aj_type; bond23 = LJs[row * BLOCK_SIZE + column] == 3; @@ -219,7 +219,7 @@ __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj, } __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute, - coord_t *X, double *Evdw, double *Ecoul, dvel_t *PP_MAT, + coord_t *X, float *Evdw, float *Ecoul, dvel_t *PP_MAT, ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, int *D_LJ_matrix, bool *D_excluded, topo_t D_topo) { // Block index int bx = blockIdx.x; @@ -232,8 +232,8 @@ __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute, int aStart = BLOCK_SIZE * by; int bStart = BLOCK_SIZE * bx; - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; Ecoul_S[ty][tx] = 0; Evdw_S[ty][tx] = 0; @@ -272,7 +272,7 @@ __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute, int column = bx * BLOCK_SIZE + tx; if (bx != by || tx != ty) { - double evdw = 0, ecoul = 0; + float evdw = 0, ecoul = 0; calc_pp_dvel_matrix_incr(ty, pi, tx, pj, Xs, Ys, LJs, excluded_s, &evdw, &ecoul, &patom_a, &patom_b, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo); Evdw_S[ty][tx] = evdw; @@ -287,8 +287,8 @@ __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute, int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; if (tx == 0 && ty == 0) { - double tot_Evdw = 0; - double tot_Ecoul = 0; + float tot_Evdw = 0; + float tot_Ecoul = 0; for (int i = 0; i < BLOCK_SIZE; i++) { for (int j = 0; j < BLOCK_SIZE; j++) { tot_Evdw += Evdw_S[i][j]; diff --git a/src/core/nonbonded.h b/src/core/nonbonded.h index 0421082d..b961be56 100644 --- a/src/core/nonbonded.h +++ b/src/core/nonbonded.h @@ -27,11 +27,11 @@ extern bool *D_excluded; // P-P interactions __device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj, - coord_t *Xs, coord_t *Ys, int *LJs, bool *excluded_s, double *Evdw, double *Ecoul, dvel_t *patom_a, dvel_t *patom_b, + coord_t *Xs, coord_t *Ys, int *LJs, bool *excluded_s, float *Evdw, float *Ecoul, dvel_t *patom_a, dvel_t *patom_b, ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, topo_t topo); __global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute, - coord_t *X, double *Evdw, double *Ecoul, dvel_t *PP_MAT, + coord_t *X, float *Evdw, float *Ecoul, dvel_t *PP_MAT, ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, int *D_LJ_matrix, bool *D_excluded, topo_t topo); __global__ void calc_pp_dvel_vector(int n_patoms, dvel_t *DV_X, dvel_t *PP_MAT, p_atom_t *D_patoms); @@ -39,6 +39,6 @@ __global__ void calc_pp_dvel_vector(int n_patoms, dvel_t *DV_X, dvel_t *PP_MAT, void clean_d_patoms(); -__global__ void calc_energy_sum(int rows, int columns, double *Evdw_TOT, double *Ecoul_TOT, double *Evdw, double *Ecoul, bool upper_diagonal); +__global__ void calc_energy_sum(int rows, int columns, float *Evdw_TOT, float *Ecoul_TOT, float *Evdw, float *Ecoul, bool upper_diagonal); #endif /* __NONBONDED_H__ */ \ No newline at end of file diff --git a/src/core/parse.cu b/src/core/parse.cu index a11561cc..4019c2c2 100644 --- a/src/core/parse.cu +++ b/src/core/parse.cu @@ -210,7 +210,7 @@ void init_md(const char *filename) { #ifdef VERBOSE printf("reading in %d lambdas (%s in file)\n", n_lambdas, file.buffer[k][1]); #endif - lambdas = (double*) malloc(n_lambdas * sizeof(double)); + lambdas = (float*) malloc(n_lambdas * sizeof(float)); k++; for (int i = 0; i < n_lambdas; i++) { lambdas[i] = strtod(file.buffer[k][0], &eptr); @@ -521,20 +521,28 @@ void init_excluded(const char *filename) { if (fp == NULL) exit(EXIT_FAILURE); - char line[8192]; - - n_excluded = 0; +// --- dynamically read a full line --- + char *line = NULL; + size_t len = 0; + ssize_t read_len = getline(&line, &len, fp); + if (read_len == -1) { + fprintf(stderr, "Error: failed to read line from %s\n", path); + free(line); + fclose(fp); + exit(EXIT_FAILURE); + } - if (fgets(line, 8192, fp)) { - for (int i = 0; i < n_atoms; i++) { - bool excl = (line[i] == '1'); - excluded[i] = excl; - if (excl) { - n_excluded++; - } - } + // --- parse --- + n_excluded = 0; + for (int i = 0; i < n_atoms && i < read_len; i++) { + bool excl = (line[i] == '1'); + excluded[i] = excl; + if (excl) n_excluded++; } + printf("Number of excluded atoms: %d\n", n_excluded); + + free(line); fclose(fp); } diff --git a/src/core/qatoms.cu b/src/core/qatoms.cu index b4a0307e..dba06732 100644 --- a/src/core/qatoms.cu +++ b/src/core/qatoms.cu @@ -10,30 +10,30 @@ calc_qw_t *QW_MAT, *h_QW_MAT; calc_qp_t *QP_MAT, *h_QP_MAT; dvel_t *QQ_MAT; -double *D_QP_Evdw, *D_QP_Ecoul, *h_QP_Evdw, *h_QP_Ecoul; -double *D_QP_evdw_TOT, *D_QP_ecoul_TOT, QP_evdw_TOT, QP_ecoul_TOT; +float *D_QP_Evdw, *D_QP_Ecoul, *h_QP_Evdw, *h_QP_Ecoul; +float *D_QP_evdw_TOT, *D_QP_ecoul_TOT, QP_evdw_TOT, QP_ecoul_TOT; -double *D_QW_Evdw, *D_QW_Ecoul, *h_QW_Evdw, *h_QW_Ecoul; -double *D_QW_evdw_TOT, *D_QW_ecoul_TOT, QW_evdw_TOT, QW_ecoul_TOT; +float *D_QW_Evdw, *D_QW_Ecoul, *h_QW_Evdw, *h_QW_Ecoul; +float *D_QW_evdw_TOT, *D_QW_ecoul_TOT, QW_evdw_TOT, QW_ecoul_TOT; // Constants pointers q_catype_t *D_qcatypes; q_atype_t *D_qatypes; q_charge_t *D_qcharges; q_atom_t *D_qatoms; -double *D_lambdas; +float *D_lambdas; bool qp_gpu_set = false; void calc_nonbonded_qp_forces() { int i, j; coord_t da; - double r2, r6, r; - double ai_aii, aj_aii, ai_bii, aj_bii; + float r2, r6, r; + float ai_aii, aj_aii, ai_bii, aj_bii; q_catype_t qi_type; catype_t aj_type; bool bond23, bond14; - double scaling, Vel, V_a, V_b, dv; + float scaling, Vel, V_a, V_b, dv; for (int qi = 0; qi < n_qatoms; qi++) { for (int pj = 0; pj < n_patoms; pj++) { @@ -98,7 +98,7 @@ void calc_nonbonded_qp_forces_host() { int mem_size_qatypes = n_qatoms * n_lambdas * sizeof(q_atype_t); int mem_size_qcharges = n_qatoms * n_lambdas * sizeof(q_charge_t); int mem_size_qatoms = n_qatoms * sizeof(q_atom_t); - int mem_size_lambdas = n_lambdas * sizeof(double); + int mem_size_lambdas = n_lambdas * sizeof(float); int mem_size_ccharges = n_ccharges * sizeof(ccharge_t); int mem_size_charges = n_atoms * sizeof(charge_t); @@ -111,8 +111,8 @@ void calc_nonbonded_qp_forces_host() { int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; //TODO make Evdw & Ecoul work for # of states > 2 - int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double); - int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double); + int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(float); + int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(float); if (!qp_gpu_set){ qp_gpu_set = true; @@ -207,11 +207,11 @@ void calc_nonbonded_qp_forces_host() { #endif check_cudaMalloc((void**) &D_QP_Ecoul, mem_size_QP_Ecoul); - check_cudaMalloc((void**) &D_QP_evdw_TOT, sizeof(double)); - check_cudaMalloc((void**) &D_QP_ecoul_TOT, sizeof(double)); + check_cudaMalloc((void**) &D_QP_evdw_TOT, sizeof(float)); + check_cudaMalloc((void**) &D_QP_ecoul_TOT, sizeof(float)); - h_QP_Evdw = (double*) malloc(mem_size_QP_Evdw); - h_QP_Ecoul = (double*) malloc(mem_size_QP_Ecoul); + h_QP_Evdw = (float*) malloc(mem_size_QP_Evdw); + h_QP_Ecoul = (float*) malloc(mem_size_QP_Ecoul); h_QP_MAT = (calc_qp_t*) malloc(mem_size_QP_MAT); } @@ -250,8 +250,8 @@ void calc_nonbonded_qp_forces_host() { for (int state = 0; state < min(2, n_lambdas); state++) { calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, &D_QP_Evdw[state * n_blocks_p * n_blocks_q], &D_QP_Ecoul[state * n_blocks_p * n_blocks_q], false); - cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(float), cudaMemcpyDeviceToHost); EQ_nonbond_qp[state].Uvdw += QP_evdw_TOT; EQ_nonbond_qp[state].Ucoul += QP_ecoul_TOT; @@ -261,11 +261,11 @@ void calc_nonbonded_qp_forces_host() { void calc_nonbonded_qw_forces() { int i; coord_t dO, dH1, dH2; - double r2O, rH1, rH2, r6O, rO, r2H1, r2H2; - double dvO, dvH1, dvH2; - double V_a, V_b, VelO, VelH1, VelH2; + float r2O, rH1, rH2, r6O, rO, r2H1, r2H2; + float dvO, dvH1, dvH2; + float V_a, V_b, VelO, VelH1, VelH2; q_catype_t qi_type; - double ai_aii, ai_bii; + float ai_aii, ai_bii; if (A_O == 0) { catype_t catype_ow; // Atom type of first O, H atom @@ -362,8 +362,8 @@ void calc_nonbonded_qw_forces_host() { int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; //TODO make Evdw & Ecoul work for # of states > 2 - int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); - int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); + int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(float); + int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(float); if (A_O == 0) { catype_t catype_ow; // Atom type of first O atom @@ -387,11 +387,11 @@ void calc_nonbonded_qw_forces_host() { #endif check_cudaMalloc((void**) &D_QW_Ecoul, mem_size_QW_Ecoul); - check_cudaMalloc((void**) &D_QW_evdw_TOT, sizeof(double)); - check_cudaMalloc((void**) &D_QW_ecoul_TOT, sizeof(double)); + check_cudaMalloc((void**) &D_QW_evdw_TOT, sizeof(float)); + check_cudaMalloc((void**) &D_QW_ecoul_TOT, sizeof(float)); - h_QW_Evdw = (double*) malloc(mem_size_QW_Evdw); - h_QW_Ecoul = (double*) malloc(mem_size_QW_Ecoul); + h_QW_Evdw = (float*) malloc(mem_size_QW_Evdw); + h_QW_Ecoul = (float*) malloc(mem_size_QW_Ecoul); h_QW_MAT = (calc_qw_t*) malloc(mem_size_MAT); } @@ -406,7 +406,7 @@ void calc_nonbonded_qw_forces_host() { threads = dim3(BLOCK_SIZE, BLOCK_SIZE); grid = dim3((n_waters + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y); - double evdw, ecoul; + float evdw, ecoul; calc_qw_dvel_matrix<<>>(n_qatoms, n_waters, n_lambdas, crg_ow, crg_hw, A_O, B_O, X, W, D_QW_Evdw, D_QW_Ecoul, QW_MAT, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, topo); calc_qw_dvel_vector_column<<<((n_waters+BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_W, QW_MAT); @@ -432,8 +432,8 @@ void calc_nonbonded_qw_forces_host() { for (int state = 0; state < min(2, n_lambdas); state++) { calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, &D_QW_Evdw[state * n_blocks_w * n_blocks_q], &D_QW_Ecoul[state * n_blocks_w * n_blocks_q], false); - cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(float), cudaMemcpyDeviceToHost); EQ_nonbond_qw[state].Uvdw += QW_evdw_TOT; EQ_nonbond_qw[state].Ucoul += QW_ecoul_TOT; @@ -442,15 +442,15 @@ void calc_nonbonded_qw_forces_host() { void calc_nonbonded_qq_forces() { int ai, aj; - double crg_i, crg_j; - double elscale, scaling; + float crg_i, crg_j; + float elscale, scaling; q_catype_t qi_type, qj_type; bool bond23, bond14; coord_t da; - double r2a, ra, r6a; - double Vela, V_a, V_b; - double dva; - double ai_aii, aj_aii, ai_bii, aj_bii; + float r2a, ra, r6a; + float Vela, V_a, V_b; + float dva; + float ai_aii, aj_aii, ai_bii, aj_bii; for (int state = 0; state < n_lambdas; state++) { for (int qi = 0; qi < n_qatoms; qi++) { @@ -520,8 +520,8 @@ void calc_qangle_forces(int state) { int ic; int ai, aj, ak; coord_t rji, rjk; - double bji, bjk; - double cos_th, th, dth, ener, dv, f1; + float bji, bjk; + float cos_th, th, dth, ener, dv, f1; coord_t di, dk; for (int i = 0; i < n_qangles; i++) { @@ -580,7 +580,7 @@ void calc_qangle_forces(int state) { void calc_qbond_forces(int state) { int ic; int ai, aj; - double b, db, ener, dv; + float b, db, ener, dv; coord_t rij; for (int i = 0; i < n_qbonds; i++) { @@ -617,10 +617,10 @@ void calc_qtorsion_forces(int state) { coord_t rji, rjk, rkl, rnj, rnk, rki, rlj; coord_t di, dl, dpi, dpj, dpk, dpl; - double bj2inv, bk2inv, bjinv, bkinv; - double bj, bk, cos_phi, phi; - double arg, dv, f1; - double ener; + float bj2inv, bk2inv, bjinv, bkinv; + float bj, bk, cos_phi, phi; + float arg, dv, f1; + float ener; for (int i = 0; i < n_qtorsions; i++) { ic = q_torsions[i + n_qtorsions * state].code; @@ -730,18 +730,18 @@ void calc_qtorsion_forces(int state) { // Q-W interactions -__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, double crg_ow, double crg_hw, double A_O, double B_O, - coord_t *Qs, coord_t *Ws, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t *qw, - q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, double *D_lambdas, topo_t D_topo) { +__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, float crg_ow, float crg_hw, float A_O, float B_O, + coord_t *Qs, coord_t *Ws, float Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], float Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t *qw, + q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, float *D_lambdas, topo_t D_topo) { int j; coord_t dO, dH1, dH2; - double r2O, rH1, rH2, r6O, rO, r2H1, r2H2; - double dvO, dvH1, dvH2; - double V_a, V_b, VelO, VelH1, VelH2; + float r2O, rH1, rH2, r6O, rO, r2H1, r2H2; + float dvO, dvH1, dvH2; + float V_a, V_b, VelO, VelH1, VelH2; q_atype_t qa_type; q_catype_t qi_type; - double ai_aii, ai_bii; + float ai_aii, ai_bii; j = 3 * column; dO.x = Ws[j].x - Qs[row].x; @@ -810,9 +810,9 @@ __device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lamb (*qw).H2.z += dvH2 * dH2.z; } -__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O, - coord_t *X, coord_t *W, double *Evdw, double *Ecoul, calc_qw_t *MAT, - q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, double *D_lambdas, topo_t D_topo) { +__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, float crg_ow, float crg_hw, float A_O, float B_O, + coord_t *X, coord_t *W, float *Evdw, float *Ecoul, calc_qw_t *MAT, + q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, float *D_lambdas, topo_t D_topo) { // Block index int bx = blockIdx.x; int by = blockIdx.y; @@ -822,8 +822,8 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d int ty = threadIdx.y; //TODO implement >2 states on GPU - __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; - __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + __shared__ float Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + __shared__ float Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; Ecoul_S[ty][tx] = 0; Evdw_S[ty][tx] = 0; @@ -870,8 +870,8 @@ __global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, d if (tx == 0 && ty == 0) { //TODO implement >2 states on GPU for (int state = 0; state < min(2, n_lambdas); state++) { - double tot_Evdw = 0; - double tot_Ecoul = 0; + float tot_Evdw = 0; + float tot_Ecoul = 0; for (int i = 0; i < BLOCK_SIZE; i++) { for (int j = 0; j < BLOCK_SIZE; j++) { tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE]; @@ -957,17 +957,17 @@ __global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t *D // Q-P interactions __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms, - coord_t *Qs, coord_t *Ps, int *LJs, bool *excluded_s, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t *qp, - q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, double *D_lambdas, + coord_t *Qs, coord_t *Ps, int *LJs, bool *excluded_s, float Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], float Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t *qp, + q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, float *D_lambdas, catype_t *D_catypes, atype_t *D_atypes, ccharge_t *D_ccharges, charge_t *D_charges, topo_t D_topo) { coord_t da; - double r2, r6, r; - double ai_aii, aj_aii, ai_bii, aj_bii; + float r2, r6, r; + float ai_aii, aj_aii, ai_bii, aj_bii; q_catype_t qi_type; catype_t aj_type; bool bond23, bond14; - double scaling, Vel, V_a, V_b, dv; + float scaling, Vel, V_a, V_b, dv; int j = D_patoms[pj].a-1; @@ -1023,8 +1023,8 @@ __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, in } __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute, - coord_t *X, double *Evdw, double *Ecoul, calc_qp_t *QP_MAT, - q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, double *D_lambdas, + coord_t *X, float *Evdw, float *Ecoul, calc_qp_t *QP_MAT, + q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, float *D_lambdas, int *D_LJ_matrix, bool *D_excluded, catype_t *D_catypes, atype_t *D_atypes, ccharge_t *D_ccharges, charge_t *D_charges, topo_t D_topo) { // Block index @@ -1036,8 +1036,8 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i int ty = threadIdx.y; //TODO implement >2 states on GPU - __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; - __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + __shared__ float Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + __shared__ float Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; Ecoul_S[ty][tx] = 0; Evdw_S[ty][tx] = 0; @@ -1090,8 +1090,8 @@ __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, i if (tx == 0 && ty == 0) { //TODO implement >2 states on GPU for (int state = 0; state < min(2, n_lambdas); state++) { - double tot_Evdw = 0; - double tot_Ecoul = 0; + float tot_Evdw = 0; + float tot_Ecoul = 0; for (int i = 0; i < BLOCK_SIZE; i++) { for (int j = 0; j < BLOCK_SIZE; j++) { tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE]; diff --git a/src/core/qatoms.h b/src/core/qatoms.h index f85890c8..465b6009 100644 --- a/src/core/qatoms.h +++ b/src/core/qatoms.h @@ -44,18 +44,18 @@ extern q_catype_t *D_qcatypes; extern q_atype_t *D_qatypes; extern q_charge_t *D_qcharges; extern q_atom_t *D_qatoms; -extern double *D_lambdas; +extern float *D_lambdas; extern int *D_LJ_matrix; extern bool *D_excluded; // Q-W interactions -__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O, - coord_t *Qs, coord_t *Ws, double *Evdw, double *Ecoul, calc_qw_t *qw, - q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, double *D_lambdas, topo_t topo); +__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, float crg_ow, float crg_hw, float A_O, float B_O, + coord_t *Qs, coord_t *Ws, float *Evdw, float *Ecoul, calc_qw_t *qw, + q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, float *D_lambdas, topo_t topo); -__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O, - coord_t *X, coord_t *W, double *Evdw, double *Ecoul, calc_qw_t *MAT, - q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, double *D_lambdas, topo_t topo); +__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, float crg_ow, float crg_hw, float A_O, float B_O, + coord_t *X, coord_t *W, float *Evdw, float *Ecoul, calc_qw_t *MAT, + q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, q_atom_t *D_qatoms, float *D_lambdas, topo_t topo); __global__ void calc_qw_dvel_vector_row(int n_qatoms, int n_waters, dvel_t *DV_X, dvel_t *DV_W, calc_qw_t *QW_MAT, q_atom_t *D_qatoms); @@ -66,20 +66,20 @@ __global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t *D // Q-P interactions __device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms, - coord_t *Qs, coord_t *Ps, int *LJs, bool *excluded_s, double *Evdw, double *Ecoul, calc_qp_t *qp, - q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, double *D_lambdas, + coord_t *Qs, coord_t *Ps, int *LJs, bool *excluded_s, float *Evdw, float *Ecoul, calc_qp_t *qp, + q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, float *D_lambdas, catype_t *D_catypes, atype_t *D_atypes, ccharge_t *D_ccharges, charge_t *D_charges, topo_t topo); __global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute, - coord_t *X, double *Evdw, double *Ecoul, calc_qp_t *QP_MAT, - q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, double *D_lambdas, + coord_t *X, float *Evdw, float *Ecoul, calc_qp_t *QP_MAT, + q_catype_t *D_qcatypes, q_atype_t *D_qatypes, q_charge_t *D_qcharges, p_atom_t *D_patoms, q_atom_t *D_qatoms, float *D_lambdas, int *D_LJ_matrix, bool *D_excluded, catype_t *D_catypes, atype_t *D_atypes, ccharge_t *D_ccharges, charge_t *D_charges, topo_t topo); __global__ void calc_qp_dvel_vector_row(int n_qatoms, int n_patoms, dvel_t *DV_X, calc_qp_t *QP_MAT, q_atom_t *D_qatoms); __global__ void calc_qp_dvel_vector_column(int n_qatoms, int n_patoms, dvel_t *DV_X, calc_qp_t *QP_MAT, p_atom_t *D_patoms); -__global__ void calc_energy_sum(int rows, int columns, double *Evdw_TOT, double *Ecoul_TOT, double *Evdw, double *Ecoul, bool upper_diagonal); +__global__ void calc_energy_sum(int rows, int columns, float *Evdw_TOT, float *Ecoul_TOT, float *Evdw, float *Ecoul, bool upper_diagonal); void clean_d_qatoms(); diff --git a/src/core/restraints.cu b/src/core/restraints.cu index 56d20ef5..132d4f83 100644 --- a/src/core/restraints.cu +++ b/src/core/restraints.cu @@ -6,9 +6,9 @@ #include void calc_radix_w_forces() { - double b, db, ener, dv, fexp; + float b, db, ener, dv, fexp; coord_t dr; - double shift; + float shift; if (md.radial_force != 0) { shift = sqrt(Boltz * Tfree / md.radial_force); @@ -52,12 +52,12 @@ void calc_radix_w_forces() { void calc_polx_w_forces(int iteration) { int wi, imin, jw, ii, iis, jmin; - double tmin; + float tmin; coord_t rmu, rcu, f1O, f1H1, f1H2, f2; - double rm, rc; - double cos_th; - double avtdum, arg, f0, dv; - double ener; + float rm, rc; + float cos_th; + float avtdum, arg, f0, dv; + float ener; for (int is = 0; is < n_shells; is++) { wshells[is].n_inshell = 0; @@ -129,8 +129,8 @@ void calc_polx_w_forces(int iteration) { if (iteration != 0 && iteration % itdis_update == 0) { for (int is = 0; is < n_shells; is++) { printf("SHELL %d\n", is); - wshells[is].avtheta /= (double) itdis_update; - wshells[is].avn_inshell /= (double) itdis_update; + wshells[is].avtheta /= (float) itdis_update; + wshells[is].avn_inshell /= (float) itdis_update; wshells[is].theta_corr = wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb); printf("average theta = %f, average in shell = %f, theta_corr = %f\n", wshells[is].avtheta * 180 / M_PI, wshells[is].avn_inshell, wshells[is].theta_corr * 180 / M_PI); @@ -148,7 +148,7 @@ void calc_polx_w_forces(int iteration) { avtdum = 0; for (int il = 0; il < wshells[is].n_inshell; il++) { ii = nsort[il][is]; - arg = 1 + ((1 - 2 * (double) (il+1)) / (double) wshells[is].n_inshell); + arg = 1 + ((1 - 2 * (float) (il+1)) / (float) wshells[is].n_inshell); theta0[il] = acos(arg); theta0[il] = theta0[il] - 3 * sin(theta0[il]) * wshells[is].cstb / 2; if (theta0[il] < 0) theta0[il] = 0; @@ -184,7 +184,7 @@ void calc_polx_w_forces(int iteration) { if (cos_th > 1) cos_th = 1; if (cos_th < -1) cos_th = -1; f0 = sin(acos(cos_th)); - if (abs(f0) < 1.0E-12) f0 = 1.0E-12; + if (abs(f0) < 1.0E-6) f0 = 1.0E-6; f0 = -1.0 / f0; f0 *= dv; @@ -213,14 +213,14 @@ void calc_polx_w_forces(int iteration) { dvelocities[wi+2].z += f0 * (f1H2.z); } - wshells[is].avtheta += avtdum / (double) wshells[is].n_inshell; + wshells[is].avtheta += avtdum / (float) wshells[is].n_inshell; wshells[is].avn_inshell += wshells[is].n_inshell; } } void calc_pshell_forces() { coord_t dr; - double k, r2, ener; + float k, r2, ener; for (int i = 0; i < n_atoms_solute; i++) { if (shell[i] || excluded[i]) { @@ -250,9 +250,9 @@ void calc_pshell_forces() { // sequence restraints (independent of Q-state) void calc_restrseq_forces() { - double k, mass, totmass; + float k, mass, totmass; coord_t dr; - double r2, ener; + float r2, ener; for (int s = 0; s < n_restrseqs; s++) { k = restrseqs[s].k; @@ -349,7 +349,7 @@ void calc_restrseq_forces() { void calc_restrpos_forces() { int state, i; coord_t dr; - double lambda, ener, x2, y2, z2; + float lambda, ener, x2, y2, z2; for (int ir = 0; ir < n_restrspos; ir++) { state = restrspos[ir].ipsi-1; @@ -396,7 +396,7 @@ void calc_restrpos_forces() { void calc_restrdis_forces() { int state, i, j; coord_t dr; - double lambda, b, db, dv, ener; + float lambda, b, db, dv, ener; for (int ir = 0; ir < n_restrdists; ir++) { state = restrdists[ir].ipsi-1; @@ -454,8 +454,8 @@ void calc_restrdis_forces() { void calc_restrang_forces() { int state, i, j, k; coord_t dr, dr2, di, dk; - double lambda, r2ij, r2jk, rij, rjk, cos_th, th; - double dth, dv, ener, f1; + float lambda, r2ij, r2jk, rij, rjk, cos_th, th; + float dth, dv, ener, f1; for (int ir = 0; ir < n_restrangs; ir++) { state = restrangs[ir].ipsi-1; @@ -539,7 +539,7 @@ void calc_restrang_forces() { // extra half-harmonic wall restraints void calc_restrwall_forces() { - double k, b, db, ener, dv, fexp; + float k, b, db, ener, dv, fexp; coord_t dr; for (int ir = 0; ir < n_restrwalls; ir++) { diff --git a/src/core/shake.cu b/src/core/shake.cu index 645c0e3a..fa76d5bf 100644 --- a/src/core/shake.cu +++ b/src/core/shake.cu @@ -6,7 +6,7 @@ int calc_shake_constraints(coord_t *coords, coord_t *xcoords) { int ai, aj, n_iterations, total_iterations, shake; - double xij2, diff, corr, scp, xxij2; + float xij2, diff, corr, scp, xxij2; coord_t xij, xxij; shake = 0; @@ -102,8 +102,8 @@ void initial_shaking() { } void stop_cm_translation() { - double total_mass = 0; - double rmass = 0; + float total_mass = 0; + float rmass = 0; coord_t vcm; vcm.x = 0; vcm.y = 0; diff --git a/src/core/solvent.cu b/src/core/solvent.cu index 180152bc..33604898 100644 --- a/src/core/solvent.cu +++ b/src/core/solvent.cu @@ -16,20 +16,20 @@ dvel_t *DV_W; calc_ww_t *WW_MAT, *h_WW_MAT; calc_pw_t *PW_MAT, *h_PW_MAT; -double *D_WW_Evdw, *D_WW_Ecoul, *h_WW_Evdw, *h_WW_Ecoul; -double *D_WW_evdw_TOT, *D_WW_ecoul_TOT, WW_evdw_TOT, WW_ecoul_TOT; +float *D_WW_Evdw, *D_WW_Ecoul, *h_WW_Evdw, *h_WW_Ecoul; +float *D_WW_evdw_TOT, *D_WW_ecoul_TOT, WW_evdw_TOT, WW_ecoul_TOT; -double *D_PW_Evdw, *D_PW_Ecoul, *h_PW_Evdw, *h_PW_Ecoul; -double *D_PW_evdw_TOT, *D_PW_ecoul_TOT, PW_evdw_TOT, PW_ecoul_TOT; +float *D_PW_Evdw, *D_PW_Ecoul, *h_PW_Evdw, *h_PW_Ecoul; +float *D_PW_evdw_TOT, *D_PW_ecoul_TOT, PW_evdw_TOT, PW_ecoul_TOT; // Constants pointers bool pw_gpu_set = false; //ONLY call if there are actually solvent atoms, or get segfaulted void calc_nonbonded_ww_forces() { - double rOX, rH1X, rH2X, r2; + float rOX, rH1X, rH2X, r2; coord_t dOX, dH1X, dH2X; - double Vel, V_a, V_b, dv; + float Vel, V_a, V_b, dv; // Initialize water constants if (A_OO == 0) { @@ -47,11 +47,11 @@ void calc_nonbonded_ww_forces() { crg_hw = ccharge_hw.charge; } - // double ecoul_block = 0, evdw_block = 0; + // float ecoul_block = 0, evdw_block = 0; for (int i = n_atoms_solute; i < n_atoms; i+=3) { for (int j = i+3; j < n_atoms; j+=3) { - double ecoul = 0, evdw = 0; + float ecoul = 0, evdw = 0; // if (excluded[i] || excluded[j]) continue; // --- O - (O,H1,H2) --- dOX.x = coords[j].x - coords[i].x; @@ -286,9 +286,9 @@ void calc_nonbonded_ww_forces() { __device__ __forceinline__ void calculate_unforce_bound( const int y, const int x, const coord_t &q, const coord_t &p, - const topo_t &D_topo, const double &crg_ow, const double &crg_hw, - const double &A_OO, const double &B_OO, double &evdw, double &ecoul, - double &dv, double &tmpx, double &tmpy, double &tmpz) { + const topo_t &D_topo, const float &crg_ow, const float &crg_hw, + const float &A_OO, const float &B_OO, float &evdw, float &ecoul, + float &dv, float &tmpx, float &tmpy, float &tmpz) { int belong_y = y / 3; int belong_x = x / 3; if (belong_y == belong_x) { @@ -302,16 +302,16 @@ __device__ __forceinline__ void calculate_unforce_bound( tmpx = p.x - q.x; tmpy = p.y - q.y; tmpz = p.z - q.z; - // double inv_dis = 1.0 / sqrt(pow(tmpx, 2) + pow(tmpy, 2) + pow(tmpz, 2)); - double inv_dis = rsqrt(tmpx * tmpx + tmpy * tmpy + tmpz * tmpz); - double inv_dis2 = inv_dis * inv_dis; + // float inv_dis = 1.0 / sqrt(pow(tmpx, 2) + pow(tmpy, 2) + pow(tmpz, 2)); + float inv_dis = rsqrt(tmpx * tmpx + tmpy * tmpy + tmpz * tmpz); + float inv_dis2 = inv_dis * inv_dis; ecoul = inv_dis * D_topo.coulomb_constant * (y_is_o ? crg_ow : crg_hw) * (x_is_o ? crg_ow : crg_hw); - double v_a = 0, v_b = 0; + float v_a = 0, v_b = 0; if (y_is_o && x_is_o) { - double inv_dis6 = inv_dis2 * inv_dis2 * inv_dis2; - double inv_dis12 = inv_dis6 * inv_dis6; + float inv_dis6 = inv_dis2 * inv_dis2 * inv_dis2; + float inv_dis12 = inv_dis6 * inv_dis6; v_a = A_OO * inv_dis12; v_b = B_OO * inv_dis6; evdw = v_a - v_b; @@ -324,10 +324,10 @@ __device__ __forceinline__ void calculate_unforce_bound( template __global__ void -calc_ww(const int N, const double crg_ow, const double crg_hw, - const double A_OO, const double B_OO, const topo_t D_topo, +calc_ww(const int N, const float crg_ow, const float crg_hw, + const float A_OO, const float B_OO, const topo_t D_topo, coord_t *__restrict__ W, dvel_t *__restrict__ DV_W, - double *__restrict__ Evdw_TOT, double *__restrict__ ecoul_TOT) { + float *__restrict__ Evdw_TOT, float *__restrict__ ecoul_TOT) { // Calculate block boundaries int NX = N; int NY = (N + 1) / 2; @@ -341,11 +341,11 @@ calc_ww(const int N, const double crg_ow, const double crg_hw, // Shared memory declarations with padding to avoid bank conflicts __shared__ coord_t p[2 * Thread_x * Block_x + 1]; __shared__ coord_t q[2 * Thread_y * Block_y + 1]; - __shared__ double sum_row_x[2 * Thread_y * Block_y + 1]; - __shared__ double sum_row_y[2 * Thread_y * Block_y + 1]; - __shared__ double sum_row_z[2 * Thread_y * Block_y + 1]; + __shared__ float sum_row_x[2 * Thread_y * Block_y + 1]; + __shared__ float sum_row_y[2 * Thread_y * Block_y + 1]; + __shared__ float sum_row_z[2 * Thread_y * Block_y + 1]; - __shared__ double block_ecoul[(Thread_x * Thread_y + 31) / 32], + __shared__ float block_ecoul[(Thread_x * Thread_y + 31) / 32], block_evdw[(Thread_x * Thread_y + 31) / 32]; // Thread indices @@ -377,7 +377,7 @@ calc_ww(const int N, const double crg_ow, const double crg_hw, } __syncthreads(); // Initialize column sums - double sum_col_x[2 * Block_x], sum_col_y[2 * Block_x], sum_col_z[2 * Block_x]; + float sum_col_x[2 * Block_x], sum_col_y[2 * Block_x], sum_col_z[2 * Block_x]; #pragma unroll for (int i = 0; i < Block_x; i++) { sum_col_x[i] = sum_col_x[Block_x + i] = 0; @@ -386,7 +386,7 @@ calc_ww(const int N, const double crg_ow, const double crg_hw, } // Main computation loop with reduced thread divergence - double evdw_tot = 0, ecoul_tot = 0; + float evdw_tot = 0, ecoul_tot = 0; #pragma unroll for (int i = 0; i < Block_x; i++) { int i2 = i; @@ -406,8 +406,8 @@ calc_ww(const int N, const double crg_ow, const double crg_hw, // Optimized condition check if (y >= x && (N % 2 == 0 || y != NY - 1)) { int offset_y = y - block_y_left_begin; - double evdw = 0, ecoul = 0, dv = 0; - double tmpx = 0, tmpy = 0, tmpz = 0; + float evdw = 0, ecoul = 0, dv = 0; + float tmpx = 0, tmpy = 0, tmpz = 0; int y2 = N - 1 - y; int x2 = N - x; @@ -417,9 +417,9 @@ calc_ww(const int N, const double crg_ow, const double crg_hw, evdw_tot += evdw; ecoul_tot += ecoul; - double v_x = dv * tmpx; - double v_y = dv * tmpy; - double v_z = dv * tmpz; + float v_x = dv * tmpx; + float v_y = dv * tmpy; + float v_z = dv * tmpz; sum_col_x[Block_x + i2] += v_x; sum_col_y[Block_x + i2] += v_y; @@ -429,8 +429,8 @@ calc_ww(const int N, const double crg_ow, const double crg_hw, sum_row_z[y_cal_num + offset_y] -= v_z; } else if (y < x) { int offset_y = y - block_y_left_begin; - double evdw = 0, ecoul = 0, dv = 0; - double tmpx = 0, tmpy = 0, tmpz = 0; + float evdw = 0, ecoul = 0, dv = 0; + float tmpx = 0, tmpy = 0, tmpz = 0; calculate_unforce_bound(y, x, q[offset_y], now_p0, D_topo, crg_ow, crg_hw, A_OO, B_OO, evdw, ecoul, dv, tmpx, tmpy, @@ -438,9 +438,9 @@ calc_ww(const int N, const double crg_ow, const double crg_hw, evdw_tot += evdw; ecoul_tot += ecoul; - double v_x = dv * tmpx; - double v_y = dv * tmpy; - double v_z = dv * tmpz; + float v_x = dv * tmpx; + float v_y = dv * tmpy; + float v_z = dv * tmpz; sum_col_x[i2] += v_x; sum_col_y[i2] += v_y; @@ -470,8 +470,8 @@ calc_ww(const int N, const double crg_ow, const double crg_hw, // Final reduction if (warp_id == 0) { - double val1 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_ecoul[lane_id] : 0; - double val2 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_evdw[lane_id] : 0; + float val1 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_ecoul[lane_id] : 0; + float val2 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_evdw[lane_id] : 0; #pragma unroll for (unsigned w = 16; w >= 1; w /= 2) { @@ -546,8 +546,8 @@ void calc_nonbonded_ww_forces_host() { check_cudaMalloc((void **) &W, mem_size_W); check_cudaMalloc((void **) &DV_W, mem_size_DV_W); - check_cudaMalloc((void **) &D_WW_evdw_TOT, sizeof(double)); - check_cudaMalloc((void **) &D_WW_ecoul_TOT, sizeof(double)); + check_cudaMalloc((void **) &D_WW_evdw_TOT, sizeof(float)); + check_cudaMalloc((void **) &D_WW_ecoul_TOT, sizeof(float)); } WW_evdw_TOT = 0; @@ -555,9 +555,9 @@ void calc_nonbonded_ww_forces_host() { cudaMemcpy(W, &coords[n_atoms_solute], mem_size_W, cudaMemcpyHostToDevice); cudaMemcpy(DV_W, &dvelocities[n_atoms_solute], mem_size_DV_W, cudaMemcpyHostToDevice); - cudaMemcpy(D_WW_evdw_TOT, &WW_evdw_TOT, sizeof(double), + cudaMemcpy(D_WW_evdw_TOT, &WW_evdw_TOT, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(D_WW_ecoul_TOT, &WW_ecoul_TOT, sizeof(double), + cudaMemcpy(D_WW_ecoul_TOT, &WW_ecoul_TOT, sizeof(float), cudaMemcpyHostToDevice); // Optimize thread block configuration @@ -582,9 +582,9 @@ void calc_nonbonded_ww_forces_host() { cudaDeviceSynchronize(); cudaMemcpy(&dvelocities[n_atoms_solute], DV_W, mem_size_DV_W, cudaMemcpyDeviceToHost); - cudaMemcpy(&WW_evdw_TOT, D_WW_evdw_TOT, sizeof(double), + cudaMemcpy(&WW_evdw_TOT, D_WW_evdw_TOT, sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(&WW_ecoul_TOT, D_WW_ecoul_TOT, sizeof(double), + cudaMemcpy(&WW_ecoul_TOT, D_WW_ecoul_TOT, sizeof(float), cudaMemcpyDeviceToHost); E_nonbond_ww.Uvdw += WW_evdw_TOT; E_nonbond_ww.Ucoul += WW_ecoul_TOT; @@ -592,11 +592,11 @@ void calc_nonbonded_ww_forces_host() { void calc_nonbonded_pw_forces() { coord_t da; - double r2a, ra, r6a; - double Vela, V_a, V_b; - double dva; - double qi, qj; - double ai_aii, aj_aii, ai_bii, aj_bii; + float r2a, ra, r6a; + float Vela, V_a, V_b; + float dva; + float qi, qj; + float ai_aii, aj_aii, ai_bii, aj_bii; catype_t ai_type, aj_type; int i; @@ -655,8 +655,8 @@ void calc_nonbonded_pw_forces_host() { int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; int n_blocks_w = (3*n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; - int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(double); - int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(double); + int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(float); + int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(float); if (!pw_gpu_set) { pw_gpu_set = true; @@ -684,8 +684,8 @@ void calc_nonbonded_pw_forces_host() { #endif check_cudaMalloc((void**) &D_PW_Ecoul, mem_size_PW_Ecoul); - check_cudaMalloc((void**) &D_PW_evdw_TOT, sizeof(double)); - check_cudaMalloc((void**) &D_PW_ecoul_TOT, sizeof(double)); + check_cudaMalloc((void**) &D_PW_evdw_TOT, sizeof(float)); + check_cudaMalloc((void**) &D_PW_ecoul_TOT, sizeof(float)); #ifdef DEBUG printf("All GPU solvent memory allocated\n"); @@ -730,8 +730,8 @@ void calc_nonbonded_pw_forces_host() { calc_energy_sum<<<1, threads>>>(n_blocks_p, n_blocks_w, D_PW_evdw_TOT, D_PW_ecoul_TOT, D_PW_Evdw, D_PW_Ecoul, false); - cudaMemcpy(&PW_evdw_TOT, D_PW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(&PW_ecoul_TOT, D_PW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&PW_evdw_TOT, D_PW_evdw_TOT, sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(&PW_ecoul_TOT, D_PW_ecoul_TOT, sizeof(float), cudaMemcpyDeviceToHost); E_nonbond_pw.Uvdw += PW_evdw_TOT; E_nonbond_pw.Ucoul += PW_ecoul_TOT; @@ -754,12 +754,12 @@ __device__ void set_water(int n_waters, int row, int column, dvel_t *val, calc_w MAT[column + n_waters * row].H2 = val[2]; } -__device__ void calc_ww_dvel_matrix_incr(int row, int column, double crg_ow, double crg_hw, double A_OO, double B_OO, - coord_t *Xs, coord_t *Ys, double *Evdw, double *Ecoul, dvel_t *water_a, dvel_t *water_b, topo_t D_topo) { - double rOX, rH1X, rH2X, r2; +__device__ void calc_ww_dvel_matrix_incr(int row, int column, float crg_ow, float crg_hw, float A_OO, float B_OO, + coord_t *Xs, coord_t *Ys, float *Evdw, float *Ecoul, dvel_t *water_a, dvel_t *water_b, topo_t D_topo) { + float rOX, rH1X, rH2X, r2; coord_t dOX, dH1X, dH2X; - double Vel, V_a, V_b, dv; - double tempX, tempY, tempZ; + float Vel, V_a, V_b, dv; + float tempX, tempY, tempZ; int i = 3 * row; int j = 3 * column; @@ -991,8 +991,8 @@ __device__ void calc_ww_dvel_matrix_incr(int row, int column, double crg_ow, dou water_b[wj].z += (dv * dH2X.z); } -__global__ void calc_ww_dvel_matrix(int n_waters, double crg_ow, double crg_hw, double A_OO, double B_OO, - coord_t *W, double *Evdw, double *Ecoul, calc_ww_t *MAT, topo_t D_topo) { +__global__ void calc_ww_dvel_matrix(int n_waters, float crg_ow, float crg_hw, float A_OO, float B_OO, + coord_t *W, float *Evdw, float *Ecoul, calc_ww_t *MAT, topo_t D_topo) { // Block index int bx = blockIdx.x; int by = blockIdx.y; @@ -1001,8 +1001,8 @@ __global__ void calc_ww_dvel_matrix(int n_waters, double crg_ow, double crg_hw, int tx = threadIdx.x; int ty = threadIdx.y; - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; Ecoul_S[ty][tx] = 0; Evdw_S[ty][tx] = 0; @@ -1057,7 +1057,7 @@ __global__ void calc_ww_dvel_matrix(int n_waters, double crg_ow, double crg_hw, // } if (bx != by || tx != ty) { - double evdw = 0, ecoul = 0; + float evdw = 0, ecoul = 0; calc_ww_dvel_matrix_incr(ty, tx, crg_ow, crg_hw, A_OO, B_OO, Xs, Ys, &evdw, &ecoul, water_a, water_b, D_topo); Evdw_S[ty][tx] = evdw; Ecoul_S[ty][tx] = ecoul; @@ -1078,8 +1078,8 @@ __global__ void calc_ww_dvel_matrix(int n_waters, double crg_ow, double crg_hw, int rowlen = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; if (tx == 0 && ty == 0) { - double tot_Evdw = 0; - double tot_Ecoul = 0; + float tot_Evdw = 0; + float tot_Ecoul = 0; for (int i = 0; i < BLOCK_SIZE; i++) { for (int j = 0; j < BLOCK_SIZE; j++) { tot_Evdw += Evdw_S[i][j]; @@ -1153,15 +1153,15 @@ __global__ void calc_ww_dvel_vector(int n_waters, dvel_t *DV_W, calc_ww_t *MAT) // P-W interactions __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_atoms_solute, - coord_t *Xs, coord_t *Ws, bool *excluded_s, double *Evdw, double *Ecoul, calc_pw_t *pw, + coord_t *Xs, coord_t *Ws, bool *excluded_s, float *Evdw, float *Ecoul, calc_pw_t *pw, ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, topo_t D_topo) { coord_t da; - double r2a, ra, r6a; - double Vela, V_a, V_b; - double dva; - double qi, qj; - double ai_aii, aj_aii, ai_bii, aj_bii; + float r2a, ra, r6a; + float Vela, V_a, V_b; + float dva; + float qi, qj; + float ai_aii, aj_aii, ai_bii, aj_bii; catype_t ai_type, aj_type; atype_t i_type, j_type; @@ -1214,7 +1214,7 @@ __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int } __global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_waters, - coord_t *X, coord_t *W, double *Evdw, double *Ecoul, calc_pw_t *PW_MAT, + coord_t *X, coord_t *W, float *Evdw, float *Ecoul, calc_pw_t *PW_MAT, ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, bool *D_excluded, topo_t D_topo) { // Block index int bx = blockIdx.x; @@ -1224,8 +1224,8 @@ __global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_wate int tx = threadIdx.x; int ty = threadIdx.y; - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; Ecoul_S[ty][tx] = 0; Evdw_S[ty][tx] = 0; @@ -1280,9 +1280,9 @@ __global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_wate // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty); // __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_patoms, - // coord_t *Ps, coord_t *Xs, double *Evdw, double *Ecoul, calc_pw_t *pw, + // coord_t *Ps, coord_t *Xs, float *Evdw, float *Ecoul, calc_pw_t *pw, // ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms) - double evdw = 0, ecoul = 0; + float evdw = 0, ecoul = 0; calc_pw_dvel_matrix_incr(ty, pi, tx, bStart + tx, n_atoms_solute, Xs, Ws, excluded_s, &evdw, &ecoul, &pw, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo); Evdw_S[ty][tx] = evdw; Ecoul_S[ty][tx] = ecoul; @@ -1301,8 +1301,8 @@ __global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_wate int rowlen = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; if (tx == 0 && ty == 0) { - double tot_Evdw = 0; - double tot_Ecoul = 0; + float tot_Evdw = 0; + float tot_Ecoul = 0; for (int i = 0; i < BLOCK_SIZE; i++) { for (int j = 0; j < BLOCK_SIZE; j++) { tot_Evdw += Evdw_S[i][j]; @@ -1365,16 +1365,16 @@ __global__ void calc_pw_dvel_vector_column(int n_patoms, int n_waters, dvel_t *D } // General -__global__ void calc_energy_sum(int rows, int columns, double *Evdw_TOT, double *Ecoul_TOT, double *Evdw, double *Ecoul, bool upper_diagonal) { +__global__ void calc_energy_sum(int rows, int columns, float *Evdw_TOT, float *Ecoul_TOT, float *Evdw, float *Ecoul, bool upper_diagonal) { int tx = threadIdx.x; int ty = threadIdx.y; - __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; - __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; //TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work - double coul_TOT = 0; - double vdw_TOT = 0; + float coul_TOT = 0; + float vdw_TOT = 0; for (int i = ty; i < rows; i+= BLOCK_SIZE) { for (int j = tx; j < columns; j+= BLOCK_SIZE) { if (i <= j || !upper_diagonal) { @@ -1389,8 +1389,8 @@ __global__ void calc_energy_sum(int rows, int columns, double *Evdw_TOT, double __syncthreads(); if (tx == 0 && ty == 0) { - double Evdw_temp = 0; - double Ecoul_temp = 0; + float Evdw_temp = 0; + float Ecoul_temp = 0; for (int i = 0; i < BLOCK_SIZE; i++) { for (int j = 0; j < BLOCK_SIZE; j++) { diff --git a/src/core/solvent.h b/src/core/solvent.h index 1234751d..80bfebd1 100644 --- a/src/core/solvent.h +++ b/src/core/solvent.h @@ -39,21 +39,21 @@ extern bool *D_excluded; // W-W interactions __device__ void set_water(int n_waters, int row, int column, dvel_t *val, calc_ww_t *MAT); -__device__ void calc_ww_dvel_matrix_incr(int row, int column, double crg_ow, double crg_hw, double A_OO, double B_OO, - coord_t *Xs, coord_t *Ys, double *Evdw, double *Ecoul, dvel_t *water_a, dvel_t *water_b, topo_t topo); +__device__ void calc_ww_dvel_matrix_incr(int row, int column, float crg_ow, float crg_hw, float A_OO, float B_OO, + coord_t *Xs, coord_t *Ys, float *Evdw, float *Ecoul, dvel_t *water_a, dvel_t *water_b, topo_t topo); -__global__ void calc_ww_dvel_matrix(int n_waters, double crg_ow, double crg_hw, double A_OO, double B_OO, - coord_t *X, double *Evdw, double *Ecoul, calc_ww_t *MAT, topo_t topo); +__global__ void calc_ww_dvel_matrix(int n_waters, float crg_ow, float crg_hw, float A_OO, float B_OO, + coord_t *X, float *Evdw, float *Ecoul, calc_ww_t *MAT, topo_t topo); __global__ void calc_ww_dvel_vector(int n_waters, dvel_t *DV, calc_ww_t *MAT); // P-W interactions __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int wj, int n_atoms_solute, - coord_t *Ps, coord_t *Ws, bool *excluded_s, double *Evdw, double *Ecoul, calc_pw_t *pw, + coord_t *Ps, coord_t *Ws, bool *excluded_s, float *Evdw, float *Ecoul, calc_pw_t *pw, ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, topo_t topo); __global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_waters, - coord_t *X, coord_t *W, double *Evdw, double *Ecoul, calc_pw_t *PW_MAT, + coord_t *X, coord_t *W, float *Evdw, float *Ecoul, calc_pw_t *PW_MAT, ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms, bool *D_excluded, topo_t topo); __global__ void calc_pw_dvel_vector_row(int n_patoms, int n_waters, dvel_t *DV_X, dvel_t *DV_W, calc_pw_t *PW_MAT, p_atom_t *D_patoms); @@ -61,7 +61,7 @@ __global__ void calc_pw_dvel_vector_row(int n_patoms, int n_waters, dvel_t *DV_X __global__ void calc_pw_dvel_vector_column(int n_patoms, int n_waters, dvel_t *DV_X, dvel_t *DV_W, calc_pw_t *PW_MAT); // General -__global__ void calc_energy_sum(int rows, int columns, double *Evdw_TOT, double *Ecoul_TOT, double *Evdw, double *Ecoul, bool upper_diagonal); +__global__ void calc_energy_sum(int rows, int columns, float *Evdw_TOT, float *Ecoul_TOT, float *Evdw, float *Ecoul, bool upper_diagonal); void clean_d_solvent(); diff --git a/src/core/system.cu b/src/core/system.cu index 18f9944c..6cd52ef3 100644 --- a/src/core/system.cu +++ b/src/core/system.cu @@ -28,7 +28,7 @@ int n_waters; int n_molecules = 0; char base_folder[1024]; -double dt, tau_T; +float dt, tau_T; bool run_gpu = false; @@ -85,7 +85,7 @@ int *LJ_matrix; bool *excluded; bool *heavy; int *molecules; -double *winv; +float *winv; cgrp_t *charge_groups; topo_t topo; @@ -96,7 +96,7 @@ topo_t topo; */ int n_lambdas; -double *lambdas; +float *lambdas; int n_qangcouples; int n_qangles; @@ -340,23 +340,23 @@ void exclude_shaken_definitions() { coord_t* coords; vel_t* velocities; dvel_t* dvelocities; -double Temp = 0; -double Texcl = 0; -double Tfree = 0; -double Tscale = 1; +float Temp = 0; +float Texcl = 0; +float Tfree = 0; +float Tscale = 1; // Shake constrains coord_t* xcoords; // Water constants -double A_O = 0, A_OO = 0, B_O, B_OO, crg_ow, crg_hw, mu_w = 0; +float A_O = 0, A_OO = 0, B_O, B_OO, crg_ow, crg_hw, mu_w = 0; void init_velocities() { velocities = (vel_t*) malloc(n_atoms * sizeof(vel_t)); // If not previous value set, use a Maxwell distribution to fill velocities - double kT = Boltz * md.initial_temperature; - double sd, mass; + float kT = Boltz * md.initial_temperature; + float sd, mass; for (int i = 0; i < n_atoms; i++) { mass = catypes[atypes[i].code - 1].m; sd = sqrt(kT / mass); @@ -376,7 +376,7 @@ void init_xcoords() { } void init_inv_mass() { - winv = (double*) malloc(n_atoms * sizeof(double)); + winv = (float*) malloc(n_atoms * sizeof(float)); for (int ai = 0; ai < n_atoms; ai++) { winv[ai] = 1 / catypes[atypes[ai].code-1].m; } @@ -402,11 +402,11 @@ restrdis_t *restrdists; restrang_t *restrangs; restrwall_t* restrwalls; -double crgQtot = 0; -double Dwmz, awmz; +float crgQtot = 0; +float Dwmz, awmz; // Shell layout. Defined once per run -double *theta, *theta0, *tdum; //array size n_waters +float *theta, *theta0, *tdum; //array size n_waters int n_max_inshell, n_shells; int **list_sh, **nsort; // array size (n_max_inshell, n_shells) shell_t* wshells; @@ -427,7 +427,7 @@ void init_water_sphere() { //ONLY call if there are actually solvent atoms, or get segfaulted void init_wshells() { int n_inshell; - double drs, router, ri, dr, Vshell, rshell; + float drs, router, ri, dr, Vshell, rshell; if (mu_w == 0) { // Get water properties from first water molecule cbond_t cbondw = cbonds[bonds[n_atoms_solute].code-1]; @@ -479,9 +479,9 @@ void init_wshells() { n_max_inshell = n_waters; // Make largest a little bigger just in case // Initialize arrays needed for bookkeeping - theta = (double*) malloc(n_waters * sizeof(double)); - theta0 = (double*) malloc(n_waters * sizeof(double)); - tdum = (double*) malloc(n_waters * sizeof(double)); + theta = (float*) malloc(n_waters * sizeof(float)); + theta0 = (float*) malloc(n_waters * sizeof(float)); + tdum = (float*) malloc(n_waters * sizeof(float)); list_sh = (int**) malloc(n_max_inshell * sizeof(int*)); nsort = (int**) malloc(n_max_inshell * sizeof(int*)); @@ -493,7 +493,7 @@ void init_wshells() { } void init_pshells() { - double mass, r2, rin2; + float mass, r2, rin2; heavy = (bool*) calloc(n_atoms, sizeof(bool)); shell = (bool*) calloc(n_atoms, sizeof(bool)); @@ -529,7 +529,7 @@ void init_pshells() { } void init_pshells_with_charge_groups() { - double mass, r2, rin2; + float mass, r2, rin2; heavy = (bool*) calloc(n_atoms, sizeof(bool)); shell = (bool*) calloc(n_atoms, sizeof(bool)); @@ -595,7 +595,7 @@ void init_shake() { int mol = 0; int shake; int n_solute_shake_constraints = 0; - double excl_shake = 0; + float excl_shake = 0; n_shake_constraints = 0; mol_n_shakes = (int*) calloc(n_molecules, sizeof(int)); @@ -704,18 +704,18 @@ void init_patoms() { * ============================================= */ -double Ndegf, Ndegfree, Ndegf_solvent, Ndegf_solute, Ndegfree_solvent, Ndegfree_solute; -double Tscale_solute = 1, Tscale_solvent = 1; +float Ndegf, Ndegfree, Ndegf_solvent, Ndegf_solute, Ndegfree_solvent, Ndegfree_solute; +float Tscale_solute = 1, Tscale_solvent = 1; void calc_temperature() { printf("Ndegf = %f, Ndegfree = %f, n_excluded = %d, Ndegfree_solvent = %f, Ndegfree_solute = %f\n", Ndegf, Ndegfree, n_excluded, Ndegfree_solvent, Ndegfree_solute); Temp = 0; Tfree = 0; - double Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0; - double Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0; - double Ekinmax = 1000.0 * Ndegf * Boltz * md.temperature / 2.0 / n_atoms; - double ener; - double mass_i; + float Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0; + float Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0; + float Ekinmax = 1000.0 * Ndegf * Boltz * md.temperature / 2.0 / n_atoms; + float ener; + float mass_i; Temp = 0; for (int i = 0; i < n_atoms_solute; i++) { @@ -777,7 +777,7 @@ void calc_temperature() { */ void calc_leapfrog() { - double mass_i, winv_i; + float mass_i, winv_i; for (int i = 0; i < n_atoms_solute; i++) { mass_i = catypes[atypes[i].code - 1].m; @@ -1226,17 +1226,17 @@ void calc_integration_step(int iteration) { // Profiler info #ifdef __PROFILING__ - printf("Elapsed time for bonded forces: %f\n", (end_bonded-start) / (double)CLOCKS_PER_SEC ); - printf("Elapsed time for non-bonded forces: %f\n", (end_nonbonded-end_bonded) / (double)CLOCKS_PER_SEC); - printf("Elapsed time for pp interactions: %f\n", (end_pp-start_pp) / (double)CLOCKS_PER_SEC ); - printf("Elapsed time for qq interaction: %f\n", (end_qq-start_qq) / (double)CLOCKS_PER_SEC ); - printf("Elapsed time for qp interaction: %f\n", (end_qp-start_qp) / (double)CLOCKS_PER_SEC ); + printf("Elapsed time for bonded forces: %f\n", (end_bonded-start) / (float)CLOCKS_PER_SEC ); + printf("Elapsed time for non-bonded forces: %f\n", (end_nonbonded-end_bonded) / (float)CLOCKS_PER_SEC); + printf("Elapsed time for pp interactions: %f\n", (end_pp-start_pp) / (float)CLOCKS_PER_SEC ); + printf("Elapsed time for qq interaction: %f\n", (end_qq-start_qq) / (float)CLOCKS_PER_SEC ); + printf("Elapsed time for qp interaction: %f\n", (end_qp-start_qp) / (float)CLOCKS_PER_SEC ); if (n_waters > 0) { - printf("Elapsed time for ww interactions: %f\n", (end_ww-start_ww) / (double)CLOCKS_PER_SEC ); - printf("Elapsed time for pw interactions: %f\n", (end_pw-start_pw) / (double)CLOCKS_PER_SEC ); + printf("Elapsed time for ww interactions: %f\n", (end_ww-start_ww) / (float)CLOCKS_PER_SEC ); + printf("Elapsed time for pw interactions: %f\n", (end_pw-start_pw) / (float)CLOCKS_PER_SEC ); } printf("---\n"); - printf("Elapsed time for entire time-step: %f\n", (end_calculation-start) / (double)CLOCKS_PER_SEC); + printf("Elapsed time for entire time-step: %f\n", (end_calculation-start) / (float)CLOCKS_PER_SEC); #endif /* __PROFILING__ */ } diff --git a/src/core/system.h b/src/core/system.h index 24161423..db2ec269 100644 --- a/src/core/system.h +++ b/src/core/system.h @@ -64,7 +64,7 @@ extern int n_molecules; extern char base_folder[1024]; -extern double dt; +extern float dt; extern bool run_gpu; @@ -76,29 +76,29 @@ extern bool run_gpu; struct md_t { // [MD] int steps; - double stepsize; - double temperature; + float stepsize; + float temperature; char thermostat[40]; - double bath_coupling; + float bath_coupling; int random_seed; - double initial_temperature; + float initial_temperature; bool shake_solvent; bool shake_solute; bool shake_hydrogens; bool lrf; bool charge_groups; // [cut-offs] - double solute_solute; - double solvent_solvent; - double solute_solvent; - double q_atom; + float solute_solute; + float solvent_solvent; + float solute_solvent; + float q_atom; // [sphere] - double shell_radius; // Note: this is for the pshell - double shell_force; // Note: this is for the pshell + float shell_radius; // Note: this is for the pshell + float shell_force; // Note: this is for the pshell // [solvent] - double radial_force; + float radial_force; bool polarisation; - double polarisation_force; + float polarisation_force; // [intervals] int non_bond; int output; @@ -121,9 +121,9 @@ extern bool separate_scaling; */ struct coord_t { - double x; - double y; - double z; + float x; + float y; + float z; }; struct bond_t { @@ -134,8 +134,8 @@ struct bond_t { struct cbond_t { int code; - double kb; - double b0; + float kb; + float b0; }; struct angle_t { @@ -147,8 +147,8 @@ struct angle_t { struct cangle_t { int code; - double kth; - double th0; + float kth; + float th0; }; struct torsion_t { @@ -161,9 +161,9 @@ struct torsion_t { struct ctorsion_t { int code; - double k; - double n; - double d; + float k; + float n; + float d; }; struct improper_t { @@ -176,8 +176,8 @@ struct improper_t { struct cimproper_t { int code; - double k; - double phi0; + float k; + float phi0; }; struct charge_t { @@ -187,7 +187,7 @@ struct charge_t { struct ccharge_t { int code; - double charge; + float charge; }; struct atype_t { @@ -197,23 +197,23 @@ struct atype_t { struct catype_t { int code; - double m; - double aii_normal; - double bii_normal; - double aii_polar; - double bii_polar; - double aii_1_4; - double bii_1_4; + float m; + float aii_normal; + float bii_normal; + float aii_polar; + float bii_polar; + float aii_1_4; + float bii_1_4; }; struct topo_t { int solvent_type; - double exclusion_radius; - double solvent_radius; + float exclusion_radius; + float solvent_radius; coord_t solute_center; coord_t solvent_center; - double el14_scale; - double coulomb_constant; + float el14_scale; + float coulomb_constant; }; struct cgrp_t { @@ -263,7 +263,7 @@ extern int *LJ_matrix; extern torsion_t *torsions; extern bool *excluded; extern bool *heavy; -extern double *winv; +extern float *winv; extern cgrp_t *charge_groups; extern int *molecules; @@ -274,7 +274,7 @@ extern int *molecules; */ extern int n_lambdas; -extern double *lambdas; +extern float *lambdas; struct q_angcouple_t { int acode; @@ -303,45 +303,45 @@ struct q_bond_t { }; struct q_cangle_t { - double kth; - double th0; + float kth; + float th0; }; struct q_catype_t { char name[10]; - double Ai; - double Bi; - double Ci; - double ai; - double Ai_14; - double Bi_14; - double m; + float Ai; + float Bi; + float Ci; + float ai; + float Ai_14; + float Bi_14; + float m; }; struct q_cbond_t { - double kb; - double b0; + float kb; + float b0; }; struct q_charge_t { - double q; + float q; }; struct q_cimproper_t { - double k; - double phi0; + float k; + float phi0; }; struct q_ctorsion_t { - double k; - double n; - double d; + float k; + float n; + float d; }; struct q_elscale_t { int qi; int qj; - double mu; + float mu; }; struct q_exclpair_t { @@ -368,18 +368,18 @@ struct q_offdiag_t { int j; int qk; int ql; - double Aij; - double muij; + float Aij; + float muij; }; struct q_shake_t { int ai; int aj; - double dist; + float dist; }; struct q_softcore_t { - double s; + float s; }; struct q_softpair_t { @@ -451,7 +451,7 @@ extern q_torsion_t *q_torsions; struct restrseq_t { int ai; int aj; - double k; + float k; bool ih; int to_center; // Flag for restraining to geom. or mass center }; @@ -466,21 +466,21 @@ struct restrpos_t { struct restrdis_t { int ai, aj; int ipsi; - double d1, d2; - double k; + float d1, d2; + float k; char itext[20], jtext[20]; }; struct restrang_t { int ai, aj, ak; int ipsi; - double ang; - double k; + float ang; + float k; }; struct restrwall_t { int ai, aj; - double d, k, aMorse, dMorse; + float d, k, aMorse, dMorse; bool ih; }; @@ -505,20 +505,20 @@ void init_restrseqs(char* filename); struct shell_t { int n_inshell; - double theta_corr; - double avtheta; - double avn_inshell; - double router; - double dr; - double cstb; + float theta_corr; + float avtheta; + float avn_inshell; + float router; + float dr; + float cstb; }; // Total energy in the system. Defined once per run -extern double crgQtot; -extern double Dwmz, awmz; +extern float crgQtot; +extern float Dwmz, awmz; // Water shell layout. Defined once per run -extern double *theta, *theta0, *tdum; //array size n_waters +extern float *theta, *theta0, *tdum; //array size n_waters extern int n_max_inshell, n_shells; extern int **list_sh, **nsort; // array size (n_max_inshell, n_shells) extern shell_t* wshells; @@ -534,7 +534,7 @@ void init_wshells(); struct shake_bond_t { int ai; int aj; - double dist2; + float dist2; bool ready; }; @@ -553,42 +553,42 @@ struct p_atom_t { }; struct vel_t { - double x; - double y; - double z; + float x; + float y; + float z; }; struct dvel_t { - double x; - double y; - double z; + float x; + float y; + float z; }; struct E_bonded_t { - double Ubond; - double Uangle; - double Utor; - double Uimp; + float Ubond; + float Uangle; + float Utor; + float Uimp; }; struct E_nonbonded_t { - double Ucoul; - double Uvdw; + float Ucoul; + float Uvdw; }; struct E_restraint_t { - double Uradx; - double Upolx; - double Ufix; - double Ushell; - double Upres; - double Urestr; + float Uradx; + float Upolx; + float Ufix; + float Ushell; + float Upres; + float Urestr; }; struct energy_t { - double Ukin; - double Upot; - double Utot; + float Ukin; + float Upot; + float Utot; }; extern p_atom_t *p_atoms; @@ -601,8 +601,8 @@ extern E_bonded_t E_bond_p, E_bond_w, E_bond_q, *EQ_bond; extern E_nonbonded_t E_nonbond_pp, E_nonbond_pw, E_nonbond_ww, E_nonbond_qx; extern E_nonbonded_t *EQ_nonbond_qq, *EQ_nonbond_qp, *EQ_nonbond_qw, *EQ_nonbond_qx; extern E_restraint_t E_restraint, *EQ_restraint; -extern double Temp, Tfree, Texcl; -extern double A_O, A_OO, B_O, B_OO, crg_ow, crg_hw; // TODO: don't keep this in system.cu? +extern float Temp, Tfree, Texcl; +extern float A_O, A_OO, B_O, B_OO, crg_ow, crg_hw; // TODO: don't keep this in system.cu? void init_velocities(); void init_dvelocities(); @@ -613,7 +613,7 @@ void init_energies(); * ============================================= */ -extern double Ndegf, Ndegfree, Ndegf_solvent, Ndegf_solute, Ndegfree_solvent, Ndegfree_solute; +extern float Ndegf, Ndegfree, Ndegf_solvent, Ndegf_solute, Ndegfree_solvent, Ndegfree_solute; void calc_temperature(); /* ============================================= diff --git a/src/core/utils.cu b/src/core/utils.cu index a0fd20f2..009e764b 100644 --- a/src/core/utils.cu +++ b/src/core/utils.cu @@ -3,22 +3,22 @@ // Get a value from a gaussian distributed random variable with // mean mean and standard deviation sd -double gauss(double mean, double sd) { - double v1, v2, nd10; +float gauss(float mean, float sd) { + float v1, v2, nd10; - v1 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. ); - v2 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. ); + v1 = ( (float)(rand()) + 1. )/( (float)(RAND_MAX) + 1. ); + v2 = ( (float)(rand()) + 1. )/( (float)(RAND_MAX) + 1. ); nd10 = cos(2 * M_PI * v2) * sqrt(-2. * log(v1)); return sd * nd10 + mean; } -double to_degrees(double radians) { +float to_degrees(float radians) { return radians * (180.0 / M_PI); } -double to_radians(double degrees) { +float to_radians(float degrees) { return degrees * (M_PI / 180.0); } diff --git a/src/core/utils.h b/src/core/utils.h index 32d50900..4608fe8d 100644 --- a/src/core/utils.h +++ b/src/core/utils.h @@ -6,9 +6,9 @@ * ============================================= */ -double gauss(double mean, double sd); -double to_degrees(double radians); -double to_radians(double degrees); +float gauss(float mean, float sd); +float to_degrees(float radians); +float to_radians(float degrees); /* ============================================= * == DEVICE From 9942e89ec47acc5013feab09bc7a18051331a9cb Mon Sep 17 00:00:00 2001 From: "shen.guo" Date: Wed, 12 Nov 2025 09:36:30 +0100 Subject: [PATCH 3/3] add comment --- src/core/restraints.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/restraints.cu b/src/core/restraints.cu index 132d4f83..768ff40e 100644 --- a/src/core/restraints.cu +++ b/src/core/restraints.cu @@ -184,6 +184,7 @@ void calc_polx_w_forces(int iteration) { if (cos_th > 1) cos_th = 1; if (cos_th < -1) cos_th = -1; f0 = sin(acos(cos_th)); + // After change double to float, need to change 1e-12 to 1e-6, because of precision if (abs(f0) < 1.0E-6) f0 = 1.0E-6; f0 = -1.0 / f0; f0 *= dv;