From a8e1c4fc730ac32e6c69d5407469f55b947f3c99 Mon Sep 17 00:00:00 2001 From: David McKenna Date: Thu, 9 Apr 2020 13:29:42 +0100 Subject: [PATCH 1/5] More recent versions of CUDA have become more strict with cudaMalloc and now require the length to be a size_t or will raise a CUDA error at runtime --- cdmt.cu | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/cdmt.cu b/cdmt.cu index 3f43fef..aae1d8a 100644 --- a/cdmt.cu +++ b/cdmt.cu @@ -152,40 +152,40 @@ int main(int argc,char *argv[]) checkCudaErrors(cudaSetDevice(device)); // Allocate memory for complex timeseries - checkCudaErrors(cudaMalloc((void **) &cp1,sizeof(cufftComplex)*nbin*nfft*nsub)); - checkCudaErrors(cudaMalloc((void **) &cp2,sizeof(cufftComplex)*nbin*nfft*nsub)); - checkCudaErrors(cudaMalloc((void **) &cp1p,sizeof(cufftComplex)*nbin*nfft*nsub)); - checkCudaErrors(cudaMalloc((void **) &cp2p,sizeof(cufftComplex)*nbin*nfft*nsub)); + checkCudaErrors(cudaMalloc((void **) &cp1, (size_t) sizeof(cufftComplex)*nbin*nfft*nsub)); + checkCudaErrors(cudaMalloc((void **) &cp2, (size_t) sizeof(cufftComplex)*nbin*nfft*nsub)); + checkCudaErrors(cudaMalloc((void **) &cp1p,(size_t) sizeof(cufftComplex)*nbin*nfft*nsub)); + checkCudaErrors(cudaMalloc((void **) &cp2p,(size_t) sizeof(cufftComplex)*nbin*nfft*nsub)); // Allocate device memory for chirp - checkCudaErrors(cudaMalloc((void **) &dc,sizeof(cufftComplex)*nbin*nsub*ndm)); + checkCudaErrors(cudaMalloc((void **) &dc, (size_t) sizeof(cufftComplex)*nbin*nsub*ndm)); // Allocate device memory for block sums - checkCudaErrors(cudaMalloc((void **) &bs1,sizeof(float)*mblock*mchan)); - checkCudaErrors(cudaMalloc((void **) &bs2,sizeof(float)*mblock*mchan)); + checkCudaErrors(cudaMalloc((void **) &bs1, (size_t) sizeof(float)*mblock*mchan)); + checkCudaErrors(cudaMalloc((void **) &bs2, (size_t) sizeof(float)*mblock*mchan)); // Allocate device memory for channel averages and standard deviations - checkCudaErrors(cudaMalloc((void **) &zavg,sizeof(float)*mchan)); - checkCudaErrors(cudaMalloc((void **) &zstd,sizeof(float)*mchan)); + checkCudaErrors(cudaMalloc((void **) &zavg, (size_t) sizeof(float)*mchan)); + checkCudaErrors(cudaMalloc((void **) &zstd, (size_t) sizeof(float)*mchan)); // Allocate memory for redigitized output and header header=(char *) malloc(sizeof(char)*HEADERSIZE); for (i=0;i<4;i++) { h5buf[i]=(char *) malloc(sizeof(char)*nsamp*nsub); - checkCudaErrors(cudaMalloc((void **) &dh5buf[i],sizeof(char)*nsamp*nsub)); + checkCudaErrors(cudaMalloc((void **) &dh5buf[i], (size_t) sizeof(char)*nsamp*nsub)); } // Allocate output buffers fbuf=(float *) malloc(sizeof(float)*nsamp*nsub); - checkCudaErrors(cudaMalloc((void **) &dfbuf,sizeof(float)*nsamp*nsub)); + checkCudaErrors(cudaMalloc((void **) &dfbuf, (size_t) sizeof(float)*nsamp*nsub)); cbuf=(unsigned char *) malloc(sizeof(unsigned char)*msamp*mchan/ndec); - checkCudaErrors(cudaMalloc((void **) &dcbuf,sizeof(unsigned char)*msamp*mchan/ndec)); + checkCudaErrors(cudaMalloc((void **) &dcbuf, (size_t) sizeof(unsigned char)*msamp*mchan/ndec)); // Allocate DMs and copy to device dm=(float *) malloc(sizeof(float)*ndm); for (idm=0;idm Date: Wed, 13 May 2020 11:39:41 +0100 Subject: [PATCH 2/5] Update cuFFT calls to follow new best practices according to documentation --- cdmt.cu | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/cdmt.cu b/cdmt.cu index aae1d8a..b207f14 100644 --- a/cdmt.cu +++ b/cdmt.cu @@ -151,6 +151,18 @@ int main(int argc,char *argv[]) // Set device checkCudaErrors(cudaSetDevice(device)); + // DMcK: cuFFT docs say it's best practice to plan before allocating memory + // cuda-memcheck fails initialisation before this block is run? + // Generate FFT plan (batch in-place forward FFT) + idist=nbin; odist=nbin; iembed=nbin; oembed=nbin; istride=1; ostride=1; + checkCudaErrors(cufftPlanMany(&ftc2cf,1,&nbin,&iembed,istride,idist,&oembed,ostride,odist,CUFFT_C2C,nfft*nsub)); + cudaDeviceSynchronize(); + + // Generate FFT plan (batch in-place backward FFT) + idist=mbin; odist=mbin; iembed=mbin; oembed=mbin; istride=1; ostride=1; + checkCudaErrors(cufftPlanMany(&ftc2cb,1,&mbin,&iembed,istride,idist,&oembed,ostride,odist,CUFFT_C2C,nchan*nfft*nsub)); + cudaDeviceSynchronize(); + // Allocate memory for complex timeseries checkCudaErrors(cudaMalloc((void **) &cp1, (size_t) sizeof(cufftComplex)*nbin*nfft*nsub)); checkCudaErrors(cudaMalloc((void **) &cp2, (size_t) sizeof(cufftComplex)*nbin*nfft*nsub)); @@ -188,14 +200,6 @@ int main(int argc,char *argv[]) checkCudaErrors(cudaMalloc((void **) &ddm, (size_t) sizeof(float)*ndm)); checkCudaErrors(cudaMemcpy(ddm,dm,sizeof(float)*ndm,cudaMemcpyHostToDevice)); - // Generate FFT plan (batch in-place forward FFT) - idist=nbin; odist=nbin; iembed=nbin; oembed=nbin; istride=1; ostride=1; - checkCudaErrors(cufftPlanMany(&ftc2cf,1,&nbin,&iembed,istride,idist,&oembed,ostride,odist,CUFFT_C2C,nfft*nsub)); - - // Generate FFT plan (batch in-place backward FFT) - idist=mbin; odist=mbin; iembed=mbin; oembed=mbin; istride=1; ostride=1; - checkCudaErrors(cufftPlanMany(&ftc2cb,1,&mbin,&iembed,istride,idist,&oembed,ostride,odist,CUFFT_C2C,nchan*nfft*nsub)); - // Compute chirp blocksize.x=32; blocksize.y=32; blocksize.z=1; gridsize.x=nsub/blocksize.x+1; gridsize.y=nchan/blocksize.y+1; gridsize.z=ndm/blocksize.z+1; From d61f9b97c7cf1c012d5c39f4d631f2af070a4a1c Mon Sep 17 00:00:00 2001 From: David McKenna Date: Wed, 13 May 2020 11:40:03 +0100 Subject: [PATCH 3/5] Print incorrectly pass flags before exiting for easier debugging --- cdmt.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cdmt.cu b/cdmt.cu index b207f14..bca724f 100644 --- a/cdmt.cu +++ b/cdmt.cu @@ -118,6 +118,7 @@ int main(int argc,char *argv[]) } } } else { + printf("Unknown option '%c'\n", arg); usage(); return 0; } From ce146d72f1c4870db5127ec0ad5683f73e5d9799 Mon Sep 17 00:00:00 2001 From: David McKenna Date: Wed, 13 May 2020 11:42:40 +0100 Subject: [PATCH 4/5] Print exit reason when near EOF --- cdmt.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cdmt.cu b/cdmt.cu index bca724f..a93f34b 100644 --- a/cdmt.cu +++ b/cdmt.cu @@ -239,8 +239,10 @@ int main(int argc,char *argv[]) startclock=clock(); for (i=0;i<4;i++) nread=fread(h5buf[i],sizeof(char),nsamp*nsub,rawfile[i])/nsub; - if (nread==0) + if (nread==0) { + printf("No data read from last file; assuming EOF, finishng up.\n"); break; + } printf("Block: %d: Read %d MB in %.2f s\n",iblock,sizeof(char)*nread*nsub*4/(1<<20),(float) (clock()-startclock)/CLOCKS_PER_SEC); // Copy buffers to device From b48b5d2c77789a6de7add0f06f8f87ccc621c101 Mon Sep 17 00:00:00 2001 From: David McKenna Date: Fri, 15 May 2020 12:41:36 +0100 Subject: [PATCH 5/5] Add support for skipping to a given time sample and only processing N time samples. Also prints the current location in the file (in hh:mm:ss.s / seconds) for further information --- cdmt.cu | 49 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/cdmt.cu b/cdmt.cu index a93f34b..95f4eae 100644 --- a/cdmt.cu +++ b/cdmt.cu @@ -11,6 +11,7 @@ #include #include #include +#include #define HEADERSIZE 4096 #define DMCONSTANT 2.41e-10 @@ -18,7 +19,7 @@ // Struct for header information struct header { int64_t headersize,buffersize; - unsigned int nchan,nsamp,nbit,nif,nsub; + unsigned int nchan,nsamp,nbit=8,nif,nsub; int machine_id,telescope_id,nbeam,ibeam,sumif; double tstart,tsamp,fch1,foff,fcen,bwchan; double src_raj,src_dej,az_start,za_start; @@ -44,7 +45,7 @@ void write_filterbank_header(struct header h,FILE *file); // Usage void usage() { - printf("cdmt -P -d -D -b -N -n -o \n\n"); + printf("cdmt -P -d -D -b -N -n -r