From a8e1c4fc730ac32e6c69d5407469f55b947f3c99 Mon Sep 17 00:00:00 2001
From: David McKenna <dmckenna@cp.dias.ie>
Date: Thu, 9 Apr 2020 13:29:42 +0100
Subject: [PATCH 1/5] More recent versions of CUDA have become more strict with
 cudaMalloc and now require the length to be a size_t or will raise a CUDA
 error at runtime

---
 cdmt.cu | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/cdmt.cu b/cdmt.cu
index 3f43fef..aae1d8a 100644
--- a/cdmt.cu
+++ b/cdmt.cu
@@ -152,40 +152,40 @@ int main(int argc,char *argv[])
   checkCudaErrors(cudaSetDevice(device));
 
   // Allocate memory for complex timeseries
-  checkCudaErrors(cudaMalloc((void **) &cp1,sizeof(cufftComplex)*nbin*nfft*nsub));
-  checkCudaErrors(cudaMalloc((void **) &cp2,sizeof(cufftComplex)*nbin*nfft*nsub));
-  checkCudaErrors(cudaMalloc((void **) &cp1p,sizeof(cufftComplex)*nbin*nfft*nsub));
-  checkCudaErrors(cudaMalloc((void **) &cp2p,sizeof(cufftComplex)*nbin*nfft*nsub));
+  checkCudaErrors(cudaMalloc((void **) &cp1, (size_t) sizeof(cufftComplex)*nbin*nfft*nsub));
+  checkCudaErrors(cudaMalloc((void **) &cp2, (size_t) sizeof(cufftComplex)*nbin*nfft*nsub));
+  checkCudaErrors(cudaMalloc((void **) &cp1p,(size_t) sizeof(cufftComplex)*nbin*nfft*nsub));
+  checkCudaErrors(cudaMalloc((void **) &cp2p,(size_t) sizeof(cufftComplex)*nbin*nfft*nsub));
 
   // Allocate device memory for chirp
-  checkCudaErrors(cudaMalloc((void **) &dc,sizeof(cufftComplex)*nbin*nsub*ndm));
+  checkCudaErrors(cudaMalloc((void **) &dc, (size_t) sizeof(cufftComplex)*nbin*nsub*ndm));
 
   // Allocate device memory for block sums
-  checkCudaErrors(cudaMalloc((void **) &bs1,sizeof(float)*mblock*mchan));
-  checkCudaErrors(cudaMalloc((void **) &bs2,sizeof(float)*mblock*mchan));
+  checkCudaErrors(cudaMalloc((void **) &bs1, (size_t) sizeof(float)*mblock*mchan));
+  checkCudaErrors(cudaMalloc((void **) &bs2, (size_t) sizeof(float)*mblock*mchan));
 
   // Allocate device memory for channel averages and standard deviations
-  checkCudaErrors(cudaMalloc((void **) &zavg,sizeof(float)*mchan));
-  checkCudaErrors(cudaMalloc((void **) &zstd,sizeof(float)*mchan));
+  checkCudaErrors(cudaMalloc((void **) &zavg, (size_t) sizeof(float)*mchan));
+  checkCudaErrors(cudaMalloc((void **) &zstd, (size_t) sizeof(float)*mchan));
 
   // Allocate memory for redigitized output and header
   header=(char *) malloc(sizeof(char)*HEADERSIZE);
   for (i=0;i<4;i++) {
     h5buf[i]=(char *) malloc(sizeof(char)*nsamp*nsub);
-    checkCudaErrors(cudaMalloc((void **) &dh5buf[i],sizeof(char)*nsamp*nsub));
+    checkCudaErrors(cudaMalloc((void **) &dh5buf[i], (size_t) sizeof(char)*nsamp*nsub));
   }
 
   // Allocate output buffers
   fbuf=(float *) malloc(sizeof(float)*nsamp*nsub);
-  checkCudaErrors(cudaMalloc((void **) &dfbuf,sizeof(float)*nsamp*nsub));
+  checkCudaErrors(cudaMalloc((void **) &dfbuf, (size_t) sizeof(float)*nsamp*nsub));
   cbuf=(unsigned char *) malloc(sizeof(unsigned char)*msamp*mchan/ndec);
-  checkCudaErrors(cudaMalloc((void **) &dcbuf,sizeof(unsigned char)*msamp*mchan/ndec));
+  checkCudaErrors(cudaMalloc((void **) &dcbuf, (size_t) sizeof(unsigned char)*msamp*mchan/ndec));
 
   // Allocate DMs and copy to device
   dm=(float *) malloc(sizeof(float)*ndm);
   for (idm=0;idm<ndm;idm++)
     dm[idm]=dm_start+(float) idm*dm_step;
-  checkCudaErrors(cudaMalloc((void **) &ddm,sizeof(float)*ndm));
+  checkCudaErrors(cudaMalloc((void **) &ddm, (size_t) sizeof(float)*ndm));
   checkCudaErrors(cudaMemcpy(ddm,dm,sizeof(float)*ndm,cudaMemcpyHostToDevice));
 
   // Generate FFT plan (batch in-place forward FFT)

From ff09d77b1e505035ceceb4f1eadd3feb3577b559 Mon Sep 17 00:00:00 2001
From: David McKenna <dmckenna@cp.dias.ie>
Date: Wed, 13 May 2020 11:39:41 +0100
Subject: [PATCH 2/5] Update cuFFT calls to follow new best practices according
 to documentation

---
 cdmt.cu | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/cdmt.cu b/cdmt.cu
index aae1d8a..b207f14 100644
--- a/cdmt.cu
+++ b/cdmt.cu
@@ -151,6 +151,18 @@ int main(int argc,char *argv[])
   // Set device
   checkCudaErrors(cudaSetDevice(device));
 
+  // DMcK: cuFFT docs say it's best practice to plan before allocating memory
+  // cuda-memcheck fails initialisation before this block is run?
+  // Generate FFT plan (batch in-place forward FFT)
+  idist=nbin;  odist=nbin;  iembed=nbin;  oembed=nbin;  istride=1;  ostride=1;
+  checkCudaErrors(cufftPlanMany(&ftc2cf,1,&nbin,&iembed,istride,idist,&oembed,ostride,odist,CUFFT_C2C,nfft*nsub));
+  cudaDeviceSynchronize();
+
+  // Generate FFT plan (batch in-place backward FFT)
+  idist=mbin;  odist=mbin;  iembed=mbin;  oembed=mbin;  istride=1;  ostride=1;
+  checkCudaErrors(cufftPlanMany(&ftc2cb,1,&mbin,&iembed,istride,idist,&oembed,ostride,odist,CUFFT_C2C,nchan*nfft*nsub));
+  cudaDeviceSynchronize();
+
   // Allocate memory for complex timeseries
   checkCudaErrors(cudaMalloc((void **) &cp1, (size_t) sizeof(cufftComplex)*nbin*nfft*nsub));
   checkCudaErrors(cudaMalloc((void **) &cp2, (size_t) sizeof(cufftComplex)*nbin*nfft*nsub));
@@ -188,14 +200,6 @@ int main(int argc,char *argv[])
   checkCudaErrors(cudaMalloc((void **) &ddm, (size_t) sizeof(float)*ndm));
   checkCudaErrors(cudaMemcpy(ddm,dm,sizeof(float)*ndm,cudaMemcpyHostToDevice));
 
-  // Generate FFT plan (batch in-place forward FFT)
-  idist=nbin;  odist=nbin;  iembed=nbin;  oembed=nbin;  istride=1;  ostride=1;
-  checkCudaErrors(cufftPlanMany(&ftc2cf,1,&nbin,&iembed,istride,idist,&oembed,ostride,odist,CUFFT_C2C,nfft*nsub));
-
-  // Generate FFT plan (batch in-place backward FFT)
-  idist=mbin;  odist=mbin;  iembed=mbin;  oembed=mbin;  istride=1;  ostride=1;
-  checkCudaErrors(cufftPlanMany(&ftc2cb,1,&mbin,&iembed,istride,idist,&oembed,ostride,odist,CUFFT_C2C,nchan*nfft*nsub));
-
   // Compute chirp
   blocksize.x=32; blocksize.y=32; blocksize.z=1;
   gridsize.x=nsub/blocksize.x+1; gridsize.y=nchan/blocksize.y+1; gridsize.z=ndm/blocksize.z+1;

From d61f9b97c7cf1c012d5c39f4d631f2af070a4a1c Mon Sep 17 00:00:00 2001
From: David McKenna <dmckenna@cp.dias.ie>
Date: Wed, 13 May 2020 11:40:03 +0100
Subject: [PATCH 3/5] Print incorrectly pass flags before exiting for easier
 debugging

---
 cdmt.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cdmt.cu b/cdmt.cu
index b207f14..bca724f 100644
--- a/cdmt.cu
+++ b/cdmt.cu
@@ -118,6 +118,7 @@ int main(int argc,char *argv[])
       }
     }
   } else {
+    printf("Unknown option '%c'\n", arg);
     usage();
     return 0;
   }

From ce146d72f1c4870db5127ec0ad5683f73e5d9799 Mon Sep 17 00:00:00 2001
From: David McKenna <dmckenna@cp.dias.ie>
Date: Wed, 13 May 2020 11:42:40 +0100
Subject: [PATCH 4/5] Print exit reason when near EOF

---
 cdmt.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cdmt.cu b/cdmt.cu
index bca724f..a93f34b 100644
--- a/cdmt.cu
+++ b/cdmt.cu
@@ -239,8 +239,10 @@ int main(int argc,char *argv[])
     startclock=clock();
     for (i=0;i<4;i++)
       nread=fread(h5buf[i],sizeof(char),nsamp*nsub,rawfile[i])/nsub;
-    if (nread==0)
+    if (nread==0) {
+      printf("No data read from last file; assuming EOF, finishng up.\n");
       break;
+    }
     printf("Block: %d: Read %d MB in %.2f s\n",iblock,sizeof(char)*nread*nsub*4/(1<<20),(float) (clock()-startclock)/CLOCKS_PER_SEC);
 
     // Copy buffers to device

From b48b5d2c77789a6de7add0f06f8f87ccc621c101 Mon Sep 17 00:00:00 2001
From: David McKenna <dmckenna@cp.dias.ie>
Date: Fri, 15 May 2020 12:41:36 +0100
Subject: [PATCH 5/5] Add support for skipping to a given time sample and only
 processing N time samples. Also prints the current location in the file (in
 hh:mm:ss.s / seconds) for further information

---
 cdmt.cu | 49 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/cdmt.cu b/cdmt.cu
index a93f34b..95f4eae 100644
--- a/cdmt.cu
+++ b/cdmt.cu
@@ -11,6 +11,7 @@
 #include <helper_cuda.h>
 #include <getopt.h>
 #include <hdf5.h>
+#include <limits.h>
 
 #define HEADERSIZE 4096
 #define DMCONSTANT 2.41e-10
@@ -18,7 +19,7 @@
 // Struct for header information
 struct header {
   int64_t headersize,buffersize;
-  unsigned int nchan,nsamp,nbit,nif,nsub;
+  unsigned int nchan,nsamp,nbit=8,nif,nsub;
   int machine_id,telescope_id,nbeam,ibeam,sumif;
   double tstart,tsamp,fch1,foff,fcen,bwchan;
   double src_raj,src_dej,az_start,za_start;
@@ -44,7 +45,7 @@ void write_filterbank_header(struct header h,FILE *file);
 // Usage
 void usage()
 {
-  printf("cdmt -P <part> -d <DM start,step,num> -D <GPU device> -b <ndec> -N <forward FFT size> -n <overlap region> -o <outputname> <file.h5>\n\n");
+  printf("cdmt -P <part> -d <DM start,step,num> -D <GPU device> -b <ndec> -N <forward FFT size> -n <overlap region> -r <time steps to process> -s <time steps to skip> -o <outputname> <file.h5>\n\n");
   printf("Compute coherently dedispersed SIGPROC filterbank files from LOFAR complex voltage data in HDF5 format.\n");
   printf("-P <part>        Specify part number for input file [integer, default: 0]\n");
   printf("-D <GPU device>  Select GPU device [integer, default: 0]\n");
@@ -53,7 +54,8 @@ void usage()
   printf("-o <outputname>           Output filename [default: cdmt]\n");
   printf("-N <forward FFT size>     Forward FFT size [integer, default: 65536]\n");
   printf("-n <overlap region>       Overlap region [integer, default: 2048]\n");
-
+  printf("-s <bytes>       Number of time samples to skip in the filterbank before stating processing [integer, default: 0]\n");
+  printf("-r <bytes>       Number of time samples to read in total from the -s offset [integer, default: length of file]\n");
   return;
 }
 
@@ -75,15 +77,18 @@ int main(int argc,char *argv[])
   float *dm,*ddm,dm_start,dm_step;
   char fname[128],fheader[1024],*h5fname,obsid[128]="cdmt";
   int bytes_read;
+  long int ts_read=LONG_MAX,ts_skip=0;
+  long int total_ts_read=0,bytes_skip=0;
   int part=0,device=0;
   int arg=0;
   FILE **outfile;
+  double timeInSeconds;
 
   // Read options
   if (argc>1) {
-    while ((arg=getopt(argc,argv,"P:d:D:ho:b:N:n:"))!=-1) {
+    while ((arg=getopt(argc,argv,"P:d:D:ho:b:N:n:r:s:"))!=-1) {
       switch (arg) {
-	
+
       case 'n':
 	noverlap=atoi(optarg);
 	break;
@@ -99,7 +104,7 @@ int main(int argc,char *argv[])
       case 'o':
 	strcpy(obsid,optarg);
 	break;
-	
+
       case 'P':
 	part=atoi(optarg);
 	break;
@@ -107,11 +112,19 @@ int main(int argc,char *argv[])
       case 'D':
 	device=atoi(optarg);
 	break;
-	
+
       case 'd':
 	sscanf(optarg,"%f,%f,%d",&dm_start,&dm_step,&ndm);
 	break;
 
+      case 's':
+	ts_skip=atol(optarg);
+	break;
+
+      case 'r':
+	ts_read=atol(optarg);
+	break;
+
       case 'h':
 	usage();
 	return 0;
@@ -127,9 +140,19 @@ int main(int argc,char *argv[])
   // Read HDF5 header
   h5=read_h5_header(h5fname);
 
+  // Handle skip flag
+  if (ts_skip > 0) {
+  	// If it's not initialised by default...
+  	if (h5.nbit == 0) h5.nbit = 8;
+
+  	bytes_skip = (long int) (ts_skip * (float) h5.nsub * (float) h5.nbit / 8.0);
+  	// Account for the difference in time in the new header if we skip bytes    // tstart = MJD, tsamp = seconds, 1 byte = 8 bits = 1 sample per file by default
+  	h5.tstart += (double) ts_skip * h5.tsamp / 86400.0;
+  }
 
   // Set number of subbands
   nsub=h5.nsub;
+  double timeOffset = h5.tsamp / nsub;
 
   // Adjust header for filterbank format
   h5.tsamp*=nchan*ndec;
@@ -231,6 +254,10 @@ int main(int argc,char *argv[])
   // Read files
   for (i=0;i<4;i++) {
     rawfile[i]=fopen(h5.rawfname[i],"r");
+    if (bytes_skip > 0) {
+  fseek(rawfile[i],bytes_skip,SEEK_SET);
+  printf("Skipping to timestep %ld (byte %ld ftell %ld)\n", ts_skip, bytes_skip, ftell(rawfile[i]));
+    }
   }
 
   // Loop over input file contents
@@ -243,6 +270,9 @@ int main(int argc,char *argv[])
       printf("No data read from last file; assuming EOF, finishng up.\n");
       break;
     }
+
+    // Count up the total bytes read
+    total_ts_read += nread;
     printf("Block: %d: Read %d MB in %.2f s\n",iblock,sizeof(char)*nread*nsub*4/(1<<20),(float) (clock()-startclock)/CLOCKS_PER_SEC);
 
     // Copy buffers to device
@@ -312,6 +342,11 @@ int main(int argc,char *argv[])
       fwrite(cbuf,sizeof(char),nread*nsub/ndec,outfile[idm]);
     }
     printf("Processed %d DMs in %.2f s\n",ndm,(float) (clock()-startclock)/CLOCKS_PER_SEC);
+    timeInSeconds = (double) ftell(rawfile[0]) * timeOffset;
+    printf("Current file position: %02ld:%02ld:%05.2lf (%1.2lfs)\n\n", (long int) (timeInSeconds / 3600.0), (long int) ((fmod(timeInSeconds, 3600.0)) / 60.0), fmod(timeInSeconds, 60.0), timeInSeconds);
+    // Exit when we pass the read length limit
+    if (total_ts_read > ts_read)
+      break;
   }
 
   // Close files