// 14 Oct 2008: Important correction to d_choldc_strip made // now moving to double precision!!!!!!! // I am now trying to "transpose" everything and make the code // return L^T in order to see if the memory related slowdowns go away... /* (C) Steven Gratton 2008. This program computes the Cholesky factorization of a matrix. Basic program layout inspired by the examples in the NVIDIA CUDA (TM) SDK. Basic algorithm based on that in Numerical Recipes for F90, that referencing Golub and Van Loan, Matrix Computations, Johns Hopkins University Press. I came up with the parallelization scheme for the G80 myself, but (unsurprisingly) it is rather similar to those discussed for general parallel computing, see e.g. SCALAPACK documentation. I */ /* This code is in an unfinished state and as such should not be used for anything serious or be redistributed. I plan to make a "proper" version available in the near future. Any comments/suggestions are welcome; please see my homepage at www.ast.cam.ac.uk/~stg20 for contact details. */ // You may need to increase stack size to get the program to run, // try "ulimit -s unlimited" or similar on linux. // block indices follow matrix conventions // i.e. (bp,bq) // refers to the bp'th block matrix in the bq'th column // thread x,y indices are always chosen to coalesce global memory reads // max block size is 22 because of 512 threads per block limit // WARNING: various options will now break if this is changed from 16 #define BLOCK_SIZE 16 // MAT_SIZE should be a multiple of BLOCK_SIZE; // for large matrices you may want to use diaginitialize... // for BLOCK_SIZE=16, largest matrix you can fit into 8800GTS // memory is 12496 (i.e. 596 megabytes) // for a GTX 260 it is 14880=16*930 (=844 MB) // seems cuda takes just under 50MB in overhead. // 768, 3072 and 12288 are relevant for healpix... // 9216=16x576 is too, being 3x3072 // ************************************************************** // // ******CHANGE THE NUMBER HERE TO TEST PERFORMANCE ISSUE!******* // #define DESIRED_MAT_SIZE (16*576) // e.g. for 8800 gts try (16*760) // for 8800gtx would be interesting to try (16*769) // ************************************************************** // // If DESIRED_MAT_SIZE/16 is a multiple of 20 (on an 8800GTS) consider // defining GLOBAL_PAD here to speed code back up (by a factor of 2). // The condition might be a multiple of 24 on an 8800GTX. // This will slow code down if (DESIRED_MAT_SIZE/16)%20=19 ... //#define GLOBAL_PAD #ifdef GLOBAL_PAD #define MAT_SIZE (DESIRED_MAT_SIZE+16) #define MAT_BLOCKS (MAT_SIZE/BLOCK_SIZE-1) #else #define MAT_SIZE DESIRED_MAT_SIZE #define MAT_BLOCKS (MAT_SIZE/BLOCK_SIZE) #endif // warning: "UNROLL" only set up for a block size of 16! #define COMPARE #define NOCPUCMP #define UNROLL //#define PRINTMATRIX //#define FULLINIT //#define PRINTINPUTMATRIX //#define MATDISP //#define CPU #include #include #include #include //a simple test case... void diaginitialize(double (*x)[MAT_SIZE]) { int i,j; for (i=0;iMAT_SIZE) vectors void initialize(double (*x)[MAT_SIZE]) { double tmp[MAT_SIZE]; int i,j,a,b; for (i=0;i=k)) { topleft[ty][tx]=(topleft[ty][tx])*fac; } __syncthreads(); if ((tx>=ty)&&(ty>k)) { topleft[ty][tx]=topleft[ty][tx]-topleft[k][ty]*topleft[k][tx]; } } __syncthreads(); if (tx>=ty) { m[ty+BLOCK_SIZE*boffset][tx+BLOCK_SIZE*boffset] =topleft[ty][tx]; } } // this kernel updates the strip below the "topleft" block __global__ void d_choldc_strip(double (*m)[MAT_SIZE], int blockoffset) { // +1 since blockoffset labels the "topleft" position // and boff is the working position... int boffy=blockoffset; int boffx = blockIdx.x+blockoffset+1; int tx = threadIdx.x; int ty = threadIdx.y; __shared__ double topleftt[BLOCK_SIZE][BLOCK_SIZE+1]; __shared__ double workingmat[BLOCK_SIZE][BLOCK_SIZE+1]; // deliberately transposed... topleftt[tx][ty]=m[ty+blockoffset*BLOCK_SIZE][tx+blockoffset*BLOCK_SIZE]; workingmat[ty][tx]= m[ty+boffy*BLOCK_SIZE][tx+boffx*BLOCK_SIZE]; __syncthreads(); // now we forward-substitute for the new strip-elements... // one thread per column (a bit inefficient I'm afraid) if(ty==0) { for (int k=0;k=ty) { #ifdef UNROLL matrixprod+=topt[ty][0]*topt[tx][0]; matrixprod+=topt[ty][1]*topt[tx][1]; matrixprod+=topt[ty][2]*topt[tx][2]; matrixprod+=topt[ty][3]*topt[tx][3]; matrixprod+=topt[ty][4]*topt[tx][4]; matrixprod+=topt[ty][5]*topt[tx][5]; matrixprod+=topt[ty][6]*topt[tx][6]; matrixprod+=topt[ty][7]*topt[tx][7]; matrixprod+=topt[ty][8]*topt[tx][8]; matrixprod+=topt[ty][9]*topt[tx][9]; matrixprod+=topt[ty][10]*topt[tx][10]; matrixprod+=topt[ty][11]*topt[tx][11]; matrixprod+=topt[ty][12]*topt[tx][12]; matrixprod+=topt[ty][13]*topt[tx][13]; matrixprod+=topt[ty][14]*topt[tx][14]; matrixprod+=topt[ty][15]*topt[tx][15]; #else for (int kk=0;kk2) { higrid.x=i-2; higrid.y=1; higrid.z=1; dim3 stripgrid(i-1); d_choldc_topleft<<<1,threads>>>(d_mat,j-i); #ifdef MATDISP printf("after topleft for j-i=%d:\n",j-i); matdisp(d_mat); #endif d_choldc_strip<<>>(d_mat,j-i); #ifdef MATDISP matdisp(d_mat); #endif d_choldc_diagupdate<<>>(d_mat,j-i); /* printf("here,%i %i.\n",higrid.x,higrid.y); error=cudaGetLastError(); printf(" Error code %d: %s.\n",error,cudaGetErrorString(error)); */ #ifdef MATDISP matdisp(d_mat); #endif // printf("here,%i %i.\n",higrid.x,higrid.y); d_choldc_hiupdate<<>>(d_mat,j-i); /* error=cudaGetLastError(); printf(" Error code %d: %s.\n",error,cudaGetErrorString(error)); */ #ifdef MATDISP printf("After hiupdate:"); matdisp(d_mat); #endif i--; } if(j>1) { d_choldc_topleft<<<1,threads>>>(d_mat,j-2); #ifdef MATDISP matdisp(d_mat); #endif d_choldc_strip<<<1,threads>>>(d_mat,j-2); #ifdef MATDISP matdisp(d_mat); #endif d_choldc_diagupdate<<<1,threads>>>(d_mat,j-2); #ifdef MATDISP matdisp(d_mat); #endif } d_choldc_topleft<<<1,threads>>>(d_mat,j-1); #ifdef MATDISP matdisp(d_mat); #endif cudaThreadSynchronize(); gputime=clock()-gputime; printf("kernel time=%f s.\n",gputime/1.e6f); cudaMemcpy(mat,d_mat, mat_size,cudaMemcpyDeviceToHost); cudaFree(d_mat); cudaThreadSynchronize(); error=cudaGetLastError(); printf(" Error code %d: %s.\n",error,cudaGetErrorString(error)); } int main() { double x[MAT_SIZE][MAT_SIZE]; double diagonal[1][MAT_SIZE]; int i,j; clock_t maincputime,maingputime; cudaThreadExit(); printf("Initializing test matrices...\n"); #ifdef FULLINIT initialize(x); #else diaginitialize(x); #endif diagonalinitialize(diagonal,x); #ifdef PRINTINPUTMATRIX for (i=0;i