/* (C) Steven Gratton 2007. This code attempts to calculate average trajectories for the early universe in a model of eternal inflation. Basic layout is based on examples in the NVIDIA CUDA (TM) SDK. For more information see http://www.ast.cam.ac.uk/~stg20/cuda/index.html */ // GTS has 12 multiprocessors. // Want #blocks> 2*#multiprocessors. // Want #threads per block a multiple of 64 ideally say 256. // The threads of each block are split into warps of 32 threads. // time here is alpha*t of GT05 // r here is (alpha/beta)^(2/3) r of GT05 // the point of these rescalings is to make the // volume-weighted saddle point equation simple, // namely r''=r+3/r^2 // using the 2pi's that others use in alpha and beta, we have // alpha=8 sqrt(lambda/3), beta= (lambda/3)^(1/4) / pi // we find (alpha/beta)^(2/3) = (8pi)^(2/3) * (lambda/3) ^(1/6) // also need alpha^(1/6) beta^(1/3) = (8 lambda/(3 pi^2))^(1/6) // in these units the critical value for eternal inflation is r^3=6. // typical volume weighted paths occur over timescales of order 1 // non-volume-weighted- means and variancees are coming out right! 6/9/07 // I think the vol-weighted code is basically there but is // suffering from overflow errors... #include"math_constants.h" #include // NUM_RNGS=NUM_RNGS_PER_BLOCK*NUM_BLOCKS // NUM_RNGS_PER_BLOCK is the block size // We need the block size to be a power of 2 // for the sum over thread results in each block to work // You can still set DELTA_T appropriately to get any // desired endtime. // To sum over block results, we use a new kernel // and get each new block to do one timestep // So have to make sure NUM_BLOCKS < max num of threads per block // Again need NUM_BLOCKS to be a power of 2 // for the sum to work // Make sure sure you have enough threads per block to do the sum outputs!!! // typical values 64,256,16384,128,5000 #define NUM_RNGS_PER_BLOCK 64 #define NUM_BLOCKS 256 #define NUM_RNGS 16384 #define NUM_STEPS 128 #define NUM_HISTORIES_PER_RNG 5000 #define R_START 1.0f //#define R_END 1.0f // only used for constrained paths #define DELTA_T .01f // most testing done with lambda=1.e-12 #define LAMBDA 1.e-12f #define LG2_BANK_SIZE 4 #define BANK_SIZE 16 //#define LG2_BANK_SIZE 4 unsigned int xtest[NUM_RNGS]; unsigned int ctest[NUM_RNGS]; unsigned int atest[NUM_RNGS]; float rarray[NUM_RNGS]; __device__ float rblocksum[NUM_STEPS][NUM_BLOCKS]; __device__ float rsqblocksum[NUM_STEPS][NUM_BLOCKS]; __device__ float vblocksum[NUM_STEPS][NUM_BLOCKS]; __device__ float rvblocksum[NUM_STEPS][NUM_BLOCKS]; __device__ float rsqvblocksum[NUM_STEPS][NUM_BLOCKS]; // forward declaration of the device code __global__ void etinfd(unsigned int*,unsigned int*, unsigned int*); __global__ void sumoverblocksd(float*,float*,float*,float*,float*); // wrapper for device code void etinf(unsigned int* x,unsigned int* c,unsigned int* a, float* r,float* rsq,float* v,float* rv,float* rsqv) { cudaError_t cudastat; clock_t time1,time2; int size; size=NUM_RNGS*sizeof(unsigned int); time1=clock(); //load initial values unsigned int* xd; cudaMalloc((void**)&xd,size); cudaMemcpy(xd,x,size,cudaMemcpyHostToDevice); unsigned int* cd; cudaMalloc((void**)&cd,size); cudaMemcpy(cd,c,size,cudaMemcpyHostToDevice); unsigned int* ad; cudaMalloc((void**)&ad,size); cudaMemcpy(ad,a,size,cudaMemcpyHostToDevice); //set up an array for output size=NUM_STEPS*sizeof(float); float* rd; cudaMalloc((void**)&rd,size); // cudaMemset(rd,0,size); //if we wanted to zero it size=NUM_STEPS*sizeof(float); float* rsqd; cudaMalloc((void**)&rsqd,size); size=NUM_STEPS*sizeof(float); float* vd; cudaMalloc((void**)&vd,size); size=NUM_STEPS*sizeof(float); float* rvd; cudaMalloc((void**)&rvd,size); size=NUM_STEPS*sizeof(float); float* rsqvd; cudaMalloc((void**)&rsqvd,size); dim3 dimBlock(NUM_RNGS_PER_BLOCK); dim3 dimGrid(NUM_BLOCKS); etinfd<<>>(xd,cd,ad); cudastat=cudaGetLastError(); printf("Error code=%i, %s.\n",cudastat,cudaGetErrorString(cudastat)); cudaThreadSynchronize(); //probably not necessary // warning, need at least 5 (at the moment) threads to do the outputs... sumoverblocksd<<>>(rd,rsqd,vd,rvd,rsqvd); cudastat=cudaGetLastError(); printf("Error code=%i, %s.\n",cudastat,cudaGetErrorString(cudastat)); cudaThreadSynchronize(); cudaMemcpy(r,rd,size,cudaMemcpyDeviceToHost); cudaMemcpy(rsq,rsqd,size,cudaMemcpyDeviceToHost); cudaMemcpy(v,vd,size,cudaMemcpyDeviceToHost); cudaMemcpy(rv,rvd,size,cudaMemcpyDeviceToHost); cudaMemcpy(rsqv,rsqvd,size,cudaMemcpyDeviceToHost); //Free device memory cudaFree(xd); cudaFree(cd); cudaFree(ad); cudaFree(rd); cudaFree(rsqd); cudaFree(vd); cudaFree(rvd); cudaFree(rsqvd); time2=clock(); printf("time1=%u, time2=%u.\n",time1,time2); cudastat=cudaGetLastError(); printf("Error code=%i, %s.\n",cudastat,cudaGetErrorString(cudastat)); } __global__ void etinfd (unsigned int* xd,unsigned int* cd, unsigned int* ad) { //Block index int bx=blockIdx.x; //Thread index int tx=threadIdx.x; //for loops unsigned long int ii=0; unsigned long int jj=0; //First element processed by the block int begin=NUM_RNGS_PER_BLOCK*bx; unsigned long long int x=xd[begin+tx]; unsigned int tmpx; unsigned int c=cd[begin+tx]; unsigned int a=ad[begin+tx]; float deltat=DELTA_T; float tend=deltat*NUM_STEPS; deltat=.5f*deltat; // since we take two Box-Muller steps per step float amp,nefolds,nefoldsbg,volprefac; // amp^6=alpha*beta^2=8 lambda/(3 pi^2) // using the Hodges' prefactor amp=__powf((2.f*LAMBDA/3.f)*CUDART_2_OVER_PI_F*CUDART_2_OVER_PI_F,1.f/6.f); amp=amp*sqrtf(deltat); #ifdef TESTING1 volprefac=5000.f; #else volprefac=CUDART_PI_F*__powf(8.f*CUDART_PI_F*LAMBDA/3.f,-1.f/3.f); // = 15468.34 for lambda=1e-12 #endif float r,vol; float rend,t; // trying out an automatic array to store intermediate data float rlocalsum[NUM_STEPS]; float rsqlocalsum[NUM_STEPS]; float vlocalsum[NUM_STEPS]; float rvlocalsum[NUM_STEPS]; float rsqvlocalsum[NUM_STEPS]; for (int kk=0;kk>32); x=x&0xffffffffull; // shift by one to avoid logf(0.) below... tmpx=x; nr=(float)tmpx; nr=nr+1.f; nr=nr/(UINT_MAX); x=x*a+c; c=(x>>32); x=x&0xffffffffull; tmpx=x; nphase=2.f*CUDART_PI_F*((float)(tmpx)/((float)(UINT_MAX))); nr=sqrtf(-2.f*logf(nr)); n1=nr*sinf(nphase); n2=nr*cosf(nphase); // printf("n1=%f, n2=%f.\n",n1,n2); nefolds=nefolds+deltat/r; t+=deltat; r=r*(1.f+deltat)+amp*n1; nefolds=nefolds+deltat/r; t+=deltat; r=r*(1.f+deltat)+amp*n2; // printf("tid=%d,r[%f]=%f.\n",tx,t,r); rlocalsum[ii]+=r; rsqlocalsum[ii]+=r*r; #ifdef EFOLDS_IN_RV // temporarily returning (a multiple of) the number of // efolds in the volume slots... rvlocalsum[ii]+=nefolds; rsqvlocalsum[ii]+=nefolds*nefolds; #else // subtracting off the "classical" expansion to try and keep the // numbers under control... // trying putting a big number in the vol to prevent overflow... // ugly; done by hand by trial and error --- can we do better? nefoldsbg=(1.f-expf(-t))/R_START; vol=expf(3.f*volprefac*(nefolds-nefoldsbg)-60.f-600.f*t); vlocalsum[ii]+=vol; rvlocalsum[ii]+=r*vol; rsqvlocalsum[ii]+=r*r*vol; #endif // printf("tid=%d,rlocalsum[%d]=%f.\n",tx,ii,r); } // over time steps } //over runs // now let's try averaging over the histories! // first within a block... // note that we have as many threads as RNGs per block // should think about doing multiple sums at once so as not to waste // half of them... __shared__ float rsumfromlocal[NUM_RNGS_PER_BLOCK]; __shared__ float rsqsumfromlocal[NUM_RNGS_PER_BLOCK]; __shared__ float vsumfromlocal[NUM_RNGS_PER_BLOCK]; __shared__ float rvsumfromlocal[NUM_RNGS_PER_BLOCK]; __shared__ float rsqvsumfromlocal[NUM_RNGS_PER_BLOCK]; for (ii=0;ii>1; for (int p=sumdiff;p>=1;p>>=1) { __syncthreads(); //might be a problem, see prog guide p.26 // no, the syncthreads is outside the if if (tx>=1; } __syncthreads(); // if(tx==0) printf("tid=%d, rsumfromlocal=%f\n",tx,rsumfromlocal[0]); if (tx==0) rblocksum[ii][bx]=rsumfromlocal[0]; if (tx==1) rsqblocksum[ii][bx]=rsqsumfromlocal[0]; if (tx==2) vblocksum[ii][bx]=vsumfromlocal[0]; if (tx==3) rvblocksum[ii][bx]=rvsumfromlocal[0]; if (tx==4) rsqvblocksum[ii][bx]=rsqvsumfromlocal[0]; // if(tx==0) printf("rblocksum[%d][%d]=%f\n",ii,bx,rblocksum[ii][bx]); } //now will sum over blocks using the kernel below } __global__ void sumoverblocksd(float* rav,float* rsqav, float* vav,float* rvav, float* rsqvav) { int tx=threadIdx.x; // the old block no. int bx=blockIdx.x; // the time step // each new block does one timestep __shared__ float rblocksumshared[NUM_BLOCKS]; __shared__ float rsum; __shared__ float rsqblocksumshared[NUM_BLOCKS]; __shared__ float rsqsum; __shared__ float vblocksumshared[NUM_BLOCKS]; __shared__ float vsum; __shared__ float rvblocksumshared[NUM_BLOCKS]; __shared__ float rvsum; __shared__ float rsqvblocksumshared[NUM_BLOCKS]; __shared__ float rsqvsum; // rblocksumshared[tx]=rblocksum[bx][tx]; // note we're deliberately accessing rblocksum with the // indices the other way round to how we wrote it... // don't think the above was well-received, so let's try... // careful with the spellings! ar__p ar for array, p for pointer // then in the middle we have v for vol, r for r, rsq for r^2 float* arrp=rblocksum[bx]; rblocksumshared[tx]=arrp[tx]; float* arrsqp=rsqblocksum[bx]; rsqblocksumshared[tx]=arrsqp[tx]; float* arvp=vblocksum[bx]; vblocksumshared[tx]=arvp[tx]; float* arrvp=rvblocksum[bx]; rvblocksumshared[tx]=arrvp[tx]; float* arrsqvp=rsqvblocksum[bx]; rsqvblocksumshared[tx]=arrsqvp[tx]; // printf("rblocksumshared[%d]=%f\n",tx,arrp[tx]); __syncthreads(); int sumdiff=NUM_BLOCKS>>1; for (int p=sumdiff;p>=1;p>>=1) { __syncthreads(); //might be a problem, see prog guide p.26 // no, the syncthreads is outside the if if (tx>=1; } __syncthreads(); if (tx==0) rsum=rblocksumshared[0]; if (tx==1) rsqsum=rsqblocksumshared[0]; if (tx==2) vsum=vblocksumshared[0]; if (tx==3) rvsum=rvblocksumshared[0]; if (tx==4) rsqvsum=rsqvblocksumshared[0]; // if (tx==0) printf("rsum[%d]=%f\n",bx,rsum); // now write it back... __syncthreads(); if (tx==0) rav[bx]=rsum; if (tx==1) rsqav[bx]=rsqsum; if (tx==2) vav[bx]=vsum; if (tx==3) rvav[bx]=rvsum; if (tx==4) rsqvav[bx]=rsqvsum; } void initialize(void) { FILE *fp; unsigned int begin=0u; unsigned long long int xinit=1ull; unsigned int cinit=0u; unsigned int fora,tmp1,tmp2; fp=fopen("midsafeprimesupto2tothe32.txt","r"); // use begin as a multiplier to generate the initial x's for // the other generators... fscanf(fp,"%u %u %u",&begin,&tmp1,&tmp2); for (unsigned int i=0;i>32; xinit=xinit&0xffffffffull; xtest[i]=(unsigned int) xinit; fscanf(fp,"%u %u %u",&fora,&tmp1,&tmp2); atest[i]=fora; xinit=xinit*begin+cinit; cinit=xinit>>32; xinit=xinit&0xffffffffull; ctest[i]=(unsigned int) ((((double)xinit)/UINT_MAX)*fora); // printf("%u: x=%llu, c=%u, a=%u.\n",i,xtest[i],ctest[i],atest[i]); } fclose(fp); } /* typedef struct av_s {float mean;float sd;} av_t; av_t average(float *arr,int n) { int ii; double av=0.; double var=0.; av_t myav; for (ii=0;ii