/* (C) Steven Gratton 2007.
   This code attempts to calculate average trajectories for the 
   early universe in a model of eternal inflation.
   Basic layout is based on examples in the NVIDIA CUDA (TM) SDK.
   
   For more information see http://www.ast.cam.ac.uk/~stg20/cuda/index.html
*/


// GTS has 12 multiprocessors.  
// Want #blocks> 2*#multiprocessors.
// Want #threads per block a multiple of 64 ideally say 256.
// The threads of each block are split into warps of 32 threads.

// time here is alpha*t of GT05
// r here is (alpha/beta)^(2/3) r of GT05
// the point of these rescalings is to make the 
// volume-weighted saddle point equation simple,
// namely r''=r+3/r^2


// using the 2pi's that others use in alpha and beta, we have
// alpha=8 sqrt(lambda/3), beta= (lambda/3)^(1/4) / pi
// we find (alpha/beta)^(2/3) = (8pi)^(2/3) * (lambda/3) ^(1/6)
// also need alpha^(1/6) beta^(1/3) = (8 lambda/(3 pi^2))^(1/6)

// in these units the critical value for eternal inflation is r^3=6.
// typical volume weighted paths occur over timescales of order 1

// non-volume-weighted- means and variancees are coming out right! 6/9/07
// I think the vol-weighted code is basically there but is
// suffering from overflow errors...


#include"math_constants.h"
#include <stdio.h>


// NUM_RNGS=NUM_RNGS_PER_BLOCK*NUM_BLOCKS
// NUM_RNGS_PER_BLOCK is the block size
// We need the block size to be a power of 2
// for the sum over thread results in each block to work
// You can still set DELTA_T appropriately to get any 
// desired endtime. 

// To sum over block results, we use a new kernel
// and get each new block to do one timestep
// So have to make sure NUM_BLOCKS < max num of threads per block
// Again need NUM_BLOCKS to be a power of 2
// for the sum to work

// Make sure sure you have enough threads per block to do the sum outputs!!!


// typical values 64,256,16384,128,5000

#define NUM_RNGS_PER_BLOCK 64
#define NUM_BLOCKS 256
#define NUM_RNGS 16384
#define NUM_STEPS 128
#define NUM_HISTORIES_PER_RNG 5000


#define R_START 1.0f
//#define R_END 1.0f
// only used for constrained paths

#define DELTA_T .01f


// most testing done with lambda=1.e-12
#define LAMBDA 1.e-12f
#define LG2_BANK_SIZE 4

#define BANK_SIZE 16
//#define LG2_BANK_SIZE 4

unsigned int xtest[NUM_RNGS];
unsigned int ctest[NUM_RNGS];
unsigned int atest[NUM_RNGS];

float rarray[NUM_RNGS];

__device__ float rblocksum[NUM_STEPS][NUM_BLOCKS];
__device__ float rsqblocksum[NUM_STEPS][NUM_BLOCKS];
__device__ float vblocksum[NUM_STEPS][NUM_BLOCKS];
__device__ float rvblocksum[NUM_STEPS][NUM_BLOCKS];
__device__ float rsqvblocksum[NUM_STEPS][NUM_BLOCKS];


// forward declaration of the device code
__global__ void etinfd(unsigned int*,unsigned int*,
		       unsigned int*);
__global__ void sumoverblocksd(float*,float*,float*,float*,float*);

// wrapper for device code
void etinf(unsigned int* x,unsigned int* c,unsigned int* a,
	   float* r,float* rsq,float* v,float* rv,float* rsqv)
{
    cudaError_t cudastat;
    clock_t time1,time2;
    int size;
    size=NUM_RNGS*sizeof(unsigned int);

    time1=clock();


    //load initial values
    unsigned int* xd;
    cudaMalloc((void**)&xd,size);
    cudaMemcpy(xd,x,size,cudaMemcpyHostToDevice);
	
    unsigned int* cd;
    cudaMalloc((void**)&cd,size);
    cudaMemcpy(cd,c,size,cudaMemcpyHostToDevice);
	
    unsigned int* ad;
    cudaMalloc((void**)&ad,size);
    cudaMemcpy(ad,a,size,cudaMemcpyHostToDevice);
	

    //set up an array for output
    size=NUM_STEPS*sizeof(float);
    float* rd;
    cudaMalloc((void**)&rd,size);
//    cudaMemset(rd,0,size); //if we wanted to zero it

    size=NUM_STEPS*sizeof(float);
    float* rsqd;
    cudaMalloc((void**)&rsqd,size);
 
    size=NUM_STEPS*sizeof(float);
    float* vd;
    cudaMalloc((void**)&vd,size);
 
    size=NUM_STEPS*sizeof(float);
    float* rvd;
    cudaMalloc((void**)&rvd,size);
 
    size=NUM_STEPS*sizeof(float);
    float* rsqvd;
    cudaMalloc((void**)&rsqvd,size);


    dim3 dimBlock(NUM_RNGS_PER_BLOCK);
    dim3 dimGrid(NUM_BLOCKS);


    etinfd<<<dimGrid,dimBlock>>>(xd,cd,ad);	

    cudastat=cudaGetLastError();
    printf("Error code=%i, %s.\n",cudastat,cudaGetErrorString(cudastat));

    cudaThreadSynchronize(); //probably not necessary


// warning, need at least 5 (at the moment) threads to do the outputs...
    sumoverblocksd<<<NUM_STEPS,NUM_BLOCKS>>>(rd,rsqd,vd,rvd,rsqvd);	

    cudastat=cudaGetLastError();
    printf("Error code=%i, %s.\n",cudastat,cudaGetErrorString(cudastat));

    cudaThreadSynchronize();


    cudaMemcpy(r,rd,size,cudaMemcpyDeviceToHost);
    cudaMemcpy(rsq,rsqd,size,cudaMemcpyDeviceToHost);
   cudaMemcpy(v,vd,size,cudaMemcpyDeviceToHost);
   cudaMemcpy(rv,rvd,size,cudaMemcpyDeviceToHost);
   cudaMemcpy(rsqv,rsqvd,size,cudaMemcpyDeviceToHost);


    //Free device memory
    cudaFree(xd);
    cudaFree(cd);
    cudaFree(ad);
    cudaFree(rd);
    cudaFree(rsqd);
   cudaFree(vd);
   cudaFree(rvd);
   cudaFree(rsqvd);

    time2=clock();
	
    printf("time1=%u, time2=%u.\n",time1,time2);
  
    cudastat=cudaGetLastError();
    printf("Error code=%i, %s.\n",cudastat,cudaGetErrorString(cudastat));
 

}


__global__ void etinfd
(unsigned int* xd,unsigned int* cd,
 unsigned int* ad)
{
    //Block index
    int bx=blockIdx.x;

    //Thread index
    int tx=threadIdx.x;	

    //for loops
    unsigned long int ii=0;
    unsigned long int jj=0;

    //First element processed by the block
    int begin=NUM_RNGS_PER_BLOCK*bx;

    unsigned long long int x=xd[begin+tx];
    unsigned int tmpx;
    unsigned int c=cd[begin+tx];
    unsigned int a=ad[begin+tx];

    float deltat=DELTA_T;
    float tend=deltat*NUM_STEPS;
    deltat=.5f*deltat; // since we take two Box-Muller steps per step

    float amp,nefolds,nefoldsbg,volprefac;

//  amp^6=alpha*beta^2=8 lambda/(3 pi^2)
//  using the Hodges' prefactor
    amp=__powf((2.f*LAMBDA/3.f)*CUDART_2_OVER_PI_F*CUDART_2_OVER_PI_F,1.f/6.f);
    amp=amp*sqrtf(deltat);
    
#ifdef TESTING1
    volprefac=5000.f;
#else
    volprefac=CUDART_PI_F*__powf(8.f*CUDART_PI_F*LAMBDA/3.f,-1.f/3.f);
// = 15468.34 for lambda=1e-12 
#endif

    float r,vol;
    float rend,t;

//  trying out an automatic array to store intermediate data
    float rlocalsum[NUM_STEPS];
    float rsqlocalsum[NUM_STEPS];
    float vlocalsum[NUM_STEPS];
    float rvlocalsum[NUM_STEPS];
    float rsqvlocalsum[NUM_STEPS];
    
    
    for (int kk=0;kk<NUM_STEPS;kk++) 
    {
        rlocalsum[kk]=0.f;
        rsqlocalsum[kk]=0.f;
        vlocalsum[kk]=0.f;
        rvlocalsum[kk]=0.f;
        rsqvlocalsum[kk]=0.f;
    }
        

//looping over runs...
    for (jj=0;jj<NUM_HISTORIES_PER_RNG;jj++)
    {
	nefolds=0.f;
	vol=1.f;
	r=R_START;

	t=0.f;


// looping over time steps...
	for(ii=0;ii<NUM_STEPS;ii++){

	    float nr;
	    float nphase;
	    float n1,n2;
  

	    x=x*a+c;
	    c=(x>>32);
	    x=x&0xffffffffull;

// shift by one to avoid logf(0.) below...
	
	    tmpx=x;
	    nr=(float)tmpx;
	    nr=nr+1.f;
	    nr=nr/(UINT_MAX);

	    x=x*a+c;
	    c=(x>>32);
	    x=x&0xffffffffull;

	    tmpx=x;
	    nphase=2.f*CUDART_PI_F*((float)(tmpx)/((float)(UINT_MAX)));

	    nr=sqrtf(-2.f*logf(nr));

	    n1=nr*sinf(nphase);
	    n2=nr*cosf(nphase);

//	printf("n1=%f, n2=%f.\n",n1,n2);

	    nefolds=nefolds+deltat/r;

	    t+=deltat;
	    r=r*(1.f+deltat)+amp*n1;


	    nefolds=nefolds+deltat/r;

	    t+=deltat;
	    r=r*(1.f+deltat)+amp*n2;

//	printf("tid=%d,r[%f]=%f.\n",tx,t,r);

	    rlocalsum[ii]+=r;
	    rsqlocalsum[ii]+=r*r;

#ifdef EFOLDS_IN_RV	    
// temporarily returning (a multiple of) the number of 
// efolds in the volume slots...
        rvlocalsum[ii]+=nefolds;
	    rsqvlocalsum[ii]+=nefolds*nefolds;
#else
// subtracting off the "classical" expansion to try and keep the 
// numbers under control...


// trying putting a big number in the vol to prevent overflow...
// ugly; done by hand by trial and error --- can we do better?
        nefoldsbg=(1.f-expf(-t))/R_START;
        vol=expf(3.f*volprefac*(nefolds-nefoldsbg)-60.f-600.f*t);
        vlocalsum[ii]+=vol;
        rvlocalsum[ii]+=r*vol;
	    rsqvlocalsum[ii]+=r*r*vol;       
#endif

	    
//	    printf("tid=%d,rlocalsum[%d]=%f.\n",tx,ii,r);

	} // over time steps
    } //over runs

// now let's try averaging over the histories!

// first within a block...
// note that we have as many threads as RNGs per block
// should think about doing multiple sums at once so as not to waste
// half of them...

    __shared__ float rsumfromlocal[NUM_RNGS_PER_BLOCK];
    __shared__ float rsqsumfromlocal[NUM_RNGS_PER_BLOCK];
    __shared__ float vsumfromlocal[NUM_RNGS_PER_BLOCK];
    __shared__ float rvsumfromlocal[NUM_RNGS_PER_BLOCK];
    __shared__ float rsqvsumfromlocal[NUM_RNGS_PER_BLOCK];


    for (ii=0;ii<NUM_STEPS;ii++)
    {
	__syncthreads();
	rsumfromlocal[tx]=rlocalsum[ii];
	rsqsumfromlocal[tx]=rsqlocalsum[ii];
	vsumfromlocal[tx]=vlocalsum[ii];
	rvsumfromlocal[tx]=rvlocalsum[ii];
	rsqvsumfromlocal[tx]=rsqvlocalsum[ii];


	int sumdiff=NUM_RNGS_PER_BLOCK>>1;
	for (int p=sumdiff;p>=1;p>>=1)
	{
	    __syncthreads(); //might be a problem, see prog guide p.26
	    // no, the syncthreads is outside the if
	    if (tx<p) {
	        rsumfromlocal[tx]+=rsumfromlocal[tx+sumdiff];
	        rsqsumfromlocal[tx]+=rsqsumfromlocal[tx+sumdiff];
	        vsumfromlocal[tx]+=vsumfromlocal[tx+sumdiff];
	        rvsumfromlocal[tx]+=rvsumfromlocal[tx+sumdiff];
	        rsqvsumfromlocal[tx]+=rsqvsumfromlocal[tx+sumdiff];
	        
	        
        }
	    sumdiff>>=1;
	}
	__syncthreads();
 //   if(tx==0) printf("tid=%d, rsumfromlocal=%f\n",tx,rsumfromlocal[0]);
	if (tx==0) rblocksum[ii][bx]=rsumfromlocal[0];
    if (tx==1) rsqblocksum[ii][bx]=rsqsumfromlocal[0];
    if (tx==2) vblocksum[ii][bx]=vsumfromlocal[0];
    if (tx==3) rvblocksum[ii][bx]=rvsumfromlocal[0];
    if (tx==4) rsqvblocksum[ii][bx]=rsqvsumfromlocal[0];
 
 //   if(tx==0) printf("rblocksum[%d][%d]=%f\n",ii,bx,rblocksum[ii][bx]);

    }

//now will sum over blocks using the kernel below	


}

__global__ void sumoverblocksd(float* rav,float* rsqav, float* vav,float* rvav,
                                    float* rsqvav)
{

    int tx=threadIdx.x;	// the old block no.
    int bx=blockIdx.x;  // the time step

    // each new block does one timestep

    __shared__ float rblocksumshared[NUM_BLOCKS];
    __shared__ float rsum;
    __shared__ float rsqblocksumshared[NUM_BLOCKS];
    __shared__ float rsqsum;
    __shared__ float vblocksumshared[NUM_BLOCKS];
    __shared__ float vsum;
    __shared__ float rvblocksumshared[NUM_BLOCKS];
    __shared__ float rvsum;
    __shared__ float rsqvblocksumshared[NUM_BLOCKS];
    __shared__ float rsqvsum;


//   rblocksumshared[tx]=rblocksum[bx][tx];
        // note we're deliberately accessing rblocksum with the
        // indices the other way round to how we wrote it...

// don't think the above was well-received, so let's try...
// careful with the spellings! ar__p ar for array, p for pointer
// then in the middle we have v for vol, r for r, rsq for r^2

    float* arrp=rblocksum[bx];
    rblocksumshared[tx]=arrp[tx];
    float* arrsqp=rsqblocksum[bx];
    rsqblocksumshared[tx]=arrsqp[tx];
    float* arvp=vblocksum[bx];
    vblocksumshared[tx]=arvp[tx];
    float* arrvp=rvblocksum[bx];
    rvblocksumshared[tx]=arrvp[tx];
    float* arrsqvp=rsqvblocksum[bx];
    rsqvblocksumshared[tx]=arrsqvp[tx];
    
    
//    printf("rblocksumshared[%d]=%f\n",tx,arrp[tx]);
        
    __syncthreads();
     	     	
   	int sumdiff=NUM_BLOCKS>>1;
     	for (int p=sumdiff;p>=1;p>>=1)
    	{
    	    __syncthreads(); //might be a problem, see prog guide p.26
    	    // no, the syncthreads is outside the if
            if (tx<p) {
	            rblocksumshared[tx]+=rblocksumshared[tx+sumdiff];
	            rsqblocksumshared[tx]+=rsqblocksumshared[tx+sumdiff];
	            vblocksumshared[tx]+=vblocksumshared[tx+sumdiff];
	            rvblocksumshared[tx]+=rvblocksumshared[tx+sumdiff];
	            rsqvblocksumshared[tx]+=rsqvblocksumshared[tx+sumdiff];
            }
    	    sumdiff>>=1;
    	}
	__syncthreads();
	if (tx==0) rsum=rblocksumshared[0];
	if (tx==1) rsqsum=rsqblocksumshared[0];
	if (tx==2) vsum=vblocksumshared[0];
	if (tx==3) rvsum=rvblocksumshared[0];
	if (tx==4) rsqvsum=rsqvblocksumshared[0];
	
	
//	if (tx==0) printf("rsum[%d]=%f\n",bx,rsum);
//	now write it back...
	__syncthreads();
	
	if (tx==0) rav[bx]=rsum;
	if (tx==1) rsqav[bx]=rsqsum;
	if (tx==2) vav[bx]=vsum;
	if (tx==3) rvav[bx]=rvsum;
	if (tx==4) rsqvav[bx]=rsqvsum;
}
	

void initialize(void)
{
    FILE *fp;
    unsigned int begin=0u;
    unsigned long long int xinit=1ull;
    unsigned int cinit=0u;
    unsigned int fora,tmp1,tmp2;
    fp=fopen("midsafeprimesupto2tothe32.txt","r");


// use begin as a multiplier to generate the initial x's for 
// the other generators...

    fscanf(fp,"%u %u %u",&begin,&tmp1,&tmp2);

    for (unsigned int i=0;i<NUM_RNGS;i++)
    {

	xinit=xinit*begin+cinit;
	cinit=xinit>>32;
	xinit=xinit&0xffffffffull;
	xtest[i]=(unsigned int) xinit;
	fscanf(fp,"%u %u %u",&fora,&tmp1,&tmp2);
	atest[i]=fora;

	xinit=xinit*begin+cinit;
	cinit=xinit>>32;
	xinit=xinit&0xffffffffull;
	ctest[i]=(unsigned int) ((((double)xinit)/UINT_MAX)*fora);


//	printf("%u: x=%llu, c=%u, a=%u.\n",i,xtest[i],ctest[i],atest[i]);
    }
    fclose(fp);
}

/*
typedef struct av_s {float mean;float sd;} av_t;

av_t average(float *arr,int n)
{
    int ii;
    double av=0.;
    double var=0.;
    av_t myav;
    for (ii=0;ii<n;ii++)
    {
	av+=arr[ii];
//	printf("av[%d]=%f.\n",ii,av);
    }

    av=av/n;

    for (ii=0;ii<n;ii++){
	var+=(arr[ii]-av)*(arr[ii]-av);
//	printf("var[%d]=%f.\n",ii,var);
    }

    var=var/n;

    myav.mean=av;
    myav.sd=sqrt(var);

    return myav;
}
*/


int main(int argc,char* argv[])
{
    int ii,jj;
//    av_t av;
    float routputsum[NUM_STEPS];
    float rsqoutputsum[NUM_STEPS];
    float voutputsum[NUM_STEPS];
    float rvoutputsum[NUM_STEPS];
    float rsqvoutputsum[NUM_STEPS];

    volatile double tmp1,tmp2,tmp3;

 
    float totalno=(float)NUM_RNGS*(float)NUM_HISTORIES_PER_RNG;
    float invtotalno=1.f/totalno;
 
 
    initialize();
    etinf(xtest,ctest,atest,routputsum,rsqoutputsum,voutputsum
                ,rvoutputsum,rsqvoutputsum);
    for (ii=0;ii<NUM_STEPS;ii++){
	printf("At step %d, mean = %f, variance =%f.\n",ii,
	    routputsum[ii]*invtotalno,
	    (rsqoutputsum[ii]-routputsum[ii]*routputsum[ii]*invtotalno)*invtotalno);
    }

   printf("\n");

#ifdef EFOLDS_IN_RV
    for (ii=0;ii<NUM_STEPS;ii++){
	printf("At step %d, no. efolds = %f, variance =%f.\n",ii,
	    rvoutputsum[ii]*invtotalno,
	    (rsqvoutputsum[ii]-rvoutputsum[ii]*rvoutputsum[ii]*invtotalno)*invtotalno);
    }

   printf("\n");
#endif

#ifdef VOL_WEIGHT  
    for (ii=0;ii<NUM_STEPS;ii++){
#ifdef TESTING
	printf("With vol weighting, at step %d, weight=%.4e, rsum=%.4e, rsqsum=%.4e.\n",ii,
	    voutputsum[ii],
	    rvoutputsum[ii],
	    rsqvoutputsum[ii]);
    }

#else
    tmp1=voutputsum[ii];
    tmp2=rvoutputsum[ii]/voutputsum[ii];
    tmp3=rsqvoutputsum[ii]/voutputsum[ii];
    tmp3=tmp3-tmp2*tmp2;

	printf("With vol weighting, at step %d, weight=%.4e, r=%.4e, var(r)=%.4e.\n",ii,
	    tmp1,tmp2,tmp3);
    }
#endif
#endif
 
 
 //   av=average(rarray,NUM_RNGS);
 //   printf("mean=%f, s.d.=%e.\n",av.mean,av.sd);

}