// Copyright (C) 2008/2011 Steven Gratton
// Guided in part by samples from the AMD CAL SDK 
// Potentially links with AMD's Timer code

#include <iostream>
#include <iomanip>
#include <string>

#include <time.h>
#include <cmath>

#include "cal.h"
#include "calcl.h"

#include "matmult.h"

//#include "Timer.h"

//#define PERFCOUNT

#ifdef PERFCOUNT
#include "cal_ext.h"
#include "cal_ext_counter.h"
#endif


std::string ILcheck=
  "il_ps_2_0\n"
  "ret_dyn\n"
  "end\n";

std::string ILcheck2=
"il_ps_2_0\n"
"dcl_input_position_interp(linear_noperspective) vWinCoord0.xy\n"
"dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n"
"dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n"
"dcl_cb cb0[1] ; k4max, pitch of mat2 as floats \n"
"sample_resource(0)_sampler(0) r0, vWinCoord0.xy\n"
"sample_resource(1)_sampler(0) r1, vWinCoord0.xy\n"
"mad g[0],r0,r1,cb0[0]\n"
  "ret_dyn\n"
  "end\n";

//static CPerfCounter timer;

using namespace std;

#ifdef PERFCOUNT
static PFNCALCTXCREATECOUNTER  calCtxCreateCounterExt;
static PFNCALCTXDESTROYCOUNTER calCtxDestroyCounterExt;
static PFNCALCTXBEGINCOUNTER   calCtxBeginCounterExt;
static PFNCALCTXENDCOUNTER     calCtxEndCounterExt;
static PFNCALCTXGETCOUNTER     calCtxGetCounterExt;
#endif

void cpumatmult(int m,int k,int n,float* a,float* b,float* c)
{

  printf("*on the cpu...*\n");
  int i,j,p;
  float sum;
  volatile clock_t internalcpu;

  internalcpu=clock();
  for (i=0;i<m;i++)
    {
      for (j=0;j<n;j++)
        {
	  float tmp=0.f; 
	  for (p=0;p<k;p++)
	    {
	      tmp+=a[i*k+p]*b[p*n+j];
	    }
	  c[i*k+j]=tmp;
        }
    }
  internalcpu=clock()-internalcpu;
  printf("Internal cputime=%u.\n",internalcpu);
}

void maketestmat(int height,int width,float* mat)
{
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      mat[i*width+j]=1.f;
    }
  }
}

void dispmat(int height,int width,float* mat)
{
  cout << "In dispmat." << endl;
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      printf("%10.6f ",mat[i*width+j]);
      if (!((j+1)&0x3)) printf("  ");
    }
    printf("\n");
    if (!((i+1)&0x3)) printf("\n");
  }
  printf("\n");
}

void dispdiag(int height,int width,float* mat)
{
  int top=0;
  if (width<=height) 
    top=width;
  else
    top=height;
  for (int i=0;i<top;i++){
    printf("%10.6f ",mat[i*width+i]);
    if (!((i+1)&0x3)) printf("  ");
    if (!((i+1)&0x1f)) printf("\n");
  }
  printf("\n");
}

void dispdiagpart(int height,int width,int maximum, float* mat)
{
  int top=height;
   if (width<height) top=width;
   if (maximum<top) top=maximum;
  for (int i=0;i<top;i++){
    printf("%10.6f ",mat[i*width+i]);
    if (!((i+1)&0x3)) printf("  ");
    if (!((i+1)&0x1f)) printf("\n");
  }
  printf("\n");
}

void dispmattr(int height,int width,float* mat)
{
  for (int i=0;i<height;i++){
    for (int j=0;j<width;j++){
      printf("%10.6f ",mat[j*width+i]);
      if (!((j+1)&0x3)) printf("  ");
    }
    printf("\n");
    if (!((i+1)&0x3)) printf("\n");
  }
  printf("\n");
}


int copytogpu(int width,int height,float* cpumat,CALresource gpumat)
{
  float* gpuptr=NULL;
  CALuint gpupitch=0;
  calResMap((CALvoid**)&gpuptr, &gpupitch, gpumat, 0);
  cout << "pitch=" << gpupitch << endl;
  for (int i = 0; i < height; ++i)
    {
      float* tmp = &gpuptr[i * gpupitch*4];
      for(int j = 0; j < width; ++j)
	{
	  tmp[j]=cpumat[i*width+j];
	}
    }
  calResUnmap(gpumat);
  return (int) gpupitch;
}

int zeroongpu(int width,int height,CALresource gpumat)
{
  float* gpuptr=NULL;
  CALuint gpupitch=0;
  calResMap((CALvoid**)&gpuptr, &gpupitch, gpumat, 0);
  memset(gpuptr,0,height*gpupitch*4*sizeof(float));
  calResUnmap(gpumat);
  return (int) gpupitch;
}

void copytocpu(int width,int height,float* cpumat,CALresource gpumat)
{
  cout << "In copytocpu." << endl;
  float* gpuptr=NULL;
  CALuint gpupitch=0;
  calResMap((CALvoid**)&gpuptr, &gpupitch, gpumat, 0);
  for (int i = 0; i < height; ++i)
    {
      float* tmp = &gpuptr[i * gpupitch*4];
      for(int j = 0; j < width; ++j)
	{
	  cpumat[i*width+j]=tmp[j];
	}
    }
  calResUnmap(gpumat);
}

int main(int argc, char** argv)
{
  //  multiply an m*k matrix by a k*n one to get a m*n one
  //  where m*k means m rows and k columns
  //  i.e. an array of width k and height m
  //  All arrays are stored row major
  //  It can get a bit confusing because "graphics-oriented"
  //  calls typically write width, height
  //  whereas matrix ones do height,width...

  int m=8192,k=8192,n=8192;
  int k4=k/4;
  int n4=n/4;
  int m8=m/8;
  int k8=k/8;
  int n8=n/8;
  int n16=n/16;

  if((m%8)!=0||(k%4)!=0||(n%16)!=0) 
{
    cout << "Sorry, matrices aren't appropriately sized." << endl; 
return -1;
 };

  float* cpumat0=new float[m*k];
  float* cpumat1=new float[k*n];
  float* cpumat2=new float[m*n];

  // a "matrix" call so height, width...
  maketestmat(m,k,cpumat0);
  maketestmat(k,n,cpumat1);

 
  float* cpumat3=new float[m*n];
 // cpumatmult(m,k,n,cpumat0,cpumat1,cpumat2);
//  dispdiag(m,n,cpumat2);
  delete[] cpumat3;
 
  std::string kernel0 = fastmult;
  // kernel0=ILcheck2;

  calInit();
  CALuint numDevices = 0;
  calDeviceGetCount(&numDevices);

  cout << "Num devices =" << numDevices << endl;

  CALdevice device = 0;
  calDeviceOpen(&device, 0);

  CALdeviceinfo info;
  calDeviceGetInfo(&info, 0);

cout << "info.target=" << info.target << endl;


#ifdef PERFCOUNT
  if (calExtSupported((CALextid)CAL_EXT_COUNTERS) != CAL_RESULT_OK)
    {
      return 1;
    }
        
  if (calExtGetProc((CALextproc*)&calCtxCreateCounterExt, (CALextid)CAL_EXT_COUNTERS, "calCtxCreateCounter"))
    {
      return 1;
    }

  if (calExtGetProc((CALextproc*)&calCtxDestroyCounterExt, (CALextid)CAL_EXT_COUNTERS, "calCtxDestroyCounter"))
    {
      return 1;
    }
    
  if (calExtGetProc((CALextproc*)&calCtxBeginCounterExt, (CALextid)CAL_EXT_COUNTERS, "calCtxBeginCounter"))
    {
      return 1;
    }
    
  if (calExtGetProc((CALextproc*)&calCtxEndCounterExt, (CALextid)CAL_EXT_COUNTERS, "calCtxEndCounter"))
    {
      return 1;
    }

  if (calExtGetProc((CALextproc*)&calCtxGetCounterExt, (CALextid)CAL_EXT_COUNTERS, "calCtxGetCounter"))
    {
      return 1;
    }
#endif

  CALcontext ctx = 0;
  calCtxCreate(&ctx, device);

  CALobject obj0 = NULL;
  CALimage image0 = NULL;
  CALlanguage lang0 = CAL_LANGUAGE_IL;

#ifdef LOCALBUILD
  if (calclCompile(&obj0, lang0, kernel0.c_str(), info.target) != CAL_RESULT_OK)
    {
      fprintf(stdout, "Kernel0 compilation failed. Exiting.\n");
      cout << "Compile error: " << calclGetErrorString() << endl;
      return 1;
    }
  else
    {
      cout << "kernel0 compiled fine" << endl;
    };

  if (calclLink(&image0, &obj0, 1) != CAL_RESULT_OK)
    {
      fprintf(stdout, "Kernel0 linking failed. Exiting.\n");
      return 1;
    }
#endif 


#ifdef AMDIMAGE
   FILE* fp;
   fpos_t sz;
   fp=fopen("/users/sgratton/documents/lexyacc/pmm_CAYMAN.elf","rb");
   fseek(fp,0l,SEEK_END);
   fgetpos(fp,&sz);
   rewind(fp);
   char* buff=(char*) malloc(sz);
   fread(buff,1,sz,fp);
   fclose(fp);

   printf("size=%d.\n",sz);

cout << "reading amd image" << endl;

   calImageRead(&image0,buff,sz);

   printf("Error string: %s\n",calGetErrorString());
#endif

#ifdef MYIMAGE
   FILE* fp;
   fpos_t sz;
   fp=fopen("/users/sgratton/documents/calmatmult/myfmmnewaddr.elf","rb");
   fseek(fp,0l,SEEK_END);
   fgetpos(fp,&sz);
   rewind(fp);
   char* buff=(char*) malloc(sz);
   fread(buff,1,sz,fp);
   fclose(fp);

   printf("size=%d.\n",sz);

cout << "reading my image" << endl;

   calImageRead(&image0,buff,sz);

   printf("Error string: %s\n",calGetErrorString());
#endif


  // "graphics" so width, height...
  CALresource gpumat0=0;
  if(calResAllocLocal2D(&gpumat0, device,k4,m, 
			CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      printf("gpumat0 resource allocation failed.\n");
    }
  else
    {
      cout << "gpumat0 fine." << endl;
    }


  CALresource gpumat1=0;
  if(calResAllocLocal2D(&gpumat1, device, n4,k, 
			CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      printf("gpumat1 resource allocation failed.\n");
    }
  else
    {
      cout << "gpumat1 fine." << endl;
    }

  CALresource mat2=0;
  if(calResAllocLocal2D(&mat2, device, n4,m, 
			CAL_FORMAT_FLOAT_4, CAL_RESALLOC_GLOBAL_BUFFER)
     !=CAL_RESULT_OK) 
    {
      printf("mat2 resource allocation failed.\n");
	printf("%s\n",calGetErrorString());
    }
  else
    {
      cout << "mat2 fine." << endl;
    }


  CALresource const0=0;
  if(calResAllocLocal1D(&const0, device, 1, 
			CAL_FORMAT_FLOAT_4, 0)
     !=CAL_RESULT_OK) 
    {
      printf("const0 resource allocation failed.\n");
    }
  else
    {
      cout << "const0 fine." << endl;
    }

  // "graphics" so width, height...
  copytogpu(k,m,cpumat0,gpumat0);
  copytogpu(n,k,cpumat1,gpumat1);
  int pitch2=0;
  pitch2=zeroongpu(n,m,mat2);
  cout <<"pitch 2="<<pitch2<<"."<<endl;

  union {int i; float f;} tmpunion;
  tmpunion.i=pitch2;

  float* constdata=NULL;
  CALuint constpitch=0;     
  calResMap((void**)&constdata,&constpitch,const0,0);
  constdata[0]=(float) k4;
  constdata[1]=tmpunion.f;
  calResUnmap(const0);

  CALmem mat0mem=0;
  CALmem mat1mem=0;
  CALmem mat2mem=0;
  CALmem const0mem=0;

  calCtxGetMem(&mat0mem, ctx, gpumat0);
  calCtxGetMem(&mat1mem, ctx, gpumat1);
  calCtxGetMem(&mat2mem, ctx, mat2);
  calCtxGetMem(&const0mem,ctx,const0);

  cout << "After GetMem, " << calGetErrorString() << "." << endl;

  CALmodule module0 = 0;

  calModuleLoad(&module0, ctx, image0);

  cout << "After ModuleLoad, " << calGetErrorString() << "." << endl;

  CALfunc func0 = 0;

  CALname matname0=0;
  CALname matname1=0;
  CALname matname2=0;
  CALname constname0=0;

  calModuleGetEntry(&func0, ctx, module0, "main");

  cout << "After GetEntry, " << calGetErrorString() << "." << endl;


  calModuleGetName(&matname0, ctx, module0, "i0");
  calModuleGetName(&matname1, ctx, module0, "i1");
  calModuleGetName(&matname2, ctx, module0, "g[]");
  calModuleGetName(&constname0, ctx, module0, "cb0");

  cout << "After GetName, " << calGetErrorString() << "." << endl;

  calCtxSetMem(ctx, matname0, mat0mem);
  calCtxSetMem(ctx, matname1, mat1mem);
  calCtxSetMem(ctx, matname2, mat2mem);
  calCtxSetMem(ctx, constname0, const0mem);

  cout << "After SetMem, " << calGetErrorString() << "." << endl;

  
  CALevent e = 0;

 // timer.Reset();

  cout << "Just before running, " << calGetErrorString() << "." << endl;

  volatile clock_t gputime;

#ifdef PERFCOUNT

  CALcounter idleCounter;
  if (calCtxCreateCounterExt(&idleCounter, ctx, CAL_COUNTER_IDLE) != CAL_RESULT_OK)
    {
      return 1;
    }

  CALcounter cacheCounter;
  if (calCtxCreateCounterExt(&cacheCounter, ctx, CAL_COUNTER_INPUT_CACHE_HIT_RATE) != CAL_RESULT_OK)
    {
      return 1;
    }
    
  if (calCtxBeginCounterExt(ctx, idleCounter) != CAL_RESULT_OK)
    {
      return 1;
    }
    
  if (calCtxBeginCounterExt(ctx, cacheCounter) != CAL_RESULT_OK)
    {
      return 1;
    }
#endif

  // "graphics" so width, height...
  //CALdomain domain0 = {1,1,4,4};
       CALdomain domain0 = {0,0,n16,m8};
  //  CALdomain domain0 = {0,1,1,1};

  CALuint xstart=0;
  CALuint ystart=0;
 CALuint xblock=n16;
  CALuint yblock=m8;


  if ((n16%xblock)!=0||(m8%yblock)!=0) {cout << "block sizes no good." << endl; return 1;}

  //  calCtxFlush(ctx);
  //  while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);e=0;

  //  calCtxRunProgram(&e, ctx, func0, &domain0);

  //  while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);e=0;

  gputime=clock();

  //timer.Start();

  
   
  //   while(ystart<m8)
   //     {
    //      xstart=0;
     //     while(xstart<n16)
      //      {
       //       CALdomain domain1 = {xstart, ystart, xstart+xblock, ystart+yblock};
	//      calCtxRunProgram(&e, ctx, func0, &domain1);
         //     xstart+=xblock;
	      //	      cout << domain1.x << domain1.y << domain1.width << domain1.height << endl;
           // };
      //    ystart+=yblock;
	  //	  cout<< "ystart=" << ystart << endl;
      //  };
 
  
cerr << "running prog..."<< endl;

                calCtxRunProgram(&e, ctx, func0, &domain0);

      while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);e=0;

cerr << "done running prog" << endl;

// timer.Stop();

gputime=clock()-gputime;

   cout << "After kernel0, " << calGetErrorString() << endl;

#ifdef PERFCOUNT
if (calCtxEndCounterExt(ctx, idleCounter) != CAL_RESULT_OK)
  {
    return 1;
  }

if (calCtxEndCounterExt(ctx, cacheCounter) != CAL_RESULT_OK)
  {
    return 1;
  }


CALfloat idlePercentage = 0.0f;
if (calCtxGetCounterExt(&idlePercentage, ctx, idleCounter) != CAL_RESULT_OK)
  {
    return 1;
  }

CALfloat cachePercentage = 0.0f;
if (calCtxGetCounterExt(&cachePercentage, ctx, cacheCounter) != CAL_RESULT_OK)
  {
    return 1;
  }
printf("Idle percentage: %0.2f%% Cache hit rate: %0.2f%%\n", idlePercentage * 100.0f, cachePercentage * 100.0f);

if (calCtxDestroyCounterExt(ctx, idleCounter) != CAL_RESULT_OK)
  {
    return 1;
  }

if (calCtxDestroyCounterExt(ctx, cacheCounter) != CAL_RESULT_OK)
  {
    return 1;
  }

#endif

 cout << "gpu time=" << gputime << "/" << CLOCKS_PER_SEC << "=" <<gputime/(float)CLOCKS_PER_SEC <<" s." <<endl;

cout << "gflops=" << (float) CLOCKS_PER_SEC*2*m*n*k/gputime /1.e9 << endl;

 //cout << "timer time=" << timer.GetElapsedTime() <<"s." << endl;

cout << "After calculation, " << calGetErrorString() << "." << endl;

// ambiguous, so chosen "graphics"...
copytocpu(n,m,cpumat2,mat2);

cout << "After copy, " << calGetErrorString() << "." << endl;

for (int ii=0;ii<5; ii++) cout << cpumat2[ii] << "  ";
cout << endl;

// "matrix" so height, width...
//dispmat(m,n,cpumat2);
//dispdiagpart(m,n,16,cpumat2);

 cout << "After dispmat." << endl;

delete[] cpumat0;
delete[] cpumat1;
delete[] cpumat2;
  
calModuleUnload(ctx, module0);

calclFreeImage(image0);
calclFreeObject(obj0);

calCtxReleaseMem(ctx,mat0mem);
calCtxReleaseMem(ctx,mat1mem);
calCtxReleaseMem(ctx,mat2mem);
calCtxReleaseMem(ctx,const0mem);

calResFree(gpumat0);
calResFree(gpumat1);
calResFree(mat2);
calResFree(const0);

calCtxDestroy(ctx);

calDeviceClose(device);

calShutdown();

return 0;
}
