/*
 *  Ray Trace Mandlebulb using CUDA
 *  '0' snaps angles to 0,0
 *  arrows to rotate the world
 *  wasd   move light
 *  +/-    adjust power
 *  []     adjust iterations
 */

using namespace std;

#define GL_GLEXT_PROTOTYPES
#ifdef __APPLE__
#include <GLUT/glut.h>
#else
#include <GL/glut.h>
#endif
#include <vector>
#include <cuda.h>

#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include "ray.h"
#include "mb.h"

//  Global variables
int th=0;                   //  Azimuth of view angle
int ph=0;                   //  Elevation of view angle
int Th=30;                  //  Azimuth of light angle
int Ph=30;                  //  Elevation of light angle
unsigned char* pixels=NULL; //  Pixel array
MandelBulb mb;              //  Mandelbulb structure
int nthread;                //  Threads per block

/*
 *  Print message to stderr and exit
 */
void Fatal(const char* format , ...)
{
   va_list args;
   va_start(args,format);
   vfprintf(stderr,format,args);
   va_end(args);
   exit(1);
}

/*
 *  Initialize fastest GPU device
 */
int InitGPU(int verbose)
{
   //  Initialize CUDA
   if (cuInit(0)) Fatal("Cannot initialize CUDA\n");

   //  Get number of CUDA devices
   int num;
   if (cudaGetDeviceCount(&num)) Fatal("Cannot get number of CUDA devices\n");
   if (num<1) Fatal("No CUDA devices found\n");

   //  Get fastest device
   cudaDeviceProp prop;
   int   MaxDevice = -1;
   int   MaxGflops = -1;
   for (int dev=0;dev<num;dev++)
   {
      if (cudaGetDeviceProperties(&prop,dev)) Fatal("Error getting device %d properties\n",dev);
      int Gflops = prop.multiProcessorCount * prop.clockRate;
      if (verbose) printf("CUDA Device %d: %s Gflops %f Processors %d Threads/Block %d\n",dev,prop.name,1e-6*Gflops,prop.multiProcessorCount,prop.maxThreadsPerBlock);
      if(Gflops > MaxGflops)
      {
         MaxGflops = Gflops;
         MaxDevice = dev;
      }
   }

   //  Print and set device
   if (cudaGetDeviceProperties(&prop,MaxDevice)) Fatal("Error getting device %d properties\n",MaxDevice);
   printf("Fastest CUDA Device %d: %s\n",MaxDevice,prop.name);
   cudaSetDevice(MaxDevice);

   //  Return max thread count
   return prop.maxThreadsPerBlock;
}

//  Normalize Vec3 (device)
//  Needed so that sqrtf is not host code
__device__ inline Vec3 d_normalize(Vec3 v)
{
   float l = v*v;
   return (l==0) ? Vec3(1,0,0) : (1/sqrtf(l))*v;
}

//  Definitions to make device code
#define DEVICE __device__
#define normalize d_normalize
#include "mandelbulb.h"

/*
 *  Ray trace the scene
 *  Overloaded function to provide host/device interface
 */
__global__ void RayTrace(unsigned char* pix,const MandelBulb mb)
{
   unsigned int k = blockIdx.x*blockDim.x+threadIdx.x;
   if (k<mb.wid*mb.hgt)
      RayTracePixel(pix,k,mb);
}

/*
 *  OpenGL (GLUT) calls this routine to display the scene
 */
void display()
{
   double t = 0.001*glutGet(GLUT_ELAPSED_TIME);
   //  Set light direction
   mb.L.x = Sin(Th)*Cos(Ph);
   mb.L.y =         Sin(Ph);
   mb.L.z = Cos(Th)*Cos(Ph);

   //  Allocate pixel array on device
   unsigned char* devpix;
   if (cudaMalloc((void**)&devpix,mb.dim)) Fatal("Cannot allocate device memory for pixels\n");

   //  Ray trace scene
   int nblock = mb.wid*mb.hgt/nthread;
   if (nblock*nthread<mb.wid*mb.hgt) nblock++;
   RayTrace<<<nblock,nthread>>>(devpix,mb);

   //  Copy pixels from device to host
   if (cudaMemcpy(pixels,devpix,mb.dim,cudaMemcpyDeviceToHost)) Fatal("Cannot copy pixels from device to host\n");

   //  Free device memory
   cudaFree(devpix);

   //  Time ray tracing
   t = 0.001*glutGet(GLUT_ELAPSED_TIME) - t;
   //  Blit scene to screen
   glWindowPos2i(0,0);
   glDrawPixels(mb.wid,mb.hgt,GL_RGBA,GL_UNSIGNED_BYTE,pixels);
   //  Display
   glWindowPos2i(5,5);
   Print("Size %dx%d Time %.3fs Angle %d,%d Power %d MaxIter %d",mb.wid,mb.hgt,t,th,ph,mb.n,mb.maxiter);
   //  Flush
   glFlush();
   glutSwapBuffers();
}

/*
 *  Start up GLUT and tell it what to do
 */
int main(int argc,char* argv[])
{
   //  Initialize GLUT and process user parameters
   glutInit(&argc,argv);
   //  Request double buffered, true Color window with Z buffering at 600x600
   glutInitWindowSize(800,600);
   glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
   //  Create the window
   glutCreateWindow("Mandelbulb - GPU");
   //  Set GLUT callbacks
   glutDisplayFunc(display);
   glutReshapeFunc(reshape);
   glutSpecialFunc(special);
   glutKeyboardFunc(key);
   //  Initialize Mandelbulb
   Init(mb);
   //  Initialize CUDA
   nthread = InitGPU(1)/2;
   //  Pass control to GLUT so it can interact with the user
   glutMainLoop();
   return 0;
}
