#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include "utility.h"
#include "cuda/book.h"

#define MAXVERSION 5
#define MAXTHREAD  512

// Function prototypes

void processCmdLine(int argc, char ** argv, 
		    char **filename, int *v, int *n, int *m, int *t);

int cpureduce0(int * A, int n, int t)
{
  int sum = 0;
  int i;

  for (i=0;i<n;i++) {
    sum += A[i];
  }

  return sum;
}

//---------------------------------------------------------------------
// reduce version 1:
//   standard tree loop building stride and halving number of 
//   participants at each level.
//---------------------------------------------------------------------
__global__ void reduce1(int *A, int n, int *result)
{
  __shared__ int s_A[MAXTHREAD];   // Declare block-shared memory region

  // Each thread loads an element from global to shared memory before we reduce

  int tid = threadIdx.x;                             // Local index
  int i   = blockIdx.x * blockDim.x + threadIdx.x;   // Global index

  if (i < n) {
    s_A[tid] = A[i];
  } else {
    s_A[tid] = 0;
  }
  __syncthreads(); // barrier before the reduction tree

  // Now reduce in shared memory

  int stride = 1;
  while (stride < blockDim.x) {
    if (tid % (stride << 1) == 0) {
      s_A[tid] += s_A[tid + stride];
    }
    stride = stride<<1;
    __syncthreads();
  }

  // Copy back block result to appropriate place in destination

  if (tid==0) result[blockIdx.x] = s_A[0];
}

//---------------------------------------------------------------------
// reduce version 2:
//   stride loop still starts at 1 and increases, but particpants
//   are grouped together and non-participants are grouped together
//   by using a strided index and we eliminate mod operation
//---------------------------------------------------------------------
__global__ void reduce2(int *A, int n, int *result)
{
  __shared__ int s_A[MAXTHREAD];   // Declare block-shared memory region

  // Each thread loads an element from global to shared memory before we reduce

  int tid = threadIdx.x;                             // Local index
  int i   = blockIdx.x * blockDim.x + threadIdx.x;   // Global index

  if (i < n) {
    s_A[tid] = A[i];
  } else {
    s_A[tid] = 0;
  }
  __syncthreads(); // barrier before the reduction tree

  // Now reduce in shared memory

  int stride = 1;
  while (stride < blockDim.x) {
    int index = (stride << 1) * tid;

    if (index < blockDim.x) {
      s_A[index] += s_A[index + stride];
    }
    stride = stride<<1;
    __syncthreads();
  }

  // Copy back block result to appropriate place in destination

  if (tid==0) result[blockIdx.x] = s_A[0];
}

//---------------------------------------------------------------------
// reduce version 3:
//   reverse stride loop starting at half block size and going down
//   by powers of two.  Result is to reduce bank conflicts
//---------------------------------------------------------------------
__global__ void reduce3(int *A, int n, int *result)
{
  __shared__ int s_A[MAXTHREAD];   // Declare block-shared memory region

  // Each thread loads an element from global to shared memory before we reduce

  int tid = threadIdx.x;                             // Local index
  int i   = blockIdx.x * blockDim.x + threadIdx.x;   // Global index

  if (i < n) {
    s_A[tid] = A[i];
  } else {
    s_A[tid] = 0;
  }
  __syncthreads(); // barrier before the reduction tree

  // Now reduce in shared memory

  int stride = blockDim.x/2;
  while (stride > 0) {

    if (tid < stride) {
      s_A[tid] += s_A[tid + stride];
    }

    stride = stride>>1;

    __syncthreads();
  }

  // Copy back block result to appropriate place in destination

  if (tid==0) result[blockIdx.x] = s_A[0];
}

//---------------------------------------------------------------------
// reduce version 4:
//   Get twice work done per block by having each thread start by
//   loading and adding two elements from original vector.  Need to
//   also change launch invocation to support.
//---------------------------------------------------------------------
__global__ void reduce4(int *A, int n, int *result)
{
  __shared__ int s_A[MAXTHREAD];   // Declare block-shared memory region

  // Each thread loads an element from global to shared memory before we reduce

  int tid = threadIdx.x;                             // Local index
  int i   = blockIdx.x * (blockDim.x * 2) + threadIdx.x;   // Global index

  if (i < n) {
    s_A[tid] = A[i];
  } else {
    s_A[tid] = 0;
  }
  if (i + blockDim.x < n) {
    s_A[tid] += A[i + blockDim.x];
  }
  __syncthreads(); // barrier before the reduction tree

  // Now reduce in shared memory

  int stride = blockDim.x/2;
  while (stride > 0) {

    if (tid < stride) {
      s_A[tid] += s_A[tid + stride];
    }

    stride = stride>>1;

    __syncthreads();
  }

  // Copy back block result to appropriate place in destination

  if (tid==0) result[blockIdx.x] = s_A[0];
}

//---------------------------------------------------------------------
// reduce version 5:
//   Partial loop unrolling when a warp or fewer remaining active
//   participants.
//---------------------------------------------------------------------
__global__ void reduce5(int *A, int n, int *result)
{
  __shared__ int s_A[MAXTHREAD];   // Declare block-shared memory region

  // Each thread loads an element from global to shared memory before we reduce

  int tid = threadIdx.x;                             // Local index
  int i   = blockIdx.x * (blockDim.x * 2) + threadIdx.x;   // Global index

  if (i < n) {
    s_A[tid] = A[i];
  } else {
    s_A[tid] = 0;
  }
  if (i + blockDim.x < n) {
    s_A[tid] += A[i + blockDim.x];
  }
  __syncthreads(); // barrier before the reduction tree

  // Now reduce in shared memory

  int stride = blockDim.x/2;
  while (stride > 32) {

    if (tid < stride) {
      s_A[tid] += s_A[tid + stride];
    }

    stride = stride>>1;

    __syncthreads();
  }

  if (tid < 32) {
    s_A[tid] += s_A[tid + 32];
    s_A[tid] += s_A[tid + 16];
    s_A[tid] += s_A[tid +  8];
    s_A[tid] += s_A[tid +  4];
    s_A[tid] += s_A[tid +  2];
    s_A[tid] += s_A[tid +  1];
  }

  // Copy back block result to appropriate place in destination

  if (tid==0) result[blockIdx.x] = s_A[0];
}

int cpureduce(int *A, int n, int t, int v)
{
  int numBlocks = (n + t - 1)/t;
  int result[MAXTHREAD];
  int *d_A; 
  int *d_B;
  int *d_src;
  int *d_dst;
  int *d_temp;
  int sum = 0;;
  int countToReduce = n;

  HANDLE_ERROR( cudaMalloc( (void **)&d_A, n * sizeof(int) ) ); 
  HANDLE_ERROR( cudaMalloc( (void **)&d_B, numBlocks * sizeof(int) ) );

  HANDLE_ERROR( cudaMemcpy( d_A, A, n * sizeof(int), cudaMemcpyHostToDevice ) );
  HANDLE_ERROR( cudaMemset( d_B, 0, numBlocks*sizeof(int) ) );

  d_src = d_A;
  d_dst = d_B;

  if (v == 1) {
    while (countToReduce >= t) {
      reduce1<<<numBlocks, t>>>(d_src, countToReduce, d_dst);
      countToReduce = numBlocks;
      numBlocks = (countToReduce + t - 1)/t;
      d_temp = d_src;
      d_src = d_dst;
      d_dst = d_temp;
    }
  } else if (v == 2) {
    while (countToReduce >= t) {
      reduce2<<<numBlocks, t>>>(d_src, countToReduce, d_dst);
      countToReduce = numBlocks;
      numBlocks = (countToReduce + t - 1)/t;
      d_temp = d_src;
      d_src = d_dst;
      d_dst = d_temp;
    }
  } else if (v == 3) {
    while (countToReduce >= t) {
      reduce3<<<numBlocks, t>>>(d_src, countToReduce, d_dst);
      countToReduce = numBlocks;
      numBlocks = (countToReduce + t - 1)/t;
      d_temp = d_src;
      d_src = d_dst;
      d_dst = d_temp;
    }
  } else if (v == 4) {
    numBlocks = (n/2 + t - 1)/t;
    while (countToReduce >= t) {
      reduce4<<<numBlocks, t>>>(d_src, countToReduce, d_dst);
      countToReduce = numBlocks;
      numBlocks = (countToReduce/2 + t - 1)/t;
      d_temp = d_src;
      d_src = d_dst;
      d_dst = d_temp;
    }
  } else if (v == 5) {
    numBlocks = (n/2 + t - 1)/t;
    while (countToReduce >= t) {
      reduce5<<<numBlocks, t>>>(d_src, countToReduce, d_dst);
      countToReduce = numBlocks;
      numBlocks = (countToReduce/2 + t - 1)/t;
      d_temp = d_src;
      d_src = d_dst;
      d_dst = d_temp;
    }
  }
  HANDLE_ERROR( cudaMemcpy(&result, d_src, countToReduce * sizeof(int), cudaMemcpyDeviceToHost ) );

  cudaFree( d_A );
  cudaFree( d_B );

  int i;
  for (i=0; i<countToReduce; i++) {
    sum += result[i];
  }
  return sum;
}


int main(int argc, char * argv[]) 
{
  // Data for global problem
  int * A;

  int v = 0;     // Version of reduce computation
  char *filename = NULL;
  int proceed;
  int n = 0;
  int m = 200;
  int t = MAXTHREAD;

  int sum;
  struct timeval cpustart;
  struct timeval cpustop;

  processCmdLine(argc, argv, &filename, &v, &n, &m, &t);


  fprintf(stderr, "filename: %s, version: %d, n: %d, m: %d, t: %d\n", 
	  (filename == NULL) ? "stdin": filename, v, n, m, t);

  proceed = 1;

  if (v < 0 || v > MAXVERSION) {
    fprintf(stderr, "Invalid version number\n");
    proceed = 0;
    goto errexit;
  }

  if (filename != NULL && n > 0) {
    fprintf(stderr, 
	    "Cannot specify both a filename and an n for random gen.\n");
    proceed = 0;
    goto errexit;
  }
  
  if (n > 0) {
    int inputok = randomintvector(n, m, &A);
    if (inputok < 0) {
      fprintf(stderr, "randomintvector failed");
      proceed = 0;
      goto errexit;
    }
  } else {
    int inputok = readintvector(filename, &n, &A);
    if (inputok < 0) {
      fprintf(stderr, "readintvector failed with filename: %s\n", filename);
      proceed = 0;
      goto errexit;
    }
  }

  // Check that n is sane
  
  proceed = n / t;

 errexit:
  if (!proceed) {
    printf("Got the nogo signal\n");
    exit(1);
  }

  // At this point, cpu/host should have all the data and we should have
  // a viable problem to solve.

  gettimeofday(&cpustart, NULL);

  switch(v) {
  case 0:
    sum = cpureduce0(A, n, t);
    break;
  case 1:
    sum = cpureduce(A, n, t, 1);
    break;
  case 2:
    sum = cpureduce(A, n, t, 2);
    break;
  case 3:
    sum = cpureduce(A, n, t, 3);
    break;
  case 4:
    sum = cpureduce(A, n, t, 4);
    break;
  case 5:
    sum = cpureduce(A, n, t, 5);
    break;
  }

  gettimeofday(&cpustop, NULL);
  double cpuelapsed = timediff(&cpustart, &cpustop);
  printf("Delta: %f\n", cpuelapsed);
  printf("Reduction sum = %d\n", sum);

  return 0;
}

void processCmdLine(int argc, char ** argv,
		    char ** filenameptr, int * vptr, int * nptr, int * mptr, int * tptr)
{
  int c;

  while ((c=getopt(argc, argv, "f:v:n:m:t:")) != -1) {
    switch(c) {
    case 'f':
      *filenameptr = strdup(optarg);
      break;
    case 'v':
      *vptr = atoi(optarg);
      break;
    case 'n':
      *nptr = atoi(optarg);
      break;
    case 'm':
      *mptr = atoi(optarg);
      break;
    case 't':
      *tptr = atoi(optarg);
      break;
    }
  }
}

//---------------------------------------------------------------------
//---------------------------------------------------------------------