#include #include #include #include #include #include #include "utility.h" #include "cuda/book.h" #define MAXVERSION 5 #define MAXTHREAD 512 // Function prototypes void processCmdLine(int argc, char ** argv, char **filename, int *v, int *n, int *m, int *t); int cpureduce0(int * A, int n, int t) { int sum = 0; int i; for (i=0;i 0) { if (tid < stride) { s_A[tid] += s_A[tid + stride]; } stride = stride>>1; __syncthreads(); } // Copy back block result to appropriate place in destination if (tid==0) result[blockIdx.x] = s_A[0]; } //--------------------------------------------------------------------- // reduce version 4: // Get twice work done per block by having each thread start by // loading and adding two elements from original vector. Need to // also change launch invocation to support. //--------------------------------------------------------------------- __global__ void reduce4(int *A, int n, int *result) { __shared__ int s_A[MAXTHREAD]; // Declare block-shared memory region // Each thread loads an element from global to shared memory before we reduce int tid = threadIdx.x; // Local index int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; // Global index if (i < n) { s_A[tid] = A[i]; } else { s_A[tid] = 0; } if (i + blockDim.x < n) { s_A[tid] += A[i + blockDim.x]; } __syncthreads(); // barrier before the reduction tree // Now reduce in shared memory int stride = blockDim.x/2; while (stride > 0) { if (tid < stride) { s_A[tid] += s_A[tid + stride]; } stride = stride>>1; __syncthreads(); } // Copy back block result to appropriate place in destination if (tid==0) result[blockIdx.x] = s_A[0]; } //--------------------------------------------------------------------- // reduce version 5: // Partial loop unrolling when a warp or fewer remaining active // participants. //--------------------------------------------------------------------- __global__ void reduce5(int *A, int n, int *result) { __shared__ int s_A[MAXTHREAD]; // Declare block-shared memory region // Each thread loads an element from global to shared memory before we reduce int tid = threadIdx.x; // Local index int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x; // Global index if (i < n) { s_A[tid] = A[i]; } else { s_A[tid] = 0; } if (i + blockDim.x < n) { s_A[tid] += A[i + blockDim.x]; } __syncthreads(); // barrier before the reduction tree // Now reduce in shared memory int stride = blockDim.x/2; while (stride > 32) { if (tid < stride) { s_A[tid] += s_A[tid + stride]; } stride = stride>>1; __syncthreads(); } if (tid < 32) { s_A[tid] += s_A[tid + 32]; s_A[tid] += s_A[tid + 16]; s_A[tid] += s_A[tid + 8]; s_A[tid] += s_A[tid + 4]; s_A[tid] += s_A[tid + 2]; s_A[tid] += s_A[tid + 1]; } // Copy back block result to appropriate place in destination if (tid==0) result[blockIdx.x] = s_A[0]; } int cpureduce(int *A, int n, int t, int v) { int numBlocks = (n + t - 1)/t; int result[MAXTHREAD]; int *d_A; int *d_B; int *d_src; int *d_dst; int *d_temp; int sum = 0;; int countToReduce = n; HANDLE_ERROR( cudaMalloc( (void **)&d_A, n * sizeof(int) ) ); HANDLE_ERROR( cudaMalloc( (void **)&d_B, numBlocks * sizeof(int) ) ); HANDLE_ERROR( cudaMemcpy( d_A, A, n * sizeof(int), cudaMemcpyHostToDevice ) ); HANDLE_ERROR( cudaMemset( d_B, 0, numBlocks*sizeof(int) ) ); d_src = d_A; d_dst = d_B; if (v == 1) { while (countToReduce >= t) { reduce1<<>>(d_src, countToReduce, d_dst); countToReduce = numBlocks; numBlocks = (countToReduce + t - 1)/t; d_temp = d_src; d_src = d_dst; d_dst = d_temp; } } else if (v == 2) { while (countToReduce >= t) { reduce2<<>>(d_src, countToReduce, d_dst); countToReduce = numBlocks; numBlocks = (countToReduce + t - 1)/t; d_temp = d_src; d_src = d_dst; d_dst = d_temp; } } else if (v == 3) { while (countToReduce >= t) { reduce3<<>>(d_src, countToReduce, d_dst); countToReduce = numBlocks; numBlocks = (countToReduce + t - 1)/t; d_temp = d_src; d_src = d_dst; d_dst = d_temp; } } else if (v == 4) { numBlocks = (n/2 + t - 1)/t; while (countToReduce >= t) { reduce4<<>>(d_src, countToReduce, d_dst); countToReduce = numBlocks; numBlocks = (countToReduce/2 + t - 1)/t; d_temp = d_src; d_src = d_dst; d_dst = d_temp; } } else if (v == 5) { numBlocks = (n/2 + t - 1)/t; while (countToReduce >= t) { reduce5<<>>(d_src, countToReduce, d_dst); countToReduce = numBlocks; numBlocks = (countToReduce/2 + t - 1)/t; d_temp = d_src; d_src = d_dst; d_dst = d_temp; } } HANDLE_ERROR( cudaMemcpy(&result, d_src, countToReduce * sizeof(int), cudaMemcpyDeviceToHost ) ); cudaFree( d_A ); cudaFree( d_B ); int i; for (i=0; i MAXVERSION) { fprintf(stderr, "Invalid version number\n"); proceed = 0; goto errexit; } if (filename != NULL && n > 0) { fprintf(stderr, "Cannot specify both a filename and an n for random gen.\n"); proceed = 0; goto errexit; } if (n > 0) { int inputok = randomintvector(n, m, &A); if (inputok < 0) { fprintf(stderr, "randomintvector failed"); proceed = 0; goto errexit; } } else { int inputok = readintvector(filename, &n, &A); if (inputok < 0) { fprintf(stderr, "readintvector failed with filename: %s\n", filename); proceed = 0; goto errexit; } } // Check that n is sane proceed = n / t; errexit: if (!proceed) { printf("Got the nogo signal\n"); exit(1); } // At this point, cpu/host should have all the data and we should have // a viable problem to solve. gettimeofday(&cpustart, NULL); switch(v) { case 0: sum = cpureduce0(A, n, t); break; case 1: sum = cpureduce(A, n, t, 1); break; case 2: sum = cpureduce(A, n, t, 2); break; case 3: sum = cpureduce(A, n, t, 3); break; case 4: sum = cpureduce(A, n, t, 4); break; case 5: sum = cpureduce(A, n, t, 5); break; } gettimeofday(&cpustop, NULL); double cpuelapsed = timediff(&cpustart, &cpustop); printf("Delta: %f\n", cpuelapsed); printf("Reduction sum = %d\n", sum); return 0; } void processCmdLine(int argc, char ** argv, char ** filenameptr, int * vptr, int * nptr, int * mptr, int * tptr) { int c; while ((c=getopt(argc, argv, "f:v:n:m:t:")) != -1) { switch(c) { case 'f': *filenameptr = strdup(optarg); break; case 'v': *vptr = atoi(optarg); break; case 'n': *nptr = atoi(optarg); break; case 'm': *mptr = atoi(optarg); break; case 't': *tptr = atoi(optarg); break; } } } //--------------------------------------------------------------------- //---------------------------------------------------------------------