extra/cuda/prefix-sum.cu

   1 #include <stdio.h>
   2 #include <stdlib.h>
   3 #include <cuda_runtime.h>
   4
   5 static const int LOG_BANK_COUNT = 4;
   6
   7 static inline __device__ __host__ unsigned shared_offset(unsigned i)
   8 {
   9     return i + (i >> LOG_BANK_COUNT);
  10 }
  11
  12 static inline __device__ __host__ unsigned offset_a(unsigned offset, unsigned i)
  13 {
  14     return shared_offset(offset * (2*i + 1) - 1);
  15 }
  16
  17 static inline __device__ __host__ unsigned offset_b(unsigned offset, unsigned i)
  18 {
  19     return shared_offset(offset * (2*i + 2) - 1);
  20 }
  21
  22 static inline __device__ __host__ unsigned lpot(unsigned x)
  23 {
  24     --x; x |= x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return ++x;
  25 }
  26
  27 template<typename T>
  28 __global__ void prefix_sum_block(T *in, T *out, unsigned n)
  29 {
  30     extern __shared__ T temp[];
  31
  32     int idx = threadIdx.x;
  33     int blocksize = blockDim.x;
  34
  35     temp[shared_offset(idx            )] = (idx             < n) ? in[idx            ] : 0;
  36     temp[shared_offset(idx + blocksize)] = (idx + blocksize < n) ? in[idx + blocksize] : 0;
  37
  38     int offset, d;
  39     for (offset = 1, d = blocksize; d > 0; d >>= 1, offset <<= 1) {
  40         __syncthreads();
  41         if (idx < d) {
  42             unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
  43             temp[b] += temp[a];
  44         }
  45     }
  46
  47     if (idx == 0) temp[shared_offset(blocksize*2 - 1)] = 0;
  48
  49     for (d = 1; d <= blocksize; d <<= 1) {
  50         offset >>= 1;
  51         __syncthreads();
  52
  53         if (idx < d) {
  54             unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
  55             unsigned t = temp[a];
  56             temp[a] = temp[b];
  57             temp[b] += t;
  58         }
  59     }
  60     __syncthreads();
  61
  62     if (idx             < n) out[idx            ] = temp[shared_offset(idx            )];
  63     if (idx + blocksize < n) out[idx + blocksize] = temp[shared_offset(idx + blocksize)];
  64 }
  65
  66 template<typename T>
  67 void prefix_sum(T *in, T *out, unsigned n)
  68 {
  69     char *device_values;
  70     unsigned n_lpot = lpot(n);
  71     size_t n_pitch;
  72
  73     cudaError_t error = cudaMallocPitch((void**)&device_values, &n_pitch, sizeof(T)*n, 2);
  74     if (error != 0) {
  75         printf("error %u allocating width %lu height %u\n", error, sizeof(T)*n, 2);
  76         exit(1);
  77     }
  78
  79     cudaMemcpy(device_values, in, sizeof(T)*n, cudaMemcpyHostToDevice);
  80
  81     prefix_sum_block<<<1, n_lpot/2, shared_offset(n_lpot)*sizeof(T)>>>
  82         ((T*)device_values, (T*)(device_values + n_pitch), n);
  83
  84     cudaMemcpy(out, device_values + n_pitch, sizeof(T)*n, cudaMemcpyDeviceToHost);
  85     cudaFree(device_values);
  86 }
  87
  88 int main()
  89 {
  90     sranddev();
  91
  92     static unsigned in_values[1024], out_values[1024];
  93
  94     for (int i = 0; i < 1024; ++i)
  95         in_values[i] = rand() >> 21;
  96
  97     prefix_sum(in_values, out_values, 1024);
  98
  99     for (int i = 0; i < 1024; ++i)
 100         printf("%5d => %5d\n", in_values[i], out_values[i]);
 101
 102     return 0;
 103 }