--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+static const int LOG_BANK_COUNT = 4;
+
+static inline __device__ __host__ unsigned shared_offset(unsigned i)
+{
+ return i + (i >> LOG_BANK_COUNT);
+}
+
+static inline __device__ __host__ unsigned offset_a(unsigned offset, unsigned i)
+{
+ return shared_offset(offset * (2*i + 1) - 1);
+}
+
+static inline __device__ __host__ unsigned offset_b(unsigned offset, unsigned i)
+{
+ return shared_offset(offset * (2*i + 2) - 1);
+}
+
+static inline __device__ __host__ unsigned lpot(unsigned x)
+{
+ --x; x |= x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return ++x;
+}
+
+template<typename T>
+__global__ void prefix_sum_block(T *in, T *out, unsigned n)
+{
+ extern __shared__ T temp[];
+
+ int idx = threadIdx.x;
+ int blocksize = blockDim.x;
+
+ temp[shared_offset(idx )] = (idx < n) ? in[idx ] : 0;
+ temp[shared_offset(idx + blocksize)] = (idx + blocksize < n) ? in[idx + blocksize] : 0;
+
+ int offset, d;
+ for (offset = 1, d = blocksize; d > 0; d >>= 1, offset <<= 1) {
+ __syncthreads();
+ if (idx < d) {
+ unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
+ temp[b] += temp[a];
+ }
+ }
+
+ if (idx == 0) temp[shared_offset(blocksize*2 - 1)] = 0;
+
+ for (d = 1; d <= blocksize; d <<= 1) {
+ offset >>= 1;
+ __syncthreads();
+
+ if (idx < d) {
+ unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
+ unsigned t = temp[a];
+ temp[a] = temp[b];
+ temp[b] += t;
+ }
+ }
+ __syncthreads();
+
+ if (idx < n) out[idx ] = temp[shared_offset(idx )];
+ if (idx + blocksize < n) out[idx + blocksize] = temp[shared_offset(idx + blocksize)];
+}
+
+template<typename T>
+void prefix_sum(T *in, T *out, unsigned n)
+{
+ char *device_values;
+ unsigned n_lpot = lpot(n);
+ size_t n_pitch;
+
+ cudaError_t error = cudaMallocPitch((void**)&device_values, &n_pitch, sizeof(T)*n, 2);
+ if (error != 0) {
+ printf("error %u allocating width %lu height %u\n", error, sizeof(T)*n, 2);
+ exit(1);
+ }
+
+ cudaMemcpy(device_values, in, sizeof(T)*n, cudaMemcpyHostToDevice);
+
+ prefix_sum_block<<<1, n_lpot/2, shared_offset(n_lpot)*sizeof(T)>>>
+ ((T*)device_values, (T*)(device_values + n_pitch), n);
+
+ cudaMemcpy(out, device_values + n_pitch, sizeof(T)*n, cudaMemcpyDeviceToHost);
+ cudaFree(device_values);
+}
+
+int main()
+{
+ sranddev();
+
+ static unsigned in_values[1024], out_values[1024];
+
+ for (int i = 0; i < 1024; ++i)
+ in_values[i] = rand() >> 21;
+
+ prefix_sum(in_values, out_values, 1024);
+
+ for (int i = 0; i < 1024; ++i)
+ printf("%5d => %5d\n", in_values[i], out_values[i]);
+
+ return 0;
+}
--- /dev/null
+ .version 1.4
+ .target sm_10, map_f64_to_f32
+ // compiled with /usr/local/cuda/bin/../open64/lib//be
+ // nvopencc 3.0 built on 2010-03-11
+
+ //-----------------------------------------------------------
+ // Compiling /tmp/tmpxft_00000236_00000000-7_prefix-sum.cpp3.i (/var/folders/K6/K6oI14wZ2RWhSE+BYqTjA++++TI/-Tmp-/ccBI#.0ATpGM)
+ //-----------------------------------------------------------
+
+ //-----------------------------------------------------------
+ // Options:
+ //-----------------------------------------------------------
+ // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
+ // -O3 (Optimization level)
+ // -g0 (Debug level)
+ // -m2 (Report advisories)
+ //-----------------------------------------------------------
+
+ .file 1 "<command-line>"
+ .file 2 "/tmp/tmpxft_00000236_00000000-6_prefix-sum.cudafe2.gpu"
+ .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
+ .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
+ .file 5 "/usr/local/cuda/bin/../include/host_defines.h"
+ .file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
+ .file 7 "/usr/local/cuda/bin/../include/device_types.h"
+ .file 8 "/usr/local/cuda/bin/../include/driver_types.h"
+ .file 9 "/usr/local/cuda/bin/../include/texture_types.h"
+ .file 10 "/usr/local/cuda/bin/../include/vector_types.h"
+ .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
+ .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
+ .file 13 "/usr/include/i386/_types.h"
+ .file 14 "/usr/include/time.h"
+ .file 15 "prefix-sum.cu"
+ .file 16 "/usr/local/cuda/bin/../include/common_functions.h"
+ .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
+ .file 18 "/usr/local/cuda/bin/../include/math_functions.h"
+ .file 19 "/usr/local/cuda/bin/../include/device_functions.h"
+ .file 20 "/usr/local/cuda/bin/../include/math_constants.h"
+ .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
+ .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
+ .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
+ .file 24 "/usr/local/cuda/bin/../include/common_types.h"
+ .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
+ .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
+ .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
+ .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
+
+ .extern .shared .align 4 .b8 temp[];
+
+ .entry _Z16prefix_sum_blockIjEvPT_S1_j (
+ .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in,
+ .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out,
+ .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n)
+ {
+ .reg .u32 %r<81>;
+ .reg .pred %p<11>;
+ .loc 15 28 0
+$LBB1__Z16prefix_sum_blockIjEvPT_S1_j:
+ ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
+ cvt.s32.u16 %r2, %tid.x;
+ setp.lt.u32 %p1, %r2, %r1;
+ @!%p1 bra $Lt_0_7938;
+ .loc 15 35 0
+ ld.param.u32 %r3, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
+ mul24.lo.u32 %r4, %r2, 4;
+ add.u32 %r5, %r3, %r4;
+ ld.global.u32 %r6, [%r5+0];
+ bra.uni $Lt_0_7682;
+$Lt_0_7938:
+ mov.u32 %r6, 0;
+$Lt_0_7682:
+ mov.u32 %r7, temp;
+ shr.u32 %r8, %r2, 4;
+ add.u32 %r9, %r2, %r8;
+ mul.lo.u32 %r10, %r9, 4;
+ add.u32 %r11, %r10, %r7;
+ st.shared.u32 [%r11+0], %r6;
+ cvt.s32.u16 %r12, %ntid.x;
+ add.s32 %r13, %r12, %r2;
+ .loc 15 28 0
+ ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
+ .loc 15 35 0
+ setp.lt.u32 %p2, %r13, %r1;
+ @!%p2 bra $Lt_0_8450;
+ .loc 15 36 0
+ ld.param.u32 %r14, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
+ mul.lo.u32 %r15, %r13, 4;
+ add.u32 %r16, %r14, %r15;
+ ld.global.u32 %r17, [%r16+0];
+ bra.uni $Lt_0_8194;
+$Lt_0_8450:
+ mov.u32 %r17, 0;
+$Lt_0_8194:
+ shr.u32 %r18, %r13, 4;
+ add.u32 %r19, %r13, %r18;
+ mul.lo.u32 %r20, %r19, 4;
+ add.u32 %r21, %r20, %r7;
+ st.shared.u32 [%r21+0], %r17;
+ .loc 15 39 0
+ mov.s32 %r22, %r12;
+ mov.u32 %r23, 0;
+ setp.le.s32 %p3, %r12, %r23;
+ mov.s32 %r24, 1;
+ @%p3 bra $Lt_0_13314;
+$Lt_0_9218:
+ //<loop> Loop body line 39, nesting depth: 1, estimated iterations: unknown
+ .loc 15 40 0
+ bar.sync 0;
+ setp.le.s32 %p4, %r22, %r2;
+ @%p4 bra $Lt_0_9474;
+ //<loop> Part of loop body line 39, head labeled $Lt_0_9218
+ .loc 15 43 0
+ mul24.lo.u32 %r25, %r2, 2;
+ add.u32 %r26, %r25, 1;
+ add.u32 %r27, %r25, 2;
+ mul.lo.u32 %r28, %r24, %r26;
+ mul.lo.u32 %r29, %r24, %r27;
+ sub.u32 %r30, %r29, 1;
+ shr.u32 %r31, %r30, 4;
+ add.u32 %r32, %r29, %r31;
+ mul.lo.u32 %r33, %r32, 4;
+ add.u32 %r34, %r33, %r7;
+ ld.shared.u32 %r35, [%r34+-4];
+ sub.u32 %r36, %r28, 1;
+ shr.u32 %r37, %r36, 4;
+ add.u32 %r38, %r28, %r37;
+ mul.lo.u32 %r39, %r38, 4;
+ add.u32 %r40, %r7, %r39;
+ ld.shared.u32 %r41, [%r40+-4];
+ add.u32 %r42, %r35, %r41;
+ st.shared.u32 [%r34+-4], %r42;
+$Lt_0_9474:
+ //<loop> Part of loop body line 39, head labeled $Lt_0_9218
+ .loc 15 39 0
+ shr.s32 %r22, %r22, 1;
+ shl.b32 %r24, %r24, 1;
+ mov.u32 %r43, 0;
+ setp.gt.s32 %p5, %r22, %r43;
+ @%p5 bra $Lt_0_9218;
+ bra.uni $Lt_0_8706;
+$Lt_0_13314:
+$Lt_0_8706:
+ mov.u32 %r44, 0;
+ setp.ne.s32 %p6, %r2, %r44;
+ @%p6 bra $Lt_0_10242;
+ .loc 15 47 0
+ mul24.lo.s32 %r45, %r12, 2;
+ mov.u32 %r46, 0;
+ sub.u32 %r47, %r45, 1;
+ shr.u32 %r48, %r47, 4;
+ add.u32 %r49, %r45, %r48;
+ mul.lo.u32 %r50, %r49, 4;
+ add.u32 %r51, %r7, %r50;
+ st.shared.u32 [%r51+-4], %r46;
+$Lt_0_10242:
+ mov.u32 %r52, 1;
+ setp.lt.s32 %p7, %r12, %r52;
+ @%p7 bra $Lt_0_10754;
+ mov.s32 %r22, 1;
+$Lt_0_11266:
+ //<loop> Loop body line 47, nesting depth: 1, estimated iterations: unknown
+ .loc 15 50 0
+ shr.s32 %r24, %r24, 1;
+ .loc 15 51 0
+ bar.sync 0;
+ setp.le.s32 %p8, %r22, %r2;
+ @%p8 bra $Lt_0_11522;
+ //<loop> Part of loop body line 47, head labeled $Lt_0_11266
+ .loc 15 55 0
+ mul24.lo.u32 %r53, %r2, 2;
+ add.u32 %r54, %r53, 1;
+ mul.lo.u32 %r55, %r24, %r54;
+ sub.u32 %r56, %r55, 1;
+ shr.u32 %r57, %r56, 4;
+ add.u32 %r58, %r55, %r57;
+ mul.lo.u32 %r59, %r58, 4;
+ add.u32 %r60, %r59, %r7;
+ ld.shared.u32 %r61, [%r60+-4];
+ .loc 15 56 0
+ add.u32 %r62, %r53, 2;
+ mul.lo.u32 %r63, %r24, %r62;
+ sub.u32 %r64, %r63, 1;
+ shr.u32 %r65, %r64, 4;
+ add.u32 %r66, %r63, %r65;
+ mul.lo.u32 %r67, %r66, 4;
+ add.u32 %r68, %r67, %r7;
+ ld.shared.u32 %r69, [%r68+-4];
+ st.shared.u32 [%r60+-4], %r69;
+ .loc 15 57 0
+ ld.shared.u32 %r70, [%r68+-4];
+ add.u32 %r71, %r70, %r61;
+ st.shared.u32 [%r68+-4], %r71;
+$Lt_0_11522:
+ //<loop> Part of loop body line 47, head labeled $Lt_0_11266
+ .loc 15 49 0
+ shl.b32 %r22, %r22, 1;
+ setp.le.s32 %p9, %r22, %r12;
+ @%p9 bra $Lt_0_11266;
+$Lt_0_10754:
+ .loc 15 60 0
+ bar.sync 0;
+ @!%p1 bra $Lt_0_12290;
+ .loc 15 62 0
+ ld.shared.u32 %r72, [%r11+0];
+ ld.param.u32 %r73, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
+ mul24.lo.u32 %r74, %r2, 4;
+ add.u32 %r75, %r73, %r74;
+ st.global.u32 [%r75+0], %r72;
+$Lt_0_12290:
+ @!%p2 bra $Lt_0_12802;
+ .loc 15 63 0
+ ld.shared.u32 %r76, [%r21+0];
+ ld.param.u32 %r77, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
+ mul.lo.u32 %r78, %r13, 4;
+ add.u32 %r79, %r77, %r78;
+ st.global.u32 [%r79+0], %r76;
+$Lt_0_12802:
+ .loc 15 64 0
+ exit;
+$LDWend__Z16prefix_sum_blockIjEvPT_S1_j:
+ } // _Z16prefix_sum_blockIjEvPT_S1_j
+