! Copyright (C) 2010 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors alien alien.c-types alien.data alien.parser
-alien.strings arrays assocs byte-arrays classes.struct
+USING: accessors alien alien.data alien.parser alien.strings
+alien.syntax arrays assocs byte-arrays classes.struct
combinators continuations cuda.ffi destructors fry io
io.backend io.encodings.string io.encodings.utf8 kernel lexer
-locals math math.parser namespaces opengl.gl.extensions
-prettyprint quotations sequences ;
+locals macros math math.parser namespaces nested-comments
+opengl.gl.extensions parser prettyprint quotations sequences
+words ;
+QUALIFIED-WITH: alien.c-types a
IN: cuda
SYMBOL: cuda-device
SYMBOL: cuda-launcher
SYMBOL: cuda-memory-hashtable
+SYMBOL: cuda-libraries
+cuda-libraries [ H{ } clone ] initialize
+
+SYMBOL: cuda-functions
+
+TUPLE: cuda-library name path ;
+
+: <cuda-library> ( name path -- obj )
+ \ cuda-library new
+ swap >>path
+ swap >>name ;
+
+: add-cuda-library ( name path -- )
+ normalize-path <cuda-library>
+ dup name>> cuda-libraries get set-at ;
+
+: cuda-library ( name -- cuda-library )
+ cuda-libraries get at ;
+
ERROR: throw-cuda-error n ;
: cuda-error ( n -- )
dup CUDA_SUCCESS = [ drop ] [ throw-cuda-error ] if ;
: cuda-version ( -- n )
- int <c-object> [ cuDriverGetVersion cuda-error ] keep *int ;
+ a:int <c-object> [ cuDriverGetVersion cuda-error ] keep a:*int ;
: init-cuda ( -- )
0 cuInit cuda-error ;
TUPLE: launcher
{ device integer initial: 0 }
{ device-flags initial: 0 }
-path block-shape shared-size grid ;
+path ;
+
+TUPLE: function-launcher
+dim-block
+dim-grid
+shared-size
+stream ;
: with-cuda-context ( flags device quot -- )
+ H{ } clone cuda-functions set
[
[ CUcontext <c-object> ] 2dip
- [ cuCtxCreate cuda-error ] 3keep 2drop *void*
+ [ cuCtxCreate cuda-error ] 3keep 2drop a:*void*
] dip
[ '[ _ @ ] ]
[ drop '[ _ cuCtxDestroy cuda-error ] ] 2bi
[
normalize-path
[ CUmodule <c-object> ] dip
- [ cuModuleLoad cuda-error ] 2keep drop *void*
+ [ cuModuleLoad cuda-error ] 2keep drop a:*void*
] dip
[ '[ _ @ ] ]
[ drop '[ _ cuModuleUnload cuda-error ] ] 2bi
<PRIVATE
: #cuda-devices ( -- n )
- int <c-object> [ cuDeviceGetCount cuda-error ] keep *int ;
+ a:int <c-object> [ cuDeviceGetCount cuda-error ] keep a:*int ;
: n>cuda-device ( n -- device )
- [ CUdevice <c-object> ] dip [ cuDeviceGet cuda-error ] 2keep drop *int ;
+ [ CUdevice <c-object> ] dip [ cuDeviceGet cuda-error ] 2keep drop a:*int ;
: enumerate-cuda-devices ( -- devices )
#cuda-devices iota [ n>cuda-device ] map ;
[ 2drop utf8 alien>string ] 3bi ;
: cuda-device-capability ( n -- pair )
- [ int <c-object> int <c-object> ] dip
+ [ a:int <c-object> a:int <c-object> ] dip
[ cuDeviceComputeCapability cuda-error ]
- [ drop [ *int ] bi@ ] 3bi 2array ;
+ [ drop [ a:*int ] bi@ ] 3bi 2array ;
: cuda-device-memory ( n -- bytes )
- [ uint <c-object> ] dip
+ [ a:uint <c-object> ] dip
[ cuDeviceTotalMem cuda-error ]
- [ drop *uint ] 2bi ;
+ [ drop a:*uint ] 2bi ;
-: get-cuda-function* ( module string -- function )
+: get-function-ptr* ( module string -- function )
[ CUfunction <c-object> ] 2dip
- [ cuModuleGetFunction cuda-error ] 3keep 2drop *void* ;
+ [ cuModuleGetFunction cuda-error ] 3keep 2drop a:*void* ;
-: get-cuda-function ( string -- function )
- [ cuda-module get ] dip get-cuda-function* ;
+: get-function-ptr ( string -- function )
+ [ cuda-module get ] dip get-function-ptr* ;
: with-cuda-function ( string quot -- )
[
- get-cuda-function cuda-function set
+ get-function-ptr* cuda-function set
] dip call ; inline
+: cached-cuda-function ( string -- alien )
+ cuda-functions get [ get-function-ptr ] cache ;
+
: launch-function* ( function -- ) cuLaunch cuda-error ;
: launch-function ( -- ) cuda-function get cuLaunch cuda-error ;
: cuda-malloc ( n -- ptr )
[ CUdeviceptr <c-object> ] dip
[ cuMemAlloc cuda-error ] 2keep
- [ *int ] dip <cuda-memory> add-cuda-memory ;
+ [ a:*int ] dip <cuda-memory> add-cuda-memory ;
: cuda-free* ( ptr -- )
cuMemFree cuda-error ;
offset param-size ;
: cuda-device-attribute ( attribute dev -- n )
- [ int <c-object> ] 2dip
+ [ a:int <c-object> ] 2dip
[ cuDeviceGetAttribute cuda-error ]
- [ 2drop *int ] 3bi ;
+ [ 2drop a:*int ] 3bi ;
: function-block-shape* ( function x y z -- )
cuFuncSetBlockShape cuda-error ;
"CUDA Version: " write cuda-version number>string print nl
#cuda-devices iota [ nl ] [ cuda-device. ] interleave ;
+: c-type>cuda-setter ( c-type -- n cuda-type )
+ {
+ { [ dup a:int = ] [ drop 4 [ cuda-int* ] ] }
+ { [ dup a:uint = ] [ drop 4 [ cuda-int* ] ] }
+ { [ dup a:float = ] [ drop 4 [ cuda-float* ] ] }
+ { [ dup a:pointer? ] [ drop 4 [ ptr>> cuda-int* ] ] }
+ { [ dup a:void* = ] [ drop 4 [ ptr>> cuda-int* ] ] }
+ } cond ;
+
+: run-function-launcher ( function-launcher function -- )
+ swap
+ {
+ [ dim-block>> first3 function-block-shape* ]
+ [ shared-size>> function-shared-size* ]
+ [
+ dim-grid>> [
+ launch-function*
+ ] [
+ first2 launch-function-grid*
+ ] if-empty
+ ]
+ } 2cleave ;
+
+: cuda-argument-setter ( offset c-type -- offset' quot )
+ c-type>cuda-setter
+ [ over [ + ] dip ] dip
+ '[ swap _ swap _ call ] ;
+
+MACRO: cuda-arguments ( c-types -- quot: ( args... function -- ) )
+ [ 0 ] dip [ cuda-argument-setter ] map reverse
+ swap '[ _ param-size* ] suffix
+ '[ _ cleave ] ;
-: test-cuda0 ( -- )
- T{ launcher
- { path "vocab:cuda/hello.ptx" }
- { block-shape { 6 6 6 } }
- { shared-size 2 }
- { grid { 2 6 } }
- } [
- "helloWorld" [
- "Hello World!" [ - ] map-index
- malloc-device-string &dispose
-
- [ 1array set-parameters ]
- [ drop launch ]
- [ device>host utf8 alien>string . ] tri
- ] with-cuda-function
- ] with-cuda ;
+: define-cuda-word ( word string arguments -- )
+ [
+ '[
+ _ get-function-ptr
+ [ nip _ cuda-arguments ]
+ [ run-function-launcher ] 2bi
+ ]
+ ]
+ [ nip \ function-launcher suffix a:void function-effect ]
+ 2bi define-declared ;
--- /dev/null
+Doug Coleman
--- /dev/null
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien.c-types alien.strings cuda cuda.syntax destructors
+io.encodings.utf8 kernel locals math prettyprint sequences ;
+IN: cuda.hello-world
+
+CUDA-LIBRARY: hello vocab:cuda/hello.ptx
+
+CUDA-FUNCTION: helloWorld ( char* string-ptr ) ;
+
+:: cuda-hello-world ( -- )
+ T{ launcher
+ { device 0 }
+ { path "vocab:cuda/hello.ptx" }
+ } [
+ "Hello World!" [ - ] map-index malloc-device-string &dispose dup :> str
+
+ T{ function-launcher
+ { dim-block { 6 1 1 } }
+ { dim-grid { 2 1 } }
+ { shared-size 0 }
+ }
+ helloWorld
+
+ ! <<< { 6 1 1 } { 2 1 } 1 >>> helloWorld
+
+ str device>host utf8 alien>string .
+ ] with-cuda ;
+
+MAIN: cuda-hello-world
--- /dev/null
+/*
+ World using CUDA
+**
+** The string "Hello World!" is mangled then restored using a common CUDA idiom
+**
+** Byron Galbraith
+** 2009-02-18
+*/
+#include <cuda.h>
+#include <stdio.h>
+
+// Prototypes
+extern "C" __global__ void helloWorld(char*);
+
+// Host function
+int
+main(int argc, char** argv)
+{
+ int i;
+
+ // desired output
+ char str[] = "Hello World!";
+
+ // mangle contents of output
+ // the null character is left intact for simplicity
+ for(i = 0; i < 12; i++)
+ str[i] -= i;
+
+ // allocate memory on the device
+ char *d_str;
+ size_t size = sizeof(str);
+ cudaMalloc((void**)&d_str, size);
+
+ // copy the string to the device
+ cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
+
+ // set the grid and block sizes
+ dim3 dimGrid(2); // one block per word
+ dim3 dimBlock(6); // one thread per character
+
+ // invoke the kernel
+ helloWorld<<< dimGrid, dimBlock >>>(d_str);
+
+ // retrieve the results from the device
+ cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
+
+ // free up the allocated memory on the device
+ cudaFree(d_str);
+
+ // everyone's favorite part
+ printf("%s\n", str);
+
+ return 0;
+}
+
+// Device kernel
+__global__ void
+helloWorld(char* str)
+{
+ // determine where in the thread grid we are
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // unmangle output
+ str[idx] += idx;
+}
--- /dev/null
+ .version 1.4
+ .target sm_10, map_f64_to_f32
+ // compiled with /usr/local/cuda/bin/../open64/lib//be
+ // nvopencc 3.0 built on 2010-03-11
+
+ //-----------------------------------------------------------
+ // Compiling /tmp/tmpxft_00000eab_00000000-7_hello.cpp3.i (/var/folders/KD/KDnx4D80Eh0fsORqNrFWBE+++TI/-Tmp-/ccBI#.AYqbdQ)
+ //-----------------------------------------------------------
+
+ //-----------------------------------------------------------
+ // Options:
+ //-----------------------------------------------------------
+ // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
+ // -O3 (Optimization level)
+ // -g0 (Debug level)
+ // -m2 (Report advisories)
+ //-----------------------------------------------------------
+
+ .file 1 "<command-line>"
+ .file 2 "/tmp/tmpxft_00000eab_00000000-6_hello.cudafe2.gpu"
+ .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
+ .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
+ .file 5 "/usr/local/cuda/bin/../include/host_defines.h"
+ .file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
+ .file 7 "/usr/local/cuda/bin/../include/device_types.h"
+ .file 8 "/usr/local/cuda/bin/../include/driver_types.h"
+ .file 9 "/usr/local/cuda/bin/../include/texture_types.h"
+ .file 10 "/usr/local/cuda/bin/../include/vector_types.h"
+ .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
+ .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
+ .file 13 "/usr/include/i386/_types.h"
+ .file 14 "/usr/include/time.h"
+ .file 15 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
+ .file 16 "/usr/local/cuda/bin/../include/common_functions.h"
+ .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
+ .file 18 "/usr/local/cuda/bin/../include/math_functions.h"
+ .file 19 "/usr/local/cuda/bin/../include/device_functions.h"
+ .file 20 "/usr/local/cuda/bin/../include/math_constants.h"
+ .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
+ .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
+ .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
+ .file 24 "/usr/local/cuda/bin/../include/common_types.h"
+ .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
+ .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
+ .file 27 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
+ .file 28 "hello.cu"
+
+
+ .entry helloWorld (
+ .param .u32 __cudaparm_helloWorld_str)
+ {
+ .reg .u16 %rh<4>;
+ .reg .u32 %r<9>;
+ .loc 28 58 0
+$LBB1_helloWorld:
+ .loc 28 64 0
+ mov.u16 %rh1, %ctaid.x;
+ mov.u16 %rh2, %ntid.x;
+ mul.wide.u16 %r1, %rh1, %rh2;
+ cvt.u32.u16 %r2, %tid.x;
+ add.u32 %r3, %r2, %r1;
+ ld.param.u32 %r4, [__cudaparm_helloWorld_str];
+ add.u32 %r5, %r4, %r3;
+ ld.global.s8 %r6, [%r5+0];
+ add.s32 %r7, %r6, %r3;
+ st.global.s8 [%r5+0], %r7;
+ .loc 28 65 0
+ exit;
+$LDWend_helloWorld:
+ } // helloWorld
+
--- /dev/null
+Doug Coleman
+Joe Groff
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+static const int LOG_BANK_COUNT = 4;
+
+static inline __device__ __host__ unsigned shared_offset(unsigned i)
+{
+ return i + (i >> LOG_BANK_COUNT);
+}
+
+static inline __device__ __host__ unsigned offset_a(unsigned offset, unsigned i)
+{
+ return shared_offset(offset * (2*i + 1) - 1);
+}
+
+static inline __device__ __host__ unsigned offset_b(unsigned offset, unsigned i)
+{
+ return shared_offset(offset * (2*i + 2) - 1);
+}
+
+static inline __device__ __host__ unsigned lpot(unsigned x)
+{
+ --x; x |= x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return ++x;
+}
+
+template<typename T>
+__global__ void prefix_sum_block(T *in, T *out, unsigned n)
+{
+ extern __shared__ T temp[];
+
+ int idx = threadIdx.x;
+ int blocksize = blockDim.x;
+
+ temp[shared_offset(idx )] = (idx < n) ? in[idx ] : 0;
+ temp[shared_offset(idx + blocksize)] = (idx + blocksize < n) ? in[idx + blocksize] : 0;
+
+ int offset, d;
+ for (offset = 1, d = blocksize; d > 0; d >>= 1, offset <<= 1) {
+ __syncthreads();
+ if (idx < d) {
+ unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
+ temp[b] += temp[a];
+ }
+ }
+
+ if (idx == 0) temp[shared_offset(blocksize*2 - 1)] = 0;
+
+ for (d = 1; d <= blocksize; d <<= 1) {
+ offset >>= 1;
+ __syncthreads();
+
+ if (idx < d) {
+ unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
+ unsigned t = temp[a];
+ temp[a] = temp[b];
+ temp[b] += t;
+ }
+ }
+ __syncthreads();
+
+ if (idx < n) out[idx ] = temp[shared_offset(idx )];
+ if (idx + blocksize < n) out[idx + blocksize] = temp[shared_offset(idx + blocksize)];
+}
+
+template<typename T>
+void prefix_sum(T *in, T *out, unsigned n)
+{
+ char *device_values;
+ unsigned n_lpot = lpot(n);
+ size_t n_pitch;
+
+ cudaError_t error = cudaMallocPitch((void**)&device_values, &n_pitch, sizeof(T)*n, 2);
+ if (error != 0) {
+ printf("error %u allocating width %lu height %u\n", error, sizeof(T)*n, 2);
+ exit(1);
+ }
+
+ cudaMemcpy(device_values, in, sizeof(T)*n, cudaMemcpyHostToDevice);
+
+ prefix_sum_block<<<1, n_lpot/2, shared_offset(n_lpot)*sizeof(T)>>>
+ ((T*)device_values, (T*)(device_values + n_pitch), n);
+
+ cudaMemcpy(out, device_values + n_pitch, sizeof(T)*n, cudaMemcpyDeviceToHost);
+ cudaFree(device_values);
+}
+
+int main()
+{
+ sranddev();
+
+ static unsigned in_values[1024], out_values[1024];
+
+ for (int i = 0; i < 1024; ++i)
+ in_values[i] = rand() >> 21;
+
+ prefix_sum(in_values, out_values, 1024);
+
+ for (int i = 0; i < 1024; ++i)
+ printf("%5d => %5d\n", in_values[i], out_values[i]);
+
+ return 0;
+}
--- /dev/null
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien.c-types cuda cuda.syntax locals ;
+IN: cuda.demos.prefix-sum
+
+CUDA-LIBRARY: prefix-sum vocab:cuda/demos/prefix-sum/prefix-sum.ptx
+
+CUDA-FUNCTION: prefix_sum_block ( uint* in, uint* out, uint n ) ;
+
+:: cuda-prefix-sum ( -- )
+ T{ launcher
+ { device 0 }
+ { path "vocab:cuda/demos/prefix-sum/prefix-sum.ptx" }
+ } [
+
+
+ ! { 1 1 1 } { 2 1 } 0 3<<< prefix_sum_block
+
+ ] with-cuda ;
+
+MAIN: cuda-prefix-sum
--- /dev/null
+ .version 1.4
+ .target sm_10, map_f64_to_f32
+ // compiled with /usr/local/cuda/bin/../open64/lib//be
+ // nvopencc 3.0 built on 2010-03-11
+
+ //-----------------------------------------------------------
+ // Compiling /tmp/tmpxft_00000236_00000000-7_prefix-sum.cpp3.i (/var/folders/K6/K6oI14wZ2RWhSE+BYqTjA++++TI/-Tmp-/ccBI#.0ATpGM)
+ //-----------------------------------------------------------
+
+ //-----------------------------------------------------------
+ // Options:
+ //-----------------------------------------------------------
+ // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
+ // -O3 (Optimization level)
+ // -g0 (Debug level)
+ // -m2 (Report advisories)
+ //-----------------------------------------------------------
+
+ .file 1 "<command-line>"
+ .file 2 "/tmp/tmpxft_00000236_00000000-6_prefix-sum.cudafe2.gpu"
+ .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
+ .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
+ .file 5 "/usr/local/cuda/bin/../include/host_defines.h"
+ .file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
+ .file 7 "/usr/local/cuda/bin/../include/device_types.h"
+ .file 8 "/usr/local/cuda/bin/../include/driver_types.h"
+ .file 9 "/usr/local/cuda/bin/../include/texture_types.h"
+ .file 10 "/usr/local/cuda/bin/../include/vector_types.h"
+ .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
+ .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
+ .file 13 "/usr/include/i386/_types.h"
+ .file 14 "/usr/include/time.h"
+ .file 15 "prefix-sum.cu"
+ .file 16 "/usr/local/cuda/bin/../include/common_functions.h"
+ .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
+ .file 18 "/usr/local/cuda/bin/../include/math_functions.h"
+ .file 19 "/usr/local/cuda/bin/../include/device_functions.h"
+ .file 20 "/usr/local/cuda/bin/../include/math_constants.h"
+ .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
+ .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
+ .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
+ .file 24 "/usr/local/cuda/bin/../include/common_types.h"
+ .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
+ .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
+ .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
+ .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
+
+ .extern .shared .align 4 .b8 temp[];
+
+ .entry _Z16prefix_sum_blockIjEvPT_S1_j (
+ .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in,
+ .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out,
+ .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n)
+ {
+ .reg .u32 %r<81>;
+ .reg .pred %p<11>;
+ .loc 15 28 0
+$LBB1__Z16prefix_sum_blockIjEvPT_S1_j:
+ ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
+ cvt.s32.u16 %r2, %tid.x;
+ setp.lt.u32 %p1, %r2, %r1;
+ @!%p1 bra $Lt_0_7938;
+ .loc 15 35 0
+ ld.param.u32 %r3, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
+ mul24.lo.u32 %r4, %r2, 4;
+ add.u32 %r5, %r3, %r4;
+ ld.global.u32 %r6, [%r5+0];
+ bra.uni $Lt_0_7682;
+$Lt_0_7938:
+ mov.u32 %r6, 0;
+$Lt_0_7682:
+ mov.u32 %r7, temp;
+ shr.u32 %r8, %r2, 4;
+ add.u32 %r9, %r2, %r8;
+ mul.lo.u32 %r10, %r9, 4;
+ add.u32 %r11, %r10, %r7;
+ st.shared.u32 [%r11+0], %r6;
+ cvt.s32.u16 %r12, %ntid.x;
+ add.s32 %r13, %r12, %r2;
+ .loc 15 28 0
+ ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
+ .loc 15 35 0
+ setp.lt.u32 %p2, %r13, %r1;
+ @!%p2 bra $Lt_0_8450;
+ .loc 15 36 0
+ ld.param.u32 %r14, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
+ mul.lo.u32 %r15, %r13, 4;
+ add.u32 %r16, %r14, %r15;
+ ld.global.u32 %r17, [%r16+0];
+ bra.uni $Lt_0_8194;
+$Lt_0_8450:
+ mov.u32 %r17, 0;
+$Lt_0_8194:
+ shr.u32 %r18, %r13, 4;
+ add.u32 %r19, %r13, %r18;
+ mul.lo.u32 %r20, %r19, 4;
+ add.u32 %r21, %r20, %r7;
+ st.shared.u32 [%r21+0], %r17;
+ .loc 15 39 0
+ mov.s32 %r22, %r12;
+ mov.u32 %r23, 0;
+ setp.le.s32 %p3, %r12, %r23;
+ mov.s32 %r24, 1;
+ @%p3 bra $Lt_0_13314;
+$Lt_0_9218:
+ //<loop> Loop body line 39, nesting depth: 1, estimated iterations: unknown
+ .loc 15 40 0
+ bar.sync 0;
+ setp.le.s32 %p4, %r22, %r2;
+ @%p4 bra $Lt_0_9474;
+ //<loop> Part of loop body line 39, head labeled $Lt_0_9218
+ .loc 15 43 0
+ mul24.lo.u32 %r25, %r2, 2;
+ add.u32 %r26, %r25, 1;
+ add.u32 %r27, %r25, 2;
+ mul.lo.u32 %r28, %r24, %r26;
+ mul.lo.u32 %r29, %r24, %r27;
+ sub.u32 %r30, %r29, 1;
+ shr.u32 %r31, %r30, 4;
+ add.u32 %r32, %r29, %r31;
+ mul.lo.u32 %r33, %r32, 4;
+ add.u32 %r34, %r33, %r7;
+ ld.shared.u32 %r35, [%r34+-4];
+ sub.u32 %r36, %r28, 1;
+ shr.u32 %r37, %r36, 4;
+ add.u32 %r38, %r28, %r37;
+ mul.lo.u32 %r39, %r38, 4;
+ add.u32 %r40, %r7, %r39;
+ ld.shared.u32 %r41, [%r40+-4];
+ add.u32 %r42, %r35, %r41;
+ st.shared.u32 [%r34+-4], %r42;
+$Lt_0_9474:
+ //<loop> Part of loop body line 39, head labeled $Lt_0_9218
+ .loc 15 39 0
+ shr.s32 %r22, %r22, 1;
+ shl.b32 %r24, %r24, 1;
+ mov.u32 %r43, 0;
+ setp.gt.s32 %p5, %r22, %r43;
+ @%p5 bra $Lt_0_9218;
+ bra.uni $Lt_0_8706;
+$Lt_0_13314:
+$Lt_0_8706:
+ mov.u32 %r44, 0;
+ setp.ne.s32 %p6, %r2, %r44;
+ @%p6 bra $Lt_0_10242;
+ .loc 15 47 0
+ mul24.lo.s32 %r45, %r12, 2;
+ mov.u32 %r46, 0;
+ sub.u32 %r47, %r45, 1;
+ shr.u32 %r48, %r47, 4;
+ add.u32 %r49, %r45, %r48;
+ mul.lo.u32 %r50, %r49, 4;
+ add.u32 %r51, %r7, %r50;
+ st.shared.u32 [%r51+-4], %r46;
+$Lt_0_10242:
+ mov.u32 %r52, 1;
+ setp.lt.s32 %p7, %r12, %r52;
+ @%p7 bra $Lt_0_10754;
+ mov.s32 %r22, 1;
+$Lt_0_11266:
+ //<loop> Loop body line 47, nesting depth: 1, estimated iterations: unknown
+ .loc 15 50 0
+ shr.s32 %r24, %r24, 1;
+ .loc 15 51 0
+ bar.sync 0;
+ setp.le.s32 %p8, %r22, %r2;
+ @%p8 bra $Lt_0_11522;
+ //<loop> Part of loop body line 47, head labeled $Lt_0_11266
+ .loc 15 55 0
+ mul24.lo.u32 %r53, %r2, 2;
+ add.u32 %r54, %r53, 1;
+ mul.lo.u32 %r55, %r24, %r54;
+ sub.u32 %r56, %r55, 1;
+ shr.u32 %r57, %r56, 4;
+ add.u32 %r58, %r55, %r57;
+ mul.lo.u32 %r59, %r58, 4;
+ add.u32 %r60, %r59, %r7;
+ ld.shared.u32 %r61, [%r60+-4];
+ .loc 15 56 0
+ add.u32 %r62, %r53, 2;
+ mul.lo.u32 %r63, %r24, %r62;
+ sub.u32 %r64, %r63, 1;
+ shr.u32 %r65, %r64, 4;
+ add.u32 %r66, %r63, %r65;
+ mul.lo.u32 %r67, %r66, 4;
+ add.u32 %r68, %r67, %r7;
+ ld.shared.u32 %r69, [%r68+-4];
+ st.shared.u32 [%r60+-4], %r69;
+ .loc 15 57 0
+ ld.shared.u32 %r70, [%r68+-4];
+ add.u32 %r71, %r70, %r61;
+ st.shared.u32 [%r68+-4], %r71;
+$Lt_0_11522:
+ //<loop> Part of loop body line 47, head labeled $Lt_0_11266
+ .loc 15 49 0
+ shl.b32 %r22, %r22, 1;
+ setp.le.s32 %p9, %r22, %r12;
+ @%p9 bra $Lt_0_11266;
+$Lt_0_10754:
+ .loc 15 60 0
+ bar.sync 0;
+ @!%p1 bra $Lt_0_12290;
+ .loc 15 62 0
+ ld.shared.u32 %r72, [%r11+0];
+ ld.param.u32 %r73, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
+ mul24.lo.u32 %r74, %r2, 4;
+ add.u32 %r75, %r73, %r74;
+ st.global.u32 [%r75+0], %r72;
+$Lt_0_12290:
+ @!%p2 bra $Lt_0_12802;
+ .loc 15 63 0
+ ld.shared.u32 %r76, [%r21+0];
+ ld.param.u32 %r77, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
+ mul.lo.u32 %r78, %r13, 4;
+ add.u32 %r79, %r77, %r78;
+ st.global.u32 [%r79+0], %r76;
+$Lt_0_12802:
+ .loc 15 64 0
+ exit;
+$LDWend__Z16prefix_sum_blockIjEvPT_S1_j:
+ } // _Z16prefix_sum_blockIjEvPT_S1_j
+
+++ /dev/null
-/*
- World using CUDA
-**
-** The string "Hello World!" is mangled then restored using a common CUDA idiom
-**
-** Byron Galbraith
-** 2009-02-18
-*/
-#include <cuda.h>
-#include <stdio.h>
-
-// Prototypes
-extern "C" __global__ void helloWorld(char*);
-
-// Host function
-int
-main(int argc, char** argv)
-{
- int i;
-
- // desired output
- char str[] = "Hello World!";
-
- // mangle contents of output
- // the null character is left intact for simplicity
- for(i = 0; i < 12; i++)
- str[i] -= i;
-
- // allocate memory on the device
- char *d_str;
- size_t size = sizeof(str);
- cudaMalloc((void**)&d_str, size);
-
- // copy the string to the device
- cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
-
- // set the grid and block sizes
- dim3 dimGrid(2); // one block per word
- dim3 dimBlock(6); // one thread per character
-
- // invoke the kernel
- helloWorld<<< dimGrid, dimBlock >>>(d_str);
-
- // retrieve the results from the device
- cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
-
- // free up the allocated memory on the device
- cudaFree(d_str);
-
- // everyone's favorite part
- printf("%s\n", str);
-
- return 0;
-}
-
-// Device kernel
-__global__ void
-helloWorld(char* str)
-{
- // determine where in the thread grid we are
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
- // unmangle output
- str[idx] += idx;
-}
+++ /dev/null
- .version 1.4
- .target sm_10, map_f64_to_f32
- // compiled with /usr/local/cuda/bin/../open64/lib//be
- // nvopencc 3.0 built on 2010-03-11
-
- //-----------------------------------------------------------
- // Compiling /tmp/tmpxft_00000eab_00000000-7_hello.cpp3.i (/var/folders/KD/KDnx4D80Eh0fsORqNrFWBE+++TI/-Tmp-/ccBI#.AYqbdQ)
- //-----------------------------------------------------------
-
- //-----------------------------------------------------------
- // Options:
- //-----------------------------------------------------------
- // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
- // -O3 (Optimization level)
- // -g0 (Debug level)
- // -m2 (Report advisories)
- //-----------------------------------------------------------
-
- .file 1 "<command-line>"
- .file 2 "/tmp/tmpxft_00000eab_00000000-6_hello.cudafe2.gpu"
- .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
- .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
- .file 5 "/usr/local/cuda/bin/../include/host_defines.h"
- .file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
- .file 7 "/usr/local/cuda/bin/../include/device_types.h"
- .file 8 "/usr/local/cuda/bin/../include/driver_types.h"
- .file 9 "/usr/local/cuda/bin/../include/texture_types.h"
- .file 10 "/usr/local/cuda/bin/../include/vector_types.h"
- .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
- .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
- .file 13 "/usr/include/i386/_types.h"
- .file 14 "/usr/include/time.h"
- .file 15 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
- .file 16 "/usr/local/cuda/bin/../include/common_functions.h"
- .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
- .file 18 "/usr/local/cuda/bin/../include/math_functions.h"
- .file 19 "/usr/local/cuda/bin/../include/device_functions.h"
- .file 20 "/usr/local/cuda/bin/../include/math_constants.h"
- .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
- .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
- .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
- .file 24 "/usr/local/cuda/bin/../include/common_types.h"
- .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
- .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
- .file 27 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
- .file 28 "hello.cu"
-
-
- .entry helloWorld (
- .param .u32 __cudaparm_helloWorld_str)
- {
- .reg .u16 %rh<4>;
- .reg .u32 %r<9>;
- .loc 28 58 0
-$LBB1_helloWorld:
- .loc 28 64 0
- mov.u16 %rh1, %ctaid.x;
- mov.u16 %rh2, %ntid.x;
- mul.wide.u16 %r1, %rh1, %rh2;
- cvt.u32.u16 %r2, %tid.x;
- add.u32 %r3, %r2, %r1;
- ld.param.u32 %r4, [__cudaparm_helloWorld_str];
- add.u32 %r5, %r4, %r3;
- ld.global.s8 %r6, [%r5+0];
- add.s32 %r7, %r6, %r3;
- st.global.s8 [%r5+0], %r7;
- .loc 28 65 0
- exit;
-$LDWend_helloWorld:
- } // helloWorld
-
+++ /dev/null
-#include <stdio.h>
-#include <stdlib.h>
-#include <cuda_runtime.h>
-
-static const int LOG_BANK_COUNT = 4;
-
-static inline __device__ __host__ unsigned shared_offset(unsigned i)
-{
- return i + (i >> LOG_BANK_COUNT);
-}
-
-static inline __device__ __host__ unsigned offset_a(unsigned offset, unsigned i)
-{
- return shared_offset(offset * (2*i + 1) - 1);
-}
-
-static inline __device__ __host__ unsigned offset_b(unsigned offset, unsigned i)
-{
- return shared_offset(offset * (2*i + 2) - 1);
-}
-
-static inline __device__ __host__ unsigned lpot(unsigned x)
-{
- --x; x |= x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return ++x;
-}
-
-template<typename T>
-__global__ void prefix_sum_block(T *in, T *out, unsigned n)
-{
- extern __shared__ T temp[];
-
- int idx = threadIdx.x;
- int blocksize = blockDim.x;
-
- temp[shared_offset(idx )] = (idx < n) ? in[idx ] : 0;
- temp[shared_offset(idx + blocksize)] = (idx + blocksize < n) ? in[idx + blocksize] : 0;
-
- int offset, d;
- for (offset = 1, d = blocksize; d > 0; d >>= 1, offset <<= 1) {
- __syncthreads();
- if (idx < d) {
- unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
- temp[b] += temp[a];
- }
- }
-
- if (idx == 0) temp[shared_offset(blocksize*2 - 1)] = 0;
-
- for (d = 1; d <= blocksize; d <<= 1) {
- offset >>= 1;
- __syncthreads();
-
- if (idx < d) {
- unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
- unsigned t = temp[a];
- temp[a] = temp[b];
- temp[b] += t;
- }
- }
- __syncthreads();
-
- if (idx < n) out[idx ] = temp[shared_offset(idx )];
- if (idx + blocksize < n) out[idx + blocksize] = temp[shared_offset(idx + blocksize)];
-}
-
-template<typename T>
-void prefix_sum(T *in, T *out, unsigned n)
-{
- char *device_values;
- unsigned n_lpot = lpot(n);
- size_t n_pitch;
-
- cudaError_t error = cudaMallocPitch((void**)&device_values, &n_pitch, sizeof(T)*n, 2);
- if (error != 0) {
- printf("error %u allocating width %lu height %u\n", error, sizeof(T)*n, 2);
- exit(1);
- }
-
- cudaMemcpy(device_values, in, sizeof(T)*n, cudaMemcpyHostToDevice);
-
- prefix_sum_block<<<1, n_lpot/2, shared_offset(n_lpot)*sizeof(T)>>>
- ((T*)device_values, (T*)(device_values + n_pitch), n);
-
- cudaMemcpy(out, device_values + n_pitch, sizeof(T)*n, cudaMemcpyDeviceToHost);
- cudaFree(device_values);
-}
-
-int main()
-{
- sranddev();
-
- static unsigned in_values[1024], out_values[1024];
-
- for (int i = 0; i < 1024; ++i)
- in_values[i] = rand() >> 21;
-
- prefix_sum(in_values, out_values, 1024);
-
- for (int i = 0; i < 1024; ++i)
- printf("%5d => %5d\n", in_values[i], out_values[i]);
-
- return 0;
-}
+++ /dev/null
- .version 1.4
- .target sm_10, map_f64_to_f32
- // compiled with /usr/local/cuda/bin/../open64/lib//be
- // nvopencc 3.0 built on 2010-03-11
-
- //-----------------------------------------------------------
- // Compiling /tmp/tmpxft_00000236_00000000-7_prefix-sum.cpp3.i (/var/folders/K6/K6oI14wZ2RWhSE+BYqTjA++++TI/-Tmp-/ccBI#.0ATpGM)
- //-----------------------------------------------------------
-
- //-----------------------------------------------------------
- // Options:
- //-----------------------------------------------------------
- // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
- // -O3 (Optimization level)
- // -g0 (Debug level)
- // -m2 (Report advisories)
- //-----------------------------------------------------------
-
- .file 1 "<command-line>"
- .file 2 "/tmp/tmpxft_00000236_00000000-6_prefix-sum.cudafe2.gpu"
- .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
- .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
- .file 5 "/usr/local/cuda/bin/../include/host_defines.h"
- .file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
- .file 7 "/usr/local/cuda/bin/../include/device_types.h"
- .file 8 "/usr/local/cuda/bin/../include/driver_types.h"
- .file 9 "/usr/local/cuda/bin/../include/texture_types.h"
- .file 10 "/usr/local/cuda/bin/../include/vector_types.h"
- .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
- .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
- .file 13 "/usr/include/i386/_types.h"
- .file 14 "/usr/include/time.h"
- .file 15 "prefix-sum.cu"
- .file 16 "/usr/local/cuda/bin/../include/common_functions.h"
- .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
- .file 18 "/usr/local/cuda/bin/../include/math_functions.h"
- .file 19 "/usr/local/cuda/bin/../include/device_functions.h"
- .file 20 "/usr/local/cuda/bin/../include/math_constants.h"
- .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
- .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
- .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
- .file 24 "/usr/local/cuda/bin/../include/common_types.h"
- .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
- .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
- .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
- .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
-
- .extern .shared .align 4 .b8 temp[];
-
- .entry _Z16prefix_sum_blockIjEvPT_S1_j (
- .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in,
- .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out,
- .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n)
- {
- .reg .u32 %r<81>;
- .reg .pred %p<11>;
- .loc 15 28 0
-$LBB1__Z16prefix_sum_blockIjEvPT_S1_j:
- ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
- cvt.s32.u16 %r2, %tid.x;
- setp.lt.u32 %p1, %r2, %r1;
- @!%p1 bra $Lt_0_7938;
- .loc 15 35 0
- ld.param.u32 %r3, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
- mul24.lo.u32 %r4, %r2, 4;
- add.u32 %r5, %r3, %r4;
- ld.global.u32 %r6, [%r5+0];
- bra.uni $Lt_0_7682;
-$Lt_0_7938:
- mov.u32 %r6, 0;
-$Lt_0_7682:
- mov.u32 %r7, temp;
- shr.u32 %r8, %r2, 4;
- add.u32 %r9, %r2, %r8;
- mul.lo.u32 %r10, %r9, 4;
- add.u32 %r11, %r10, %r7;
- st.shared.u32 [%r11+0], %r6;
- cvt.s32.u16 %r12, %ntid.x;
- add.s32 %r13, %r12, %r2;
- .loc 15 28 0
- ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
- .loc 15 35 0
- setp.lt.u32 %p2, %r13, %r1;
- @!%p2 bra $Lt_0_8450;
- .loc 15 36 0
- ld.param.u32 %r14, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
- mul.lo.u32 %r15, %r13, 4;
- add.u32 %r16, %r14, %r15;
- ld.global.u32 %r17, [%r16+0];
- bra.uni $Lt_0_8194;
-$Lt_0_8450:
- mov.u32 %r17, 0;
-$Lt_0_8194:
- shr.u32 %r18, %r13, 4;
- add.u32 %r19, %r13, %r18;
- mul.lo.u32 %r20, %r19, 4;
- add.u32 %r21, %r20, %r7;
- st.shared.u32 [%r21+0], %r17;
- .loc 15 39 0
- mov.s32 %r22, %r12;
- mov.u32 %r23, 0;
- setp.le.s32 %p3, %r12, %r23;
- mov.s32 %r24, 1;
- @%p3 bra $Lt_0_13314;
-$Lt_0_9218:
- //<loop> Loop body line 39, nesting depth: 1, estimated iterations: unknown
- .loc 15 40 0
- bar.sync 0;
- setp.le.s32 %p4, %r22, %r2;
- @%p4 bra $Lt_0_9474;
- //<loop> Part of loop body line 39, head labeled $Lt_0_9218
- .loc 15 43 0
- mul24.lo.u32 %r25, %r2, 2;
- add.u32 %r26, %r25, 1;
- add.u32 %r27, %r25, 2;
- mul.lo.u32 %r28, %r24, %r26;
- mul.lo.u32 %r29, %r24, %r27;
- sub.u32 %r30, %r29, 1;
- shr.u32 %r31, %r30, 4;
- add.u32 %r32, %r29, %r31;
- mul.lo.u32 %r33, %r32, 4;
- add.u32 %r34, %r33, %r7;
- ld.shared.u32 %r35, [%r34+-4];
- sub.u32 %r36, %r28, 1;
- shr.u32 %r37, %r36, 4;
- add.u32 %r38, %r28, %r37;
- mul.lo.u32 %r39, %r38, 4;
- add.u32 %r40, %r7, %r39;
- ld.shared.u32 %r41, [%r40+-4];
- add.u32 %r42, %r35, %r41;
- st.shared.u32 [%r34+-4], %r42;
-$Lt_0_9474:
- //<loop> Part of loop body line 39, head labeled $Lt_0_9218
- .loc 15 39 0
- shr.s32 %r22, %r22, 1;
- shl.b32 %r24, %r24, 1;
- mov.u32 %r43, 0;
- setp.gt.s32 %p5, %r22, %r43;
- @%p5 bra $Lt_0_9218;
- bra.uni $Lt_0_8706;
-$Lt_0_13314:
-$Lt_0_8706:
- mov.u32 %r44, 0;
- setp.ne.s32 %p6, %r2, %r44;
- @%p6 bra $Lt_0_10242;
- .loc 15 47 0
- mul24.lo.s32 %r45, %r12, 2;
- mov.u32 %r46, 0;
- sub.u32 %r47, %r45, 1;
- shr.u32 %r48, %r47, 4;
- add.u32 %r49, %r45, %r48;
- mul.lo.u32 %r50, %r49, 4;
- add.u32 %r51, %r7, %r50;
- st.shared.u32 [%r51+-4], %r46;
-$Lt_0_10242:
- mov.u32 %r52, 1;
- setp.lt.s32 %p7, %r12, %r52;
- @%p7 bra $Lt_0_10754;
- mov.s32 %r22, 1;
-$Lt_0_11266:
- //<loop> Loop body line 47, nesting depth: 1, estimated iterations: unknown
- .loc 15 50 0
- shr.s32 %r24, %r24, 1;
- .loc 15 51 0
- bar.sync 0;
- setp.le.s32 %p8, %r22, %r2;
- @%p8 bra $Lt_0_11522;
- //<loop> Part of loop body line 47, head labeled $Lt_0_11266
- .loc 15 55 0
- mul24.lo.u32 %r53, %r2, 2;
- add.u32 %r54, %r53, 1;
- mul.lo.u32 %r55, %r24, %r54;
- sub.u32 %r56, %r55, 1;
- shr.u32 %r57, %r56, 4;
- add.u32 %r58, %r55, %r57;
- mul.lo.u32 %r59, %r58, 4;
- add.u32 %r60, %r59, %r7;
- ld.shared.u32 %r61, [%r60+-4];
- .loc 15 56 0
- add.u32 %r62, %r53, 2;
- mul.lo.u32 %r63, %r24, %r62;
- sub.u32 %r64, %r63, 1;
- shr.u32 %r65, %r64, 4;
- add.u32 %r66, %r63, %r65;
- mul.lo.u32 %r67, %r66, 4;
- add.u32 %r68, %r67, %r7;
- ld.shared.u32 %r69, [%r68+-4];
- st.shared.u32 [%r60+-4], %r69;
- .loc 15 57 0
- ld.shared.u32 %r70, [%r68+-4];
- add.u32 %r71, %r70, %r61;
- st.shared.u32 [%r68+-4], %r71;
-$Lt_0_11522:
- //<loop> Part of loop body line 47, head labeled $Lt_0_11266
- .loc 15 49 0
- shl.b32 %r22, %r22, 1;
- setp.le.s32 %p9, %r22, %r12;
- @%p9 bra $Lt_0_11266;
-$Lt_0_10754:
- .loc 15 60 0
- bar.sync 0;
- @!%p1 bra $Lt_0_12290;
- .loc 15 62 0
- ld.shared.u32 %r72, [%r11+0];
- ld.param.u32 %r73, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
- mul24.lo.u32 %r74, %r2, 4;
- add.u32 %r75, %r73, %r74;
- st.global.u32 [%r75+0], %r72;
-$Lt_0_12290:
- @!%p2 bra $Lt_0_12802;
- .loc 15 63 0
- ld.shared.u32 %r76, [%r21+0];
- ld.param.u32 %r77, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
- mul.lo.u32 %r78, %r13, 4;
- add.u32 %r79, %r77, %r78;
- st.global.u32 [%r79+0], %r76;
-$Lt_0_12802:
- .loc 15 64 0
- exit;
-$LDWend__Z16prefix_sum_blockIjEvPT_S1_j:
- } // _Z16prefix_sum_blockIjEvPT_S1_j
-
--- /dev/null
+Doug Coleman
--- /dev/null
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien.parser cuda kernel lexer parser ;
+IN: cuda.syntax
+
+SYNTAX: CUDA-LIBRARY: scan scan add-cuda-library ;
+
+SYNTAX: CUDA-FUNCTION:
+ scan [ create-in ] [ ] bi ";" scan-c-args drop define-cuda-word ;
+
+: 3<<< ( dim-block dim-grid shared-size -- function-launcher )
+ f function-launcher boa ;
+
+: 4<<< ( dim-block dim-grid shared-size stream -- function-launcher )
+ function-launcher boa ;