]> gitweb.factorcode.org Git - factor.git/commitdiff
CUDA-FUNCTION: works, splitting up CUDA into more vocabs
authorDoug Coleman <doug.coleman@gmail.com>
Sun, 18 Apr 2010 23:33:18 +0000 (18:33 -0500)
committerDoug Coleman <doug.coleman@gmail.com>
Sun, 18 Apr 2010 23:34:46 +0000 (18:34 -0500)
15 files changed:
extra/cuda/cuda.factor
extra/cuda/demos/hello-world/authors.txt [new file with mode: 0644]
extra/cuda/demos/hello-world/hello-world.factor [new file with mode: 0644]
extra/cuda/demos/hello-world/hello.cu [new file with mode: 0644]
extra/cuda/demos/hello-world/hello.ptx [new file with mode: 0644]
extra/cuda/demos/prefix-sum/authors.txt [new file with mode: 0644]
extra/cuda/demos/prefix-sum/prefix-sum.cu [new file with mode: 0644]
extra/cuda/demos/prefix-sum/prefix-sum.factor [new file with mode: 0644]
extra/cuda/demos/prefix-sum/prefix-sum.ptx [new file with mode: 0644]
extra/cuda/hello.cu [deleted file]
extra/cuda/hello.ptx [deleted file]
extra/cuda/prefix-sum.cu [deleted file]
extra/cuda/prefix-sum.ptx [deleted file]
extra/cuda/syntax/authors.txt [new file with mode: 0644]
extra/cuda/syntax/syntax.factor [new file with mode: 0644]

index 6b343fb1ccdca99498ad421d2ab818f782e7106a..d8b6f2e2ce3fa4f390c7bb5242cac17a139cbc4c 100644 (file)
@@ -1,11 +1,13 @@
 ! Copyright (C) 2010 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors alien alien.c-types alien.data alien.parser
-alien.strings arrays assocs byte-arrays classes.struct
+USING: accessors alien alien.data alien.parser alien.strings
+alien.syntax arrays assocs byte-arrays classes.struct
 combinators continuations cuda.ffi destructors fry io
 io.backend io.encodings.string io.encodings.utf8 kernel lexer
-locals math math.parser namespaces opengl.gl.extensions
-prettyprint quotations sequences ;
+locals macros math math.parser namespaces nested-comments
+opengl.gl.extensions parser prettyprint quotations sequences
+words ;
+QUALIFIED-WITH: alien.c-types a
 IN: cuda
 
 SYMBOL: cuda-device
@@ -15,13 +17,32 @@ SYMBOL: cuda-function
 SYMBOL: cuda-launcher
 SYMBOL: cuda-memory-hashtable
 
+SYMBOL: cuda-libraries
+cuda-libraries [ H{ } clone ] initialize
+
+SYMBOL: cuda-functions
+
+TUPLE: cuda-library name path ;
+
+: <cuda-library> ( name path -- obj )
+    \ cuda-library new
+        swap >>path
+        swap >>name ;
+
+: add-cuda-library ( name path -- )
+    normalize-path <cuda-library>
+    dup name>> cuda-libraries get set-at ;
+
+: cuda-library ( name -- cuda-library )
+    cuda-libraries get at ;
+
 ERROR: throw-cuda-error n ;
 
 : cuda-error ( n -- )
     dup CUDA_SUCCESS = [ drop ] [ throw-cuda-error ] if ;
 
 : cuda-version ( -- n )
-    int <c-object> [ cuDriverGetVersion cuda-error ] keep *int ;
+    a:int <c-object> [ cuDriverGetVersion cuda-error ] keep a:*int ;
 
 : init-cuda ( -- )
     0 cuInit cuda-error ;
@@ -29,12 +50,19 @@ ERROR: throw-cuda-error n ;
 TUPLE: launcher
 { device integer initial: 0 }
 { device-flags initial: 0 }
-path block-shape shared-size grid ;
+path ;
+
+TUPLE: function-launcher
+dim-block
+dim-grid
+shared-size
+stream ;
 
 : with-cuda-context ( flags device quot -- )
+    H{ } clone cuda-functions set
     [
         [ CUcontext <c-object> ] 2dip
-        [ cuCtxCreate cuda-error ] 3keep 2drop *void*
+        [ cuCtxCreate cuda-error ] 3keep 2drop a:*void*
     ] dip 
     [ '[ _ @ ] ]
     [ drop '[ _ cuCtxDestroy cuda-error ] ] 2bi
@@ -44,7 +72,7 @@ path block-shape shared-size grid ;
     [
         normalize-path
         [ CUmodule <c-object> ] dip
-        [ cuModuleLoad cuda-error ] 2keep drop *void*
+        [ cuModuleLoad cuda-error ] 2keep drop a:*void*
     ] dip
     [ '[ _ @ ] ]
     [ drop '[ _ cuModuleUnload cuda-error ] ] 2bi
@@ -74,10 +102,10 @@ path block-shape shared-size grid ;
 <PRIVATE
 
 : #cuda-devices ( -- n )
-    int <c-object> [ cuDeviceGetCount cuda-error ] keep *int ;
+    a:int <c-object> [ cuDeviceGetCount cuda-error ] keep a:*int ;
 
 : n>cuda-device ( n -- device )
-    [ CUdevice <c-object> ] dip [ cuDeviceGet cuda-error ] 2keep drop *int ;
+    [ CUdevice <c-object> ] dip [ cuDeviceGet cuda-error ] 2keep drop a:*int ;
 
 : enumerate-cuda-devices ( -- devices )
     #cuda-devices iota [ n>cuda-device ] map ;
@@ -98,27 +126,30 @@ PRIVATE>
     [ 2drop utf8 alien>string ] 3bi ;
 
 : cuda-device-capability ( n -- pair )
-    [ int <c-object> int <c-object> ] dip
+    [ a:int <c-object> a:int <c-object> ] dip
     [ cuDeviceComputeCapability cuda-error ]
-    [ drop [ *int ] bi@ ] 3bi 2array ;
+    [ drop [ a:*int ] bi@ ] 3bi 2array ;
 
 : cuda-device-memory ( n -- bytes )
-    [ uint <c-object> ] dip
+    [ a:uint <c-object> ] dip
     [ cuDeviceTotalMem cuda-error ]
-    [ drop *uint ] 2bi ;
+    [ drop a:*uint ] 2bi ;
 
-: get-cuda-function* ( module string -- function )
+: get-function-ptr* ( module string -- function )
     [ CUfunction <c-object> ] 2dip
-    [ cuModuleGetFunction cuda-error ] 3keep 2drop *void* ;
+    [ cuModuleGetFunction cuda-error ] 3keep 2drop a:*void* ;
 
-: get-cuda-function ( string -- function )
-    [ cuda-module get ] dip get-cuda-function* ;
+: get-function-ptr ( string -- function )
+    [ cuda-module get ] dip get-function-ptr* ;
 
 : with-cuda-function ( string quot -- )
     [
-        get-cuda-function cuda-function set
+        get-function-ptr* cuda-function set
     ] dip call ; inline
 
+: cached-cuda-function ( string -- alien )
+    cuda-functions get [ get-function-ptr ] cache ;
+
 : launch-function* ( function -- ) cuLaunch cuda-error ;
 
 : launch-function ( -- ) cuda-function get cuLaunch cuda-error ;
@@ -157,7 +188,7 @@ M: cuda-memory byte-length length>> ;
 : cuda-malloc ( n -- ptr )
     [ CUdeviceptr <c-object> ] dip
     [ cuMemAlloc cuda-error ] 2keep
-    [ *int ] dip <cuda-memory> add-cuda-memory ;
+    [ a:*int ] dip <cuda-memory> add-cuda-memory ;
 
 : cuda-free* ( ptr -- )
     cuMemFree cuda-error ;
@@ -237,9 +268,9 @@ ERROR: bad-cuda-parameter parameter ;
     offset param-size ;
 
 : cuda-device-attribute ( attribute dev -- n )
-    [ int <c-object> ] 2dip
+    [ a:int <c-object> ] 2dip
     [ cuDeviceGetAttribute cuda-error ]
-    [ 2drop *int ] 3bi ;
+    [ 2drop a:*int ] 3bi ;
 
 : function-block-shape* ( function x y z -- )
     cuFuncSetBlockShape cuda-error ;
@@ -289,20 +320,46 @@ ERROR: bad-cuda-parameter parameter ;
     "CUDA Version: " write cuda-version number>string print nl
     #cuda-devices iota [ nl ] [ cuda-device. ] interleave ;
 
+: c-type>cuda-setter ( c-type -- n cuda-type )
+    {
+        { [ dup a:int = ] [ drop 4 [ cuda-int* ] ] }
+        { [ dup a:uint = ] [ drop 4 [ cuda-int* ] ] }
+        { [ dup a:float = ] [ drop 4 [ cuda-float* ] ] }
+        { [ dup a:pointer? ] [ drop 4 [ ptr>> cuda-int* ] ] }
+        { [ dup a:void* = ] [ drop 4 [ ptr>> cuda-int* ] ] }
+    } cond ;
+
+: run-function-launcher ( function-launcher function -- )
+    swap
+    {
+        [ dim-block>> first3 function-block-shape* ]
+        [ shared-size>> function-shared-size* ]
+        [
+            dim-grid>> [
+                launch-function*
+            ] [
+                first2 launch-function-grid*
+            ] if-empty
+        ]
+    } 2cleave ;
+
+: cuda-argument-setter ( offset c-type -- offset' quot )
+    c-type>cuda-setter
+    [ over [ + ] dip ] dip
+    '[ swap _ swap _ call ] ;
+
+MACRO: cuda-arguments ( c-types -- quot: ( args... function -- ) )
+    [ 0 ] dip [ cuda-argument-setter ] map reverse
+    swap '[ _ param-size* ] suffix
+    '[ _ cleave ] ;
 
-: test-cuda0 ( -- )
-    T{ launcher
-        { path "vocab:cuda/hello.ptx" }
-        { block-shape { 6 6 6 } }
-        { shared-size 2 }
-        { grid { 2 6 } }
-    } [
-        "helloWorld" [
-            "Hello World!" [ - ] map-index
-            malloc-device-string &dispose
-
-            [ 1array set-parameters ]
-            [ drop launch ]
-            [ device>host utf8 alien>string . ] tri
-        ] with-cuda-function
-    ] with-cuda ;
+: define-cuda-word ( word string arguments -- )
+    [
+        '[
+            _ get-function-ptr
+            [ nip _ cuda-arguments ]
+            [ run-function-launcher ] 2bi
+        ]
+    ]
+    [ nip \ function-launcher suffix a:void function-effect ]
+    2bi define-declared ;
diff --git a/extra/cuda/demos/hello-world/authors.txt b/extra/cuda/demos/hello-world/authors.txt
new file mode 100644 (file)
index 0000000..7c1b2f2
--- /dev/null
@@ -0,0 +1 @@
+Doug Coleman
diff --git a/extra/cuda/demos/hello-world/hello-world.factor b/extra/cuda/demos/hello-world/hello-world.factor
new file mode 100644 (file)
index 0000000..6a598dd
--- /dev/null
@@ -0,0 +1,30 @@
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien.c-types alien.strings cuda cuda.syntax destructors
+io.encodings.utf8 kernel locals math prettyprint sequences ;
+IN: cuda.hello-world
+
+CUDA-LIBRARY: hello vocab:cuda/hello.ptx
+
+CUDA-FUNCTION: helloWorld ( char* string-ptr ) ;
+
+:: cuda-hello-world ( -- )
+    T{ launcher
+        { device 0 }
+        { path "vocab:cuda/hello.ptx" }
+    } [
+        "Hello World!" [ - ] map-index malloc-device-string &dispose dup :> str
+
+        T{ function-launcher
+            { dim-block { 6 1 1 } }
+            { dim-grid { 2 1 } }
+            { shared-size 0 }
+        }
+        helloWorld
+
+        ! <<< { 6 1 1 } { 2 1 } 1 >>> helloWorld
+
+        str device>host utf8 alien>string .
+    ] with-cuda ;
+
+MAIN: cuda-hello-world
diff --git a/extra/cuda/demos/hello-world/hello.cu b/extra/cuda/demos/hello-world/hello.cu
new file mode 100644 (file)
index 0000000..1f3cd67
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+ World using CUDA
+** 
+** The string "Hello World!" is mangled then restored using a common CUDA idiom
+**
+** Byron Galbraith
+** 2009-02-18
+*/
+#include <cuda.h>
+#include <stdio.h>
+
+// Prototypes
+extern "C" __global__ void helloWorld(char*);
+
+// Host function
+int
+main(int argc, char** argv)
+{
+  int i;
+
+  // desired output
+  char str[] = "Hello World!";
+
+  // mangle contents of output
+  // the null character is left intact for simplicity
+  for(i = 0; i < 12; i++)
+    str[i] -= i;
+
+  // allocate memory on the device 
+  char *d_str;
+  size_t size = sizeof(str);
+  cudaMalloc((void**)&d_str, size);
+
+  // copy the string to the device
+  cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
+
+  // set the grid and block sizes
+  dim3 dimGrid(2);   // one block per word  
+  dim3 dimBlock(6); // one thread per character
+  
+  // invoke the kernel
+  helloWorld<<< dimGrid, dimBlock >>>(d_str);
+
+  // retrieve the results from the device
+  cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
+
+  // free up the allocated memory on the device
+  cudaFree(d_str);
+  
+  // everyone's favorite part
+  printf("%s\n", str);
+
+  return 0;
+}
+
+// Device kernel
+__global__ void
+helloWorld(char* str)
+{
+  // determine where in the thread grid we are
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // unmangle output
+  str[idx] += idx;
+}
diff --git a/extra/cuda/demos/hello-world/hello.ptx b/extra/cuda/demos/hello-world/hello.ptx
new file mode 100644 (file)
index 0000000..049bb5e
--- /dev/null
@@ -0,0 +1,71 @@
+       .version 1.4
+       .target sm_10, map_f64_to_f32
+       // compiled with /usr/local/cuda/bin/../open64/lib//be
+       // nvopencc 3.0 built on 2010-03-11
+
+       //-----------------------------------------------------------
+       // Compiling /tmp/tmpxft_00000eab_00000000-7_hello.cpp3.i (/var/folders/KD/KDnx4D80Eh0fsORqNrFWBE+++TI/-Tmp-/ccBI#.AYqbdQ)
+       //-----------------------------------------------------------
+
+       //-----------------------------------------------------------
+       // Options:
+       //-----------------------------------------------------------
+       //  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
+       //  -O3 (Optimization level)
+       //  -g0 (Debug level)
+       //  -m2 (Report advisories)
+       //-----------------------------------------------------------
+
+       .file   1       "<command-line>"
+       .file   2       "/tmp/tmpxft_00000eab_00000000-6_hello.cudafe2.gpu"
+       .file   3       "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
+       .file   4       "/usr/local/cuda/bin/../include/crt/device_runtime.h"
+       .file   5       "/usr/local/cuda/bin/../include/host_defines.h"
+       .file   6       "/usr/local/cuda/bin/../include/builtin_types.h"
+       .file   7       "/usr/local/cuda/bin/../include/device_types.h"
+       .file   8       "/usr/local/cuda/bin/../include/driver_types.h"
+       .file   9       "/usr/local/cuda/bin/../include/texture_types.h"
+       .file   10      "/usr/local/cuda/bin/../include/vector_types.h"
+       .file   11      "/usr/local/cuda/bin/../include/device_launch_parameters.h"
+       .file   12      "/usr/local/cuda/bin/../include/crt/storage_class.h"
+       .file   13      "/usr/include/i386/_types.h"
+       .file   14      "/usr/include/time.h"
+       .file   15      "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
+       .file   16      "/usr/local/cuda/bin/../include/common_functions.h"
+       .file   17      "/usr/local/cuda/bin/../include/crt/func_macro.h"
+       .file   18      "/usr/local/cuda/bin/../include/math_functions.h"
+       .file   19      "/usr/local/cuda/bin/../include/device_functions.h"
+       .file   20      "/usr/local/cuda/bin/../include/math_constants.h"
+       .file   21      "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
+       .file   22      "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
+       .file   23      "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
+       .file   24      "/usr/local/cuda/bin/../include/common_types.h"
+       .file   25      "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
+       .file   26      "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
+       .file   27      "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
+       .file   28      "hello.cu"
+
+
+       .entry helloWorld (
+               .param .u32 __cudaparm_helloWorld_str)
+       {
+       .reg .u16 %rh<4>;
+       .reg .u32 %r<9>;
+       .loc    28      58      0
+$LBB1_helloWorld:
+       .loc    28      64      0
+       mov.u16         %rh1, %ctaid.x;
+       mov.u16         %rh2, %ntid.x;
+       mul.wide.u16    %r1, %rh1, %rh2;
+       cvt.u32.u16     %r2, %tid.x;
+       add.u32         %r3, %r2, %r1;
+       ld.param.u32    %r4, [__cudaparm_helloWorld_str];
+       add.u32         %r5, %r4, %r3;
+       ld.global.s8    %r6, [%r5+0];
+       add.s32         %r7, %r6, %r3;
+       st.global.s8    [%r5+0], %r7;
+       .loc    28      65      0
+       exit;
+$LDWend_helloWorld:
+       } // helloWorld
+
diff --git a/extra/cuda/demos/prefix-sum/authors.txt b/extra/cuda/demos/prefix-sum/authors.txt
new file mode 100644 (file)
index 0000000..2d6d456
--- /dev/null
@@ -0,0 +1,2 @@
+Doug Coleman
+Joe Groff
diff --git a/extra/cuda/demos/prefix-sum/prefix-sum.cu b/extra/cuda/demos/prefix-sum/prefix-sum.cu
new file mode 100644 (file)
index 0000000..a77a67f
--- /dev/null
@@ -0,0 +1,103 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+static const int LOG_BANK_COUNT = 4;
+
+static inline __device__ __host__ unsigned shared_offset(unsigned i)
+{
+    return i + (i >> LOG_BANK_COUNT);
+}
+
+static inline __device__ __host__ unsigned offset_a(unsigned offset, unsigned i)
+{
+    return shared_offset(offset * (2*i + 1) - 1);
+}
+
+static inline __device__ __host__ unsigned offset_b(unsigned offset, unsigned i)
+{
+    return shared_offset(offset * (2*i + 2) - 1);
+}
+
+static inline __device__ __host__ unsigned lpot(unsigned x)
+{
+    --x; x |= x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return ++x;
+}
+
+template<typename T>
+__global__ void prefix_sum_block(T *in, T *out, unsigned n)
+{
+    extern __shared__ T temp[];
+
+    int idx = threadIdx.x;
+    int blocksize = blockDim.x;
+
+    temp[shared_offset(idx            )] = (idx             < n) ? in[idx            ] : 0;
+    temp[shared_offset(idx + blocksize)] = (idx + blocksize < n) ? in[idx + blocksize] : 0;
+
+    int offset, d;
+    for (offset = 1, d = blocksize; d > 0; d >>= 1, offset <<= 1) {
+        __syncthreads();
+        if (idx < d) {
+            unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
+            temp[b] += temp[a];
+        }
+    }
+
+    if (idx == 0) temp[shared_offset(blocksize*2 - 1)] = 0;
+
+    for (d = 1; d <= blocksize; d <<= 1) {
+        offset >>= 1;
+        __syncthreads();
+
+        if (idx < d) {
+            unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
+            unsigned t = temp[a];
+            temp[a] = temp[b];
+            temp[b] += t;
+        }
+    }
+    __syncthreads();
+
+    if (idx             < n) out[idx            ] = temp[shared_offset(idx            )];
+    if (idx + blocksize < n) out[idx + blocksize] = temp[shared_offset(idx + blocksize)];
+}
+
+template<typename T>
+void prefix_sum(T *in, T *out, unsigned n)
+{
+    char *device_values;
+    unsigned n_lpot = lpot(n);
+    size_t n_pitch;
+
+    cudaError_t error = cudaMallocPitch((void**)&device_values, &n_pitch, sizeof(T)*n, 2);
+    if (error != 0) {
+        printf("error %u allocating width %lu height %u\n", error, sizeof(T)*n, 2);
+        exit(1);
+    }
+
+    cudaMemcpy(device_values, in, sizeof(T)*n, cudaMemcpyHostToDevice);
+
+    prefix_sum_block<<<1, n_lpot/2, shared_offset(n_lpot)*sizeof(T)>>>
+        ((T*)device_values, (T*)(device_values + n_pitch), n);
+
+    cudaMemcpy(out, device_values + n_pitch, sizeof(T)*n, cudaMemcpyDeviceToHost);
+    cudaFree(device_values);
+}
+
+int main()
+{
+    sranddev();
+
+    static unsigned in_values[1024], out_values[1024];
+
+    for (int i = 0; i < 1024; ++i)
+        in_values[i] = rand() >> 21;
+
+    prefix_sum(in_values, out_values, 1024);
+
+    for (int i = 0; i < 1024; ++i)
+        printf("%5d => %5d\n", in_values[i], out_values[i]);
+
+    return 0;
+}
diff --git a/extra/cuda/demos/prefix-sum/prefix-sum.factor b/extra/cuda/demos/prefix-sum/prefix-sum.factor
new file mode 100644 (file)
index 0000000..2cd8eba
--- /dev/null
@@ -0,0 +1,21 @@
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien.c-types cuda cuda.syntax locals ;
+IN: cuda.demos.prefix-sum
+
+CUDA-LIBRARY: prefix-sum vocab:cuda/demos/prefix-sum/prefix-sum.ptx
+
+CUDA-FUNCTION: prefix_sum_block ( uint* in, uint* out, uint n ) ;
+
+:: cuda-prefix-sum ( -- )
+    T{ launcher
+        { device 0 }
+        { path "vocab:cuda/demos/prefix-sum/prefix-sum.ptx" }
+    } [
+
+        
+        ! { 1 1 1 } { 2 1 } 0 3<<< prefix_sum_block
+
+    ] with-cuda ;
+
+MAIN: cuda-prefix-sum
diff --git a/extra/cuda/demos/prefix-sum/prefix-sum.ptx b/extra/cuda/demos/prefix-sum/prefix-sum.ptx
new file mode 100644 (file)
index 0000000..d189179
--- /dev/null
@@ -0,0 +1,222 @@
+       .version 1.4
+       .target sm_10, map_f64_to_f32
+       // compiled with /usr/local/cuda/bin/../open64/lib//be
+       // nvopencc 3.0 built on 2010-03-11
+
+       //-----------------------------------------------------------
+       // Compiling /tmp/tmpxft_00000236_00000000-7_prefix-sum.cpp3.i (/var/folders/K6/K6oI14wZ2RWhSE+BYqTjA++++TI/-Tmp-/ccBI#.0ATpGM)
+       //-----------------------------------------------------------
+
+       //-----------------------------------------------------------
+       // Options:
+       //-----------------------------------------------------------
+       //  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
+       //  -O3 (Optimization level)
+       //  -g0 (Debug level)
+       //  -m2 (Report advisories)
+       //-----------------------------------------------------------
+
+       .file   1       "<command-line>"
+       .file   2       "/tmp/tmpxft_00000236_00000000-6_prefix-sum.cudafe2.gpu"
+       .file   3       "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
+       .file   4       "/usr/local/cuda/bin/../include/crt/device_runtime.h"
+       .file   5       "/usr/local/cuda/bin/../include/host_defines.h"
+       .file   6       "/usr/local/cuda/bin/../include/builtin_types.h"
+       .file   7       "/usr/local/cuda/bin/../include/device_types.h"
+       .file   8       "/usr/local/cuda/bin/../include/driver_types.h"
+       .file   9       "/usr/local/cuda/bin/../include/texture_types.h"
+       .file   10      "/usr/local/cuda/bin/../include/vector_types.h"
+       .file   11      "/usr/local/cuda/bin/../include/device_launch_parameters.h"
+       .file   12      "/usr/local/cuda/bin/../include/crt/storage_class.h"
+       .file   13      "/usr/include/i386/_types.h"
+       .file   14      "/usr/include/time.h"
+       .file   15      "prefix-sum.cu"
+       .file   16      "/usr/local/cuda/bin/../include/common_functions.h"
+       .file   17      "/usr/local/cuda/bin/../include/crt/func_macro.h"
+       .file   18      "/usr/local/cuda/bin/../include/math_functions.h"
+       .file   19      "/usr/local/cuda/bin/../include/device_functions.h"
+       .file   20      "/usr/local/cuda/bin/../include/math_constants.h"
+       .file   21      "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
+       .file   22      "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
+       .file   23      "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
+       .file   24      "/usr/local/cuda/bin/../include/common_types.h"
+       .file   25      "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
+       .file   26      "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
+       .file   27      "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
+       .file   28      "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
+
+       .extern .shared .align 4 .b8 temp[];
+
+       .entry _Z16prefix_sum_blockIjEvPT_S1_j (
+               .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in,
+               .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out,
+               .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n)
+       {
+       .reg .u32 %r<81>;
+       .reg .pred %p<11>;
+       .loc    15      28      0
+$LBB1__Z16prefix_sum_blockIjEvPT_S1_j:
+       ld.param.u32    %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
+       cvt.s32.u16     %r2, %tid.x;
+       setp.lt.u32     %p1, %r2, %r1;
+       @!%p1 bra       $Lt_0_7938;
+       .loc    15      35      0
+       ld.param.u32    %r3, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
+       mul24.lo.u32    %r4, %r2, 4;
+       add.u32         %r5, %r3, %r4;
+       ld.global.u32   %r6, [%r5+0];
+       bra.uni         $Lt_0_7682;
+$Lt_0_7938:
+       mov.u32         %r6, 0;
+$Lt_0_7682:
+       mov.u32         %r7, temp;
+       shr.u32         %r8, %r2, 4;
+       add.u32         %r9, %r2, %r8;
+       mul.lo.u32      %r10, %r9, 4;
+       add.u32         %r11, %r10, %r7;
+       st.shared.u32   [%r11+0], %r6;
+       cvt.s32.u16     %r12, %ntid.x;
+       add.s32         %r13, %r12, %r2;
+       .loc    15      28      0
+       ld.param.u32    %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
+       .loc    15      35      0
+       setp.lt.u32     %p2, %r13, %r1;
+       @!%p2 bra       $Lt_0_8450;
+       .loc    15      36      0
+       ld.param.u32    %r14, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
+       mul.lo.u32      %r15, %r13, 4;
+       add.u32         %r16, %r14, %r15;
+       ld.global.u32   %r17, [%r16+0];
+       bra.uni         $Lt_0_8194;
+$Lt_0_8450:
+       mov.u32         %r17, 0;
+$Lt_0_8194:
+       shr.u32         %r18, %r13, 4;
+       add.u32         %r19, %r13, %r18;
+       mul.lo.u32      %r20, %r19, 4;
+       add.u32         %r21, %r20, %r7;
+       st.shared.u32   [%r21+0], %r17;
+       .loc    15      39      0
+       mov.s32         %r22, %r12;
+       mov.u32         %r23, 0;
+       setp.le.s32     %p3, %r12, %r23;
+       mov.s32         %r24, 1;
+       @%p3 bra        $Lt_0_13314;
+$Lt_0_9218:
+ //<loop> Loop body line 39, nesting depth: 1, estimated iterations: unknown
+       .loc    15      40      0
+       bar.sync        0;
+       setp.le.s32     %p4, %r22, %r2;
+       @%p4 bra        $Lt_0_9474;
+ //<loop> Part of loop body line 39, head labeled $Lt_0_9218
+       .loc    15      43      0
+       mul24.lo.u32    %r25, %r2, 2;
+       add.u32         %r26, %r25, 1;
+       add.u32         %r27, %r25, 2;
+       mul.lo.u32      %r28, %r24, %r26;
+       mul.lo.u32      %r29, %r24, %r27;
+       sub.u32         %r30, %r29, 1;
+       shr.u32         %r31, %r30, 4;
+       add.u32         %r32, %r29, %r31;
+       mul.lo.u32      %r33, %r32, 4;
+       add.u32         %r34, %r33, %r7;
+       ld.shared.u32   %r35, [%r34+-4];
+       sub.u32         %r36, %r28, 1;
+       shr.u32         %r37, %r36, 4;
+       add.u32         %r38, %r28, %r37;
+       mul.lo.u32      %r39, %r38, 4;
+       add.u32         %r40, %r7, %r39;
+       ld.shared.u32   %r41, [%r40+-4];
+       add.u32         %r42, %r35, %r41;
+       st.shared.u32   [%r34+-4], %r42;
+$Lt_0_9474:
+ //<loop> Part of loop body line 39, head labeled $Lt_0_9218
+       .loc    15      39      0
+       shr.s32         %r22, %r22, 1;
+       shl.b32         %r24, %r24, 1;
+       mov.u32         %r43, 0;
+       setp.gt.s32     %p5, %r22, %r43;
+       @%p5 bra        $Lt_0_9218;
+       bra.uni         $Lt_0_8706;
+$Lt_0_13314:
+$Lt_0_8706:
+       mov.u32         %r44, 0;
+       setp.ne.s32     %p6, %r2, %r44;
+       @%p6 bra        $Lt_0_10242;
+       .loc    15      47      0
+       mul24.lo.s32    %r45, %r12, 2;
+       mov.u32         %r46, 0;
+       sub.u32         %r47, %r45, 1;
+       shr.u32         %r48, %r47, 4;
+       add.u32         %r49, %r45, %r48;
+       mul.lo.u32      %r50, %r49, 4;
+       add.u32         %r51, %r7, %r50;
+       st.shared.u32   [%r51+-4], %r46;
+$Lt_0_10242:
+       mov.u32         %r52, 1;
+       setp.lt.s32     %p7, %r12, %r52;
+       @%p7 bra        $Lt_0_10754;
+       mov.s32         %r22, 1;
+$Lt_0_11266:
+ //<loop> Loop body line 47, nesting depth: 1, estimated iterations: unknown
+       .loc    15      50      0
+       shr.s32         %r24, %r24, 1;
+       .loc    15      51      0
+       bar.sync        0;
+       setp.le.s32     %p8, %r22, %r2;
+       @%p8 bra        $Lt_0_11522;
+ //<loop> Part of loop body line 47, head labeled $Lt_0_11266
+       .loc    15      55      0
+       mul24.lo.u32    %r53, %r2, 2;
+       add.u32         %r54, %r53, 1;
+       mul.lo.u32      %r55, %r24, %r54;
+       sub.u32         %r56, %r55, 1;
+       shr.u32         %r57, %r56, 4;
+       add.u32         %r58, %r55, %r57;
+       mul.lo.u32      %r59, %r58, 4;
+       add.u32         %r60, %r59, %r7;
+       ld.shared.u32   %r61, [%r60+-4];
+       .loc    15      56      0
+       add.u32         %r62, %r53, 2;
+       mul.lo.u32      %r63, %r24, %r62;
+       sub.u32         %r64, %r63, 1;
+       shr.u32         %r65, %r64, 4;
+       add.u32         %r66, %r63, %r65;
+       mul.lo.u32      %r67, %r66, 4;
+       add.u32         %r68, %r67, %r7;
+       ld.shared.u32   %r69, [%r68+-4];
+       st.shared.u32   [%r60+-4], %r69;
+       .loc    15      57      0
+       ld.shared.u32   %r70, [%r68+-4];
+       add.u32         %r71, %r70, %r61;
+       st.shared.u32   [%r68+-4], %r71;
+$Lt_0_11522:
+ //<loop> Part of loop body line 47, head labeled $Lt_0_11266
+       .loc    15      49      0
+       shl.b32         %r22, %r22, 1;
+       setp.le.s32     %p9, %r22, %r12;
+       @%p9 bra        $Lt_0_11266;
+$Lt_0_10754:
+       .loc    15      60      0
+       bar.sync        0;
+       @!%p1 bra       $Lt_0_12290;
+       .loc    15      62      0
+       ld.shared.u32   %r72, [%r11+0];
+       ld.param.u32    %r73, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
+       mul24.lo.u32    %r74, %r2, 4;
+       add.u32         %r75, %r73, %r74;
+       st.global.u32   [%r75+0], %r72;
+$Lt_0_12290:
+       @!%p2 bra       $Lt_0_12802;
+       .loc    15      63      0
+       ld.shared.u32   %r76, [%r21+0];
+       ld.param.u32    %r77, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
+       mul.lo.u32      %r78, %r13, 4;
+       add.u32         %r79, %r77, %r78;
+       st.global.u32   [%r79+0], %r76;
+$Lt_0_12802:
+       .loc    15      64      0
+       exit;
+$LDWend__Z16prefix_sum_blockIjEvPT_S1_j:
+       } // _Z16prefix_sum_blockIjEvPT_S1_j
+
diff --git a/extra/cuda/hello.cu b/extra/cuda/hello.cu
deleted file mode 100644 (file)
index 1f3cd67..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- World using CUDA
-** 
-** The string "Hello World!" is mangled then restored using a common CUDA idiom
-**
-** Byron Galbraith
-** 2009-02-18
-*/
-#include <cuda.h>
-#include <stdio.h>
-
-// Prototypes
-extern "C" __global__ void helloWorld(char*);
-
-// Host function
-int
-main(int argc, char** argv)
-{
-  int i;
-
-  // desired output
-  char str[] = "Hello World!";
-
-  // mangle contents of output
-  // the null character is left intact for simplicity
-  for(i = 0; i < 12; i++)
-    str[i] -= i;
-
-  // allocate memory on the device 
-  char *d_str;
-  size_t size = sizeof(str);
-  cudaMalloc((void**)&d_str, size);
-
-  // copy the string to the device
-  cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
-
-  // set the grid and block sizes
-  dim3 dimGrid(2);   // one block per word  
-  dim3 dimBlock(6); // one thread per character
-  
-  // invoke the kernel
-  helloWorld<<< dimGrid, dimBlock >>>(d_str);
-
-  // retrieve the results from the device
-  cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
-
-  // free up the allocated memory on the device
-  cudaFree(d_str);
-  
-  // everyone's favorite part
-  printf("%s\n", str);
-
-  return 0;
-}
-
-// Device kernel
-__global__ void
-helloWorld(char* str)
-{
-  // determine where in the thread grid we are
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // unmangle output
-  str[idx] += idx;
-}
diff --git a/extra/cuda/hello.ptx b/extra/cuda/hello.ptx
deleted file mode 100644 (file)
index 049bb5e..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-       .version 1.4
-       .target sm_10, map_f64_to_f32
-       // compiled with /usr/local/cuda/bin/../open64/lib//be
-       // nvopencc 3.0 built on 2010-03-11
-
-       //-----------------------------------------------------------
-       // Compiling /tmp/tmpxft_00000eab_00000000-7_hello.cpp3.i (/var/folders/KD/KDnx4D80Eh0fsORqNrFWBE+++TI/-Tmp-/ccBI#.AYqbdQ)
-       //-----------------------------------------------------------
-
-       //-----------------------------------------------------------
-       // Options:
-       //-----------------------------------------------------------
-       //  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
-       //  -O3 (Optimization level)
-       //  -g0 (Debug level)
-       //  -m2 (Report advisories)
-       //-----------------------------------------------------------
-
-       .file   1       "<command-line>"
-       .file   2       "/tmp/tmpxft_00000eab_00000000-6_hello.cudafe2.gpu"
-       .file   3       "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
-       .file   4       "/usr/local/cuda/bin/../include/crt/device_runtime.h"
-       .file   5       "/usr/local/cuda/bin/../include/host_defines.h"
-       .file   6       "/usr/local/cuda/bin/../include/builtin_types.h"
-       .file   7       "/usr/local/cuda/bin/../include/device_types.h"
-       .file   8       "/usr/local/cuda/bin/../include/driver_types.h"
-       .file   9       "/usr/local/cuda/bin/../include/texture_types.h"
-       .file   10      "/usr/local/cuda/bin/../include/vector_types.h"
-       .file   11      "/usr/local/cuda/bin/../include/device_launch_parameters.h"
-       .file   12      "/usr/local/cuda/bin/../include/crt/storage_class.h"
-       .file   13      "/usr/include/i386/_types.h"
-       .file   14      "/usr/include/time.h"
-       .file   15      "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
-       .file   16      "/usr/local/cuda/bin/../include/common_functions.h"
-       .file   17      "/usr/local/cuda/bin/../include/crt/func_macro.h"
-       .file   18      "/usr/local/cuda/bin/../include/math_functions.h"
-       .file   19      "/usr/local/cuda/bin/../include/device_functions.h"
-       .file   20      "/usr/local/cuda/bin/../include/math_constants.h"
-       .file   21      "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
-       .file   22      "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
-       .file   23      "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
-       .file   24      "/usr/local/cuda/bin/../include/common_types.h"
-       .file   25      "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
-       .file   26      "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
-       .file   27      "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
-       .file   28      "hello.cu"
-
-
-       .entry helloWorld (
-               .param .u32 __cudaparm_helloWorld_str)
-       {
-       .reg .u16 %rh<4>;
-       .reg .u32 %r<9>;
-       .loc    28      58      0
-$LBB1_helloWorld:
-       .loc    28      64      0
-       mov.u16         %rh1, %ctaid.x;
-       mov.u16         %rh2, %ntid.x;
-       mul.wide.u16    %r1, %rh1, %rh2;
-       cvt.u32.u16     %r2, %tid.x;
-       add.u32         %r3, %r2, %r1;
-       ld.param.u32    %r4, [__cudaparm_helloWorld_str];
-       add.u32         %r5, %r4, %r3;
-       ld.global.s8    %r6, [%r5+0];
-       add.s32         %r7, %r6, %r3;
-       st.global.s8    [%r5+0], %r7;
-       .loc    28      65      0
-       exit;
-$LDWend_helloWorld:
-       } // helloWorld
-
diff --git a/extra/cuda/prefix-sum.cu b/extra/cuda/prefix-sum.cu
deleted file mode 100644 (file)
index a77a67f..0000000
+++ /dev/null
@@ -1,103 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <cuda_runtime.h>
-
-static const int LOG_BANK_COUNT = 4;
-
-static inline __device__ __host__ unsigned shared_offset(unsigned i)
-{
-    return i + (i >> LOG_BANK_COUNT);
-}
-
-static inline __device__ __host__ unsigned offset_a(unsigned offset, unsigned i)
-{
-    return shared_offset(offset * (2*i + 1) - 1);
-}
-
-static inline __device__ __host__ unsigned offset_b(unsigned offset, unsigned i)
-{
-    return shared_offset(offset * (2*i + 2) - 1);
-}
-
-static inline __device__ __host__ unsigned lpot(unsigned x)
-{
-    --x; x |= x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return ++x;
-}
-
-template<typename T>
-__global__ void prefix_sum_block(T *in, T *out, unsigned n)
-{
-    extern __shared__ T temp[];
-
-    int idx = threadIdx.x;
-    int blocksize = blockDim.x;
-
-    temp[shared_offset(idx            )] = (idx             < n) ? in[idx            ] : 0;
-    temp[shared_offset(idx + blocksize)] = (idx + blocksize < n) ? in[idx + blocksize] : 0;
-
-    int offset, d;
-    for (offset = 1, d = blocksize; d > 0; d >>= 1, offset <<= 1) {
-        __syncthreads();
-        if (idx < d) {
-            unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
-            temp[b] += temp[a];
-        }
-    }
-
-    if (idx == 0) temp[shared_offset(blocksize*2 - 1)] = 0;
-
-    for (d = 1; d <= blocksize; d <<= 1) {
-        offset >>= 1;
-        __syncthreads();
-
-        if (idx < d) {
-            unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
-            unsigned t = temp[a];
-            temp[a] = temp[b];
-            temp[b] += t;
-        }
-    }
-    __syncthreads();
-
-    if (idx             < n) out[idx            ] = temp[shared_offset(idx            )];
-    if (idx + blocksize < n) out[idx + blocksize] = temp[shared_offset(idx + blocksize)];
-}
-
-template<typename T>
-void prefix_sum(T *in, T *out, unsigned n)
-{
-    char *device_values;
-    unsigned n_lpot = lpot(n);
-    size_t n_pitch;
-
-    cudaError_t error = cudaMallocPitch((void**)&device_values, &n_pitch, sizeof(T)*n, 2);
-    if (error != 0) {
-        printf("error %u allocating width %lu height %u\n", error, sizeof(T)*n, 2);
-        exit(1);
-    }
-
-    cudaMemcpy(device_values, in, sizeof(T)*n, cudaMemcpyHostToDevice);
-
-    prefix_sum_block<<<1, n_lpot/2, shared_offset(n_lpot)*sizeof(T)>>>
-        ((T*)device_values, (T*)(device_values + n_pitch), n);
-
-    cudaMemcpy(out, device_values + n_pitch, sizeof(T)*n, cudaMemcpyDeviceToHost);
-    cudaFree(device_values);
-}
-
-int main()
-{
-    sranddev();
-
-    static unsigned in_values[1024], out_values[1024];
-
-    for (int i = 0; i < 1024; ++i)
-        in_values[i] = rand() >> 21;
-
-    prefix_sum(in_values, out_values, 1024);
-
-    for (int i = 0; i < 1024; ++i)
-        printf("%5d => %5d\n", in_values[i], out_values[i]);
-
-    return 0;
-}
diff --git a/extra/cuda/prefix-sum.ptx b/extra/cuda/prefix-sum.ptx
deleted file mode 100644 (file)
index d189179..0000000
+++ /dev/null
@@ -1,222 +0,0 @@
-       .version 1.4
-       .target sm_10, map_f64_to_f32
-       // compiled with /usr/local/cuda/bin/../open64/lib//be
-       // nvopencc 3.0 built on 2010-03-11
-
-       //-----------------------------------------------------------
-       // Compiling /tmp/tmpxft_00000236_00000000-7_prefix-sum.cpp3.i (/var/folders/K6/K6oI14wZ2RWhSE+BYqTjA++++TI/-Tmp-/ccBI#.0ATpGM)
-       //-----------------------------------------------------------
-
-       //-----------------------------------------------------------
-       // Options:
-       //-----------------------------------------------------------
-       //  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
-       //  -O3 (Optimization level)
-       //  -g0 (Debug level)
-       //  -m2 (Report advisories)
-       //-----------------------------------------------------------
-
-       .file   1       "<command-line>"
-       .file   2       "/tmp/tmpxft_00000236_00000000-6_prefix-sum.cudafe2.gpu"
-       .file   3       "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
-       .file   4       "/usr/local/cuda/bin/../include/crt/device_runtime.h"
-       .file   5       "/usr/local/cuda/bin/../include/host_defines.h"
-       .file   6       "/usr/local/cuda/bin/../include/builtin_types.h"
-       .file   7       "/usr/local/cuda/bin/../include/device_types.h"
-       .file   8       "/usr/local/cuda/bin/../include/driver_types.h"
-       .file   9       "/usr/local/cuda/bin/../include/texture_types.h"
-       .file   10      "/usr/local/cuda/bin/../include/vector_types.h"
-       .file   11      "/usr/local/cuda/bin/../include/device_launch_parameters.h"
-       .file   12      "/usr/local/cuda/bin/../include/crt/storage_class.h"
-       .file   13      "/usr/include/i386/_types.h"
-       .file   14      "/usr/include/time.h"
-       .file   15      "prefix-sum.cu"
-       .file   16      "/usr/local/cuda/bin/../include/common_functions.h"
-       .file   17      "/usr/local/cuda/bin/../include/crt/func_macro.h"
-       .file   18      "/usr/local/cuda/bin/../include/math_functions.h"
-       .file   19      "/usr/local/cuda/bin/../include/device_functions.h"
-       .file   20      "/usr/local/cuda/bin/../include/math_constants.h"
-       .file   21      "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
-       .file   22      "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
-       .file   23      "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
-       .file   24      "/usr/local/cuda/bin/../include/common_types.h"
-       .file   25      "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
-       .file   26      "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
-       .file   27      "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
-       .file   28      "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
-
-       .extern .shared .align 4 .b8 temp[];
-
-       .entry _Z16prefix_sum_blockIjEvPT_S1_j (
-               .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in,
-               .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out,
-               .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n)
-       {
-       .reg .u32 %r<81>;
-       .reg .pred %p<11>;
-       .loc    15      28      0
-$LBB1__Z16prefix_sum_blockIjEvPT_S1_j:
-       ld.param.u32    %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
-       cvt.s32.u16     %r2, %tid.x;
-       setp.lt.u32     %p1, %r2, %r1;
-       @!%p1 bra       $Lt_0_7938;
-       .loc    15      35      0
-       ld.param.u32    %r3, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
-       mul24.lo.u32    %r4, %r2, 4;
-       add.u32         %r5, %r3, %r4;
-       ld.global.u32   %r6, [%r5+0];
-       bra.uni         $Lt_0_7682;
-$Lt_0_7938:
-       mov.u32         %r6, 0;
-$Lt_0_7682:
-       mov.u32         %r7, temp;
-       shr.u32         %r8, %r2, 4;
-       add.u32         %r9, %r2, %r8;
-       mul.lo.u32      %r10, %r9, 4;
-       add.u32         %r11, %r10, %r7;
-       st.shared.u32   [%r11+0], %r6;
-       cvt.s32.u16     %r12, %ntid.x;
-       add.s32         %r13, %r12, %r2;
-       .loc    15      28      0
-       ld.param.u32    %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
-       .loc    15      35      0
-       setp.lt.u32     %p2, %r13, %r1;
-       @!%p2 bra       $Lt_0_8450;
-       .loc    15      36      0
-       ld.param.u32    %r14, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
-       mul.lo.u32      %r15, %r13, 4;
-       add.u32         %r16, %r14, %r15;
-       ld.global.u32   %r17, [%r16+0];
-       bra.uni         $Lt_0_8194;
-$Lt_0_8450:
-       mov.u32         %r17, 0;
-$Lt_0_8194:
-       shr.u32         %r18, %r13, 4;
-       add.u32         %r19, %r13, %r18;
-       mul.lo.u32      %r20, %r19, 4;
-       add.u32         %r21, %r20, %r7;
-       st.shared.u32   [%r21+0], %r17;
-       .loc    15      39      0
-       mov.s32         %r22, %r12;
-       mov.u32         %r23, 0;
-       setp.le.s32     %p3, %r12, %r23;
-       mov.s32         %r24, 1;
-       @%p3 bra        $Lt_0_13314;
-$Lt_0_9218:
- //<loop> Loop body line 39, nesting depth: 1, estimated iterations: unknown
-       .loc    15      40      0
-       bar.sync        0;
-       setp.le.s32     %p4, %r22, %r2;
-       @%p4 bra        $Lt_0_9474;
- //<loop> Part of loop body line 39, head labeled $Lt_0_9218
-       .loc    15      43      0
-       mul24.lo.u32    %r25, %r2, 2;
-       add.u32         %r26, %r25, 1;
-       add.u32         %r27, %r25, 2;
-       mul.lo.u32      %r28, %r24, %r26;
-       mul.lo.u32      %r29, %r24, %r27;
-       sub.u32         %r30, %r29, 1;
-       shr.u32         %r31, %r30, 4;
-       add.u32         %r32, %r29, %r31;
-       mul.lo.u32      %r33, %r32, 4;
-       add.u32         %r34, %r33, %r7;
-       ld.shared.u32   %r35, [%r34+-4];
-       sub.u32         %r36, %r28, 1;
-       shr.u32         %r37, %r36, 4;
-       add.u32         %r38, %r28, %r37;
-       mul.lo.u32      %r39, %r38, 4;
-       add.u32         %r40, %r7, %r39;
-       ld.shared.u32   %r41, [%r40+-4];
-       add.u32         %r42, %r35, %r41;
-       st.shared.u32   [%r34+-4], %r42;
-$Lt_0_9474:
- //<loop> Part of loop body line 39, head labeled $Lt_0_9218
-       .loc    15      39      0
-       shr.s32         %r22, %r22, 1;
-       shl.b32         %r24, %r24, 1;
-       mov.u32         %r43, 0;
-       setp.gt.s32     %p5, %r22, %r43;
-       @%p5 bra        $Lt_0_9218;
-       bra.uni         $Lt_0_8706;
-$Lt_0_13314:
-$Lt_0_8706:
-       mov.u32         %r44, 0;
-       setp.ne.s32     %p6, %r2, %r44;
-       @%p6 bra        $Lt_0_10242;
-       .loc    15      47      0
-       mul24.lo.s32    %r45, %r12, 2;
-       mov.u32         %r46, 0;
-       sub.u32         %r47, %r45, 1;
-       shr.u32         %r48, %r47, 4;
-       add.u32         %r49, %r45, %r48;
-       mul.lo.u32      %r50, %r49, 4;
-       add.u32         %r51, %r7, %r50;
-       st.shared.u32   [%r51+-4], %r46;
-$Lt_0_10242:
-       mov.u32         %r52, 1;
-       setp.lt.s32     %p7, %r12, %r52;
-       @%p7 bra        $Lt_0_10754;
-       mov.s32         %r22, 1;
-$Lt_0_11266:
- //<loop> Loop body line 47, nesting depth: 1, estimated iterations: unknown
-       .loc    15      50      0
-       shr.s32         %r24, %r24, 1;
-       .loc    15      51      0
-       bar.sync        0;
-       setp.le.s32     %p8, %r22, %r2;
-       @%p8 bra        $Lt_0_11522;
- //<loop> Part of loop body line 47, head labeled $Lt_0_11266
-       .loc    15      55      0
-       mul24.lo.u32    %r53, %r2, 2;
-       add.u32         %r54, %r53, 1;
-       mul.lo.u32      %r55, %r24, %r54;
-       sub.u32         %r56, %r55, 1;
-       shr.u32         %r57, %r56, 4;
-       add.u32         %r58, %r55, %r57;
-       mul.lo.u32      %r59, %r58, 4;
-       add.u32         %r60, %r59, %r7;
-       ld.shared.u32   %r61, [%r60+-4];
-       .loc    15      56      0
-       add.u32         %r62, %r53, 2;
-       mul.lo.u32      %r63, %r24, %r62;
-       sub.u32         %r64, %r63, 1;
-       shr.u32         %r65, %r64, 4;
-       add.u32         %r66, %r63, %r65;
-       mul.lo.u32      %r67, %r66, 4;
-       add.u32         %r68, %r67, %r7;
-       ld.shared.u32   %r69, [%r68+-4];
-       st.shared.u32   [%r60+-4], %r69;
-       .loc    15      57      0
-       ld.shared.u32   %r70, [%r68+-4];
-       add.u32         %r71, %r70, %r61;
-       st.shared.u32   [%r68+-4], %r71;
-$Lt_0_11522:
- //<loop> Part of loop body line 47, head labeled $Lt_0_11266
-       .loc    15      49      0
-       shl.b32         %r22, %r22, 1;
-       setp.le.s32     %p9, %r22, %r12;
-       @%p9 bra        $Lt_0_11266;
-$Lt_0_10754:
-       .loc    15      60      0
-       bar.sync        0;
-       @!%p1 bra       $Lt_0_12290;
-       .loc    15      62      0
-       ld.shared.u32   %r72, [%r11+0];
-       ld.param.u32    %r73, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
-       mul24.lo.u32    %r74, %r2, 4;
-       add.u32         %r75, %r73, %r74;
-       st.global.u32   [%r75+0], %r72;
-$Lt_0_12290:
-       @!%p2 bra       $Lt_0_12802;
-       .loc    15      63      0
-       ld.shared.u32   %r76, [%r21+0];
-       ld.param.u32    %r77, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
-       mul.lo.u32      %r78, %r13, 4;
-       add.u32         %r79, %r77, %r78;
-       st.global.u32   [%r79+0], %r76;
-$Lt_0_12802:
-       .loc    15      64      0
-       exit;
-$LDWend__Z16prefix_sum_blockIjEvPT_S1_j:
-       } // _Z16prefix_sum_blockIjEvPT_S1_j
-
diff --git a/extra/cuda/syntax/authors.txt b/extra/cuda/syntax/authors.txt
new file mode 100644 (file)
index 0000000..7c1b2f2
--- /dev/null
@@ -0,0 +1 @@
+Doug Coleman
diff --git a/extra/cuda/syntax/syntax.factor b/extra/cuda/syntax/syntax.factor
new file mode 100644 (file)
index 0000000..b8df30f
--- /dev/null
@@ -0,0 +1,15 @@
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien.parser cuda kernel lexer parser ;
+IN: cuda.syntax
+
+SYNTAX: CUDA-LIBRARY: scan scan add-cuda-library ;
+
+SYNTAX: CUDA-FUNCTION:
+    scan [ create-in ] [ ] bi ";" scan-c-args drop define-cuda-word ;
+
+: 3<<< ( dim-block dim-grid shared-size -- function-launcher )
+    f function-launcher boa ;
+
+: 4<<< ( dim-block dim-grid shared-size stream -- function-launcher )
+    function-launcher boa ;