! (c)2009, 2010 Slava Pestov, Joe Groff bsd license
-USING: accessors alien alien.c-types alien.arrays alien.strings arrays
-byte-arrays cpu.architecture fry io io.encodings.binary
-io.files io.streams.memory kernel libc math sequences words
-byte-vectors ;
+USING: accessors alien alien.c-types alien.arrays alien.strings
+arrays byte-arrays cpu.architecture fry io io.encodings.binary
+io.files io.streams.memory kernel libc math sequences words ;
IN: alien.data
GENERIC: require-c-array ( c-type -- )
swap memory>byte-array
] [ [ + ] change-index drop ] 2bi ;
-M: byte-vector stream-write
- [ dup byte-length tail-slice ]
- [ [ [ byte-length ] bi@ + ] keep lengthen ]
- [ drop byte-length ]
- 2tri
- [ >c-ptr swap >c-ptr ] dip memcpy ;
-
M: value-type c-type-rep drop int-rep ;
M: value-type c-type-getter
unclip [ array-length ] dip [ <c-direct-array> ] 2curry ;
M: array c-type-unboxer-quot drop [ >c-ptr ] ;
-
M: biassoc assoc-size from>> assoc-size ;
-M: biassoc at* from>> at* ;
+M: biassoc at* from>> at* ; inline
-M: biassoc value-at* to>> at* ;
+M: biassoc value-at* to>> at* ; inline
: once-at ( value key assoc -- )
2dup key? [ 3drop ] [ set-at ] if ;
! Copyright (C) 2008, 2010 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays combinators hints kernel locals math
-math.order sequences ;
+math.order sequences sequences.private ;
IN: binary-search
<PRIVATE
:: (search) ( seq from to quot: ( elt -- <=> ) -- i elt )
from to + 2/ :> midpoint@
- midpoint@ seq nth :> midpoint
+ midpoint@ seq nth-unsafe :> midpoint
to from - 1 <= [
midpoint@ midpoint
[ optimized? not ] filter compile ;
"debug-compiler" get [
-
+
nl
"Compiling..." write flush
curry compose uncurry
- array-nth set-array-nth length>>
+ array-nth set-array-nth
wrap probe
" done" print flush
+ "io.streams.byte-array.fast" require
+
] unless
M: ##compare analyze-aliases*
call-next-method
dup useless-compare? [
- dst>> \ f type-number \ ##load-immediate new-insn
+ dst>> f \ ##load-constant new-insn
analyze-aliases*
] when ;
and ;
: emit-trivial-if ( -- )
- ds-pop \ f type-number cc/= ^^compare-imm ds-push ;
+ ds-pop f cc/= ^^compare-imm ds-push ;
: trivial-not-if? ( #if -- ? )
children>> first2
and ;
: emit-trivial-not-if ( -- )
- ds-pop \ f type-number cc= ^^compare-imm ds-push ;
+ ds-pop f cc= ^^compare-imm ds-push ;
: emit-actual-if ( #if -- )
! Inputs to the final instruction need to be copied because of
! loc>vreg sync
- ds-pop any-rep ^^copy \ f type-number cc/= ##compare-imm-branch emit-if ;
+ ds-pop any-rep ^^copy f cc/= ##compare-imm-branch emit-if ;
M: #if emit-node
{
-! Copyright (C) 2008, 2009 Slava Pestov.
+! Copyright (C) 2008, 2010 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays byte-arrays kernel layouts math
-namespaces sequences combinators splitting parser effects
-words cpu.architecture compiler.cfg.registers
+USING: accessors arrays byte-arrays combinators.short-circuit
+kernel layouts math namespaces sequences combinators splitting
+parser effects words cpu.architecture compiler.cfg.registers
compiler.cfg.instructions compiler.cfg.instructions.syntax ;
IN: compiler.cfg.hats
>>
+: immutable? ( obj -- ? )
+ { [ float? ] [ word? ] [ not ] } 1|| ; inline
+
: ^^load-literal ( obj -- dst )
[ next-vreg dup ] dip {
- { [ dup not ] [ drop \ f type-number ##load-immediate ] }
{ [ dup fixnum? ] [ tag-fixnum ##load-immediate ] }
- { [ dup float? ] [ ##load-constant ] }
+ { [ dup immutable? ] [ ##load-constant ] }
[ ##load-reference ]
} cond ;
def: dst/int-rep
constant: obj ;
+INSN: ##load-double
+def: dst/double-rep
+constant: val ;
+
INSN: ##peek
def: dst/int-rep
literal: loc ;
0 cc= ^^compare-imm
ds-push ;
-: tag-literal ( n -- tagged )
- literal>> [ tag-fixnum ] [ \ f type-number ] if* ;
-
: emit-fixnum-op ( insn -- )
[ 2inputs ] dip call ds-push ; inline
{ [ dup 0 [-inf,a] interval-subset? ] [ drop emit-fixnum-right-shift ] }
[ drop emit-fixnum-shift-general ]
} cond ;
-
+
: emit-fixnum-bitnot ( -- )
ds-pop ^^not tag-mask get ^^xor-imm ds-push ;
tri
] with-compilation-unit
-: each-def-rep ( ... insn vreg-quot: ( ... vreg rep -- ... ) -- ... )
+: each-def-rep ( insn vreg-quot: ( vreg rep -- ) -- )
[ [ defs-vreg ] [ defs-vreg-rep ] bi ] dip with when* ; inline
-: each-use-rep ( ... insn vreg-quot: ( ... vreg rep -- ... ) -- ... )
+: each-use-rep ( insn vreg-quot: ( vreg rep -- ) -- )
[ [ uses-vregs ] [ uses-vreg-reps ] bi ] dip 2each ; inline
-: each-temp-rep ( ... insn vreg-quot: ( ... vreg rep -- ... ) -- ... )
+: each-temp-rep ( insn vreg-quot: ( vreg rep -- ) -- )
[ [ temp-vregs ] [ temp-vreg-reps ] bi ] dip 2each ; inline
+: each-rep ( insn vreg-quot: ( vreg rep -- ) -- )
+ [ each-def-rep ] [ each-use-rep ] [ each-temp-rep ] 2tri ; inline
+
: with-vreg-reps ( ..a cfg vreg-quot: ( ..a vreg rep -- ..b ) -- ..b )
'[
[ basic-block set ] [
[
- _
- [ each-def-rep ]
- [ each-use-rep ]
- [ each-temp-rep ] 2tri
+ _ each-rep
] each-non-phi
] bi
] each-basic-block ; inline
-USING: tools.test cpu.architecture
-compiler.cfg.registers compiler.cfg.instructions
-compiler.cfg.representations.preferred ;
+USING: accessors compiler.cfg compiler.cfg.debugger
+compiler.cfg.instructions compiler.cfg.registers
+compiler.cfg.representations.preferred cpu.architecture kernel
+namespaces tools.test sequences arrays system ;
IN: compiler.cfg.representations
[ { double-rep double-rep } ] [
{ dst 5 }
{ src 3 }
} defs-vreg-rep
-] unit-test
\ No newline at end of file
+] unit-test
+
+: test-representations ( -- )
+ cfg new 0 get >>entry dup cfg set select-representations drop ;
+
+! Make sure cost calculation isn't completely wrong
+V{
+ T{ ##prologue }
+ T{ ##branch }
+} 0 test-bb
+
+V{
+ T{ ##peek f 1 D 0 }
+ T{ ##peek f 2 D 1 }
+ T{ ##add-float f 3 1 2 }
+ T{ ##replace f 3 D 0 }
+ T{ ##replace f 3 D 1 }
+ T{ ##replace f 3 D 2 }
+ T{ ##branch }
+} 1 test-bb
+
+V{
+ T{ ##epilogue }
+ T{ ##return }
+} 2 test-bb
+
+0 1 edge
+1 2 edge
+
+[ ] [ test-representations ] unit-test
+
+[ 1 ] [ 1 get instructions>> [ ##allot? ] count ] unit-test
+
+cpu x86.32? [
+
+ ! Make sure load-constant is converted into load-double
+ V{
+ T{ ##prologue }
+ T{ ##branch }
+ } 0 test-bb
+
+ V{
+ T{ ##peek f 1 D 0 }
+ T{ ##load-constant f 2 0.5 }
+ T{ ##add-float f 3 1 2 }
+ T{ ##replace f 3 D 0 }
+ T{ ##branch }
+ } 1 test-bb
+
+ V{
+ T{ ##epilogue }
+ T{ ##return }
+ } 2 test-bb
+
+ 0 1 edge
+ 1 2 edge
+
+ [ ] [ test-representations ] unit-test
+
+ [ t ] [ 1 get instructions>> second ##load-double? ] unit-test
+
+ ! Make sure phi nodes are handled in a sane way
+ V{
+ T{ ##prologue }
+ T{ ##branch }
+ } 0 test-bb
+
+ V{
+ T{ ##peek f 1 D 0 }
+ T{ ##compare-imm-branch f 1 2 }
+ } 1 test-bb
+
+ V{
+ T{ ##load-constant f 2 1.5 }
+ T{ ##branch }
+ } 2 test-bb
+
+ V{
+ T{ ##load-constant f 3 2.5 }
+ T{ ##branch }
+ } 3 test-bb
+
+ V{
+ T{ ##phi f 4 }
+ T{ ##peek f 5 D 0 }
+ T{ ##add-float f 6 4 5 }
+ T{ ##replace f 6 D 0 }
+ } 4 test-bb
+
+ V{
+ T{ ##epilogue }
+ T{ ##return }
+ } 5 test-bb
+
+ test-diamond
+ 4 5 edge
+
+ 2 get 2 2array
+ 3 get 3 2array 2array 4 get instructions>> first (>>inputs)
+
+ [ ] [ test-representations ] unit-test
+
+ [ t ] [ 2 get instructions>> first ##load-double? ] unit-test
+
+ [ t ] [ 3 get instructions>> first ##load-double? ] unit-test
+
+ [ t ] [ 4 get instructions>> first ##phi? ] unit-test
+] when
\ No newline at end of file
-! Copyright (C) 2009 Slava Pestov
+! Copyright (C) 2009, 2010 Slava Pestov
! See http://factorcode.org/license.txt for BSD license.
USING: kernel fry accessors sequences assocs sets namespaces
arrays combinators combinators.short-circuit math make locals
: possible ( vreg -- reps ) possibilities get at ;
: compute-possibilities ( cfg -- )
- H{ } clone [ '[ swap _ conjoin-at ] with-vreg-reps ] keep
- [ keys ] assoc-map possibilities set ;
+ H{ } clone [ '[ swap _ adjoin-at ] with-vreg-reps ] keep
+ [ members ] assoc-map possibilities set ;
! Compute vregs which must remain tagged for their lifetime.
SYMBOL: always-boxed
SYMBOL: costs
: init-costs ( -- )
- possibilities get [ [ 0 ] H{ } map>assoc ] assoc-map costs set ;
+ possibilities get [ drop H{ } clone ] assoc-map costs set ;
+
+: record-possibility ( rep vreg -- )
+ costs get at [ 0 or ] change-at ;
: increase-cost ( rep vreg -- )
! Increase cost of keeping vreg in rep, making a choice of rep less
! likely.
- [ basic-block get loop-nesting-at ] 2dip costs get at at+ ;
+ costs get at [ 0 or basic-block get loop-nesting-at 1 + + ] change-at ;
: maybe-increase-cost ( possible vreg preferred -- )
- pick eq? [ 2drop ] [ increase-cost ] if ;
+ pick eq? [ record-possibility ] [ increase-cost ] if ;
: representation-cost ( vreg preferred -- )
! 'preferred' is a representation that the instruction can accept with no cost.
[ '[ _ _ maybe-increase-cost ] ]
2bi each ;
+GENERIC: compute-insn-costs ( insn -- )
+
+M: ##load-constant compute-insn-costs
+ ! There's no cost to unboxing the result of a ##load-constant
+ drop ;
+
+M: insn compute-insn-costs [ representation-cost ] each-rep ;
+
: compute-costs ( cfg -- costs )
- init-costs [ representation-cost ] with-vreg-reps costs get ;
+ init-costs
+ [
+ [ basic-block set ]
+ [
+ [
+ compute-insn-costs
+ ] each-non-phi
+ ] bi
+ ] each-basic-block
+ costs get ;
! For every vreg, compute preferred representation, that minimizes costs.
: minimize-costs ( costs -- representations )
+ [ nip assoc-empty? not ] assoc-filter
[ >alist alist-min first ] assoc-map ;
: compute-representations ( cfg -- )
bi assoc-union
representations set ;
+! PHI nodes require special treatment
+! If the output of a phi instruction is only used as the input to another
+! phi instruction, then we want to use the same representation for both
+! if possible.
+SYMBOL: phis
+
+: collect-phis ( cfg -- )
+ H{ } clone phis set
+ [
+ phis get
+ '[ [ inputs>> values ] [ dst>> ] bi _ set-at ] each-phi
+ ] each-basic-block ;
+
+SYMBOL: work-list
+
+: add-to-work-list ( vregs -- )
+ work-list get push-all-front ;
+
+: rep-assigned ( vregs -- vregs' )
+ representations get '[ _ key? ] filter ;
+
+: rep-not-assigned ( vregs -- vregs' )
+ representations get '[ _ key? not ] filter ;
+
+: add-ready-phis ( -- )
+ phis get keys rep-assigned add-to-work-list ;
+
+: process-phi ( dst -- )
+ ! If dst = phi(src1,src2,...) and dst's representation has been
+ ! determined, assign that representation to each one of src1,...
+ ! that does not have a representation yet, and process those, too.
+ dup phis get at* [
+ [ rep-of ] [ rep-not-assigned ] bi*
+ [ [ set-rep-of ] with each ] [ add-to-work-list ] bi
+ ] [ 2drop ] if ;
+
+: remaining-phis ( -- )
+ phis get keys rep-not-assigned { } assert-sequence= ;
+
+: process-phis ( -- )
+ <hashed-dlist> work-list set
+ add-ready-phis
+ work-list get [ process-phi ] slurp-deque
+ remaining-phis ;
+
+: compute-phi-representations ( cfg -- )
+ collect-phis process-phis ;
+
! Insert conversions. This introduces new temporaries, so we need
! to rename opearands too.
: record-renaming ( from to -- )
2array renaming-set get push needs-renaming? on ;
-:: (compute-renaming-set) ( ..a vreg required quot: ( ..a vreg preferred required -- ..b ) -- ..b )
+:: (compute-renaming-set) ( vreg required quot: ( vreg preferred required -- new-vreg ) -- )
vreg rep-of :> preferred
preferred required eq?
[ vreg no-renaming ]
GENERIC: conversions-for-insn ( insn -- )
-SYMBOL: phi-mappings
+M: ##phi conversions-for-insn , ;
-! compiler.cfg.cssa inserts conversions which convert phi inputs into
-! the representation of the output. However, we still have to do some
-! processing here, because if the only node that uses the output of
-! the phi instruction is another phi instruction then this phi node's
-! output won't have a representation assigned.
-M: ##phi conversions-for-insn
- [ , ] [ [ inputs>> values ] [ dst>> ] bi phi-mappings get set-at ] bi ;
+! When a float is unboxed, we replace the ##load-constant with a ##load-double
+! if the architecture supports it
+: convert-to-load-double? ( insn -- ? )
+ {
+ [ drop load-double? ]
+ [ dst>> rep-of double-rep? ]
+ [ obj>> float? ]
+ } 1&& ;
! When a literal zeroes/ones vector is unboxed, we replace the ##load-reference
! with a ##zero-vector or ##fill-vector instruction since this is more efficient.
[ dst>> rep-of vector-rep? ]
[ obj>> B{ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 } = ]
} 1&& ;
+
: convert-to-fill-vector? ( insn -- ? )
{
[ dst>> rep-of vector-rep? ]
[ obj>> B{ 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 } = ]
} 1&& ;
+: (convert-to-load-double) ( insn -- dst val )
+ [ dst>> ] [ obj>> ] bi ; inline
+
: (convert-to-zero/fill-vector) ( insn -- dst rep )
dst>> dup rep-of ; inline
: conversions-for-load-insn ( insn -- ?insn )
{
+ {
+ [ dup convert-to-load-double? ]
+ [ (convert-to-load-double) ##load-double f ]
+ }
{
[ dup convert-to-zero-vector? ]
[ (convert-to-zero/fill-vector) ##zero-vector f ]
] change-instructions drop
] if ;
-! If the output of a phi instruction is only used as the input to another
-! phi instruction, then we want to use the same representation for both
-! if possible.
-SYMBOL: work-list
-
-: add-to-work-list ( vregs -- )
- work-list get push-all-front ;
-
-: rep-assigned ( vregs -- vregs' )
- representations get '[ _ key? ] filter ;
-
-: rep-not-assigned ( vregs -- vregs' )
- representations get '[ _ key? not ] filter ;
-
-: add-ready-phis ( -- )
- phi-mappings get keys rep-assigned add-to-work-list ;
-
-: process-phi-mapping ( dst -- )
- ! If dst = phi(src1,src2,...) and dst's representation has been
- ! determined, assign that representation to each one of src1,...
- ! that does not have a representation yet, and process those, too.
- dup phi-mappings get at* [
- [ rep-of ] [ rep-not-assigned ] bi*
- [ [ set-rep-of ] with each ] [ add-to-work-list ] bi
- ] [ 2drop ] if ;
-
-: remaining-phi-mappings ( -- )
- phi-mappings get keys rep-not-assigned
- [ [ int-rep ] dip set-rep-of ] each ;
-
-: process-phi-mappings ( -- )
- <hashed-dlist> work-list set
- add-ready-phis
- work-list get [ process-phi-mapping ] slurp-deque
- remaining-phi-mappings ;
-
: insert-conversions ( cfg -- )
- H{ } clone phi-mappings set
- [ conversions-for-block ] each-basic-block
- process-phi-mappings ;
+ [ conversions-for-block ] each-basic-block ;
PRIVATE>
{
[ compute-possibilities ]
[ compute-representations ]
+ [ compute-phi-representations ]
[ insert-conversions ]
[ ]
} cleave
[ value>> immediate-bitwise? ]
} 1&& ;
+: vreg-immediate-comparand? ( vreg -- ? )
+ vreg>expr {
+ [ constant-expr? ]
+ [ value>> immediate-comparand? ]
+ } 1&& ;
+
! Outputs f to mean no change
GENERIC: rewrite ( insn -- insn/f )
: ##branch-t? ( insn -- ? )
dup ##compare-imm-branch? [
- {
- [ cc>> cc/= eq? ]
- [ src2>> \ f type-number eq? ]
- } 1&&
+ { [ cc>> cc/= eq? ] [ src2>> not ] } 1&&
] [ drop f ] if ; inline
: general-compare-expr? ( insn -- ? )
: rewrite-redundant-comparison? ( insn -- ? )
{
[ src1>> vreg>expr general-compare-expr? ]
- [ src2>> \ f type-number = ]
- [ cc>> { cc= cc/= } member-eq? ]
+ [ src2>> not ]
+ [ cc>> { cc= cc/= } member? ]
} 1&& ; inline
: rewrite-redundant-comparison ( insn -- insn' )
} cond
swap cc= eq? [ [ negate-cc ] change-cc ] when ;
-ERROR: bad-comparison ;
-
: (fold-compare-imm) ( insn -- ? )
- [ [ src1>> vreg>constant ] [ src2>> ] bi ] [ cc>> ] bi
- pick integer?
- [ [ <=> ] dip evaluate-cc ]
- [
- 2nip {
- { cc= [ f ] }
- { cc/= [ t ] }
- [ bad-comparison ]
+ [ src1>> vreg>constant ] [ src2>> ] [ cc>> ] tri
+ 2over [ integer? ] both? [ [ <=> ] dip evaluate-cc ] [
+ {
+ { cc= [ eq? ] }
+ { cc/= [ eq? not ] }
} case
] if ;
M: ##compare-branch rewrite
{
- { [ dup src1>> vreg-immediate-arithmetic? ] [ t >compare-imm-branch ] }
- { [ dup src2>> vreg-immediate-arithmetic? ] [ f >compare-imm-branch ] }
+ { [ dup src1>> vreg-immediate-comparand? ] [ t >compare-imm-branch ] }
+ { [ dup src2>> vreg-immediate-comparand? ] [ f >compare-imm-branch ] }
{ [ dup self-compare? ] [ rewrite-self-compare-branch ] }
[ drop f ]
} cond ;
next-vreg \ ##compare-imm new-insn ; inline
: >boolean-insn ( insn ? -- insn' )
- [ dst>> ] dip
- {
- { t [ t \ ##load-constant new-insn ] }
- { f [ \ f type-number \ ##load-immediate new-insn ] }
- } case ;
+ [ dst>> ] dip \ ##load-constant new-insn ;
: rewrite-self-compare ( insn -- insn' )
dup (rewrite-self-compare) >boolean-insn ;
M: ##compare rewrite
{
- { [ dup src1>> vreg-immediate-arithmetic? ] [ t >compare-imm ] }
- { [ dup src2>> vreg-immediate-arithmetic? ] [ f >compare-imm ] }
+ { [ dup src1>> vreg-immediate-comparand? ] [ t >compare-imm ] }
+ { [ dup src2>> vreg-immediate-comparand? ] [ f >compare-imm ] }
{ [ dup self-compare? ] [ rewrite-self-compare ] }
[ drop f ]
} cond ;
: constant-fold ( insn -- insn' )
[ dst>> ]
- [ [ src1>> vreg>constant ] [ src2>> ] [ ] tri constant-fold* ] bi
+ [
+ [ src1>> vreg>constant \ f type-number or ]
+ [ src2>> ]
+ [ ]
+ tri constant-fold*
+ ] bi
\ ##load-immediate new-insn ; inline
: unary-constant-fold? ( insn -- ? )
[ drop f ]
} cond ;
-: insn>imm-insn ( insn op swap? -- )
+: insn>imm-insn ( insn op swap? -- new-insn )
swap [
[ [ dst>> ] [ src1>> ] [ src2>> ] tri ] dip
[ swap ] when vreg>constant
arithmetic-op?
[ vreg-immediate-arithmetic? ] [ vreg-immediate-bitwise? ] if ;
-: rewrite-arithmetic ( insn op -- ? )
+: rewrite-arithmetic ( insn op -- insn/f )
{
{ [ over src2>> over vreg-immediate? ] [ f insn>imm-insn ] }
[ 2drop f ]
} cond ; inline
-: rewrite-arithmetic-commutative ( insn op -- ? )
+: rewrite-arithmetic-commutative ( insn op -- insn/f )
{
{ [ over src2>> over vreg-immediate? ] [ f insn>imm-insn ] }
{ [ over src1>> over vreg-immediate? ] [ t insn>imm-insn ] }
accessors sequences compiler.cfg.predecessors locals compiler.cfg.dce
compiler.cfg.ssa.destruction compiler.cfg.loop-detection
compiler.cfg.representations compiler.cfg assocs vectors arrays
-layouts literals namespaces alien compiler.cfg.value-numbering.simd ;
+layouts literals namespaces alien compiler.cfg.value-numbering.simd
+system ;
IN: compiler.cfg.value-numbering.tests
: trim-temps ( insns -- insns )
T{ ##load-reference f 1 + }
T{ ##peek f 2 D 0 }
T{ ##compare f 4 2 1 cc> }
- T{ ##compare-imm f 6 4 $[ \ f type-number ] cc/= }
+ T{ ##compare-imm f 6 4 f cc/= }
T{ ##replace f 6 D 0 }
} value-numbering-step trim-temps
] unit-test
T{ ##load-reference f 1 + }
T{ ##peek f 2 D 0 }
T{ ##compare f 4 2 1 cc<= }
- T{ ##compare-imm f 6 4 $[ \ f type-number ] cc= }
+ T{ ##compare-imm f 6 4 f cc= }
T{ ##replace f 6 D 0 }
} value-numbering-step trim-temps
] unit-test
T{ ##peek f 8 D 0 }
T{ ##peek f 9 D -1 }
T{ ##compare-float-unordered f 12 8 9 cc< }
- T{ ##compare-imm f 14 12 $[ \ f type-number ] cc= }
+ T{ ##compare-imm f 14 12 f cc= }
T{ ##replace f 14 D 0 }
} value-numbering-step trim-temps
] unit-test
T{ ##peek f 29 D -1 }
T{ ##peek f 30 D -2 }
T{ ##compare f 33 29 30 cc<= }
- T{ ##compare-imm-branch f 33 $[ \ f type-number ] cc/= }
+ T{ ##compare-imm-branch f 33 f cc/= }
} value-numbering-step trim-temps
] unit-test
{
T{ ##peek f 1 D -1 }
T{ ##test-vector f 2 1 f float-4-rep vcc-any }
- T{ ##compare-imm-branch f 2 $[ \ f type-number ] cc/= }
+ T{ ##compare-imm-branch f 2 f cc/= }
} value-numbering-step trim-temps
] unit-test
} value-numbering-step trim-temps
] unit-test
+cpu x86.32? [
+ [
+ {
+ T{ ##peek f 0 D 0 }
+ T{ ##load-constant f 1 + }
+ T{ ##compare-imm f 2 0 + cc= }
+ }
+ ] [
+ {
+ T{ ##peek f 0 D 0 }
+ T{ ##load-constant f 1 + }
+ T{ ##compare f 2 0 1 cc= }
+ } value-numbering-step trim-temps
+ ] unit-test
+
+ [
+ {
+ T{ ##peek f 0 D 0 }
+ T{ ##load-constant f 1 + }
+ T{ ##compare-imm-branch f 0 + cc= }
+ }
+ ] [
+ {
+ T{ ##peek f 0 D 0 }
+ T{ ##load-constant f 1 + }
+ T{ ##compare-branch f 0 1 cc= }
+ } value-numbering-step trim-temps
+ ] unit-test
+] when
+
[
{
T{ ##peek f 0 D 0 }
} value-numbering-step trim-temps
] unit-test
+[
+ {
+ T{ ##peek f 0 D 0 }
+ T{ ##load-constant f 1 3.5 }
+ T{ ##compare-branch f 0 1 cc= }
+ }
+] [
+ {
+ T{ ##peek f 0 D 0 }
+ T{ ##load-constant f 1 3.5 }
+ T{ ##compare-branch f 0 1 cc= }
+ } value-numbering-step trim-temps
+] unit-test
+
[
{
T{ ##peek f 0 D 0 }
[
{
T{ ##peek f 0 D 0 }
- T{ ##load-constant f 1 3.5 }
- T{ ##compare-branch f 0 1 cc= }
+ T{ ##load-immediate f 1 100 }
+ T{ ##compare-imm-branch f 0 100 cc>= }
}
] [
{
T{ ##peek f 0 D 0 }
- T{ ##load-constant f 1 3.5 }
- T{ ##compare-branch f 0 1 cc= }
+ T{ ##load-immediate f 1 100 }
+ T{ ##compare-branch f 1 0 cc<= }
+ } value-numbering-step trim-temps
+] unit-test
+
+! Branch folding
+[
+ {
+ T{ ##load-immediate f 1 100 }
+ T{ ##load-immediate f 2 200 }
+ T{ ##load-constant f 3 t }
+ }
+] [
+ {
+ T{ ##load-immediate f 1 100 }
+ T{ ##load-immediate f 2 200 }
+ T{ ##compare f 3 1 2 cc<= }
} value-numbering-step trim-temps
] unit-test
[
{
- T{ ##peek f 0 D 0 }
T{ ##load-immediate f 1 100 }
- T{ ##compare-imm-branch f 0 100 cc>= }
+ T{ ##load-immediate f 2 200 }
+ T{ ##load-constant f 3 f }
}
] [
{
- T{ ##peek f 0 D 0 }
T{ ##load-immediate f 1 100 }
- T{ ##compare-branch f 1 0 cc<= }
+ T{ ##load-immediate f 2 200 }
+ T{ ##compare f 3 1 2 cc= }
+ } value-numbering-step trim-temps
+] unit-test
+
+[
+ {
+ T{ ##load-immediate f 1 100 }
+ T{ ##load-constant f 2 f }
+ }
+] [
+ {
+ T{ ##load-immediate f 1 100 }
+ T{ ##compare-imm f 2 1 f cc= }
+ } value-numbering-step trim-temps
+] unit-test
+
+[
+ {
+ T{ ##load-constant f 1 f }
+ T{ ##load-constant f 2 t }
+ }
+] [
+ {
+ T{ ##load-constant f 1 f }
+ T{ ##compare-imm f 2 1 f cc= }
} value-numbering-step trim-temps
] unit-test
} value-numbering-step
] unit-test
+! Stupid constant folding corner case
+[
+ {
+ T{ ##load-constant f 1 f }
+ T{ ##load-immediate f 2 $[ \ f type-number ] }
+ }
+] [
+ {
+ T{ ##load-constant f 1 f }
+ T{ ##and-imm f 2 1 15 }
+ } value-numbering-step
+] unit-test
+
! Displaced alien optimizations
3 vreg-counter set-global
{
T{ ##load-immediate f 1 10 }
T{ ##load-immediate f 2 20 }
- T{ ##load-immediate f 3 $[ \ f type-number ] }
+ T{ ##load-constant f 3 f }
}
] [
{
{
T{ ##load-immediate f 1 10 }
T{ ##load-immediate f 2 20 }
- T{ ##load-immediate f 3 $[ \ f type-number ] }
+ T{ ##load-constant f 3 f }
}
] [
{
[
{
T{ ##peek f 0 D 0 }
- T{ ##load-immediate f 1 $[ \ f type-number ] }
+ T{ ##load-constant f 1 f }
}
] [
{
[
{
T{ ##peek f 0 D 0 }
- T{ ##load-immediate f 1 $[ \ f type-number ] }
+ T{ ##load-constant f 1 f }
}
] [
{
[
{
T{ ##peek f 0 D 0 }
- T{ ##load-immediate f 1 $[ \ f type-number ] }
+ T{ ##load-constant f 1 f }
}
] [
{
{
T{ ##peek f 0 D 0 }
T{ ##compare f 1 0 0 cc<= }
- T{ ##compare-imm-branch f 1 $[ \ f type-number ] cc/= }
+ T{ ##compare-imm-branch f 1 f cc/= }
} test-branch-folding
] unit-test
T{ ##copy { dst 21 } { src 20 } { rep any-rep } }
T{ ##compare-imm-branch
{ src1 21 }
- { src2 $[ \ f type-number ] }
+ { src2 f }
{ cc cc/= }
}
} 1 test-bb
CODEGEN: ##load-immediate %load-immediate
CODEGEN: ##load-reference %load-reference
CODEGEN: ##load-constant %load-reference
+CODEGEN: ##load-double %load-double
CODEGEN: ##peek %peek
CODEGEN: ##replace %replace
CODEGEN: ##inc-d %inc-d
: rel-word-pic-tail ( word class -- )
[ add-literal ] dip rt-entry-point-pic-tail rel-fixup ;
-: rel-immediate ( literal class -- )
+: rel-literal ( literal class -- )
[ add-literal ] dip rt-literal rel-fixup ;
+: rel-float ( literal class -- )
+ [ add-literal ] dip rt-float rel-fixup ;
+
: rel-this ( class -- )
rt-this rel-fixup ;
rt-vm
rt-cards-offset
rt-decks-offset
- rt-exception-handler ;
+ rt-exception-handler
+ rt-float ;
: rc-absolute? ( n -- ? )
${
compile-test-cfg
execute( -- result ) ;
-! loading immediates
+! loading constants
[ f ] [
V{
- T{ ##load-immediate f 0 $[ \ f type-number ] }
+ T{ ##load-constant f 0 f }
} compile-test-bb
] unit-test
integer generalize-counter-interval
] unit-test
-[ T{ interval f { 0 t } { $[ most-positive-fixnum ] t } } ] [
+[ T{ interval f { 0 t } { $[ max-array-capacity ] t } } ] [
T{ interval f { 1 t } { 1 t } }
T{ interval f { 0 t } { 0 t } }
fixnum generalize-counter-interval
-! Copyright (C) 2008, 2009 Slava Pestov.
+! Copyright (C) 2008, 2010 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
-USING: kernel sequences accessors arrays fry math math.intervals
-layouts combinators namespaces locals
+USING: kernel classes.algebra sequences accessors arrays fry
+math math.intervals layouts combinators namespaces locals
stack-checker.inlining
compiler.tree
compiler.tree.combinators
compiler.tree.propagation.simple
compiler.tree.propagation.branches
compiler.tree.propagation.constraints ;
+FROM: sequences.private => array-capacity ;
IN: compiler.tree.propagation.recursive
: check-fixed-point ( node infos1 infos2 -- )
[ label>> calls>> [ node>> node-input-infos ] map flip ]
[ latest-input-infos ] bi ;
+: counter-class ( interval class -- class' )
+ dup fixnum class<= [
+ swap array-capacity-interval interval-subset?
+ [ drop array-capacity ] when
+ ] [ nip ] if ;
+
:: generalize-counter-interval ( interval initial-interval class -- interval' )
+ interval class counter-class :> class
{
{ [ interval initial-interval interval-subset? ] [ initial-interval ] }
{ [ interval empty-interval eq? ] [ initial-interval ] }
! Copyright (C) 2008, 2010 Slava Pestov, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: alien.c-types kernel sequences words fry generic accessors
-classes.tuple classes classes.algebra definitions
-stack-checker.dependencies quotations classes.tuple.private math
-math.partial-dispatch math.private math.intervals sets.private
-math.floats.private math.integers.private layouts math.order
-vectors hashtables combinators effects generalizations assocs
-sets combinators.short-circuit sequences.private locals growable
+USING: alien.c-types kernel sequences words fry generic
+generic.single accessors classes.tuple classes classes.algebra
+definitions stack-checker.dependencies quotations
+classes.tuple.private math math.partial-dispatch math.private
+math.intervals sets.private math.floats.private
+math.integers.private layouts math.order vectors hashtables
+combinators effects generalizations assocs sets
+combinators.short-circuit sequences.private locals growable
stack-checker namespaces compiler.tree.propagation.info ;
FROM: math => float ;
FROM: sets => set ;
[ \ push def>> ] [ f ] if
] "custom-inlining" set-word-prop
+! Speeds up fasta benchmark
+\ >fixnum [
+ in-d>> first value-info class>> fixnum \ f class-or class<=
+ [ [ dup [ \ >fixnum no-method ] unless ] ] [ f ] if
+] "custom-inlining" set-word-prop
+
! We want to constant-fold calls to heap-size, and recompile those
! calls when a C type is redefined
\ heap-size [
! Mapping from register class to machine registers
HOOK: machine-registers cpu ( -- assoc )
-HOOK: %load-immediate cpu ( reg obj -- )
+HOOK: %load-immediate cpu ( reg val -- )
HOOK: %load-reference cpu ( reg obj -- )
+HOOK: %load-double cpu ( reg val -- )
HOOK: %peek cpu ( vreg loc -- )
HOOK: %replace cpu ( vreg loc -- )
M: stack-params param-reg 2drop ;
-! Is this integer small enough to be an immediate operand for
-! %add-imm, %sub-imm, and %mul-imm?
+! Does this architecture support %load-double?
+HOOK: load-double? cpu ( -- ? )
+
+M: object load-double? f ;
+
+! Can this value be an immediate operand for %add-imm, %sub-imm,
+! or %mul-imm?
HOOK: immediate-arithmetic? cpu ( n -- ? )
-! Is this integer small enough to be an immediate operand for
-! %and-imm, %or-imm, and %xor-imm?
+! Can this value be an immediate operand for %and-imm, %or-imm,
+! or %xor-imm?
HOOK: immediate-bitwise? cpu ( n -- ? )
-! What c-type describes the implicit struct return pointer for large structs?
+! Can this value be an immediate operand for %compare-imm or
+! %compare-imm-branch?
+HOOK: immediate-comparand? cpu ( n -- ? )
+
+M: object immediate-comparand? ( n -- ? )
+ {
+ { [ dup integer? ] [ immediate-arithmetic? ] }
+ { [ dup not ] [ drop t ] }
+ [ drop f ]
+ } cond ;
+
+! What c-type describes the implicit struct return pointer for
+! large structs?
HOOK: struct-return-pointer-type cpu ( -- c-type )
! Is this structure small enough to be returned in registers?
M: ppc %load-immediate ( reg n -- ) swap LOAD ;
M: ppc %load-reference ( reg obj -- )
- [ 0 swap LOAD32 ] [ rc-absolute-ppc-2/2 rel-immediate ] bi* ;
+ [ 0 swap LOAD32 ] [ rc-absolute-ppc-2/2 rel-literal ] bi* ;
M: ppc %alien-global ( register symbol dll -- )
[ 0 swap LOAD32 ] 2dip rc-absolute-ppc-2/2 rel-dlsym ;
} case ;
: (%compare) ( src1 src2 -- ) [ 0 ] dip CMP ; inline
-: (%compare-imm) ( src1 src2 -- ) [ 0 ] 2dip CMPI ; inline
+: (%compare-imm) ( src1 src2 -- ) [ 0 ] [ ] [ \ f type-number or ] tri* CMPI ; inline
: (%compare-float-unordered) ( src1 src2 -- ) [ 0 ] dip FCMPU ; inline
: (%compare-float-ordered) ( src1 src2 -- ) [ 0 ] dip FCMPO ; inline
! See http://factorcode.org/license.txt for BSD license.
USING: locals alien alien.c-types alien.libraries alien.syntax
arrays kernel fry math namespaces sequences system layouts io
-vocabs.loader accessors init classes.struct combinators command-line
-make compiler compiler.units compiler.constants compiler.alien
-compiler.codegen compiler.codegen.fixup
-compiler.cfg.instructions compiler.cfg.builder
-compiler.cfg.intrinsics compiler.cfg.stack-frame
-cpu.x86.assembler cpu.x86.assembler.operands cpu.x86
-cpu.architecture vm ;
+vocabs.loader accessors init classes.struct combinators
+command-line make words compiler compiler.units
+compiler.constants compiler.alien compiler.codegen
+compiler.codegen.fixup compiler.cfg.instructions
+compiler.cfg.builder compiler.cfg.intrinsics
+compiler.cfg.stack-frame cpu.x86.assembler
+cpu.x86.assembler.operands cpu.x86 cpu.architecture vm ;
FROM: layouts => cell ;
IN: cpu.x86.32
M: x86.32 frame-reg EBP ;
M: x86.32 temp-reg ECX ;
+M: x86.32 immediate-comparand? ( n -- ? )
+ [ call-next-method ] [ word? ] bi or ;
+
+M: x86.32 load-double? ( -- ? ) t ;
+
+M: x86.32 %load-double ( dst val -- )
+ [ 0 [] MOVSD ] dip rc-absolute rel-float ;
+
M: x86.32 %mov-vm-ptr ( reg -- )
0 MOV 0 rc-absolute-cell rel-vm ;
M: x86 %load-immediate dup 0 = [ drop dup XOR ] [ MOV ] if ;
-M: x86 %load-reference swap 0 MOV rc-absolute-cell rel-immediate ;
+M: x86 %load-reference swap 0 MOV rc-absolute-cell rel-literal ;
HOOK: ds-reg cpu ( -- reg )
HOOK: rs-reg cpu ( -- reg )
M: x86 %epilogue ( n -- ) cell - incr-stack-reg ;
-:: %boolean ( dst temp word -- )
+:: (%boolean) ( dst temp insn -- )
dst \ f type-number MOV
- temp 0 MOV \ t rc-absolute-cell rel-immediate
- dst temp word execute ; inline
-
-: (%compare) ( src1 src2 cc -- )
- 2over [ { cc= cc/= } member? ] [ register? ] [ 0 = ] tri* and and
- [ drop dup TEST ]
- [ CMP ] if ;
+ temp 0 MOV \ t rc-absolute-cell rel-literal
+ dst temp insn execute ; inline
+
+: %boolean ( dst cc temp -- )
+ swap order-cc {
+ { cc< [ \ CMOVL (%boolean) ] }
+ { cc<= [ \ CMOVLE (%boolean) ] }
+ { cc> [ \ CMOVG (%boolean) ] }
+ { cc>= [ \ CMOVGE (%boolean) ] }
+ { cc= [ \ CMOVE (%boolean) ] }
+ { cc/= [ \ CMOVNE (%boolean) ] }
+ } case ;
M:: x86 %compare ( dst src1 src2 cc temp -- )
- src1 src2 cc (%compare)
- cc order-cc {
- { cc< [ dst temp \ CMOVL %boolean ] }
- { cc<= [ dst temp \ CMOVLE %boolean ] }
- { cc> [ dst temp \ CMOVG %boolean ] }
- { cc>= [ dst temp \ CMOVGE %boolean ] }
- { cc= [ dst temp \ CMOVE %boolean ] }
- { cc/= [ dst temp \ CMOVNE %boolean ] }
- } case ;
+ src1 src2 CMP
+ dst cc temp %boolean ;
-M: x86 %compare-imm ( dst src1 src2 cc temp -- )
- %compare ;
+: use-test? ( src1 src2 cc -- ? )
+ [ register? ] [ 0 = ] [ { cc= cc/= } member? ] tri* and and ;
-M:: x86 %compare-branch ( label src1 src2 cc -- )
- src1 src2 cc (%compare)
- cc order-cc {
- { cc< [ label JL ] }
- { cc<= [ label JLE ] }
- { cc> [ label JG ] }
- { cc>= [ label JGE ] }
- { cc= [ label JE ] }
- { cc/= [ label JNE ] }
+: (%compare-tagged) ( src1 src2 -- )
+ [ HEX: ffffffff CMP ] dip rc-absolute rel-literal ;
+
+: (%compare-imm) ( src1 src2 cc -- )
+ {
+ { [ 3dup use-test? ] [ 2drop dup TEST ] }
+ { [ over integer? ] [ drop CMP ] }
+ { [ over word? ] [ drop (%compare-tagged) ] }
+ { [ over not ] [ 2drop \ f type-number CMP ] }
+ } cond ;
+
+M:: x86 %compare-imm ( dst src1 src2 cc temp -- )
+ src1 src2 cc (%compare-imm)
+ dst cc temp %boolean ;
+
+: %branch ( label cc -- )
+ order-cc {
+ { cc< [ JL ] }
+ { cc<= [ JLE ] }
+ { cc> [ JG ] }
+ { cc>= [ JGE ] }
+ { cc= [ JE ] }
+ { cc/= [ JNE ] }
} case ;
-M: x86 %compare-imm-branch ( label src1 src2 cc -- )
- %compare-branch ;
+M:: x86 %compare-branch ( label src1 src2 cc -- )
+ src1 src2 CMP
+ label cc %branch ;
+
+M:: x86 %compare-imm-branch ( label src1 src2 cc -- )
+ src1 src2 cc (%compare-imm)
+ label cc %branch ;
M: x86 %add-float double-rep two-operand ADDSD ;
M: x86 %sub-float double-rep two-operand SUBSD ;
:: (%compare-float) ( dst src1 src2 cc temp compare -- )
cc {
- { cc< [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVA %boolean ] }
- { cc<= [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVAE %boolean ] }
- { cc> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVA %boolean ] }
- { cc>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVAE %boolean ] }
- { cc= [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float= %boolean ] }
- { cc<> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNE %boolean ] }
- { cc<>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNP %boolean ] }
- { cc/< [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVBE %boolean ] }
- { cc/<= [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVB %boolean ] }
- { cc/> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVBE %boolean ] }
- { cc/>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVB %boolean ] }
- { cc/= [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float/= %boolean ] }
- { cc/<> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVE %boolean ] }
- { cc/<>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVP %boolean ] }
+ { cc< [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVA (%boolean) ] }
+ { cc<= [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVAE (%boolean) ] }
+ { cc> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVA (%boolean) ] }
+ { cc>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVAE (%boolean) ] }
+ { cc= [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float= (%boolean) ] }
+ { cc<> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNE (%boolean) ] }
+ { cc<>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNP (%boolean) ] }
+ { cc/< [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVBE (%boolean) ] }
+ { cc/<= [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVB (%boolean) ] }
+ { cc/> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVBE (%boolean) ] }
+ { cc/>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVB (%boolean) ] }
+ { cc/= [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float/= (%boolean) ] }
+ { cc/<> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVE (%boolean) ] }
+ { cc/<>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVP (%boolean) ] }
} case ; inline
M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- )
:: %test-vector-mask ( dst temp mask vcc -- )
vcc {
- { vcc-any [ dst dst TEST dst temp \ CMOVNE %boolean ] }
- { vcc-none [ dst dst TEST dst temp \ CMOVE %boolean ] }
- { vcc-all [ dst mask CMP dst temp \ CMOVE %boolean ] }
- { vcc-notall [ dst mask CMP dst temp \ CMOVNE %boolean ] }
+ { vcc-any [ dst dst TEST dst temp \ CMOVNE (%boolean) ] }
+ { vcc-none [ dst dst TEST dst temp \ CMOVE (%boolean) ] }
+ { vcc-all [ dst mask CMP dst temp \ CMOVE (%boolean) ] }
+ { vcc-notall [ dst mask CMP dst temp \ CMOVNE (%boolean) ] }
} case ;
: %move-vector-mask ( dst src rep -- mask )
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs byte-arrays byte-vectors classes
combinators definitions effects fry generic generic.single
-generic.standard hashtables io.binary io.streams.string kernel
-kernel.private math math.integers.private math.parser
-namespaces parser sbufs sequences splitting splitting.private strings
-vectors words ;
+generic.standard hashtables io.binary io.encodings
+io.streams.string kernel kernel.private math
+math.integers.private math.parser namespaces parser sbufs
+sequences splitting splitting.private strings vectors words ;
IN: hints
GENERIC: specializer-predicate ( spec -- quot )
M\ hashtable set-at { { object fixnum object } { object word object } } "specializer" set-word-prop
\ bignum/f { { bignum bignum } { bignum fixnum } { fixnum bignum } { fixnum fixnum } } "specializer" set-word-prop
+
+\ encode-string { string object object } "specializer" set-word-prop
! Copyright (C) 2008 Daniel Ehrenberg, Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
-USING: math.parser arrays io.encodings sequences kernel assocs
-hashtables io.encodings.ascii generic parser classes.tuple words
-words.symbol io io.files splitting namespaces math
-compiler.units accessors classes.singleton classes.mixin
-io.encodings.iana fry simple-flat-file lexer ;
+USING: arrays assocs biassocs kernel io.encodings math.parser
+sequences hashtables io.encodings.ascii generic parser
+classes.tuple words words.symbol io io.files splitting
+namespaces math compiler.units accessors classes.singleton
+classes.mixin io.encodings.iana fry simple-flat-file lexer ;
IN: io.encodings.8-bit
<PRIVATE
SYMBOL: 8-bit-encodings
8-bit-encodings [ H{ } clone ] initialize
-TUPLE: 8-bit biassoc ;
+TUPLE: 8-bit { biassoc biassoc read-only } ;
-: encode-8-bit ( char stream assoc -- )
- swapd value-at
- [ swap stream-write1 ] [ encode-error ] if* ; inline
+: 8-bit-encode ( char 8-bit -- byte )
+ biassoc>> value-at [ encode-error ] unless* ; inline
-M: 8-bit encode-char biassoc>> encode-8-bit ;
+M: 8-bit encode-char
+ swap [ 8-bit-encode ] dip stream-write1 ;
-: decode-8-bit ( stream assoc -- char/f )
- swap stream-read1
- [ swap at [ replacement-char ] unless* ]
- [ drop f ] if* ; inline
+M: 8-bit encode-string
+ swap [ '[ _ 8-bit-encode ] B{ } map-as ] dip stream-write ;
-M: 8-bit decode-char biassoc>> decode-8-bit ;
+M: 8-bit decode-char
+ swap stream-read1 dup
+ [ swap biassoc>> at [ replacement-char ] unless* ]
+ [ 2drop f ]
+ if ;
MIXIN: 8-bit-encoding
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: io io.encodings kernel math io.encodings.private ;
+USING: accessors byte-arrays io io.encodings
+io.encodings.private kernel math sequences ;
IN: io.encodings.ascii
-<PRIVATE
-: encode-if< ( char stream encoding max -- )
- nip 1 - pick < [ encode-error ] [ stream-write1 ] if ; inline
-
-: decode-if< ( stream encoding max -- character )
- nip swap stream-read1 dup
- [ [ nip ] [ > ] 2bi [ >fixnum ] [ drop replacement-char ] if ]
- [ 2drop f ] if ; inline
-PRIVATE>
-
SINGLETON: ascii
M: ascii encode-char
- 128 encode-if< ; inline
+ drop
+ over 127 <= [ stream-write1 ] [ encode-error ] if ; inline
+
+M: ascii encode-string
+ drop
+ [
+ dup aux>>
+ [ [ dup 127 <= [ encode-error ] unless ] B{ } map-as ]
+ [ >byte-array ]
+ if
+ ] dip
+ stream-write ;
M: ascii decode-char
- 128 decode-if< ; inline
+ drop
+ stream-read1 dup [
+ dup 127 <= [ >fixnum ] [ drop replacement-char ] if
+ ] when ; inline
: write-in-groups ( byte-array port -- )
[ binary-object <direct-uchar-array> ] dip
- [ buffer>> size>> <groups> ] [ '[ _ stream-write ] ] bi
+ [ buffer>> size>> <sliced-groups> ] [ '[ _ stream-write ] ] bi
each ;
M: output-port stream-write
HINTS: decoder-read-until { string input-port utf8 } { string input-port ascii } ;
HINTS: decoder-readln { input-port utf8 } { input-port ascii } ;
-
-HINTS: encoder-write { object output-port utf8 } { object output-port ascii } ;
--- /dev/null
+Slava Pestov
--- /dev/null
+! Copyright (C) 2010 Slava Pestov.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien byte-vectors io kernel libc math sequences ;
+IN: io.streams.byte-array.fast
+
+! This is split off from io.streams.byte-array because it uses
+! memcpy, which is a non-core word that only works after the
+! optimizing compiler has been loaded.
+
+M: byte-vector stream-write
+ [ dup byte-length tail-slice ]
+ [ [ [ byte-length ] bi@ + ] keep lengthen ]
+ [ drop byte-length ]
+ 2tri
+ [ >c-ptr swap >c-ptr ] dip memcpy ;
-! Copyright (C) 2008 Daniel Ehrenberg.
+! Copyright (C) 2008, 2010 Daniel Ehrenberg, Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors namespaces growable
strings io classes continuations destructors combinators
GENERIC: encode-char ( char stream encoding -- )
+GENERIC: encode-string ( string stream encoding -- )
+
+M: object encode-string [ encode-char ] 2curry each ; inline
+
GENERIC: <decoder> ( stream encoding -- newstream )
CONSTANT: replacement-char HEX: fffd
M: encoder stream-write1
>encoder< encode-char ;
-GENERIC# encoder-write 2 ( string stream encoding -- )
-
-M: string encoder-write
- [ encode-char ] 2curry each ;
-
M: encoder stream-write
- >encoder< encoder-write ;
+ >encoder< encode-string ;
M: encoder dispose stream>> dispose ;
! Copyright (C) 2006, 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
-USING: math math.order kernel sequences sbufs vectors growable io
-continuations namespaces io.encodings combinators strings ;
+USING: accessors byte-arrays math math.order kernel sequences
+sbufs vectors growable io continuations namespaces io.encodings
+combinators strings ;
IN: io.encodings.utf8
! Decoding UTF-8
! Encoding UTF-8
: encoded ( stream char -- )
- BIN: 111111 bitand BIN: 10000000 bitor swap stream-write1 ;
+ BIN: 111111 bitand BIN: 10000000 bitor swap stream-write1 ; inline
-: char>utf8 ( stream char -- )
- {
+: char>utf8 ( char stream -- )
+ swap {
{ [ dup -7 shift zero? ] [ swap stream-write1 ] }
{ [ dup -11 shift zero? ] [
2dup -6 shift BIN: 11000000 bitor swap stream-write1
2dup -6 shift encoded
encoded
]
- } cond ;
+ } cond ; inline
M: utf8 encode-char
- drop swap char>utf8 ;
+ drop char>utf8 ;
+
+M: utf8 encode-string
+ drop
+ over aux>>
+ [ [ char>utf8 ] curry each ]
+ [ [ >byte-array ] dip stream-write ] if ;
PRIVATE>
! Copyright (C) 2003, 2008 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math.private sequences kernel.private
-math sequences.private slots.private byte-arrays
-alien.accessors ;
+math sequences.private slots.private alien.accessors ;
IN: strings
<PRIVATE
n 2 * ALU "Homo sapiens alu" "ONE" write-repeat-fasta
initial-seed
+
n 3 * homo-sapiens-chars homo-sapiens-floats
"IUB ambiguity codes" "TWO" write-random-fasta
+
n 5 * IUB-chars IUB-floats
"Homo sapiens frequency" "THREE" write-random-fasta
+
drop
] with-file-writer
] ;
! Copyright (C) 2010 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors alien alien.c-types alien.data alien.parser
-alien.strings arrays assocs byte-arrays classes.struct
-combinators continuations cuda.ffi destructors fry io
-io.backend io.encodings.string io.encodings.utf8 kernel lexer
-locals math math.parser namespaces opengl.gl.extensions
-prettyprint quotations sequences ;
+USING: accessors alien alien.data alien.parser alien.strings
+alien.syntax arrays assocs byte-arrays classes.struct
+combinators continuations cuda.ffi cuda.memory cuda.utils
+destructors fry io io.backend io.encodings.string
+io.encodings.utf8 kernel lexer locals macros math math.parser
+namespaces nested-comments opengl.gl.extensions parser
+prettyprint quotations sequences words ;
+QUALIFIED-WITH: alien.c-types a
IN: cuda
-SYMBOL: cuda-device
-SYMBOL: cuda-context
-SYMBOL: cuda-module
-SYMBOL: cuda-function
-SYMBOL: cuda-launcher
-SYMBOL: cuda-memory-hashtable
-
-ERROR: throw-cuda-error n ;
-
-: cuda-error ( n -- )
- dup CUDA_SUCCESS = [ drop ] [ throw-cuda-error ] if ;
-
-: cuda-version ( -- n )
- int <c-object> [ cuDriverGetVersion cuda-error ] keep *int ;
-
-: init-cuda ( -- )
- 0 cuInit cuda-error ;
-
TUPLE: launcher
{ device integer initial: 0 }
-{ device-flags initial: 0 }
-path block-shape shared-size grid ;
+{ device-flags initial: 0 } ;
-: with-cuda-context ( flags device quot -- )
- [
- [ CUcontext <c-object> ] 2dip
- [ cuCtxCreate cuda-error ] 3keep 2drop *void*
- ] dip
- [ '[ _ @ ] ]
- [ drop '[ _ cuCtxDestroy cuda-error ] ] 2bi
- [ ] cleanup ; inline
+TUPLE: function-launcher
+dim-block dim-grid shared-size stream ;
-: with-cuda-module ( path quot -- )
- [
- normalize-path
- [ CUmodule <c-object> ] dip
- [ cuModuleLoad cuda-error ] 2keep drop *void*
- ] dip
+: with-cuda-context ( flags device quot -- )
+ H{ } clone cuda-modules set-global
+ H{ } clone cuda-functions set
+ [ create-context ] dip
[ '[ _ @ ] ]
- [ drop '[ _ cuModuleUnload cuda-error ] ] 2bi
+ [ drop '[ _ destroy-context ] ] 2bi
[ ] cleanup ; inline
-: with-cuda-program ( flags device path quot -- )
+: with-cuda-program ( flags device quot -- )
[ dup cuda-device set ] 2dip
- '[
- cuda-context set
- _ [
- cuda-module set
- _ call
- ] with-cuda-module
- ] with-cuda-context ; inline
+ '[ cuda-context set _ call ] with-cuda-context ; inline
: with-cuda ( launcher quot -- )
- [
- init-cuda
- H{ } clone cuda-memory-hashtable
- ] 2dip '[
+ init-cuda
+ [ H{ } clone cuda-memory-hashtable ] 2dip '[
_
[ cuda-launcher set ]
- [ [ device>> ] [ device-flags>> ] [ path>> ] tri ] bi
+ [ [ device>> ] [ device-flags>> ] bi ] bi
_ with-cuda-program
] with-variable ; inline
-<PRIVATE
-
-: #cuda-devices ( -- n )
- int <c-object> [ cuDeviceGetCount cuda-error ] keep *int ;
-
-: n>cuda-device ( n -- device )
- [ CUdevice <c-object> ] dip [ cuDeviceGet cuda-error ] 2keep drop *int ;
-
-: enumerate-cuda-devices ( -- devices )
- #cuda-devices iota [ n>cuda-device ] map ;
-
-: cuda-device-properties ( device -- properties )
- [ CUdevprop <c-object> ] dip
- [ cuDeviceGetProperties cuda-error ] 2keep drop
- CUdevprop memory>struct ;
-
-PRIVATE>
-
-: cuda-devices ( -- assoc )
- enumerate-cuda-devices [ dup cuda-device-properties ] { } map>assoc ;
-
-: cuda-device-name ( n -- string )
- [ 256 [ <byte-array> ] keep ] dip
- [ cuDeviceGetName cuda-error ]
- [ 2drop utf8 alien>string ] 3bi ;
-
-: cuda-device-capability ( n -- pair )
- [ int <c-object> int <c-object> ] dip
- [ cuDeviceComputeCapability cuda-error ]
- [ drop [ *int ] bi@ ] 3bi 2array ;
-
-: cuda-device-memory ( n -- bytes )
- [ uint <c-object> ] dip
- [ cuDeviceTotalMem cuda-error ]
- [ drop *uint ] 2bi ;
-
-: get-cuda-function* ( module string -- function )
- [ CUfunction <c-object> ] 2dip
- [ cuModuleGetFunction cuda-error ] 3keep 2drop *void* ;
-
-: get-cuda-function ( string -- function )
- [ cuda-module get ] dip get-cuda-function* ;
-
-: with-cuda-function ( string quot -- )
- [
- get-cuda-function cuda-function set
- ] dip call ; inline
-
-: launch-function* ( function -- ) cuLaunch cuda-error ;
-
-: launch-function ( -- ) cuda-function get cuLaunch cuda-error ;
-
-: launch-function-grid* ( function width height -- )
- cuLaunchGrid cuda-error ;
-
-: launch-function-grid ( width height -- )
- [ cuda-function get ] 2dip
- cuLaunchGrid cuda-error ;
-
-TUPLE: cuda-memory < disposable ptr length ;
-
-: <cuda-memory> ( ptr length -- obj )
- cuda-memory new-disposable
- swap >>length
- swap >>ptr ;
-
-: add-cuda-memory ( obj -- obj )
- dup dup ptr>> cuda-memory-hashtable get set-at ;
-
-: delete-cuda-memory ( obj -- )
- cuda-memory-hashtable delete-at ;
-
-ERROR: invalid-cuda-memory ptr ;
-
-: cuda-memory-length ( cuda-memory -- n )
- ptr>> cuda-memory-hashtable get ?at [
- length>>
- ] [
- invalid-cuda-memory
- ] if ;
-
-M: cuda-memory byte-length length>> ;
-
-: cuda-malloc ( n -- ptr )
- [ CUdeviceptr <c-object> ] dip
- [ cuMemAlloc cuda-error ] 2keep
- [ *int ] dip <cuda-memory> add-cuda-memory ;
-
-: cuda-free* ( ptr -- )
- cuMemFree cuda-error ;
-
-M: cuda-memory dispose ( ptr -- )
- ptr>> cuda-free* ;
-
-: host>device ( dest-ptr src-ptr -- )
- [ ptr>> ] dip dup length cuMemcpyHtoD cuda-error ;
-
-:: device>host ( ptr -- seq )
- ptr byte-length <byte-array>
- [ ptr [ ptr>> ] [ byte-length ] bi cuMemcpyDtoH cuda-error ] keep ;
-
-: memcpy-device>device ( dest-ptr src-ptr count -- )
- cuMemcpyDtoD cuda-error ;
-
-: memcpy-device>array ( dest-array dest-index src-ptr count -- )
- cuMemcpyDtoA cuda-error ;
-
-: memcpy-array>device ( dest-ptr src-array src-index count -- )
- cuMemcpyAtoD cuda-error ;
-
-: memcpy-array>host ( dest-ptr src-array src-index count -- )
- cuMemcpyAtoH cuda-error ;
-
-: memcpy-host>array ( dest-array dest-index src-ptr count -- )
- cuMemcpyHtoA cuda-error ;
-
-: memcpy-array>array ( dest-array dest-index src-array src-ptr count -- )
- cuMemcpyAtoA cuda-error ;
-
-: cuda-int* ( function offset value -- )
- cuParamSeti cuda-error ;
-
-: cuda-int ( offset value -- )
- [ cuda-function get ] 2dip cuda-int* ;
-
-: cuda-float* ( function offset value -- )
- cuParamSetf cuda-error ;
-
-: cuda-float ( offset value -- )
- [ cuda-function get ] 2dip cuda-float* ;
-
-: cuda-vector* ( function offset ptr n -- )
- cuParamSetv cuda-error ;
-
-: cuda-vector ( offset ptr n -- )
- [ cuda-function get ] 3dip cuda-vector* ;
-
-: param-size* ( function n -- )
- cuParamSetSize cuda-error ;
-
-: param-size ( n -- )
- [ cuda-function get ] dip param-size* ;
-
-: malloc-device-string ( string -- n )
- utf8 encode
- [ length cuda-malloc ] keep
- [ host>device ] [ drop ] 2bi ;
-
-ERROR: bad-cuda-parameter parameter ;
-
-:: set-parameters ( seq -- )
- cuda-function get :> function
- 0 :> offset!
- seq [
- [ offset ] dip
- {
- { [ dup cuda-memory? ] [ ptr>> cuda-int ] }
- { [ dup float? ] [ cuda-float ] }
- { [ dup integer? ] [ cuda-int ] }
- [ bad-cuda-parameter ]
- } cond
- offset 4 + offset!
- ] each
- offset param-size ;
-
-: cuda-device-attribute ( attribute dev -- n )
- [ int <c-object> ] 2dip
- [ cuDeviceGetAttribute cuda-error ]
- [ 2drop *int ] 3bi ;
-
-: function-block-shape* ( function x y z -- )
- cuFuncSetBlockShape cuda-error ;
-
-: function-block-shape ( x y z -- )
- [ cuda-function get ] 3dip
- cuFuncSetBlockShape cuda-error ;
-
-: function-shared-size* ( function n -- )
- cuFuncSetSharedSize cuda-error ;
-
-: function-shared-size ( n -- )
- [ cuda-function get ] dip
- cuFuncSetSharedSize cuda-error ;
-
-: launch ( -- )
- cuda-launcher get {
- [ block-shape>> first3 function-block-shape ]
- [ shared-size>> function-shared-size ]
+: c-type>cuda-setter ( c-type -- n cuda-type )
+ {
+ { [ dup a:int = ] [ drop 4 [ cuda-int* ] ] }
+ { [ dup a:uint = ] [ drop 4 [ cuda-int* ] ] }
+ { [ dup a:float = ] [ drop 4 [ cuda-float* ] ] }
+ { [ dup a:pointer? ] [ drop 4 [ ptr>> cuda-int* ] ] }
+ { [ dup a:void* = ] [ drop 4 [ ptr>> cuda-int* ] ] }
+ } cond ;
+
+: run-function-launcher ( function-launcher function -- )
+ swap
+ {
+ [ dim-block>> first3 function-block-shape* ]
+ [ shared-size>> function-shared-size* ]
[
- grid>> [
- launch-function
+ dim-grid>> [
+ launch-function*
] [
- first2 launch-function-grid
+ first2 launch-function-grid*
] if-empty
]
- } cleave ;
+ } 2cleave ;
-: cuda-device. ( n -- )
- {
- [ "Device: " write number>string print ]
- [ "Name: " write cuda-device-name print ]
- [ "Memory: " write cuda-device-memory number>string print ]
- [
- "Capability: " write
- cuda-device-capability [ number>string ] map " " join print
- ]
- [ "Properties: " write cuda-device-properties . ]
- [
- "CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: " write
- CU_DEVICE_ATTRIBUTE_GPU_OVERLAP swap
- cuda-device-attribute number>string print
- ]
- } cleave ;
+: cuda-argument-setter ( offset c-type -- offset' quot )
+ c-type>cuda-setter
+ [ over [ + ] dip ] dip
+ '[ swap _ swap _ call ] ;
-: cuda. ( -- )
- "CUDA Version: " write cuda-version number>string print nl
- #cuda-devices iota [ nl ] [ cuda-device. ] interleave ;
+MACRO: cuda-arguments ( c-types -- quot: ( args... function -- ) )
+ [ 0 ] dip [ cuda-argument-setter ] map reverse
+ swap '[ _ param-size* ] suffix
+ '[ _ cleave ] ;
-
-: test-cuda0 ( -- )
- T{ launcher
- { path "vocab:cuda/hello.ptx" }
- { block-shape { 6 6 6 } }
- { shared-size 2 }
- { grid { 2 6 } }
- } [
- "helloWorld" [
- "Hello World!" [ - ] map-index
- malloc-device-string &dispose
-
- [ 1array set-parameters ]
- [ drop launch ]
- [ device>host utf8 alien>string . ] tri
- ] with-cuda-function
- ] with-cuda ;
+: define-cuda-word ( word module-name function-name arguments -- )
+ [
+ '[
+ _ _ cached-function
+ [ nip _ cuda-arguments ]
+ [ run-function-launcher ] 2bi
+ ]
+ ]
+ [ 2nip \ function-launcher suffix a:void function-effect ]
+ 3bi define-declared ;
--- /dev/null
+Doug Coleman
--- /dev/null
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien.c-types alien.strings cuda cuda.memory cuda.syntax
+destructors io io.encodings.utf8 kernel locals math sequences ;
+IN: cuda.demos.hello-world
+
+CUDA-LIBRARY: hello vocab:cuda/demos/hello-world/hello.ptx
+
+CUDA-FUNCTION: helloWorld ( char* string-ptr ) ;
+
+:: cuda-hello-world ( -- )
+ T{ launcher { device 0 } } [
+ "Hello World!" [ - ] map-index malloc-device-string
+ &dispose dup :> str
+
+ { 6 1 1 } { 2 1 } 1 3<<< helloWorld
+
+ str device>host utf8 alien>string print
+ ] with-cuda ;
+
+MAIN: cuda-hello-world
--- /dev/null
+/*
+ World using CUDA
+**
+** The string "Hello World!" is mangled then restored using a common CUDA idiom
+**
+** Byron Galbraith
+** 2009-02-18
+*/
+#include <cuda.h>
+#include <stdio.h>
+
+// Prototypes
+extern "C" __global__ void helloWorld(char*);
+
+// Host function
+int
+main(int argc, char** argv)
+{
+ int i;
+
+ // desired output
+ char str[] = "Hello World!";
+
+ // mangle contents of output
+ // the null character is left intact for simplicity
+ for(i = 0; i < 12; i++)
+ str[i] -= i;
+
+ // allocate memory on the device
+ char *d_str;
+ size_t size = sizeof(str);
+ cudaMalloc((void**)&d_str, size);
+
+ // copy the string to the device
+ cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
+
+ // set the grid and block sizes
+ dim3 dimGrid(2); // one block per word
+ dim3 dimBlock(6); // one thread per character
+
+ // invoke the kernel
+ helloWorld<<< dimGrid, dimBlock >>>(d_str);
+
+ // retrieve the results from the device
+ cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
+
+ // free up the allocated memory on the device
+ cudaFree(d_str);
+
+ // everyone's favorite part
+ printf("%s\n", str);
+
+ return 0;
+}
+
+// Device kernel
+__global__ void
+helloWorld(char* str)
+{
+ // determine where in the thread grid we are
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // unmangle output
+ str[idx] += idx;
+}
--- /dev/null
+ .version 1.4
+ .target sm_10, map_f64_to_f32
+ // compiled with /usr/local/cuda/bin/../open64/lib//be
+ // nvopencc 3.0 built on 2010-03-11
+
+ //-----------------------------------------------------------
+ // Compiling /tmp/tmpxft_00000eab_00000000-7_hello.cpp3.i (/var/folders/KD/KDnx4D80Eh0fsORqNrFWBE+++TI/-Tmp-/ccBI#.AYqbdQ)
+ //-----------------------------------------------------------
+
+ //-----------------------------------------------------------
+ // Options:
+ //-----------------------------------------------------------
+ // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
+ // -O3 (Optimization level)
+ // -g0 (Debug level)
+ // -m2 (Report advisories)
+ //-----------------------------------------------------------
+
+ .file 1 "<command-line>"
+ .file 2 "/tmp/tmpxft_00000eab_00000000-6_hello.cudafe2.gpu"
+ .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
+ .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
+ .file 5 "/usr/local/cuda/bin/../include/host_defines.h"
+ .file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
+ .file 7 "/usr/local/cuda/bin/../include/device_types.h"
+ .file 8 "/usr/local/cuda/bin/../include/driver_types.h"
+ .file 9 "/usr/local/cuda/bin/../include/texture_types.h"
+ .file 10 "/usr/local/cuda/bin/../include/vector_types.h"
+ .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
+ .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
+ .file 13 "/usr/include/i386/_types.h"
+ .file 14 "/usr/include/time.h"
+ .file 15 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
+ .file 16 "/usr/local/cuda/bin/../include/common_functions.h"
+ .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
+ .file 18 "/usr/local/cuda/bin/../include/math_functions.h"
+ .file 19 "/usr/local/cuda/bin/../include/device_functions.h"
+ .file 20 "/usr/local/cuda/bin/../include/math_constants.h"
+ .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
+ .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
+ .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
+ .file 24 "/usr/local/cuda/bin/../include/common_types.h"
+ .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
+ .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
+ .file 27 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
+ .file 28 "hello.cu"
+
+
+ .entry helloWorld (
+ .param .u32 __cudaparm_helloWorld_str)
+ {
+ .reg .u16 %rh<4>;
+ .reg .u32 %r<9>;
+ .loc 28 58 0
+$LBB1_helloWorld:
+ .loc 28 64 0
+ mov.u16 %rh1, %ctaid.x;
+ mov.u16 %rh2, %ntid.x;
+ mul.wide.u16 %r1, %rh1, %rh2;
+ cvt.u32.u16 %r2, %tid.x;
+ add.u32 %r3, %r2, %r1;
+ ld.param.u32 %r4, [__cudaparm_helloWorld_str];
+ add.u32 %r5, %r4, %r3;
+ ld.global.s8 %r6, [%r5+0];
+ add.s32 %r7, %r6, %r3;
+ st.global.s8 [%r5+0], %r7;
+ .loc 28 65 0
+ exit;
+$LDWend_helloWorld:
+ } // helloWorld
+
--- /dev/null
+Doug Coleman
+Joe Groff
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+static const int LOG_BANK_COUNT = 4;
+
+static inline __device__ __host__ unsigned shared_offset(unsigned i)
+{
+ return i + (i >> LOG_BANK_COUNT);
+}
+
+static inline __device__ __host__ unsigned offset_a(unsigned offset, unsigned i)
+{
+ return shared_offset(offset * (2*i + 1) - 1);
+}
+
+static inline __device__ __host__ unsigned offset_b(unsigned offset, unsigned i)
+{
+ return shared_offset(offset * (2*i + 2) - 1);
+}
+
+static inline __device__ __host__ unsigned lpot(unsigned x)
+{
+ --x; x |= x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return ++x;
+}
+
+template<typename T>
+__global__ void prefix_sum_block(T *in, T *out, unsigned n)
+{
+ extern __shared__ T temp[];
+
+ int idx = threadIdx.x;
+ int blocksize = blockDim.x;
+
+ temp[shared_offset(idx )] = (idx < n) ? in[idx ] : 0;
+ temp[shared_offset(idx + blocksize)] = (idx + blocksize < n) ? in[idx + blocksize] : 0;
+
+ int offset, d;
+ for (offset = 1, d = blocksize; d > 0; d >>= 1, offset <<= 1) {
+ __syncthreads();
+ if (idx < d) {
+ unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
+ temp[b] += temp[a];
+ }
+ }
+
+ if (idx == 0) temp[shared_offset(blocksize*2 - 1)] = 0;
+
+ for (d = 1; d <= blocksize; d <<= 1) {
+ offset >>= 1;
+ __syncthreads();
+
+ if (idx < d) {
+ unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
+ unsigned t = temp[a];
+ temp[a] = temp[b];
+ temp[b] += t;
+ }
+ }
+ __syncthreads();
+
+ if (idx < n) out[idx ] = temp[shared_offset(idx )];
+ if (idx + blocksize < n) out[idx + blocksize] = temp[shared_offset(idx + blocksize)];
+}
+
+template<typename T>
+void prefix_sum(T *in, T *out, unsigned n)
+{
+ char *device_values;
+ unsigned n_lpot = lpot(n);
+ size_t n_pitch;
+
+ cudaError_t error = cudaMallocPitch((void**)&device_values, &n_pitch, sizeof(T)*n, 2);
+ if (error != 0) {
+ printf("error %u allocating width %lu height %u\n", error, sizeof(T)*n, 2);
+ exit(1);
+ }
+
+ cudaMemcpy(device_values, in, sizeof(T)*n, cudaMemcpyHostToDevice);
+
+ prefix_sum_block<<<1, n_lpot/2, shared_offset(n_lpot)*sizeof(T)>>>
+ ((T*)device_values, (T*)(device_values + n_pitch), n);
+
+ cudaMemcpy(out, device_values + n_pitch, sizeof(T)*n, cudaMemcpyDeviceToHost);
+ cudaFree(device_values);
+}
+
+int main()
+{
+ sranddev();
+
+ static unsigned in_values[1024], out_values[1024];
+
+ for (int i = 0; i < 1024; ++i)
+ in_values[i] = rand() >> 21;
+
+ prefix_sum(in_values, out_values, 1024);
+
+ for (int i = 0; i < 1024; ++i)
+ printf("%5d => %5d\n", in_values[i], out_values[i]);
+
+ return 0;
+}
--- /dev/null
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien.c-types cuda cuda.syntax locals ;
+IN: cuda.demos.prefix-sum
+
+CUDA-LIBRARY: prefix-sum vocab:cuda/demos/prefix-sum/prefix-sum.ptx
+
+CUDA-FUNCTION: prefix_sum_block ( uint* in, uint* out, uint n ) ;
+
+:: cuda-prefix-sum ( -- )
+ T{ launcher { device 0 } }
+ [
+ ! { 1 1 1 } { 2 1 } 0 3<<< prefix_sum_block
+ ] with-cuda ;
+
+MAIN: cuda-prefix-sum
--- /dev/null
+ .version 1.4
+ .target sm_10, map_f64_to_f32
+ // compiled with /usr/local/cuda/bin/../open64/lib//be
+ // nvopencc 3.0 built on 2010-03-11
+
+ //-----------------------------------------------------------
+ // Compiling /tmp/tmpxft_00000236_00000000-7_prefix-sum.cpp3.i (/var/folders/K6/K6oI14wZ2RWhSE+BYqTjA++++TI/-Tmp-/ccBI#.0ATpGM)
+ //-----------------------------------------------------------
+
+ //-----------------------------------------------------------
+ // Options:
+ //-----------------------------------------------------------
+ // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
+ // -O3 (Optimization level)
+ // -g0 (Debug level)
+ // -m2 (Report advisories)
+ //-----------------------------------------------------------
+
+ .file 1 "<command-line>"
+ .file 2 "/tmp/tmpxft_00000236_00000000-6_prefix-sum.cudafe2.gpu"
+ .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
+ .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
+ .file 5 "/usr/local/cuda/bin/../include/host_defines.h"
+ .file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
+ .file 7 "/usr/local/cuda/bin/../include/device_types.h"
+ .file 8 "/usr/local/cuda/bin/../include/driver_types.h"
+ .file 9 "/usr/local/cuda/bin/../include/texture_types.h"
+ .file 10 "/usr/local/cuda/bin/../include/vector_types.h"
+ .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
+ .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
+ .file 13 "/usr/include/i386/_types.h"
+ .file 14 "/usr/include/time.h"
+ .file 15 "prefix-sum.cu"
+ .file 16 "/usr/local/cuda/bin/../include/common_functions.h"
+ .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
+ .file 18 "/usr/local/cuda/bin/../include/math_functions.h"
+ .file 19 "/usr/local/cuda/bin/../include/device_functions.h"
+ .file 20 "/usr/local/cuda/bin/../include/math_constants.h"
+ .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
+ .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
+ .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
+ .file 24 "/usr/local/cuda/bin/../include/common_types.h"
+ .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
+ .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
+ .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
+ .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
+
+ .extern .shared .align 4 .b8 temp[];
+
+ .entry _Z16prefix_sum_blockIjEvPT_S1_j (
+ .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in,
+ .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out,
+ .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n)
+ {
+ .reg .u32 %r<81>;
+ .reg .pred %p<11>;
+ .loc 15 28 0
+$LBB1__Z16prefix_sum_blockIjEvPT_S1_j:
+ ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
+ cvt.s32.u16 %r2, %tid.x;
+ setp.lt.u32 %p1, %r2, %r1;
+ @!%p1 bra $Lt_0_7938;
+ .loc 15 35 0
+ ld.param.u32 %r3, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
+ mul24.lo.u32 %r4, %r2, 4;
+ add.u32 %r5, %r3, %r4;
+ ld.global.u32 %r6, [%r5+0];
+ bra.uni $Lt_0_7682;
+$Lt_0_7938:
+ mov.u32 %r6, 0;
+$Lt_0_7682:
+ mov.u32 %r7, temp;
+ shr.u32 %r8, %r2, 4;
+ add.u32 %r9, %r2, %r8;
+ mul.lo.u32 %r10, %r9, 4;
+ add.u32 %r11, %r10, %r7;
+ st.shared.u32 [%r11+0], %r6;
+ cvt.s32.u16 %r12, %ntid.x;
+ add.s32 %r13, %r12, %r2;
+ .loc 15 28 0
+ ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
+ .loc 15 35 0
+ setp.lt.u32 %p2, %r13, %r1;
+ @!%p2 bra $Lt_0_8450;
+ .loc 15 36 0
+ ld.param.u32 %r14, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
+ mul.lo.u32 %r15, %r13, 4;
+ add.u32 %r16, %r14, %r15;
+ ld.global.u32 %r17, [%r16+0];
+ bra.uni $Lt_0_8194;
+$Lt_0_8450:
+ mov.u32 %r17, 0;
+$Lt_0_8194:
+ shr.u32 %r18, %r13, 4;
+ add.u32 %r19, %r13, %r18;
+ mul.lo.u32 %r20, %r19, 4;
+ add.u32 %r21, %r20, %r7;
+ st.shared.u32 [%r21+0], %r17;
+ .loc 15 39 0
+ mov.s32 %r22, %r12;
+ mov.u32 %r23, 0;
+ setp.le.s32 %p3, %r12, %r23;
+ mov.s32 %r24, 1;
+ @%p3 bra $Lt_0_13314;
+$Lt_0_9218:
+ //<loop> Loop body line 39, nesting depth: 1, estimated iterations: unknown
+ .loc 15 40 0
+ bar.sync 0;
+ setp.le.s32 %p4, %r22, %r2;
+ @%p4 bra $Lt_0_9474;
+ //<loop> Part of loop body line 39, head labeled $Lt_0_9218
+ .loc 15 43 0
+ mul24.lo.u32 %r25, %r2, 2;
+ add.u32 %r26, %r25, 1;
+ add.u32 %r27, %r25, 2;
+ mul.lo.u32 %r28, %r24, %r26;
+ mul.lo.u32 %r29, %r24, %r27;
+ sub.u32 %r30, %r29, 1;
+ shr.u32 %r31, %r30, 4;
+ add.u32 %r32, %r29, %r31;
+ mul.lo.u32 %r33, %r32, 4;
+ add.u32 %r34, %r33, %r7;
+ ld.shared.u32 %r35, [%r34+-4];
+ sub.u32 %r36, %r28, 1;
+ shr.u32 %r37, %r36, 4;
+ add.u32 %r38, %r28, %r37;
+ mul.lo.u32 %r39, %r38, 4;
+ add.u32 %r40, %r7, %r39;
+ ld.shared.u32 %r41, [%r40+-4];
+ add.u32 %r42, %r35, %r41;
+ st.shared.u32 [%r34+-4], %r42;
+$Lt_0_9474:
+ //<loop> Part of loop body line 39, head labeled $Lt_0_9218
+ .loc 15 39 0
+ shr.s32 %r22, %r22, 1;
+ shl.b32 %r24, %r24, 1;
+ mov.u32 %r43, 0;
+ setp.gt.s32 %p5, %r22, %r43;
+ @%p5 bra $Lt_0_9218;
+ bra.uni $Lt_0_8706;
+$Lt_0_13314:
+$Lt_0_8706:
+ mov.u32 %r44, 0;
+ setp.ne.s32 %p6, %r2, %r44;
+ @%p6 bra $Lt_0_10242;
+ .loc 15 47 0
+ mul24.lo.s32 %r45, %r12, 2;
+ mov.u32 %r46, 0;
+ sub.u32 %r47, %r45, 1;
+ shr.u32 %r48, %r47, 4;
+ add.u32 %r49, %r45, %r48;
+ mul.lo.u32 %r50, %r49, 4;
+ add.u32 %r51, %r7, %r50;
+ st.shared.u32 [%r51+-4], %r46;
+$Lt_0_10242:
+ mov.u32 %r52, 1;
+ setp.lt.s32 %p7, %r12, %r52;
+ @%p7 bra $Lt_0_10754;
+ mov.s32 %r22, 1;
+$Lt_0_11266:
+ //<loop> Loop body line 47, nesting depth: 1, estimated iterations: unknown
+ .loc 15 50 0
+ shr.s32 %r24, %r24, 1;
+ .loc 15 51 0
+ bar.sync 0;
+ setp.le.s32 %p8, %r22, %r2;
+ @%p8 bra $Lt_0_11522;
+ //<loop> Part of loop body line 47, head labeled $Lt_0_11266
+ .loc 15 55 0
+ mul24.lo.u32 %r53, %r2, 2;
+ add.u32 %r54, %r53, 1;
+ mul.lo.u32 %r55, %r24, %r54;
+ sub.u32 %r56, %r55, 1;
+ shr.u32 %r57, %r56, 4;
+ add.u32 %r58, %r55, %r57;
+ mul.lo.u32 %r59, %r58, 4;
+ add.u32 %r60, %r59, %r7;
+ ld.shared.u32 %r61, [%r60+-4];
+ .loc 15 56 0
+ add.u32 %r62, %r53, 2;
+ mul.lo.u32 %r63, %r24, %r62;
+ sub.u32 %r64, %r63, 1;
+ shr.u32 %r65, %r64, 4;
+ add.u32 %r66, %r63, %r65;
+ mul.lo.u32 %r67, %r66, 4;
+ add.u32 %r68, %r67, %r7;
+ ld.shared.u32 %r69, [%r68+-4];
+ st.shared.u32 [%r60+-4], %r69;
+ .loc 15 57 0
+ ld.shared.u32 %r70, [%r68+-4];
+ add.u32 %r71, %r70, %r61;
+ st.shared.u32 [%r68+-4], %r71;
+$Lt_0_11522:
+ //<loop> Part of loop body line 47, head labeled $Lt_0_11266
+ .loc 15 49 0
+ shl.b32 %r22, %r22, 1;
+ setp.le.s32 %p9, %r22, %r12;
+ @%p9 bra $Lt_0_11266;
+$Lt_0_10754:
+ .loc 15 60 0
+ bar.sync 0;
+ @!%p1 bra $Lt_0_12290;
+ .loc 15 62 0
+ ld.shared.u32 %r72, [%r11+0];
+ ld.param.u32 %r73, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
+ mul24.lo.u32 %r74, %r2, 4;
+ add.u32 %r75, %r73, %r74;
+ st.global.u32 [%r75+0], %r72;
+$Lt_0_12290:
+ @!%p2 bra $Lt_0_12802;
+ .loc 15 63 0
+ ld.shared.u32 %r76, [%r21+0];
+ ld.param.u32 %r77, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
+ mul.lo.u32 %r78, %r13, 4;
+ add.u32 %r79, %r77, %r78;
+ st.global.u32 [%r79+0], %r76;
+$Lt_0_12802:
+ .loc 15 64 0
+ exit;
+$LDWend__Z16prefix_sum_blockIjEvPT_S1_j:
+ } // _Z16prefix_sum_blockIjEvPT_S1_j
+
--- /dev/null
+Doug Coleman
--- /dev/null
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien.c-types alien.data alien.strings arrays assocs
+byte-arrays classes.struct combinators cuda.ffi cuda.utils io
+io.encodings.utf8 kernel math.parser prettyprint sequences ;
+IN: cuda.devices
+
+: #cuda-devices ( -- n )
+ int <c-object> [ cuDeviceGetCount cuda-error ] keep *int ;
+
+: n>cuda-device ( n -- device )
+ [ CUdevice <c-object> ] dip [ cuDeviceGet cuda-error ] 2keep drop *int ;
+
+: enumerate-cuda-devices ( -- devices )
+ #cuda-devices iota [ n>cuda-device ] map ;
+
+: cuda-device-properties ( device -- properties )
+ [ CUdevprop <c-object> ] dip
+ [ cuDeviceGetProperties cuda-error ] 2keep drop
+ CUdevprop memory>struct ;
+
+: cuda-devices ( -- assoc )
+ enumerate-cuda-devices [ dup cuda-device-properties ] { } map>assoc ;
+
+: cuda-device-name ( n -- string )
+ [ 256 [ <byte-array> ] keep ] dip
+ [ cuDeviceGetName cuda-error ]
+ [ 2drop utf8 alien>string ] 3bi ;
+
+: cuda-device-capability ( n -- pair )
+ [ int <c-object> int <c-object> ] dip
+ [ cuDeviceComputeCapability cuda-error ]
+ [ drop [ *int ] bi@ ] 3bi 2array ;
+
+: cuda-device-memory ( n -- bytes )
+ [ uint <c-object> ] dip
+ [ cuDeviceTotalMem cuda-error ]
+ [ drop *uint ] 2bi ;
+
+: cuda-device-attribute ( attribute dev -- n )
+ [ int <c-object> ] 2dip
+ [ cuDeviceGetAttribute cuda-error ]
+ [ 2drop *int ] 3bi ;
+
+: cuda-device. ( n -- )
+ {
+ [ "Device: " write number>string print ]
+ [ "Name: " write cuda-device-name print ]
+ [ "Memory: " write cuda-device-memory number>string print ]
+ [
+ "Capability: " write
+ cuda-device-capability [ number>string ] map " " join print
+ ]
+ [ "Properties: " write cuda-device-properties . ]
+ [
+ "CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: " write
+ CU_DEVICE_ATTRIBUTE_GPU_OVERLAP swap
+ cuda-device-attribute number>string print
+ ]
+ } cleave ;
+
+: cuda. ( -- )
+ "CUDA Version: " write cuda-version number>string print nl
+ #cuda-devices iota [ nl ] [ cuda-device. ] interleave ;
+
+++ /dev/null
-/*
- World using CUDA
-**
-** The string "Hello World!" is mangled then restored using a common CUDA idiom
-**
-** Byron Galbraith
-** 2009-02-18
-*/
-#include <cuda.h>
-#include <stdio.h>
-
-// Prototypes
-extern "C" __global__ void helloWorld(char*);
-
-// Host function
-int
-main(int argc, char** argv)
-{
- int i;
-
- // desired output
- char str[] = "Hello World!";
-
- // mangle contents of output
- // the null character is left intact for simplicity
- for(i = 0; i < 12; i++)
- str[i] -= i;
-
- // allocate memory on the device
- char *d_str;
- size_t size = sizeof(str);
- cudaMalloc((void**)&d_str, size);
-
- // copy the string to the device
- cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
-
- // set the grid and block sizes
- dim3 dimGrid(2); // one block per word
- dim3 dimBlock(6); // one thread per character
-
- // invoke the kernel
- helloWorld<<< dimGrid, dimBlock >>>(d_str);
-
- // retrieve the results from the device
- cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
-
- // free up the allocated memory on the device
- cudaFree(d_str);
-
- // everyone's favorite part
- printf("%s\n", str);
-
- return 0;
-}
-
-// Device kernel
-__global__ void
-helloWorld(char* str)
-{
- // determine where in the thread grid we are
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
- // unmangle output
- str[idx] += idx;
-}
+++ /dev/null
- .version 1.4
- .target sm_10, map_f64_to_f32
- // compiled with /usr/local/cuda/bin/../open64/lib//be
- // nvopencc 3.0 built on 2010-03-11
-
- //-----------------------------------------------------------
- // Compiling /tmp/tmpxft_00000eab_00000000-7_hello.cpp3.i (/var/folders/KD/KDnx4D80Eh0fsORqNrFWBE+++TI/-Tmp-/ccBI#.AYqbdQ)
- //-----------------------------------------------------------
-
- //-----------------------------------------------------------
- // Options:
- //-----------------------------------------------------------
- // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
- // -O3 (Optimization level)
- // -g0 (Debug level)
- // -m2 (Report advisories)
- //-----------------------------------------------------------
-
- .file 1 "<command-line>"
- .file 2 "/tmp/tmpxft_00000eab_00000000-6_hello.cudafe2.gpu"
- .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
- .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
- .file 5 "/usr/local/cuda/bin/../include/host_defines.h"
- .file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
- .file 7 "/usr/local/cuda/bin/../include/device_types.h"
- .file 8 "/usr/local/cuda/bin/../include/driver_types.h"
- .file 9 "/usr/local/cuda/bin/../include/texture_types.h"
- .file 10 "/usr/local/cuda/bin/../include/vector_types.h"
- .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
- .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
- .file 13 "/usr/include/i386/_types.h"
- .file 14 "/usr/include/time.h"
- .file 15 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
- .file 16 "/usr/local/cuda/bin/../include/common_functions.h"
- .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
- .file 18 "/usr/local/cuda/bin/../include/math_functions.h"
- .file 19 "/usr/local/cuda/bin/../include/device_functions.h"
- .file 20 "/usr/local/cuda/bin/../include/math_constants.h"
- .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
- .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
- .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
- .file 24 "/usr/local/cuda/bin/../include/common_types.h"
- .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
- .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
- .file 27 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
- .file 28 "hello.cu"
-
-
- .entry helloWorld (
- .param .u32 __cudaparm_helloWorld_str)
- {
- .reg .u16 %rh<4>;
- .reg .u32 %r<9>;
- .loc 28 58 0
-$LBB1_helloWorld:
- .loc 28 64 0
- mov.u16 %rh1, %ctaid.x;
- mov.u16 %rh2, %ntid.x;
- mul.wide.u16 %r1, %rh1, %rh2;
- cvt.u32.u16 %r2, %tid.x;
- add.u32 %r3, %r2, %r1;
- ld.param.u32 %r4, [__cudaparm_helloWorld_str];
- add.u32 %r5, %r4, %r3;
- ld.global.s8 %r6, [%r5+0];
- add.s32 %r7, %r6, %r3;
- st.global.s8 [%r5+0], %r7;
- .loc 28 65 0
- exit;
-$LDWend_helloWorld:
- } // helloWorld
-
--- /dev/null
+Doug Coleman
--- /dev/null
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors alien alien.data assocs byte-arrays cuda.ffi
+cuda.utils destructors io.encodings.string io.encodings.utf8
+kernel locals namespaces sequences ;
+QUALIFIED-WITH: alien.c-types a
+IN: cuda.memory
+
+SYMBOL: cuda-memory-hashtable
+
+TUPLE: cuda-memory < disposable ptr length ;
+
+: <cuda-memory> ( ptr length -- obj )
+ cuda-memory new-disposable
+ swap >>length
+ swap >>ptr ;
+
+: add-cuda-memory ( obj -- obj )
+ dup dup ptr>> cuda-memory-hashtable get set-at ;
+
+: delete-cuda-memory ( obj -- )
+ cuda-memory-hashtable delete-at ;
+
+ERROR: invalid-cuda-memory ptr ;
+
+: cuda-memory-length ( cuda-memory -- n )
+ ptr>> cuda-memory-hashtable get ?at [
+ length>>
+ ] [
+ invalid-cuda-memory
+ ] if ;
+
+M: cuda-memory byte-length length>> ;
+
+: cuda-malloc ( n -- ptr )
+ [ CUdeviceptr <c-object> ] dip
+ [ cuMemAlloc cuda-error ] 2keep
+ [ a:*int ] dip <cuda-memory> add-cuda-memory ;
+
+: cuda-free* ( ptr -- )
+ cuMemFree cuda-error ;
+
+M: cuda-memory dispose ( ptr -- )
+ ptr>> cuda-free* ;
+
+: memcpy-device>device ( dest-ptr src-ptr count -- )
+ cuMemcpyDtoD cuda-error ;
+
+: memcpy-device>array ( dest-array dest-index src-ptr count -- )
+ cuMemcpyDtoA cuda-error ;
+
+: memcpy-array>device ( dest-ptr src-array src-index count -- )
+ cuMemcpyAtoD cuda-error ;
+
+: memcpy-array>host ( dest-ptr src-array src-index count -- )
+ cuMemcpyAtoH cuda-error ;
+
+: memcpy-host>array ( dest-array dest-index src-ptr count -- )
+ cuMemcpyHtoA cuda-error ;
+
+: memcpy-array>array ( dest-array dest-index src-array src-ptr count -- )
+ cuMemcpyAtoA cuda-error ;
+
+: host>device ( dest-ptr src-ptr -- )
+ [ ptr>> ] dip dup length cuMemcpyHtoD cuda-error ;
+
+:: device>host ( ptr -- seq )
+ ptr byte-length <byte-array>
+ [ ptr [ ptr>> ] [ byte-length ] bi cuMemcpyDtoH cuda-error ] keep ;
+
+: malloc-device-string ( string -- n )
+ utf8 encode
+ [ length cuda-malloc ] keep
+ [ host>device ] [ drop ] 2bi ;
+++ /dev/null
-#include <stdio.h>
-#include <stdlib.h>
-#include <cuda_runtime.h>
-
-static const int LOG_BANK_COUNT = 4;
-
-static inline __device__ __host__ unsigned shared_offset(unsigned i)
-{
- return i + (i >> LOG_BANK_COUNT);
-}
-
-static inline __device__ __host__ unsigned offset_a(unsigned offset, unsigned i)
-{
- return shared_offset(offset * (2*i + 1) - 1);
-}
-
-static inline __device__ __host__ unsigned offset_b(unsigned offset, unsigned i)
-{
- return shared_offset(offset * (2*i + 2) - 1);
-}
-
-static inline __device__ __host__ unsigned lpot(unsigned x)
-{
- --x; x |= x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return ++x;
-}
-
-template<typename T>
-__global__ void prefix_sum_block(T *in, T *out, unsigned n)
-{
- extern __shared__ T temp[];
-
- int idx = threadIdx.x;
- int blocksize = blockDim.x;
-
- temp[shared_offset(idx )] = (idx < n) ? in[idx ] : 0;
- temp[shared_offset(idx + blocksize)] = (idx + blocksize < n) ? in[idx + blocksize] : 0;
-
- int offset, d;
- for (offset = 1, d = blocksize; d > 0; d >>= 1, offset <<= 1) {
- __syncthreads();
- if (idx < d) {
- unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
- temp[b] += temp[a];
- }
- }
-
- if (idx == 0) temp[shared_offset(blocksize*2 - 1)] = 0;
-
- for (d = 1; d <= blocksize; d <<= 1) {
- offset >>= 1;
- __syncthreads();
-
- if (idx < d) {
- unsigned a = offset_a(offset, idx), b = offset_b(offset, idx);
- unsigned t = temp[a];
- temp[a] = temp[b];
- temp[b] += t;
- }
- }
- __syncthreads();
-
- if (idx < n) out[idx ] = temp[shared_offset(idx )];
- if (idx + blocksize < n) out[idx + blocksize] = temp[shared_offset(idx + blocksize)];
-}
-
-template<typename T>
-void prefix_sum(T *in, T *out, unsigned n)
-{
- char *device_values;
- unsigned n_lpot = lpot(n);
- size_t n_pitch;
-
- cudaError_t error = cudaMallocPitch((void**)&device_values, &n_pitch, sizeof(T)*n, 2);
- if (error != 0) {
- printf("error %u allocating width %lu height %u\n", error, sizeof(T)*n, 2);
- exit(1);
- }
-
- cudaMemcpy(device_values, in, sizeof(T)*n, cudaMemcpyHostToDevice);
-
- prefix_sum_block<<<1, n_lpot/2, shared_offset(n_lpot)*sizeof(T)>>>
- ((T*)device_values, (T*)(device_values + n_pitch), n);
-
- cudaMemcpy(out, device_values + n_pitch, sizeof(T)*n, cudaMemcpyDeviceToHost);
- cudaFree(device_values);
-}
-
-int main()
-{
- sranddev();
-
- static unsigned in_values[1024], out_values[1024];
-
- for (int i = 0; i < 1024; ++i)
- in_values[i] = rand() >> 21;
-
- prefix_sum(in_values, out_values, 1024);
-
- for (int i = 0; i < 1024; ++i)
- printf("%5d => %5d\n", in_values[i], out_values[i]);
-
- return 0;
-}
+++ /dev/null
- .version 1.4
- .target sm_10, map_f64_to_f32
- // compiled with /usr/local/cuda/bin/../open64/lib//be
- // nvopencc 3.0 built on 2010-03-11
-
- //-----------------------------------------------------------
- // Compiling /tmp/tmpxft_00000236_00000000-7_prefix-sum.cpp3.i (/var/folders/K6/K6oI14wZ2RWhSE+BYqTjA++++TI/-Tmp-/ccBI#.0ATpGM)
- //-----------------------------------------------------------
-
- //-----------------------------------------------------------
- // Options:
- //-----------------------------------------------------------
- // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
- // -O3 (Optimization level)
- // -g0 (Debug level)
- // -m2 (Report advisories)
- //-----------------------------------------------------------
-
- .file 1 "<command-line>"
- .file 2 "/tmp/tmpxft_00000236_00000000-6_prefix-sum.cudafe2.gpu"
- .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
- .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
- .file 5 "/usr/local/cuda/bin/../include/host_defines.h"
- .file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
- .file 7 "/usr/local/cuda/bin/../include/device_types.h"
- .file 8 "/usr/local/cuda/bin/../include/driver_types.h"
- .file 9 "/usr/local/cuda/bin/../include/texture_types.h"
- .file 10 "/usr/local/cuda/bin/../include/vector_types.h"
- .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
- .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
- .file 13 "/usr/include/i386/_types.h"
- .file 14 "/usr/include/time.h"
- .file 15 "prefix-sum.cu"
- .file 16 "/usr/local/cuda/bin/../include/common_functions.h"
- .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
- .file 18 "/usr/local/cuda/bin/../include/math_functions.h"
- .file 19 "/usr/local/cuda/bin/../include/device_functions.h"
- .file 20 "/usr/local/cuda/bin/../include/math_constants.h"
- .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
- .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
- .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
- .file 24 "/usr/local/cuda/bin/../include/common_types.h"
- .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
- .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
- .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
- .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
-
- .extern .shared .align 4 .b8 temp[];
-
- .entry _Z16prefix_sum_blockIjEvPT_S1_j (
- .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in,
- .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out,
- .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n)
- {
- .reg .u32 %r<81>;
- .reg .pred %p<11>;
- .loc 15 28 0
-$LBB1__Z16prefix_sum_blockIjEvPT_S1_j:
- ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
- cvt.s32.u16 %r2, %tid.x;
- setp.lt.u32 %p1, %r2, %r1;
- @!%p1 bra $Lt_0_7938;
- .loc 15 35 0
- ld.param.u32 %r3, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
- mul24.lo.u32 %r4, %r2, 4;
- add.u32 %r5, %r3, %r4;
- ld.global.u32 %r6, [%r5+0];
- bra.uni $Lt_0_7682;
-$Lt_0_7938:
- mov.u32 %r6, 0;
-$Lt_0_7682:
- mov.u32 %r7, temp;
- shr.u32 %r8, %r2, 4;
- add.u32 %r9, %r2, %r8;
- mul.lo.u32 %r10, %r9, 4;
- add.u32 %r11, %r10, %r7;
- st.shared.u32 [%r11+0], %r6;
- cvt.s32.u16 %r12, %ntid.x;
- add.s32 %r13, %r12, %r2;
- .loc 15 28 0
- ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n];
- .loc 15 35 0
- setp.lt.u32 %p2, %r13, %r1;
- @!%p2 bra $Lt_0_8450;
- .loc 15 36 0
- ld.param.u32 %r14, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in];
- mul.lo.u32 %r15, %r13, 4;
- add.u32 %r16, %r14, %r15;
- ld.global.u32 %r17, [%r16+0];
- bra.uni $Lt_0_8194;
-$Lt_0_8450:
- mov.u32 %r17, 0;
-$Lt_0_8194:
- shr.u32 %r18, %r13, 4;
- add.u32 %r19, %r13, %r18;
- mul.lo.u32 %r20, %r19, 4;
- add.u32 %r21, %r20, %r7;
- st.shared.u32 [%r21+0], %r17;
- .loc 15 39 0
- mov.s32 %r22, %r12;
- mov.u32 %r23, 0;
- setp.le.s32 %p3, %r12, %r23;
- mov.s32 %r24, 1;
- @%p3 bra $Lt_0_13314;
-$Lt_0_9218:
- //<loop> Loop body line 39, nesting depth: 1, estimated iterations: unknown
- .loc 15 40 0
- bar.sync 0;
- setp.le.s32 %p4, %r22, %r2;
- @%p4 bra $Lt_0_9474;
- //<loop> Part of loop body line 39, head labeled $Lt_0_9218
- .loc 15 43 0
- mul24.lo.u32 %r25, %r2, 2;
- add.u32 %r26, %r25, 1;
- add.u32 %r27, %r25, 2;
- mul.lo.u32 %r28, %r24, %r26;
- mul.lo.u32 %r29, %r24, %r27;
- sub.u32 %r30, %r29, 1;
- shr.u32 %r31, %r30, 4;
- add.u32 %r32, %r29, %r31;
- mul.lo.u32 %r33, %r32, 4;
- add.u32 %r34, %r33, %r7;
- ld.shared.u32 %r35, [%r34+-4];
- sub.u32 %r36, %r28, 1;
- shr.u32 %r37, %r36, 4;
- add.u32 %r38, %r28, %r37;
- mul.lo.u32 %r39, %r38, 4;
- add.u32 %r40, %r7, %r39;
- ld.shared.u32 %r41, [%r40+-4];
- add.u32 %r42, %r35, %r41;
- st.shared.u32 [%r34+-4], %r42;
-$Lt_0_9474:
- //<loop> Part of loop body line 39, head labeled $Lt_0_9218
- .loc 15 39 0
- shr.s32 %r22, %r22, 1;
- shl.b32 %r24, %r24, 1;
- mov.u32 %r43, 0;
- setp.gt.s32 %p5, %r22, %r43;
- @%p5 bra $Lt_0_9218;
- bra.uni $Lt_0_8706;
-$Lt_0_13314:
-$Lt_0_8706:
- mov.u32 %r44, 0;
- setp.ne.s32 %p6, %r2, %r44;
- @%p6 bra $Lt_0_10242;
- .loc 15 47 0
- mul24.lo.s32 %r45, %r12, 2;
- mov.u32 %r46, 0;
- sub.u32 %r47, %r45, 1;
- shr.u32 %r48, %r47, 4;
- add.u32 %r49, %r45, %r48;
- mul.lo.u32 %r50, %r49, 4;
- add.u32 %r51, %r7, %r50;
- st.shared.u32 [%r51+-4], %r46;
-$Lt_0_10242:
- mov.u32 %r52, 1;
- setp.lt.s32 %p7, %r12, %r52;
- @%p7 bra $Lt_0_10754;
- mov.s32 %r22, 1;
-$Lt_0_11266:
- //<loop> Loop body line 47, nesting depth: 1, estimated iterations: unknown
- .loc 15 50 0
- shr.s32 %r24, %r24, 1;
- .loc 15 51 0
- bar.sync 0;
- setp.le.s32 %p8, %r22, %r2;
- @%p8 bra $Lt_0_11522;
- //<loop> Part of loop body line 47, head labeled $Lt_0_11266
- .loc 15 55 0
- mul24.lo.u32 %r53, %r2, 2;
- add.u32 %r54, %r53, 1;
- mul.lo.u32 %r55, %r24, %r54;
- sub.u32 %r56, %r55, 1;
- shr.u32 %r57, %r56, 4;
- add.u32 %r58, %r55, %r57;
- mul.lo.u32 %r59, %r58, 4;
- add.u32 %r60, %r59, %r7;
- ld.shared.u32 %r61, [%r60+-4];
- .loc 15 56 0
- add.u32 %r62, %r53, 2;
- mul.lo.u32 %r63, %r24, %r62;
- sub.u32 %r64, %r63, 1;
- shr.u32 %r65, %r64, 4;
- add.u32 %r66, %r63, %r65;
- mul.lo.u32 %r67, %r66, 4;
- add.u32 %r68, %r67, %r7;
- ld.shared.u32 %r69, [%r68+-4];
- st.shared.u32 [%r60+-4], %r69;
- .loc 15 57 0
- ld.shared.u32 %r70, [%r68+-4];
- add.u32 %r71, %r70, %r61;
- st.shared.u32 [%r68+-4], %r71;
-$Lt_0_11522:
- //<loop> Part of loop body line 47, head labeled $Lt_0_11266
- .loc 15 49 0
- shl.b32 %r22, %r22, 1;
- setp.le.s32 %p9, %r22, %r12;
- @%p9 bra $Lt_0_11266;
-$Lt_0_10754:
- .loc 15 60 0
- bar.sync 0;
- @!%p1 bra $Lt_0_12290;
- .loc 15 62 0
- ld.shared.u32 %r72, [%r11+0];
- ld.param.u32 %r73, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
- mul24.lo.u32 %r74, %r2, 4;
- add.u32 %r75, %r73, %r74;
- st.global.u32 [%r75+0], %r72;
-$Lt_0_12290:
- @!%p2 bra $Lt_0_12802;
- .loc 15 63 0
- ld.shared.u32 %r76, [%r21+0];
- ld.param.u32 %r77, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out];
- mul.lo.u32 %r78, %r13, 4;
- add.u32 %r79, %r77, %r78;
- st.global.u32 [%r79+0], %r76;
-$Lt_0_12802:
- .loc 15 64 0
- exit;
-$LDWend__Z16prefix_sum_blockIjEvPT_S1_j:
- } // _Z16prefix_sum_blockIjEvPT_S1_j
-
--- /dev/null
+! (c)2010 Joe Groff bsd license
+USING: accessors arrays combinators io kernel math math.parser
+roles sequences strings variants words ;
+FROM: roles => TUPLE: ;
+IN: cuda.ptx
+
+UNION: dim integer sequence ;
+UNION: ?integer POSTPONE: f integer ;
+UNION: ?string POSTPONE: f string ;
+
+VARIANT: ptx-type
+ .s8 .s16 .s32 .s64
+ .u8 .u16 .u32 .u64
+ .f16 .f32 .f64
+ .b8 .b16 .b32 .b64
+ .pred
+ .texref .samplerref .surfref
+ .v2: { { of ptx-type } }
+ .v4: { { of ptx-type } }
+ .struct: { { name string } } ;
+
+VARIANT: ptx-arch
+ sm_10 sm_11 sm_12 sm_13 sm_20 ;
+UNION: ?ptx-arch POSTPONE: f ptx-arch ;
+
+VARIANT: ptx-texmode
+ .texmode_unified .texmode_independent ;
+UNION: ?ptx-texmode POSTPONE: f ptx-texmode ;
+
+VARIANT: ptx-storage-space
+ .reg
+ .sreg
+ .const: { { bank ?integer } }
+ .global
+ .local
+ .param
+ .shared
+ .tex ;
+UNION: ?ptx-storage-space POSTPONE: f ptx-storage-space ;
+
+TUPLE: ptx-target
+ { arch ?ptx-arch }
+ { map_f64_to_f32? boolean }
+ { texmode ?ptx-texmode } ;
+
+TUPLE: ptx
+ { version string }
+ { target ptx-target }
+ body ;
+
+TUPLE: ptx-struct-definition
+ { name string }
+ members ;
+
+TUPLE: ptx-variable
+ { extern? boolean }
+ { visible? boolean }
+ { align ?integer }
+ { storage-space ptx-storage-space }
+ { type ptx-type }
+ { name string }
+ { parameter ?integer }
+ { dim dim }
+ { initializer ?string } ;
+
+TUPLE: ptx-predicate
+ { negated? boolean }
+ { variable string } ;
+UNION: ?ptx-predicate POSTPONE: f ptx-predicate ;
+
+TUPLE: ptx-instruction
+ { label ?string }
+ { predicate ?ptx-predicate } ;
+
+TUPLE: ptx-entry
+ { name string }
+ params
+ directives
+ body ;
+
+TUPLE: ptx-func < ptx-entry
+ { return ptx-variable } ;
+
+TUPLE: ptx-directive ;
+
+TUPLE: .file < ptx-directive
+ { info string } ;
+TUPLE: .loc < ptx-directive
+ { info string } ;
+TUPLE: .maxnctapersm < ptx-directive
+ { ncta integer } ;
+TUPLE: .minnctapersm < ptx-directive
+ { ncta integer } ;
+TUPLE: .maxnreg < ptx-directive
+ { n integer } ;
+TUPLE: .maxntid < ptx-directive
+ { dim dim } ;
+TUPLE: .pragma < ptx-directive
+ { pragma string } ;
+
+VARIANT: ptx-float-rounding-mode
+ .rn .rz .rm .rp .approx .full ;
+VARIANT: ptx-int-rounding-mode
+ .rni .rzi .rmi .rpi ;
+UNION: ?ptx-float-rounding-mode POSTPONE: f ptx-float-rounding-mode ;
+UNION: ?ptx-int-rounding-mode POSTPONE: f ptx-int-rounding-mode ;
+
+UNION: ptx-rounding-mode
+ ptx-float-rounding-mode ptx-int-rounding-mode ;
+UNION: ?ptx-rounding-mode POSTPONE: f ptx-rounding-mode ;
+
+TUPLE: ptx-typed-instruction < ptx-instruction
+ { type ptx-type }
+ { dest string } ;
+
+TUPLE: ptx-2op-instruction < ptx-typed-instruction
+ { a string } ;
+
+TUPLE: ptx-3op-instruction < ptx-typed-instruction
+ { a string }
+ { b string } ;
+
+TUPLE: ptx-4op-instruction < ptx-typed-instruction
+ { a string }
+ { b string }
+ { c string } ;
+
+TUPLE: ptx-5op-instruction < ptx-typed-instruction
+ { a string }
+ { b string }
+ { c string }
+ { d string } ;
+
+TUPLE: ptx-addsub-instruction < ptx-3op-instruction
+ { sat? boolean }
+ { cc? boolean } ;
+
+VARIANT: ptx-mul-mode
+ .wide ;
+UNION: ?ptx-mul-mode POSTPONE: f ptx-mul-mode ;
+
+TUPLE: ptx-mul-instruction < ptx-3op-instruction
+ { mode ?ptx-mul-mode } ;
+
+TUPLE: ptx-mad-instruction < ptx-4op-instruction
+ { mode ?ptx-mul-mode }
+ { sat? boolean } ;
+
+VARIANT: ptx-prmt-mode
+ .f4e .b4e .rc8 .ecl .ecr .rc16 ;
+UNION: ?ptx-prmt-mode POSTPONE: f ptx-prmt-mode ;
+
+ROLE: ptx-float-ftz
+ { ftz? boolean } ;
+ROLE: ptx-float-env < ptx-float-ftz
+ { round ?ptx-float-rounding-mode } ;
+
+VARIANT: ptx-testp-op
+ .finite .infinite .number .notanumber .normal .subnormal ;
+
+VARIANT: ptx-cmp-op
+ .eq .ne
+ .lt .le .gt .ge
+ .ls .hs
+ .equ .neu
+ .ltu .leu .gtu .geu
+ .num .nan ;
+
+VARIANT: ptx-op
+ .and .or .xor .cas .exch .add .inc .dec .min .max
+ .popc ;
+UNION: ?ptx-op POSTPONE: f ptx-op ;
+
+SINGLETONS: .lo .hi ;
+INSTANCE: .lo ptx-mul-mode
+INSTANCE: .lo ptx-cmp-op
+INSTANCE: .hi ptx-mul-mode
+INSTANCE: .hi ptx-cmp-op
+
+TUPLE: ptx-set-instruction < ptx-3op-instruction
+ { cmp-op ptx-cmp-op }
+ { bool-op ?ptx-op }
+ { c ?string }
+ { ftz? boolean } ;
+
+VARIANT: ptx-cache-op
+ .ca .cg .cs .lu .cv
+ .wb .wt ;
+UNION: ?ptx-cache-op POSTPONE: f ptx-cache-op ;
+
+TUPLE: ptx-ldst-instruction < ptx-2op-instruction
+ { volatile? boolean }
+ { storage-space ?ptx-storage-space }
+ { cache-op ?ptx-cache-op } ;
+
+VARIANT: ptx-cache-level
+ .L1 .L2 ;
+
+TUPLE: ptx-branch-instruction < ptx-instruction
+ { target string }
+ { uni? boolean } ;
+
+VARIANT: ptx-membar-level
+ .cta .gl .sys ;
+
+VARIANT: ptx-vote-mode
+ .all .any .uni .ballot ;
+
+TUPLE: ptx-instruction-not-supported-yet < ptx-instruction ;
+
+TUPLE: abs <{ ptx-2op-instruction ptx-float-ftz } ;
+TUPLE: add <{ ptx-addsub-instruction ptx-float-env } ;
+TUPLE: addc < ptx-addsub-instruction ;
+TUPLE: and < ptx-3op-instruction ;
+TUPLE: atom < ptx-3op-instruction
+ { storage-space ?ptx-storage-space }
+ { op ptx-op }
+ { c ?string } ;
+TUPLE: bar.arrive < ptx-instruction
+ { a string }
+ { b string } ;
+TUPLE: bar.red < ptx-2op-instruction
+ { op ptx-op }
+ { b ?string }
+ { c string } ;
+TUPLE: bar.sync < ptx-instruction
+ { a string }
+ { b ?string } ;
+TUPLE: bfe < ptx-4op-instruction ;
+TUPLE: bfi < ptx-5op-instruction ;
+TUPLE: bfind < ptx-2op-instruction
+ { shiftamt? boolean } ;
+TUPLE: bra < ptx-branch-instruction ;
+TUPLE: brev < ptx-2op-instruction ;
+TUPLE: brkpt < ptx-instruction ;
+TUPLE: call < ptx-branch-instruction
+ { return ?string }
+ params ;
+TUPLE: clz < ptx-2op-instruction ;
+TUPLE: cnot < ptx-2op-instruction ;
+TUPLE: copysign < ptx-3op-instruction ;
+TUPLE: cos <{ ptx-2op-instruction ptx-float-env } ;
+TUPLE: cvt < ptx-2op-instruction
+ { rounding-mode ?ptx-rounding-mode }
+ { ftz? boolean }
+ { sat? boolean }
+ { dest-type ptx-type } ;
+TUPLE: cvta < ptx-2op-instruction
+ { to? boolean }
+ { storage-space ?ptx-storage-space } ;
+TUPLE: div <{ ptx-3op-instruction ptx-float-env } ;
+TUPLE: ex2 <{ ptx-2op-instruction ptx-float-env } ;
+TUPLE: exit < ptx-instruction ;
+TUPLE: fma <{ ptx-mad-instruction ptx-float-env } ;
+TUPLE: isspacep < ptx-instruction
+ { storage-space ?ptx-storage-space }
+ { dest string }
+ { a string } ;
+TUPLE: ld < ptx-ldst-instruction ;
+TUPLE: ldu < ptx-ldst-instruction ;
+TUPLE: lg2 <{ ptx-2op-instruction ptx-float-env } ;
+TUPLE: mad <{ ptx-mad-instruction ptx-float-env } ;
+TUPLE: mad24 < ptx-mad-instruction ;
+TUPLE: max <{ ptx-3op-instruction ptx-float-ftz } ;
+TUPLE: membar < ptx-instruction
+ { level ptx-membar-level } ;
+TUPLE: min <{ ptx-3op-instruction ptx-float-ftz } ;
+TUPLE: mov < ptx-2op-instruction ;
+TUPLE: mul <{ ptx-mul-instruction ptx-float-env } ;
+TUPLE: mul24 < ptx-mul-instruction ;
+TUPLE: neg <{ ptx-2op-instruction ptx-float-ftz } ;
+TUPLE: not < ptx-2op-instruction ;
+TUPLE: or < ptx-3op-instruction ;
+TUPLE: pmevent < ptx-instruction
+ { a string } ;
+TUPLE: popc < ptx-2op-instruction ;
+TUPLE: prefetch < ptx-instruction
+ { a string }
+ { storage-space ?ptx-storage-space }
+ { level ptx-cache-level } ;
+TUPLE: prefetchu < ptx-instruction
+ { a string }
+ { level ptx-cache-level } ;
+TUPLE: prmt < ptx-4op-instruction
+ { mode ?ptx-prmt-mode } ;
+TUPLE: rcp <{ ptx-2op-instruction ptx-float-env } ;
+TUPLE: red < ptx-2op-instruction
+ { storage-space ?ptx-storage-space }
+ { op ptx-op } ;
+TUPLE: rem < ptx-3op-instruction ;
+TUPLE: ret < ptx-instruction ;
+TUPLE: rsqrt <{ ptx-2op-instruction ptx-float-env } ;
+TUPLE: sad < ptx-4op-instruction ;
+TUPLE: selp < ptx-4op-instruction ;
+TUPLE: set < ptx-set-instruction
+ { dest-type ptx-type } ;
+TUPLE: setp < ptx-set-instruction
+ { |dest ?string } ;
+TUPLE: shl < ptx-3op-instruction ;
+TUPLE: shr < ptx-3op-instruction ;
+TUPLE: sin <{ ptx-2op-instruction ptx-float-env } ;
+TUPLE: slct < ptx-4op-instruction
+ { dest-type ptx-type }
+ { ftz? boolean } ;
+TUPLE: sqrt <{ ptx-2op-instruction ptx-float-env } ;
+TUPLE: st < ptx-ldst-instruction ;
+TUPLE: sub <{ ptx-addsub-instruction ptx-float-env } ;
+TUPLE: subc < ptx-addsub-instruction ;
+TUPLE: suld < ptx-instruction-not-supported-yet ;
+TUPLE: sured < ptx-instruction-not-supported-yet ;
+TUPLE: sust < ptx-instruction-not-supported-yet ;
+TUPLE: suq < ptx-instruction-not-supported-yet ;
+TUPLE: testp < ptx-2op-instruction
+ { op ptx-testp-op } ;
+TUPLE: tex < ptx-instruction-not-supported-yet ;
+TUPLE: txq < ptx-instruction-not-supported-yet ;
+TUPLE: trap < ptx-instruction ;
+TUPLE: vabsdiff < ptx-instruction-not-supported-yet ;
+TUPLE: vadd < ptx-instruction-not-supported-yet ;
+TUPLE: vmad < ptx-instruction-not-supported-yet ;
+TUPLE: vmax < ptx-instruction-not-supported-yet ;
+TUPLE: vmin < ptx-instruction-not-supported-yet ;
+TUPLE: vset < ptx-instruction-not-supported-yet ;
+TUPLE: vshl < ptx-instruction-not-supported-yet ;
+TUPLE: vshr < ptx-instruction-not-supported-yet ;
+TUPLE: vsub < ptx-instruction-not-supported-yet ;
+TUPLE: vote < ptx-2op-instruction
+ { mode ptx-vote-mode } ;
+TUPLE: xor < ptx-3op-instruction ;
+
+GENERIC: ptx-element-label ( elt -- label )
+M: object ptx-element-label drop f ;
+
+GENERIC: (write-ptx-element) ( elt -- )
+
+: write-ptx-element ( elt -- )
+ dup ptx-element-label [ write ":" write ] when*
+ "\t" write (write-ptx-element)
+ ";" print ;
+
+: write-ptx ( ptx -- )
+ "\t.version " write dup version>> write ";" print
+ dup target>> write-ptx-element
+ body>> [ write-ptx-element ] each ;
+
+: write-ptx-symbol ( symbol/f -- )
+ [ name>> write ] when* ;
+
+M: f (write-ptx-element)
+ drop ;
+
+M: word (write-ptx-element)
+ name>> write ;
+
+M: .const (write-ptx-element)
+ ".const" write
+ bank>> [ "[" write number>string write "]" write ] when* ;
+M: .v2 (write-ptx-element)
+ ".v2" write of>> (write-ptx-element) ;
+M: .v4 (write-ptx-element)
+ ".v4" write of>> (write-ptx-element) ;
+M: .struct (write-ptx-element)
+ ".struct " write name>> write ;
+
+M: ptx-target (write-ptx-element)
+ ".target " write
+ [ arch>> [ name>> ] [ f ] if* ]
+ [ map_f64_to_f32?>> [ "map_f64_to_f32" ] [ f ] if ]
+ [ texmode>> [ name>> ] [ f ] if* ] tri
+ 3array sift ", " join write ;
+
+: write-ptx-dim ( dim -- )
+ {
+ { [ dup zero? ] [ drop "[]" write ] }
+ { [ dup sequence? ] [ [ "[" write number>string write "]" write ] each ] }
+ [ "[" write number>string write "]" write ]
+ } cond ;
+
+M: ptx-variable (write-ptx-element)
+ dup extern?>> [ ".extern " write ] when
+ dup visible?>> [ ".visible " write ] when
+ dup align>> [ ".align " write number>string write " " write ] when*
+ dup storage-space>> (write-ptx-element) " " write
+ dup type>> (write-ptx-element) " " write
+ dup name>> write
+ dup parameter>> [ "<" write number>string write ">" write ] when*
+ dup dim>> [ write-ptx-dim ] when*
+ dup initializer>> [ " = " write write ] when*
+ drop ;
+
+: write-params ( params -- )
+ "(" write unclip (write-ptx-element)
+ [ ", " write (write-ptx-element) ] each
+ ")" write ;
+
+: write-body ( params -- )
+ "\t{" print
+ [ write-ptx-element ] each
+ "\t}" write ;
+
+: write-entry ( entry -- )
+ dup name>> write " " write
+ dup params>> [ write-params ] when* nl
+ dup directives>> [ (write-ptx-element) ] each nl
+ dup body>> write-body
+ drop ;
+
+M: ptx-entry (write-ptx-element)
+ ".entry " write
+ write-entry ;
+
+M: ptx-func (write-ptx-element)
+ ".func " write
+ dup return>> [ "(" write (write-ptx-element) ") " write ] when*
+ write-entry ;
+
+M: .file (write-ptx-element)
+ ".file " write info>> write ;
+M: .loc (write-ptx-element)
+ ".loc " write info>> write ;
+M: .maxnctapersm (write-ptx-element)
+ ".maxnctapersm " write ncta>> number>string write ;
+M: .minnctapersm (write-ptx-element)
+ ".minnctapersm " write ncta>> number>string write ;
+M: .maxnreg (write-ptx-element)
+ ".maxnreg " write n>> number>string write ;
+M: .maxntid (write-ptx-element)
+ ".maxntid " write
+ dup sequence? [ [ number>string ] map ", " join write ] [ number>string write ] if ;
+M: .pragma (write-ptx-element)
+ ".pragma \"" write pragma>> write "\"" write ;
+
+M: ptx-instruction ptx-element-label
+ label>> ;
+
+: write-insn ( insn name -- insn )
+ over predicate>>
+ [ "@" write dup negated?>> [ "!" write ] when variable>> write " " write ] when*
+ write ;
+
+: write-2op ( insn -- )
+ dup type>> (write-ptx-element) " " write
+ dup dest>> write ", " write
+ dup a>> write
+ drop ;
+
+: write-3op ( insn -- )
+ dup write-2op ", " write
+ dup b>> write
+ drop ;
+
+: write-4op ( insn -- )
+ dup write-3op ", " write
+ dup c>> write
+ drop ;
+
+: write-5op ( insn -- )
+ dup write-4op ", " write
+ dup d>> write
+ drop ;
+
+: write-ftz ( insn -- )
+ ftz?>> [ ".ftz" write ] when ;
+
+: write-sat ( insn -- )
+ sat?>> [ ".sat" write ] when ;
+
+: write-float-env ( insn -- )
+ dup round>> (write-ptx-element)
+ write-ftz ;
+
+: write-int-addsub ( insn -- )
+ dup write-sat
+ dup cc?>> [ ".cc" write ] when
+ write-3op ;
+
+: write-addsub ( insn -- )
+ dup write-float-env
+ write-int-addsub ;
+
+: write-ldst ( insn -- )
+ dup volatile?>> [ ".volatile" write ] when
+ dup storage-space>> (write-ptx-element)
+ dup cache-op>> (write-ptx-element)
+ write-2op ;
+
+: (write-mul) ( insn -- )
+ dup mode>> (write-ptx-element)
+ drop ;
+
+: write-mul ( insn -- )
+ dup write-float-env
+ dup (write-mul)
+ write-3op ;
+
+: write-mad ( insn -- )
+ dup write-float-env
+ dup (write-mul)
+ dup write-sat
+ write-4op ;
+
+: write-uni ( insn -- )
+ uni?>> [ ".uni" write ] when ;
+
+: write-set ( insn -- )
+ dup cmp-op>> (write-ptx-element)
+ dup bool-op>> (write-ptx-element)
+ write-ftz ;
+
+M: abs (write-ptx-element)
+ "abs" write-insn
+ dup write-ftz
+ write-2op ;
+M: add (write-ptx-element)
+ "add" write-insn
+ write-addsub ;
+M: addc (write-ptx-element)
+ "addc" write-insn
+ write-int-addsub ;
+M: and (write-ptx-element)
+ "and" write-insn
+ write-3op ;
+M: atom (write-ptx-element)
+ "atom" write-insn
+ dup storage-space>> (write-ptx-element)
+ dup op>> (write-ptx-element)
+ dup write-3op
+ c>> [ ", " write write ] when* ;
+M: bar.arrive (write-ptx-element)
+ "bar.arrive " write-insn
+ dup a>> write ", " write
+ dup b>> write
+ drop ;
+M: bar.red (write-ptx-element)
+ "bar.red" write-insn
+ dup op>> (write-ptx-element)
+ dup write-2op
+ dup b>> [ ", " write write ] when*
+ ", " write c>> write ;
+M: bar.sync (write-ptx-element)
+ "bar.arrive " write-insn
+ dup a>> write
+ dup b>> [ ", " write write ] when*
+ drop ;
+M: bfe (write-ptx-element)
+ "bfe" write-insn
+ write-4op ;
+M: bfi (write-ptx-element)
+ "bfi" write-insn
+ write-5op ;
+M: bfind (write-ptx-element)
+ "bfind" write-insn
+ dup shiftamt?>> [ ".shiftamt" write ] when
+ write-2op ;
+M: bra (write-ptx-element)
+ "bra" write-insn
+ dup write-uni
+ " " write target>> write ;
+M: brev (write-ptx-element)
+ "brev" write-insn
+ write-2op ;
+M: brkpt (write-ptx-element)
+ "brkpt" write-insn drop ;
+M: call (write-ptx-element)
+ "call" write-insn " " write
+ dup return>> [ "(" write write "), " write ] when*
+ dup target>> write
+ dup params>> [ ", (" write ", " join write ")" write ] unless-empty
+ drop ;
+M: clz (write-ptx-element)
+ "clz" write-insn
+ write-2op ;
+M: cnot (write-ptx-element)
+ "cnot" write-insn
+ write-2op ;
+M: copysign (write-ptx-element)
+ "copysign" write-insn
+ write-3op ;
+M: cos (write-ptx-element)
+ "cos" write-insn
+ dup write-float-env
+ write-2op ;
+M: cvt (write-ptx-element)
+ "cvt" write-insn
+ dup rounding-mode>> (write-ptx-element)
+ dup write-ftz
+ dup write-sat
+ dup dest-type>> (write-ptx-element)
+ write-2op ;
+M: cvta (write-ptx-element)
+ "cvta" write-insn
+ dup to?>> [ ".to" write ] when
+ dup storage-space>> (write-ptx-element)
+ write-2op ;
+M: div (write-ptx-element)
+ "div" write-insn
+ dup write-float-env
+ write-3op ;
+M: ex2 (write-ptx-element)
+ "ex2" write-insn
+ dup write-float-env
+ write-2op ;
+M: exit (write-ptx-element)
+ "exit" write-insn drop ;
+M: fma (write-ptx-element)
+ "fma" write-insn
+ write-mad ;
+M: isspacep (write-ptx-element)
+ "isspacep" write-insn
+ dup storage-space>> (write-ptx-element)
+ " " write
+ dup dest>> write ", " write a>> write ;
+M: ld (write-ptx-element)
+ "ld" write-insn
+ write-ldst ;
+M: ldu (write-ptx-element)
+ "ldu" write-insn
+ write-ldst ;
+M: lg2 (write-ptx-element)
+ "lg2" write-insn
+ dup write-float-env
+ write-2op ;
+M: mad (write-ptx-element)
+ "mad" write-insn
+ write-mad ;
+M: mad24 (write-ptx-element)
+ "mad24" write-insn
+ dup (write-mul)
+ dup write-sat
+ write-4op ;
+M: max (write-ptx-element)
+ "max" write-insn
+ dup write-ftz
+ write-3op ;
+M: membar (write-ptx-element)
+ "membar" write-insn
+ dup level>> (write-ptx-element)
+ drop ;
+M: min (write-ptx-element)
+ "min" write-insn
+ dup write-ftz
+ write-3op ;
+M: mov (write-ptx-element)
+ "mov" write-insn
+ write-2op ;
+M: mul (write-ptx-element)
+ "mul" write-insn
+ write-mul ;
+M: mul24 (write-ptx-element)
+ "mul24" write-insn
+ dup (write-mul)
+ write-3op ;
+M: neg (write-ptx-element)
+ "neg" write-insn
+ dup write-ftz
+ write-2op ;
+M: not (write-ptx-element)
+ "not" write-insn
+ write-2op ;
+M: or (write-ptx-element)
+ "or" write-insn
+ write-3op ;
+M: pmevent (write-ptx-element)
+ "pmevent" write-insn " " write a>> write ;
+M: popc (write-ptx-element)
+ "popc" write-insn
+ write-2op ;
+M: prefetch (write-ptx-element)
+ "prefetch" write-insn
+ dup storage-space>> (write-ptx-element)
+ dup level>> (write-ptx-element)
+ " " write a>> write ;
+M: prefetchu (write-ptx-element)
+ "prefetchu" write-insn
+ dup level>> (write-ptx-element)
+ " " write a>> write ;
+M: prmt (write-ptx-element)
+ "prmt" write-insn
+ dup mode>> (write-ptx-element)
+ write-4op ;
+M: rcp (write-ptx-element)
+ "rcp" write-insn
+ dup write-float-env
+ write-3op ;
+M: red (write-ptx-element)
+ "red" write-insn
+ dup storage-space>> (write-ptx-element)
+ dup op>> (write-ptx-element)
+ write-2op ;
+M: rem (write-ptx-element)
+ "rem" write-insn
+ write-3op ;
+M: ret (write-ptx-element)
+ "ret" write-insn drop ;
+M: rsqrt (write-ptx-element)
+ "rsqrt" write-insn
+ dup write-float-env
+ write-2op ;
+M: sad (write-ptx-element)
+ "sad" write-insn
+ write-4op ;
+M: selp (write-ptx-element)
+ "selp" write-insn
+ write-4op ;
+M: set (write-ptx-element)
+ "set" write-insn
+ dup write-set
+ dup dest-type>> (write-ptx-element)
+ dup write-3op
+ c>> [ ", " write write ] when* ;
+M: setp (write-ptx-element)
+ "setp" write-insn
+ dup write-set
+ dup type>> (write-ptx-element) " " write
+ dup dest>> write
+ dup |dest>> [ "|" write write ] when* ", " write
+ dup a>> write ", " write
+ dup b>> write
+ c>> [ ", " write write ] when* ;
+M: shl (write-ptx-element)
+ "shl" write-insn
+ write-3op ;
+M: shr (write-ptx-element)
+ "shr" write-insn
+ write-3op ;
+M: sin (write-ptx-element)
+ "sin" write-insn
+ dup write-float-env
+ write-2op ;
+M: slct (write-ptx-element)
+ "slct" write-insn
+ dup write-ftz
+ dup dest-type>> (write-ptx-element)
+ write-4op ;
+M: sqrt (write-ptx-element)
+ "sqrt" write-insn
+ dup write-float-env
+ write-2op ;
+M: st (write-ptx-element)
+ "st" write-insn
+ write-ldst ;
+M: sub (write-ptx-element)
+ "sub" write-insn
+ write-addsub ;
+M: subc (write-ptx-element)
+ "subc" write-insn
+ write-int-addsub ;
+M: testp (write-ptx-element)
+ "testp" write-insn
+ dup op>> (write-ptx-element)
+ write-2op ;
+M: vote (write-ptx-element)
+ "vote" write-insn
+ dup mode>> (write-ptx-element)
+ write-2op ;
+M: xor (write-ptx-element)
+ "or" write-insn
+ write-3op ;
--- /dev/null
+Doug Coleman
--- /dev/null
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien.parser cuda cuda.utils io.backend kernel lexer
+namespaces parser ;
+IN: cuda.syntax
+
+SYNTAX: CUDA-LIBRARY:
+ scan scan normalize-path
+ [ add-cuda-library ]
+ [ drop current-cuda-library set-global ] 2bi ;
+
+SYNTAX: CUDA-FUNCTION:
+ scan [ create-in current-cuda-library get ] [ ] bi
+ ";" scan-c-args drop define-cuda-word ;
+
+: 3<<< ( dim-block dim-grid shared-size -- function-launcher )
+ f function-launcher boa ;
+
+: 4<<< ( dim-block dim-grid shared-size stream -- function-launcher )
+ function-launcher boa ;
--- /dev/null
+Doug Coleman
--- /dev/null
+! Copyright (C) 2010 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors alien.c-types alien.data alien.strings arrays
+assocs byte-arrays classes.struct combinators cuda.ffi io
+io.backend io.encodings.utf8 kernel math.parser namespaces
+prettyprint sequences ;
+IN: cuda.utils
+
+SYMBOL: cuda-device
+SYMBOL: cuda-context
+SYMBOL: cuda-module
+SYMBOL: cuda-function
+SYMBOL: cuda-launcher
+
+SYMBOL: cuda-modules
+SYMBOL: cuda-functions
+
+ERROR: throw-cuda-error n ;
+
+: cuda-error ( n -- )
+ dup CUDA_SUCCESS = [ drop ] [ throw-cuda-error ] if ;
+
+: init-cuda ( -- )
+ 0 cuInit cuda-error ;
+
+: cuda-version ( -- n )
+ int <c-object> [ cuDriverGetVersion cuda-error ] keep *int ;
+
+: get-function-ptr* ( module string -- function )
+ [ CUfunction <c-object> ] 2dip
+ [ cuModuleGetFunction cuda-error ] 3keep 2drop *void* ;
+
+: get-function-ptr ( string -- function )
+ [ cuda-module get ] dip get-function-ptr* ;
+
+: with-cuda-function ( string quot -- )
+ [
+ get-function-ptr* cuda-function set
+ ] dip call ; inline
+
+: create-context ( flags device -- context )
+ [ CUcontext <c-object> ] 2dip
+ [ cuCtxCreate cuda-error ] 3keep 2drop *void* ;
+
+: destroy-context ( context -- ) cuCtxDestroy cuda-error ;
+
+SYMBOL: cuda-libraries
+cuda-libraries [ H{ } clone ] initialize
+
+SYMBOL: current-cuda-library
+
+TUPLE: cuda-library name path handle ;
+
+: <cuda-library> ( name path -- obj )
+ \ cuda-library new
+ swap >>path
+ swap >>name ;
+
+: add-cuda-library ( name path -- )
+ normalize-path <cuda-library>
+ dup name>> cuda-libraries get-global set-at ;
+
+: ?delete-at ( key assoc -- old/key ? )
+ 2dup delete-at* [ 2nip t ] [ 2drop f ] if ; inline
+
+ERROR: no-cuda-library name ;
+
+: load-module ( path -- module )
+ [ CUmodule <c-object> ] dip
+ [ cuModuleLoad cuda-error ] 2keep drop *void* ;
+
+: unload-module ( module -- )
+ cuModuleUnload cuda-error ;
+
+: load-cuda-library ( library -- handle )
+ path>> load-module ;
+
+: lookup-cuda-library ( name -- cuda-library )
+ cuda-libraries get ?at [ no-cuda-library ] unless ;
+
+: remove-cuda-library ( name -- library )
+ cuda-libraries get ?delete-at [ no-cuda-library ] unless ;
+
+: unload-cuda-library ( name -- )
+ remove-cuda-library handle>> unload-module ;
+
+
+: cached-module ( module-name -- alien )
+ lookup-cuda-library
+ cuda-modules get-global [ load-cuda-library ] cache ;
+
+: cached-function ( module-name function-name -- alien )
+ [ cached-module ] dip
+ 2array cuda-functions get [ first2 get-function-ptr* ] cache ;
+
+: launch-function* ( function -- ) cuLaunch cuda-error ;
+
+: launch-function ( -- ) cuda-function get cuLaunch cuda-error ;
+
+: cuda-int* ( function offset value -- )
+ cuParamSeti cuda-error ;
+
+: cuda-int ( offset value -- )
+ [ cuda-function get ] 2dip cuda-int* ;
+
+: cuda-float* ( function offset value -- )
+ cuParamSetf cuda-error ;
+
+: cuda-float ( offset value -- )
+ [ cuda-function get ] 2dip cuda-float* ;
+
+: cuda-vector* ( function offset ptr n -- )
+ cuParamSetv cuda-error ;
+
+: cuda-vector ( offset ptr n -- )
+ [ cuda-function get ] 3dip cuda-vector* ;
+
+: param-size* ( function n -- )
+ cuParamSetSize cuda-error ;
+
+: param-size ( n -- )
+ [ cuda-function get ] dip param-size* ;
+
+: launch-function-grid* ( function width height -- )
+ cuLaunchGrid cuda-error ;
+
+: launch-function-grid ( width height -- )
+ [ cuda-function get ] 2dip
+ cuLaunchGrid cuda-error ;
+
+: function-block-shape* ( function x y z -- )
+ cuFuncSetBlockShape cuda-error ;
+
+: function-block-shape ( x y z -- )
+ [ cuda-function get ] 3dip
+ cuFuncSetBlockShape cuda-error ;
+
+: function-shared-size* ( function n -- )
+ cuFuncSetSharedSize cuda-error ;
+
+: function-shared-size ( n -- )
+ [ cuda-function get ] dip
+ cuFuncSetSharedSize cuda-error ;
[ current-status "status" set-value ]
[ last-build-status "last-build" set-value ]
[ clean-build-status "last-clean-build" set-value ]
- [ heartbeat-timestamp>> "heartbeat-timestamp" set-value ]
+ [ current-timestamp>> "current-timestamp" set-value ]
[ packages-link "binaries" set-value ]
[ clean-image-link "clean-images" set-value ]
[ report-link "last-report" set-value ]
case RT_LITERAL:
op.store_value(next_literal());
break;
+ case RT_FLOAT:
+ op.store_float(next_literal());
+ break;
case RT_ENTRY_POINT:
op.store_value(parent->compute_entry_point_address(next_literal()));
break;
case RT_LITERAL:
op.store_value(slot_forwarder.visit_pointer(op.load_value(old_offset)));
break;
+ case RT_FLOAT:
+ op.store_float(slot_forwarder.visit_pointer(op.load_float(old_offset)));
+ break;
case RT_ENTRY_POINT:
case RT_ENTRY_POINT_PIC:
case RT_ENTRY_POINT_PIC_TAIL:
case RT_LITERAL:
op.store_value(data_visitor.visit_pointer(op.load_value(old_offset)));
break;
+ case RT_FLOAT:
+ op.store_float(data_visitor.visit_pointer(op.load_float(old_offset)));
+ break;
case RT_ENTRY_POINT:
case RT_ENTRY_POINT_PIC:
case RT_ENTRY_POINT_PIC_TAIL:
return load_value(pointer);
}
+cell instruction_operand::load_float()
+{
+ return (cell)load_value() - boxed_float_offset;
+}
+
+cell instruction_operand::load_float(cell pointer)
+{
+ return (cell)load_value(pointer) - boxed_float_offset;
+}
+
code_block *instruction_operand::load_code_block(cell relative_to)
{
return ((code_block *)load_value(relative_to) - 1);
}
}
+void instruction_operand::store_float(cell value)
+{
+ store_value((fixnum)value + boxed_float_offset);
+}
+
void instruction_operand::store_code_block(code_block *compiled)
{
store_value((cell)compiled->entry_point());
type since its used in a situation where relocation arguments cannot
be passed in, and so RT_DLSYM is inappropriate (Windows only) */
RT_EXCEPTION_HANDLER,
+ /* pointer to a float's payload */
+ RT_FLOAT,
+
};
enum relocation_class {
case RT_CARDS_OFFSET:
case RT_DECKS_OFFSET:
case RT_EXCEPTION_HANDLER:
+ case RT_FLOAT:
return 0;
default:
critical_error("Bad rel type",rel_type());
fixnum load_value_masked(cell mask, cell bits, cell shift);
fixnum load_value(cell relative_to);
fixnum load_value();
+ cell load_float(cell relative_to);
+ cell load_float();
code_block *load_code_block(cell relative_to);
code_block *load_code_block();
void store_value_2_2(fixnum value);
void store_value_masked(fixnum value, cell mask, cell shift);
void store_value(fixnum value);
+ void store_float(cell value);
void store_code_block(code_block *compiled);
};
cell object;
};
+const fixnum boxed_float_offset = 8 - FLOAT_TYPE;
+
/* Assembly code makes assumptions about the layout of this struct */
struct boxed_float : object {
static const cell type_number = FLOAT_TYPE;
void operator()(instruction_operand op)
{
- if(op.rel_type() == RT_LITERAL)
+ switch(op.rel_type())
+ {
+ case RT_LITERAL:
op.store_value(visitor->visit_pointer(op.load_value()));
+ break;
+ case RT_FLOAT:
+ op.store_float(visitor->visit_pointer(op.load_float()));
+ break;
+ default:
+ break;
+ }
}
};