cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands
cpu.architecture kernel kernel.private math memory namespaces make
sequences words system layouts combinators math.order fry locals
+compiler.constants byte-arrays io macros quotations cpu.x86.features
+cpu.x86.features.private compiler compiler.units init
compiler.constants vm byte-arrays
compiler.cfg.registers
compiler.cfg.instructions
dst src byte-array-offset [+]
rep copy-register ;
+MACRO: available-reps ( alist -- )
+ ! Each SSE version adds new representations and supports
+ ! all old ones
+ unzip { } [ append ] accumulate rest swap suffix
+ [ [ 1quotation ] map ] bi@ zip
+ reverse [ { } ] suffix
+ '[ _ cond ] ;
+
M: x86 %broadcast-vector ( dst src rep -- )
{
{ float-4-rep [ [ MOVSS ] [ drop dup 0 SHUFPS ] 2bi ] }
{ double-2-rep [ [ MOVSD ] [ drop dup UNPCKLPD ] 2bi ] }
} case ;
+M: x86 %broadcast-vector-reps
+ {
+ { sse? { float-4-rep } }
+ { sse2? { double-2-rep } }
+ } available-reps ;
+
M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
rep {
{
}
} case ;
+M: x86 %gather-vector-4-reps
+ {
+ { sse? { float-4-rep } }
+ } available-reps ;
+
M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
rep {
{
}
} case ;
+M: x86 %gather-vector-2-reps
+ {
+ { sse2? { double-2-rep } }
+ } available-reps ;
+
M: x86 %add-vector ( dst src1 src2 rep -- )
{
{ float-4-rep [ ADDPS ] }
{ uint-4-rep [ PADDD ] }
} case drop ;
+M: x86 %add-vector-reps
+ {
+ { sse? { float-4-rep } }
+ { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
+ } available-reps ;
+
+M: x86 %saturated-add-vector ( dst src1 src2 rep -- )
+ {
+ { char-16-rep [ PADDSB ] }
+ { uchar-16-rep [ PADDUSB ] }
+ { short-8-rep [ PADDSW ] }
+ { ushort-8-rep [ PADDUSW ] }
+ } case drop ;
+
+M: x86 %saturated-add-vector-reps
+ {
+ { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
+ } available-reps ;
+
+M: x86 %add-sub-vector ( dst src1 src2 rep -- )
+ {
+ { float-4-rep [ ADDSUBPS ] }
+ { double-2-rep [ ADDSUBPD ] }
+ } case drop ;
+
+M: x86 %add-sub-vector-reps
+ {
+ { sse3? { float-4-rep double-2-rep } }
+ } available-reps ;
+
M: x86 %sub-vector ( dst src1 src2 rep -- )
{
{ float-4-rep [ SUBPS ] }
{ uint-4-rep [ PSUBD ] }
} case drop ;
+M: x86 %sub-vector-reps
+ {
+ { sse? { float-4-rep } }
+ { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
+ } available-reps ;
+
+M: x86 %saturated-sub-vector ( dst src1 src2 rep -- )
+ {
+ { char-16-rep [ PSUBSB ] }
+ { uchar-16-rep [ PSUBUSB ] }
+ { short-8-rep [ PSUBSW ] }
+ { ushort-8-rep [ PSUBUSW ] }
+ } case drop ;
+
+M: x86 %saturated-sub-vector-reps
+ {
+ { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
+ } available-reps ;
+
M: x86 %mul-vector ( dst src1 src2 rep -- )
{
{ float-4-rep [ MULPS ] }
{ double-2-rep [ MULPD ] }
- { int-4-rep [ PMULLW ] }
+ { short-8-rep [ PMULLW ] }
+ { ushort-8-rep [ PMULLW ] }
+ { int-4-rep [ PMULLD ] }
+ { uint-4-rep [ PMULLD ] }
} case drop ;
+M: x86 %mul-vector-reps
+ {
+ { sse? { float-4-rep } }
+ { sse2? { double-2-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
+ } available-reps ;
+
+M: x86 %saturated-mul-vector-reps
+ ! No multiplication with saturation on x86
+ { } ;
+
M: x86 %div-vector ( dst src1 src2 rep -- )
{
{ float-4-rep [ DIVPS ] }
{ double-2-rep [ DIVPD ] }
} case drop ;
+M: x86 %div-vector-reps
+ {
+ { sse? { float-4-rep } }
+ { sse2? { double-2-rep } }
+ } available-reps ;
+
M: x86 %min-vector ( dst src1 src2 rep -- )
{
{ float-4-rep [ MINPS ] }
{ double-2-rep [ MINPD ] }
+ { uchar-16-rep [ PMINUB ] }
+ { short-8-rep [ PMINSW ] }
} case drop ;
+M: x86 %min-vector-reps
+ {
+ { sse? { float-4-rep } }
+ { sse2? { double-2-rep short-8-rep uchar-16-rep } }
+ } available-reps ;
+
M: x86 %max-vector ( dst src1 src2 rep -- )
{
{ float-4-rep [ MAXPS ] }
{ double-2-rep [ MAXPD ] }
+ { uchar-16-rep [ PMAXUB ] }
+ { short-8-rep [ PMAXSW ] }
} case drop ;
+M: x86 %max-vector-reps
+ {
+ { sse? { float-4-rep } }
+ { sse2? { double-2-rep short-8-rep uchar-16-rep } }
+ } available-reps ;
+
M: x86 %sqrt-vector ( dst src rep -- )
{
{ float-4-rep [ SQRTPS ] }
{ double-2-rep [ SQRTPD ] }
} case ;
+M: x86 %sqrt-vector-reps
+ {
+ { sse? { float-4-rep } }
+ { sse2? { double-2-rep } }
+ } available-reps ;
+
M: x86 %horizontal-add-vector ( dst src rep -- )
{
{ float-4-rep [ [ MOVAPS ] [ HADDPS ] [ HADDPS ] 2tri ] }
{ double-2-rep [ [ MOVAPD ] [ HADDPD ] 2bi ] }
} case ;
+M: x86 %horizontal-add-vector-reps
+ {
+ { sse3? { float-4-rep double-2-rep } }
+ } available-reps ;
+
M: x86 %unbox-alien ( dst src -- )
alien-offset [+] MOV ;
#! set up by the caller.
stack-frame get total-size>> + stack@ ;
-: enable-sse2 ( -- )
- enable-float-intrinsics
- enable-fsqrt
- enable-float-min/max
- enable-sse2-simd ;
+enable-simd
+enable-min/max
+enable-fixnum-log2
-: enable-sse3 ( -- )
- enable-sse2
- enable-sse3-simd ;
+:: install-sse2-check ( -- )
+ [
+ sse-version 20 < [
+ "This image was built to use SSE2 but your CPU does not support it." print
+ "You will need to bootstrap Factor again." print
+ flush
+ 1 exit
+ ] when
+ ] "cpu.x86" add-init-hook ;
+
+: enable-sse2 ( version -- )
+ 20 >= [
+ enable-float-intrinsics
+ enable-fsqrt
+ enable-float-min/max
+ install-sse2-check
+ ] when ;
-enable-min/max
-enable-fixnum-log2
\ No newline at end of file
+: check-sse ( -- )
+ [ { sse_version } compile ] with-optimizer
+ "Checking for multimedia extensions: " write sse-version
+ [ sse-string write " detected" print ] [ enable-sse2 ] bi ;