1 ! Copyright (C) 2009, 2010 Joe Groff, Slava Pestov.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: alien.c-types arrays assocs combinators fry kernel locals
4 macros math math.vectors namespaces quotations sequences system
5 compiler.cfg.comparisons compiler.cfg.intrinsics
6 compiler.codegen.fixup cpu.architecture cpu.x86
7 cpu.x86.assembler cpu.x86.assembler.operands cpu.x86.features ;
10 ! Scalar floating point with SSE2
11 M: x86 %load-float <float> float-rep %load-vector ;
12 M: x86 %load-double <double> double-rep %load-vector ;
14 M: float-rep copy-register* drop MOVAPS ;
15 M: double-rep copy-register* drop MOVAPS ;
17 M: float-rep copy-memory* drop MOVSS ;
18 M: double-rep copy-memory* drop MOVSD ;
20 M: x86 %add-float double-rep two-operand ADDSD ;
21 M: x86 %sub-float double-rep two-operand SUBSD ;
22 M: x86 %mul-float double-rep two-operand MULSD ;
23 M: x86 %div-float double-rep two-operand DIVSD ;
24 M: x86 %min-float double-rep two-operand MINSD ;
25 M: x86 %max-float double-rep two-operand MAXSD ;
28 : %clear-unless-in-place ( dst src -- )
29 over = [ drop ] [ dup XORPS ] if ;
31 M: x86 %single>double-float [ %clear-unless-in-place ] [ CVTSS2SD ] 2bi ;
32 M: x86 %double>single-float [ %clear-unless-in-place ] [ CVTSD2SS ] 2bi ;
34 M: x86 integer-float-needs-stack-frame? f ;
35 M: x86 %integer>float [ drop dup XORPS ] [ CVTSI2SD ] 2bi ;
36 M: x86 %float>integer CVTTSD2SI ;
38 M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- )
39 [ COMISD ] (%compare-float) ;
41 M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- )
42 [ UCOMISD ] (%compare-float) ;
44 M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- )
45 [ COMISD ] (%compare-float-branch) ;
47 M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- )
48 [ UCOMISD ] (%compare-float-branch) ;
51 M: float-4-rep copy-register* drop MOVAPS ;
52 M: double-2-rep copy-register* drop MOVAPS ;
53 M: vector-rep copy-register* drop MOVDQA ;
55 MACRO: available-reps ( alist -- )
56 ! Each SSE version adds new representations and supports
58 unzip { } [ append ] accumulate rest swap suffix
59 [ [ 1quotation ] map ] bi@ zip
60 reverse [ { } ] suffix
63 M: x86 %alien-vector-reps
65 { sse? { float-4-rep } }
66 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
71 { double-2-rep [ dup XORPS ] }
72 { float-4-rep [ dup XORPS ] }
76 M: x86 %zero-vector-reps
78 { sse? { float-4-rep } }
79 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
84 { double-2-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] }
85 { float-4-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] }
89 M: x86 %fill-vector-reps
91 { sse? { float-4-rep } }
92 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
95 M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
98 dst src1 float-4-rep %copy
104 dst src1 int-4-rep %copy
111 M: x86 %gather-vector-4-reps
113 ! Can't do this with sse1 since it will want to unbox
114 ! double-precision floats and convert to single precision
115 { sse2? { float-4-rep int-4-rep uint-4-rep } }
118 M:: x86 %gather-int-vector-4 ( dst src1 src2 src3 src4 rep -- )
120 dst src1 32-bit-version-of 0 PINSRD
121 dst src2 32-bit-version-of 1 PINSRD
122 dst src3 32-bit-version-of 2 PINSRD
123 dst src4 32-bit-version-of 3 PINSRD ;
125 M: x86 %gather-int-vector-4-reps
127 { sse4.1? { int-4-rep uint-4-rep } }
130 M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
133 dst src1 double-2-rep %copy
137 dst src1 longlong-2-rep %copy
142 M: x86 %gather-vector-2-reps
144 { sse2? { double-2-rep longlong-2-rep ulonglong-2-rep } }
147 M:: x86.64 %gather-int-vector-2 ( dst src1 src2 rep -- )
152 M: x86.64 %gather-int-vector-2-reps
154 { sse4.1? { longlong-2-rep ulonglong-2-rep } }
157 :: %select-vector-32 ( dst src n rep -- )
160 dst 32-bit-version-of src n PEXTRB
161 dst dst 8-bit-version-of MOVSX
164 dst 32-bit-version-of src n PEXTRB
167 dst 32-bit-version-of src n PEXTRW
168 dst dst 16-bit-version-of MOVSX
171 dst 32-bit-version-of src n PEXTRW
174 dst 32-bit-version-of src n PEXTRD
175 dst dst 32-bit-version-of 2dup = [ 2drop ] [ MOVSX ] if
178 dst 32-bit-version-of src n PEXTRD
182 M: x86.32 %select-vector
185 M: x86.32 %select-vector-reps
187 { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep } }
190 M: x86.64 %select-vector
192 { longlong-2-rep [ PEXTRQ ] }
193 { ulonglong-2-rep [ PEXTRQ ] }
194 [ %select-vector-32 ]
197 M: x86.64 %select-vector-reps
199 { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep ulonglong-2-rep longlong-2-rep } }
202 : sse1-float-4-shuffle ( dst shuffle -- )
204 { { 0 1 2 3 } [ drop ] }
205 { { 0 1 0 1 } [ dup MOVLHPS ] }
206 { { 2 3 2 3 } [ dup MOVHLPS ] }
207 { { 0 0 1 1 } [ dup UNPCKLPS ] }
208 { { 2 2 3 3 } [ dup UNPCKHPS ] }
212 : float-4-shuffle ( dst shuffle -- )
215 { { 0 0 2 2 } [ dup MOVSLDUP ] }
216 { { 1 1 3 3 } [ dup MOVSHDUP ] }
217 [ sse1-float-4-shuffle ]
219 ] [ sse1-float-4-shuffle ] if ;
221 : int-4-shuffle ( dst shuffle -- )
223 { { 0 1 2 3 } [ drop ] }
224 { { 0 0 1 1 } [ dup PUNPCKLDQ ] }
225 { { 2 2 3 3 } [ dup PUNPCKHDQ ] }
226 { { 0 1 0 1 } [ dup PUNPCKLQDQ ] }
227 { { 2 3 2 3 } [ dup PUNPCKHQDQ ] }
231 : longlong-2-shuffle ( dst shuffle -- )
232 first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ;
234 : >float-4-shuffle ( double-2-shuffle -- float-4-shuffle )
235 [ 2 * { 0 1 } n+v ] map concat ;
237 M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- )
239 dst shuffle rep signed-rep {
240 { double-2-rep [ >float-4-shuffle float-4-shuffle ] }
241 { float-4-rep [ float-4-shuffle ] }
242 { int-4-rep [ int-4-shuffle ] }
243 { longlong-2-rep [ longlong-2-shuffle ] }
246 M: x86 %shuffle-vector-imm-reps
248 { sse? { float-4-rep } }
249 { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
252 M:: x86 %shuffle-vector-halves-imm ( dst src1 src2 shuffle rep -- )
253 dst src1 src2 rep two-operand
255 { double-2-rep [ >float-4-shuffle SHUFPS ] }
256 { float-4-rep [ SHUFPS ] }
259 M: x86 %shuffle-vector-halves-imm-reps
261 { sse? { float-4-rep } }
262 { sse2? { double-2-rep } }
265 M: x86 %shuffle-vector ( dst src shuffle rep -- )
268 M: x86 %shuffle-vector-reps
270 { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
273 M: x86 %merge-vector-head
276 { double-2-rep [ MOVLHPS ] }
277 { float-4-rep [ UNPCKLPS ] }
278 { longlong-2-rep [ PUNPCKLQDQ ] }
279 { int-4-rep [ PUNPCKLDQ ] }
280 { short-8-rep [ PUNPCKLWD ] }
281 { char-16-rep [ PUNPCKLBW ] }
284 M: x86 %merge-vector-tail
287 { double-2-rep [ UNPCKHPD ] }
288 { float-4-rep [ UNPCKHPS ] }
289 { longlong-2-rep [ PUNPCKHQDQ ] }
290 { int-4-rep [ PUNPCKHDQ ] }
291 { short-8-rep [ PUNPCKHWD ] }
292 { char-16-rep [ PUNPCKHBW ] }
295 M: x86 %merge-vector-reps
297 { sse? { float-4-rep } }
298 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
301 M: x86 %float-pack-vector
304 M: x86 %float-pack-vector-reps
306 { sse2? { double-2-rep } }
309 M: x86 %signed-pack-vector
312 { int-4-rep [ PACKSSDW ] }
313 { short-8-rep [ PACKSSWB ] }
316 M: x86 %signed-pack-vector-reps
318 { sse2? { short-8-rep int-4-rep } }
321 M: x86 %unsigned-pack-vector
324 { int-4-rep [ PACKUSDW ] }
325 { short-8-rep [ PACKUSWB ] }
328 M: x86 %unsigned-pack-vector-reps
330 { sse2? { short-8-rep } }
331 { sse4.1? { int-4-rep } }
334 M: x86 %tail>head-vector ( dst src rep -- )
336 { float-4-rep [ drop UNPCKHPD ] }
337 { double-2-rep [ drop UNPCKHPD ] }
338 [ drop [ %copy ] [ drop PUNPCKHQDQ ] 3bi ]
341 M: x86 %unpack-vector-head ( dst src rep -- )
343 { char-16-rep [ PMOVSXBW ] }
344 { uchar-16-rep [ PMOVZXBW ] }
345 { short-8-rep [ PMOVSXWD ] }
346 { ushort-8-rep [ PMOVZXWD ] }
347 { int-4-rep [ PMOVSXDQ ] }
348 { uint-4-rep [ PMOVZXDQ ] }
349 { float-4-rep [ CVTPS2PD ] }
352 M: x86 %unpack-vector-head-reps ( -- reps )
354 { sse2? { float-4-rep } }
355 { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
358 M: x86 %integer>float-vector ( dst src rep -- )
360 { int-4-rep [ CVTDQ2PS ] }
363 M: x86 %integer>float-vector-reps
365 { sse2? { int-4-rep } }
368 M: x86 %float>integer-vector ( dst src rep -- )
370 { float-4-rep [ CVTTPS2DQ ] }
373 M: x86 %float>integer-vector-reps
375 { sse2? { float-4-rep } }
378 : (%compare-float-vector) ( dst src rep double single -- )
379 [ double-2-rep eq? ] 2dip if ; inline
381 : %compare-float-vector ( dst src rep cc -- )
383 { cc< [ [ CMPLTPD ] [ CMPLTPS ] (%compare-float-vector) ] }
384 { cc<= [ [ CMPLEPD ] [ CMPLEPS ] (%compare-float-vector) ] }
385 { cc= [ [ CMPEQPD ] [ CMPEQPS ] (%compare-float-vector) ] }
386 { cc<>= [ [ CMPORDPD ] [ CMPORDPS ] (%compare-float-vector) ] }
387 { cc/< [ [ CMPNLTPD ] [ CMPNLTPS ] (%compare-float-vector) ] }
388 { cc/<= [ [ CMPNLEPD ] [ CMPNLEPS ] (%compare-float-vector) ] }
389 { cc/= [ [ CMPNEQPD ] [ CMPNEQPS ] (%compare-float-vector) ] }
390 { cc/<>= [ [ CMPUNORDPD ] [ CMPUNORDPS ] (%compare-float-vector) ] }
393 :: (%compare-int-vector) ( dst src rep int64 int32 int16 int8 -- )
394 rep signed-rep :> rep'
396 { longlong-2-rep [ int64 call ] }
397 { int-4-rep [ int32 call ] }
398 { short-8-rep [ int16 call ] }
399 { char-16-rep [ int8 call ] }
402 : %compare-int-vector ( dst src rep cc -- )
404 { cc= [ [ PCMPEQQ ] [ PCMPEQD ] [ PCMPEQW ] [ PCMPEQB ] (%compare-int-vector) ] }
405 { cc> [ [ PCMPGTQ ] [ PCMPGTD ] [ PCMPGTW ] [ PCMPGTB ] (%compare-int-vector) ] }
408 M: x86 %compare-vector ( dst src1 src2 rep cc -- )
409 [ [ two-operand ] keep ] dip
410 over float-vector-rep?
411 [ %compare-float-vector ]
412 [ %compare-int-vector ] if ;
414 : %compare-vector-eq-reps ( -- reps )
416 { sse? { float-4-rep } }
417 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
418 { sse4.1? { longlong-2-rep ulonglong-2-rep } }
421 : %compare-vector-ord-reps ( -- reps )
423 { sse? { float-4-rep } }
424 { sse2? { double-2-rep char-16-rep short-8-rep int-4-rep } }
425 { sse4.2? { longlong-2-rep } }
428 M: x86 %compare-vector-reps
430 { [ dup { cc= cc/= cc/<>= cc<>= } member-eq? ] [ drop %compare-vector-eq-reps ] }
431 [ drop %compare-vector-ord-reps ]
434 : %compare-float-vector-ccs ( cc -- ccs not? )
436 { cc< [ { { cc< f } } f ] }
437 { cc<= [ { { cc<= f } } f ] }
438 { cc> [ { { cc< t } } f ] }
439 { cc>= [ { { cc<= t } } f ] }
440 { cc= [ { { cc= f } } f ] }
441 { cc<> [ { { cc< f } { cc< t } } f ] }
442 { cc<>= [ { { cc<>= f } } f ] }
443 { cc/< [ { { cc/< f } } f ] }
444 { cc/<= [ { { cc/<= f } } f ] }
445 { cc/> [ { { cc/< t } } f ] }
446 { cc/>= [ { { cc/<= t } } f ] }
447 { cc/= [ { { cc/= f } } f ] }
448 { cc/<> [ { { cc/= f } { cc/<>= f } } f ] }
449 { cc/<>= [ { { cc/<>= f } } f ] }
452 : %compare-int-vector-ccs ( cc -- ccs not? )
454 { cc< [ { { cc> t } } f ] }
455 { cc<= [ { { cc> f } } t ] }
456 { cc> [ { { cc> f } } f ] }
457 { cc>= [ { { cc> t } } t ] }
458 { cc= [ { { cc= f } } f ] }
459 { cc/= [ { { cc= f } } t ] }
464 M: x86 %compare-vector-ccs
465 swap float-vector-rep?
466 [ %compare-float-vector-ccs ]
467 [ %compare-int-vector-ccs ] if ;
469 :: %test-vector-mask ( dst temp mask vcc -- )
471 { vcc-any [ dst dst TEST dst temp \ CMOVNE (%boolean) ] }
472 { vcc-none [ dst dst TEST dst temp \ CMOVE (%boolean) ] }
473 { vcc-all [ dst mask CMP dst temp \ CMOVE (%boolean) ] }
474 { vcc-notall [ dst mask CMP dst temp \ CMOVNE (%boolean) ] }
477 : %move-vector-mask ( dst src rep -- mask )
479 { double-2-rep [ MOVMSKPS HEX: f ] }
480 { float-4-rep [ MOVMSKPS HEX: f ] }
481 [ drop PMOVMSKB HEX: ffff ]
484 M:: x86 %test-vector ( dst src temp rep vcc -- )
485 dst src rep %move-vector-mask :> mask
486 dst temp mask vcc %test-vector-mask ;
488 :: %test-vector-mask-branch ( label temp mask vcc -- )
490 { vcc-any [ temp temp TEST label JNE ] }
491 { vcc-none [ temp temp TEST label JE ] }
492 { vcc-all [ temp mask CMP label JE ] }
493 { vcc-notall [ temp mask CMP label JNE ] }
496 M:: x86 %test-vector-branch ( label src temp rep vcc -- )
497 temp src rep %move-vector-mask :> mask
498 label temp mask vcc %test-vector-mask-branch ;
500 M: x86 %test-vector-reps
502 { sse? { float-4-rep } }
503 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
506 M: x86 %add-vector ( dst src1 src2 rep -- )
509 { float-4-rep [ ADDPS ] }
510 { double-2-rep [ ADDPD ] }
511 { char-16-rep [ PADDB ] }
512 { uchar-16-rep [ PADDB ] }
513 { short-8-rep [ PADDW ] }
514 { ushort-8-rep [ PADDW ] }
515 { int-4-rep [ PADDD ] }
516 { uint-4-rep [ PADDD ] }
517 { longlong-2-rep [ PADDQ ] }
518 { ulonglong-2-rep [ PADDQ ] }
521 M: x86 %add-vector-reps
523 { sse? { float-4-rep } }
524 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
527 M: x86 %saturated-add-vector ( dst src1 src2 rep -- )
530 { char-16-rep [ PADDSB ] }
531 { uchar-16-rep [ PADDUSB ] }
532 { short-8-rep [ PADDSW ] }
533 { ushort-8-rep [ PADDUSW ] }
536 M: x86 %saturated-add-vector-reps
538 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
541 M: x86 %add-sub-vector ( dst src1 src2 rep -- )
544 { float-4-rep [ ADDSUBPS ] }
545 { double-2-rep [ ADDSUBPD ] }
548 M: x86 %add-sub-vector-reps
550 { sse3? { float-4-rep double-2-rep } }
553 M: x86 %sub-vector ( dst src1 src2 rep -- )
556 { float-4-rep [ SUBPS ] }
557 { double-2-rep [ SUBPD ] }
558 { char-16-rep [ PSUBB ] }
559 { uchar-16-rep [ PSUBB ] }
560 { short-8-rep [ PSUBW ] }
561 { ushort-8-rep [ PSUBW ] }
562 { int-4-rep [ PSUBD ] }
563 { uint-4-rep [ PSUBD ] }
564 { longlong-2-rep [ PSUBQ ] }
565 { ulonglong-2-rep [ PSUBQ ] }
568 M: x86 %sub-vector-reps
570 { sse? { float-4-rep } }
571 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
574 M: x86 %saturated-sub-vector ( dst src1 src2 rep -- )
577 { char-16-rep [ PSUBSB ] }
578 { uchar-16-rep [ PSUBUSB ] }
579 { short-8-rep [ PSUBSW ] }
580 { ushort-8-rep [ PSUBUSW ] }
583 M: x86 %saturated-sub-vector-reps
585 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
588 M: x86 %mul-vector ( dst src1 src2 rep -- )
591 { float-4-rep [ MULPS ] }
592 { double-2-rep [ MULPD ] }
593 { short-8-rep [ PMULLW ] }
594 { ushort-8-rep [ PMULLW ] }
595 { int-4-rep [ PMULLD ] }
596 { uint-4-rep [ PMULLD ] }
599 M: x86 %mul-vector-reps
601 { sse? { float-4-rep } }
602 { sse2? { double-2-rep short-8-rep ushort-8-rep } }
603 { sse4.1? { int-4-rep uint-4-rep } }
606 M: x86 %mul-high-vector ( dst src1 src2 rep -- )
609 { short-8-rep [ PMULHW ] }
610 { ushort-8-rep [ PMULHUW ] }
613 M: x86 %mul-high-vector-reps
615 { sse2? { short-8-rep ushort-8-rep } }
618 M: x86 %mul-horizontal-add-vector ( dst src1 src2 rep -- )
621 { char-16-rep [ PMADDUBSW ] }
622 { uchar-16-rep [ PMADDUBSW ] }
623 { short-8-rep [ PMADDWD ] }
626 M: x86 %mul-horizontal-add-vector-reps
628 { sse2? { short-8-rep } }
629 { ssse3? { char-16-rep uchar-16-rep } }
632 M: x86 %div-vector ( dst src1 src2 rep -- )
635 { float-4-rep [ DIVPS ] }
636 { double-2-rep [ DIVPD ] }
639 M: x86 %div-vector-reps
641 { sse? { float-4-rep } }
642 { sse2? { double-2-rep } }
645 M: x86 %min-vector ( dst src1 src2 rep -- )
648 { char-16-rep [ PMINSB ] }
649 { uchar-16-rep [ PMINUB ] }
650 { short-8-rep [ PMINSW ] }
651 { ushort-8-rep [ PMINUW ] }
652 { int-4-rep [ PMINSD ] }
653 { uint-4-rep [ PMINUD ] }
654 { float-4-rep [ MINPS ] }
655 { double-2-rep [ MINPD ] }
658 M: x86 %min-vector-reps
660 { sse? { float-4-rep } }
661 { sse2? { uchar-16-rep short-8-rep double-2-rep } }
662 { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
665 M: x86 %max-vector ( dst src1 src2 rep -- )
668 { char-16-rep [ PMAXSB ] }
669 { uchar-16-rep [ PMAXUB ] }
670 { short-8-rep [ PMAXSW ] }
671 { ushort-8-rep [ PMAXUW ] }
672 { int-4-rep [ PMAXSD ] }
673 { uint-4-rep [ PMAXUD ] }
674 { float-4-rep [ MAXPS ] }
675 { double-2-rep [ MAXPD ] }
678 M: x86 %max-vector-reps
680 { sse? { float-4-rep } }
681 { sse2? { uchar-16-rep short-8-rep double-2-rep } }
682 { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
685 M: x86 %avg-vector ( dst src1 src2 rep -- )
688 { uchar-16-rep [ PAVGB ] }
689 { ushort-8-rep [ PAVGW ] }
692 M: x86 %avg-vector-reps
694 { sse2? { uchar-16-rep ushort-8-rep } }
700 { float-4-rep [ HEX: ff DPPS ] }
701 { double-2-rep [ HEX: ff DPPD ] }
704 M: x86 %dot-vector-reps
706 { sse4.1? { float-4-rep double-2-rep } }
712 { uchar-16-rep [ PSADBW ] }
715 M: x86 %sad-vector-reps
717 { sse2? { uchar-16-rep } }
720 M: x86 %horizontal-add-vector ( dst src1 src2 rep -- )
723 { float-4-rep [ HADDPS ] }
724 { double-2-rep [ HADDPD ] }
725 { int-4-rep [ PHADDD ] }
726 { short-8-rep [ PHADDW ] }
729 M: x86 %horizontal-add-vector-reps
731 { sse3? { float-4-rep double-2-rep } }
732 { ssse3? { int-4-rep uint-4-rep short-8-rep ushort-8-rep } }
735 M: x86 %horizontal-shl-vector-imm ( dst src1 src2 rep -- )
738 M: x86 %horizontal-shl-vector-imm-reps
740 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } }
743 M: x86 %horizontal-shr-vector-imm ( dst src1 src2 rep -- )
746 M: x86 %horizontal-shr-vector-imm-reps
748 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } }
751 M: x86 %abs-vector ( dst src rep -- )
753 { char-16-rep [ PABSB ] }
754 { short-8-rep [ PABSW ] }
755 { int-4-rep [ PABSD ] }
758 M: x86 %abs-vector-reps
760 { ssse3? { char-16-rep short-8-rep int-4-rep } }
763 M: x86 %sqrt-vector ( dst src rep -- )
765 { float-4-rep [ SQRTPS ] }
766 { double-2-rep [ SQRTPD ] }
769 M: x86 %sqrt-vector-reps
771 { sse? { float-4-rep } }
772 { sse2? { double-2-rep } }
775 M: x86 %and-vector ( dst src1 src2 rep -- )
778 { float-4-rep [ ANDPS ] }
779 { double-2-rep [ ANDPS ] }
783 M: x86 %and-vector-reps
785 { sse? { float-4-rep } }
786 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
789 M: x86 %andn-vector ( dst src1 src2 rep -- )
792 { float-4-rep [ ANDNPS ] }
793 { double-2-rep [ ANDNPS ] }
797 M: x86 %andn-vector-reps
799 { sse? { float-4-rep } }
800 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
803 M: x86 %or-vector ( dst src1 src2 rep -- )
806 { float-4-rep [ ORPS ] }
807 { double-2-rep [ ORPS ] }
811 M: x86 %or-vector-reps
813 { sse? { float-4-rep } }
814 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
817 M: x86 %xor-vector ( dst src1 src2 rep -- )
820 { float-4-rep [ XORPS ] }
821 { double-2-rep [ XORPS ] }
825 M: x86 %xor-vector-reps
827 { sse? { float-4-rep } }
828 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
831 M: x86 %shl-vector ( dst src1 src2 rep -- )
834 { short-8-rep [ PSLLW ] }
835 { ushort-8-rep [ PSLLW ] }
836 { int-4-rep [ PSLLD ] }
837 { uint-4-rep [ PSLLD ] }
838 { longlong-2-rep [ PSLLQ ] }
839 { ulonglong-2-rep [ PSLLQ ] }
842 M: x86 %shl-vector-reps
844 { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
847 M: x86 %shr-vector ( dst src1 src2 rep -- )
850 { short-8-rep [ PSRAW ] }
851 { ushort-8-rep [ PSRLW ] }
852 { int-4-rep [ PSRAD ] }
853 { uint-4-rep [ PSRLD ] }
854 { ulonglong-2-rep [ PSRLQ ] }
857 M: x86 %shr-vector-reps
859 { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } }
862 M: x86 %shl-vector-imm %shl-vector ;
863 M: x86 %shl-vector-imm-reps %shl-vector-reps ;
864 M: x86 %shr-vector-imm %shr-vector ;
865 M: x86 %shr-vector-imm-reps %shr-vector-reps ;
867 : scalar-sized-reg ( reg rep -- reg' )
868 rep-size 8 * n-bit-version-of ;
870 M: x86 %integer>scalar drop MOVD ;
872 :: %scalar>integer-32 ( dst src rep -- )
875 dst 32-bit-version-of src MOVD
876 dst dst 32-bit-version-of
877 2dup eq? [ 2drop ] [ MOVSX ] if
880 dst 32-bit-version-of src MOVD
883 dst 32-bit-version-of src MOVD
884 dst dst 16-bit-version-of MOVSX
886 { ushort-scalar-rep [
887 dst 32-bit-version-of src MOVD
888 dst dst 16-bit-version-of MOVZX
891 dst 32-bit-version-of src MOVD
892 dst { } 8 [| tmp-dst |
893 tmp-dst dst int-rep %copy
894 tmp-dst tmp-dst 8-bit-version-of MOVSX
895 dst tmp-dst int-rep %copy
896 ] with-small-register
899 dst 32-bit-version-of src MOVD
900 dst { } 8 [| tmp-dst |
901 tmp-dst dst int-rep %copy
902 tmp-dst tmp-dst 8-bit-version-of MOVZX
903 dst tmp-dst int-rep %copy
904 ] with-small-register
908 M: x86.32 %scalar>integer ( dst src rep -- ) %scalar>integer-32 ;
910 M: x86.64 %scalar>integer ( dst src rep -- )
912 { longlong-scalar-rep [ MOVD ] }
913 { ulonglong-scalar-rep [ MOVD ] }
914 [ %scalar>integer-32 ]
917 M: x86 %vector>scalar %copy ;
919 M: x86 %scalar>vector %copy ;
921 enable-float-intrinsics
922 enable-float-functions