1 ! Copyright (C) 2009, 2010 Joe Groff, Slava Pestov.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: alien.c-types arrays assocs combinators fry kernel locals
4 macros math math.vectors namespaces quotations sequences system
5 compiler.cfg.comparisons compiler.cfg.intrinsics
6 compiler.codegen.fixup cpu.architecture cpu.x86
7 cpu.x86.assembler cpu.x86.assembler.operands cpu.x86.features ;
8 QUALIFIED-WITH: alien.c-types c
11 ! Scalar floating point with SSE2
12 M: x86 %load-float c:float <ref> float-rep %load-vector ;
13 M: x86 %load-double c:double <ref> double-rep %load-vector ;
15 M: float-rep copy-register* drop MOVAPS ;
16 M: double-rep copy-register* drop MOVAPS ;
18 M: float-rep copy-memory* drop MOVSS ;
19 M: double-rep copy-memory* drop MOVSD ;
21 M: x86 %add-float double-rep two-operand ADDSD ;
22 M: x86 %sub-float double-rep two-operand SUBSD ;
23 M: x86 %mul-float double-rep two-operand MULSD ;
24 M: x86 %div-float double-rep two-operand DIVSD ;
25 M: x86 %min-float double-rep two-operand MINSD ;
26 M: x86 %max-float double-rep two-operand MAXSD ;
29 : %clear-unless-in-place ( dst src -- )
30 over = [ drop ] [ dup XORPS ] if ;
32 M: x86 %single>double-float [ %clear-unless-in-place ] [ CVTSS2SD ] 2bi ;
33 M: x86 %double>single-float [ %clear-unless-in-place ] [ CVTSD2SS ] 2bi ;
35 M: x86 integer-float-needs-stack-frame? f ;
36 M: x86 %integer>float [ drop dup XORPS ] [ CVTSI2SD ] 2bi ;
37 M: x86 %float>integer CVTTSD2SI ;
39 M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- )
40 [ COMISD ] (%compare-float) ;
42 M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- )
43 [ UCOMISD ] (%compare-float) ;
45 M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- )
46 [ COMISD ] (%compare-float-branch) ;
48 M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- )
49 [ UCOMISD ] (%compare-float-branch) ;
52 M: float-4-rep copy-register* drop MOVAPS ;
53 M: double-2-rep copy-register* drop MOVAPS ;
54 M: vector-rep copy-register* drop MOVDQA ;
56 MACRO: available-reps ( alist -- )
57 ! Each SSE version adds new representations and supports
59 unzip { } [ append ] accumulate rest swap suffix
60 [ [ 1quotation ] map ] bi@ zip
61 reverse [ { } ] suffix
64 M: x86 %alien-vector-reps
66 { sse? { float-4-rep } }
67 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
72 { double-2-rep [ dup XORPS ] }
73 { float-4-rep [ dup XORPS ] }
77 M: x86 %zero-vector-reps
79 { sse? { float-4-rep } }
80 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
85 { double-2-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] }
86 { float-4-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] }
90 M: x86 %fill-vector-reps
92 { sse? { float-4-rep } }
93 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
96 M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
99 dst src1 float-4-rep %copy
105 dst src1 int-4-rep %copy
112 M: x86 %gather-vector-4-reps
114 ! Can't do this with sse1 since it will want to unbox
115 ! double-precision floats and convert to single precision
116 { sse2? { float-4-rep int-4-rep uint-4-rep } }
119 M:: x86 %gather-int-vector-4 ( dst src1 src2 src3 src4 rep -- )
121 dst src1 32-bit-version-of 0 PINSRD
122 dst src2 32-bit-version-of 1 PINSRD
123 dst src3 32-bit-version-of 2 PINSRD
124 dst src4 32-bit-version-of 3 PINSRD ;
126 M: x86 %gather-int-vector-4-reps
128 { sse4.1? { int-4-rep uint-4-rep } }
131 M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
134 dst src1 double-2-rep %copy
138 dst src1 longlong-2-rep %copy
143 M: x86 %gather-vector-2-reps
145 { sse2? { double-2-rep longlong-2-rep ulonglong-2-rep } }
148 M:: x86.64 %gather-int-vector-2 ( dst src1 src2 rep -- )
153 M: x86.64 %gather-int-vector-2-reps
155 { sse4.1? { longlong-2-rep ulonglong-2-rep } }
158 :: %select-vector-32 ( dst src n rep -- )
161 dst 32-bit-version-of src n PEXTRB
162 dst dst 8-bit-version-of MOVSX
165 dst 32-bit-version-of src n PEXTRB
168 dst 32-bit-version-of src n PEXTRW
169 dst dst 16-bit-version-of MOVSX
172 dst 32-bit-version-of src n PEXTRW
175 dst 32-bit-version-of src n PEXTRD
176 dst dst 32-bit-version-of 2dup = [ 2drop ] [ MOVSX ] if
179 dst 32-bit-version-of src n PEXTRD
183 M: x86.32 %select-vector
186 M: x86.32 %select-vector-reps
188 { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep } }
191 M: x86.64 %select-vector
193 { longlong-2-rep [ PEXTRQ ] }
194 { ulonglong-2-rep [ PEXTRQ ] }
195 [ %select-vector-32 ]
198 M: x86.64 %select-vector-reps
200 { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep ulonglong-2-rep longlong-2-rep } }
203 : sse1-float-4-shuffle ( dst shuffle -- )
205 { { 0 1 2 3 } [ drop ] }
206 { { 0 1 0 1 } [ dup MOVLHPS ] }
207 { { 2 3 2 3 } [ dup MOVHLPS ] }
208 { { 0 0 1 1 } [ dup UNPCKLPS ] }
209 { { 2 2 3 3 } [ dup UNPCKHPS ] }
213 : float-4-shuffle ( dst shuffle -- )
216 { { 0 0 2 2 } [ dup MOVSLDUP ] }
217 { { 1 1 3 3 } [ dup MOVSHDUP ] }
218 [ sse1-float-4-shuffle ]
220 ] [ sse1-float-4-shuffle ] if ;
222 : int-4-shuffle ( dst shuffle -- )
224 { { 0 1 2 3 } [ drop ] }
225 { { 0 0 1 1 } [ dup PUNPCKLDQ ] }
226 { { 2 2 3 3 } [ dup PUNPCKHDQ ] }
227 { { 0 1 0 1 } [ dup PUNPCKLQDQ ] }
228 { { 2 3 2 3 } [ dup PUNPCKHQDQ ] }
232 : longlong-2-shuffle ( dst shuffle -- )
233 first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ;
235 : >float-4-shuffle ( double-2-shuffle -- float-4-shuffle )
236 [ 2 * { 0 1 } n+v ] map concat ;
238 M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- )
240 dst shuffle rep signed-rep {
241 { double-2-rep [ >float-4-shuffle float-4-shuffle ] }
242 { float-4-rep [ float-4-shuffle ] }
243 { int-4-rep [ int-4-shuffle ] }
244 { longlong-2-rep [ longlong-2-shuffle ] }
247 M: x86 %shuffle-vector-imm-reps
249 { sse? { float-4-rep } }
250 { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
253 M:: x86 %shuffle-vector-halves-imm ( dst src1 src2 shuffle rep -- )
254 dst src1 src2 rep two-operand
256 { double-2-rep [ >float-4-shuffle SHUFPS ] }
257 { float-4-rep [ SHUFPS ] }
260 M: x86 %shuffle-vector-halves-imm-reps
262 { sse? { float-4-rep } }
263 { sse2? { double-2-rep } }
266 M: x86 %shuffle-vector ( dst src shuffle rep -- )
269 M: x86 %shuffle-vector-reps
271 { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
274 M: x86 %merge-vector-head
277 { double-2-rep [ MOVLHPS ] }
278 { float-4-rep [ UNPCKLPS ] }
279 { longlong-2-rep [ PUNPCKLQDQ ] }
280 { int-4-rep [ PUNPCKLDQ ] }
281 { short-8-rep [ PUNPCKLWD ] }
282 { char-16-rep [ PUNPCKLBW ] }
285 M: x86 %merge-vector-tail
288 { double-2-rep [ UNPCKHPD ] }
289 { float-4-rep [ UNPCKHPS ] }
290 { longlong-2-rep [ PUNPCKHQDQ ] }
291 { int-4-rep [ PUNPCKHDQ ] }
292 { short-8-rep [ PUNPCKHWD ] }
293 { char-16-rep [ PUNPCKHBW ] }
296 M: x86 %merge-vector-reps
298 { sse? { float-4-rep } }
299 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
302 M: x86 %float-pack-vector
305 M: x86 %float-pack-vector-reps
307 { sse2? { double-2-rep } }
310 M: x86 %signed-pack-vector
313 { int-4-rep [ PACKSSDW ] }
314 { short-8-rep [ PACKSSWB ] }
317 M: x86 %signed-pack-vector-reps
319 { sse2? { short-8-rep int-4-rep } }
322 M: x86 %unsigned-pack-vector
325 { int-4-rep [ PACKUSDW ] }
326 { short-8-rep [ PACKUSWB ] }
329 M: x86 %unsigned-pack-vector-reps
331 { sse2? { short-8-rep } }
332 { sse4.1? { int-4-rep } }
335 M: x86 %tail>head-vector ( dst src rep -- )
337 { float-4-rep [ drop UNPCKHPD ] }
338 { double-2-rep [ drop UNPCKHPD ] }
339 [ drop [ %copy ] [ drop PUNPCKHQDQ ] 3bi ]
342 M: x86 %unpack-vector-head ( dst src rep -- )
344 { char-16-rep [ PMOVSXBW ] }
345 { uchar-16-rep [ PMOVZXBW ] }
346 { short-8-rep [ PMOVSXWD ] }
347 { ushort-8-rep [ PMOVZXWD ] }
348 { int-4-rep [ PMOVSXDQ ] }
349 { uint-4-rep [ PMOVZXDQ ] }
350 { float-4-rep [ CVTPS2PD ] }
353 M: x86 %unpack-vector-head-reps ( -- reps )
355 { sse2? { float-4-rep } }
356 { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
359 M: x86 %integer>float-vector ( dst src rep -- )
361 { int-4-rep [ CVTDQ2PS ] }
364 M: x86 %integer>float-vector-reps
366 { sse2? { int-4-rep } }
369 M: x86 %float>integer-vector ( dst src rep -- )
371 { float-4-rep [ CVTTPS2DQ ] }
374 M: x86 %float>integer-vector-reps
376 { sse2? { float-4-rep } }
379 : (%compare-float-vector) ( dst src rep double single -- )
380 [ double-2-rep eq? ] 2dip if ; inline
382 : %compare-float-vector ( dst src rep cc -- )
384 { cc< [ [ CMPLTPD ] [ CMPLTPS ] (%compare-float-vector) ] }
385 { cc<= [ [ CMPLEPD ] [ CMPLEPS ] (%compare-float-vector) ] }
386 { cc= [ [ CMPEQPD ] [ CMPEQPS ] (%compare-float-vector) ] }
387 { cc<>= [ [ CMPORDPD ] [ CMPORDPS ] (%compare-float-vector) ] }
388 { cc/< [ [ CMPNLTPD ] [ CMPNLTPS ] (%compare-float-vector) ] }
389 { cc/<= [ [ CMPNLEPD ] [ CMPNLEPS ] (%compare-float-vector) ] }
390 { cc/= [ [ CMPNEQPD ] [ CMPNEQPS ] (%compare-float-vector) ] }
391 { cc/<>= [ [ CMPUNORDPD ] [ CMPUNORDPS ] (%compare-float-vector) ] }
394 :: (%compare-int-vector) ( dst src rep int64 int32 int16 int8 -- )
395 rep signed-rep :> rep'
397 { longlong-2-rep [ int64 call ] }
398 { int-4-rep [ int32 call ] }
399 { short-8-rep [ int16 call ] }
400 { char-16-rep [ int8 call ] }
403 : %compare-int-vector ( dst src rep cc -- )
405 { cc= [ [ PCMPEQQ ] [ PCMPEQD ] [ PCMPEQW ] [ PCMPEQB ] (%compare-int-vector) ] }
406 { cc> [ [ PCMPGTQ ] [ PCMPGTD ] [ PCMPGTW ] [ PCMPGTB ] (%compare-int-vector) ] }
409 M: x86 %compare-vector ( dst src1 src2 rep cc -- )
410 [ [ two-operand ] keep ] dip
411 over float-vector-rep?
412 [ %compare-float-vector ]
413 [ %compare-int-vector ] if ;
415 : %compare-vector-eq-reps ( -- reps )
417 { sse? { float-4-rep } }
418 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
419 { sse4.1? { longlong-2-rep ulonglong-2-rep } }
422 : %compare-vector-ord-reps ( -- reps )
424 { sse? { float-4-rep } }
425 { sse2? { double-2-rep char-16-rep short-8-rep int-4-rep } }
426 { sse4.2? { longlong-2-rep } }
429 M: x86 %compare-vector-reps
431 { [ dup { cc= cc/= cc/<>= cc<>= } member-eq? ] [ drop %compare-vector-eq-reps ] }
432 [ drop %compare-vector-ord-reps ]
435 : %compare-float-vector-ccs ( cc -- ccs not? )
437 { cc< [ { { cc< f } } f ] }
438 { cc<= [ { { cc<= f } } f ] }
439 { cc> [ { { cc< t } } f ] }
440 { cc>= [ { { cc<= t } } f ] }
441 { cc= [ { { cc= f } } f ] }
442 { cc<> [ { { cc< f } { cc< t } } f ] }
443 { cc<>= [ { { cc<>= f } } f ] }
444 { cc/< [ { { cc/< f } } f ] }
445 { cc/<= [ { { cc/<= f } } f ] }
446 { cc/> [ { { cc/< t } } f ] }
447 { cc/>= [ { { cc/<= t } } f ] }
448 { cc/= [ { { cc/= f } } f ] }
449 { cc/<> [ { { cc/= f } { cc/<>= f } } f ] }
450 { cc/<>= [ { { cc/<>= f } } f ] }
453 : %compare-int-vector-ccs ( cc -- ccs not? )
455 { cc< [ { { cc> t } } f ] }
456 { cc<= [ { { cc> f } } t ] }
457 { cc> [ { { cc> f } } f ] }
458 { cc>= [ { { cc> t } } t ] }
459 { cc= [ { { cc= f } } f ] }
460 { cc/= [ { { cc= f } } t ] }
465 M: x86 %compare-vector-ccs
466 swap float-vector-rep?
467 [ %compare-float-vector-ccs ]
468 [ %compare-int-vector-ccs ] if ;
470 :: %test-vector-mask ( dst temp mask vcc -- )
472 { vcc-any [ dst dst TEST dst temp \ CMOVNE (%boolean) ] }
473 { vcc-none [ dst dst TEST dst temp \ CMOVE (%boolean) ] }
474 { vcc-all [ dst mask CMP dst temp \ CMOVE (%boolean) ] }
475 { vcc-notall [ dst mask CMP dst temp \ CMOVNE (%boolean) ] }
478 : %move-vector-mask ( dst src rep -- mask )
480 { double-2-rep [ MOVMSKPS HEX: f ] }
481 { float-4-rep [ MOVMSKPS HEX: f ] }
482 [ drop PMOVMSKB HEX: ffff ]
485 M:: x86 %test-vector ( dst src temp rep vcc -- )
486 dst src rep %move-vector-mask :> mask
487 dst temp mask vcc %test-vector-mask ;
489 :: %test-vector-mask-branch ( label temp mask vcc -- )
491 { vcc-any [ temp temp TEST label JNE ] }
492 { vcc-none [ temp temp TEST label JE ] }
493 { vcc-all [ temp mask CMP label JE ] }
494 { vcc-notall [ temp mask CMP label JNE ] }
497 M:: x86 %test-vector-branch ( label src temp rep vcc -- )
498 temp src rep %move-vector-mask :> mask
499 label temp mask vcc %test-vector-mask-branch ;
501 M: x86 %test-vector-reps
503 { sse? { float-4-rep } }
504 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
507 M: x86 %add-vector ( dst src1 src2 rep -- )
510 { float-4-rep [ ADDPS ] }
511 { double-2-rep [ ADDPD ] }
512 { char-16-rep [ PADDB ] }
513 { uchar-16-rep [ PADDB ] }
514 { short-8-rep [ PADDW ] }
515 { ushort-8-rep [ PADDW ] }
516 { int-4-rep [ PADDD ] }
517 { uint-4-rep [ PADDD ] }
518 { longlong-2-rep [ PADDQ ] }
519 { ulonglong-2-rep [ PADDQ ] }
522 M: x86 %add-vector-reps
524 { sse? { float-4-rep } }
525 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
528 M: x86 %saturated-add-vector ( dst src1 src2 rep -- )
531 { char-16-rep [ PADDSB ] }
532 { uchar-16-rep [ PADDUSB ] }
533 { short-8-rep [ PADDSW ] }
534 { ushort-8-rep [ PADDUSW ] }
537 M: x86 %saturated-add-vector-reps
539 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
542 M: x86 %add-sub-vector ( dst src1 src2 rep -- )
545 { float-4-rep [ ADDSUBPS ] }
546 { double-2-rep [ ADDSUBPD ] }
549 M: x86 %add-sub-vector-reps
551 { sse3? { float-4-rep double-2-rep } }
554 M: x86 %sub-vector ( dst src1 src2 rep -- )
557 { float-4-rep [ SUBPS ] }
558 { double-2-rep [ SUBPD ] }
559 { char-16-rep [ PSUBB ] }
560 { uchar-16-rep [ PSUBB ] }
561 { short-8-rep [ PSUBW ] }
562 { ushort-8-rep [ PSUBW ] }
563 { int-4-rep [ PSUBD ] }
564 { uint-4-rep [ PSUBD ] }
565 { longlong-2-rep [ PSUBQ ] }
566 { ulonglong-2-rep [ PSUBQ ] }
569 M: x86 %sub-vector-reps
571 { sse? { float-4-rep } }
572 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
575 M: x86 %saturated-sub-vector ( dst src1 src2 rep -- )
578 { char-16-rep [ PSUBSB ] }
579 { uchar-16-rep [ PSUBUSB ] }
580 { short-8-rep [ PSUBSW ] }
581 { ushort-8-rep [ PSUBUSW ] }
584 M: x86 %saturated-sub-vector-reps
586 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
589 M: x86 %mul-vector ( dst src1 src2 rep -- )
592 { float-4-rep [ MULPS ] }
593 { double-2-rep [ MULPD ] }
594 { short-8-rep [ PMULLW ] }
595 { ushort-8-rep [ PMULLW ] }
596 { int-4-rep [ PMULLD ] }
597 { uint-4-rep [ PMULLD ] }
600 M: x86 %mul-vector-reps
602 { sse? { float-4-rep } }
603 { sse2? { double-2-rep short-8-rep ushort-8-rep } }
604 { sse4.1? { int-4-rep uint-4-rep } }
607 M: x86 %mul-high-vector ( dst src1 src2 rep -- )
610 { short-8-rep [ PMULHW ] }
611 { ushort-8-rep [ PMULHUW ] }
614 M: x86 %mul-high-vector-reps
616 { sse2? { short-8-rep ushort-8-rep } }
619 M: x86 %mul-horizontal-add-vector ( dst src1 src2 rep -- )
622 { char-16-rep [ PMADDUBSW ] }
623 { uchar-16-rep [ PMADDUBSW ] }
624 { short-8-rep [ PMADDWD ] }
627 M: x86 %mul-horizontal-add-vector-reps
629 { sse2? { short-8-rep } }
630 { ssse3? { char-16-rep uchar-16-rep } }
633 M: x86 %div-vector ( dst src1 src2 rep -- )
636 { float-4-rep [ DIVPS ] }
637 { double-2-rep [ DIVPD ] }
640 M: x86 %div-vector-reps
642 { sse? { float-4-rep } }
643 { sse2? { double-2-rep } }
646 M: x86 %min-vector ( dst src1 src2 rep -- )
649 { char-16-rep [ PMINSB ] }
650 { uchar-16-rep [ PMINUB ] }
651 { short-8-rep [ PMINSW ] }
652 { ushort-8-rep [ PMINUW ] }
653 { int-4-rep [ PMINSD ] }
654 { uint-4-rep [ PMINUD ] }
655 { float-4-rep [ MINPS ] }
656 { double-2-rep [ MINPD ] }
659 M: x86 %min-vector-reps
661 { sse? { float-4-rep } }
662 { sse2? { uchar-16-rep short-8-rep double-2-rep } }
663 { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
666 M: x86 %max-vector ( dst src1 src2 rep -- )
669 { char-16-rep [ PMAXSB ] }
670 { uchar-16-rep [ PMAXUB ] }
671 { short-8-rep [ PMAXSW ] }
672 { ushort-8-rep [ PMAXUW ] }
673 { int-4-rep [ PMAXSD ] }
674 { uint-4-rep [ PMAXUD ] }
675 { float-4-rep [ MAXPS ] }
676 { double-2-rep [ MAXPD ] }
679 M: x86 %max-vector-reps
681 { sse? { float-4-rep } }
682 { sse2? { uchar-16-rep short-8-rep double-2-rep } }
683 { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
686 M: x86 %avg-vector ( dst src1 src2 rep -- )
689 { uchar-16-rep [ PAVGB ] }
690 { ushort-8-rep [ PAVGW ] }
693 M: x86 %avg-vector-reps
695 { sse2? { uchar-16-rep ushort-8-rep } }
701 { float-4-rep [ HEX: ff DPPS ] }
702 { double-2-rep [ HEX: ff DPPD ] }
705 M: x86 %dot-vector-reps
707 { sse4.1? { float-4-rep double-2-rep } }
713 { uchar-16-rep [ PSADBW ] }
716 M: x86 %sad-vector-reps
718 { sse2? { uchar-16-rep } }
721 M: x86 %horizontal-add-vector ( dst src1 src2 rep -- )
724 { float-4-rep [ HADDPS ] }
725 { double-2-rep [ HADDPD ] }
726 { int-4-rep [ PHADDD ] }
727 { short-8-rep [ PHADDW ] }
730 M: x86 %horizontal-add-vector-reps
732 { sse3? { float-4-rep double-2-rep } }
733 { ssse3? { int-4-rep uint-4-rep short-8-rep ushort-8-rep } }
736 M: x86 %horizontal-shl-vector-imm ( dst src1 src2 rep -- )
739 M: x86 %horizontal-shl-vector-imm-reps
741 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } }
744 M: x86 %horizontal-shr-vector-imm ( dst src1 src2 rep -- )
747 M: x86 %horizontal-shr-vector-imm-reps
749 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } }
752 M: x86 %abs-vector ( dst src rep -- )
754 { char-16-rep [ PABSB ] }
755 { short-8-rep [ PABSW ] }
756 { int-4-rep [ PABSD ] }
759 M: x86 %abs-vector-reps
761 { ssse3? { char-16-rep short-8-rep int-4-rep } }
764 M: x86 %sqrt-vector ( dst src rep -- )
766 { float-4-rep [ SQRTPS ] }
767 { double-2-rep [ SQRTPD ] }
770 M: x86 %sqrt-vector-reps
772 { sse? { float-4-rep } }
773 { sse2? { double-2-rep } }
776 M: x86 %and-vector ( dst src1 src2 rep -- )
779 { float-4-rep [ ANDPS ] }
780 { double-2-rep [ ANDPS ] }
784 M: x86 %and-vector-reps
786 { sse? { float-4-rep } }
787 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
790 M: x86 %andn-vector ( dst src1 src2 rep -- )
793 { float-4-rep [ ANDNPS ] }
794 { double-2-rep [ ANDNPS ] }
798 M: x86 %andn-vector-reps
800 { sse? { float-4-rep } }
801 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
804 M: x86 %or-vector ( dst src1 src2 rep -- )
807 { float-4-rep [ ORPS ] }
808 { double-2-rep [ ORPS ] }
812 M: x86 %or-vector-reps
814 { sse? { float-4-rep } }
815 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
818 M: x86 %xor-vector ( dst src1 src2 rep -- )
821 { float-4-rep [ XORPS ] }
822 { double-2-rep [ XORPS ] }
826 M: x86 %xor-vector-reps
828 { sse? { float-4-rep } }
829 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
832 M: x86 %shl-vector ( dst src1 src2 rep -- )
835 { short-8-rep [ PSLLW ] }
836 { ushort-8-rep [ PSLLW ] }
837 { int-4-rep [ PSLLD ] }
838 { uint-4-rep [ PSLLD ] }
839 { longlong-2-rep [ PSLLQ ] }
840 { ulonglong-2-rep [ PSLLQ ] }
843 M: x86 %shl-vector-reps
845 { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
848 M: x86 %shr-vector ( dst src1 src2 rep -- )
851 { short-8-rep [ PSRAW ] }
852 { ushort-8-rep [ PSRLW ] }
853 { int-4-rep [ PSRAD ] }
854 { uint-4-rep [ PSRLD ] }
855 { ulonglong-2-rep [ PSRLQ ] }
858 M: x86 %shr-vector-reps
860 { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } }
863 M: x86 %shl-vector-imm %shl-vector ;
864 M: x86 %shl-vector-imm-reps %shl-vector-reps ;
865 M: x86 %shr-vector-imm %shr-vector ;
866 M: x86 %shr-vector-imm-reps %shr-vector-reps ;
868 : scalar-sized-reg ( reg rep -- reg' )
869 rep-size 8 * n-bit-version-of ;
871 M: x86 %integer>scalar drop MOVD ;
873 :: %scalar>integer-32 ( dst src rep -- )
876 dst 32-bit-version-of src MOVD
877 dst dst 32-bit-version-of
878 2dup eq? [ 2drop ] [ MOVSX ] if
881 dst 32-bit-version-of src MOVD
884 dst 32-bit-version-of src MOVD
885 dst dst 16-bit-version-of MOVSX
887 { ushort-scalar-rep [
888 dst 32-bit-version-of src MOVD
889 dst dst 16-bit-version-of MOVZX
892 dst 32-bit-version-of src MOVD
893 dst { } 8 [| tmp-dst |
894 tmp-dst dst int-rep %copy
895 tmp-dst tmp-dst 8-bit-version-of MOVSX
896 dst tmp-dst int-rep %copy
897 ] with-small-register
900 dst 32-bit-version-of src MOVD
901 dst { } 8 [| tmp-dst |
902 tmp-dst dst int-rep %copy
903 tmp-dst tmp-dst 8-bit-version-of MOVZX
904 dst tmp-dst int-rep %copy
905 ] with-small-register
909 M: x86.32 %scalar>integer ( dst src rep -- ) %scalar>integer-32 ;
911 M: x86.64 %scalar>integer ( dst src rep -- )
913 { longlong-scalar-rep [ MOVD ] }
914 { ulonglong-scalar-rep [ MOVD ] }
915 [ %scalar>integer-32 ]
918 M: x86 %vector>scalar %copy ;
920 M: x86 %scalar>vector %copy ;
922 enable-float-intrinsics