1 ! Copyright (C) 2009, 2010 Joe Groff, Slava Pestov.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: alien.data arrays assocs combinators compiler.cfg.comparisons
4 compiler.cfg.intrinsics cpu.architecture cpu.x86 cpu.x86.assembler
5 cpu.x86.assembler.operands cpu.x86.features fry kernel locals macros
6 math math.vectors quotations sequences system ;
7 QUALIFIED-WITH: alien.c-types c
10 ! Scalar floating point with SSE2
11 M: x86 %load-float c:float <ref> float-rep %load-vector ;
12 M: x86 %load-double c:double <ref> double-rep %load-vector ;
14 M: float-rep copy-register* drop MOVAPS ;
15 M: double-rep copy-register* drop MOVAPS ;
17 M: float-rep copy-memory* drop MOVSS ;
18 M: double-rep copy-memory* drop MOVSD ;
20 M: x86 %add-float double-rep two-operand ADDSD ;
21 M: x86 %sub-float double-rep two-operand SUBSD ;
22 M: x86 %mul-float double-rep two-operand MULSD ;
23 M: x86 %div-float double-rep two-operand DIVSD ;
24 M: x86 %min-float double-rep two-operand MINSD ;
25 M: x86 %max-float double-rep two-operand MAXSD ;
28 : %clear-unless-in-place ( dst src -- )
29 over = [ drop ] [ dup XORPS ] if ;
31 M: x86 %single>double-float [ %clear-unless-in-place ] [ CVTSS2SD ] 2bi ;
32 M: x86 %double>single-float [ %clear-unless-in-place ] [ CVTSD2SS ] 2bi ;
34 M: x86 integer-float-needs-stack-frame? f ;
35 M: x86 %integer>float [ drop dup XORPS ] [ CVTSI2SD ] 2bi ;
36 M: x86 %float>integer CVTTSD2SI ;
38 M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- )
39 [ COMISD ] (%compare-float) ;
41 M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- )
42 [ UCOMISD ] (%compare-float) ;
44 M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- )
45 [ COMISD ] (%compare-float-branch) ;
47 M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- )
48 [ UCOMISD ] (%compare-float-branch) ;
51 M: float-4-rep copy-register* drop MOVAPS ;
52 M: double-2-rep copy-register* drop MOVAPS ;
53 M: vector-rep copy-register* drop MOVDQA ;
55 MACRO: available-reps ( alist -- quot )
56 ! Each SSE version adds new representations and supports
58 unzip { } [ append ] accumulate*
59 [ [ 1quotation ] map ] bi@ zip
60 reverse [ { } ] suffix
63 M: x86 %alien-vector-reps
65 { sse? { float-4-rep } }
66 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
71 { double-2-rep [ dup XORPS ] }
72 { float-4-rep [ dup XORPS ] }
76 M: x86 %zero-vector-reps
78 { sse? { float-4-rep } }
79 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
84 { double-2-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] }
85 { float-4-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] }
89 M: x86 %fill-vector-reps
91 { sse? { float-4-rep } }
92 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
95 M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
98 dst src1 float-4-rep %copy
104 dst src1 int-4-rep %copy
111 M: x86 %gather-vector-4-reps
113 ! Can't do this with sse1 since it will want to unbox
114 ! double-precision floats and convert to single precision
115 { sse2? { float-4-rep int-4-rep uint-4-rep } }
118 M:: x86 %gather-int-vector-4 ( dst src1 src2 src3 src4 rep -- )
120 dst src1 32-bit-version-of 0 PINSRD
121 dst src2 32-bit-version-of 1 PINSRD
122 dst src3 32-bit-version-of 2 PINSRD
123 dst src4 32-bit-version-of 3 PINSRD ;
125 M: x86 %gather-int-vector-4-reps
127 { sse4.1? { int-4-rep uint-4-rep } }
130 M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
133 dst src1 double-2-rep %copy
137 dst src1 longlong-2-rep %copy
142 M: x86 %gather-vector-2-reps
144 { sse2? { double-2-rep longlong-2-rep ulonglong-2-rep } }
147 M:: x86.64 %gather-int-vector-2 ( dst src1 src2 rep -- )
152 M: x86.64 %gather-int-vector-2-reps
154 { sse4.1? { longlong-2-rep ulonglong-2-rep } }
157 :: %select-vector-32 ( dst src n rep -- )
160 dst 32-bit-version-of src n PEXTRB
161 dst dst 8-bit-version-of MOVSX
164 dst 32-bit-version-of src n PEXTRB
167 dst 32-bit-version-of src n PEXTRW
168 dst dst 16-bit-version-of MOVSX
171 dst 32-bit-version-of src n PEXTRW
174 dst 32-bit-version-of src n PEXTRD
175 dst dst 32-bit-version-of 2dup = [ 2drop ] [ MOVSX ] if
178 dst 32-bit-version-of src n PEXTRD
182 M: x86.32 %select-vector
185 M: x86.32 %select-vector-reps
187 { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep } }
190 M: x86.64 %select-vector
192 { longlong-2-rep [ PEXTRQ ] }
193 { ulonglong-2-rep [ PEXTRQ ] }
194 [ %select-vector-32 ]
197 M: x86.64 %select-vector-reps
199 { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep ulonglong-2-rep longlong-2-rep } }
202 : sse1-float-4-shuffle ( dst shuffle -- )
204 { { 0 1 2 3 } [ drop ] }
205 { { 0 1 0 1 } [ dup MOVLHPS ] }
206 { { 2 3 2 3 } [ dup MOVHLPS ] }
207 { { 0 0 1 1 } [ dup UNPCKLPS ] }
208 { { 2 2 3 3 } [ dup UNPCKHPS ] }
212 : float-4-shuffle ( dst shuffle -- )
215 { { 0 0 2 2 } [ dup MOVSLDUP ] }
216 { { 1 1 3 3 } [ dup MOVSHDUP ] }
217 [ sse1-float-4-shuffle ]
219 ] [ sse1-float-4-shuffle ] if ;
221 : int-4-shuffle ( dst shuffle -- )
223 { { 0 1 2 3 } [ drop ] }
224 { { 0 0 1 1 } [ dup PUNPCKLDQ ] }
225 { { 2 2 3 3 } [ dup PUNPCKHDQ ] }
226 { { 0 1 0 1 } [ dup PUNPCKLQDQ ] }
227 { { 2 3 2 3 } [ dup PUNPCKHQDQ ] }
231 : longlong-2-shuffle ( dst shuffle -- )
232 first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ;
234 : >float-4-shuffle ( double-2-shuffle -- float-4-shuffle )
235 [ 2 * { 0 1 } n+v ] map concat ;
237 M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- )
239 dst shuffle rep signed-rep {
240 { double-2-rep [ >float-4-shuffle float-4-shuffle ] }
241 { float-4-rep [ float-4-shuffle ] }
242 { int-4-rep [ int-4-shuffle ] }
243 { longlong-2-rep [ longlong-2-shuffle ] }
246 M: x86 %shuffle-vector-imm-reps
248 { sse? { float-4-rep } }
249 { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
252 M:: x86 %shuffle-vector-halves-imm ( dst src1 src2 shuffle rep -- )
253 dst src1 src2 rep two-operand
255 { double-2-rep [ >float-4-shuffle SHUFPS ] }
256 { float-4-rep [ SHUFPS ] }
259 M: x86 %shuffle-vector-halves-imm-reps
261 { sse? { float-4-rep } }
262 { sse2? { double-2-rep } }
265 M: x86 %shuffle-vector ( dst src shuffle rep -- )
268 M: x86 %shuffle-vector-reps
270 { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
273 M: x86 %merge-vector-head
276 { double-2-rep [ MOVLHPS ] }
277 { float-4-rep [ UNPCKLPS ] }
278 { longlong-2-rep [ PUNPCKLQDQ ] }
279 { int-4-rep [ PUNPCKLDQ ] }
280 { short-8-rep [ PUNPCKLWD ] }
281 { char-16-rep [ PUNPCKLBW ] }
284 M: x86 %merge-vector-tail
287 { double-2-rep [ UNPCKHPD ] }
288 { float-4-rep [ UNPCKHPS ] }
289 { longlong-2-rep [ PUNPCKHQDQ ] }
290 { int-4-rep [ PUNPCKHDQ ] }
291 { short-8-rep [ PUNPCKHWD ] }
292 { char-16-rep [ PUNPCKHBW ] }
295 M: x86 %merge-vector-reps
297 { sse? { float-4-rep } }
298 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
301 M: x86 %float-pack-vector
304 M: x86 %float-pack-vector-reps
306 { sse2? { double-2-rep } }
309 M: x86 %signed-pack-vector
312 { int-4-rep [ PACKSSDW ] }
313 { short-8-rep [ PACKSSWB ] }
316 M: x86 %signed-pack-vector-reps
318 { sse2? { short-8-rep int-4-rep } }
321 M: x86 %unsigned-pack-vector
324 { int-4-rep [ PACKUSDW ] }
325 { short-8-rep [ PACKUSWB ] }
328 M: x86 %unsigned-pack-vector-reps
330 { sse2? { short-8-rep } }
331 { sse4.1? { int-4-rep } }
334 M: x86 %tail>head-vector ( dst src rep -- )
336 { float-4-rep [ drop UNPCKHPD ] }
337 { double-2-rep [ drop UNPCKHPD ] }
338 [ drop [ %copy ] [ drop PUNPCKHQDQ ] 3bi ]
341 M: x86 %unpack-vector-head ( dst src rep -- )
343 { char-16-rep [ PMOVSXBW ] }
344 { uchar-16-rep [ PMOVZXBW ] }
345 { short-8-rep [ PMOVSXWD ] }
346 { ushort-8-rep [ PMOVZXWD ] }
347 { int-4-rep [ PMOVSXDQ ] }
348 { uint-4-rep [ PMOVZXDQ ] }
349 { float-4-rep [ CVTPS2PD ] }
352 M: x86 %unpack-vector-head-reps ( -- reps )
354 { sse2? { float-4-rep } }
355 { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
358 M: x86 %integer>float-vector ( dst src rep -- )
360 { int-4-rep [ CVTDQ2PS ] }
363 M: x86 %integer>float-vector-reps
365 { sse2? { int-4-rep } }
368 M: x86 %float>integer-vector ( dst src rep -- )
370 { float-4-rep [ CVTTPS2DQ ] }
373 M: x86 %float>integer-vector-reps
375 { sse2? { float-4-rep } }
378 : (%compare-float-vector) ( dst src rep double single -- )
379 [ double-2-rep eq? ] 2dip if ; inline
381 : %compare-float-vector ( dst src rep cc -- )
383 { cc< [ [ CMPLTPD ] [ CMPLTPS ] (%compare-float-vector) ] }
384 { cc<= [ [ CMPLEPD ] [ CMPLEPS ] (%compare-float-vector) ] }
385 { cc= [ [ CMPEQPD ] [ CMPEQPS ] (%compare-float-vector) ] }
386 { cc<>= [ [ CMPORDPD ] [ CMPORDPS ] (%compare-float-vector) ] }
387 { cc/< [ [ CMPNLTPD ] [ CMPNLTPS ] (%compare-float-vector) ] }
388 { cc/<= [ [ CMPNLEPD ] [ CMPNLEPS ] (%compare-float-vector) ] }
389 { cc/= [ [ CMPNEQPD ] [ CMPNEQPS ] (%compare-float-vector) ] }
390 { cc/<>= [ [ CMPUNORDPD ] [ CMPUNORDPS ] (%compare-float-vector) ] }
393 :: (%compare-int-vector) ( dst src rep int64 int32 int16 int8 -- )
394 rep signed-rep :> rep'
396 { longlong-2-rep [ int64 call ] }
397 { int-4-rep [ int32 call ] }
398 { short-8-rep [ int16 call ] }
399 { char-16-rep [ int8 call ] }
402 : %compare-int-vector ( dst src rep cc -- )
404 { cc= [ [ PCMPEQQ ] [ PCMPEQD ] [ PCMPEQW ] [ PCMPEQB ] (%compare-int-vector) ] }
405 { cc> [ [ PCMPGTQ ] [ PCMPGTD ] [ PCMPGTW ] [ PCMPGTB ] (%compare-int-vector) ] }
408 M: x86 %compare-vector ( dst src1 src2 rep cc -- )
409 [ [ two-operand ] keep ] dip
410 over float-vector-rep?
411 [ %compare-float-vector ]
412 [ %compare-int-vector ] if ;
414 : %compare-vector-eq-reps ( -- reps )
416 { sse? { float-4-rep } }
417 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
418 { sse4.1? { longlong-2-rep ulonglong-2-rep } }
421 : %compare-vector-ord-reps ( -- reps )
423 { sse? { float-4-rep } }
424 { sse2? { double-2-rep char-16-rep short-8-rep int-4-rep } }
425 { sse4.2? { longlong-2-rep } }
428 M: x86 %compare-vector-reps
430 { [ dup { cc= cc/= cc/<>= cc<>= } member-eq? ] [ drop %compare-vector-eq-reps ] }
431 [ drop %compare-vector-ord-reps ]
434 : %compare-float-vector-ccs ( cc -- ccs not? )
436 { cc< [ { { cc< f } } f ] }
437 { cc<= [ { { cc<= f } } f ] }
438 { cc> [ { { cc< t } } f ] }
439 { cc>= [ { { cc<= t } } f ] }
440 { cc= [ { { cc= f } } f ] }
441 { cc<> [ { { cc< f } { cc< t } } f ] }
442 { cc<>= [ { { cc<>= f } } f ] }
443 { cc/< [ { { cc/< f } } f ] }
444 { cc/<= [ { { cc/<= f } } f ] }
445 { cc/> [ { { cc/< t } } f ] }
446 { cc/>= [ { { cc/<= t } } f ] }
447 { cc/= [ { { cc/= f } } f ] }
448 { cc/<> [ { { cc/= f } { cc/<>= f } } f ] }
449 { cc/<>= [ { { cc/<>= f } } f ] }
452 : %compare-int-vector-ccs ( cc -- ccs not? )
454 { cc< [ { { cc> t } } f ] }
455 { cc<= [ { { cc> f } } t ] }
456 { cc> [ { { cc> f } } f ] }
457 { cc>= [ { { cc> t } } t ] }
458 { cc= [ { { cc= f } } f ] }
459 { cc/= [ { { cc= f } } t ] }
464 M: x86 %compare-vector-ccs
465 swap float-vector-rep?
466 [ %compare-float-vector-ccs ]
467 [ %compare-int-vector-ccs ] if ;
469 :: %test-vector-mask ( dst temp mask vcc -- )
471 { vcc-any [ dst dst TEST dst temp \ CMOVNE (%boolean) ] }
472 { vcc-none [ dst dst TEST dst temp \ CMOVE (%boolean) ] }
473 { vcc-all [ dst mask CMP dst temp \ CMOVE (%boolean) ] }
474 { vcc-notall [ dst mask CMP dst temp \ CMOVNE (%boolean) ] }
477 : (%move-vector-mask) ( dst src rep -- mask )
479 { double-2-rep [ MOVMSKPS 0xf ] }
480 { float-4-rep [ MOVMSKPS 0xf ] }
481 [ drop PMOVMSKB 0xffff ]
484 M: x86 %move-vector-mask ( dst src rep -- )
485 (%move-vector-mask) drop ;
487 M: x86 %move-vector-mask-reps
489 { sse? { float-4-rep } }
490 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
493 M:: x86 %test-vector ( dst src temp rep vcc -- )
494 dst src rep (%move-vector-mask) :> mask
495 dst temp mask vcc %test-vector-mask ;
497 :: %test-vector-mask-branch ( label temp mask vcc -- )
499 { vcc-any [ temp temp TEST label JNE ] }
500 { vcc-none [ temp temp TEST label JE ] }
501 { vcc-all [ temp mask CMP label JE ] }
502 { vcc-notall [ temp mask CMP label JNE ] }
505 M:: x86 %test-vector-branch ( label src temp rep vcc -- )
506 temp src rep (%move-vector-mask) :> mask
507 label temp mask vcc %test-vector-mask-branch ;
509 M: x86 %test-vector-reps
511 { sse? { float-4-rep } }
512 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
515 M: x86 %add-vector ( dst src1 src2 rep -- )
518 { float-4-rep [ ADDPS ] }
519 { double-2-rep [ ADDPD ] }
520 { char-16-rep [ PADDB ] }
521 { uchar-16-rep [ PADDB ] }
522 { short-8-rep [ PADDW ] }
523 { ushort-8-rep [ PADDW ] }
524 { int-4-rep [ PADDD ] }
525 { uint-4-rep [ PADDD ] }
526 { longlong-2-rep [ PADDQ ] }
527 { ulonglong-2-rep [ PADDQ ] }
530 M: x86 %add-vector-reps
532 { sse? { float-4-rep } }
533 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
536 M: x86 %saturated-add-vector ( dst src1 src2 rep -- )
539 { char-16-rep [ PADDSB ] }
540 { uchar-16-rep [ PADDUSB ] }
541 { short-8-rep [ PADDSW ] }
542 { ushort-8-rep [ PADDUSW ] }
545 M: x86 %saturated-add-vector-reps
547 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
550 M: x86 %add-sub-vector ( dst src1 src2 rep -- )
553 { float-4-rep [ ADDSUBPS ] }
554 { double-2-rep [ ADDSUBPD ] }
557 M: x86 %add-sub-vector-reps
559 { sse3? { float-4-rep double-2-rep } }
562 M: x86 %sub-vector ( dst src1 src2 rep -- )
565 { float-4-rep [ SUBPS ] }
566 { double-2-rep [ SUBPD ] }
567 { char-16-rep [ PSUBB ] }
568 { uchar-16-rep [ PSUBB ] }
569 { short-8-rep [ PSUBW ] }
570 { ushort-8-rep [ PSUBW ] }
571 { int-4-rep [ PSUBD ] }
572 { uint-4-rep [ PSUBD ] }
573 { longlong-2-rep [ PSUBQ ] }
574 { ulonglong-2-rep [ PSUBQ ] }
577 M: x86 %sub-vector-reps
579 { sse? { float-4-rep } }
580 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
583 M: x86 %saturated-sub-vector ( dst src1 src2 rep -- )
586 { char-16-rep [ PSUBSB ] }
587 { uchar-16-rep [ PSUBUSB ] }
588 { short-8-rep [ PSUBSW ] }
589 { ushort-8-rep [ PSUBUSW ] }
592 M: x86 %saturated-sub-vector-reps
594 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
597 M: x86 %mul-vector ( dst src1 src2 rep -- )
600 { float-4-rep [ MULPS ] }
601 { double-2-rep [ MULPD ] }
602 { short-8-rep [ PMULLW ] }
603 { ushort-8-rep [ PMULLW ] }
604 { int-4-rep [ PMULLD ] }
605 { uint-4-rep [ PMULLD ] }
608 M: x86 %mul-vector-reps
610 { sse? { float-4-rep } }
611 { sse2? { double-2-rep short-8-rep ushort-8-rep } }
612 { sse4.1? { int-4-rep uint-4-rep } }
615 M: x86 %mul-high-vector ( dst src1 src2 rep -- )
618 { short-8-rep [ PMULHW ] }
619 { ushort-8-rep [ PMULHUW ] }
622 M: x86 %mul-high-vector-reps
624 { sse2? { short-8-rep ushort-8-rep } }
627 M: x86 %mul-horizontal-add-vector ( dst src1 src2 rep -- )
630 { char-16-rep [ PMADDUBSW ] }
631 { uchar-16-rep [ PMADDUBSW ] }
632 { short-8-rep [ PMADDWD ] }
635 M: x86 %mul-horizontal-add-vector-reps
637 { sse2? { short-8-rep } }
638 { ssse3? { char-16-rep uchar-16-rep } }
641 M: x86 %div-vector ( dst src1 src2 rep -- )
644 { float-4-rep [ DIVPS ] }
645 { double-2-rep [ DIVPD ] }
648 M: x86 %div-vector-reps
650 { sse? { float-4-rep } }
651 { sse2? { double-2-rep } }
654 M: x86 %min-vector ( dst src1 src2 rep -- )
657 { char-16-rep [ PMINSB ] }
658 { uchar-16-rep [ PMINUB ] }
659 { short-8-rep [ PMINSW ] }
660 { ushort-8-rep [ PMINUW ] }
661 { int-4-rep [ PMINSD ] }
662 { uint-4-rep [ PMINUD ] }
663 { float-4-rep [ MINPS ] }
664 { double-2-rep [ MINPD ] }
667 M: x86 %min-vector-reps
669 { sse? { float-4-rep } }
670 { sse2? { uchar-16-rep short-8-rep double-2-rep } }
671 { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
674 M: x86 %max-vector ( dst src1 src2 rep -- )
677 { char-16-rep [ PMAXSB ] }
678 { uchar-16-rep [ PMAXUB ] }
679 { short-8-rep [ PMAXSW ] }
680 { ushort-8-rep [ PMAXUW ] }
681 { int-4-rep [ PMAXSD ] }
682 { uint-4-rep [ PMAXUD ] }
683 { float-4-rep [ MAXPS ] }
684 { double-2-rep [ MAXPD ] }
687 M: x86 %max-vector-reps
689 { sse? { float-4-rep } }
690 { sse2? { uchar-16-rep short-8-rep double-2-rep } }
691 { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
694 M: x86 %avg-vector ( dst src1 src2 rep -- )
697 { uchar-16-rep [ PAVGB ] }
698 { ushort-8-rep [ PAVGW ] }
701 M: x86 %avg-vector-reps
703 { sse2? { uchar-16-rep ushort-8-rep } }
709 { float-4-rep [ 0xff DPPS ] }
710 { double-2-rep [ 0xff DPPD ] }
713 M: x86 %dot-vector-reps
715 { sse4.1? { float-4-rep double-2-rep } }
721 { uchar-16-rep [ PSADBW ] }
724 M: x86 %sad-vector-reps
726 { sse2? { uchar-16-rep } }
729 M: x86 %horizontal-add-vector ( dst src1 src2 rep -- )
732 { float-4-rep [ HADDPS ] }
733 { double-2-rep [ HADDPD ] }
734 { int-4-rep [ PHADDD ] }
735 { short-8-rep [ PHADDW ] }
738 M: x86 %horizontal-add-vector-reps
740 { sse3? { float-4-rep double-2-rep } }
741 { ssse3? { int-4-rep uint-4-rep short-8-rep ushort-8-rep } }
744 M: x86 %horizontal-shl-vector-imm ( dst src1 src2 rep -- )
747 M: x86 %horizontal-shl-vector-imm-reps
749 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } }
752 M: x86 %horizontal-shr-vector-imm ( dst src1 src2 rep -- )
755 M: x86 %horizontal-shr-vector-imm-reps
757 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } }
760 M: x86 %abs-vector ( dst src rep -- )
762 { char-16-rep [ PABSB ] }
763 { short-8-rep [ PABSW ] }
764 { int-4-rep [ PABSD ] }
767 M: x86 %abs-vector-reps
769 { ssse3? { char-16-rep short-8-rep int-4-rep } }
772 M: x86 %sqrt-vector ( dst src rep -- )
774 { float-4-rep [ SQRTPS ] }
775 { double-2-rep [ SQRTPD ] }
778 M: x86 %sqrt-vector-reps
780 { sse? { float-4-rep } }
781 { sse2? { double-2-rep } }
784 M: x86 %and-vector ( dst src1 src2 rep -- )
787 { float-4-rep [ ANDPS ] }
788 { double-2-rep [ ANDPS ] }
792 M: x86 %and-vector-reps
794 { sse? { float-4-rep } }
795 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
798 M: x86 %andn-vector ( dst src1 src2 rep -- )
801 { float-4-rep [ ANDNPS ] }
802 { double-2-rep [ ANDNPS ] }
806 M: x86 %andn-vector-reps
808 { sse? { float-4-rep } }
809 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
812 M: x86 %or-vector ( dst src1 src2 rep -- )
815 { float-4-rep [ ORPS ] }
816 { double-2-rep [ ORPS ] }
820 M: x86 %or-vector-reps
822 { sse? { float-4-rep } }
823 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
826 M: x86 %xor-vector ( dst src1 src2 rep -- )
829 { float-4-rep [ XORPS ] }
830 { double-2-rep [ XORPS ] }
834 M: x86 %xor-vector-reps
836 { sse? { float-4-rep } }
837 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
840 M: x86 %shl-vector ( dst src1 src2 rep -- )
843 { short-8-rep [ PSLLW ] }
844 { ushort-8-rep [ PSLLW ] }
845 { int-4-rep [ PSLLD ] }
846 { uint-4-rep [ PSLLD ] }
847 { longlong-2-rep [ PSLLQ ] }
848 { ulonglong-2-rep [ PSLLQ ] }
851 M: x86 %shl-vector-reps
853 { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
856 M: x86 %shr-vector ( dst src1 src2 rep -- )
859 { short-8-rep [ PSRAW ] }
860 { ushort-8-rep [ PSRLW ] }
861 { int-4-rep [ PSRAD ] }
862 { uint-4-rep [ PSRLD ] }
863 { ulonglong-2-rep [ PSRLQ ] }
866 M: x86 %shr-vector-reps
868 { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } }
871 M: x86 %shl-vector-imm %shl-vector ;
872 M: x86 %shl-vector-imm-reps %shl-vector-reps ;
873 M: x86 %shr-vector-imm %shr-vector ;
874 M: x86 %shr-vector-imm-reps %shr-vector-reps ;
876 M: x86 %integer>scalar drop MOVD ;
878 :: %scalar>integer-32 ( dst src rep -- )
881 dst 32-bit-version-of src MOVD
882 dst dst 32-bit-version-of
883 2dup eq? [ 2drop ] [ MOVSX ] if
886 dst 32-bit-version-of src MOVD
889 dst 32-bit-version-of src MOVD
890 dst dst 16-bit-version-of MOVSX
892 { ushort-scalar-rep [
893 dst 32-bit-version-of src MOVD
894 dst dst 16-bit-version-of MOVZX
897 dst 32-bit-version-of src MOVD
898 dst { } 8 [| tmp-dst |
899 tmp-dst dst int-rep %copy
900 tmp-dst tmp-dst 8-bit-version-of MOVSX
901 dst tmp-dst int-rep %copy
902 ] with-small-register
905 dst 32-bit-version-of src MOVD
906 dst { } 8 [| tmp-dst |
907 tmp-dst dst int-rep %copy
908 tmp-dst tmp-dst 8-bit-version-of MOVZX
909 dst tmp-dst int-rep %copy
910 ] with-small-register
914 M: x86.32 %scalar>integer ( dst src rep -- ) %scalar>integer-32 ;
916 M: x86.64 %scalar>integer ( dst src rep -- )
918 { longlong-scalar-rep [ MOVD ] }
919 { ulonglong-scalar-rep [ MOVD ] }
920 [ %scalar>integer-32 ]
923 M: x86 %vector>scalar %copy ;
925 M: x86 %scalar>vector %copy ;