1 ! Copyright (C) 2009, 2010 Joe Groff, Slava Pestov.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: alien.data arrays assocs combinators fry kernel locals
4 macros math math.vectors namespaces quotations sequences system
5 compiler.cfg.comparisons compiler.cfg.intrinsics
6 compiler.codegen.labels compiler.codegen.relocation
7 cpu.architecture cpu.x86 cpu.x86.assembler
8 cpu.x86.assembler.operands cpu.x86.features ;
9 QUALIFIED-WITH: alien.c-types c
12 ! Scalar floating point with SSE2
13 M: x86 %load-float c:float <ref> float-rep %load-vector ;
14 M: x86 %load-double c:double <ref> double-rep %load-vector ;
16 M: float-rep copy-register* drop MOVAPS ;
17 M: double-rep copy-register* drop MOVAPS ;
19 M: float-rep copy-memory* drop MOVSS ;
20 M: double-rep copy-memory* drop MOVSD ;
22 M: x86 %add-float double-rep two-operand ADDSD ;
23 M: x86 %sub-float double-rep two-operand SUBSD ;
24 M: x86 %mul-float double-rep two-operand MULSD ;
25 M: x86 %div-float double-rep two-operand DIVSD ;
26 M: x86 %min-float double-rep two-operand MINSD ;
27 M: x86 %max-float double-rep two-operand MAXSD ;
30 : %clear-unless-in-place ( dst src -- )
31 over = [ drop ] [ dup XORPS ] if ;
33 M: x86 %single>double-float [ %clear-unless-in-place ] [ CVTSS2SD ] 2bi ;
34 M: x86 %double>single-float [ %clear-unless-in-place ] [ CVTSD2SS ] 2bi ;
36 M: x86 integer-float-needs-stack-frame? f ;
37 M: x86 %integer>float [ drop dup XORPS ] [ CVTSI2SD ] 2bi ;
38 M: x86 %float>integer CVTTSD2SI ;
40 M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- )
41 [ COMISD ] (%compare-float) ;
43 M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- )
44 [ UCOMISD ] (%compare-float) ;
46 M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- )
47 [ COMISD ] (%compare-float-branch) ;
49 M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- )
50 [ UCOMISD ] (%compare-float-branch) ;
53 M: float-4-rep copy-register* drop MOVAPS ;
54 M: double-2-rep copy-register* drop MOVAPS ;
55 M: vector-rep copy-register* drop MOVDQA ;
57 MACRO: available-reps ( alist -- )
58 ! Each SSE version adds new representations and supports
60 unzip { } [ append ] accumulate rest swap suffix
61 [ [ 1quotation ] map ] bi@ zip
62 reverse [ { } ] suffix
65 M: x86 %alien-vector-reps
67 { sse? { float-4-rep } }
68 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
73 { double-2-rep [ dup XORPS ] }
74 { float-4-rep [ dup XORPS ] }
78 M: x86 %zero-vector-reps
80 { sse? { float-4-rep } }
81 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
86 { double-2-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] }
87 { float-4-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] }
91 M: x86 %fill-vector-reps
93 { sse? { float-4-rep } }
94 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
97 M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
100 dst src1 float-4-rep %copy
106 dst src1 int-4-rep %copy
113 M: x86 %gather-vector-4-reps
115 ! Can't do this with sse1 since it will want to unbox
116 ! double-precision floats and convert to single precision
117 { sse2? { float-4-rep int-4-rep uint-4-rep } }
120 M:: x86 %gather-int-vector-4 ( dst src1 src2 src3 src4 rep -- )
122 dst src1 32-bit-version-of 0 PINSRD
123 dst src2 32-bit-version-of 1 PINSRD
124 dst src3 32-bit-version-of 2 PINSRD
125 dst src4 32-bit-version-of 3 PINSRD ;
127 M: x86 %gather-int-vector-4-reps
129 { sse4.1? { int-4-rep uint-4-rep } }
132 M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
135 dst src1 double-2-rep %copy
139 dst src1 longlong-2-rep %copy
144 M: x86 %gather-vector-2-reps
146 { sse2? { double-2-rep longlong-2-rep ulonglong-2-rep } }
149 M:: x86.64 %gather-int-vector-2 ( dst src1 src2 rep -- )
154 M: x86.64 %gather-int-vector-2-reps
156 { sse4.1? { longlong-2-rep ulonglong-2-rep } }
159 :: %select-vector-32 ( dst src n rep -- )
162 dst 32-bit-version-of src n PEXTRB
163 dst dst 8-bit-version-of MOVSX
166 dst 32-bit-version-of src n PEXTRB
169 dst 32-bit-version-of src n PEXTRW
170 dst dst 16-bit-version-of MOVSX
173 dst 32-bit-version-of src n PEXTRW
176 dst 32-bit-version-of src n PEXTRD
177 dst dst 32-bit-version-of 2dup = [ 2drop ] [ MOVSX ] if
180 dst 32-bit-version-of src n PEXTRD
184 M: x86.32 %select-vector
187 M: x86.32 %select-vector-reps
189 { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep } }
192 M: x86.64 %select-vector
194 { longlong-2-rep [ PEXTRQ ] }
195 { ulonglong-2-rep [ PEXTRQ ] }
196 [ %select-vector-32 ]
199 M: x86.64 %select-vector-reps
201 { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep ulonglong-2-rep longlong-2-rep } }
204 : sse1-float-4-shuffle ( dst shuffle -- )
206 { { 0 1 2 3 } [ drop ] }
207 { { 0 1 0 1 } [ dup MOVLHPS ] }
208 { { 2 3 2 3 } [ dup MOVHLPS ] }
209 { { 0 0 1 1 } [ dup UNPCKLPS ] }
210 { { 2 2 3 3 } [ dup UNPCKHPS ] }
214 : float-4-shuffle ( dst shuffle -- )
217 { { 0 0 2 2 } [ dup MOVSLDUP ] }
218 { { 1 1 3 3 } [ dup MOVSHDUP ] }
219 [ sse1-float-4-shuffle ]
221 ] [ sse1-float-4-shuffle ] if ;
223 : int-4-shuffle ( dst shuffle -- )
225 { { 0 1 2 3 } [ drop ] }
226 { { 0 0 1 1 } [ dup PUNPCKLDQ ] }
227 { { 2 2 3 3 } [ dup PUNPCKHDQ ] }
228 { { 0 1 0 1 } [ dup PUNPCKLQDQ ] }
229 { { 2 3 2 3 } [ dup PUNPCKHQDQ ] }
233 : longlong-2-shuffle ( dst shuffle -- )
234 first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ;
236 : >float-4-shuffle ( double-2-shuffle -- float-4-shuffle )
237 [ 2 * { 0 1 } n+v ] map concat ;
239 M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- )
241 dst shuffle rep signed-rep {
242 { double-2-rep [ >float-4-shuffle float-4-shuffle ] }
243 { float-4-rep [ float-4-shuffle ] }
244 { int-4-rep [ int-4-shuffle ] }
245 { longlong-2-rep [ longlong-2-shuffle ] }
248 M: x86 %shuffle-vector-imm-reps
250 { sse? { float-4-rep } }
251 { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
254 M:: x86 %shuffle-vector-halves-imm ( dst src1 src2 shuffle rep -- )
255 dst src1 src2 rep two-operand
257 { double-2-rep [ >float-4-shuffle SHUFPS ] }
258 { float-4-rep [ SHUFPS ] }
261 M: x86 %shuffle-vector-halves-imm-reps
263 { sse? { float-4-rep } }
264 { sse2? { double-2-rep } }
267 M: x86 %shuffle-vector ( dst src shuffle rep -- )
270 M: x86 %shuffle-vector-reps
272 { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
275 M: x86 %merge-vector-head
278 { double-2-rep [ MOVLHPS ] }
279 { float-4-rep [ UNPCKLPS ] }
280 { longlong-2-rep [ PUNPCKLQDQ ] }
281 { int-4-rep [ PUNPCKLDQ ] }
282 { short-8-rep [ PUNPCKLWD ] }
283 { char-16-rep [ PUNPCKLBW ] }
286 M: x86 %merge-vector-tail
289 { double-2-rep [ UNPCKHPD ] }
290 { float-4-rep [ UNPCKHPS ] }
291 { longlong-2-rep [ PUNPCKHQDQ ] }
292 { int-4-rep [ PUNPCKHDQ ] }
293 { short-8-rep [ PUNPCKHWD ] }
294 { char-16-rep [ PUNPCKHBW ] }
297 M: x86 %merge-vector-reps
299 { sse? { float-4-rep } }
300 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
303 M: x86 %float-pack-vector
306 M: x86 %float-pack-vector-reps
308 { sse2? { double-2-rep } }
311 M: x86 %signed-pack-vector
314 { int-4-rep [ PACKSSDW ] }
315 { short-8-rep [ PACKSSWB ] }
318 M: x86 %signed-pack-vector-reps
320 { sse2? { short-8-rep int-4-rep } }
323 M: x86 %unsigned-pack-vector
326 { int-4-rep [ PACKUSDW ] }
327 { short-8-rep [ PACKUSWB ] }
330 M: x86 %unsigned-pack-vector-reps
332 { sse2? { short-8-rep } }
333 { sse4.1? { int-4-rep } }
336 M: x86 %tail>head-vector ( dst src rep -- )
338 { float-4-rep [ drop UNPCKHPD ] }
339 { double-2-rep [ drop UNPCKHPD ] }
340 [ drop [ %copy ] [ drop PUNPCKHQDQ ] 3bi ]
343 M: x86 %unpack-vector-head ( dst src rep -- )
345 { char-16-rep [ PMOVSXBW ] }
346 { uchar-16-rep [ PMOVZXBW ] }
347 { short-8-rep [ PMOVSXWD ] }
348 { ushort-8-rep [ PMOVZXWD ] }
349 { int-4-rep [ PMOVSXDQ ] }
350 { uint-4-rep [ PMOVZXDQ ] }
351 { float-4-rep [ CVTPS2PD ] }
354 M: x86 %unpack-vector-head-reps ( -- reps )
356 { sse2? { float-4-rep } }
357 { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
360 M: x86 %integer>float-vector ( dst src rep -- )
362 { int-4-rep [ CVTDQ2PS ] }
365 M: x86 %integer>float-vector-reps
367 { sse2? { int-4-rep } }
370 M: x86 %float>integer-vector ( dst src rep -- )
372 { float-4-rep [ CVTTPS2DQ ] }
375 M: x86 %float>integer-vector-reps
377 { sse2? { float-4-rep } }
380 : (%compare-float-vector) ( dst src rep double single -- )
381 [ double-2-rep eq? ] 2dip if ; inline
383 : %compare-float-vector ( dst src rep cc -- )
385 { cc< [ [ CMPLTPD ] [ CMPLTPS ] (%compare-float-vector) ] }
386 { cc<= [ [ CMPLEPD ] [ CMPLEPS ] (%compare-float-vector) ] }
387 { cc= [ [ CMPEQPD ] [ CMPEQPS ] (%compare-float-vector) ] }
388 { cc<>= [ [ CMPORDPD ] [ CMPORDPS ] (%compare-float-vector) ] }
389 { cc/< [ [ CMPNLTPD ] [ CMPNLTPS ] (%compare-float-vector) ] }
390 { cc/<= [ [ CMPNLEPD ] [ CMPNLEPS ] (%compare-float-vector) ] }
391 { cc/= [ [ CMPNEQPD ] [ CMPNEQPS ] (%compare-float-vector) ] }
392 { cc/<>= [ [ CMPUNORDPD ] [ CMPUNORDPS ] (%compare-float-vector) ] }
395 :: (%compare-int-vector) ( dst src rep int64 int32 int16 int8 -- )
396 rep signed-rep :> rep'
398 { longlong-2-rep [ int64 call ] }
399 { int-4-rep [ int32 call ] }
400 { short-8-rep [ int16 call ] }
401 { char-16-rep [ int8 call ] }
404 : %compare-int-vector ( dst src rep cc -- )
406 { cc= [ [ PCMPEQQ ] [ PCMPEQD ] [ PCMPEQW ] [ PCMPEQB ] (%compare-int-vector) ] }
407 { cc> [ [ PCMPGTQ ] [ PCMPGTD ] [ PCMPGTW ] [ PCMPGTB ] (%compare-int-vector) ] }
410 M: x86 %compare-vector ( dst src1 src2 rep cc -- )
411 [ [ two-operand ] keep ] dip
412 over float-vector-rep?
413 [ %compare-float-vector ]
414 [ %compare-int-vector ] if ;
416 : %compare-vector-eq-reps ( -- reps )
418 { sse? { float-4-rep } }
419 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
420 { sse4.1? { longlong-2-rep ulonglong-2-rep } }
423 : %compare-vector-ord-reps ( -- reps )
425 { sse? { float-4-rep } }
426 { sse2? { double-2-rep char-16-rep short-8-rep int-4-rep } }
427 { sse4.2? { longlong-2-rep } }
430 M: x86 %compare-vector-reps
432 { [ dup { cc= cc/= cc/<>= cc<>= } member-eq? ] [ drop %compare-vector-eq-reps ] }
433 [ drop %compare-vector-ord-reps ]
436 : %compare-float-vector-ccs ( cc -- ccs not? )
438 { cc< [ { { cc< f } } f ] }
439 { cc<= [ { { cc<= f } } f ] }
440 { cc> [ { { cc< t } } f ] }
441 { cc>= [ { { cc<= t } } f ] }
442 { cc= [ { { cc= f } } f ] }
443 { cc<> [ { { cc< f } { cc< t } } f ] }
444 { cc<>= [ { { cc<>= f } } f ] }
445 { cc/< [ { { cc/< f } } f ] }
446 { cc/<= [ { { cc/<= f } } f ] }
447 { cc/> [ { { cc/< t } } f ] }
448 { cc/>= [ { { cc/<= t } } f ] }
449 { cc/= [ { { cc/= f } } f ] }
450 { cc/<> [ { { cc/= f } { cc/<>= f } } f ] }
451 { cc/<>= [ { { cc/<>= f } } f ] }
454 : %compare-int-vector-ccs ( cc -- ccs not? )
456 { cc< [ { { cc> t } } f ] }
457 { cc<= [ { { cc> f } } t ] }
458 { cc> [ { { cc> f } } f ] }
459 { cc>= [ { { cc> t } } t ] }
460 { cc= [ { { cc= f } } f ] }
461 { cc/= [ { { cc= f } } t ] }
466 M: x86 %compare-vector-ccs
467 swap float-vector-rep?
468 [ %compare-float-vector-ccs ]
469 [ %compare-int-vector-ccs ] if ;
471 :: %test-vector-mask ( dst temp mask vcc -- )
473 { vcc-any [ dst dst TEST dst temp \ CMOVNE (%boolean) ] }
474 { vcc-none [ dst dst TEST dst temp \ CMOVE (%boolean) ] }
475 { vcc-all [ dst mask CMP dst temp \ CMOVE (%boolean) ] }
476 { vcc-notall [ dst mask CMP dst temp \ CMOVNE (%boolean) ] }
479 : (%move-vector-mask) ( dst src rep -- mask )
481 { double-2-rep [ MOVMSKPS 0xf ] }
482 { float-4-rep [ MOVMSKPS 0xf ] }
483 [ drop PMOVMSKB 0xffff ]
486 M: x86 %move-vector-mask ( dst src rep -- )
487 (%move-vector-mask) drop ;
489 M: x86 %move-vector-mask-reps
491 { sse? { float-4-rep } }
492 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
495 M:: x86 %test-vector ( dst src temp rep vcc -- )
496 dst src rep (%move-vector-mask) :> mask
497 dst temp mask vcc %test-vector-mask ;
499 :: %test-vector-mask-branch ( label temp mask vcc -- )
501 { vcc-any [ temp temp TEST label JNE ] }
502 { vcc-none [ temp temp TEST label JE ] }
503 { vcc-all [ temp mask CMP label JE ] }
504 { vcc-notall [ temp mask CMP label JNE ] }
507 M:: x86 %test-vector-branch ( label src temp rep vcc -- )
508 temp src rep (%move-vector-mask) :> mask
509 label temp mask vcc %test-vector-mask-branch ;
511 M: x86 %test-vector-reps
513 { sse? { float-4-rep } }
514 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
517 M: x86 %add-vector ( dst src1 src2 rep -- )
520 { float-4-rep [ ADDPS ] }
521 { double-2-rep [ ADDPD ] }
522 { char-16-rep [ PADDB ] }
523 { uchar-16-rep [ PADDB ] }
524 { short-8-rep [ PADDW ] }
525 { ushort-8-rep [ PADDW ] }
526 { int-4-rep [ PADDD ] }
527 { uint-4-rep [ PADDD ] }
528 { longlong-2-rep [ PADDQ ] }
529 { ulonglong-2-rep [ PADDQ ] }
532 M: x86 %add-vector-reps
534 { sse? { float-4-rep } }
535 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
538 M: x86 %saturated-add-vector ( dst src1 src2 rep -- )
541 { char-16-rep [ PADDSB ] }
542 { uchar-16-rep [ PADDUSB ] }
543 { short-8-rep [ PADDSW ] }
544 { ushort-8-rep [ PADDUSW ] }
547 M: x86 %saturated-add-vector-reps
549 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
552 M: x86 %add-sub-vector ( dst src1 src2 rep -- )
555 { float-4-rep [ ADDSUBPS ] }
556 { double-2-rep [ ADDSUBPD ] }
559 M: x86 %add-sub-vector-reps
561 { sse3? { float-4-rep double-2-rep } }
564 M: x86 %sub-vector ( dst src1 src2 rep -- )
567 { float-4-rep [ SUBPS ] }
568 { double-2-rep [ SUBPD ] }
569 { char-16-rep [ PSUBB ] }
570 { uchar-16-rep [ PSUBB ] }
571 { short-8-rep [ PSUBW ] }
572 { ushort-8-rep [ PSUBW ] }
573 { int-4-rep [ PSUBD ] }
574 { uint-4-rep [ PSUBD ] }
575 { longlong-2-rep [ PSUBQ ] }
576 { ulonglong-2-rep [ PSUBQ ] }
579 M: x86 %sub-vector-reps
581 { sse? { float-4-rep } }
582 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
585 M: x86 %saturated-sub-vector ( dst src1 src2 rep -- )
588 { char-16-rep [ PSUBSB ] }
589 { uchar-16-rep [ PSUBUSB ] }
590 { short-8-rep [ PSUBSW ] }
591 { ushort-8-rep [ PSUBUSW ] }
594 M: x86 %saturated-sub-vector-reps
596 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
599 M: x86 %mul-vector ( dst src1 src2 rep -- )
602 { float-4-rep [ MULPS ] }
603 { double-2-rep [ MULPD ] }
604 { short-8-rep [ PMULLW ] }
605 { ushort-8-rep [ PMULLW ] }
606 { int-4-rep [ PMULLD ] }
607 { uint-4-rep [ PMULLD ] }
610 M: x86 %mul-vector-reps
612 { sse? { float-4-rep } }
613 { sse2? { double-2-rep short-8-rep ushort-8-rep } }
614 { sse4.1? { int-4-rep uint-4-rep } }
617 M: x86 %mul-high-vector ( dst src1 src2 rep -- )
620 { short-8-rep [ PMULHW ] }
621 { ushort-8-rep [ PMULHUW ] }
624 M: x86 %mul-high-vector-reps
626 { sse2? { short-8-rep ushort-8-rep } }
629 M: x86 %mul-horizontal-add-vector ( dst src1 src2 rep -- )
632 { char-16-rep [ PMADDUBSW ] }
633 { uchar-16-rep [ PMADDUBSW ] }
634 { short-8-rep [ PMADDWD ] }
637 M: x86 %mul-horizontal-add-vector-reps
639 { sse2? { short-8-rep } }
640 { ssse3? { char-16-rep uchar-16-rep } }
643 M: x86 %div-vector ( dst src1 src2 rep -- )
646 { float-4-rep [ DIVPS ] }
647 { double-2-rep [ DIVPD ] }
650 M: x86 %div-vector-reps
652 { sse? { float-4-rep } }
653 { sse2? { double-2-rep } }
656 M: x86 %min-vector ( dst src1 src2 rep -- )
659 { char-16-rep [ PMINSB ] }
660 { uchar-16-rep [ PMINUB ] }
661 { short-8-rep [ PMINSW ] }
662 { ushort-8-rep [ PMINUW ] }
663 { int-4-rep [ PMINSD ] }
664 { uint-4-rep [ PMINUD ] }
665 { float-4-rep [ MINPS ] }
666 { double-2-rep [ MINPD ] }
669 M: x86 %min-vector-reps
671 { sse? { float-4-rep } }
672 { sse2? { uchar-16-rep short-8-rep double-2-rep } }
673 { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
676 M: x86 %max-vector ( dst src1 src2 rep -- )
679 { char-16-rep [ PMAXSB ] }
680 { uchar-16-rep [ PMAXUB ] }
681 { short-8-rep [ PMAXSW ] }
682 { ushort-8-rep [ PMAXUW ] }
683 { int-4-rep [ PMAXSD ] }
684 { uint-4-rep [ PMAXUD ] }
685 { float-4-rep [ MAXPS ] }
686 { double-2-rep [ MAXPD ] }
689 M: x86 %max-vector-reps
691 { sse? { float-4-rep } }
692 { sse2? { uchar-16-rep short-8-rep double-2-rep } }
693 { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
696 M: x86 %avg-vector ( dst src1 src2 rep -- )
699 { uchar-16-rep [ PAVGB ] }
700 { ushort-8-rep [ PAVGW ] }
703 M: x86 %avg-vector-reps
705 { sse2? { uchar-16-rep ushort-8-rep } }
711 { float-4-rep [ 0xff DPPS ] }
712 { double-2-rep [ 0xff DPPD ] }
715 M: x86 %dot-vector-reps
717 { sse4.1? { float-4-rep double-2-rep } }
723 { uchar-16-rep [ PSADBW ] }
726 M: x86 %sad-vector-reps
728 { sse2? { uchar-16-rep } }
731 M: x86 %horizontal-add-vector ( dst src1 src2 rep -- )
734 { float-4-rep [ HADDPS ] }
735 { double-2-rep [ HADDPD ] }
736 { int-4-rep [ PHADDD ] }
737 { short-8-rep [ PHADDW ] }
740 M: x86 %horizontal-add-vector-reps
742 { sse3? { float-4-rep double-2-rep } }
743 { ssse3? { int-4-rep uint-4-rep short-8-rep ushort-8-rep } }
746 M: x86 %horizontal-shl-vector-imm ( dst src1 src2 rep -- )
749 M: x86 %horizontal-shl-vector-imm-reps
751 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } }
754 M: x86 %horizontal-shr-vector-imm ( dst src1 src2 rep -- )
757 M: x86 %horizontal-shr-vector-imm-reps
759 { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } }
762 M: x86 %abs-vector ( dst src rep -- )
764 { char-16-rep [ PABSB ] }
765 { short-8-rep [ PABSW ] }
766 { int-4-rep [ PABSD ] }
769 M: x86 %abs-vector-reps
771 { ssse3? { char-16-rep short-8-rep int-4-rep } }
774 M: x86 %sqrt-vector ( dst src rep -- )
776 { float-4-rep [ SQRTPS ] }
777 { double-2-rep [ SQRTPD ] }
780 M: x86 %sqrt-vector-reps
782 { sse? { float-4-rep } }
783 { sse2? { double-2-rep } }
786 M: x86 %and-vector ( dst src1 src2 rep -- )
789 { float-4-rep [ ANDPS ] }
790 { double-2-rep [ ANDPS ] }
794 M: x86 %and-vector-reps
796 { sse? { float-4-rep } }
797 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
800 M: x86 %andn-vector ( dst src1 src2 rep -- )
803 { float-4-rep [ ANDNPS ] }
804 { double-2-rep [ ANDNPS ] }
808 M: x86 %andn-vector-reps
810 { sse? { float-4-rep } }
811 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
814 M: x86 %or-vector ( dst src1 src2 rep -- )
817 { float-4-rep [ ORPS ] }
818 { double-2-rep [ ORPS ] }
822 M: x86 %or-vector-reps
824 { sse? { float-4-rep } }
825 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
828 M: x86 %xor-vector ( dst src1 src2 rep -- )
831 { float-4-rep [ XORPS ] }
832 { double-2-rep [ XORPS ] }
836 M: x86 %xor-vector-reps
838 { sse? { float-4-rep } }
839 { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
842 M: x86 %shl-vector ( dst src1 src2 rep -- )
845 { short-8-rep [ PSLLW ] }
846 { ushort-8-rep [ PSLLW ] }
847 { int-4-rep [ PSLLD ] }
848 { uint-4-rep [ PSLLD ] }
849 { longlong-2-rep [ PSLLQ ] }
850 { ulonglong-2-rep [ PSLLQ ] }
853 M: x86 %shl-vector-reps
855 { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
858 M: x86 %shr-vector ( dst src1 src2 rep -- )
861 { short-8-rep [ PSRAW ] }
862 { ushort-8-rep [ PSRLW ] }
863 { int-4-rep [ PSRAD ] }
864 { uint-4-rep [ PSRLD ] }
865 { ulonglong-2-rep [ PSRLQ ] }
868 M: x86 %shr-vector-reps
870 { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } }
873 M: x86 %shl-vector-imm %shl-vector ;
874 M: x86 %shl-vector-imm-reps %shl-vector-reps ;
875 M: x86 %shr-vector-imm %shr-vector ;
876 M: x86 %shr-vector-imm-reps %shr-vector-reps ;
878 : scalar-sized-reg ( reg rep -- reg' )
879 rep-size 8 * n-bit-version-of ;
881 M: x86 %integer>scalar drop MOVD ;
883 :: %scalar>integer-32 ( dst src rep -- )
886 dst 32-bit-version-of src MOVD
887 dst dst 32-bit-version-of
888 2dup eq? [ 2drop ] [ MOVSX ] if
891 dst 32-bit-version-of src MOVD
894 dst 32-bit-version-of src MOVD
895 dst dst 16-bit-version-of MOVSX
897 { ushort-scalar-rep [
898 dst 32-bit-version-of src MOVD
899 dst dst 16-bit-version-of MOVZX
902 dst 32-bit-version-of src MOVD
903 dst { } 8 [| tmp-dst |
904 tmp-dst dst int-rep %copy
905 tmp-dst tmp-dst 8-bit-version-of MOVSX
906 dst tmp-dst int-rep %copy
907 ] with-small-register
910 dst 32-bit-version-of src MOVD
911 dst { } 8 [| tmp-dst |
912 tmp-dst dst int-rep %copy
913 tmp-dst tmp-dst 8-bit-version-of MOVZX
914 dst tmp-dst int-rep %copy
915 ] with-small-register
919 M: x86.32 %scalar>integer ( dst src rep -- ) %scalar>integer-32 ;
921 M: x86.64 %scalar>integer ( dst src rep -- )
923 { longlong-scalar-rep [ MOVD ] }
924 { ulonglong-scalar-rep [ MOVD ] }
925 [ %scalar>integer-32 ]
928 M: x86 %vector>scalar %copy ;
930 M: x86 %scalar>vector %copy ;
932 enable-float-intrinsics