basis/cpu/x86/x86.factor

   1 ! Copyright (C) 2005, 2009 Slava Pestov.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: accessors assocs alien alien.c-types arrays strings
   4 cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands
   5 cpu.x86.features cpu.x86.features.private cpu.architecture kernel
   6 kernel.private math memory namespaces make sequences words system
   7 layouts combinators math.order fry locals compiler.constants
   8 byte-arrays io macros quotations compiler compiler.units init vm
   9 compiler.cfg.registers
  10 compiler.cfg.instructions
  11 compiler.cfg.intrinsics
  12 compiler.cfg.comparisons
  13 compiler.cfg.stack-frame
  14 compiler.codegen.fixup ;
  15 FROM: layouts => cell ;
  16 FROM: math => float ;
  17 IN: cpu.x86
  18
  19 ! Add some methods to the assembler to be more useful to the backend
  20 M: label JMP 0 JMP rc-relative label-fixup ;
  21 M: label JUMPcc [ 0 ] dip JUMPcc rc-relative label-fixup ;
  22
  23 M: x86 vector-regs float-regs ;
  24
  25 HOOK: stack-reg cpu ( -- reg )
  26
  27 HOOK: reserved-stack-space cpu ( -- n )
  28
  29 HOOK: extra-stack-space cpu ( stack-frame -- n )
  30
  31 : stack@ ( n -- op ) stack-reg swap [+] ;
  32
  33 : special@ ( n -- op )
  34     stack-frame get extra-stack-space +
  35     reserved-stack-space +
  36     stack@ ;
  37
  38 : spill@ ( n -- op ) spill-offset special@ ;
  39
  40 : gc-root@ ( n -- op ) gc-root-offset special@ ;
  41
  42 : decr-stack-reg ( n -- )
  43     dup 0 = [ drop ] [ stack-reg swap SUB ] if ;
  44
  45 : incr-stack-reg ( n -- )
  46     dup 0 = [ drop ] [ stack-reg swap ADD ] if ;
  47
  48 : align-stack ( n -- n' )
  49     os macosx? cpu x86.64? or [ 16 align ] when ;
  50
  51 M: x86 stack-frame-size ( stack-frame -- i )
  52     [ (stack-frame-size) ]
  53     [ extra-stack-space ] bi +
  54     reserved-stack-space +
  55     3 cells +
  56     align-stack ;
  57
  58 ! Must be a volatile register not used for parameter passing, for safe
  59 ! use in calls in and out of C
  60 HOOK: temp-reg cpu ( -- reg )
  61
  62 HOOK: pic-tail-reg cpu ( -- reg )
  63
  64 M: x86 %load-immediate dup 0 = [ drop dup XOR ] [ MOV ] if ;
  65
  66 M: x86 %load-reference swap 0 MOV rc-absolute-cell rel-immediate ;
  67
  68 HOOK: ds-reg cpu ( -- reg )
  69 HOOK: rs-reg cpu ( -- reg )
  70
  71 : reg-stack ( n reg -- op ) swap cells neg [+] ;
  72
  73 GENERIC: loc>operand ( loc -- operand )
  74
  75 M: ds-loc loc>operand n>> ds-reg reg-stack ;
  76 M: rs-loc loc>operand n>> rs-reg reg-stack ;
  77
  78 M: x86 %peek loc>operand MOV ;
  79 M: x86 %replace loc>operand swap MOV ;
  80 : (%inc) ( n reg -- ) swap cells dup 0 > [ ADD ] [ neg SUB ] if ; inline
  81 M: x86 %inc-d ( n -- ) ds-reg (%inc) ;
  82 M: x86 %inc-r ( n -- ) rs-reg (%inc) ;
  83
  84 M: x86 %call ( word -- ) 0 CALL rc-relative rel-word-pic ;
  85
  86 : xt-tail-pic-offset ( -- n )
  87     #! See the comment in vm/cpu-x86.hpp
  88     cell 4 + 1 + ; inline
  89
  90 M: x86 %jump ( word -- )
  91     pic-tail-reg 0 MOV xt-tail-pic-offset rc-absolute-cell rel-here
  92     0 JMP rc-relative rel-word-pic-tail ;
  93
  94 M: x86 %jump-label ( label -- ) 0 JMP rc-relative label-fixup ;
  95
  96 M: x86 %return ( -- ) 0 RET ;
  97
  98 : code-alignment ( align -- n )
  99     [ building get length dup ] dip align swap - ;
 100
 101 : align-code ( n -- )
 102     0 <repetition> % ;
 103
 104 :: (%slot-imm) ( obj slot tag -- op )
 105     obj slot tag slot-offset [+] ; inline
 106
 107 M: x86 %slot ( dst obj slot -- ) [+] MOV ;
 108 M: x86 %slot-imm ( dst obj slot tag -- ) (%slot-imm) MOV ;
 109 M: x86 %set-slot ( src obj slot -- ) [+] swap MOV ;
 110 M: x86 %set-slot-imm ( src obj slot tag -- ) (%slot-imm) swap MOV ;
 111
 112 :: two-operand ( dst src1 src2 rep -- dst src )
 113     dst src2 eq? dst src1 eq? not and [ "Cannot handle this case" throw ] when
 114     dst src1 rep %copy
 115     dst src2 ; inline
 116
 117 :: one-operand ( dst src rep -- dst )
 118     dst src rep %copy
 119     dst ; inline
 120
 121 M: x86 %add     2over eq? [ nip ADD ] [ [+] LEA ] if ;
 122 M: x86 %add-imm 2over eq? [ nip ADD ] [ [+] LEA ] if ;
 123 M: x86 %sub     int-rep two-operand SUB ;
 124 M: x86 %sub-imm 2over eq? [ nip SUB ] [ neg [+] LEA ] if ;
 125 M: x86 %mul     int-rep two-operand swap IMUL2 ;
 126 M: x86 %mul-imm IMUL3 ;
 127 M: x86 %and     int-rep two-operand AND ;
 128 M: x86 %and-imm int-rep two-operand AND ;
 129 M: x86 %or      int-rep two-operand OR ;
 130 M: x86 %or-imm  int-rep two-operand OR ;
 131 M: x86 %xor     int-rep two-operand XOR ;
 132 M: x86 %xor-imm int-rep two-operand XOR ;
 133 M: x86 %shl-imm int-rep two-operand SHL ;
 134 M: x86 %shr-imm int-rep two-operand SHR ;
 135 M: x86 %sar-imm int-rep two-operand SAR ;
 136
 137 M: x86 %min     int-rep two-operand [ CMP ] [ CMOVG ] 2bi ;
 138 M: x86 %max     int-rep two-operand [ CMP ] [ CMOVL ] 2bi ;
 139
 140 M: x86 %not     int-rep one-operand NOT ;
 141 M: x86 %neg     int-rep one-operand NEG ;
 142 M: x86 %log2    BSR ;
 143
 144 GENERIC: copy-register* ( dst src rep -- )
 145
 146 M: int-rep copy-register* drop MOV ;
 147 M: tagged-rep copy-register* drop MOV ;
 148 M: float-rep copy-register* drop MOVSS ;
 149 M: double-rep copy-register* drop MOVSD ;
 150 M: float-4-rep copy-register* drop MOVUPS ;
 151 M: double-2-rep copy-register* drop MOVUPD ;
 152 M: vector-rep copy-register* drop MOVDQU ;
 153
 154 M: x86 %copy ( dst src rep -- )
 155     2over eq? [ 3drop ] [
 156         [ [ dup spill-slot? [ n>> spill@ ] when ] bi@ ] dip
 157         copy-register*
 158     ] if ;
 159
 160 M: x86 %fixnum-add ( label dst src1 src2 -- )
 161     int-rep two-operand ADD JO ;
 162
 163 M: x86 %fixnum-sub ( label dst src1 src2 -- )
 164     int-rep two-operand SUB JO ;
 165
 166 M: x86 %fixnum-mul ( label dst src1 src2 -- )
 167     int-rep two-operand swap IMUL2 JO ;
 168
 169 M: x86 %unbox-alien ( dst src -- )
 170     alien-offset [+] MOV ;
 171
 172 M:: x86 %unbox-any-c-ptr ( dst src temp -- )
 173     [
 174         { "is-byte-array" "end" "start" } [ define-label ] each
 175         dst 0 MOV
 176         temp src MOV
 177         ! We come back here with displaced aliens
 178         "start" resolve-label
 179         ! Is the object f?
 180         temp \ f tag-number CMP
 181         "end" get JE
 182         ! Is the object an alien?
 183         temp header-offset [+] alien type-number tag-fixnum CMP
 184         "is-byte-array" get JNE
 185         ! If so, load the offset and add it to the address
 186         dst temp alien-offset [+] ADD
 187         ! Now recurse on the underlying alien
 188         temp temp underlying-alien-offset [+] MOV
 189         "start" get JMP
 190         "is-byte-array" resolve-label
 191         ! Add byte array address to address being computed
 192         dst temp ADD
 193         ! Add an offset to start of byte array's data
 194         dst byte-array-offset ADD
 195         "end" resolve-label
 196     ] with-scope ;
 197
 198 : alien@ ( reg n -- op ) cells alien tag-number - [+] ;
 199
 200 :: %allot-alien ( dst displacement base temp -- )
 201     dst 4 cells alien temp %allot
 202     dst 1 alien@ base MOV ! alien
 203     dst 2 alien@ \ f tag-number MOV ! expired
 204     dst 3 alien@ displacement MOV ! displacement
 205     ;
 206
 207 M:: x86 %box-alien ( dst src temp -- )
 208     [
 209         "end" define-label
 210         dst \ f tag-number MOV
 211         src 0 CMP
 212         "end" get JE
 213         dst src \ f tag-number temp %allot-alien
 214         "end" resolve-label
 215     ] with-scope ;
 216
 217 M:: x86 %box-displaced-alien ( dst displacement base displacement' base' base-class -- )
 218     [
 219         "end" define-label
 220         "ok" define-label
 221         ! If displacement is zero, return the base
 222         dst base MOV
 223         displacement 0 CMP
 224         "end" get JE
 225         ! Quickly use displacement' before its needed for real, as allot temporary
 226         dst 4 cells alien displacement' %allot
 227         ! If base is already a displaced alien, unpack it
 228         base' base MOV
 229         displacement' displacement MOV
 230         base \ f tag-number CMP
 231         "ok" get JE
 232         base header-offset [+] alien type-number tag-fixnum CMP
 233         "ok" get JNE
 234         ! displacement += base.displacement
 235         displacement' base 3 alien@ ADD
 236         ! base = base.base
 237         base' base 1 alien@ MOV
 238         "ok" resolve-label
 239         dst 1 alien@ base' MOV ! alien
 240         dst 2 alien@ \ f tag-number MOV ! expired
 241         dst 3 alien@ displacement' MOV ! displacement
 242         "end" resolve-label
 243     ] with-scope ;
 244
 245 ! The 'small-reg' mess is pretty crappy, but its only used on x86-32.
 246 ! On x86-64, all registers have 8-bit versions. However, a similar
 247 ! problem arises for shifts, where the shift count must be in CL, and
 248 ! so one day I will fix this properly by adding precoloring to the
 249 ! register allocator.
 250
 251 HOOK: has-small-reg? cpu ( reg size -- ? )
 252
 253 CONSTANT: have-byte-regs { EAX ECX EDX EBX }
 254
 255 M: x86.32 has-small-reg?
 256     {
 257         { 8 [ have-byte-regs memq? ] }
 258         { 16 [ drop t ] }
 259         { 32 [ drop t ] }
 260     } case ;
 261
 262 M: x86.64 has-small-reg? 2drop t ;
 263
 264 : small-reg-that-isn't ( exclude -- reg' )
 265     [ have-byte-regs ] dip
 266     [ native-version-of ] map
 267     '[ _ memq? not ] find nip ;
 268
 269 : with-save/restore ( reg quot -- )
 270     [ drop PUSH ] [ call ] [ drop POP ] 2tri ; inline
 271
 272 :: with-small-register ( dst exclude size quot: ( new-dst -- ) -- )
 273     ! If the destination register overlaps a small register with
 274     ! 'size' bits, we call the quot with that. Otherwise, we find a
 275     ! small register that is not in exclude, and call quot, saving and
 276     ! restoring the small register.
 277     dst size has-small-reg? [ dst quot call ] [
 278         exclude small-reg-that-isn't
 279         [ quot call ] with-save/restore
 280     ] if ; inline
 281
 282 M:: x86 %string-nth ( dst src index temp -- )
 283     ! We request a small-reg of size 8 since those of size 16 are
 284     ! a superset.
 285     "end" define-label
 286     dst { src index temp } 8 [| new-dst |
 287         ! Load the least significant 7 bits into new-dst.
 288         ! 8th bit indicates whether we have to load from
 289         ! the aux vector or not.
 290         temp src index [+] LEA
 291         new-dst 8-bit-version-of temp string-offset [+] MOV
 292         new-dst new-dst 8-bit-version-of MOVZX
 293         ! Do we have to look at the aux vector?
 294         new-dst HEX: 80 CMP
 295         "end" get JL
 296         ! Yes, this is a non-ASCII character. Load aux vector
 297         temp src string-aux-offset [+] MOV
 298         new-dst temp XCHG
 299         ! Compute index
 300         new-dst index ADD
 301         new-dst index ADD
 302         ! Load high 16 bits
 303         new-dst 16-bit-version-of new-dst byte-array-offset [+] MOV
 304         new-dst new-dst 16-bit-version-of MOVZX
 305         new-dst 7 SHL
 306         ! Compute code point
 307         new-dst temp XOR
 308         "end" resolve-label
 309         dst new-dst int-rep %copy
 310     ] with-small-register ;
 311
 312 M:: x86 %set-string-nth-fast ( ch str index temp -- )
 313     ch { index str temp } 8 [| new-ch |
 314         new-ch ch int-rep %copy
 315         temp str index [+] LEA
 316         temp string-offset [+] new-ch 8-bit-version-of MOV
 317     ] with-small-register ;
 318
 319 :: %alien-integer-getter ( dst src offset size quot -- )
 320     dst { src } size [| new-dst |
 321         new-dst dup size n-bit-version-of dup src offset [+] MOV
 322         quot call
 323         dst new-dst int-rep %copy
 324     ] with-small-register ; inline
 325
 326 : %alien-unsigned-getter ( dst src offset size -- )
 327     [ MOVZX ] %alien-integer-getter ; inline
 328
 329 : %alien-signed-getter ( dst src offset size -- )
 330     [ MOVSX ] %alien-integer-getter ; inline
 331
 332 :: %alien-integer-setter ( ptr offset value size -- )
 333     value { ptr } size [| new-value |
 334         new-value value int-rep %copy
 335         ptr offset [+] new-value size n-bit-version-of MOV
 336     ] with-small-register ; inline
 337
 338 M: x86 %alien-unsigned-1 8 %alien-unsigned-getter ;
 339 M: x86 %alien-unsigned-2 16 %alien-unsigned-getter ;
 340 M: x86 %alien-unsigned-4 32 [ 2drop ] %alien-integer-getter ;
 341
 342 M: x86 %alien-signed-1 8 %alien-signed-getter ;
 343 M: x86 %alien-signed-2 16 %alien-signed-getter ;
 344 M: x86 %alien-signed-4 32 %alien-signed-getter ;
 345
 346 M: x86 %alien-cell [+] MOV ;
 347 M: x86 %alien-float [+] MOVSS ;
 348 M: x86 %alien-double [+] MOVSD ;
 349 M: x86 %alien-vector [ [+] ] dip %copy ;
 350
 351 M: x86 %set-alien-integer-1 8 %alien-integer-setter ;
 352 M: x86 %set-alien-integer-2 16 %alien-integer-setter ;
 353 M: x86 %set-alien-integer-4 32 %alien-integer-setter ;
 354 M: x86 %set-alien-cell [ [+] ] dip MOV ;
 355 M: x86 %set-alien-float [ [+] ] dip MOVSS ;
 356 M: x86 %set-alien-double [ [+] ] dip MOVSD ;
 357 M: x86 %set-alien-vector [ [+] ] 2dip %copy ;
 358
 359 : shift-count? ( reg -- ? ) { ECX RCX } memq? ;
 360
 361 :: emit-shift ( dst src quot -- )
 362     src shift-count? [
 363         dst CL quot call
 364     ] [
 365         dst shift-count? [
 366             dst src XCHG
 367             src CL quot call
 368             dst src XCHG
 369         ] [
 370             ECX native-version-of [
 371                 CL src MOV
 372                 drop dst CL quot call
 373             ] with-save/restore
 374         ] if
 375     ] if ; inline
 376
 377 M: x86 %shl int-rep two-operand [ SHL ] emit-shift ;
 378 M: x86 %shr int-rep two-operand [ SHR ] emit-shift ;
 379 M: x86 %sar int-rep two-operand [ SAR ] emit-shift ;
 380
 381 : %mov-vm-ptr ( reg -- )
 382     0 MOV 0 rc-absolute-cell rel-vm ;
 383
 384 M: x86 %vm-field-ptr ( dst field -- )
 385     [ 0 MOV ] dip vm-field-offset rc-absolute-cell rel-vm ;
 386
 387 : load-allot-ptr ( nursery-ptr allot-ptr -- )
 388     [ drop "nursery" %vm-field-ptr ] [ swap [] MOV ] 2bi ;
 389
 390 : inc-allot-ptr ( nursery-ptr n -- )
 391     [ [] ] dip 8 align ADD ;
 392
 393 : store-header ( temp class -- )
 394     [ [] ] [ type-number tag-fixnum ] bi* MOV ;
 395
 396 : store-tagged ( dst tag -- )
 397     tag-number OR ;
 398
 399 M:: x86 %allot ( dst size class nursery-ptr -- )
 400     nursery-ptr dst load-allot-ptr
 401     dst class store-header
 402     dst class store-tagged
 403     nursery-ptr size inc-allot-ptr ;
 404
 405 HOOK: %mark-card cpu ( card temp -- )
 406 HOOK: %mark-deck cpu ( card temp -- )
 407
 408 :: (%write-barrier) ( src slot temp1 temp2 -- )
 409     temp1 src slot [+] LEA
 410     temp1 card-bits SHR
 411     temp1 temp2 %mark-card
 412     temp1 deck-bits card-bits - SHR
 413     temp1 temp2 %mark-deck ;
 414
 415 M: x86 %write-barrier ( src slot temp1 temp2 -- ) (%write-barrier) ;
 416
 417 M: x86 %write-barrier-imm ( src slot temp1 temp2 -- ) (%write-barrier) ;
 418
 419 M:: x86 %check-nursery ( label size temp1 temp2 -- )
 420     temp1 "nursery" %vm-field-ptr
 421     ! Load 'here' into temp2
 422     temp2 temp1 [] MOV
 423     temp2 size ADD
 424     ! Load 'end' into temp1
 425     temp1 temp1 2 cells [+] MOV
 426     temp2 temp1 CMP
 427     label JLE ;
 428
 429 M: x86 %save-gc-root ( gc-root register -- ) [ gc-root@ ] dip MOV ;
 430
 431 M: x86 %load-gc-root ( gc-root register -- ) swap gc-root@ MOV ;
 432
 433 M: x86 %alien-global ( dst symbol library -- )
 434     [ 0 MOV ] 2dip rc-absolute-cell rel-dlsym ;
 435
 436 M: x86 %epilogue ( n -- ) cell - incr-stack-reg ;
 437
 438 :: %boolean ( dst temp word -- )
 439     dst \ f tag-number MOV
 440     temp 0 MOV \ t rc-absolute-cell rel-immediate
 441     dst temp word execute ; inline
 442
 443 : (%compare) ( src1 src2 cc -- )
 444     2over [ { cc= cc/= } member? ] [ register? ] [ 0 = ] tri* and and
 445     [ drop dup TEST ]
 446     [ CMP ] if ;
 447
 448 M:: x86 %compare ( dst src1 src2 cc temp -- )
 449     src1 src2 cc (%compare)
 450     cc order-cc {
 451         { cc<  [ dst temp \ CMOVL %boolean ] }
 452         { cc<= [ dst temp \ CMOVLE %boolean ] }
 453         { cc>  [ dst temp \ CMOVG %boolean ] }
 454         { cc>= [ dst temp \ CMOVGE %boolean ] }
 455         { cc=  [ dst temp \ CMOVE %boolean ] }
 456         { cc/= [ dst temp \ CMOVNE %boolean ] }
 457     } case ;
 458
 459 M: x86 %compare-imm ( dst src1 src2 cc temp -- )
 460     %compare ;
 461
 462 M:: x86 %compare-branch ( label src1 src2 cc -- )
 463     src1 src2 cc (%compare)
 464     cc order-cc {
 465         { cc<  [ label JL ] }
 466         { cc<= [ label JLE ] }
 467         { cc>  [ label JG ] }
 468         { cc>= [ label JGE ] }
 469         { cc=  [ label JE ] }
 470         { cc/= [ label JNE ] }
 471     } case ;
 472
 473 M: x86 %compare-imm-branch ( label src1 src2 cc -- )
 474     %compare-branch ;
 475
 476 M: x86 %add-float double-rep two-operand ADDSD ;
 477 M: x86 %sub-float double-rep two-operand SUBSD ;
 478 M: x86 %mul-float double-rep two-operand MULSD ;
 479 M: x86 %div-float double-rep two-operand DIVSD ;
 480 M: x86 %min-float double-rep two-operand MINSD ;
 481 M: x86 %max-float double-rep two-operand MAXSD ;
 482 M: x86 %sqrt SQRTSD ;
 483
 484 M: x86 %single>double-float CVTSS2SD ;
 485 M: x86 %double>single-float CVTSD2SS ;
 486
 487 M: x86 %integer>float CVTSI2SD ;
 488 M: x86 %float>integer CVTTSD2SI ;
 489
 490 : %cmov-float= ( dst src -- )
 491     [
 492         "no-move" define-label
 493
 494         "no-move" get [ JNE ] [ JP ] bi
 495         MOV
 496         "no-move" resolve-label
 497     ] with-scope ;
 498
 499 : %cmov-float/= ( dst src -- )
 500     [
 501         "no-move" define-label
 502         "move" define-label
 503
 504         "move" get JP
 505         "no-move" get JE
 506         "move" resolve-label
 507         MOV
 508         "no-move" resolve-label
 509     ] with-scope ;
 510
 511 :: (%compare-float) ( dst src1 src2 cc temp compare -- )
 512     cc {
 513         { cc<    [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVA  %boolean ] }
 514         { cc<=   [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVAE %boolean ] }
 515         { cc>    [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVA  %boolean ] }
 516         { cc>=   [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVAE %boolean ] }
 517         { cc=    [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float= %boolean ] }
 518         { cc<>   [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNE %boolean ] }
 519         { cc<>=  [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNP %boolean ] }
 520         { cc/<   [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVBE %boolean ] }
 521         { cc/<=  [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVB  %boolean ] }
 522         { cc/>   [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVBE %boolean ] }
 523         { cc/>=  [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVB  %boolean ] }
 524         { cc/=   [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float/= %boolean ] }
 525         { cc/<>  [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVE  %boolean ] }
 526         { cc/<>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVP  %boolean ] }
 527     } case ; inline
 528
 529 M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- )
 530     \ COMISD (%compare-float) ;
 531
 532 M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- )
 533     \ UCOMISD (%compare-float) ;
 534
 535 : %jump-float= ( label -- )
 536     [
 537         "no-jump" define-label
 538         "no-jump" get JP
 539         JE
 540         "no-jump" resolve-label
 541     ] with-scope ;
 542
 543 : %jump-float/= ( label -- )
 544     [ JNE ] [ JP ] bi ;
 545
 546 :: (%compare-float-branch) ( label src1 src2 cc compare -- )
 547     cc {
 548         { cc<    [ src2 src1 \ compare execute( a b -- ) label JA  ] }
 549         { cc<=   [ src2 src1 \ compare execute( a b -- ) label JAE ] }
 550         { cc>    [ src1 src2 \ compare execute( a b -- ) label JA  ] }
 551         { cc>=   [ src1 src2 \ compare execute( a b -- ) label JAE ] }
 552         { cc=    [ src1 src2 \ compare execute( a b -- ) label %jump-float= ] }
 553         { cc<>   [ src1 src2 \ compare execute( a b -- ) label JNE ] }
 554         { cc<>=  [ src1 src2 \ compare execute( a b -- ) label JNP ] }
 555         { cc/<   [ src2 src1 \ compare execute( a b -- ) label JBE ] }
 556         { cc/<=  [ src2 src1 \ compare execute( a b -- ) label JB  ] }
 557         { cc/>   [ src1 src2 \ compare execute( a b -- ) label JBE ] }
 558         { cc/>=  [ src1 src2 \ compare execute( a b -- ) label JB  ] }
 559         { cc/=   [ src1 src2 \ compare execute( a b -- ) label %jump-float/= ] }
 560         { cc/<>  [ src1 src2 \ compare execute( a b -- ) label JE  ] }
 561         { cc/<>= [ src1 src2 \ compare execute( a b -- ) label JP  ] }
 562     } case ;
 563
 564 M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- )
 565     \ COMISD (%compare-float-branch) ;
 566
 567 M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- )
 568     \ UCOMISD (%compare-float-branch) ;
 569
 570 MACRO: available-reps ( alist -- )
 571     ! Each SSE version adds new representations and supports
 572     ! all old ones
 573     unzip { } [ append ] accumulate rest swap suffix
 574     [ [ 1quotation ] map ] bi@ zip
 575     reverse [ { } ] suffix
 576     '[ _ cond ] ;
 577
 578 M: x86 %alien-vector-reps
 579     {
 580         { sse? { float-4-rep } }
 581         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 582     } available-reps ;
 583
 584 M: x86 %zero-vector
 585     {
 586         { double-2-rep [ dup XORPD ] }
 587         { float-4-rep [ dup XORPS ] }
 588         [ drop dup PXOR ]
 589     } case ;
 590
 591 M: x86 %zero-vector-reps
 592     {
 593         { sse? { float-4-rep } }
 594         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 595     } available-reps ;
 596
 597 M: x86 %fill-vector
 598     {
 599         { double-2-rep [ dup [ XORPD ] [ CMPEQPD ] 2bi ] }
 600         { float-4-rep  [ dup [ XORPS ] [ CMPEQPS ] 2bi ] }
 601         [ drop dup PCMPEQB ]
 602     } case ;
 603
 604 M: x86 %fill-vector-reps
 605     {
 606         { sse? { float-4-rep } }
 607         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 608     } available-reps ;
 609
 610 ! M:: x86 %broadcast-vector ( dst src rep -- )
 611 !     rep unsign-rep {
 612 !         { float-4-rep [
 613 !             dst src float-4-rep %copy
 614 !             dst dst { 0 0 0 0 } SHUFPS
 615 !         ] }
 616 !         { double-2-rep [
 617 !             dst src MOVDDUP
 618 !         ] }
 619 !         { longlong-2-rep [
 620 !             dst src =
 621 !             [ dst dst PUNPCKLQDQ ]
 622 !             [ dst src { 0 1 0 1 } PSHUFD ]
 623 !             if
 624 !         ] }
 625 !         { int-4-rep [
 626 !             dst src { 0 0 0 0 } PSHUFD
 627 !         ] }
 628 !         { short-8-rep [
 629 !             dst src { 0 0 0 0 } PSHUFLW
 630 !             dst dst PUNPCKLQDQ
 631 !         ] }
 632 !         { char-16-rep [
 633 !             dst src char-16-rep %copy
 634 !             dst dst PUNPCKLBW
 635 !             dst dst { 0 0 0 0 } PSHUFLW
 636 !             dst dst PUNPCKLQDQ
 637 !         ] }
 638 !     } case ;
 639 !
 640 ! M: x86 %broadcast-vector-reps
 641 !     {
 642 !         ! Can't do this with sse1 since it will want to unbox
 643 !         ! a double-precision float and convert to single precision
 644 !         { sse2? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
 645 !     } available-reps ;
 646
 647 M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
 648     rep unsign-rep {
 649         { float-4-rep [
 650             dst src1 float-4-rep %copy
 651             dst src2 UNPCKLPS
 652             src3 src4 UNPCKLPS
 653             dst src3 MOVLHPS
 654         ] }
 655         { int-4-rep [
 656             dst src1 int-4-rep %copy
 657             dst src2 PUNPCKLDQ
 658             src3 src4 PUNPCKLDQ
 659             dst src3 PUNPCKLQDQ
 660         ] }
 661     } case ;
 662
 663 M: x86 %gather-vector-4-reps
 664     {
 665         ! Can't do this with sse1 since it will want to unbox
 666         ! double-precision floats and convert to single precision
 667         { sse2? { float-4-rep int-4-rep uint-4-rep } }
 668     } available-reps ;
 669
 670 M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
 671     rep unsign-rep {
 672         { double-2-rep [
 673             dst src1 double-2-rep %copy
 674             dst src2 UNPCKLPD
 675         ] }
 676         { longlong-2-rep [
 677             dst src1 longlong-2-rep %copy
 678             dst src2 PUNPCKLQDQ
 679         ] }
 680     } case ;
 681
 682 M: x86 %gather-vector-2-reps
 683     {
 684         { sse2? { double-2-rep longlong-2-rep ulonglong-2-rep } }
 685     } available-reps ;
 686
 687 : double-2-shuffle ( dst shuffle -- )
 688     {
 689         { { 0 1 } [ drop ] }
 690         { { 0 0 } [ dup UNPCKLPD ] }
 691         { { 1 1 } [ dup UNPCKHPD ] }
 692         [ dupd SHUFPD ]
 693     } case ;
 694
 695 : sse1-float-4-shuffle ( dst shuffle -- )
 696     {
 697         { { 0 1 2 3 } [ drop ] }
 698         { { 0 1 0 1 } [ dup MOVLHPS ] }
 699         { { 2 3 2 3 } [ dup MOVHLPS ] }
 700         { { 0 0 1 1 } [ dup UNPCKLPS ] }
 701         { { 2 2 3 3 } [ dup UNPCKHPS ] }
 702         [ dupd SHUFPS ]
 703     } case ;
 704
 705 : float-4-shuffle ( dst shuffle -- )
 706     sse3? [
 707         {
 708             { { 0 0 2 2 } [ dup MOVSLDUP ] }
 709             { { 1 1 3 3 } [ dup MOVSHDUP ] }
 710             [ sse1-float-4-shuffle ]
 711         } case
 712     ] [ sse1-float-4-shuffle ] if ;
 713
 714 : int-4-shuffle ( dst shuffle -- )
 715     {
 716         { { 0 1 2 3 } [ drop ] }
 717         { { 0 0 1 1 } [ dup PUNPCKLDQ ] }
 718         { { 2 2 3 3 } [ dup PUNPCKHDQ ] }
 719         { { 0 1 0 1 } [ dup PUNPCKLQDQ ] }
 720         { { 2 3 2 3 } [ dup PUNPCKHQDQ ] }
 721         [ dupd PSHUFD ]
 722     } case ;
 723
 724 : longlong-2-shuffle ( dst shuffle -- )
 725     first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ;
 726
 727 M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- )
 728     dst src rep %copy
 729     dst shuffle rep unsign-rep {
 730         { double-2-rep [ double-2-shuffle ] }
 731         { float-4-rep [ float-4-shuffle ] }
 732         { int-4-rep [ int-4-shuffle ] }
 733         { longlong-2-rep [ longlong-2-shuffle ] }
 734     } case ;
 735
 736 M: x86 %shuffle-vector-imm-reps
 737     {
 738         { sse? { float-4-rep } }
 739         { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 740     } available-reps ;
 741
 742 M: x86 %shuffle-vector ( dst src shuffle rep -- )
 743     two-operand PSHUFB ;
 744
 745 M: x86 %shuffle-vector-reps
 746     {
 747         { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
 748     } available-reps ;
 749
 750 M: x86 %merge-vector-head
 751     [ two-operand ] keep
 752     unsign-rep {
 753         { double-2-rep   [ UNPCKLPD ] }
 754         { float-4-rep    [ UNPCKLPS ] }
 755         { longlong-2-rep [ PUNPCKLQDQ ] }
 756         { int-4-rep      [ PUNPCKLDQ ] }
 757         { short-8-rep    [ PUNPCKLWD ] }
 758         { char-16-rep    [ PUNPCKLBW ] }
 759     } case ;
 760
 761 M: x86 %merge-vector-tail
 762     [ two-operand ] keep
 763     unsign-rep {
 764         { double-2-rep   [ UNPCKHPD ] }
 765         { float-4-rep    [ UNPCKHPS ] }
 766         { longlong-2-rep [ PUNPCKHQDQ ] }
 767         { int-4-rep      [ PUNPCKHDQ ] }
 768         { short-8-rep    [ PUNPCKHWD ] }
 769         { char-16-rep    [ PUNPCKHBW ] }
 770     } case ;
 771
 772 M: x86 %merge-vector-reps
 773     {
 774         { sse? { float-4-rep } }
 775         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 776     } available-reps ;
 777
 778 M: x86 %signed-pack-vector
 779     [ two-operand ] keep
 780     {
 781         { int-4-rep    [ PACKSSDW ] }
 782         { short-8-rep  [ PACKSSWB ] }
 783     } case ;
 784
 785 M: x86 %signed-pack-vector-reps
 786     {
 787         { sse2? { short-8-rep int-4-rep } }
 788     } available-reps ;
 789
 790 M: x86 %unsigned-pack-vector
 791     [ two-operand ] keep
 792     unsign-rep {
 793         { int-4-rep   [ PACKUSDW ] }
 794         { short-8-rep [ PACKUSWB ] }
 795     } case ;
 796
 797 M: x86 %unsigned-pack-vector-reps
 798     {
 799         { sse2? { short-8-rep } }
 800         { sse4.1? { int-4-rep } }
 801     } available-reps ;
 802
 803 M: x86 %tail>head-vector ( dst src rep -- )
 804     dup {
 805         { float-4-rep [ drop MOVHLPS ] }
 806         { double-2-rep [ [ %copy ] [ drop UNPCKHPD ] 3bi ] }
 807         [ drop [ %copy ] [ drop PUNPCKHQDQ ] 3bi ]
 808     } case ;
 809
 810 M: x86 %unpack-vector-head ( dst src rep -- )
 811     {
 812         { char-16-rep  [ PMOVSXBW ] }
 813         { uchar-16-rep [ PMOVZXBW ] }
 814         { short-8-rep  [ PMOVSXWD ] }
 815         { ushort-8-rep [ PMOVZXWD ] }
 816         { int-4-rep    [ PMOVSXDQ ] }
 817         { uint-4-rep   [ PMOVZXDQ ] }
 818         { float-4-rep  [ CVTPS2PD ] }
 819     } case ;
 820
 821 M: x86 %unpack-vector-head-reps ( -- reps )
 822     {
 823         { sse2? { float-4-rep } }
 824         { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
 825     } available-reps ;
 826
 827 M: x86 %integer>float-vector ( dst src rep -- )
 828     {
 829         { int-4-rep [ CVTDQ2PS ] }
 830     } case ;
 831
 832 M: x86 %integer>float-vector-reps
 833     {
 834         { sse2? { int-4-rep } }
 835     } available-reps ;
 836
 837 M: x86 %float>integer-vector ( dst src rep -- )
 838     {
 839         { float-4-rep [ CVTTPS2DQ ] }
 840     } case ;
 841
 842 M: x86 %float>integer-vector-reps
 843     {
 844         { sse2? { float-4-rep } }
 845     } available-reps ;
 846
 847 : (%compare-float-vector) ( dst src rep double single -- )
 848     [ double-2-rep eq? ] 2dip if ; inline
 849 : %compare-float-vector ( dst src rep cc -- )
 850     {
 851         { cc<    [ [ CMPLTPD    ] [ CMPLTPS    ] (%compare-float-vector) ] }
 852         { cc<=   [ [ CMPLEPD    ] [ CMPLEPS    ] (%compare-float-vector) ] }
 853         { cc=    [ [ CMPEQPD    ] [ CMPEQPS    ] (%compare-float-vector) ] }
 854         { cc<>=  [ [ CMPORDPD   ] [ CMPORDPS   ] (%compare-float-vector) ] }
 855         { cc/<   [ [ CMPNLTPD   ] [ CMPNLTPS   ] (%compare-float-vector) ] }
 856         { cc/<=  [ [ CMPNLEPD   ] [ CMPNLEPS   ] (%compare-float-vector) ] }
 857         { cc/=   [ [ CMPNEQPD   ] [ CMPNEQPS   ] (%compare-float-vector) ] }
 858         { cc/<>= [ [ CMPUNORDPD ] [ CMPUNORDPS ] (%compare-float-vector) ] }
 859     } case ;
 860
 861 :: (%compare-int-vector) ( dst src rep int64 int32 int16 int8 -- )
 862     rep unsign-rep :> rep'
 863     dst src rep' {
 864         { longlong-2-rep [ int64 call ] }
 865         { int-4-rep      [ int32 call ] }
 866         { short-8-rep    [ int16 call ] }
 867         { char-16-rep    [ int8  call ] }
 868     } case ; inline
 869 : %compare-int-vector ( dst src rep cc -- )
 870     {
 871         { cc= [ [ PCMPEQQ ] [ PCMPEQD ] [ PCMPEQW ] [ PCMPEQB ] (%compare-int-vector) ] }
 872         { cc> [ [ PCMPGTQ ] [ PCMPGTD ] [ PCMPGTW ] [ PCMPGTB ] (%compare-int-vector) ] }
 873     } case ;
 874
 875 M: x86 %compare-vector ( dst src1 src2 rep cc -- )
 876     [ [ two-operand ] keep ] dip
 877     over float-vector-rep?
 878     [ %compare-float-vector ]
 879     [ %compare-int-vector ] if ;
 880
 881 : %compare-vector-eq-reps ( -- reps )
 882     {
 883         { sse? { float-4-rep } }
 884         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
 885         { sse4.1? { longlong-2-rep ulonglong-2-rep } }
 886     } available-reps ;
 887 : %compare-vector-ord-reps ( -- reps )
 888     {
 889         { sse? { float-4-rep } }
 890         { sse2? { double-2-rep char-16-rep short-8-rep int-4-rep } }
 891         { sse4.2? { longlong-2-rep } }
 892     } available-reps ;
 893
 894 M: x86 %compare-vector-reps
 895     {
 896         { [ dup { cc= cc/= cc/<>= cc<>= } memq? ] [ drop %compare-vector-eq-reps ] }
 897         [ drop %compare-vector-ord-reps ]
 898     } cond ;
 899
 900 : %compare-float-vector-ccs ( cc -- ccs not? )
 901     {
 902         { cc<    [ { { cc<  f   }              } f ] }
 903         { cc<=   [ { { cc<= f   }              } f ] }
 904         { cc>    [ { { cc<  t   }              } f ] }
 905         { cc>=   [ { { cc<= t   }              } f ] }
 906         { cc=    [ { { cc=  f   }              } f ] }
 907         { cc<>   [ { { cc<  f   } { cc<    t } } f ] }
 908         { cc<>=  [ { { cc<>= f  }              } f ] }
 909         { cc/<   [ { { cc/<  f  }              } f ] }
 910         { cc/<=  [ { { cc/<= f  }              } f ] }
 911         { cc/>   [ { { cc/<  t  }              } f ] }
 912         { cc/>=  [ { { cc/<= t  }              } f ] }
 913         { cc/=   [ { { cc/=  f  }              } f ] }
 914         { cc/<>  [ { { cc/=  f  } { cc/<>= f } } f ] }
 915         { cc/<>= [ { { cc/<>= f }              } f ] }
 916     } case ;
 917
 918 : %compare-int-vector-ccs ( cc -- ccs not? )
 919     order-cc {
 920         { cc<    [ { { cc> t } } f ] }
 921         { cc<=   [ { { cc> f } } t ] }
 922         { cc>    [ { { cc> f } } f ] }
 923         { cc>=   [ { { cc> t } } t ] }
 924         { cc=    [ { { cc= f } } f ] }
 925         { cc/=   [ { { cc= f } } t ] }
 926         { t      [ {           } t ] }
 927         { f      [ {           } f ] }
 928     } case ;
 929
 930 M: x86 %compare-vector-ccs
 931     swap float-vector-rep?
 932     [ %compare-float-vector-ccs ]
 933     [ %compare-int-vector-ccs ] if ;
 934
 935 :: %test-vector-mask ( dst temp mask vcc -- )
 936     vcc {
 937         { vcc-any    [ dst dst TEST dst temp \ CMOVNE %boolean ] }
 938         { vcc-none   [ dst dst TEST dst temp \ CMOVE  %boolean ] }
 939         { vcc-all    [ dst mask CMP dst temp \ CMOVE  %boolean ] }
 940         { vcc-notall [ dst mask CMP dst temp \ CMOVNE %boolean ] }
 941     } case ;
 942
 943 : %move-vector-mask ( dst src rep -- mask )
 944     {
 945         { double-2-rep [ MOVMSKPD HEX: 3 ] }
 946         { float-4-rep  [ MOVMSKPS HEX: f ] }
 947         [ drop PMOVMSKB HEX: ffff ]
 948     } case ;
 949
 950 M:: x86 %test-vector ( dst src temp rep vcc -- )
 951     dst src rep %move-vector-mask :> mask
 952     dst temp mask vcc %test-vector-mask ;
 953
 954 :: %test-vector-mask-branch ( label temp mask vcc -- )
 955     vcc {
 956         { vcc-any    [ temp temp TEST label JNE ] }
 957         { vcc-none   [ temp temp TEST label JE ] }
 958         { vcc-all    [ temp mask CMP label JE ] }
 959         { vcc-notall [ temp mask CMP label JNE ] }
 960     } case ;
 961
 962 M:: x86 %test-vector-branch ( label src temp rep vcc -- )
 963     temp src rep %move-vector-mask :> mask
 964     label temp mask vcc %test-vector-mask-branch ;
 965
 966 M: x86 %test-vector-reps
 967     {
 968         { sse? { float-4-rep } }
 969         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 970     } available-reps ;
 971
 972 M: x86 %add-vector ( dst src1 src2 rep -- )
 973     [ two-operand ] keep
 974     {
 975         { float-4-rep [ ADDPS ] }
 976         { double-2-rep [ ADDPD ] }
 977         { char-16-rep [ PADDB ] }
 978         { uchar-16-rep [ PADDB ] }
 979         { short-8-rep [ PADDW ] }
 980         { ushort-8-rep [ PADDW ] }
 981         { int-4-rep [ PADDD ] }
 982         { uint-4-rep [ PADDD ] }
 983         { longlong-2-rep [ PADDQ ] }
 984         { ulonglong-2-rep [ PADDQ ] }
 985     } case ;
 986
 987 M: x86 %add-vector-reps
 988     {
 989         { sse? { float-4-rep } }
 990         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 991     } available-reps ;
 992
 993 M: x86 %saturated-add-vector ( dst src1 src2 rep -- )
 994     [ two-operand ] keep
 995     {
 996         { char-16-rep [ PADDSB ] }
 997         { uchar-16-rep [ PADDUSB ] }
 998         { short-8-rep [ PADDSW ] }
 999         { ushort-8-rep [ PADDUSW ] }
1000     } case ;
1001
1002 M: x86 %saturated-add-vector-reps
1003     {
1004         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
1005     } available-reps ;
1006
1007 M: x86 %add-sub-vector ( dst src1 src2 rep -- )
1008     [ two-operand ] keep
1009     {
1010         { float-4-rep [ ADDSUBPS ] }
1011         { double-2-rep [ ADDSUBPD ] }
1012     } case ;
1013
1014 M: x86 %add-sub-vector-reps
1015     {
1016         { sse3? { float-4-rep double-2-rep } }
1017     } available-reps ;
1018
1019 M: x86 %sub-vector ( dst src1 src2 rep -- )
1020     [ two-operand ] keep
1021     {
1022         { float-4-rep [ SUBPS ] }
1023         { double-2-rep [ SUBPD ] }
1024         { char-16-rep [ PSUBB ] }
1025         { uchar-16-rep [ PSUBB ] }
1026         { short-8-rep [ PSUBW ] }
1027         { ushort-8-rep [ PSUBW ] }
1028         { int-4-rep [ PSUBD ] }
1029         { uint-4-rep [ PSUBD ] }
1030         { longlong-2-rep [ PSUBQ ] }
1031         { ulonglong-2-rep [ PSUBQ ] }
1032     } case ;
1033
1034 M: x86 %sub-vector-reps
1035     {
1036         { sse? { float-4-rep } }
1037         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1038     } available-reps ;
1039
1040 M: x86 %saturated-sub-vector ( dst src1 src2 rep -- )
1041     [ two-operand ] keep
1042     {
1043         { char-16-rep [ PSUBSB ] }
1044         { uchar-16-rep [ PSUBUSB ] }
1045         { short-8-rep [ PSUBSW ] }
1046         { ushort-8-rep [ PSUBUSW ] }
1047     } case ;
1048
1049 M: x86 %saturated-sub-vector-reps
1050     {
1051         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
1052     } available-reps ;
1053
1054 M: x86 %mul-vector ( dst src1 src2 rep -- )
1055     [ two-operand ] keep
1056     {
1057         { float-4-rep [ MULPS ] }
1058         { double-2-rep [ MULPD ] }
1059         { short-8-rep [ PMULLW ] }
1060         { ushort-8-rep [ PMULLW ] }
1061         { int-4-rep [ PMULLD ] }
1062         { uint-4-rep [ PMULLD ] }
1063     } case ;
1064
1065 M: x86 %mul-vector-reps
1066     {
1067         { sse? { float-4-rep } }
1068         { sse2? { double-2-rep short-8-rep ushort-8-rep } }
1069         { sse4.1? { int-4-rep uint-4-rep } }
1070     } available-reps ;
1071
1072 M: x86 %div-vector ( dst src1 src2 rep -- )
1073     [ two-operand ] keep
1074     {
1075         { float-4-rep [ DIVPS ] }
1076         { double-2-rep [ DIVPD ] }
1077     } case ;
1078
1079 M: x86 %div-vector-reps
1080     {
1081         { sse? { float-4-rep } }
1082         { sse2? { double-2-rep } }
1083     } available-reps ;
1084
1085 M: x86 %min-vector ( dst src1 src2 rep -- )
1086     [ two-operand ] keep
1087     {
1088         { char-16-rep [ PMINSB ] }
1089         { uchar-16-rep [ PMINUB ] }
1090         { short-8-rep [ PMINSW ] }
1091         { ushort-8-rep [ PMINUW ] }
1092         { int-4-rep [ PMINSD ] }
1093         { uint-4-rep [ PMINUD ] }
1094         { float-4-rep [ MINPS ] }
1095         { double-2-rep [ MINPD ] }
1096     } case ;
1097
1098 M: x86 %min-vector-reps
1099     {
1100         { sse? { float-4-rep } }
1101         { sse2? { uchar-16-rep short-8-rep double-2-rep } }
1102         { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
1103     } available-reps ;
1104
1105 M: x86 %max-vector ( dst src1 src2 rep -- )
1106     [ two-operand ] keep
1107     {
1108         { char-16-rep [ PMAXSB ] }
1109         { uchar-16-rep [ PMAXUB ] }
1110         { short-8-rep [ PMAXSW ] }
1111         { ushort-8-rep [ PMAXUW ] }
1112         { int-4-rep [ PMAXSD ] }
1113         { uint-4-rep [ PMAXUD ] }
1114         { float-4-rep [ MAXPS ] }
1115         { double-2-rep [ MAXPD ] }
1116     } case ;
1117
1118 M: x86 %max-vector-reps
1119     {
1120         { sse? { float-4-rep } }
1121         { sse2? { uchar-16-rep short-8-rep double-2-rep } }
1122         { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
1123     } available-reps ;
1124
1125 M: x86 %dot-vector
1126     [ two-operand ] keep
1127     {
1128         { float-4-rep [
1129             sse4.1?
1130             [ HEX: ff DPPS ]
1131             [ [ MULPS ] [ drop dup float-4-rep %horizontal-add-vector ] 2bi ]
1132             if
1133         ] }
1134         { double-2-rep [
1135             sse4.1?
1136             [ HEX: ff DPPD ]
1137             [ [ MULPD ] [ drop dup double-2-rep %horizontal-add-vector ] 2bi ]
1138             if
1139         ] }
1140     } case ;
1141
1142 M: x86 %dot-vector-reps
1143     {
1144         { sse3? { float-4-rep double-2-rep } }
1145     } available-reps ;
1146
1147 M: x86 %horizontal-add-vector ( dst src rep -- )
1148     {
1149         { float-4-rep [ [ float-4-rep %copy ] [ HADDPS ] [ HADDPS ] 2tri ] }
1150         { double-2-rep [ [ double-2-rep %copy ] [ HADDPD ] 2bi ] }
1151     } case ;
1152
1153 M: x86 %horizontal-add-vector-reps
1154     {
1155         { sse3? { float-4-rep double-2-rep } }
1156     } available-reps ;
1157
1158 M: x86 %horizontal-shl-vector ( dst src1 src2 rep -- )
1159     two-operand PSLLDQ ;
1160
1161 M: x86 %horizontal-shl-vector-reps
1162     {
1163         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1164     } available-reps ;
1165
1166 M: x86 %horizontal-shr-vector ( dst src1 src2 rep -- )
1167     two-operand PSRLDQ ;
1168
1169 M: x86 %horizontal-shr-vector-reps
1170     {
1171         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1172     } available-reps ;
1173
1174 M: x86 %abs-vector ( dst src rep -- )
1175     {
1176         { char-16-rep [ PABSB ] }
1177         { short-8-rep [ PABSW ] }
1178         { int-4-rep [ PABSD ] }
1179     } case ;
1180
1181 M: x86 %abs-vector-reps
1182     {
1183         { ssse3? { char-16-rep short-8-rep int-4-rep } }
1184     } available-reps ;
1185
1186 M: x86 %sqrt-vector ( dst src rep -- )
1187     {
1188         { float-4-rep [ SQRTPS ] }
1189         { double-2-rep [ SQRTPD ] }
1190     } case ;
1191
1192 M: x86 %sqrt-vector-reps
1193     {
1194         { sse? { float-4-rep } }
1195         { sse2? { double-2-rep } }
1196     } available-reps ;
1197
1198 M: x86 %and-vector ( dst src1 src2 rep -- )
1199     [ two-operand ] keep
1200     {
1201         { float-4-rep [ ANDPS ] }
1202         { double-2-rep [ ANDPD ] }
1203         [ drop PAND ]
1204     } case ;
1205
1206 M: x86 %and-vector-reps
1207     {
1208         { sse? { float-4-rep } }
1209         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1210     } available-reps ;
1211
1212 M: x86 %andn-vector ( dst src1 src2 rep -- )
1213     [ two-operand ] keep
1214     {
1215         { float-4-rep [ ANDNPS ] }
1216         { double-2-rep [ ANDNPD ] }
1217         [ drop PANDN ]
1218     } case ;
1219
1220 M: x86 %andn-vector-reps
1221     {
1222         { sse? { float-4-rep } }
1223         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1224     } available-reps ;
1225
1226 M: x86 %or-vector ( dst src1 src2 rep -- )
1227     [ two-operand ] keep
1228     {
1229         { float-4-rep [ ORPS ] }
1230         { double-2-rep [ ORPD ] }
1231         [ drop POR ]
1232     } case ;
1233
1234 M: x86 %or-vector-reps
1235     {
1236         { sse? { float-4-rep } }
1237         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1238     } available-reps ;
1239
1240 M: x86 %xor-vector ( dst src1 src2 rep -- )
1241     [ two-operand ] keep
1242     {
1243         { float-4-rep [ XORPS ] }
1244         { double-2-rep [ XORPD ] }
1245         [ drop PXOR ]
1246     } case ;
1247
1248 M: x86 %xor-vector-reps
1249     {
1250         { sse? { float-4-rep } }
1251         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1252     } available-reps ;
1253
1254 M: x86 %shl-vector ( dst src1 src2 rep -- )
1255     [ two-operand ] keep
1256     {
1257         { short-8-rep [ PSLLW ] }
1258         { ushort-8-rep [ PSLLW ] }
1259         { int-4-rep [ PSLLD ] }
1260         { uint-4-rep [ PSLLD ] }
1261         { longlong-2-rep [ PSLLQ ] }
1262         { ulonglong-2-rep [ PSLLQ ] }
1263     } case ;
1264
1265 M: x86 %shl-vector-reps
1266     {
1267         { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1268     } available-reps ;
1269
1270 M: x86 %shr-vector ( dst src1 src2 rep -- )
1271     [ two-operand ] keep
1272     {
1273         { short-8-rep [ PSRAW ] }
1274         { ushort-8-rep [ PSRLW ] }
1275         { int-4-rep [ PSRAD ] }
1276         { uint-4-rep [ PSRLD ] }
1277         { ulonglong-2-rep [ PSRLQ ] }
1278     } case ;
1279
1280 M: x86 %shr-vector-reps
1281     {
1282         { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } }
1283     } available-reps ;
1284
1285 : scalar-sized-reg ( reg rep -- reg' )
1286     rep-size 8 * n-bit-version-of ;
1287
1288 M: x86 %integer>scalar drop MOVD ;
1289
1290 M:: x86 %scalar>integer ( dst src rep -- )
1291     rep {
1292         { int-scalar-rep [
1293             dst 32-bit-version-of src MOVD
1294             dst dst 32-bit-version-of
1295             2dup eq? [ 2drop ] [ MOVSX ] if
1296         ] }
1297         { uint-scalar-rep [
1298             dst 32-bit-version-of src MOVD
1299         ] }
1300         { short-scalar-rep [
1301             dst 32-bit-version-of src MOVD
1302             dst dst 16-bit-version-of MOVSX
1303         ] }
1304         { ushort-scalar-rep [
1305             dst 32-bit-version-of src MOVD
1306             dst dst 16-bit-version-of MOVZX
1307         ] }
1308         { char-scalar-rep [
1309             dst 32-bit-version-of src MOVD
1310             dst { } 8 [| tmp-dst |
1311                 tmp-dst dst int-rep %copy
1312                 tmp-dst tmp-dst 8-bit-version-of MOVSX
1313                 dst tmp-dst int-rep %copy
1314             ] with-small-register
1315         ] }
1316         { uchar-scalar-rep [
1317             dst 32-bit-version-of src MOVD
1318             dst { } 8 [| tmp-dst |
1319                 tmp-dst dst int-rep %copy
1320                 tmp-dst tmp-dst 8-bit-version-of MOVZX
1321                 dst tmp-dst int-rep %copy
1322             ] with-small-register
1323         ] }
1324     } case ;
1325
1326 M: x86 %vector>scalar %copy ;
1327 M: x86 %scalar>vector %copy ;
1328
1329 M:: x86 %spill ( src rep dst -- ) dst src rep %copy ;
1330 M:: x86 %reload ( dst rep src -- ) dst src rep %copy ;
1331
1332 M: x86 %loop-entry 16 code-alignment [ NOP ] times ;
1333
1334 M:: x86 %save-context ( temp1 temp2 callback-allowed? -- )
1335     #! Save Factor stack pointers in case the C code calls a
1336     #! callback which does a GC, which must reliably trace
1337     #! all roots.
1338     temp1 "stack_chain" %vm-field-ptr
1339     temp1 temp1 [] MOV
1340     temp2 stack-reg cell neg [+] LEA
1341     temp1 [] temp2 MOV
1342     callback-allowed? [
1343         temp1 2 cells [+] ds-reg MOV
1344         temp1 3 cells [+] rs-reg MOV
1345     ] when ;
1346
1347 M: x86 value-struct? drop t ;
1348
1349 M: x86 immediate-arithmetic? ( n -- ? )
1350     HEX: -80000000 HEX: 7fffffff between? ;
1351
1352 M: x86 immediate-bitwise? ( n -- ? )
1353     HEX: -80000000 HEX: 7fffffff between? ;
1354
1355 : next-stack@ ( n -- operand )
1356     #! nth parameter from the next stack frame. Used to box
1357     #! input values to callbacks; the callback has its own
1358     #! stack frame set up, and we want to read the frame
1359     #! set up by the caller.
1360     stack-frame get total-size>> + stack@ ;
1361
1362 enable-simd
1363 enable-min/max
1364 enable-fixnum-log2
1365
1366 :: install-sse2-check ( -- )
1367     [
1368         sse-version 20 < [
1369             "This image was built to use SSE2 but your CPU does not support it." print
1370             "You will need to bootstrap Factor again." print
1371             flush
1372             1 exit
1373         ] when
1374     ] "cpu.x86" add-init-hook ;
1375
1376 : enable-sse2 ( version -- )
1377     20 >= [
1378         enable-float-intrinsics
1379         enable-float-functions
1380         enable-float-min/max
1381         enable-fsqrt
1382         install-sse2-check
1383     ] when ;
1384
1385 : check-sse ( -- )
1386     [ { sse_version } compile ] with-optimizer
1387     "Checking for multimedia extensions: " write sse-version
1388     [ sse-string write " detected" print ] [ enable-sse2 ] bi ;