basis/cpu/x86/x86.factor

   1 ! Copyright (C) 2005, 2009 Slava Pestov.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: accessors assocs alien alien.c-types arrays strings
   4 cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands
   5 cpu.x86.features cpu.x86.features.private cpu.architecture kernel
   6 kernel.private math memory namespaces make sequences words system
   7 layouts combinators math.order math.vectors fry locals compiler.constants
   8 byte-arrays io macros quotations compiler compiler.units init vm
   9 compiler.cfg.registers
  10 compiler.cfg.instructions
  11 compiler.cfg.intrinsics
  12 compiler.cfg.comparisons
  13 compiler.cfg.stack-frame
  14 compiler.codegen.fixup ;
  15 FROM: layouts => cell ;
  16 FROM: math => float ;
  17 IN: cpu.x86
  18
  19 ! Add some methods to the assembler to be more useful to the backend
  20 M: label JMP 0 JMP rc-relative label-fixup ;
  21 M: label JUMPcc [ 0 ] dip JUMPcc rc-relative label-fixup ;
  22
  23 M: x86 vector-regs float-regs ;
  24
  25 HOOK: stack-reg cpu ( -- reg )
  26
  27 HOOK: reserved-stack-space cpu ( -- n )
  28
  29 HOOK: extra-stack-space cpu ( stack-frame -- n )
  30
  31 : stack@ ( n -- op ) stack-reg swap [+] ;
  32
  33 : special@ ( n -- op )
  34     stack-frame get extra-stack-space +
  35     reserved-stack-space +
  36     stack@ ;
  37
  38 : spill@ ( n -- op ) spill-offset special@ ;
  39
  40 : gc-root@ ( n -- op ) gc-root-offset special@ ;
  41
  42 : decr-stack-reg ( n -- )
  43     dup 0 = [ drop ] [ stack-reg swap SUB ] if ;
  44
  45 : incr-stack-reg ( n -- )
  46     dup 0 = [ drop ] [ stack-reg swap ADD ] if ;
  47
  48 : align-stack ( n -- n' ) 16 align ;
  49
  50 M: x86 stack-frame-size ( stack-frame -- i )
  51     [ (stack-frame-size) ]
  52     [ extra-stack-space ] bi +
  53     reserved-stack-space +
  54     3 cells +
  55     align-stack ;
  56
  57 ! Must be a volatile register not used for parameter passing, for safe
  58 ! use in calls in and out of C
  59 HOOK: temp-reg cpu ( -- reg )
  60
  61 HOOK: pic-tail-reg cpu ( -- reg )
  62
  63 M: x86 %load-immediate dup 0 = [ drop dup XOR ] [ MOV ] if ;
  64
  65 M: x86 %load-reference swap 0 MOV rc-absolute-cell rel-immediate ;
  66
  67 HOOK: ds-reg cpu ( -- reg )
  68 HOOK: rs-reg cpu ( -- reg )
  69
  70 : reg-stack ( n reg -- op ) swap cells neg [+] ;
  71
  72 GENERIC: loc>operand ( loc -- operand )
  73
  74 M: ds-loc loc>operand n>> ds-reg reg-stack ;
  75 M: rs-loc loc>operand n>> rs-reg reg-stack ;
  76
  77 M: x86 %peek loc>operand MOV ;
  78 M: x86 %replace loc>operand swap MOV ;
  79 : (%inc) ( n reg -- ) swap cells dup 0 > [ ADD ] [ neg SUB ] if ; inline
  80 M: x86 %inc-d ( n -- ) ds-reg (%inc) ;
  81 M: x86 %inc-r ( n -- ) rs-reg (%inc) ;
  82
  83 M: x86 %call ( word -- ) 0 CALL rc-relative rel-word-pic ;
  84
  85 : xt-tail-pic-offset ( -- n )
  86     #! See the comment in vm/cpu-x86.hpp
  87     cell 4 + 1 + ; inline
  88
  89 M: x86 %jump ( word -- )
  90     pic-tail-reg 0 MOV xt-tail-pic-offset rc-absolute-cell rel-here
  91     0 JMP rc-relative rel-word-pic-tail ;
  92
  93 M: x86 %jump-label ( label -- ) 0 JMP rc-relative label-fixup ;
  94
  95 M: x86 %return ( -- ) 0 RET ;
  96
  97 : code-alignment ( align -- n )
  98     [ building get length dup ] dip align swap - ;
  99
 100 : align-code ( n -- )
 101     0 <repetition> % ;
 102
 103 :: (%slot-imm) ( obj slot tag -- op )
 104     obj slot tag slot-offset [+] ; inline
 105
 106 M: x86 %slot ( dst obj slot -- ) [+] MOV ;
 107 M: x86 %slot-imm ( dst obj slot tag -- ) (%slot-imm) MOV ;
 108 M: x86 %set-slot ( src obj slot -- ) [+] swap MOV ;
 109 M: x86 %set-slot-imm ( src obj slot tag -- ) (%slot-imm) swap MOV ;
 110
 111 :: two-operand ( dst src1 src2 rep -- dst src )
 112     dst src2 eq? dst src1 eq? not and [ "Cannot handle this case" throw ] when
 113     dst src1 rep %copy
 114     dst src2 ; inline
 115
 116 :: one-operand ( dst src rep -- dst )
 117     dst src rep %copy
 118     dst ; inline
 119
 120 M: x86 %add     2over eq? [ nip ADD ] [ [+] LEA ] if ;
 121 M: x86 %add-imm 2over eq? [ nip ADD ] [ [+] LEA ] if ;
 122 M: x86 %sub     int-rep two-operand SUB ;
 123 M: x86 %sub-imm 2over eq? [ nip SUB ] [ neg [+] LEA ] if ;
 124 M: x86 %mul     int-rep two-operand swap IMUL2 ;
 125 M: x86 %mul-imm IMUL3 ;
 126 M: x86 %and     int-rep two-operand AND ;
 127 M: x86 %and-imm int-rep two-operand AND ;
 128 M: x86 %or      int-rep two-operand OR ;
 129 M: x86 %or-imm  int-rep two-operand OR ;
 130 M: x86 %xor     int-rep two-operand XOR ;
 131 M: x86 %xor-imm int-rep two-operand XOR ;
 132 M: x86 %shl-imm int-rep two-operand SHL ;
 133 M: x86 %shr-imm int-rep two-operand SHR ;
 134 M: x86 %sar-imm int-rep two-operand SAR ;
 135
 136 M: x86 %min     int-rep two-operand [ CMP ] [ CMOVG ] 2bi ;
 137 M: x86 %max     int-rep two-operand [ CMP ] [ CMOVL ] 2bi ;
 138
 139 M: x86 %not     int-rep one-operand NOT ;
 140 M: x86 %neg     int-rep one-operand NEG ;
 141 M: x86 %log2    BSR ;
 142
 143 ! A bit of logic to avoid using MOVSS/MOVSD for reg-reg moves
 144 ! since this induces partial register stalls
 145 GENERIC: copy-register* ( dst src rep -- )
 146 GENERIC: copy-memory* ( dst src rep -- )
 147
 148 M: int-rep copy-register* drop MOV ;
 149 M: tagged-rep copy-register* drop MOV ;
 150 M: float-rep copy-register* drop MOVAPS ;
 151 M: double-rep copy-register* drop MOVAPS ;
 152 M: float-4-rep copy-register* drop MOVAPS ;
 153 M: double-2-rep copy-register* drop MOVAPS ;
 154 M: vector-rep copy-register* drop MOVDQA ;
 155
 156 M: object copy-memory* copy-register* ;
 157 M: float-rep copy-memory* drop MOVSS ;
 158 M: double-rep copy-memory* drop MOVSD ;
 159
 160 M: x86 %copy ( dst src rep -- )
 161     2over eq? [ 3drop ] [
 162         [ [ dup spill-slot? [ n>> spill@ ] when ] bi@ ] dip
 163         2over [ register? ] both? [ copy-register* ] [ copy-memory* ] if
 164     ] if ;
 165
 166 M: x86 %fixnum-add ( label dst src1 src2 -- )
 167     int-rep two-operand ADD JO ;
 168
 169 M: x86 %fixnum-sub ( label dst src1 src2 -- )
 170     int-rep two-operand SUB JO ;
 171
 172 M: x86 %fixnum-mul ( label dst src1 src2 -- )
 173     int-rep two-operand swap IMUL2 JO ;
 174
 175 M: x86 %unbox-alien ( dst src -- )
 176     alien-offset [+] MOV ;
 177
 178 M:: x86 %unbox-any-c-ptr ( dst src -- )
 179     [
 180         "end" define-label
 181         dst dst XOR
 182         ! Is the object f?
 183         src \ f type-number CMP
 184         "end" get JE
 185         ! Compute tag in dst register
 186         dst src MOV
 187         dst tag-mask get AND
 188         ! Is the object an alien?
 189         dst alien type-number CMP
 190         ! Add an offset to start of byte array's data
 191         dst src byte-array-offset [+] LEA
 192         "end" get JNE
 193         ! If so, load the offset and add it to the address
 194         dst src alien-offset [+] MOV
 195         "end" resolve-label
 196     ] with-scope ;
 197
 198 : alien@ ( reg n -- op ) cells alien type-number - [+] ;
 199
 200 M:: x86 %box-alien ( dst src temp -- )
 201     [
 202         "end" define-label
 203         dst \ f type-number MOV
 204         src src TEST
 205         "end" get JE
 206         dst 5 cells alien temp %allot
 207         dst 1 alien@ \ f type-number MOV ! base
 208         dst 2 alien@ \ f type-number MOV ! expired
 209         dst 3 alien@ src MOV ! displacement
 210         dst 4 alien@ src MOV ! address
 211         "end" resolve-label
 212     ] with-scope ;
 213
 214 M:: x86 %box-displaced-alien ( dst displacement base temp base-class -- )
 215     ! This is ridiculous
 216     [
 217         "end" define-label
 218         "not-f" define-label
 219         "not-alien" define-label
 220
 221         ! If displacement is zero, return the base
 222         dst base MOV
 223         displacement displacement TEST
 224         "end" get JE
 225
 226         ! Displacement is non-zero, we're going to be allocating a new
 227         ! object
 228         dst 5 cells alien temp %allot
 229
 230         ! Set expired to f
 231         dst 2 alien@ \ f type-number MOV
 232
 233         ! Is base f?
 234         base \ f type-number CMP
 235         "not-f" get JNE
 236
 237         ! Yes, it is f. Fill in new object
 238         dst 1 alien@ base MOV
 239         dst 3 alien@ displacement MOV
 240         dst 4 alien@ displacement MOV
 241
 242         "end" get JMP
 243
 244         "not-f" resolve-label
 245
 246         ! Check base type
 247         temp base MOV
 248         temp tag-mask get AND
 249
 250         ! Is base an alien?
 251         temp alien type-number CMP
 252         "not-alien" get JNE
 253
 254         ! Yes, it is an alien. Set new alien's base to base.base
 255         temp base 1 alien@ MOV
 256         dst 1 alien@ temp MOV
 257
 258         ! Compute displacement
 259         temp base 3 alien@ MOV
 260         temp displacement ADD
 261         dst 3 alien@ temp MOV
 262
 263         ! Compute address
 264         temp base 4 alien@ MOV
 265         temp displacement ADD
 266         dst 4 alien@ temp MOV
 267
 268         ! We are done
 269         "end" get JMP
 270
 271         ! Is base a byte array? It has to be, by now...
 272         "not-alien" resolve-label
 273
 274         dst 1 alien@ base MOV
 275         dst 3 alien@ displacement MOV
 276         temp base MOV
 277         temp byte-array-offset ADD
 278         temp displacement ADD
 279         dst 4 alien@ temp MOV
 280
 281         "end" resolve-label
 282     ] with-scope ;
 283
 284 ! The 'small-reg' mess is pretty crappy, but its only used on x86-32.
 285 ! On x86-64, all registers have 8-bit versions. However, a similar
 286 ! problem arises for shifts, where the shift count must be in CL, and
 287 ! so one day I will fix this properly by adding precoloring to the
 288 ! register allocator.
 289
 290 HOOK: has-small-reg? cpu ( reg size -- ? )
 291
 292 CONSTANT: have-byte-regs { EAX ECX EDX EBX }
 293
 294 M: x86.32 has-small-reg?
 295     {
 296         { 8 [ have-byte-regs member-eq? ] }
 297         { 16 [ drop t ] }
 298         { 32 [ drop t ] }
 299     } case ;
 300
 301 M: x86.64 has-small-reg? 2drop t ;
 302
 303 : small-reg-that-isn't ( exclude -- reg' )
 304     [ have-byte-regs ] dip
 305     [ native-version-of ] map
 306     '[ _ member-eq? not ] find nip ;
 307
 308 : with-save/restore ( reg quot -- )
 309     [ drop PUSH ] [ call ] [ drop POP ] 2tri ; inline
 310
 311 :: with-small-register ( dst exclude size quot: ( new-dst -- ) -- )
 312     ! If the destination register overlaps a small register with
 313     ! 'size' bits, we call the quot with that. Otherwise, we find a
 314     ! small register that is not in exclude, and call quot, saving and
 315     ! restoring the small register.
 316     dst size has-small-reg? [ dst quot call ] [
 317         exclude small-reg-that-isn't
 318         [ quot call ] with-save/restore
 319     ] if ; inline
 320
 321 M:: x86 %string-nth ( dst src index temp -- )
 322     ! We request a small-reg of size 8 since those of size 16 are
 323     ! a superset.
 324     "end" define-label
 325     dst { src index temp } 8 [| new-dst |
 326         ! Load the least significant 7 bits into new-dst.
 327         ! 8th bit indicates whether we have to load from
 328         ! the aux vector or not.
 329         temp src index [+] LEA
 330         new-dst 8-bit-version-of temp string-offset [+] MOV
 331         new-dst new-dst 8-bit-version-of MOVZX
 332         ! Do we have to look at the aux vector?
 333         new-dst HEX: 80 CMP
 334         "end" get JL
 335         ! Yes, this is a non-ASCII character. Load aux vector
 336         temp src string-aux-offset [+] MOV
 337         new-dst temp XCHG
 338         ! Compute index
 339         new-dst index ADD
 340         new-dst index ADD
 341         ! Load high 16 bits
 342         new-dst 16-bit-version-of new-dst byte-array-offset [+] MOV
 343         new-dst new-dst 16-bit-version-of MOVZX
 344         new-dst 7 SHL
 345         ! Compute code point
 346         new-dst temp XOR
 347         "end" resolve-label
 348         dst new-dst int-rep %copy
 349     ] with-small-register ;
 350
 351 M:: x86 %set-string-nth-fast ( ch str index temp -- )
 352     ch { index str temp } 8 [| new-ch |
 353         new-ch ch int-rep %copy
 354         temp str index [+] LEA
 355         temp string-offset [+] new-ch 8-bit-version-of MOV
 356     ] with-small-register ;
 357
 358 :: %alien-integer-getter ( dst src offset size quot -- )
 359     dst { src } size [| new-dst |
 360         new-dst dup size n-bit-version-of dup src offset [+] MOV
 361         quot call
 362         dst new-dst int-rep %copy
 363     ] with-small-register ; inline
 364
 365 : %alien-unsigned-getter ( dst src offset size -- )
 366     [ MOVZX ] %alien-integer-getter ; inline
 367
 368 : %alien-signed-getter ( dst src offset size -- )
 369     [ MOVSX ] %alien-integer-getter ; inline
 370
 371 :: %alien-integer-setter ( ptr offset value size -- )
 372     value { ptr } size [| new-value |
 373         new-value value int-rep %copy
 374         ptr offset [+] new-value size n-bit-version-of MOV
 375     ] with-small-register ; inline
 376
 377 M: x86 %alien-unsigned-1 8 %alien-unsigned-getter ;
 378 M: x86 %alien-unsigned-2 16 %alien-unsigned-getter ;
 379 M: x86 %alien-unsigned-4 32 [ 2drop ] %alien-integer-getter ;
 380
 381 M: x86 %alien-signed-1 8 %alien-signed-getter ;
 382 M: x86 %alien-signed-2 16 %alien-signed-getter ;
 383 M: x86 %alien-signed-4 32 %alien-signed-getter ;
 384
 385 M: x86 %alien-cell [+] MOV ;
 386 M: x86 %alien-float [+] MOVSS ;
 387 M: x86 %alien-double [+] MOVSD ;
 388 M: x86 %alien-vector [ [+] ] dip %copy ;
 389
 390 M: x86 %set-alien-integer-1 8 %alien-integer-setter ;
 391 M: x86 %set-alien-integer-2 16 %alien-integer-setter ;
 392 M: x86 %set-alien-integer-4 32 %alien-integer-setter ;
 393 M: x86 %set-alien-cell [ [+] ] dip MOV ;
 394 M: x86 %set-alien-float [ [+] ] dip MOVSS ;
 395 M: x86 %set-alien-double [ [+] ] dip MOVSD ;
 396 M: x86 %set-alien-vector [ [+] ] 2dip %copy ;
 397
 398 : shift-count? ( reg -- ? ) { ECX RCX } member-eq? ;
 399
 400 :: emit-shift ( dst src quot -- )
 401     src shift-count? [
 402         dst CL quot call
 403     ] [
 404         dst shift-count? [
 405             dst src XCHG
 406             src CL quot call
 407             dst src XCHG
 408         ] [
 409             ECX native-version-of [
 410                 CL src MOV
 411                 drop dst CL quot call
 412             ] with-save/restore
 413         ] if
 414     ] if ; inline
 415
 416 M: x86 %shl int-rep two-operand [ SHL ] emit-shift ;
 417 M: x86 %shr int-rep two-operand [ SHR ] emit-shift ;
 418 M: x86 %sar int-rep two-operand [ SAR ] emit-shift ;
 419
 420 : %mov-vm-ptr ( reg -- )
 421     0 MOV 0 rc-absolute-cell rel-vm ;
 422
 423 M: x86 %vm-field-ptr ( dst field -- )
 424     [ 0 MOV ] dip vm-field-offset rc-absolute-cell rel-vm ;
 425
 426 : load-allot-ptr ( nursery-ptr allot-ptr -- )
 427     [ drop "nursery" %vm-field-ptr ] [ swap [] MOV ] 2bi ;
 428
 429 : inc-allot-ptr ( nursery-ptr n -- )
 430     [ [] ] dip data-alignment get align ADD ;
 431
 432 : store-header ( temp class -- )
 433     [ [] ] [ type-number tag-header ] bi* MOV ;
 434
 435 : store-tagged ( dst tag -- )
 436     type-number OR ;
 437
 438 M:: x86 %allot ( dst size class nursery-ptr -- )
 439     nursery-ptr dst load-allot-ptr
 440     dst class store-header
 441     dst class store-tagged
 442     nursery-ptr size inc-allot-ptr ;
 443
 444 HOOK: %mark-card cpu ( card temp -- )
 445 HOOK: %mark-deck cpu ( card temp -- )
 446
 447 :: (%write-barrier) ( src slot temp1 temp2 -- )
 448     temp1 src slot [+] LEA
 449     temp1 card-bits SHR
 450     temp1 temp2 %mark-card
 451     temp1 deck-bits card-bits - SHR
 452     temp1 temp2 %mark-deck ;
 453
 454 M: x86 %write-barrier ( src slot temp1 temp2 -- ) (%write-barrier) ;
 455
 456 M: x86 %write-barrier-imm ( src slot temp1 temp2 -- ) (%write-barrier) ;
 457
 458 M:: x86 %check-nursery ( label size temp1 temp2 -- )
 459     temp1 "nursery" %vm-field-ptr
 460     ! Load 'here' into temp2
 461     temp2 temp1 [] MOV
 462     temp2 size ADD
 463     ! Load 'end' into temp1
 464     temp1 temp1 2 cells [+] MOV
 465     temp2 temp1 CMP
 466     label JLE ;
 467
 468 M: x86 %save-gc-root ( gc-root register -- ) [ gc-root@ ] dip MOV ;
 469
 470 M: x86 %load-gc-root ( gc-root register -- ) swap gc-root@ MOV ;
 471
 472 M: x86 %alien-global ( dst symbol library -- )
 473     [ 0 MOV ] 2dip rc-absolute-cell rel-dlsym ;
 474
 475 M: x86 %epilogue ( n -- ) cell - incr-stack-reg ;
 476
 477 :: %boolean ( dst temp word -- )
 478     dst \ f type-number MOV
 479     temp 0 MOV \ t rc-absolute-cell rel-immediate
 480     dst temp word execute ; inline
 481
 482 : (%compare) ( src1 src2 cc -- )
 483     2over [ { cc= cc/= } member? ] [ register? ] [ 0 = ] tri* and and
 484     [ drop dup TEST ]
 485     [ CMP ] if ;
 486
 487 M:: x86 %compare ( dst src1 src2 cc temp -- )
 488     src1 src2 cc (%compare)
 489     cc order-cc {
 490         { cc<  [ dst temp \ CMOVL %boolean ] }
 491         { cc<= [ dst temp \ CMOVLE %boolean ] }
 492         { cc>  [ dst temp \ CMOVG %boolean ] }
 493         { cc>= [ dst temp \ CMOVGE %boolean ] }
 494         { cc=  [ dst temp \ CMOVE %boolean ] }
 495         { cc/= [ dst temp \ CMOVNE %boolean ] }
 496     } case ;
 497
 498 M: x86 %compare-imm ( dst src1 src2 cc temp -- )
 499     %compare ;
 500
 501 M:: x86 %compare-branch ( label src1 src2 cc -- )
 502     src1 src2 cc (%compare)
 503     cc order-cc {
 504         { cc<  [ label JL ] }
 505         { cc<= [ label JLE ] }
 506         { cc>  [ label JG ] }
 507         { cc>= [ label JGE ] }
 508         { cc=  [ label JE ] }
 509         { cc/= [ label JNE ] }
 510     } case ;
 511
 512 M: x86 %compare-imm-branch ( label src1 src2 cc -- )
 513     %compare-branch ;
 514
 515 M: x86 %add-float double-rep two-operand ADDSD ;
 516 M: x86 %sub-float double-rep two-operand SUBSD ;
 517 M: x86 %mul-float double-rep two-operand MULSD ;
 518 M: x86 %div-float double-rep two-operand DIVSD ;
 519 M: x86 %min-float double-rep two-operand MINSD ;
 520 M: x86 %max-float double-rep two-operand MAXSD ;
 521 M: x86 %sqrt SQRTSD ;
 522
 523 : %clear-unless-in-place ( dst src -- )
 524     over = [ drop ] [ dup XORPS ] if ;
 525
 526 M: x86 %single>double-float [ %clear-unless-in-place ] [ CVTSS2SD ] 2bi ;
 527 M: x86 %double>single-float [ %clear-unless-in-place ] [ CVTSD2SS ] 2bi ;
 528
 529 M: x86 %integer>float [ drop dup XORPS ] [ CVTSI2SD ] 2bi ;
 530 M: x86 %float>integer CVTTSD2SI ;
 531
 532 : %cmov-float= ( dst src -- )
 533     [
 534         "no-move" define-label
 535
 536         "no-move" get [ JNE ] [ JP ] bi
 537         MOV
 538         "no-move" resolve-label
 539     ] with-scope ;
 540
 541 : %cmov-float/= ( dst src -- )
 542     [
 543         "no-move" define-label
 544         "move" define-label
 545
 546         "move" get JP
 547         "no-move" get JE
 548         "move" resolve-label
 549         MOV
 550         "no-move" resolve-label
 551     ] with-scope ;
 552
 553 :: (%compare-float) ( dst src1 src2 cc temp compare -- )
 554     cc {
 555         { cc<    [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVA  %boolean ] }
 556         { cc<=   [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVAE %boolean ] }
 557         { cc>    [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVA  %boolean ] }
 558         { cc>=   [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVAE %boolean ] }
 559         { cc=    [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float= %boolean ] }
 560         { cc<>   [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNE %boolean ] }
 561         { cc<>=  [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNP %boolean ] }
 562         { cc/<   [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVBE %boolean ] }
 563         { cc/<=  [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVB  %boolean ] }
 564         { cc/>   [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVBE %boolean ] }
 565         { cc/>=  [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVB  %boolean ] }
 566         { cc/=   [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float/= %boolean ] }
 567         { cc/<>  [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVE  %boolean ] }
 568         { cc/<>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVP  %boolean ] }
 569     } case ; inline
 570
 571 M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- )
 572     \ COMISD (%compare-float) ;
 573
 574 M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- )
 575     \ UCOMISD (%compare-float) ;
 576
 577 : %jump-float= ( label -- )
 578     [
 579         "no-jump" define-label
 580         "no-jump" get JP
 581         JE
 582         "no-jump" resolve-label
 583     ] with-scope ;
 584
 585 : %jump-float/= ( label -- )
 586     [ JNE ] [ JP ] bi ;
 587
 588 :: (%compare-float-branch) ( label src1 src2 cc compare -- )
 589     cc {
 590         { cc<    [ src2 src1 \ compare execute( a b -- ) label JA  ] }
 591         { cc<=   [ src2 src1 \ compare execute( a b -- ) label JAE ] }
 592         { cc>    [ src1 src2 \ compare execute( a b -- ) label JA  ] }
 593         { cc>=   [ src1 src2 \ compare execute( a b -- ) label JAE ] }
 594         { cc=    [ src1 src2 \ compare execute( a b -- ) label %jump-float= ] }
 595         { cc<>   [ src1 src2 \ compare execute( a b -- ) label JNE ] }
 596         { cc<>=  [ src1 src2 \ compare execute( a b -- ) label JNP ] }
 597         { cc/<   [ src2 src1 \ compare execute( a b -- ) label JBE ] }
 598         { cc/<=  [ src2 src1 \ compare execute( a b -- ) label JB  ] }
 599         { cc/>   [ src1 src2 \ compare execute( a b -- ) label JBE ] }
 600         { cc/>=  [ src1 src2 \ compare execute( a b -- ) label JB  ] }
 601         { cc/=   [ src1 src2 \ compare execute( a b -- ) label %jump-float/= ] }
 602         { cc/<>  [ src1 src2 \ compare execute( a b -- ) label JE  ] }
 603         { cc/<>= [ src1 src2 \ compare execute( a b -- ) label JP  ] }
 604     } case ;
 605
 606 M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- )
 607     \ COMISD (%compare-float-branch) ;
 608
 609 M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- )
 610     \ UCOMISD (%compare-float-branch) ;
 611
 612 MACRO: available-reps ( alist -- )
 613     ! Each SSE version adds new representations and supports
 614     ! all old ones
 615     unzip { } [ append ] accumulate rest swap suffix
 616     [ [ 1quotation ] map ] bi@ zip
 617     reverse [ { } ] suffix
 618     '[ _ cond ] ;
 619
 620 M: x86 %alien-vector-reps
 621     {
 622         { sse? { float-4-rep } }
 623         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 624     } available-reps ;
 625
 626 M: x86 %zero-vector
 627     {
 628         { double-2-rep [ dup XORPS ] }
 629         { float-4-rep [ dup XORPS ] }
 630         [ drop dup PXOR ]
 631     } case ;
 632
 633 M: x86 %zero-vector-reps
 634     {
 635         { sse? { float-4-rep } }
 636         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 637     } available-reps ;
 638
 639 M: x86 %fill-vector
 640     {
 641         { double-2-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] }
 642         { float-4-rep  [ dup [ XORPS ] [ CMPEQPS ] 2bi ] }
 643         [ drop dup PCMPEQB ]
 644     } case ;
 645
 646 M: x86 %fill-vector-reps
 647     {
 648         { sse? { float-4-rep } }
 649         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 650     } available-reps ;
 651
 652 ! M:: x86 %broadcast-vector ( dst src rep -- )
 653 !     rep signed-rep {
 654 !         { float-4-rep [
 655 !             dst src float-4-rep %copy
 656 !             dst dst { 0 0 0 0 } SHUFPS
 657 !         ] }
 658 !         { double-2-rep [
 659 !             dst src MOVDDUP
 660 !         ] }
 661 !         { longlong-2-rep [
 662 !             dst src =
 663 !             [ dst dst PUNPCKLQDQ ]
 664 !             [ dst src { 0 1 0 1 } PSHUFD ]
 665 !             if
 666 !         ] }
 667 !         { int-4-rep [
 668 !             dst src { 0 0 0 0 } PSHUFD
 669 !         ] }
 670 !         { short-8-rep [
 671 !             dst src { 0 0 0 0 } PSHUFLW
 672 !             dst dst PUNPCKLQDQ
 673 !         ] }
 674 !         { char-16-rep [
 675 !             dst src char-16-rep %copy
 676 !             dst dst PUNPCKLBW
 677 !             dst dst { 0 0 0 0 } PSHUFLW
 678 !             dst dst PUNPCKLQDQ
 679 !         ] }
 680 !     } case ;
 681 !
 682 ! M: x86 %broadcast-vector-reps
 683 !     {
 684 !         ! Can't do this with sse1 since it will want to unbox
 685 !         ! a double-precision float and convert to single precision
 686 !         { sse2? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
 687 !     } available-reps ;
 688
 689 M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
 690     rep signed-rep {
 691         { float-4-rep [
 692             dst src1 float-4-rep %copy
 693             dst src2 UNPCKLPS
 694             src3 src4 UNPCKLPS
 695             dst src3 MOVLHPS
 696         ] }
 697         { int-4-rep [
 698             dst src1 int-4-rep %copy
 699             dst src2 PUNPCKLDQ
 700             src3 src4 PUNPCKLDQ
 701             dst src3 PUNPCKLQDQ
 702         ] }
 703     } case ;
 704
 705 M: x86 %gather-vector-4-reps
 706     {
 707         ! Can't do this with sse1 since it will want to unbox
 708         ! double-precision floats and convert to single precision
 709         { sse2? { float-4-rep int-4-rep uint-4-rep } }
 710     } available-reps ;
 711
 712 M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
 713     rep signed-rep {
 714         { double-2-rep [
 715             dst src1 double-2-rep %copy
 716             dst src2 MOVLHPS
 717         ] }
 718         { longlong-2-rep [
 719             dst src1 longlong-2-rep %copy
 720             dst src2 PUNPCKLQDQ
 721         ] }
 722     } case ;
 723
 724 M: x86 %gather-vector-2-reps
 725     {
 726         { sse2? { double-2-rep longlong-2-rep ulonglong-2-rep } }
 727     } available-reps ;
 728
 729 : sse1-float-4-shuffle ( dst shuffle -- )
 730     {
 731         { { 0 1 2 3 } [ drop ] }
 732         { { 0 1 0 1 } [ dup MOVLHPS ] }
 733         { { 2 3 2 3 } [ dup MOVHLPS ] }
 734         { { 0 0 1 1 } [ dup UNPCKLPS ] }
 735         { { 2 2 3 3 } [ dup UNPCKHPS ] }
 736         [ dupd SHUFPS ]
 737     } case ;
 738
 739 : float-4-shuffle ( dst shuffle -- )
 740     sse3? [
 741         {
 742             { { 0 0 2 2 } [ dup MOVSLDUP ] }
 743             { { 1 1 3 3 } [ dup MOVSHDUP ] }
 744             [ sse1-float-4-shuffle ]
 745         } case
 746     ] [ sse1-float-4-shuffle ] if ;
 747
 748 : int-4-shuffle ( dst shuffle -- )
 749     {
 750         { { 0 1 2 3 } [ drop ] }
 751         { { 0 0 1 1 } [ dup PUNPCKLDQ ] }
 752         { { 2 2 3 3 } [ dup PUNPCKHDQ ] }
 753         { { 0 1 0 1 } [ dup PUNPCKLQDQ ] }
 754         { { 2 3 2 3 } [ dup PUNPCKHQDQ ] }
 755         [ dupd PSHUFD ]
 756     } case ;
 757
 758 : longlong-2-shuffle ( dst shuffle -- )
 759     first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ;
 760
 761 : >float-4-shuffle ( double-2-shuffle -- float-4-shuffle )
 762     [ 2 * { 0 1 } n+v ] map concat ;
 763
 764 M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- )
 765     dst src rep %copy
 766     dst shuffle rep signed-rep {
 767         { double-2-rep [ >float-4-shuffle float-4-shuffle ] }
 768         { float-4-rep [ float-4-shuffle ] }
 769         { int-4-rep [ int-4-shuffle ] }
 770         { longlong-2-rep [ longlong-2-shuffle ] }
 771     } case ;
 772
 773 M: x86 %shuffle-vector-imm-reps
 774     {
 775         { sse? { float-4-rep } }
 776         { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 777     } available-reps ;
 778
 779 M: x86 %shuffle-vector ( dst src shuffle rep -- )
 780     two-operand PSHUFB ;
 781
 782 M: x86 %shuffle-vector-reps
 783     {
 784         { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
 785     } available-reps ;
 786
 787 M: x86 %merge-vector-head
 788     [ two-operand ] keep
 789     signed-rep {
 790         { double-2-rep   [ MOVLHPS ] }
 791         { float-4-rep    [ UNPCKLPS ] }
 792         { longlong-2-rep [ PUNPCKLQDQ ] }
 793         { int-4-rep      [ PUNPCKLDQ ] }
 794         { short-8-rep    [ PUNPCKLWD ] }
 795         { char-16-rep    [ PUNPCKLBW ] }
 796     } case ;
 797
 798 M: x86 %merge-vector-tail
 799     [ two-operand ] keep
 800     signed-rep {
 801         { double-2-rep   [ UNPCKHPD ] }
 802         { float-4-rep    [ UNPCKHPS ] }
 803         { longlong-2-rep [ PUNPCKHQDQ ] }
 804         { int-4-rep      [ PUNPCKHDQ ] }
 805         { short-8-rep    [ PUNPCKHWD ] }
 806         { char-16-rep    [ PUNPCKHBW ] }
 807     } case ;
 808
 809 M: x86 %merge-vector-reps
 810     {
 811         { sse? { float-4-rep } }
 812         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 813     } available-reps ;
 814
 815 M: x86 %signed-pack-vector
 816     [ two-operand ] keep
 817     {
 818         { int-4-rep    [ PACKSSDW ] }
 819         { short-8-rep  [ PACKSSWB ] }
 820     } case ;
 821
 822 M: x86 %signed-pack-vector-reps
 823     {
 824         { sse2? { short-8-rep int-4-rep } }
 825     } available-reps ;
 826
 827 M: x86 %unsigned-pack-vector
 828     [ two-operand ] keep
 829     signed-rep {
 830         { int-4-rep   [ PACKUSDW ] }
 831         { short-8-rep [ PACKUSWB ] }
 832     } case ;
 833
 834 M: x86 %unsigned-pack-vector-reps
 835     {
 836         { sse2? { short-8-rep } }
 837         { sse4.1? { int-4-rep } }
 838     } available-reps ;
 839
 840 M: x86 %tail>head-vector ( dst src rep -- )
 841     dup {
 842         { float-4-rep [ drop UNPCKHPD ] }
 843         { double-2-rep [ drop UNPCKHPD ] }
 844         [ drop [ %copy ] [ drop PUNPCKHQDQ ] 3bi ]
 845     } case ;
 846
 847 M: x86 %unpack-vector-head ( dst src rep -- )
 848     {
 849         { char-16-rep  [ PMOVSXBW ] }
 850         { uchar-16-rep [ PMOVZXBW ] }
 851         { short-8-rep  [ PMOVSXWD ] }
 852         { ushort-8-rep [ PMOVZXWD ] }
 853         { int-4-rep    [ PMOVSXDQ ] }
 854         { uint-4-rep   [ PMOVZXDQ ] }
 855         { float-4-rep  [ CVTPS2PD ] }
 856     } case ;
 857
 858 M: x86 %unpack-vector-head-reps ( -- reps )
 859     {
 860         { sse2? { float-4-rep } }
 861         { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
 862     } available-reps ;
 863
 864 M: x86 %integer>float-vector ( dst src rep -- )
 865     {
 866         { int-4-rep [ CVTDQ2PS ] }
 867     } case ;
 868
 869 M: x86 %integer>float-vector-reps
 870     {
 871         { sse2? { int-4-rep } }
 872     } available-reps ;
 873
 874 M: x86 %float>integer-vector ( dst src rep -- )
 875     {
 876         { float-4-rep [ CVTTPS2DQ ] }
 877     } case ;
 878
 879 M: x86 %float>integer-vector-reps
 880     {
 881         { sse2? { float-4-rep } }
 882     } available-reps ;
 883
 884 : (%compare-float-vector) ( dst src rep double single -- )
 885     [ double-2-rep eq? ] 2dip if ; inline
 886 : %compare-float-vector ( dst src rep cc -- )
 887     {
 888         { cc<    [ [ CMPLTPD    ] [ CMPLTPS    ] (%compare-float-vector) ] }
 889         { cc<=   [ [ CMPLEPD    ] [ CMPLEPS    ] (%compare-float-vector) ] }
 890         { cc=    [ [ CMPEQPD    ] [ CMPEQPS    ] (%compare-float-vector) ] }
 891         { cc<>=  [ [ CMPORDPD   ] [ CMPORDPS   ] (%compare-float-vector) ] }
 892         { cc/<   [ [ CMPNLTPD   ] [ CMPNLTPS   ] (%compare-float-vector) ] }
 893         { cc/<=  [ [ CMPNLEPD   ] [ CMPNLEPS   ] (%compare-float-vector) ] }
 894         { cc/=   [ [ CMPNEQPD   ] [ CMPNEQPS   ] (%compare-float-vector) ] }
 895         { cc/<>= [ [ CMPUNORDPD ] [ CMPUNORDPS ] (%compare-float-vector) ] }
 896     } case ;
 897
 898 :: (%compare-int-vector) ( dst src rep int64 int32 int16 int8 -- )
 899     rep signed-rep :> rep'
 900     dst src rep' {
 901         { longlong-2-rep [ int64 call ] }
 902         { int-4-rep      [ int32 call ] }
 903         { short-8-rep    [ int16 call ] }
 904         { char-16-rep    [ int8  call ] }
 905     } case ; inline
 906 : %compare-int-vector ( dst src rep cc -- )
 907     {
 908         { cc= [ [ PCMPEQQ ] [ PCMPEQD ] [ PCMPEQW ] [ PCMPEQB ] (%compare-int-vector) ] }
 909         { cc> [ [ PCMPGTQ ] [ PCMPGTD ] [ PCMPGTW ] [ PCMPGTB ] (%compare-int-vector) ] }
 910     } case ;
 911
 912 M: x86 %compare-vector ( dst src1 src2 rep cc -- )
 913     [ [ two-operand ] keep ] dip
 914     over float-vector-rep?
 915     [ %compare-float-vector ]
 916     [ %compare-int-vector ] if ;
 917
 918 : %compare-vector-eq-reps ( -- reps )
 919     {
 920         { sse? { float-4-rep } }
 921         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
 922         { sse4.1? { longlong-2-rep ulonglong-2-rep } }
 923     } available-reps ;
 924 : %compare-vector-ord-reps ( -- reps )
 925     {
 926         { sse? { float-4-rep } }
 927         { sse2? { double-2-rep char-16-rep short-8-rep int-4-rep } }
 928         { sse4.2? { longlong-2-rep } }
 929     } available-reps ;
 930
 931 M: x86 %compare-vector-reps
 932     {
 933         { [ dup { cc= cc/= cc/<>= cc<>= } member-eq? ] [ drop %compare-vector-eq-reps ] }
 934         [ drop %compare-vector-ord-reps ]
 935     } cond ;
 936
 937 : %compare-float-vector-ccs ( cc -- ccs not? )
 938     {
 939         { cc<    [ { { cc<  f   }              } f ] }
 940         { cc<=   [ { { cc<= f   }              } f ] }
 941         { cc>    [ { { cc<  t   }              } f ] }
 942         { cc>=   [ { { cc<= t   }              } f ] }
 943         { cc=    [ { { cc=  f   }              } f ] }
 944         { cc<>   [ { { cc<  f   } { cc<    t } } f ] }
 945         { cc<>=  [ { { cc<>= f  }              } f ] }
 946         { cc/<   [ { { cc/<  f  }              } f ] }
 947         { cc/<=  [ { { cc/<= f  }              } f ] }
 948         { cc/>   [ { { cc/<  t  }              } f ] }
 949         { cc/>=  [ { { cc/<= t  }              } f ] }
 950         { cc/=   [ { { cc/=  f  }              } f ] }
 951         { cc/<>  [ { { cc/=  f  } { cc/<>= f } } f ] }
 952         { cc/<>= [ { { cc/<>= f }              } f ] }
 953     } case ;
 954
 955 : %compare-int-vector-ccs ( cc -- ccs not? )
 956     order-cc {
 957         { cc<    [ { { cc> t } } f ] }
 958         { cc<=   [ { { cc> f } } t ] }
 959         { cc>    [ { { cc> f } } f ] }
 960         { cc>=   [ { { cc> t } } t ] }
 961         { cc=    [ { { cc= f } } f ] }
 962         { cc/=   [ { { cc= f } } t ] }
 963         { t      [ {           } t ] }
 964         { f      [ {           } f ] }
 965     } case ;
 966
 967 M: x86 %compare-vector-ccs
 968     swap float-vector-rep?
 969     [ %compare-float-vector-ccs ]
 970     [ %compare-int-vector-ccs ] if ;
 971
 972 :: %test-vector-mask ( dst temp mask vcc -- )
 973     vcc {
 974         { vcc-any    [ dst dst TEST dst temp \ CMOVNE %boolean ] }
 975         { vcc-none   [ dst dst TEST dst temp \ CMOVE  %boolean ] }
 976         { vcc-all    [ dst mask CMP dst temp \ CMOVE  %boolean ] }
 977         { vcc-notall [ dst mask CMP dst temp \ CMOVNE %boolean ] }
 978     } case ;
 979
 980 : %move-vector-mask ( dst src rep -- mask )
 981     {
 982         { double-2-rep [ MOVMSKPS HEX: f ] }
 983         { float-4-rep  [ MOVMSKPS HEX: f ] }
 984         [ drop PMOVMSKB HEX: ffff ]
 985     } case ;
 986
 987 M:: x86 %test-vector ( dst src temp rep vcc -- )
 988     dst src rep %move-vector-mask :> mask
 989     dst temp mask vcc %test-vector-mask ;
 990
 991 :: %test-vector-mask-branch ( label temp mask vcc -- )
 992     vcc {
 993         { vcc-any    [ temp temp TEST label JNE ] }
 994         { vcc-none   [ temp temp TEST label JE ] }
 995         { vcc-all    [ temp mask CMP label JE ] }
 996         { vcc-notall [ temp mask CMP label JNE ] }
 997     } case ;
 998
 999 M:: x86 %test-vector-branch ( label src temp rep vcc -- )
1000     temp src rep %move-vector-mask :> mask
1001     label temp mask vcc %test-vector-mask-branch ;
1002
1003 M: x86 %test-vector-reps
1004     {
1005         { sse? { float-4-rep } }
1006         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1007     } available-reps ;
1008
1009 M: x86 %add-vector ( dst src1 src2 rep -- )
1010     [ two-operand ] keep
1011     {
1012         { float-4-rep [ ADDPS ] }
1013         { double-2-rep [ ADDPD ] }
1014         { char-16-rep [ PADDB ] }
1015         { uchar-16-rep [ PADDB ] }
1016         { short-8-rep [ PADDW ] }
1017         { ushort-8-rep [ PADDW ] }
1018         { int-4-rep [ PADDD ] }
1019         { uint-4-rep [ PADDD ] }
1020         { longlong-2-rep [ PADDQ ] }
1021         { ulonglong-2-rep [ PADDQ ] }
1022     } case ;
1023
1024 M: x86 %add-vector-reps
1025     {
1026         { sse? { float-4-rep } }
1027         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1028     } available-reps ;
1029
1030 M: x86 %saturated-add-vector ( dst src1 src2 rep -- )
1031     [ two-operand ] keep
1032     {
1033         { char-16-rep [ PADDSB ] }
1034         { uchar-16-rep [ PADDUSB ] }
1035         { short-8-rep [ PADDSW ] }
1036         { ushort-8-rep [ PADDUSW ] }
1037     } case ;
1038
1039 M: x86 %saturated-add-vector-reps
1040     {
1041         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
1042     } available-reps ;
1043
1044 M: x86 %add-sub-vector ( dst src1 src2 rep -- )
1045     [ two-operand ] keep
1046     {
1047         { float-4-rep [ ADDSUBPS ] }
1048         { double-2-rep [ ADDSUBPD ] }
1049     } case ;
1050
1051 M: x86 %add-sub-vector-reps
1052     {
1053         { sse3? { float-4-rep double-2-rep } }
1054     } available-reps ;
1055
1056 M: x86 %sub-vector ( dst src1 src2 rep -- )
1057     [ two-operand ] keep
1058     {
1059         { float-4-rep [ SUBPS ] }
1060         { double-2-rep [ SUBPD ] }
1061         { char-16-rep [ PSUBB ] }
1062         { uchar-16-rep [ PSUBB ] }
1063         { short-8-rep [ PSUBW ] }
1064         { ushort-8-rep [ PSUBW ] }
1065         { int-4-rep [ PSUBD ] }
1066         { uint-4-rep [ PSUBD ] }
1067         { longlong-2-rep [ PSUBQ ] }
1068         { ulonglong-2-rep [ PSUBQ ] }
1069     } case ;
1070
1071 M: x86 %sub-vector-reps
1072     {
1073         { sse? { float-4-rep } }
1074         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1075     } available-reps ;
1076
1077 M: x86 %saturated-sub-vector ( dst src1 src2 rep -- )
1078     [ two-operand ] keep
1079     {
1080         { char-16-rep [ PSUBSB ] }
1081         { uchar-16-rep [ PSUBUSB ] }
1082         { short-8-rep [ PSUBSW ] }
1083         { ushort-8-rep [ PSUBUSW ] }
1084     } case ;
1085
1086 M: x86 %saturated-sub-vector-reps
1087     {
1088         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
1089     } available-reps ;
1090
1091 M: x86 %mul-vector ( dst src1 src2 rep -- )
1092     [ two-operand ] keep
1093     {
1094         { float-4-rep [ MULPS ] }
1095         { double-2-rep [ MULPD ] }
1096         { short-8-rep [ PMULLW ] }
1097         { ushort-8-rep [ PMULLW ] }
1098         { int-4-rep [ PMULLD ] }
1099         { uint-4-rep [ PMULLD ] }
1100     } case ;
1101
1102 M: x86 %mul-vector-reps
1103     {
1104         { sse? { float-4-rep } }
1105         { sse2? { double-2-rep short-8-rep ushort-8-rep } }
1106         { sse4.1? { int-4-rep uint-4-rep } }
1107     } available-reps ;
1108
1109 M: x86 %div-vector ( dst src1 src2 rep -- )
1110     [ two-operand ] keep
1111     {
1112         { float-4-rep [ DIVPS ] }
1113         { double-2-rep [ DIVPD ] }
1114     } case ;
1115
1116 M: x86 %div-vector-reps
1117     {
1118         { sse? { float-4-rep } }
1119         { sse2? { double-2-rep } }
1120     } available-reps ;
1121
1122 M: x86 %min-vector ( dst src1 src2 rep -- )
1123     [ two-operand ] keep
1124     {
1125         { char-16-rep [ PMINSB ] }
1126         { uchar-16-rep [ PMINUB ] }
1127         { short-8-rep [ PMINSW ] }
1128         { ushort-8-rep [ PMINUW ] }
1129         { int-4-rep [ PMINSD ] }
1130         { uint-4-rep [ PMINUD ] }
1131         { float-4-rep [ MINPS ] }
1132         { double-2-rep [ MINPD ] }
1133     } case ;
1134
1135 M: x86 %min-vector-reps
1136     {
1137         { sse? { float-4-rep } }
1138         { sse2? { uchar-16-rep short-8-rep double-2-rep } }
1139         { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
1140     } available-reps ;
1141
1142 M: x86 %max-vector ( dst src1 src2 rep -- )
1143     [ two-operand ] keep
1144     {
1145         { char-16-rep [ PMAXSB ] }
1146         { uchar-16-rep [ PMAXUB ] }
1147         { short-8-rep [ PMAXSW ] }
1148         { ushort-8-rep [ PMAXUW ] }
1149         { int-4-rep [ PMAXSD ] }
1150         { uint-4-rep [ PMAXUD ] }
1151         { float-4-rep [ MAXPS ] }
1152         { double-2-rep [ MAXPD ] }
1153     } case ;
1154
1155 M: x86 %max-vector-reps
1156     {
1157         { sse? { float-4-rep } }
1158         { sse2? { uchar-16-rep short-8-rep double-2-rep } }
1159         { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
1160     } available-reps ;
1161
1162 M: x86 %dot-vector
1163     [ two-operand ] keep
1164     {
1165         { float-4-rep [ HEX: ff DPPS ] }
1166         { double-2-rep [ HEX: ff DPPD ] }
1167     } case ;
1168
1169 M: x86 %dot-vector-reps
1170     {
1171         { sse4.1? { float-4-rep double-2-rep } }
1172     } available-reps ;
1173
1174 M: x86 %horizontal-add-vector ( dst src1 src2 rep -- )
1175     [ two-operand ] keep
1176     signed-rep {
1177         { float-4-rep  [ HADDPS ] }
1178         { double-2-rep [ HADDPD ] }
1179         { int-4-rep    [ PHADDD ] }
1180         { short-8-rep  [ PHADDW ] }
1181     } case ;
1182
1183 M: x86 %horizontal-add-vector-reps
1184     {
1185         { sse3? { float-4-rep double-2-rep } }
1186         { ssse3? { int-4-rep uint-4-rep short-8-rep ushort-8-rep } }
1187     } available-reps ;
1188
1189 M: x86 %horizontal-shl-vector-imm ( dst src1 src2 rep -- )
1190     two-operand PSLLDQ ;
1191
1192 M: x86 %horizontal-shl-vector-imm-reps
1193     {
1194         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } }
1195     } available-reps ;
1196
1197 M: x86 %horizontal-shr-vector-imm ( dst src1 src2 rep -- )
1198     two-operand PSRLDQ ;
1199
1200 M: x86 %horizontal-shr-vector-imm-reps
1201     {
1202         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } }
1203     } available-reps ;
1204
1205 M: x86 %abs-vector ( dst src rep -- )
1206     {
1207         { char-16-rep [ PABSB ] }
1208         { short-8-rep [ PABSW ] }
1209         { int-4-rep [ PABSD ] }
1210     } case ;
1211
1212 M: x86 %abs-vector-reps
1213     {
1214         { ssse3? { char-16-rep short-8-rep int-4-rep } }
1215     } available-reps ;
1216
1217 M: x86 %sqrt-vector ( dst src rep -- )
1218     {
1219         { float-4-rep [ SQRTPS ] }
1220         { double-2-rep [ SQRTPD ] }
1221     } case ;
1222
1223 M: x86 %sqrt-vector-reps
1224     {
1225         { sse? { float-4-rep } }
1226         { sse2? { double-2-rep } }
1227     } available-reps ;
1228
1229 M: x86 %and-vector ( dst src1 src2 rep -- )
1230     [ two-operand ] keep
1231     {
1232         { float-4-rep [ ANDPS ] }
1233         { double-2-rep [ ANDPS ] }
1234         [ drop PAND ]
1235     } case ;
1236
1237 M: x86 %and-vector-reps
1238     {
1239         { sse? { float-4-rep } }
1240         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1241     } available-reps ;
1242
1243 M: x86 %andn-vector ( dst src1 src2 rep -- )
1244     [ two-operand ] keep
1245     {
1246         { float-4-rep [ ANDNPS ] }
1247         { double-2-rep [ ANDNPS ] }
1248         [ drop PANDN ]
1249     } case ;
1250
1251 M: x86 %andn-vector-reps
1252     {
1253         { sse? { float-4-rep } }
1254         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1255     } available-reps ;
1256
1257 M: x86 %or-vector ( dst src1 src2 rep -- )
1258     [ two-operand ] keep
1259     {
1260         { float-4-rep [ ORPS ] }
1261         { double-2-rep [ ORPS ] }
1262         [ drop POR ]
1263     } case ;
1264
1265 M: x86 %or-vector-reps
1266     {
1267         { sse? { float-4-rep } }
1268         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1269     } available-reps ;
1270
1271 M: x86 %xor-vector ( dst src1 src2 rep -- )
1272     [ two-operand ] keep
1273     {
1274         { float-4-rep [ XORPS ] }
1275         { double-2-rep [ XORPS ] }
1276         [ drop PXOR ]
1277     } case ;
1278
1279 M: x86 %xor-vector-reps
1280     {
1281         { sse? { float-4-rep } }
1282         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1283     } available-reps ;
1284
1285 M: x86 %shl-vector ( dst src1 src2 rep -- )
1286     [ two-operand ] keep
1287     {
1288         { short-8-rep [ PSLLW ] }
1289         { ushort-8-rep [ PSLLW ] }
1290         { int-4-rep [ PSLLD ] }
1291         { uint-4-rep [ PSLLD ] }
1292         { longlong-2-rep [ PSLLQ ] }
1293         { ulonglong-2-rep [ PSLLQ ] }
1294     } case ;
1295
1296 M: x86 %shl-vector-reps
1297     {
1298         { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
1299     } available-reps ;
1300
1301 M: x86 %shr-vector ( dst src1 src2 rep -- )
1302     [ two-operand ] keep
1303     {
1304         { short-8-rep [ PSRAW ] }
1305         { ushort-8-rep [ PSRLW ] }
1306         { int-4-rep [ PSRAD ] }
1307         { uint-4-rep [ PSRLD ] }
1308         { ulonglong-2-rep [ PSRLQ ] }
1309     } case ;
1310
1311 M: x86 %shr-vector-reps
1312     {
1313         { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } }
1314     } available-reps ;
1315
1316 M: x86 %shl-vector-imm %shl-vector ;
1317 M: x86 %shl-vector-imm-reps %shl-vector-reps ;
1318 M: x86 %shr-vector-imm %shr-vector ;
1319 M: x86 %shr-vector-imm-reps %shr-vector-reps ;
1320
1321 : scalar-sized-reg ( reg rep -- reg' )
1322     rep-size 8 * n-bit-version-of ;
1323
1324 M: x86 %integer>scalar drop MOVD ;
1325
1326 ! XXX the longlong versions won't work on x86.32
1327 M:: x86 %scalar>integer ( dst src rep -- )
1328     rep {
1329         { longlong-scalar-rep [
1330             dst src MOVD
1331         ] }
1332         { ulonglong-scalar-rep [
1333             dst src MOVD
1334         ] }
1335         { int-scalar-rep [
1336             dst 32-bit-version-of src MOVD
1337             dst dst 32-bit-version-of
1338             2dup eq? [ 2drop ] [ MOVSX ] if
1339         ] }
1340         { uint-scalar-rep [
1341             dst 32-bit-version-of src MOVD
1342         ] }
1343         { short-scalar-rep [
1344             dst 32-bit-version-of src MOVD
1345             dst dst 16-bit-version-of MOVSX
1346         ] }
1347         { ushort-scalar-rep [
1348             dst 32-bit-version-of src MOVD
1349             dst dst 16-bit-version-of MOVZX
1350         ] }
1351         { char-scalar-rep [
1352             dst 32-bit-version-of src MOVD
1353             dst { } 8 [| tmp-dst |
1354                 tmp-dst dst int-rep %copy
1355                 tmp-dst tmp-dst 8-bit-version-of MOVSX
1356                 dst tmp-dst int-rep %copy
1357             ] with-small-register
1358         ] }
1359         { uchar-scalar-rep [
1360             dst 32-bit-version-of src MOVD
1361             dst { } 8 [| tmp-dst |
1362                 tmp-dst dst int-rep %copy
1363                 tmp-dst tmp-dst 8-bit-version-of MOVZX
1364                 dst tmp-dst int-rep %copy
1365             ] with-small-register
1366         ] }
1367     } case ;
1368
1369 M: x86 %vector>scalar %copy ;
1370 M: x86 %scalar>vector %copy ;
1371
1372 M:: x86 %spill ( src rep dst -- ) dst src rep %copy ;
1373 M:: x86 %reload ( dst rep src -- ) dst src rep %copy ;
1374
1375 M: x86 %loop-entry 16 code-alignment [ NOP ] times ;
1376
1377 M:: x86 %save-context ( temp1 temp2 callback-allowed? -- )
1378     #! Save Factor stack pointers in case the C code calls a
1379     #! callback which does a GC, which must reliably trace
1380     #! all roots.
1381     temp1 "stack_chain" %vm-field-ptr
1382     temp1 temp1 [] MOV
1383     temp2 stack-reg cell neg [+] LEA
1384     temp1 [] temp2 MOV
1385     callback-allowed? [
1386         temp1 2 cells [+] ds-reg MOV
1387         temp1 3 cells [+] rs-reg MOV
1388     ] when ;
1389
1390 M: x86 value-struct? drop t ;
1391
1392 M: x86 immediate-arithmetic? ( n -- ? )
1393     HEX: -80000000 HEX: 7fffffff between? ;
1394
1395 M: x86 immediate-bitwise? ( n -- ? )
1396     HEX: -80000000 HEX: 7fffffff between? ;
1397
1398 : next-stack@ ( n -- operand )
1399     #! nth parameter from the next stack frame. Used to box
1400     #! input values to callbacks; the callback has its own
1401     #! stack frame set up, and we want to read the frame
1402     #! set up by the caller.
1403     stack-frame get total-size>> + stack@ ;
1404
1405 enable-min/max
1406 enable-fixnum-log2
1407
1408 :: install-sse2-check ( -- )
1409     [
1410         sse-version 20 < [
1411             "This image was built to use SSE2 but your CPU does not support it." print
1412             "You will need to bootstrap Factor again." print
1413             flush
1414             1 exit
1415         ] when
1416     ] "cpu.x86" add-startup-hook ;
1417
1418 : enable-sse2 ( version -- )
1419     20 >= [
1420         enable-float-intrinsics
1421         enable-float-functions
1422         enable-float-min/max
1423         enable-fsqrt
1424         install-sse2-check
1425     ] when ;
1426
1427 : check-sse ( -- )
1428     [ { sse_version } compile ] with-optimizer
1429     "Checking for multimedia extensions: " write sse-version
1430     [ sse-string write " detected" print ] [ enable-sse2 ] bi ;