basis/cpu/x86/x86.factor

   1 ! Copyright (C) 2005, 2009 Slava Pestov.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: accessors assocs alien alien.c-types arrays strings
   4 cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands
   5 cpu.x86.features cpu.x86.features.private cpu.architecture kernel
   6 kernel.private math memory namespaces make sequences words system
   7 layouts combinators math.order fry locals compiler.constants
   8 byte-arrays io macros quotations compiler compiler.units init vm
   9 compiler.cfg.registers
  10 compiler.cfg.instructions
  11 compiler.cfg.intrinsics
  12 compiler.cfg.comparisons
  13 compiler.cfg.stack-frame
  14 compiler.codegen.fixup ;
  15 FROM: layouts => cell ;
  16 FROM: math => float ;
  17 IN: cpu.x86
  18
  19 ! Add some methods to the assembler to be more useful to the backend
  20 M: label JMP 0 JMP rc-relative label-fixup ;
  21 M: label JUMPcc [ 0 ] dip JUMPcc rc-relative label-fixup ;
  22
  23 M: x86 vector-regs float-regs ;
  24
  25 HOOK: stack-reg cpu ( -- reg )
  26
  27 HOOK: reserved-area-size cpu ( -- n )
  28
  29 : stack@ ( n -- op ) stack-reg swap [+] ;
  30
  31 : param@ ( n -- op ) reserved-area-size + stack@ ;
  32
  33 : spill@ ( n -- op ) spill-offset param@ ;
  34
  35 : gc-root@ ( n -- op ) gc-root-offset param@ ;
  36
  37 : decr-stack-reg ( n -- )
  38     dup 0 = [ drop ] [ stack-reg swap SUB ] if ;
  39
  40 : incr-stack-reg ( n -- )
  41     dup 0 = [ drop ] [ stack-reg swap ADD ] if ;
  42
  43 : align-stack ( n -- n' )
  44     os macosx? cpu x86.64? or [ 16 align ] when ;
  45
  46 M: x86 stack-frame-size ( stack-frame -- i )
  47     (stack-frame-size) 3 cells reserved-area-size + + align-stack ;
  48
  49 ! Must be a volatile register not used for parameter passing, for safe
  50 ! use in calls in and out of C
  51 HOOK: temp-reg cpu ( -- reg )
  52
  53 HOOK: pic-tail-reg cpu ( -- reg )
  54
  55 M: x86 %load-immediate dup 0 = [ drop dup XOR ] [ MOV ] if ;
  56
  57 M: x86 %load-reference swap 0 MOV rc-absolute-cell rel-immediate ;
  58
  59 HOOK: ds-reg cpu ( -- reg )
  60 HOOK: rs-reg cpu ( -- reg )
  61
  62 : reg-stack ( n reg -- op ) swap cells neg [+] ;
  63
  64 GENERIC: loc>operand ( loc -- operand )
  65
  66 M: ds-loc loc>operand n>> ds-reg reg-stack ;
  67 M: rs-loc loc>operand n>> rs-reg reg-stack ;
  68
  69 M: x86 %peek loc>operand MOV ;
  70 M: x86 %replace loc>operand swap MOV ;
  71 : (%inc) ( n reg -- ) swap cells dup 0 > [ ADD ] [ neg SUB ] if ; inline
  72 M: x86 %inc-d ( n -- ) ds-reg (%inc) ;
  73 M: x86 %inc-r ( n -- ) rs-reg (%inc) ;
  74
  75 M: x86 %call ( word -- ) 0 CALL rc-relative rel-word-pic ;
  76
  77 : xt-tail-pic-offset ( -- n )
  78     #! See the comment in vm/cpu-x86.hpp
  79     cell 4 + 1 + ; inline
  80
  81 M: x86 %jump ( word -- )
  82     pic-tail-reg 0 MOV xt-tail-pic-offset rc-absolute-cell rel-here
  83     0 JMP rc-relative rel-word-pic-tail ;
  84
  85 M: x86 %jump-label ( label -- ) 0 JMP rc-relative label-fixup ;
  86
  87 M: x86 %return ( -- ) 0 RET ;
  88
  89 : code-alignment ( align -- n )
  90     [ building get length dup ] dip align swap - ;
  91
  92 : align-code ( n -- )
  93     0 <repetition> % ;
  94
  95 :: (%slot-imm) ( obj slot tag -- op )
  96     obj slot cells tag - [+] ; inline
  97
  98 M: x86 %slot ( dst obj slot -- ) [+] MOV ;
  99 M: x86 %slot-imm ( dst obj slot tag -- ) (%slot-imm) MOV ;
 100 M: x86 %set-slot ( src obj slot -- ) [+] swap MOV ;
 101 M: x86 %set-slot-imm ( src obj slot tag -- ) (%slot-imm) swap MOV ;
 102
 103 :: two-operand ( dst src1 src2 rep -- dst src )
 104     dst src2 eq? dst src1 eq? not and [ "Cannot handle this case" throw ] when
 105     dst src1 rep %copy
 106     dst src2 ; inline
 107
 108 :: one-operand ( dst src rep -- dst )
 109     dst src rep %copy
 110     dst ; inline
 111
 112 M: x86 %add     2over eq? [ nip ADD ] [ [+] LEA ] if ;
 113 M: x86 %add-imm 2over eq? [ nip ADD ] [ [+] LEA ] if ;
 114 M: x86 %sub     int-rep two-operand SUB ;
 115 M: x86 %sub-imm 2over eq? [ nip SUB ] [ neg [+] LEA ] if ;
 116 M: x86 %mul     int-rep two-operand swap IMUL2 ;
 117 M: x86 %mul-imm IMUL3 ;
 118 M: x86 %and     int-rep two-operand AND ;
 119 M: x86 %and-imm int-rep two-operand AND ;
 120 M: x86 %or      int-rep two-operand OR ;
 121 M: x86 %or-imm  int-rep two-operand OR ;
 122 M: x86 %xor     int-rep two-operand XOR ;
 123 M: x86 %xor-imm int-rep two-operand XOR ;
 124 M: x86 %shl-imm int-rep two-operand SHL ;
 125 M: x86 %shr-imm int-rep two-operand SHR ;
 126 M: x86 %sar-imm int-rep two-operand SAR ;
 127
 128 M: x86 %min     int-rep two-operand [ CMP ] [ CMOVG ] 2bi ;
 129 M: x86 %max     int-rep two-operand [ CMP ] [ CMOVL ] 2bi ;
 130
 131 M: x86 %not     int-rep one-operand NOT ;
 132 M: x86 %log2    BSR ;
 133
 134 GENERIC: copy-register* ( dst src rep -- )
 135
 136 M: int-rep copy-register* drop MOV ;
 137 M: tagged-rep copy-register* drop MOV ;
 138 M: float-rep copy-register* drop MOVSS ;
 139 M: double-rep copy-register* drop MOVSD ;
 140 M: float-4-rep copy-register* drop MOVUPS ;
 141 M: double-2-rep copy-register* drop MOVUPD ;
 142 M: vector-rep copy-register* drop MOVDQU ;
 143
 144 M: x86 %copy ( dst src rep -- )
 145     2over eq? [ 3drop ] [
 146         [ [ dup spill-slot? [ n>> spill@ ] when ] bi@ ] dip
 147         copy-register*
 148     ] if ;
 149
 150 M: x86 %fixnum-add ( label dst src1 src2 -- )
 151     int-rep two-operand ADD JO ;
 152
 153 M: x86 %fixnum-sub ( label dst src1 src2 -- )
 154     int-rep two-operand SUB JO ;
 155
 156 M: x86 %fixnum-mul ( label dst src1 src2 -- )
 157     int-rep two-operand swap IMUL2 JO ;
 158
 159 M: x86 %unbox-alien ( dst src -- )
 160     alien-offset [+] MOV ;
 161
 162 M:: x86 %unbox-any-c-ptr ( dst src temp -- )
 163     [
 164         { "is-byte-array" "end" "start" } [ define-label ] each
 165         dst 0 MOV
 166         temp src MOV
 167         ! We come back here with displaced aliens
 168         "start" resolve-label
 169         ! Is the object f?
 170         temp \ f tag-number CMP
 171         "end" get JE
 172         ! Is the object an alien?
 173         temp header-offset [+] alien type-number tag-fixnum CMP
 174         "is-byte-array" get JNE
 175         ! If so, load the offset and add it to the address
 176         dst temp alien-offset [+] ADD
 177         ! Now recurse on the underlying alien
 178         temp temp underlying-alien-offset [+] MOV
 179         "start" get JMP
 180         "is-byte-array" resolve-label
 181         ! Add byte array address to address being computed
 182         dst temp ADD
 183         ! Add an offset to start of byte array's data
 184         dst byte-array-offset ADD
 185         "end" resolve-label
 186     ] with-scope ;
 187
 188 : alien@ ( reg n -- op ) cells alien tag-number - [+] ;
 189
 190 :: %allot-alien ( dst displacement base temp -- )
 191     dst 4 cells alien temp %allot
 192     dst 1 alien@ base MOV ! alien
 193     dst 2 alien@ \ f tag-number MOV ! expired
 194     dst 3 alien@ displacement MOV ! displacement
 195     ;
 196
 197 M:: x86 %box-alien ( dst src temp -- )
 198     [
 199         "end" define-label
 200         dst \ f tag-number MOV
 201         src 0 CMP
 202         "end" get JE
 203         dst src \ f tag-number temp %allot-alien
 204         "end" resolve-label
 205     ] with-scope ;
 206
 207 M:: x86 %box-displaced-alien ( dst displacement base displacement' base' base-class -- )
 208     [
 209         "end" define-label
 210         "ok" define-label
 211         ! If displacement is zero, return the base
 212         dst base MOV
 213         displacement 0 CMP
 214         "end" get JE
 215         ! Quickly use displacement' before its needed for real, as allot temporary
 216         dst 4 cells alien displacement' %allot
 217         ! If base is already a displaced alien, unpack it
 218         base' base MOV
 219         displacement' displacement MOV
 220         base \ f tag-number CMP
 221         "ok" get JE
 222         base header-offset [+] alien type-number tag-fixnum CMP
 223         "ok" get JNE
 224         ! displacement += base.displacement
 225         displacement' base 3 alien@ ADD
 226         ! base = base.base
 227         base' base 1 alien@ MOV
 228         "ok" resolve-label
 229         dst 1 alien@ base' MOV ! alien
 230         dst 2 alien@ \ f tag-number MOV ! expired
 231         dst 3 alien@ displacement' MOV ! displacement
 232         "end" resolve-label
 233     ] with-scope ;
 234
 235 ! The 'small-reg' mess is pretty crappy, but its only used on x86-32.
 236 ! On x86-64, all registers have 8-bit versions. However, a similar
 237 ! problem arises for shifts, where the shift count must be in CL, and
 238 ! so one day I will fix this properly by adding precoloring to the
 239 ! register allocator.
 240
 241 HOOK: has-small-reg? cpu ( reg size -- ? )
 242
 243 CONSTANT: have-byte-regs { EAX ECX EDX EBX }
 244
 245 M: x86.32 has-small-reg?
 246     {
 247         { 8 [ have-byte-regs memq? ] }
 248         { 16 [ drop t ] }
 249         { 32 [ drop t ] }
 250     } case ;
 251
 252 M: x86.64 has-small-reg? 2drop t ;
 253
 254 : small-reg-that-isn't ( exclude -- reg' )
 255     [ have-byte-regs ] dip
 256     [ native-version-of ] map
 257     '[ _ memq? not ] find nip ;
 258
 259 : with-save/restore ( reg quot -- )
 260     [ drop PUSH ] [ call ] [ drop POP ] 2tri ; inline
 261
 262 :: with-small-register ( dst exclude size quot: ( new-dst -- ) -- )
 263     ! If the destination register overlaps a small register with
 264     ! 'size' bits, we call the quot with that. Otherwise, we find a
 265     ! small register that is not in exclude, and call quot, saving and
 266     ! restoring the small register.
 267     dst size has-small-reg? [ dst quot call ] [
 268         exclude small-reg-that-isn't
 269         [ quot call ] with-save/restore
 270     ] if ; inline
 271
 272 M:: x86 %string-nth ( dst src index temp -- )
 273     ! We request a small-reg of size 8 since those of size 16 are
 274     ! a superset.
 275     "end" define-label
 276     dst { src index temp } 8 [| new-dst |
 277         ! Load the least significant 7 bits into new-dst.
 278         ! 8th bit indicates whether we have to load from
 279         ! the aux vector or not.
 280         temp src index [+] LEA
 281         new-dst 8-bit-version-of temp string-offset [+] MOV
 282         new-dst new-dst 8-bit-version-of MOVZX
 283         ! Do we have to look at the aux vector?
 284         new-dst HEX: 80 CMP
 285         "end" get JL
 286         ! Yes, this is a non-ASCII character. Load aux vector
 287         temp src string-aux-offset [+] MOV
 288         new-dst temp XCHG
 289         ! Compute index
 290         new-dst index ADD
 291         new-dst index ADD
 292         ! Load high 16 bits
 293         new-dst 16-bit-version-of new-dst byte-array-offset [+] MOV
 294         new-dst new-dst 16-bit-version-of MOVZX
 295         new-dst 7 SHL
 296         ! Compute code point
 297         new-dst temp XOR
 298         "end" resolve-label
 299         dst new-dst int-rep %copy
 300     ] with-small-register ;
 301
 302 M:: x86 %set-string-nth-fast ( ch str index temp -- )
 303     ch { index str temp } 8 [| new-ch |
 304         new-ch ch int-rep %copy
 305         temp str index [+] LEA
 306         temp string-offset [+] new-ch 8-bit-version-of MOV
 307     ] with-small-register ;
 308
 309 :: %alien-integer-getter ( dst src size quot -- )
 310     dst { src } size [| new-dst |
 311         new-dst dup size n-bit-version-of dup src [] MOV
 312         quot call
 313         dst new-dst int-rep %copy
 314     ] with-small-register ; inline
 315
 316 : %alien-unsigned-getter ( dst src size -- )
 317     [ MOVZX ] %alien-integer-getter ; inline
 318
 319 M: x86 %alien-unsigned-1 8 %alien-unsigned-getter ;
 320 M: x86 %alien-unsigned-2 16 %alien-unsigned-getter ;
 321 M: x86 %alien-unsigned-4 32 [ 2drop ] %alien-integer-getter ;
 322
 323 : %alien-signed-getter ( dst src size -- )
 324     [ MOVSX ] %alien-integer-getter ; inline
 325
 326 M: x86 %alien-signed-1 8 %alien-signed-getter ;
 327 M: x86 %alien-signed-2 16 %alien-signed-getter ;
 328 M: x86 %alien-signed-4 32 %alien-signed-getter ;
 329
 330 M: x86 %alien-cell [] MOV ;
 331 M: x86 %alien-float [] MOVSS ;
 332 M: x86 %alien-double [] MOVSD ;
 333 M: x86 %alien-vector [ [] ] dip %copy ;
 334
 335 :: %alien-integer-setter ( ptr value size -- )
 336     value { ptr } size [| new-value |
 337         new-value value int-rep %copy
 338         ptr [] new-value size n-bit-version-of MOV
 339     ] with-small-register ; inline
 340
 341 M: x86 %set-alien-integer-1 8 %alien-integer-setter ;
 342 M: x86 %set-alien-integer-2 16 %alien-integer-setter ;
 343 M: x86 %set-alien-integer-4 32 %alien-integer-setter ;
 344 M: x86 %set-alien-cell [ [] ] dip MOV ;
 345 M: x86 %set-alien-float [ [] ] dip MOVSS ;
 346 M: x86 %set-alien-double [ [] ] dip MOVSD ;
 347 M: x86 %set-alien-vector [ [] ] 2dip %copy ;
 348
 349 : shift-count? ( reg -- ? ) { ECX RCX } memq? ;
 350
 351 :: emit-shift ( dst src quot -- )
 352     src shift-count? [
 353         dst CL quot call
 354     ] [
 355         dst shift-count? [
 356             dst src XCHG
 357             src CL quot call
 358             dst src XCHG
 359         ] [
 360             ECX native-version-of [
 361                 CL src MOV
 362                 drop dst CL quot call
 363             ] with-save/restore
 364         ] if
 365     ] if ; inline
 366
 367 M: x86 %shl int-rep two-operand [ SHL ] emit-shift ;
 368 M: x86 %shr int-rep two-operand [ SHR ] emit-shift ;
 369 M: x86 %sar int-rep two-operand [ SAR ] emit-shift ;
 370
 371 M: x86 %vm-field-ptr ( dst field -- )
 372     [ drop 0 MOV rc-absolute-cell rt-vm rel-fixup ]
 373     [ vm-field-offset ADD ] 2bi ;
 374
 375 : load-zone-ptr ( reg -- )
 376     #! Load pointer to start of zone array
 377     "nursery" %vm-field-ptr ;
 378
 379 : load-allot-ptr ( nursery-ptr allot-ptr -- )
 380     [ drop load-zone-ptr ] [ swap cell [+] MOV ] 2bi ;
 381
 382 : inc-allot-ptr ( nursery-ptr n -- )
 383     [ cell [+] ] dip 8 align ADD ;
 384
 385 : store-header ( temp class -- )
 386     [ [] ] [ type-number tag-fixnum ] bi* MOV ;
 387
 388 : store-tagged ( dst tag -- )
 389     tag-number OR ;
 390
 391 M:: x86 %allot ( dst size class nursery-ptr -- )
 392     nursery-ptr dst load-allot-ptr
 393     dst class store-header
 394     dst class store-tagged
 395     nursery-ptr size inc-allot-ptr ;
 396
 397 M:: x86 %write-barrier ( src card# table -- )
 398     #! Mark the card pointed to by vreg.
 399     ! Mark the card
 400     card# src MOV
 401     card# card-bits SHR
 402     table "cards_offset" %vm-field-ptr
 403     table table [] MOV
 404     table card# [+] card-mark <byte> MOV
 405
 406     ! Mark the card deck
 407     card# deck-bits card-bits - SHR
 408     table "decks_offset" %vm-field-ptr
 409     table table [] MOV
 410     table card# [+] card-mark <byte> MOV ;
 411
 412 M:: x86 %check-nursery ( label temp1 temp2 -- )
 413     temp1 load-zone-ptr
 414     temp2 temp1 cell [+] MOV
 415     temp2 1024 ADD
 416     temp1 temp1 3 cells [+] MOV
 417     temp2 temp1 CMP
 418     label JLE ;
 419
 420 M: x86 %save-gc-root ( gc-root register -- ) [ gc-root@ ] dip MOV ;
 421
 422 M: x86 %load-gc-root ( gc-root register -- ) swap gc-root@ MOV ;
 423
 424 M: x86 %alien-global ( dst symbol library -- )
 425     [ 0 MOV ] 2dip rc-absolute-cell rel-dlsym ;
 426
 427 M: x86 %epilogue ( n -- ) cell - incr-stack-reg ;
 428
 429 :: %boolean ( dst temp word -- )
 430     dst \ f tag-number MOV
 431     temp 0 MOV \ t rc-absolute-cell rel-immediate
 432     dst temp word execute ; inline
 433
 434 M:: x86 %compare ( dst src1 src2 cc temp -- )
 435     src1 src2 CMP
 436     cc order-cc {
 437         { cc<  [ dst temp \ CMOVL %boolean ] }
 438         { cc<= [ dst temp \ CMOVLE %boolean ] }
 439         { cc>  [ dst temp \ CMOVG %boolean ] }
 440         { cc>= [ dst temp \ CMOVGE %boolean ] }
 441         { cc=  [ dst temp \ CMOVE %boolean ] }
 442         { cc/= [ dst temp \ CMOVNE %boolean ] }
 443     } case ;
 444
 445 M: x86 %compare-imm ( dst src1 src2 cc temp -- )
 446     %compare ;
 447
 448 M:: x86 %compare-branch ( label src1 src2 cc -- )
 449     src1 src2 CMP
 450     cc order-cc {
 451         { cc<  [ label JL ] }
 452         { cc<= [ label JLE ] }
 453         { cc>  [ label JG ] }
 454         { cc>= [ label JGE ] }
 455         { cc=  [ label JE ] }
 456         { cc/= [ label JNE ] }
 457     } case ;
 458
 459 M: x86 %compare-imm-branch ( label src1 src2 cc -- )
 460     %compare-branch ;
 461
 462 M: x86 %add-float double-rep two-operand ADDSD ;
 463 M: x86 %sub-float double-rep two-operand SUBSD ;
 464 M: x86 %mul-float double-rep two-operand MULSD ;
 465 M: x86 %div-float double-rep two-operand DIVSD ;
 466 M: x86 %min-float double-rep two-operand MINSD ;
 467 M: x86 %max-float double-rep two-operand MAXSD ;
 468 M: x86 %sqrt SQRTSD ;
 469
 470 M: x86 %single>double-float CVTSS2SD ;
 471 M: x86 %double>single-float CVTSD2SS ;
 472
 473 M: x86 %integer>float CVTSI2SD ;
 474 M: x86 %float>integer CVTTSD2SI ;
 475
 476 M: x86 %unbox-float ( dst src -- )
 477     float-offset [+] MOVSD ;
 478
 479 M:: x86 %box-float ( dst src temp -- )
 480     dst 16 float temp %allot
 481     dst float-offset [+] src MOVSD ;
 482
 483 : %cmov-float= ( dst src -- )
 484     [
 485         "no-move" define-label
 486
 487         "no-move" get [ JNE ] [ JP ] bi
 488         MOV
 489         "no-move" resolve-label
 490     ] with-scope ;
 491
 492 : %cmov-float/= ( dst src -- )
 493     [
 494         "no-move" define-label
 495         "move" define-label
 496
 497         "move" get JP
 498         "no-move" get JE
 499         "move" resolve-label
 500         MOV
 501         "no-move" resolve-label
 502     ] with-scope ;
 503
 504 :: (%compare-float) ( dst src1 src2 cc temp compare -- )
 505     cc {
 506         { cc<    [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVA  %boolean ] }
 507         { cc<=   [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVAE %boolean ] }
 508         { cc>    [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVA  %boolean ] }
 509         { cc>=   [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVAE %boolean ] }
 510         { cc=    [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float= %boolean ] }
 511         { cc<>   [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNE %boolean ] }
 512         { cc<>=  [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNP %boolean ] }
 513         { cc/<   [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVBE %boolean ] }
 514         { cc/<=  [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVB  %boolean ] }
 515         { cc/>   [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVBE %boolean ] }
 516         { cc/>=  [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVB  %boolean ] }
 517         { cc/=   [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float/= %boolean ] }
 518         { cc/<>  [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVE  %boolean ] }
 519         { cc/<>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVP  %boolean ] }
 520     } case ; inline
 521
 522 M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- )
 523     \ COMISD (%compare-float) ;
 524
 525 M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- )
 526     \ UCOMISD (%compare-float) ;
 527
 528 : %jump-float= ( label -- )
 529     [
 530         "no-jump" define-label
 531         "no-jump" get JP
 532         JE
 533         "no-jump" resolve-label
 534     ] with-scope ;
 535
 536 : %jump-float/= ( label -- )
 537     [ JNE ] [ JP ] bi ;
 538
 539 :: (%compare-float-branch) ( label src1 src2 cc compare -- )
 540     cc {
 541         { cc<    [ src2 src1 \ compare execute( a b -- ) label JA  ] }
 542         { cc<=   [ src2 src1 \ compare execute( a b -- ) label JAE ] }
 543         { cc>    [ src1 src2 \ compare execute( a b -- ) label JA  ] }
 544         { cc>=   [ src1 src2 \ compare execute( a b -- ) label JAE ] }
 545         { cc=    [ src1 src2 \ compare execute( a b -- ) label %jump-float= ] }
 546         { cc<>   [ src1 src2 \ compare execute( a b -- ) label JNE ] }
 547         { cc<>=  [ src1 src2 \ compare execute( a b -- ) label JNP ] }
 548         { cc/<   [ src2 src1 \ compare execute( a b -- ) label JBE ] }
 549         { cc/<=  [ src2 src1 \ compare execute( a b -- ) label JB  ] }
 550         { cc/>   [ src1 src2 \ compare execute( a b -- ) label JBE ] }
 551         { cc/>=  [ src1 src2 \ compare execute( a b -- ) label JB  ] }
 552         { cc/=   [ src1 src2 \ compare execute( a b -- ) label %jump-float/= ] }
 553         { cc/<>  [ src1 src2 \ compare execute( a b -- ) label JE  ] }
 554         { cc/<>= [ src1 src2 \ compare execute( a b -- ) label JP  ] }
 555     } case ;
 556
 557 M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- )
 558     \ COMISD (%compare-float-branch) ;
 559
 560 M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- )
 561     \ UCOMISD (%compare-float-branch) ;
 562
 563 M:: x86 %box-vector ( dst src rep temp -- )
 564     dst rep rep-size 2 cells + byte-array temp %allot
 565     16 tag-fixnum dst 1 byte-array tag-number %set-slot-imm
 566     dst byte-array-offset [+]
 567     src rep %copy ;
 568
 569 M:: x86 %unbox-vector ( dst src rep -- )
 570     dst src byte-array-offset [+]
 571     rep %copy ;
 572
 573 MACRO: available-reps ( alist -- )
 574     ! Each SSE version adds new representations and supports
 575     ! all old ones
 576     unzip { } [ append ] accumulate rest swap suffix
 577     [ [ 1quotation ] map ] bi@ zip
 578     reverse [ { } ] suffix
 579     '[ _ cond ] ;
 580
 581 M: x86 %broadcast-vector ( dst src rep -- )
 582     {
 583         { float-4-rep [ [ float-4-rep %copy ] [ drop dup 0 SHUFPS ] 2bi ] }
 584         { double-2-rep [ [ double-2-rep %copy ] [ drop dup UNPCKLPD ] 2bi ] }
 585     } case ;
 586
 587 M: x86 %broadcast-vector-reps
 588     {
 589         ! Can't do this with sse1 since it will want to unbox
 590         ! a double-precision float and convert to single precision
 591         { sse2? { float-4-rep double-2-rep } }
 592     } available-reps ;
 593
 594 M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
 595     {
 596         {
 597             [ rep float-4-rep eq? ]
 598             [
 599                 dst src1 float-4-rep %copy
 600                 dst src2 UNPCKLPS
 601                 src3 src4 UNPCKLPS
 602                 dst src3 MOVLHPS
 603             ]
 604         }
 605         {
 606             [ rep { int-4-rep uint-4-rep } memq? ]
 607             [
 608                 dst src1 int-4-rep %copy
 609                 dst src2 PUNPCKLDQ
 610                 src3 src4 PUNPCKLDQ
 611                 dst src3 PUNPCKLQDQ
 612             ]
 613         }
 614     } cond ;
 615
 616 M: x86 %gather-vector-4-reps
 617     {
 618         ! Can't do this with sse1 since it will want to unbox
 619         ! double-precision floats and convert to single precision
 620         { sse2? { float-4-rep int-4-rep uint-4-rep } }
 621     } available-reps ;
 622
 623 M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
 624     rep {
 625         {
 626             double-2-rep
 627             [
 628                 dst src1 double-2-rep %copy
 629                 dst src2 UNPCKLPD
 630             ]
 631         }
 632     } case ;
 633
 634 M: x86 %gather-vector-2-reps
 635     {
 636         { sse2? { double-2-rep } }
 637     } available-reps ;
 638
 639 M: x86 %add-vector ( dst src1 src2 rep -- )
 640     [ two-operand ] keep
 641     {
 642         { float-4-rep [ ADDPS ] }
 643         { double-2-rep [ ADDPD ] }
 644         { char-16-rep [ PADDB ] }
 645         { uchar-16-rep [ PADDB ] }
 646         { short-8-rep [ PADDW ] }
 647         { ushort-8-rep [ PADDW ] }
 648         { int-4-rep [ PADDD ] }
 649         { uint-4-rep [ PADDD ] }
 650         { longlong-2-rep [ PADDQ ] }
 651         { ulonglong-2-rep [ PADDQ ] }
 652     } case ;
 653
 654 M: x86 %add-vector-reps
 655     {
 656         { sse? { float-4-rep } }
 657         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 658     } available-reps ;
 659
 660 M: x86 %saturated-add-vector ( dst src1 src2 rep -- )
 661     [ two-operand ] keep
 662     {
 663         { char-16-rep [ PADDSB ] }
 664         { uchar-16-rep [ PADDUSB ] }
 665         { short-8-rep [ PADDSW ] }
 666         { ushort-8-rep [ PADDUSW ] }
 667     } case ;
 668
 669 M: x86 %saturated-add-vector-reps
 670     {
 671         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
 672     } available-reps ;
 673
 674 M: x86 %add-sub-vector ( dst src1 src2 rep -- )
 675     [ two-operand ] keep
 676     {
 677         { float-4-rep [ ADDSUBPS ] }
 678         { double-2-rep [ ADDSUBPD ] }
 679     } case ;
 680
 681 M: x86 %add-sub-vector-reps
 682     {
 683         { sse3? { float-4-rep double-2-rep } }
 684     } available-reps ;
 685
 686 M: x86 %sub-vector ( dst src1 src2 rep -- )
 687     [ two-operand ] keep
 688     {
 689         { float-4-rep [ SUBPS ] }
 690         { double-2-rep [ SUBPD ] }
 691         { char-16-rep [ PSUBB ] }
 692         { uchar-16-rep [ PSUBB ] }
 693         { short-8-rep [ PSUBW ] }
 694         { ushort-8-rep [ PSUBW ] }
 695         { int-4-rep [ PSUBD ] }
 696         { uint-4-rep [ PSUBD ] }
 697         { longlong-2-rep [ PSUBQ ] }
 698         { ulonglong-2-rep [ PSUBQ ] }
 699     } case ;
 700
 701 M: x86 %sub-vector-reps
 702     {
 703         { sse? { float-4-rep } }
 704         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 705     } available-reps ;
 706
 707 M: x86 %saturated-sub-vector ( dst src1 src2 rep -- )
 708     [ two-operand ] keep
 709     {
 710         { char-16-rep [ PSUBSB ] }
 711         { uchar-16-rep [ PSUBUSB ] }
 712         { short-8-rep [ PSUBSW ] }
 713         { ushort-8-rep [ PSUBUSW ] }
 714     } case ;
 715
 716 M: x86 %saturated-sub-vector-reps
 717     {
 718         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
 719     } available-reps ;
 720
 721 M: x86 %mul-vector ( dst src1 src2 rep -- )
 722     [ two-operand ] keep
 723     {
 724         { float-4-rep [ MULPS ] }
 725         { double-2-rep [ MULPD ] }
 726         { short-8-rep [ PMULLW ] }
 727         { ushort-8-rep [ PMULLW ] }
 728         { int-4-rep [ PMULLD ] }
 729         { uint-4-rep [ PMULLD ] }
 730     } case ;
 731
 732 M: x86 %mul-vector-reps
 733     {
 734         { sse? { float-4-rep } }
 735         { sse2? { double-2-rep short-8-rep ushort-8-rep } }
 736         { sse4.1? { int-4-rep uint-4-rep } }
 737     } available-reps ;
 738
 739 M: x86 %saturated-mul-vector-reps
 740     ! No multiplication with saturation on x86
 741     { } ;
 742
 743 M: x86 %div-vector ( dst src1 src2 rep -- )
 744     [ two-operand ] keep
 745     {
 746         { float-4-rep [ DIVPS ] }
 747         { double-2-rep [ DIVPD ] }
 748     } case ;
 749
 750 M: x86 %div-vector-reps
 751     {
 752         { sse? { float-4-rep } }
 753         { sse2? { double-2-rep } }
 754     } available-reps ;
 755
 756 M: x86 %min-vector ( dst src1 src2 rep -- )
 757     [ two-operand ] keep
 758     {
 759         { char-16-rep [ PMINSB ] }
 760         { uchar-16-rep [ PMINUB ] }
 761         { short-8-rep [ PMINSW ] }
 762         { ushort-8-rep [ PMINUW ] }
 763         { int-4-rep [ PMINSD ] }
 764         { uint-4-rep [ PMINUD ] }
 765         { float-4-rep [ MINPS ] }
 766         { double-2-rep [ MINPD ] }
 767     } case ;
 768
 769 M: x86 %min-vector-reps
 770     {
 771         { sse? { float-4-rep } }
 772         { sse2? { uchar-16-rep short-8-rep double-2-rep short-8-rep uchar-16-rep } }
 773         { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
 774     } available-reps ;
 775
 776 M: x86 %max-vector ( dst src1 src2 rep -- )
 777     [ two-operand ] keep
 778     {
 779         { char-16-rep [ PMAXSB ] }
 780         { uchar-16-rep [ PMAXUB ] }
 781         { short-8-rep [ PMAXSW ] }
 782         { ushort-8-rep [ PMAXUW ] }
 783         { int-4-rep [ PMAXSD ] }
 784         { uint-4-rep [ PMAXUD ] }
 785         { float-4-rep [ MAXPS ] }
 786         { double-2-rep [ MAXPD ] }
 787     } case ;
 788
 789 M: x86 %max-vector-reps
 790     {
 791         { sse? { float-4-rep } }
 792         { sse2? { uchar-16-rep short-8-rep double-2-rep short-8-rep uchar-16-rep } }
 793         { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
 794     } available-reps ;
 795
 796 M: x86 %horizontal-add-vector ( dst src rep -- )
 797     {
 798         { float-4-rep [ [ float-4-rep %copy ] [ HADDPS ] [ HADDPS ] 2tri ] }
 799         { double-2-rep [ [ double-2-rep %copy ] [ HADDPD ] 2bi ] }
 800     } case ;
 801
 802 M: x86 %horizontal-add-vector-reps
 803     {
 804         { sse3? { float-4-rep double-2-rep } }
 805     } available-reps ;
 806
 807 M: x86 %horizontal-shl-vector ( dst src1 src2 rep -- )
 808     two-operand PSLLDQ ;
 809
 810 M: x86 %horizontal-shl-vector-reps
 811     {
 812         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 813     } available-reps ;
 814
 815 M: x86 %horizontal-shr-vector ( dst src1 src2 rep -- )
 816     two-operand PSRLDQ ;
 817
 818 M: x86 %horizontal-shr-vector-reps
 819     {
 820         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 821     } available-reps ;
 822
 823 M: x86 %abs-vector ( dst src rep -- )
 824     {
 825         { char-16-rep [ PABSB ] }
 826         { short-8-rep [ PABSW ] }
 827         { int-4-rep [ PABSD ] }
 828     } case ;
 829
 830 M: x86 %abs-vector-reps
 831     {
 832         { ssse3? { char-16-rep short-8-rep int-4-rep } }
 833     } available-reps ;
 834
 835 M: x86 %sqrt-vector ( dst src rep -- )
 836     {
 837         { float-4-rep [ SQRTPS ] }
 838         { double-2-rep [ SQRTPD ] }
 839     } case ;
 840
 841 M: x86 %sqrt-vector-reps
 842     {
 843         { sse? { float-4-rep } }
 844         { sse2? { double-2-rep } }
 845     } available-reps ;
 846
 847 M: x86 %and-vector ( dst src1 src2 rep -- )
 848     [ two-operand ] keep
 849     {
 850         { float-4-rep [ ANDPS ] }
 851         { double-2-rep [ ANDPD ] }
 852         [ drop PAND ]
 853     } case ;
 854
 855 M: x86 %and-vector-reps
 856     {
 857         { sse? { float-4-rep } }
 858         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 859     } available-reps ;
 860
 861 M: x86 %andn-vector ( dst src1 src2 rep -- )
 862     [ two-operand ] keep
 863     {
 864         { float-4-rep [ ANDNPS ] }
 865         { double-2-rep [ ANDNPD ] }
 866         [ drop PANDN ]
 867     } case ;
 868
 869 M: x86 %andn-vector-reps
 870     {
 871         { sse? { float-4-rep } }
 872         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 873     } available-reps ;
 874
 875 M: x86 %or-vector ( dst src1 src2 rep -- )
 876     [ two-operand ] keep
 877     {
 878         { float-4-rep [ ORPS ] }
 879         { double-2-rep [ ORPD ] }
 880         [ drop POR ]
 881     } case ;
 882
 883 M: x86 %or-vector-reps
 884     {
 885         { sse? { float-4-rep } }
 886         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 887     } available-reps ;
 888
 889 M: x86 %xor-vector ( dst src1 src2 rep -- )
 890     [ two-operand ] keep
 891     {
 892         { float-4-rep [ XORPS ] }
 893         { double-2-rep [ XORPD ] }
 894         [ drop PXOR ]
 895     } case ;
 896
 897 M: x86 %xor-vector-reps
 898     {
 899         { sse? { float-4-rep } }
 900         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 901     } available-reps ;
 902
 903 M: x86 %shl-vector ( dst src1 src2 rep -- )
 904     [ two-operand ] keep
 905     {
 906         { short-8-rep [ PSLLW ] }
 907         { ushort-8-rep [ PSLLW ] }
 908         { int-4-rep [ PSLLD ] }
 909         { uint-4-rep [ PSLLD ] }
 910         { longlong-2-rep [ PSLLQ ] }
 911         { ulonglong-2-rep [ PSLLQ ] }
 912     } case ;
 913
 914 M: x86 %shl-vector-reps
 915     {
 916         { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
 917     } available-reps ;
 918
 919 M: x86 %shr-vector ( dst src1 src2 rep -- )
 920     [ two-operand ] keep
 921     {
 922         { short-8-rep [ PSRAW ] }
 923         { ushort-8-rep [ PSRLW ] }
 924         { int-4-rep [ PSRAD ] }
 925         { uint-4-rep [ PSRLD ] }
 926         { ulonglong-2-rep [ PSRLQ ] }
 927     } case ;
 928
 929 M: x86 %shr-vector-reps
 930     {
 931         { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } }
 932     } available-reps ;
 933
 934 M: x86 %integer>scalar drop MOVD ;
 935
 936 M: x86 %scalar>integer drop MOVD ;
 937
 938 M:: x86 %spill ( src rep dst -- ) dst src rep %copy ;
 939 M:: x86 %reload ( dst rep src -- ) dst src rep %copy ;
 940
 941 M: x86 %loop-entry 16 code-alignment [ NOP ] times ;
 942
 943 M:: x86 %save-context ( temp1 temp2 callback-allowed? -- )
 944     #! Save Factor stack pointers in case the C code calls a
 945     #! callback which does a GC, which must reliably trace
 946     #! all roots.
 947     temp1 0 MOV rc-absolute-cell rt-vm rel-fixup
 948     temp1 temp1 "stack_chain" vm-field-offset [+] MOV
 949     temp2 stack-reg cell neg [+] LEA
 950     temp1 [] temp2 MOV
 951     callback-allowed? [
 952         temp1 2 cells [+] ds-reg MOV
 953         temp1 3 cells [+] rs-reg MOV
 954     ] when ;
 955
 956 M: x86 value-struct? drop t ;
 957
 958 M: x86 small-enough? ( n -- ? )
 959     HEX: -80000000 HEX: 7fffffff between? ;
 960
 961 : next-stack@ ( n -- operand )
 962     #! nth parameter from the next stack frame. Used to box
 963     #! input values to callbacks; the callback has its own
 964     #! stack frame set up, and we want to read the frame
 965     #! set up by the caller.
 966     stack-frame get total-size>> + stack@ ;
 967
 968 enable-simd
 969 enable-min/max
 970 enable-fixnum-log2
 971
 972 :: install-sse2-check ( -- )
 973     [
 974         sse-version 20 < [
 975             "This image was built to use SSE2 but your CPU does not support it." print
 976             "You will need to bootstrap Factor again." print
 977             flush
 978             1 exit
 979         ] when
 980     ] "cpu.x86" add-init-hook ;
 981
 982 : enable-sse2 ( version -- )
 983     20 >= [
 984         enable-float-intrinsics
 985         enable-float-functions
 986         enable-float-min/max
 987         enable-fsqrt
 988         install-sse2-check
 989     ] when ;
 990
 991 : check-sse ( -- )
 992     [ { sse_version } compile ] with-optimizer
 993     "Checking for multimedia extensions: " write sse-version
 994     [ sse-string write " detected" print ] [ enable-sse2 ] bi ;