Add SSE variant of pow/powi to interpreter.

Use SSE pow/powi helper functions from compiled code. Cleanup use of helper functions. Related cleanups of folding functions in x64 interpreter.
2009-12-25 23:12:30 +01:00 · 2009-12-25 23:12:30 +01:00 · 690760aa38
parent 6ce0c90ed6
commit 690760aa38
4 changed files with 949 additions and 740 deletions
--- a/src/buildvm_x86.dasc
+++ b/src/buildvm_x86.dasc
@ -96,10 +96,6 @@
 |.type TRACE,		Trace
 |.type EXITINFO,	ExitInfo
 |
-|// x86/x64 portability macros
-|.macro push_eax; .if X64; push rax; .else; push eax; .endif; .endmacro
-|.macro pop_eax; .if X64; pop rax; .else; pop eax; .endif; .endmacro
-|
 |// Stack layout while in interpreter. Must match with lj_frame.h.
 |//-----------------------------------------------------------------------
 |.if not X64		// x86 stack layout.
@ -2072,10 +2068,10 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
  |  fpop1
  |  jmp ->fff_resn
  |
-  if (0 && sse) {  // NYI
-    |.ffunc_nnsse math_pow;  call ->vm_pow;  jmp ->fff_resxmm0
+  if (sse) {
+    |.ffunc_nnsse math_pow;	call ->vm_pow;	jmp ->fff_resxmm0
  } else {
-    |.ffunc_nn math_pow;  call ->vm_pow;  jmp ->fff_resn
+    |.ffunc_nn math_pow;	call ->vm_pow;	jmp ->fff_resn
  }
  |
  |.macro math_minmax, name, cmovop, nocmovop, sseop
@ -2091,6 +2087,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
  |  add RB, 1
  |  jmp <1
  ||} else {
+  |.if not X64
  |.ffunc_n name
  |  mov RB, 2
  |1:
@ -2101,12 +2098,13 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
  ||if (cmov) {
  |  fucomi st1; cmovop st1; fpop1
  ||} else {
-  |  push_eax
+  |  push eax
  |  fucom st1; fnstsw ax; test ah, 1; nocmovop >2; fxch; 2: ; fpop
-  |  pop_eax
+  |  pop eax
  ||}
  |  add RB, 1
  |  jmp <1
+  |.endif
  ||}
  |.endmacro
  |
@ -2842,19 +2840,29 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
  |->vm_exp:
  |  fldl2e; fmulp st1				// e^x ==> 2^(x*log2(e))
  |->vm_exp2:
-  |  fst dword [esp+4]				// Caveat: overwrites ARG1.
-  |  cmp dword [esp+4], 0x7f800000; je >1	// Special case: e^+Inf = +Inf
-  |  cmp dword [esp+4], 0xff800000; je >2	// Special case: e^-Inf = 0
+  |  .if X64WIN
+  |    .define expscratch, dword [rsp+8]	// Use scratch area.
+  |  .elif X64
+  |    .define expscratch, dword [rsp-8]	// Use red zone.
+  |  .else
+  |    .define expscratch, dword [esp+4]	// Needs 4 byte scratch area.
+  |  .endif
+  |  fst expscratch				// Caveat: overwrites ARG1.
+  |  cmp expscratch, 0x7f800000; je >1		// Special case: e^+Inf = +Inf
+  |  cmp expscratch, 0xff800000; je >2		// Special case: e^-Inf = 0
  |->vm_exp2raw:  // Entry point for vm_pow. Without +-Inf check.
-  |  fdup; frndint; fsub st1, st0; fxch	// Split into frac/int part.
+  |  fdup; frndint; fsub st1, st0; fxch		// Split into frac/int part.
  |  f2xm1; fld1; faddp st1; fscale; fpop1	// ==> (2^frac-1 +1) << int
  |1:
  |  ret
  |2:
  |  fpop; fldz; ret
  |
-  |// Generic power function x^y. Called by BC_POW, math.pow fast function
-  |// and vm_arith. Args/ret on x87 stack (y on top). No int/xmm regs modified.
+  |// Generic power function x^y. Called by BC_POW, math.pow fast function,
+  |// and vm_arith.
+  if (!sse) {
+  |.if not X64
+  |// Args/ret on x87 stack (y on top). RC (eax) modified.
  |// Caveat: needs 3 slots on x87 stack!
  |->vm_pow:
  |  fist dword [esp+4]			// Store/reload int before comparison.
@ -2862,18 +2870,16 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
  ||if (cmov) {
  |  fucomip st1
  ||} else {
-  |  push_eax; fucomp st1; fnstsw ax; sahf; pop_eax
+  |  fucomp st1; fnstsw ax; sahf
  ||}
  |  jnz >8				// Branch for FP exponents.
  |  jp >9				// Branch for NaN exponent.
  |  fpop				// Pop y and fallthrough to vm_powi.
  |
-  |// FP/int power function x^i. Called from JIT code. Arg1/ret on x87 stack.
-  |// Arg2 (int) on C stack. No int/xmm regs modified.
+  |// FP/int power function x^i. Arg1/ret on x87 stack.
+  |// Arg2 (int) on C stack. RC (eax) modified.
  |// Caveat: needs 2 slots on x87 stack!
-  |->vm_powi:
-  |  push_eax
-  |  mov eax, [esp+8]
+  |  mov eax, [esp+4]
  |  cmp eax, 1; jle >6			// i<=1?
  |  // Now 1 < (unsigned)i <= 0x80000000.
  |1:  // Handle leading zeros.
@ -2893,7 +2899,6 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
  |4:
  |  fmulp st1
  |5:
-  |  pop_eax
  |  ret
  |6:
  |  je <5				// x^1 ==> x
@ -2904,19 +2909,16 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
  |  jmp <1				// x^-i ==> (1/x)^i
  |7:
  |  fpop; fld1				// x^0 ==> 1
-  |  pop_eax
  |  ret
  |
  |8:  // FP/FP power function x^y.
-  |  push_eax
-  |  fst dword [esp+8]
+  |  fst dword [esp+4]
  |  fxch
-  |  fst dword [esp+12]
-  |  mov eax, [esp+8]; shl eax, 1
+  |  fst dword [esp+8]
+  |  mov eax, [esp+4]; shl eax, 1
  |  cmp eax, 0xff000000; je >2			// x^+-Inf?
-  |  mov eax, [esp+12]; shl eax, 1; je >4	// +-0^y?
+  |  mov eax, [esp+8]; shl eax, 1; je >4	// +-0^y?
  |  cmp eax, 0xff000000; je >4			// +-Inf^y?
-  |  pop_eax
  |  fyl2x
  |  jmp ->vm_exp2raw
  |
@ -2925,7 +2927,7 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
  ||if (cmov) {
  |  fucomip st2
  ||} else {
-  |  push_eax; fucomp st2; fnstsw ax; sahf; pop_eax
+  |  fucomp st2; fnstsw ax; sahf
  ||}
  |  je >1				// 1^NaN ==> 1
  |  fxch				// x^NaN ==> NaN
@ -2943,41 +2945,205 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
  ||}
  |  je >3					// +-1^+-Inf ==> 1
  |  fpop; fabs; fldz; mov eax, 0; setc al
-  |  ror eax, 1; xor eax, [esp+8]; jns >3	// |x|<>1, x^+-Inf ==> +Inf/0
+  |  ror eax, 1; xor eax, [esp+4]; jns >3	// |x|<>1, x^+-Inf ==> +Inf/0
  |  fxch
  |3:
-  |  fpop1; fabs; pop_eax
+  |  fpop1; fabs
  |  ret
  |
  |4:  // Handle +-0^y or +-Inf^y.
-  |  cmp dword [esp+8], 0; jge <3		// y >= 0, x^y ==> |x|
+  |  cmp dword [esp+4], 0; jge <3		// y >= 0, x^y ==> |x|
  |  fpop; fpop
-  |  test eax, eax; pop_eax; jz >5		// y < 0, +-0^y ==> +Inf
+  |  test eax, eax; jz >5			// y < 0, +-0^y ==> +Inf
  |  fldz					// y < 0, +-Inf^y ==> 0
  |  ret
  |5:
-  |  mov dword [esp+8], 0x7f800000		// Return +Inf.
-  |  fld dword [esp+8]
+  |  mov dword [esp+4], 0x7f800000		// Return +Inf.
+  |  fld dword [esp+4]
+  |  ret
+  |.endif
+  } else {
+    |->vm_pow:
+  }
+  |
+  |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified.
+  |// Needs 16 byte scratch area for x86. Also called from JIT code.
+  |->vm_pow_sse:
+  |  cvtsd2si eax, xmm1
+  |  cvtsi2sd xmm2, eax
+  |  ucomisd xmm1, xmm2
+  |  jnz >8				// Branch for FP exponents.
+  |  jp >9				// Branch for NaN exponent.
+  |  // Fallthrough to vm_powi_sse.
+  |
+  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
+  |->vm_powi_sse:
+  |  cmp eax, 1; jle >6			// i<=1?
+  |  // Now 1 < (unsigned)i <= 0x80000000.
+  |1:  // Handle leading zeros.
+  |  test eax, 1; jnz >2
+  |  mulsd xmm0, xmm0
+  |  shr eax, 1
+  |  jmp <1
+  |2:
+  |  shr eax, 1; jz >5
+  |  movaps xmm1, xmm0
+  |3:  // Handle trailing bits.
+  |  mulsd xmm0, xmm0
+  |  shr eax, 1; jz >4
+  |  jnc <3
+  |  mulsd xmm1, xmm0
+  |  jmp <3
+  |4:
+  |  mulsd xmm0, xmm1
+  |5:
+  |  ret
+  |6:
+  |  je <5				// x^1 ==> x
+  |  jb >7
+  |  push RDa
+  |  sseconst_1 xmm1, RDa
+  |  divsd xmm1, xmm0
+  |  pop RDa
+  |  movaps xmm0, xmm1
+  |  neg eax
+  |  cmp eax, 1; je <5			// x^-1 ==> 1/x
+  |  jmp <1				// x^-i ==> (1/x)^i
+  |7:
+  |  sseconst_1 xmm0, RDa
+  |  ret
+  |
+  |8:  // FP/FP power function x^y.
+  |.if X64
+  |  movd rax, xmm1; shl rax, 1
+  |  ror rax, 32; cmp rax, 0xffe00000; je >2	// x^+-Inf?
+  |  movd rax, xmm0; shl rax, 1; je >4		// +-0^y?
+  |  ror rax, 32; cmp rax, 0xffe00000; je >5	// +-Inf^y?
+  |  .if X64WIN
+  |    movsd qword [rsp+16], xmm1		// Use scratch area.
+  |    movsd qword [rsp+8], xmm0
+  |    fld qword [rsp+16]
+  |    fld qword [rsp+8]
+  |  .else
+  |    movsd qword [rsp-16], xmm1		// Use red zone.
+  |    movsd qword [rsp-8], xmm0
+  |    fld qword [rsp-16]
+  |    fld qword [rsp-8]
+  |  .endif
+  |.else
+  |  movsd qword [esp+12], xmm1			// Needs 16 byte scratch area.
+  |  movsd qword [esp+4], xmm0
+  |  cmp dword [esp+12], 0; jne >1
+  |  mov eax, [esp+16]; shl eax, 1
+  |  cmp eax, 0xffe00000; je >2			// x^+-Inf?
+  |1:
+  |  cmp dword [esp+4], 0; jne >1
+  |  mov eax, [esp+8]; shl eax, 1; je >4	// +-0^y?
+  |  cmp eax, 0xffe00000; je >5			// +-Inf^y?
+  |1:
+  |  fld qword [esp+12]
+  |  fld qword [esp+4]
+  |.endif
+  |  fyl2x					// y*log2(x)
+  |  fdup; frndint; fsub st1, st0; fxch		// Split into frac/int part.
+  |  f2xm1; fld1; faddp st1; fscale; fpop1	// ==> (2^frac-1 +1) << int
+  |.if X64WIN
+  |  fstp qword [rsp+8]				// Use scratch area.
+  |  movsd xmm0, qword [rsp+8]
+  |.elif X64
+  |  fstp qword [rsp-8]				// Use red zone.
+  |  movsd xmm0, qword [rsp-8]
+  |.else
+  |  fstp qword [esp+4]				// Needs 8 byte scratch area.
+  |  movsd xmm0, qword [esp+4]
+  |.endif
+  |  ret
+  |
+  |9:  // Handle x^NaN.
+  |  sseconst_1 xmm2, RDa
+  |  ucomisd xmm0, xmm2; je >1			// 1^NaN ==> 1
+  |  movaps xmm0, xmm1				// x^NaN ==> NaN
+  |1:
+  |  ret
+  |
+  |2:  // Handle x^+-Inf.
+  |  sseconst_abs xmm2, RDa
+  |  andpd xmm0, xmm2				// |x|
+  |  sseconst_1 xmm2, RDa
+  |  ucomisd xmm0, xmm2; je <1			// +-1^+-Inf ==> 1
+  |  movmskpd eax, xmm1
+  |  xorps xmm0, xmm0
+  |  mov ah, al; setc al; xor al, ah; jne <1	// |x|<>1, x^+-Inf ==> +Inf/0
+  |3:
+  |  sseconst_hi xmm0, RDa, 7ff00000  // +Inf
+  |  ret
+  |
+  |4:  // Handle +-0^y.
+  |  movmskpd eax, xmm1; test eax, eax; jnz <3	// y < 0, +-0^y ==> +Inf
+  |  xorps xmm0, xmm0				// y >= 0, +-0^y ==> 0
+  |  ret
+  |
+  |5:  // Handle +-Inf^y.
+  |  movmskpd eax, xmm1; test eax, eax; jz <3	// y >= 0, +-Inf^y ==> +Inf
+  |  xorps xmm0, xmm0				// y < 0, +-Inf^y ==> 0
  |  ret
  |
  |// Callable from C: double lj_vm_foldfpm(double x, int fpm)
  |// Computes fpm(x) for extended math functions. ORDER FPM.
  |->vm_foldfpm:
  if (sse) {
-    |.if X64WIN
-    |  .define fpmop, CARG2d
-    |.elif X64
-    |  .define fpmop, CARG1d
-    |.else
-    |  .define fpmop, eax
-    |  mov fpmop, [esp+12]
-    |  movsd xmm0, qword [esp+4]
-    |.endif
    |.if X64
+    |
+    |  .if X64WIN
+    |    .define fpmop, CARG2d
+    |  .else
+    |    .define fpmop, CARG1d
+    |  .endif
    |  cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
    |  cmp fpmop, 3; jb ->vm_trunc; ja >2
    |  sqrtsd xmm0, xmm0; ret
-    |.else
+    |2:
+    |  .if X64WIN
+    |    movsd qword [rsp+8], xmm0	// Use scratch area.
+    |    fld qword [rsp+8]
+    |  .else
+    |    movsd qword [rsp-8], xmm0	// Use red zone.
+    |    fld qword [rsp-8]
+    |  .endif
+    |  cmp fpmop, 5; ja >2
+    |  .if X64WIN; pop rax; .endif
+    |  je >1
+    |  call ->vm_exp
+    |  .if X64WIN; push rax; .endif
+    |  jmp >7
+    |1:
+    |  call ->vm_exp2
+    |  .if X64WIN; push rax; .endif
+    |  jmp >7
+    |2: ; cmp fpmop, 7; je >1; ja >2
+    |  fldln2; fxch; fyl2x; jmp >7
+    |1: ; fld1; fxch; fyl2x; jmp >7
+    |2: ; cmp fpmop, 9; je >1; ja >2
+    |  fldlg2; fxch; fyl2x; jmp >7
+    |1: ; fsin; jmp >7
+    |2: ; cmp fpmop, 11; je >1; ja >9
+    |   fcos; jmp >7
+    |1: ; fptan; fpop
+    |7:
+    |  .if X64WIN
+    |    fstp qword [rsp+8]		// Use scratch area.
+    |    movsd xmm0, qword [rsp+8]
+    |  .else
+    |    fstp qword [rsp-8]		// Use red zone.
+    |    movsd xmm0, qword [rsp-8]
+    |  .endif
+    |  ret
+    |
+    |.else  // x86 calling convention.
+    |
+    |  .define fpmop, eax
+    |  mov fpmop, [esp+12]
+    |  movsd xmm0, qword [esp+4]
    |  cmp fpmop, 1; je >1; ja >2
    |  call ->vm_floor; jmp >7
    |1: ; call ->vm_ceil; jmp >7
@ -2989,27 +3155,36 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
    |  movsd qword [esp+4], xmm0	// Overwrite callee-owned args.
    |  fld qword [esp+4]
    |  ret
+    |2: ; fld qword [esp+4]
+    |  cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2
+    |2: ; cmp fpmop, 7; je >1; ja >2
+    |  fldln2; fxch; fyl2x; ret
+    |1: ; fld1; fxch; fyl2x; ret
+    |2: ; cmp fpmop, 9; je >1; ja >2
+    |  fldlg2; fxch; fyl2x; ret
+    |1: ; fsin; ret
+    |2: ; cmp fpmop, 11; je >1; ja >9
+    |   fcos; ret
+    |1: ; fptan; fpop; ret
+    |
    |.endif
-    |2:
-    |  fld qword [esp+4]
  } else {
    |  mov fpmop, [esp+12]
    |  fld qword [esp+4]
    |  cmp fpmop, 1; jb ->vm_floor; je ->vm_ceil
    |  cmp fpmop, 3; jb ->vm_trunc; ja >2
    |  fsqrt; ret
-    |2:
+    |2: ; cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2
+    |  cmp fpmop, 7; je >1; ja >2
+    |  fldln2; fxch; fyl2x; ret
+    |1: ; fld1; fxch; fyl2x; ret
+    |2: ; cmp fpmop, 9; je >1; ja >2
+    |  fldlg2; fxch; fyl2x; ret
+    |1: ; fsin; ret
+    |2: ; cmp fpmop, 11; je >1; ja >9
+    |   fcos; ret
+    |1: ; fptan; fpop; ret
  }
-  |  cmp fpmop, 5; jb ->vm_exp; je ->vm_exp2
-  |  cmp fpmop, 7; je >1; ja >2
-  |  fldln2; fxch; fyl2x; ret
-  |1: ; fld1; fxch; fyl2x; ret
-  |2: ; cmp fpmop, 9; je >1; ja >2
-  |  fldlg2; fxch; fyl2x; ret
-  |1: ; fsin; ret
-  |2: ; cmp fpmop, 11; je >1; ja >9
-  |   fcos; ret
-  |1: ; fptan; fpop; ret
  |9: ; int3					// Bad fpm.
  |
  |// Callable from C: double lj_vm_foldarith(double x, double y, int op)
@ -3017,72 +3192,87 @@ static void build_subroutines(BuildCtx *ctx, int cmov, int sse)
  |// and basic math functions. ORDER ARITH
  |->vm_foldarith:
  if (sse) {
-    |.macro retxmm0; .if X64; ret; .else; jmp >7; .endif; .endmacro
-    |.macro retst0; .if X64; jmp >7; .else; ret; .endif; .endmacro
+    |.if X64
    |
+    |  .if X64WIN
+    |    .define foldop, CARG3d
+    |  .else
+    |    .define foldop, CARG1d
+    |  .endif
+    |  cmp foldop, 1; je >1; ja >2
+    |  addsd xmm0, xmm1; ret
+    |1: ; subsd xmm0, xmm1; ret
+    |2: ; cmp foldop, 3; je >1; ja >2
+    |  mulsd xmm0, xmm1; ret
+    |1: ; divsd xmm0, xmm1; ret
+    |2: ; cmp foldop, 5; jb ->vm_mod; je ->vm_pow
+    |  cmp foldop, 7; je >1; ja >2
+    |  sseconst_sign xmm1, RDa; xorps xmm0, xmm1; ret
+    |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; ret
+    |2: ; cmp foldop, 9; ja >2
    |.if X64WIN
-    |  .define foldop, CARG3d
-    |.elif X64
-    |  .define foldop, CARG1d
+    |  movsd qword [rsp+8], xmm0	// Use scratch area.
+    |  movsd qword [rsp+16], xmm1
+    |  fld qword [rsp+8]
+    |  fld qword [rsp+16]
    |.else
+    |  movsd qword [rsp-8], xmm0	// Use red zone.
+    |  movsd qword [rsp-16], xmm1
+    |  fld qword [rsp-8]
+    |  fld qword [rsp-16]
+    |.endif
+    |  je >1
+    |  fpatan
+    |7:
+    |.if X64WIN
+    |  fstp qword [rsp+8]		// Use scratch area.
+    |  movsd xmm0, qword [rsp+8]
+    |.else
+    |  fstp qword [rsp-8]		// Use red zone.
+    |  movsd xmm0, qword [rsp-8]
+    |.endif
+    |  ret
+    |1: ; fxch; fscale; fpop1; jmp <7
+    |2: ; cmp foldop, 11; je >1; ja >9
+    |  minsd xmm0, xmm1; ret
+    |1: ; maxsd xmm0, xmm1; ret
+    |9: ; int3				// Bad op.
+    |
+    |.else  // x86 calling convention.
+    |
    |  .define foldop, eax
    |  mov foldop, [esp+20]
    |  movsd xmm0, qword [esp+4]
    |  movsd xmm1, qword [esp+12]
-    |.endif
    |  cmp foldop, 1; je >1; ja >2
-    |  addsd xmm0, xmm1; retxmm0
-    |1: ; subsd xmm0, xmm1; retxmm0
-    |2: ; cmp foldop, 3; je >1; ja >2
-    |  mulsd xmm0, xmm1; retxmm0
-    |1: ; divsd xmm0, xmm1; retxmm0
-    |2: ; cmp foldop, 5
-    |.if X64
-    |  jb ->vm_mod; je ->vm_pow		// NYI: broken without SSE vm_pow.
-    |.else
-    |  je >1; ja >2
-    |  call ->vm_mod; retxmm0
-    |1: ; fld qword [esp+4]; fld qword [esp+12]; jmp ->vm_pow  // NYI
-    |2:
-    |.endif
-    |  cmp foldop, 7; je >1; ja >2
-    |  sseconst_sign xmm1, RDa; xorps xmm0, xmm1; retxmm0
-    |1:
-    |  sseconst_abs xmm1, RDa; andps xmm0, xmm1; retxmm0
-    |2: ; cmp foldop, 9; ja >2
-    |.if X64WIN
-    |  movsd qword [esp+8], xmm0	// Use scratch area.
-    |  movsd qword [esp+16], xmm1
-    |  fld qword [esp+8]
-    |  fld qword [esp+16]
-    |.elif X64
-    |  movsd qword [esp-8], xmm0	// Use red zone.
-    |  movsd qword [esp-16], xmm1
-    |  fld qword [esp-8]
-    |  fld qword [esp-16]
-    |.else
-    |  fld qword [esp+4]		// Reload from stack
-    |  fld qword [esp+12]
-    |.endif
-    |  je >1
-    |  fpatan; retst0
-    |1: ; fxch; fscale; fpop1; retst0
-    |2: ; cmp foldop, 11; je >1; ja >9
-    |  minsd xmm0, xmm1; retxmm0
-    |1: ; maxsd xmm0, xmm1; retxmm0
-    |9: ; int3				// Bad op.
-    |7:  // Move return value depending on calling convention.
-    |.if X64WIN
-    |  fstp qword [esp+8]		// Use scratch area.
-    |  movsd xmm0, qword [esp+8]
-    |.elif X64
-    |  fstp qword [esp-8]		// Use red zone.
-    |  movsd xmm0, qword [esp-8]
-    |.else
+    |  addsd xmm0, xmm1
+    |7:
    |  movsd qword [esp+4], xmm0	// Overwrite callee-owned args.
    |  fld qword [esp+4]
-    |.endif
    |  ret
+    |1: ; subsd xmm0, xmm1; jmp <7
+    |2: ; cmp foldop, 3; je >1; ja >2
+    |  mulsd xmm0, xmm1; jmp <7
+    |1: ; divsd xmm0, xmm1; jmp <7
+    |2: ; cmp foldop, 5
+    |  je >1; ja >2
+    |  call ->vm_mod; jmp <7
+    |1: ; pop edx; call ->vm_pow; push edx; jmp <7  // Writes to scratch area.
+    |2: ; cmp foldop, 7; je >1; ja >2
+    |  sseconst_sign xmm1, RDa; xorps xmm0, xmm1; jmp <7
+    |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; jmp <7
+    |2: ; cmp foldop, 9; ja >2
+    |  fld qword [esp+4]		// Reload from stack
+    |  fld qword [esp+12]
+    |  je >1
+    |  fpatan; ret
+    |1: ; fxch; fscale; fpop1; ret
+    |2: ; cmp foldop, 11; je >1; ja >9
+    |  minsd xmm0, xmm1; jmp <7
+    |1: ; maxsd xmm0, xmm1; jmp <7
+    |9: ; int3				// Bad op.
+    |
+    |.endif
  } else {
    |  mov eax, [esp+20]
    |  fld qword [esp+4]
@ -3483,17 +3673,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop, int cmov, int sse)
    |  jmp ->BC_MODVN_Z			// Avoid 3 copies. It's slow anyway.
    break;
  case BC_POW:
-    if (sse) {
-      sse = 0;  /* NYI: temporary workaround. */
-      |  ins_arithpre fld, movsd, xmm1
-      |  call ->vm_pow
-      |  ins_arithpost
-      sse = 1;
-    } else {
-      |  ins_arithpre fld, movsd, xmm1
-      |  call ->vm_pow
-      |  ins_arithpost
-    }
+    |  ins_arithpre fld, movsd, xmm1
+    |  call ->vm_pow
+    |  ins_arithpost
    |  ins_next
    break;

--- a/src/buildvm_x86.h
+++ b/src/buildvm_x86.h
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@ -1991,9 +1991,19 @@ static int fpmjoin_pow(ASMState *as, IRIns *ir)
    IRIns *irpp = IR(irp->op1);
    if (irpp == ir-2 && irpp->o == IR_FPMATH &&
 	irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) {
-      emit_call(as, lj_vm_pow);  /* st0 = lj_vm_pow(st1, st0) */
-      asm_x87load(as, irp->op2);
-      asm_x87load(as, irpp->op1);
+      /* The modified regs must match with the *.dasc implementation. */
+      RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
+      IRIns *irx;
+      if (ra_hasreg(ir->r))
+	rset_clear(drop, ir->r);  /* Dest reg handled below. */
+      ra_evictset(as, drop);
+      ra_destreg(as, ir, RID_XMM0);
+      emit_call(as, lj_vm_pow_sse);
+      irx = IR(irpp->op1);
+      if (ra_noreg(irx->r) && ra_gethint(irx->r) == RID_XMM1)
+	irx->r = RID_INIT;  /* Avoid allocating xmm1 for x. */
+      ra_left(as, RID_XMM0, irpp->op1);
+      ra_left(as, RID_XMM1, irp->op2);
      return 1;
    }
  }
@ -2007,30 +2017,35 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
    Reg dest = ra_dest(as, ir, RSET_FPR);
    Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
    emit_mrm(as, XO_SQRTSD, dest, left);
-  } else if ((as->flags & JIT_F_SSE4_1) && fpm <= IRFPM_TRUNC) {
-    Reg dest = ra_dest(as, ir, RSET_FPR);
-    Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
-    /* Round down/up/trunc == 1001/1010/1011. */
-    emit_i8(as, 0x09 + fpm);
-    /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op. */
-    emit_mrm(as, XO_ROUNDSD, dest, left);
-    /* Let's pretend it's a 3-byte opcode, and compensate afterwards. */
-    /* This is atrocious, but the alternatives are much worse. */
-    if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
-      as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f;  /* Swap 0F and REX. */
-    }
-    *--as->mcp = 0x66;  /* 1st byte of ROUNDSD opcode. */
  } else if (fpm <= IRFPM_TRUNC) {
-    /* The modified regs must match with the *.dasc implementation. */
-    RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
-    if (ra_hasreg(ir->r))
-      rset_clear(drop, ir->r);  /* Dest reg handled below. */
-    ra_evictset(as, drop);
-    ra_destreg(as, ir, RID_XMM0);
-    emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse :
-		  fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
-    ra_left(as, RID_XMM0, ir->op1);
-  } else {
+    if (as->flags & JIT_F_SSE4_1) {  /* SSE4.1 has a rounding instruction. */
+      Reg dest = ra_dest(as, ir, RSET_FPR);
+      Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
+      /* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op.
+      ** Let's pretend it's a 3-byte opcode, and compensate afterwards.
+      ** This is atrocious, but the alternatives are much worse.
+      */
+      /* Round down/up/trunc == 1001/1010/1011. */
+      emit_i8(as, 0x09 + fpm);
+      emit_mrm(as, XO_ROUNDSD, dest, left);
+      if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
+	as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f;  /* Swap 0F and REX. */
+      }
+      *--as->mcp = 0x66;  /* 1st byte of ROUNDSD opcode. */
+    } else {  /* Call helper functions for SSE2 variant. */
+      /* The modified regs must match with the *.dasc implementation. */
+      RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
+      if (ra_hasreg(ir->r))
+	rset_clear(drop, ir->r);  /* Dest reg handled below. */
+      ra_evictset(as, drop);
+      ra_destreg(as, ir, RID_XMM0);
+      emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse :
+		    fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
+      ra_left(as, RID_XMM0, ir->op1);
+    }
+  } else if (fpm == IRFPM_EXP2 && fpmjoin_pow(as, ir)) {
+    /* Rejoined to pow(). */
+  } else {  /* Handle x87 ops. */
    int32_t ofs = sps_scale(ir->s);  /* Use spill slot or slots SPS_TEMP1/2. */
    Reg dest = ir->r;
    if (ra_hasreg(dest)) {
@ -2040,14 +2055,8 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
    }
    emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
    switch (fpm) {  /* st0 = lj_vm_*(st0) */
-    case IRFPM_FLOOR: emit_call(as, lj_vm_floor); break;
-    case IRFPM_CEIL: emit_call(as, lj_vm_ceil); break;
-    case IRFPM_TRUNC: emit_call(as, lj_vm_trunc); break;
    case IRFPM_EXP: emit_call(as, lj_vm_exp); break;
-    case IRFPM_EXP2:
-      if (fpmjoin_pow(as, ir)) return;
-      emit_call(as, lj_vm_exp2);  /* st0 = lj_vm_exp2(st0) */
-      break;
+    case IRFPM_EXP2: emit_call(as, lj_vm_exp2); break;
    case IRFPM_SIN: emit_x87op(as, XI_FSIN); break;
    case IRFPM_COS: emit_x87op(as, XI_FCOS); break;
    case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break;
@ -2063,10 +2072,6 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
 	emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break;
      case IR_LDEXP:
 	emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break;
-      case IR_POWI:
-	emit_call(as, lj_vm_powi);  /* st0 = lj_vm_powi(st0, [esp]) */
-	emit_rmro(as, XO_MOVto, ra_alloc1(as, ir->op2, RSET_GPR), RID_ESP, 0);
-	break;
      default: lua_assert(0); break;
      }
      break;
@ -2085,6 +2090,19 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
  }
 }

+static void asm_powi(ASMState *as, IRIns *ir)
+{
+  /* The modified regs must match with the *.dasc implementation. */
+  RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
+  if (ra_hasreg(ir->r))
+    rset_clear(drop, ir->r);  /* Dest reg handled below. */
+  ra_evictset(as, drop);
+  ra_destreg(as, ir, RID_XMM0);
+  emit_call(as, lj_vm_powi_sse);
+  ra_left(as, RID_XMM0, ir->op1);
+  ra_left(as, RID_EAX, ir->op2);
+}
+
 /* Find out whether swapping operands might be beneficial. */
 static int swapops(ASMState *as, IRIns *ir)
 {
@ -3132,9 +3150,10 @@ static void asm_ir(ASMState *as, IRIns *ir)
  case IR_MIN: asm_fparith(as, ir, XO_MINSD); break;
  case IR_MAX: asm_fparith(as, ir, XO_MAXSD); break;

-  case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: case IR_POWI:
+  case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
    asm_fpmath(as, ir);
    break;
+  case IR_POWI: asm_powi(as, ir); break;

  /* Overflow-checking arithmetic ops. Note: don't use LEA here! */
  case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
@ -3285,8 +3304,22 @@ static void asm_setup_regsp(ASMState *as, Trace *T)
      if (inloop)
 	as->modset = RSET_SCRATCH;
      break;
+    case IR_POWI:
+      ir->prev = REGSP_HINT(RID_XMM0);
+      if (inloop)
+	as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
+      continue;
    case IR_FPMATH:
-      if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
+      if (ir->op2 == IRFPM_EXP2) {  /* May be joined to lj_vm_pow_sse. */
+	ir->prev = REGSP_HINT(RID_XMM0);
+#if !LJ_64
+	if (as->evenspill < 4)  /* Leave room for 16 byte scratch area. */
+	  as->evenspill = 4;
+#endif
+	if (inloop)
+	  as->modset |= RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
+	continue;
+      } else if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
 	ir->prev = REGSP_HINT(RID_XMM0);
 	if (inloop)
 	  as->modset |= RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
--- a/src/lj_vm.h
+++ b/src/lj_vm.h
@ -34,16 +34,13 @@ LJ_ASMF void lj_vm_exit_handler(void);
 LJ_ASMF void lj_vm_exit_interp(void);

 /* Handlers callable from compiled code. */
-LJ_ASMF void lj_vm_floor(void);
-LJ_ASMF void lj_vm_ceil(void);
-LJ_ASMF void lj_vm_trunc(void);
 LJ_ASMF void lj_vm_floor_sse(void);
 LJ_ASMF void lj_vm_ceil_sse(void);
 LJ_ASMF void lj_vm_trunc_sse(void);
 LJ_ASMF void lj_vm_exp(void);
 LJ_ASMF void lj_vm_exp2(void);
-LJ_ASMF void lj_vm_pow(void);
-LJ_ASMF void lj_vm_powi(void);
+LJ_ASMF void lj_vm_pow_sse(void);
+LJ_ASMF void lj_vm_powi_sse(void);

 /* Call gates for functions. */
 LJ_ASMF void lj_gate_lf(void);