Fix pow() optimization inconsistencies.

2022-01-24 14:37:50 +01:00 · 2022-01-24 14:37:50 +01:00 · 9512d5c1ac
parent c18acfe756
commit 9512d5c1ac
15 changed files with 114 additions and 205 deletions
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@ -1670,7 +1670,6 @@ static void asm_loop(ASMState *as)
 #if !LJ_SOFTFP32
 #if !LJ_TARGET_X86ORX64
 #define asm_ldexp(as, ir)	asm_callid(as, ir, IRCALL_ldexp)
-#define asm_fppowi(as, ir)	asm_callid(as, ir, IRCALL_lj_vm_powi)
 #endif

 static void asm_pow(ASMState *as, IRIns *ir)
@ -1681,10 +1680,8 @@ static void asm_pow(ASMState *as, IRIns *ir)
 					  IRCALL_lj_carith_powu64);
  else
 #endif
-  if (irt_isnum(IR(ir->op2)->t))
-    asm_callid(as, ir, IRCALL_pow);
-  else
-    asm_fppowi(as, ir);
+  asm_callid(as, ir, irt_isnum(IR(ir->op2)->t) ? IRCALL_lj_vm_pow :
+						 IRCALL_lj_vm_powi);
 }

 static void asm_div(ASMState *as, IRIns *ir)
--- a/src/lj_asm_x86.h
+++ b/src/lj_asm_x86.h
@ -2017,19 +2017,6 @@ static void asm_ldexp(ASMState *as, IRIns *ir)
  asm_x87load(as, ir->op2);
 }

-static void asm_fppowi(ASMState *as, IRIns *ir)
-{
-  /* The modified regs must match with the *.dasc implementation. */
-  RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
-  if (ra_hasreg(ir->r))
-    rset_clear(drop, ir->r);  /* Dest reg handled below. */
-  ra_evictset(as, drop);
-  ra_destreg(as, ir, RID_XMM0);
-  emit_call(as, lj_vm_powi_sse);
-  ra_left(as, RID_XMM0, ir->op1);
-  ra_left(as, RID_EAX, ir->op2);
-}
-
 static int asm_swapops(ASMState *as, IRIns *ir)
 {
  IRIns *irl = IR(ir->op1);
--- a/src/lj_dispatch.h
+++ b/src/lj_dispatch.h
@ -44,7 +44,7 @@ extern double __divdf3(double a, double b);
 #define GOTDEF(_) \
  _(floor) _(ceil) _(trunc) _(log) _(log10) _(exp) _(sin) _(cos) _(tan) \
  _(asin) _(acos) _(atan) _(sinh) _(cosh) _(tanh) _(frexp) _(modf) _(atan2) \
-  _(pow) _(fmod) _(ldexp) _(lj_vm_modi) \
+  _(lj_vm_pow) _(fmod) _(ldexp) _(lj_vm_modi) \
  _(lj_dispatch_call) _(lj_dispatch_ins) _(lj_dispatch_stitch) \
  _(lj_dispatch_profile) _(lj_err_throw) \
  _(lj_ffh_coroutine_wrap_err) _(lj_func_closeuv) _(lj_func_newL_gc) \
--- a/src/lj_ircall.h
+++ b/src/lj_ircall.h
@ -218,7 +218,7 @@ typedef struct CCallInfo {
  _(ANY,	log,			1,   N, NUM, XA_FP) \
  _(ANY,	lj_vm_log2,		1,   N, NUM, XA_FP) \
  _(ANY,	lj_vm_powi,		2,   N, NUM, XA_FP) \
-  _(ANY,	pow,			2,   N, NUM, XA2_FP) \
+  _(ANY,	lj_vm_pow,		2,   N, NUM, XA2_FP) \
  _(ANY,	atan2,			2,   N, NUM, XA2_FP) \
  _(ANY,	ldexp,			2,   N, NUM, XA_FP) \
  _(SOFTFP,	lj_vm_tobit,		1,   N, INT, XA_FP32) \
--- a/src/lj_opt_fold.c
+++ b/src/lj_opt_fold.c
@ -1143,33 +1143,6 @@ LJFOLDF(simplify_numpow_xkint)
  return ref;
 }

-LJFOLD(POW any KNUM)
-LJFOLDF(simplify_numpow_xknum)
-{
-  if (knumright == 0.5)  /* x ^ 0.5 ==> sqrt(x) */
-    return emitir(IRTN(IR_FPMATH), fins->op1, IRFPM_SQRT);
-  return NEXTFOLD;
-}
-
-LJFOLD(POW KNUM any)
-LJFOLDF(simplify_numpow_kx)
-{
-  lua_Number n = knumleft;
-  if (n == 2.0 && irt_isint(fright->t)) {  /* 2.0 ^ i ==> ldexp(1.0, i) */
-#if LJ_TARGET_X86ORX64
-    /* Different IR_LDEXP calling convention on x86/x64 requires conversion. */
-    fins->o = IR_CONV;
-    fins->op1 = fins->op2;
-    fins->op2 = IRCONV_NUM_INT;
-    fins->op2 = (IRRef1)lj_opt_fold(J);
-#endif
-    fins->op1 = (IRRef1)lj_ir_knum_one(J);
-    fins->o = IR_LDEXP;
-    return RETRYFOLD;
-  }
-  return NEXTFOLD;
-}
-
 /* -- Simplify conversions ------------------------------------------------ */

 LJFOLD(CONV CONV IRCONV_NUM_INT)  /* _NUM */
--- a/src/lj_opt_narrow.c
+++ b/src/lj_opt_narrow.c
@ -590,20 +590,14 @@ TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc)
  rb = conv_str_tonum(J, rb, vb);
  rb = lj_ir_tonum(J, rb);  /* Left arg is always treated as an FP number. */
  rc = conv_str_tonum(J, rc, vc);
-  /* Narrowing must be unconditional to preserve (-x)^i semantics. */
  if (tvisint(vc) || numisint(numV(vc))) {
-    int checkrange = 0;
-    /* pow() is faster for bigger exponents. But do this only for (+k)^i. */
-    if (tref_isk(rb) && (int32_t)ir_knum(IR(tref_ref(rb)))->u32.hi >= 0) {
-      int32_t k = numberVint(vc);
-      if (!(k >= -65536 && k <= 65536)) goto force_pow_num;
-      checkrange = 1;
-    }
+    int32_t k = numberVint(vc);
+    if (!(k >= -65536 && k <= 65536)) goto force_pow_num;
    if (!tref_isinteger(rc)) {
      /* Guarded conversion to integer! */
      rc = emitir(IRTGI(IR_CONV), rc, IRCONV_INT_NUM|IRCONV_CHECK);
    }
-    if (checkrange && !tref_isk(rc)) {  /* Range guard: -65536 <= i <= 65536 */
+    if (!tref_isk(rc)) {  /* Range guard: -65536 <= i <= 65536 */
      TRef tmp = emitir(IRTI(IR_ADD), rc, lj_ir_kint(J, 65536));
      emitir(IRTGI(IR_ULE), tmp, lj_ir_kint(J, 2*65536));
    }
--- a/src/lj_vm.h
+++ b/src/lj_vm.h
@ -83,10 +83,6 @@ LJ_ASMF int32_t LJ_FASTCALL lj_vm_modi(int32_t, int32_t);
 LJ_ASMF void lj_vm_floor_sse(void);
 LJ_ASMF void lj_vm_ceil_sse(void);
 LJ_ASMF void lj_vm_trunc_sse(void);
-LJ_ASMF void lj_vm_powi_sse(void);
-#define lj_vm_powi	NULL
-#else
-LJ_ASMF double lj_vm_powi(double, int32_t);
 #endif
 #if LJ_TARGET_PPC || LJ_TARGET_ARM64
 #define lj_vm_trunc	trunc
@ -102,6 +98,9 @@ LJ_ASMF int lj_vm_errno(void);
 LJ_ASMF TValue *lj_vm_next(GCtab *t, uint32_t idx);
 #endif

+LJ_ASMF double lj_vm_powi(double, int32_t);
+LJ_ASMF double lj_vm_pow(double, double);
+
 /* Continuations for metamethods. */
 LJ_ASMF void lj_cont_cat(void);  /* Continue with concatenation. */
 LJ_ASMF void lj_cont_ra(void);  /* Store result in RA from instruction. */
--- a/src/lj_vmmath.c
+++ b/src/lj_vmmath.c
@ -30,57 +30,11 @@ LJ_FUNCA double lj_wrap_sinh(double x) { return sinh(x); }
 LJ_FUNCA double lj_wrap_cosh(double x) { return cosh(x); }
 LJ_FUNCA double lj_wrap_tanh(double x) { return tanh(x); }
 LJ_FUNCA double lj_wrap_atan2(double x, double y) { return atan2(x, y); }
-LJ_FUNCA double lj_wrap_pow(double x, double y) { return pow(x, y); }
 LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); }
 #endif

-/* -- Helper functions for generated machine code ------------------------- */
+/* -- Helper functions ---------------------------------------------------- */

-double lj_vm_foldarith(double x, double y, int op)
-{
-  switch (op) {
-  case IR_ADD - IR_ADD: return x+y; break;
-  case IR_SUB - IR_ADD: return x-y; break;
-  case IR_MUL - IR_ADD: return x*y; break;
-  case IR_DIV - IR_ADD: return x/y; break;
-  case IR_MOD - IR_ADD: return x-lj_vm_floor(x/y)*y; break;
-  case IR_POW - IR_ADD: return pow(x, y); break;
-  case IR_NEG - IR_ADD: return -x; break;
-  case IR_ABS - IR_ADD: return fabs(x); break;
-#if LJ_HASJIT
-  case IR_LDEXP - IR_ADD: return ldexp(x, (int)y); break;
-  case IR_MIN - IR_ADD: return x < y ? x : y; break;
-  case IR_MAX - IR_ADD: return x > y ? x : y; break;
-#endif
-  default: return x;
-  }
-}
-
-#if (LJ_HASJIT && !(LJ_TARGET_ARM || LJ_TARGET_ARM64 || LJ_TARGET_PPC)) || LJ_TARGET_MIPS
-int32_t LJ_FASTCALL lj_vm_modi(int32_t a, int32_t b)
-{
-  uint32_t y, ua, ub;
-  /* This must be checked before using this function. */
-  lj_assertX(b != 0, "modulo with zero divisor");
-  ua = a < 0 ? (uint32_t)-a : (uint32_t)a;
-  ub = b < 0 ? (uint32_t)-b : (uint32_t)b;
-  y = ua % ub;
-  if (y != 0 && (a^b) < 0) y = y - ub;
-  if (((int32_t)y^b) < 0) y = (uint32_t)-(int32_t)y;
-  return (int32_t)y;
-}
-#endif
-
-#if LJ_HASJIT
-
-#ifdef LUAJIT_NO_LOG2
-double lj_vm_log2(double a)
-{
-  return log(a) * 1.4426950408889634074;
-}
-#endif
-
-#if !LJ_TARGET_X86ORX64
 /* Unsigned x^k. */
 static double lj_vm_powui(double x, uint32_t k)
 {
@ -112,6 +66,60 @@ double lj_vm_powi(double x, int32_t k)
  else
    return 1.0 / lj_vm_powui(x, (uint32_t)-k);
 }
+
+double lj_vm_pow(double x, double y)
+{
+  int32_t k = lj_num2int(y);
+  if ((k >= -65536 && k <= 65536) && y == (double)k)
+    return lj_vm_powi(x, k);
+  else
+    return pow(x, y);
+}
+
+double lj_vm_foldarith(double x, double y, int op)
+{
+  switch (op) {
+  case IR_ADD - IR_ADD: return x+y; break;
+  case IR_SUB - IR_ADD: return x-y; break;
+  case IR_MUL - IR_ADD: return x*y; break;
+  case IR_DIV - IR_ADD: return x/y; break;
+  case IR_MOD - IR_ADD: return x-lj_vm_floor(x/y)*y; break;
+  case IR_POW - IR_ADD: return lj_vm_pow(x, y); break;
+  case IR_NEG - IR_ADD: return -x; break;
+  case IR_ABS - IR_ADD: return fabs(x); break;
+#if LJ_HASJIT
+  case IR_LDEXP - IR_ADD: return ldexp(x, (int)y); break;
+  case IR_MIN - IR_ADD: return x < y ? x : y; break;
+  case IR_MAX - IR_ADD: return x > y ? x : y; break;
+#endif
+  default: return x;
+  }
+}
+
+/* -- Helper functions for generated machine code ------------------------- */
+
+#if (LJ_HASJIT && !(LJ_TARGET_ARM || LJ_TARGET_ARM64 || LJ_TARGET_PPC)) || LJ_TARGET_MIPS
+int32_t LJ_FASTCALL lj_vm_modi(int32_t a, int32_t b)
+{
+  uint32_t y, ua, ub;
+  /* This must be checked before using this function. */
+  lj_assertX(b != 0, "modulo with zero divisor");
+  ua = a < 0 ? (uint32_t)-a : (uint32_t)a;
+  ub = b < 0 ? (uint32_t)-b : (uint32_t)b;
+  y = ua % ub;
+  if (y != 0 && (a^b) < 0) y = y - ub;
+  if (((int32_t)y^b) < 0) y = (uint32_t)-(int32_t)y;
+  return (int32_t)y;
+}
+#endif
+
+#if LJ_HASJIT
+
+#ifdef LUAJIT_NO_LOG2
+double lj_vm_log2(double a)
+{
+  return log(a) * 1.4426950408889634074;
+}
 #endif

 /* Computes fpm(x) for extended math functions. */
--- a/src/vm_arm.dasc
+++ b/src/vm_arm.dasc
@ -1477,11 +1477,11 @@ static void build_subroutines(BuildCtx *ctx)
  |.endif
  |.endmacro
  |
-  |.macro math_extern2, func
+  |.macro math_extern2, name, func
  |.if HFABI
-  |  .ffunc_dd math_ .. func
+  |  .ffunc_dd math_ .. name
  |.else
-  |  .ffunc_nn math_ .. func
+  |  .ffunc_nn math_ .. name
  |.endif
  |  .IOS mov RA, BASE
  |  bl extern func
@ -1492,6 +1492,9 @@ static void build_subroutines(BuildCtx *ctx)
  |  b ->fff_restv
  |.endif
  |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
  |
  |.if FPU
  |  .ffunc_d math_sqrt
@ -1537,7 +1540,7 @@ static void build_subroutines(BuildCtx *ctx)
  |  math_extern sinh
  |  math_extern cosh
  |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
  |  math_extern2 atan2
  |  math_extern2 fmod
  |
@ -3203,7 +3206,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
    break;
  case BC_POW:
    |  // NYI: (partial) integer arithmetic.
-    |  ins_arithfp extern, extern pow
+    |  ins_arithfp extern, extern lj_vm_pow
    break;

  case BC_CAT:
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@ -1387,11 +1387,14 @@ static void build_subroutines(BuildCtx *ctx)
  |  b ->fff_resn
  |.endmacro
  |
-  |.macro math_extern2, func
-  |  .ffunc_nn math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nn math_ .. name
  |  bl extern func
  |  b ->fff_resn
  |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
  |
  |.ffunc_n math_sqrt
  |  fsqrt d0, d0
@ -1420,7 +1423,7 @@ static void build_subroutines(BuildCtx *ctx)
  |  math_extern sinh
  |  math_extern cosh
  |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
  |  math_extern2 atan2
  |  math_extern2 fmod
  |
@ -2674,7 +2677,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
    |  ins_arithload FARG1, FARG2
    |  ins_arithfallback ins_arithcheck_num
    |.if "fpins" == "fpow"
-    |  bl extern pow
+    |  bl extern lj_vm_pow
    |.else
    |  fpins FARG1, FARG1, FARG2
    |.endif
--- a/src/vm_mips.dasc
+++ b/src/vm_mips.dasc
@ -1623,14 +1623,17 @@ static void build_subroutines(BuildCtx *ctx)
  |.  nop
  |.endmacro
  |
-  |.macro math_extern2, func
-  |  .ffunc_nn math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nn math_ .. name
  |.  load_got func
  |  call_extern
  |.  nop
  |  b ->fff_resn
  |.  nop
  |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
  |
  |// TODO: Return integer type if result is integer (own sf implementation).
  |.macro math_round, func
@ -1684,7 +1687,7 @@ static void build_subroutines(BuildCtx *ctx)
  |  math_extern sinh
  |  math_extern cosh
  |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
  |  math_extern2 atan2
  |  math_extern2 fmod
  |
@ -3689,7 +3692,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
    |  sltiu AT, SFARG1HI, LJ_TISNUM
    |  sltiu TMP0, SFARG2HI, LJ_TISNUM
    |  and AT, AT, TMP0
-    |  load_got pow
+    |  load_got lj_vm_pow
    |  beqz AT, ->vmeta_arith
    |.  addu RA, BASE, RA
    |.if FPU
--- a/src/vm_mips64.dasc
+++ b/src/vm_mips64.dasc
@ -1667,14 +1667,17 @@ static void build_subroutines(BuildCtx *ctx)
  |.  nop
  |.endmacro
  |
-  |.macro math_extern2, func
-  |  .ffunc_nn math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nn math_ .. name
  |.  load_got func
  |  call_extern
  |.  nop
  |  b ->fff_resn
  |.  nop
  |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
  |
  |// TODO: Return integer type if result is integer (own sf implementation).
  |.macro math_round, func
@ -1728,7 +1731,7 @@ static void build_subroutines(BuildCtx *ctx)
  |  math_extern sinh
  |  math_extern cosh
  |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
  |  math_extern2 atan2
  |  math_extern2 fmod
  |
@ -3915,7 +3918,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
    |  sltiu TMP0, TMP0, LJ_TISNUM
    |   sltiu TMP1, TMP1, LJ_TISNUM
    |  and AT, TMP0, TMP1
-    |  load_got pow
+    |  load_got lj_vm_pow
    |  beqz AT, ->vmeta_arith
    |.  daddu RA, BASE, RA
    |.if FPU
--- a/src/vm_ppc.dasc
+++ b/src/vm_ppc.dasc
@ -2012,11 +2012,14 @@ static void build_subroutines(BuildCtx *ctx)
  |  b ->fff_resn
  |.endmacro
  |
-  |.macro math_extern2, func
-  |  .ffunc_nn math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nn math_ .. name
  |  blex func
  |  b ->fff_resn
  |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
  |
  |.macro math_round, func
  |  .ffunc_1 math_ .. func
@ -2141,7 +2144,7 @@ static void build_subroutines(BuildCtx *ctx)
  |  math_extern sinh
  |  math_extern cosh
  |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
  |  math_extern2 atan2
  |  math_extern2 fmod
  |
@ -4139,7 +4142,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
    |  checknum cr1, CARG3
    |  crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
    |  bge ->vmeta_arith_vv
-    |  blex pow
+    |  blex lj_vm_pow
    |  ins_next1
    |.if FPU
    |  stfdx FARG1, BASE, RA
--- a/src/vm_x64.dasc
+++ b/src/vm_x64.dasc
@ -1755,13 +1755,16 @@ static void build_subroutines(BuildCtx *ctx)
  |  jmp ->fff_resxmm0
  |.endmacro
  |
-  |.macro math_extern2, func
-  |  .ffunc_nn math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nn math_ .. name
  |  mov RB, BASE
  |  call extern func
  |  mov BASE, RB
  |  jmp ->fff_resxmm0
  |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
  |
  |  math_extern log10
  |  math_extern exp
@ -1774,7 +1777,7 @@ static void build_subroutines(BuildCtx *ctx)
  |  math_extern sinh
  |  math_extern cosh
  |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
  |  math_extern2 atan2
  |  math_extern2 fmod
  |
@ -2579,41 +2582,6 @@ static void build_subroutines(BuildCtx *ctx)
  |  subsd xmm0, xmm1
  |  ret
  |
-  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
-  |->vm_powi_sse:
-  |  cmp eax, 1; jle >6			// i<=1?
-  |  // Now 1 < (unsigned)i <= 0x80000000.
-  |1:  // Handle leading zeros.
-  |  test eax, 1; jnz >2
-  |  mulsd xmm0, xmm0
-  |  shr eax, 1
-  |  jmp <1
-  |2:
-  |  shr eax, 1; jz >5
-  |  movaps xmm1, xmm0
-  |3:  // Handle trailing bits.
-  |  mulsd xmm0, xmm0
-  |  shr eax, 1; jz >4
-  |  jnc <3
-  |  mulsd xmm1, xmm0
-  |  jmp <3
-  |4:
-  |  mulsd xmm0, xmm1
-  |5:
-  |  ret
-  |6:
-  |  je <5				// x^1 ==> x
-  |  jb >7				// x^0 ==> 1
-  |  neg eax
-  |  call <1
-  |  sseconst_1 xmm1, RD
-  |  divsd xmm1, xmm0
-  |  movaps xmm0, xmm1
-  |  ret
-  |7:
-  |  sseconst_1 xmm0, RD
-  |  ret
-  |
  |//-----------------------------------------------------------------------
  |//-- Miscellaneous functions --------------------------------------------
  |//-----------------------------------------------------------------------
--- a/src/vm_x86.dasc
+++ b/src/vm_x86.dasc
@ -2138,8 +2138,8 @@ static void build_subroutines(BuildCtx *ctx)
  |  jmp ->fff_resfp
  |.endmacro
  |
-  |.macro math_extern2, func
-  |  .ffunc_nnsse math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nnsse math_ .. name
  |.if not X64
  |  movsd FPARG1, xmm0
  |  movsd FPARG3, xmm1
@ -2149,6 +2149,9 @@ static void build_subroutines(BuildCtx *ctx)
  |  mov BASE, RB
  |  jmp ->fff_resfp
  |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
  |
  |  math_extern log10
  |  math_extern exp
@ -2161,7 +2164,7 @@ static void build_subroutines(BuildCtx *ctx)
  |  math_extern sinh
  |  math_extern cosh
  |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
  |  math_extern2 atan2
  |  math_extern2 fmod
  |
@ -3038,41 +3041,6 @@ static void build_subroutines(BuildCtx *ctx)
  |  subsd xmm0, xmm1
  |  ret
  |
-  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
-  |->vm_powi_sse:
-  |  cmp eax, 1; jle >6			// i<=1?
-  |  // Now 1 < (unsigned)i <= 0x80000000.
-  |1:  // Handle leading zeros.
-  |  test eax, 1; jnz >2
-  |  mulsd xmm0, xmm0
-  |  shr eax, 1
-  |  jmp <1
-  |2:
-  |  shr eax, 1; jz >5
-  |  movaps xmm1, xmm0
-  |3:  // Handle trailing bits.
-  |  mulsd xmm0, xmm0
-  |  shr eax, 1; jz >4
-  |  jnc <3
-  |  mulsd xmm1, xmm0
-  |  jmp <3
-  |4:
-  |  mulsd xmm0, xmm1
-  |5:
-  |  ret
-  |6:
-  |  je <5				// x^1 ==> x
-  |  jb >7				// x^0 ==> 1
-  |  neg eax
-  |  call <1
-  |  sseconst_1 xmm1, RDa
-  |  divsd xmm1, xmm0
-  |  movaps xmm0, xmm1
-  |  ret
-  |7:
-  |  sseconst_1 xmm0, RDa
-  |  ret
-  |
  |//-----------------------------------------------------------------------
  |//-- Miscellaneous functions --------------------------------------------
  |//-----------------------------------------------------------------------
@ -3954,7 +3922,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
    |  movsd FPARG1, xmm0
    |  movsd FPARG3, xmm1
    |.endif
-    |  call extern pow
+    |  call extern lj_vm_pow
    |  movzx RA, PC_RA
    |  mov BASE, RB
    |.if X64