LuaJIT/src/lj_opt_loop.c

362 lines
12 KiB
C
Raw Normal View History

2009-12-08 10:46:35 -08:00
/*
** LOOP: Loop Optimizations.
** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
*/
#define lj_opt_loop_c
#define LUA_CORE
#include "lj_obj.h"
#if LJ_HASJIT
#include "lj_gc.h"
#include "lj_err.h"
#include "lj_str.h"
#include "lj_ir.h"
#include "lj_jit.h"
#include "lj_iropt.h"
#include "lj_trace.h"
#include "lj_snap.h"
#include "lj_vm.h"
/* Loop optimization:
**
** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
** of a loop into invariant and variant instructions. The invariant
** instructions are hoisted out of the loop and only the variant
** instructions remain inside the loop body.
**
** Unfortunately LICM is mostly useless for compiling dynamic languages.
** The IR has many guards and most of the subsequent instructions are
** control-dependent on them. The first non-hoistable guard would
** effectively prevent hoisting of all subsequent instructions.
**
** That's why we use a special form of unrolling using copy-substitution,
** combined with redundancy elimination:
**
** The recorded instruction stream is re-emitted to the compiler pipeline
** with substituted operands. The substitution table is filled with the
** refs returned by re-emitting each instruction. This can be done
** on-the-fly, because the IR is in strict SSA form, where every ref is
** defined before its use.
**
** This aproach generates two code sections, separated by the LOOP
** instruction:
**
** 1. The recorded instructions form a kind of pre-roll for the loop. It
** contains a mix of invariant and variant instructions and performs
** exactly one loop iteration (but not necessarily the 1st iteration).
**
** 2. The loop body contains only the variant instructions and performs
** all remaining loop iterations.
**
** On first sight that looks like a waste of space, because the variant
** instructions are present twice. But the key insight is that the
** pre-roll honors the control-dependencies for *both* the pre-roll itself
** *and* the loop body!
**
** It also means one doesn't have to explicitly model control-dependencies
** (which, BTW, wouldn't help LICM much). And it's much easier to
** integrate sparse snapshotting with this approach.
**
** One of the nicest aspects of this approach is that all of the
** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
** reused with only minor restrictions (e.g. one should not fold
** instructions across loop-carried dependencies).
**
** But in general all optimizations can be applied which only need to look
** backwards into the generated instruction stream. At any point in time
** during the copy-substitution process this contains both a static loop
** iteration (the pre-roll) and a dynamic one (from the to-be-copied
** instruction up to the end of the partial loop body).
**
** Since control-dependencies are implicitly kept, CSE also applies to all
** kinds of guards. The major advantage is that all invariant guards can
** be hoisted, too.
**
** Load/store forwarding works across loop iterations, too. This is
** important if loop-carried dependencies are kept in upvalues or tables.
** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
** become a forwarded loop-recurrence after inlining.
**
** Since the IR is in SSA form, loop-carried dependencies have to be
** modeled with PHI instructions. The potential candidates for PHIs are
** collected on-the-fly during copy-substitution. After eliminating the
** redundant ones, PHI instructions are emitted *below* the loop body.
**
** Note that this departure from traditional SSA form doesn't change the
** semantics of the PHI instructions themselves. But it greatly simplifies
** on-the-fly generation of the IR and the machine code.
*/
/* Some local macros to save typing. Undef'd at the end. */
#define IR(ref) (&J->cur.ir[(ref)])
/* Pass IR on to next optimization in chain (FOLD). */
#define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
/* Emit raw IR without passing through optimizations. */
#define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
/* -- PHI elimination ----------------------------------------------------- */
/* Emit or eliminate collected PHIs. */
static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi)
{
int pass2 = 0;
IRRef i, nslots;
IRRef invar = J->chain[IR_LOOP];
/* Pass #1: mark redundant and potentially redundant PHIs. */
for (i = 0; i < nphi; i++) {
IRRef lref = phi[i];
IRRef rref = subst[lref];
if (lref == rref || rref == REF_DROP) { /* Invariants are redundant. */
irt_setmark(IR(lref)->t);
} else if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) {
/* Quick check for simple recurrences failed, need pass2. */
irt_setmark(IR(lref)->t);
pass2 = 1;
}
}
/* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
if (pass2) {
for (i = J->cur.nins-1; i > invar; i--) {
IRIns *ir = IR(i);
if (!irref_isk(ir->op1)) irt_clearmark(IR(ir->op1)->t);
if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
}
}
/* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
nslots = J->baseslot+J->maxslot;
for (i = 1; i < nslots; i++) {
IRRef ref = tref_ref(J->slot[i]);
while (!irref_isk(ref) && ref != subst[ref]) {
2009-12-08 10:46:35 -08:00
IRIns *ir = IR(ref);
irt_clearmark(ir->t); /* Unmark potential uses, too. */
if (irt_isphi(ir->t) || irt_ispri(ir->t))
break;
irt_setphi(ir->t);
if (nphi >= LJ_MAX_PHI)
lj_trace_err(J, LJ_TRERR_PHIOV);
phi[nphi++] = (IRRef1)ref;
ref = subst[ref];
if (ref > invar)
break;
2009-12-08 10:46:35 -08:00
}
}
/* Pass #4: emit PHI instructions or eliminate PHIs. */
for (i = 0; i < nphi; i++) {
IRRef lref = phi[i];
IRIns *ir = IR(lref);
if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */
IRRef rref = subst[lref];
if (rref > invar)
irt_setphi(IR(rref)->t);
emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
} else { /* Otherwise eliminate PHI. */
irt_clearmark(ir->t);
irt_clearphi(ir->t);
}
}
}
/* -- Loop unrolling using copy-substitution ------------------------------ */
/* Unroll loop. */
static void loop_unroll(jit_State *J)
{
IRRef1 phi[LJ_MAX_PHI];
uint32_t nphi = 0;
IRRef1 *subst;
SnapShot *osnap, *snap;
IRRef2 *loopmap;
BCReg loopslots;
MSize nsnap, nsnapmap;
IRRef ins, invar, osnapref;
/* Use temp buffer for substitution table.
** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
** Note: don't call into the VM or run the GC or the buffer may be gone.
*/
invar = J->cur.nins;
subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf,
(invar-REF_BIAS)*sizeof(IRRef1)) - REF_BIAS;
subst[REF_BASE] = REF_BASE;
/* LOOP separates the pre-roll from the loop body. */
emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
/* Ensure size for copy-substituted snapshots (minus #0 and loop snapshot). */
nsnap = J->cur.nsnap;
if (LJ_UNLIKELY(2*nsnap-2 > J->sizesnap)) {
MSize maxsnap = (MSize)J->param[JIT_P_maxsnap];
if (2*nsnap-2 > maxsnap)
lj_trace_err(J, LJ_TRERR_SNAPOV);
lj_mem_growvec(J->L, J->snapbuf, J->sizesnap, maxsnap, SnapShot);
J->cur.snap = J->snapbuf;
}
nsnapmap = J->cur.nsnapmap; /* Use temp. copy to avoid undo. */
if (LJ_UNLIKELY(nsnapmap*2 > J->sizesnapmap)) {
J->snapmapbuf = (IRRef2 *)lj_mem_realloc(J->L, J->snapmapbuf,
J->sizesnapmap*sizeof(IRRef2),
2*J->sizesnapmap*sizeof(IRRef2));
J->cur.snapmap = J->snapmapbuf;
J->sizesnapmap *= 2;
}
/* The loop snapshot is used for fallback substitutions. */
snap = &J->cur.snap[nsnap-1];
loopmap = &J->cur.snapmap[snap->mapofs];
loopslots = snap->nslots;
/* The PC of snapshot #0 and the loop snapshot must match. */
lua_assert(loopmap[loopslots] == J->cur.snapmap[J->cur.snap[0].nslots]);
/* Start substitution with snapshot #1 (#0 is empty for root traces). */
osnap = &J->cur.snap[1];
osnapref = osnap->ref;
/* Copy and substitute all recorded instructions and snapshots. */
for (ins = REF_FIRST; ins < invar; ins++) {
IRIns *ir;
IRRef op1, op2;
/* Copy-substitute snapshot. */
if (ins >= osnapref) {
IRRef2 *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
BCReg s, nslots;
uint32_t nmapofs, nframelinks;
if (irt_isguard(J->guardemit)) { /* Guard inbetween? */
nmapofs = nsnapmap;
snap++; /* Add new snapshot. */
} else {
nmapofs = snap->mapofs; /* Overwrite previous snapshot. */
}
J->guardemit.irt = 0;
nslots = osnap->nslots;
nframelinks = osnap->nframelinks;
snap->mapofs = (uint16_t)nmapofs;
snap->ref = (IRRef1)J->cur.nins;
snap->nslots = (uint8_t)nslots;
snap->nframelinks = (uint8_t)nframelinks;
snap->count = 0;
osnap++;
osnapref = osnap->ref;
nsnapmap = nmapofs + nslots + nframelinks;
nmap = &J->cur.snapmap[nmapofs];
/* Substitute snapshot slots. */
for (s = 0; s < nslots; s++) {
IRRef ref = snap_ref(omap[s]);
if (ref) {
if (!irref_isk(ref))
ref = subst[ref];
} else if (s < loopslots) {
ref = loopmap[s];
}
nmap[s] = ref;
}
/* Copy frame links. */
nmap += nslots;
omap += nslots;
for (s = 0; s < nframelinks; s++)
nmap[s] = omap[s];
}
/* Substitute instruction operands. */
ir = IR(ins);
op1 = ir->op1;
if (!irref_isk(op1)) op1 = subst[op1];
op2 = ir->op2;
if (!irref_isk(op2)) op2 = subst[op2];
if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */
subst[ins] = (IRRef1)ins; /* Shortcut. */
} else {
/* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */
IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
subst[ins] = (IRRef1)ref;
if (ref != ins && ref < invar) { /* Loop-carried dependency? */
IRIns *irr = IR(ref);
/* Potential PHI? */
if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
irt_setphi(irr->t);
if (nphi >= LJ_MAX_PHI)
lj_trace_err(J, LJ_TRERR_PHIOV);
phi[nphi++] = (IRRef1)ref;
}
/* Check all loop-carried dependencies for type instability. */
if (!irt_sametype(t, irr->t)) {
if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num case. */
subst[ins] = tref_ref(emitir(IRTN(IR_TONUM), ref, 0));
2009-12-08 10:49:20 -08:00
else if (!(irt_isinteger(t) && irt_isinteger(irr->t)))
2009-12-08 10:46:35 -08:00
lj_trace_err(J, LJ_TRERR_TYPEINS);
}
}
}
}
if (irt_isguard(J->guardemit)) { /* Guard inbetween? */
J->cur.nsnapmap = (uint16_t)nsnapmap;
snap++;
} else {
J->cur.nsnapmap = (uint16_t)snap->mapofs; /* Last snapshot is redundant. */
}
J->cur.nsnap = (uint16_t)(snap - J->cur.snap);
lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
loop_emit_phi(J, subst, phi, nphi);
}
/* Undo any partial changes made by the loop optimization. */
static void loop_undo(jit_State *J, IRRef ins)
{
lj_ir_rollback(J, ins);
for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */
IRIns *ir = IR(ins);
irt_clearphi(ir->t);
irt_clearmark(ir->t);
}
}
/* Protected callback for loop optimization. */
static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
{
UNUSED(L); UNUSED(dummy);
loop_unroll((jit_State *)ud);
return NULL;
}
/* Loop optimization. */
int lj_opt_loop(jit_State *J)
{
IRRef nins = J->cur.nins;
int errcode = lj_vm_cpcall(J->L, cploop_opt, NULL, J);
if (LJ_UNLIKELY(errcode)) {
lua_State *L = J->L;
if (errcode == LUA_ERRRUN && tvisnum(L->top-1)) { /* Trace error? */
int32_t e = lj_num2int(numV(L->top-1));
switch ((TraceError)e) {
case LJ_TRERR_TYPEINS: /* Type instability. */
case LJ_TRERR_GFAIL: /* Guard would always fail. */
/* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
if (--J->instunroll < 0) /* But do not unroll forever. */
break;
L->top--; /* Remove error object. */
J->guardemit.irt = 0;
loop_undo(J, nins);
return 1; /* Loop optimization failed, continue recording. */
default:
break;
}
}
lj_err_throw(L, errcode); /* Propagate all other errors. */
}
return 0; /* Loop optimization is ok. */
}
#undef IR
#undef emitir
#undef emitir_raw
#endif