/* ** LOOP: Loop Optimizations. ** Copyright (C) 2005-2022 Mike Pall. See Copyright Notice in luajit.h */ #define lj_opt_loop_c #define LUA_CORE #include "lj_obj.h" #if LJ_HASJIT #include "lj_err.h" #include "lj_buf.h" #include "lj_ir.h" #include "lj_jit.h" #include "lj_iropt.h" #include "lj_trace.h" #include "lj_snap.h" #include "lj_vm.h" /* Loop optimization: ** ** Traditional Loop-Invariant Code Motion (LICM) splits the instructions ** of a loop into invariant and variant instructions. The invariant ** instructions are hoisted out of the loop and only the variant ** instructions remain inside the loop body. ** ** Unfortunately LICM is mostly useless for compiling dynamic languages. ** The IR has many guards and most of the subsequent instructions are ** control-dependent on them. The first non-hoistable guard would ** effectively prevent hoisting of all subsequent instructions. ** ** That's why we use a special form of unrolling using copy-substitution, ** combined with redundancy elimination: ** ** The recorded instruction stream is re-emitted to the compiler pipeline ** with substituted operands. The substitution table is filled with the ** refs returned by re-emitting each instruction. This can be done ** on-the-fly, because the IR is in strict SSA form, where every ref is ** defined before its use. ** ** This aproach generates two code sections, separated by the LOOP ** instruction: ** ** 1. The recorded instructions form a kind of pre-roll for the loop. It ** contains a mix of invariant and variant instructions and performs ** exactly one loop iteration (but not necessarily the 1st iteration). ** ** 2. The loop body contains only the variant instructions and performs ** all remaining loop iterations. ** ** On first sight that looks like a waste of space, because the variant ** instructions are present twice. But the key insight is that the ** pre-roll honors the control-dependencies for *both* the pre-roll itself ** *and* the loop body! ** ** It also means one doesn't have to explicitly model control-dependencies ** (which, BTW, wouldn't help LICM much). And it's much easier to ** integrate sparse snapshotting with this approach. ** ** One of the nicest aspects of this approach is that all of the ** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be ** reused with only minor restrictions (e.g. one should not fold ** instructions across loop-carried dependencies). ** ** But in general all optimizations can be applied which only need to look ** backwards into the generated instruction stream. At any point in time ** during the copy-substitution process this contains both a static loop ** iteration (the pre-roll) and a dynamic one (from the to-be-copied ** instruction up to the end of the partial loop body). ** ** Since control-dependencies are implicitly kept, CSE also applies to all ** kinds of guards. The major advantage is that all invariant guards can ** be hoisted, too. ** ** Load/store forwarding works across loop iterations, too. This is ** important if loop-carried dependencies are kept in upvalues or tables. ** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may ** become a forwarded loop-recurrence after inlining. ** ** Since the IR is in SSA form, loop-carried dependencies have to be ** modeled with PHI instructions. The potential candidates for PHIs are ** collected on-the-fly during copy-substitution. After eliminating the ** redundant ones, PHI instructions are emitted *below* the loop body. ** ** Note that this departure from traditional SSA form doesn't change the ** semantics of the PHI instructions themselves. But it greatly simplifies ** on-the-fly generation of the IR and the machine code. */ /* Some local macros to save typing. Undef'd at the end. */ #define IR(ref) (&J->cur.ir[(ref)]) /* Pass IR on to next optimization in chain (FOLD). */ #define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J)) /* Emit raw IR without passing through optimizations. */ #define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J)) /* -- PHI elimination ----------------------------------------------------- */ /* Emit or eliminate collected PHIs. */ static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi, SnapNo onsnap) { int passx = 0; IRRef i, j, nslots; IRRef invar = J->chain[IR_LOOP]; /* Pass #1: mark redundant and potentially redundant PHIs. */ for (i = 0, j = 0; i < nphi; i++) { IRRef lref = phi[i]; IRRef rref = subst[lref]; if (lref == rref || rref == REF_DROP) { /* Invariants are redundant. */ irt_clearphi(IR(lref)->t); } else { phi[j++] = (IRRef1)lref; if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) { /* Quick check for simple recurrences failed, need pass2. */ irt_setmark(IR(lref)->t); passx = 1; } } } nphi = j; /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */ if (passx) { SnapNo s; for (i = J->cur.nins-1; i > invar; i--) { IRIns *ir = IR(i); if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t); if (!irref_isk(ir->op1)) { irt_clearmark(IR(ir->op1)->t); if (ir->op1 < invar && ir->o >= IR_CALLN && ir->o <= IR_CARG) { /* ORDER IR */ ir = IR(ir->op1); while (ir->o == IR_CARG) { if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t); if (irref_isk(ir->op1)) break; ir = IR(ir->op1); irt_clearmark(ir->t); } } } } for (s = J->cur.nsnap-1; s >= onsnap; s--) { SnapShot *snap = &J->cur.snap[s]; SnapEntry *map = &J->cur.snapmap[snap->mapofs]; MSize n, nent = snap->nent; for (n = 0; n < nent; n++) { IRRef ref = snap_ref(map[n]); if (!irref_isk(ref)) irt_clearmark(IR(ref)->t); } } } /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */ nslots = J->baseslot+J->maxslot; for (i = 1; i < nslots; i++) { IRRef ref = tref_ref(J->slot[i]); while (!irref_isk(ref) && ref != subst[ref]) { IRIns *ir = IR(ref); irt_clearmark(ir->t); /* Unmark potential uses, too. */ if (irt_isphi(ir->t) || irt_ispri(ir->t)) break; irt_setphi(ir->t); if (nphi >= LJ_MAX_PHI) lj_trace_err(J, LJ_TRERR_PHIOV); phi[nphi++] = (IRRef1)ref; ref = subst[ref]; if (ref > invar) break; } } /* Pass #4: propagate non-redundant PHIs. */ while (passx) { passx = 0; for (i = 0; i < nphi; i++) { IRRef lref = phi[i]; IRIns *ir = IR(lref); if (!irt_ismarked(ir->t)) { /* Propagate only from unmarked PHIs. */ IRIns *irr = IR(subst[lref]); if (irt_ismarked(irr->t)) { /* Right ref points to other PHI? */ irt_clearmark(irr->t); /* Mark that PHI as non-redundant. */ passx = 1; /* Retry. */ } } } } /* Pass #5: emit PHI instructions or eliminate PHIs. */ for (i = 0; i < nphi; i++) { IRRef lref = phi[i]; IRIns *ir = IR(lref); if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */ IRRef rref = subst[lref]; if (rref > invar) irt_setphi(IR(rref)->t); emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref); } else { /* Otherwise eliminate PHI. */ irt_clearmark(ir->t); irt_clearphi(ir->t); } } } /* -- Loop unrolling using copy-substitution ------------------------------ */ /* Copy-substitute snapshot. */ static void loop_subst_snap(jit_State *J, SnapShot *osnap, SnapEntry *loopmap, IRRef1 *subst) { SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs]; SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)]; MSize nmapofs; MSize on, ln, nn, onent = osnap->nent; BCReg nslots = osnap->nslots; SnapShot *snap = &J->cur.snap[J->cur.nsnap]; if (irt_isguard(J->guardemit)) { /* Guard inbetween? */ nmapofs = J->cur.nsnapmap; J->cur.nsnap++; /* Add new snapshot. */ } else { /* Otherwise overwrite previous snapshot. */ snap--; nmapofs = snap->mapofs; } J->guardemit.irt = 0; /* Setup new snapshot. */ snap->mapofs = (uint32_t)nmapofs; snap->ref = (IRRef1)J->cur.nins; snap->mcofs = 0; snap->nslots = nslots; snap->topslot = osnap->topslot; snap->count = 0; nmap = &J->cur.snapmap[nmapofs]; /* Substitute snapshot slots. */ on = ln = nn = 0; while (on < onent) { SnapEntry osn = omap[on], lsn = loopmap[ln]; if (snap_slot(lsn) < snap_slot(osn)) { /* Copy slot from loop map. */ nmap[nn++] = lsn; ln++; } else { /* Copy substituted slot from snapshot map. */ if (snap_slot(lsn) == snap_slot(osn)) ln++; /* Shadowed loop slot. */ if (!irref_isk(snap_ref(osn))) osn = snap_setref(osn, subst[snap_ref(osn)]); nmap[nn++] = osn; on++; } } while (snap_slot(loopmap[ln]) < nslots) /* Copy remaining loop slots. */ nmap[nn++] = loopmap[ln++]; snap->nent = (uint8_t)nn; omap += onent; nmap += nn; while (omap < nextmap) /* Copy PC + frame links. */ *nmap++ = *omap++; J->cur.nsnapmap = (uint32_t)(nmap - J->cur.snapmap); } typedef struct LoopState { jit_State *J; IRRef1 *subst; MSize sizesubst; } LoopState; /* Unroll loop. */ static void loop_unroll(LoopState *lps) { jit_State *J = lps->J; IRRef1 phi[LJ_MAX_PHI]; uint32_t nphi = 0; IRRef1 *subst; SnapNo onsnap; SnapShot *osnap, *loopsnap; SnapEntry *loopmap, *psentinel; IRRef ins, invar; /* Allocate substitution table. ** Only non-constant refs in [REF_BIAS,invar) are valid indexes. */ invar = J->cur.nins; lps->sizesubst = invar - REF_BIAS; lps->subst = lj_mem_newvec(J->L, lps->sizesubst, IRRef1); subst = lps->subst - REF_BIAS; subst[REF_BASE] = REF_BASE; /* LOOP separates the pre-roll from the loop body. */ emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0); /* Grow snapshot buffer and map for copy-substituted snapshots. ** Need up to twice the number of snapshots minus #0 and loop snapshot. ** Need up to twice the number of entries plus fallback substitutions ** from the loop snapshot entries for each new snapshot. ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap! */ onsnap = J->cur.nsnap; lj_snap_grow_buf(J, 2*onsnap-2); lj_snap_grow_map(J, J->cur.nsnapmap*2+(onsnap-2)*J->cur.snap[onsnap-1].nent); /* The loop snapshot is used for fallback substitutions. */ loopsnap = &J->cur.snap[onsnap-1]; loopmap = &J->cur.snapmap[loopsnap->mapofs]; /* The PC of snapshot #0 and the loop snapshot must match. */ psentinel = &loopmap[loopsnap->nent]; lj_assertJ(*psentinel == J->cur.snapmap[J->cur.snap[0].nent], "mismatched PC for loop snapshot"); *psentinel = SNAP(255, 0, 0); /* Replace PC with temporary sentinel. */ /* Start substitution with snapshot #1 (#0 is empty for root traces). */ osnap = &J->cur.snap[1]; /* Copy and substitute all recorded instructions and snapshots. */ for (ins = REF_FIRST; ins < invar; ins++) { IRIns *ir; IRRef op1, op2; if (ins >= osnap->ref) /* Instruction belongs to next snapshot? */ loop_subst_snap(J, osnap++, loopmap, subst); /* Copy-substitute it. */ /* Substitute instruction operands. */ ir = IR(ins); op1 = ir->op1; if (!irref_isk(op1)) op1 = subst[op1]; op2 = ir->op2; if (!irref_isk(op2)) op2 = subst[op2]; if (irm_kind(lj_ir_mode[ir->o]) == IRM_N && op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */ subst[ins] = (IRRef1)ins; /* Shortcut. */ } else { /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */ IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */ IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2)); subst[ins] = (IRRef1)ref; if (ref != ins) { IRIns *irr = IR(ref); if (ref < invar) { /* Loop-carried dependency? */ /* Potential PHI? */ if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) { irt_setphi(irr->t); if (nphi >= LJ_MAX_PHI) lj_trace_err(J, LJ_TRERR_PHIOV); phi[nphi++] = (IRRef1)ref; } /* Check all loop-carried dependencies for type instability. */ if (!irt_sametype(t, irr->t)) { if (irt_isinteger(t) && irt_isinteger(irr->t)) continue; else if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num. */ ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT)); else if (irt_isnum(irr->t) && irt_isinteger(t)) /* Fix num->int. */ ref = tref_ref(emitir(IRTGI(IR_CONV), ref, IRCONV_INT_NUM|IRCONV_CHECK)); else lj_trace_err(J, LJ_TRERR_TYPEINS); subst[ins] = (IRRef1)ref; irr = IR(ref); goto phiconv; } } else if (ref != REF_DROP && ref > invar && ((irr->o == IR_CONV && irr->op1 < invar) || (irr->o == IR_ALEN && irr->op2 < invar && irr->op2 != REF_NIL))) { /* May need an extra PHI for a CONV or ALEN hint. */ ref = irr->o == IR_CONV ? irr->op1 : irr->op2; irr = IR(ref); phiconv: if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) { irt_setphi(irr->t); if (nphi >= LJ_MAX_PHI) lj_trace_err(J, LJ_TRERR_PHIOV); phi[nphi++] = (IRRef1)ref; } } } } } if (!irt_isguard(J->guardemit)) /* Drop redundant snapshot. */ J->cur.nsnapmap = (uint32_t)J->cur.snap[--J->cur.nsnap].mapofs; lj_assertJ(J->cur.nsnapmap <= J->sizesnapmap, "bad snapshot map index"); *psentinel = J->cur.snapmap[J->cur.snap[0].nent]; /* Restore PC. */ loop_emit_phi(J, subst, phi, nphi, onsnap); } /* Undo any partial changes made by the loop optimization. */ static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap) { ptrdiff_t i; SnapShot *snap = &J->cur.snap[nsnap-1]; SnapEntry *map = J->cur.snapmap; map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent]; /* Restore PC. */ J->cur.nsnapmap = (uint32_t)nsnapmap; J->cur.nsnap = nsnap; J->guardemit.irt = 0; lj_ir_rollback(J, ins); for (i = 0; i < BPROP_SLOTS; i++) { /* Remove backprop. cache entries. */ BPropEntry *bp = &J->bpropcache[i]; if (bp->val >= ins) bp->key = 0; } for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */ IRIns *ir = IR(ins); irt_clearphi(ir->t); irt_clearmark(ir->t); } } /* Protected callback for loop optimization. */ static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud) { UNUSED(L); UNUSED(dummy); loop_unroll((LoopState *)ud); return NULL; } /* Loop optimization. */ int lj_opt_loop(jit_State *J) { IRRef nins = J->cur.nins; SnapNo nsnap = J->cur.nsnap; MSize nsnapmap = J->cur.nsnapmap; LoopState lps; int errcode; lps.J = J; lps.subst = NULL; lps.sizesubst = 0; errcode = lj_vm_cpcall(J->L, NULL, &lps, cploop_opt); lj_mem_freevec(J2G(J), lps.subst, lps.sizesubst, IRRef1); if (LJ_UNLIKELY(errcode)) { lua_State *L = J->L; if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) { /* Trace error? */ int32_t e = numberVint(L->top-1); switch ((TraceError)e) { case LJ_TRERR_TYPEINS: /* Type instability. */ case LJ_TRERR_GFAIL: /* Guard would always fail. */ /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */ if (--J->instunroll < 0) /* But do not unroll forever. */ break; L->top--; /* Remove error object. */ loop_undo(J, nins, nsnap, nsnapmap); return 1; /* Loop optimization failed, continue recording. */ default: break; } } lj_err_throw(L, errcode); /* Propagate all other errors. */ } return 0; /* Loop optimization is ok. */ } #undef IR #undef emitir #undef emitir_raw #endif