ocaml/asmcomp/arm64/emit.mlp

(**************************************************************************)
(*                                                                        *)
(*                                 OCaml                                  *)
(*                                                                        *)
(*             Xavier Leroy, projet Gallium, INRIA Rocquencourt           *)
(*                 Benedikt Meurer, University of Siegen                  *)
(*                                                                        *)
(*   Copyright 2013 Institut National de Recherche en Informatique et     *)
(*     en Automatique.                                                    *)
(*   Copyright 2012 Benedikt Meurer.                                      *)
(*                                                                        *)
(*   All rights reserved.  This file is distributed under the terms of    *)
(*   the GNU Lesser General Public License version 2.1, with the          *)
(*   special exception on linking described in the file LICENSE.          *)
(*                                                                        *)
(**************************************************************************)

(* Emission of ARM assembly code, 64-bit mode *)

open Misc
open Cmm
open Arch
open Proc
open Reg
open Mach
open Linearize
open Emitaux

(* Tradeoff between code size and code speed *)

let fastcode_flag = ref true

(* Names for special regs *)

let reg_trap_ptr = phys_reg 23
let reg_alloc_ptr = phys_reg 24
let reg_alloc_limit = phys_reg 25
let reg_tmp1 = phys_reg 26
let reg_tmp2 = phys_reg 27
let reg_x15 = phys_reg 15

(* Output a label *)

let emit_label lbl =
  emit_string ".L"; emit_int lbl

let emit_data_label lbl =
  emit_string ".Ld"; emit_int lbl

(* Symbols *)

let emit_symbol s =
  Emitaux.emit_symbol '$' s

(* Output a pseudo-register *)

let emit_reg = function
    {loc = Reg r} -> emit_string (register_name r)
  | _ -> fatal_error "Emit.emit_reg"

(* Likewise, but with the 32-bit name of the register *)

let int_reg_name_w =
  [| "w0";  "w1";  "w2";  "w3";  "w4";  "w5";  "w6";  "w7";
     "w8";  "w9";  "w10"; "w11"; "w12"; "w13"; "w14"; "w15";
     "w19"; "w20"; "w21"; "w22"; "w23"; "w24"; "w25";
     "w26"; "w27"; "w28"; "w16"; "w17" |]

let emit_wreg = function
    {loc = Reg r} -> emit_string int_reg_name_w.(r)
  | _ -> fatal_error "Emit.emit_wreg"

(* Layout of the stack frame *)

let stack_offset = ref 0

let frame_size () =
  let sz =
    !stack_offset +
    8 * num_stack_slots.(0) +
    8 * num_stack_slots.(1) +
    (if !contains_calls then 8 else 0)
  in Misc.align sz 16

let slot_offset loc cl =
  match loc with
    Incoming n ->
      assert (n >= 0);
      frame_size() + n
  | Local n ->
      !stack_offset +
      (if cl = 0
       then n * 8
       else num_stack_slots.(0) * 8 + n * 8)
  | Outgoing n ->
      assert (n >= 0);
      n

(* Output a stack reference *)

let emit_stack r =
  match r.loc with
  | Stack s ->
      let ofs = slot_offset s (register_class r) in `[sp, #{emit_int ofs}]`
  | _ -> fatal_error "Emit.emit_stack"

(* Output an addressing mode *)

let emit_symbol_offset s ofs =
  emit_symbol s;
  if ofs > 0 then `+{emit_int ofs}`
  else if ofs < 0 then `-{emit_int (-ofs)}`
  else ()

let emit_addressing addr r =
  match addr with
  | Iindexed ofs ->
      `[{emit_reg r}, #{emit_int ofs}]`
  | Ibased(s, ofs) ->
      `[{emit_reg r}, #:lo12:{emit_symbol_offset s ofs}]`

(* Record live pointers at call points *)

let record_frame_label live dbg =
  let lbl = new_label() in
  let live_offset = ref [] in
  Reg.Set.iter
    (function
      | {typ = Val; loc = Reg r} ->
          live_offset := ((r lsl 1) + 1) :: !live_offset
      | {typ = Val; loc = Stack s} as reg ->
          live_offset := slot_offset s (register_class reg) :: !live_offset
      | {typ = Addr} as r ->
          Misc.fatal_error ("bad GC root " ^ Reg.name r)
      | _ -> ())
    live;
  frame_descriptors :=
    { fd_lbl = lbl;
      fd_frame_size = frame_size();
      fd_live_offset = !live_offset;
      fd_debuginfo = dbg } :: !frame_descriptors;
  lbl

let record_frame live dbg =
  let lbl = record_frame_label live dbg in `{emit_label lbl}:`

(* Record calls to the GC -- we've moved them out of the way *)

type gc_call =
  { gc_lbl: label;                      (* Entry label *)
    gc_return_lbl: label;               (* Where to branch after GC *)
    gc_frame_lbl: label }               (* Label of frame descriptor *)

let call_gc_sites = ref ([] : gc_call list)

let emit_call_gc gc =
  `{emit_label gc.gc_lbl}:	bl	{emit_symbol "caml_call_gc"}\n`;
  `{emit_label gc.gc_frame_lbl}:	b	{emit_label gc.gc_return_lbl}\n`

(* Record calls to caml_ml_array_bound_error.
   In debug mode, we maintain one call to caml_ml_array_bound_error
   per bound check site. Otherwise, we can share a single call. *)

type bound_error_call =
  { bd_lbl: label;                    (* Entry label *)
    bd_frame_lbl: label }             (* Label of frame descriptor *)

let bound_error_sites = ref ([] : bound_error_call list)

let bound_error_label dbg =
  if !Clflags.debug || !bound_error_sites = [] then begin
    let lbl_bound_error = new_label() in
    let lbl_frame = record_frame_label Reg.Set.empty dbg in
    bound_error_sites :=
      { bd_lbl = lbl_bound_error;
        bd_frame_lbl = lbl_frame } :: !bound_error_sites;
    lbl_bound_error
  end else begin
    let bd = List.hd !bound_error_sites in bd.bd_lbl
  end

let emit_call_bound_error bd =
  `{emit_label bd.bd_lbl}:	bl	{emit_symbol "caml_ml_array_bound_error"}\n`;
  `{emit_label bd.bd_frame_lbl}:\n`

(* Names of various instructions *)

let name_for_comparison = function
  | Isigned Ceq -> "eq" | Isigned Cne -> "ne" | Isigned Cle -> "le"
  | Isigned Cge -> "ge" | Isigned Clt -> "lt" | Isigned Cgt -> "gt"
  | Iunsigned Ceq -> "eq" | Iunsigned Cne -> "ne" | Iunsigned Cle -> "ls"
  | Iunsigned Cge -> "cs" | Iunsigned Clt -> "cc" | Iunsigned Cgt -> "hi"

let name_for_int_operation = function
  | Iadd -> "add"
  | Isub -> "sub"
  | Imul -> "mul"
  | Idiv -> "sdiv"
  | Iand -> "and"
  | Ior  -> "orr"
  | Ixor -> "eor"
  | Ilsl -> "lsl"
  | Ilsr -> "lsr"
  | Iasr -> "asr"
  | _ -> assert false

(* Load an integer constant into a register *)

let emit_intconst dst n =
  let rec emit_pos first shift =
    if shift < 0 then begin
      if first then `	mov	{emit_reg dst}, xzr\n`
    end else begin
      let s = Nativeint.(logand (shift_right_logical n shift) 0xFFFFn) in
      if s = 0n then emit_pos first (shift - 16) else begin
        if first then
          `	movz	{emit_reg dst}, #{emit_nativeint s}, lsl #{emit_int shift}\n`
        else
           `	movk	{emit_reg dst}, #{emit_nativeint s}, lsl #{emit_int shift}\n`;
        emit_pos false (shift - 16)
      end
    end
  and emit_neg first shift =
    if shift < 0 then begin
      if first then `	movn	{emit_reg dst}, #0\n`
    end else begin
      let s = Nativeint.(logand (shift_right_logical n shift) 0xFFFFn) in
      if s = 0xFFFFn then emit_neg first (shift - 16) else begin
        if first then
          `	movn	{emit_reg dst}, #{emit_nativeint (Nativeint.logxor s 0xFFFFn)}, lsl #{emit_int shift}\n`
        else
           `	movk	{emit_reg dst}, #{emit_nativeint s}, lsl #{emit_int shift}\n`;
        emit_neg false (shift - 16)
      end
    end
  in
    if n < 0n then emit_neg true 48 else emit_pos true 48

let num_instructions_for_intconst n =
  let num_instructions = ref 0 in
  let rec count_pos first shift =
    if shift < 0 then begin
      if first then incr num_instructions
    end else begin
      let s = Nativeint.(logand (shift_right_logical n shift) 0xFFFFn) in
      if s = 0n then count_pos first (shift - 16) else begin
        incr num_instructions;
        count_pos false (shift - 16)
      end
    end
  and count_neg first shift =
    if shift < 0 then begin
      if first then incr num_instructions
    end else begin
      let s = Nativeint.(logand (shift_right_logical n shift) 0xFFFFn) in
      if s = 0xFFFFn then count_neg first (shift - 16) else begin
        incr num_instructions;
        count_neg false (shift - 16)
      end
    end
  in
  if n < 0n then count_neg true 48 else count_pos true 48;
  !num_instructions

(* Recognize float constants appropriate for FMOV dst, #fpimm instruction:
   "a normalized binary floating point encoding with 1 sign bit, 4
    bits of fraction and a 3-bit exponent" *)

let is_immediate_float bits =
  let exp = (Int64.(to_int (shift_right_logical bits 52)) land 0x7FF) - 1023 in
  let mant = Int64.logand bits 0xF_FFFF_FFFF_FFFFL in
  exp >= -3 && exp <= 4 && Int64.logand mant 0xF_0000_0000_0000L = mant

(* Adjust sp (up or down) by the given byte amount *)

let emit_stack_adjustment n =
  let instr = if n < 0 then "sub" else "add" in
  let m = abs n in
  assert (m < 0x1_000_000);
  let ml = m land 0xFFF and mh = m land 0xFFF_000 in
  if mh <> 0 then `	{emit_string instr}	sp, sp, #{emit_int mh}\n`;
  if ml <> 0 then `	{emit_string instr}	sp, sp, #{emit_int ml}\n`;
  if n <> 0 then cfi_adjust_cfa_offset (-n)

(* Deallocate the stack frame and reload the return address
   before a return or tail call *)

let output_epilogue f =
  let n = frame_size() in
  if !contains_calls then
    `	ldr	x30, [sp, #{emit_int (n-8)}]\n`;
  if n > 0 then
    emit_stack_adjustment n;
  f();
  (* reset CFA back because function body may continue *)
  if n > 0 then cfi_adjust_cfa_offset n

(* Name of current function *)
let function_name = ref ""
(* Entry point for tail recursive calls *)
let tailrec_entry_point = ref 0
(* Pending floating-point literals *)
let float_literals = ref ([] : (int64 * label) list)

(* Label a floating-point literal *)
let float_literal f =
  try
    List.assoc f !float_literals
  with Not_found ->
    let lbl = new_label() in
    float_literals := (f, lbl) :: !float_literals;
    lbl

(* Emit all pending literals *)
let emit_literals() =
  if !float_literals <> [] then begin
    `	.align	3\n`;
    List.iter
      (fun (f, lbl) ->
        `{emit_label lbl}:`; emit_float64_directive ".quad" f)
      !float_literals;
    float_literals := []
  end

(* Emit code to load the address of a symbol *)

let emit_load_symbol_addr dst s =
  if (not !Clflags.dlcode) || Compilenv.symbol_in_current_unit s then begin
    `	adrp	{emit_reg dst}, {emit_symbol s}\n`;
    `	add	{emit_reg dst}, {emit_reg dst}, #:lo12:{emit_symbol s}\n`
  end else begin
    `	adrp	{emit_reg dst}, :got:{emit_symbol s}\n`;
    `	ldr	{emit_reg dst}, [{emit_reg dst}, #:got_lo12:{emit_symbol s}]\n`
  end

(* The following functions are used for calculating the sizes of the
   call GC and bounds check points emitted out-of-line from the function
   body.  See branch_relaxation.mli. *)

let num_call_gc_and_check_bound_points instr =
  let rec loop instr ((call_gc, check_bound) as totals) =
    match instr.desc with
    | Lend -> totals
    | Lop (Ialloc _) when !fastcode_flag ->
      loop instr.next (call_gc + 1, check_bound)
    | Lop (Iintop Icheckbound)
    | Lop (Iintop_imm (Icheckbound, _))
    | Lop (Ispecific (Ishiftcheckbound _)) ->
      let check_bound =
        (* When not in debug mode, there is at most one check-bound point. *)
        if not !Clflags.debug then 1
        else check_bound + 1
      in
      loop instr.next (call_gc, check_bound)
    (* The following four should never be seen, since this function is run
       before branch relaxation. *)
    | Lop (Ispecific (Ifar_alloc _))
    | Lop (Ispecific Ifar_intop_checkbound)
    | Lop (Ispecific (Ifar_intop_imm_checkbound _))
    | Lop (Ispecific (Ifar_shiftcheckbound _)) -> assert false
    | _ -> loop instr.next totals
  in
  loop instr (0, 0)

let max_out_of_line_code_offset instr ~num_call_gc ~num_check_bound =
  if num_call_gc < 1 && num_check_bound < 1 then 0
  else begin
    let size_of_call_gc = 2 in
    let size_of_check_bound = 1 in
    let size_of_last_thing =
      (* Call-GC points come before check-bound points. *)
      if num_check_bound >= 1 then size_of_check_bound else size_of_call_gc
    in
    let total_size =
      size_of_call_gc*num_call_gc + size_of_check_bound*num_check_bound
    in
    let max_offset = total_size - size_of_last_thing in
    assert (max_offset >= 0);
    max_offset
  end

module BR = Branch_relaxation.Make (struct
  (* CR-someday mshinwell: B and BL have +/- 128Mb ranges; for the moment we
     assume we will never exceed this.  It would seem to be most likely to
     occur for branches between functions; in this case, the linker should be
     able to insert veneers anyway.  (See section 4.6.7 of the document
     "ELF for the ARM 64-bit architecture (AArch64)".) *)

  type distance = int

  module Cond_branch = struct
    type t = TB | CB | Bcc

    let all = [TB; CB; Bcc]

    (* AArch64 instructions are 32 bits wide, so [distance] in this module
       means units of 32-bit words. *)
    let max_displacement = function
      | TB -> 32 * 1024 / 4  (* +/- 32Kb *)
      | CB | Bcc -> 1 * 1024 * 1024 / 4  (* +/- 1Mb *)

    let classify_instr = function
      | Lop (Ialloc _)
      | Lop (Iintop Icheckbound)
      | Lop (Iintop_imm (Icheckbound, _))
      | Lop (Ispecific (Ishiftcheckbound _)) -> Some Bcc
      (* The various "far" variants in [specific_operation] don't need to
         return [Some] here, since their code sequences never contain any
         conditional branches that might need relaxing. *)
      | Lcondbranch (Itruetest, _)
      | Lcondbranch (Ifalsetest, _) -> Some CB
      | Lcondbranch (Iinttest _, _)
      | Lcondbranch (Iinttest_imm _, _)
      | Lcondbranch (Ifloattest _, _) -> Some Bcc
      | Lcondbranch (Ioddtest, _)
      | Lcondbranch (Ieventest, _) -> Some TB
      | Lcondbranch3 _ -> Some Bcc
      | _ -> None
  end

  let offset_pc_at_branch = 0

  let epilogue_size () =
    if !contains_calls then 3 else 2

  let instr_size = function
    | Lend -> 0
    | Lop (Imove | Ispill | Ireload) -> 1
    | Lop (Iconst_int n | Iconst_blockheader n) ->
      num_instructions_for_intconst n
    | Lop (Iconst_float _) -> 2
    | Lop (Iconst_symbol _) -> 2
    | Lop (Icall_ind) -> 1
    | Lop (Icall_imm _) -> 1
    | Lop (Itailcall_ind) -> epilogue_size ()
    | Lop (Itailcall_imm s) ->
      if s = !function_name then 1 else epilogue_size ()
    | Lop (Iextcall (_, false)) -> 1
    | Lop (Iextcall (_, true)) -> 3
    | Lop (Istackoffset _) -> 2
    | Lop (Iload (size, addr)) | Lop (Istore (size, addr, _)) ->
      let based = match addr with Iindexed _ -> 0 | Ibased _ -> 1 in
      based + begin match size with Single -> 2 | _ -> 1 end
    | Lop (Ialloc _) when !fastcode_flag -> 4
    | Lop (Ispecific (Ifar_alloc _)) when !fastcode_flag -> 5
    | Lop (Ialloc num_words) | Lop (Ispecific (Ifar_alloc num_words)) ->
      begin match num_words with
      | 16 | 24 | 32 -> 1
      | _ -> 1 + num_instructions_for_intconst (Nativeint.of_int num_words)
      end
    | Lop (Iintop (Icomp _)) -> 2
    | Lop (Iintop_imm (Icomp _, _)) -> 2
    | Lop (Iintop Icheckbound) -> 2
    | Lop (Ispecific Ifar_intop_checkbound) -> 3
    | Lop (Iintop_imm (Icheckbound, _)) -> 2
    | Lop (Ispecific (Ifar_intop_imm_checkbound _)) -> 3
    | Lop (Ispecific (Ishiftcheckbound _)) -> 2
    | Lop (Ispecific (Ifar_shiftcheckbound _)) -> 3
    | Lop (Iintop Imod) -> 2
    | Lop (Iintop Imulh) -> 1
    | Lop (Iintop _) -> 1
    | Lop (Iintop_imm _) -> 1
    | Lop (Ifloatofint | Iintoffloat | Iabsf | Inegf | Ispecific Isqrtf) -> 1
    | Lop (Iaddf | Isubf | Imulf | Idivf | Ispecific Inegmulf) -> 1
    | Lop (Ispecific (Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf)) -> 1
    | Lop (Ispecific (Ishiftarith _)) -> 1
    | Lop (Ispecific (Imuladd | Imulsub)) -> 1
    | Lop (Ispecific (Ibswap 16)) -> 2
    | Lop (Ispecific (Ibswap _)) -> 1
    | Lreloadretaddr -> 0
    | Lreturn -> epilogue_size ()
    | Llabel _ -> 0
    | Lbranch _ -> 1
    | Lcondbranch (tst, _) ->
      begin match tst with
      | Itruetest -> 1
      | Ifalsetest -> 1
      | Iinttest _ -> 2
      | Iinttest_imm _ -> 2
      | Ifloattest _ -> 2
      | Ioddtest -> 1
      | Ieventest -> 1
      end
    | Lcondbranch3 (lbl0, lbl1, lbl2) ->
      1 + begin match lbl0 with None -> 0 | Some _ -> 1 end
        + begin match lbl1 with None -> 0 | Some _ -> 1 end
        + begin match lbl2 with None -> 0 | Some _ -> 1 end
    | Lswitch jumptbl -> 3 + Array.length jumptbl
    | Lsetuptrap _ -> 2
    | Lpushtrap -> 3
    | Lpoptrap -> 1
    | Lraise k ->
      begin match !Clflags.debug, k with
      | true, (Lambda.Raise_regular | Lambda.Raise_reraise) -> 1
      | false, _
      | true, Lambda.Raise_notrace -> 4
      end

  let relax_allocation ~num_words =
    Lop (Ispecific (Ifar_alloc num_words))

  let relax_intop_checkbound () =
    Lop (Ispecific Ifar_intop_checkbound)

  let relax_intop_imm_checkbound ~bound =
    Lop (Ispecific (Ifar_intop_imm_checkbound bound))

  let relax_specific_op = function
    | Ishiftcheckbound shift -> Lop (Ispecific (Ifar_shiftcheckbound shift))
    | _ -> assert false
end)

(* Output the assembly code for allocation. *)

let assembly_code_for_allocation i ~n ~far =
  let lbl_frame = record_frame_label i.live i.dbg in
  if !fastcode_flag then begin
    let lbl_redo = new_label() in
    let lbl_call_gc = new_label() in
    `{emit_label lbl_redo}:`;
    `	sub	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
    `	cmp	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_limit}\n`;
    `	add	{emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`;
    if not far then begin
      `	b.lo	{emit_label lbl_call_gc}\n`
    end else begin
      let lbl = new_label () in
      `	b.cs	{emit_label lbl}\n`;
      `	b	{emit_label lbl_call_gc}\n`;
      `{emit_label lbl}:\n`
    end;
    call_gc_sites :=
      { gc_lbl = lbl_call_gc;
        gc_return_lbl = lbl_redo;
        gc_frame_lbl = lbl_frame } :: !call_gc_sites
  end else begin
    begin match n with
    | 16 -> `	bl	{emit_symbol "caml_alloc1"}\n`
    | 24 -> `	bl	{emit_symbol "caml_alloc2"}\n`
    | 32 -> `	bl	{emit_symbol "caml_alloc3"}\n`
    | _  -> emit_intconst reg_x15 (Nativeint.of_int n);
            `	bl	{emit_symbol "caml_allocN"}\n`
    end;
    `{emit_label lbl_frame}:	add	{emit_reg i.res.(0)}, {emit_reg reg_alloc_ptr}, #8\n`
  end

(* Output the assembly code for an instruction *)

let emit_instr i =
    emit_debug_info i.dbg;
    match i.desc with
    | Lend -> ()
    | Lop(Imove | Ispill | Ireload) ->
        let src = i.arg.(0) and dst = i.res.(0) in
        if src.loc <> dst.loc then begin
          match (src, dst) with
          | {loc = Reg _; typ = Float}, {loc = Reg _} ->
              `	fmov	{emit_reg dst}, {emit_reg src}\n`
          | {loc = Reg _}, {loc = Reg _} ->
              `	mov	{emit_reg dst}, {emit_reg src}\n`
          | {loc = Reg _}, {loc = Stack _} ->
              `	str	{emit_reg src}, {emit_stack dst}\n`
          | {loc = Stack _}, {loc = Reg _} ->
              `	ldr	{emit_reg dst}, {emit_stack src}\n`
          | _ ->
              assert false
        end
    | Lop(Iconst_int n | Iconst_blockheader n) ->
        emit_intconst i.res.(0) n
    | Lop(Iconst_float f) ->
        if f = 0L then
          `	fmov	{emit_reg i.res.(0)}, xzr\n`
        else if is_immediate_float f then
          `	fmov	{emit_reg i.res.(0)}, #{emit_printf "0x%Lx" f}\n`
        else begin
          let lbl = float_literal f in
          `	adrp	{emit_reg reg_tmp1}, {emit_label lbl}\n`;
          `	ldr	{emit_reg i.res.(0)}, [{emit_reg reg_tmp1}, #:lo12:{emit_label lbl}]\n`
        end
    | Lop(Iconst_symbol s) ->
        emit_load_symbol_addr i.res.(0) s
    | Lop(Icall_ind) ->
        `	blr	{emit_reg i.arg.(0)}\n`;
        `{record_frame i.live i.dbg}\n`
    | Lop(Icall_imm s) ->
        `	bl	{emit_symbol s}\n`;
        `{record_frame i.live i.dbg}\n`
    | Lop(Itailcall_ind) ->
        output_epilogue (fun () -> `	br	{emit_reg i.arg.(0)}\n`)
    | Lop(Itailcall_imm s) ->
        if s = !function_name then
          `	b	{emit_label !tailrec_entry_point}\n`
        else
          output_epilogue (fun () -> `	b	{emit_symbol s}\n`)
    | Lop(Iextcall(s, false)) ->
        `	bl	{emit_symbol s}\n`
    | Lop(Iextcall(s, true)) ->
        emit_load_symbol_addr reg_x15 s;
        `	bl	{emit_symbol "caml_c_call"}\n`;
        `{record_frame i.live i.dbg}\n`
    | Lop(Istackoffset n) ->
        assert (n mod 16 = 0);
        emit_stack_adjustment (-n);
        stack_offset := !stack_offset + n
    | Lop(Iload(size, addr)) ->
        let dst = i.res.(0) in
        let base =
          match addr with
          | Iindexed ofs -> i.arg.(0)
          | Ibased(s, ofs) ->
              `	adrp	{emit_reg reg_tmp1}, {emit_symbol_offset s ofs}\n`;
              reg_tmp1 in
        begin match size with
        | Byte_unsigned ->
            `	ldrb	{emit_wreg dst}, {emit_addressing addr base}\n`
        | Byte_signed ->
            `	ldrsb	{emit_reg dst}, {emit_addressing addr base}\n`
        | Sixteen_unsigned ->
            `	ldrh	{emit_wreg dst}, {emit_addressing addr base}\n`
        | Sixteen_signed ->
            `	ldrsh	{emit_reg dst}, {emit_addressing addr base}\n`
        | Thirtytwo_unsigned ->
            `	ldr	{emit_wreg dst}, {emit_addressing addr base}\n`
        | Thirtytwo_signed ->
            `	ldrsw	{emit_reg dst}, {emit_addressing addr base}\n`
        | Single ->
            `	ldr	s7, {emit_addressing addr base}\n`;
            `	fcvt	{emit_reg dst}, s7\n`
        | Word_int | Word_val | Double | Double_u ->
            `	ldr	{emit_reg dst}, {emit_addressing addr base}\n`
        end
    | Lop(Istore(size, addr, _)) ->
        let src = i.arg.(0) in
        let base =
          match addr with
          | Iindexed ofs -> i.arg.(1)
          | Ibased(s, ofs) ->
              `	adrp	{emit_reg reg_tmp1}, {emit_symbol_offset s ofs}\n`;
              reg_tmp1 in
        begin match size with
        | Byte_unsigned | Byte_signed ->
            `	strb	{emit_wreg src}, {emit_addressing addr base}\n`
        | Sixteen_unsigned | Sixteen_signed ->
            `	strh	{emit_wreg src}, {emit_addressing addr base}\n`
        | Thirtytwo_unsigned | Thirtytwo_signed ->
            `	str	{emit_wreg src}, {emit_addressing addr base}\n`
        | Single ->
            `	fcvt	s7, {emit_reg src}\n`;
            `	str	s7, {emit_addressing addr base}\n`;
        | Word_int | Word_val | Double | Double_u ->
            `	str	{emit_reg src}, {emit_addressing addr base}\n`
        end
    | Lop(Ialloc n) ->
        assembly_code_for_allocation i ~n ~far:false
    | Lop(Ispecific (Ifar_alloc n)) ->
        assembly_code_for_allocation i ~n ~far:true
    | Lop(Iintop(Icomp cmp)) ->
        `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
        `	cset	{emit_reg i.res.(0)}, {emit_string (name_for_comparison cmp)}\n`
    | Lop(Iintop_imm(Icomp cmp, n)) ->
        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
        `	cset	{emit_reg i.res.(0)}, {emit_string (name_for_comparison cmp)}\n`
    | Lop(Iintop Icheckbound) ->
        let lbl = bound_error_label i.dbg in
        `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
        `	b.ls	{emit_label lbl}\n`
    | Lop(Ispecific Ifar_intop_checkbound) ->
        let lbl = bound_error_label i.dbg in
        let lbl2 = new_label () in
        `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
        `	b.hi	{emit_label lbl2}\n`;
        `	b	{emit_label lbl}\n`;
        `{emit_label lbl2}:\n`;
    | Lop(Iintop_imm(Icheckbound, n)) ->
        let lbl = bound_error_label i.dbg in
        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
        `	b.ls	{emit_label lbl}\n`
    | Lop(Ispecific(Ifar_intop_imm_checkbound bound)) ->
        let lbl = bound_error_label i.dbg in
        let lbl2 = new_label () in
        `	cmp	{emit_reg i.arg.(0)}, #{emit_int bound}\n`;
        `	b.hi	{emit_label lbl2}\n`;
        `	b	{emit_label lbl}\n`;
        `{emit_label lbl2}:\n`;
    | Lop(Ispecific(Ishiftcheckbound shift)) ->
        let lbl = bound_error_label i.dbg in
        `	cmp	{emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}, lsr #{emit_int shift}\n`;
        `	b.cs	{emit_label lbl}\n`
    | Lop(Ispecific(Ifar_shiftcheckbound shift)) ->
        let lbl = bound_error_label i.dbg in
        let lbl2 = new_label () in
        `	cmp	{emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}, lsr #{emit_int shift}\n`;
        `	b.lo	{emit_label lbl2}\n`;
        `	b	{emit_label lbl}\n`;
        `{emit_label lbl2}:\n`;
    | Lop(Iintop Imod) ->
        `	sdiv	{emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
        `	msub	{emit_reg i.res.(0)}, {emit_reg reg_tmp1}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}\n`
    | Lop(Iintop Imulh) ->
        `	smulh	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
    | Lop(Iintop op) ->
        let instr = name_for_int_operation op in
        `	{emit_string instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
    | Lop(Iintop_imm(op, n)) ->
        let instr = name_for_int_operation op in
        `	{emit_string instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`
    | Lop(Ifloatofint | Iintoffloat | Iabsf | Inegf | Ispecific Isqrtf as op) ->
        let instr = (match op with
                     | Ifloatofint      -> "scvtf"
                     | Iintoffloat      -> "fcvtzs"
                     | Iabsf            -> "fabs"
                     | Inegf            -> "fneg"
                     | Ispecific Isqrtf -> "fsqrt"
                     | _                -> assert false) in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`
    | Lop(Iaddf | Isubf | Imulf | Idivf | Ispecific Inegmulf as op) ->
        let instr = (match op with
                     | Iaddf              -> "fadd"
                     | Isubf              -> "fsub"
                     | Imulf              -> "fmul"
                     | Idivf              -> "fdiv"
                     | Ispecific Inegmulf -> "fnmul"
                     | _                  -> assert false) in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
    | Lop(Ispecific(Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf as op)) ->
        let instr = (match op with
                     | Imuladdf    -> "fmadd"
                     | Inegmuladdf -> "fnmadd"
                     | Imulsubf    -> "fmsub"
                     | Inegmulsubf -> "fnmsub"
                     | _ -> assert false) in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}, {emit_reg i.arg.(0)}\n`
    | Lop(Ispecific(Ishiftarith(op, shift))) ->
        let instr = (match op with
                       Ishiftadd    -> "add"
                     | Ishiftsub    -> "sub") in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}`;
        if shift >= 0
        then `, lsl #{emit_int shift}\n`
        else `, asr #{emit_int (-shift)}\n`
    | Lop(Ispecific(Imuladd | Imulsub as op)) ->
        let instr = (match op with
                       Imuladd -> "madd"
                     | Imulsub -> "msub"
                     | _ -> assert false) in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}\n`
    | Lop(Ispecific(Ibswap size)) ->
        begin match size with
        | 16 ->
            `	rev16	{emit_wreg i.res.(0)}, {emit_wreg i.arg.(0)}\n`;
            `	ubfm	{emit_reg i.res.(0)}, {emit_reg i.res.(0)}, #0, #15\n`
        | 32 ->
            `	rev	{emit_wreg i.res.(0)}, {emit_wreg i.arg.(0)}\n`
        | 64 ->
            `	rev	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`
        | _ ->
            assert false
        end
    | Lreloadretaddr ->
        ()
    | Lreturn ->
        output_epilogue (fun () -> `	ret\n`)
    | Llabel lbl ->
        `{emit_label lbl}:\n`
    | Lbranch lbl ->
        `	b	{emit_label lbl}\n`
    | Lcondbranch(tst, lbl) ->
        begin match tst with
        | Itruetest ->
            `	cbnz	{emit_reg i.arg.(0)}, {emit_label lbl}\n`
        | Ifalsetest ->
            `	cbz	{emit_reg i.arg.(0)}, {emit_label lbl}\n`
        | Iinttest cmp ->
            `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
            let comp = name_for_comparison cmp in
            `	b.{emit_string comp}	{emit_label lbl}\n`
        | Iinttest_imm(cmp, n) ->
            `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
            let comp = name_for_comparison cmp in
            `	b.{emit_string comp}	{emit_label lbl}\n`
        | Ifloattest(cmp, neg) ->
            let comp = (match (cmp, neg) with
                        | (Ceq, false) | (Cne, true) -> "eq"
                        | (Cne, false) | (Ceq, true) -> "ne"
                        | (Clt, false) -> "cc"
                        | (Clt, true)  -> "cs"
                        | (Cle, false) -> "ls"
                        | (Cle, true)  -> "hi"
                        | (Cgt, false) -> "gt"
                        | (Cgt, true)  -> "le"
                        | (Cge, false) -> "ge"
                        | (Cge, true)  -> "lt") in
            `	fcmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
            `	b.{emit_string comp}	{emit_label lbl}\n`
        | Ioddtest ->
            `	tbnz	{emit_reg i.arg.(0)}, #0, {emit_label lbl}\n`
        | Ieventest ->
            `	tbz	{emit_reg i.arg.(0)}, #0, {emit_label lbl}\n`
        end
    | Lcondbranch3(lbl0, lbl1, lbl2) ->
        `	cmp	{emit_reg i.arg.(0)}, #1\n`;
        begin match lbl0 with
          None -> ()
        | Some lbl -> `	b.lt	{emit_label lbl}\n`
        end;
        begin match lbl1 with
          None -> ()
        | Some lbl -> `	b.eq	{emit_label lbl}\n`
        end;
        begin match lbl2 with
          None -> ()
        | Some lbl -> `	b.gt	{emit_label lbl}\n`
        end
    | Lswitch jumptbl ->
        let lbltbl = new_label() in
        `	adr	{emit_reg reg_tmp1}, {emit_label lbltbl}\n`;
        `	add	{emit_reg reg_tmp1}, {emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, lsl #2\n`;
        `	br	{emit_reg reg_tmp1}\n`;
        `{emit_label lbltbl}:`;
        for j = 0 to Array.length jumptbl - 1 do
            `	b	{emit_label jumptbl.(j)}\n`
        done
(* Alternative:
        let lbltbl = new_label() in
        `	adr	{emit_reg reg_tmp1}, {emit_label lbltbl}\n`;
        `	ldr	{emit_wreg reg_tmp2}, [{emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, lsl #2]\n`;
        `	add	{emit_reg reg_tmp1}, {emit_wreg reg_tmp2}, sxtb\n`;
        `	br	{emit_reg reg_tmp1}\n`;
        `{emit_label lbltbl}:\n`;
        for j = 0 to Array.length jumptbl - 1 do
            `	.word	{emit_label jumptbl.(j)} - {emit_label lbltbl}\n`
        done
*)
    | Lsetuptrap lbl ->
        let lblnext = new_label() in
        `	adr	{emit_reg reg_tmp1}, {emit_label lblnext}\n`;
        `	b	{emit_label lbl}\n`;
        `{emit_label lblnext}:\n`
    | Lpushtrap ->
        stack_offset := !stack_offset + 16;
        `	str	{emit_reg reg_trap_ptr}, [sp, -16]!\n`;
        `	str	{emit_reg reg_tmp1}, [sp, #8]\n`;
        cfi_adjust_cfa_offset 16;
        `	mov	{emit_reg reg_trap_ptr}, sp\n`
    | Lpoptrap ->
        `	ldr	{emit_reg reg_trap_ptr}, [sp], 16\n`;
        cfi_adjust_cfa_offset (-16);
        stack_offset := !stack_offset - 16
    | Lraise k ->
        begin match !Clflags.debug, k with
        | true, (Lambda.Raise_regular | Lambda.Raise_reraise) ->
          `	bl	{emit_symbol "caml_raise_exn"}\n`;
          `{record_frame Reg.Set.empty i.dbg}\n`
        | false, _
        | true, Lambda.Raise_notrace ->
          `	mov	sp, {emit_reg reg_trap_ptr}\n`;
          `	ldr	{emit_reg reg_tmp1}, [sp, #8]\n`;
          `	ldr	{emit_reg reg_trap_ptr}, [sp], 16\n`;
          `	br	{emit_reg reg_tmp1}\n`
        end

(* Emission of an instruction sequence *)

let rec emit_all i =
  if i.desc = Lend then () else (emit_instr i; emit_all i.next)

(* Emission of the profiling prelude *)

let emit_profile() = ()   (* TODO *)
(*
  match Config.system with
    "linux_eabi" | "linux_eabihf" | "netbsd" ->
      `	push	\{lr}\n`;
      `	{emit_call "__gnu_mcount_nc"}\n`
  | _ -> ()
*)

(* Emission of a function declaration *)

let fundecl fundecl =
  function_name := fundecl.fun_name;
  fastcode_flag := fundecl.fun_fast;
  tailrec_entry_point := new_label();
  float_literals := [];
  stack_offset := 0;
  call_gc_sites := [];
  bound_error_sites := [];
  `	.text\n`;
  `	.align	3\n`;
  `	.globl	{emit_symbol fundecl.fun_name}\n`;
  `	.type	{emit_symbol fundecl.fun_name}, %function\n`;
  `{emit_symbol fundecl.fun_name}:\n`;
  emit_debug_info fundecl.fun_dbg;
  cfi_startproc();
  if !Clflags.gprofile then emit_profile();
  let n = frame_size() in
  if n > 0 then
    emit_stack_adjustment (-n);
  if !contains_calls then begin
    cfi_offset ~reg:30 (* return address *) ~offset:(-8);
    `	str	x30, [sp, #{emit_int (n-8)}]\n`
  end;
  `{emit_label !tailrec_entry_point}:\n`;
  let num_call_gc, num_check_bound =
    num_call_gc_and_check_bound_points fundecl.fun_body
  in
  let max_out_of_line_code_offset =
    max_out_of_line_code_offset fundecl.fun_body ~num_call_gc
      ~num_check_bound
  in
  BR.relax fundecl.fun_body ~max_out_of_line_code_offset;
  emit_all fundecl.fun_body;
  List.iter emit_call_gc !call_gc_sites;
  List.iter emit_call_bound_error !bound_error_sites;
  assert (List.length !call_gc_sites = num_call_gc);
  assert (List.length !bound_error_sites = num_check_bound);
  cfi_endproc();
  `	.type	{emit_symbol fundecl.fun_name}, %function\n`;
  `	.size	{emit_symbol fundecl.fun_name}, .-{emit_symbol fundecl.fun_name}\n`;
  emit_literals()

(* Emission of data *)

let emit_item = function
  | Cglobal_symbol s -> `	.globl	{emit_symbol s}\n`;
  | Cdefine_symbol s -> `{emit_symbol s}:\n`
  | Cdefine_label lbl -> `{emit_data_label lbl}:\n`
  | Cint8 n -> `	.byte	{emit_int n}\n`
  | Cint16 n -> `	.short	{emit_int n}\n`
  | Cint32 n -> `	.long	{emit_nativeint n}\n`
  | Cint n -> `	.quad	{emit_nativeint n}\n`
  | Csingle f -> emit_float32_directive ".long" (Int32.bits_of_float f)
  | Cdouble f -> emit_float64_directive ".quad" (Int64.bits_of_float f)
  | Csymbol_address s -> `	.quad	{emit_symbol s}\n`
  | Clabel_address lbl -> `	.quad	{emit_data_label lbl}\n`
  | Cstring s -> emit_string_directive "	.ascii  " s
  | Cskip n -> if n > 0 then `	.space	{emit_int n}\n`
  | Calign n -> `	.align	{emit_int(Misc.log2 n)}\n`

let data l =
  `	.data\n`;
  `	.align 3\n`;
  List.iter emit_item l

(* Beginning / end of an assembly file *)

let begin_assembly() =
  reset_debug_info();
  `	.file	\"\"\n`;  (* PR#7037 *)
  let lbl_begin = Compilenv.make_symbol (Some "data_begin") in
  `	.data\n`;
  `	.globl	{emit_symbol lbl_begin}\n`;
  `{emit_symbol lbl_begin}:\n`;
  let lbl_begin = Compilenv.make_symbol (Some "code_begin") in
  `	.text\n`;
  `	.globl	{emit_symbol lbl_begin}\n`;
  `{emit_symbol lbl_begin}:\n`

let end_assembly () =
  let lbl_end = Compilenv.make_symbol (Some "code_end") in
  `	.text\n`;
  `	.globl	{emit_symbol lbl_end}\n`;
  `{emit_symbol lbl_end}:\n`;
  let lbl_end = Compilenv.make_symbol (Some "data_end") in
  `	.data\n`;
  `	.globl	{emit_symbol lbl_end}\n`;
  `{emit_symbol lbl_end}:\n`;
  `	.long	0\n`;
  let lbl = Compilenv.make_symbol (Some "frametable") in
  `	.globl	{emit_symbol lbl}\n`;
  `{emit_symbol lbl}:\n`;
  emit_frames
    { efa_label = (fun lbl ->
                       `	.type	{emit_label lbl}, %function\n`;
                       `	.quad	{emit_label lbl}\n`);
      efa_16 = (fun n -> `	.short	{emit_int n}\n`);
      efa_32 = (fun n -> `	.long	{emit_int32 n}\n`);
      efa_word = (fun n -> `	.quad	{emit_int n}\n`);
      efa_align = (fun n -> `	.align	{emit_int(Misc.log2 n)}\n`);
      efa_label_rel = (fun lbl ofs ->
                           `	.long	{emit_label lbl} - . + {emit_int32 ofs}\n`);
      efa_def_label = (fun lbl -> `{emit_label lbl}:\n`);
      efa_string = (fun s -> emit_string_directive "	.asciz	" s) };
  `	.type	{emit_symbol lbl}, %object\n`;
  `	.size	{emit_symbol lbl}, .-{emit_symbol lbl}\n`;
  begin match Config.system with
  | "linux" ->
      (* Mark stack as non-executable *)
      `	.section	.note.GNU-stack,\"\",%progbits\n`
  | _ -> ()
  end