ocaml/asmcomp/arm/emit.mlp

(**************************************************************************)
(*                                                                        *)
(*                                 OCaml                                  *)
(*                                                                        *)
(*                 Benedikt Meurer, University of Siegen                  *)
(*                                                                        *)
(*   Copyright 1998 Institut National de Recherche en Informatique et     *)
(*     en Automatique.                                                    *)
(*   Copyright 2012 Benedikt Meurer.                                      *)
(*                                                                        *)
(*   All rights reserved.  This file is distributed under the terms of    *)
(*   the GNU Lesser General Public License version 2.1, with the          *)
(*   special exception on linking described in the file LICENSE.          *)
(*                                                                        *)
(**************************************************************************)

(* Emission of ARM assembly code *)

open Misc
open Cmm
open Arch
open Proc
open Reg
open Mach
open Linearize
open Emitaux

(* Tradeoff between code size and code speed *)

let fastcode_flag = ref true

(* Output a label *)

let emit_label lbl =
  emit_string ".L"; emit_int lbl

let emit_data_label lbl =
  emit_string ".Ld"; emit_int lbl

(* Symbols *)

let emit_symbol s =
  Emitaux.emit_symbol '$' s

let emit_call s =
  if !Clflags.dlcode || !Clflags.pic_code
  then `bl	{emit_symbol s}(PLT)`
  else `bl	{emit_symbol s}`

let emit_jump s =
  if !Clflags.dlcode || !Clflags.pic_code
  then `b	{emit_symbol s}(PLT)`
  else `b	{emit_symbol s}`

(* Output a pseudo-register *)

let emit_reg = function
    {loc = Reg r} -> emit_string (register_name r)
  | _ -> fatal_error "Emit_arm.emit_reg"

(* Layout of the stack frame *)

let stack_offset = ref 0

let frame_size () =
  let sz =
    !stack_offset +
    4 * num_stack_slots.(0) +
    8 * num_stack_slots.(1) +
    8 * num_stack_slots.(2) +
    (if !contains_calls then 4 else 0)
  in Misc.align sz 8

let slot_offset loc cl =
  match loc with
    Incoming n ->
      assert (n >= 0);
      frame_size() + n
  | Local n ->
      if cl = 0
      then !stack_offset + n * 4
      else !stack_offset + num_stack_slots.(0) * 4 + n * 8
  | Outgoing n ->
      assert (n >= 0);
      n

(* Output a stack reference *)

let emit_stack r =
  match r.loc with
  | Stack s ->
      let ofs = slot_offset s (register_class r) in `[sp, #{emit_int ofs}]`
  | _ -> fatal_error "Emit_arm.emit_stack"

(* Output an addressing mode *)

let emit_addressing addr r n =
  match addr with
    Iindexed ofs ->
      `[{emit_reg r.(n)}, #{emit_int ofs}]`

(* Record live pointers at call points *)

let record_frame_label live dbg =
  let lbl = new_label() in
  let live_offset = ref [] in
  Reg.Set.iter
    (function
      | {typ = Val; loc = Reg r} ->
          live_offset := ((r lsl 1) + 1) :: !live_offset
      | {typ = Val; loc = Stack s} as reg ->
          live_offset := slot_offset s (register_class reg) :: !live_offset
      | {typ = Addr} as r ->
          Misc.fatal_error ("bad GC root " ^ Reg.name r)
      | _ -> ())
    live;
  frame_descriptors :=
    { fd_lbl = lbl;
      fd_frame_size = frame_size();
      fd_live_offset = !live_offset;
      fd_debuginfo = dbg } :: !frame_descriptors;
  lbl

let record_frame live dbg =
  let lbl = record_frame_label live dbg in `{emit_label lbl}:`

(* Record calls to the GC -- we've moved them out of the way *)

type gc_call =
  { gc_lbl: label;                      (* Entry label *)
    gc_return_lbl: label;               (* Where to branch after GC *)
    gc_frame_lbl: label }               (* Label of frame descriptor *)

let call_gc_sites = ref ([] : gc_call list)

let emit_call_gc gc =
  `{emit_label gc.gc_lbl}:	{emit_call "caml_call_gc"}\n`;
  `{emit_label gc.gc_frame_lbl}:	b	{emit_label gc.gc_return_lbl}\n`

(* Record calls to caml_ml_array_bound_error.
   In debug mode, we maintain one call to caml_ml_array_bound_error
   per bound check site. Otherwise, we can share a single call. *)

type bound_error_call =
  { bd_lbl: label;                    (* Entry label *)
    bd_frame_lbl: label }             (* Label of frame descriptor *)

let bound_error_sites = ref ([] : bound_error_call list)

let bound_error_label dbg =
  if !Clflags.debug || !bound_error_sites = [] then begin
    let lbl_bound_error = new_label() in
    let lbl_frame = record_frame_label Reg.Set.empty dbg in
    bound_error_sites :=
      { bd_lbl = lbl_bound_error;
        bd_frame_lbl = lbl_frame } :: !bound_error_sites;
    lbl_bound_error
  end else begin
    let bd = List.hd !bound_error_sites in bd.bd_lbl
  end

let emit_call_bound_error bd =
  `{emit_label bd.bd_lbl}:	{emit_call "caml_ml_array_bound_error"}\n`;
  `{emit_label bd.bd_frame_lbl}:\n`

(* Negate a comparison *)

let negate_integer_comparison = function
    Isigned cmp   -> Isigned(negate_comparison cmp)
  | Iunsigned cmp -> Iunsigned(negate_comparison cmp)

(* Names of various instructions *)

let name_for_comparison = function
    Isigned Ceq -> "eq" | Isigned Cne -> "ne" | Isigned Cle -> "le"
  | Isigned Cge -> "ge" | Isigned Clt -> "lt" | Isigned Cgt -> "gt"
  | Iunsigned Ceq -> "eq" | Iunsigned Cne -> "ne" | Iunsigned Cle -> "ls"
  | Iunsigned Cge -> "cs" | Iunsigned Clt -> "cc" | Iunsigned Cgt -> "hi"

let name_for_int_operation = function
  (* Use adds,subs,... to enable 16-bit T1 encoding *)
    Iadd  -> "adds"
  | Isub  -> "subs"
  | Imul  -> "mul"
  | Imulh -> "smmul"
  | Iand  -> "ands"
  | Ior   -> "orrs"
  | Ixor  -> "eors"
  | Ilsl  -> "lsls"
  | Ilsr  -> "lsrs"
  | Iasr  -> "asrs"
  | _ -> assert false

let name_for_shift_operation = function
    Ishiftlogicalleft -> "lsl"
  | Ishiftlogicalright -> "lsr"
  | Ishiftarithmeticright -> "asr"

(* General functional to decompose a non-immediate integer constant
   into 8-bit chunks shifted left 0 ... 30 bits. *)

let decompose_intconst n fn =
  let i = ref n in
  let shift = ref 0 in
  let ninstr = ref 0 in
  while !i <> 0l do
    if Int32.logand (Int32.shift_right !i !shift) 3l = 0l then
      shift := !shift + 2
    else begin
      let bits = Int32.logand !i (Int32.shift_left 0xffl !shift) in
      i := Int32.sub !i bits;
      shift := !shift + 8;
      incr ninstr;
      fn bits
    end
  done;
  !ninstr

(* Load an integer constant into a register *)

let emit_intconst dst n =
  let nr = Int32.lognot n in
  if is_immediate n then begin
    (* Use movs here to enable 16-bit T1 encoding *)
    `	movs	{emit_reg dst}, #{emit_int32 n}\n`; 1
  end else if is_immediate nr then begin
    `	mvn	{emit_reg dst}, #{emit_int32 nr}\n`; 1
  end else if !arch > ARMv6 then begin
    let nl = Int32.logand 0xffffl n in
    let nh = Int32.logand 0xffffl (Int32.shift_right_logical n 16) in
    if nh = 0l then begin
      `	movw	{emit_reg dst}, #{emit_int32 nl}\n`; 1
    end else if Int32.logand nl 0xffl = nl then begin
      `	movs	{emit_reg dst}, #{emit_int32 nl}\n`;
      `	movt	{emit_reg dst}, #{emit_int32 nh}\n`; 2
    end else begin
      `	movw	{emit_reg dst}, #{emit_int32 nl}\n`;
      `	movt	{emit_reg dst}, #{emit_int32 nh}\n`; 2
    end
  end else begin
    let first = ref true in
    decompose_intconst n
      (fun bits ->
        if !first
        (* Use movs,adds here to enable 16-bit T1 encoding *)
        then `	movs	{emit_reg dst}, #{emit_int32 bits} @ {emit_int32 n}\n`
        else `	adds	{emit_reg dst}, {emit_reg dst}, #{emit_int32 bits}\n`;
        first := false)
  end

(* Adjust sp (up or down) by the given byte amount *)

let emit_stack_adjustment n =
  if n = 0 then 0 else begin
    let instr = if n < 0 then "sub" else "add" in
    let ninstr = decompose_intconst (Int32.of_int (abs n))
                   (fun bits ->
                     `	{emit_string instr}	sp, sp, #{emit_int32 bits}\n`) in
    cfi_adjust_cfa_offset (-n);
    ninstr
  end

(* Deallocate the stack frame before a return or tail call *)

let output_epilogue f =
  let n = frame_size() in
  if n > 0 then begin
    let ninstr = emit_stack_adjustment n in
    let ninstr = ninstr + f () in
    (* reset CFA back cause function body may continue *)
    cfi_adjust_cfa_offset n;
    ninstr
  end else
    f ()

(* Name of current function *)
let function_name = ref ""
(* Entry point for tail recursive calls *)
let tailrec_entry_point = ref 0
(* Pending floating-point literals *)
let float_literals = ref ([] : (int64 * label) list)
(* Pending relative references to the global offset table *)
let gotrel_literals = ref ([] : (label * label) list)
(* Pending symbol literals *)
let symbol_literals = ref ([] : (string * label) list)
(* Total space (in words) occupied by pending literals *)
let num_literals = ref 0

(* Label a floating-point literal *)
let float_literal f =
  try
    List.assoc f !float_literals
  with Not_found ->
    let lbl = new_label() in
    num_literals := !num_literals + 2;
    float_literals := (f, lbl) :: !float_literals;
    lbl

(* Label a GOTREL literal *)
let gotrel_literal l =
  let lbl = new_label() in
  num_literals := !num_literals + 1;
  gotrel_literals := (l, lbl) :: !gotrel_literals;
  lbl

(* Label a symbol literal *)
let symbol_literal s =
  try
    List.assoc s !symbol_literals
  with Not_found ->
    let lbl = new_label() in
    num_literals := !num_literals + 1;
    symbol_literals := (s, lbl) :: !symbol_literals;
    lbl

(* Emit all pending literals *)
let emit_literals() =
  if !float_literals <> [] then begin
    `	.align	3\n`;
    List.iter
      (fun (f, lbl) ->
        `{emit_label lbl}:`; emit_float64_split_directive ".long" f)
      !float_literals;
    float_literals := []
  end;
  if !symbol_literals <> [] then begin
    let offset = if !thumb then 4 else 8 in
    let suffix = if !Clflags.pic_code then "(GOT)" else "" in
    `	.align	2\n`;
    List.iter
      (fun (l, lbl) ->
        `{emit_label lbl}:	.word	_GLOBAL_OFFSET_TABLE_-({emit_label l}+{emit_int offset})\n`)
      !gotrel_literals;
    List.iter
      (fun (s, lbl) ->
        `{emit_label lbl}:	.word	{emit_symbol s}{emit_string suffix}\n`)
      !symbol_literals;
    gotrel_literals := [];
    symbol_literals := []
  end;
  num_literals := 0

(* Emit code to load the address of a symbol *)

let emit_load_symbol_addr dst s =
  if !Clflags.pic_code then begin
    let lbl_pic = new_label() in
    let lbl_got = gotrel_literal lbl_pic in
    let lbl_sym = symbol_literal s in
    (* Both r3 and r12 are marked as clobbered in PIC mode (cf. proc.ml),
       so use r12 as temporary scratch register unless the destination is
       r12, then we use r3 instead. *)
    let tmp = if dst.loc = Reg 8 (*r12*)
              then phys_reg 3 (*r3*)
              else phys_reg 8 (*r12*) in
    `	ldr	{emit_reg tmp}, {emit_label lbl_got}\n`;
    `	ldr	{emit_reg dst}, {emit_label lbl_sym}\n`;
    `{emit_label lbl_pic}:	add	{emit_reg tmp}, pc, {emit_reg tmp}\n`;
    `	ldr	{emit_reg dst}, [{emit_reg tmp}, {emit_reg dst}] @ {emit_symbol s}\n`;
    4
  end else if !arch > ARMv6 && not !Clflags.dlcode && !fastcode_flag then begin
    `	movw	{emit_reg dst}, #:lower16:{emit_symbol s}\n`;
    `	movt	{emit_reg dst}, #:upper16:{emit_symbol s}\n`;
    2
  end else begin
    let lbl = symbol_literal s in
    `	ldr	{emit_reg dst}, {emit_label lbl} @ {emit_symbol s}\n`;
    1
  end

(* Output the assembly code for an instruction *)

let emit_instr i =
    emit_debug_info i.dbg;
    match i.desc with
      Lend -> 0
    | Lop(Imove | Ispill | Ireload) ->
        let src = i.arg.(0) and dst = i.res.(0) in
        if src.loc = dst.loc then 0 else begin
          begin match (src, dst) with
            {loc = Reg _; typ = Float}, {loc = Reg _} ->
              `	fcpyd	{emit_reg dst}, {emit_reg src}\n`
          | {loc = Reg _}, {loc = Reg _} ->
              `	mov	{emit_reg dst}, {emit_reg src}\n`
          | {loc = Reg _; typ = Float}, _ ->
              `	fstd	{emit_reg src}, {emit_stack dst}\n`
          | {loc = Reg _}, _ ->
              `	str	{emit_reg src}, {emit_stack dst}\n`
          | {typ = Float}, _ ->
              `	fldd	{emit_reg dst}, {emit_stack src}\n`
          | _ ->
              `	ldr	{emit_reg dst}, {emit_stack src}\n`
          end; 1
        end
    | Lop(Iconst_int n | Iconst_blockheader n) ->
        emit_intconst i.res.(0) (Nativeint.to_int32 n)
    | Lop(Iconst_float f) when !fpu = Soft ->
        let high_bits = Int64.to_int32 (Int64.shift_right_logical f 32)
        and low_bits = Int64.to_int32 f in
        if is_immediate low_bits || is_immediate high_bits then begin
          let ninstr_low = emit_intconst i.res.(0) low_bits
          and ninstr_high = emit_intconst i.res.(1) high_bits in
          ninstr_low + ninstr_high
        end else begin
          let lbl = float_literal f in
          `	ldr	{emit_reg i.res.(0)}, {emit_label lbl}\n`;
          `	ldr	{emit_reg i.res.(1)}, {emit_label lbl} + 4\n`;
          2
        end
    | Lop(Iconst_float f) when !fpu = VFPv2 ->
        let lbl = float_literal f in
        `	fldd	{emit_reg i.res.(0)}, {emit_label lbl}\n`;
        1
    | Lop(Iconst_float f) ->
        let encode imm =
          let sg = Int64.to_int (Int64.shift_right_logical imm 63) in
          let ex = Int64.to_int (Int64.shift_right_logical imm 52) in
          let ex = (ex land 0x7ff) - 1023 in
          let mn = Int64.logand imm 0xfffffffffffffL in
          if Int64.logand mn 0xffffffffffffL <> 0L || ex < -3 || ex > 4
          then
            None
          else begin
            let mn = Int64.to_int (Int64.shift_right_logical mn 48) in
            if mn land 0x0f <> mn then
              None
            else
              let ex = ((ex + 3) land 0x07) lxor 0x04 in
              Some((sg lsl 7) lor (ex lsl 4) lor mn)
          end in
        begin match encode f with
          None ->
            let lbl = float_literal f in
            `	fldd	{emit_reg i.res.(0)}, {emit_label lbl}\n`
        | Some imm8 ->
            `	fconstd	{emit_reg i.res.(0)}, #{emit_int imm8}\n`
        end; 1
    | Lop(Iconst_symbol s) ->
        emit_load_symbol_addr i.res.(0) s
    | Lop(Icall_ind) ->
        if !arch >= ARMv5 then begin
          `	blx	{emit_reg i.arg.(0)}\n`;
          `{record_frame i.live i.dbg}\n`; 1
        end else begin
          `	mov	lr, pc\n`;
          `	bx	{emit_reg i.arg.(0)}\n`;
          `{record_frame i.live i.dbg}\n`; 2
        end
    | Lop(Icall_imm s) ->
        `	{emit_call s}\n`;
        `{record_frame i.live i.dbg}\n`; 1
    | Lop(Itailcall_ind) ->
        output_epilogue begin fun () ->
          if !contains_calls then
            `	ldr	lr, [sp, #{emit_int (-4)}]\n`;
          `	bx	{emit_reg i.arg.(0)}\n`; 2
        end
    | Lop(Itailcall_imm s) ->
        if s = !function_name then begin
          `	b	{emit_label !tailrec_entry_point}\n`; 1
        end else begin
          output_epilogue begin fun () ->
            if !contains_calls then
              `	ldr	lr, [sp, #{emit_int (-4)}]\n`;
            `	{emit_jump s}\n`; 2
          end
        end
    | Lop(Iextcall(s, false)) ->
        `	{emit_call s}\n`; 1
    | Lop(Iextcall(s, true)) ->
        let ninstr = emit_load_symbol_addr (phys_reg 7 (* r7 *)) s in
        `	{emit_call "caml_c_call"}\n`;
        `{record_frame i.live i.dbg}\n`;
        1 + ninstr
    | Lop(Istackoffset n) ->
        assert (n mod 8 = 0);
        let ninstr = emit_stack_adjustment (-n) in
        stack_offset := !stack_offset + n;
        ninstr
    | Lop(Iload(Single, addr)) when !fpu >= VFPv2 ->
        `	flds	s14, {emit_addressing addr i.arg 0}\n`;
        `	fcvtds	{emit_reg i.res.(0)}, s14\n`; 2
    | Lop(Iload((Double | Double_u), addr)) when !fpu = Soft ->
        (* Use LDM or LDRD if possible *)
        begin match i.res.(0), i.res.(1), addr with
          {loc = Reg rt}, {loc = Reg rt2}, Iindexed 0
          when rt < rt2 ->
            `	ldm	{emit_reg i.arg.(0)}, \{{emit_reg i.res.(0)}, {emit_reg i.res.(1)}}\n`; 1
        | {loc = Reg rt}, {loc = Reg rt2}, addr
          when !arch >= ARMv5TE && rt mod 2 == 0 && rt2 = rt + 1 ->
            `	ldrd	{emit_reg i.res.(0)}, {emit_reg i.res.(1)}, {emit_addressing addr i.arg 0}\n`; 1
        | _ ->
            let addr' = offset_addressing addr 4 in
            if i.res.(0).loc <> i.arg.(0).loc then begin
              `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`;
              `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`
            end else begin
              `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`;
              `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`
            end; 2
        end
    | Lop(Iload(size, addr)) ->
        let r = i.res.(0) in
        let instr =
          match size with
            Byte_unsigned -> "ldrb"
          | Byte_signed -> "ldrsb"
          | Sixteen_unsigned -> "ldrh"
          | Sixteen_signed -> "ldrsh"
          | Double
          | Double_u -> "fldd"
          | _ (* 32-bit quantities *) -> "ldr" in
        `	{emit_string instr}	{emit_reg r}, {emit_addressing addr i.arg 0}\n`; 1
    | Lop(Istore(Single, addr, _)) when !fpu >= VFPv2 ->
        `	fcvtsd	s14, {emit_reg i.arg.(0)}\n`;
        `	fsts	s14, {emit_addressing addr i.arg 1}\n`; 2
    | Lop(Istore((Double | Double_u), addr, _)) when !fpu = Soft ->
        (* Use STM or STRD if possible *)
        begin match i.arg.(0), i.arg.(1), addr with
          {loc = Reg rt}, {loc = Reg rt2}, Iindexed 0
          when rt < rt2 ->
            `	stm	{emit_reg i.arg.(2)}, \{{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}}\n`; 1
        | {loc = Reg rt}, {loc = Reg rt2}, addr
          when !arch >= ARMv5TE && rt mod 2 == 0 && rt2 = rt + 1 ->
            `	strd	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_addressing addr i.arg 2}\n`; 1
        | _ ->
            let addr' = offset_addressing addr 4 in
            `	str	{emit_reg i.arg.(0)}, {emit_addressing addr i.arg 2}\n`;
            `	str	{emit_reg i.arg.(1)}, {emit_addressing addr' i.arg 2}\n`; 2
        end
    | Lop(Istore(size, addr, _)) ->
        let r = i.arg.(0) in
        let instr =
          match size with
            Byte_unsigned
          | Byte_signed -> "strb"
          | Sixteen_unsigned
          | Sixteen_signed -> "strh"
          | Double
          | Double_u -> "fstd"
          | _ (* 32-bit quantities *) -> "str" in
        `	{emit_string instr}	{emit_reg r}, {emit_addressing addr i.arg 1}\n`; 1
    | Lop(Ialloc n) ->
        let lbl_frame = record_frame_label i.live i.dbg in
        if !fastcode_flag then begin
          let lbl_redo = new_label() in
          `{emit_label lbl_redo}:`;
          let ninstr = decompose_intconst
                         (Int32.of_int n)
                         (fun i ->
                           `	sub	alloc_ptr, alloc_ptr, #{emit_int32 i}\n`) in
          `	cmp	alloc_ptr, alloc_limit\n`;
          `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
          let lbl_call_gc = new_label() in
          `	bcc	{emit_label lbl_call_gc}\n`;
          call_gc_sites :=
            { gc_lbl = lbl_call_gc;
              gc_return_lbl = lbl_redo;
              gc_frame_lbl = lbl_frame } :: !call_gc_sites;
          3 + ninstr
        end else begin
          let ninstr =
            begin match n with
               8 -> `	{emit_call "caml_alloc1"}\n`; 1
            | 12 -> `	{emit_call "caml_alloc2"}\n`; 1
            | 16 -> `	{emit_call "caml_alloc3"}\n`; 1
            |  _ -> let ninstr = emit_intconst (phys_reg 7) (Int32.of_int n) in
                    `	{emit_call "caml_allocN"}\n`; 1 + ninstr
            end in
          `{emit_label lbl_frame}:	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
          1 + ninstr
        end
    | Lop(Iintop(Icomp cmp)) ->
        let compthen = name_for_comparison cmp in
        let compelse = name_for_comparison (negate_integer_comparison cmp) in
        `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
        `	ite	{emit_string compthen}\n`;
        `	mov{emit_string	compthen}	{emit_reg i.res.(0)}, #1\n`;
        `	mov{emit_string compelse}	{emit_reg i.res.(0)}, #0\n`; 4
    | Lop(Iintop_imm(Icomp cmp, n)) ->
        let compthen = name_for_comparison cmp in
        let compelse = name_for_comparison (negate_integer_comparison cmp) in
        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
        `	ite	{emit_string compthen}\n`;
        `	mov{emit_string	compthen}	{emit_reg i.res.(0)}, #1\n`;
        `	mov{emit_string compelse}	{emit_reg i.res.(0)}, #0\n`; 4
    | Lop(Iintop Icheckbound) ->
        let lbl = bound_error_label i.dbg in
        `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
        `	bls	{emit_label lbl}\n`; 2
    | Lop(Iintop_imm(Icheckbound, n)) ->
        let lbl = bound_error_label i.dbg in
        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
        `	bls	{emit_label lbl}\n`; 2
    | Lop(Ispecific(Ishiftcheckbound(shiftop, n))) ->
        let lbl = bound_error_label i.dbg in
        let op = name_for_shift_operation shiftop in
        `	cmp	{emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}, {emit_string op} #{emit_int n}\n`;
        `	bcs	{emit_label lbl}\n`; 2
    | Lop(Iintop Imulh) when !arch < ARMv6 ->
        `	smull	r12, {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1
    | Lop(Ispecific Imulhadd) ->
        `	smmla	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}\n`; 1
    | Lop(Iintop op) ->
        let instr = name_for_int_operation op in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1
    | Lop(Iintop_imm(op, n)) ->
        let instr = name_for_int_operation op in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1
    | Lop(Iabsf | Inegf as op) when !fpu = Soft ->
        let instr = (match op with
                       Iabsf -> "bic"
                     | Inegf -> "eor"
                     | _     -> assert false) in
        `	{emit_string instr}	{emit_reg i.res.(1)}, {emit_reg i.arg.(1)}, #0x80000000\n`; 1
    | Lop(Iabsf | Inegf | Ispecific Isqrtf as op) ->
        let instr = (match op with
                       Iabsf            -> "fabsd"
                     | Inegf            -> "fnegd"
                     | Ispecific Isqrtf -> "fsqrtd"
                     | _                -> assert false) in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`; 1
    | Lop(Ifloatofint) ->
        `	fmsr	s14, {emit_reg i.arg.(0)}\n`;
        `	fsitod	{emit_reg i.res.(0)}, s14\n`; 2
    | Lop(Iintoffloat) ->
        `	ftosizd	s14, {emit_reg i.arg.(0)}\n`;
        `	fmrs	{emit_reg i.res.(0)}, s14\n`; 2
    | Lop(Iaddf | Isubf | Imulf | Idivf | Ispecific Inegmulf as op) ->
        let instr = (match op with
                       Iaddf              -> "faddd"
                     | Isubf              -> "fsubd"
                     | Imulf              -> "fmuld"
                     | Idivf              -> "fdivd"
                     | Ispecific Inegmulf -> "fnmuld"
                     | _                  -> assert false) in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
        1
    | Lop(Ispecific(Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf as op)) ->
        let instr = (match op with
                       Imuladdf    -> "fmacd"
                     | Inegmuladdf -> "fnmacd"
                     | Imulsubf    -> "fmscd"
                     | Inegmulsubf -> "fnmscd"
                     | _ -> assert false) in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}\n`;
        1
    | Lop(Ispecific(Ishiftarith(op, shiftop, n))) ->
        let instr = (match op with
                       Ishiftadd    -> "add"
                     | Ishiftsub    -> "sub"
                     | Ishiftsubrev -> "rsb"
                     | Ishiftand    -> "and"
                     | Ishiftor     -> "orr"
                     | Ishiftxor    -> "eor") in
        let op = name_for_shift_operation shiftop in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_string op} #{emit_int n}\n`; 1
    | Lop(Ispecific(Irevsubimm n)) ->
        `	rsb	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1
    | Lop(Ispecific(Imuladd | Imulsub as op)) ->
        let instr = (match op with
                       Imuladd -> "mla"
                     | Imulsub -> "mls"
                     | _ -> assert false) in
        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}\n`; 1
    | Lop(Ispecific(Ibswap size)) ->
        begin match size with
          16 ->
            `	rev16	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`;
            `	movt	{emit_reg i.res.(0)}, #0\n`; 2
        | 32 ->
            `	rev	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`; 1
        | _ ->
            assert false
        end
    | Lreloadretaddr ->
        let n = frame_size() in
        `	ldr	lr, [sp, #{emit_int(n-4)}]\n`; 1
    | Lreturn ->
        output_epilogue begin fun () ->
          `	bx	lr\n`; 1
        end
    | Llabel lbl ->
        `{emit_label lbl}:\n`; 0
    | Lbranch lbl ->
        `	b	{emit_label lbl}\n`; 1
    | Lcondbranch(tst, lbl) ->
        begin match tst with
          Itruetest ->
            `	cmp	{emit_reg i.arg.(0)}, #0\n`;
            `	bne	{emit_label lbl}\n`; 2
        | Ifalsetest ->
            `	cmp	{emit_reg i.arg.(0)}, #0\n`;
            `	beq	{emit_label lbl}\n`; 2
        | Iinttest cmp ->
            `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
            let comp = name_for_comparison cmp in
            `	b{emit_string comp}	{emit_label lbl}\n`; 2
        | Iinttest_imm(cmp, n) ->
            `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
            let comp = name_for_comparison cmp in
            `	b{emit_string comp}	{emit_label lbl}\n`; 2
        | Ifloattest(cmp, neg) ->
            let comp = (match (cmp, neg) with
                          (Ceq, false) | (Cne, true) -> "eq"
                        | (Cne, false) | (Ceq, true) -> "ne"
                        | (Clt, false) -> "cc"
                        | (Clt, true)  -> "cs"
                        | (Cle, false) -> "ls"
                        | (Cle, true)  -> "hi"
                        | (Cgt, false) -> "gt"
                        | (Cgt, true)  -> "le"
                        | (Cge, false) -> "ge"
                        | (Cge, true)  -> "lt") in
            `	fcmpd	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
            `	fmstat\n`;
            `	b{emit_string comp}	{emit_label lbl}\n`; 3
        | Ioddtest ->
            `	tst	{emit_reg i.arg.(0)}, #1\n`;
            `	bne	{emit_label lbl}\n`; 2
        | Ieventest ->
            `	tst	{emit_reg i.arg.(0)}, #1\n`;
            `	beq	{emit_label lbl}\n`; 2
        end
    | Lcondbranch3(lbl0, lbl1, lbl2) ->
        `	cmp	{emit_reg i.arg.(0)}, #1\n`;
        begin match lbl0 with
          None -> ()
        | Some lbl -> `	blt	{emit_label lbl}\n`
        end;
        begin match lbl1 with
          None -> ()
        | Some lbl -> `	beq	{emit_label lbl}\n`
        end;
        begin match lbl2 with
          None -> ()
        | Some lbl -> `	bgt	{emit_label lbl}\n`
        end;
        4
    | Lswitch jumptbl ->
        if !arch > ARMv6 && !thumb then begin
          (* The Thumb-2 TBH instruction supports only forward branches,
             so we need to generate appropriate trampolines for all labels
             that appear before this switch instruction (PR#5623) *)
          let tramtbl = Array.copy jumptbl in
          `	tbh	[pc, {emit_reg i.arg.(0)}, lsl #1]\n`;
          for j = 0 to Array.length tramtbl - 1 do
            let rec label i =
              match i.desc with
                Lend -> new_label()
              | Llabel lbl when lbl = tramtbl.(j) -> lbl
              | _ -> label i.next in
            tramtbl.(j) <- label i.next;
            `	.short	({emit_label tramtbl.(j)}-.)/2+{emit_int j}\n`
          done;
          (* Generate the necessary trampolines *)
          for j = 0 to Array.length tramtbl - 1 do
            if tramtbl.(j) <> jumptbl.(j) then
              `{emit_label tramtbl.(j)}:	b	{emit_label jumptbl.(j)}\n`
          done
        end else if not !Clflags.pic_code then begin
          `	ldr	pc, [pc, {emit_reg i.arg.(0)}, lsl #2]\n`;
          `	nop\n`;
          for j = 0 to Array.length jumptbl - 1 do
            `	.word	{emit_label jumptbl.(j)}\n`
          done
        end else begin
          (* Slightly slower, but position-independent *)
          `	add	pc, pc, {emit_reg i.arg.(0)}, lsl #2\n`;
          `	nop\n`;
          for j = 0 to Array.length jumptbl - 1 do
            `	b	{emit_label jumptbl.(j)}\n`
          done
        end;
        2 + Array.length jumptbl
    | Lsetuptrap lbl ->
        `	bl	{emit_label lbl}\n`; 1
    | Lpushtrap ->
        stack_offset := !stack_offset + 8;
        `	push	\{trap_ptr, lr}\n`;
        cfi_adjust_cfa_offset 8;
        `	mov	trap_ptr, sp\n`; 2
    | Lpoptrap ->
        `	pop	\{trap_ptr, lr}\n`;
        cfi_adjust_cfa_offset (-8);
        stack_offset := !stack_offset - 8; 1
    | Lraise k ->
        begin match !Clflags.debug, k with
        | true, (Lambda.Raise_regular | Lambda.Raise_reraise) ->
          `	{emit_call "caml_raise_exn"}\n`;
          `{record_frame Reg.Set.empty i.dbg}\n`; 1
        | false, _
        | true, Lambda.Raise_notrace ->
          `	mov	sp, trap_ptr\n`;
          `	pop	\{trap_ptr, pc}\n`; 2
        end

(* Emission of an instruction sequence *)

let rec emit_all ninstr i =
  if i.desc = Lend then () else begin
    let n = emit_instr i in
    let ninstr' = ninstr + n in
    (* fldd can address up to +/-1KB, ldr can address up to +/-4KB *)
    let limit = (if !fpu >= VFPv2 && !float_literals <> []
                 then 127
                 else 511) in
    let limit = limit - !num_literals in
    if ninstr' >= limit - 64 && not(has_fallthrough i.desc) then begin
      emit_literals();
      emit_all 0 i.next
    end else if !num_literals != 0 && ninstr' >= limit then begin
      let lbl = new_label() in
      `	b	{emit_label lbl}\n`;
      emit_literals();
      `{emit_label lbl}:\n`;
      emit_all 0 i.next
    end else
      emit_all ninstr' i.next
  end

(* Emission of the profiling prelude *)

let emit_profile() =
  match Config.system with
    "linux_eabi" | "linux_eabihf" | "netbsd" ->
      `	push	\{lr}\n`;
      `	{emit_call "__gnu_mcount_nc"}\n`
  | _ -> ()

(* Emission of a function declaration *)

let fundecl fundecl =
  function_name := fundecl.fun_name;
  fastcode_flag := fundecl.fun_fast;
  tailrec_entry_point := new_label();
  float_literals := [];
  gotrel_literals := [];
  symbol_literals := [];
  stack_offset := 0;
  call_gc_sites := [];
  bound_error_sites := [];
  `	.text\n`;
  `	.align	2\n`;
  `	.globl	{emit_symbol fundecl.fun_name}\n`;
  if !arch > ARMv6 && !thumb then
    `	.thumb\n`
  else
    `	.arm\n`;
  `	.type	{emit_symbol fundecl.fun_name}, %function\n`;
  `{emit_symbol fundecl.fun_name}:\n`;
  emit_debug_info fundecl.fun_dbg;
  cfi_startproc();
  if !Clflags.gprofile then emit_profile();
  let n = frame_size() in
  if n > 0 then begin
    ignore(emit_stack_adjustment (-n));
    if !contains_calls then begin
      cfi_offset ~reg:14 (* lr *) ~offset:(-4);
      `	str	lr, [sp, #{emit_int(n - 4)}]\n`
    end
  end;
  `{emit_label !tailrec_entry_point}:\n`;
  emit_all 0 fundecl.fun_body;
  emit_literals();
  List.iter emit_call_gc !call_gc_sites;
  List.iter emit_call_bound_error !bound_error_sites;
  cfi_endproc();
  `	.type	{emit_symbol fundecl.fun_name}, %function\n`;
  `	.size	{emit_symbol fundecl.fun_name}, .-{emit_symbol fundecl.fun_name}\n`

(* Emission of data *)

let emit_item = function
    Cglobal_symbol s -> `	.globl	{emit_symbol s}\n`;
  | Cdefine_symbol s -> `{emit_symbol s}:\n`
  | Cdefine_label lbl -> `{emit_data_label lbl}:\n`
  | Cint8 n -> `	.byte	{emit_int n}\n`
  | Cint16 n -> `	.short	{emit_int n}\n`
  | Cint32 n -> `	.long	{emit_int32 (Nativeint.to_int32 n)}\n`
  | Cint n -> `	.long	{emit_int32 (Nativeint.to_int32 n)}\n`
  | Csingle f -> emit_float32_directive ".long" (Int32.bits_of_float f)
  | Cdouble f -> emit_float64_split_directive ".long" (Int64.bits_of_float f)
  | Csymbol_address s -> `	.word	{emit_symbol s}\n`
  | Clabel_address lbl -> `	.word	{emit_data_label lbl}\n`
  | Cstring s -> emit_string_directive "	.ascii  " s
  | Cskip n -> if n > 0 then `	.space	{emit_int n}\n`
  | Calign n -> `	.align	{emit_int(Misc.log2 n)}\n`

let data l =
  `	.data\n`;
  List.iter emit_item l

(* Beginning / end of an assembly file *)

let begin_assembly() =
  reset_debug_info();
  `	.file	\"\"\n`;  (* PR#7037 *)
  `	.syntax	unified\n`;
  begin match !arch with
  | ARMv4   -> `	.arch	armv4t\n`
  | ARMv5   -> `	.arch	armv5t\n`
  | ARMv5TE -> `	.arch	armv5te\n`
  | ARMv6   -> `	.arch	armv6\n`
  | ARMv6T2 -> `	.arch	armv6t2\n`
  | ARMv7   -> `	.arch	armv7-a\n`
  end;
  begin match !fpu with
    Soft      -> `	.fpu	softvfp\n`
  | VFPv2     -> `	.fpu	vfpv2\n`
  | VFPv3_D16 -> `	.fpu	vfpv3-d16\n`
  | VFPv3     -> `	.fpu	vfpv3\n`
  end;
  `trap_ptr	.req	r8\n`;
  `alloc_ptr	.req	r10\n`;
  `alloc_limit	.req	r11\n`;
  let lbl_begin = Compilenv.make_symbol (Some "data_begin") in
  `	.data\n`;
  `	.globl	{emit_symbol lbl_begin}\n`;
  `{emit_symbol lbl_begin}:\n`;
  let lbl_begin = Compilenv.make_symbol (Some "code_begin") in
  `	.text\n`;
  `	.globl	{emit_symbol lbl_begin}\n`;
  `{emit_symbol lbl_begin}:\n`

let end_assembly () =
  let lbl_end = Compilenv.make_symbol (Some "code_end") in
  `	.text\n`;
  `	.globl	{emit_symbol lbl_end}\n`;
  `{emit_symbol lbl_end}:\n`;
  let lbl_end = Compilenv.make_symbol (Some "data_end") in
  `	.data\n`;
  `	.globl	{emit_symbol lbl_end}\n`;
  `{emit_symbol lbl_end}:\n`;
  `	.long	0\n`;
  let lbl = Compilenv.make_symbol (Some "frametable") in
  `	.globl	{emit_symbol lbl}\n`;
  `{emit_symbol lbl}:\n`;
  emit_frames
    { efa_label = (fun lbl ->
                       `	.type	{emit_label lbl}, %function\n`;
                       `	.word	{emit_label lbl}\n`);
      efa_16 = (fun n -> `	.short	{emit_int n}\n`);
      efa_32 = (fun n -> `	.long	{emit_int32 n}\n`);
      efa_word = (fun n -> `	.word	{emit_int n}\n`);
      efa_align = (fun n -> `	.align	{emit_int(Misc.log2 n)}\n`);
      efa_label_rel = (fun lbl ofs ->
                           `	.word	{emit_label lbl} - . + {emit_int32 ofs}\n`);
      efa_def_label = (fun lbl -> `{emit_label lbl}:\n`);
      efa_string = (fun s -> emit_string_directive "	.asciz	" s) };
  `	.type	{emit_symbol lbl}, %object\n`;
  `	.size	{emit_symbol lbl}, .-{emit_symbol lbl}\n`;
  begin match Config.system with
    "linux_eabihf" | "linux_eabi" | "netbsd" ->
      (* Mark stack as non-executable *)
      `	.section	.note.GNU-stack,\"\",%progbits\n`
  | _ -> ()
  end