ocaml/asmcomp/ia64/emit.mlp

1320 lines
52 KiB
Plaintext

(***********************************************************************)
(* *)
(* Objective Caml *)
(* *)
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 2000 Institut National de Recherche en Informatique et *)
(* en Automatique. All rights reserved. This file is distributed *)
(* under the terms of the Q Public License version 1.0. *)
(* *)
(***********************************************************************)
(* $Id$ *)
(* Emission of IA64 assembly code *)
open Printf
open Misc
open Cmm
open Arch
open Proc
open Reg
open Mach
open Linearize
open Emitaux
(************** Part 1: assembly-level scheduler *******************)
(* Representation of resources accessed or produced by instructions *)
type resource = string
(* A resource is either:
- a register name
- "stkN" for a stack location
- "heap" for the Caml heap
- "chkN" for the result of a checkbound instruction *)
let is_memory_resource rsrc =
String.length rsrc >= 4 &&
begin match String.sub rsrc 0 3 with
"stk" -> true
| "hea" -> true
| "chk" -> true
| _ -> false
end
let is_mutable_resource rsrc =
rsrc <> "r0" && rsrc <> "p0"
(* Description of instructions *)
type instruction_kind =
KA (* A type instruction (int or mem unit) *)
| KB (* B type instruction (branch unit) *)
| KI (* I type instruction (int unit *)
| KF (* F type instruction (FP unit) *)
| KM (* M type instruction (mem unit) *)
| KB_exc (* B type instruction, exceptional condition,
can be moved around *)
type instruction_format =
F_i (* op imm *)
| F_i_pred (* (pred) op imm *)
| F_ir_rr (* op p1,p2 = imm, r *)
| F_ir_r (* op r = imm, r *)
| F_ir_r_pred (* (pred) op r = imm, r *)
| F_ld (* op r = [r] *)
| F_ld_post (* op r = [r], imm *)
| F_r (* op r *)
| F_i_r (* op r = imm *)
| F_i_r_pred (* (pred) op r = imm *)
| F_ri_rr (* op p1,p2 = imm, r *)
| F_ri_r (* op r = imm, r *)
| F_r_r (* op r = r *)
| F_r_r_pred (* (pred) op r = r *)
| F_rr_rr (* op p1,p2 = r1, r2 *)
| F_r_rir (* op r = r1, imm, r2 *)
| F_rr_r (* op r = r1, r2 *)
| F_rr_r_pred (* (pred) op r = r1, r2 *)
| F_rri_r (* op r = r1, r2, imm *)
| F_rrr_r (* op r = r1, r2, r3 *)
| F_rrr_r_pred (* (pred) op r = r1, r2, r3 *)
| F_st (* op [r] = r *)
| F_st_post (* op [r] = r, imm *)
type instruction_descr =
{ opcode: string; (* actual opcode *)
latency: int; (* latency in cycles *)
kind: instruction_kind; (* kind of instruction *)
format: instruction_format } (* how to generate asm for it *)
let instruction_table = create_hashtable 73 [
"add", {opcode = "add"; latency = 1; kind = KA; format = F_rr_r};
"add1", {opcode = "add"; latency = 1; kind = KA; format = F_rri_r};
"addcond", {opcode = "add"; latency = 1; kind = KA; format = F_rr_r_pred};
"addi", {opcode = "add"; latency = 1; kind = KA; format = F_ir_r};
"addicond", {opcode = "add"; latency = 1; kind = KA; format = F_ir_r_pred};
"and", {opcode = "and"; latency = 1; kind = KA; format = F_rr_r};
"andi", {opcode = "and"; latency = 1; kind = KA; format = F_ir_r};
"br", {opcode = "br.sptk.many"; latency = 1; kind = KB; format = F_i};
"brret", {opcode = "br.ret.sptk"; latency = 1; kind = KB; format = F_r};
"brcall", {opcode = "br.call.sptk.many"; latency = 1; kind = KB; format = F_i_r};
"brcallcond", {opcode = "br.call.spnt.many"; latency = 1; kind = KB; format = F_i_r_pred};
"brcallcondexc", {opcode = "br.call.spnt.many"; latency = 1; kind = KB_exc; format = F_i_r_pred};
"brcallind", {opcode = "br.call.sptk.many"; latency = 1; kind = KB; format = F_r_r};
"brcond", {opcode = "br.dpnt.many"; latency = 1; kind = KB; format = F_i_pred};
"brind", {opcode = "br.sptk.many"; latency = 1; kind = KB; format = F_r};
"cmp.eq", {opcode = "cmp.eq"; latency = 0; kind = KA; format = F_rr_rr};
"cmp.ge", {opcode = "cmp.ge"; latency = 0; kind = KA; format = F_rr_rr};
"cmp.geu", {opcode = "cmp.geu"; latency = 0; kind = KA; format = F_rr_rr};
"cmp.gt", {opcode = "cmp.gt"; latency = 0; kind = KA; format = F_rr_rr};
"cmp.gtu", {opcode = "cmp.gtu"; latency = 0; kind = KA; format = F_rr_rr};
"cmp.le", {opcode = "cmp.le"; latency = 0; kind = KA; format = F_rr_rr};
"cmp.leu", {opcode = "cmp.leu"; latency = 0; kind = KA; format = F_rr_rr};
"cmp.lt", {opcode = "cmp.lt"; latency = 0; kind = KA; format = F_rr_rr};
"cmp.ltu", {opcode = "cmp.ltu"; latency = 0; kind = KA; format = F_rr_rr};
"cmp.ne", {opcode = "cmp.ne"; latency = 0; kind = KA; format = F_rr_rr};
"cmpi.eq", {opcode = "cmp.eq"; latency = 0; kind = KA; format = F_ir_rr};
"cmpi.ge", {opcode = "cmp.ge"; latency = 0; kind = KA; format = F_ir_rr};
"cmpi.geu", {opcode = "cmp.geu"; latency = 0; kind = KA; format = F_ir_rr};
"cmpi.gt", {opcode = "cmp.gt"; latency = 0; kind = KA; format = F_ir_rr};
"cmpi.gtu", {opcode = "cmp.gtu"; latency = 0; kind = KA; format = F_ir_rr};
"cmpi.le", {opcode = "cmp.le"; latency = 0; kind = KA; format = F_ir_rr};
"cmpi.leu", {opcode = "cmp.leu"; latency = 0; kind = KA; format = F_ir_rr};
"cmpi.lt", {opcode = "cmp.lt"; latency = 0; kind = KA; format = F_ir_rr};
"cmpi.ltu", {opcode = "cmp.ltu"; latency = 0; kind = KA; format = F_ir_rr};
"cmpi.ne", {opcode = "cmp.ne"; latency = 0; kind = KA; format = F_ir_rr};
"cmpp.eq", {opcode = "cmp.eq"; latency = 1; kind = KA; format = F_rr_rr};
"cmpp.ge", {opcode = "cmp.ge"; latency = 1; kind = KA; format = F_rr_rr};
"cmpp.geu", {opcode = "cmp.geu"; latency = 1; kind = KA; format = F_rr_rr};
"cmpp.gt", {opcode = "cmp.gt"; latency = 1; kind = KA; format = F_rr_rr};
"cmpp.gtu", {opcode = "cmp.gtu"; latency = 1; kind = KA; format = F_rr_rr};
"cmpp.le", {opcode = "cmp.le"; latency = 1; kind = KA; format = F_rr_rr};
"cmpp.leu", {opcode = "cmp.leu"; latency = 1; kind = KA; format = F_rr_rr};
"cmpp.lt", {opcode = "cmp.lt"; latency = 1; kind = KA; format = F_rr_rr};
"cmpp.ltu", {opcode = "cmp.ltu"; latency = 1; kind = KA; format = F_rr_rr};
"cmpp.ne", {opcode = "cmp.ne"; latency = 1; kind = KA; format = F_rr_rr};
"cmpp.ne.and", {opcode = "cmp.ne.and"; latency = 1; kind = KA; format = F_rr_rr};
"cmppi.eq", {opcode = "cmp.eq"; latency = 1; kind = KA; format = F_ir_rr};
"cmppi.ge", {opcode = "cmp.ge"; latency = 1; kind = KA; format = F_ir_rr};
"cmppi.geu", {opcode = "cmp.geu"; latency = 1; kind = KA; format = F_ir_rr};
"cmppi.gt", {opcode = "cmp.gt"; latency = 1; kind = KA; format = F_ir_rr};
"cmppi.gtu", {opcode = "cmp.gtu"; latency = 1; kind = KA; format = F_ir_rr};
"cmppi.le", {opcode = "cmp.le"; latency = 1; kind = KA; format = F_ir_rr};
"cmppi.leu", {opcode = "cmp.leu"; latency = 1; kind = KA; format = F_ir_rr};
"cmppi.lt", {opcode = "cmp.lt"; latency = 1; kind = KA; format = F_ir_rr};
"cmppi.ltu", {opcode = "cmp.ltu"; latency = 1; kind = KA; format = F_ir_rr};
"cmppi.ne", {opcode = "cmp.ne"; latency = 1; kind = KA; format = F_ir_rr};
"extr.u", {opcode = "extr.u"; latency = 1; kind = KI; format = F_ri_r};
"fabs", {opcode = "fabs"; latency = 1; kind = KF; format = F_r_r};
"fadd.d", {opcode = "fadd.d"; latency = 5; kind = KF; format = F_rr_r};
"fcmp.eq", {opcode = "fcmp.eq"; latency = 1; kind = KF; format = F_rr_rr};
"fcmp.ge", {opcode = "fcmp.ge"; latency = 1; kind = KF; format = F_rr_rr};
"fcmp.gt", {opcode = "fcmp.gt"; latency = 1; kind = KF; format = F_rr_rr};
"fcmp.le", {opcode = "fcmp.le"; latency = 1; kind = KF; format = F_rr_rr};
"fcmp.lt", {opcode = "fcmp.lt"; latency = 1; kind = KF; format = F_rr_rr};
"fcmp.neq", {opcode = "fcmp.neq"; latency = 1; kind = KF; format = F_rr_rr};
"fcvt.fx.trunc", {opcode = "fcvt.fx.trunc"; latency = 7; kind = KF; format = F_r_r};
"fcvt.xf", {opcode = "fcvt.xf"; latency = 5; kind = KF; format = F_r_r};
"fma.d", {opcode = "fma.d"; latency = 5; kind = KF; format = F_rrr_r};
"fmacond", {opcode = "fma.d.s0"; latency = 5; kind = KF; format = F_rrr_r_pred};
"fmas1cond", {opcode = "fma.s1"; latency = 5; kind = KF; format = F_rrr_r_pred};
"fmads1cond", {opcode = "fma.d.s1"; latency = 5; kind = KF; format = F_rrr_r_pred};
"fmpy.d", {opcode = "fmpy.d"; latency = 5; kind = KF; format = F_rr_r};
"fms.d", {opcode = "fms.d"; latency = 5; kind = KF; format = F_rrr_r};
"fneg", {opcode = "fneg"; latency = 1; kind = KF; format = F_r_r};
"fnma.d", {opcode = "fnma.d"; latency = 5; kind = KF; format = F_rrr_r};
"fnmas1cond", {opcode = "fnma.s1"; latency = 5; kind = KF; format = F_rrr_r_pred};
"fnmads1cond", {opcode = "fnma.d.s1"; latency = 5; kind = KF; format = F_rrr_r_pred};
"fnorm.d", {opcode = "fnorm.d"; latency = 5; kind = KF; format = F_r_r};
"frcpa", {opcode = "frcpa.s0"; latency = 5; kind = KF; format = F_rr_rr};
"fsub.d", {opcode = "fsub.d"; latency = 5; kind = KF; format = F_rr_r};
"getf.sig", {opcode = "getf.sig"; latency = 2; kind = KM; format = F_r_r};
"ld1", {opcode = "ld1"; latency = 2; kind = KM; format = F_ld};
"ld2", {opcode = "ld2"; latency = 2; kind = KM; format = F_ld};
"ld4", {opcode = "ld4"; latency = 2; kind = KM; format = F_ld};
"ld8", {opcode = "ld8"; latency = 2; kind = KM; format = F_ld};
"ld8+", {opcode = "ld8"; latency = 2; kind = KM; format = F_ld_post};
"ldfd", {opcode = "ldfd"; latency = 9; kind = KM; format = F_ld};
"ldfd+", {opcode = "ldfd"; latency = 9; kind = KM; format = F_ld_post};
"ldfs", {opcode = "ldfs"; latency = 9; kind = KM; format = F_ld};
"mov", {opcode = "mov"; latency = 1; kind = KA; format = F_r_r};
"movcond", {opcode = "mov"; latency = 1; kind = KA; format = F_r_r_pred};
"movtb", {opcode = "mov"; latency = 9; kind = KI; format = F_r_r};
"movfb", {opcode = "mov"; latency = 2; kind = KI; format = F_r_r};
"movi", {opcode = "mov"; latency = 1; kind = KA; format = F_i_r};
"movicond", {opcode = "mov"; latency = 1; kind = KA; format = F_i_r_pred};
"movil", {opcode = "movl"; latency = 1; kind = KI; format = F_i_r};
"movpr", {opcode = "mov"; latency = 1; kind = KI; format = F_ri_r};
"or", {opcode = "or"; latency = 1; kind = KA; format = F_rr_r};
"ori", {opcode = "or"; latency = 1; kind = KA; format = F_ir_r};
"setf.d", {opcode = "setf.d"; latency = 8; kind = KM; format = F_r_r};
"setf.sig", {opcode = "setf.sig"; latency = 8; kind = KM; format = F_r_r};
"shl", {opcode = "shl"; latency = 2; kind = KI; format = F_rr_r};
"shladd", {opcode = "shladd"; latency = 1; kind = KA; format = F_r_rir};
"shli", {opcode = "shl"; latency = 1; kind = KI; format = F_ri_r};
"shr", {opcode = "shr"; latency = 2; kind = KI; format = F_rr_r};
"shri", {opcode = "shr"; latency = 1; kind = KI; format = F_ri_r};
"shru", {opcode = "shr.u"; latency = 2; kind = KI; format = F_rr_r};
"shrui", {opcode = "shr.u"; latency = 1; kind = KI; format = F_ri_r};
"st1", {opcode = "st1"; latency = 0; kind = KM; format = F_st};
"st2", {opcode = "st2"; latency = 0; kind = KM; format = F_st};
"st4", {opcode = "st4"; latency = 0; kind = KM; format = F_st};
"st8", {opcode = "st8"; latency = 0; kind = KM; format = F_st};
"st8+", {opcode = "st8"; latency = 1; kind = KM; format = F_st_post};
"stfd", {opcode = "stfd"; latency = 0; kind = KM; format = F_st};
"stfd+", {opcode = "stfd"; latency = 1; kind = KM; format = F_st_post};
"stfs", {opcode = "stfs"; latency = 0; kind = KM; format = F_st};
"sub", {opcode = "sub"; latency = 1; kind = KA; format = F_rr_r};
"sub1", {opcode = "sub"; latency = 1; kind = KA; format = F_rri_r};
"subi", {opcode = "sub"; latency = 1; kind = KA; format = F_ir_r};
"sxt1", {opcode = "sxt1"; latency = 1; kind = KI; format = F_r_r};
"sxt2", {opcode = "sxt2"; latency = 1; kind = KI; format = F_r_r};
"sxt4", {opcode = "sxt4"; latency = 1; kind = KI; format = F_r_r};
"tbit.nz", {opcode = "tbit.nz"; latency = 0; kind = KI; format = F_ri_rr};
"tbit.z", {opcode = "tbit.z"; latency = 0; kind = KI; format = F_ri_rr};
"xmpy.l", {opcode = "xmpy.l"; latency = 7; kind = KF; format = F_rr_r};
"xor", {opcode = "xor"; latency = 1; kind = KA; format = F_rr_r};
"xori", {opcode = "xor"; latency = 1; kind = KA; format = F_ir_r};
"#initbarrier", {opcode = "# init barrier"; latency = 0; kind = KI; format = F_i};
]
(* Nodes of the code DAG. Each node represents one instruction to be
emitted. *)
type code_dag_node =
{ instr: instruction_descr; (* the instruction *)
imm: string; (* its immediate argument, if any *)
iarg: resource array; (* arguments *)
ires: resource array; (* results *)
delay: int; (* how many cycles before result is available *)
mutable sons: (code_dag_node * int) list;
(* nodes that depend on this node *)
mutable date: int; (* start date *)
mutable length: int; (* length of longest path to result *)
mutable ancestors: int; (* number of ancestors *)
mutable emitted_ancestors: int } (* number of emitted ancestors *)
(* The code dag itself is represented by two tables from resources to nodes:
- "results" maps resources to the instructions that produced them;
- "uses" maps resources to the instructions that use them. *)
let code_results = (Hashtbl.create 31 : (resource, code_dag_node) Hashtbl.t)
let code_uses = (Hashtbl.create 31 : (resource, code_dag_node) Hashtbl.t)
let clear_code_dag () =
Hashtbl.clear code_results;
Hashtbl.clear code_uses
(* The ready queue: a list of nodes that can be computed immediately
(all arguments are available), kept sorted by decreasing length to results.
The in progress queue: a list of nodes whose arguments are being computed,
and thus can be computed at a later date, kept sorted by increasing
availability date
The branch list: a list of all branch instructions (to be emitted last) *)
let ready_queue = ref ([] : code_dag_node list)
let in_progress_queue = ref ([] : code_dag_node list)
let branch_list = ref ([] : code_dag_node list) (* built in reverse order *)
let clear_queues () =
ready_queue := []; in_progress_queue := []; branch_list := []
let rec insert_queue prio node = function
[] -> [node]
| hd :: tl as queue ->
if prio node hd then node :: queue else hd :: insert_queue prio node tl
let length_prio n1 n2 = n1.length > n2.length
let date_prio n1 n2 = n1.date < n2.date
let add_ready node =
ready_queue := insert_queue length_prio node !ready_queue
let add_in_progress node =
in_progress_queue := insert_queue date_prio node !in_progress_queue
let add_branch node =
branch_list := node :: !branch_list
(* Add an edge to the code DAG *)
let add_edge ancestor son delay =
ancestor.sons <- (son, delay) :: ancestor.sons;
son.ancestors <- son.ancestors + 1
let add_edge_after son ancestor = add_edge ancestor son 0
(* Add an instruction to the code DAG *)
let insimm opc arg imm res =
let instr =
try
Hashtbl.find instruction_table opc
with Not_found ->
fatal_error ("Unknown instruction " ^ opc) in
let node =
{ instr = instr;
imm = imm;
iarg = arg;
ires = res;
delay = instr.latency;
sons = []; (* to be filled later *)
date = 0; (* to be adjusted later *)
length = -1; (* to be computed later *)
ancestors = 0; (* ditto *)
emitted_ancestors = 0 } in (* ditto *)
(* RAW dependencies: add edges from all instrs that define one of the
resources used *)
for i = 0 to Array.length arg - 1 do
try
let rsrc = arg.(i) in
if is_mutable_resource rsrc then begin
let anc = Hashtbl.find code_results rsrc in
let delay = if is_memory_resource rsrc then 0 else anc.delay in
(* Memory accesses are ordered by the hardware, so we can emit
a memop 1, then a dependent memop 2 in the same cycle *)
add_edge anc node delay
end
with Not_found ->
()
done;
(* WAR dependencies: add edges from all instrs that use one of the
resources defined by this instruction
WAW dependencies: add edges from all instrs that define one of the
resources defined by this instruction *)
for i = 0 to Array.length res - 1 do
let rsrc = res.(i) in
if is_mutable_resource rsrc then begin
(* WAR *)
let anc = Hashtbl.find_all code_uses res.(i) in
List.iter (add_edge_after node) anc;
(* WAW *)
try
let anc = Hashtbl.find code_results rsrc in
let delay = if is_memory_resource rsrc then 0 else 1 in
add_edge anc node delay
with Not_found ->
()
end
done;
(* Remember the results and uses of this instruction *)
for i = 0 to Array.length res - 1 do
Hashtbl.add code_results res.(i) node
done;
for i = 0 to Array.length arg - 1 do
Hashtbl.add code_uses arg.(i) node
done;
(* Insert in appropriate queue *)
if node.instr.kind = KB
then add_branch node
else if node.ancestors = 0 then add_ready node
let insert opc arg res =
insimm opc arg "" res
(* Compute length of longest path to a result. *)
let rec longest_path node =
if node.length < 0 then begin
node.length <-
List.fold_left
(fun len (son, delay) -> max len (longest_path son + delay))
0 node.sons
end;
node.length
(* Emit the assembly code for a node *)
let emit_r = emit_string
let emit_instr node =
let opc = node.instr.opcode
and a = node.iarg
and r = node.ires
and imm = node.imm in
match node.instr.format with
F_i ->
` {emit_string opc} {emit_string imm}\n`
| F_i_pred ->
` ({emit_r a.(0)}) {emit_string opc} {emit_string imm}\n`
| F_ir_rr ->
` {emit_string opc} {emit_r r.(0)}, {emit_r r.(1)} = {emit_string imm}, {emit_r a.(0)}\n`
| F_ir_r ->
` {emit_string opc} {emit_r r.(0)} = {emit_string imm}, {emit_r a.(0)}\n`
| F_ir_r_pred ->
` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_string imm}, {emit_r a.(1)}\n`
| F_ld ->
` {emit_string opc} {emit_r r.(0)} = [{emit_r a.(0)}]\n`
| F_ld_post ->
` {emit_string opc} {emit_r r.(0)} = [{emit_r a.(0)}], {emit_string imm}\n`
| F_r ->
` {emit_string opc} {emit_r a.(0)}\n`
| F_i_r ->
` {emit_string opc} {emit_r r.(0)} = {emit_string imm}\n`
| F_i_r_pred ->
` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_string imm}\n`
| F_ri_rr ->
` {emit_string opc} {emit_r r.(0)}, {emit_r r.(1)} = {emit_r a.(0)}, {emit_string imm}\n`
| F_ri_r ->
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_string imm}\n`
| F_r_r ->
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}\n`
| F_r_r_pred ->
` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_r a.(1)}\n`
| F_rr_rr ->
` {emit_string opc} {emit_r r.(0)}, {emit_r r.(1)} = {emit_r a.(0)}, {emit_r a.(1)}\n`
| F_r_rir ->
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_string imm}, {emit_r a.(1)}\n`
| F_rr_r ->
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_r a.(1)}\n`
| F_rr_r_pred ->
` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_r a.(1)}, {emit_r a.(2)}\n`
| F_rri_r ->
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_r a.(1)}, {emit_string imm}\n`
| F_rrr_r ->
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_r a.(1)}, {emit_r a.(2)}\n`
| F_rrr_r_pred ->
` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_r a.(1)}, {emit_r a.(2)}, {emit_r a.(3)}\n`
| F_st ->
` {emit_string opc} [{emit_r a.(0)}] = {emit_r a.(1)}\n`
| F_st_post ->
` {emit_string opc} [{emit_r a.(0)}] = {emit_r a.(1)}, {emit_string imm}\n`
(* Little state machine reflecting how many instructions the chip can
issue in one cycle. We roughly follow the Itanium model:
2 int units, 2 mem units, 2 FP units, and 3 branch units,
with a maximum of 6 instructions dispatched per clock cycle. *)
let num_A = ref 0
let num_I = ref 0
let num_M = ref 0
let num_F = ref 0
let num_B = ref 0
let reset_issue () =
num_A := 0; num_I := 0; num_M := 0; num_F := 0; num_B := 0
let can_issue instr =
if !num_A + !num_I + !num_M + !num_F + !num_B >= 6 then false else begin
match instr.kind with
KA ->
if !num_A + !num_I + !num_M < 4
then (incr num_A; true)
else false
| KF ->
if !num_F < 2 then (incr num_F; true) else false
| KI ->
if !num_I < 2 && !num_A + !num_I + !num_M < 4
then (incr num_I; true) else false
| KM ->
if !num_M < 2 && !num_A + !num_I + !num_M < 4
then (incr num_M; true) else false
| _ (* KB | KB_exc *) ->
if !num_B < 3 then (incr num_B; true) else false
end
(* Emit one node, updating the completion date and number of ancestors
emitted for all nodes that depend on this node. Enter the nodes
that are no longer waiting on anything (all ancestors emitted)
in the ready queue or in the in_progress queue, depending on
latency. *)
let emit_node date node =
begin try
(*`# Date: {emit_int date}; distance: {emit_int node.length}\n`;*)
emit_instr node
with x ->
fatal_error ("Error while emitting " ^ node.instr.opcode)
end;
List.iter
(fun (son, delay) ->
let completion_date = date + delay in
if son.date < completion_date then son.date <- completion_date;
son.emitted_ancestors <- son.emitted_ancestors + 1;
if son.emitted_ancestors = son.ancestors && son.instr.kind <> KB then
begin
(*`# {emit_string son.instr.opcode} will be ready at {emit_int son.date}\n`;*)
if son.date = date then add_ready son else add_in_progress son
end)
node.sons
(* Emit all ready nodes that we can emit given the architectural
constraints. *)
let rec emit_ready_nodes filter date =
match !ready_queue with
[] -> []
| node :: rem ->
ready_queue := rem;
if filter node && can_issue node.instr then begin
emit_node date node;
emit_ready_nodes filter date
end else
node :: emit_ready_nodes filter date
let filter_MF node =
match node.instr.kind with KM -> true | KF -> true | _ -> false
let filter_non_MF node =
not(filter_MF node)
(* Add all instructions with date <= d to the ready queue, and remove them *)
let rec extract_ready d = function
[] -> []
| node :: rem as queue ->
if node.date <= d then (add_ready node; extract_ready d rem) else queue
(* Say if a branch is ready to be emitted now *)
let branch_is_ready date br =
br.emitted_ancestors = br.ancestors && br.date <= date
(* Schedule the basic block, emitting all of its instructions *)
let rec reschedule date =
match (!ready_queue, !in_progress_queue) with
([], []) ->
(* We're done with the regular instructions; finish with the branches *)
begin match !branch_list with
[] -> ()
| br -> List.iter emit_instr br; emit_string " ;;\n"
end
| ([], node :: _) ->
(* Advance to the time node.date, extracting from in_progress_queue
all instructions ready at that time and adding them to the
ready queue *)
in_progress_queue := extract_ready node.date !in_progress_queue;
(* Try again *)
reschedule node.date
| (_, _) ->
` # time {emit_int date}\n`;
(* Emit and remove as many ready instructions as we can *)
(* Give priority to M and F instructions *)
reset_issue();
ready_queue := emit_ready_nodes filter_MF date;
ready_queue := emit_ready_nodes filter_non_MF date;
(* Special hack: if the only remaining instructions are branches
and they are all ready now, emit them in the current
group of instructions *)
if !ready_queue = []
&& !in_progress_queue = []
&& List.for_all (branch_is_ready date) !branch_list
then begin
List.iter emit_instr !branch_list;
branch_list := []
end;
(* Emit a stop to pause the processor *)
emit_string " ;;\n";
(* Advance to the time date + 1, extracting from in_progress_queue
all instructions ready at that time and adding them to the
ready queue *)
in_progress_queue := extract_ready (date + 1) !in_progress_queue;
(* Try again *)
reschedule (date + 1)
(* Emit the code for the current basic block *)
let end_basic_block () =
(* Compute critical paths and rebuild ready queue sorted by
decreasing criticality *)
let r = !ready_queue in
ready_queue := [];
let max_length =
List.fold_left (fun len node -> max len (longest_path node)) 0 r in
List.iter add_ready r;
branch_list := List.rev !branch_list;
(* Emit the instructions by traversing the code DAG *)
reschedule 0;
if max_length > 0 then ` # basic block length {emit_int max_length}\n`;
clear_code_dag ();
clear_queues ()
(************** Part 2: the code emitter *******************)
(* Tradeoff between code size and code speed *)
let fastcode_flag = ref true
(* Translate or output a label *)
let label lbl = sprintf ".L%d" lbl
let emit_label lbl = emit_string ".L"; emit_int lbl
(* Translate or output a symbol *)
let symbol s =
let b = Buffer.create (String.length s + 1) in
for i = 0 to String.length s - 1 do
let c = s.[i] in
match c with
'A'..'Z' | 'a'..'z' | '0'..'9' | '_' ->
Buffer.add_char b c
| _ ->
Buffer.add_string b (sprintf "$%02x" (Char.code c))
done;
Buffer.add_char b '#';
Buffer.contents b
let emit_symbol s = Emitaux.emit_symbol '$' s
(* Translate a pseudo-register *)
let reg r =
match r.loc with Reg r -> register_name r | _ -> assert false
let regs r =
Array.map reg r
(* Output a pseudo-register *)
let emit_reg r =
match r.loc with
Reg r -> emit_string (register_name r)
| _ -> fatal_error "Emit_ia64.emit_reg"
(* Translate a float as a 64-bit integer *)
let float_bits f =
let b = Buffer.create 18 in
let bytes = (Obj.magic f : string) in
Buffer.add_string b "0x";
for i = 7 downto 0 do (* little-endian *)
Buffer.add_string b
(sprintf "%02x" (Char.code (String.unsafe_get bytes i)))
done;
Buffer.contents b
(* Translate an "ltoffset" reference to a global *)
let ltoffset s = sprintf "@ltoff(%s)" (symbol s)
let ltoffset_fptr s = sprintf "@ltoff(@fptr(%s))" (symbol s)
(* Layout of the stack frame.
All stack offsets are shifted by 16 to preserve the scratch area at
bottom of stack. *)
let stack_offset = ref 0
let frame_size () =
let size =
!stack_offset +
8 * (num_stack_slots.(0) + num_stack_slots.(1)) +
(if !contains_calls then 8 else 0) in
Misc.align size 16
let slot_offset loc cl =
match loc with
Incoming n -> frame_size() + n + 16
| Local n ->
if cl = 0
then !stack_offset + n * 8 + 16
else !stack_offset + (num_stack_slots.(0) + n) * 8 + 16
| Outgoing n -> n + 16
let slot_offset_reg r =
match r.loc with
Stack l -> slot_offset l (register_class r)
| _ -> assert false
(* Record live pointers at call points *)
type frame_descr =
{ fd_lbl: int; (* Return address *)
fd_frame_size: int; (* Size of stack frame *)
fd_live_offset: int list } (* Offsets/regs of live addresses *)
let frame_descriptors = ref([] : frame_descr list)
let record_frame_label live =
let lbl = new_label() in
let live_offset = ref [] in
Reg.Set.iter
(function
{typ = Addr; loc = Reg r} ->
live_offset := ((r lsl 1) + 1) :: !live_offset
| {typ = Addr; loc = Stack s} as reg ->
live_offset := slot_offset s (register_class reg) :: !live_offset
| _ -> ())
live;
frame_descriptors :=
{ fd_lbl = lbl;
fd_frame_size = frame_size();
fd_live_offset = !live_offset } :: !frame_descriptors;
lbl
let record_frame live =
let lbl = record_frame_label live in `{emit_label lbl}:`
let emit_frame fd =
` data8 {emit_label fd.fd_lbl}\n`;
` data2 {emit_int fd.fd_frame_size}\n`;
` data2 {emit_int (List.length fd.fd_live_offset)}\n`;
List.iter
(fun n ->
` data2 {emit_int n}\n`)
fd.fd_live_offset;
` .align 8\n`
(* Names of various instructions *)
let name_for_int_operation = function
Iadd -> "add"
| Isub -> "sub"
| Iand -> "and"
| Ior -> "or"
| Ixor -> "xor"
| Ilsl -> "shl"
| Ilsr -> "shru"
| Iasr -> "shr"
| _ -> Misc.fatal_error "Emit.name_for_int_operation"
let name_for_float_operation = function
Inegf -> "fneg"
| Iabsf -> "fabs"
| Iaddf -> "fadd.d"
| Isubf -> "fsub.d"
| Imulf -> "fmpy.d"
| _ -> Misc.fatal_error "Emit.name_for_float_operation"
let name_for_specific_operation = function
Imultaddf -> "fma.d"
| Imultsubf -> "fms.d"
| Isubmultf -> "fnma.d"
| _ -> Misc.fatal_error "Emit.name_for_specific_operation"
let name_for_int_comparison = function
Isigned Ceq -> "eq" | Isigned Cne -> "ne"
| Isigned Cle -> "le" | Isigned Cgt -> "gt"
| Isigned Clt -> "lt" | Isigned Cge -> "ge"
| Iunsigned Ceq -> "eq" | Iunsigned Cne -> "ne"
| Iunsigned Cle -> "leu" | Iunsigned Cgt -> "gtu"
| Iunsigned Clt -> "ltu" | Iunsigned Cge -> "geu"
let name_for_swapped_int_comparison = function
Isigned Ceq -> "eq" | Isigned Cne -> "ne"
| Isigned Cle -> "ge" | Isigned Cgt -> "lt"
| Isigned Clt -> "gt" | Isigned Cge -> "le"
| Iunsigned Ceq -> "eq" | Iunsigned Cne -> "ne"
| Iunsigned Cle -> "geu" | Iunsigned Cgt -> "ltu"
| Iunsigned Clt -> "gtu" | Iunsigned Cge -> "leu"
let name_for_float_comparison cmp =
match cmp with
Ceq -> "eq" | Cne -> "neq"
| Cle -> "le" | Cgt -> "gt"
| Clt -> "lt" | Cge -> "ge"
(* Immediate range for addl (move) and adds (general add) instructions *)
let is_immediate_addl n = n >= -0x200000 && n < 0x200000
let is_immediate_addl_nat n =
n >= Nativeint.of_int (-0x200000) && n < Nativeint.of_int 0x200000
let is_immediate_adds n = n >= -0x2000 && n < 0x2000
(* Return the positions of all "1" bits in the given integer,
most significant bits first *)
let ones_pos n =
let rec ones p accu =
if p >= 63
then accu
else ones (p+1) (if n land (1 lsl p) = 0 then accu else p :: accu) in
ones 0 []
(* Generate temporary registers *)
let temp_generator temporaries =
let counter = ref 0 in
fun () ->
let r = temporaries.(!counter) in
incr counter;
if !counter >= Array.length temporaries then counter := 0;
r
let new_temp_reg =
temp_generator [| "r2"; "r3"; "r14"; "r15" |]
let new_temp_float =
temp_generator [| "f64"; "f65"; "f66"; "f67";
"f68"; "f69"; "f70"; "f71" |]
let new_pred =
temp_generator [| "p2"; "p3"; "p4"; "p5" |]
(* Output the assembly code for an instruction *)
(* Name of current function *)
let function_name = ref ""
(* Entry point for tail recursive calls *)
let tailrec_entry_point = ref 0
let emit_instr i =
match i.desc with
Lend -> ()
| Lop(Imove | Ispill | Ireload) ->
let src = i.arg.(0) and dst = i.res.(0) in
if src.loc <> dst.loc then begin
match (src.loc, dst.loc) with
(Reg _, Reg _) ->
insert "mov" (regs i.arg) (regs i.res)
| (Reg _, Stack _) ->
let offset = string_of_int (slot_offset_reg dst) in
let r = new_temp_reg() in
insimm "addi" [| "sp" |] offset [| r |];
insert (if i.res.(0).typ = Float then "stfd" else "st8")
[| r; reg src |] [| "stk" ^ offset |]
| (Stack _, Reg _) ->
let offset = string_of_int (slot_offset_reg src) in
let r = new_temp_reg() in
insimm "addi" [| "sp" |] offset [| r |];
insert (if i.arg.(0).typ = Float then "ldfd" else "ld8")
[| r; "stk" ^ offset |] (regs i.res)
| (_, _) ->
assert false
end
| Lop(Iconst_int n) ->
let instr =
if is_immediate_addl_nat n then "movi" else "movil" in
insimm instr [||] (Nativeint.to_string n) (regs i.res)
| Lop(Iconst_float s) ->
let f = float_of_string s in
if f = 0.0 then
insert "mov" [| "f0" |] (regs i.res)
else if f = 1.0 then
insert "mov" [| "f1" |] (regs i.res)
else begin
let tmp = new_temp_reg() in
insimm "movil" [||] (float_bits f) [| tmp |];
insert "setf.d" [| tmp |] (regs i.res)
end
| Lop(Iconst_symbol s) ->
insimm "addi" [| "gp" |] (ltoffset s) (regs i.res);
insert "ld8" (regs i.res) (regs i.res)
| Lop(Icall_ind) ->
insert "movtb" (regs i.arg) [| "b0" |];
insert "brcallind" [| "b0" |] [| "b0" |];
end_basic_block();
`{record_frame i.live}\n`
| Lop(Icall_imm s) ->
insimm "brcall" [||] (symbol s) [| "b0" |];
end_basic_block();
`{record_frame i.live}\n`
| Lop(Itailcall_ind) ->
let n = frame_size() in
insert "movtb" (regs i.arg) [| "b6" |];
if !contains_calls then begin
let tmp = new_temp_reg() in
insimm "addi" [| "sp" |] (string_of_int (n + 8)) [| tmp |];
insert "ld8" [| tmp |] [| tmp |];
insert "mov" [| tmp |] [| "b0" |]
end;
if n > 0 then
insimm "addi" [| "sp" |] (string_of_int n) [| "sp" |];
insert "brind" [| "b6" |] [||];
end_basic_block()
| Lop(Itailcall_imm s) ->
if s = !function_name then begin
insimm "br" [||] (label !tailrec_entry_point) [||]
end else begin
let n = frame_size() in
if !contains_calls then begin
let tmp = new_temp_reg() in
insimm "addi" [| "sp" |] (string_of_int (n + 8)) [| tmp |];
insert "ld8" [| tmp |] [| tmp |];
insert "mov" [| tmp |] [| "b0" |]
end;
if n > 0 then
insimm "addi" [| "sp" |] (string_of_int n) [| "sp" |];
insimm "br" [||] (symbol s) [||]
end;
end_basic_block()
| Lop(Iextcall(s, alloc)) ->
if alloc then begin
let tmp = new_temp_reg() in
insimm "addi" [| "gp" |] (ltoffset_fptr s) [| tmp |];
insert "ld8" [| tmp |] [| "r2" |];
insimm "brcall" [||] "caml_c_call#" [| "b0" |];
end_basic_block();
`{record_frame i.live}\n`
end else begin
insert "mov" [| "gp" |] [| "r7" |];
insimm "brcall" [||] (symbol s) [| "b0" |];
end_basic_block();
insert "mov" [| "r7" |] [| "gp" |]
end
| Lop(Istackoffset n) ->
end_basic_block();
insimm "addi" [| "sp" |] (string_of_int (-n)) [| "sp" |];
stack_offset := !stack_offset + n
| Lop(Iload(chunk, addr)) ->
let load_instr =
match chunk with
| Byte_unsigned -> "ld1"
| Byte_signed -> "ld1"
| Sixteen_unsigned -> "ld2"
| Sixteen_signed -> "ld2"
| Thirtytwo_unsigned -> "ld4"
| Thirtytwo_signed -> "ld4"
| Word -> "ld8"
| Single -> "ldfs"
| Double -> "ldfd"
| Double_u -> "ldfd" in
insert load_instr [| reg i.arg.(0); "heap" |] (regs i.res);
let sext_instr =
match chunk with
Byte_signed -> "sxt1"
| Sixteen_signed -> "sxt2"
| Thirtytwo_signed -> "sxt4"
| _ -> "" in
if sext_instr <> "" then
insert sext_instr (regs i.res) (regs i.res)
| Lop(Istore(chunk, addr)) ->
let store_instr =
match chunk with
| Byte_unsigned -> "st1"
| Byte_signed -> "st1"
| Sixteen_unsigned -> "st2"
| Sixteen_signed -> "st2"
| Thirtytwo_unsigned -> "st4"
| Thirtytwo_signed -> "st4"
| Word -> "st8"
| Single -> "stfs"
| Double -> "stfd"
| Double_u -> "stfd" in
insert store_instr [| reg i.arg.(1); reg i.arg.(0) |] [| "heap" |]
| Lop(Ialloc n) ->
if !fastcode_flag then begin
insimm "addi" [| "r4" |] (string_of_int (-n)) [| "r4" |];
insert "cmp.ltu" [| "r4"; "r5" |] [| "p6"; "p0" |];
insimm "movi" [||] (string_of_int n) [| "r2" |];
insimm "brcallcond" [| "p6" |] "caml_call_gc#" [| "b0" |];
end_basic_block();
`{record_frame i.live}\n`;
insimm "addi" [| "r4" |] "8" (regs i.res)
end else begin
insimm "movi" [||] (string_of_int n) [| "r2" |];
insimm "brcall" [||] "caml_alloc#" [| "b0" |];
end_basic_block();
`{record_frame i.live}\n`;
insimm "addi" [| "r4" |] "8" (regs i.res)
end
| Lop(Iintop Imul) ->
let t1 = new_temp_float() and t2 = new_temp_float() in
insert "setf.sig" [|reg i.arg.(0)|] [| t1 |];
insert "setf.sig" [|reg i.arg.(1)|] [| t2 |];
insert "xmpy.l" [| t1; t2 |] [| t1 |];
insert "getf.sig" [| t1 |] (regs i.res)
| Lop(Iintop(Icomp cmp)) ->
let comp = "cmpp." ^ name_for_int_comparison cmp in
let p1 = new_pred() and p2 = new_pred() in
insert comp (regs i.arg) [| p1; p2 |];
insimm "movicond" [| p1 |] "1" (regs i.res);
insimm "movicond" [| p2 |] "0" (regs i.res)
| Lop(Iintop(Icheckbound)) ->
insert "cmp.leu" (regs i.arg) [| "p6"; "p0" |];
insimm "brcallcondexc" [| "p6" |] "caml_array_bound_error#"
[| "b0"; "heap" |]
| Lop(Iintop op) ->
let instr = name_for_int_operation op in
insert instr (regs i.arg) (regs i.res)
| Lop(Iintop_imm(Imul, n)) ->
let src = reg i.arg.(0) and dst = reg i.res.(0) in
begin match ones_pos n with
[] ->
insimm "movi" [||] "0" [|dst|]
| [n] ->
insimm "shli" [|src|] (string_of_int n) [|dst|]
| [n; 0] when n <= 4 ->
insimm "shladd" [|src; src|] (string_of_int n) [|dst|]
| n1::n2::lst ->
let acc1 = new_temp_reg() and acc2 = new_temp_reg()
and tmp1 = new_temp_reg() and tmp2 = new_temp_reg() in
insimm "shli" [|src|] (string_of_int n1) [|acc1|];
insimm "shli" [|src|] (string_of_int n2) [|acc2|];
let rec add_shifts a1 t1 a2 t2 = function
[] ->
insert "add" [|a1; a2|] [|dst|]
| n::rem ->
if n = 0 then
insert "add" [|src; a1|] [|a1|]
else if n <= 4 then
insimm "shladd" [|src; a1|] (string_of_int n) [|a1|]
else begin
insimm "shli" [|src|] (string_of_int n) [|t1|];
insert "add" [|t1; a1|] [|a1|]
end;
add_shifts a2 t2 a1 t1 rem in
add_shifts acc1 tmp1 acc2 tmp2 lst
end
| Lop(Iintop_imm(Idiv, n)) -> (* n must be a power of 2 *)
let src = regs i.arg and dst = regs i.res in
let p1 = new_pred() and p2 = new_pred() in
let l = Misc.log2 n in
insert "cmpp.lt" [| src.(0); "r0" |] [| p1; p2 |];
if is_immediate_adds (n-1) then
insimm "addicond" [| p1; src.(0) |] (string_of_int (n-1)) dst
else begin
let moveop = if is_immediate_addl (n-1) then "movi" else "movil" in
insimm moveop [||] (string_of_int (n-1)) [| "r2" |];
insert "addcond" [| p1; src.(0); "r2" |] dst
end;
insert "movcond" [| p2; src.(0) |] dst;
insimm "shri" dst (string_of_int l) dst
| Lop(Iintop_imm(Imod, n)) -> (* n must be a power of 2 *)
let src = regs i.arg and dst = regs i.res in
let p = new_pred() in
let l = Misc.log2 n in
insert "cmpp.lt" [| src.(0); "r0" |] [| p; "p0" |];
insimm "extr.u" src (sprintf "0, %d" l) dst;
insert "cmpp.ne.and" [| dst.(0); "r0"; p |] [| p; "p0" |];
if is_immediate_adds (-n) then
insimm "addicond" [| p; dst.(0) |] (string_of_int (-n)) dst
else begin
let moveop = if is_immediate_addl (-n) then "movi" else "movil" in
insimm moveop [||] (string_of_int (-n)) [| "r2" |];
insert "addcond" [| p; dst.(0); "r2" |] dst
end
| Lop(Iintop_imm(Icomp cmp, n)) ->
let comp = "cmppi." ^ name_for_swapped_int_comparison cmp in
let p1 = new_pred() and p2 = new_pred() in
insimm comp (regs i.arg) (string_of_int n) [| p1; p2 |];
insimm "movicond" [| p1 |] "1" (regs i.res);
insimm "movicond" [| p2 |] "0" (regs i.res)
| Lop(Iintop_imm(Icheckbound, n)) ->
insimm "cmpi.geu" (regs i.arg) (string_of_int n) [| "p6"; "p0" |];
insimm "brcallcondexc" [| "p6" |] "caml_array_bound_error#"
[| "b0"; "heap" |]
| Lop(Iintop_imm(op, n)) ->
let instr = name_for_int_operation op ^ "i" in
insimm instr (regs i.arg) (string_of_int n) (regs i.res)
| Lop(Inegf | Iabsf | Iaddf | Isubf | Imulf as op) ->
let instr = name_for_float_operation op in
insert instr (regs i.arg) (regs i.res)
| Lop(Idivf) ->
(* Straight from the IA64 application developer's architecture guide,
section 13.3.3.1. Modified so that the destination may be equal
to one of the operands *)
let a = reg i.arg.(0) and b = reg i.arg.(1) and r = reg i.res.(0)
and t1 = new_temp_float() and t2 = new_temp_float()
and t3 = new_temp_float() and t4 = new_temp_float()
and p = new_pred() in
insert "frcpa" [| a; b |] [| t1; p |];
insert "fmas1cond" [| p; a; t1; "f0" |] [| t2 |];
insert "fnmas1cond" [| p; b; t1; "f1" |] [| t3 |];
insert "fmas1cond" [| p; t3; t3; t2 |] [| t2 |];
insert "fmas1cond" [| p; t3; t3; "f0" |] [| t4 |];
insert "fmas1cond" [| p; t3; t1; t1 |] [| t1 |];
insert "fmas1cond" [| p; t4; t2; t2 |] [| t2 |];
insert "fmas1cond" [| p; t4; t4; "f0" |] [| t3 |];
insert "fmas1cond" [| p; t4; t1; t1 |] [| t1 |];
insert "fmads1cond" [| p; t3; t2; t2 |] [| t2 |];
insert "fmas1cond" [| p; t3; t1; t1 |] [| t1 |];
insert "fnmads1cond" [| p; b; t2; a |] [| t3 |];
insert "mov" [| t1 |] [| r |];
insert "fmacond" [| p; t3; t1; t2 |] [| r |]
| Lop(Ifloatofint) ->
let src = regs i.arg and dst = regs i.res in
insert "setf.sig" src dst;
insert "fcvt.xf" dst dst;
insert "fnorm.d" dst dst
| Lop(Iintoffloat) ->
let src = regs i.arg and dst = regs i.res and tmp = new_temp_float() in
insert "fcvt.fx.trunc" src [| tmp |];
insert "getf.sig" [| tmp |] dst
| Lop(Ispecific(Iadd1)) ->
let s = if Array.length i.arg >= 2 then 1 else 0 in
insimm "add1" [| reg i.arg.(0); reg i.arg.(s) |] "1" (regs i.res)
| Lop(Ispecific(Isub1)) ->
insimm "sub1" (regs i.arg) "1" (regs i.res)
| Lop(Ispecific(Ishladd n)) ->
insimm "shladd" (regs i.arg) (string_of_int n) (regs i.res)
| Lop(Ispecific(Isignextend n)) ->
let op = "sxt" ^ string_of_int n in
insert op (regs i.arg) (regs i.res)
| Lop(Ispecific (Imultaddf | Imultsubf | Isubmultf as sop)) ->
let name = name_for_specific_operation sop in
insert name (regs i.arg) (regs i.res)
| Lop(Ispecific (Istoreincr n)) ->
let op = if i.arg.(1).typ = Float then "stfd+" else "st8+" in
insimm op [| reg i.arg.(0); reg i.arg.(1) |]
(string_of_int n)
[| reg i.res.(0); "heapinit" |]
| Lop(Ispecific Iinitbarrier) ->
insert "#initbarrier" [| "heapinit" |] [| "heap" |]
| Lreloadretaddr ->
let n = frame_size() + 8 in
let tmp = new_temp_reg() in
insimm "addi" [| "sp" |] (string_of_int n) [| tmp |];
insert "ld8" [| tmp |] [| tmp |];
insert "movtb" [| tmp |] [| "b0" |]
| Lreturn ->
let n = frame_size() in
if n > 0 then
insimm "addi" [| "sp" |] (string_of_int n) [| "sp" |];
insert "brret" [| "b0" |] [||];
end_basic_block()
| Llabel lbl ->
end_basic_block();
`{emit_label lbl}:\n`
| Lbranch lbl ->
insimm "br" [||] (label lbl) [||];
end_basic_block()
| Lcondbranch(tst, lbl) ->
begin match tst with
Itruetest ->
insimm "cmpi.ne" (regs i.arg) "0" [| "p6"; "p0" |]
| Ifalsetest ->
insimm "cmpi.eq" (regs i.arg) "0" [| "p6"; "p0" |]
| Iinttest cmp ->
let comp = "cmp." ^ name_for_int_comparison cmp in
insert comp (regs i.arg) [| "p6"; "p0" |]
| Iinttest_imm(cmp, n) ->
let comp = "cmpi." ^ name_for_swapped_int_comparison cmp in
insimm comp (regs i.arg) (string_of_int n) [| "p6"; "p0" |]
| Ifloattest(cmp, neg) ->
let comp = "fcmp." ^ name_for_float_comparison cmp in
insert comp (regs i.arg)
(if neg then [| "p0"; "p6" |]
else [| "p6"; "p0" |])
| Ioddtest ->
insimm "tbit.nz" (regs i.arg) "0" [| "p6"; "p0" |]
| Ieventest ->
insimm "tbit.z" (regs i.arg) "0" [| "p6"; "p0" |]
end;
insimm "brcond" [| "p6" |] (label lbl) [||];
end_basic_block()
| Lcondbranch3(lbl0, lbl1, lbl2) ->
end_basic_block();
let emit_compare n p = function
None -> ()
| Some lbl ->
` cmp.eq p{emit_int p}, p0 = {emit_int n}, {emit_reg i.arg.(0)}\n` in
let emit_branch p = function
None -> ()
| Some lbl ->
` (p{emit_int p}) br {emit_label lbl}\n` in
emit_compare 0 5 lbl0; emit_compare 1 6 lbl1; emit_compare 2 7 lbl2;
emit_branch 5 lbl0; emit_branch 6 lbl1; emit_branch 7 lbl2
| Lswitch jumptbl ->
end_basic_block();
let numcases = Array.length jumptbl in
if numcases <= 9 then begin
for j = 0 to numcases / 3 do
let n = j * 3 in
for k = 0 to 2 do
if n + k < numcases - 1 then
` cmp.eq p{emit_int(k+5)}, p0 = {emit_int (n+k)}, {emit_reg i.arg.(0)}\n`
done;
for k = 0 to 2 do
if n + k < numcases - 1 then
` (p{emit_int(k+5)}) br {emit_label jumptbl.(n+k)}\n`
else if n + k = numcases - 1 then
` br {emit_label jumptbl.(n+k)}\n`
done;
` ;;\n`
done
end else if numcases <= 47 then begin
` mov r2 = 1\n`;
` cmp.eq p6, p0 = 0, {emit_reg i.arg.(0)}\n`;
` (p6) br {emit_label jumptbl.(0)} ;;\n`;
` shl r2 = r2, {emit_reg i.arg.(0)}\n`;
` cmp.eq p7, p0 = 1, {emit_reg i.arg.(0)}\n`;
` (p7) br {emit_label jumptbl.(1)} ;;\n`;
` mov pr = r2, -1 ;;\n`;
for i = 2 to numcases - 1 do
` (p{emit_int i}) br {emit_label jumptbl.(i)}\n`
done;
` ;;\n`
end else begin
let lbl_jumptbl = new_label() in
let lbl_ip = new_label() in
`{emit_label lbl_ip}: mov r2 = ip ;;\n`;
` add r2 = {emit_label lbl_jumptbl} - {emit_label lbl_ip}, r2 ;;\n`;
` shladd r3 = {emit_reg i.arg.(0)}, 2, r2 ;;\n`;
` ld4 r3 = [r3] ;;\n`;
` sxt4 r3 = r3 ;;\n`;
` add r2 = r2, r3 ;;\n`;
` mov b6 = r2 ;;\n`;
` br b6 ;;\n`;
` .align 4\n`;
`{emit_label lbl_jumptbl}:\n`;
for i = 0 to numcases - 1 do
` data4 {emit_label jumptbl.(i)} - {emit_label lbl_jumptbl}\n`
done;
` .align 16\n`
end
| Lsetuptrap lbl ->
end_basic_block();
let lbl_ip = new_label() in
let lbl_next = new_label() in
`{emit_label lbl_ip}: mov r2 = ip ;;\n`;
` add r2 = {emit_label lbl_next} - {emit_label lbl_ip}, r2\n`;
` br.sptk {emit_label lbl} ;;\n`;
`{emit_label lbl_next}:\n`
| Lpushtrap ->
end_basic_block();
stack_offset := !stack_offset + 16;
(* Store trap pointer at sp, handler addr at sp+8,
and decrement sp by 16. Remember, the bottom 16 bytes
of the stack must be left free. *)
` add r3 = 8, sp\n`;
` st8 [sp] = r6, -16 ;;\n`;
` st8 [r3] = r2\n`;
` add r6 = 16, sp ;;\n`
| Lpoptrap ->
end_basic_block();
` add sp = 16, sp ;;\n`;
` ld8 r6 = [sp] ;;\n`;
stack_offset := !stack_offset - 16
| Lraise ->
end_basic_block();
` mov sp = r6\n`;
` add r2 = 8, r6\n`;
` ld8 r6 = [r6] ;;\n`;
` ld8 r2 = [r2] ;;\n`;
` mov b6 = r2 ;;\n`;
` br b6\n`
let rec emit_all i =
match i.desc with Lend -> () | _ -> emit_instr i; emit_all i.next
(* Check if a function contains a tail call to itself *)
let rec is_tailrec i =
match i.desc with
Lend -> false
| Lop(Itailcall_imm s) when s = !function_name -> true
| _ -> is_tailrec i.next
(* Emission of a function declaration *)
let fundecl f =
function_name := f.fun_name;
fastcode_flag := f.fun_fast;
stack_offset := 0;
` .text\n`;
` .align 4\n`;
` .global {emit_symbol f.fun_name}#\n`;
` .proc {emit_symbol f.fun_name}#\n`;
`{emit_symbol f.fun_name}:\n`;
let n = frame_size() in
if !contains_calls then begin
insert "movfb" [| "b0" |] [| "r2" |];
insimm "addi" [| "sp" |] "8" [| "r3" |];
insimm "addi" [| "sp" |] (string_of_int (-n)) [| "sp" |];
insert "st8" [| "r3"; "r2" |] [||]
end
else if n > 0 then
insimm "addi" [| "sp" |] (string_of_int (-n)) [| "sp" |];
if is_tailrec f.fun_body then begin
tailrec_entry_point := new_label();
end_basic_block();
`{emit_label !tailrec_entry_point}:\n`
end;
emit_all f.fun_body;
end_basic_block();
` .endp {emit_symbol f.fun_name}#\n`
(* Emission of data *)
let emit_define_symbol s =
` .global {emit_symbol s}#\n`;
` .type {emit_symbol s}#, @object\n`;
` .size {emit_symbol s}#, 8\n`;
`{emit_symbol s}:\n`
let emit_item = function
Cdefine_symbol s ->
emit_define_symbol s
| Cdefine_label lbl ->
`{emit_label (100000 + lbl)}:\n`
| Cint8 n ->
` data1 {emit_int n}\n`
| Cint16 n ->
` data2 {emit_int n}\n`
| Cint32 n ->
let n' = Nativeint.shift_right (Nativeint.shift_left n 32) 32 in
` data4 {emit_nativeint n'}\n`
| Cint n ->
` data8 {emit_nativeint n}\n`
| Csingle f ->
` real4 {emit_string f}\n`
| Cdouble f ->
` real8 {emit_string f}\n`
| Csymbol_address s ->
` data8 {emit_symbol s}#\n`
| Clabel_address lbl ->
` data8 {emit_label (100000 + lbl)}\n`
| Cstring s ->
emit_string_directive " string " s
| Cskip n ->
if n > 0 then ` .skip {emit_int n}\n`
| Calign n ->
` .align {emit_int n}\n`
let data l =
` .data\n`;
` .align 8\n`;
List.iter emit_item l
(* Beginning / end of an assembly file *)
let begin_assembly() =
` .data\n`;
emit_define_symbol (Compilenv.current_unit_name() ^ "__data_begin");
` .text\n`;
emit_define_symbol (Compilenv.current_unit_name() ^ "__code_begin")
let end_assembly () =
` .data\n`;
emit_define_symbol (Compilenv.current_unit_name() ^ "__data_end");
` .text\n`;
emit_define_symbol (Compilenv.current_unit_name() ^ "__code_end");
` .rodata\n`;
` .align 8\n`;
emit_define_symbol (Compilenv.current_unit_name() ^ "__frametable");
` data8 {emit_int (List.length !frame_descriptors)}\n`;
List.iter emit_frame !frame_descriptors;
frame_descriptors := []