1320 lines
52 KiB
Plaintext
1320 lines
52 KiB
Plaintext
(***********************************************************************)
|
|
(* *)
|
|
(* Objective Caml *)
|
|
(* *)
|
|
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
|
|
(* *)
|
|
(* Copyright 2000 Institut National de Recherche en Informatique et *)
|
|
(* en Automatique. All rights reserved. This file is distributed *)
|
|
(* under the terms of the Q Public License version 1.0. *)
|
|
(* *)
|
|
(***********************************************************************)
|
|
|
|
(* $Id$ *)
|
|
|
|
(* Emission of IA64 assembly code *)
|
|
|
|
open Printf
|
|
open Misc
|
|
open Cmm
|
|
open Arch
|
|
open Proc
|
|
open Reg
|
|
open Mach
|
|
open Linearize
|
|
open Emitaux
|
|
|
|
(************** Part 1: assembly-level scheduler *******************)
|
|
|
|
(* Representation of resources accessed or produced by instructions *)
|
|
|
|
type resource = string
|
|
(* A resource is either:
|
|
- a register name
|
|
- "stkN" for a stack location
|
|
- "heap" for the Caml heap
|
|
- "chkN" for the result of a checkbound instruction *)
|
|
|
|
let is_memory_resource rsrc =
|
|
String.length rsrc >= 4 &&
|
|
begin match String.sub rsrc 0 3 with
|
|
"stk" -> true
|
|
| "hea" -> true
|
|
| "chk" -> true
|
|
| _ -> false
|
|
end
|
|
|
|
let is_mutable_resource rsrc =
|
|
rsrc <> "r0" && rsrc <> "p0"
|
|
|
|
(* Description of instructions *)
|
|
|
|
type instruction_kind =
|
|
KA (* A type instruction (int or mem unit) *)
|
|
| KB (* B type instruction (branch unit) *)
|
|
| KI (* I type instruction (int unit *)
|
|
| KF (* F type instruction (FP unit) *)
|
|
| KM (* M type instruction (mem unit) *)
|
|
| KB_exc (* B type instruction, exceptional condition,
|
|
can be moved around *)
|
|
|
|
type instruction_format =
|
|
F_i (* op imm *)
|
|
| F_i_pred (* (pred) op imm *)
|
|
| F_ir_rr (* op p1,p2 = imm, r *)
|
|
| F_ir_r (* op r = imm, r *)
|
|
| F_ir_r_pred (* (pred) op r = imm, r *)
|
|
| F_ld (* op r = [r] *)
|
|
| F_ld_post (* op r = [r], imm *)
|
|
| F_r (* op r *)
|
|
| F_i_r (* op r = imm *)
|
|
| F_i_r_pred (* (pred) op r = imm *)
|
|
| F_ri_rr (* op p1,p2 = imm, r *)
|
|
| F_ri_r (* op r = imm, r *)
|
|
| F_r_r (* op r = r *)
|
|
| F_r_r_pred (* (pred) op r = r *)
|
|
| F_rr_rr (* op p1,p2 = r1, r2 *)
|
|
| F_r_rir (* op r = r1, imm, r2 *)
|
|
| F_rr_r (* op r = r1, r2 *)
|
|
| F_rr_r_pred (* (pred) op r = r1, r2 *)
|
|
| F_rri_r (* op r = r1, r2, imm *)
|
|
| F_rrr_r (* op r = r1, r2, r3 *)
|
|
| F_rrr_r_pred (* (pred) op r = r1, r2, r3 *)
|
|
| F_st (* op [r] = r *)
|
|
| F_st_post (* op [r] = r, imm *)
|
|
|
|
type instruction_descr =
|
|
{ opcode: string; (* actual opcode *)
|
|
latency: int; (* latency in cycles *)
|
|
kind: instruction_kind; (* kind of instruction *)
|
|
format: instruction_format } (* how to generate asm for it *)
|
|
|
|
let instruction_table = create_hashtable 73 [
|
|
"add", {opcode = "add"; latency = 1; kind = KA; format = F_rr_r};
|
|
"add1", {opcode = "add"; latency = 1; kind = KA; format = F_rri_r};
|
|
"addcond", {opcode = "add"; latency = 1; kind = KA; format = F_rr_r_pred};
|
|
"addi", {opcode = "add"; latency = 1; kind = KA; format = F_ir_r};
|
|
"addicond", {opcode = "add"; latency = 1; kind = KA; format = F_ir_r_pred};
|
|
"and", {opcode = "and"; latency = 1; kind = KA; format = F_rr_r};
|
|
"andi", {opcode = "and"; latency = 1; kind = KA; format = F_ir_r};
|
|
"br", {opcode = "br.sptk.many"; latency = 1; kind = KB; format = F_i};
|
|
"brret", {opcode = "br.ret.sptk"; latency = 1; kind = KB; format = F_r};
|
|
"brcall", {opcode = "br.call.sptk.many"; latency = 1; kind = KB; format = F_i_r};
|
|
"brcallcond", {opcode = "br.call.spnt.many"; latency = 1; kind = KB; format = F_i_r_pred};
|
|
"brcallcondexc", {opcode = "br.call.spnt.many"; latency = 1; kind = KB_exc; format = F_i_r_pred};
|
|
"brcallind", {opcode = "br.call.sptk.many"; latency = 1; kind = KB; format = F_r_r};
|
|
"brcond", {opcode = "br.dpnt.many"; latency = 1; kind = KB; format = F_i_pred};
|
|
"brind", {opcode = "br.sptk.many"; latency = 1; kind = KB; format = F_r};
|
|
"cmp.eq", {opcode = "cmp.eq"; latency = 0; kind = KA; format = F_rr_rr};
|
|
"cmp.ge", {opcode = "cmp.ge"; latency = 0; kind = KA; format = F_rr_rr};
|
|
"cmp.geu", {opcode = "cmp.geu"; latency = 0; kind = KA; format = F_rr_rr};
|
|
"cmp.gt", {opcode = "cmp.gt"; latency = 0; kind = KA; format = F_rr_rr};
|
|
"cmp.gtu", {opcode = "cmp.gtu"; latency = 0; kind = KA; format = F_rr_rr};
|
|
"cmp.le", {opcode = "cmp.le"; latency = 0; kind = KA; format = F_rr_rr};
|
|
"cmp.leu", {opcode = "cmp.leu"; latency = 0; kind = KA; format = F_rr_rr};
|
|
"cmp.lt", {opcode = "cmp.lt"; latency = 0; kind = KA; format = F_rr_rr};
|
|
"cmp.ltu", {opcode = "cmp.ltu"; latency = 0; kind = KA; format = F_rr_rr};
|
|
"cmp.ne", {opcode = "cmp.ne"; latency = 0; kind = KA; format = F_rr_rr};
|
|
"cmpi.eq", {opcode = "cmp.eq"; latency = 0; kind = KA; format = F_ir_rr};
|
|
"cmpi.ge", {opcode = "cmp.ge"; latency = 0; kind = KA; format = F_ir_rr};
|
|
"cmpi.geu", {opcode = "cmp.geu"; latency = 0; kind = KA; format = F_ir_rr};
|
|
"cmpi.gt", {opcode = "cmp.gt"; latency = 0; kind = KA; format = F_ir_rr};
|
|
"cmpi.gtu", {opcode = "cmp.gtu"; latency = 0; kind = KA; format = F_ir_rr};
|
|
"cmpi.le", {opcode = "cmp.le"; latency = 0; kind = KA; format = F_ir_rr};
|
|
"cmpi.leu", {opcode = "cmp.leu"; latency = 0; kind = KA; format = F_ir_rr};
|
|
"cmpi.lt", {opcode = "cmp.lt"; latency = 0; kind = KA; format = F_ir_rr};
|
|
"cmpi.ltu", {opcode = "cmp.ltu"; latency = 0; kind = KA; format = F_ir_rr};
|
|
"cmpi.ne", {opcode = "cmp.ne"; latency = 0; kind = KA; format = F_ir_rr};
|
|
"cmpp.eq", {opcode = "cmp.eq"; latency = 1; kind = KA; format = F_rr_rr};
|
|
"cmpp.ge", {opcode = "cmp.ge"; latency = 1; kind = KA; format = F_rr_rr};
|
|
"cmpp.geu", {opcode = "cmp.geu"; latency = 1; kind = KA; format = F_rr_rr};
|
|
"cmpp.gt", {opcode = "cmp.gt"; latency = 1; kind = KA; format = F_rr_rr};
|
|
"cmpp.gtu", {opcode = "cmp.gtu"; latency = 1; kind = KA; format = F_rr_rr};
|
|
"cmpp.le", {opcode = "cmp.le"; latency = 1; kind = KA; format = F_rr_rr};
|
|
"cmpp.leu", {opcode = "cmp.leu"; latency = 1; kind = KA; format = F_rr_rr};
|
|
"cmpp.lt", {opcode = "cmp.lt"; latency = 1; kind = KA; format = F_rr_rr};
|
|
"cmpp.ltu", {opcode = "cmp.ltu"; latency = 1; kind = KA; format = F_rr_rr};
|
|
"cmpp.ne", {opcode = "cmp.ne"; latency = 1; kind = KA; format = F_rr_rr};
|
|
"cmpp.ne.and", {opcode = "cmp.ne.and"; latency = 1; kind = KA; format = F_rr_rr};
|
|
"cmppi.eq", {opcode = "cmp.eq"; latency = 1; kind = KA; format = F_ir_rr};
|
|
"cmppi.ge", {opcode = "cmp.ge"; latency = 1; kind = KA; format = F_ir_rr};
|
|
"cmppi.geu", {opcode = "cmp.geu"; latency = 1; kind = KA; format = F_ir_rr};
|
|
"cmppi.gt", {opcode = "cmp.gt"; latency = 1; kind = KA; format = F_ir_rr};
|
|
"cmppi.gtu", {opcode = "cmp.gtu"; latency = 1; kind = KA; format = F_ir_rr};
|
|
"cmppi.le", {opcode = "cmp.le"; latency = 1; kind = KA; format = F_ir_rr};
|
|
"cmppi.leu", {opcode = "cmp.leu"; latency = 1; kind = KA; format = F_ir_rr};
|
|
"cmppi.lt", {opcode = "cmp.lt"; latency = 1; kind = KA; format = F_ir_rr};
|
|
"cmppi.ltu", {opcode = "cmp.ltu"; latency = 1; kind = KA; format = F_ir_rr};
|
|
"cmppi.ne", {opcode = "cmp.ne"; latency = 1; kind = KA; format = F_ir_rr};
|
|
"extr.u", {opcode = "extr.u"; latency = 1; kind = KI; format = F_ri_r};
|
|
"fabs", {opcode = "fabs"; latency = 1; kind = KF; format = F_r_r};
|
|
"fadd.d", {opcode = "fadd.d"; latency = 5; kind = KF; format = F_rr_r};
|
|
"fcmp.eq", {opcode = "fcmp.eq"; latency = 1; kind = KF; format = F_rr_rr};
|
|
"fcmp.ge", {opcode = "fcmp.ge"; latency = 1; kind = KF; format = F_rr_rr};
|
|
"fcmp.gt", {opcode = "fcmp.gt"; latency = 1; kind = KF; format = F_rr_rr};
|
|
"fcmp.le", {opcode = "fcmp.le"; latency = 1; kind = KF; format = F_rr_rr};
|
|
"fcmp.lt", {opcode = "fcmp.lt"; latency = 1; kind = KF; format = F_rr_rr};
|
|
"fcmp.neq", {opcode = "fcmp.neq"; latency = 1; kind = KF; format = F_rr_rr};
|
|
"fcvt.fx.trunc", {opcode = "fcvt.fx.trunc"; latency = 7; kind = KF; format = F_r_r};
|
|
"fcvt.xf", {opcode = "fcvt.xf"; latency = 5; kind = KF; format = F_r_r};
|
|
"fma.d", {opcode = "fma.d"; latency = 5; kind = KF; format = F_rrr_r};
|
|
"fmacond", {opcode = "fma.d.s0"; latency = 5; kind = KF; format = F_rrr_r_pred};
|
|
"fmas1cond", {opcode = "fma.s1"; latency = 5; kind = KF; format = F_rrr_r_pred};
|
|
"fmads1cond", {opcode = "fma.d.s1"; latency = 5; kind = KF; format = F_rrr_r_pred};
|
|
"fmpy.d", {opcode = "fmpy.d"; latency = 5; kind = KF; format = F_rr_r};
|
|
"fms.d", {opcode = "fms.d"; latency = 5; kind = KF; format = F_rrr_r};
|
|
"fneg", {opcode = "fneg"; latency = 1; kind = KF; format = F_r_r};
|
|
"fnma.d", {opcode = "fnma.d"; latency = 5; kind = KF; format = F_rrr_r};
|
|
"fnmas1cond", {opcode = "fnma.s1"; latency = 5; kind = KF; format = F_rrr_r_pred};
|
|
"fnmads1cond", {opcode = "fnma.d.s1"; latency = 5; kind = KF; format = F_rrr_r_pred};
|
|
"fnorm.d", {opcode = "fnorm.d"; latency = 5; kind = KF; format = F_r_r};
|
|
"frcpa", {opcode = "frcpa.s0"; latency = 5; kind = KF; format = F_rr_rr};
|
|
"fsub.d", {opcode = "fsub.d"; latency = 5; kind = KF; format = F_rr_r};
|
|
"getf.sig", {opcode = "getf.sig"; latency = 2; kind = KM; format = F_r_r};
|
|
"ld1", {opcode = "ld1"; latency = 2; kind = KM; format = F_ld};
|
|
"ld2", {opcode = "ld2"; latency = 2; kind = KM; format = F_ld};
|
|
"ld4", {opcode = "ld4"; latency = 2; kind = KM; format = F_ld};
|
|
"ld8", {opcode = "ld8"; latency = 2; kind = KM; format = F_ld};
|
|
"ld8+", {opcode = "ld8"; latency = 2; kind = KM; format = F_ld_post};
|
|
"ldfd", {opcode = "ldfd"; latency = 9; kind = KM; format = F_ld};
|
|
"ldfd+", {opcode = "ldfd"; latency = 9; kind = KM; format = F_ld_post};
|
|
"ldfs", {opcode = "ldfs"; latency = 9; kind = KM; format = F_ld};
|
|
"mov", {opcode = "mov"; latency = 1; kind = KA; format = F_r_r};
|
|
"movcond", {opcode = "mov"; latency = 1; kind = KA; format = F_r_r_pred};
|
|
"movtb", {opcode = "mov"; latency = 9; kind = KI; format = F_r_r};
|
|
"movfb", {opcode = "mov"; latency = 2; kind = KI; format = F_r_r};
|
|
"movi", {opcode = "mov"; latency = 1; kind = KA; format = F_i_r};
|
|
"movicond", {opcode = "mov"; latency = 1; kind = KA; format = F_i_r_pred};
|
|
"movil", {opcode = "movl"; latency = 1; kind = KI; format = F_i_r};
|
|
"movpr", {opcode = "mov"; latency = 1; kind = KI; format = F_ri_r};
|
|
"or", {opcode = "or"; latency = 1; kind = KA; format = F_rr_r};
|
|
"ori", {opcode = "or"; latency = 1; kind = KA; format = F_ir_r};
|
|
"setf.d", {opcode = "setf.d"; latency = 8; kind = KM; format = F_r_r};
|
|
"setf.sig", {opcode = "setf.sig"; latency = 8; kind = KM; format = F_r_r};
|
|
"shl", {opcode = "shl"; latency = 2; kind = KI; format = F_rr_r};
|
|
"shladd", {opcode = "shladd"; latency = 1; kind = KA; format = F_r_rir};
|
|
"shli", {opcode = "shl"; latency = 1; kind = KI; format = F_ri_r};
|
|
"shr", {opcode = "shr"; latency = 2; kind = KI; format = F_rr_r};
|
|
"shri", {opcode = "shr"; latency = 1; kind = KI; format = F_ri_r};
|
|
"shru", {opcode = "shr.u"; latency = 2; kind = KI; format = F_rr_r};
|
|
"shrui", {opcode = "shr.u"; latency = 1; kind = KI; format = F_ri_r};
|
|
"st1", {opcode = "st1"; latency = 0; kind = KM; format = F_st};
|
|
"st2", {opcode = "st2"; latency = 0; kind = KM; format = F_st};
|
|
"st4", {opcode = "st4"; latency = 0; kind = KM; format = F_st};
|
|
"st8", {opcode = "st8"; latency = 0; kind = KM; format = F_st};
|
|
"st8+", {opcode = "st8"; latency = 1; kind = KM; format = F_st_post};
|
|
"stfd", {opcode = "stfd"; latency = 0; kind = KM; format = F_st};
|
|
"stfd+", {opcode = "stfd"; latency = 1; kind = KM; format = F_st_post};
|
|
"stfs", {opcode = "stfs"; latency = 0; kind = KM; format = F_st};
|
|
"sub", {opcode = "sub"; latency = 1; kind = KA; format = F_rr_r};
|
|
"sub1", {opcode = "sub"; latency = 1; kind = KA; format = F_rri_r};
|
|
"subi", {opcode = "sub"; latency = 1; kind = KA; format = F_ir_r};
|
|
"sxt1", {opcode = "sxt1"; latency = 1; kind = KI; format = F_r_r};
|
|
"sxt2", {opcode = "sxt2"; latency = 1; kind = KI; format = F_r_r};
|
|
"sxt4", {opcode = "sxt4"; latency = 1; kind = KI; format = F_r_r};
|
|
"tbit.nz", {opcode = "tbit.nz"; latency = 0; kind = KI; format = F_ri_rr};
|
|
"tbit.z", {opcode = "tbit.z"; latency = 0; kind = KI; format = F_ri_rr};
|
|
"xmpy.l", {opcode = "xmpy.l"; latency = 7; kind = KF; format = F_rr_r};
|
|
"xor", {opcode = "xor"; latency = 1; kind = KA; format = F_rr_r};
|
|
"xori", {opcode = "xor"; latency = 1; kind = KA; format = F_ir_r};
|
|
"#initbarrier", {opcode = "# init barrier"; latency = 0; kind = KI; format = F_i};
|
|
]
|
|
|
|
(* Nodes of the code DAG. Each node represents one instruction to be
|
|
emitted. *)
|
|
|
|
type code_dag_node =
|
|
{ instr: instruction_descr; (* the instruction *)
|
|
imm: string; (* its immediate argument, if any *)
|
|
iarg: resource array; (* arguments *)
|
|
ires: resource array; (* results *)
|
|
delay: int; (* how many cycles before result is available *)
|
|
mutable sons: (code_dag_node * int) list;
|
|
(* nodes that depend on this node *)
|
|
mutable date: int; (* start date *)
|
|
mutable length: int; (* length of longest path to result *)
|
|
mutable ancestors: int; (* number of ancestors *)
|
|
mutable emitted_ancestors: int } (* number of emitted ancestors *)
|
|
|
|
(* The code dag itself is represented by two tables from resources to nodes:
|
|
- "results" maps resources to the instructions that produced them;
|
|
- "uses" maps resources to the instructions that use them. *)
|
|
|
|
let code_results = (Hashtbl.create 31 : (resource, code_dag_node) Hashtbl.t)
|
|
let code_uses = (Hashtbl.create 31 : (resource, code_dag_node) Hashtbl.t)
|
|
|
|
let clear_code_dag () =
|
|
Hashtbl.clear code_results;
|
|
Hashtbl.clear code_uses
|
|
|
|
(* The ready queue: a list of nodes that can be computed immediately
|
|
(all arguments are available), kept sorted by decreasing length to results.
|
|
|
|
The in progress queue: a list of nodes whose arguments are being computed,
|
|
and thus can be computed at a later date, kept sorted by increasing
|
|
availability date
|
|
|
|
The branch list: a list of all branch instructions (to be emitted last) *)
|
|
|
|
let ready_queue = ref ([] : code_dag_node list)
|
|
let in_progress_queue = ref ([] : code_dag_node list)
|
|
let branch_list = ref ([] : code_dag_node list) (* built in reverse order *)
|
|
|
|
let clear_queues () =
|
|
ready_queue := []; in_progress_queue := []; branch_list := []
|
|
|
|
let rec insert_queue prio node = function
|
|
[] -> [node]
|
|
| hd :: tl as queue ->
|
|
if prio node hd then node :: queue else hd :: insert_queue prio node tl
|
|
|
|
let length_prio n1 n2 = n1.length > n2.length
|
|
let date_prio n1 n2 = n1.date < n2.date
|
|
|
|
let add_ready node =
|
|
ready_queue := insert_queue length_prio node !ready_queue
|
|
let add_in_progress node =
|
|
in_progress_queue := insert_queue date_prio node !in_progress_queue
|
|
let add_branch node =
|
|
branch_list := node :: !branch_list
|
|
|
|
(* Add an edge to the code DAG *)
|
|
|
|
let add_edge ancestor son delay =
|
|
ancestor.sons <- (son, delay) :: ancestor.sons;
|
|
son.ancestors <- son.ancestors + 1
|
|
|
|
let add_edge_after son ancestor = add_edge ancestor son 0
|
|
|
|
(* Add an instruction to the code DAG *)
|
|
|
|
let insimm opc arg imm res =
|
|
let instr =
|
|
try
|
|
Hashtbl.find instruction_table opc
|
|
with Not_found ->
|
|
fatal_error ("Unknown instruction " ^ opc) in
|
|
let node =
|
|
{ instr = instr;
|
|
imm = imm;
|
|
iarg = arg;
|
|
ires = res;
|
|
delay = instr.latency;
|
|
sons = []; (* to be filled later *)
|
|
date = 0; (* to be adjusted later *)
|
|
length = -1; (* to be computed later *)
|
|
ancestors = 0; (* ditto *)
|
|
emitted_ancestors = 0 } in (* ditto *)
|
|
(* RAW dependencies: add edges from all instrs that define one of the
|
|
resources used *)
|
|
for i = 0 to Array.length arg - 1 do
|
|
try
|
|
let rsrc = arg.(i) in
|
|
if is_mutable_resource rsrc then begin
|
|
let anc = Hashtbl.find code_results rsrc in
|
|
let delay = if is_memory_resource rsrc then 0 else anc.delay in
|
|
(* Memory accesses are ordered by the hardware, so we can emit
|
|
a memop 1, then a dependent memop 2 in the same cycle *)
|
|
add_edge anc node delay
|
|
end
|
|
with Not_found ->
|
|
()
|
|
done;
|
|
(* WAR dependencies: add edges from all instrs that use one of the
|
|
resources defined by this instruction
|
|
WAW dependencies: add edges from all instrs that define one of the
|
|
resources defined by this instruction *)
|
|
for i = 0 to Array.length res - 1 do
|
|
let rsrc = res.(i) in
|
|
if is_mutable_resource rsrc then begin
|
|
(* WAR *)
|
|
let anc = Hashtbl.find_all code_uses res.(i) in
|
|
List.iter (add_edge_after node) anc;
|
|
(* WAW *)
|
|
try
|
|
let anc = Hashtbl.find code_results rsrc in
|
|
let delay = if is_memory_resource rsrc then 0 else 1 in
|
|
add_edge anc node delay
|
|
with Not_found ->
|
|
()
|
|
end
|
|
done;
|
|
(* Remember the results and uses of this instruction *)
|
|
for i = 0 to Array.length res - 1 do
|
|
Hashtbl.add code_results res.(i) node
|
|
done;
|
|
for i = 0 to Array.length arg - 1 do
|
|
Hashtbl.add code_uses arg.(i) node
|
|
done;
|
|
(* Insert in appropriate queue *)
|
|
if node.instr.kind = KB
|
|
then add_branch node
|
|
else if node.ancestors = 0 then add_ready node
|
|
|
|
let insert opc arg res =
|
|
insimm opc arg "" res
|
|
|
|
(* Compute length of longest path to a result. *)
|
|
|
|
let rec longest_path node =
|
|
if node.length < 0 then begin
|
|
node.length <-
|
|
List.fold_left
|
|
(fun len (son, delay) -> max len (longest_path son + delay))
|
|
0 node.sons
|
|
end;
|
|
node.length
|
|
|
|
(* Emit the assembly code for a node *)
|
|
|
|
let emit_r = emit_string
|
|
|
|
let emit_instr node =
|
|
let opc = node.instr.opcode
|
|
and a = node.iarg
|
|
and r = node.ires
|
|
and imm = node.imm in
|
|
match node.instr.format with
|
|
F_i ->
|
|
` {emit_string opc} {emit_string imm}\n`
|
|
| F_i_pred ->
|
|
` ({emit_r a.(0)}) {emit_string opc} {emit_string imm}\n`
|
|
| F_ir_rr ->
|
|
` {emit_string opc} {emit_r r.(0)}, {emit_r r.(1)} = {emit_string imm}, {emit_r a.(0)}\n`
|
|
| F_ir_r ->
|
|
` {emit_string opc} {emit_r r.(0)} = {emit_string imm}, {emit_r a.(0)}\n`
|
|
| F_ir_r_pred ->
|
|
` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_string imm}, {emit_r a.(1)}\n`
|
|
| F_ld ->
|
|
` {emit_string opc} {emit_r r.(0)} = [{emit_r a.(0)}]\n`
|
|
| F_ld_post ->
|
|
` {emit_string opc} {emit_r r.(0)} = [{emit_r a.(0)}], {emit_string imm}\n`
|
|
| F_r ->
|
|
` {emit_string opc} {emit_r a.(0)}\n`
|
|
| F_i_r ->
|
|
` {emit_string opc} {emit_r r.(0)} = {emit_string imm}\n`
|
|
| F_i_r_pred ->
|
|
` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_string imm}\n`
|
|
| F_ri_rr ->
|
|
` {emit_string opc} {emit_r r.(0)}, {emit_r r.(1)} = {emit_r a.(0)}, {emit_string imm}\n`
|
|
| F_ri_r ->
|
|
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_string imm}\n`
|
|
| F_r_r ->
|
|
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}\n`
|
|
| F_r_r_pred ->
|
|
` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_r a.(1)}\n`
|
|
| F_rr_rr ->
|
|
` {emit_string opc} {emit_r r.(0)}, {emit_r r.(1)} = {emit_r a.(0)}, {emit_r a.(1)}\n`
|
|
| F_r_rir ->
|
|
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_string imm}, {emit_r a.(1)}\n`
|
|
| F_rr_r ->
|
|
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_r a.(1)}\n`
|
|
| F_rr_r_pred ->
|
|
` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_r a.(1)}, {emit_r a.(2)}\n`
|
|
| F_rri_r ->
|
|
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_r a.(1)}, {emit_string imm}\n`
|
|
| F_rrr_r ->
|
|
` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_r a.(1)}, {emit_r a.(2)}\n`
|
|
| F_rrr_r_pred ->
|
|
` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_r a.(1)}, {emit_r a.(2)}, {emit_r a.(3)}\n`
|
|
| F_st ->
|
|
` {emit_string opc} [{emit_r a.(0)}] = {emit_r a.(1)}\n`
|
|
| F_st_post ->
|
|
` {emit_string opc} [{emit_r a.(0)}] = {emit_r a.(1)}, {emit_string imm}\n`
|
|
|
|
(* Little state machine reflecting how many instructions the chip can
|
|
issue in one cycle. We roughly follow the Itanium model:
|
|
2 int units, 2 mem units, 2 FP units, and 3 branch units,
|
|
with a maximum of 6 instructions dispatched per clock cycle. *)
|
|
|
|
let num_A = ref 0
|
|
let num_I = ref 0
|
|
let num_M = ref 0
|
|
let num_F = ref 0
|
|
let num_B = ref 0
|
|
|
|
let reset_issue () =
|
|
num_A := 0; num_I := 0; num_M := 0; num_F := 0; num_B := 0
|
|
|
|
let can_issue instr =
|
|
if !num_A + !num_I + !num_M + !num_F + !num_B >= 6 then false else begin
|
|
match instr.kind with
|
|
KA ->
|
|
if !num_A + !num_I + !num_M < 4
|
|
then (incr num_A; true)
|
|
else false
|
|
| KF ->
|
|
if !num_F < 2 then (incr num_F; true) else false
|
|
| KI ->
|
|
if !num_I < 2 && !num_A + !num_I + !num_M < 4
|
|
then (incr num_I; true) else false
|
|
| KM ->
|
|
if !num_M < 2 && !num_A + !num_I + !num_M < 4
|
|
then (incr num_M; true) else false
|
|
| _ (* KB | KB_exc *) ->
|
|
if !num_B < 3 then (incr num_B; true) else false
|
|
end
|
|
|
|
(* Emit one node, updating the completion date and number of ancestors
|
|
emitted for all nodes that depend on this node. Enter the nodes
|
|
that are no longer waiting on anything (all ancestors emitted)
|
|
in the ready queue or in the in_progress queue, depending on
|
|
latency. *)
|
|
|
|
let emit_node date node =
|
|
begin try
|
|
(*`# Date: {emit_int date}; distance: {emit_int node.length}\n`;*)
|
|
emit_instr node
|
|
with x ->
|
|
fatal_error ("Error while emitting " ^ node.instr.opcode)
|
|
end;
|
|
List.iter
|
|
(fun (son, delay) ->
|
|
let completion_date = date + delay in
|
|
if son.date < completion_date then son.date <- completion_date;
|
|
son.emitted_ancestors <- son.emitted_ancestors + 1;
|
|
if son.emitted_ancestors = son.ancestors && son.instr.kind <> KB then
|
|
begin
|
|
(*`# {emit_string son.instr.opcode} will be ready at {emit_int son.date}\n`;*)
|
|
if son.date = date then add_ready son else add_in_progress son
|
|
end)
|
|
node.sons
|
|
|
|
(* Emit all ready nodes that we can emit given the architectural
|
|
constraints. *)
|
|
|
|
let rec emit_ready_nodes filter date =
|
|
match !ready_queue with
|
|
[] -> []
|
|
| node :: rem ->
|
|
ready_queue := rem;
|
|
if filter node && can_issue node.instr then begin
|
|
emit_node date node;
|
|
emit_ready_nodes filter date
|
|
end else
|
|
node :: emit_ready_nodes filter date
|
|
|
|
let filter_MF node =
|
|
match node.instr.kind with KM -> true | KF -> true | _ -> false
|
|
let filter_non_MF node =
|
|
not(filter_MF node)
|
|
|
|
(* Add all instructions with date <= d to the ready queue, and remove them *)
|
|
|
|
let rec extract_ready d = function
|
|
[] -> []
|
|
| node :: rem as queue ->
|
|
if node.date <= d then (add_ready node; extract_ready d rem) else queue
|
|
|
|
(* Say if a branch is ready to be emitted now *)
|
|
|
|
let branch_is_ready date br =
|
|
br.emitted_ancestors = br.ancestors && br.date <= date
|
|
|
|
(* Schedule the basic block, emitting all of its instructions *)
|
|
|
|
let rec reschedule date =
|
|
match (!ready_queue, !in_progress_queue) with
|
|
([], []) ->
|
|
(* We're done with the regular instructions; finish with the branches *)
|
|
begin match !branch_list with
|
|
[] -> ()
|
|
| br -> List.iter emit_instr br; emit_string " ;;\n"
|
|
end
|
|
| ([], node :: _) ->
|
|
(* Advance to the time node.date, extracting from in_progress_queue
|
|
all instructions ready at that time and adding them to the
|
|
ready queue *)
|
|
in_progress_queue := extract_ready node.date !in_progress_queue;
|
|
(* Try again *)
|
|
reschedule node.date
|
|
| (_, _) ->
|
|
` # time {emit_int date}\n`;
|
|
(* Emit and remove as many ready instructions as we can *)
|
|
(* Give priority to M and F instructions *)
|
|
reset_issue();
|
|
ready_queue := emit_ready_nodes filter_MF date;
|
|
ready_queue := emit_ready_nodes filter_non_MF date;
|
|
(* Special hack: if the only remaining instructions are branches
|
|
and they are all ready now, emit them in the current
|
|
group of instructions *)
|
|
if !ready_queue = []
|
|
&& !in_progress_queue = []
|
|
&& List.for_all (branch_is_ready date) !branch_list
|
|
then begin
|
|
List.iter emit_instr !branch_list;
|
|
branch_list := []
|
|
end;
|
|
(* Emit a stop to pause the processor *)
|
|
emit_string " ;;\n";
|
|
(* Advance to the time date + 1, extracting from in_progress_queue
|
|
all instructions ready at that time and adding them to the
|
|
ready queue *)
|
|
in_progress_queue := extract_ready (date + 1) !in_progress_queue;
|
|
(* Try again *)
|
|
reschedule (date + 1)
|
|
|
|
(* Emit the code for the current basic block *)
|
|
|
|
let end_basic_block () =
|
|
(* Compute critical paths and rebuild ready queue sorted by
|
|
decreasing criticality *)
|
|
let r = !ready_queue in
|
|
ready_queue := [];
|
|
let max_length =
|
|
List.fold_left (fun len node -> max len (longest_path node)) 0 r in
|
|
List.iter add_ready r;
|
|
branch_list := List.rev !branch_list;
|
|
(* Emit the instructions by traversing the code DAG *)
|
|
reschedule 0;
|
|
if max_length > 0 then ` # basic block length {emit_int max_length}\n`;
|
|
clear_code_dag ();
|
|
clear_queues ()
|
|
|
|
(************** Part 2: the code emitter *******************)
|
|
|
|
(* Tradeoff between code size and code speed *)
|
|
|
|
let fastcode_flag = ref true
|
|
|
|
(* Translate or output a label *)
|
|
|
|
let label lbl = sprintf ".L%d" lbl
|
|
|
|
let emit_label lbl = emit_string ".L"; emit_int lbl
|
|
|
|
(* Translate or output a symbol *)
|
|
|
|
let symbol s =
|
|
let b = Buffer.create (String.length s + 1) in
|
|
for i = 0 to String.length s - 1 do
|
|
let c = s.[i] in
|
|
match c with
|
|
'A'..'Z' | 'a'..'z' | '0'..'9' | '_' ->
|
|
Buffer.add_char b c
|
|
| _ ->
|
|
Buffer.add_string b (sprintf "$%02x" (Char.code c))
|
|
done;
|
|
Buffer.add_char b '#';
|
|
Buffer.contents b
|
|
|
|
let emit_symbol s = Emitaux.emit_symbol '$' s
|
|
|
|
(* Translate a pseudo-register *)
|
|
|
|
let reg r =
|
|
match r.loc with Reg r -> register_name r | _ -> assert false
|
|
|
|
let regs r =
|
|
Array.map reg r
|
|
|
|
(* Output a pseudo-register *)
|
|
|
|
let emit_reg r =
|
|
match r.loc with
|
|
Reg r -> emit_string (register_name r)
|
|
| _ -> fatal_error "Emit_ia64.emit_reg"
|
|
|
|
(* Translate a float as a 64-bit integer *)
|
|
|
|
let float_bits f =
|
|
let b = Buffer.create 18 in
|
|
let bytes = (Obj.magic f : string) in
|
|
Buffer.add_string b "0x";
|
|
for i = 7 downto 0 do (* little-endian *)
|
|
Buffer.add_string b
|
|
(sprintf "%02x" (Char.code (String.unsafe_get bytes i)))
|
|
done;
|
|
Buffer.contents b
|
|
|
|
(* Translate an "ltoffset" reference to a global *)
|
|
|
|
let ltoffset s = sprintf "@ltoff(%s)" (symbol s)
|
|
let ltoffset_fptr s = sprintf "@ltoff(@fptr(%s))" (symbol s)
|
|
|
|
(* Layout of the stack frame.
|
|
All stack offsets are shifted by 16 to preserve the scratch area at
|
|
bottom of stack. *)
|
|
|
|
let stack_offset = ref 0
|
|
|
|
let frame_size () =
|
|
let size =
|
|
!stack_offset +
|
|
8 * (num_stack_slots.(0) + num_stack_slots.(1)) +
|
|
(if !contains_calls then 8 else 0) in
|
|
Misc.align size 16
|
|
|
|
let slot_offset loc cl =
|
|
match loc with
|
|
Incoming n -> frame_size() + n + 16
|
|
| Local n ->
|
|
if cl = 0
|
|
then !stack_offset + n * 8 + 16
|
|
else !stack_offset + (num_stack_slots.(0) + n) * 8 + 16
|
|
| Outgoing n -> n + 16
|
|
|
|
let slot_offset_reg r =
|
|
match r.loc with
|
|
Stack l -> slot_offset l (register_class r)
|
|
| _ -> assert false
|
|
|
|
(* Record live pointers at call points *)
|
|
|
|
type frame_descr =
|
|
{ fd_lbl: int; (* Return address *)
|
|
fd_frame_size: int; (* Size of stack frame *)
|
|
fd_live_offset: int list } (* Offsets/regs of live addresses *)
|
|
|
|
let frame_descriptors = ref([] : frame_descr list)
|
|
|
|
let record_frame_label live =
|
|
let lbl = new_label() in
|
|
let live_offset = ref [] in
|
|
Reg.Set.iter
|
|
(function
|
|
{typ = Addr; loc = Reg r} ->
|
|
live_offset := ((r lsl 1) + 1) :: !live_offset
|
|
| {typ = Addr; loc = Stack s} as reg ->
|
|
live_offset := slot_offset s (register_class reg) :: !live_offset
|
|
| _ -> ())
|
|
live;
|
|
frame_descriptors :=
|
|
{ fd_lbl = lbl;
|
|
fd_frame_size = frame_size();
|
|
fd_live_offset = !live_offset } :: !frame_descriptors;
|
|
lbl
|
|
|
|
let record_frame live =
|
|
let lbl = record_frame_label live in `{emit_label lbl}:`
|
|
|
|
let emit_frame fd =
|
|
` data8 {emit_label fd.fd_lbl}\n`;
|
|
` data2 {emit_int fd.fd_frame_size}\n`;
|
|
` data2 {emit_int (List.length fd.fd_live_offset)}\n`;
|
|
List.iter
|
|
(fun n ->
|
|
` data2 {emit_int n}\n`)
|
|
fd.fd_live_offset;
|
|
` .align 8\n`
|
|
|
|
(* Names of various instructions *)
|
|
|
|
let name_for_int_operation = function
|
|
Iadd -> "add"
|
|
| Isub -> "sub"
|
|
| Iand -> "and"
|
|
| Ior -> "or"
|
|
| Ixor -> "xor"
|
|
| Ilsl -> "shl"
|
|
| Ilsr -> "shru"
|
|
| Iasr -> "shr"
|
|
| _ -> Misc.fatal_error "Emit.name_for_int_operation"
|
|
|
|
let name_for_float_operation = function
|
|
Inegf -> "fneg"
|
|
| Iabsf -> "fabs"
|
|
| Iaddf -> "fadd.d"
|
|
| Isubf -> "fsub.d"
|
|
| Imulf -> "fmpy.d"
|
|
| _ -> Misc.fatal_error "Emit.name_for_float_operation"
|
|
|
|
let name_for_specific_operation = function
|
|
Imultaddf -> "fma.d"
|
|
| Imultsubf -> "fms.d"
|
|
| Isubmultf -> "fnma.d"
|
|
| _ -> Misc.fatal_error "Emit.name_for_specific_operation"
|
|
|
|
let name_for_int_comparison = function
|
|
Isigned Ceq -> "eq" | Isigned Cne -> "ne"
|
|
| Isigned Cle -> "le" | Isigned Cgt -> "gt"
|
|
| Isigned Clt -> "lt" | Isigned Cge -> "ge"
|
|
| Iunsigned Ceq -> "eq" | Iunsigned Cne -> "ne"
|
|
| Iunsigned Cle -> "leu" | Iunsigned Cgt -> "gtu"
|
|
| Iunsigned Clt -> "ltu" | Iunsigned Cge -> "geu"
|
|
|
|
let name_for_swapped_int_comparison = function
|
|
Isigned Ceq -> "eq" | Isigned Cne -> "ne"
|
|
| Isigned Cle -> "ge" | Isigned Cgt -> "lt"
|
|
| Isigned Clt -> "gt" | Isigned Cge -> "le"
|
|
| Iunsigned Ceq -> "eq" | Iunsigned Cne -> "ne"
|
|
| Iunsigned Cle -> "geu" | Iunsigned Cgt -> "ltu"
|
|
| Iunsigned Clt -> "gtu" | Iunsigned Cge -> "leu"
|
|
|
|
let name_for_float_comparison cmp =
|
|
match cmp with
|
|
Ceq -> "eq" | Cne -> "neq"
|
|
| Cle -> "le" | Cgt -> "gt"
|
|
| Clt -> "lt" | Cge -> "ge"
|
|
|
|
(* Immediate range for addl (move) and adds (general add) instructions *)
|
|
|
|
let is_immediate_addl n = n >= -0x200000 && n < 0x200000
|
|
let is_immediate_addl_nat n =
|
|
n >= Nativeint.of_int (-0x200000) && n < Nativeint.of_int 0x200000
|
|
let is_immediate_adds n = n >= -0x2000 && n < 0x2000
|
|
|
|
(* Return the positions of all "1" bits in the given integer,
|
|
most significant bits first *)
|
|
|
|
let ones_pos n =
|
|
let rec ones p accu =
|
|
if p >= 63
|
|
then accu
|
|
else ones (p+1) (if n land (1 lsl p) = 0 then accu else p :: accu) in
|
|
ones 0 []
|
|
|
|
(* Generate temporary registers *)
|
|
|
|
let temp_generator temporaries =
|
|
let counter = ref 0 in
|
|
fun () ->
|
|
let r = temporaries.(!counter) in
|
|
incr counter;
|
|
if !counter >= Array.length temporaries then counter := 0;
|
|
r
|
|
|
|
let new_temp_reg =
|
|
temp_generator [| "r2"; "r3"; "r14"; "r15" |]
|
|
let new_temp_float =
|
|
temp_generator [| "f64"; "f65"; "f66"; "f67";
|
|
"f68"; "f69"; "f70"; "f71" |]
|
|
let new_pred =
|
|
temp_generator [| "p2"; "p3"; "p4"; "p5" |]
|
|
|
|
(* Output the assembly code for an instruction *)
|
|
|
|
(* Name of current function *)
|
|
let function_name = ref ""
|
|
(* Entry point for tail recursive calls *)
|
|
let tailrec_entry_point = ref 0
|
|
|
|
let emit_instr i =
|
|
match i.desc with
|
|
Lend -> ()
|
|
| Lop(Imove | Ispill | Ireload) ->
|
|
let src = i.arg.(0) and dst = i.res.(0) in
|
|
if src.loc <> dst.loc then begin
|
|
match (src.loc, dst.loc) with
|
|
(Reg _, Reg _) ->
|
|
insert "mov" (regs i.arg) (regs i.res)
|
|
| (Reg _, Stack _) ->
|
|
let offset = string_of_int (slot_offset_reg dst) in
|
|
let r = new_temp_reg() in
|
|
insimm "addi" [| "sp" |] offset [| r |];
|
|
insert (if i.res.(0).typ = Float then "stfd" else "st8")
|
|
[| r; reg src |] [| "stk" ^ offset |]
|
|
| (Stack _, Reg _) ->
|
|
let offset = string_of_int (slot_offset_reg src) in
|
|
let r = new_temp_reg() in
|
|
insimm "addi" [| "sp" |] offset [| r |];
|
|
insert (if i.arg.(0).typ = Float then "ldfd" else "ld8")
|
|
[| r; "stk" ^ offset |] (regs i.res)
|
|
| (_, _) ->
|
|
assert false
|
|
end
|
|
| Lop(Iconst_int n) ->
|
|
let instr =
|
|
if is_immediate_addl_nat n then "movi" else "movil" in
|
|
insimm instr [||] (Nativeint.to_string n) (regs i.res)
|
|
| Lop(Iconst_float s) ->
|
|
let f = float_of_string s in
|
|
if f = 0.0 then
|
|
insert "mov" [| "f0" |] (regs i.res)
|
|
else if f = 1.0 then
|
|
insert "mov" [| "f1" |] (regs i.res)
|
|
else begin
|
|
let tmp = new_temp_reg() in
|
|
insimm "movil" [||] (float_bits f) [| tmp |];
|
|
insert "setf.d" [| tmp |] (regs i.res)
|
|
end
|
|
| Lop(Iconst_symbol s) ->
|
|
insimm "addi" [| "gp" |] (ltoffset s) (regs i.res);
|
|
insert "ld8" (regs i.res) (regs i.res)
|
|
| Lop(Icall_ind) ->
|
|
insert "movtb" (regs i.arg) [| "b0" |];
|
|
insert "brcallind" [| "b0" |] [| "b0" |];
|
|
end_basic_block();
|
|
`{record_frame i.live}\n`
|
|
| Lop(Icall_imm s) ->
|
|
insimm "brcall" [||] (symbol s) [| "b0" |];
|
|
end_basic_block();
|
|
`{record_frame i.live}\n`
|
|
| Lop(Itailcall_ind) ->
|
|
let n = frame_size() in
|
|
insert "movtb" (regs i.arg) [| "b6" |];
|
|
if !contains_calls then begin
|
|
let tmp = new_temp_reg() in
|
|
insimm "addi" [| "sp" |] (string_of_int (n + 8)) [| tmp |];
|
|
insert "ld8" [| tmp |] [| tmp |];
|
|
insert "mov" [| tmp |] [| "b0" |]
|
|
end;
|
|
if n > 0 then
|
|
insimm "addi" [| "sp" |] (string_of_int n) [| "sp" |];
|
|
insert "brind" [| "b6" |] [||];
|
|
end_basic_block()
|
|
| Lop(Itailcall_imm s) ->
|
|
if s = !function_name then begin
|
|
insimm "br" [||] (label !tailrec_entry_point) [||]
|
|
end else begin
|
|
let n = frame_size() in
|
|
if !contains_calls then begin
|
|
let tmp = new_temp_reg() in
|
|
insimm "addi" [| "sp" |] (string_of_int (n + 8)) [| tmp |];
|
|
insert "ld8" [| tmp |] [| tmp |];
|
|
insert "mov" [| tmp |] [| "b0" |]
|
|
end;
|
|
if n > 0 then
|
|
insimm "addi" [| "sp" |] (string_of_int n) [| "sp" |];
|
|
insimm "br" [||] (symbol s) [||]
|
|
end;
|
|
end_basic_block()
|
|
| Lop(Iextcall(s, alloc)) ->
|
|
if alloc then begin
|
|
let tmp = new_temp_reg() in
|
|
insimm "addi" [| "gp" |] (ltoffset_fptr s) [| tmp |];
|
|
insert "ld8" [| tmp |] [| "r2" |];
|
|
insimm "brcall" [||] "caml_c_call#" [| "b0" |];
|
|
end_basic_block();
|
|
`{record_frame i.live}\n`
|
|
end else begin
|
|
insert "mov" [| "gp" |] [| "r7" |];
|
|
insimm "brcall" [||] (symbol s) [| "b0" |];
|
|
end_basic_block();
|
|
insert "mov" [| "r7" |] [| "gp" |]
|
|
end
|
|
| Lop(Istackoffset n) ->
|
|
end_basic_block();
|
|
insimm "addi" [| "sp" |] (string_of_int (-n)) [| "sp" |];
|
|
stack_offset := !stack_offset + n
|
|
| Lop(Iload(chunk, addr)) ->
|
|
let load_instr =
|
|
match chunk with
|
|
| Byte_unsigned -> "ld1"
|
|
| Byte_signed -> "ld1"
|
|
| Sixteen_unsigned -> "ld2"
|
|
| Sixteen_signed -> "ld2"
|
|
| Thirtytwo_unsigned -> "ld4"
|
|
| Thirtytwo_signed -> "ld4"
|
|
| Word -> "ld8"
|
|
| Single -> "ldfs"
|
|
| Double -> "ldfd"
|
|
| Double_u -> "ldfd" in
|
|
insert load_instr [| reg i.arg.(0); "heap" |] (regs i.res);
|
|
let sext_instr =
|
|
match chunk with
|
|
Byte_signed -> "sxt1"
|
|
| Sixteen_signed -> "sxt2"
|
|
| Thirtytwo_signed -> "sxt4"
|
|
| _ -> "" in
|
|
if sext_instr <> "" then
|
|
insert sext_instr (regs i.res) (regs i.res)
|
|
| Lop(Istore(chunk, addr)) ->
|
|
let store_instr =
|
|
match chunk with
|
|
| Byte_unsigned -> "st1"
|
|
| Byte_signed -> "st1"
|
|
| Sixteen_unsigned -> "st2"
|
|
| Sixteen_signed -> "st2"
|
|
| Thirtytwo_unsigned -> "st4"
|
|
| Thirtytwo_signed -> "st4"
|
|
| Word -> "st8"
|
|
| Single -> "stfs"
|
|
| Double -> "stfd"
|
|
| Double_u -> "stfd" in
|
|
insert store_instr [| reg i.arg.(1); reg i.arg.(0) |] [| "heap" |]
|
|
| Lop(Ialloc n) ->
|
|
if !fastcode_flag then begin
|
|
insimm "addi" [| "r4" |] (string_of_int (-n)) [| "r4" |];
|
|
insert "cmp.ltu" [| "r4"; "r5" |] [| "p6"; "p0" |];
|
|
insimm "movi" [||] (string_of_int n) [| "r2" |];
|
|
insimm "brcallcond" [| "p6" |] "caml_call_gc#" [| "b0" |];
|
|
end_basic_block();
|
|
`{record_frame i.live}\n`;
|
|
insimm "addi" [| "r4" |] "8" (regs i.res)
|
|
end else begin
|
|
insimm "movi" [||] (string_of_int n) [| "r2" |];
|
|
insimm "brcall" [||] "caml_alloc#" [| "b0" |];
|
|
end_basic_block();
|
|
`{record_frame i.live}\n`;
|
|
insimm "addi" [| "r4" |] "8" (regs i.res)
|
|
end
|
|
| Lop(Iintop Imul) ->
|
|
let t1 = new_temp_float() and t2 = new_temp_float() in
|
|
insert "setf.sig" [|reg i.arg.(0)|] [| t1 |];
|
|
insert "setf.sig" [|reg i.arg.(1)|] [| t2 |];
|
|
insert "xmpy.l" [| t1; t2 |] [| t1 |];
|
|
insert "getf.sig" [| t1 |] (regs i.res)
|
|
| Lop(Iintop(Icomp cmp)) ->
|
|
let comp = "cmpp." ^ name_for_int_comparison cmp in
|
|
let p1 = new_pred() and p2 = new_pred() in
|
|
insert comp (regs i.arg) [| p1; p2 |];
|
|
insimm "movicond" [| p1 |] "1" (regs i.res);
|
|
insimm "movicond" [| p2 |] "0" (regs i.res)
|
|
| Lop(Iintop(Icheckbound)) ->
|
|
insert "cmp.leu" (regs i.arg) [| "p6"; "p0" |];
|
|
insimm "brcallcondexc" [| "p6" |] "caml_array_bound_error#"
|
|
[| "b0"; "heap" |]
|
|
| Lop(Iintop op) ->
|
|
let instr = name_for_int_operation op in
|
|
insert instr (regs i.arg) (regs i.res)
|
|
| Lop(Iintop_imm(Imul, n)) ->
|
|
let src = reg i.arg.(0) and dst = reg i.res.(0) in
|
|
begin match ones_pos n with
|
|
[] ->
|
|
insimm "movi" [||] "0" [|dst|]
|
|
| [n] ->
|
|
insimm "shli" [|src|] (string_of_int n) [|dst|]
|
|
| [n; 0] when n <= 4 ->
|
|
insimm "shladd" [|src; src|] (string_of_int n) [|dst|]
|
|
| n1::n2::lst ->
|
|
let acc1 = new_temp_reg() and acc2 = new_temp_reg()
|
|
and tmp1 = new_temp_reg() and tmp2 = new_temp_reg() in
|
|
insimm "shli" [|src|] (string_of_int n1) [|acc1|];
|
|
insimm "shli" [|src|] (string_of_int n2) [|acc2|];
|
|
let rec add_shifts a1 t1 a2 t2 = function
|
|
[] ->
|
|
insert "add" [|a1; a2|] [|dst|]
|
|
| n::rem ->
|
|
if n = 0 then
|
|
insert "add" [|src; a1|] [|a1|]
|
|
else if n <= 4 then
|
|
insimm "shladd" [|src; a1|] (string_of_int n) [|a1|]
|
|
else begin
|
|
insimm "shli" [|src|] (string_of_int n) [|t1|];
|
|
insert "add" [|t1; a1|] [|a1|]
|
|
end;
|
|
add_shifts a2 t2 a1 t1 rem in
|
|
add_shifts acc1 tmp1 acc2 tmp2 lst
|
|
end
|
|
| Lop(Iintop_imm(Idiv, n)) -> (* n must be a power of 2 *)
|
|
let src = regs i.arg and dst = regs i.res in
|
|
let p1 = new_pred() and p2 = new_pred() in
|
|
let l = Misc.log2 n in
|
|
insert "cmpp.lt" [| src.(0); "r0" |] [| p1; p2 |];
|
|
if is_immediate_adds (n-1) then
|
|
insimm "addicond" [| p1; src.(0) |] (string_of_int (n-1)) dst
|
|
else begin
|
|
let moveop = if is_immediate_addl (n-1) then "movi" else "movil" in
|
|
insimm moveop [||] (string_of_int (n-1)) [| "r2" |];
|
|
insert "addcond" [| p1; src.(0); "r2" |] dst
|
|
end;
|
|
insert "movcond" [| p2; src.(0) |] dst;
|
|
insimm "shri" dst (string_of_int l) dst
|
|
| Lop(Iintop_imm(Imod, n)) -> (* n must be a power of 2 *)
|
|
let src = regs i.arg and dst = regs i.res in
|
|
let p = new_pred() in
|
|
let l = Misc.log2 n in
|
|
insert "cmpp.lt" [| src.(0); "r0" |] [| p; "p0" |];
|
|
insimm "extr.u" src (sprintf "0, %d" l) dst;
|
|
insert "cmpp.ne.and" [| dst.(0); "r0"; p |] [| p; "p0" |];
|
|
if is_immediate_adds (-n) then
|
|
insimm "addicond" [| p; dst.(0) |] (string_of_int (-n)) dst
|
|
else begin
|
|
let moveop = if is_immediate_addl (-n) then "movi" else "movil" in
|
|
insimm moveop [||] (string_of_int (-n)) [| "r2" |];
|
|
insert "addcond" [| p; dst.(0); "r2" |] dst
|
|
end
|
|
| Lop(Iintop_imm(Icomp cmp, n)) ->
|
|
let comp = "cmppi." ^ name_for_swapped_int_comparison cmp in
|
|
let p1 = new_pred() and p2 = new_pred() in
|
|
insimm comp (regs i.arg) (string_of_int n) [| p1; p2 |];
|
|
insimm "movicond" [| p1 |] "1" (regs i.res);
|
|
insimm "movicond" [| p2 |] "0" (regs i.res)
|
|
| Lop(Iintop_imm(Icheckbound, n)) ->
|
|
insimm "cmpi.geu" (regs i.arg) (string_of_int n) [| "p6"; "p0" |];
|
|
insimm "brcallcondexc" [| "p6" |] "caml_array_bound_error#"
|
|
[| "b0"; "heap" |]
|
|
| Lop(Iintop_imm(op, n)) ->
|
|
let instr = name_for_int_operation op ^ "i" in
|
|
insimm instr (regs i.arg) (string_of_int n) (regs i.res)
|
|
| Lop(Inegf | Iabsf | Iaddf | Isubf | Imulf as op) ->
|
|
let instr = name_for_float_operation op in
|
|
insert instr (regs i.arg) (regs i.res)
|
|
| Lop(Idivf) ->
|
|
(* Straight from the IA64 application developer's architecture guide,
|
|
section 13.3.3.1. Modified so that the destination may be equal
|
|
to one of the operands *)
|
|
let a = reg i.arg.(0) and b = reg i.arg.(1) and r = reg i.res.(0)
|
|
and t1 = new_temp_float() and t2 = new_temp_float()
|
|
and t3 = new_temp_float() and t4 = new_temp_float()
|
|
and p = new_pred() in
|
|
insert "frcpa" [| a; b |] [| t1; p |];
|
|
insert "fmas1cond" [| p; a; t1; "f0" |] [| t2 |];
|
|
insert "fnmas1cond" [| p; b; t1; "f1" |] [| t3 |];
|
|
insert "fmas1cond" [| p; t3; t3; t2 |] [| t2 |];
|
|
insert "fmas1cond" [| p; t3; t3; "f0" |] [| t4 |];
|
|
insert "fmas1cond" [| p; t3; t1; t1 |] [| t1 |];
|
|
insert "fmas1cond" [| p; t4; t2; t2 |] [| t2 |];
|
|
insert "fmas1cond" [| p; t4; t4; "f0" |] [| t3 |];
|
|
insert "fmas1cond" [| p; t4; t1; t1 |] [| t1 |];
|
|
insert "fmads1cond" [| p; t3; t2; t2 |] [| t2 |];
|
|
insert "fmas1cond" [| p; t3; t1; t1 |] [| t1 |];
|
|
insert "fnmads1cond" [| p; b; t2; a |] [| t3 |];
|
|
insert "mov" [| t1 |] [| r |];
|
|
insert "fmacond" [| p; t3; t1; t2 |] [| r |]
|
|
| Lop(Ifloatofint) ->
|
|
let src = regs i.arg and dst = regs i.res in
|
|
insert "setf.sig" src dst;
|
|
insert "fcvt.xf" dst dst;
|
|
insert "fnorm.d" dst dst
|
|
| Lop(Iintoffloat) ->
|
|
let src = regs i.arg and dst = regs i.res and tmp = new_temp_float() in
|
|
insert "fcvt.fx.trunc" src [| tmp |];
|
|
insert "getf.sig" [| tmp |] dst
|
|
| Lop(Ispecific(Iadd1)) ->
|
|
let s = if Array.length i.arg >= 2 then 1 else 0 in
|
|
insimm "add1" [| reg i.arg.(0); reg i.arg.(s) |] "1" (regs i.res)
|
|
| Lop(Ispecific(Isub1)) ->
|
|
insimm "sub1" (regs i.arg) "1" (regs i.res)
|
|
| Lop(Ispecific(Ishladd n)) ->
|
|
insimm "shladd" (regs i.arg) (string_of_int n) (regs i.res)
|
|
| Lop(Ispecific(Isignextend n)) ->
|
|
let op = "sxt" ^ string_of_int n in
|
|
insert op (regs i.arg) (regs i.res)
|
|
| Lop(Ispecific (Imultaddf | Imultsubf | Isubmultf as sop)) ->
|
|
let name = name_for_specific_operation sop in
|
|
insert name (regs i.arg) (regs i.res)
|
|
| Lop(Ispecific (Istoreincr n)) ->
|
|
let op = if i.arg.(1).typ = Float then "stfd+" else "st8+" in
|
|
insimm op [| reg i.arg.(0); reg i.arg.(1) |]
|
|
(string_of_int n)
|
|
[| reg i.res.(0); "heapinit" |]
|
|
| Lop(Ispecific Iinitbarrier) ->
|
|
insert "#initbarrier" [| "heapinit" |] [| "heap" |]
|
|
| Lreloadretaddr ->
|
|
let n = frame_size() + 8 in
|
|
let tmp = new_temp_reg() in
|
|
insimm "addi" [| "sp" |] (string_of_int n) [| tmp |];
|
|
insert "ld8" [| tmp |] [| tmp |];
|
|
insert "movtb" [| tmp |] [| "b0" |]
|
|
| Lreturn ->
|
|
let n = frame_size() in
|
|
if n > 0 then
|
|
insimm "addi" [| "sp" |] (string_of_int n) [| "sp" |];
|
|
insert "brret" [| "b0" |] [||];
|
|
end_basic_block()
|
|
| Llabel lbl ->
|
|
end_basic_block();
|
|
`{emit_label lbl}:\n`
|
|
| Lbranch lbl ->
|
|
insimm "br" [||] (label lbl) [||];
|
|
end_basic_block()
|
|
| Lcondbranch(tst, lbl) ->
|
|
begin match tst with
|
|
Itruetest ->
|
|
insimm "cmpi.ne" (regs i.arg) "0" [| "p6"; "p0" |]
|
|
| Ifalsetest ->
|
|
insimm "cmpi.eq" (regs i.arg) "0" [| "p6"; "p0" |]
|
|
| Iinttest cmp ->
|
|
let comp = "cmp." ^ name_for_int_comparison cmp in
|
|
insert comp (regs i.arg) [| "p6"; "p0" |]
|
|
| Iinttest_imm(cmp, n) ->
|
|
let comp = "cmpi." ^ name_for_swapped_int_comparison cmp in
|
|
insimm comp (regs i.arg) (string_of_int n) [| "p6"; "p0" |]
|
|
| Ifloattest(cmp, neg) ->
|
|
let comp = "fcmp." ^ name_for_float_comparison cmp in
|
|
insert comp (regs i.arg)
|
|
(if neg then [| "p0"; "p6" |]
|
|
else [| "p6"; "p0" |])
|
|
| Ioddtest ->
|
|
insimm "tbit.nz" (regs i.arg) "0" [| "p6"; "p0" |]
|
|
| Ieventest ->
|
|
insimm "tbit.z" (regs i.arg) "0" [| "p6"; "p0" |]
|
|
end;
|
|
insimm "brcond" [| "p6" |] (label lbl) [||];
|
|
end_basic_block()
|
|
| Lcondbranch3(lbl0, lbl1, lbl2) ->
|
|
end_basic_block();
|
|
let emit_compare n p = function
|
|
None -> ()
|
|
| Some lbl ->
|
|
` cmp.eq p{emit_int p}, p0 = {emit_int n}, {emit_reg i.arg.(0)}\n` in
|
|
let emit_branch p = function
|
|
None -> ()
|
|
| Some lbl ->
|
|
` (p{emit_int p}) br {emit_label lbl}\n` in
|
|
emit_compare 0 5 lbl0; emit_compare 1 6 lbl1; emit_compare 2 7 lbl2;
|
|
emit_branch 5 lbl0; emit_branch 6 lbl1; emit_branch 7 lbl2
|
|
| Lswitch jumptbl ->
|
|
end_basic_block();
|
|
let numcases = Array.length jumptbl in
|
|
if numcases <= 9 then begin
|
|
for j = 0 to numcases / 3 do
|
|
let n = j * 3 in
|
|
for k = 0 to 2 do
|
|
if n + k < numcases - 1 then
|
|
` cmp.eq p{emit_int(k+5)}, p0 = {emit_int (n+k)}, {emit_reg i.arg.(0)}\n`
|
|
done;
|
|
for k = 0 to 2 do
|
|
if n + k < numcases - 1 then
|
|
` (p{emit_int(k+5)}) br {emit_label jumptbl.(n+k)}\n`
|
|
else if n + k = numcases - 1 then
|
|
` br {emit_label jumptbl.(n+k)}\n`
|
|
done;
|
|
` ;;\n`
|
|
done
|
|
end else if numcases <= 47 then begin
|
|
` mov r2 = 1\n`;
|
|
` cmp.eq p6, p0 = 0, {emit_reg i.arg.(0)}\n`;
|
|
` (p6) br {emit_label jumptbl.(0)} ;;\n`;
|
|
` shl r2 = r2, {emit_reg i.arg.(0)}\n`;
|
|
` cmp.eq p7, p0 = 1, {emit_reg i.arg.(0)}\n`;
|
|
` (p7) br {emit_label jumptbl.(1)} ;;\n`;
|
|
` mov pr = r2, -1 ;;\n`;
|
|
for i = 2 to numcases - 1 do
|
|
` (p{emit_int i}) br {emit_label jumptbl.(i)}\n`
|
|
done;
|
|
` ;;\n`
|
|
end else begin
|
|
let lbl_jumptbl = new_label() in
|
|
let lbl_ip = new_label() in
|
|
`{emit_label lbl_ip}: mov r2 = ip ;;\n`;
|
|
` add r2 = {emit_label lbl_jumptbl} - {emit_label lbl_ip}, r2 ;;\n`;
|
|
` shladd r3 = {emit_reg i.arg.(0)}, 2, r2 ;;\n`;
|
|
` ld4 r3 = [r3] ;;\n`;
|
|
` sxt4 r3 = r3 ;;\n`;
|
|
` add r2 = r2, r3 ;;\n`;
|
|
` mov b6 = r2 ;;\n`;
|
|
` br b6 ;;\n`;
|
|
` .align 4\n`;
|
|
`{emit_label lbl_jumptbl}:\n`;
|
|
for i = 0 to numcases - 1 do
|
|
` data4 {emit_label jumptbl.(i)} - {emit_label lbl_jumptbl}\n`
|
|
done;
|
|
` .align 16\n`
|
|
end
|
|
| Lsetuptrap lbl ->
|
|
end_basic_block();
|
|
let lbl_ip = new_label() in
|
|
let lbl_next = new_label() in
|
|
`{emit_label lbl_ip}: mov r2 = ip ;;\n`;
|
|
` add r2 = {emit_label lbl_next} - {emit_label lbl_ip}, r2\n`;
|
|
` br.sptk {emit_label lbl} ;;\n`;
|
|
`{emit_label lbl_next}:\n`
|
|
| Lpushtrap ->
|
|
end_basic_block();
|
|
stack_offset := !stack_offset + 16;
|
|
(* Store trap pointer at sp, handler addr at sp+8,
|
|
and decrement sp by 16. Remember, the bottom 16 bytes
|
|
of the stack must be left free. *)
|
|
` add r3 = 8, sp\n`;
|
|
` st8 [sp] = r6, -16 ;;\n`;
|
|
` st8 [r3] = r2\n`;
|
|
` add r6 = 16, sp ;;\n`
|
|
| Lpoptrap ->
|
|
end_basic_block();
|
|
` add sp = 16, sp ;;\n`;
|
|
` ld8 r6 = [sp] ;;\n`;
|
|
stack_offset := !stack_offset - 16
|
|
| Lraise ->
|
|
end_basic_block();
|
|
` mov sp = r6\n`;
|
|
` add r2 = 8, r6\n`;
|
|
` ld8 r6 = [r6] ;;\n`;
|
|
` ld8 r2 = [r2] ;;\n`;
|
|
` mov b6 = r2 ;;\n`;
|
|
` br b6\n`
|
|
|
|
let rec emit_all i =
|
|
match i.desc with Lend -> () | _ -> emit_instr i; emit_all i.next
|
|
|
|
(* Check if a function contains a tail call to itself *)
|
|
|
|
let rec is_tailrec i =
|
|
match i.desc with
|
|
Lend -> false
|
|
| Lop(Itailcall_imm s) when s = !function_name -> true
|
|
| _ -> is_tailrec i.next
|
|
|
|
(* Emission of a function declaration *)
|
|
|
|
let fundecl f =
|
|
function_name := f.fun_name;
|
|
fastcode_flag := f.fun_fast;
|
|
stack_offset := 0;
|
|
` .text\n`;
|
|
` .align 4\n`;
|
|
` .global {emit_symbol f.fun_name}#\n`;
|
|
` .proc {emit_symbol f.fun_name}#\n`;
|
|
`{emit_symbol f.fun_name}:\n`;
|
|
let n = frame_size() in
|
|
if !contains_calls then begin
|
|
insert "movfb" [| "b0" |] [| "r2" |];
|
|
insimm "addi" [| "sp" |] "8" [| "r3" |];
|
|
insimm "addi" [| "sp" |] (string_of_int (-n)) [| "sp" |];
|
|
insert "st8" [| "r3"; "r2" |] [||]
|
|
end
|
|
else if n > 0 then
|
|
insimm "addi" [| "sp" |] (string_of_int (-n)) [| "sp" |];
|
|
if is_tailrec f.fun_body then begin
|
|
tailrec_entry_point := new_label();
|
|
end_basic_block();
|
|
`{emit_label !tailrec_entry_point}:\n`
|
|
end;
|
|
emit_all f.fun_body;
|
|
end_basic_block();
|
|
` .endp {emit_symbol f.fun_name}#\n`
|
|
|
|
(* Emission of data *)
|
|
|
|
let emit_define_symbol s =
|
|
` .global {emit_symbol s}#\n`;
|
|
` .type {emit_symbol s}#, @object\n`;
|
|
` .size {emit_symbol s}#, 8\n`;
|
|
`{emit_symbol s}:\n`
|
|
|
|
let emit_item = function
|
|
Cdefine_symbol s ->
|
|
emit_define_symbol s
|
|
| Cdefine_label lbl ->
|
|
`{emit_label (100000 + lbl)}:\n`
|
|
| Cint8 n ->
|
|
` data1 {emit_int n}\n`
|
|
| Cint16 n ->
|
|
` data2 {emit_int n}\n`
|
|
| Cint32 n ->
|
|
let n' = Nativeint.shift_right (Nativeint.shift_left n 32) 32 in
|
|
` data4 {emit_nativeint n'}\n`
|
|
| Cint n ->
|
|
` data8 {emit_nativeint n}\n`
|
|
| Csingle f ->
|
|
` real4 {emit_string f}\n`
|
|
| Cdouble f ->
|
|
` real8 {emit_string f}\n`
|
|
| Csymbol_address s ->
|
|
` data8 {emit_symbol s}#\n`
|
|
| Clabel_address lbl ->
|
|
` data8 {emit_label (100000 + lbl)}\n`
|
|
| Cstring s ->
|
|
emit_string_directive " string " s
|
|
| Cskip n ->
|
|
if n > 0 then ` .skip {emit_int n}\n`
|
|
| Calign n ->
|
|
` .align {emit_int n}\n`
|
|
|
|
let data l =
|
|
` .data\n`;
|
|
` .align 8\n`;
|
|
List.iter emit_item l
|
|
|
|
(* Beginning / end of an assembly file *)
|
|
|
|
let begin_assembly() =
|
|
` .data\n`;
|
|
emit_define_symbol (Compilenv.current_unit_name() ^ "__data_begin");
|
|
` .text\n`;
|
|
emit_define_symbol (Compilenv.current_unit_name() ^ "__code_begin")
|
|
|
|
let end_assembly () =
|
|
` .data\n`;
|
|
emit_define_symbol (Compilenv.current_unit_name() ^ "__data_end");
|
|
` .text\n`;
|
|
emit_define_symbol (Compilenv.current_unit_name() ^ "__code_end");
|
|
` .rodata\n`;
|
|
` .align 8\n`;
|
|
emit_define_symbol (Compilenv.current_unit_name() ^ "__frametable");
|
|
` data8 {emit_int (List.length !frame_descriptors)}\n`;
|
|
List.iter emit_frame !frame_descriptors;
|
|
frame_descriptors := []
|