stage2: improved codegen

* multiple returns jump to one canonical function exitlude. This is in
   preparation for the defer feature.
 * simple elision of trivial jump relocs.
 * omit prelude/exitlude for naked calling convention functions.
 * fix not switching on arch for prelude/exitlude
 * fix swapped registers when setting stack mem from a register
master
Andrew Kelley 2020-07-29 00:08:43 -07:00
parent 64a1a280ef
commit 1bbfa36b76
1 changed files with 88 additions and 26 deletions

View File

@ -214,6 +214,11 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
src: usize,
stack_align: u32,
/// The value is an offset into the `Function` `code` from the beginning.
/// To perform the reloc, write 32-bit signed little-endian integer
/// which is a relative jump, based on the address following the reloc.
exitlude_jump_relocs: std.ArrayListUnmanaged(usize) = .{},
/// Whenever there is a runtime branch, we push a Branch onto this stack,
/// and pop it off when the runtime branch joins. This provides an "overlay"
/// of the table of mappings from instructions to `MCValue` from within the branch.
@ -376,6 +381,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
.src = src,
.stack_align = undefined,
};
defer function.exitlude_jump_relocs.deinit(bin_file.allocator);
var call_info = function.resolveCallingConventionValues(src, fn_type) catch |err| switch (err) {
error.CodegenFail => return Result{ .fail = function.err_msg.? },
@ -401,29 +407,78 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
}
fn gen(self: *Self) !void {
try self.code.ensureCapacity(self.code.items.len + 11);
switch (arch) {
.x86_64 => {
try self.code.ensureCapacity(self.code.items.len + 11);
// TODO omit this for naked functions
// push rbp
// mov rbp, rsp
self.code.appendSliceAssumeCapacity(&[_]u8{ 0x55, 0x48, 0x89, 0xe5 });
const cc = self.fn_type.fnCallingConvention();
if (cc != .Naked) {
// We want to subtract the aligned stack frame size from rsp here, but we don't
// yet know how big it will be, so we leave room for a 4-byte stack size.
// TODO During semantic analysis, check if there are no function calls. If there
// are none, here we can omit the part where we subtract and then add rsp.
self.code.appendSliceAssumeCapacity(&[_]u8{
// push rbp
0x55,
// mov rbp, rsp
0x48,
0x89,
0xe5,
// sub rsp, imm32 (with reloc)
0x48,
0x81,
0xec,
});
const reloc_index = self.code.items.len;
self.code.items.len += 4;
// sub rsp, x
const stack_end = self.branch_stack.items[0].max_end_stack;
if (stack_end > math.maxInt(i32)) {
return self.fail(self.src, "too much stack used in call parameters", .{});
} else if (stack_end > math.maxInt(i8)) {
// 48 83 ec xx sub rsp,0x10
self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x81, 0xec });
const x = @intCast(u32, stack_end);
mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), x);
} else if (stack_end != 0) {
// 48 81 ec xx xx xx xx sub rsp,0x80
const x = @intCast(u8, stack_end);
self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x83, 0xec, x });
try self.genBody(self.mod_fn.analysis.success);
const stack_end = self.branch_stack.items[0].max_end_stack;
if (stack_end > math.maxInt(i32))
return self.fail(self.src, "too much stack used in call parameters", .{});
const aligned_stack_end = mem.alignForward(stack_end, self.stack_align);
mem.writeIntLittle(u32, self.code.items[reloc_index..][0..4], @intCast(u32, aligned_stack_end));
if (self.code.items.len >= math.maxInt(i32)) {
return self.fail(self.src, "unable to perform relocation: jump too far", .{});
}
for (self.exitlude_jump_relocs.items) |jmp_reloc| {
const amt = self.code.items.len - (jmp_reloc + 4);
// If it wouldn't jump at all, elide it.
if (amt == 0) {
self.code.items.len -= 5;
continue;
}
const s32_amt = @intCast(i32, amt);
mem.writeIntLittle(i32, self.code.items[jmp_reloc..][0..4], s32_amt);
}
try self.code.ensureCapacity(self.code.items.len + 9);
// add rsp, x
if (aligned_stack_end > math.maxInt(i8)) {
// example: 48 81 c4 ff ff ff 7f add rsp,0x7fffffff
self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x81, 0xc4 });
const x = @intCast(u32, aligned_stack_end);
mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), x);
} else if (aligned_stack_end != 0) {
// example: 48 83 c4 7f add rsp,0x7f
const x = @intCast(u8, aligned_stack_end);
self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x83, 0xc4, x });
}
self.code.appendSliceAssumeCapacity(&[_]u8{
0x5d, // pop rbp
0xc3, // ret
});
} else {
try self.genBody(self.mod_fn.analysis.success);
}
},
else => {
try self.genBody(self.mod_fn.analysis.success);
},
}
try self.genBody(self.mod_fn.analysis.success);
}
fn genBody(self: *Self, body: ir.Body) InnerError!void {
@ -987,10 +1042,12 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
try self.code.append(0xc3); // ret
},
.x86_64 => {
try self.code.appendSlice(&[_]u8{
0x5d, // pop rbp
0xc3, // ret
});
// TODO when implementing defer, this will need to jump to the appropriate defer expression.
// TODO optimization opportunity: figure out when we can emit this as a 2 byte instruction
// which is available if the jump is 127 bytes or less forward.
try self.code.resize(self.code.items.len + 5);
self.code.items[self.code.items.len - 5] = 0xe9; // jmp rel32
try self.exitlude_jump_relocs.append(self.gpa, self.code.items.len - 4);
},
else => return self.fail(src, "TODO implement return for {}", .{self.target.cpu.arch}),
}
@ -1130,6 +1187,11 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
switch (reloc) {
.rel32 => |pos| {
const amt = self.code.items.len - (pos + 4);
// If it wouldn't jump at all, elide it.
if (amt == 0) {
self.code.items.len -= 5;
return;
}
const s32_amt = math.cast(i32, amt) catch
return self.fail(src, "unable to perform relocation: jump too far", .{});
mem.writeIntLittle(i32, self.code.items[pos..][0..4], s32_amt);
@ -1296,13 +1358,13 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
const reg_id: u8 = @truncate(u3, reg.id());
if (stack_offset <= 128) {
// example: 48 89 55 7f mov QWORD PTR [rbp+0x7f],rdx
const RM = @as(u8, 0b01_101_000) | reg_id;
const RM = @as(u8, 0b01_000_101) | (reg_id << 3);
const negative_offset = @intCast(i8, -@intCast(i32, stack_offset));
const twos_comp = @bitCast(u8, negative_offset);
self.code.appendSliceAssumeCapacity(&[_]u8{ 0x89, RM, twos_comp });
} else if (stack_offset <= 2147483648) {
// example: 48 89 95 80 00 00 00 mov QWORD PTR [rbp+0x80],rdx
const RM = @as(u8, 0b10_101_000) | reg_id;
const RM = @as(u8, 0b10_000_101) | (reg_id << 3);
const negative_offset = @intCast(i32, -@intCast(i33, stack_offset));
const twos_comp = @bitCast(u32, negative_offset);
self.code.appendSliceAssumeCapacity(&[_]u8{ 0x89, RM });