stage2: improved codegen

* multiple returns jump to one canonical function exitlude. This is in preparation for the defer feature. * simple elision of trivial jump relocs. * omit prelude/exitlude for naked calling convention functions. * fix not switching on arch for prelude/exitlude * fix swapped registers when setting stack mem from a register
2020-07-29 00:08:43 -07:00 · 2020-07-29 00:08:43 -07:00 · 1bbfa36b76
parent 64a1a280ef
commit 1bbfa36b76
1 changed files with 88 additions and 26 deletions
--- a/src-self-hosted/codegen.zig
+++ b/src-self-hosted/codegen.zig
@ -214,6 +214,11 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
        src: usize,
        stack_align: u32,

+        /// The value is an offset into the `Function` `code` from the beginning.
+        /// To perform the reloc, write 32-bit signed little-endian integer
+        /// which is a relative jump, based on the address following the reloc.
+        exitlude_jump_relocs: std.ArrayListUnmanaged(usize) = .{},
+
        /// Whenever there is a runtime branch, we push a Branch onto this stack,
        /// and pop it off when the runtime branch joins. This provides an "overlay"
        /// of the table of mappings from instructions to `MCValue` from within the branch.
@ -376,6 +381,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                .src = src,
                .stack_align = undefined,
            };
+            defer function.exitlude_jump_relocs.deinit(bin_file.allocator);

            var call_info = function.resolveCallingConventionValues(src, fn_type) catch |err| switch (err) {
                error.CodegenFail => return Result{ .fail = function.err_msg.? },
@ -401,29 +407,78 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
        }

        fn gen(self: *Self) !void {
-            try self.code.ensureCapacity(self.code.items.len + 11);
+            switch (arch) {
+                .x86_64 => {
+                    try self.code.ensureCapacity(self.code.items.len + 11);

-            // TODO omit this for naked functions
-            // push rbp
-            // mov rbp, rsp
-            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x55, 0x48, 0x89, 0xe5 });
+                    const cc = self.fn_type.fnCallingConvention();
+                    if (cc != .Naked) {
+                        // We want to subtract the aligned stack frame size from rsp here, but we don't
+                        // yet know how big it will be, so we leave room for a 4-byte stack size.
+                        // TODO During semantic analysis, check if there are no function calls. If there
+                        // are none, here we can omit the part where we subtract and then add rsp.
+                        self.code.appendSliceAssumeCapacity(&[_]u8{
+                            // push rbp
+                            0x55,
+                            // mov rbp, rsp
+                            0x48,
+                            0x89,
+                            0xe5,
+                            // sub rsp, imm32 (with reloc)
+                            0x48,
+                            0x81,
+                            0xec,
+                        });
+                        const reloc_index = self.code.items.len;
+                        self.code.items.len += 4;

-            // sub rsp, x
-            const stack_end = self.branch_stack.items[0].max_end_stack;
-            if (stack_end > math.maxInt(i32)) {
-                return self.fail(self.src, "too much stack used in call parameters", .{});
-            } else if (stack_end > math.maxInt(i8)) {
-                // 48 83 ec xx    sub rsp,0x10
-                self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x81, 0xec });
-                const x = @intCast(u32, stack_end);
-                mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), x);
-            } else if (stack_end != 0) {
-                // 48 81 ec xx xx xx xx   sub rsp,0x80
-                const x = @intCast(u8, stack_end);
-                self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x83, 0xec, x });
+                        try self.genBody(self.mod_fn.analysis.success);
+
+                        const stack_end = self.branch_stack.items[0].max_end_stack;
+                        if (stack_end > math.maxInt(i32))
+                            return self.fail(self.src, "too much stack used in call parameters", .{});
+                        const aligned_stack_end = mem.alignForward(stack_end, self.stack_align);
+                        mem.writeIntLittle(u32, self.code.items[reloc_index..][0..4], @intCast(u32, aligned_stack_end));
+
+                        if (self.code.items.len >= math.maxInt(i32)) {
+                            return self.fail(self.src, "unable to perform relocation: jump too far", .{});
+                        }
+                        for (self.exitlude_jump_relocs.items) |jmp_reloc| {
+                            const amt = self.code.items.len - (jmp_reloc + 4);
+                            // If it wouldn't jump at all, elide it.
+                            if (amt == 0) {
+                                self.code.items.len -= 5;
+                                continue;
+                            }
+                            const s32_amt = @intCast(i32, amt);
+                            mem.writeIntLittle(i32, self.code.items[jmp_reloc..][0..4], s32_amt);
+                        }
+
+                        try self.code.ensureCapacity(self.code.items.len + 9);
+                        // add rsp, x
+                        if (aligned_stack_end > math.maxInt(i8)) {
+                            // example: 48 81 c4 ff ff ff 7f  add    rsp,0x7fffffff
+                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x81, 0xc4 });
+                            const x = @intCast(u32, aligned_stack_end);
+                            mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), x);
+                        } else if (aligned_stack_end != 0) {
+                            // example: 48 83 c4 7f           add    rsp,0x7f
+                            const x = @intCast(u8, aligned_stack_end);
+                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x83, 0xc4, x });
+                        }
+
+                        self.code.appendSliceAssumeCapacity(&[_]u8{
+                            0x5d, // pop rbp
+                            0xc3, // ret
+                        });
+                    } else {
+                        try self.genBody(self.mod_fn.analysis.success);
+                    }
+                },
+                else => {
+                    try self.genBody(self.mod_fn.analysis.success);
+                },
            }
-
-            try self.genBody(self.mod_fn.analysis.success);
        }

        fn genBody(self: *Self, body: ir.Body) InnerError!void {
@ -987,10 +1042,12 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                    try self.code.append(0xc3); // ret
                },
                .x86_64 => {
-                    try self.code.appendSlice(&[_]u8{
-                        0x5d, // pop rbp
-                        0xc3, // ret
-                    });
+                    // TODO when implementing defer, this will need to jump to the appropriate defer expression.
+                    // TODO optimization opportunity: figure out when we can emit this as a 2 byte instruction
+                    // which is available if the jump is 127 bytes or less forward.
+                    try self.code.resize(self.code.items.len + 5);
+                    self.code.items[self.code.items.len - 5] = 0xe9; // jmp rel32
+                    try self.exitlude_jump_relocs.append(self.gpa, self.code.items.len - 4);
                },
                else => return self.fail(src, "TODO implement return for {}", .{self.target.cpu.arch}),
            }
@ -1130,6 +1187,11 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
            switch (reloc) {
                .rel32 => |pos| {
                    const amt = self.code.items.len - (pos + 4);
+                    // If it wouldn't jump at all, elide it.
+                    if (amt == 0) {
+                        self.code.items.len -= 5;
+                        return;
+                    }
                    const s32_amt = math.cast(i32, amt) catch
                        return self.fail(src, "unable to perform relocation: jump too far", .{});
                    mem.writeIntLittle(i32, self.code.items[pos..][0..4], s32_amt);
@ -1296,13 +1358,13 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                        const reg_id: u8 = @truncate(u3, reg.id());
                        if (stack_offset <= 128) {
                            // example: 48 89 55 7f           mov    QWORD PTR [rbp+0x7f],rdx
-                            const RM = @as(u8, 0b01_101_000) | reg_id;
+                            const RM = @as(u8, 0b01_000_101) | (reg_id << 3);
                            const negative_offset = @intCast(i8, -@intCast(i32, stack_offset));
                            const twos_comp = @bitCast(u8, negative_offset);
                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x89, RM, twos_comp });
                        } else if (stack_offset <= 2147483648) {
                            // example: 48 89 95 80 00 00 00  mov    QWORD PTR [rbp+0x80],rdx
-                            const RM = @as(u8, 0b10_101_000) | reg_id;
+                            const RM = @as(u8, 0b10_000_101) | (reg_id << 3);
                            const negative_offset = @intCast(i32, -@intCast(i33, stack_offset));
                            const twos_comp = @bitCast(u32, negative_offset);
                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x89, RM });