From a34f67aa66554a776ae9a8f42207c62fcbc5fc25 Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Wed, 25 Mar 2020 12:08:50 +0100 Subject: [PATCH] std: Minor changes to TLS handling * Always allocate an info block per-thread so that libc can store important stuff there. * Respect ABI-mandated alignment in more places. * Nicer code, use slices/pointers instead of raw addresses whenever possible. --- lib/std/os/linux/tls.zig | 237 ++++++++++++++++++++------------------- lib/std/start.zig | 8 +- lib/std/thread.zig | 33 +++--- 3 files changed, 142 insertions(+), 136 deletions(-) diff --git a/lib/std/os/linux/tls.zig b/lib/std/os/linux/tls.zig index 20c51abdf..b6617e03f 100644 --- a/lib/std/os/linux/tls.zig +++ b/lib/std/os/linux/tls.zig @@ -1,8 +1,9 @@ const std = @import("std"); +const builtin = std.builtin; const os = std.os; const mem = std.mem; const elf = std.elf; -const builtin = @import("builtin"); +const math = std.math; const assert = std.debug.assert; // This file implements the two TLS variants [1] used by ELF-based systems. @@ -60,10 +61,11 @@ const tls_tcb_size = switch (builtin.arch) { else => @sizeOf(usize), }; -// Controls if the TCB should be aligned according to the TLS segment p_align +// Controls the minimum alignment of the TCB end address. The effective value +// used by the code is min(this_value, tls_segment.p_align) const tls_tcb_align_size = switch (builtin.arch) { - .arm, .armeb, .aarch64, .aarch64_be => true, - else => false, + .arm, .armeb, .aarch64, .aarch64_be => 16, + else => 1, }; // Controls if the TP points to the end of the TCB instead of its beginning @@ -72,13 +74,6 @@ const tls_tp_points_past_tcb = switch (builtin.arch) { else => false, }; -// Check if the architecture-specific parameters look correct -comptime { - if (tls_tcb_align_size and tls_variant != TLSVariant.VariantI) { - @compileError("tls_tcb_align_size is only meaningful for variant I TLS"); - } -} - // Some architectures add some offset to the tp and dtv addresses in order to // make the generated code more efficient @@ -94,17 +89,19 @@ const tls_dtv_offset = switch (builtin.arch) { }; // Per-thread storage for Zig's use -const CustomData = packed struct {}; +const CustomData = struct { + padding: [16]usize, +}; // Dynamic Thread Vector -const DTV = packed struct { +const DTV = extern struct { entries: usize, - tls_block: [1]usize, + tls_block: [1][*]u8, }; // Holds all the information about the process TLS image const TLSImage = struct { - data_src: []u8, + data_src: []const u8, alloc_size: usize, tcb_offset: usize, dtv_offset: usize, @@ -113,13 +110,13 @@ const TLSImage = struct { gdt_entry_number: usize, }; -pub var tls_image: ?TLSImage = null; +pub var tls_image: TLSImage = undefined; pub fn setThreadPointer(addr: usize) void { switch (builtin.arch) { .i386 => { var user_desc = std.os.linux.user_desc{ - .entry_number = tls_image.?.gdt_entry_number, + .entry_number = tls_image.gdt_entry_number, .base_addr = addr, .limit = 0xfffff, .seg_32bit = 1, @@ -134,7 +131,7 @@ pub fn setThreadPointer(addr: usize) void { const gdt_entry_number = user_desc.entry_number; // We have to keep track of our slot as it's also needed for clone() - tls_image.?.gdt_entry_number = gdt_entry_number; + tls_image.gdt_entry_number = gdt_entry_number; // Update the %gs selector asm volatile ("movl %[gs_val], %%gs" : @@ -171,7 +168,7 @@ pub fn setThreadPointer(addr: usize) void { } } -pub fn initTLS() ?*elf.Phdr { +fn initTLS() void { var tls_phdr: ?*elf.Phdr = null; var img_base: usize = 0; @@ -195,124 +192,138 @@ pub fn initTLS() ?*elf.Phdr { // Sanity check assert(at_phent == @sizeOf(elf.Phdr)); - // Search the TLS section + // Find the TLS section const phdrs = (@intToPtr([*]elf.Phdr, at_phdr))[0..at_phnum]; - var gnu_stack: ?*elf.Phdr = null; - for (phdrs) |*phdr| { switch (phdr.p_type) { elf.PT_PHDR => img_base = at_phdr - phdr.p_vaddr, elf.PT_TLS => tls_phdr = phdr, - elf.PT_GNU_STACK => gnu_stack = phdr, - else => continue, + else => {}, } } + // If the cpu is ARM-based, check if it supports the TLS register + if (comptime builtin.arch.isARM() and at_hwcap & std.os.linux.HWCAP_TLS == 0) { + // If the CPU does not support TLS via a coprocessor register, + // a kernel helper function can be used instead on certain linux kernels. + // See linux/arch/arm/include/asm/tls.h and musl/src/thread/arm/__set_thread_area.c. + @panic("TODO: Implement ARM fallback TLS functionality"); + } + + var tls_align_factor: usize = undefined; + var tls_data: []const u8 = undefined; if (tls_phdr) |phdr| { - // If the cpu is arm-based, check if it supports the TLS register - if (builtin.arch == .arm and at_hwcap & std.os.linux.HWCAP_TLS == 0) { - // If the CPU does not support TLS via a coprocessor register, - // a kernel helper function can be used instead on certain linux kernels. - // See linux/arch/arm/include/asm/tls.h and musl/src/thread/arm/__set_thread_area.c. - @panic("TODO: Implement ARM fallback TLS functionality"); - } - - // Offsets into the allocated TLS area - var tcb_offset: usize = undefined; - var dtv_offset: usize = undefined; - var data_offset: usize = undefined; - var thread_data_offset: usize = undefined; - // Compute the total size of the ABI-specific data plus our own control - // structures - const alloc_size = switch (tls_variant) { - .VariantI => blk: { - var l: usize = 0; - dtv_offset = l; - l += @sizeOf(DTV); - thread_data_offset = l; - l += @sizeOf(CustomData); - l = mem.alignForward(l, phdr.p_align); - tcb_offset = l; - if (tls_tcb_align_size) { - l += mem.alignForward(tls_tcb_size, phdr.p_align); - } else { - l += tls_tcb_size; - } - data_offset = l; - l += phdr.p_memsz; - break :blk l; - }, - .VariantII => blk: { - var l: usize = 0; - data_offset = l; - l += phdr.p_memsz; - l = mem.alignForward(l, phdr.p_align); - tcb_offset = l; - l += tls_tcb_size; - thread_data_offset = l; - l += @sizeOf(CustomData); - dtv_offset = l; - l += @sizeOf(DTV); - break :blk l; - }, - }; - - tls_image = TLSImage{ - .data_src = @intToPtr([*]u8, phdr.p_vaddr + img_base)[0..phdr.p_filesz], - .alloc_size = alloc_size, - .tcb_offset = tcb_offset, - .dtv_offset = dtv_offset, - .data_offset = data_offset, - .gdt_entry_number = @bitCast(usize, @as(isize, -1)), - }; + tls_align_factor = phdr.p_align; + tls_data = @intToPtr([*]u8, img_base + phdr.p_vaddr)[0..phdr.p_memsz]; + } else { + tls_align_factor = @alignOf(*usize); + tls_data = &[_]u8{}; } - return gnu_stack; + // Offsets into the allocated TLS area + var tcb_offset: usize = undefined; + var dtv_offset: usize = undefined; + var data_offset: usize = undefined; + var thread_data_offset: usize = undefined; + // Compute the total size of the ABI-specific data plus our own control + // structures + const alloc_size = switch (tls_variant) { + .VariantI => blk: { + var l: usize = 0; + // Unneeded because l is zero + // l = mem.alignForward(l, @alignOf(DTV)); + dtv_offset = l; + l += @sizeOf(DTV); + l = mem.alignForward(l, @alignOf(CustomData)); + thread_data_offset = l; + l += @sizeOf(CustomData); + // Make sure the TP is aligned + l = mem.alignForward(l, tls_align_factor); + tcb_offset = l; + // Ensure there are at least tls_tcb_align_size bytes of padding + const min_align = math.max(tls_tcb_align_size, tls_align_factor); + l += mem.alignForward(tls_tcb_size, min_align); + data_offset = l; + l += mem.alignForward(tls_data.len, tls_align_factor); + break :blk l; + }, + .VariantII => blk: { + var l: usize = 0; + data_offset = l; + l = mem.alignForward(tls_data.len, tls_align_factor); + // The TP is aligned to p_align + tcb_offset = l; + l += tls_tcb_size; + l = mem.alignForward(l, @alignOf(CustomData)); + thread_data_offset = l; + l += @sizeOf(CustomData); + l = mem.alignForward(l, @alignOf(DTV)); + dtv_offset = l; + l += @sizeOf(DTV); + break :blk l; + }, + }; + + tls_image = TLSImage{ + .data_src = tls_data, + .alloc_size = alloc_size, + .tcb_offset = tcb_offset, + .dtv_offset = dtv_offset, + .data_offset = data_offset, + .gdt_entry_number = @bitCast(usize, @as(isize, -1)), + }; } -pub fn copyTLS(addr: usize) usize { - const tls_img = tls_image.?; +inline fn alignPtrCast(comptime T: type, ptr: [*]u8) *T { + return @ptrCast(*T, @alignCast(@alignOf(*T), ptr)); +} - // Be paranoid, clear the area we're going to use - @memset(@intToPtr([*]u8, addr), 0, tls_img.alloc_size); +/// Initializes all the fields of the static TLS area and returns the computed +/// architecture-specific value of the thread-pointer register +pub fn prepareTLS(area: []u8) usize { + // Clear the area we're going to use, just to be safe + mem.set(u8, area, 0); // Prepare the DTV - const dtv = @intToPtr(*DTV, addr + tls_img.dtv_offset); + const dtv = alignPtrCast(DTV, area.ptr + tls_image.dtv_offset); dtv.entries = 1; - dtv.tls_block[0] = addr + tls_img.data_offset + tls_dtv_offset; - // Set-up the TCB - // Force the alignment to 1 byte as the TCB may start from a non-aligned - // address under the variant II model - const tcb_ptr = @intToPtr(*align(1) usize, addr + tls_img.tcb_offset); - if (tls_variant == TLSVariant.VariantI) { - tcb_ptr.* = addr + tls_img.dtv_offset; - } else { - tcb_ptr.* = addr + tls_img.tcb_offset; - } + dtv.tls_block[0] = area.ptr + tls_dtv_offset + tls_image.data_offset; + // Prepare the TCB + const tcb_ptr = alignPtrCast([*]u8, area.ptr + tls_image.tcb_offset); + tcb_ptr.* = switch (tls_variant) { + .VariantI => area.ptr + tls_image.dtv_offset, + .VariantII => area.ptr + tls_image.tcb_offset, + }; // Copy the data - @memcpy(@intToPtr([*]u8, addr + tls_img.data_offset), tls_img.data_src.ptr, tls_img.data_src.len); + mem.copy(u8, area[tls_image.data_offset..], tls_image.data_src); // Return the corrected (if needed) value for the tp register - return addr + tls_tp_offset + - if (tls_tp_points_past_tcb) tls_img.data_offset else tls_img.tcb_offset; + return @ptrToInt(area.ptr) + tls_tp_offset + + if (tls_tp_points_past_tcb) tls_image.data_offset else tls_image.tcb_offset; } var main_thread_tls_buffer: [256]u8 align(32) = undefined; -pub fn allocateTLS(size: usize) usize { - // Small TLS allocation, use our local buffer - if (size < main_thread_tls_buffer.len) { - return @ptrToInt(&main_thread_tls_buffer); - } +pub fn initStaticTLS() void { + initTLS(); - const slice = os.mmap( - null, - size, - os.PROT_READ | os.PROT_WRITE, - os.MAP_PRIVATE | os.MAP_ANONYMOUS, - -1, - 0, - ) catch @panic("out of memory"); + var tls_area = blk: { + // Fast path for the common case where the TLS data is really small, + // avoid an allocation and use our local buffer + if (tls_image.alloc_size < main_thread_tls_buffer.len) { + break :blk main_thread_tls_buffer[0..tls_image.alloc_size]; + } - return @ptrToInt(slice.ptr); + break :blk os.mmap( + null, + tls_image.alloc_size, + os.PROT_READ | os.PROT_WRITE, + os.MAP_PRIVATE | os.MAP_ANONYMOUS, + -1, + 0, + ) catch @panic("out of memory"); + }; + + const tp_value = prepareTLS(tls_area); + setThreadPointer(tp_value); } diff --git a/lib/std/start.zig b/lib/std/start.zig index 1a4997edb..fbb814066 100644 --- a/lib/std/start.zig +++ b/lib/std/start.zig @@ -152,13 +152,7 @@ fn posixCallMainAndExit() noreturn { const auxv = @ptrCast([*]std.elf.Auxv, @alignCast(@alignOf(usize), envp.ptr + envp_count + 1)); std.os.linux.elf_aux_maybe = auxv; // Initialize the TLS area - const gnu_stack_phdr = std.os.linux.tls.initTLS() orelse @panic("ELF missing stack size"); - - if (std.os.linux.tls.tls_image) |tls_img| { - const tls_addr = std.os.linux.tls.allocateTLS(tls_img.alloc_size); - const tp = std.os.linux.tls.copyTLS(tls_addr); - std.os.linux.tls.setThreadPointer(tp); - } + std.os.linux.tls.initStaticTLS(); // TODO This is disabled because what should we do when linking libc and this code // does not execute? And also it's causing a test failure in stack traces in release modes. diff --git a/lib/std/thread.zig b/lib/std/thread.zig index 596a8f3cd..dc478b5a7 100644 --- a/lib/std/thread.zig +++ b/lib/std/thread.zig @@ -286,11 +286,10 @@ pub const Thread = struct { } // Finally, the Thread Local Storage, if any. if (!Thread.use_pthreads) { - if (os.linux.tls.tls_image) |tls_img| { - l = mem.alignForward(l, @alignOf(usize)); - tls_start_offset = l; - l += tls_img.alloc_size; - } + // XXX: Is this alignment enough? + l = mem.alignForward(l, @alignOf(usize)); + tls_start_offset = l; + l += os.linux.tls.tls_image.alloc_size; } break :blk l; }; @@ -349,18 +348,21 @@ pub const Thread = struct { else => return os.unexpectedErrno(@intCast(usize, err)), } } else if (std.Target.current.os.tag == .linux) { - var flags: u32 = os.CLONE_VM | os.CLONE_FS | os.CLONE_FILES | os.CLONE_SIGHAND | - os.CLONE_THREAD | os.CLONE_SYSVSEM | os.CLONE_PARENT_SETTID | os.CLONE_CHILD_CLEARTID | - os.CLONE_DETACHED; - var newtls: usize = undefined; + const flags: u32 = os.CLONE_VM | os.CLONE_FS | os.CLONE_FILES | + os.CLONE_SIGHAND | os.CLONE_THREAD | os.CLONE_SYSVSEM | + os.CLONE_PARENT_SETTID | os.CLONE_CHILD_CLEARTID | + os.CLONE_DETACHED | os.CLONE_SETTLS; // This structure is only needed when targeting i386 var user_desc: if (std.Target.current.cpu.arch == .i386) os.linux.user_desc else void = undefined; - if (os.linux.tls.tls_image) |tls_img| { + const tls_area = mmap_slice[tls_start_offset..]; + const tp_value = os.linux.tls.prepareTLS(tls_area); + + var newtls = blk: { if (std.Target.current.cpu.arch == .i386) { user_desc = os.linux.user_desc{ - .entry_number = tls_img.gdt_entry_number, - .base_addr = os.linux.tls.copyTLS(mmap_addr + tls_start_offset), + .entry_number = os.linux.tls.tls_image.gdt_entry_number, + .base_addr = tp_value, .limit = 0xfffff, .seg_32bit = 1, .contents = 0, // Data @@ -369,12 +371,11 @@ pub const Thread = struct { .seg_not_present = 0, .useable = 1, }; - newtls = @ptrToInt(&user_desc); + break :blk @ptrToInt(&user_desc); } else { - newtls = os.linux.tls.copyTLS(mmap_addr + tls_start_offset); + break :blk tp_value; } - flags |= os.CLONE_SETTLS; - } + }; const rc = os.linux.clone( MainFuncs.linuxThreadMain,