std: Minor changes to TLS handling
* Always allocate an info block per-thread so that libc can store important stuff there. * Respect ABI-mandated alignment in more places. * Nicer code, use slices/pointers instead of raw addresses whenever possible.master
parent
cbaede7f55
commit
a34f67aa66
|
@ -1,8 +1,9 @@
|
|||
const std = @import("std");
|
||||
const builtin = std.builtin;
|
||||
const os = std.os;
|
||||
const mem = std.mem;
|
||||
const elf = std.elf;
|
||||
const builtin = @import("builtin");
|
||||
const math = std.math;
|
||||
const assert = std.debug.assert;
|
||||
|
||||
// This file implements the two TLS variants [1] used by ELF-based systems.
|
||||
|
@ -60,10 +61,11 @@ const tls_tcb_size = switch (builtin.arch) {
|
|||
else => @sizeOf(usize),
|
||||
};
|
||||
|
||||
// Controls if the TCB should be aligned according to the TLS segment p_align
|
||||
// Controls the minimum alignment of the TCB end address. The effective value
|
||||
// used by the code is min(this_value, tls_segment.p_align)
|
||||
const tls_tcb_align_size = switch (builtin.arch) {
|
||||
.arm, .armeb, .aarch64, .aarch64_be => true,
|
||||
else => false,
|
||||
.arm, .armeb, .aarch64, .aarch64_be => 16,
|
||||
else => 1,
|
||||
};
|
||||
|
||||
// Controls if the TP points to the end of the TCB instead of its beginning
|
||||
|
@ -72,13 +74,6 @@ const tls_tp_points_past_tcb = switch (builtin.arch) {
|
|||
else => false,
|
||||
};
|
||||
|
||||
// Check if the architecture-specific parameters look correct
|
||||
comptime {
|
||||
if (tls_tcb_align_size and tls_variant != TLSVariant.VariantI) {
|
||||
@compileError("tls_tcb_align_size is only meaningful for variant I TLS");
|
||||
}
|
||||
}
|
||||
|
||||
// Some architectures add some offset to the tp and dtv addresses in order to
|
||||
// make the generated code more efficient
|
||||
|
||||
|
@ -94,17 +89,19 @@ const tls_dtv_offset = switch (builtin.arch) {
|
|||
};
|
||||
|
||||
// Per-thread storage for Zig's use
|
||||
const CustomData = packed struct {};
|
||||
const CustomData = struct {
|
||||
padding: [16]usize,
|
||||
};
|
||||
|
||||
// Dynamic Thread Vector
|
||||
const DTV = packed struct {
|
||||
const DTV = extern struct {
|
||||
entries: usize,
|
||||
tls_block: [1]usize,
|
||||
tls_block: [1][*]u8,
|
||||
};
|
||||
|
||||
// Holds all the information about the process TLS image
|
||||
const TLSImage = struct {
|
||||
data_src: []u8,
|
||||
data_src: []const u8,
|
||||
alloc_size: usize,
|
||||
tcb_offset: usize,
|
||||
dtv_offset: usize,
|
||||
|
@ -113,13 +110,13 @@ const TLSImage = struct {
|
|||
gdt_entry_number: usize,
|
||||
};
|
||||
|
||||
pub var tls_image: ?TLSImage = null;
|
||||
pub var tls_image: TLSImage = undefined;
|
||||
|
||||
pub fn setThreadPointer(addr: usize) void {
|
||||
switch (builtin.arch) {
|
||||
.i386 => {
|
||||
var user_desc = std.os.linux.user_desc{
|
||||
.entry_number = tls_image.?.gdt_entry_number,
|
||||
.entry_number = tls_image.gdt_entry_number,
|
||||
.base_addr = addr,
|
||||
.limit = 0xfffff,
|
||||
.seg_32bit = 1,
|
||||
|
@ -134,7 +131,7 @@ pub fn setThreadPointer(addr: usize) void {
|
|||
|
||||
const gdt_entry_number = user_desc.entry_number;
|
||||
// We have to keep track of our slot as it's also needed for clone()
|
||||
tls_image.?.gdt_entry_number = gdt_entry_number;
|
||||
tls_image.gdt_entry_number = gdt_entry_number;
|
||||
// Update the %gs selector
|
||||
asm volatile ("movl %[gs_val], %%gs"
|
||||
:
|
||||
|
@ -171,7 +168,7 @@ pub fn setThreadPointer(addr: usize) void {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn initTLS() ?*elf.Phdr {
|
||||
fn initTLS() void {
|
||||
var tls_phdr: ?*elf.Phdr = null;
|
||||
var img_base: usize = 0;
|
||||
|
||||
|
@ -195,29 +192,35 @@ pub fn initTLS() ?*elf.Phdr {
|
|||
// Sanity check
|
||||
assert(at_phent == @sizeOf(elf.Phdr));
|
||||
|
||||
// Search the TLS section
|
||||
// Find the TLS section
|
||||
const phdrs = (@intToPtr([*]elf.Phdr, at_phdr))[0..at_phnum];
|
||||
|
||||
var gnu_stack: ?*elf.Phdr = null;
|
||||
|
||||
for (phdrs) |*phdr| {
|
||||
switch (phdr.p_type) {
|
||||
elf.PT_PHDR => img_base = at_phdr - phdr.p_vaddr,
|
||||
elf.PT_TLS => tls_phdr = phdr,
|
||||
elf.PT_GNU_STACK => gnu_stack = phdr,
|
||||
else => continue,
|
||||
else => {},
|
||||
}
|
||||
}
|
||||
|
||||
if (tls_phdr) |phdr| {
|
||||
// If the cpu is arm-based, check if it supports the TLS register
|
||||
if (builtin.arch == .arm and at_hwcap & std.os.linux.HWCAP_TLS == 0) {
|
||||
// If the cpu is ARM-based, check if it supports the TLS register
|
||||
if (comptime builtin.arch.isARM() and at_hwcap & std.os.linux.HWCAP_TLS == 0) {
|
||||
// If the CPU does not support TLS via a coprocessor register,
|
||||
// a kernel helper function can be used instead on certain linux kernels.
|
||||
// See linux/arch/arm/include/asm/tls.h and musl/src/thread/arm/__set_thread_area.c.
|
||||
@panic("TODO: Implement ARM fallback TLS functionality");
|
||||
}
|
||||
|
||||
var tls_align_factor: usize = undefined;
|
||||
var tls_data: []const u8 = undefined;
|
||||
if (tls_phdr) |phdr| {
|
||||
tls_align_factor = phdr.p_align;
|
||||
tls_data = @intToPtr([*]u8, img_base + phdr.p_vaddr)[0..phdr.p_memsz];
|
||||
} else {
|
||||
tls_align_factor = @alignOf(*usize);
|
||||
tls_data = &[_]u8{};
|
||||
}
|
||||
|
||||
// Offsets into the allocated TLS area
|
||||
var tcb_offset: usize = undefined;
|
||||
var dtv_offset: usize = undefined;
|
||||
|
@ -228,30 +231,34 @@ pub fn initTLS() ?*elf.Phdr {
|
|||
const alloc_size = switch (tls_variant) {
|
||||
.VariantI => blk: {
|
||||
var l: usize = 0;
|
||||
// Unneeded because l is zero
|
||||
// l = mem.alignForward(l, @alignOf(DTV));
|
||||
dtv_offset = l;
|
||||
l += @sizeOf(DTV);
|
||||
l = mem.alignForward(l, @alignOf(CustomData));
|
||||
thread_data_offset = l;
|
||||
l += @sizeOf(CustomData);
|
||||
l = mem.alignForward(l, phdr.p_align);
|
||||
// Make sure the TP is aligned
|
||||
l = mem.alignForward(l, tls_align_factor);
|
||||
tcb_offset = l;
|
||||
if (tls_tcb_align_size) {
|
||||
l += mem.alignForward(tls_tcb_size, phdr.p_align);
|
||||
} else {
|
||||
l += tls_tcb_size;
|
||||
}
|
||||
// Ensure there are at least tls_tcb_align_size bytes of padding
|
||||
const min_align = math.max(tls_tcb_align_size, tls_align_factor);
|
||||
l += mem.alignForward(tls_tcb_size, min_align);
|
||||
data_offset = l;
|
||||
l += phdr.p_memsz;
|
||||
l += mem.alignForward(tls_data.len, tls_align_factor);
|
||||
break :blk l;
|
||||
},
|
||||
.VariantII => blk: {
|
||||
var l: usize = 0;
|
||||
data_offset = l;
|
||||
l += phdr.p_memsz;
|
||||
l = mem.alignForward(l, phdr.p_align);
|
||||
l = mem.alignForward(tls_data.len, tls_align_factor);
|
||||
// The TP is aligned to p_align
|
||||
tcb_offset = l;
|
||||
l += tls_tcb_size;
|
||||
l = mem.alignForward(l, @alignOf(CustomData));
|
||||
thread_data_offset = l;
|
||||
l += @sizeOf(CustomData);
|
||||
l = mem.alignForward(l, @alignOf(DTV));
|
||||
dtv_offset = l;
|
||||
l += @sizeOf(DTV);
|
||||
break :blk l;
|
||||
|
@ -259,60 +266,64 @@ pub fn initTLS() ?*elf.Phdr {
|
|||
};
|
||||
|
||||
tls_image = TLSImage{
|
||||
.data_src = @intToPtr([*]u8, phdr.p_vaddr + img_base)[0..phdr.p_filesz],
|
||||
.data_src = tls_data,
|
||||
.alloc_size = alloc_size,
|
||||
.tcb_offset = tcb_offset,
|
||||
.dtv_offset = dtv_offset,
|
||||
.data_offset = data_offset,
|
||||
.gdt_entry_number = @bitCast(usize, @as(isize, -1)),
|
||||
};
|
||||
}
|
||||
|
||||
return gnu_stack;
|
||||
}
|
||||
|
||||
pub fn copyTLS(addr: usize) usize {
|
||||
const tls_img = tls_image.?;
|
||||
inline fn alignPtrCast(comptime T: type, ptr: [*]u8) *T {
|
||||
return @ptrCast(*T, @alignCast(@alignOf(*T), ptr));
|
||||
}
|
||||
|
||||
// Be paranoid, clear the area we're going to use
|
||||
@memset(@intToPtr([*]u8, addr), 0, tls_img.alloc_size);
|
||||
/// Initializes all the fields of the static TLS area and returns the computed
|
||||
/// architecture-specific value of the thread-pointer register
|
||||
pub fn prepareTLS(area: []u8) usize {
|
||||
// Clear the area we're going to use, just to be safe
|
||||
mem.set(u8, area, 0);
|
||||
// Prepare the DTV
|
||||
const dtv = @intToPtr(*DTV, addr + tls_img.dtv_offset);
|
||||
const dtv = alignPtrCast(DTV, area.ptr + tls_image.dtv_offset);
|
||||
dtv.entries = 1;
|
||||
dtv.tls_block[0] = addr + tls_img.data_offset + tls_dtv_offset;
|
||||
// Set-up the TCB
|
||||
// Force the alignment to 1 byte as the TCB may start from a non-aligned
|
||||
// address under the variant II model
|
||||
const tcb_ptr = @intToPtr(*align(1) usize, addr + tls_img.tcb_offset);
|
||||
if (tls_variant == TLSVariant.VariantI) {
|
||||
tcb_ptr.* = addr + tls_img.dtv_offset;
|
||||
} else {
|
||||
tcb_ptr.* = addr + tls_img.tcb_offset;
|
||||
}
|
||||
dtv.tls_block[0] = area.ptr + tls_dtv_offset + tls_image.data_offset;
|
||||
// Prepare the TCB
|
||||
const tcb_ptr = alignPtrCast([*]u8, area.ptr + tls_image.tcb_offset);
|
||||
tcb_ptr.* = switch (tls_variant) {
|
||||
.VariantI => area.ptr + tls_image.dtv_offset,
|
||||
.VariantII => area.ptr + tls_image.tcb_offset,
|
||||
};
|
||||
// Copy the data
|
||||
@memcpy(@intToPtr([*]u8, addr + tls_img.data_offset), tls_img.data_src.ptr, tls_img.data_src.len);
|
||||
mem.copy(u8, area[tls_image.data_offset..], tls_image.data_src);
|
||||
|
||||
// Return the corrected (if needed) value for the tp register
|
||||
return addr + tls_tp_offset +
|
||||
if (tls_tp_points_past_tcb) tls_img.data_offset else tls_img.tcb_offset;
|
||||
return @ptrToInt(area.ptr) + tls_tp_offset +
|
||||
if (tls_tp_points_past_tcb) tls_image.data_offset else tls_image.tcb_offset;
|
||||
}
|
||||
|
||||
var main_thread_tls_buffer: [256]u8 align(32) = undefined;
|
||||
|
||||
pub fn allocateTLS(size: usize) usize {
|
||||
// Small TLS allocation, use our local buffer
|
||||
if (size < main_thread_tls_buffer.len) {
|
||||
return @ptrToInt(&main_thread_tls_buffer);
|
||||
pub fn initStaticTLS() void {
|
||||
initTLS();
|
||||
|
||||
var tls_area = blk: {
|
||||
// Fast path for the common case where the TLS data is really small,
|
||||
// avoid an allocation and use our local buffer
|
||||
if (tls_image.alloc_size < main_thread_tls_buffer.len) {
|
||||
break :blk main_thread_tls_buffer[0..tls_image.alloc_size];
|
||||
}
|
||||
|
||||
const slice = os.mmap(
|
||||
break :blk os.mmap(
|
||||
null,
|
||||
size,
|
||||
tls_image.alloc_size,
|
||||
os.PROT_READ | os.PROT_WRITE,
|
||||
os.MAP_PRIVATE | os.MAP_ANONYMOUS,
|
||||
-1,
|
||||
0,
|
||||
) catch @panic("out of memory");
|
||||
};
|
||||
|
||||
return @ptrToInt(slice.ptr);
|
||||
const tp_value = prepareTLS(tls_area);
|
||||
setThreadPointer(tp_value);
|
||||
}
|
||||
|
|
|
@ -152,13 +152,7 @@ fn posixCallMainAndExit() noreturn {
|
|||
const auxv = @ptrCast([*]std.elf.Auxv, @alignCast(@alignOf(usize), envp.ptr + envp_count + 1));
|
||||
std.os.linux.elf_aux_maybe = auxv;
|
||||
// Initialize the TLS area
|
||||
const gnu_stack_phdr = std.os.linux.tls.initTLS() orelse @panic("ELF missing stack size");
|
||||
|
||||
if (std.os.linux.tls.tls_image) |tls_img| {
|
||||
const tls_addr = std.os.linux.tls.allocateTLS(tls_img.alloc_size);
|
||||
const tp = std.os.linux.tls.copyTLS(tls_addr);
|
||||
std.os.linux.tls.setThreadPointer(tp);
|
||||
}
|
||||
std.os.linux.tls.initStaticTLS();
|
||||
|
||||
// TODO This is disabled because what should we do when linking libc and this code
|
||||
// does not execute? And also it's causing a test failure in stack traces in release modes.
|
||||
|
|
|
@ -286,11 +286,10 @@ pub const Thread = struct {
|
|||
}
|
||||
// Finally, the Thread Local Storage, if any.
|
||||
if (!Thread.use_pthreads) {
|
||||
if (os.linux.tls.tls_image) |tls_img| {
|
||||
// XXX: Is this alignment enough?
|
||||
l = mem.alignForward(l, @alignOf(usize));
|
||||
tls_start_offset = l;
|
||||
l += tls_img.alloc_size;
|
||||
}
|
||||
l += os.linux.tls.tls_image.alloc_size;
|
||||
}
|
||||
break :blk l;
|
||||
};
|
||||
|
@ -349,18 +348,21 @@ pub const Thread = struct {
|
|||
else => return os.unexpectedErrno(@intCast(usize, err)),
|
||||
}
|
||||
} else if (std.Target.current.os.tag == .linux) {
|
||||
var flags: u32 = os.CLONE_VM | os.CLONE_FS | os.CLONE_FILES | os.CLONE_SIGHAND |
|
||||
os.CLONE_THREAD | os.CLONE_SYSVSEM | os.CLONE_PARENT_SETTID | os.CLONE_CHILD_CLEARTID |
|
||||
os.CLONE_DETACHED;
|
||||
var newtls: usize = undefined;
|
||||
const flags: u32 = os.CLONE_VM | os.CLONE_FS | os.CLONE_FILES |
|
||||
os.CLONE_SIGHAND | os.CLONE_THREAD | os.CLONE_SYSVSEM |
|
||||
os.CLONE_PARENT_SETTID | os.CLONE_CHILD_CLEARTID |
|
||||
os.CLONE_DETACHED | os.CLONE_SETTLS;
|
||||
// This structure is only needed when targeting i386
|
||||
var user_desc: if (std.Target.current.cpu.arch == .i386) os.linux.user_desc else void = undefined;
|
||||
|
||||
if (os.linux.tls.tls_image) |tls_img| {
|
||||
const tls_area = mmap_slice[tls_start_offset..];
|
||||
const tp_value = os.linux.tls.prepareTLS(tls_area);
|
||||
|
||||
var newtls = blk: {
|
||||
if (std.Target.current.cpu.arch == .i386) {
|
||||
user_desc = os.linux.user_desc{
|
||||
.entry_number = tls_img.gdt_entry_number,
|
||||
.base_addr = os.linux.tls.copyTLS(mmap_addr + tls_start_offset),
|
||||
.entry_number = os.linux.tls.tls_image.gdt_entry_number,
|
||||
.base_addr = tp_value,
|
||||
.limit = 0xfffff,
|
||||
.seg_32bit = 1,
|
||||
.contents = 0, // Data
|
||||
|
@ -369,12 +371,11 @@ pub const Thread = struct {
|
|||
.seg_not_present = 0,
|
||||
.useable = 1,
|
||||
};
|
||||
newtls = @ptrToInt(&user_desc);
|
||||
break :blk @ptrToInt(&user_desc);
|
||||
} else {
|
||||
newtls = os.linux.tls.copyTLS(mmap_addr + tls_start_offset);
|
||||
}
|
||||
flags |= os.CLONE_SETTLS;
|
||||
break :blk tp_value;
|
||||
}
|
||||
};
|
||||
|
||||
const rc = os.linux.clone(
|
||||
MainFuncs.linuxThreadMain,
|
||||
|
|
Loading…
Reference in New Issue