diff --git a/lib/std/cache_hash.zig b/lib/std/cache_hash.zig new file mode 100644 index 000000000..ea85cb6db --- /dev/null +++ b/lib/std/cache_hash.zig @@ -0,0 +1,329 @@ +const Blake3 = @import("crypto.zig").Blake3; +const fs = @import("fs.zig"); +const File = fs.File; +const base64 = @import("base64.zig"); +const ArrayList = @import("array_list.zig").ArrayList; +const debug = @import("debug.zig"); +const testing = @import("testing.zig"); +const mem = @import("mem.zig"); +const fmt = @import("fmt.zig"); +const Allocator = mem.Allocator; +const Buffer = @import("buffer.zig").Buffer; +const os = @import("os.zig"); + +const base64_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; +const base64_pad_char = '='; +const encoder = base64.Base64Encoder.init(base64_alphabet, base64_pad_char); +const decoder = base64.Base64Decoder.init(base64_alphabet, base64_pad_char); +const BIN_DIGEST_LEN = 32; + +pub const CacheHashFile = struct { + path: ?[]const u8, + stat: fs.File.Stat, + file_handle: os.fd_t, + bin_digest: [BIN_DIGEST_LEN]u8, + contents: ?[]const u8, + + pub fn deinit(self: *@This(), alloc: *Allocator) void { + if (self.path) |owned_slice| { + alloc.free(owned_slice); + self.path = null; + } + if (self.contents) |owned_slice| { + alloc.free(owned_slice); + self.contents = null; + } + } +}; + +pub const CacheHash = struct { + alloc: *Allocator, + blake3: Blake3, + manifest_dir: []const u8, + manifest_file_path: ?[]const u8, + manifest_file: ?File, + manifest_dirty: bool, + force_check_manifest: bool, + files: ArrayList(CacheHashFile), + b64_digest: ArrayList(u8), + + pub fn init(alloc: *Allocator, manifest_dir_path: []const u8) !@This() { + return CacheHash{ + .alloc = alloc, + .blake3 = Blake3.init(), + .manifest_dir = manifest_dir_path, + .manifest_file_path = null, + .manifest_file = null, + .manifest_dirty = false, + .force_check_manifest = false, + .files = ArrayList(CacheHashFile).init(alloc), + .b64_digest = ArrayList(u8).init(alloc), + }; + } + + pub fn cache_buf(self: *@This(), val: []const u8) !void { + debug.assert(self.manifest_file_path == null); + + var temp_buffer = try self.alloc.alloc(u8, val.len + 1); + defer self.alloc.free(temp_buffer); + + mem.copy(u8, temp_buffer, val); + temp_buffer[val.len] = 0; + + self.blake3.update(temp_buffer); + } + + pub fn cache_file(self: *@This(), file_path: []const u8) !void { + debug.assert(self.manifest_file_path == null); + + var cache_hash_file = try self.files.addOne(); + cache_hash_file.path = try fs.path.resolve(self.alloc, &[_][]const u8{file_path}); + + try self.cache_buf(cache_hash_file.path.?); + } + + pub fn hit(self: *@This(), out_digest: *ArrayList(u8)) !bool { + debug.assert(self.manifest_file_path == null); + + var bin_digest: [BIN_DIGEST_LEN]u8 = undefined; + self.blake3.final(&bin_digest); + + const OUT_DIGEST_LEN = base64.Base64Encoder.calcSize(BIN_DIGEST_LEN); + try self.b64_digest.resize(OUT_DIGEST_LEN); + encoder.encode(self.b64_digest.toSlice(), &bin_digest); + + if (self.files.toSlice().len == 0 and !self.force_check_manifest) { + try out_digest.resize(OUT_DIGEST_LEN); + mem.copy(u8, out_digest.toSlice(), self.b64_digest.toSlice()); + return true; + } + + self.blake3 = Blake3.init(); + self.blake3.update(&bin_digest); + + { + const manifest_file_path_slice = try fs.path.join(self.alloc, &[_][]const u8{ self.manifest_dir, self.b64_digest.toSlice() }); + var path_buf = ArrayList(u8).fromOwnedSlice(self.alloc, manifest_file_path_slice); + defer path_buf.deinit(); + try path_buf.appendSlice(".txt"); + + self.manifest_file_path = path_buf.toOwnedSlice(); + } + + const cwd = fs.cwd(); + + try cwd.makePath(self.manifest_dir); + + // TODO: Open file with a file lock + self.manifest_file = try cwd.createFile(self.manifest_file_path.?, .{ .read = true, .truncate = false }); + + // TODO: Figure out a good max value? + const file_contents = try self.manifest_file.?.inStream().stream.readAllAlloc(self.alloc, 16 * 1024); + defer self.alloc.free(file_contents); + + const input_file_count = self.files.len; + var any_file_changed = false; + var line_iter = mem.tokenize(file_contents, "\n"); + var idx: usize = 0; + while (line_iter.next()) |line| { + defer idx += 1; + + var cache_hash_file: *CacheHashFile = undefined; + if (idx < input_file_count) { + cache_hash_file = self.files.ptrAt(idx); + } else { + cache_hash_file = try self.files.addOne(); + cache_hash_file.path = null; + } + + var iter = mem.tokenize(line, " "); + const file_handle_str = iter.next() orelse return error.InvalidFormat; + const mtime_nsec_str = iter.next() orelse return error.InvalidFormat; + const digest_str = iter.next() orelse return error.InvalidFormat; + const file_path = iter.rest(); + + cache_hash_file.file_handle = fmt.parseInt(os.fd_t, file_handle_str, 10) catch return error.InvalidFormat; + cache_hash_file.stat.mtime = fmt.parseInt(i64, mtime_nsec_str, 10) catch return error.InvalidFormat; + decoder.decode(&cache_hash_file.bin_digest, digest_str) catch return error.InvalidFormat; + + if (file_path.len == 0) { + return error.InvalidFormat; + } + if (cache_hash_file.path != null and !mem.eql(u8, file_path, cache_hash_file.path.?)) { + return error.InvalidFormat; + } + cache_hash_file.path = try mem.dupe(self.alloc, u8, file_path); + + const this_file = cwd.openFile(cache_hash_file.path.?, .{ .read = true }) catch { + self.manifest_file.?.close(); + self.manifest_file = null; + return error.CacheUnavailable; + }; + defer this_file.close(); + cache_hash_file.stat = try this_file.stat(); + // TODO: check mtime + if (false) {} else { + self.manifest_dirty = true; + + // TODO: check for problematic timestamp + + var actual_digest: [32]u8 = undefined; + try hash_file(self.alloc, &actual_digest, &this_file); + + if (!mem.eql(u8, &cache_hash_file.bin_digest, &actual_digest)) { + mem.copy(u8, &cache_hash_file.bin_digest, &actual_digest); + // keep going until we have the input file digests + any_file_changed = true; + } + } + + if (!any_file_changed) { + self.blake3.update(&cache_hash_file.bin_digest); + } + } + + if (any_file_changed) { + // cache miss + // keep the manifest file open (TODO: with rw lock) + // reset the hash + self.blake3 = Blake3.init(); + self.blake3.update(&bin_digest); + try self.files.resize(input_file_count); + for (self.files.toSlice()) |file| { + self.blake3.update(&file.bin_digest); + } + return false; + } + + if (idx < input_file_count or idx == 0) { + self.manifest_dirty = true; + while (idx < input_file_count) : (idx += 1) { + var cache_hash_file = self.files.ptrAt(idx); + self.populate_file_hash(cache_hash_file) catch |err| { + self.manifest_file.?.close(); + self.manifest_file = null; + return error.CacheUnavailable; + }; + } + return false; + } + + try self.final(out_digest); + return true; + } + + pub fn populate_file_hash(self: *@This(), cache_hash_file: *CacheHashFile) !void { + debug.assert(cache_hash_file.path != null); + + const this_file = try fs.cwd().openFile(cache_hash_file.path.?, .{}); + defer this_file.close(); + + cache_hash_file.stat = try this_file.stat(); + + // TODO: check for problematic timestamp + + try hash_file(self.alloc, &cache_hash_file.bin_digest, &this_file); + self.blake3.update(&cache_hash_file.bin_digest); + } + + pub fn final(self: *@This(), out_digest: *ArrayList(u8)) !void { + debug.assert(self.manifest_file_path != null); + + var bin_digest: [BIN_DIGEST_LEN]u8 = undefined; + self.blake3.final(&bin_digest); + + const OUT_DIGEST_LEN = base64.Base64Encoder.calcSize(BIN_DIGEST_LEN); + try out_digest.resize(OUT_DIGEST_LEN); + encoder.encode(out_digest.toSlice(), &bin_digest); + } + + pub fn write_manifest(self: *@This()) !void { + debug.assert(self.manifest_file_path != null); + + const OUT_DIGEST_LEN = base64.Base64Encoder.calcSize(BIN_DIGEST_LEN); + var encoded_digest = try Buffer.initSize(self.alloc, OUT_DIGEST_LEN); + defer encoded_digest.deinit(); + var contents = try Buffer.init(self.alloc, ""); + defer contents.deinit(); + + for (self.files.toSlice()) |file| { + encoder.encode(encoded_digest.toSlice(), &file.bin_digest); + try contents.print("{} {} {} {}\n", .{ file.file_handle, file.stat.mtime, encoded_digest.toSlice(), file.path }); + } + + try self.manifest_file.?.seekTo(0); + try self.manifest_file.?.writeAll(contents.toSlice()); + } + + pub fn release(self: *@This()) void { + debug.assert(self.manifest_file_path != null); + + if (self.manifest_dirty) { + self.write_manifest() catch |err| { + debug.warn("Unable to write cache file '{}': {}\n", .{ self.manifest_file_path, err }); + }; + } + + self.manifest_file.?.close(); + if (self.manifest_file_path) |owned_slice| { + self.alloc.free(owned_slice); + } + for (self.files.toSlice()) |*file| { + file.deinit(self.alloc); + } + self.files.deinit(); + self.b64_digest.deinit(); + } +}; + +fn hash_file(alloc: *Allocator, bin_digest: []u8, handle: *const File) !void { + var blake3 = Blake3.init(); + var in_stream = handle.inStream().stream; + + const contents = try handle.inStream().stream.readAllAlloc(alloc, 64 * 1024); + defer alloc.free(contents); + + blake3.update(contents); + + blake3.final(bin_digest); +} + +test "see if imported" { + const cwd = fs.cwd(); + + const temp_manifest_dir = "temp_manifest_dir"; + + try cwd.writeFile("test.txt", "Hello, world!\n"); + + var digest1 = try ArrayList(u8).initCapacity(testing.allocator, 32); + defer digest1.deinit(); + var digest2 = try ArrayList(u8).initCapacity(testing.allocator, 32); + defer digest2.deinit(); + + { + var ch = try CacheHash.init(testing.allocator, temp_manifest_dir); + defer ch.release(); + + try ch.cache_buf("1234"); + try ch.cache_file("test.txt"); + + // There should be nothing in the cache + debug.assert((try ch.hit(&digest1)) == false); + + try ch.final(&digest1); + } + { + var ch = try CacheHash.init(testing.allocator, temp_manifest_dir); + defer ch.release(); + + try ch.cache_buf("1234"); + try ch.cache_file("test.txt"); + + // Cache hit! We just "built" the same file + debug.assert((try ch.hit(&digest2)) == true); + } + + debug.assert(mem.eql(u8, digest1.toSlice(), digest2.toSlice())); + + try cwd.deleteTree(temp_manifest_dir); +} diff --git a/lib/std/std.zig b/lib/std/std.zig index 376c20020..9920ca337 100644 --- a/lib/std/std.zig +++ b/lib/std/std.zig @@ -31,6 +31,7 @@ pub const base64 = @import("base64.zig"); pub const build = @import("build.zig"); pub const builtin = @import("builtin.zig"); pub const c = @import("c.zig"); +pub const cache_hash = @import("cache_hash.zig"); pub const coff = @import("coff.zig"); pub const crypto = @import("crypto.zig"); pub const cstr = @import("cstr.zig");