diff --git a/build.zig b/build.zig index 3f7f1a903..a6a2d8737 100644 --- a/build.zig +++ b/build.zig @@ -128,6 +128,7 @@ pub fn build(b: *Builder) !void { "README.md", ".z.0", ".z.9", + ".gz", "rfc1951.txt", }, }); diff --git a/lib/std/compress.zig b/lib/std/compress.zig index 5518f807d..95f496021 100644 --- a/lib/std/compress.zig +++ b/lib/std/compress.zig @@ -6,8 +6,10 @@ const std = @import("std.zig"); pub const deflate = @import("compress/deflate.zig"); +pub const gzip = @import("compress/gzip.zig"); pub const zlib = @import("compress/zlib.zig"); test "" { + _ = gzip; _ = zlib; } diff --git a/lib/std/compress/gzip.zig b/lib/std/compress/gzip.zig new file mode 100644 index 000000000..aad173139 --- /dev/null +++ b/lib/std/compress/gzip.zig @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2015-2020 Zig Contributors +// This file is part of [zig](https://ziglang.org/), which is MIT licensed. +// The MIT license requires this copyright notice to be included in all copies +// and substantial portions of the software. +// +// Decompressor for GZIP data streams (RFC1952) + +const std = @import("std"); +const io = std.io; +const fs = std.fs; +const testing = std.testing; +const mem = std.mem; +const deflate = std.compress.deflate; + +// Flags for the FLG field in the header +const FTEXT = 1 << 0; +const FHCRC = 1 << 1; +const FEXTRA = 1 << 2; +const FNAME = 1 << 3; +const FCOMMENT = 1 << 4; + +pub fn GzipStream(comptime ReaderType: type) type { + return struct { + const Self = @This(); + + pub const Error = ReaderType.Error || + deflate.InflateStream(ReaderType).Error || + error{ CorruptedData, WrongChecksum }; + pub const Reader = io.Reader(*Self, Error, read); + + allocator: *mem.Allocator, + inflater: deflate.InflateStream(ReaderType), + in_reader: ReaderType, + hasher: std.hash.Crc32, + window_slice: []u8, + read_amt: usize, + + info: struct { + filename: ?[]const u8, + comment: ?[]const u8, + modification_time: u32, + }, + + fn init(allocator: *mem.Allocator, source: ReaderType) !Self { + // gzip header format is specified in RFC1952 + const header = try source.readBytesNoEof(10); + + // Check the ID1/ID2 fields + if (header[0] != 0x1f or header[1] != 0x8b) + return error.BadHeader; + + const CM = header[2]; + // The CM field must be 8 to indicate the use of DEFLATE + if (CM != 8) return error.InvalidCompression; + // Flags + const FLG = header[3]; + // Modification time, as a Unix timestamp. + // If zero there's no timestamp available. + const MTIME = mem.readIntLittle(u32, header[4..8]); + // Extra flags + const XFL = header[8]; + // Operating system where the compression took place + const OS = header[9]; + + if (FLG & FEXTRA != 0) { + // Skip the extra data, we could read and expose it to the user + // if somebody needs it. + const len = try source.readIntLittle(u16); + try source.skipBytes(len, .{}); + } + + var filename: ?[]const u8 = null; + if (FLG & FNAME != 0) { + filename = try source.readUntilDelimiterAlloc( + allocator, + 0, + std.math.maxInt(usize), + ); + } + errdefer if (filename) |p| allocator.free(p); + + var comment: ?[]const u8 = null; + if (FLG & FCOMMENT != 0) { + comment = try source.readUntilDelimiterAlloc( + allocator, + 0, + std.math.maxInt(usize), + ); + } + errdefer if (comment) |p| allocator.free(p); + + if (FLG & FHCRC != 0) { + // TODO: Evaluate and check the header checksum. The stdlib has + // no CRC16 yet :( + _ = try source.readIntLittle(u16); + } + + // The RFC doesn't say anything about the DEFLATE window size to be + // used, default to 32K. + var window_slice = try allocator.alloc(u8, 32 * 1024); + + return Self{ + .allocator = allocator, + .inflater = deflate.inflateStream(source, window_slice), + .in_reader = source, + .hasher = std.hash.Crc32.init(), + .window_slice = window_slice, + .info = .{ + .filename = filename, + .comment = comment, + .modification_time = MTIME, + }, + .read_amt = 0, + }; + } + + pub fn deinit(self: *Self) void { + self.allocator.free(self.window_slice); + if (self.info.filename) |filename| + self.allocator.free(filename); + if (self.info.comment) |comment| + self.allocator.free(comment); + } + + // Implements the io.Reader interface + pub fn read(self: *Self, buffer: []u8) Error!usize { + if (buffer.len == 0) + return 0; + + // Read from the compressed stream and update the computed checksum + const r = try self.inflater.read(buffer); + if (r != 0) { + self.hasher.update(buffer[0..r]); + self.read_amt += r; + return r; + } + + // We've reached the end of stream, check if the checksum matches + const hash = try self.in_reader.readIntLittle(u32); + if (hash != self.hasher.final()) + return error.WrongChecksum; + + // The ISIZE field is the size of the uncompressed input modulo 2^32 + const input_size = try self.in_reader.readIntLittle(u32); + if (self.read_amt & 0xffffffff != input_size) + return error.CorruptedData; + + return 0; + } + + pub fn reader(self: *Self) Reader { + return .{ .context = self }; + } + }; +} + +pub fn gzipStream(allocator: *mem.Allocator, reader: anytype) !GzipStream(@TypeOf(reader)) { + return GzipStream(@TypeOf(reader)).init(allocator, reader); +} + +fn testReader(data: []const u8, comptime expected: []const u8) !void { + var in_stream = io.fixedBufferStream(data); + + var gzip_stream = try gzipStream(testing.allocator, in_stream.reader()); + defer gzip_stream.deinit(); + + // Read and decompress the whole file + const buf = try gzip_stream.reader().readAllAlloc(testing.allocator, std.math.maxInt(usize)); + defer testing.allocator.free(buf); + // Calculate its SHA256 hash and check it against the reference + var hash: [32]u8 = undefined; + std.crypto.hash.sha2.Sha256.hash(buf, hash[0..], .{}); + + assertEqual(expected, &hash); +} + +// Assert `expected` == `input` where `input` is a bytestring. +pub fn assertEqual(comptime expected: []const u8, input: []const u8) void { + var expected_bytes: [expected.len / 2]u8 = undefined; + for (expected_bytes) |*r, i| { + r.* = std.fmt.parseInt(u8, expected[2 * i .. 2 * i + 2], 16) catch unreachable; + } + + testing.expectEqualSlices(u8, &expected_bytes, input); +} + +// All the test cases are obtained by compressing the RFC1952 text +// +// https://tools.ietf.org/rfc/rfc1952.txt length=25037 bytes +// SHA256=164ef0897b4cbec63abf1b57f069f3599bd0fb7c72c2a4dee21bd7e03ec9af67 +test "compressed data" { + try testReader( + @embedFile("rfc1952.txt.gz"), + "164ef0897b4cbec63abf1b57f069f3599bd0fb7c72c2a4dee21bd7e03ec9af67", + ); +} + +test "sanity checks" { + // Truncated header + testing.expectError( + error.EndOfStream, + testReader(&[_]u8{ 0x1f, 0x8B }, ""), + ); + // Wrong CM + testing.expectError( + error.InvalidCompression, + testReader(&[_]u8{ + 0x1f, 0x8b, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, + }, ""), + ); + // Wrong checksum + testing.expectError( + error.WrongChecksum, + testReader(&[_]u8{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, + }, ""), + ); + // Truncated checksum + testing.expectError( + error.EndOfStream, + testReader(&[_]u8{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, + }, ""), + ); + // Wrong initial size + testing.expectError( + error.CorruptedData, + testReader(&[_]u8{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + }, ""), + ); + // Truncated initial size field + testing.expectError( + error.EndOfStream, + testReader(&[_]u8{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, + }, ""), + ); +} diff --git a/lib/std/compress/rfc1952.txt.gz b/lib/std/compress/rfc1952.txt.gz new file mode 100644 index 000000000..be43b90a7 Binary files /dev/null and b/lib/std/compress/rfc1952.txt.gz differ