const std = @import("./index.zig"); error Utf8InvalidStartByte; /// Given the first byte of a UTF-8 codepoint, /// returns a number 1-4 indicating the total length of the codepoint in bytes. /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. pub fn utf8ByteSequenceLength(first_byte: u8) -> %u3 { if (first_byte < 0b10000000) return u3(1); if (first_byte & 0b11100000 == 0b11000000) return u3(2); if (first_byte & 0b11110000 == 0b11100000) return u3(3); if (first_byte & 0b11111000 == 0b11110000) return u3(4); return error.Utf8InvalidStartByte; } error Utf8OverlongEncoding; error Utf8ExpectedContinuation; error Utf8EncodesSurrogateHalf; error Utf8CodepointTooLarge; /// Decodes the UTF-8 codepoint encoded in the given slice of bytes. /// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. /// If you already know the length at comptime, you can call one of /// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. pub fn utf8Decode(bytes: []const u8) -> %u32 { return switch (bytes.len) { 1 => u32(bytes[0]), 2 => utf8Decode2(bytes), 3 => utf8Decode3(bytes), 4 => utf8Decode4(bytes), else => unreachable, }; } pub fn utf8Decode2(bytes: []const u8) -> %u32 { std.debug.assert(bytes.len == 2); std.debug.assert(bytes[0] & 0b11100000 == 0b11000000); var value: u32 = bytes[0] & 0b00011111; if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; value |= bytes[1] & 0b00111111; if (value < 0x80) return error.Utf8OverlongEncoding; return value; } pub fn utf8Decode3(bytes: []const u8) -> %u32 { std.debug.assert(bytes.len == 3); std.debug.assert(bytes[0] & 0b11110000 == 0b11100000); var value: u32 = bytes[0] & 0b00001111; if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; value |= bytes[1] & 0b00111111; if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; value |= bytes[2] & 0b00111111; if (value < 0x800) return error.Utf8OverlongEncoding; if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf; return value; } pub fn utf8Decode4(bytes: []const u8) -> %u32 { std.debug.assert(bytes.len == 4); std.debug.assert(bytes[0] & 0b11111000 == 0b11110000); var value: u32 = bytes[0] & 0b00000111; if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; value |= bytes[1] & 0b00111111; if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; value |= bytes[2] & 0b00111111; if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; value |= bytes[3] & 0b00111111; if (value < 0x10000) return error.Utf8OverlongEncoding; if (value > 0x10FFFF) return error.Utf8CodepointTooLarge; return value; } error UnexpectedEof; test "valid utf8" { testValid("\x00", 0x0); testValid("\x20", 0x20); testValid("\x7f", 0x7f); testValid("\xc2\x80", 0x80); testValid("\xdf\xbf", 0x7ff); testValid("\xe0\xa0\x80", 0x800); testValid("\xe1\x80\x80", 0x1000); testValid("\xef\xbf\xbf", 0xffff); testValid("\xf0\x90\x80\x80", 0x10000); testValid("\xf1\x80\x80\x80", 0x40000); testValid("\xf3\xbf\xbf\xbf", 0xfffff); testValid("\xf4\x8f\xbf\xbf", 0x10ffff); } test "invalid utf8 continuation bytes" { // unexpected continuation testError("\x80", error.Utf8InvalidStartByte); testError("\xbf", error.Utf8InvalidStartByte); // too many leading 1's testError("\xf8", error.Utf8InvalidStartByte); testError("\xff", error.Utf8InvalidStartByte); // expected continuation for 2 byte sequences testError("\xc2", error.UnexpectedEof); testError("\xc2\x00", error.Utf8ExpectedContinuation); testError("\xc2\xc0", error.Utf8ExpectedContinuation); // expected continuation for 3 byte sequences testError("\xe0", error.UnexpectedEof); testError("\xe0\x00", error.UnexpectedEof); testError("\xe0\xc0", error.UnexpectedEof); testError("\xe0\xa0", error.UnexpectedEof); testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation); testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation); // expected continuation for 4 byte sequences testError("\xf0", error.UnexpectedEof); testError("\xf0\x00", error.UnexpectedEof); testError("\xf0\xc0", error.UnexpectedEof); testError("\xf0\x90\x00", error.UnexpectedEof); testError("\xf0\x90\xc0", error.UnexpectedEof); testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation); testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation); } test "overlong utf8 codepoint" { testError("\xc0\x80", error.Utf8OverlongEncoding); testError("\xc1\xbf", error.Utf8OverlongEncoding); testError("\xe0\x80\x80", error.Utf8OverlongEncoding); testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding); testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding); testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding); } test "misc invalid utf8" { // codepoint out of bounds testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge); testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge); // surrogate halves testValid("\xed\x9f\xbf", 0xd7ff); testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf); testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf); testValid("\xee\x80\x80", 0xe000); } fn testError(bytes: []const u8, expected_err: error) { if (testDecode(bytes)) |_| { unreachable; } else |err| { std.debug.assert(err == expected_err); } } fn testValid(bytes: []const u8, expected_codepoint: u32) { std.debug.assert((testDecode(bytes) catch unreachable) == expected_codepoint); } fn testDecode(bytes: []const u8) -> %u32 { const length = try utf8ByteSequenceLength(bytes[0]); if (bytes.len < length) return error.UnexpectedEof; std.debug.assert(bytes.len == length); return utf8Decode(bytes); }