zig/std/unicode.zig

const std = @import("./index.zig");

error Utf8InvalidStartByte;

/// Given the first byte of a UTF-8 codepoint,
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
pub fn utf8ByteSequenceLength(first_byte: u8) %u3 {
    if (first_byte < 0b10000000) return u3(1);
    if (first_byte & 0b11100000 == 0b11000000) return u3(2);
    if (first_byte & 0b11110000 == 0b11100000) return u3(3);
    if (first_byte & 0b11111000 == 0b11110000) return u3(4);
    return error.Utf8InvalidStartByte;
}

error Utf8OverlongEncoding;
error Utf8ExpectedContinuation;
error Utf8EncodesSurrogateHalf;
error Utf8CodepointTooLarge;

/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
/// If you already know the length at comptime, you can call one of
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
pub fn utf8Decode(bytes: []const u8) %u32 {
    return switch (bytes.len) {
        1 => u32(bytes[0]),
        2 => utf8Decode2(bytes),
        3 => utf8Decode3(bytes),
        4 => utf8Decode4(bytes),
        else => unreachable,
    };
}
pub fn utf8Decode2(bytes: []const u8) %u32 {
    std.debug.assert(bytes.len == 2);
    std.debug.assert(bytes[0] & 0b11100000 == 0b11000000);
    var value: u32 = bytes[0] & 0b00011111;

    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    value <<= 6;
    value |= bytes[1] & 0b00111111;

    if (value < 0x80) return error.Utf8OverlongEncoding;

    return value;
}
pub fn utf8Decode3(bytes: []const u8) %u32 {
    std.debug.assert(bytes.len == 3);
    std.debug.assert(bytes[0] & 0b11110000 == 0b11100000);
    var value: u32 = bytes[0] & 0b00001111;

    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    value <<= 6;
    value |= bytes[1] & 0b00111111;

    if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    value <<= 6;
    value |= bytes[2] & 0b00111111;

    if (value < 0x800) return error.Utf8OverlongEncoding;
    if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;

    return value;
}
pub fn utf8Decode4(bytes: []const u8) %u32 {
    std.debug.assert(bytes.len == 4);
    std.debug.assert(bytes[0] & 0b11111000 == 0b11110000);
    var value: u32 = bytes[0] & 0b00000111;

    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    value <<= 6;
    value |= bytes[1] & 0b00111111;

    if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    value <<= 6;
    value |= bytes[2] & 0b00111111;

    if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    value <<= 6;
    value |= bytes[3] & 0b00111111;

    if (value < 0x10000) return error.Utf8OverlongEncoding;
    if (value > 0x10FFFF) return error.Utf8CodepointTooLarge;

    return value;
}

error UnexpectedEof;
test "valid utf8" {
    testValid("\x00", 0x0);
    testValid("\x20", 0x20);
    testValid("\x7f", 0x7f);
    testValid("\xc2\x80", 0x80);
    testValid("\xdf\xbf", 0x7ff);
    testValid("\xe0\xa0\x80", 0x800);
    testValid("\xe1\x80\x80", 0x1000);
    testValid("\xef\xbf\xbf", 0xffff);
    testValid("\xf0\x90\x80\x80", 0x10000);
    testValid("\xf1\x80\x80\x80", 0x40000);
    testValid("\xf3\xbf\xbf\xbf", 0xfffff);
    testValid("\xf4\x8f\xbf\xbf", 0x10ffff);
}

test "invalid utf8 continuation bytes" {
    // unexpected continuation
    testError("\x80", error.Utf8InvalidStartByte);
    testError("\xbf", error.Utf8InvalidStartByte);
    // too many leading 1's
    testError("\xf8", error.Utf8InvalidStartByte);
    testError("\xff", error.Utf8InvalidStartByte);
    // expected continuation for 2 byte sequences
    testError("\xc2", error.UnexpectedEof);
    testError("\xc2\x00", error.Utf8ExpectedContinuation);
    testError("\xc2\xc0", error.Utf8ExpectedContinuation);
    // expected continuation for 3 byte sequences
    testError("\xe0", error.UnexpectedEof);
    testError("\xe0\x00", error.UnexpectedEof);
    testError("\xe0\xc0", error.UnexpectedEof);
    testError("\xe0\xa0", error.UnexpectedEof);
    testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation);
    testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation);
    // expected continuation for 4 byte sequences
    testError("\xf0", error.UnexpectedEof);
    testError("\xf0\x00", error.UnexpectedEof);
    testError("\xf0\xc0", error.UnexpectedEof);
    testError("\xf0\x90\x00", error.UnexpectedEof);
    testError("\xf0\x90\xc0", error.UnexpectedEof);
    testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation);
    testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation);
}

test "overlong utf8 codepoint" {
    testError("\xc0\x80", error.Utf8OverlongEncoding);
    testError("\xc1\xbf", error.Utf8OverlongEncoding);
    testError("\xe0\x80\x80", error.Utf8OverlongEncoding);
    testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding);
    testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding);
    testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding);
}

test "misc invalid utf8" {
    // codepoint out of bounds
    testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
    testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);
    // surrogate halves
    testValid("\xed\x9f\xbf", 0xd7ff);
    testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf);
    testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf);
    testValid("\xee\x80\x80", 0xe000);
}

fn testError(bytes: []const u8, expected_err: error) void {
    if (testDecode(bytes)) |_| {
        unreachable;
    } else |err| {
        std.debug.assert(err == expected_err);
    }
}

fn testValid(bytes: []const u8, expected_codepoint: u32) void {
    std.debug.assert((testDecode(bytes) catch unreachable) == expected_codepoint);
}

fn testDecode(bytes: []const u8) %u32 {
    const length = try utf8ByteSequenceLength(bytes[0]);
    if (bytes.len < length) return error.UnexpectedEof;
    std.debug.assert(bytes.len == length);
    return utf8Decode(bytes);
}
move utf8 parsing to std source files no longer need to end with a newline 2017-12-26 22:17:33 -08:00			`const std = @import("./index.zig");`

			`error Utf8InvalidStartByte;`

			`/// Given the first byte of a UTF-8 codepoint,`
			`/// returns a number 1-4 indicating the total length of the codepoint in bytes.`
			`/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.`
syntax: functions require return type. remove `->` The purpose of this is: * Only one way to do things * Changing a function with void return type to return a possible error becomes a 1 character change, subtly encouraging people to use errors. See #632 Here are some imperfect sed commands for performing this update: remove arrow: ``` sed -i 's/\(\bfn\b.\)-> /\1/g' $(find . -name ".zig") ``` add void: ``` sed -i 's/\(\bfn\b.\))\s{/\1) void {/g' $(find ../ -name "*.zig") ``` Some cleanup may be necessary, but this should do the bulk of the work. 2018-01-25 01:10:11 -08:00			`pub fn utf8ByteSequenceLength(first_byte: u8) %u3 {`
move utf8 parsing to std source files no longer need to end with a newline 2017-12-26 22:17:33 -08:00			`if (first_byte < 0b10000000) return u3(1);`
			`if (first_byte & 0b11100000 == 0b11000000) return u3(2);`
			`if (first_byte & 0b11110000 == 0b11100000) return u3(3);`
			`if (first_byte & 0b11111000 == 0b11110000) return u3(4);`
			`return error.Utf8InvalidStartByte;`
			`}`

			`error Utf8OverlongEncoding;`
			`error Utf8ExpectedContinuation;`
			`error Utf8EncodesSurrogateHalf;`
			`error Utf8CodepointTooLarge;`

			`/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.`
remove %% prefix operator See #632 closes #545 closes #510 this makes #651 higher priority 2018-01-08 21:07:01 -08:00			`/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.`
move utf8 parsing to std source files no longer need to end with a newline 2017-12-26 22:17:33 -08:00			`/// If you already know the length at comptime, you can call one of`
			`/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.`
syntax: functions require return type. remove `->` The purpose of this is: * Only one way to do things * Changing a function with void return type to return a possible error becomes a 1 character change, subtly encouraging people to use errors. See #632 Here are some imperfect sed commands for performing this update: remove arrow: ``` sed -i 's/\(\bfn\b.\)-> /\1/g' $(find . -name ".zig") ``` add void: ``` sed -i 's/\(\bfn\b.\))\s{/\1) void {/g' $(find ../ -name "*.zig") ``` Some cleanup may be necessary, but this should do the bulk of the work. 2018-01-25 01:10:11 -08:00			`pub fn utf8Decode(bytes: []const u8) %u32 {`
move utf8 parsing to std source files no longer need to end with a newline 2017-12-26 22:17:33 -08:00			`return switch (bytes.len) {`
			`1 => u32(bytes[0]),`
			`2 => utf8Decode2(bytes),`
			`3 => utf8Decode3(bytes),`
			`4 => utf8Decode4(bytes),`
			`else => unreachable,`
			`};`
			`}`
syntax: functions require return type. remove `->` The purpose of this is: * Only one way to do things * Changing a function with void return type to return a possible error becomes a 1 character change, subtly encouraging people to use errors. See #632 Here are some imperfect sed commands for performing this update: remove arrow: ``` sed -i 's/\(\bfn\b.\)-> /\1/g' $(find . -name ".zig") ``` add void: ``` sed -i 's/\(\bfn\b.\))\s{/\1) void {/g' $(find ../ -name "*.zig") ``` Some cleanup may be necessary, but this should do the bulk of the work. 2018-01-25 01:10:11 -08:00			`pub fn utf8Decode2(bytes: []const u8) %u32 {`
move utf8 parsing to std source files no longer need to end with a newline 2017-12-26 22:17:33 -08:00			`std.debug.assert(bytes.len == 2);`
			`std.debug.assert(bytes[0] & 0b11100000 == 0b11000000);`
			`var value: u32 = bytes[0] & 0b00011111;`

			`if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;`
			`value <<= 6;`
			`value \|= bytes[1] & 0b00111111;`

			`if (value < 0x80) return error.Utf8OverlongEncoding;`

			`return value;`
			`}`
syntax: functions require return type. remove `->` The purpose of this is: * Only one way to do things * Changing a function with void return type to return a possible error becomes a 1 character change, subtly encouraging people to use errors. See #632 Here are some imperfect sed commands for performing this update: remove arrow: ``` sed -i 's/\(\bfn\b.\)-> /\1/g' $(find . -name ".zig") ``` add void: ``` sed -i 's/\(\bfn\b.\))\s{/\1) void {/g' $(find ../ -name "*.zig") ``` Some cleanup may be necessary, but this should do the bulk of the work. 2018-01-25 01:10:11 -08:00			`pub fn utf8Decode3(bytes: []const u8) %u32 {`
move utf8 parsing to std source files no longer need to end with a newline 2017-12-26 22:17:33 -08:00			`std.debug.assert(bytes.len == 3);`
			`std.debug.assert(bytes[0] & 0b11110000 == 0b11100000);`
			`var value: u32 = bytes[0] & 0b00001111;`

			`if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;`
			`value <<= 6;`
			`value \|= bytes[1] & 0b00111111;`

			`if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;`
			`value <<= 6;`
			`value \|= bytes[2] & 0b00111111;`

			`if (value < 0x800) return error.Utf8OverlongEncoding;`
			`if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;`

			`return value;`
			`}`
syntax: functions require return type. remove `->` The purpose of this is: * Only one way to do things * Changing a function with void return type to return a possible error becomes a 1 character change, subtly encouraging people to use errors. See #632 Here are some imperfect sed commands for performing this update: remove arrow: ``` sed -i 's/\(\bfn\b.\)-> /\1/g' $(find . -name ".zig") ``` add void: ``` sed -i 's/\(\bfn\b.\))\s{/\1) void {/g' $(find ../ -name "*.zig") ``` Some cleanup may be necessary, but this should do the bulk of the work. 2018-01-25 01:10:11 -08:00			`pub fn utf8Decode4(bytes: []const u8) %u32 {`
move utf8 parsing to std source files no longer need to end with a newline 2017-12-26 22:17:33 -08:00			`std.debug.assert(bytes.len == 4);`
			`std.debug.assert(bytes[0] & 0b11111000 == 0b11110000);`
			`var value: u32 = bytes[0] & 0b00000111;`

			`if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;`
			`value <<= 6;`
			`value \|= bytes[1] & 0b00111111;`

			`if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;`
			`value <<= 6;`
			`value \|= bytes[2] & 0b00111111;`

			`if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;`
			`value <<= 6;`
			`value \|= bytes[3] & 0b00111111;`

			`if (value < 0x10000) return error.Utf8OverlongEncoding;`
			`if (value > 0x10FFFF) return error.Utf8CodepointTooLarge;`

			`return value;`
			`}`

			`error UnexpectedEof;`
			`test "valid utf8" {`
			`testValid("\x00", 0x0);`
			`testValid("\x20", 0x20);`
			`testValid("\x7f", 0x7f);`
			`testValid("\xc2\x80", 0x80);`
			`testValid("\xdf\xbf", 0x7ff);`
			`testValid("\xe0\xa0\x80", 0x800);`
			`testValid("\xe1\x80\x80", 0x1000);`
			`testValid("\xef\xbf\xbf", 0xffff);`
			`testValid("\xf0\x90\x80\x80", 0x10000);`
			`testValid("\xf1\x80\x80\x80", 0x40000);`
			`testValid("\xf3\xbf\xbf\xbf", 0xfffff);`
			`testValid("\xf4\x8f\xbf\xbf", 0x10ffff);`
			`}`

			`test "invalid utf8 continuation bytes" {`
			`// unexpected continuation`
			`testError("\x80", error.Utf8InvalidStartByte);`
			`testError("\xbf", error.Utf8InvalidStartByte);`
			`// too many leading 1's`
			`testError("\xf8", error.Utf8InvalidStartByte);`
			`testError("\xff", error.Utf8InvalidStartByte);`
			`// expected continuation for 2 byte sequences`
			`testError("\xc2", error.UnexpectedEof);`
			`testError("\xc2\x00", error.Utf8ExpectedContinuation);`
			`testError("\xc2\xc0", error.Utf8ExpectedContinuation);`
			`// expected continuation for 3 byte sequences`
			`testError("\xe0", error.UnexpectedEof);`
			`testError("\xe0\x00", error.UnexpectedEof);`
			`testError("\xe0\xc0", error.UnexpectedEof);`
			`testError("\xe0\xa0", error.UnexpectedEof);`
			`testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation);`
			`testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation);`
			`// expected continuation for 4 byte sequences`
			`testError("\xf0", error.UnexpectedEof);`
			`testError("\xf0\x00", error.UnexpectedEof);`
			`testError("\xf0\xc0", error.UnexpectedEof);`
			`testError("\xf0\x90\x00", error.UnexpectedEof);`
			`testError("\xf0\x90\xc0", error.UnexpectedEof);`
			`testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation);`
			`testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation);`
			`}`

			`test "overlong utf8 codepoint" {`
			`testError("\xc0\x80", error.Utf8OverlongEncoding);`
			`testError("\xc1\xbf", error.Utf8OverlongEncoding);`
			`testError("\xe0\x80\x80", error.Utf8OverlongEncoding);`
			`testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding);`
			`testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding);`
			`testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding);`
			`}`

			`test "misc invalid utf8" {`
			`// codepoint out of bounds`
			`testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);`
			`testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);`
			`// surrogate halves`
			`testValid("\xed\x9f\xbf", 0xd7ff);`
			`testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf);`
			`testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf);`
			`testValid("\xee\x80\x80", 0xe000);`
			`}`

syntax: functions require return type. remove `->` The purpose of this is: * Only one way to do things * Changing a function with void return type to return a possible error becomes a 1 character change, subtly encouraging people to use errors. See #632 Here are some imperfect sed commands for performing this update: remove arrow: ``` sed -i 's/\(\bfn\b.\)-> /\1/g' $(find . -name ".zig") ``` add void: ``` sed -i 's/\(\bfn\b.\))\s{/\1) void {/g' $(find ../ -name "*.zig") ``` Some cleanup may be necessary, but this should do the bulk of the work. 2018-01-25 01:10:11 -08:00			`fn testError(bytes: []const u8, expected_err: error) void {`
move utf8 parsing to std source files no longer need to end with a newline 2017-12-26 22:17:33 -08:00			`if (testDecode(bytes)) \|_\| {`
			`unreachable;`
			`} else \|err\| {`
			`std.debug.assert(err == expected_err);`
			`}`
			`}`

syntax: functions require return type. remove `->` The purpose of this is: * Only one way to do things * Changing a function with void return type to return a possible error becomes a 1 character change, subtly encouraging people to use errors. See #632 Here are some imperfect sed commands for performing this update: remove arrow: ``` sed -i 's/\(\bfn\b.\)-> /\1/g' $(find . -name ".zig") ``` add void: ``` sed -i 's/\(\bfn\b.\))\s{/\1) void {/g' $(find ../ -name "*.zig") ``` Some cleanup may be necessary, but this should do the bulk of the work. 2018-01-25 01:10:11 -08:00			`fn testValid(bytes: []const u8, expected_codepoint: u32) void {`
remove %% prefix operator See #632 closes #545 closes #510 this makes #651 higher priority 2018-01-08 21:07:01 -08:00			`std.debug.assert((testDecode(bytes) catch unreachable) == expected_codepoint);`
move utf8 parsing to std source files no longer need to end with a newline 2017-12-26 22:17:33 -08:00			`}`

syntax: functions require return type. remove `->` The purpose of this is: * Only one way to do things * Changing a function with void return type to return a possible error becomes a 1 character change, subtly encouraging people to use errors. See #632 Here are some imperfect sed commands for performing this update: remove arrow: ``` sed -i 's/\(\bfn\b.\)-> /\1/g' $(find . -name ".zig") ``` add void: ``` sed -i 's/\(\bfn\b.\))\s{/\1) void {/g' $(find ../ -name "*.zig") ``` Some cleanup may be necessary, but this should do the bulk of the work. 2018-01-25 01:10:11 -08:00			`fn testDecode(bytes: []const u8) %u32 {`
replace `%return` with `try` See #632 better fits the convention of using keywords for control flow 2018-01-07 13:51:46 -08:00			`const length = try utf8ByteSequenceLength(bytes[0]);`
move utf8 parsing to std source files no longer need to end with a newline 2017-12-26 22:17:33 -08:00			`if (bytes.len < length) return error.UnexpectedEof;`
			`std.debug.assert(bytes.len == length);`
			`return utf8Decode(bytes);`
			`}`