170 lines
6.1 KiB
Zig
170 lines
6.1 KiB
Zig
|
const std = @import("./index.zig");
|
||
|
|
||
|
error Utf8InvalidStartByte;
|
||
|
|
||
|
/// Given the first byte of a UTF-8 codepoint,
|
||
|
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
|
||
|
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
|
||
|
pub fn utf8ByteSequenceLength(first_byte: u8) -> %u3 {
|
||
|
if (first_byte < 0b10000000) return u3(1);
|
||
|
if (first_byte & 0b11100000 == 0b11000000) return u3(2);
|
||
|
if (first_byte & 0b11110000 == 0b11100000) return u3(3);
|
||
|
if (first_byte & 0b11111000 == 0b11110000) return u3(4);
|
||
|
return error.Utf8InvalidStartByte;
|
||
|
}
|
||
|
|
||
|
error Utf8OverlongEncoding;
|
||
|
error Utf8ExpectedContinuation;
|
||
|
error Utf8EncodesSurrogateHalf;
|
||
|
error Utf8CodepointTooLarge;
|
||
|
|
||
|
/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
|
||
|
/// bytes.len must be equal to %%utf8ByteSequenceLength(bytes[0]).
|
||
|
/// If you already know the length at comptime, you can call one of
|
||
|
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
|
||
|
pub fn utf8Decode(bytes: []const u8) -> %u32 {
|
||
|
return switch (bytes.len) {
|
||
|
1 => u32(bytes[0]),
|
||
|
2 => utf8Decode2(bytes),
|
||
|
3 => utf8Decode3(bytes),
|
||
|
4 => utf8Decode4(bytes),
|
||
|
else => unreachable,
|
||
|
};
|
||
|
}
|
||
|
pub fn utf8Decode2(bytes: []const u8) -> %u32 {
|
||
|
std.debug.assert(bytes.len == 2);
|
||
|
std.debug.assert(bytes[0] & 0b11100000 == 0b11000000);
|
||
|
var value: u32 = bytes[0] & 0b00011111;
|
||
|
|
||
|
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
||
|
value <<= 6;
|
||
|
value |= bytes[1] & 0b00111111;
|
||
|
|
||
|
if (value < 0x80) return error.Utf8OverlongEncoding;
|
||
|
|
||
|
return value;
|
||
|
}
|
||
|
pub fn utf8Decode3(bytes: []const u8) -> %u32 {
|
||
|
std.debug.assert(bytes.len == 3);
|
||
|
std.debug.assert(bytes[0] & 0b11110000 == 0b11100000);
|
||
|
var value: u32 = bytes[0] & 0b00001111;
|
||
|
|
||
|
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
||
|
value <<= 6;
|
||
|
value |= bytes[1] & 0b00111111;
|
||
|
|
||
|
if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
||
|
value <<= 6;
|
||
|
value |= bytes[2] & 0b00111111;
|
||
|
|
||
|
if (value < 0x800) return error.Utf8OverlongEncoding;
|
||
|
if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
|
||
|
|
||
|
return value;
|
||
|
}
|
||
|
pub fn utf8Decode4(bytes: []const u8) -> %u32 {
|
||
|
std.debug.assert(bytes.len == 4);
|
||
|
std.debug.assert(bytes[0] & 0b11111000 == 0b11110000);
|
||
|
var value: u32 = bytes[0] & 0b00000111;
|
||
|
|
||
|
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
||
|
value <<= 6;
|
||
|
value |= bytes[1] & 0b00111111;
|
||
|
|
||
|
if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
||
|
value <<= 6;
|
||
|
value |= bytes[2] & 0b00111111;
|
||
|
|
||
|
if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
||
|
value <<= 6;
|
||
|
value |= bytes[3] & 0b00111111;
|
||
|
|
||
|
if (value < 0x10000) return error.Utf8OverlongEncoding;
|
||
|
if (value > 0x10FFFF) return error.Utf8CodepointTooLarge;
|
||
|
|
||
|
return value;
|
||
|
}
|
||
|
|
||
|
error UnexpectedEof;
|
||
|
test "valid utf8" {
|
||
|
testValid("\x00", 0x0);
|
||
|
testValid("\x20", 0x20);
|
||
|
testValid("\x7f", 0x7f);
|
||
|
testValid("\xc2\x80", 0x80);
|
||
|
testValid("\xdf\xbf", 0x7ff);
|
||
|
testValid("\xe0\xa0\x80", 0x800);
|
||
|
testValid("\xe1\x80\x80", 0x1000);
|
||
|
testValid("\xef\xbf\xbf", 0xffff);
|
||
|
testValid("\xf0\x90\x80\x80", 0x10000);
|
||
|
testValid("\xf1\x80\x80\x80", 0x40000);
|
||
|
testValid("\xf3\xbf\xbf\xbf", 0xfffff);
|
||
|
testValid("\xf4\x8f\xbf\xbf", 0x10ffff);
|
||
|
}
|
||
|
|
||
|
test "invalid utf8 continuation bytes" {
|
||
|
// unexpected continuation
|
||
|
testError("\x80", error.Utf8InvalidStartByte);
|
||
|
testError("\xbf", error.Utf8InvalidStartByte);
|
||
|
// too many leading 1's
|
||
|
testError("\xf8", error.Utf8InvalidStartByte);
|
||
|
testError("\xff", error.Utf8InvalidStartByte);
|
||
|
// expected continuation for 2 byte sequences
|
||
|
testError("\xc2", error.UnexpectedEof);
|
||
|
testError("\xc2\x00", error.Utf8ExpectedContinuation);
|
||
|
testError("\xc2\xc0", error.Utf8ExpectedContinuation);
|
||
|
// expected continuation for 3 byte sequences
|
||
|
testError("\xe0", error.UnexpectedEof);
|
||
|
testError("\xe0\x00", error.UnexpectedEof);
|
||
|
testError("\xe0\xc0", error.UnexpectedEof);
|
||
|
testError("\xe0\xa0", error.UnexpectedEof);
|
||
|
testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation);
|
||
|
testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation);
|
||
|
// expected continuation for 4 byte sequences
|
||
|
testError("\xf0", error.UnexpectedEof);
|
||
|
testError("\xf0\x00", error.UnexpectedEof);
|
||
|
testError("\xf0\xc0", error.UnexpectedEof);
|
||
|
testError("\xf0\x90\x00", error.UnexpectedEof);
|
||
|
testError("\xf0\x90\xc0", error.UnexpectedEof);
|
||
|
testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation);
|
||
|
testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation);
|
||
|
}
|
||
|
|
||
|
test "overlong utf8 codepoint" {
|
||
|
testError("\xc0\x80", error.Utf8OverlongEncoding);
|
||
|
testError("\xc1\xbf", error.Utf8OverlongEncoding);
|
||
|
testError("\xe0\x80\x80", error.Utf8OverlongEncoding);
|
||
|
testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding);
|
||
|
testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding);
|
||
|
testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding);
|
||
|
}
|
||
|
|
||
|
test "misc invalid utf8" {
|
||
|
// codepoint out of bounds
|
||
|
testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
|
||
|
testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);
|
||
|
// surrogate halves
|
||
|
testValid("\xed\x9f\xbf", 0xd7ff);
|
||
|
testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf);
|
||
|
testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf);
|
||
|
testValid("\xee\x80\x80", 0xe000);
|
||
|
}
|
||
|
|
||
|
fn testError(bytes: []const u8, expected_err: error) {
|
||
|
if (testDecode(bytes)) |_| {
|
||
|
unreachable;
|
||
|
} else |err| {
|
||
|
std.debug.assert(err == expected_err);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fn testValid(bytes: []const u8, expected_codepoint: u32) {
|
||
|
std.debug.assert(%%testDecode(bytes) == expected_codepoint);
|
||
|
}
|
||
|
|
||
|
fn testDecode(bytes: []const u8) -> %u32 {
|
||
|
const length = %return utf8ByteSequenceLength(bytes[0]);
|
||
|
if (bytes.len < length) return error.UnexpectedEof;
|
||
|
std.debug.assert(bytes.len == length);
|
||
|
return utf8Decode(bytes);
|
||
|
}
|