use explicit error sets for utf8Decode functions
and run unicode tests at comptime also
This commit is contained in:
parent
c03b9010db
commit
9543c0a7cc
@ -57,11 +57,12 @@ pub fn utf8Encode(c: u32, out: []u8) !u3 {
|
||||
return length;
|
||||
}
|
||||
|
||||
const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
|
||||
/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
|
||||
/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
|
||||
/// If you already know the length at comptime, you can call one of
|
||||
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
|
||||
pub fn utf8Decode(bytes: []const u8) !u32 {
|
||||
pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 {
|
||||
return switch (bytes.len) {
|
||||
1 => u32(bytes[0]),
|
||||
2 => utf8Decode2(bytes),
|
||||
@ -71,7 +72,11 @@ pub fn utf8Decode(bytes: []const u8) !u32 {
|
||||
};
|
||||
}
|
||||
|
||||
pub fn utf8Decode2(bytes: []const u8) !u32 {
|
||||
const Utf8Decode2Error = error{
|
||||
Utf8ExpectedContinuation,
|
||||
Utf8OverlongEncoding,
|
||||
};
|
||||
pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 {
|
||||
debug.assert(bytes.len == 2);
|
||||
debug.assert(bytes[0] & 0b11100000 == 0b11000000);
|
||||
var value: u32 = bytes[0] & 0b00011111;
|
||||
@ -85,7 +90,12 @@ pub fn utf8Decode2(bytes: []const u8) !u32 {
|
||||
return value;
|
||||
}
|
||||
|
||||
pub fn utf8Decode3(bytes: []const u8) !u32 {
|
||||
const Utf8Decode3Error = error{
|
||||
Utf8ExpectedContinuation,
|
||||
Utf8OverlongEncoding,
|
||||
Utf8EncodesSurrogateHalf,
|
||||
};
|
||||
pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 {
|
||||
debug.assert(bytes.len == 3);
|
||||
debug.assert(bytes[0] & 0b11110000 == 0b11100000);
|
||||
var value: u32 = bytes[0] & 0b00001111;
|
||||
@ -104,7 +114,12 @@ pub fn utf8Decode3(bytes: []const u8) !u32 {
|
||||
return value;
|
||||
}
|
||||
|
||||
pub fn utf8Decode4(bytes: []const u8) !u32 {
|
||||
const Utf8Decode4Error = error{
|
||||
Utf8ExpectedContinuation,
|
||||
Utf8OverlongEncoding,
|
||||
Utf8CodepointTooLarge,
|
||||
};
|
||||
pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 {
|
||||
debug.assert(bytes.len == 4);
|
||||
debug.assert(bytes[0] & 0b11111000 == 0b11110000);
|
||||
var value: u32 = bytes[0] & 0b00000111;
|
||||
@ -206,19 +221,21 @@ const Utf8Iterator = struct {
|
||||
pub fn nextCodepoint(it: &Utf8Iterator) ?u32 {
|
||||
const slice = it.nextCodepointSlice() ?? return null;
|
||||
|
||||
const r = switch (slice.len) {
|
||||
1 => u32(slice[0]),
|
||||
2 => utf8Decode2(slice),
|
||||
3 => utf8Decode3(slice),
|
||||
4 => utf8Decode4(slice),
|
||||
switch (slice.len) {
|
||||
1 => return u32(slice[0]),
|
||||
2 => return utf8Decode2(slice) catch unreachable,
|
||||
3 => return utf8Decode3(slice) catch unreachable,
|
||||
4 => return utf8Decode4(slice) catch unreachable,
|
||||
else => unreachable,
|
||||
};
|
||||
|
||||
return r catch unreachable;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
test "utf8 encode" {
|
||||
comptime testUtf8Encode() catch unreachable;
|
||||
try testUtf8Encode();
|
||||
}
|
||||
fn testUtf8Encode() !void {
|
||||
// A few taken from wikipedia a few taken elsewhere
|
||||
var array: [4]u8 = undefined;
|
||||
debug.assert((try utf8Encode(try utf8Decode("€"), array[0..])) == 3);
|
||||
@ -241,6 +258,10 @@ test "utf8 encode" {
|
||||
}
|
||||
|
||||
test "utf8 encode error" {
|
||||
comptime testUtf8EncodeError();
|
||||
testUtf8EncodeError();
|
||||
}
|
||||
fn testUtf8EncodeError() void {
|
||||
var array: [4]u8 = undefined;
|
||||
testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf);
|
||||
testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf);
|
||||
@ -257,6 +278,10 @@ fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void {
|
||||
}
|
||||
|
||||
test "utf8 iterator on ascii" {
|
||||
comptime testUtf8IteratorOnAscii();
|
||||
testUtf8IteratorOnAscii();
|
||||
}
|
||||
fn testUtf8IteratorOnAscii() void {
|
||||
const s = Utf8View.initComptime("abc");
|
||||
|
||||
var it1 = s.iterator();
|
||||
@ -273,6 +298,10 @@ test "utf8 iterator on ascii" {
|
||||
}
|
||||
|
||||
test "utf8 view bad" {
|
||||
comptime testUtf8ViewBad();
|
||||
testUtf8ViewBad();
|
||||
}
|
||||
fn testUtf8ViewBad() void {
|
||||
// Compile-time error.
|
||||
// const s3 = Utf8View.initComptime("\xfe\xf2");
|
||||
|
||||
@ -281,6 +310,10 @@ test "utf8 view bad" {
|
||||
}
|
||||
|
||||
test "utf8 view ok" {
|
||||
comptime testUtf8ViewOk();
|
||||
testUtf8ViewOk();
|
||||
}
|
||||
fn testUtf8ViewOk() void {
|
||||
const s = Utf8View.initComptime("東京市");
|
||||
|
||||
var it1 = s.iterator();
|
||||
@ -297,6 +330,10 @@ test "utf8 view ok" {
|
||||
}
|
||||
|
||||
test "bad utf8 slice" {
|
||||
comptime testBadUtf8Slice();
|
||||
testBadUtf8Slice();
|
||||
}
|
||||
fn testBadUtf8Slice() void {
|
||||
debug.assert(utf8ValidateSlice("abc"));
|
||||
debug.assert(!utf8ValidateSlice("abc\xc0"));
|
||||
debug.assert(!utf8ValidateSlice("abc\xc0abc"));
|
||||
@ -304,6 +341,10 @@ test "bad utf8 slice" {
|
||||
}
|
||||
|
||||
test "valid utf8" {
|
||||
comptime testValidUtf8();
|
||||
testValidUtf8();
|
||||
}
|
||||
fn testValidUtf8() void {
|
||||
testValid("\x00", 0x0);
|
||||
testValid("\x20", 0x20);
|
||||
testValid("\x7f", 0x7f);
|
||||
@ -319,6 +360,10 @@ test "valid utf8" {
|
||||
}
|
||||
|
||||
test "invalid utf8 continuation bytes" {
|
||||
comptime testInvalidUtf8ContinuationBytes();
|
||||
testInvalidUtf8ContinuationBytes();
|
||||
}
|
||||
fn testInvalidUtf8ContinuationBytes() void {
|
||||
// unexpected continuation
|
||||
testError("\x80", error.Utf8InvalidStartByte);
|
||||
testError("\xbf", error.Utf8InvalidStartByte);
|
||||
@ -347,6 +392,10 @@ test "invalid utf8 continuation bytes" {
|
||||
}
|
||||
|
||||
test "overlong utf8 codepoint" {
|
||||
comptime testOverlongUtf8Codepoint();
|
||||
testOverlongUtf8Codepoint();
|
||||
}
|
||||
fn testOverlongUtf8Codepoint() void {
|
||||
testError("\xc0\x80", error.Utf8OverlongEncoding);
|
||||
testError("\xc1\xbf", error.Utf8OverlongEncoding);
|
||||
testError("\xe0\x80\x80", error.Utf8OverlongEncoding);
|
||||
@ -356,6 +405,10 @@ test "overlong utf8 codepoint" {
|
||||
}
|
||||
|
||||
test "misc invalid utf8" {
|
||||
comptime testMiscInvalidUtf8();
|
||||
testMiscInvalidUtf8();
|
||||
}
|
||||
fn testMiscInvalidUtf8() void {
|
||||
// codepoint out of bounds
|
||||
testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
|
||||
testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);
|
||||
|
Loading…
x
Reference in New Issue
Block a user