2017-12-26 22:17:33 -08:00
|
|
|
const std = @import("./index.zig");
|
2018-07-18 07:07:22 -07:00
|
|
|
const builtin = @import("builtin");
|
2018-02-23 00:20:15 -08:00
|
|
|
const debug = std.debug;
|
2018-07-18 07:07:22 -07:00
|
|
|
const assert = std.debug.assert;
|
|
|
|
const mem = std.mem;
|
2017-12-26 22:17:33 -08:00
|
|
|
|
2018-04-29 14:28:11 -07:00
|
|
|
/// Returns how many bytes the UTF-8 representation would require
|
|
|
|
/// for the given codepoint.
|
2018-04-24 21:59:03 -07:00
|
|
|
pub fn utf8CodepointSequenceLength(c: u32) !u3 {
|
|
|
|
if (c < 0x80) return u3(1);
|
|
|
|
if (c < 0x800) return u3(2);
|
|
|
|
if (c < 0x10000) return u3(3);
|
|
|
|
if (c < 0x110000) return u3(4);
|
|
|
|
return error.CodepointTooLarge;
|
|
|
|
}
|
|
|
|
|
2017-12-26 22:17:33 -08:00
|
|
|
/// Given the first byte of a UTF-8 codepoint,
|
|
|
|
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
|
|
|
|
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
|
2018-01-31 19:48:40 -08:00
|
|
|
pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
|
2017-12-26 22:17:33 -08:00
|
|
|
if (first_byte < 0b10000000) return u3(1);
|
|
|
|
if (first_byte & 0b11100000 == 0b11000000) return u3(2);
|
|
|
|
if (first_byte & 0b11110000 == 0b11100000) return u3(3);
|
|
|
|
if (first_byte & 0b11111000 == 0b11110000) return u3(4);
|
|
|
|
return error.Utf8InvalidStartByte;
|
|
|
|
}
|
|
|
|
|
2018-04-29 14:28:11 -07:00
|
|
|
/// Encodes the given codepoint into a UTF-8 byte sequence.
|
|
|
|
/// c: the codepoint.
|
|
|
|
/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
|
|
|
|
/// Errors: if c cannot be encoded in UTF-8.
|
|
|
|
/// Returns: the number of bytes written to out.
|
2018-04-24 21:59:03 -07:00
|
|
|
pub fn utf8Encode(c: u32, out: []u8) !u3 {
|
2018-04-29 14:28:11 -07:00
|
|
|
const length = try utf8CodepointSequenceLength(c);
|
|
|
|
debug.assert(out.len >= length);
|
|
|
|
switch (length) {
|
|
|
|
// The pattern for each is the same
|
|
|
|
// - Increasing the initial shift by 6 each time
|
|
|
|
// - Each time after the first shorten the shifted
|
|
|
|
// value to a max of 0b111111 (63)
|
2018-06-16 23:57:07 -07:00
|
|
|
1 => out[0] = @intCast(u8, c), // Can just do 0 + codepoint for initial range
|
2018-04-29 14:28:11 -07:00
|
|
|
2 => {
|
2018-06-16 23:57:07 -07:00
|
|
|
out[0] = @intCast(u8, 0b11000000 | (c >> 6));
|
|
|
|
out[1] = @intCast(u8, 0b10000000 | (c & 0b111111));
|
2018-04-29 14:28:11 -07:00
|
|
|
},
|
|
|
|
3 => {
|
|
|
|
if (0xd800 <= c and c <= 0xdfff) return error.Utf8CannotEncodeSurrogateHalf;
|
2018-06-16 23:57:07 -07:00
|
|
|
out[0] = @intCast(u8, 0b11100000 | (c >> 12));
|
|
|
|
out[1] = @intCast(u8, 0b10000000 | ((c >> 6) & 0b111111));
|
|
|
|
out[2] = @intCast(u8, 0b10000000 | (c & 0b111111));
|
2018-04-29 14:28:11 -07:00
|
|
|
},
|
|
|
|
4 => {
|
2018-06-16 23:57:07 -07:00
|
|
|
out[0] = @intCast(u8, 0b11110000 | (c >> 18));
|
|
|
|
out[1] = @intCast(u8, 0b10000000 | ((c >> 12) & 0b111111));
|
|
|
|
out[2] = @intCast(u8, 0b10000000 | ((c >> 6) & 0b111111));
|
|
|
|
out[3] = @intCast(u8, 0b10000000 | (c & 0b111111));
|
2018-04-29 14:28:11 -07:00
|
|
|
},
|
|
|
|
else => unreachable,
|
2018-04-24 21:59:03 -07:00
|
|
|
}
|
2018-04-29 14:28:11 -07:00
|
|
|
return length;
|
2018-04-24 21:59:03 -07:00
|
|
|
}
|
|
|
|
|
2018-04-29 14:38:41 -07:00
|
|
|
const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
|
2018-05-28 17:23:55 -07:00
|
|
|
|
2017-12-26 22:17:33 -08:00
|
|
|
/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
|
2018-01-08 21:07:01 -08:00
|
|
|
/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
|
2017-12-26 22:17:33 -08:00
|
|
|
/// If you already know the length at comptime, you can call one of
|
|
|
|
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
|
2018-04-29 14:38:41 -07:00
|
|
|
pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 {
|
2017-12-26 22:17:33 -08:00
|
|
|
return switch (bytes.len) {
|
|
|
|
1 => u32(bytes[0]),
|
|
|
|
2 => utf8Decode2(bytes),
|
|
|
|
3 => utf8Decode3(bytes),
|
|
|
|
4 => utf8Decode4(bytes),
|
|
|
|
else => unreachable,
|
|
|
|
};
|
|
|
|
}
|
2018-04-24 21:59:03 -07:00
|
|
|
|
2018-04-29 14:38:41 -07:00
|
|
|
const Utf8Decode2Error = error{
|
|
|
|
Utf8ExpectedContinuation,
|
|
|
|
Utf8OverlongEncoding,
|
|
|
|
};
|
|
|
|
pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 {
|
2018-02-23 00:20:15 -08:00
|
|
|
debug.assert(bytes.len == 2);
|
|
|
|
debug.assert(bytes[0] & 0b11100000 == 0b11000000);
|
2017-12-26 22:17:33 -08:00
|
|
|
var value: u32 = bytes[0] & 0b00011111;
|
|
|
|
|
|
|
|
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
|
|
|
value <<= 6;
|
|
|
|
value |= bytes[1] & 0b00111111;
|
|
|
|
|
|
|
|
if (value < 0x80) return error.Utf8OverlongEncoding;
|
|
|
|
|
|
|
|
return value;
|
|
|
|
}
|
2018-04-24 21:59:03 -07:00
|
|
|
|
2018-04-29 14:38:41 -07:00
|
|
|
const Utf8Decode3Error = error{
|
|
|
|
Utf8ExpectedContinuation,
|
|
|
|
Utf8OverlongEncoding,
|
|
|
|
Utf8EncodesSurrogateHalf,
|
|
|
|
};
|
|
|
|
pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 {
|
2018-02-23 00:20:15 -08:00
|
|
|
debug.assert(bytes.len == 3);
|
|
|
|
debug.assert(bytes[0] & 0b11110000 == 0b11100000);
|
2017-12-26 22:17:33 -08:00
|
|
|
var value: u32 = bytes[0] & 0b00001111;
|
|
|
|
|
|
|
|
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
|
|
|
value <<= 6;
|
|
|
|
value |= bytes[1] & 0b00111111;
|
|
|
|
|
|
|
|
if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
|
|
|
value <<= 6;
|
|
|
|
value |= bytes[2] & 0b00111111;
|
|
|
|
|
|
|
|
if (value < 0x800) return error.Utf8OverlongEncoding;
|
|
|
|
if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
|
|
|
|
|
|
|
|
return value;
|
|
|
|
}
|
2018-04-24 21:59:03 -07:00
|
|
|
|
2018-04-29 14:38:41 -07:00
|
|
|
const Utf8Decode4Error = error{
|
|
|
|
Utf8ExpectedContinuation,
|
|
|
|
Utf8OverlongEncoding,
|
|
|
|
Utf8CodepointTooLarge,
|
|
|
|
};
|
|
|
|
pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 {
|
2018-02-23 00:20:15 -08:00
|
|
|
debug.assert(bytes.len == 4);
|
|
|
|
debug.assert(bytes[0] & 0b11111000 == 0b11110000);
|
2017-12-26 22:17:33 -08:00
|
|
|
var value: u32 = bytes[0] & 0b00000111;
|
|
|
|
|
|
|
|
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
|
|
|
value <<= 6;
|
|
|
|
value |= bytes[1] & 0b00111111;
|
|
|
|
|
|
|
|
if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
|
|
|
value <<= 6;
|
|
|
|
value |= bytes[2] & 0b00111111;
|
|
|
|
|
|
|
|
if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
|
|
|
|
value <<= 6;
|
|
|
|
value |= bytes[3] & 0b00111111;
|
|
|
|
|
|
|
|
if (value < 0x10000) return error.Utf8OverlongEncoding;
|
|
|
|
if (value > 0x10FFFF) return error.Utf8CodepointTooLarge;
|
|
|
|
|
|
|
|
return value;
|
|
|
|
}
|
|
|
|
|
2018-02-23 00:20:15 -08:00
|
|
|
pub fn utf8ValidateSlice(s: []const u8) bool {
|
|
|
|
var i: usize = 0;
|
|
|
|
while (i < s.len) {
|
|
|
|
if (utf8ByteSequenceLength(s[i])) |cp_len| {
|
|
|
|
if (i + cp_len > s.len) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-05-30 13:09:11 -07:00
|
|
|
if (utf8Decode(s[i .. i + cp_len])) |_| {} else |_| {
|
2018-05-28 17:23:55 -07:00
|
|
|
return false;
|
|
|
|
}
|
2018-02-23 00:20:15 -08:00
|
|
|
i += cp_len;
|
|
|
|
} else |err| {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-03-06 07:41:07 -08:00
|
|
|
/// Utf8View iterates the code points of a utf-8 encoded string.
|
2018-03-05 21:42:01 -08:00
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// var utf8 = (try std.unicode.Utf8View.init("hi there")).iterator();
|
|
|
|
/// while (utf8.nextCodepointSlice()) |codepoint| {
|
|
|
|
/// std.debug.warn("got codepoint {}\n", codepoint);
|
|
|
|
/// }
|
|
|
|
/// ```
|
|
|
|
pub const Utf8View = struct {
|
2018-02-23 00:20:15 -08:00
|
|
|
bytes: []const u8,
|
|
|
|
|
|
|
|
pub fn init(s: []const u8) !Utf8View {
|
|
|
|
if (!utf8ValidateSlice(s)) {
|
|
|
|
return error.InvalidUtf8;
|
|
|
|
}
|
|
|
|
|
|
|
|
return initUnchecked(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn initUnchecked(s: []const u8) Utf8View {
|
2018-05-28 17:23:55 -07:00
|
|
|
return Utf8View{ .bytes = s };
|
2018-02-23 00:20:15 -08:00
|
|
|
}
|
|
|
|
|
2018-08-09 13:48:44 -07:00
|
|
|
/// TODO: https://github.com/ziglang/zig/issues/425
|
2018-02-23 00:20:15 -08:00
|
|
|
pub fn initComptime(comptime s: []const u8) Utf8View {
|
|
|
|
if (comptime init(s)) |r| {
|
|
|
|
return r;
|
|
|
|
} else |err| switch (err) {
|
|
|
|
error.InvalidUtf8 => {
|
|
|
|
@compileError("invalid utf8");
|
|
|
|
unreachable;
|
2018-05-28 17:23:55 -07:00
|
|
|
},
|
2018-02-23 00:20:15 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-09 13:48:44 -07:00
|
|
|
pub fn iterator(s: Utf8View) Utf8Iterator {
|
2018-05-28 17:23:55 -07:00
|
|
|
return Utf8Iterator{
|
2018-02-23 00:20:15 -08:00
|
|
|
.bytes = s.bytes,
|
|
|
|
.i = 0,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
const Utf8Iterator = struct {
|
|
|
|
bytes: []const u8,
|
|
|
|
i: usize,
|
|
|
|
|
2018-05-31 07:56:59 -07:00
|
|
|
pub fn nextCodepointSlice(it: *Utf8Iterator) ?[]const u8 {
|
2018-02-23 00:20:15 -08:00
|
|
|
if (it.i >= it.bytes.len) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable;
|
|
|
|
it.i += cp_len;
|
2018-05-30 13:09:11 -07:00
|
|
|
return it.bytes[it.i - cp_len .. it.i];
|
2018-02-23 00:20:15 -08:00
|
|
|
}
|
|
|
|
|
2018-05-31 07:56:59 -07:00
|
|
|
pub fn nextCodepoint(it: *Utf8Iterator) ?u32 {
|
2018-06-09 22:13:51 -07:00
|
|
|
const slice = it.nextCodepointSlice() orelse return null;
|
2018-02-23 00:20:15 -08:00
|
|
|
|
2018-04-29 14:38:41 -07:00
|
|
|
switch (slice.len) {
|
|
|
|
1 => return u32(slice[0]),
|
|
|
|
2 => return utf8Decode2(slice) catch unreachable,
|
|
|
|
3 => return utf8Decode3(slice) catch unreachable,
|
|
|
|
4 => return utf8Decode4(slice) catch unreachable,
|
2018-02-23 00:20:15 -08:00
|
|
|
else => unreachable,
|
2018-04-29 14:38:41 -07:00
|
|
|
}
|
2018-02-23 00:20:15 -08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2018-08-20 21:46:42 -07:00
|
|
|
pub const Utf16LeIterator = struct {
|
|
|
|
bytes: []const u8,
|
|
|
|
i: usize,
|
|
|
|
|
|
|
|
pub fn init(s: []const u16) Utf16LeIterator {
|
|
|
|
return Utf16LeIterator{
|
|
|
|
.bytes = @sliceToBytes(s),
|
|
|
|
.i = 0,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn nextCodepoint(it: *Utf16LeIterator) !?u32 {
|
2018-08-21 13:07:28 -07:00
|
|
|
assert(it.i <= it.bytes.len);
|
|
|
|
if (it.i == it.bytes.len) return null;
|
2018-08-20 21:46:42 -07:00
|
|
|
const c0: u32 = mem.readIntLE(u16, it.bytes[it.i .. it.i + 2]);
|
|
|
|
if (c0 & ~u32(0x03ff) == 0xd800) {
|
|
|
|
// surrogate pair
|
|
|
|
it.i += 2;
|
|
|
|
if (it.i >= it.bytes.len) return error.DanglingSurrogateHalf;
|
|
|
|
const c1: u32 = mem.readIntLE(u16, it.bytes[it.i .. it.i + 2]);
|
|
|
|
if (c1 & ~u32(0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf;
|
2018-08-21 13:07:28 -07:00
|
|
|
it.i += 2;
|
2018-08-20 21:46:42 -07:00
|
|
|
return 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff));
|
|
|
|
} else if (c0 & ~u32(0x03ff) == 0xdc00) {
|
|
|
|
return error.UnexpectedSecondSurrogateHalf;
|
|
|
|
} else {
|
2018-08-21 13:07:28 -07:00
|
|
|
it.i += 2;
|
2018-08-20 21:46:42 -07:00
|
|
|
return c0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2018-04-24 21:59:03 -07:00
|
|
|
test "utf8 encode" {
|
2018-04-29 14:38:41 -07:00
|
|
|
comptime testUtf8Encode() catch unreachable;
|
|
|
|
try testUtf8Encode();
|
|
|
|
}
|
|
|
|
fn testUtf8Encode() !void {
|
2018-04-24 21:59:03 -07:00
|
|
|
// A few taken from wikipedia a few taken elsewhere
|
|
|
|
var array: [4]u8 = undefined;
|
|
|
|
debug.assert((try utf8Encode(try utf8Decode("€"), array[0..])) == 3);
|
|
|
|
debug.assert(array[0] == 0b11100010);
|
|
|
|
debug.assert(array[1] == 0b10000010);
|
|
|
|
debug.assert(array[2] == 0b10101100);
|
|
|
|
|
|
|
|
debug.assert((try utf8Encode(try utf8Decode("$"), array[0..])) == 1);
|
|
|
|
debug.assert(array[0] == 0b00100100);
|
|
|
|
|
|
|
|
debug.assert((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2);
|
|
|
|
debug.assert(array[0] == 0b11000010);
|
|
|
|
debug.assert(array[1] == 0b10100010);
|
|
|
|
|
|
|
|
debug.assert((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4);
|
|
|
|
debug.assert(array[0] == 0b11110000);
|
|
|
|
debug.assert(array[1] == 0b10010000);
|
|
|
|
debug.assert(array[2] == 0b10001101);
|
|
|
|
debug.assert(array[3] == 0b10001000);
|
|
|
|
}
|
|
|
|
|
|
|
|
test "utf8 encode error" {
|
2018-04-29 14:38:41 -07:00
|
|
|
comptime testUtf8EncodeError();
|
|
|
|
testUtf8EncodeError();
|
|
|
|
}
|
|
|
|
fn testUtf8EncodeError() void {
|
2018-04-24 21:59:03 -07:00
|
|
|
var array: [4]u8 = undefined;
|
2018-04-29 14:28:11 -07:00
|
|
|
testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf);
|
|
|
|
testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf);
|
|
|
|
testErrorEncode(0x110000, array[0..], error.CodepointTooLarge);
|
|
|
|
testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge);
|
2018-04-24 21:59:03 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: error) void {
|
|
|
|
if (utf8Encode(codePoint, array)) |_| {
|
|
|
|
unreachable;
|
|
|
|
} else |err| {
|
2018-04-24 23:26:57 -07:00
|
|
|
debug.assert(err == expectedErr);
|
2018-04-24 21:59:03 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-23 00:20:15 -08:00
|
|
|
test "utf8 iterator on ascii" {
|
2018-04-29 14:38:41 -07:00
|
|
|
comptime testUtf8IteratorOnAscii();
|
|
|
|
testUtf8IteratorOnAscii();
|
|
|
|
}
|
|
|
|
fn testUtf8IteratorOnAscii() void {
|
2018-02-23 00:20:15 -08:00
|
|
|
const s = Utf8View.initComptime("abc");
|
|
|
|
|
2018-03-05 21:42:01 -08:00
|
|
|
var it1 = s.iterator();
|
2018-06-09 20:42:14 -07:00
|
|
|
debug.assert(std.mem.eql(u8, "a", it1.nextCodepointSlice().?));
|
|
|
|
debug.assert(std.mem.eql(u8, "b", it1.nextCodepointSlice().?));
|
|
|
|
debug.assert(std.mem.eql(u8, "c", it1.nextCodepointSlice().?));
|
2018-02-23 00:20:15 -08:00
|
|
|
debug.assert(it1.nextCodepointSlice() == null);
|
|
|
|
|
2018-03-05 21:42:01 -08:00
|
|
|
var it2 = s.iterator();
|
2018-06-09 20:42:14 -07:00
|
|
|
debug.assert(it2.nextCodepoint().? == 'a');
|
|
|
|
debug.assert(it2.nextCodepoint().? == 'b');
|
|
|
|
debug.assert(it2.nextCodepoint().? == 'c');
|
2018-02-23 00:20:15 -08:00
|
|
|
debug.assert(it2.nextCodepoint() == null);
|
|
|
|
}
|
|
|
|
|
|
|
|
test "utf8 view bad" {
|
2018-04-29 14:38:41 -07:00
|
|
|
comptime testUtf8ViewBad();
|
|
|
|
testUtf8ViewBad();
|
|
|
|
}
|
|
|
|
fn testUtf8ViewBad() void {
|
2018-02-23 00:20:15 -08:00
|
|
|
// Compile-time error.
|
|
|
|
// const s3 = Utf8View.initComptime("\xfe\xf2");
|
|
|
|
const s = Utf8View.init("hel\xadlo");
|
2018-05-28 17:23:55 -07:00
|
|
|
if (s) |_| {
|
|
|
|
unreachable;
|
|
|
|
} else |err| {
|
|
|
|
debug.assert(err == error.InvalidUtf8);
|
|
|
|
}
|
2018-02-23 00:20:15 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
test "utf8 view ok" {
|
2018-04-29 14:38:41 -07:00
|
|
|
comptime testUtf8ViewOk();
|
|
|
|
testUtf8ViewOk();
|
|
|
|
}
|
|
|
|
fn testUtf8ViewOk() void {
|
2018-02-23 00:20:15 -08:00
|
|
|
const s = Utf8View.initComptime("東京市");
|
|
|
|
|
2018-03-05 21:42:01 -08:00
|
|
|
var it1 = s.iterator();
|
2018-06-09 20:42:14 -07:00
|
|
|
debug.assert(std.mem.eql(u8, "東", it1.nextCodepointSlice().?));
|
|
|
|
debug.assert(std.mem.eql(u8, "京", it1.nextCodepointSlice().?));
|
|
|
|
debug.assert(std.mem.eql(u8, "市", it1.nextCodepointSlice().?));
|
2018-02-23 00:20:15 -08:00
|
|
|
debug.assert(it1.nextCodepointSlice() == null);
|
|
|
|
|
2018-03-05 21:42:01 -08:00
|
|
|
var it2 = s.iterator();
|
2018-06-09 20:42:14 -07:00
|
|
|
debug.assert(it2.nextCodepoint().? == 0x6771);
|
|
|
|
debug.assert(it2.nextCodepoint().? == 0x4eac);
|
|
|
|
debug.assert(it2.nextCodepoint().? == 0x5e02);
|
2018-02-23 00:20:15 -08:00
|
|
|
debug.assert(it2.nextCodepoint() == null);
|
|
|
|
}
|
|
|
|
|
|
|
|
test "bad utf8 slice" {
|
2018-04-29 14:38:41 -07:00
|
|
|
comptime testBadUtf8Slice();
|
|
|
|
testBadUtf8Slice();
|
|
|
|
}
|
|
|
|
fn testBadUtf8Slice() void {
|
2018-02-23 00:20:15 -08:00
|
|
|
debug.assert(utf8ValidateSlice("abc"));
|
|
|
|
debug.assert(!utf8ValidateSlice("abc\xc0"));
|
|
|
|
debug.assert(!utf8ValidateSlice("abc\xc0abc"));
|
|
|
|
debug.assert(utf8ValidateSlice("abc\xdf\xbf"));
|
|
|
|
}
|
|
|
|
|
2017-12-26 22:17:33 -08:00
|
|
|
test "valid utf8" {
|
2018-04-29 14:38:41 -07:00
|
|
|
comptime testValidUtf8();
|
|
|
|
testValidUtf8();
|
|
|
|
}
|
|
|
|
fn testValidUtf8() void {
|
2017-12-26 22:17:33 -08:00
|
|
|
testValid("\x00", 0x0);
|
|
|
|
testValid("\x20", 0x20);
|
|
|
|
testValid("\x7f", 0x7f);
|
|
|
|
testValid("\xc2\x80", 0x80);
|
|
|
|
testValid("\xdf\xbf", 0x7ff);
|
|
|
|
testValid("\xe0\xa0\x80", 0x800);
|
|
|
|
testValid("\xe1\x80\x80", 0x1000);
|
|
|
|
testValid("\xef\xbf\xbf", 0xffff);
|
|
|
|
testValid("\xf0\x90\x80\x80", 0x10000);
|
|
|
|
testValid("\xf1\x80\x80\x80", 0x40000);
|
|
|
|
testValid("\xf3\xbf\xbf\xbf", 0xfffff);
|
|
|
|
testValid("\xf4\x8f\xbf\xbf", 0x10ffff);
|
|
|
|
}
|
|
|
|
|
|
|
|
test "invalid utf8 continuation bytes" {
|
2018-04-29 14:38:41 -07:00
|
|
|
comptime testInvalidUtf8ContinuationBytes();
|
|
|
|
testInvalidUtf8ContinuationBytes();
|
|
|
|
}
|
|
|
|
fn testInvalidUtf8ContinuationBytes() void {
|
2017-12-26 22:17:33 -08:00
|
|
|
// unexpected continuation
|
|
|
|
testError("\x80", error.Utf8InvalidStartByte);
|
|
|
|
testError("\xbf", error.Utf8InvalidStartByte);
|
|
|
|
// too many leading 1's
|
|
|
|
testError("\xf8", error.Utf8InvalidStartByte);
|
|
|
|
testError("\xff", error.Utf8InvalidStartByte);
|
|
|
|
// expected continuation for 2 byte sequences
|
|
|
|
testError("\xc2", error.UnexpectedEof);
|
|
|
|
testError("\xc2\x00", error.Utf8ExpectedContinuation);
|
|
|
|
testError("\xc2\xc0", error.Utf8ExpectedContinuation);
|
|
|
|
// expected continuation for 3 byte sequences
|
|
|
|
testError("\xe0", error.UnexpectedEof);
|
|
|
|
testError("\xe0\x00", error.UnexpectedEof);
|
|
|
|
testError("\xe0\xc0", error.UnexpectedEof);
|
|
|
|
testError("\xe0\xa0", error.UnexpectedEof);
|
|
|
|
testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation);
|
|
|
|
testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation);
|
|
|
|
// expected continuation for 4 byte sequences
|
|
|
|
testError("\xf0", error.UnexpectedEof);
|
|
|
|
testError("\xf0\x00", error.UnexpectedEof);
|
|
|
|
testError("\xf0\xc0", error.UnexpectedEof);
|
|
|
|
testError("\xf0\x90\x00", error.UnexpectedEof);
|
|
|
|
testError("\xf0\x90\xc0", error.UnexpectedEof);
|
|
|
|
testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation);
|
|
|
|
testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation);
|
|
|
|
}
|
|
|
|
|
|
|
|
test "overlong utf8 codepoint" {
|
2018-04-29 14:38:41 -07:00
|
|
|
comptime testOverlongUtf8Codepoint();
|
|
|
|
testOverlongUtf8Codepoint();
|
|
|
|
}
|
|
|
|
fn testOverlongUtf8Codepoint() void {
|
2017-12-26 22:17:33 -08:00
|
|
|
testError("\xc0\x80", error.Utf8OverlongEncoding);
|
|
|
|
testError("\xc1\xbf", error.Utf8OverlongEncoding);
|
|
|
|
testError("\xe0\x80\x80", error.Utf8OverlongEncoding);
|
|
|
|
testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding);
|
|
|
|
testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding);
|
|
|
|
testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding);
|
|
|
|
}
|
|
|
|
|
|
|
|
test "misc invalid utf8" {
|
2018-04-29 14:38:41 -07:00
|
|
|
comptime testMiscInvalidUtf8();
|
|
|
|
testMiscInvalidUtf8();
|
|
|
|
}
|
|
|
|
fn testMiscInvalidUtf8() void {
|
2017-12-26 22:17:33 -08:00
|
|
|
// codepoint out of bounds
|
|
|
|
testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
|
|
|
|
testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);
|
|
|
|
// surrogate halves
|
|
|
|
testValid("\xed\x9f\xbf", 0xd7ff);
|
|
|
|
testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf);
|
|
|
|
testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf);
|
|
|
|
testValid("\xee\x80\x80", 0xe000);
|
|
|
|
}
|
|
|
|
|
2018-01-25 01:10:11 -08:00
|
|
|
fn testError(bytes: []const u8, expected_err: error) void {
|
2017-12-26 22:17:33 -08:00
|
|
|
if (testDecode(bytes)) |_| {
|
|
|
|
unreachable;
|
|
|
|
} else |err| {
|
2018-02-23 00:20:15 -08:00
|
|
|
debug.assert(err == expected_err);
|
2017-12-26 22:17:33 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-01-25 01:10:11 -08:00
|
|
|
fn testValid(bytes: []const u8, expected_codepoint: u32) void {
|
2018-02-23 00:20:15 -08:00
|
|
|
debug.assert((testDecode(bytes) catch unreachable) == expected_codepoint);
|
2017-12-26 22:17:33 -08:00
|
|
|
}
|
|
|
|
|
2018-01-31 19:48:40 -08:00
|
|
|
fn testDecode(bytes: []const u8) !u32 {
|
2018-01-07 13:51:46 -08:00
|
|
|
const length = try utf8ByteSequenceLength(bytes[0]);
|
2017-12-26 22:17:33 -08:00
|
|
|
if (bytes.len < length) return error.UnexpectedEof;
|
2018-02-23 00:20:15 -08:00
|
|
|
debug.assert(bytes.len == length);
|
2017-12-26 22:17:33 -08:00
|
|
|
return utf8Decode(bytes);
|
|
|
|
}
|
2018-07-18 07:07:22 -07:00
|
|
|
|
2018-08-20 21:46:42 -07:00
|
|
|
/// Caller must free returned memory.
|
|
|
|
pub fn utf16leToUtf8Alloc(allocator: *mem.Allocator, utf16le: []const u16) ![]u8 {
|
2018-07-18 07:07:22 -07:00
|
|
|
var result = std.ArrayList(u8).init(allocator);
|
|
|
|
// optimistically guess that it will all be ascii.
|
|
|
|
try result.ensureCapacity(utf16le.len);
|
|
|
|
var out_index: usize = 0;
|
2018-08-20 21:46:42 -07:00
|
|
|
var it = Utf16LeIterator.init(utf16le);
|
|
|
|
while (try it.nextCodepoint()) |codepoint| {
|
2018-07-18 07:07:22 -07:00
|
|
|
const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
|
|
|
|
try result.resize(result.len + utf8_len);
|
2018-08-20 21:46:42 -07:00
|
|
|
assert((utf8Encode(codepoint, result.items[out_index..]) catch unreachable) == utf8_len);
|
2018-07-18 07:07:22 -07:00
|
|
|
out_index += utf8_len;
|
|
|
|
}
|
|
|
|
|
|
|
|
return result.toOwnedSlice();
|
|
|
|
}
|
|
|
|
|
2018-08-21 13:07:28 -07:00
|
|
|
/// Asserts that the output buffer is big enough.
|
|
|
|
/// Returns end index.
|
|
|
|
pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize {
|
|
|
|
var end_index: usize = 0;
|
2018-08-20 21:46:42 -07:00
|
|
|
var it = Utf16LeIterator.init(utf16le);
|
|
|
|
while (try it.nextCodepoint()) |codepoint| {
|
2018-08-21 13:07:28 -07:00
|
|
|
end_index += try utf8Encode(codepoint, utf8[end_index..]);
|
2018-08-20 21:46:42 -07:00
|
|
|
}
|
2018-08-21 13:07:28 -07:00
|
|
|
return end_index;
|
2018-08-20 21:46:42 -07:00
|
|
|
}
|
|
|
|
|
2018-07-18 07:07:22 -07:00
|
|
|
test "utf16leToUtf8" {
|
|
|
|
var utf16le: [2]u16 = undefined;
|
|
|
|
const utf16le_as_bytes = @sliceToBytes(utf16le[0..]);
|
|
|
|
|
|
|
|
{
|
|
|
|
mem.writeInt(utf16le_as_bytes[0..], u16('A'), builtin.Endian.Little);
|
|
|
|
mem.writeInt(utf16le_as_bytes[2..], u16('a'), builtin.Endian.Little);
|
2018-08-21 13:07:28 -07:00
|
|
|
const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le);
|
2018-07-18 07:07:22 -07:00
|
|
|
assert(mem.eql(u8, utf8, "Aa"));
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
mem.writeInt(utf16le_as_bytes[0..], u16(0x80), builtin.Endian.Little);
|
|
|
|
mem.writeInt(utf16le_as_bytes[2..], u16(0xffff), builtin.Endian.Little);
|
2018-08-21 13:07:28 -07:00
|
|
|
const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le);
|
2018-07-18 07:07:22 -07:00
|
|
|
assert(mem.eql(u8, utf8, "\xc2\x80" ++ "\xef\xbf\xbf"));
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// the values just outside the surrogate half range
|
|
|
|
mem.writeInt(utf16le_as_bytes[0..], u16(0xd7ff), builtin.Endian.Little);
|
|
|
|
mem.writeInt(utf16le_as_bytes[2..], u16(0xe000), builtin.Endian.Little);
|
2018-08-21 13:07:28 -07:00
|
|
|
const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le);
|
2018-07-18 07:07:22 -07:00
|
|
|
assert(mem.eql(u8, utf8, "\xed\x9f\xbf" ++ "\xee\x80\x80"));
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// smallest surrogate pair
|
|
|
|
mem.writeInt(utf16le_as_bytes[0..], u16(0xd800), builtin.Endian.Little);
|
|
|
|
mem.writeInt(utf16le_as_bytes[2..], u16(0xdc00), builtin.Endian.Little);
|
2018-08-21 13:07:28 -07:00
|
|
|
const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le);
|
2018-07-18 07:07:22 -07:00
|
|
|
assert(mem.eql(u8, utf8, "\xf0\x90\x80\x80"));
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// largest surrogate pair
|
|
|
|
mem.writeInt(utf16le_as_bytes[0..], u16(0xdbff), builtin.Endian.Little);
|
|
|
|
mem.writeInt(utf16le_as_bytes[2..], u16(0xdfff), builtin.Endian.Little);
|
2018-08-21 13:07:28 -07:00
|
|
|
const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le);
|
2018-07-18 07:07:22 -07:00
|
|
|
assert(mem.eql(u8, utf8, "\xf4\x8f\xbf\xbf"));
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
mem.writeInt(utf16le_as_bytes[0..], u16(0xdbff), builtin.Endian.Little);
|
|
|
|
mem.writeInt(utf16le_as_bytes[2..], u16(0xdc00), builtin.Endian.Little);
|
2018-08-21 13:07:28 -07:00
|
|
|
const utf8 = try utf16leToUtf8Alloc(std.debug.global_allocator, utf16le);
|
2018-07-18 07:07:22 -07:00
|
|
|
assert(mem.eql(u8, utf8, "\xf4\x8f\xb0\x80"));
|
|
|
|
}
|
|
|
|
}
|
2018-08-09 13:48:44 -07:00
|
|
|
|
|
|
|
/// TODO support codepoints bigger than 16 bits
|
|
|
|
/// TODO type for null terminated pointer
|
|
|
|
pub fn utf8ToUtf16LeWithNull(allocator: *mem.Allocator, utf8: []const u8) ![]u16 {
|
|
|
|
var result = std.ArrayList(u16).init(allocator);
|
|
|
|
// optimistically guess that it will not require surrogate pairs
|
|
|
|
try result.ensureCapacity(utf8.len + 1);
|
|
|
|
|
|
|
|
const view = try Utf8View.init(utf8);
|
|
|
|
var it = view.iterator();
|
|
|
|
while (it.nextCodepoint()) |codepoint| {
|
|
|
|
try result.append(@intCast(u16, codepoint)); // TODO surrogate pairs
|
|
|
|
}
|
|
|
|
|
|
|
|
try result.append(0);
|
|
|
|
return result.toOwnedSlice();
|
|
|
|
}
|
2018-08-21 13:07:28 -07:00
|
|
|
|
|
|
|
/// Returns index of next character. If exact fit, returned index equals output slice length.
|
|
|
|
/// If ran out of room, returned index equals output slice length + 1.
|
|
|
|
/// TODO support codepoints bigger than 16 bits
|
|
|
|
pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {
|
|
|
|
const utf16le_as_bytes = @sliceToBytes(utf16le[0..]);
|
|
|
|
var end_index: usize = 0;
|
|
|
|
|
|
|
|
var it = (try Utf8View.init(utf8)).iterator();
|
|
|
|
while (it.nextCodepoint()) |codepoint| {
|
|
|
|
if (end_index == utf16le_as_bytes.len) return (end_index / 2) + 1;
|
|
|
|
// TODO surrogate pairs
|
|
|
|
mem.writeInt(utf16le_as_bytes[end_index..], @intCast(u16, codepoint), builtin.Endian.Little);
|
|
|
|
end_index += 2;
|
|
|
|
}
|
|
|
|
return end_index / 2;
|
|
|
|
}
|