commit
473cb1fd74
@ -7,6 +7,7 @@ const std = @import("std.zig");
|
|||||||
const math = std.math;
|
const math = std.math;
|
||||||
const assert = std.debug.assert;
|
const assert = std.debug.assert;
|
||||||
const mem = std.mem;
|
const mem = std.mem;
|
||||||
|
const unicode = std.unicode;
|
||||||
const builtin = @import("builtin");
|
const builtin = @import("builtin");
|
||||||
const errol = @import("fmt/errol.zig");
|
const errol = @import("fmt/errol.zig");
|
||||||
const lossyCast = std.math.lossyCast;
|
const lossyCast = std.math.lossyCast;
|
||||||
@ -76,6 +77,7 @@ fn peekIsAlign(comptime fmt: []const u8) bool {
|
|||||||
/// - `b`: output integer value in binary notation
|
/// - `b`: output integer value in binary notation
|
||||||
/// - `o`: output integer value in octal notation
|
/// - `o`: output integer value in octal notation
|
||||||
/// - `c`: output integer as an ASCII character. Integer type must have 8 bits at max.
|
/// - `c`: output integer as an ASCII character. Integer type must have 8 bits at max.
|
||||||
|
/// - `u`: output integer as an UTF-8 sequence. Integer type must have 21 bits at max.
|
||||||
/// - `*`: output the address of the value instead of the value itself.
|
/// - `*`: output the address of the value instead of the value itself.
|
||||||
///
|
///
|
||||||
/// If a formatted user type contains a function of the type
|
/// If a formatted user type contains a function of the type
|
||||||
@ -555,6 +557,12 @@ pub fn formatIntValue(
|
|||||||
} else {
|
} else {
|
||||||
@compileError("Cannot escape character with more than 8 bits");
|
@compileError("Cannot escape character with more than 8 bits");
|
||||||
}
|
}
|
||||||
|
} else if (comptime std.mem.eql(u8, fmt, "u")) {
|
||||||
|
if (@typeInfo(@TypeOf(int_value)).Int.bits <= 21) {
|
||||||
|
return formatUnicodeCodepoint(@as(u21, int_value), options, writer);
|
||||||
|
} else {
|
||||||
|
@compileError("Cannot print integer that is larger than 21 bits as an UTF-8 sequence");
|
||||||
|
}
|
||||||
} else if (comptime std.mem.eql(u8, fmt, "b")) {
|
} else if (comptime std.mem.eql(u8, fmt, "b")) {
|
||||||
radix = 2;
|
radix = 2;
|
||||||
uppercase = false;
|
uppercase = false;
|
||||||
@ -641,30 +649,54 @@ pub fn formatAsciiChar(
|
|||||||
return writer.writeAll(@as(*const [1]u8, &c));
|
return writer.writeAll(@as(*const [1]u8, &c));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn formatUnicodeCodepoint(
|
||||||
|
c: u21,
|
||||||
|
options: FormatOptions,
|
||||||
|
writer: anytype,
|
||||||
|
) !void {
|
||||||
|
var buf: [4]u8 = undefined;
|
||||||
|
const len = std.unicode.utf8Encode(c, &buf) catch |err| switch (err) {
|
||||||
|
error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => {
|
||||||
|
// In case of error output the replacement char U+FFFD
|
||||||
|
return formatBuf(&[_]u8{ 0xef, 0xbf, 0xbd }, options, writer);
|
||||||
|
},
|
||||||
|
};
|
||||||
|
return formatBuf(buf[0..len], options, writer);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn formatBuf(
|
pub fn formatBuf(
|
||||||
buf: []const u8,
|
buf: []const u8,
|
||||||
options: FormatOptions,
|
options: FormatOptions,
|
||||||
writer: anytype,
|
writer: anytype,
|
||||||
) !void {
|
) !void {
|
||||||
const width = options.width orelse buf.len;
|
if (options.width) |min_width| {
|
||||||
const padding = if (width > buf.len) (width - buf.len) else 0;
|
// In case of error assume the buffer content is ASCII-encoded
|
||||||
|
const width = unicode.utf8CountCodepoints(buf) catch |_| buf.len;
|
||||||
|
const padding = if (width < min_width) min_width - width else 0;
|
||||||
|
|
||||||
switch (options.alignment) {
|
if (padding == 0)
|
||||||
.Left => {
|
return writer.writeAll(buf);
|
||||||
try writer.writeAll(buf);
|
|
||||||
try writer.writeByteNTimes(options.fill, padding);
|
switch (options.alignment) {
|
||||||
},
|
.Left => {
|
||||||
.Center => {
|
try writer.writeAll(buf);
|
||||||
const left_padding = padding / 2;
|
try writer.writeByteNTimes(options.fill, padding);
|
||||||
const right_padding = (padding + 1) / 2;
|
},
|
||||||
try writer.writeByteNTimes(options.fill, left_padding);
|
.Center => {
|
||||||
try writer.writeAll(buf);
|
const left_padding = padding / 2;
|
||||||
try writer.writeByteNTimes(options.fill, right_padding);
|
const right_padding = (padding + 1) / 2;
|
||||||
},
|
try writer.writeByteNTimes(options.fill, left_padding);
|
||||||
.Right => {
|
try writer.writeAll(buf);
|
||||||
try writer.writeByteNTimes(options.fill, padding);
|
try writer.writeByteNTimes(options.fill, right_padding);
|
||||||
try writer.writeAll(buf);
|
},
|
||||||
},
|
.Right => {
|
||||||
|
try writer.writeByteNTimes(options.fill, padding);
|
||||||
|
try writer.writeAll(buf);
|
||||||
|
},
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Fast path, avoid counting the number of codepoints
|
||||||
|
try writer.writeAll(buf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1385,6 +1417,22 @@ test "int.specifier" {
|
|||||||
const value: u16 = 0o1234;
|
const value: u16 = 0o1234;
|
||||||
try testFmt("u16: 0o1234\n", "u16: 0o{o}\n", .{value});
|
try testFmt("u16: 0o1234\n", "u16: 0o{o}\n", .{value});
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
const value: u8 = 'a';
|
||||||
|
try testFmt("UTF-8: a\n", "UTF-8: {u}\n", .{value});
|
||||||
|
}
|
||||||
|
{
|
||||||
|
const value: u21 = 0x1F310;
|
||||||
|
try testFmt("UTF-8: 🌐\n", "UTF-8: {u}\n", .{value});
|
||||||
|
}
|
||||||
|
{
|
||||||
|
const value: u21 = 0xD800;
|
||||||
|
try testFmt("UTF-8: <20>\n", "UTF-8: {u}\n", .{value});
|
||||||
|
}
|
||||||
|
{
|
||||||
|
const value: u21 = 0x110001;
|
||||||
|
try testFmt("UTF-8: <20>\n", "UTF-8: {u}\n", .{value});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
test "int.padded" {
|
test "int.padded" {
|
||||||
@ -1400,6 +1448,10 @@ test "int.padded" {
|
|||||||
try testFmt("i16: '-12345'", "i16: '{:4}'", .{@as(i16, -12345)});
|
try testFmt("i16: '-12345'", "i16: '{:4}'", .{@as(i16, -12345)});
|
||||||
try testFmt("i16: '+12345'", "i16: '{:4}'", .{@as(i16, 12345)});
|
try testFmt("i16: '+12345'", "i16: '{:4}'", .{@as(i16, 12345)});
|
||||||
try testFmt("u16: '12345'", "u16: '{:4}'", .{@as(u16, 12345)});
|
try testFmt("u16: '12345'", "u16: '{:4}'", .{@as(u16, 12345)});
|
||||||
|
|
||||||
|
try testFmt("UTF-8: 'ü '", "UTF-8: '{u:<4}'", .{'ü'});
|
||||||
|
try testFmt("UTF-8: ' ü'", "UTF-8: '{u:>4}'", .{'ü'});
|
||||||
|
try testFmt("UTF-8: ' ü '", "UTF-8: '{u:^4}'", .{'ü'});
|
||||||
}
|
}
|
||||||
|
|
||||||
test "buffer" {
|
test "buffer" {
|
||||||
@ -1929,6 +1981,9 @@ test "padding" {
|
|||||||
try testFmt("==================Filled", "{:=>24}", .{"Filled"});
|
try testFmt("==================Filled", "{:=>24}", .{"Filled"});
|
||||||
try testFmt(" Centered ", "{:^24}", .{"Centered"});
|
try testFmt(" Centered ", "{:^24}", .{"Centered"});
|
||||||
try testFmt("-", "{:-^1}", .{""});
|
try testFmt("-", "{:-^1}", .{""});
|
||||||
|
try testFmt("==crêpe===", "{:=^10}", .{"crêpe"});
|
||||||
|
try testFmt("=====crêpe", "{:=>10}", .{"crêpe"});
|
||||||
|
try testFmt("crêpe=====", "{:=<10}", .{"crêpe"});
|
||||||
}
|
}
|
||||||
|
|
||||||
test "decimal float padding" {
|
test "decimal float padding" {
|
||||||
|
@ -23,11 +23,12 @@ pub fn utf8CodepointSequenceLength(c: u21) !u3 {
|
|||||||
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
|
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
|
||||||
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
|
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
|
||||||
pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
|
pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
|
||||||
return switch (@clz(u8, ~first_byte)) {
|
// The switch is optimized much better than a "smart" approach using @clz
|
||||||
0 => 1,
|
return switch (first_byte) {
|
||||||
2 => 2,
|
0b0000_0000 ... 0b0111_1111 => 1,
|
||||||
3 => 3,
|
0b1100_0000 ... 0b1101_1111 => 2,
|
||||||
4 => 4,
|
0b1110_0000 ... 0b1110_1111 => 3,
|
||||||
|
0b1111_0000 ... 0b1111_0111 => 4,
|
||||||
else => error.Utf8InvalidStartByte,
|
else => error.Utf8InvalidStartByte,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -153,6 +154,50 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
|
|||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns true if the given unicode codepoint can be encoded in UTF-8.
|
||||||
|
pub fn utf8ValidCodepoint(value: u21) bool {
|
||||||
|
return switch (value) {
|
||||||
|
0xD800 ... 0xDFFF => false, // Surrogates range
|
||||||
|
0x110000 ... 0x1FFFFF => false, // Above the maximum codepoint value
|
||||||
|
else => true,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the length of a supplied UTF-8 string literal in terms of unicode
|
||||||
|
/// codepoints.
|
||||||
|
/// Asserts that the data is valid UTF-8.
|
||||||
|
pub fn utf8CountCodepoints(s: []const u8) !usize {
|
||||||
|
var len: usize = 0;
|
||||||
|
|
||||||
|
const N = @sizeOf(usize);
|
||||||
|
const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
|
||||||
|
|
||||||
|
var i: usize = 0;
|
||||||
|
while (i < s.len) {
|
||||||
|
// Fast path for ASCII sequences
|
||||||
|
while (i + N <= s.len) : (i += N) {
|
||||||
|
const v = mem.readIntNative(usize, s[i..][0..N]);
|
||||||
|
if (v & MASK != 0) break;
|
||||||
|
len += N;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < s.len) {
|
||||||
|
const n = try utf8ByteSequenceLength(s[i]);
|
||||||
|
if (i + n > s.len) return error.TruncatedInput;
|
||||||
|
|
||||||
|
switch (n) {
|
||||||
|
1 => {}, // ASCII, no validation needed
|
||||||
|
else => _ = try utf8Decode(s[i .. i + n]),
|
||||||
|
}
|
||||||
|
|
||||||
|
i += n;
|
||||||
|
len += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
pub fn utf8ValidateSlice(s: []const u8) bool {
|
pub fn utf8ValidateSlice(s: []const u8) bool {
|
||||||
var i: usize = 0;
|
var i: usize = 0;
|
||||||
while (i < s.len) {
|
while (i < s.len) {
|
||||||
@ -687,7 +732,6 @@ pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16Le
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns length of a supplied UTF-8 string literal. Asserts that the data is valid UTF-8.
|
|
||||||
fn calcUtf16LeLen(utf8: []const u8) usize {
|
fn calcUtf16LeLen(utf8: []const u8) usize {
|
||||||
var src_i: usize = 0;
|
var src_i: usize = 0;
|
||||||
var dest_len: usize = 0;
|
var dest_len: usize = 0;
|
||||||
@ -757,3 +801,31 @@ test "utf8ToUtf16LeStringLiteral" {
|
|||||||
testing.expect(utf16[2] == 0);
|
testing.expect(utf16[2] == 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn testUtf8CountCodepoints() !void {
|
||||||
|
testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij"));
|
||||||
|
testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö"));
|
||||||
|
testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは"));
|
||||||
|
// testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80"));
|
||||||
|
}
|
||||||
|
|
||||||
|
test "utf8 count codepoints" {
|
||||||
|
try testUtf8CountCodepoints();
|
||||||
|
comptime testUtf8CountCodepoints() catch unreachable;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn testUtf8ValidCodepoint() !void {
|
||||||
|
testing.expect(utf8ValidCodepoint('e'));
|
||||||
|
testing.expect(utf8ValidCodepoint('ë'));
|
||||||
|
testing.expect(utf8ValidCodepoint('は'));
|
||||||
|
testing.expect(utf8ValidCodepoint(0xe000));
|
||||||
|
testing.expect(utf8ValidCodepoint(0x10ffff));
|
||||||
|
testing.expect(!utf8ValidCodepoint(0xd800));
|
||||||
|
testing.expect(!utf8ValidCodepoint(0xdfff));
|
||||||
|
testing.expect(!utf8ValidCodepoint(0x110000));
|
||||||
|
}
|
||||||
|
|
||||||
|
test "utf8 valid codepoint" {
|
||||||
|
try testUtf8ValidCodepoint();
|
||||||
|
comptime testUtf8ValidCodepoint() catch unreachable;
|
||||||
|
}
|
||||||
|
@ -3,47 +3,79 @@
|
|||||||
// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
|
// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
|
||||||
// The MIT license requires this copyright notice to be included in all copies
|
// The MIT license requires this copyright notice to be included in all copies
|
||||||
// and substantial portions of the software.
|
// and substantial portions of the software.
|
||||||
const builtin = @import("builtin");
|
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
|
const builtin = std.builtin;
|
||||||
|
const time = std.time;
|
||||||
|
const unicode = std.unicode;
|
||||||
|
|
||||||
|
const Timer = time.Timer;
|
||||||
|
|
||||||
|
const N = 1_000_000;
|
||||||
|
|
||||||
|
const KiB = 1024;
|
||||||
|
const MiB = 1024 * KiB;
|
||||||
|
const GiB = 1024 * MiB;
|
||||||
|
|
||||||
|
const ResultCount = struct {
|
||||||
|
count: usize,
|
||||||
|
throughput: u64,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn benchmarkCodepointCount(buf: []const u8) !ResultCount {
|
||||||
|
var timer = try Timer.start();
|
||||||
|
|
||||||
|
const bytes = N * buf.len;
|
||||||
|
|
||||||
|
const start = timer.lap();
|
||||||
|
var i: usize = 0;
|
||||||
|
var r: usize = undefined;
|
||||||
|
while (i < N) : (i += 1) {
|
||||||
|
r = try @call(
|
||||||
|
.{ .modifier = .never_inline },
|
||||||
|
std.unicode.utf8CountCodepoints,
|
||||||
|
.{buf},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const end = timer.read();
|
||||||
|
|
||||||
|
const elapsed_s = @intToFloat(f64, end - start) / time.ns_per_s;
|
||||||
|
const throughput = @floatToInt(u64, @intToFloat(f64, bytes) / elapsed_s);
|
||||||
|
|
||||||
|
return ResultCount{ .count = r, .throughput = throughput };
|
||||||
|
}
|
||||||
|
|
||||||
pub fn main() !void {
|
pub fn main() !void {
|
||||||
const stdout = std.io.getStdOut().outStream();
|
const stdout = std.io.getStdOut().outStream();
|
||||||
|
|
||||||
const args = try std.process.argsAlloc(std.heap.page_allocator);
|
const args = try std.process.argsAlloc(std.heap.page_allocator);
|
||||||
|
|
||||||
// Warm up runs
|
try stdout.print("short ASCII strings\n", .{});
|
||||||
var buffer0: [32767]u16 align(4096) = undefined;
|
{
|
||||||
_ = try std.unicode.utf8ToUtf16Le(&buffer0, args[1]);
|
const result = try benchmarkCodepointCount("abc");
|
||||||
_ = try std.unicode.utf8ToUtf16Le_better(&buffer0, args[1]);
|
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
|
||||||
|
}
|
||||||
|
|
||||||
@fence(.SeqCst);
|
try stdout.print("short Unicode strings\n", .{});
|
||||||
var timer = try std.time.Timer.start();
|
{
|
||||||
@fence(.SeqCst);
|
const result = try benchmarkCodepointCount("ŌŌŌ");
|
||||||
|
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
|
||||||
|
}
|
||||||
|
|
||||||
var buffer1: [32767]u16 align(4096) = undefined;
|
try stdout.print("pure ASCII strings\n", .{});
|
||||||
_ = try std.unicode.utf8ToUtf16Le(&buffer1, args[1]);
|
{
|
||||||
|
const result = try benchmarkCodepointCount("hello" ** 16);
|
||||||
|
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
|
||||||
|
}
|
||||||
|
|
||||||
@fence(.SeqCst);
|
try stdout.print("pure Unicode strings\n", .{});
|
||||||
const elapsed_ns_orig = timer.lap();
|
{
|
||||||
@fence(.SeqCst);
|
const result = try benchmarkCodepointCount("こんにちは" ** 16);
|
||||||
|
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
|
||||||
|
}
|
||||||
|
|
||||||
var buffer2: [32767]u16 align(4096) = undefined;
|
try stdout.print("mixed ASCII/Unicode strings\n", .{});
|
||||||
_ = try std.unicode.utf8ToUtf16Le_better(&buffer2, args[1]);
|
{
|
||||||
|
const result = try benchmarkCodepointCount("Hyvää huomenta" ** 16);
|
||||||
@fence(.SeqCst);
|
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
|
||||||
const elapsed_ns_better = timer.lap();
|
}
|
||||||
@fence(.SeqCst);
|
|
||||||
|
|
||||||
std.debug.warn("original utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", .{
|
|
||||||
elapsed_ns_orig, elapsed_ns_orig / 1000000,
|
|
||||||
});
|
|
||||||
std.debug.warn("new utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", .{
|
|
||||||
elapsed_ns_better, elapsed_ns_better / 1000000,
|
|
||||||
});
|
|
||||||
asm volatile ("nop"
|
|
||||||
:
|
|
||||||
: [a] "r" (&buffer1),
|
|
||||||
[b] "r" (&buffer2)
|
|
||||||
: "memory"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user