zig/lib/std/json.zig
hryx 2933a8241a json: disallow overlong and out-of-range UTF-8
Fixes #2379

= Overlong (non-shortest) sequences

UTF-8's unique encoding scheme allows for some Unicode codepoints
to be represented in multiple ways. For any of these characters,
the spec forbids all but the shortest form. These disallowed longer
sequences are called "overlong". As an interesting side effect of
this rule, the bytes C0 and C1 never appear in valid UTF-8.

= Codepoint range

UTF-8 disallows representation of codepoints beyond U+10FFFF,
which is the highest character which can be encoded in UTF-16.
Because a 4-byte sequence is capable of resulting in such characters,
they must be explicitly rejected. This rule also has an interesting
side effect, which is that bytes F5 to FF never appear.

= References

Detecting an overlong version of a codepoint could get gnarly, but
luckily The Unicode Consortium did the hard work by creating this
handy table of valid byte sequences:

https://unicode.org/versions/corrigendum1.html

I thought this mapped nicely to the parser's state machine, so I
rearranged the relevant states to make use of it.
2020-01-07 12:07:44 -05:00

1654 lines
55 KiB
Zig

// JSON parser conforming to RFC8259.
//
// https://tools.ietf.org/html/rfc8259
const std = @import("std.zig");
const debug = std.debug;
const assert = debug.assert;
const testing = std.testing;
const mem = std.mem;
const maxInt = std.math.maxInt;
pub const WriteStream = @import("json/write_stream.zig").WriteStream;
const StringEscapes = union(enum) {
None,
Some: struct {
size_diff: isize,
},
};
/// A single token slice into the parent string.
///
/// Use `token.slice()` on the input at the current position to get the current slice.
pub const Token = union(enum) {
ObjectBegin,
ObjectEnd,
ArrayBegin,
ArrayEnd,
String: struct {
/// How many bytes the token is.
count: usize,
/// Whether string contains an escape sequence and cannot be zero-copied
escapes: StringEscapes,
pub fn decodedLength(self: @This()) usize {
return self.count +% switch (self.escapes) {
.None => 0,
.Some => |s| @bitCast(usize, s.size_diff),
};
}
/// Slice into the underlying input string.
pub fn slice(self: @This(), input: []const u8, i: usize) []const u8 {
return input[i - self.count .. i];
}
},
Number: struct {
/// How many bytes the token is.
count: usize,
/// Whether number is simple and can be represented by an integer (i.e. no `.` or `e`)
is_integer: bool,
/// Slice into the underlying input string.
pub fn slice(self: @This(), input: []const u8, i: usize) []const u8 {
return input[i - self.count .. i];
}
},
True,
False,
Null,
};
/// A small streaming JSON parser. This accepts input one byte at a time and returns tokens as
/// they are encountered. No copies or allocations are performed during parsing and the entire
/// parsing state requires ~40-50 bytes of stack space.
///
/// Conforms strictly to RFC8529.
///
/// For a non-byte based wrapper, consider using TokenStream instead.
pub const StreamingParser = struct {
// Current state
state: State,
// How many bytes we have counted for the current token
count: usize,
// What state to follow after parsing a string (either property or value string)
after_string_state: State,
// What state to follow after parsing a value (either top-level or value end)
after_value_state: State,
// If we stopped now, would the complete parsed string to now be a valid json string
complete: bool,
// Current token flags to pass through to the next generated, see Token.
string_escapes: StringEscapes,
// When in .String states, was the previous character a high surrogate?
string_last_was_high_surrogate: bool,
// Used inside of StringEscapeHexUnicode* states
string_unicode_codepoint: u21,
// The first byte needs to be stored to validate 3- and 4-byte sequences.
sequence_first_byte: u8 = undefined,
// When in .Number states, is the number a (still) valid integer?
number_is_integer: bool,
// Bit-stack for nested object/map literals (max 255 nestings).
stack: u256,
stack_used: u8,
const object_bit = 0;
const array_bit = 1;
const max_stack_size = maxInt(u8);
pub fn init() StreamingParser {
var p: StreamingParser = undefined;
p.reset();
return p;
}
pub fn reset(p: *StreamingParser) void {
p.state = .TopLevelBegin;
p.count = 0;
// Set before ever read in main transition function
p.after_string_state = undefined;
p.after_value_state = .ValueEnd; // handle end of values normally
p.stack = 0;
p.stack_used = 0;
p.complete = false;
p.string_escapes = undefined;
p.string_last_was_high_surrogate = undefined;
p.string_unicode_codepoint = undefined;
p.number_is_integer = undefined;
}
pub const State = enum {
// These must be first with these explicit values as we rely on them for indexing the
// bit-stack directly and avoiding a branch.
ObjectSeparator = 0,
ValueEnd = 1,
TopLevelBegin,
TopLevelEnd,
ValueBegin,
ValueBeginNoClosing,
String,
StringUtf8Byte2Of2,
StringUtf8Byte2Of3,
StringUtf8Byte3Of3,
StringUtf8Byte2Of4,
StringUtf8Byte3Of4,
StringUtf8Byte4Of4,
StringEscapeCharacter,
StringEscapeHexUnicode4,
StringEscapeHexUnicode3,
StringEscapeHexUnicode2,
StringEscapeHexUnicode1,
Number,
NumberMaybeDotOrExponent,
NumberMaybeDigitOrDotOrExponent,
NumberFractionalRequired,
NumberFractional,
NumberMaybeExponent,
NumberExponent,
NumberExponentDigitsRequired,
NumberExponentDigits,
TrueLiteral1,
TrueLiteral2,
TrueLiteral3,
FalseLiteral1,
FalseLiteral2,
FalseLiteral3,
FalseLiteral4,
NullLiteral1,
NullLiteral2,
NullLiteral3,
// Only call this function to generate array/object final state.
pub fn fromInt(x: var) State {
debug.assert(x == 0 or x == 1);
const T = @TagType(State);
return @intToEnum(State, @intCast(T, x));
}
};
pub const Error = error{
InvalidTopLevel,
TooManyNestedItems,
TooManyClosingItems,
InvalidValueBegin,
InvalidValueEnd,
UnbalancedBrackets,
UnbalancedBraces,
UnexpectedClosingBracket,
UnexpectedClosingBrace,
InvalidNumber,
InvalidSeparator,
InvalidLiteral,
InvalidEscapeCharacter,
InvalidUnicodeHexSymbol,
InvalidUtf8Byte,
InvalidTopLevelTrailing,
InvalidControlCharacter,
};
/// Give another byte to the parser and obtain any new tokens. This may (rarely) return two
/// tokens. token2 is always null if token1 is null.
///
/// There is currently no error recovery on a bad stream.
pub fn feed(p: *StreamingParser, c: u8, token1: *?Token, token2: *?Token) Error!void {
token1.* = null;
token2.* = null;
p.count += 1;
// unlikely
if (try p.transition(c, token1)) {
_ = try p.transition(c, token2);
}
}
// Perform a single transition on the state machine and return any possible token.
fn transition(p: *StreamingParser, c: u8, token: *?Token) Error!bool {
switch (p.state) {
.TopLevelBegin => switch (c) {
'{' => {
p.stack <<= 1;
p.stack |= object_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ObjectSeparator;
token.* = Token.ObjectBegin;
},
'[' => {
p.stack <<= 1;
p.stack |= array_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ValueEnd;
token.* = Token.ArrayBegin;
},
'-' => {
p.number_is_integer = true;
p.state = .Number;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
'0' => {
p.number_is_integer = true;
p.state = .NumberMaybeDotOrExponent;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
'1'...'9' => {
p.number_is_integer = true;
p.state = .NumberMaybeDigitOrDotOrExponent;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
'"' => {
p.state = .String;
p.after_value_state = .TopLevelEnd;
// We don't actually need the following since after_value_state should override.
p.after_string_state = .ValueEnd;
p.string_escapes = .None;
p.string_last_was_high_surrogate = false;
p.count = 0;
},
't' => {
p.state = .TrueLiteral1;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
'f' => {
p.state = .FalseLiteral1;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
'n' => {
p.state = .NullLiteral1;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidTopLevel;
},
},
.TopLevelEnd => switch (c) {
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidTopLevelTrailing;
},
},
.ValueBegin => switch (c) {
// NOTE: These are shared in ValueEnd as well, think we can reorder states to
// be a bit clearer and avoid this duplication.
'}' => {
// unlikely
if (p.stack & 1 != object_bit) {
return error.UnexpectedClosingBracket;
}
if (p.stack_used == 0) {
return error.TooManyClosingItems;
}
p.state = .ValueBegin;
p.after_string_state = State.fromInt(p.stack & 1);
p.stack >>= 1;
p.stack_used -= 1;
switch (p.stack_used) {
0 => {
p.complete = true;
p.state = .TopLevelEnd;
},
else => {
p.state = .ValueEnd;
},
}
token.* = Token.ObjectEnd;
},
']' => {
if (p.stack & 1 != array_bit) {
return error.UnexpectedClosingBrace;
}
if (p.stack_used == 0) {
return error.TooManyClosingItems;
}
p.state = .ValueBegin;
p.after_string_state = State.fromInt(p.stack & 1);
p.stack >>= 1;
p.stack_used -= 1;
switch (p.stack_used) {
0 => {
p.complete = true;
p.state = .TopLevelEnd;
},
else => {
p.state = .ValueEnd;
},
}
token.* = Token.ArrayEnd;
},
'{' => {
if (p.stack_used == max_stack_size) {
return error.TooManyNestedItems;
}
p.stack <<= 1;
p.stack |= object_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ObjectSeparator;
token.* = Token.ObjectBegin;
},
'[' => {
if (p.stack_used == max_stack_size) {
return error.TooManyNestedItems;
}
p.stack <<= 1;
p.stack |= array_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ValueEnd;
token.* = Token.ArrayBegin;
},
'-' => {
p.number_is_integer = true;
p.state = .Number;
p.count = 0;
},
'0' => {
p.number_is_integer = true;
p.state = .NumberMaybeDotOrExponent;
p.count = 0;
},
'1'...'9' => {
p.number_is_integer = true;
p.state = .NumberMaybeDigitOrDotOrExponent;
p.count = 0;
},
'"' => {
p.state = .String;
p.string_escapes = .None;
p.string_last_was_high_surrogate = false;
p.count = 0;
},
't' => {
p.state = .TrueLiteral1;
p.count = 0;
},
'f' => {
p.state = .FalseLiteral1;
p.count = 0;
},
'n' => {
p.state = .NullLiteral1;
p.count = 0;
},
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidValueBegin;
},
},
// TODO: A bit of duplication here and in the following state, redo.
.ValueBeginNoClosing => switch (c) {
'{' => {
if (p.stack_used == max_stack_size) {
return error.TooManyNestedItems;
}
p.stack <<= 1;
p.stack |= object_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ObjectSeparator;
token.* = Token.ObjectBegin;
},
'[' => {
if (p.stack_used == max_stack_size) {
return error.TooManyNestedItems;
}
p.stack <<= 1;
p.stack |= array_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ValueEnd;
token.* = Token.ArrayBegin;
},
'-' => {
p.number_is_integer = true;
p.state = .Number;
p.count = 0;
},
'0' => {
p.number_is_integer = true;
p.state = .NumberMaybeDotOrExponent;
p.count = 0;
},
'1'...'9' => {
p.number_is_integer = true;
p.state = .NumberMaybeDigitOrDotOrExponent;
p.count = 0;
},
'"' => {
p.state = .String;
p.string_escapes = .None;
p.string_last_was_high_surrogate = false;
p.count = 0;
},
't' => {
p.state = .TrueLiteral1;
p.count = 0;
},
'f' => {
p.state = .FalseLiteral1;
p.count = 0;
},
'n' => {
p.state = .NullLiteral1;
p.count = 0;
},
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidValueBegin;
},
},
.ValueEnd => switch (c) {
',' => {
p.after_string_state = State.fromInt(p.stack & 1);
p.state = .ValueBeginNoClosing;
},
']' => {
if (p.stack_used == 0) {
return error.UnbalancedBrackets;
}
p.state = .ValueEnd;
p.after_string_state = State.fromInt(p.stack & 1);
p.stack >>= 1;
p.stack_used -= 1;
if (p.stack_used == 0) {
p.complete = true;
p.state = .TopLevelEnd;
}
token.* = Token.ArrayEnd;
},
'}' => {
if (p.stack_used == 0) {
return error.UnbalancedBraces;
}
p.state = .ValueEnd;
p.after_string_state = State.fromInt(p.stack & 1);
p.stack >>= 1;
p.stack_used -= 1;
if (p.stack_used == 0) {
p.complete = true;
p.state = .TopLevelEnd;
}
token.* = Token.ObjectEnd;
},
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidValueEnd;
},
},
.ObjectSeparator => switch (c) {
':' => {
p.state = .ValueBegin;
p.after_string_state = .ValueEnd;
},
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidSeparator;
},
},
.String => switch (c) {
0x00...0x1F => {
return error.InvalidControlCharacter;
},
'"' => {
p.state = p.after_string_state;
if (p.after_value_state == .TopLevelEnd) {
p.state = .TopLevelEnd;
p.complete = true;
}
token.* = .{
.String = .{
.count = p.count - 1,
.escapes = p.string_escapes,
},
};
p.string_escapes = undefined;
p.string_last_was_high_surrogate = undefined;
},
'\\' => {
p.state = .StringEscapeCharacter;
switch (p.string_escapes) {
.None => {
p.string_escapes = .{ .Some = .{ .size_diff = 0 } };
},
.Some => {},
}
},
0x20, 0x21, 0x23...0x5B, 0x5D...0x7F => {
// non-control ascii
p.string_last_was_high_surrogate = false;
},
0xC2...0xDF => {
p.state = .StringUtf8Byte2Of2;
},
0xE0...0xEF => {
p.state = .StringUtf8Byte2Of3;
p.sequence_first_byte = c;
},
0xF0...0xF4 => {
p.state = .StringUtf8Byte2Of4;
p.sequence_first_byte = c;
},
else => {
return error.InvalidUtf8Byte;
},
},
.StringUtf8Byte2Of2 => switch (c >> 6) {
0b10 => p.state = .String,
else => return error.InvalidUtf8Byte,
},
.StringUtf8Byte2Of3 => {
switch (p.sequence_first_byte) {
0xE0 => switch (c) {
0xA0...0xBF => {},
else => return error.InvalidUtf8Byte,
},
0xE1...0xEF => switch (c) {
0x80...0xBF => {},
else => return error.InvalidUtf8Byte,
},
else => return error.InvalidUtf8Byte,
}
p.state = .StringUtf8Byte3Of3;
},
.StringUtf8Byte3Of3 => switch (c) {
0x80...0xBF => p.state = .String,
else => return error.InvalidUtf8Byte,
},
.StringUtf8Byte2Of4 => {
switch (p.sequence_first_byte) {
0xF0 => switch (c) {
0x90...0xBF => {},
else => return error.InvalidUtf8Byte,
},
0xF1...0xF3 => switch (c) {
0x80...0xBF => {},
else => return error.InvalidUtf8Byte,
},
0xF4 => switch (c) {
0x80...0x8F => {},
else => return error.InvalidUtf8Byte,
},
else => return error.InvalidUtf8Byte,
}
p.state = .StringUtf8Byte3Of4;
},
.StringUtf8Byte3Of4 => switch (c) {
0x80...0xBF => p.state = .StringUtf8Byte4Of4,
else => return error.InvalidUtf8Byte,
},
.StringUtf8Byte4Of4 => switch (c) {
0x80...0xBF => p.state = .String,
else => return error.InvalidUtf8Byte,
},
.StringEscapeCharacter => switch (c) {
// NOTE: '/' is allowed as an escaped character but it also is allowed
// as unescaped according to the RFC. There is a reported errata which suggests
// removing the non-escaped variant but it makes more sense to simply disallow
// it as an escape code here.
//
// The current JSONTestSuite tests rely on both of this behaviour being present
// however, so we default to the status quo where both are accepted until this
// is further clarified.
'"', '\\', '/', 'b', 'f', 'n', 'r', 't' => {
p.string_escapes.Some.size_diff -= 1;
p.state = .String;
p.string_last_was_high_surrogate = false;
},
'u' => {
p.state = .StringEscapeHexUnicode4;
},
else => {
return error.InvalidEscapeCharacter;
},
},
.StringEscapeHexUnicode4 => {
var codepoint: u21 = undefined;
switch (c) {
else => return error.InvalidUnicodeHexSymbol,
'0'...'9' => {
codepoint = c - '0';
},
'A'...'F' => {
codepoint = c - 'A' + 10;
},
'a'...'f' => {
codepoint = c - 'a' + 10;
},
}
p.state = .StringEscapeHexUnicode3;
p.string_unicode_codepoint = codepoint << 12;
},
.StringEscapeHexUnicode3 => {
var codepoint: u21 = undefined;
switch (c) {
else => return error.InvalidUnicodeHexSymbol,
'0'...'9' => {
codepoint = c - '0';
},
'A'...'F' => {
codepoint = c - 'A' + 10;
},
'a'...'f' => {
codepoint = c - 'a' + 10;
},
}
p.state = .StringEscapeHexUnicode2;
p.string_unicode_codepoint |= codepoint << 8;
},
.StringEscapeHexUnicode2 => {
var codepoint: u21 = undefined;
switch (c) {
else => return error.InvalidUnicodeHexSymbol,
'0'...'9' => {
codepoint = c - '0';
},
'A'...'F' => {
codepoint = c - 'A' + 10;
},
'a'...'f' => {
codepoint = c - 'a' + 10;
},
}
p.state = .StringEscapeHexUnicode1;
p.string_unicode_codepoint |= codepoint << 4;
},
.StringEscapeHexUnicode1 => {
var codepoint: u21 = undefined;
switch (c) {
else => return error.InvalidUnicodeHexSymbol,
'0'...'9' => {
codepoint = c - '0';
},
'A'...'F' => {
codepoint = c - 'A' + 10;
},
'a'...'f' => {
codepoint = c - 'a' + 10;
},
}
p.state = .String;
p.string_unicode_codepoint |= codepoint;
if (p.string_unicode_codepoint < 0xD800 or p.string_unicode_codepoint >= 0xE000) {
// not part of surrogate pair
p.string_escapes.Some.size_diff -= @as(isize, 6 - (std.unicode.utf8CodepointSequenceLength(p.string_unicode_codepoint) catch unreachable));
p.string_last_was_high_surrogate = false;
} else if (p.string_unicode_codepoint < 0xDC00) {
// 'high' surrogate
// takes 3 bytes to encode a half surrogate pair into wtf8
p.string_escapes.Some.size_diff -= 6 - 3;
p.string_last_was_high_surrogate = true;
} else {
// 'low' surrogate
p.string_escapes.Some.size_diff -= 6;
if (p.string_last_was_high_surrogate) {
// takes 4 bytes to encode a full surrogate pair into utf8
// 3 bytes are already reserved by high surrogate
p.string_escapes.Some.size_diff -= -1;
} else {
// takes 3 bytes to encode a half surrogate pair into wtf8
p.string_escapes.Some.size_diff -= -3;
}
p.string_last_was_high_surrogate = false;
}
p.string_unicode_codepoint = undefined;
},
.Number => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'0' => {
p.state = .NumberMaybeDotOrExponent;
},
'1'...'9' => {
p.state = .NumberMaybeDigitOrDotOrExponent;
},
else => {
return error.InvalidNumber;
},
}
},
.NumberMaybeDotOrExponent => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'.' => {
p.number_is_integer = false;
p.state = .NumberFractionalRequired;
},
'e', 'E' => {
p.number_is_integer = false;
p.state = .NumberExponent;
},
else => {
p.state = p.after_value_state;
token.* = .{
.Number = .{
.count = p.count,
.is_integer = p.number_is_integer,
},
};
p.number_is_integer = undefined;
return true;
},
}
},
.NumberMaybeDigitOrDotOrExponent => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'.' => {
p.number_is_integer = false;
p.state = .NumberFractionalRequired;
},
'e', 'E' => {
p.number_is_integer = false;
p.state = .NumberExponent;
},
'0'...'9' => {
// another digit
},
else => {
p.state = p.after_value_state;
token.* = .{
.Number = .{
.count = p.count,
.is_integer = p.number_is_integer,
},
};
return true;
},
}
},
.NumberFractionalRequired => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'0'...'9' => {
p.state = .NumberFractional;
},
else => {
return error.InvalidNumber;
},
}
},
.NumberFractional => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'0'...'9' => {
// another digit
},
'e', 'E' => {
p.number_is_integer = false;
p.state = .NumberExponent;
},
else => {
p.state = p.after_value_state;
token.* = .{
.Number = .{
.count = p.count,
.is_integer = p.number_is_integer,
},
};
return true;
},
}
},
.NumberMaybeExponent => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'e', 'E' => {
p.number_is_integer = false;
p.state = .NumberExponent;
},
else => {
p.state = p.after_value_state;
token.* = .{
.Number = .{
.count = p.count,
.is_integer = p.number_is_integer,
},
};
return true;
},
}
},
.NumberExponent => switch (c) {
'-', '+' => {
p.complete = false;
p.state = .NumberExponentDigitsRequired;
},
'0'...'9' => {
p.complete = p.after_value_state == .TopLevelEnd;
p.state = .NumberExponentDigits;
},
else => {
return error.InvalidNumber;
},
},
.NumberExponentDigitsRequired => switch (c) {
'0'...'9' => {
p.complete = p.after_value_state == .TopLevelEnd;
p.state = .NumberExponentDigits;
},
else => {
return error.InvalidNumber;
},
},
.NumberExponentDigits => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'0'...'9' => {
// another digit
},
else => {
p.state = p.after_value_state;
token.* = .{
.Number = .{
.count = p.count,
.is_integer = p.number_is_integer,
},
};
return true;
},
}
},
.TrueLiteral1 => switch (c) {
'r' => p.state = .TrueLiteral2,
else => return error.InvalidLiteral,
},
.TrueLiteral2 => switch (c) {
'u' => p.state = .TrueLiteral3,
else => return error.InvalidLiteral,
},
.TrueLiteral3 => switch (c) {
'e' => {
p.state = p.after_value_state;
p.complete = p.state == .TopLevelEnd;
token.* = Token.True;
},
else => {
return error.InvalidLiteral;
},
},
.FalseLiteral1 => switch (c) {
'a' => p.state = .FalseLiteral2,
else => return error.InvalidLiteral,
},
.FalseLiteral2 => switch (c) {
'l' => p.state = .FalseLiteral3,
else => return error.InvalidLiteral,
},
.FalseLiteral3 => switch (c) {
's' => p.state = .FalseLiteral4,
else => return error.InvalidLiteral,
},
.FalseLiteral4 => switch (c) {
'e' => {
p.state = p.after_value_state;
p.complete = p.state == .TopLevelEnd;
token.* = Token.False;
},
else => {
return error.InvalidLiteral;
},
},
.NullLiteral1 => switch (c) {
'u' => p.state = .NullLiteral2,
else => return error.InvalidLiteral,
},
.NullLiteral2 => switch (c) {
'l' => p.state = .NullLiteral3,
else => return error.InvalidLiteral,
},
.NullLiteral3 => switch (c) {
'l' => {
p.state = p.after_value_state;
p.complete = p.state == .TopLevelEnd;
token.* = Token.Null;
},
else => {
return error.InvalidLiteral;
},
},
}
return false;
}
};
/// A small wrapper over a StreamingParser for full slices. Returns a stream of json Tokens.
pub const TokenStream = struct {
i: usize,
slice: []const u8,
parser: StreamingParser,
token: ?Token,
pub const Error = StreamingParser.Error || error{UnexpectedEndOfJson};
pub fn init(slice: []const u8) TokenStream {
return TokenStream{
.i = 0,
.slice = slice,
.parser = StreamingParser.init(),
.token = null,
};
}
pub fn next(self: *TokenStream) Error!?Token {
if (self.token) |token| {
// TODO: Audit this pattern once #2915 is closed
const copy = token;
self.token = null;
return copy;
}
var t1: ?Token = undefined;
var t2: ?Token = undefined;
while (self.i < self.slice.len) {
try self.parser.feed(self.slice[self.i], &t1, &t2);
self.i += 1;
if (t1) |token| {
self.token = t2;
return token;
}
}
// Without this a bare number fails, the streaming parser doesn't know the input ended
try self.parser.feed(' ', &t1, &t2);
self.i += 1;
if (t1) |token| {
return token;
} else if (self.parser.complete) {
return null;
} else {
return error.UnexpectedEndOfJson;
}
}
};
fn checkNext(p: *TokenStream, id: std.meta.TagType(Token)) void {
const token = (p.next() catch unreachable).?;
debug.assert(std.meta.activeTag(token) == id);
}
test "json.token" {
const s =
\\{
\\ "Image": {
\\ "Width": 800,
\\ "Height": 600,
\\ "Title": "View from 15th Floor",
\\ "Thumbnail": {
\\ "Url": "http://www.example.com/image/481989943",
\\ "Height": 125,
\\ "Width": 100
\\ },
\\ "Animated" : false,
\\ "IDs": [116, 943, 234, 38793]
\\ }
\\}
;
var p = TokenStream.init(s);
checkNext(&p, .ObjectBegin);
checkNext(&p, .String); // Image
checkNext(&p, .ObjectBegin);
checkNext(&p, .String); // Width
checkNext(&p, .Number);
checkNext(&p, .String); // Height
checkNext(&p, .Number);
checkNext(&p, .String); // Title
checkNext(&p, .String);
checkNext(&p, .String); // Thumbnail
checkNext(&p, .ObjectBegin);
checkNext(&p, .String); // Url
checkNext(&p, .String);
checkNext(&p, .String); // Height
checkNext(&p, .Number);
checkNext(&p, .String); // Width
checkNext(&p, .Number);
checkNext(&p, .ObjectEnd);
checkNext(&p, .String); // Animated
checkNext(&p, .False);
checkNext(&p, .String); // IDs
checkNext(&p, .ArrayBegin);
checkNext(&p, .Number);
checkNext(&p, .Number);
checkNext(&p, .Number);
checkNext(&p, .Number);
checkNext(&p, .ArrayEnd);
checkNext(&p, .ObjectEnd);
checkNext(&p, .ObjectEnd);
testing.expect((try p.next()) == null);
}
/// Validate a JSON string. This does not limit number precision so a decoder may not necessarily
/// be able to decode the string even if this returns true.
pub fn validate(s: []const u8) bool {
var p = StreamingParser.init();
for (s) |c, i| {
var token1: ?Token = undefined;
var token2: ?Token = undefined;
p.feed(c, &token1, &token2) catch |err| {
return false;
};
}
return p.complete;
}
test "json.validate" {
testing.expect(validate("{}"));
}
const Allocator = std.mem.Allocator;
const ArenaAllocator = std.heap.ArenaAllocator;
const ArrayList = std.ArrayList;
const StringHashMap = std.StringHashMap;
pub const ValueTree = struct {
arena: ArenaAllocator,
root: Value,
pub fn deinit(self: *ValueTree) void {
self.arena.deinit();
}
};
pub const ObjectMap = StringHashMap(Value);
pub const Array = ArrayList(Value);
/// Represents a JSON value
/// Currently only supports numbers that fit into i64 or f64.
pub const Value = union(enum) {
Null,
Bool: bool,
Integer: i64,
Float: f64,
String: []const u8,
Array: Array,
Object: ObjectMap,
pub fn dump(self: Value) void {
var held = std.debug.getStderrMutex().acquire();
defer held.release();
const stderr = std.debug.getStderrStream();
self.dumpStream(stderr, 1024) catch return;
}
pub fn dumpIndent(self: Value, comptime indent: usize) void {
if (indent == 0) {
self.dump();
} else {
var held = std.debug.getStderrMutex().acquire();
defer held.release();
const stderr = std.debug.getStderrStream();
self.dumpStreamIndent(indent, stderr, 1024) catch return;
}
}
pub fn dumpStream(self: @This(), stream: var, comptime max_depth: usize) !void {
var w = std.json.WriteStream(@TypeOf(stream).Child, max_depth).init(stream);
w.newline = "";
w.one_indent = "";
w.space = "";
try w.emitJson(self);
}
pub fn dumpStreamIndent(self: @This(), comptime indent: usize, stream: var, comptime max_depth: usize) !void {
var one_indent = " " ** indent;
var w = std.json.WriteStream(@TypeOf(stream).Child, max_depth).init(stream);
w.one_indent = one_indent;
try w.emitJson(self);
}
};
/// A non-stream JSON parser which constructs a tree of Value's.
pub const Parser = struct {
allocator: *Allocator,
state: State,
copy_strings: bool,
// Stores parent nodes and un-combined Values.
stack: Array,
const State = enum {
ObjectKey,
ObjectValue,
ArrayValue,
Simple,
};
pub fn init(allocator: *Allocator, copy_strings: bool) Parser {
return Parser{
.allocator = allocator,
.state = .Simple,
.copy_strings = copy_strings,
.stack = Array.init(allocator),
};
}
pub fn deinit(p: *Parser) void {
p.stack.deinit();
}
pub fn reset(p: *Parser) void {
p.state = .Simple;
p.stack.shrink(0);
}
pub fn parse(p: *Parser, input: []const u8) !ValueTree {
var s = TokenStream.init(input);
var arena = ArenaAllocator.init(p.allocator);
errdefer arena.deinit();
while (try s.next()) |token| {
try p.transition(&arena.allocator, input, s.i - 1, token);
}
debug.assert(p.stack.len == 1);
return ValueTree{
.arena = arena,
.root = p.stack.at(0),
};
}
// Even though p.allocator exists, we take an explicit allocator so that allocation state
// can be cleaned up on error correctly during a `parse` on call.
fn transition(p: *Parser, allocator: *Allocator, input: []const u8, i: usize, token: Token) !void {
switch (p.state) {
.ObjectKey => switch (token) {
.ObjectEnd => {
if (p.stack.len == 1) {
return;
}
var value = p.stack.pop();
try p.pushToParent(&value);
},
.String => |s| {
try p.stack.append(try p.parseString(allocator, s, input, i));
p.state = .ObjectValue;
},
else => {
// The streaming parser would return an error eventually.
// To prevent invalid state we return an error now.
// TODO make the streaming parser return an error as soon as it encounters an invalid object key
return error.InvalidLiteral;
},
},
.ObjectValue => {
var object = &p.stack.items[p.stack.len - 2].Object;
var key = p.stack.items[p.stack.len - 1].String;
switch (token) {
.ObjectBegin => {
try p.stack.append(Value{ .Object = ObjectMap.init(allocator) });
p.state = .ObjectKey;
},
.ArrayBegin => {
try p.stack.append(Value{ .Array = Array.init(allocator) });
p.state = .ArrayValue;
},
.String => |s| {
_ = try object.put(key, try p.parseString(allocator, s, input, i));
_ = p.stack.pop();
p.state = .ObjectKey;
},
.Number => |n| {
_ = try object.put(key, try p.parseNumber(n, input, i));
_ = p.stack.pop();
p.state = .ObjectKey;
},
.True => {
_ = try object.put(key, Value{ .Bool = true });
_ = p.stack.pop();
p.state = .ObjectKey;
},
.False => {
_ = try object.put(key, Value{ .Bool = false });
_ = p.stack.pop();
p.state = .ObjectKey;
},
.Null => {
_ = try object.put(key, Value.Null);
_ = p.stack.pop();
p.state = .ObjectKey;
},
.ObjectEnd, .ArrayEnd => {
unreachable;
},
}
},
.ArrayValue => {
var array = &p.stack.items[p.stack.len - 1].Array;
switch (token) {
.ArrayEnd => {
if (p.stack.len == 1) {
return;
}
var value = p.stack.pop();
try p.pushToParent(&value);
},
.ObjectBegin => {
try p.stack.append(Value{ .Object = ObjectMap.init(allocator) });
p.state = .ObjectKey;
},
.ArrayBegin => {
try p.stack.append(Value{ .Array = Array.init(allocator) });
p.state = .ArrayValue;
},
.String => |s| {
try array.append(try p.parseString(allocator, s, input, i));
},
.Number => |n| {
try array.append(try p.parseNumber(n, input, i));
},
.True => {
try array.append(Value{ .Bool = true });
},
.False => {
try array.append(Value{ .Bool = false });
},
.Null => {
try array.append(Value.Null);
},
.ObjectEnd => {
unreachable;
},
}
},
.Simple => switch (token) {
.ObjectBegin => {
try p.stack.append(Value{ .Object = ObjectMap.init(allocator) });
p.state = .ObjectKey;
},
.ArrayBegin => {
try p.stack.append(Value{ .Array = Array.init(allocator) });
p.state = .ArrayValue;
},
.String => |s| {
try p.stack.append(try p.parseString(allocator, s, input, i));
},
.Number => |n| {
try p.stack.append(try p.parseNumber(n, input, i));
},
.True => {
try p.stack.append(Value{ .Bool = true });
},
.False => {
try p.stack.append(Value{ .Bool = false });
},
.Null => {
try p.stack.append(Value.Null);
},
.ObjectEnd, .ArrayEnd => {
unreachable;
},
},
}
}
fn pushToParent(p: *Parser, value: *const Value) !void {
switch (p.stack.toSlice()[p.stack.len - 1]) {
// Object Parent -> [ ..., object, <key>, value ]
Value.String => |key| {
_ = p.stack.pop();
var object = &p.stack.items[p.stack.len - 1].Object;
_ = try object.put(key, value.*);
p.state = .ObjectKey;
},
// Array Parent -> [ ..., <array>, value ]
Value.Array => |*array| {
try array.append(value.*);
p.state = .ArrayValue;
},
else => {
unreachable;
},
}
}
fn parseString(p: *Parser, allocator: *Allocator, s: std.meta.TagPayloadType(Token, Token.String), input: []const u8, i: usize) !Value {
// TODO: We don't strictly have to copy values which do not contain any escape
// characters if flagged with the option.
const slice = s.slice(input, i);
switch (s.escapes) {
.None => return Value{ .String = try mem.dupe(allocator, u8, slice) },
.Some => |some_escapes| {
const output = try allocator.alloc(u8, s.decodedLength());
errdefer allocator.free(output);
try unescapeString(output, slice);
return Value{ .String = output };
},
}
}
fn parseNumber(p: *Parser, n: std.meta.TagPayloadType(Token, Token.Number), input: []const u8, i: usize) !Value {
return if (n.is_integer)
Value{ .Integer = try std.fmt.parseInt(i64, n.slice(input, i), 10) }
else
Value{ .Float = try std.fmt.parseFloat(f64, n.slice(input, i)) };
}
};
// Unescape a JSON string
// Only to be used on strings already validated by the parser
// (note the unreachable statements and lack of bounds checking)
fn unescapeString(output: []u8, input: []const u8) !void {
var inIndex: usize = 0;
var outIndex: usize = 0;
while (inIndex < input.len) {
if (input[inIndex] != '\\') {
// not an escape sequence
output[outIndex] = input[inIndex];
inIndex += 1;
outIndex += 1;
} else if (input[inIndex + 1] != 'u') {
// a simple escape sequence
output[outIndex] = @as(u8, switch (input[inIndex + 1]) {
'\\' => '\\',
'/' => '/',
'n' => '\n',
'r' => '\r',
't' => '\t',
'f' => 12,
'b' => 8,
'"' => '"',
else => unreachable,
});
inIndex += 2;
outIndex += 1;
} else {
// a unicode escape sequence
const firstCodeUnit = std.fmt.parseInt(u16, input[inIndex + 2 .. inIndex + 6], 16) catch unreachable;
// guess optimistically that it's not a surrogate pair
if (std.unicode.utf8Encode(firstCodeUnit, output[outIndex..])) |byteCount| {
outIndex += byteCount;
inIndex += 6;
} else |err| {
// it might be a surrogate pair
if (err != error.Utf8CannotEncodeSurrogateHalf) {
return error.InvalidUnicodeHexSymbol;
}
// check if a second code unit is present
if (inIndex + 7 >= input.len or input[inIndex + 6] != '\\' or input[inIndex + 7] != 'u') {
return error.InvalidUnicodeHexSymbol;
}
const secondCodeUnit = std.fmt.parseInt(u16, input[inIndex + 8 .. inIndex + 12], 16) catch unreachable;
if (std.unicode.utf16leToUtf8(output[outIndex..], &[2]u16{ firstCodeUnit, secondCodeUnit })) |byteCount| {
outIndex += byteCount;
inIndex += 12;
} else |_| {
return error.InvalidUnicodeHexSymbol;
}
}
}
}
assert(outIndex == output.len);
}
test "json.parser.dynamic" {
var p = Parser.init(debug.global_allocator, false);
defer p.deinit();
const s =
\\{
\\ "Image": {
\\ "Width": 800,
\\ "Height": 600,
\\ "Title": "View from 15th Floor",
\\ "Thumbnail": {
\\ "Url": "http://www.example.com/image/481989943",
\\ "Height": 125,
\\ "Width": 100
\\ },
\\ "Animated" : false,
\\ "IDs": [116, 943, 234, 38793],
\\ "ArrayOfObject": [{"n": "m"}],
\\ "double": 1.3412
\\ }
\\}
;
var tree = try p.parse(s);
defer tree.deinit();
var root = tree.root;
var image = root.Object.get("Image").?.value;
const width = image.Object.get("Width").?.value;
testing.expect(width.Integer == 800);
const height = image.Object.get("Height").?.value;
testing.expect(height.Integer == 600);
const title = image.Object.get("Title").?.value;
testing.expect(mem.eql(u8, title.String, "View from 15th Floor"));
const animated = image.Object.get("Animated").?.value;
testing.expect(animated.Bool == false);
const array_of_object = image.Object.get("ArrayOfObject").?.value;
testing.expect(array_of_object.Array.len == 1);
const obj0 = array_of_object.Array.at(0).Object.get("n").?.value;
testing.expect(mem.eql(u8, obj0.String, "m"));
const double = image.Object.get("double").?.value;
testing.expect(double.Float == 1.3412);
}
test "import more json tests" {
_ = @import("json/test.zig");
_ = @import("json/write_stream.zig");
}
test "write json then parse it" {
var out_buffer: [1000]u8 = undefined;
var slice_out_stream = std.io.SliceOutStream.init(&out_buffer);
const out_stream = &slice_out_stream.stream;
var jw = WriteStream(@TypeOf(out_stream).Child, 4).init(out_stream);
try jw.beginObject();
try jw.objectField("f");
try jw.emitBool(false);
try jw.objectField("t");
try jw.emitBool(true);
try jw.objectField("int");
try jw.emitNumber(@as(i32, 1234));
try jw.objectField("array");
try jw.beginArray();
try jw.arrayElem();
try jw.emitNull();
try jw.arrayElem();
try jw.emitNumber(@as(f64, 12.34));
try jw.endArray();
try jw.objectField("str");
try jw.emitString("hello");
try jw.endObject();
var mem_buffer: [1024 * 20]u8 = undefined;
const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
var parser = Parser.init(allocator, false);
const tree = try parser.parse(slice_out_stream.getWritten());
testing.expect(tree.root.Object.get("f").?.value.Bool == false);
testing.expect(tree.root.Object.get("t").?.value.Bool == true);
testing.expect(tree.root.Object.get("int").?.value.Integer == 1234);
testing.expect(tree.root.Object.get("array").?.value.Array.at(0).Null == {});
testing.expect(tree.root.Object.get("array").?.value.Array.at(1).Float == 12.34);
testing.expect(mem.eql(u8, tree.root.Object.get("str").?.value.String, "hello"));
}
fn test_parse(json_str: []const u8) !Value {
var p = Parser.init(debug.global_allocator, false);
return (try p.parse(json_str)).root;
}
test "parsing empty string gives appropriate error" {
testing.expectError(error.UnexpectedEndOfJson, test_parse(""));
}
test "integer after float has proper type" {
const json = try test_parse(
\\{
\\ "float": 3.14,
\\ "ints": [1, 2, 3]
\\}
);
std.testing.expect(json.Object.getValue("ints").?.Array.at(0) == .Integer);
}
test "escaped characters" {
const input =
\\{
\\ "backslash": "\\",
\\ "forwardslash": "\/",
\\ "newline": "\n",
\\ "carriagereturn": "\r",
\\ "tab": "\t",
\\ "formfeed": "\f",
\\ "backspace": "\b",
\\ "doublequote": "\"",
\\ "unicode": "\u0105",
\\ "surrogatepair": "\ud83d\ude02"
\\}
;
var p = Parser.init(debug.global_allocator, false);
const tree = try p.parse(input);
const obj = tree.root.Object;
testing.expectEqualSlices(u8, obj.get("backslash").?.value.String, "\\");
testing.expectEqualSlices(u8, obj.get("forwardslash").?.value.String, "/");
testing.expectEqualSlices(u8, obj.get("newline").?.value.String, "\n");
testing.expectEqualSlices(u8, obj.get("carriagereturn").?.value.String, "\r");
testing.expectEqualSlices(u8, obj.get("tab").?.value.String, "\t");
testing.expectEqualSlices(u8, obj.get("formfeed").?.value.String, "\x0C");
testing.expectEqualSlices(u8, obj.get("backspace").?.value.String, "\x08");
testing.expectEqualSlices(u8, obj.get("doublequote").?.value.String, "\"");
testing.expectEqualSlices(u8, obj.get("unicode").?.value.String, "ą");
testing.expectEqualSlices(u8, obj.get("surrogatepair").?.value.String, "😂");
}