zig/lib/std/json.zig

1693 lines
56 KiB
Zig

// JSON parser conforming to RFC8259.
//
// https://tools.ietf.org/html/rfc8259
const std = @import("std.zig");
const debug = std.debug;
const assert = debug.assert;
const testing = std.testing;
const mem = std.mem;
const maxInt = std.math.maxInt;
pub const WriteStream = @import("json/write_stream.zig").WriteStream;
const StringEscapes = union(enum) {
None,
Some: struct {
size_diff: isize,
},
};
/// A single token slice into the parent string.
///
/// Use `token.slice()` on the input at the current position to get the current slice.
pub const Token = union(enum) {
ObjectBegin,
ObjectEnd,
ArrayBegin,
ArrayEnd,
String: struct {
/// How many bytes the token is.
count: usize,
/// Whether string contains an escape sequence and cannot be zero-copied
escapes: StringEscapes,
pub fn decodedLength(self: @This()) usize {
return self.count +% switch (self.escapes) {
.None => 0,
.Some => |s| @bitCast(usize, s.size_diff),
};
}
/// Slice into the underlying input string.
pub fn slice(self: @This(), input: []const u8, i: usize) []const u8 {
return input[i - self.count .. i];
}
},
Number: struct {
/// How many bytes the token is.
count: usize,
/// Whether number is simple and can be represented by an integer (i.e. no `.` or `e`)
is_integer: bool,
/// Slice into the underlying input string.
pub fn slice(self: @This(), input: []const u8, i: usize) []const u8 {
return input[i - self.count .. i];
}
},
True,
False,
Null,
};
/// A small streaming JSON parser. This accepts input one byte at a time and returns tokens as
/// they are encountered. No copies or allocations are performed during parsing and the entire
/// parsing state requires ~40-50 bytes of stack space.
///
/// Conforms strictly to RFC8529.
///
/// For a non-byte based wrapper, consider using TokenStream instead.
pub const StreamingParser = struct {
// Current state
state: State,
// How many bytes we have counted for the current token
count: usize,
// What state to follow after parsing a string (either property or value string)
after_string_state: State,
// What state to follow after parsing a value (either top-level or value end)
after_value_state: State,
// If we stopped now, would the complete parsed string to now be a valid json string
complete: bool,
// Current token flags to pass through to the next generated, see Token.
string_escapes: StringEscapes,
// When in .String states, was the previous character a high surrogate?
string_last_was_high_surrogate: bool,
// Used inside of StringEscapeHexUnicode* states
string_unicode_codepoint: u21,
// The first byte needs to be stored to validate 3- and 4-byte sequences.
sequence_first_byte: u8 = undefined,
// When in .Number states, is the number a (still) valid integer?
number_is_integer: bool,
// Bit-stack for nested object/map literals (max 255 nestings).
stack: u256,
stack_used: u8,
const object_bit = 0;
const array_bit = 1;
const max_stack_size = maxInt(u8);
pub fn init() StreamingParser {
var p: StreamingParser = undefined;
p.reset();
return p;
}
pub fn reset(p: *StreamingParser) void {
p.state = .TopLevelBegin;
p.count = 0;
// Set before ever read in main transition function
p.after_string_state = undefined;
p.after_value_state = .ValueEnd; // handle end of values normally
p.stack = 0;
p.stack_used = 0;
p.complete = false;
p.string_escapes = undefined;
p.string_last_was_high_surrogate = undefined;
p.string_unicode_codepoint = undefined;
p.number_is_integer = undefined;
}
pub const State = enum {
// These must be first with these explicit values as we rely on them for indexing the
// bit-stack directly and avoiding a branch.
ObjectSeparator = 0,
ValueEnd = 1,
TopLevelBegin,
TopLevelEnd,
ValueBegin,
ValueBeginNoClosing,
String,
StringUtf8Byte2Of2,
StringUtf8Byte2Of3,
StringUtf8Byte3Of3,
StringUtf8Byte2Of4,
StringUtf8Byte3Of4,
StringUtf8Byte4Of4,
StringEscapeCharacter,
StringEscapeHexUnicode4,
StringEscapeHexUnicode3,
StringEscapeHexUnicode2,
StringEscapeHexUnicode1,
Number,
NumberMaybeDotOrExponent,
NumberMaybeDigitOrDotOrExponent,
NumberFractionalRequired,
NumberFractional,
NumberMaybeExponent,
NumberExponent,
NumberExponentDigitsRequired,
NumberExponentDigits,
TrueLiteral1,
TrueLiteral2,
TrueLiteral3,
FalseLiteral1,
FalseLiteral2,
FalseLiteral3,
FalseLiteral4,
NullLiteral1,
NullLiteral2,
NullLiteral3,
// Only call this function to generate array/object final state.
pub fn fromInt(x: var) State {
debug.assert(x == 0 or x == 1);
const T = @TagType(State);
return @intToEnum(State, @intCast(T, x));
}
};
pub const Error = error{
InvalidTopLevel,
TooManyNestedItems,
TooManyClosingItems,
InvalidValueBegin,
InvalidValueEnd,
UnbalancedBrackets,
UnbalancedBraces,
UnexpectedClosingBracket,
UnexpectedClosingBrace,
InvalidNumber,
InvalidSeparator,
InvalidLiteral,
InvalidEscapeCharacter,
InvalidUnicodeHexSymbol,
InvalidUtf8Byte,
InvalidTopLevelTrailing,
InvalidControlCharacter,
};
/// Give another byte to the parser and obtain any new tokens. This may (rarely) return two
/// tokens. token2 is always null if token1 is null.
///
/// There is currently no error recovery on a bad stream.
pub fn feed(p: *StreamingParser, c: u8, token1: *?Token, token2: *?Token) Error!void {
token1.* = null;
token2.* = null;
p.count += 1;
// unlikely
if (try p.transition(c, token1)) {
_ = try p.transition(c, token2);
}
}
// Perform a single transition on the state machine and return any possible token.
fn transition(p: *StreamingParser, c: u8, token: *?Token) Error!bool {
switch (p.state) {
.TopLevelBegin => switch (c) {
'{' => {
p.stack <<= 1;
p.stack |= object_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ObjectSeparator;
token.* = Token.ObjectBegin;
},
'[' => {
p.stack <<= 1;
p.stack |= array_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ValueEnd;
token.* = Token.ArrayBegin;
},
'-' => {
p.number_is_integer = true;
p.state = .Number;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
'0' => {
p.number_is_integer = true;
p.state = .NumberMaybeDotOrExponent;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
'1'...'9' => {
p.number_is_integer = true;
p.state = .NumberMaybeDigitOrDotOrExponent;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
'"' => {
p.state = .String;
p.after_value_state = .TopLevelEnd;
// We don't actually need the following since after_value_state should override.
p.after_string_state = .ValueEnd;
p.string_escapes = .None;
p.string_last_was_high_surrogate = false;
p.count = 0;
},
't' => {
p.state = .TrueLiteral1;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
'f' => {
p.state = .FalseLiteral1;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
'n' => {
p.state = .NullLiteral1;
p.after_value_state = .TopLevelEnd;
p.count = 0;
},
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidTopLevel;
},
},
.TopLevelEnd => switch (c) {
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidTopLevelTrailing;
},
},
.ValueBegin => switch (c) {
// NOTE: These are shared in ValueEnd as well, think we can reorder states to
// be a bit clearer and avoid this duplication.
'}' => {
// unlikely
if (p.stack & 1 != object_bit) {
return error.UnexpectedClosingBracket;
}
if (p.stack_used == 0) {
return error.TooManyClosingItems;
}
p.state = .ValueBegin;
p.after_string_state = State.fromInt(p.stack & 1);
p.stack >>= 1;
p.stack_used -= 1;
switch (p.stack_used) {
0 => {
p.complete = true;
p.state = .TopLevelEnd;
},
else => {
p.state = .ValueEnd;
},
}
token.* = Token.ObjectEnd;
},
']' => {
if (p.stack & 1 != array_bit) {
return error.UnexpectedClosingBrace;
}
if (p.stack_used == 0) {
return error.TooManyClosingItems;
}
p.state = .ValueBegin;
p.after_string_state = State.fromInt(p.stack & 1);
p.stack >>= 1;
p.stack_used -= 1;
switch (p.stack_used) {
0 => {
p.complete = true;
p.state = .TopLevelEnd;
},
else => {
p.state = .ValueEnd;
},
}
token.* = Token.ArrayEnd;
},
'{' => {
if (p.stack_used == max_stack_size) {
return error.TooManyNestedItems;
}
p.stack <<= 1;
p.stack |= object_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ObjectSeparator;
token.* = Token.ObjectBegin;
},
'[' => {
if (p.stack_used == max_stack_size) {
return error.TooManyNestedItems;
}
p.stack <<= 1;
p.stack |= array_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ValueEnd;
token.* = Token.ArrayBegin;
},
'-' => {
p.number_is_integer = true;
p.state = .Number;
p.count = 0;
},
'0' => {
p.number_is_integer = true;
p.state = .NumberMaybeDotOrExponent;
p.count = 0;
},
'1'...'9' => {
p.number_is_integer = true;
p.state = .NumberMaybeDigitOrDotOrExponent;
p.count = 0;
},
'"' => {
p.state = .String;
p.string_escapes = .None;
p.string_last_was_high_surrogate = false;
p.count = 0;
},
't' => {
p.state = .TrueLiteral1;
p.count = 0;
},
'f' => {
p.state = .FalseLiteral1;
p.count = 0;
},
'n' => {
p.state = .NullLiteral1;
p.count = 0;
},
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidValueBegin;
},
},
// TODO: A bit of duplication here and in the following state, redo.
.ValueBeginNoClosing => switch (c) {
'{' => {
if (p.stack_used == max_stack_size) {
return error.TooManyNestedItems;
}
p.stack <<= 1;
p.stack |= object_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ObjectSeparator;
token.* = Token.ObjectBegin;
},
'[' => {
if (p.stack_used == max_stack_size) {
return error.TooManyNestedItems;
}
p.stack <<= 1;
p.stack |= array_bit;
p.stack_used += 1;
p.state = .ValueBegin;
p.after_string_state = .ValueEnd;
token.* = Token.ArrayBegin;
},
'-' => {
p.number_is_integer = true;
p.state = .Number;
p.count = 0;
},
'0' => {
p.number_is_integer = true;
p.state = .NumberMaybeDotOrExponent;
p.count = 0;
},
'1'...'9' => {
p.number_is_integer = true;
p.state = .NumberMaybeDigitOrDotOrExponent;
p.count = 0;
},
'"' => {
p.state = .String;
p.string_escapes = .None;
p.string_last_was_high_surrogate = false;
p.count = 0;
},
't' => {
p.state = .TrueLiteral1;
p.count = 0;
},
'f' => {
p.state = .FalseLiteral1;
p.count = 0;
},
'n' => {
p.state = .NullLiteral1;
p.count = 0;
},
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidValueBegin;
},
},
.ValueEnd => switch (c) {
',' => {
p.after_string_state = State.fromInt(p.stack & 1);
p.state = .ValueBeginNoClosing;
},
']' => {
if (p.stack_used == 0) {
return error.UnbalancedBrackets;
}
p.state = .ValueEnd;
p.after_string_state = State.fromInt(p.stack & 1);
p.stack >>= 1;
p.stack_used -= 1;
if (p.stack_used == 0) {
p.complete = true;
p.state = .TopLevelEnd;
}
token.* = Token.ArrayEnd;
},
'}' => {
if (p.stack_used == 0) {
return error.UnbalancedBraces;
}
p.state = .ValueEnd;
p.after_string_state = State.fromInt(p.stack & 1);
p.stack >>= 1;
p.stack_used -= 1;
if (p.stack_used == 0) {
p.complete = true;
p.state = .TopLevelEnd;
}
token.* = Token.ObjectEnd;
},
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidValueEnd;
},
},
.ObjectSeparator => switch (c) {
':' => {
p.state = .ValueBegin;
p.after_string_state = .ValueEnd;
},
0x09, 0x0A, 0x0D, 0x20 => {
// whitespace
},
else => {
return error.InvalidSeparator;
},
},
.String => switch (c) {
0x00...0x1F => {
return error.InvalidControlCharacter;
},
'"' => {
p.state = p.after_string_state;
if (p.after_value_state == .TopLevelEnd) {
p.state = .TopLevelEnd;
p.complete = true;
}
token.* = .{
.String = .{
.count = p.count - 1,
.escapes = p.string_escapes,
},
};
p.string_escapes = undefined;
p.string_last_was_high_surrogate = undefined;
},
'\\' => {
p.state = .StringEscapeCharacter;
switch (p.string_escapes) {
.None => {
p.string_escapes = .{ .Some = .{ .size_diff = 0 } };
},
.Some => {},
}
},
0x20, 0x21, 0x23...0x5B, 0x5D...0x7F => {
// non-control ascii
p.string_last_was_high_surrogate = false;
},
0xC2...0xDF => {
p.state = .StringUtf8Byte2Of2;
},
0xE0...0xEF => {
p.state = .StringUtf8Byte2Of3;
p.sequence_first_byte = c;
},
0xF0...0xF4 => {
p.state = .StringUtf8Byte2Of4;
p.sequence_first_byte = c;
},
else => {
return error.InvalidUtf8Byte;
},
},
.StringUtf8Byte2Of2 => switch (c >> 6) {
0b10 => p.state = .String,
else => return error.InvalidUtf8Byte,
},
.StringUtf8Byte2Of3 => {
switch (p.sequence_first_byte) {
0xE0 => switch (c) {
0xA0...0xBF => {},
else => return error.InvalidUtf8Byte,
},
0xE1...0xEF => switch (c) {
0x80...0xBF => {},
else => return error.InvalidUtf8Byte,
},
else => return error.InvalidUtf8Byte,
}
p.state = .StringUtf8Byte3Of3;
},
.StringUtf8Byte3Of3 => switch (c) {
0x80...0xBF => p.state = .String,
else => return error.InvalidUtf8Byte,
},
.StringUtf8Byte2Of4 => {
switch (p.sequence_first_byte) {
0xF0 => switch (c) {
0x90...0xBF => {},
else => return error.InvalidUtf8Byte,
},
0xF1...0xF3 => switch (c) {
0x80...0xBF => {},
else => return error.InvalidUtf8Byte,
},
0xF4 => switch (c) {
0x80...0x8F => {},
else => return error.InvalidUtf8Byte,
},
else => return error.InvalidUtf8Byte,
}
p.state = .StringUtf8Byte3Of4;
},
.StringUtf8Byte3Of4 => switch (c) {
0x80...0xBF => p.state = .StringUtf8Byte4Of4,
else => return error.InvalidUtf8Byte,
},
.StringUtf8Byte4Of4 => switch (c) {
0x80...0xBF => p.state = .String,
else => return error.InvalidUtf8Byte,
},
.StringEscapeCharacter => switch (c) {
// NOTE: '/' is allowed as an escaped character but it also is allowed
// as unescaped according to the RFC. There is a reported errata which suggests
// removing the non-escaped variant but it makes more sense to simply disallow
// it as an escape code here.
//
// The current JSONTestSuite tests rely on both of this behaviour being present
// however, so we default to the status quo where both are accepted until this
// is further clarified.
'"', '\\', '/', 'b', 'f', 'n', 'r', 't' => {
p.string_escapes.Some.size_diff -= 1;
p.state = .String;
p.string_last_was_high_surrogate = false;
},
'u' => {
p.state = .StringEscapeHexUnicode4;
},
else => {
return error.InvalidEscapeCharacter;
},
},
.StringEscapeHexUnicode4 => {
var codepoint: u21 = undefined;
switch (c) {
else => return error.InvalidUnicodeHexSymbol,
'0'...'9' => {
codepoint = c - '0';
},
'A'...'F' => {
codepoint = c - 'A' + 10;
},
'a'...'f' => {
codepoint = c - 'a' + 10;
},
}
p.state = .StringEscapeHexUnicode3;
p.string_unicode_codepoint = codepoint << 12;
},
.StringEscapeHexUnicode3 => {
var codepoint: u21 = undefined;
switch (c) {
else => return error.InvalidUnicodeHexSymbol,
'0'...'9' => {
codepoint = c - '0';
},
'A'...'F' => {
codepoint = c - 'A' + 10;
},
'a'...'f' => {
codepoint = c - 'a' + 10;
},
}
p.state = .StringEscapeHexUnicode2;
p.string_unicode_codepoint |= codepoint << 8;
},
.StringEscapeHexUnicode2 => {
var codepoint: u21 = undefined;
switch (c) {
else => return error.InvalidUnicodeHexSymbol,
'0'...'9' => {
codepoint = c - '0';
},
'A'...'F' => {
codepoint = c - 'A' + 10;
},
'a'...'f' => {
codepoint = c - 'a' + 10;
},
}
p.state = .StringEscapeHexUnicode1;
p.string_unicode_codepoint |= codepoint << 4;
},
.StringEscapeHexUnicode1 => {
var codepoint: u21 = undefined;
switch (c) {
else => return error.InvalidUnicodeHexSymbol,
'0'...'9' => {
codepoint = c - '0';
},
'A'...'F' => {
codepoint = c - 'A' + 10;
},
'a'...'f' => {
codepoint = c - 'a' + 10;
},
}
p.state = .String;
p.string_unicode_codepoint |= codepoint;
if (p.string_unicode_codepoint < 0xD800 or p.string_unicode_codepoint >= 0xE000) {
// not part of surrogate pair
p.string_escapes.Some.size_diff -= @as(isize, 6 - (std.unicode.utf8CodepointSequenceLength(p.string_unicode_codepoint) catch unreachable));
p.string_last_was_high_surrogate = false;
} else if (p.string_unicode_codepoint < 0xDC00) {
// 'high' surrogate
// takes 3 bytes to encode a half surrogate pair into wtf8
p.string_escapes.Some.size_diff -= 6 - 3;
p.string_last_was_high_surrogate = true;
} else {
// 'low' surrogate
p.string_escapes.Some.size_diff -= 6;
if (p.string_last_was_high_surrogate) {
// takes 4 bytes to encode a full surrogate pair into utf8
// 3 bytes are already reserved by high surrogate
p.string_escapes.Some.size_diff -= -1;
} else {
// takes 3 bytes to encode a half surrogate pair into wtf8
p.string_escapes.Some.size_diff -= -3;
}
p.string_last_was_high_surrogate = false;
}
p.string_unicode_codepoint = undefined;
},
.Number => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'0' => {
p.state = .NumberMaybeDotOrExponent;
},
'1'...'9' => {
p.state = .NumberMaybeDigitOrDotOrExponent;
},
else => {
return error.InvalidNumber;
},
}
},
.NumberMaybeDotOrExponent => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'.' => {
p.number_is_integer = false;
p.state = .NumberFractionalRequired;
},
'e', 'E' => {
p.number_is_integer = false;
p.state = .NumberExponent;
},
else => {
p.state = p.after_value_state;
token.* = .{
.Number = .{
.count = p.count,
.is_integer = p.number_is_integer,
},
};
p.number_is_integer = undefined;
return true;
},
}
},
.NumberMaybeDigitOrDotOrExponent => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'.' => {
p.number_is_integer = false;
p.state = .NumberFractionalRequired;
},
'e', 'E' => {
p.number_is_integer = false;
p.state = .NumberExponent;
},
'0'...'9' => {
// another digit
},
else => {
p.state = p.after_value_state;
token.* = .{
.Number = .{
.count = p.count,
.is_integer = p.number_is_integer,
},
};
return true;
},
}
},
.NumberFractionalRequired => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'0'...'9' => {
p.state = .NumberFractional;
},
else => {
return error.InvalidNumber;
},
}
},
.NumberFractional => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'0'...'9' => {
// another digit
},
'e', 'E' => {
p.number_is_integer = false;
p.state = .NumberExponent;
},
else => {
p.state = p.after_value_state;
token.* = .{
.Number = .{
.count = p.count,
.is_integer = p.number_is_integer,
},
};
return true;
},
}
},
.NumberMaybeExponent => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'e', 'E' => {
p.number_is_integer = false;
p.state = .NumberExponent;
},
else => {
p.state = p.after_value_state;
token.* = .{
.Number = .{
.count = p.count,
.is_integer = p.number_is_integer,
},
};
return true;
},
}
},
.NumberExponent => switch (c) {
'-', '+' => {
p.complete = false;
p.state = .NumberExponentDigitsRequired;
},
'0'...'9' => {
p.complete = p.after_value_state == .TopLevelEnd;
p.state = .NumberExponentDigits;
},
else => {
return error.InvalidNumber;
},
},
.NumberExponentDigitsRequired => switch (c) {
'0'...'9' => {
p.complete = p.after_value_state == .TopLevelEnd;
p.state = .NumberExponentDigits;
},
else => {
return error.InvalidNumber;
},
},
.NumberExponentDigits => {
p.complete = p.after_value_state == .TopLevelEnd;
switch (c) {
'0'...'9' => {
// another digit
},
else => {
p.state = p.after_value_state;
token.* = .{
.Number = .{
.count = p.count,
.is_integer = p.number_is_integer,
},
};
return true;
},
}
},
.TrueLiteral1 => switch (c) {
'r' => p.state = .TrueLiteral2,
else => return error.InvalidLiteral,
},
.TrueLiteral2 => switch (c) {
'u' => p.state = .TrueLiteral3,
else => return error.InvalidLiteral,
},
.TrueLiteral3 => switch (c) {
'e' => {
p.state = p.after_value_state;
p.complete = p.state == .TopLevelEnd;
token.* = Token.True;
},
else => {
return error.InvalidLiteral;
},
},
.FalseLiteral1 => switch (c) {
'a' => p.state = .FalseLiteral2,
else => return error.InvalidLiteral,
},
.FalseLiteral2 => switch (c) {
'l' => p.state = .FalseLiteral3,
else => return error.InvalidLiteral,
},
.FalseLiteral3 => switch (c) {
's' => p.state = .FalseLiteral4,
else => return error.InvalidLiteral,
},
.FalseLiteral4 => switch (c) {
'e' => {
p.state = p.after_value_state;
p.complete = p.state == .TopLevelEnd;
token.* = Token.False;
},
else => {
return error.InvalidLiteral;
},
},
.NullLiteral1 => switch (c) {
'u' => p.state = .NullLiteral2,
else => return error.InvalidLiteral,
},
.NullLiteral2 => switch (c) {
'l' => p.state = .NullLiteral3,
else => return error.InvalidLiteral,
},
.NullLiteral3 => switch (c) {
'l' => {
p.state = p.after_value_state;
p.complete = p.state == .TopLevelEnd;
token.* = Token.Null;
},
else => {
return error.InvalidLiteral;
},
},
}
return false;
}
};
/// A small wrapper over a StreamingParser for full slices. Returns a stream of json Tokens.
pub const TokenStream = struct {
i: usize,
slice: []const u8,
parser: StreamingParser,
token: ?Token,
pub const Error = StreamingParser.Error || error{UnexpectedEndOfJson};
pub fn init(slice: []const u8) TokenStream {
return TokenStream{
.i = 0,
.slice = slice,
.parser = StreamingParser.init(),
.token = null,
};
}
pub fn next(self: *TokenStream) Error!?Token {
if (self.token) |token| {
// TODO: Audit this pattern once #2915 is closed
const copy = token;
self.token = null;
return copy;
}
var t1: ?Token = undefined;
var t2: ?Token = undefined;
while (self.i < self.slice.len) {
try self.parser.feed(self.slice[self.i], &t1, &t2);
self.i += 1;
if (t1) |token| {
self.token = t2;
return token;
}
}
// Without this a bare number fails, the streaming parser doesn't know the input ended
try self.parser.feed(' ', &t1, &t2);
self.i += 1;
if (t1) |token| {
return token;
} else if (self.parser.complete) {
return null;
} else {
return error.UnexpectedEndOfJson;
}
}
};
fn checkNext(p: *TokenStream, id: std.meta.TagType(Token)) void {
const token = (p.next() catch unreachable).?;
debug.assert(std.meta.activeTag(token) == id);
}
test "json.token" {
const s =
\\{
\\ "Image": {
\\ "Width": 800,
\\ "Height": 600,
\\ "Title": "View from 15th Floor",
\\ "Thumbnail": {
\\ "Url": "http://www.example.com/image/481989943",
\\ "Height": 125,
\\ "Width": 100
\\ },
\\ "Animated" : false,
\\ "IDs": [116, 943, 234, 38793]
\\ }
\\}
;
var p = TokenStream.init(s);
checkNext(&p, .ObjectBegin);
checkNext(&p, .String); // Image
checkNext(&p, .ObjectBegin);
checkNext(&p, .String); // Width
checkNext(&p, .Number);
checkNext(&p, .String); // Height
checkNext(&p, .Number);
checkNext(&p, .String); // Title
checkNext(&p, .String);
checkNext(&p, .String); // Thumbnail
checkNext(&p, .ObjectBegin);
checkNext(&p, .String); // Url
checkNext(&p, .String);
checkNext(&p, .String); // Height
checkNext(&p, .Number);
checkNext(&p, .String); // Width
checkNext(&p, .Number);
checkNext(&p, .ObjectEnd);
checkNext(&p, .String); // Animated
checkNext(&p, .False);
checkNext(&p, .String); // IDs
checkNext(&p, .ArrayBegin);
checkNext(&p, .Number);
checkNext(&p, .Number);
checkNext(&p, .Number);
checkNext(&p, .Number);
checkNext(&p, .ArrayEnd);
checkNext(&p, .ObjectEnd);
checkNext(&p, .ObjectEnd);
testing.expect((try p.next()) == null);
}
/// Validate a JSON string. This does not limit number precision so a decoder may not necessarily
/// be able to decode the string even if this returns true.
pub fn validate(s: []const u8) bool {
var p = StreamingParser.init();
for (s) |c, i| {
var token1: ?Token = undefined;
var token2: ?Token = undefined;
p.feed(c, &token1, &token2) catch |err| {
return false;
};
}
return p.complete;
}
test "json.validate" {
testing.expect(validate("{}"));
}
const Allocator = std.mem.Allocator;
const ArenaAllocator = std.heap.ArenaAllocator;
const ArrayList = std.ArrayList;
const StringHashMap = std.StringHashMap;
pub const ValueTree = struct {
arena: ArenaAllocator,
root: Value,
pub fn deinit(self: *ValueTree) void {
self.arena.deinit();
}
};
pub const ObjectMap = StringHashMap(Value);
pub const Array = ArrayList(Value);
/// Represents a JSON value
/// Currently only supports numbers that fit into i64 or f64.
pub const Value = union(enum) {
Null,
Bool: bool,
Integer: i64,
Float: f64,
String: []const u8,
Array: Array,
Object: ObjectMap,
pub fn dump(self: Value) void {
var held = std.debug.getStderrMutex().acquire();
defer held.release();
const stderr = std.debug.getStderrStream();
self.dumpStream(stderr, 1024) catch return;
}
pub fn dumpIndent(self: Value, comptime indent: usize) void {
if (indent == 0) {
self.dump();
} else {
var held = std.debug.getStderrMutex().acquire();
defer held.release();
const stderr = std.debug.getStderrStream();
self.dumpStreamIndent(indent, stderr, 1024) catch return;
}
}
pub fn dumpStream(self: @This(), stream: var, comptime max_depth: usize) !void {
var w = std.json.WriteStream(@TypeOf(stream).Child, max_depth).init(stream);
w.newline = "";
w.one_indent = "";
w.space = "";
try w.emitJson(self);
}
pub fn dumpStreamIndent(self: @This(), comptime indent: usize, stream: var, comptime max_depth: usize) !void {
var one_indent = " " ** indent;
var w = std.json.WriteStream(@TypeOf(stream).Child, max_depth).init(stream);
w.one_indent = one_indent;
try w.emitJson(self);
}
};
/// A non-stream JSON parser which constructs a tree of Value's.
pub const Parser = struct {
allocator: *Allocator,
state: State,
copy_strings: bool,
// Stores parent nodes and un-combined Values.
stack: Array,
const State = enum {
ObjectKey,
ObjectValue,
ArrayValue,
Simple,
};
pub fn init(allocator: *Allocator, copy_strings: bool) Parser {
return Parser{
.allocator = allocator,
.state = .Simple,
.copy_strings = copy_strings,
.stack = Array.init(allocator),
};
}
pub fn deinit(p: *Parser) void {
p.stack.deinit();
}
pub fn reset(p: *Parser) void {
p.state = .Simple;
p.stack.shrink(0);
}
pub fn parse(p: *Parser, input: []const u8) !ValueTree {
var s = TokenStream.init(input);
var arena = ArenaAllocator.init(p.allocator);
errdefer arena.deinit();
while (try s.next()) |token| {
try p.transition(&arena.allocator, input, s.i - 1, token);
}
debug.assert(p.stack.len == 1);
return ValueTree{
.arena = arena,
.root = p.stack.at(0),
};
}
// Even though p.allocator exists, we take an explicit allocator so that allocation state
// can be cleaned up on error correctly during a `parse` on call.
fn transition(p: *Parser, allocator: *Allocator, input: []const u8, i: usize, token: Token) !void {
switch (p.state) {
.ObjectKey => switch (token) {
.ObjectEnd => {
if (p.stack.len == 1) {
return;
}
var value = p.stack.pop();
try p.pushToParent(&value);
},
.String => |s| {
try p.stack.append(try p.parseString(allocator, s, input, i));
p.state = .ObjectValue;
},
else => {
// The streaming parser would return an error eventually.
// To prevent invalid state we return an error now.
// TODO make the streaming parser return an error as soon as it encounters an invalid object key
return error.InvalidLiteral;
},
},
.ObjectValue => {
var object = &p.stack.items[p.stack.len - 2].Object;
var key = p.stack.items[p.stack.len - 1].String;
switch (token) {
.ObjectBegin => {
try p.stack.append(Value{ .Object = ObjectMap.init(allocator) });
p.state = .ObjectKey;
},
.ArrayBegin => {
try p.stack.append(Value{ .Array = Array.init(allocator) });
p.state = .ArrayValue;
},
.String => |s| {
_ = try object.put(key, try p.parseString(allocator, s, input, i));
_ = p.stack.pop();
p.state = .ObjectKey;
},
.Number => |n| {
_ = try object.put(key, try p.parseNumber(n, input, i));
_ = p.stack.pop();
p.state = .ObjectKey;
},
.True => {
_ = try object.put(key, Value{ .Bool = true });
_ = p.stack.pop();
p.state = .ObjectKey;
},
.False => {
_ = try object.put(key, Value{ .Bool = false });
_ = p.stack.pop();
p.state = .ObjectKey;
},
.Null => {
_ = try object.put(key, Value.Null);
_ = p.stack.pop();
p.state = .ObjectKey;
},
.ObjectEnd, .ArrayEnd => {
unreachable;
},
}
},
.ArrayValue => {
var array = &p.stack.items[p.stack.len - 1].Array;
switch (token) {
.ArrayEnd => {
if (p.stack.len == 1) {
return;
}
var value = p.stack.pop();
try p.pushToParent(&value);
},
.ObjectBegin => {
try p.stack.append(Value{ .Object = ObjectMap.init(allocator) });
p.state = .ObjectKey;
},
.ArrayBegin => {
try p.stack.append(Value{ .Array = Array.init(allocator) });
p.state = .ArrayValue;
},
.String => |s| {
try array.append(try p.parseString(allocator, s, input, i));
},
.Number => |n| {
try array.append(try p.parseNumber(n, input, i));
},
.True => {
try array.append(Value{ .Bool = true });
},
.False => {
try array.append(Value{ .Bool = false });
},
.Null => {
try array.append(Value.Null);
},
.ObjectEnd => {
unreachable;
},
}
},
.Simple => switch (token) {
.ObjectBegin => {
try p.stack.append(Value{ .Object = ObjectMap.init(allocator) });
p.state = .ObjectKey;
},
.ArrayBegin => {
try p.stack.append(Value{ .Array = Array.init(allocator) });
p.state = .ArrayValue;
},
.String => |s| {
try p.stack.append(try p.parseString(allocator, s, input, i));
},
.Number => |n| {
try p.stack.append(try p.parseNumber(n, input, i));
},
.True => {
try p.stack.append(Value{ .Bool = true });
},
.False => {
try p.stack.append(Value{ .Bool = false });
},
.Null => {
try p.stack.append(Value.Null);
},
.ObjectEnd, .ArrayEnd => {
unreachable;
},
},
}
}
fn pushToParent(p: *Parser, value: *const Value) !void {
switch (p.stack.toSlice()[p.stack.len - 1]) {
// Object Parent -> [ ..., object, <key>, value ]
Value.String => |key| {
_ = p.stack.pop();
var object = &p.stack.items[p.stack.len - 1].Object;
_ = try object.put(key, value.*);
p.state = .ObjectKey;
},
// Array Parent -> [ ..., <array>, value ]
Value.Array => |*array| {
try array.append(value.*);
p.state = .ArrayValue;
},
else => {
unreachable;
},
}
}
fn parseString(p: *Parser, allocator: *Allocator, s: std.meta.TagPayloadType(Token, Token.String), input: []const u8, i: usize) !Value {
const slice = s.slice(input, i);
switch (s.escapes) {
.None => return Value{ .String = if (p.copy_strings) try mem.dupe(allocator, u8, slice) else slice },
.Some => |some_escapes| {
const output = try allocator.alloc(u8, s.decodedLength());
errdefer allocator.free(output);
try unescapeString(output, slice);
return Value{ .String = output };
},
}
}
fn parseNumber(p: *Parser, n: std.meta.TagPayloadType(Token, Token.Number), input: []const u8, i: usize) !Value {
return if (n.is_integer)
Value{ .Integer = try std.fmt.parseInt(i64, n.slice(input, i), 10) }
else
Value{ .Float = try std.fmt.parseFloat(f64, n.slice(input, i)) };
}
};
// Unescape a JSON string
// Only to be used on strings already validated by the parser
// (note the unreachable statements and lack of bounds checking)
fn unescapeString(output: []u8, input: []const u8) !void {
var inIndex: usize = 0;
var outIndex: usize = 0;
while (inIndex < input.len) {
if (input[inIndex] != '\\') {
// not an escape sequence
output[outIndex] = input[inIndex];
inIndex += 1;
outIndex += 1;
} else if (input[inIndex + 1] != 'u') {
// a simple escape sequence
output[outIndex] = @as(u8, switch (input[inIndex + 1]) {
'\\' => '\\',
'/' => '/',
'n' => '\n',
'r' => '\r',
't' => '\t',
'f' => 12,
'b' => 8,
'"' => '"',
else => unreachable,
});
inIndex += 2;
outIndex += 1;
} else {
// a unicode escape sequence
const firstCodeUnit = std.fmt.parseInt(u16, input[inIndex + 2 .. inIndex + 6], 16) catch unreachable;
// guess optimistically that it's not a surrogate pair
if (std.unicode.utf8Encode(firstCodeUnit, output[outIndex..])) |byteCount| {
outIndex += byteCount;
inIndex += 6;
} else |err| {
// it might be a surrogate pair
if (err != error.Utf8CannotEncodeSurrogateHalf) {
return error.InvalidUnicodeHexSymbol;
}
// check if a second code unit is present
if (inIndex + 7 >= input.len or input[inIndex + 6] != '\\' or input[inIndex + 7] != 'u') {
return error.InvalidUnicodeHexSymbol;
}
const secondCodeUnit = std.fmt.parseInt(u16, input[inIndex + 8 .. inIndex + 12], 16) catch unreachable;
if (std.unicode.utf16leToUtf8(output[outIndex..], &[2]u16{ firstCodeUnit, secondCodeUnit })) |byteCount| {
outIndex += byteCount;
inIndex += 12;
} else |_| {
return error.InvalidUnicodeHexSymbol;
}
}
}
}
assert(outIndex == output.len);
}
test "json.parser.dynamic" {
var memory: [1024 * 16]u8 = undefined;
var buf_alloc = std.heap.FixedBufferAllocator.init(&memory);
var p = Parser.init(&buf_alloc.allocator, false);
defer p.deinit();
const s =
\\{
\\ "Image": {
\\ "Width": 800,
\\ "Height": 600,
\\ "Title": "View from 15th Floor",
\\ "Thumbnail": {
\\ "Url": "http://www.example.com/image/481989943",
\\ "Height": 125,
\\ "Width": 100
\\ },
\\ "Animated" : false,
\\ "IDs": [116, 943, 234, 38793],
\\ "ArrayOfObject": [{"n": "m"}],
\\ "double": 1.3412
\\ }
\\}
;
var tree = try p.parse(s);
defer tree.deinit();
var root = tree.root;
var image = root.Object.get("Image").?.value;
const width = image.Object.get("Width").?.value;
testing.expect(width.Integer == 800);
const height = image.Object.get("Height").?.value;
testing.expect(height.Integer == 600);
const title = image.Object.get("Title").?.value;
testing.expect(mem.eql(u8, title.String, "View from 15th Floor"));
const animated = image.Object.get("Animated").?.value;
testing.expect(animated.Bool == false);
const array_of_object = image.Object.get("ArrayOfObject").?.value;
testing.expect(array_of_object.Array.len == 1);
const obj0 = array_of_object.Array.at(0).Object.get("n").?.value;
testing.expect(mem.eql(u8, obj0.String, "m"));
const double = image.Object.get("double").?.value;
testing.expect(double.Float == 1.3412);
}
test "import more json tests" {
_ = @import("json/test.zig");
_ = @import("json/write_stream.zig");
}
test "write json then parse it" {
var out_buffer: [1000]u8 = undefined;
var slice_out_stream = std.io.SliceOutStream.init(&out_buffer);
const out_stream = &slice_out_stream.stream;
var jw = WriteStream(@TypeOf(out_stream).Child, 4).init(out_stream);
try jw.beginObject();
try jw.objectField("f");
try jw.emitBool(false);
try jw.objectField("t");
try jw.emitBool(true);
try jw.objectField("int");
try jw.emitNumber(@as(i32, 1234));
try jw.objectField("array");
try jw.beginArray();
try jw.arrayElem();
try jw.emitNull();
try jw.arrayElem();
try jw.emitNumber(@as(f64, 12.34));
try jw.endArray();
try jw.objectField("str");
try jw.emitString("hello");
try jw.endObject();
var mem_buffer: [1024 * 20]u8 = undefined;
const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
var parser = Parser.init(allocator, false);
const tree = try parser.parse(slice_out_stream.getWritten());
testing.expect(tree.root.Object.get("f").?.value.Bool == false);
testing.expect(tree.root.Object.get("t").?.value.Bool == true);
testing.expect(tree.root.Object.get("int").?.value.Integer == 1234);
testing.expect(tree.root.Object.get("array").?.value.Array.at(0).Null == {});
testing.expect(tree.root.Object.get("array").?.value.Array.at(1).Float == 12.34);
testing.expect(mem.eql(u8, tree.root.Object.get("str").?.value.String, "hello"));
}
fn test_parse(memory: []u8, json_str: []const u8) !Value {
// buf_alloc goes out of scope, but we don't use it after parsing
var buf_alloc = std.heap.FixedBufferAllocator.init(memory);
var p = Parser.init(&buf_alloc.allocator, false);
return (try p.parse(json_str)).root;
}
test "parsing empty string gives appropriate error" {
var memory: [1024 * 4]u8 = undefined;
testing.expectError(error.UnexpectedEndOfJson, test_parse(&memory, ""));
}
test "integer after float has proper type" {
var memory: [1024 * 8]u8 = undefined;
const json = try test_parse(&memory,
\\{
\\ "float": 3.14,
\\ "ints": [1, 2, 3]
\\}
);
std.testing.expect(json.Object.getValue("ints").?.Array.at(0) == .Integer);
}
test "escaped characters" {
var memory: [1024 * 16]u8 = undefined;
const input =
\\{
\\ "backslash": "\\",
\\ "forwardslash": "\/",
\\ "newline": "\n",
\\ "carriagereturn": "\r",
\\ "tab": "\t",
\\ "formfeed": "\f",
\\ "backspace": "\b",
\\ "doublequote": "\"",
\\ "unicode": "\u0105",
\\ "surrogatepair": "\ud83d\ude02"
\\}
;
const obj = (try test_parse(&memory, input)).Object;
testing.expectEqualSlices(u8, obj.get("backslash").?.value.String, "\\");
testing.expectEqualSlices(u8, obj.get("forwardslash").?.value.String, "/");
testing.expectEqualSlices(u8, obj.get("newline").?.value.String, "\n");
testing.expectEqualSlices(u8, obj.get("carriagereturn").?.value.String, "\r");
testing.expectEqualSlices(u8, obj.get("tab").?.value.String, "\t");
testing.expectEqualSlices(u8, obj.get("formfeed").?.value.String, "\x0C");
testing.expectEqualSlices(u8, obj.get("backspace").?.value.String, "\x08");
testing.expectEqualSlices(u8, obj.get("doublequote").?.value.String, "\"");
testing.expectEqualSlices(u8, obj.get("unicode").?.value.String, "ą");
testing.expectEqualSlices(u8, obj.get("surrogatepair").?.value.String, "😂");
}
test "string copy option" {
const input =
\\{
\\ "noescape": "aą😂",
\\ "simple": "\\\/\n\r\t\f\b\"",
\\ "unicode": "\u0105",
\\ "surrogatepair": "\ud83d\ude02"
\\}
;
var mem_buffer: [1024 * 16]u8 = undefined;
var buf_alloc = std.heap.FixedBufferAllocator.init(&mem_buffer);
const tree_nocopy = try Parser.init(&buf_alloc.allocator, false).parse(input);
const obj_nocopy = tree_nocopy.root.Object;
const tree_copy = try Parser.init(&buf_alloc.allocator, true).parse(input);
const obj_copy = tree_copy.root.Object;
for ([_][]const u8{ "noescape", "simple", "unicode", "surrogatepair" }) |field_name| {
testing.expectEqualSlices(u8, obj_nocopy.getValue(field_name).?.String, obj_copy.getValue(field_name).?.String);
}
const nocopy_addr = &obj_nocopy.getValue("noescape").?.String[0];
const copy_addr = &obj_copy.getValue("noescape").?.String[0];
var found_nocopy = false;
for (input) |_, index| {
testing.expect(copy_addr != &input[index]);
if (nocopy_addr == &input[index]) {
found_nocopy = true;
}
}
testing.expect(found_nocopy);
}