Fix bugs in JSON parser

Make comments into documentation where appropriate
master
xackus 2019-11-11 23:25:54 +01:00
parent 371747d8fb
commit f9b7d6d75d
2 changed files with 69 additions and 60 deletions

View File

@ -10,18 +10,18 @@ const maxInt = std.math.maxInt;
pub const WriteStream = @import("json/write_stream.zig").WriteStream;
// A single token slice into the parent string.
//
// Use `token.slice()` on the input at the current position to get the current slice.
/// A single token slice into the parent string.
///
/// Use `token.slice()` on the input at the current position to get the current slice.
pub const Token = struct {
id: Id,
// How many bytes do we skip before counting
/// How many bytes do we skip before counting
offset: u1,
// Whether string contains a \uXXXX sequence and cannot be zero-copied
/// Whether string contains an escape sequence and cannot be zero-copied
string_has_escape: bool,
// Whether number is simple and can be represented by an integer (i.e. no `.` or `e`)
/// Whether number is simple and can be represented by an integer (i.e. no `.` or `e`)
number_is_integer: bool,
// How many bytes from the current position behind the start of this token is.
/// How many bytes from the current position behind the start of this token is.
count: usize,
pub const Id = enum {
@ -66,7 +66,7 @@ pub const Token = struct {
};
}
// A marker token is a zero-length
/// A marker token is a zero-length
pub fn initMarker(id: Id) Token {
return Token{
.id = id,
@ -77,19 +77,19 @@ pub const Token = struct {
};
}
// Slice into the underlying input string.
/// Slice into the underlying input string.
pub fn slice(self: Token, input: []const u8, i: usize) []const u8 {
return input[i + self.offset - self.count .. i + self.offset];
}
};
// A small streaming JSON parser. This accepts input one byte at a time and returns tokens as
// they are encountered. No copies or allocations are performed during parsing and the entire
// parsing state requires ~40-50 bytes of stack space.
//
// Conforms strictly to RFC8529.
//
// For a non-byte based wrapper, consider using TokenStream instead.
/// A small streaming JSON parser. This accepts input one byte at a time and returns tokens as
/// they are encountered. No copies or allocations are performed during parsing and the entire
/// parsing state requires ~40-50 bytes of stack space.
///
/// Conforms strictly to RFC8529.
///
/// For a non-byte based wrapper, consider using TokenStream instead.
pub const StreamingParser = struct {
// Current state
state: State,
@ -205,10 +205,10 @@ pub const StreamingParser = struct {
InvalidControlCharacter,
};
// Give another byte to the parser and obtain any new tokens. This may (rarely) return two
// tokens. token2 is always null if token1 is null.
//
// There is currently no error recovery on a bad stream.
/// Give another byte to the parser and obtain any new tokens. This may (rarely) return two
/// tokens. token2 is always null if token1 is null.
///
/// There is currently no error recovery on a bad stream.
pub fn feed(p: *StreamingParser, c: u8, token1: *?Token, token2: *?Token) Error!void {
token1.* = null;
token2.* = null;
@ -860,7 +860,7 @@ pub const StreamingParser = struct {
}
};
// A small wrapper over a StreamingParser for full slices. Returns a stream of json Tokens.
/// A small wrapper over a StreamingParser for full slices. Returns a stream of json Tokens.
pub const TokenStream = struct {
i: usize,
slice: []const u8,
@ -898,7 +898,13 @@ pub const TokenStream = struct {
}
}
if (self.parser.complete) {
// Without this a bare number fails, becasue the streaming parser doesn't know it ended
try self.parser.feed(' ', &t1, &t2);
self.i += 1;
if (t1) |token| {
return token;
} else if (self.parser.complete) {
return null;
} else {
return error.UnexpectedEndOfJson;
@ -1050,7 +1056,7 @@ pub const Value = union(enum) {
}
};
// A non-stream JSON parser which constructs a tree of Value's.
/// A non-stream JSON parser which constructs a tree of Value's.
pub const Parser = struct {
allocator: *Allocator,
state: State,
@ -1119,7 +1125,10 @@ pub const Parser = struct {
p.state = State.ObjectValue;
},
else => {
unreachable;
// The streaming parser would return an error eventually.
// To prevent invalid state we return an error now.
// TODO make the streaming parser return an error as soon as it encounters an invalid object key
return error.InvalidLiteral;
},
},
State.ObjectValue => {
@ -1276,6 +1285,10 @@ pub const Parser = struct {
// Only to be used on strings already validated by the parser
// (note the unreachable statements and lack of bounds checking)
// Optimized for arena allocators, uses Allocator.shrink
//
// Idea: count how many bytes we will need to allocate in the streaming parser and store it
// in the token to avoid allocating too much memory or iterating through the string again
// Downside: need to find how many bytes a unicode escape sequence will produce twice
fn unescapeStringAlloc(alloc: *Allocator, input: []const u8) ![]u8 {
const output = try alloc.alloc(u8, input.len);
errdefer alloc.free(output);
@ -1290,22 +1303,22 @@ fn unescapeStringAlloc(alloc: *Allocator, input: []const u8) ![]u8 {
inIndex += 1;
outIndex += 1;
} else if(input[inIndex + 1] != 'u'){
// a simple escape sequence
output[outIndex] = @as(u8,
switch(input[inIndex + 1]){
'\\' => '\\',
'/' => '/',
'n' => '\n',
'r' => '\r',
't' => '\t',
'f' => 12,
'b' => 8,
'"' => '"',
else => unreachable
}
);
inIndex += 2;
outIndex += 1;
// a simple escape sequence
output[outIndex] = @as(u8,
switch(input[inIndex + 1]){
'\\' => '\\',
'/' => '/',
'n' => '\n',
'r' => '\r',
't' => '\t',
'f' => 12,
'b' => 8,
'"' => '"',
else => unreachable
}
);
inIndex += 2;
outIndex += 1;
} else {
// a unicode escape sequence
const firstCodeUnit = std.fmt.parseInt(u16, input[inIndex+2 .. inIndex+6], 16) catch unreachable;

View File

@ -559,17 +559,15 @@ test "y_structure_lonely_false" {
}
test "y_structure_lonely_int" {
return error.SkipZigTest;
// ok(
// \\42
// );
ok(
\\42
);
}
test "y_structure_lonely_negative_real" {
return error.SkipZigTest;
// ok(
// \\-0.1
// );
ok(
\\-0.1
);
}
test "y_structure_lonely_null" {
@ -1107,10 +1105,9 @@ test "n_object_bad_value" {
}
test "n_object_bracket_key" {
return error.SkipZigTest;
// err(
// \\{[: "x"}
// );
err(
\\{[: "x"}
);
}
test "n_object_comma_instead_of_colon" {
@ -1192,10 +1189,9 @@ test "n_object_non_string_key" {
}
test "n_object_repeated_null_null" {
return error.SkipZigTest;
// err(
// \\{null:null,null:null}
// );
err(
\\{null:null,null:null}
);
}
test "n_object_several_trailing_commas" {
@ -1618,10 +1614,9 @@ test "n_structure_open_object" {
}
test "n_structure_open_object_open_array" {
return error.SkipZigTest;
// err(
// \\{[
// );
err(
\\{[
);
}
test "n_structure_open_object_open_string" {
@ -1734,6 +1729,7 @@ test "i_number_double_huge_neg_exp" {
test "i_number_huge_exp" {
return error.SkipZigTest;
// FIXME Integer overflow in parseFloat
// any(
// \\[0.4e00669999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999969999999006]
// );