Add json.TokenStream (#1062)

This hides some of the low-level parsing details from the
StreamingParser. These don't need to be known when parsing a complete
slice at once (which is we can usually do).

Also, remove `Json` from Parser names. The namespace `json` is sufficient.
This commit is contained in:
Marc Tiehuis 2018-06-07 03:24:36 +12:00 committed by Andrew Kelley
parent f389e53735
commit e7f141b376

View File

@ -3,6 +3,7 @@
// https://tools.ietf.org/html/rfc8259 // https://tools.ietf.org/html/rfc8259
const std = @import("index.zig"); const std = @import("index.zig");
const debug = std.debug;
const mem = std.mem; const mem = std.mem;
const u1 = @IntType(false, 1); const u1 = @IntType(false, 1);
@ -86,7 +87,9 @@ pub const Token = struct {
// parsing state requires ~40-50 bytes of stack space. // parsing state requires ~40-50 bytes of stack space.
// //
// Conforms strictly to RFC8529. // Conforms strictly to RFC8529.
pub const StreamingJsonParser = struct { //
// For a non-byte based wrapper, consider using TokenStream instead.
pub const StreamingParser = struct {
// Current state // Current state
state: State, state: State,
// How many bytes we have counted for the current token // How many bytes we have counted for the current token
@ -109,13 +112,13 @@ pub const StreamingJsonParser = struct {
const array_bit = 1; const array_bit = 1;
const max_stack_size = @maxValue(u8); const max_stack_size = @maxValue(u8);
pub fn init() StreamingJsonParser { pub fn init() StreamingParser {
var p: StreamingJsonParser = undefined; var p: StreamingParser = undefined;
p.reset(); p.reset();
return p; return p;
} }
pub fn reset(p: *StreamingJsonParser) void { pub fn reset(p: *StreamingParser) void {
p.state = State.TopLevelBegin; p.state = State.TopLevelBegin;
p.count = 0; p.count = 0;
// Set before ever read in main transition function // Set before ever read in main transition function
@ -175,7 +178,7 @@ pub const StreamingJsonParser = struct {
// Only call this function to generate array/object final state. // Only call this function to generate array/object final state.
pub fn fromInt(x: var) State { pub fn fromInt(x: var) State {
std.debug.assert(x == 0 or x == 1); debug.assert(x == 0 or x == 1);
const T = @TagType(State); const T = @TagType(State);
return State(T(x)); return State(T(x));
} }
@ -205,7 +208,7 @@ pub const StreamingJsonParser = struct {
// tokens. token2 is always null if token1 is null. // tokens. token2 is always null if token1 is null.
// //
// There is currently no error recovery on a bad stream. // There is currently no error recovery on a bad stream.
pub fn feed(p: *StreamingJsonParser, c: u8, token1: *?Token, token2: *?Token) Error!void { pub fn feed(p: *StreamingParser, c: u8, token1: *?Token, token2: *?Token) Error!void {
token1.* = null; token1.* = null;
token2.* = null; token2.* = null;
p.count += 1; p.count += 1;
@ -217,7 +220,7 @@ pub const StreamingJsonParser = struct {
} }
// Perform a single transition on the state machine and return any possible token. // Perform a single transition on the state machine and return any possible token.
fn transition(p: *StreamingJsonParser, c: u8, token: *?Token) Error!bool { fn transition(p: *StreamingParser, c: u8, token: *?Token) Error!bool {
switch (p.state) { switch (p.state) {
State.TopLevelBegin => switch (c) { State.TopLevelBegin => switch (c) {
'{' => { '{' => {
@ -852,10 +855,116 @@ pub const StreamingJsonParser = struct {
} }
}; };
// A small wrapper over a StreamingParser for full slices. Returns a stream of json Tokens.
pub const TokenStream = struct {
i: usize,
slice: []const u8,
parser: StreamingParser,
token: ?Token,
pub fn init(slice: []const u8) TokenStream {
return TokenStream{
.i = 0,
.slice = slice,
.parser = StreamingParser.init(),
.token = null,
};
}
pub fn next(self: *TokenStream) !?Token {
if (self.token) |token| {
self.token = null;
return token;
}
var t1: ?Token = undefined;
var t2: ?Token = undefined;
while (self.i < self.slice.len) {
try self.parser.feed(self.slice[self.i], &t1, &t2);
self.i += 1;
if (t1) |token| {
self.token = t2;
return token;
}
}
if (self.i > self.slice.len) {
try self.parser.feed(' ', &t1, &t2);
self.i += 1;
if (t1) |token| {
return token;
}
}
return null;
}
};
fn checkNext(p: *TokenStream, id: Token.Id) void {
const token = ??(p.next() catch unreachable);
debug.assert(token.id == id);
}
test "token" {
const s =
\\{
\\ "Image": {
\\ "Width": 800,
\\ "Height": 600,
\\ "Title": "View from 15th Floor",
\\ "Thumbnail": {
\\ "Url": "http://www.example.com/image/481989943",
\\ "Height": 125,
\\ "Width": 100
\\ },
\\ "Animated" : false,
\\ "IDs": [116, 943, 234, 38793]
\\ }
\\}
;
var p = TokenStream.init(s);
checkNext(&p, Token.Id.ObjectBegin);
checkNext(&p, Token.Id.String); // Image
checkNext(&p, Token.Id.ObjectBegin);
checkNext(&p, Token.Id.String); // Width
checkNext(&p, Token.Id.Number);
checkNext(&p, Token.Id.String); // Height
checkNext(&p, Token.Id.Number);
checkNext(&p, Token.Id.String); // Title
checkNext(&p, Token.Id.String);
checkNext(&p, Token.Id.String); // Thumbnail
checkNext(&p, Token.Id.ObjectBegin);
checkNext(&p, Token.Id.String); // Url
checkNext(&p, Token.Id.String);
checkNext(&p, Token.Id.String); // Height
checkNext(&p, Token.Id.Number);
checkNext(&p, Token.Id.String); // Width
checkNext(&p, Token.Id.Number);
checkNext(&p, Token.Id.ObjectEnd);
checkNext(&p, Token.Id.String); // Animated
checkNext(&p, Token.Id.False);
checkNext(&p, Token.Id.String); // IDs
checkNext(&p, Token.Id.ArrayBegin);
checkNext(&p, Token.Id.Number);
checkNext(&p, Token.Id.Number);
checkNext(&p, Token.Id.Number);
checkNext(&p, Token.Id.Number);
checkNext(&p, Token.Id.ArrayEnd);
checkNext(&p, Token.Id.ObjectEnd);
checkNext(&p, Token.Id.ObjectEnd);
debug.assert((try p.next()) == null);
}
// Validate a JSON string. This does not limit number precision so a decoder may not necessarily // Validate a JSON string. This does not limit number precision so a decoder may not necessarily
// be able to decode the string even if this returns true. // be able to decode the string even if this returns true.
pub fn validate(s: []const u8) bool { pub fn validate(s: []const u8) bool {
var p = StreamingJsonParser.init(); var p = StreamingParser.init();
for (s) |c, i| { for (s) |c, i| {
var token1: ?Token = undefined; var token1: ?Token = undefined;
@ -897,46 +1006,46 @@ pub const Value = union(enum) {
pub fn dump(self: *const Value) void { pub fn dump(self: *const Value) void {
switch (self.*) { switch (self.*) {
Value.Null => { Value.Null => {
std.debug.warn("null"); debug.warn("null");
}, },
Value.Bool => |inner| { Value.Bool => |inner| {
std.debug.warn("{}", inner); debug.warn("{}", inner);
}, },
Value.Integer => |inner| { Value.Integer => |inner| {
std.debug.warn("{}", inner); debug.warn("{}", inner);
}, },
Value.Float => |inner| { Value.Float => |inner| {
std.debug.warn("{.5}", inner); debug.warn("{.5}", inner);
}, },
Value.String => |inner| { Value.String => |inner| {
std.debug.warn("\"{}\"", inner); debug.warn("\"{}\"", inner);
}, },
Value.Array => |inner| { Value.Array => |inner| {
var not_first = false; var not_first = false;
std.debug.warn("["); debug.warn("[");
for (inner.toSliceConst()) |value| { for (inner.toSliceConst()) |value| {
if (not_first) { if (not_first) {
std.debug.warn(","); debug.warn(",");
} }
not_first = true; not_first = true;
value.dump(); value.dump();
} }
std.debug.warn("]"); debug.warn("]");
}, },
Value.Object => |inner| { Value.Object => |inner| {
var not_first = false; var not_first = false;
std.debug.warn("{{"); debug.warn("{{");
var it = inner.iterator(); var it = inner.iterator();
while (it.next()) |entry| { while (it.next()) |entry| {
if (not_first) { if (not_first) {
std.debug.warn(","); debug.warn(",");
} }
not_first = true; not_first = true;
std.debug.warn("\"{}\":", entry.key); debug.warn("\"{}\":", entry.key);
entry.value.dump(); entry.value.dump();
} }
std.debug.warn("}}"); debug.warn("}}");
}, },
} }
} }
@ -952,53 +1061,53 @@ pub const Value = union(enum) {
fn dumpIndentLevel(self: *const Value, indent: usize, level: usize) void { fn dumpIndentLevel(self: *const Value, indent: usize, level: usize) void {
switch (self.*) { switch (self.*) {
Value.Null => { Value.Null => {
std.debug.warn("null"); debug.warn("null");
}, },
Value.Bool => |inner| { Value.Bool => |inner| {
std.debug.warn("{}", inner); debug.warn("{}", inner);
}, },
Value.Integer => |inner| { Value.Integer => |inner| {
std.debug.warn("{}", inner); debug.warn("{}", inner);
}, },
Value.Float => |inner| { Value.Float => |inner| {
std.debug.warn("{.5}", inner); debug.warn("{.5}", inner);
}, },
Value.String => |inner| { Value.String => |inner| {
std.debug.warn("\"{}\"", inner); debug.warn("\"{}\"", inner);
}, },
Value.Array => |inner| { Value.Array => |inner| {
var not_first = false; var not_first = false;
std.debug.warn("[\n"); debug.warn("[\n");
for (inner.toSliceConst()) |value| { for (inner.toSliceConst()) |value| {
if (not_first) { if (not_first) {
std.debug.warn(",\n"); debug.warn(",\n");
} }
not_first = true; not_first = true;
padSpace(level + indent); padSpace(level + indent);
value.dumpIndentLevel(indent, level + indent); value.dumpIndentLevel(indent, level + indent);
} }
std.debug.warn("\n"); debug.warn("\n");
padSpace(level); padSpace(level);
std.debug.warn("]"); debug.warn("]");
}, },
Value.Object => |inner| { Value.Object => |inner| {
var not_first = false; var not_first = false;
std.debug.warn("{{\n"); debug.warn("{{\n");
var it = inner.iterator(); var it = inner.iterator();
while (it.next()) |entry| { while (it.next()) |entry| {
if (not_first) { if (not_first) {
std.debug.warn(",\n"); debug.warn(",\n");
} }
not_first = true; not_first = true;
padSpace(level + indent); padSpace(level + indent);
std.debug.warn("\"{}\": ", entry.key); debug.warn("\"{}\": ", entry.key);
entry.value.dumpIndentLevel(indent, level + indent); entry.value.dumpIndentLevel(indent, level + indent);
} }
std.debug.warn("\n"); debug.warn("\n");
padSpace(level); padSpace(level);
std.debug.warn("}}"); debug.warn("}}");
}, },
} }
} }
@ -1006,13 +1115,13 @@ pub const Value = union(enum) {
fn padSpace(indent: usize) void { fn padSpace(indent: usize) void {
var i: usize = 0; var i: usize = 0;
while (i < indent) : (i += 1) { while (i < indent) : (i += 1) {
std.debug.warn(" "); debug.warn(" ");
} }
} }
}; };
// A non-stream JSON parser which constructs a tree of Value's. // A non-stream JSON parser which constructs a tree of Value's.
pub const JsonParser = struct { pub const Parser = struct {
allocator: *Allocator, allocator: *Allocator,
state: State, state: State,
copy_strings: bool, copy_strings: bool,
@ -1026,8 +1135,8 @@ pub const JsonParser = struct {
Simple, Simple,
}; };
pub fn init(allocator: *Allocator, copy_strings: bool) JsonParser { pub fn init(allocator: *Allocator, copy_strings: bool) Parser {
return JsonParser{ return Parser{
.allocator = allocator, .allocator = allocator,
.state = State.Simple, .state = State.Simple,
.copy_strings = copy_strings, .copy_strings = copy_strings,
@ -1035,52 +1144,26 @@ pub const JsonParser = struct {
}; };
} }
pub fn deinit(p: *JsonParser) void { pub fn deinit(p: *Parser) void {
p.stack.deinit(); p.stack.deinit();
} }
pub fn reset(p: *JsonParser) void { pub fn reset(p: *Parser) void {
p.state = State.Simple; p.state = State.Simple;
p.stack.shrink(0); p.stack.shrink(0);
} }
pub fn parse(p: *JsonParser, input: []const u8) !ValueTree { pub fn parse(p: *Parser, input: []const u8) !ValueTree {
var mp = StreamingJsonParser.init(); var s = TokenStream.init(input);
var arena = ArenaAllocator.init(p.allocator); var arena = ArenaAllocator.init(p.allocator);
errdefer arena.deinit(); errdefer arena.deinit();
for (input) |c, i| { while (try s.next()) |token| {
var mt1: ?Token = undefined; try p.transition(&arena.allocator, input, s.i - 1, token);
var mt2: ?Token = undefined;
try mp.feed(c, &mt1, &mt2);
if (mt1) |t1| {
try p.transition(&arena.allocator, input, i, t1);
if (mt2) |t2| {
try p.transition(&arena.allocator, input, i, t2);
}
}
} }
// Handle top-level lonely number values. debug.assert(p.stack.len == 1);
{
const i = input.len;
var mt1: ?Token = undefined;
var mt2: ?Token = undefined;
try mp.feed(' ', &mt1, &mt2);
if (mt1) |t1| {
try p.transition(&arena.allocator, input, i, t1);
}
}
if (!mp.complete) {
return error.IncompleteJsonInput;
}
std.debug.assert(p.stack.len == 1);
return ValueTree{ return ValueTree{
.arena = arena, .arena = arena,
@ -1090,7 +1173,7 @@ pub const JsonParser = struct {
// Even though p.allocator exists, we take an explicit allocator so that allocation state // Even though p.allocator exists, we take an explicit allocator so that allocation state
// can be cleaned up on error correctly during a `parse` on call. // can be cleaned up on error correctly during a `parse` on call.
fn transition(p: *JsonParser, allocator: *Allocator, input: []const u8, i: usize, token: *const Token) !void { fn transition(p: *Parser, allocator: *Allocator, input: []const u8, i: usize, token: *const Token) !void {
switch (p.state) { switch (p.state) {
State.ObjectKey => switch (token.id) { State.ObjectKey => switch (token.id) {
Token.Id.ObjectEnd => { Token.Id.ObjectEnd => {
@ -1223,7 +1306,7 @@ pub const JsonParser = struct {
} }
} }
fn pushToParent(p: *JsonParser, value: *const Value) !void { fn pushToParent(p: *Parser, value: *const Value) !void {
switch (p.stack.at(p.stack.len - 1)) { switch (p.stack.at(p.stack.len - 1)) {
// Object Parent -> [ ..., object, <key>, value ] // Object Parent -> [ ..., object, <key>, value ]
Value.String => |key| { Value.String => |key| {
@ -1244,14 +1327,14 @@ pub const JsonParser = struct {
} }
} }
fn parseString(p: *JsonParser, allocator: *Allocator, token: *const Token, input: []const u8, i: usize) !Value { fn parseString(p: *Parser, allocator: *Allocator, token: *const Token, input: []const u8, i: usize) !Value {
// TODO: We don't strictly have to copy values which do not contain any escape // TODO: We don't strictly have to copy values which do not contain any escape
// characters if flagged with the option. // characters if flagged with the option.
const slice = token.slice(input, i); const slice = token.slice(input, i);
return Value{ .String = try mem.dupe(p.allocator, u8, slice) }; return Value{ .String = try mem.dupe(p.allocator, u8, slice) };
} }
fn parseNumber(p: *JsonParser, token: *const Token, input: []const u8, i: usize) !Value { fn parseNumber(p: *Parser, token: *const Token, input: []const u8, i: usize) !Value {
return if (token.number_is_integer) return if (token.number_is_integer)
Value{ .Integer = try std.fmt.parseInt(i64, token.slice(input, i), 10) } Value{ .Integer = try std.fmt.parseInt(i64, token.slice(input, i), 10) }
else else
@ -1259,10 +1342,8 @@ pub const JsonParser = struct {
} }
}; };
const debug = std.debug;
test "json parser dynamic" { test "json parser dynamic" {
var p = JsonParser.init(std.debug.global_allocator, false); var p = Parser.init(debug.global_allocator, false);
defer p.deinit(); defer p.deinit();
const s = const s =