const std = @import("std"); const mem = std.mem; pub const Token = struct { id: Id, start: usize, end: usize, const KeywordId = struct { bytes: []const u8, id: Id, }; const keywords = []KeywordId { KeywordId{.bytes="align", .id = Id.Keyword_align}, KeywordId{.bytes="and", .id = Id.Keyword_and}, KeywordId{.bytes="asm", .id = Id.Keyword_asm}, KeywordId{.bytes="break", .id = Id.Keyword_break}, KeywordId{.bytes="coldcc", .id = Id.Keyword_coldcc}, KeywordId{.bytes="comptime", .id = Id.Keyword_comptime}, KeywordId{.bytes="const", .id = Id.Keyword_const}, KeywordId{.bytes="continue", .id = Id.Keyword_continue}, KeywordId{.bytes="defer", .id = Id.Keyword_defer}, KeywordId{.bytes="else", .id = Id.Keyword_else}, KeywordId{.bytes="enum", .id = Id.Keyword_enum}, KeywordId{.bytes="error", .id = Id.Keyword_error}, KeywordId{.bytes="export", .id = Id.Keyword_export}, KeywordId{.bytes="extern", .id = Id.Keyword_extern}, KeywordId{.bytes="false", .id = Id.Keyword_false}, KeywordId{.bytes="fn", .id = Id.Keyword_fn}, KeywordId{.bytes="for", .id = Id.Keyword_for}, KeywordId{.bytes="goto", .id = Id.Keyword_goto}, KeywordId{.bytes="if", .id = Id.Keyword_if}, KeywordId{.bytes="inline", .id = Id.Keyword_inline}, KeywordId{.bytes="nakedcc", .id = Id.Keyword_nakedcc}, KeywordId{.bytes="noalias", .id = Id.Keyword_noalias}, KeywordId{.bytes="null", .id = Id.Keyword_null}, KeywordId{.bytes="or", .id = Id.Keyword_or}, KeywordId{.bytes="packed", .id = Id.Keyword_packed}, KeywordId{.bytes="pub", .id = Id.Keyword_pub}, KeywordId{.bytes="return", .id = Id.Keyword_return}, KeywordId{.bytes="stdcallcc", .id = Id.Keyword_stdcallcc}, KeywordId{.bytes="struct", .id = Id.Keyword_struct}, KeywordId{.bytes="switch", .id = Id.Keyword_switch}, KeywordId{.bytes="test", .id = Id.Keyword_test}, KeywordId{.bytes="this", .id = Id.Keyword_this}, KeywordId{.bytes="true", .id = Id.Keyword_true}, KeywordId{.bytes="undefined", .id = Id.Keyword_undefined}, KeywordId{.bytes="union", .id = Id.Keyword_union}, KeywordId{.bytes="unreachable", .id = Id.Keyword_unreachable}, KeywordId{.bytes="use", .id = Id.Keyword_use}, KeywordId{.bytes="var", .id = Id.Keyword_var}, KeywordId{.bytes="volatile", .id = Id.Keyword_volatile}, KeywordId{.bytes="while", .id = Id.Keyword_while}, }; fn getKeyword(bytes: []const u8) -> ?Id { for (keywords) |kw| { if (mem.eql(u8, kw.bytes, bytes)) { return kw.id; } } return null; } const StrLitKind = enum {Normal, C}; pub const Id = union(enum) { Invalid, Identifier, StringLiteral: StrLitKind, Eof, Builtin, Bang, Equal, EqualEqual, BangEqual, LParen, RParen, Semicolon, Percent, LBrace, RBrace, Period, Ellipsis2, Ellipsis3, Minus, Arrow, Colon, Slash, Comma, Ampersand, AmpersandEqual, IntegerLiteral, FloatLiteral, Keyword_align, Keyword_and, Keyword_asm, Keyword_break, Keyword_coldcc, Keyword_comptime, Keyword_const, Keyword_continue, Keyword_defer, Keyword_else, Keyword_enum, Keyword_error, Keyword_export, Keyword_extern, Keyword_false, Keyword_fn, Keyword_for, Keyword_goto, Keyword_if, Keyword_inline, Keyword_nakedcc, Keyword_noalias, Keyword_null, Keyword_or, Keyword_packed, Keyword_pub, Keyword_return, Keyword_stdcallcc, Keyword_struct, Keyword_switch, Keyword_test, Keyword_this, Keyword_true, Keyword_undefined, Keyword_union, Keyword_unreachable, Keyword_use, Keyword_var, Keyword_volatile, Keyword_while, }; }; pub const Tokenizer = struct { buffer: []const u8, index: usize, pending_invalid_token: ?Token, pub const Location = struct { line: usize, column: usize, line_start: usize, line_end: usize, }; pub fn getTokenLocation(self: &Tokenizer, token: &const Token) -> Location { var loc = Location { .line = 0, .column = 0, .line_start = 0, .line_end = 0, }; for (self.buffer) |c, i| { if (i == token.start) { loc.line_end = i; while (loc.line_end < self.buffer.len and self.buffer[loc.line_end] != '\n') : (loc.line_end += 1) {} return loc; } if (c == '\n') { loc.line += 1; loc.column = 0; loc.line_start = i + 1; } else { loc.column += 1; } } return loc; } /// For debugging purposes pub fn dump(self: &Tokenizer, token: &const Token) { std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]); } /// buffer must end with "\n\n\n". This is so that attempting to decode /// a the 3 trailing bytes of a 4-byte utf8 sequence is never a buffer overflow. pub fn init(buffer: []const u8) -> Tokenizer { std.debug.assert(buffer[buffer.len - 1] == '\n'); std.debug.assert(buffer[buffer.len - 2] == '\n'); std.debug.assert(buffer[buffer.len - 3] == '\n'); return Tokenizer { .buffer = buffer, .index = 0, .pending_invalid_token = null, }; } const State = enum { Start, Identifier, Builtin, C, StringLiteral, StringLiteralBackslash, Equal, Bang, Minus, Slash, LineComment, Zero, IntegerLiteral, IntegerLiteralWithRadix, NumberDot, FloatFraction, FloatExponentUnsigned, FloatExponentNumber, Ampersand, Period, Period2, }; pub fn next(self: &Tokenizer) -> Token { if (self.pending_invalid_token) |token| { self.pending_invalid_token = null; return token; } var state = State.Start; var result = Token { .id = Token.Id.Eof, .start = self.index, .end = undefined, }; while (self.index < self.buffer.len) : (self.index += 1) { const c = self.buffer[self.index]; switch (state) { State.Start => switch (c) { ' ', '\n' => { result.start = self.index + 1; }, 'c' => { state = State.C; result.id = Token.Id.Identifier; }, '"' => { state = State.StringLiteral; result.id = Token.Id { .StringLiteral = Token.StrLitKind.Normal }; }, 'a'...'b', 'd'...'z', 'A'...'Z', '_' => { state = State.Identifier; result.id = Token.Id.Identifier; }, '@' => { state = State.Builtin; result.id = Token.Id.Builtin; }, '=' => { state = State.Equal; }, '!' => { state = State.Bang; }, '(' => { result.id = Token.Id.LParen; self.index += 1; break; }, ')' => { result.id = Token.Id.RParen; self.index += 1; break; }, ';' => { result.id = Token.Id.Semicolon; self.index += 1; break; }, ',' => { result.id = Token.Id.Comma; self.index += 1; break; }, ':' => { result.id = Token.Id.Colon; self.index += 1; break; }, '%' => { result.id = Token.Id.Percent; self.index += 1; break; }, '{' => { result.id = Token.Id.LBrace; self.index += 1; break; }, '}' => { result.id = Token.Id.RBrace; self.index += 1; break; }, '.' => { state = State.Period; }, '-' => { state = State.Minus; }, '/' => { state = State.Slash; }, '&' => { state = State.Ampersand; }, '0' => { state = State.Zero; result.id = Token.Id.IntegerLiteral; }, '1'...'9' => { state = State.IntegerLiteral; result.id = Token.Id.IntegerLiteral; }, else => { result.id = Token.Id.Invalid; self.index += 1; break; }, }, State.Ampersand => switch (c) { '=' => { result.id = Token.Id.AmpersandEqual; self.index += 1; break; }, else => { result.id = Token.Id.Ampersand; break; }, }, State.Identifier => switch (c) { 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, else => { if (Token.getKeyword(self.buffer[result.start..self.index])) |id| { result.id = id; } break; }, }, State.Builtin => switch (c) { 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, else => break, }, State.C => switch (c) { '\\' => @panic("TODO"), '"' => { state = State.StringLiteral; result.id = Token.Id { .StringLiteral = Token.StrLitKind.C }; }, 'a'...'z', 'A'...'Z', '_', '0'...'9' => { state = State.Identifier; }, else => break, }, State.StringLiteral => switch (c) { '\\' => { state = State.StringLiteralBackslash; }, '"' => { self.index += 1; break; }, '\n' => break, // Look for this error later. else => self.checkLiteralCharacter(), }, State.StringLiteralBackslash => switch (c) { '\n' => break, // Look for this error later. else => { state = State.StringLiteral; }, }, State.Bang => switch (c) { '=' => { result.id = Token.Id.BangEqual; self.index += 1; break; }, else => { result.id = Token.Id.Bang; break; }, }, State.Equal => switch (c) { '=' => { result.id = Token.Id.EqualEqual; self.index += 1; break; }, else => { result.id = Token.Id.Equal; break; }, }, State.Minus => switch (c) { '>' => { result.id = Token.Id.Arrow; self.index += 1; break; }, else => { result.id = Token.Id.Minus; break; }, }, State.Period => switch (c) { '.' => { state = State.Period2; }, else => { result.id = Token.Id.Period; break; }, }, State.Period2 => switch (c) { '.' => { result.id = Token.Id.Ellipsis3; self.index += 1; break; }, else => { result.id = Token.Id.Ellipsis2; break; }, }, State.Slash => switch (c) { '/' => { result.id = undefined; state = State.LineComment; }, else => { result.id = Token.Id.Slash; break; }, }, State.LineComment => switch (c) { '\n' => { state = State.Start; result = Token { .id = Token.Id.Eof, .start = self.index + 1, .end = undefined, }; }, else => self.checkLiteralCharacter(), }, State.Zero => switch (c) { 'b', 'o', 'x' => { state = State.IntegerLiteralWithRadix; }, else => { // reinterpret as a normal number self.index -= 1; state = State.IntegerLiteral; }, }, State.IntegerLiteral => switch (c) { '.' => { state = State.NumberDot; }, 'p', 'P', 'e', 'E' => { state = State.FloatExponentUnsigned; }, '0'...'9' => {}, else => break, }, State.IntegerLiteralWithRadix => switch (c) { '.' => { state = State.NumberDot; }, 'p', 'P' => { state = State.FloatExponentUnsigned; }, '0'...'9', 'a'...'f', 'A'...'F' => {}, else => break, }, State.NumberDot => switch (c) { '.' => { self.index -= 1; state = State.Start; break; }, else => { self.index -= 1; result.id = Token.Id.FloatLiteral; state = State.FloatFraction; }, }, State.FloatFraction => switch (c) { 'p', 'P' => { state = State.FloatExponentUnsigned; }, '0'...'9', 'a'...'f', 'A'...'F' => {}, else => break, }, State.FloatExponentUnsigned => switch (c) { '+', '-' => { state = State.FloatExponentNumber; }, else => { // reinterpret as a normal exponent number self.index -= 1; state = State.FloatExponentNumber; } }, State.FloatExponentNumber => switch (c) { '0'...'9', 'a'...'f', 'A'...'F' => {}, else => break, }, } } result.end = self.index; if (result.id == Token.Id.Eof) { if (self.pending_invalid_token) |token| { self.pending_invalid_token = null; return token; } } return result; } pub fn getTokenSlice(self: &const Tokenizer, token: &const Token) -> []const u8 { return self.buffer[token.start..token.end]; } fn checkLiteralCharacter(self: &Tokenizer) { if (self.pending_invalid_token != null) return; const invalid_length = self.getInvalidCharacterLength(); if (invalid_length == 0) return; self.pending_invalid_token = Token { .id = Token.Id.Invalid, .start = self.index, .end = self.index + invalid_length, }; } fn getInvalidCharacterLength(self: &Tokenizer) -> u3 { const c0 = self.buffer[self.index]; if (c0 < 0x80) { if (c0 < 0x20 or c0 == 0x7f) { // ascii control codes are never allowed // (note that \n was checked before we got here) return 1; } // looks fine to me. return 0; } else { // check utf8-encoded character. const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; // the last 3 bytes in the buffer are guaranteed to be '\n', // which means we don't need to do any bounds checking here. const bytes = self.buffer[self.index..self.index + length]; switch (length) { 2 => { const value = std.unicode.utf8Decode2(bytes) catch return length; if (value == 0x85) return length; // U+0085 (NEL) }, 3 => { const value = std.unicode.utf8Decode3(bytes) catch return length; if (value == 0x2028) return length; // U+2028 (LS) if (value == 0x2029) return length; // U+2029 (PS) }, 4 => { _ = std.unicode.utf8Decode4(bytes) catch return length; }, else => unreachable, } self.index += length - 1; return 0; } } }; test "tokenizer" { testTokenize("test", []Token.Id { Token.Id.Keyword_test, }); } test "tokenizer - invalid token characters" { testTokenize("#", []Token.Id{Token.Id.Invalid}); testTokenize("`", []Token.Id{Token.Id.Invalid}); } test "tokenizer - invalid literal/comment characters" { testTokenize("\"\x00\"", []Token.Id { Token.Id { .StringLiteral = Token.StrLitKind.Normal }, Token.Id.Invalid, }); testTokenize("//\x00", []Token.Id { Token.Id.Invalid, }); testTokenize("//\x1f", []Token.Id { Token.Id.Invalid, }); testTokenize("//\x7f", []Token.Id { Token.Id.Invalid, }); } test "tokenizer - utf8" { testTokenize("//\xc2\x80", []Token.Id{}); testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{}); } test "tokenizer - invalid utf8" { testTokenize("//\x80", []Token.Id{Token.Id.Invalid}); testTokenize("//\xbf", []Token.Id{Token.Id.Invalid}); testTokenize("//\xf8", []Token.Id{Token.Id.Invalid}); testTokenize("//\xff", []Token.Id{Token.Id.Invalid}); testTokenize("//\xc2\xc0", []Token.Id{Token.Id.Invalid}); testTokenize("//\xe0", []Token.Id{Token.Id.Invalid}); testTokenize("//\xf0", []Token.Id{Token.Id.Invalid}); testTokenize("//\xf0\x90\x80\xc0", []Token.Id{Token.Id.Invalid}); } test "tokenizer - illegal unicode codepoints" { // unicode newline characters.U+0085, U+2028, U+2029 testTokenize("//\xc2\x84", []Token.Id{}); testTokenize("//\xc2\x85", []Token.Id{Token.Id.Invalid}); testTokenize("//\xc2\x86", []Token.Id{}); testTokenize("//\xe2\x80\xa7", []Token.Id{}); testTokenize("//\xe2\x80\xa8", []Token.Id{Token.Id.Invalid}); testTokenize("//\xe2\x80\xa9", []Token.Id{Token.Id.Invalid}); testTokenize("//\xe2\x80\xaa", []Token.Id{}); } fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) { // (test authors, just make this bigger if you need it) var padded_source: [0x100]u8 = undefined; std.mem.copy(u8, padded_source[0..source.len], source); padded_source[source.len + 0] = '\n'; padded_source[source.len + 1] = '\n'; padded_source[source.len + 2] = '\n'; var tokenizer = Tokenizer.init(padded_source[0..source.len + 3]); for (expected_tokens) |expected_token_id| { const token = tokenizer.next(); std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id)); switch (expected_token_id) { Token.Id.StringLiteral => |expected_kind| { std.debug.assert(expected_kind == switch (token.id) { Token.Id.StringLiteral => |kind| kind, else => unreachable }); }, else => {}, } } std.debug.assert(tokenizer.next().id == Token.Id.Eof); }