1599 lines
53 KiB
Zig
1599 lines
53 KiB
Zig
const std = @import("../std.zig");
|
|
const mem = std.mem;
|
|
|
|
pub const Token = struct {
|
|
id: Id,
|
|
start: usize,
|
|
end: usize,
|
|
|
|
pub const Keyword = struct {
|
|
bytes: []const u8,
|
|
id: Id,
|
|
hash: u32,
|
|
|
|
fn init(bytes: []const u8, id: Id) Keyword {
|
|
@setEvalBranchQuota(2000);
|
|
return .{
|
|
.bytes = bytes,
|
|
.id = id,
|
|
.hash = std.hash_map.hashString(bytes),
|
|
};
|
|
}
|
|
};
|
|
|
|
pub const keywords = [_]Keyword{
|
|
Keyword.init("align", .Keyword_align),
|
|
Keyword.init("allowzero", .Keyword_allowzero),
|
|
Keyword.init("and", .Keyword_and),
|
|
Keyword.init("anyframe", .Keyword_anyframe),
|
|
Keyword.init("asm", .Keyword_asm),
|
|
Keyword.init("async", .Keyword_async),
|
|
Keyword.init("await", .Keyword_await),
|
|
Keyword.init("break", .Keyword_break),
|
|
Keyword.init("callconv", .Keyword_callconv),
|
|
Keyword.init("catch", .Keyword_catch),
|
|
Keyword.init("comptime", .Keyword_comptime),
|
|
Keyword.init("const", .Keyword_const),
|
|
Keyword.init("continue", .Keyword_continue),
|
|
Keyword.init("defer", .Keyword_defer),
|
|
Keyword.init("else", .Keyword_else),
|
|
Keyword.init("enum", .Keyword_enum),
|
|
Keyword.init("errdefer", .Keyword_errdefer),
|
|
Keyword.init("error", .Keyword_error),
|
|
Keyword.init("export", .Keyword_export),
|
|
Keyword.init("extern", .Keyword_extern),
|
|
Keyword.init("false", .Keyword_false),
|
|
Keyword.init("fn", .Keyword_fn),
|
|
Keyword.init("for", .Keyword_for),
|
|
Keyword.init("if", .Keyword_if),
|
|
Keyword.init("inline", .Keyword_inline),
|
|
Keyword.init("nakedcc", .Keyword_nakedcc),
|
|
Keyword.init("noalias", .Keyword_noalias),
|
|
Keyword.init("noasync", .Keyword_noasync),
|
|
Keyword.init("noinline", .Keyword_noinline),
|
|
Keyword.init("null", .Keyword_null),
|
|
Keyword.init("or", .Keyword_or),
|
|
Keyword.init("orelse", .Keyword_orelse),
|
|
Keyword.init("packed", .Keyword_packed),
|
|
Keyword.init("pub", .Keyword_pub),
|
|
Keyword.init("resume", .Keyword_resume),
|
|
Keyword.init("return", .Keyword_return),
|
|
Keyword.init("linksection", .Keyword_linksection),
|
|
Keyword.init("stdcallcc", .Keyword_stdcallcc),
|
|
Keyword.init("struct", .Keyword_struct),
|
|
Keyword.init("suspend", .Keyword_suspend),
|
|
Keyword.init("switch", .Keyword_switch),
|
|
Keyword.init("test", .Keyword_test),
|
|
Keyword.init("threadlocal", .Keyword_threadlocal),
|
|
Keyword.init("true", .Keyword_true),
|
|
Keyword.init("try", .Keyword_try),
|
|
Keyword.init("undefined", .Keyword_undefined),
|
|
Keyword.init("union", .Keyword_union),
|
|
Keyword.init("unreachable", .Keyword_unreachable),
|
|
Keyword.init("usingnamespace", .Keyword_usingnamespace),
|
|
Keyword.init("var", .Keyword_var),
|
|
Keyword.init("volatile", .Keyword_volatile),
|
|
Keyword.init("while", .Keyword_while),
|
|
};
|
|
|
|
// TODO perfect hash at comptime
|
|
pub fn getKeyword(bytes: []const u8) ?Id {
|
|
var hash = std.hash_map.hashString(bytes);
|
|
for (keywords) |kw| {
|
|
if (kw.hash == hash and mem.eql(u8, kw.bytes, bytes)) {
|
|
return kw.id;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
pub const Id = enum {
|
|
Invalid,
|
|
Invalid_ampersands,
|
|
Identifier,
|
|
StringLiteral,
|
|
MultilineStringLiteralLine,
|
|
CharLiteral,
|
|
Eof,
|
|
Builtin,
|
|
Bang,
|
|
Pipe,
|
|
PipePipe,
|
|
PipeEqual,
|
|
Equal,
|
|
EqualEqual,
|
|
EqualAngleBracketRight,
|
|
BangEqual,
|
|
LParen,
|
|
RParen,
|
|
Semicolon,
|
|
Percent,
|
|
PercentEqual,
|
|
LBrace,
|
|
RBrace,
|
|
LBracket,
|
|
RBracket,
|
|
Period,
|
|
PeriodAsterisk,
|
|
Ellipsis2,
|
|
Ellipsis3,
|
|
Caret,
|
|
CaretEqual,
|
|
Plus,
|
|
PlusPlus,
|
|
PlusEqual,
|
|
PlusPercent,
|
|
PlusPercentEqual,
|
|
Minus,
|
|
MinusEqual,
|
|
MinusPercent,
|
|
MinusPercentEqual,
|
|
Asterisk,
|
|
AsteriskEqual,
|
|
AsteriskAsterisk,
|
|
AsteriskPercent,
|
|
AsteriskPercentEqual,
|
|
Arrow,
|
|
Colon,
|
|
Slash,
|
|
SlashEqual,
|
|
Comma,
|
|
Ampersand,
|
|
AmpersandEqual,
|
|
QuestionMark,
|
|
AngleBracketLeft,
|
|
AngleBracketLeftEqual,
|
|
AngleBracketAngleBracketLeft,
|
|
AngleBracketAngleBracketLeftEqual,
|
|
AngleBracketRight,
|
|
AngleBracketRightEqual,
|
|
AngleBracketAngleBracketRight,
|
|
AngleBracketAngleBracketRightEqual,
|
|
Tilde,
|
|
IntegerLiteral,
|
|
FloatLiteral,
|
|
LineComment,
|
|
DocComment,
|
|
ContainerDocComment,
|
|
ShebangLine,
|
|
Keyword_align,
|
|
Keyword_allowzero,
|
|
Keyword_and,
|
|
Keyword_asm,
|
|
Keyword_async,
|
|
Keyword_await,
|
|
Keyword_break,
|
|
Keyword_callconv,
|
|
Keyword_catch,
|
|
Keyword_comptime,
|
|
Keyword_const,
|
|
Keyword_continue,
|
|
Keyword_defer,
|
|
Keyword_else,
|
|
Keyword_enum,
|
|
Keyword_errdefer,
|
|
Keyword_error,
|
|
Keyword_export,
|
|
Keyword_extern,
|
|
Keyword_false,
|
|
Keyword_fn,
|
|
Keyword_for,
|
|
Keyword_if,
|
|
Keyword_inline,
|
|
Keyword_nakedcc,
|
|
Keyword_noalias,
|
|
Keyword_noasync,
|
|
Keyword_noinline,
|
|
Keyword_null,
|
|
Keyword_or,
|
|
Keyword_orelse,
|
|
Keyword_packed,
|
|
Keyword_anyframe,
|
|
Keyword_pub,
|
|
Keyword_resume,
|
|
Keyword_return,
|
|
Keyword_linksection,
|
|
Keyword_stdcallcc,
|
|
Keyword_struct,
|
|
Keyword_suspend,
|
|
Keyword_switch,
|
|
Keyword_test,
|
|
Keyword_threadlocal,
|
|
Keyword_true,
|
|
Keyword_try,
|
|
Keyword_undefined,
|
|
Keyword_union,
|
|
Keyword_unreachable,
|
|
Keyword_usingnamespace,
|
|
Keyword_var,
|
|
Keyword_volatile,
|
|
Keyword_while,
|
|
|
|
pub fn symbol(id: Id) []const u8 {
|
|
return switch (id) {
|
|
.Invalid => "Invalid",
|
|
.Invalid_ampersands => "&&",
|
|
.Identifier => "Identifier",
|
|
.StringLiteral => "StringLiteral",
|
|
.MultilineStringLiteralLine => "MultilineStringLiteralLine",
|
|
.CharLiteral => "CharLiteral",
|
|
.Eof => "Eof",
|
|
.Builtin => "Builtin",
|
|
.IntegerLiteral => "IntegerLiteral",
|
|
.FloatLiteral => "FloatLiteral",
|
|
.LineComment => "LineComment",
|
|
.DocComment => "DocComment",
|
|
.ContainerDocComment => "ContainerDocComment",
|
|
.ShebangLine => "ShebangLine",
|
|
|
|
.Bang => "!",
|
|
.Pipe => "|",
|
|
.PipePipe => "||",
|
|
.PipeEqual => "|=",
|
|
.Equal => "=",
|
|
.EqualEqual => "==",
|
|
.EqualAngleBracketRight => "=>",
|
|
.BangEqual => "!=",
|
|
.LParen => "(",
|
|
.RParen => ")",
|
|
.Semicolon => ";",
|
|
.Percent => "%",
|
|
.PercentEqual => "%=",
|
|
.LBrace => "{",
|
|
.RBrace => "}",
|
|
.LBracket => "[",
|
|
.RBracket => "]",
|
|
.Period => ".",
|
|
.PeriodAsterisk => ".*",
|
|
.Ellipsis2 => "..",
|
|
.Ellipsis3 => "...",
|
|
.Caret => "^",
|
|
.CaretEqual => "^=",
|
|
.Plus => "+",
|
|
.PlusPlus => "++",
|
|
.PlusEqual => "+=",
|
|
.PlusPercent => "+%",
|
|
.PlusPercentEqual => "+%=",
|
|
.Minus => "-",
|
|
.MinusEqual => "-=",
|
|
.MinusPercent => "-%",
|
|
.MinusPercentEqual => "-%=",
|
|
.Asterisk => "*",
|
|
.AsteriskEqual => "*=",
|
|
.AsteriskAsterisk => "**",
|
|
.AsteriskPercent => "*%",
|
|
.AsteriskPercentEqual => "*%=",
|
|
.Arrow => "->",
|
|
.Colon => ":",
|
|
.Slash => "/",
|
|
.SlashEqual => "/=",
|
|
.Comma => ",",
|
|
.Ampersand => "&",
|
|
.AmpersandEqual => "&=",
|
|
.QuestionMark => "?",
|
|
.AngleBracketLeft => "<",
|
|
.AngleBracketLeftEqual => "<=",
|
|
.AngleBracketAngleBracketLeft => "<<",
|
|
.AngleBracketAngleBracketLeftEqual => "<<=",
|
|
.AngleBracketRight => ">",
|
|
.AngleBracketRightEqual => ">=",
|
|
.AngleBracketAngleBracketRight => ">>",
|
|
.AngleBracketAngleBracketRightEqual => ">>=",
|
|
.Tilde => "~",
|
|
.Keyword_align => "align",
|
|
.Keyword_allowzero => "allowzero",
|
|
.Keyword_and => "and",
|
|
.Keyword_anyframe => "anyframe",
|
|
.Keyword_asm => "asm",
|
|
.Keyword_async => "async",
|
|
.Keyword_await => "await",
|
|
.Keyword_break => "break",
|
|
.Keyword_callconv => "callconv",
|
|
.Keyword_catch => "catch",
|
|
.Keyword_comptime => "comptime",
|
|
.Keyword_const => "const",
|
|
.Keyword_continue => "continue",
|
|
.Keyword_defer => "defer",
|
|
.Keyword_else => "else",
|
|
.Keyword_enum => "enum",
|
|
.Keyword_errdefer => "errdefer",
|
|
.Keyword_error => "error",
|
|
.Keyword_export => "export",
|
|
.Keyword_extern => "extern",
|
|
.Keyword_false => "false",
|
|
.Keyword_fn => "fn",
|
|
.Keyword_for => "for",
|
|
.Keyword_if => "if",
|
|
.Keyword_inline => "inline",
|
|
.Keyword_nakedcc => "nakedcc",
|
|
.Keyword_noalias => "noalias",
|
|
.Keyword_noasync => "noasync",
|
|
.Keyword_noinline => "noinline",
|
|
.Keyword_null => "null",
|
|
.Keyword_or => "or",
|
|
.Keyword_orelse => "orelse",
|
|
.Keyword_packed => "packed",
|
|
.Keyword_pub => "pub",
|
|
.Keyword_resume => "resume",
|
|
.Keyword_return => "return",
|
|
.Keyword_linksection => "linksection",
|
|
.Keyword_stdcallcc => "stdcallcc",
|
|
.Keyword_struct => "struct",
|
|
.Keyword_suspend => "suspend",
|
|
.Keyword_switch => "switch",
|
|
.Keyword_test => "test",
|
|
.Keyword_threadlocal => "threadlocal",
|
|
.Keyword_true => "true",
|
|
.Keyword_try => "try",
|
|
.Keyword_undefined => "undefined",
|
|
.Keyword_union => "union",
|
|
.Keyword_unreachable => "unreachable",
|
|
.Keyword_usingnamespace => "usingnamespace",
|
|
.Keyword_var => "var",
|
|
.Keyword_volatile => "volatile",
|
|
.Keyword_while => "while",
|
|
};
|
|
}
|
|
};
|
|
};
|
|
|
|
pub const Tokenizer = struct {
|
|
buffer: []const u8,
|
|
index: usize,
|
|
pending_invalid_token: ?Token,
|
|
|
|
/// For debugging purposes
|
|
pub fn dump(self: *Tokenizer, token: *const Token) void {
|
|
std.debug.warn("{} \"{}\"\n", .{ @tagName(token.id), self.buffer[token.start..token.end] });
|
|
}
|
|
|
|
pub fn init(buffer: []const u8) Tokenizer {
|
|
// Skip the UTF-8 BOM if present
|
|
const src_start = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else @as(usize, 0);
|
|
return Tokenizer{
|
|
.buffer = buffer,
|
|
.index = src_start,
|
|
.pending_invalid_token = null,
|
|
};
|
|
}
|
|
|
|
const State = enum {
|
|
Start,
|
|
Identifier,
|
|
Builtin,
|
|
StringLiteral,
|
|
StringLiteralBackslash,
|
|
MultilineStringLiteralLine,
|
|
CharLiteral,
|
|
CharLiteralBackslash,
|
|
CharLiteralHexEscape,
|
|
CharLiteralUnicodeEscapeSawU,
|
|
CharLiteralUnicodeEscape,
|
|
CharLiteralUnicodeInvalid,
|
|
CharLiteralUnicode,
|
|
CharLiteralEnd,
|
|
Backslash,
|
|
Equal,
|
|
Bang,
|
|
Pipe,
|
|
Minus,
|
|
MinusPercent,
|
|
Asterisk,
|
|
AsteriskPercent,
|
|
Slash,
|
|
LineCommentStart,
|
|
LineComment,
|
|
DocCommentStart,
|
|
DocComment,
|
|
ContainerDocComment,
|
|
Zero,
|
|
IntegerLiteral,
|
|
IntegerLiteralWithRadix,
|
|
IntegerLiteralWithRadixHex,
|
|
NumberDot,
|
|
NumberDotHex,
|
|
FloatFraction,
|
|
FloatFractionHex,
|
|
FloatExponentUnsigned,
|
|
FloatExponentUnsignedHex,
|
|
FloatExponentNumber,
|
|
FloatExponentNumberHex,
|
|
Ampersand,
|
|
Caret,
|
|
Percent,
|
|
Plus,
|
|
PlusPercent,
|
|
AngleBracketLeft,
|
|
AngleBracketAngleBracketLeft,
|
|
AngleBracketRight,
|
|
AngleBracketAngleBracketRight,
|
|
Period,
|
|
Period2,
|
|
SawAtSign,
|
|
};
|
|
|
|
pub fn next(self: *Tokenizer) Token {
|
|
if (self.pending_invalid_token) |token| {
|
|
self.pending_invalid_token = null;
|
|
return token;
|
|
}
|
|
const start_index = self.index;
|
|
var state = State.Start;
|
|
var result = Token{
|
|
.id = Token.Id.Eof,
|
|
.start = self.index,
|
|
.end = undefined,
|
|
};
|
|
var seen_escape_digits: usize = undefined;
|
|
var remaining_code_units: usize = undefined;
|
|
while (self.index < self.buffer.len) : (self.index += 1) {
|
|
const c = self.buffer[self.index];
|
|
switch (state) {
|
|
State.Start => switch (c) {
|
|
' ', '\n', '\t', '\r' => {
|
|
result.start = self.index + 1;
|
|
},
|
|
'"' => {
|
|
state = State.StringLiteral;
|
|
result.id = Token.Id.StringLiteral;
|
|
},
|
|
'\'' => {
|
|
state = State.CharLiteral;
|
|
},
|
|
'a'...'z', 'A'...'Z', '_' => {
|
|
state = State.Identifier;
|
|
result.id = Token.Id.Identifier;
|
|
},
|
|
'@' => {
|
|
state = State.SawAtSign;
|
|
},
|
|
'=' => {
|
|
state = State.Equal;
|
|
},
|
|
'!' => {
|
|
state = State.Bang;
|
|
},
|
|
'|' => {
|
|
state = State.Pipe;
|
|
},
|
|
'(' => {
|
|
result.id = Token.Id.LParen;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
')' => {
|
|
result.id = Token.Id.RParen;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'[' => {
|
|
result.id = .LBracket;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
']' => {
|
|
result.id = Token.Id.RBracket;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
';' => {
|
|
result.id = Token.Id.Semicolon;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
',' => {
|
|
result.id = Token.Id.Comma;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'?' => {
|
|
result.id = Token.Id.QuestionMark;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
':' => {
|
|
result.id = Token.Id.Colon;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'%' => {
|
|
state = State.Percent;
|
|
},
|
|
'*' => {
|
|
state = State.Asterisk;
|
|
},
|
|
'+' => {
|
|
state = State.Plus;
|
|
},
|
|
'<' => {
|
|
state = State.AngleBracketLeft;
|
|
},
|
|
'>' => {
|
|
state = State.AngleBracketRight;
|
|
},
|
|
'^' => {
|
|
state = State.Caret;
|
|
},
|
|
'\\' => {
|
|
state = State.Backslash;
|
|
result.id = Token.Id.MultilineStringLiteralLine;
|
|
},
|
|
'{' => {
|
|
result.id = Token.Id.LBrace;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'}' => {
|
|
result.id = Token.Id.RBrace;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'~' => {
|
|
result.id = Token.Id.Tilde;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'.' => {
|
|
state = State.Period;
|
|
},
|
|
'-' => {
|
|
state = State.Minus;
|
|
},
|
|
'/' => {
|
|
state = State.Slash;
|
|
},
|
|
'&' => {
|
|
state = State.Ampersand;
|
|
},
|
|
'0' => {
|
|
state = State.Zero;
|
|
result.id = Token.Id.IntegerLiteral;
|
|
},
|
|
'1'...'9' => {
|
|
state = State.IntegerLiteral;
|
|
result.id = Token.Id.IntegerLiteral;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Invalid;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.SawAtSign => switch (c) {
|
|
'"' => {
|
|
result.id = Token.Id.Identifier;
|
|
state = State.StringLiteral;
|
|
},
|
|
else => {
|
|
// reinterpret as a builtin
|
|
self.index -= 1;
|
|
state = State.Builtin;
|
|
result.id = Token.Id.Builtin;
|
|
},
|
|
},
|
|
|
|
State.Ampersand => switch (c) {
|
|
'&' => {
|
|
result.id = Token.Id.Invalid_ampersands;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'=' => {
|
|
result.id = Token.Id.AmpersandEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Ampersand;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.Asterisk => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.AsteriskEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'*' => {
|
|
result.id = Token.Id.AsteriskAsterisk;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'%' => {
|
|
state = State.AsteriskPercent;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Asterisk;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.AsteriskPercent => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.AsteriskPercentEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.AsteriskPercent;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.Percent => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.PercentEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Percent;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.Plus => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.PlusEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'+' => {
|
|
result.id = Token.Id.PlusPlus;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'%' => {
|
|
state = State.PlusPercent;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Plus;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.PlusPercent => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.PlusPercentEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.PlusPercent;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.Caret => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.CaretEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Caret;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.Identifier => switch (c) {
|
|
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
|
|
else => {
|
|
if (Token.getKeyword(self.buffer[result.start..self.index])) |id| {
|
|
result.id = id;
|
|
}
|
|
break;
|
|
},
|
|
},
|
|
State.Builtin => switch (c) {
|
|
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
|
|
else => break,
|
|
},
|
|
State.Backslash => switch (c) {
|
|
'\\' => {
|
|
state = State.MultilineStringLiteralLine;
|
|
},
|
|
else => break,
|
|
},
|
|
State.StringLiteral => switch (c) {
|
|
'\\' => {
|
|
state = State.StringLiteralBackslash;
|
|
},
|
|
'"' => {
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'\n', '\r' => break, // Look for this error later.
|
|
else => self.checkLiteralCharacter(),
|
|
},
|
|
|
|
State.StringLiteralBackslash => switch (c) {
|
|
'\n', '\r' => break, // Look for this error later.
|
|
else => {
|
|
state = State.StringLiteral;
|
|
},
|
|
},
|
|
|
|
State.CharLiteral => switch (c) {
|
|
'\\' => {
|
|
state = State.CharLiteralBackslash;
|
|
},
|
|
'\'', 0x80...0xbf, 0xf8...0xff => {
|
|
result.id = Token.Id.Invalid;
|
|
break;
|
|
},
|
|
0xc0...0xdf => { // 110xxxxx
|
|
remaining_code_units = 1;
|
|
state = State.CharLiteralUnicode;
|
|
},
|
|
0xe0...0xef => { // 1110xxxx
|
|
remaining_code_units = 2;
|
|
state = State.CharLiteralUnicode;
|
|
},
|
|
0xf0...0xf7 => { // 11110xxx
|
|
remaining_code_units = 3;
|
|
state = State.CharLiteralUnicode;
|
|
},
|
|
else => {
|
|
state = State.CharLiteralEnd;
|
|
},
|
|
},
|
|
|
|
State.CharLiteralBackslash => switch (c) {
|
|
'\n' => {
|
|
result.id = Token.Id.Invalid;
|
|
break;
|
|
},
|
|
'x' => {
|
|
state = State.CharLiteralHexEscape;
|
|
seen_escape_digits = 0;
|
|
},
|
|
'u' => {
|
|
state = State.CharLiteralUnicodeEscapeSawU;
|
|
},
|
|
else => {
|
|
state = State.CharLiteralEnd;
|
|
},
|
|
},
|
|
|
|
State.CharLiteralHexEscape => switch (c) {
|
|
'0'...'9', 'a'...'f', 'A'...'F' => {
|
|
seen_escape_digits += 1;
|
|
if (seen_escape_digits == 2) {
|
|
state = State.CharLiteralEnd;
|
|
}
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Invalid;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.CharLiteralUnicodeEscapeSawU => switch (c) {
|
|
'{' => {
|
|
state = State.CharLiteralUnicodeEscape;
|
|
seen_escape_digits = 0;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Invalid;
|
|
state = State.CharLiteralUnicodeInvalid;
|
|
},
|
|
},
|
|
|
|
State.CharLiteralUnicodeEscape => switch (c) {
|
|
'0'...'9', 'a'...'f', 'A'...'F' => {
|
|
seen_escape_digits += 1;
|
|
},
|
|
'}' => {
|
|
if (seen_escape_digits == 0) {
|
|
result.id = Token.Id.Invalid;
|
|
state = State.CharLiteralUnicodeInvalid;
|
|
} else {
|
|
state = State.CharLiteralEnd;
|
|
}
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Invalid;
|
|
state = State.CharLiteralUnicodeInvalid;
|
|
},
|
|
},
|
|
|
|
State.CharLiteralUnicodeInvalid => switch (c) {
|
|
// Keep consuming characters until an obvious stopping point.
|
|
// This consolidates e.g. `u{0ab1Q}` into a single invalid token
|
|
// instead of creating the tokens `u{0ab1`, `Q`, `}`
|
|
'0'...'9', 'a'...'z', 'A'...'Z', '}' => {},
|
|
else => break,
|
|
},
|
|
|
|
State.CharLiteralEnd => switch (c) {
|
|
'\'' => {
|
|
result.id = Token.Id.CharLiteral;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Invalid;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.CharLiteralUnicode => switch (c) {
|
|
0x80...0xbf => {
|
|
remaining_code_units -= 1;
|
|
if (remaining_code_units == 0) {
|
|
state = State.CharLiteralEnd;
|
|
}
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Invalid;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.MultilineStringLiteralLine => switch (c) {
|
|
'\n' => {
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => self.checkLiteralCharacter(),
|
|
},
|
|
|
|
State.Bang => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.BangEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Bang;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.Pipe => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.PipeEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'|' => {
|
|
result.id = Token.Id.PipePipe;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Pipe;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.Equal => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.EqualEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'>' => {
|
|
result.id = Token.Id.EqualAngleBracketRight;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Equal;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.Minus => switch (c) {
|
|
'>' => {
|
|
result.id = Token.Id.Arrow;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'=' => {
|
|
result.id = Token.Id.MinusEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
'%' => {
|
|
state = State.MinusPercent;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Minus;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.MinusPercent => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.MinusPercentEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.MinusPercent;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.AngleBracketLeft => switch (c) {
|
|
'<' => {
|
|
state = State.AngleBracketAngleBracketLeft;
|
|
},
|
|
'=' => {
|
|
result.id = Token.Id.AngleBracketLeftEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.AngleBracketLeft;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.AngleBracketAngleBracketLeft => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.AngleBracketAngleBracketLeftEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.AngleBracketAngleBracketLeft;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.AngleBracketRight => switch (c) {
|
|
'>' => {
|
|
state = State.AngleBracketAngleBracketRight;
|
|
},
|
|
'=' => {
|
|
result.id = Token.Id.AngleBracketRightEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.AngleBracketRight;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.AngleBracketAngleBracketRight => switch (c) {
|
|
'=' => {
|
|
result.id = Token.Id.AngleBracketAngleBracketRightEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.AngleBracketAngleBracketRight;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.Period => switch (c) {
|
|
'.' => {
|
|
state = State.Period2;
|
|
},
|
|
'*' => {
|
|
result.id = Token.Id.PeriodAsterisk;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Period;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.Period2 => switch (c) {
|
|
'.' => {
|
|
result.id = Token.Id.Ellipsis3;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Ellipsis2;
|
|
break;
|
|
},
|
|
},
|
|
|
|
State.Slash => switch (c) {
|
|
'/' => {
|
|
state = State.LineCommentStart;
|
|
result.id = Token.Id.LineComment;
|
|
},
|
|
'=' => {
|
|
result.id = Token.Id.SlashEqual;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => {
|
|
result.id = Token.Id.Slash;
|
|
break;
|
|
},
|
|
},
|
|
State.LineCommentStart => switch (c) {
|
|
'/' => {
|
|
state = State.DocCommentStart;
|
|
},
|
|
'!' => {
|
|
result.id = Token.Id.ContainerDocComment;
|
|
state = State.ContainerDocComment;
|
|
},
|
|
'\n' => break,
|
|
else => {
|
|
state = State.LineComment;
|
|
self.checkLiteralCharacter();
|
|
},
|
|
},
|
|
State.DocCommentStart => switch (c) {
|
|
'/' => {
|
|
state = State.LineComment;
|
|
},
|
|
'\n' => {
|
|
result.id = Token.Id.DocComment;
|
|
break;
|
|
},
|
|
else => {
|
|
state = State.DocComment;
|
|
result.id = Token.Id.DocComment;
|
|
self.checkLiteralCharacter();
|
|
},
|
|
},
|
|
State.LineComment, State.DocComment, State.ContainerDocComment => switch (c) {
|
|
'\n' => break,
|
|
else => self.checkLiteralCharacter(),
|
|
},
|
|
State.Zero => switch (c) {
|
|
'b', 'o' => {
|
|
state = State.IntegerLiteralWithRadix;
|
|
},
|
|
'x' => {
|
|
state = State.IntegerLiteralWithRadixHex;
|
|
},
|
|
else => {
|
|
// reinterpret as a normal number
|
|
self.index -= 1;
|
|
state = State.IntegerLiteral;
|
|
},
|
|
},
|
|
State.IntegerLiteral => switch (c) {
|
|
'.' => {
|
|
state = State.NumberDot;
|
|
},
|
|
'p', 'P', 'e', 'E' => {
|
|
state = State.FloatExponentUnsigned;
|
|
},
|
|
'0'...'9' => {},
|
|
else => break,
|
|
},
|
|
State.IntegerLiteralWithRadix => switch (c) {
|
|
'.' => {
|
|
state = State.NumberDot;
|
|
},
|
|
'0'...'9' => {},
|
|
else => break,
|
|
},
|
|
State.IntegerLiteralWithRadixHex => switch (c) {
|
|
'.' => {
|
|
state = State.NumberDotHex;
|
|
},
|
|
'p', 'P' => {
|
|
state = State.FloatExponentUnsignedHex;
|
|
},
|
|
'0'...'9', 'a'...'f', 'A'...'F' => {},
|
|
else => break,
|
|
},
|
|
State.NumberDot => switch (c) {
|
|
'.' => {
|
|
self.index -= 1;
|
|
state = State.Start;
|
|
break;
|
|
},
|
|
else => {
|
|
self.index -= 1;
|
|
result.id = Token.Id.FloatLiteral;
|
|
state = State.FloatFraction;
|
|
},
|
|
},
|
|
State.NumberDotHex => switch (c) {
|
|
'.' => {
|
|
self.index -= 1;
|
|
state = State.Start;
|
|
break;
|
|
},
|
|
else => {
|
|
self.index -= 1;
|
|
result.id = Token.Id.FloatLiteral;
|
|
state = State.FloatFractionHex;
|
|
},
|
|
},
|
|
State.FloatFraction => switch (c) {
|
|
'e', 'E' => {
|
|
state = State.FloatExponentUnsigned;
|
|
},
|
|
'0'...'9' => {},
|
|
else => break,
|
|
},
|
|
State.FloatFractionHex => switch (c) {
|
|
'p', 'P' => {
|
|
state = State.FloatExponentUnsignedHex;
|
|
},
|
|
'0'...'9', 'a'...'f', 'A'...'F' => {},
|
|
else => break,
|
|
},
|
|
State.FloatExponentUnsigned => switch (c) {
|
|
'+', '-' => {
|
|
state = State.FloatExponentNumber;
|
|
},
|
|
else => {
|
|
// reinterpret as a normal exponent number
|
|
self.index -= 1;
|
|
state = State.FloatExponentNumber;
|
|
},
|
|
},
|
|
State.FloatExponentUnsignedHex => switch (c) {
|
|
'+', '-' => {
|
|
state = State.FloatExponentNumberHex;
|
|
},
|
|
else => {
|
|
// reinterpret as a normal exponent number
|
|
self.index -= 1;
|
|
state = State.FloatExponentNumberHex;
|
|
},
|
|
},
|
|
State.FloatExponentNumber => switch (c) {
|
|
'0'...'9' => {},
|
|
else => break,
|
|
},
|
|
State.FloatExponentNumberHex => switch (c) {
|
|
'0'...'9', 'a'...'f', 'A'...'F' => {},
|
|
else => break,
|
|
},
|
|
}
|
|
} else if (self.index == self.buffer.len) {
|
|
switch (state) {
|
|
State.Start,
|
|
State.IntegerLiteral,
|
|
State.IntegerLiteralWithRadix,
|
|
State.IntegerLiteralWithRadixHex,
|
|
State.FloatFraction,
|
|
State.FloatFractionHex,
|
|
State.FloatExponentNumber,
|
|
State.FloatExponentNumberHex,
|
|
State.StringLiteral, // find this error later
|
|
State.MultilineStringLiteralLine,
|
|
State.Builtin,
|
|
=> {},
|
|
|
|
State.Identifier => {
|
|
if (Token.getKeyword(self.buffer[result.start..self.index])) |id| {
|
|
result.id = id;
|
|
}
|
|
},
|
|
State.LineCommentStart, State.LineComment => {
|
|
result.id = Token.Id.LineComment;
|
|
},
|
|
State.DocComment, State.DocCommentStart => {
|
|
result.id = Token.Id.DocComment;
|
|
},
|
|
State.ContainerDocComment => {
|
|
result.id = Token.Id.ContainerDocComment;
|
|
},
|
|
|
|
State.NumberDot,
|
|
State.NumberDotHex,
|
|
State.FloatExponentUnsigned,
|
|
State.FloatExponentUnsignedHex,
|
|
State.SawAtSign,
|
|
State.Backslash,
|
|
State.CharLiteral,
|
|
State.CharLiteralBackslash,
|
|
State.CharLiteralHexEscape,
|
|
State.CharLiteralUnicodeEscapeSawU,
|
|
State.CharLiteralUnicodeEscape,
|
|
State.CharLiteralUnicodeInvalid,
|
|
State.CharLiteralEnd,
|
|
State.CharLiteralUnicode,
|
|
State.StringLiteralBackslash,
|
|
=> {
|
|
result.id = Token.Id.Invalid;
|
|
},
|
|
|
|
State.Equal => {
|
|
result.id = Token.Id.Equal;
|
|
},
|
|
State.Bang => {
|
|
result.id = Token.Id.Bang;
|
|
},
|
|
State.Minus => {
|
|
result.id = Token.Id.Minus;
|
|
},
|
|
State.Slash => {
|
|
result.id = Token.Id.Slash;
|
|
},
|
|
State.Zero => {
|
|
result.id = Token.Id.IntegerLiteral;
|
|
},
|
|
State.Ampersand => {
|
|
result.id = Token.Id.Ampersand;
|
|
},
|
|
State.Period => {
|
|
result.id = Token.Id.Period;
|
|
},
|
|
State.Period2 => {
|
|
result.id = Token.Id.Ellipsis2;
|
|
},
|
|
State.Pipe => {
|
|
result.id = Token.Id.Pipe;
|
|
},
|
|
State.AngleBracketAngleBracketRight => {
|
|
result.id = Token.Id.AngleBracketAngleBracketRight;
|
|
},
|
|
State.AngleBracketRight => {
|
|
result.id = Token.Id.AngleBracketRight;
|
|
},
|
|
State.AngleBracketAngleBracketLeft => {
|
|
result.id = Token.Id.AngleBracketAngleBracketLeft;
|
|
},
|
|
State.AngleBracketLeft => {
|
|
result.id = Token.Id.AngleBracketLeft;
|
|
},
|
|
State.PlusPercent => {
|
|
result.id = Token.Id.PlusPercent;
|
|
},
|
|
State.Plus => {
|
|
result.id = Token.Id.Plus;
|
|
},
|
|
State.Percent => {
|
|
result.id = Token.Id.Percent;
|
|
},
|
|
State.Caret => {
|
|
result.id = Token.Id.Caret;
|
|
},
|
|
State.AsteriskPercent => {
|
|
result.id = Token.Id.AsteriskPercent;
|
|
},
|
|
State.Asterisk => {
|
|
result.id = Token.Id.Asterisk;
|
|
},
|
|
State.MinusPercent => {
|
|
result.id = Token.Id.MinusPercent;
|
|
},
|
|
}
|
|
}
|
|
|
|
if (result.id == Token.Id.Eof) {
|
|
if (self.pending_invalid_token) |token| {
|
|
self.pending_invalid_token = null;
|
|
return token;
|
|
}
|
|
}
|
|
|
|
result.end = self.index;
|
|
return result;
|
|
}
|
|
|
|
fn checkLiteralCharacter(self: *Tokenizer) void {
|
|
if (self.pending_invalid_token != null) return;
|
|
const invalid_length = self.getInvalidCharacterLength();
|
|
if (invalid_length == 0) return;
|
|
self.pending_invalid_token = Token{
|
|
.id = Token.Id.Invalid,
|
|
.start = self.index,
|
|
.end = self.index + invalid_length,
|
|
};
|
|
}
|
|
|
|
fn getInvalidCharacterLength(self: *Tokenizer) u3 {
|
|
const c0 = self.buffer[self.index];
|
|
if (c0 < 0x80) {
|
|
if (c0 < 0x20 or c0 == 0x7f) {
|
|
// ascii control codes are never allowed
|
|
// (note that \n was checked before we got here)
|
|
return 1;
|
|
}
|
|
// looks fine to me.
|
|
return 0;
|
|
} else {
|
|
// check utf8-encoded character.
|
|
const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1;
|
|
if (self.index + length > self.buffer.len) {
|
|
return @intCast(u3, self.buffer.len - self.index);
|
|
}
|
|
const bytes = self.buffer[self.index .. self.index + length];
|
|
switch (length) {
|
|
2 => {
|
|
const value = std.unicode.utf8Decode2(bytes) catch return length;
|
|
if (value == 0x85) return length; // U+0085 (NEL)
|
|
},
|
|
3 => {
|
|
const value = std.unicode.utf8Decode3(bytes) catch return length;
|
|
if (value == 0x2028) return length; // U+2028 (LS)
|
|
if (value == 0x2029) return length; // U+2029 (PS)
|
|
},
|
|
4 => {
|
|
_ = std.unicode.utf8Decode4(bytes) catch return length;
|
|
},
|
|
else => unreachable,
|
|
}
|
|
self.index += length - 1;
|
|
return 0;
|
|
}
|
|
}
|
|
};
|
|
|
|
test "tokenizer" {
|
|
testTokenize("test", &[_]Token.Id{Token.Id.Keyword_test});
|
|
}
|
|
|
|
test "tokenizer - unknown length pointer and then c pointer" {
|
|
testTokenize(
|
|
\\[*]u8
|
|
\\[*c]u8
|
|
, &[_]Token.Id{
|
|
Token.Id.LBracket,
|
|
Token.Id.Asterisk,
|
|
Token.Id.RBracket,
|
|
Token.Id.Identifier,
|
|
Token.Id.LBracket,
|
|
Token.Id.Asterisk,
|
|
Token.Id.Identifier,
|
|
Token.Id.RBracket,
|
|
Token.Id.Identifier,
|
|
});
|
|
}
|
|
|
|
test "tokenizer - char literal with hex escape" {
|
|
testTokenize(
|
|
\\'\x1b'
|
|
, &[_]Token.Id{.CharLiteral});
|
|
testTokenize(
|
|
\\'\x1'
|
|
, &[_]Token.Id{ .Invalid, .Invalid });
|
|
}
|
|
|
|
test "tokenizer - char literal with unicode escapes" {
|
|
// Valid unicode escapes
|
|
testTokenize(
|
|
\\'\u{3}'
|
|
, &[_]Token.Id{.CharLiteral});
|
|
testTokenize(
|
|
\\'\u{01}'
|
|
, &[_]Token.Id{.CharLiteral});
|
|
testTokenize(
|
|
\\'\u{2a}'
|
|
, &[_]Token.Id{.CharLiteral});
|
|
testTokenize(
|
|
\\'\u{3f9}'
|
|
, &[_]Token.Id{.CharLiteral});
|
|
testTokenize(
|
|
\\'\u{6E09aBc1523}'
|
|
, &[_]Token.Id{.CharLiteral});
|
|
testTokenize(
|
|
\\"\u{440}"
|
|
, &[_]Token.Id{.StringLiteral});
|
|
|
|
// Invalid unicode escapes
|
|
testTokenize(
|
|
\\'\u'
|
|
, &[_]Token.Id{.Invalid});
|
|
testTokenize(
|
|
\\'\u{{'
|
|
, &[_]Token.Id{ .Invalid, .Invalid });
|
|
testTokenize(
|
|
\\'\u{}'
|
|
, &[_]Token.Id{ .Invalid, .Invalid });
|
|
testTokenize(
|
|
\\'\u{s}'
|
|
, &[_]Token.Id{ .Invalid, .Invalid });
|
|
testTokenize(
|
|
\\'\u{2z}'
|
|
, &[_]Token.Id{ .Invalid, .Invalid });
|
|
testTokenize(
|
|
\\'\u{4a'
|
|
, &[_]Token.Id{.Invalid});
|
|
|
|
// Test old-style unicode literals
|
|
testTokenize(
|
|
\\'\u0333'
|
|
, &[_]Token.Id{ .Invalid, .Invalid });
|
|
testTokenize(
|
|
\\'\U0333'
|
|
, &[_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
|
|
}
|
|
|
|
test "tokenizer - char literal with unicode code point" {
|
|
testTokenize(
|
|
\\'💩'
|
|
, &[_]Token.Id{.CharLiteral});
|
|
}
|
|
|
|
test "tokenizer - float literal e exponent" {
|
|
testTokenize("a = 4.94065645841246544177e-324;\n", &[_]Token.Id{
|
|
Token.Id.Identifier,
|
|
Token.Id.Equal,
|
|
Token.Id.FloatLiteral,
|
|
Token.Id.Semicolon,
|
|
});
|
|
}
|
|
|
|
test "tokenizer - float literal p exponent" {
|
|
testTokenize("a = 0x1.a827999fcef32p+1022;\n", &[_]Token.Id{
|
|
Token.Id.Identifier,
|
|
Token.Id.Equal,
|
|
Token.Id.FloatLiteral,
|
|
Token.Id.Semicolon,
|
|
});
|
|
}
|
|
|
|
test "tokenizer - chars" {
|
|
testTokenize("'c'", &[_]Token.Id{Token.Id.CharLiteral});
|
|
}
|
|
|
|
test "tokenizer - invalid token characters" {
|
|
testTokenize("#", &[_]Token.Id{Token.Id.Invalid});
|
|
testTokenize("`", &[_]Token.Id{Token.Id.Invalid});
|
|
testTokenize("'c", &[_]Token.Id{Token.Id.Invalid});
|
|
testTokenize("'", &[_]Token.Id{Token.Id.Invalid});
|
|
testTokenize("''", &[_]Token.Id{ Token.Id.Invalid, Token.Id.Invalid });
|
|
}
|
|
|
|
test "tokenizer - invalid literal/comment characters" {
|
|
testTokenize("\"\x00\"", &[_]Token.Id{
|
|
Token.Id.StringLiteral,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\x00", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\x1f", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\x7f", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
}
|
|
|
|
test "tokenizer - utf8" {
|
|
testTokenize("//\xc2\x80", &[_]Token.Id{Token.Id.LineComment});
|
|
testTokenize("//\xf4\x8f\xbf\xbf", &[_]Token.Id{Token.Id.LineComment});
|
|
}
|
|
|
|
test "tokenizer - invalid utf8" {
|
|
testTokenize("//\x80", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\xbf", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\xf8", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\xff", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\xc2\xc0", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\xe0", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\xf0", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\xf0\x90\x80\xc0", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
}
|
|
|
|
test "tokenizer - illegal unicode codepoints" {
|
|
// unicode newline characters.U+0085, U+2028, U+2029
|
|
testTokenize("//\xc2\x84", &[_]Token.Id{Token.Id.LineComment});
|
|
testTokenize("//\xc2\x85", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\xc2\x86", &[_]Token.Id{Token.Id.LineComment});
|
|
testTokenize("//\xe2\x80\xa7", &[_]Token.Id{Token.Id.LineComment});
|
|
testTokenize("//\xe2\x80\xa8", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\xe2\x80\xa9", &[_]Token.Id{
|
|
Token.Id.LineComment,
|
|
Token.Id.Invalid,
|
|
});
|
|
testTokenize("//\xe2\x80\xaa", &[_]Token.Id{Token.Id.LineComment});
|
|
}
|
|
|
|
test "tokenizer - string identifier and builtin fns" {
|
|
testTokenize(
|
|
\\const @"if" = @import("std");
|
|
, &[_]Token.Id{
|
|
Token.Id.Keyword_const,
|
|
Token.Id.Identifier,
|
|
Token.Id.Equal,
|
|
Token.Id.Builtin,
|
|
Token.Id.LParen,
|
|
Token.Id.StringLiteral,
|
|
Token.Id.RParen,
|
|
Token.Id.Semicolon,
|
|
});
|
|
}
|
|
|
|
test "tokenizer - pipe and then invalid" {
|
|
testTokenize("||=", &[_]Token.Id{
|
|
Token.Id.PipePipe,
|
|
Token.Id.Equal,
|
|
});
|
|
}
|
|
|
|
test "tokenizer - line comment and doc comment" {
|
|
testTokenize("//", &[_]Token.Id{Token.Id.LineComment});
|
|
testTokenize("// a / b", &[_]Token.Id{Token.Id.LineComment});
|
|
testTokenize("// /", &[_]Token.Id{Token.Id.LineComment});
|
|
testTokenize("/// a", &[_]Token.Id{Token.Id.DocComment});
|
|
testTokenize("///", &[_]Token.Id{Token.Id.DocComment});
|
|
testTokenize("////", &[_]Token.Id{Token.Id.LineComment});
|
|
testTokenize("//!", &[_]Token.Id{Token.Id.ContainerDocComment});
|
|
testTokenize("//!!", &[_]Token.Id{Token.Id.ContainerDocComment});
|
|
}
|
|
|
|
test "tokenizer - line comment followed by identifier" {
|
|
testTokenize(
|
|
\\ Unexpected,
|
|
\\ // another
|
|
\\ Another,
|
|
, &[_]Token.Id{
|
|
Token.Id.Identifier,
|
|
Token.Id.Comma,
|
|
Token.Id.LineComment,
|
|
Token.Id.Identifier,
|
|
Token.Id.Comma,
|
|
});
|
|
}
|
|
|
|
test "tokenizer - UTF-8 BOM is recognized and skipped" {
|
|
testTokenize("\xEF\xBB\xBFa;\n", &[_]Token.Id{
|
|
Token.Id.Identifier,
|
|
Token.Id.Semicolon,
|
|
});
|
|
}
|
|
|
|
test "correctly parse pointer assignment" {
|
|
testTokenize("b.*=3;\n", &[_]Token.Id{
|
|
Token.Id.Identifier,
|
|
Token.Id.PeriodAsterisk,
|
|
Token.Id.Equal,
|
|
Token.Id.IntegerLiteral,
|
|
Token.Id.Semicolon,
|
|
});
|
|
}
|
|
|
|
fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void {
|
|
var tokenizer = Tokenizer.init(source);
|
|
for (expected_tokens) |expected_token_id| {
|
|
const token = tokenizer.next();
|
|
if (token.id != expected_token_id) {
|
|
std.debug.panic("expected {}, found {}\n", .{ @tagName(expected_token_id), @tagName(token.id) });
|
|
}
|
|
}
|
|
const last_token = tokenizer.next();
|
|
std.testing.expect(last_token.id == Token.Id.Eof);
|
|
}
|