zig/src-self-hosted/tokenizer.zig

723 lines
26 KiB
Zig

const std = @import("std");
const mem = std.mem;
pub const Token = struct {
id: Id,
start: usize,
end: usize,
const KeywordId = struct {
bytes: []const u8,
id: Id,
};
const keywords = []KeywordId {
KeywordId{.bytes="align", .id = Id.Keyword_align},
KeywordId{.bytes="and", .id = Id.Keyword_and},
KeywordId{.bytes="asm", .id = Id.Keyword_asm},
KeywordId{.bytes="break", .id = Id.Keyword_break},
KeywordId{.bytes="coldcc", .id = Id.Keyword_coldcc},
KeywordId{.bytes="comptime", .id = Id.Keyword_comptime},
KeywordId{.bytes="const", .id = Id.Keyword_const},
KeywordId{.bytes="continue", .id = Id.Keyword_continue},
KeywordId{.bytes="defer", .id = Id.Keyword_defer},
KeywordId{.bytes="else", .id = Id.Keyword_else},
KeywordId{.bytes="enum", .id = Id.Keyword_enum},
KeywordId{.bytes="error", .id = Id.Keyword_error},
KeywordId{.bytes="export", .id = Id.Keyword_export},
KeywordId{.bytes="extern", .id = Id.Keyword_extern},
KeywordId{.bytes="false", .id = Id.Keyword_false},
KeywordId{.bytes="fn", .id = Id.Keyword_fn},
KeywordId{.bytes="for", .id = Id.Keyword_for},
KeywordId{.bytes="goto", .id = Id.Keyword_goto},
KeywordId{.bytes="if", .id = Id.Keyword_if},
KeywordId{.bytes="inline", .id = Id.Keyword_inline},
KeywordId{.bytes="nakedcc", .id = Id.Keyword_nakedcc},
KeywordId{.bytes="noalias", .id = Id.Keyword_noalias},
KeywordId{.bytes="null", .id = Id.Keyword_null},
KeywordId{.bytes="or", .id = Id.Keyword_or},
KeywordId{.bytes="packed", .id = Id.Keyword_packed},
KeywordId{.bytes="pub", .id = Id.Keyword_pub},
KeywordId{.bytes="return", .id = Id.Keyword_return},
KeywordId{.bytes="stdcallcc", .id = Id.Keyword_stdcallcc},
KeywordId{.bytes="struct", .id = Id.Keyword_struct},
KeywordId{.bytes="switch", .id = Id.Keyword_switch},
KeywordId{.bytes="test", .id = Id.Keyword_test},
KeywordId{.bytes="this", .id = Id.Keyword_this},
KeywordId{.bytes="true", .id = Id.Keyword_true},
KeywordId{.bytes="undefined", .id = Id.Keyword_undefined},
KeywordId{.bytes="union", .id = Id.Keyword_union},
KeywordId{.bytes="unreachable", .id = Id.Keyword_unreachable},
KeywordId{.bytes="use", .id = Id.Keyword_use},
KeywordId{.bytes="var", .id = Id.Keyword_var},
KeywordId{.bytes="volatile", .id = Id.Keyword_volatile},
KeywordId{.bytes="while", .id = Id.Keyword_while},
};
fn getKeyword(bytes: []const u8) -> ?Id {
for (keywords) |kw| {
if (mem.eql(u8, kw.bytes, bytes)) {
return kw.id;
}
}
return null;
}
const StrLitKind = enum {Normal, C};
pub const Id = union(enum) {
Invalid,
Identifier,
StringLiteral: StrLitKind,
Eof,
NoEolAtEof,
Builtin,
Bang,
Equal,
EqualEqual,
BangEqual,
LParen,
RParen,
Semicolon,
Percent,
LBrace,
RBrace,
Period,
Ellipsis2,
Ellipsis3,
Minus,
Arrow,
Colon,
Slash,
Comma,
Ampersand,
AmpersandEqual,
IntegerLiteral,
FloatLiteral,
Keyword_align,
Keyword_and,
Keyword_asm,
Keyword_break,
Keyword_coldcc,
Keyword_comptime,
Keyword_const,
Keyword_continue,
Keyword_defer,
Keyword_else,
Keyword_enum,
Keyword_error,
Keyword_export,
Keyword_extern,
Keyword_false,
Keyword_fn,
Keyword_for,
Keyword_goto,
Keyword_if,
Keyword_inline,
Keyword_nakedcc,
Keyword_noalias,
Keyword_null,
Keyword_or,
Keyword_packed,
Keyword_pub,
Keyword_return,
Keyword_stdcallcc,
Keyword_struct,
Keyword_switch,
Keyword_test,
Keyword_this,
Keyword_true,
Keyword_undefined,
Keyword_union,
Keyword_unreachable,
Keyword_use,
Keyword_var,
Keyword_volatile,
Keyword_while,
};
};
pub const Tokenizer = struct {
buffer: []const u8,
index: usize,
actual_file_end: usize,
pending_invalid_token: ?Token,
pub const Location = struct {
line: usize,
column: usize,
line_start: usize,
line_end: usize,
};
pub fn getTokenLocation(self: &Tokenizer, token: &const Token) -> Location {
var loc = Location {
.line = 0,
.column = 0,
.line_start = 0,
.line_end = 0,
};
for (self.buffer) |c, i| {
if (i == token.start) {
loc.line_end = i;
while (loc.line_end < self.buffer.len and self.buffer[loc.line_end] != '\n') : (loc.line_end += 1) {}
return loc;
}
if (c == '\n') {
loc.line += 1;
loc.column = 0;
loc.line_start = i + 1;
} else {
loc.column += 1;
}
}
return loc;
}
/// For debugging purposes
pub fn dump(self: &Tokenizer, token: &const Token) {
std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]);
}
pub fn init(buffer: []const u8) -> Tokenizer {
var source_len = buffer.len;
while (source_len > 0) : (source_len -= 1) {
if (buffer[source_len - 1] == '\n') break;
// last line is incomplete, so skip it, and give an error when we get there.
}
return Tokenizer {
.buffer = buffer[0..source_len],
.index = 0,
.actual_file_end = buffer.len,
.pending_invalid_token = null,
};
}
const State = enum {
Start,
Identifier,
Builtin,
C,
StringLiteral,
StringLiteralBackslash,
Equal,
Bang,
Minus,
Slash,
LineComment,
Zero,
IntegerLiteral,
NumberDot,
FloatFraction,
FloatExponentUnsigned,
FloatExponentNumber,
Ampersand,
Period,
Period2,
};
pub fn next(self: &Tokenizer) -> Token {
if (self.pending_invalid_token) |token| {
self.pending_invalid_token = null;
return token;
}
var state = State.Start;
var result = Token {
.id = Token.Id.Eof,
.start = self.index,
.end = undefined,
};
while (self.index < self.buffer.len) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
State.Start => switch (c) {
' ', '\n' => {
result.start = self.index + 1;
},
'c' => {
state = State.C;
result.id = Token.Id.Identifier;
},
'"' => {
state = State.StringLiteral;
result.id = Token.Id { .StringLiteral = Token.StrLitKind.Normal };
},
'a'...'b', 'd'...'z', 'A'...'Z', '_' => {
state = State.Identifier;
result.id = Token.Id.Identifier;
},
'@' => {
state = State.Builtin;
result.id = Token.Id.Builtin;
},
'=' => {
state = State.Equal;
},
'!' => {
state = State.Bang;
},
'(' => {
result.id = Token.Id.LParen;
self.index += 1;
break;
},
')' => {
result.id = Token.Id.RParen;
self.index += 1;
break;
},
';' => {
result.id = Token.Id.Semicolon;
self.index += 1;
break;
},
',' => {
result.id = Token.Id.Comma;
self.index += 1;
break;
},
':' => {
result.id = Token.Id.Colon;
self.index += 1;
break;
},
'%' => {
result.id = Token.Id.Percent;
self.index += 1;
break;
},
'{' => {
result.id = Token.Id.LBrace;
self.index += 1;
break;
},
'}' => {
result.id = Token.Id.RBrace;
self.index += 1;
break;
},
'.' => {
state = State.Period;
},
'-' => {
state = State.Minus;
},
'/' => {
state = State.Slash;
},
'&' => {
state = State.Ampersand;
},
'0' => {
state = State.Zero;
result.id = Token.Id.IntegerLiteral;
},
'1'...'9' => {
state = State.IntegerLiteral;
result.id = Token.Id.IntegerLiteral;
},
else => {
result.id = Token.Id.Invalid;
self.index += 1;
break;
},
},
State.Ampersand => switch (c) {
'=' => {
result.id = Token.Id.AmpersandEqual;
self.index += 1;
break;
},
else => {
result.id = Token.Id.Ampersand;
break;
},
},
State.Identifier => switch (c) {
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
else => {
if (Token.getKeyword(self.buffer[result.start..self.index])) |id| {
result.id = id;
}
break;
},
},
State.Builtin => switch (c) {
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
else => break,
},
State.C => switch (c) {
'\\' => @panic("TODO"),
'"' => {
state = State.StringLiteral;
result.id = Token.Id { .StringLiteral = Token.StrLitKind.C };
},
'a'...'z', 'A'...'Z', '_', '0'...'9' => {
state = State.Identifier;
},
else => break,
},
State.StringLiteral => switch (c) {
'\\' => {
state = State.StringLiteralBackslash;
},
'"' => {
self.index += 1;
break;
},
'\n' => break, // Look for this error later.
else => self.checkLiteralCharacter(),
},
State.StringLiteralBackslash => switch (c) {
'\n' => break, // Look for this error later.
else => {
state = State.StringLiteral;
},
},
State.Bang => switch (c) {
'=' => {
result.id = Token.Id.BangEqual;
self.index += 1;
break;
},
else => {
result.id = Token.Id.Bang;
break;
},
},
State.Equal => switch (c) {
'=' => {
result.id = Token.Id.EqualEqual;
self.index += 1;
break;
},
else => {
result.id = Token.Id.Equal;
break;
},
},
State.Minus => switch (c) {
'>' => {
result.id = Token.Id.Arrow;
self.index += 1;
break;
},
else => {
result.id = Token.Id.Minus;
break;
},
},
State.Period => switch (c) {
'.' => {
state = State.Period2;
},
else => {
result.id = Token.Id.Period;
break;
},
},
State.Period2 => switch (c) {
'.' => {
result.id = Token.Id.Ellipsis3;
self.index += 1;
break;
},
else => {
result.id = Token.Id.Ellipsis2;
break;
},
},
State.Slash => switch (c) {
'/' => {
result.id = undefined;
state = State.LineComment;
},
else => {
result.id = Token.Id.Slash;
break;
},
},
State.LineComment => switch (c) {
'\n' => {
state = State.Start;
result = Token {
.id = Token.Id.Eof,
.start = self.index + 1,
.end = undefined,
};
},
else => self.checkLiteralCharacter(),
},
State.Zero => switch (c) {
'b', 'o', 'x' => {
state = State.IntegerLiteral;
},
else => {
// reinterpret as a normal number
self.index -= 1;
state = State.IntegerLiteral;
},
},
State.IntegerLiteral => switch (c) {
'.' => {
state = State.NumberDot;
},
'p', 'P', 'e', 'E' => {
state = State.FloatExponentUnsigned;
},
'0'...'9', 'a'...'f', 'A'...'F' => {},
else => break,
},
State.NumberDot => switch (c) {
'.' => {
self.index -= 1;
state = State.Start;
break;
},
else => {
self.index -= 1;
result.id = Token.Id.FloatLiteral;
state = State.FloatFraction;
},
},
State.FloatFraction => switch (c) {
'p', 'P', 'e', 'E' => {
state = State.FloatExponentUnsigned;
},
'0'...'9', 'a'...'f', 'A'...'F' => {},
else => break,
},
State.FloatExponentUnsigned => switch (c) {
'+', '-' => {
state = State.FloatExponentNumber;
},
else => {
// reinterpret as a normal exponent number
self.index -= 1;
state = State.FloatExponentNumber;
}
},
State.FloatExponentNumber => switch (c) {
'0'...'9', 'a'...'f', 'A'...'F' => {},
else => break,
},
}
}
result.end = self.index;
if (result.id == Token.Id.Eof) {
if (self.pending_invalid_token) |token| {
self.pending_invalid_token = null;
return token;
}
if (self.actual_file_end != self.buffer.len) {
// instead of an Eof, give an error token
result.id = Token.Id.NoEolAtEof;
result.end = self.actual_file_end;
}
}
return result;
}
pub fn getTokenSlice(self: &const Tokenizer, token: &const Token) -> []const u8 {
return self.buffer[token.start..token.end];
}
fn checkLiteralCharacter(self: &Tokenizer) {
if (self.pending_invalid_token != null) return;
const invalid_length = self.getInvalidCharacterLength();
if (invalid_length == 0) return;
self.pending_invalid_token = Token {
.id = Token.Id.Invalid,
.start = self.index,
.end = self.index + invalid_length,
};
}
fn getInvalidCharacterLength(self: &Tokenizer) -> u3 {
const c0 = self.buffer[self.index];
if (c0 < 0x80) {
if (c0 < 0x20 or c0 == 0x7f) {
// ascii control codes are never allowed
// (note that \n was checked before we got here)
return 1;
}
// looks fine to me.
return 0;
} else {
// check utf8-encoded character.
// remember that the last byte in the buffer is guaranteed to be '\n',
// which means we really don't need to do bounds checks here,
// as long as we check one byte at a time for being a continuation byte.
var value: u32 = undefined;
var length: u3 = undefined;
if (c0 & 0b11100000 == 0b11000000) {value = c0 & 0b00011111; length = 2;}
else if (c0 & 0b11110000 == 0b11100000) {value = c0 & 0b00001111; length = 3;}
else if (c0 & 0b11111000 == 0b11110000) {value = c0 & 0b00000111; length = 4;}
else return 1; // unexpected continuation or too many leading 1's
const c1 = self.buffer[self.index + 1];
if (c1 & 0b11000000 != 0b10000000) return 1; // expected continuation
value <<= 6;
value |= c1 & 0b00111111;
if (length == 2) {
if (value < 0x80) return length; // overlong
if (value == 0x85) return length; // U+0085 (NEL)
self.index += length - 1;
return 0;
}
const c2 = self.buffer[self.index + 2];
if (c2 & 0b11000000 != 0b10000000) return 2; // expected continuation
value <<= 6;
value |= c2 & 0b00111111;
if (length == 3) {
if (value < 0x800) return length; // overlong
if (value == 0x2028) return length; // U+2028 (LS)
if (value == 0x2029) return length; // U+2029 (PS)
if (0xd800 <= value and value <= 0xdfff) return length; // surrogate halves not allowed in utf8
self.index += length - 1;
return 0;
}
const c3 = self.buffer[self.index + 3];
if (c3 & 0b11000000 != 0b10000000) return 3; // expected continuation
value <<= 6;
value |= c3 & 0b00111111;
if (length == 4) {
if (value < 0x10000) return length; // overlong
if (value > 0x10FFFF) return length; // out of bounds
self.index += length - 1;
return 0;
}
unreachable;
}
}
};
test "tokenizer - source must end with eol" {
testTokenizeWithEol("", []Token.Id {
}, true);
testTokenizeWithEol("no newline", []Token.Id {
}, false);
testTokenizeWithEol("test\n", []Token.Id {
Token.Id.Keyword_test,
}, true);
testTokenizeWithEol("test\nno newline", []Token.Id {
Token.Id.Keyword_test,
}, false);
}
test "tokenizer - invalid token characters" {
testTokenize("#\n", []Token.Id{Token.Id.Invalid});
testTokenize("`\n", []Token.Id{Token.Id.Invalid});
}
test "tokenizer - invalid literal/comment characters" {
testTokenize("\"\x00\"\n", []Token.Id {
Token.Id { .StringLiteral = Token.StrLitKind.Normal },
Token.Id.Invalid,
});
testTokenize("//\x00\n", []Token.Id {
Token.Id.Invalid,
});
testTokenize("//\x1f\n", []Token.Id {
Token.Id.Invalid,
});
testTokenize("//\x7f\n", []Token.Id {
Token.Id.Invalid,
});
}
test "tokenizer - valid unicode" {
testTokenize("//\xc2\x80\n", []Token.Id{});
testTokenize("//\xdf\xbf\n", []Token.Id{});
testTokenize("//\xe0\xa0\x80\n", []Token.Id{});
testTokenize("//\xe1\x80\x80\n", []Token.Id{});
testTokenize("//\xef\xbf\xbf\n", []Token.Id{});
testTokenize("//\xf0\x90\x80\x80\n", []Token.Id{});
testTokenize("//\xf1\x80\x80\x80\n", []Token.Id{});
testTokenize("//\xf3\xbf\xbf\xbf\n", []Token.Id{});
testTokenize("//\xf4\x8f\xbf\xbf\n", []Token.Id{});
}
test "tokenizer - invalid unicode continuation bytes" {
// unexpected continuation
testTokenize("//\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xbf\n", []Token.Id{Token.Id.Invalid});
// too many leading 1's
testTokenize("//\xf8\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xff\n", []Token.Id{Token.Id.Invalid});
// expected continuation for 2 byte sequences
testTokenize("//\xc2\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xc2\xc0\n", []Token.Id{Token.Id.Invalid});
// expected continuation for 3 byte sequences
testTokenize("//\xe0\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\xc0\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\xa0\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\xa0\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\xa0\xc0\n", []Token.Id{Token.Id.Invalid});
// expected continuation for 4 byte sequences
testTokenize("//\xf0\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\xc0\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x90\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x90\xc0\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x90\x80\x00\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x90\x80\xc0\n", []Token.Id{Token.Id.Invalid});
}
test "tokenizer - overlong utf8 codepoint" {
testTokenize("//\xc0\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xc1\xbf\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\x80\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe0\x9f\xbf\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x80\x80\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf0\x8f\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
}
test "tokenizer - misc invalid utf8" {
// codepoint out of bounds
testTokenize("//\xf4\x90\x80\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xf7\xbf\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
// unicode newline characters.U+0085, U+2028, U+2029
testTokenize("//\xc2\x84\n", []Token.Id{});
testTokenize("//\xc2\x85\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xc2\x86\n", []Token.Id{});
testTokenize("//\xe2\x80\xa7\n", []Token.Id{});
testTokenize("//\xe2\x80\xa8\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe2\x80\xa9\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xe2\x80\xaa\n", []Token.Id{});
// surrogate halves
testTokenize("//\xed\x9f\x80\n", []Token.Id{});
testTokenize("//\xed\xa0\x80\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xed\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
testTokenize("//\xee\x80\x80\n", []Token.Id{});
// surrogate halves are invalid, even in surrogate pairs
testTokenize("//\xed\xa0\xad\xed\xb2\xa9\n", []Token.Id{Token.Id.Invalid});
}
fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) {
testTokenizeWithEol(source, expected_tokens, true);
}
fn testTokenizeWithEol(source: []const u8, expected_tokens: []const Token.Id, expected_eol_at_eof: bool) {
var tokenizer = Tokenizer.init(source);
for (expected_tokens) |expected_token_id| {
const token = tokenizer.next();
std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id));
switch (expected_token_id) {
Token.Id.StringLiteral => |expected_kind| {
std.debug.assert(expected_kind == switch (token.id) { Token.Id.StringLiteral => |kind| kind, else => unreachable });
},
else => {},
}
}
std.debug.assert(tokenizer.next().id == if (expected_eol_at_eof) Token.Id.Eof else Token.Id.NoEolAtEof);
}