different strategy for tokenizing keywords

throughput: 279 MiB/s => 347 MiB/s
master
Andrew Kelley 2020-05-26 14:07:47 -04:00
parent b6e1670d2b
commit ba41a9d5d7
1 changed files with 40 additions and 9 deletions

View File

@ -13,14 +13,11 @@ pub const Token = struct {
pub const Keyword = struct {
bytes: []const u8,
id: Id,
hash: u32,
fn init(bytes: []const u8, id: Id) Keyword {
@setEvalBranchQuota(2000);
return .{
.bytes = bytes,
.id = id,
.hash = std.hash_map.hashString(bytes),
};
}
};
@ -79,15 +76,49 @@ pub const Token = struct {
Keyword.init("while", .Keyword_while),
};
// TODO perfect hash at comptime
pub fn getKeyword(bytes: []const u8) ?Id {
var hash = std.hash_map.hashString(bytes);
for (keywords) |kw| {
if (kw.hash == hash and mem.eql(u8, kw.bytes, bytes)) {
return kw.id;
const precomputed = comptime blk: {
@setEvalBranchQuota(2000);
var sorted_keywords = keywords;
const lenAsc = (struct {
fn lenAsc(a: Keyword, b: Keyword) bool {
return a.bytes.len < b.bytes.len;
}
}).lenAsc;
std.sort.sort(Keyword, &sorted_keywords, lenAsc);
const min_len = sorted_keywords[0].bytes.len;
const max_len = sorted_keywords[sorted_keywords.len - 1].bytes.len;
var len_indexes: [max_len + 1]usize = undefined;
var len: usize = 0;
var kw_i: usize = 0;
while (len <= max_len) : (len += 1) {
// find the first keyword len == len
while (len > sorted_keywords[kw_i].bytes.len) {
kw_i += 1;
}
len_indexes[len] = kw_i;
}
break :blk .{
.min_len = min_len,
.max_len = max_len,
.sorted_keywords = sorted_keywords,
.len_indexes = len_indexes,
};
};
if (bytes.len < precomputed.min_len or bytes.len > precomputed.max_len)
return null;
var i = precomputed.len_indexes[bytes.len];
while (true) {
const kw = precomputed.sorted_keywords[i];
if (kw.bytes.len != bytes.len)
return null;
if (mem.eql(u8, kw.bytes, bytes))
return kw.id;
i += 1;
if (i >= precomputed.sorted_keywords.len)
return null;
}
return null;
}
pub const Id = enum {