657 lines
20 KiB
Zig
657 lines
20 KiB
Zig
const std = @import("std");
|
|
const expect = std.testing.expect;
|
|
|
|
pub const TokenList = std.SegmentedList(CToken, 32);
|
|
|
|
pub const CToken = struct {
|
|
id: Id,
|
|
bytes: []const u8,
|
|
num_lit_suffix: NumLitSuffix = .None,
|
|
|
|
pub const Id = enum {
|
|
CharLit,
|
|
StrLit,
|
|
NumLitInt,
|
|
NumLitFloat,
|
|
Identifier,
|
|
Minus,
|
|
Slash,
|
|
LParen,
|
|
RParen,
|
|
Eof,
|
|
Dot,
|
|
Asterisk,
|
|
Bang,
|
|
Tilde,
|
|
Shl,
|
|
Lt,
|
|
Comma,
|
|
Fn,
|
|
};
|
|
|
|
pub const NumLitSuffix = enum {
|
|
None,
|
|
F,
|
|
L,
|
|
U,
|
|
LU,
|
|
LL,
|
|
LLU,
|
|
};
|
|
};
|
|
|
|
pub fn tokenizeCMacro(tl: *TokenList, chars: [*:0]const u8) !void {
|
|
var index: usize = 0;
|
|
var first = true;
|
|
while (true) {
|
|
const tok = try next(chars, &index);
|
|
if (tok.id == .StrLit or tok.id == .CharLit)
|
|
try tl.push(try zigifyEscapeSequences(tl.allocator, tok))
|
|
else
|
|
try tl.push(tok);
|
|
if (tok.id == .Eof)
|
|
return;
|
|
if (first) {
|
|
// distinguish NAME (EXPR) from NAME(ARGS)
|
|
first = false;
|
|
if (chars[index] == '(') {
|
|
try tl.push(.{
|
|
.id = .Fn,
|
|
.bytes = "",
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
fn zigifyEscapeSequences(allocator: *std.mem.Allocator, tok: CToken) !CToken {
|
|
for (tok.bytes) |c| {
|
|
if (c == '\\') {
|
|
break;
|
|
}
|
|
} else return tok;
|
|
var bytes = try allocator.alloc(u8, tok.bytes.len * 2);
|
|
var escape = false;
|
|
var i: usize = 0;
|
|
for (tok.bytes) |c| {
|
|
if (escape) {
|
|
switch (c) {
|
|
'n', 'r', 't', '\\', '\'', '\"', 'x' => {
|
|
bytes[i] = c;
|
|
},
|
|
'a' => {
|
|
bytes[i] = 'x';
|
|
i += 1;
|
|
bytes[i] = '0';
|
|
i += 1;
|
|
bytes[i] = '7';
|
|
},
|
|
'b' => {
|
|
bytes[i] = 'x';
|
|
i += 1;
|
|
bytes[i] = '0';
|
|
i += 1;
|
|
bytes[i] = '8';
|
|
},
|
|
'f' => {
|
|
bytes[i] = 'x';
|
|
i += 1;
|
|
bytes[i] = '0';
|
|
i += 1;
|
|
bytes[i] = 'C';
|
|
},
|
|
'v' => {
|
|
bytes[i] = 'x';
|
|
i += 1;
|
|
bytes[i] = '0';
|
|
i += 1;
|
|
bytes[i] = 'B';
|
|
},
|
|
'?' => {
|
|
i -= 1;
|
|
bytes[i] = '?';
|
|
},
|
|
'u', 'U' => {
|
|
// TODO unicode escape sequences
|
|
return error.TokenizingFailed;
|
|
},
|
|
'0'...'7' => {
|
|
// TODO octal escape sequences
|
|
return error.TokenizingFailed;
|
|
},
|
|
else => {
|
|
// unknown escape sequence
|
|
return error.TokenizingFailed;
|
|
},
|
|
}
|
|
i += 1;
|
|
escape = false;
|
|
} else {
|
|
if (c == '\\') {
|
|
escape = true;
|
|
}
|
|
bytes[i] = c;
|
|
i += 1;
|
|
}
|
|
}
|
|
return CToken{
|
|
.id = tok.id,
|
|
.bytes = bytes[0..i],
|
|
};
|
|
}
|
|
|
|
fn next(chars: [*:0]const u8, i: *usize) !CToken {
|
|
var state: enum {
|
|
Start,
|
|
GotLt,
|
|
CharLit,
|
|
OpenComment,
|
|
Comment,
|
|
CommentStar,
|
|
Backslash,
|
|
String,
|
|
Identifier,
|
|
Decimal,
|
|
Octal,
|
|
GotZero,
|
|
Hex,
|
|
Bin,
|
|
Float,
|
|
ExpSign,
|
|
FloatExp,
|
|
FloatExpFirst,
|
|
NumLitIntSuffixU,
|
|
NumLitIntSuffixL,
|
|
NumLitIntSuffixLL,
|
|
NumLitIntSuffixUL,
|
|
} = .Start;
|
|
|
|
var result = CToken{
|
|
.bytes = "",
|
|
.id = .Eof,
|
|
};
|
|
var begin_index: usize = 0;
|
|
var digits: u8 = 0;
|
|
var pre_escape = state;
|
|
|
|
while (true) {
|
|
const c = chars[i.*];
|
|
if (c == 0) {
|
|
switch (state) {
|
|
.Start => {
|
|
return result;
|
|
},
|
|
.Identifier,
|
|
.Decimal,
|
|
.Hex,
|
|
.Bin,
|
|
.Octal,
|
|
.GotZero,
|
|
.Float,
|
|
.FloatExp,
|
|
=> {
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
.NumLitIntSuffixU,
|
|
.NumLitIntSuffixL,
|
|
.NumLitIntSuffixUL,
|
|
.NumLitIntSuffixLL,
|
|
.GotLt,
|
|
=> {
|
|
return result;
|
|
},
|
|
.CharLit,
|
|
.OpenComment,
|
|
.Comment,
|
|
.CommentStar,
|
|
.Backslash,
|
|
.String,
|
|
.ExpSign,
|
|
.FloatExpFirst,
|
|
=> return error.TokenizingFailed,
|
|
}
|
|
}
|
|
i.* += 1;
|
|
switch (state) {
|
|
.Start => {
|
|
switch (c) {
|
|
' ', '\t', '\x0B', '\x0C' => {},
|
|
'\'' => {
|
|
state = .CharLit;
|
|
result.id = .CharLit;
|
|
begin_index = i.* - 1;
|
|
},
|
|
'\"' => {
|
|
state = .String;
|
|
result.id = .StrLit;
|
|
begin_index = i.* - 1;
|
|
},
|
|
'/' => {
|
|
state = .OpenComment;
|
|
},
|
|
'\\' => {
|
|
state = .Backslash;
|
|
},
|
|
'\n', '\r' => {
|
|
return result;
|
|
},
|
|
'a'...'z', 'A'...'Z', '_' => {
|
|
state = .Identifier;
|
|
result.id = .Identifier;
|
|
begin_index = i.* - 1;
|
|
},
|
|
'1'...'9' => {
|
|
state = .Decimal;
|
|
result.id = .NumLitInt;
|
|
begin_index = i.* - 1;
|
|
},
|
|
'0' => {
|
|
state = .GotZero;
|
|
result.id = .NumLitInt;
|
|
begin_index = i.* - 1;
|
|
},
|
|
'.' => {
|
|
result.id = .Dot;
|
|
return result;
|
|
},
|
|
'<' => {
|
|
result.id = .Lt;
|
|
state = .GotLt;
|
|
},
|
|
'(' => {
|
|
result.id = .LParen;
|
|
return result;
|
|
},
|
|
')' => {
|
|
result.id = .RParen;
|
|
return result;
|
|
},
|
|
'*' => {
|
|
result.id = .Asterisk;
|
|
return result;
|
|
},
|
|
'-' => {
|
|
result.id = .Minus;
|
|
return result;
|
|
},
|
|
'!' => {
|
|
result.id = .Bang;
|
|
return result;
|
|
},
|
|
'~' => {
|
|
result.id = .Tilde;
|
|
return result;
|
|
},
|
|
',' => {
|
|
result.id = .Comma;
|
|
return result;
|
|
},
|
|
else => return error.TokenizingFailed,
|
|
}
|
|
},
|
|
.GotLt => {
|
|
switch (c) {
|
|
'<' => {
|
|
result.id = .Shl;
|
|
return result;
|
|
},
|
|
else => {
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.Float => {
|
|
switch (c) {
|
|
'.', '0'...'9' => {},
|
|
'e', 'E' => {
|
|
state = .ExpSign;
|
|
},
|
|
'f',
|
|
'F',
|
|
=> {
|
|
i.* -= 1;
|
|
result.num_lit_suffix = .F;
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
'l', 'L' => {
|
|
i.* -= 1;
|
|
result.num_lit_suffix = .L;
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
else => {
|
|
i.* -= 1;
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.ExpSign => {
|
|
switch (c) {
|
|
'+', '-' => {
|
|
state = .FloatExpFirst;
|
|
},
|
|
'0'...'9' => {
|
|
state = .FloatExp;
|
|
},
|
|
else => return error.TokenizingFailed,
|
|
}
|
|
},
|
|
.FloatExpFirst => {
|
|
switch (c) {
|
|
'0'...'9' => {
|
|
state = .FloatExp;
|
|
},
|
|
else => return error.TokenizingFailed,
|
|
}
|
|
},
|
|
.FloatExp => {
|
|
switch (c) {
|
|
'0'...'9' => {},
|
|
'f', 'F' => {
|
|
result.num_lit_suffix = .F;
|
|
result.bytes = chars[begin_index .. i.* - 1];
|
|
return result;
|
|
},
|
|
'l', 'L' => {
|
|
result.num_lit_suffix = .L;
|
|
result.bytes = chars[begin_index .. i.* - 1];
|
|
return result;
|
|
},
|
|
else => {
|
|
i.* -= 1;
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.Decimal => {
|
|
switch (c) {
|
|
'0'...'9' => {},
|
|
'\'' => {},
|
|
'u', 'U' => {
|
|
state = .NumLitIntSuffixU;
|
|
result.num_lit_suffix = .U;
|
|
result.bytes = chars[begin_index .. i.* - 1];
|
|
},
|
|
'l', 'L' => {
|
|
state = .NumLitIntSuffixL;
|
|
result.num_lit_suffix = .L;
|
|
result.bytes = chars[begin_index .. i.* - 1];
|
|
},
|
|
'.' => {
|
|
result.id = .NumLitFloat;
|
|
state = .Float;
|
|
},
|
|
else => {
|
|
i.* -= 1;
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.GotZero => {
|
|
switch (c) {
|
|
'x', 'X' => {
|
|
state = .Hex;
|
|
},
|
|
'b', 'B' => {
|
|
state = .Bin;
|
|
},
|
|
'.' => {
|
|
state = .Float;
|
|
result.id = .NumLitFloat;
|
|
},
|
|
'u', 'U' => {
|
|
state = .NumLitIntSuffixU;
|
|
result.num_lit_suffix = .U;
|
|
result.bytes = chars[begin_index .. i.* - 1];
|
|
},
|
|
'l', 'L' => {
|
|
state = .NumLitIntSuffixL;
|
|
result.num_lit_suffix = .L;
|
|
result.bytes = chars[begin_index .. i.* - 1];
|
|
},
|
|
else => {
|
|
i.* -= 1;
|
|
state = .Octal;
|
|
},
|
|
}
|
|
},
|
|
.Octal => {
|
|
switch (c) {
|
|
'0'...'7' => {},
|
|
'8', '9' => return error.TokenizingFailed,
|
|
else => {
|
|
i.* -= 1;
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.Hex => {
|
|
switch (c) {
|
|
'0'...'9', 'a'...'f', 'A'...'F' => {},
|
|
'u', 'U' => {
|
|
// marks the number literal as unsigned
|
|
state = .NumLitIntSuffixU;
|
|
result.num_lit_suffix = .U;
|
|
result.bytes = chars[begin_index .. i.* - 1];
|
|
},
|
|
'l', 'L' => {
|
|
// marks the number literal as long
|
|
state = .NumLitIntSuffixL;
|
|
result.num_lit_suffix = .L;
|
|
result.bytes = chars[begin_index .. i.* - 1];
|
|
},
|
|
else => {
|
|
i.* -= 1;
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.Bin => {
|
|
switch (c) {
|
|
'0'...'1' => {},
|
|
'2'...'9' => return error.TokenizingFailed,
|
|
'u', 'U' => {
|
|
// marks the number literal as unsigned
|
|
state = .NumLitIntSuffixU;
|
|
result.num_lit_suffix = .U;
|
|
result.bytes = chars[begin_index .. i.* - 1];
|
|
},
|
|
'l', 'L' => {
|
|
// marks the number literal as long
|
|
state = .NumLitIntSuffixL;
|
|
result.num_lit_suffix = .L;
|
|
result.bytes = chars[begin_index .. i.* - 1];
|
|
},
|
|
else => {
|
|
i.* -= 1;
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.NumLitIntSuffixU => {
|
|
switch (c) {
|
|
'l', 'L' => {
|
|
result.num_lit_suffix = .LU;
|
|
state = .NumLitIntSuffixUL;
|
|
},
|
|
else => {
|
|
i.* -= 1;
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.NumLitIntSuffixL => {
|
|
switch (c) {
|
|
'l', 'L' => {
|
|
result.num_lit_suffix = .LL;
|
|
state = .NumLitIntSuffixLL;
|
|
},
|
|
'u', 'U' => {
|
|
result.num_lit_suffix = .LU;
|
|
return result;
|
|
},
|
|
else => {
|
|
i.* -= 1;
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.NumLitIntSuffixLL => {
|
|
switch (c) {
|
|
'u', 'U' => {
|
|
result.num_lit_suffix = .LLU;
|
|
return result;
|
|
},
|
|
else => {
|
|
i.* -= 1;
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.NumLitIntSuffixUL => {
|
|
switch (c) {
|
|
'l', 'L' => {
|
|
result.num_lit_suffix = .LLU;
|
|
return result;
|
|
},
|
|
else => {
|
|
i.* -= 1;
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.Identifier => {
|
|
switch (c) {
|
|
'_', 'a'...'z', 'A'...'Z', '0'...'9' => {},
|
|
else => {
|
|
i.* -= 1;
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.String => { // TODO char escapes
|
|
switch (c) {
|
|
'\"' => {
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
else => {},
|
|
}
|
|
},
|
|
.CharLit => {
|
|
switch (c) {
|
|
'\'' => {
|
|
result.bytes = chars[begin_index..i.*];
|
|
return result;
|
|
},
|
|
else => {},
|
|
}
|
|
},
|
|
.OpenComment => {
|
|
switch (c) {
|
|
'/' => {
|
|
return result;
|
|
},
|
|
'*' => {
|
|
state = .Comment;
|
|
},
|
|
else => {
|
|
result.id = .Slash;
|
|
return result;
|
|
},
|
|
}
|
|
},
|
|
.Comment => {
|
|
switch (c) {
|
|
'*' => {
|
|
state = .CommentStar;
|
|
},
|
|
else => {},
|
|
}
|
|
},
|
|
.CommentStar => {
|
|
switch (c) {
|
|
'/' => {
|
|
state = .Start;
|
|
},
|
|
else => {
|
|
state = .Comment;
|
|
},
|
|
}
|
|
},
|
|
.Backslash => {
|
|
switch (c) {
|
|
' ', '\t', '\x0B', '\x0C' => {},
|
|
'\n', '\r' => {
|
|
state = .Start;
|
|
},
|
|
else => return error.TokenizingFailed,
|
|
}
|
|
},
|
|
}
|
|
}
|
|
unreachable;
|
|
}
|
|
|
|
test "tokenize macro" {
|
|
var tl = TokenList.init(std.heap.page_allocator);
|
|
defer tl.deinit();
|
|
|
|
const src = "TEST(0\n";
|
|
try tokenizeCMacro(&tl, src);
|
|
var it = tl.iterator(0);
|
|
expect(it.next().?.id == .Identifier);
|
|
expect(it.next().?.id == .Fn);
|
|
expect(it.next().?.id == .LParen);
|
|
expect(std.mem.eql(u8, it.next().?.bytes, "0"));
|
|
expect(it.next().?.id == .Eof);
|
|
expect(it.next() == null);
|
|
tl.shrink(0);
|
|
|
|
const src2 = "__FLT_MIN_10_EXP__ -37\n";
|
|
try tokenizeCMacro(&tl, src2);
|
|
it = tl.iterator(0);
|
|
expect(std.mem.eql(u8, it.next().?.bytes, "__FLT_MIN_10_EXP__"));
|
|
expect(it.next().?.id == .Minus);
|
|
expect(std.mem.eql(u8, it.next().?.bytes, "37"));
|
|
expect(it.next().?.id == .Eof);
|
|
expect(it.next() == null);
|
|
tl.shrink(0);
|
|
|
|
const src3 = "__llvm__ 1\n#define";
|
|
try tokenizeCMacro(&tl, src3);
|
|
it = tl.iterator(0);
|
|
expect(std.mem.eql(u8, it.next().?.bytes, "__llvm__"));
|
|
expect(std.mem.eql(u8, it.next().?.bytes, "1"));
|
|
expect(it.next().?.id == .Eof);
|
|
expect(it.next() == null);
|
|
tl.shrink(0);
|
|
|
|
const src4 = "TEST 2";
|
|
try tokenizeCMacro(&tl, src4);
|
|
it = tl.iterator(0);
|
|
expect(it.next().?.id == .Identifier);
|
|
expect(std.mem.eql(u8, it.next().?.bytes, "2"));
|
|
expect(it.next().?.id == .Eof);
|
|
expect(it.next() == null);
|
|
tl.shrink(0);
|
|
|
|
const src5 = "FOO 0l";
|
|
try tokenizeCMacro(&tl, src5);
|
|
it = tl.iterator(0);
|
|
expect(it.next().?.id == .Identifier);
|
|
expect(std.mem.eql(u8, it.next().?.bytes, "0"));
|
|
expect(it.next().?.id == .Eof);
|
|
expect(it.next() == null);
|
|
tl.shrink(0);
|
|
}
|