zig/src-self-hosted/c_tokenizer.zig

978 lines
32 KiB
Zig

const std = @import("std");
const expect = std.testing.expect;
const ZigClangSourceLocation = @import("clang.zig").ZigClangSourceLocation;
const Context = @import("translate_c.zig").Context;
const failDecl = @import("translate_c.zig").failDecl;
pub const TokenList = std.SegmentedList(CToken, 32);
pub const CToken = struct {
id: Id,
bytes: []const u8 = "",
num_lit_suffix: NumLitSuffix = .None,
pub const Id = enum {
CharLit,
StrLit,
NumLitInt,
NumLitFloat,
Identifier,
Plus,
Minus,
Slash,
LParen,
RParen,
Eof,
Dot,
Asterisk, // *
Ampersand, // &
And, // &&
Assign, // =
Or, // ||
Bang, // !
Tilde, // ~
Shl, // <<
Shr, // >>
Lt, // <
Lte, // <=
Gt, // >
Gte, // >=
Eq, // ==
Ne, // !=
Increment, // ++
Decrement, // --
Comma,
Fn,
Arrow, // ->
LBrace,
RBrace,
Pipe,
QuestionMark,
Colon,
};
pub const NumLitSuffix = enum {
None,
F,
L,
U,
LU,
LL,
LLU,
};
};
pub fn tokenizeCMacro(ctx: *Context, loc: ZigClangSourceLocation, name: []const u8, tl: *TokenList, chars: [*:0]const u8) !void {
var index: usize = 0;
var first = true;
while (true) {
const tok = try next(ctx, loc, name, chars, &index);
if (tok.id == .StrLit or tok.id == .CharLit)
try tl.push(try zigifyEscapeSequences(ctx, loc, name, tl.allocator, tok))
else
try tl.push(tok);
if (tok.id == .Eof)
return;
if (first) {
// distinguish NAME (EXPR) from NAME(ARGS)
first = false;
if (chars[index] == '(') {
try tl.push(.{
.id = .Fn,
.bytes = "",
});
}
}
}
}
fn zigifyEscapeSequences(ctx: *Context, loc: ZigClangSourceLocation, name: []const u8, allocator: *std.mem.Allocator, tok: CToken) !CToken {
for (tok.bytes) |c| {
if (c == '\\') {
break;
}
} else return tok;
var bytes = try allocator.alloc(u8, tok.bytes.len * 2);
var state: enum {
Start,
Escape,
Hex,
Octal,
} = .Start;
var i: usize = 0;
var count: u8 = 0;
var num: u8 = 0;
for (tok.bytes) |c| {
switch (state) {
.Escape => {
switch (c) {
'n', 'r', 't', '\\', '\'', '\"' => {
bytes[i] = c;
},
'0'...'7' => {
count += 1;
num += c - '0';
state = .Octal;
bytes[i] = 'x';
},
'x' => {
state = .Hex;
bytes[i] = 'x';
},
'a' => {
bytes[i] = 'x';
i += 1;
bytes[i] = '0';
i += 1;
bytes[i] = '7';
},
'b' => {
bytes[i] = 'x';
i += 1;
bytes[i] = '0';
i += 1;
bytes[i] = '8';
},
'f' => {
bytes[i] = 'x';
i += 1;
bytes[i] = '0';
i += 1;
bytes[i] = 'C';
},
'v' => {
bytes[i] = 'x';
i += 1;
bytes[i] = '0';
i += 1;
bytes[i] = 'B';
},
'?' => {
i -= 1;
bytes[i] = '?';
},
'u', 'U' => {
try failDecl(ctx, loc, name, "macro tokenizing failed: TODO unicode escape sequences", .{});
return error.TokenizingFailed;
},
else => {
try failDecl(ctx, loc, name, "macro tokenizing failed: unknown escape sequence", .{});
return error.TokenizingFailed;
},
}
i += 1;
if (state == .Escape)
state = .Start;
},
.Start => {
if (c == '\\') {
state = .Escape;
}
bytes[i] = c;
i += 1;
},
.Hex => {
switch (c) {
'0'...'9' => {
num = std.math.mul(u8, num, 16) catch {
try failDecl(ctx, loc, name, "macro tokenizing failed: hex literal overflowed", .{});
return error.TokenizingFailed;
};
num += c - '0';
},
'a'...'f' => {
num = std.math.mul(u8, num, 16) catch {
try failDecl(ctx, loc, name, "macro tokenizing failed: hex literal overflowed", .{});
return error.TokenizingFailed;
};
num += c - 'a' + 10;
},
'A'...'F' => {
num = std.math.mul(u8, num, 16) catch {
try failDecl(ctx, loc, name, "macro tokenizing failed: hex literal overflowed", .{});
return error.TokenizingFailed;
};
num += c - 'A' + 10;
},
else => {
i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{ .fill = '0', .width = 2 });
num = 0;
if (c == '\\')
state = .Escape
else
state = .Start;
bytes[i] = c;
i += 1;
},
}
},
.Octal => {
const accept_digit = switch (c) {
// The maximum length of a octal literal is 3 digits
'0'...'7' => count < 3,
else => false,
};
if (accept_digit) {
count += 1;
num = std.math.mul(u8, num, 8) catch {
try failDecl(ctx, loc, name, "macro tokenizing failed: octal literal overflowed", .{});
return error.TokenizingFailed;
};
num += c - '0';
} else {
i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{ .fill = '0', .width = 2 });
num = 0;
count = 0;
if (c == '\\')
state = .Escape
else
state = .Start;
bytes[i] = c;
i += 1;
}
},
}
}
if (state == .Hex or state == .Octal)
i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{ .fill = '0', .width = 2 });
return CToken{
.id = tok.id,
.bytes = bytes[0..i],
};
}
fn next(ctx: *Context, loc: ZigClangSourceLocation, name: []const u8, chars: [*:0]const u8, i: *usize) !CToken {
var state: enum {
Start,
SawLt,
SawGt,
SawPlus,
SawMinus,
SawAmpersand,
SawPipe,
SawBang,
SawEq,
CharLit,
OpenComment,
Comment,
CommentStar,
Backslash,
String,
Identifier,
Decimal,
Octal,
SawZero,
Hex,
Bin,
Float,
ExpSign,
FloatExp,
FloatExpFirst,
NumLitIntSuffixU,
NumLitIntSuffixL,
NumLitIntSuffixLL,
NumLitIntSuffixUL,
Done,
} = .Start;
var result = CToken{
.bytes = "",
.id = .Eof,
};
var begin_index: usize = 0;
var digits: u8 = 0;
var pre_escape = state;
while (true) {
const c = chars[i.*];
if (c == 0) {
switch (state) {
.Identifier,
.Decimal,
.Hex,
.Bin,
.Octal,
.SawZero,
.Float,
.FloatExp,
=> {
result.bytes = chars[begin_index..i.*];
return result;
},
.Start,
.SawMinus,
.Done,
.NumLitIntSuffixU,
.NumLitIntSuffixL,
.NumLitIntSuffixUL,
.NumLitIntSuffixLL,
.SawLt,
.SawGt,
.SawPlus,
.SawAmpersand,
.SawPipe,
.SawBang,
.SawEq,
=> {
return result;
},
.CharLit,
.OpenComment,
.Comment,
.CommentStar,
.Backslash,
.String,
.ExpSign,
.FloatExpFirst,
=> {
try failDecl(ctx, loc, name, "macro tokenizing failed: unexpected EOF", .{});
return error.TokenizingFailed;
},
}
}
switch (state) {
.Start => {
switch (c) {
' ', '\t', '\x0B', '\x0C' => {},
'\'' => {
state = .CharLit;
result.id = .CharLit;
begin_index = i.*;
},
'\"' => {
state = .String;
result.id = .StrLit;
begin_index = i.*;
},
'/' => {
state = .OpenComment;
},
'\\' => {
state = .Backslash;
},
'\n', '\r' => {
return result;
},
'a'...'z', 'A'...'Z', '_' => {
state = .Identifier;
result.id = .Identifier;
begin_index = i.*;
},
'1'...'9' => {
state = .Decimal;
result.id = .NumLitInt;
begin_index = i.*;
},
'0' => {
state = .SawZero;
result.id = .NumLitInt;
begin_index = i.*;
},
'.' => {
result.id = .Dot;
state = .Done;
},
'<' => {
result.id = .Lt;
state = .SawLt;
},
'>' => {
result.id = .Gt;
state = .SawGt;
},
'(' => {
result.id = .LParen;
state = .Done;
},
')' => {
result.id = .RParen;
state = .Done;
},
'*' => {
result.id = .Asterisk;
state = .Done;
},
'+' => {
result.id = .Plus;
state = .SawPlus;
},
'-' => {
result.id = .Minus;
state = .SawMinus;
},
'!' => {
result.id = .Bang;
state = .SawBang;
},
'~' => {
result.id = .Tilde;
state = .Done;
},
'=' => {
result.id = .Assign;
state = .SawEq;
},
',' => {
result.id = .Comma;
state = .Done;
},
'[' => {
result.id = .LBrace;
state = .Done;
},
']' => {
result.id = .RBrace;
state = .Done;
},
'|' => {
result.id = .Pipe;
state = .SawPipe;
},
'&' => {
result.id = .Ampersand;
state = .SawAmpersand;
},
'?' => {
result.id = .QuestionMark;
state = .Done;
},
':' => {
result.id = .Colon;
state = .Done;
},
else => {
try failDecl(ctx, loc, name, "macro tokenizing failed: unexpected character '{c}'", .{c});
return error.TokenizingFailed;
},
}
},
.Done => return result,
.SawMinus => {
switch (c) {
'>' => {
result.id = .Arrow;
state = .Done;
},
'-' => {
result.id = .Decrement;
state = .Done;
},
else => return result,
}
},
.SawPlus => {
switch (c) {
'+' => {
result.id = .Increment;
state = .Done;
},
else => return result,
}
},
.SawLt => {
switch (c) {
'<' => {
result.id = .Shl;
state = .Done;
},
'=' => {
result.id = .Lte;
state = .Done;
},
else => return result,
}
},
.SawGt => {
switch (c) {
'>' => {
result.id = .Shr;
state = .Done;
},
'=' => {
result.id = .Gte;
state = .Done;
},
else => return result,
}
},
.SawPipe => {
switch (c) {
'|' => {
result.id = .Or;
state = .Done;
},
else => return result,
}
},
.SawAmpersand => {
switch (c) {
'&' => {
result.id = .And;
state = .Done;
},
else => return result,
}
},
.SawBang => {
switch (c) {
'=' => {
result.id = .Ne;
state = .Done;
},
else => return result,
}
},
.SawEq => {
switch (c) {
'=' => {
result.id = .Eq;
state = .Done;
},
else => return result,
}
},
.Float => {
switch (c) {
'.', '0'...'9' => {},
'e', 'E' => {
state = .ExpSign;
},
'f',
'F',
=> {
result.num_lit_suffix = .F;
result.bytes = chars[begin_index..i.*];
state = .Done;
},
'l', 'L' => {
result.num_lit_suffix = .L;
result.bytes = chars[begin_index..i.*];
state = .Done;
},
else => {
result.bytes = chars[begin_index..i.*];
return result;
},
}
},
.ExpSign => {
switch (c) {
'+', '-' => {
state = .FloatExpFirst;
},
'0'...'9' => {
state = .FloatExp;
},
else => {
try failDecl(ctx, loc, name, "macro tokenizing failed: expected a digit or '+' or '-'", .{});
return error.TokenizingFailed;
},
}
},
.FloatExpFirst => {
switch (c) {
'0'...'9' => {
state = .FloatExp;
},
else => {
try failDecl(ctx, loc, name, "macro tokenizing failed: expected a digit", .{});
return error.TokenizingFailed;
},
}
},
.FloatExp => {
switch (c) {
'0'...'9' => {},
'f', 'F' => {
result.num_lit_suffix = .F;
result.bytes = chars[begin_index..i.*];
state = .Done;
},
'l', 'L' => {
result.num_lit_suffix = .L;
result.bytes = chars[begin_index..i.*];
state = .Done;
},
else => {
result.bytes = chars[begin_index..i.*];
return result;
},
}
},
.Decimal => {
switch (c) {
'0'...'9' => {},
'\'' => {},
'u', 'U' => {
state = .NumLitIntSuffixU;
result.num_lit_suffix = .U;
result.bytes = chars[begin_index..i.*];
},
'l', 'L' => {
state = .NumLitIntSuffixL;
result.num_lit_suffix = .L;
result.bytes = chars[begin_index..i.*];
},
'.' => {
result.id = .NumLitFloat;
state = .Float;
},
else => {
result.bytes = chars[begin_index..i.*];
return result;
},
}
},
.SawZero => {
switch (c) {
'x', 'X' => {
state = .Hex;
},
'b', 'B' => {
state = .Bin;
},
'.' => {
state = .Float;
result.id = .NumLitFloat;
},
'u', 'U' => {
state = .NumLitIntSuffixU;
result.num_lit_suffix = .U;
result.bytes = chars[begin_index..i.*];
},
'l', 'L' => {
state = .NumLitIntSuffixL;
result.num_lit_suffix = .L;
result.bytes = chars[begin_index..i.*];
},
else => {
i.* -= 1;
state = .Octal;
},
}
},
.Octal => {
switch (c) {
'0'...'7' => {},
'8', '9' => {
try failDecl(ctx, loc, name, "macro tokenizing failed: invalid digit '{c}' in octal number", .{c});
return error.TokenizingFailed;
},
'u', 'U' => {
state = .NumLitIntSuffixU;
result.num_lit_suffix = .U;
result.bytes = chars[begin_index..i.*];
},
'l', 'L' => {
state = .NumLitIntSuffixL;
result.num_lit_suffix = .L;
result.bytes = chars[begin_index..i.*];
},
else => {
result.bytes = chars[begin_index..i.*];
return result;
},
}
},
.Hex => {
switch (c) {
'0'...'9', 'a'...'f', 'A'...'F' => {},
'u', 'U' => {
// marks the number literal as unsigned
state = .NumLitIntSuffixU;
result.num_lit_suffix = .U;
result.bytes = chars[begin_index..i.*];
},
'l', 'L' => {
// marks the number literal as long
state = .NumLitIntSuffixL;
result.num_lit_suffix = .L;
result.bytes = chars[begin_index..i.*];
},
else => {
result.bytes = chars[begin_index..i.*];
return result;
},
}
},
.Bin => {
switch (c) {
'0'...'1' => {},
'2'...'9' => {
try failDecl(ctx, loc, name, "macro tokenizing failed: invalid digit '{c}' in binary number", .{c});
return error.TokenizingFailed;
},
'u', 'U' => {
// marks the number literal as unsigned
state = .NumLitIntSuffixU;
result.num_lit_suffix = .U;
result.bytes = chars[begin_index..i.*];
},
'l', 'L' => {
// marks the number literal as long
state = .NumLitIntSuffixL;
result.num_lit_suffix = .L;
result.bytes = chars[begin_index..i.*];
},
else => {
result.bytes = chars[begin_index..i.*];
return result;
},
}
},
.NumLitIntSuffixU => {
switch (c) {
'l', 'L' => {
result.num_lit_suffix = .LU;
state = .NumLitIntSuffixUL;
},
else => {
return result;
},
}
},
.NumLitIntSuffixL => {
switch (c) {
'l', 'L' => {
result.num_lit_suffix = .LL;
state = .NumLitIntSuffixLL;
},
'u', 'U' => {
result.num_lit_suffix = .LU;
state = .Done;
},
else => {
return result;
},
}
},
.NumLitIntSuffixLL => {
switch (c) {
'u', 'U' => {
result.num_lit_suffix = .LLU;
state = .Done;
},
else => {
return result;
},
}
},
.NumLitIntSuffixUL => {
switch (c) {
'l', 'L' => {
result.num_lit_suffix = .LLU;
state = .Done;
},
else => {
return result;
},
}
},
.Identifier => {
switch (c) {
'_', 'a'...'z', 'A'...'Z', '0'...'9' => {},
else => {
result.bytes = chars[begin_index..i.*];
return result;
},
}
},
.String => {
switch (c) {
'\"' => {
result.bytes = chars[begin_index .. i.* + 1];
state = .Done;
},
else => {},
}
},
.CharLit => {
switch (c) {
'\'' => {
result.bytes = chars[begin_index .. i.* + 1];
state = .Done;
},
else => {},
}
},
.OpenComment => {
switch (c) {
'/' => {
return result;
},
'*' => {
state = .Comment;
},
else => {
result.id = .Slash;
state = .Done;
},
}
},
.Comment => {
switch (c) {
'*' => {
state = .CommentStar;
},
else => {},
}
},
.CommentStar => {
switch (c) {
'/' => {
state = .Start;
},
else => {
state = .Comment;
},
}
},
.Backslash => {
switch (c) {
' ', '\t', '\x0B', '\x0C' => {},
'\n', '\r' => {
state = .Start;
},
else => {
try failDecl(ctx, loc, name, "macro tokenizing failed: expected whitespace", .{});
return error.TokenizingFailed;
},
}
},
}
i.* += 1;
}
unreachable;
}
fn expectTokens(tl: *TokenList, src: [*:0]const u8, expected: []CToken) void {
// these can be undefined since they are only used for error reporting
tokenizeCMacro(undefined, undefined, undefined, tl, src) catch unreachable;
var it = tl.iterator(0);
for (expected) |t| {
var tok = it.next().?;
std.testing.expectEqual(t.id, tok.id);
if (t.bytes.len > 0) {
//std.debug.warn(" {} = {}\n", .{tok.bytes, t.bytes});
std.testing.expectEqualSlices(u8, tok.bytes, t.bytes);
}
if (t.num_lit_suffix != .None) {
std.testing.expectEqual(t.num_lit_suffix, tok.num_lit_suffix);
}
}
std.testing.expect(it.next() == null);
tl.shrink(0);
}
test "tokenize macro" {
var tl = TokenList.init(std.heap.page_allocator);
defer tl.deinit();
expectTokens(&tl, "TEST(0\n", &[_]CToken{
.{ .id = .Identifier, .bytes = "TEST" },
.{ .id = .Fn },
.{ .id = .LParen },
.{ .id = .NumLitInt, .bytes = "0" },
.{ .id = .Eof },
});
expectTokens(&tl, "__FLT_MIN_10_EXP__ -37\n", &[_]CToken{
.{ .id = .Identifier, .bytes = "__FLT_MIN_10_EXP__" },
.{ .id = .Minus },
.{ .id = .NumLitInt, .bytes = "37" },
.{ .id = .Eof },
});
expectTokens(&tl, "__llvm__ 1\n#define", &[_]CToken{
.{ .id = .Identifier, .bytes = "__llvm__" },
.{ .id = .NumLitInt, .bytes = "1" },
.{ .id = .Eof },
});
expectTokens(&tl, "TEST 2", &[_]CToken{
.{ .id = .Identifier, .bytes = "TEST" },
.{ .id = .NumLitInt, .bytes = "2" },
.{ .id = .Eof },
});
expectTokens(&tl, "FOO 0ull", &[_]CToken{
.{ .id = .Identifier, .bytes = "FOO" },
.{ .id = .NumLitInt, .bytes = "0", .num_lit_suffix = .LLU },
.{ .id = .Eof },
});
}
test "tokenize macro ops" {
var tl = TokenList.init(std.heap.page_allocator);
defer tl.deinit();
expectTokens(&tl, "ADD A + B", &[_]CToken{
.{ .id = .Identifier, .bytes = "ADD" },
.{ .id = .Identifier, .bytes = "A" },
.{ .id = .Plus },
.{ .id = .Identifier, .bytes = "B" },
.{ .id = .Eof },
});
expectTokens(&tl, "ADD (A) + B", &[_]CToken{
.{ .id = .Identifier, .bytes = "ADD" },
.{ .id = .LParen },
.{ .id = .Identifier, .bytes = "A" },
.{ .id = .RParen },
.{ .id = .Plus },
.{ .id = .Identifier, .bytes = "B" },
.{ .id = .Eof },
});
expectTokens(&tl, "ADD (A) + B", &[_]CToken{
.{ .id = .Identifier, .bytes = "ADD" },
.{ .id = .LParen },
.{ .id = .Identifier, .bytes = "A" },
.{ .id = .RParen },
.{ .id = .Plus },
.{ .id = .Identifier, .bytes = "B" },
.{ .id = .Eof },
});
}
test "escape sequences" {
var buf: [1024]u8 = undefined;
var alloc = std.heap.FixedBufferAllocator.init(buf[0..]);
const a = &alloc.allocator;
// these can be undefined since they are only used for error reporting
expect(std.mem.eql(u8, (try zigifyEscapeSequences(undefined, undefined, undefined, a, .{
.id = .StrLit,
.bytes = "\\x0077",
})).bytes, "\\x77"));
expect(std.mem.eql(u8, (try zigifyEscapeSequences(undefined, undefined, undefined, a, .{
.id = .StrLit,
.bytes = "\\24500",
})).bytes, "\\xa500"));
expect(std.mem.eql(u8, (try zigifyEscapeSequences(undefined, undefined, undefined, a, .{
.id = .StrLit,
.bytes = "\\x0077 abc",
})).bytes, "\\x77 abc"));
expect(std.mem.eql(u8, (try zigifyEscapeSequences(undefined, undefined, undefined, a, .{
.id = .StrLit,
.bytes = "\\045abc",
})).bytes, "\\x25abc"));
expect(std.mem.eql(u8, (try zigifyEscapeSequences(undefined, undefined, undefined, a, .{
.id = .CharLit,
.bytes = "\\0",
})).bytes, "\\x00"));
expect(std.mem.eql(u8, (try zigifyEscapeSequences(undefined, undefined, undefined, a, .{
.id = .CharLit,
.bytes = "\\00",
})).bytes, "\\x00"));
expect(std.mem.eql(u8, (try zigifyEscapeSequences(undefined, undefined, undefined, a, .{
.id = .CharLit,
.bytes = "\\000\\001",
})).bytes, "\\x00\\x01"));
expect(std.mem.eql(u8, (try zigifyEscapeSequences(undefined, undefined, undefined, a, .{
.id = .CharLit,
.bytes = "\\000abc",
})).bytes, "\\x00abc"));
}