tokenizer parses all number literal types

master
Josh Wolfe 2015-12-14 23:13:53 -07:00
parent de7c5ea102
commit cf88fcb2ad
2 changed files with 151 additions and 7 deletions

View File

@ -16,9 +16,8 @@
' ': \
case '\n'
#define DIGIT \
'0': \
case '1': \
#define DIGIT_NON_ZERO \
'1': \
case '2': \
case '3': \
case '4': \
@ -27,10 +26,14 @@
case '7': \
case '8': \
case '9'
#define DIGIT \
'0': \
case DIGIT_NON_ZERO
#define ALPHA_EXCEPT_C \
'a': \
case 'b': \
/*case 'c':*/ \
case 'd': \
case 'e': \
case 'f': \
@ -94,7 +97,11 @@ enum TokenizeState {
TokenizeStateStart,
TokenizeStateSymbol,
TokenizeStateSymbolFirst,
TokenizeStateNumber,
TokenizeStateZero, // "0", which might lead to "0x"
TokenizeStateNumber, // "123", "0x123"
TokenizeStateFloatFraction, // "123.456", "0x123.456"
TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
TokenizeStateString,
TokenizeStateSawStar,
TokenizeStateSawSlash,
@ -172,6 +179,16 @@ static void end_token(Tokenize *t) {
assert(t->cur_tok);
t->cur_tok->end_pos = t->pos + 1;
// normalize number literal parsing stuff
if (t->cur_tok->id == TokenIdNumberLiteral) {
if (t->cur_tok->exponent_marker_pos == 0) {
t->cur_tok->exponent_marker_pos = t->cur_tok->end_pos;
}
if (t->cur_tok->decimal_point_pos == 0) {
t->cur_tok->decimal_point_pos = t->cur_tok->exponent_marker_pos;
}
}
char *token_mem = buf_ptr(t->buf) + t->cur_tok->start_pos;
int token_len = t->cur_tok->end_pos - t->cur_tok->start_pos;
@ -218,6 +235,26 @@ static void end_token(Tokenize *t) {
t->cur_tok = nullptr;
}
static bool is_exponent_signifier(uint8_t c, int radix) {
if (radix == 16) {
return c == 'p' || c == 'P';
} else {
return c == 'e' || c == 'E';
}
}
static int get_digit_value(uint8_t c) {
if ('0' <= c && c <= '9') {
return c - '0';
}
if ('A' <= c && c <= 'Z') {
return c - 'A' + 10;
}
if ('a' <= c && c <= 'z') {
return c - 'a' + 10;
}
return -1;
}
void tokenize(Buf *buf, Tokenization *out) {
Tokenize t = {0};
t.out = out;
@ -245,9 +282,15 @@ void tokenize(Buf *buf, Tokenization *out) {
t.state = TokenizeStateSymbol;
begin_token(&t, TokenIdSymbol);
break;
case DIGIT:
case '0':
t.state = TokenizeStateZero;
begin_token(&t, TokenIdNumberLiteral);
t.cur_tok->radix = 10;
break;
case DIGIT_NON_ZERO:
t.state = TokenizeStateNumber;
begin_token(&t, TokenIdNumberLiteral);
t.cur_tok->radix = 10;
break;
case '"':
begin_token(&t, TokenIdStringLiteral);
@ -701,10 +744,102 @@ void tokenize(Buf *buf, Tokenization *out) {
break;
}
break;
case TokenizeStateZero:
switch (c) {
case 'b':
t.cur_tok->radix = 2;
break;
case 'o':
t.cur_tok->radix = 8;
break;
case 'x':
t.cur_tok->radix = 16;
break;
default:
// reinterpret as normal number
t.pos -= 1;
t.state = TokenizeStateNumber;
continue;
}
break;
case TokenizeStateNumber:
{
if (c == '.') {
t.cur_tok->decimal_point_pos = t.pos;
t.state = TokenizeStateFloatFraction;
break;
}
if (is_exponent_signifier(c, t.cur_tok->radix)) {
t.cur_tok->exponent_marker_pos = t.pos;
t.state = TokenizeStateFloatExponentUnsigned;
break;
}
if (c == '_') {
tokenize_error(&t, "invalid character: '%c'", c);
break;
}
int digit_value = get_digit_value(c);
if (digit_value >= 0) {
if (digit_value >= t.cur_tok->radix) {
tokenize_error(&t, "invalid character: '%c'", c);
}
// normal digit
} else {
// not my char
t.pos -= 1;
end_token(&t);
t.state = TokenizeStateStart;
continue;
}
break;
}
case TokenizeStateFloatFraction:
{
if (is_exponent_signifier(c, t.cur_tok->radix)) {
t.cur_tok->exponent_marker_pos = t.pos;
t.state = TokenizeStateFloatExponentUnsigned;
break;
}
if (c == '_') {
tokenize_error(&t, "invalid character: '%c'", c);
break;
}
int digit_value = get_digit_value(c);
if (digit_value >= 0) {
if (digit_value >= t.cur_tok->radix) {
tokenize_error(&t, "invalid character: '%c'", c);
}
// normal digit
} else {
// not my char
t.pos -= 1;
end_token(&t);
t.state = TokenizeStateStart;
continue;
}
break;
}
case TokenizeStateFloatExponentUnsigned:
switch (c) {
case '+':
case '-':
t.state = TokenizeStateFloatExponentNumber;
break;
default:
// reinterpret as normal exponent number
t.pos -= 1;
t.state = TokenizeStateFloatExponentNumber;
continue;
}
break;
case TokenizeStateFloatExponentNumber:
switch (c) {
case DIGIT:
break;
case ALPHA:
case '_':
tokenize_error(&t, "invalid character: '%c'", c);
break;
default:
t.pos -= 1;
end_token(&t);
@ -750,7 +885,11 @@ void tokenize(Buf *buf, Tokenization *out) {
break;
case TokenizeStateSymbol:
case TokenizeStateSymbolFirst:
case TokenizeStateZero:
case TokenizeStateNumber:
case TokenizeStateFloatFraction:
case TokenizeStateFloatExponentUnsigned:
case TokenizeStateFloatExponentNumber:
case TokenizeStateSawStar:
case TokenizeStateSawSlash:
case TokenizeStateSawPercent:

View File

@ -88,6 +88,11 @@ struct Token {
int end_pos;
int start_line;
int start_column;
// for id == TokenIdNumberLiteral
int radix; // if != 10, then skip the first 2 characters
int decimal_point_pos; // either exponent_marker_pos or the position of the '.'
int exponent_marker_pos; // either end_pos or the position of the 'e'/'p'
};
struct Tokenization {