implement string escapes

master
Andrew Kelley 2016-05-01 14:53:48 -07:00
parent 037283c3b3
commit 9ccd0ba961
4 changed files with 180 additions and 34 deletions

View File

@ -272,10 +272,26 @@ Literal Example Characters Escapes Null Term Type
Byte 'H' All ASCII Byte No u8
UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8
UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8
UTF-8 Raw String r"A(hello)A" All Unicode None No [5]u8
UTF-8 Raw C String rc"A(hello)A" All Unicode None Yes &const u8
UTF-8 Raw String r"X(hello)X" All Unicode None No [5]u8
UTF-8 Raw C String rc"X(hello)X" All Unicode None Yes &const u8
```
### Escapes
Escape | Name
----------|-------------------------------------------------------------------
\n | Newline
\r | Carriage Return
\t | Tab
\\ | Backslash
\' | Single Quote
\" | Double Quote
\xNN | hexadecimal 8-bit character code (2 digits)
\uNNNN | hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits)
\UNNNNNN | hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits)
Note that the maximum valid Unicode point is 0x10ffff.
##### Raw Strings
Raw string literals have no escapes and can span across multiple lines. To
@ -283,25 +299,6 @@ start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('.
To end a raw string, use ')' followed by the same unique bytes, followed by '"'.
```
Escape Name
\xNN hexadecimal 8-bit character code (exactly 2 digits)
\n Newline
\r Carriage return
\t Tab
\\ Backslash
\0 Null
\' Single quote
\" Double quote
```
### Unicode Escapes
Escape | Name
------------|-----------------------------------------------
\u{NNNNNN} | hexadecimal 24-bit Unicode character code (up to 6 digits)
#### Numeric Literals
```

View File

@ -219,7 +219,7 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) {
return return_value;
}
static int get_hex_digit(uint8_t c) {
static uint32_t get_hex_digit(uint8_t c) {
switch (c) {
case '0': return 0;
case '1': return 1;
@ -251,7 +251,7 @@ static int get_hex_digit(uint8_t c) {
case 'F':
return 15;
default:
return -1;
return UINT32_MAX;
}
}
@ -279,13 +279,17 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
StateEscape,
StateHex1,
StateHex2,
StateUnicode,
};
buf_resize(buf, 0);
int unicode_index;
int unicode_end;
State state = StatePre;
SrcPos pos = {token->start_line, token->start_column};
int hex_value = 0;
uint32_t hex_value = 0;
for (int i = token->start_pos; i < token->end_pos - 1; i += 1) {
uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + i);
@ -348,17 +352,34 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
if (offset_map) offset_map->append(pos);
state = StateStart;
break;
case '\'':
buf_append_char(buf, '\'');
if (offset_map) offset_map->append(pos);
state = StateStart;
break;
case 'x':
state = StateHex1;
break;
case 'u':
state = StateUnicode;
unicode_index = 0;
unicode_end = 4;
hex_value = 0;
break;
case 'U':
state = StateUnicode;
unicode_index = 0;
unicode_end = 6;
hex_value = 0;
break;
default:
ast_error(pc, token, "invalid escape character");
}
break;
case StateHex1:
{
int hex_digit = get_hex_digit(c);
if (hex_digit == -1) {
uint32_t hex_digit = get_hex_digit(c);
if (hex_digit == UINT32_MAX) {
ast_error(pc, token, "invalid hex digit: '%c'", c);
}
hex_value = hex_digit * 16;
@ -367,8 +388,8 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
}
case StateHex2:
{
int hex_digit = get_hex_digit(c);
if (hex_digit == -1) {
uint32_t hex_digit = get_hex_digit(c);
if (hex_digit == UINT32_MAX) {
ast_error(pc, token, "invalid hex digit: '%c'", c);
}
hex_value += hex_digit;
@ -377,6 +398,47 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
state = StateStart;
break;
}
case StateUnicode:
{
uint32_t hex_digit = get_hex_digit(c);
if (hex_digit == UINT32_MAX) {
ast_error(pc, token, "invalid hex digit: '%c'", c);
}
hex_value *= 16;
hex_value += hex_digit;
unicode_index += 1;
if (unicode_index >= unicode_end) {
if (hex_value <= 0x7f) {
// 00000000 00000000 00000000 0xxxxxxx
buf_append_char(buf, hex_value);
} else if (hex_value <= 0x7ff) {
// 00000000 00000000 00000xxx xx000000
buf_append_char(buf, (unsigned char)(0xc0 | (hex_value >> 6)));
// 00000000 00000000 00000000 00xxxxxx
buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
} else if (hex_value <= 0xffff) {
// 00000000 00000000 xxxx0000 00000000
buf_append_char(buf, (unsigned char)(0xe0 | (hex_value >> 12)));
// 00000000 00000000 0000xxxx xx000000
buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f)));
// 00000000 00000000 00000000 00xxxxxx
buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
} else if (hex_value <= 0x10ffff) {
// 00000000 000xxx00 00000000 00000000
buf_append_char(buf, (unsigned char)(0xf0 | (hex_value >> 18)));
// 00000000 000000xx xxxx0000 00000000
buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 12) & 0x3f)));
// 00000000 00000000 0000xxxx xx000000
buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f)));
// 00000000 00000000 00000000 00xxxxxx
buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
} else {
ast_error(pc, token, "unicode value out of range: %x", hex_value);
}
state = StateStart;
}
break;
}
}
if (c == '\n') {
pos.line += 1;

View File

@ -103,6 +103,21 @@
ALPHA: \
case '_'
#define HEX_DIGIT \
'a': \
case 'b': \
case 'c': \
case 'd': \
case 'e': \
case 'f': \
case 'A': \
case 'B': \
case 'C': \
case 'D': \
case 'E': \
case 'F': \
case DIGIT
const char * zig_keywords[] = {
"true", "false", "null", "fn", "return", "var", "const", "extern",
"pub", "export", "use", "if", "else", "goto", "asm",
@ -132,11 +147,11 @@ enum TokenizeState {
TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
TokenizeStateString,
TokenizeStateStringEscape,
TokenizeStateRawString,
TokenizeStateRawStringContents,
TokenizeStateRawStringMaybeEnd,
TokenizeStateCharLiteral,
TokenizeStateCharLiteralEscape,
TokenizeStateCharLiteralEnd,
TokenizeStateSawStar,
TokenizeStateSawSlash,
@ -162,6 +177,7 @@ enum TokenizeState {
TokenizeStateSawDotDot,
TokenizeStateSawQuestionMark,
TokenizeStateSawAtSign,
TokenizeStateHex,
TokenizeStateError,
};
@ -179,6 +195,7 @@ struct Tokenize {
int raw_string_id_start;
int raw_string_id_end;
int raw_string_id_cmp_pos;
int hex_chars_left;
};
__attribute__ ((format (printf, 2, 3)))
@ -921,10 +938,63 @@ void tokenize(Buf *buf, Tokenization *out) {
case '\n':
tokenize_error(&t, "use raw string for multiline string literal");
break;
case '\\':
t.state = TokenizeStateStringEscape;
break;
default:
break;
}
break;
case TokenizeStateStringEscape:
switch (c) {
case 'x':
t.state = TokenizeStateHex;
t.hex_chars_left = 2;
break;
case 'u':
t.state = TokenizeStateHex;
t.hex_chars_left = 4;
break;
case 'U':
t.state = TokenizeStateHex;
t.hex_chars_left = 6;
break;
case 'n':
case 'r':
case '\\':
case 't':
case '\'':
case '"':
if (t.cur_tok->id == TokenIdCharLiteral) {
t.state = TokenizeStateCharLiteralEnd;
} else if (t.cur_tok->id == TokenIdStringLiteral) {
t.state = TokenizeStateString;
} else {
zig_unreachable();
}
break;
default:
tokenize_error(&t, "invalid character: '%c'", c);
}
break;
case TokenizeStateHex:
switch (c) {
case HEX_DIGIT:
t.hex_chars_left -= 1;
if (t.hex_chars_left == 0) {
if (t.cur_tok->id == TokenIdCharLiteral) {
t.state = TokenizeStateCharLiteralEnd;
} else if (t.cur_tok->id == TokenIdStringLiteral) {
t.state = TokenizeStateString;
} else {
zig_unreachable();
}
}
break;
default:
tokenize_error(&t, "invalid character: '%c'", c);
}
break;
case TokenizeStateRawString:
if (c == '(') {
t.raw_string_id_end = t.pos;
@ -963,16 +1033,13 @@ void tokenize(Buf *buf, Tokenization *out) {
t.state = TokenizeStateStart;
break;
case '\\':
t.state = TokenizeStateCharLiteralEscape;
t.state = TokenizeStateStringEscape;
break;
default:
t.state = TokenizeStateCharLiteralEnd;
break;
}
break;
case TokenizeStateCharLiteralEscape:
t.state = TokenizeStateCharLiteralEnd;
break;
case TokenizeStateCharLiteralEnd:
switch (c) {
case '\'':
@ -1136,13 +1203,22 @@ void tokenize(Buf *buf, Tokenization *out) {
case TokenizeStateString:
tokenize_error(&t, "unterminated string");
break;
case TokenizeStateStringEscape:
case TokenizeStateHex:
if (t.cur_tok->id == TokenIdStringLiteral) {
tokenize_error(&t, "unterminated string");
} else if (t.cur_tok->id == TokenIdCharLiteral) {
tokenize_error(&t, "unterminated character literal");
} else {
zig_unreachable();
}
break;
case TokenizeStateRawString:
case TokenizeStateRawStringContents:
case TokenizeStateRawStringMaybeEnd:
tokenize_error(&t, "unterminated raw string");
break;
case TokenizeStateCharLiteral:
case TokenizeStateCharLiteralEscape:
case TokenizeStateCharLiteralEnd:
tokenize_error(&t, "unterminated character literal");
break;

View File

@ -1398,3 +1398,14 @@ fn test_take_address_of_parameter_noeval(f: f32) {
fn array_mult_operator() {
assert(str.eql("ab" ** 5, "ababababab"));
}
#attribute("test")
fn string_escapes() {
assert(str.eql("\"", "\x22"));
assert(str.eql("\'", "\x27"));
assert(str.eql("\n", "\x0a"));
assert(str.eql("\r", "\x0d"));
assert(str.eql("\t", "\x09"));
assert(str.eql("\\", "\x5c"));
assert(str.eql("\u1234\u0069", "\xe1\x88\xb4\x69"));
}