implement string escapes
parent
037283c3b3
commit
9ccd0ba961
|
@ -272,10 +272,26 @@ Literal Example Characters Escapes Null Term Type
|
|||
Byte 'H' All ASCII Byte No u8
|
||||
UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8
|
||||
UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8
|
||||
UTF-8 Raw String r"A(hello)A" All Unicode None No [5]u8
|
||||
UTF-8 Raw C String rc"A(hello)A" All Unicode None Yes &const u8
|
||||
UTF-8 Raw String r"X(hello)X" All Unicode None No [5]u8
|
||||
UTF-8 Raw C String rc"X(hello)X" All Unicode None Yes &const u8
|
||||
```
|
||||
|
||||
### Escapes
|
||||
|
||||
Escape | Name
|
||||
----------|-------------------------------------------------------------------
|
||||
\n | Newline
|
||||
\r | Carriage Return
|
||||
\t | Tab
|
||||
\\ | Backslash
|
||||
\' | Single Quote
|
||||
\" | Double Quote
|
||||
\xNN | hexadecimal 8-bit character code (2 digits)
|
||||
\uNNNN | hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits)
|
||||
\UNNNNNN | hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits)
|
||||
|
||||
Note that the maximum valid Unicode point is 0x10ffff.
|
||||
|
||||
##### Raw Strings
|
||||
|
||||
Raw string literals have no escapes and can span across multiple lines. To
|
||||
|
@ -283,25 +299,6 @@ start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('.
|
|||
To end a raw string, use ')' followed by the same unique bytes, followed by '"'.
|
||||
|
||||
|
||||
```
|
||||
Escape Name
|
||||
|
||||
\xNN hexadecimal 8-bit character code (exactly 2 digits)
|
||||
\n Newline
|
||||
\r Carriage return
|
||||
\t Tab
|
||||
\\ Backslash
|
||||
\0 Null
|
||||
\' Single quote
|
||||
\" Double quote
|
||||
```
|
||||
|
||||
### Unicode Escapes
|
||||
|
||||
Escape | Name
|
||||
------------|-----------------------------------------------
|
||||
\u{NNNNNN} | hexadecimal 24-bit Unicode character code (up to 6 digits)
|
||||
|
||||
#### Numeric Literals
|
||||
|
||||
```
|
||||
|
|
|
@ -219,7 +219,7 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) {
|
|||
return return_value;
|
||||
}
|
||||
|
||||
static int get_hex_digit(uint8_t c) {
|
||||
static uint32_t get_hex_digit(uint8_t c) {
|
||||
switch (c) {
|
||||
case '0': return 0;
|
||||
case '1': return 1;
|
||||
|
@ -251,7 +251,7 @@ static int get_hex_digit(uint8_t c) {
|
|||
case 'F':
|
||||
return 15;
|
||||
default:
|
||||
return -1;
|
||||
return UINT32_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -279,13 +279,17 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
|
|||
StateEscape,
|
||||
StateHex1,
|
||||
StateHex2,
|
||||
StateUnicode,
|
||||
};
|
||||
|
||||
buf_resize(buf, 0);
|
||||
|
||||
int unicode_index;
|
||||
int unicode_end;
|
||||
|
||||
State state = StatePre;
|
||||
SrcPos pos = {token->start_line, token->start_column};
|
||||
int hex_value = 0;
|
||||
uint32_t hex_value = 0;
|
||||
for (int i = token->start_pos; i < token->end_pos - 1; i += 1) {
|
||||
uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + i);
|
||||
|
||||
|
@ -348,17 +352,34 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
|
|||
if (offset_map) offset_map->append(pos);
|
||||
state = StateStart;
|
||||
break;
|
||||
case '\'':
|
||||
buf_append_char(buf, '\'');
|
||||
if (offset_map) offset_map->append(pos);
|
||||
state = StateStart;
|
||||
break;
|
||||
case 'x':
|
||||
state = StateHex1;
|
||||
break;
|
||||
case 'u':
|
||||
state = StateUnicode;
|
||||
unicode_index = 0;
|
||||
unicode_end = 4;
|
||||
hex_value = 0;
|
||||
break;
|
||||
case 'U':
|
||||
state = StateUnicode;
|
||||
unicode_index = 0;
|
||||
unicode_end = 6;
|
||||
hex_value = 0;
|
||||
break;
|
||||
default:
|
||||
ast_error(pc, token, "invalid escape character");
|
||||
}
|
||||
break;
|
||||
case StateHex1:
|
||||
{
|
||||
int hex_digit = get_hex_digit(c);
|
||||
if (hex_digit == -1) {
|
||||
uint32_t hex_digit = get_hex_digit(c);
|
||||
if (hex_digit == UINT32_MAX) {
|
||||
ast_error(pc, token, "invalid hex digit: '%c'", c);
|
||||
}
|
||||
hex_value = hex_digit * 16;
|
||||
|
@ -367,8 +388,8 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
|
|||
}
|
||||
case StateHex2:
|
||||
{
|
||||
int hex_digit = get_hex_digit(c);
|
||||
if (hex_digit == -1) {
|
||||
uint32_t hex_digit = get_hex_digit(c);
|
||||
if (hex_digit == UINT32_MAX) {
|
||||
ast_error(pc, token, "invalid hex digit: '%c'", c);
|
||||
}
|
||||
hex_value += hex_digit;
|
||||
|
@ -377,6 +398,47 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
|
|||
state = StateStart;
|
||||
break;
|
||||
}
|
||||
case StateUnicode:
|
||||
{
|
||||
uint32_t hex_digit = get_hex_digit(c);
|
||||
if (hex_digit == UINT32_MAX) {
|
||||
ast_error(pc, token, "invalid hex digit: '%c'", c);
|
||||
}
|
||||
hex_value *= 16;
|
||||
hex_value += hex_digit;
|
||||
unicode_index += 1;
|
||||
if (unicode_index >= unicode_end) {
|
||||
if (hex_value <= 0x7f) {
|
||||
// 00000000 00000000 00000000 0xxxxxxx
|
||||
buf_append_char(buf, hex_value);
|
||||
} else if (hex_value <= 0x7ff) {
|
||||
// 00000000 00000000 00000xxx xx000000
|
||||
buf_append_char(buf, (unsigned char)(0xc0 | (hex_value >> 6)));
|
||||
// 00000000 00000000 00000000 00xxxxxx
|
||||
buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
|
||||
} else if (hex_value <= 0xffff) {
|
||||
// 00000000 00000000 xxxx0000 00000000
|
||||
buf_append_char(buf, (unsigned char)(0xe0 | (hex_value >> 12)));
|
||||
// 00000000 00000000 0000xxxx xx000000
|
||||
buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f)));
|
||||
// 00000000 00000000 00000000 00xxxxxx
|
||||
buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
|
||||
} else if (hex_value <= 0x10ffff) {
|
||||
// 00000000 000xxx00 00000000 00000000
|
||||
buf_append_char(buf, (unsigned char)(0xf0 | (hex_value >> 18)));
|
||||
// 00000000 000000xx xxxx0000 00000000
|
||||
buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 12) & 0x3f)));
|
||||
// 00000000 00000000 0000xxxx xx000000
|
||||
buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f)));
|
||||
// 00000000 00000000 00000000 00xxxxxx
|
||||
buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
|
||||
} else {
|
||||
ast_error(pc, token, "unicode value out of range: %x", hex_value);
|
||||
}
|
||||
state = StateStart;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (c == '\n') {
|
||||
pos.line += 1;
|
||||
|
|
|
@ -103,6 +103,21 @@
|
|||
ALPHA: \
|
||||
case '_'
|
||||
|
||||
#define HEX_DIGIT \
|
||||
'a': \
|
||||
case 'b': \
|
||||
case 'c': \
|
||||
case 'd': \
|
||||
case 'e': \
|
||||
case 'f': \
|
||||
case 'A': \
|
||||
case 'B': \
|
||||
case 'C': \
|
||||
case 'D': \
|
||||
case 'E': \
|
||||
case 'F': \
|
||||
case DIGIT
|
||||
|
||||
const char * zig_keywords[] = {
|
||||
"true", "false", "null", "fn", "return", "var", "const", "extern",
|
||||
"pub", "export", "use", "if", "else", "goto", "asm",
|
||||
|
@ -132,11 +147,11 @@ enum TokenizeState {
|
|||
TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
|
||||
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
|
||||
TokenizeStateString,
|
||||
TokenizeStateStringEscape,
|
||||
TokenizeStateRawString,
|
||||
TokenizeStateRawStringContents,
|
||||
TokenizeStateRawStringMaybeEnd,
|
||||
TokenizeStateCharLiteral,
|
||||
TokenizeStateCharLiteralEscape,
|
||||
TokenizeStateCharLiteralEnd,
|
||||
TokenizeStateSawStar,
|
||||
TokenizeStateSawSlash,
|
||||
|
@ -162,6 +177,7 @@ enum TokenizeState {
|
|||
TokenizeStateSawDotDot,
|
||||
TokenizeStateSawQuestionMark,
|
||||
TokenizeStateSawAtSign,
|
||||
TokenizeStateHex,
|
||||
TokenizeStateError,
|
||||
};
|
||||
|
||||
|
@ -179,6 +195,7 @@ struct Tokenize {
|
|||
int raw_string_id_start;
|
||||
int raw_string_id_end;
|
||||
int raw_string_id_cmp_pos;
|
||||
int hex_chars_left;
|
||||
};
|
||||
|
||||
__attribute__ ((format (printf, 2, 3)))
|
||||
|
@ -921,10 +938,63 @@ void tokenize(Buf *buf, Tokenization *out) {
|
|||
case '\n':
|
||||
tokenize_error(&t, "use raw string for multiline string literal");
|
||||
break;
|
||||
case '\\':
|
||||
t.state = TokenizeStateStringEscape;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case TokenizeStateStringEscape:
|
||||
switch (c) {
|
||||
case 'x':
|
||||
t.state = TokenizeStateHex;
|
||||
t.hex_chars_left = 2;
|
||||
break;
|
||||
case 'u':
|
||||
t.state = TokenizeStateHex;
|
||||
t.hex_chars_left = 4;
|
||||
break;
|
||||
case 'U':
|
||||
t.state = TokenizeStateHex;
|
||||
t.hex_chars_left = 6;
|
||||
break;
|
||||
case 'n':
|
||||
case 'r':
|
||||
case '\\':
|
||||
case 't':
|
||||
case '\'':
|
||||
case '"':
|
||||
if (t.cur_tok->id == TokenIdCharLiteral) {
|
||||
t.state = TokenizeStateCharLiteralEnd;
|
||||
} else if (t.cur_tok->id == TokenIdStringLiteral) {
|
||||
t.state = TokenizeStateString;
|
||||
} else {
|
||||
zig_unreachable();
|
||||
}
|
||||
break;
|
||||
default:
|
||||
tokenize_error(&t, "invalid character: '%c'", c);
|
||||
}
|
||||
break;
|
||||
case TokenizeStateHex:
|
||||
switch (c) {
|
||||
case HEX_DIGIT:
|
||||
t.hex_chars_left -= 1;
|
||||
if (t.hex_chars_left == 0) {
|
||||
if (t.cur_tok->id == TokenIdCharLiteral) {
|
||||
t.state = TokenizeStateCharLiteralEnd;
|
||||
} else if (t.cur_tok->id == TokenIdStringLiteral) {
|
||||
t.state = TokenizeStateString;
|
||||
} else {
|
||||
zig_unreachable();
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
tokenize_error(&t, "invalid character: '%c'", c);
|
||||
}
|
||||
break;
|
||||
case TokenizeStateRawString:
|
||||
if (c == '(') {
|
||||
t.raw_string_id_end = t.pos;
|
||||
|
@ -963,16 +1033,13 @@ void tokenize(Buf *buf, Tokenization *out) {
|
|||
t.state = TokenizeStateStart;
|
||||
break;
|
||||
case '\\':
|
||||
t.state = TokenizeStateCharLiteralEscape;
|
||||
t.state = TokenizeStateStringEscape;
|
||||
break;
|
||||
default:
|
||||
t.state = TokenizeStateCharLiteralEnd;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case TokenizeStateCharLiteralEscape:
|
||||
t.state = TokenizeStateCharLiteralEnd;
|
||||
break;
|
||||
case TokenizeStateCharLiteralEnd:
|
||||
switch (c) {
|
||||
case '\'':
|
||||
|
@ -1136,13 +1203,22 @@ void tokenize(Buf *buf, Tokenization *out) {
|
|||
case TokenizeStateString:
|
||||
tokenize_error(&t, "unterminated string");
|
||||
break;
|
||||
case TokenizeStateStringEscape:
|
||||
case TokenizeStateHex:
|
||||
if (t.cur_tok->id == TokenIdStringLiteral) {
|
||||
tokenize_error(&t, "unterminated string");
|
||||
} else if (t.cur_tok->id == TokenIdCharLiteral) {
|
||||
tokenize_error(&t, "unterminated character literal");
|
||||
} else {
|
||||
zig_unreachable();
|
||||
}
|
||||
break;
|
||||
case TokenizeStateRawString:
|
||||
case TokenizeStateRawStringContents:
|
||||
case TokenizeStateRawStringMaybeEnd:
|
||||
tokenize_error(&t, "unterminated raw string");
|
||||
break;
|
||||
case TokenizeStateCharLiteral:
|
||||
case TokenizeStateCharLiteralEscape:
|
||||
case TokenizeStateCharLiteralEnd:
|
||||
tokenize_error(&t, "unterminated character literal");
|
||||
break;
|
||||
|
|
|
@ -1398,3 +1398,14 @@ fn test_take_address_of_parameter_noeval(f: f32) {
|
|||
fn array_mult_operator() {
|
||||
assert(str.eql("ab" ** 5, "ababababab"));
|
||||
}
|
||||
|
||||
#attribute("test")
|
||||
fn string_escapes() {
|
||||
assert(str.eql("\"", "\x22"));
|
||||
assert(str.eql("\'", "\x27"));
|
||||
assert(str.eql("\n", "\x0a"));
|
||||
assert(str.eql("\r", "\x0d"));
|
||||
assert(str.eql("\t", "\x09"));
|
||||
assert(str.eql("\\", "\x5c"));
|
||||
assert(str.eql("\u1234\u0069", "\xe1\x88\xb4\x69"));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue