add multiline string literal
and make multiple lines in normal string literals an errormaster
parent
5bae9ba086
commit
e144ddab24
|
@ -267,13 +267,22 @@ from codegen.
|
|||
|
||||
#### Character and String Literals
|
||||
```
|
||||
Literal Example Characters Escapes Null Term Type
|
||||
Literal Example Characters Escapes Null Term Type
|
||||
|
||||
Byte 'H' All ASCII Byte No u8
|
||||
UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8
|
||||
UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8
|
||||
Byte 'H' All ASCII Byte No u8
|
||||
UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8
|
||||
UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8
|
||||
UTF-8 Raw String r"A(hello)A" All Unicode None No [5]u8
|
||||
UTF-8 Raw C String rc"A(hello)A" All Unicode None Yes &const u8
|
||||
```
|
||||
|
||||
##### Raw Strings
|
||||
|
||||
Raw string literals have no escapes and can span across multiple lines. To
|
||||
start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('.
|
||||
To end a raw string, use ')' followed by the same unique bytes, followed by '"'.
|
||||
|
||||
|
||||
```
|
||||
Escape Name
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ syn match zigEscape display contained /\\\([nrt0\\'"]\|x\x\{2}\)/
|
|||
syn match zigEscapeUnicode display contained /\\\(u\x\{4}\|U\x\{8}\)/
|
||||
syn match zigEscapeUnicode display contained /\\u{\x\{1,6}}/
|
||||
syn match zigStringContinuation display contained /\\\n\s*/
|
||||
syn region zigString start=+c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell
|
||||
syn region zigString start=+r\?c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell
|
||||
syn region zigString start='b\?r\z(#*\)"' end='"\z1' contains=@Spell
|
||||
|
||||
let b:current_syntax = "zig"
|
||||
|
|
|
@ -226,6 +226,16 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) {
|
|||
static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool *out_c_str,
|
||||
ZigList<SrcPos> *offset_map)
|
||||
{
|
||||
if (token->raw_string_start > 0) {
|
||||
uint8_t c1 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos);
|
||||
uint8_t c2 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos + 1);
|
||||
assert(c1 == 'r');
|
||||
*out_c_str = (c2 == 'c');
|
||||
const char *str = buf_ptr(pc->buf) + token->raw_string_start;
|
||||
buf_init_from_mem(buf, str, token->raw_string_end - token->raw_string_start);
|
||||
return;
|
||||
}
|
||||
|
||||
// skip the double quotes at beginning and end
|
||||
// convert escape sequences
|
||||
// detect c string literal
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
'0': \
|
||||
case DIGIT_NON_ZERO
|
||||
|
||||
#define ALPHA_EXCEPT_C \
|
||||
#define ALPHA_EXCEPT_CR \
|
||||
'a': \
|
||||
case 'b': \
|
||||
/*case 'c':*/ \
|
||||
|
@ -48,7 +48,7 @@
|
|||
case 'o': \
|
||||
case 'p': \
|
||||
case 'q': \
|
||||
case 'r': \
|
||||
/*case 'r':*/ \
|
||||
case 's': \
|
||||
case 't': \
|
||||
case 'u': \
|
||||
|
@ -85,11 +85,17 @@
|
|||
case 'Z'
|
||||
|
||||
#define ALPHA \
|
||||
ALPHA_EXCEPT_C: \
|
||||
case 'c'
|
||||
ALPHA_EXCEPT_CR: \
|
||||
case 'c': \
|
||||
case 'r'
|
||||
|
||||
#define SYMBOL_CHAR \
|
||||
ALPHA: \
|
||||
SYMBOL_CHAR_EXCEPT_C: \
|
||||
case 'c'
|
||||
|
||||
#define SYMBOL_CHAR_EXCEPT_C \
|
||||
ALPHA_EXCEPT_CR: \
|
||||
case 'r': \
|
||||
case DIGIT: \
|
||||
case '_'
|
||||
|
||||
|
@ -118,12 +124,17 @@ enum TokenizeState {
|
|||
TokenizeStateStart,
|
||||
TokenizeStateSymbol,
|
||||
TokenizeStateSymbolFirst,
|
||||
TokenizeStateSymbolFirstRaw,
|
||||
TokenizeStateFirstR,
|
||||
TokenizeStateZero, // "0", which might lead to "0x"
|
||||
TokenizeStateNumber, // "123", "0x123"
|
||||
TokenizeStateFloatFraction, // "123.456", "0x123.456"
|
||||
TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
|
||||
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
|
||||
TokenizeStateString,
|
||||
TokenizeStateRawString,
|
||||
TokenizeStateRawStringContents,
|
||||
TokenizeStateRawStringMaybeEnd,
|
||||
TokenizeStateCharLiteral,
|
||||
TokenizeStateSawStar,
|
||||
TokenizeStateSawSlash,
|
||||
|
@ -162,6 +173,9 @@ struct Tokenize {
|
|||
Token *cur_tok;
|
||||
int multi_line_comment_count;
|
||||
Tokenization *out;
|
||||
int raw_string_id_start;
|
||||
int raw_string_id_end;
|
||||
int raw_string_id_cmp_pos;
|
||||
};
|
||||
|
||||
__attribute__ ((format (printf, 2, 3)))
|
||||
|
@ -193,6 +207,8 @@ static void begin_token(Tokenize *t, TokenId id) {
|
|||
token->radix = 0;
|
||||
token->decimal_point_pos = 0;
|
||||
token->exponent_marker_pos = 0;
|
||||
token->raw_string_start = 0;
|
||||
token->raw_string_end = 0;
|
||||
t->cur_tok = token;
|
||||
}
|
||||
|
||||
|
@ -324,7 +340,11 @@ void tokenize(Buf *buf, Tokenization *out) {
|
|||
t.state = TokenizeStateSymbolFirst;
|
||||
begin_token(&t, TokenIdSymbol);
|
||||
break;
|
||||
case ALPHA_EXCEPT_C:
|
||||
case 'r':
|
||||
t.state = TokenizeStateFirstR;
|
||||
begin_token(&t, TokenIdSymbol);
|
||||
break;
|
||||
case ALPHA_EXCEPT_CR:
|
||||
case '_':
|
||||
t.state = TokenizeStateSymbol;
|
||||
begin_token(&t, TokenIdSymbol);
|
||||
|
@ -821,6 +841,43 @@ void tokenize(Buf *buf, Tokenization *out) {
|
|||
continue;
|
||||
}
|
||||
break;
|
||||
case TokenizeStateSymbolFirstRaw:
|
||||
switch (c) {
|
||||
case '"':
|
||||
t.cur_tok->id = TokenIdStringLiteral;
|
||||
t.state = TokenizeStateRawString;
|
||||
t.raw_string_id_start = t.pos + 1;
|
||||
break;
|
||||
case SYMBOL_CHAR:
|
||||
t.state = TokenizeStateSymbol;
|
||||
break;
|
||||
default:
|
||||
t.pos -= 1;
|
||||
end_token(&t);
|
||||
t.state = TokenizeStateStart;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case TokenizeStateFirstR:
|
||||
switch (c) {
|
||||
case '"':
|
||||
t.cur_tok->id = TokenIdStringLiteral;
|
||||
t.state = TokenizeStateRawString;
|
||||
t.raw_string_id_start = t.pos + 1;
|
||||
break;
|
||||
case 'c':
|
||||
t.state = TokenizeStateSymbolFirstRaw;
|
||||
break;
|
||||
case SYMBOL_CHAR_EXCEPT_C:
|
||||
t.state = TokenizeStateSymbol;
|
||||
break;
|
||||
default:
|
||||
t.pos -= 1;
|
||||
end_token(&t);
|
||||
t.state = TokenizeStateStart;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case TokenizeStateSymbol:
|
||||
switch (c) {
|
||||
case SYMBOL_CHAR:
|
||||
|
@ -838,10 +895,44 @@ void tokenize(Buf *buf, Tokenization *out) {
|
|||
end_token(&t);
|
||||
t.state = TokenizeStateStart;
|
||||
break;
|
||||
case '\n':
|
||||
tokenize_error(&t, "use raw string for multiline string literal");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case TokenizeStateRawString:
|
||||
if (c == '(') {
|
||||
t.raw_string_id_end = t.pos;
|
||||
t.cur_tok->raw_string_start = t.pos + 1;
|
||||
t.state = TokenizeStateRawStringContents;
|
||||
}
|
||||
break;
|
||||
case TokenizeStateRawStringContents:
|
||||
if (c == ')') {
|
||||
t.state = TokenizeStateRawStringMaybeEnd;
|
||||
t.raw_string_id_cmp_pos = t.raw_string_id_start;
|
||||
t.cur_tok->raw_string_end = t.pos;
|
||||
}
|
||||
break;
|
||||
case TokenizeStateRawStringMaybeEnd:
|
||||
if (t.raw_string_id_cmp_pos >= t.raw_string_id_end &&
|
||||
c == '"')
|
||||
{
|
||||
end_token(&t);
|
||||
t.state = TokenizeStateStart;
|
||||
} else if (c != buf_ptr(t.buf)[t.raw_string_id_cmp_pos]) {
|
||||
if (c == ')') {
|
||||
t.raw_string_id_cmp_pos = t.raw_string_id_start;
|
||||
t.cur_tok->raw_string_end = t.pos;
|
||||
} else {
|
||||
t.state = TokenizeStateRawStringContents;
|
||||
}
|
||||
} else {
|
||||
t.raw_string_id_cmp_pos += 1;
|
||||
}
|
||||
break;
|
||||
case TokenizeStateCharLiteral:
|
||||
switch (c) {
|
||||
case '\'':
|
||||
|
@ -1002,11 +1093,18 @@ void tokenize(Buf *buf, Tokenization *out) {
|
|||
case TokenizeStateString:
|
||||
tokenize_error(&t, "unterminated string");
|
||||
break;
|
||||
case TokenizeStateRawString:
|
||||
case TokenizeStateRawStringContents:
|
||||
case TokenizeStateRawStringMaybeEnd:
|
||||
tokenize_error(&t, "unterminated raw string");
|
||||
break;
|
||||
case TokenizeStateCharLiteral:
|
||||
tokenize_error(&t, "unterminated character literal");
|
||||
break;
|
||||
case TokenizeStateSymbol:
|
||||
case TokenizeStateSymbolFirst:
|
||||
case TokenizeStateSymbolFirstRaw:
|
||||
case TokenizeStateFirstR:
|
||||
case TokenizeStateZero:
|
||||
case TokenizeStateNumber:
|
||||
case TokenizeStateFloatFraction:
|
||||
|
|
|
@ -112,6 +112,10 @@ struct Token {
|
|||
int radix; // if != 10, then skip the first 2 characters
|
||||
int decimal_point_pos; // either exponent_marker_pos or the position of the '.'
|
||||
int exponent_marker_pos; // either end_pos or the position of the 'e'/'p'
|
||||
|
||||
// for id == TokenIdStringLiteral
|
||||
int raw_string_start;
|
||||
int raw_string_end;
|
||||
};
|
||||
|
||||
struct Tokenization {
|
||||
|
|
|
@ -1770,6 +1770,12 @@ fn f() {
|
|||
const std = @import("std");
|
||||
}
|
||||
)SOURCE", 1, ".tmp_source.zig:3:17: error: @import invalid inside function bodies");
|
||||
|
||||
|
||||
add_compile_fail_case("normal string with newline", R"SOURCE(
|
||||
const foo = "a
|
||||
b";
|
||||
)SOURCE", 1, ".tmp_source.zig:2:13: error: use raw string for multiline string literal");
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -495,7 +495,31 @@ fn count_trailing_zeroes() {
|
|||
}
|
||||
|
||||
|
||||
#attribute("test")
|
||||
fn multiline_string() {
|
||||
const s1 = r"AOEU(
|
||||
one
|
||||
two)
|
||||
three)AOEU";
|
||||
const s2 = "\none\ntwo)\nthree";
|
||||
const s3 = r"(
|
||||
one
|
||||
two)
|
||||
three)";
|
||||
assert(str_eql(s1, s2));
|
||||
assert(str_eql(s3, s2));
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn assert(b: bool) {
|
||||
if (!b) unreachable{}
|
||||
}
|
||||
|
||||
fn str_eql(s1: []u8, s2: []u8) -> bool {
|
||||
if (s1.len != s2.len) return false;
|
||||
for (s1) |c, i| {
|
||||
if (s2[i] != c) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue