diff --git a/doc/langref.md b/doc/langref.md index 46f8f7f8d..e5090db2c 100644 --- a/doc/langref.md +++ b/doc/langref.md @@ -267,13 +267,22 @@ from codegen. #### Character and String Literals ``` -Literal Example Characters Escapes Null Term Type +Literal Example Characters Escapes Null Term Type -Byte 'H' All ASCII Byte No u8 -UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8 -UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8 +Byte 'H' All ASCII Byte No u8 +UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8 +UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8 +UTF-8 Raw String r"A(hello)A" All Unicode None No [5]u8 +UTF-8 Raw C String rc"A(hello)A" All Unicode None Yes &const u8 ``` +##### Raw Strings + +Raw string literals have no escapes and can span across multiple lines. To +start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('. +To end a raw string, use ')' followed by the same unique bytes, followed by '"'. + + ``` Escape Name diff --git a/doc/vim/syntax/zig.vim b/doc/vim/syntax/zig.vim index 0877eb56a..9041a3437 100644 --- a/doc/vim/syntax/zig.vim +++ b/doc/vim/syntax/zig.vim @@ -51,7 +51,7 @@ syn match zigEscape display contained /\\\([nrt0\\'"]\|x\x\{2}\)/ syn match zigEscapeUnicode display contained /\\\(u\x\{4}\|U\x\{8}\)/ syn match zigEscapeUnicode display contained /\\u{\x\{1,6}}/ syn match zigStringContinuation display contained /\\\n\s*/ -syn region zigString start=+c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell +syn region zigString start=+r\?c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell syn region zigString start='b\?r\z(#*\)"' end='"\z1' contains=@Spell let b:current_syntax = "zig" diff --git a/src/parser.cpp b/src/parser.cpp index f9115181d..14ac0fb91 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -226,6 +226,16 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) { static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool *out_c_str, ZigList *offset_map) { + if (token->raw_string_start > 0) { + uint8_t c1 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos); + uint8_t c2 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos + 1); + assert(c1 == 'r'); + *out_c_str = (c2 == 'c'); + const char *str = buf_ptr(pc->buf) + token->raw_string_start; + buf_init_from_mem(buf, str, token->raw_string_end - token->raw_string_start); + return; + } + // skip the double quotes at beginning and end // convert escape sequences // detect c string literal diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 17e31b0a1..516e666cd 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -30,7 +30,7 @@ '0': \ case DIGIT_NON_ZERO -#define ALPHA_EXCEPT_C \ +#define ALPHA_EXCEPT_CR \ 'a': \ case 'b': \ /*case 'c':*/ \ @@ -48,7 +48,7 @@ case 'o': \ case 'p': \ case 'q': \ - case 'r': \ + /*case 'r':*/ \ case 's': \ case 't': \ case 'u': \ @@ -85,11 +85,17 @@ case 'Z' #define ALPHA \ - ALPHA_EXCEPT_C: \ - case 'c' + ALPHA_EXCEPT_CR: \ + case 'c': \ + case 'r' #define SYMBOL_CHAR \ - ALPHA: \ + SYMBOL_CHAR_EXCEPT_C: \ + case 'c' + +#define SYMBOL_CHAR_EXCEPT_C \ + ALPHA_EXCEPT_CR: \ + case 'r': \ case DIGIT: \ case '_' @@ -118,12 +124,17 @@ enum TokenizeState { TokenizeStateStart, TokenizeStateSymbol, TokenizeStateSymbolFirst, + TokenizeStateSymbolFirstRaw, + TokenizeStateFirstR, TokenizeStateZero, // "0", which might lead to "0x" TokenizeStateNumber, // "123", "0x123" TokenizeStateFloatFraction, // "123.456", "0x123.456" TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p" TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5" TokenizeStateString, + TokenizeStateRawString, + TokenizeStateRawStringContents, + TokenizeStateRawStringMaybeEnd, TokenizeStateCharLiteral, TokenizeStateSawStar, TokenizeStateSawSlash, @@ -162,6 +173,9 @@ struct Tokenize { Token *cur_tok; int multi_line_comment_count; Tokenization *out; + int raw_string_id_start; + int raw_string_id_end; + int raw_string_id_cmp_pos; }; __attribute__ ((format (printf, 2, 3))) @@ -193,6 +207,8 @@ static void begin_token(Tokenize *t, TokenId id) { token->radix = 0; token->decimal_point_pos = 0; token->exponent_marker_pos = 0; + token->raw_string_start = 0; + token->raw_string_end = 0; t->cur_tok = token; } @@ -324,7 +340,11 @@ void tokenize(Buf *buf, Tokenization *out) { t.state = TokenizeStateSymbolFirst; begin_token(&t, TokenIdSymbol); break; - case ALPHA_EXCEPT_C: + case 'r': + t.state = TokenizeStateFirstR; + begin_token(&t, TokenIdSymbol); + break; + case ALPHA_EXCEPT_CR: case '_': t.state = TokenizeStateSymbol; begin_token(&t, TokenIdSymbol); @@ -821,6 +841,43 @@ void tokenize(Buf *buf, Tokenization *out) { continue; } break; + case TokenizeStateSymbolFirstRaw: + switch (c) { + case '"': + t.cur_tok->id = TokenIdStringLiteral; + t.state = TokenizeStateRawString; + t.raw_string_id_start = t.pos + 1; + break; + case SYMBOL_CHAR: + t.state = TokenizeStateSymbol; + break; + default: + t.pos -= 1; + end_token(&t); + t.state = TokenizeStateStart; + continue; + } + break; + case TokenizeStateFirstR: + switch (c) { + case '"': + t.cur_tok->id = TokenIdStringLiteral; + t.state = TokenizeStateRawString; + t.raw_string_id_start = t.pos + 1; + break; + case 'c': + t.state = TokenizeStateSymbolFirstRaw; + break; + case SYMBOL_CHAR_EXCEPT_C: + t.state = TokenizeStateSymbol; + break; + default: + t.pos -= 1; + end_token(&t); + t.state = TokenizeStateStart; + continue; + } + break; case TokenizeStateSymbol: switch (c) { case SYMBOL_CHAR: @@ -838,10 +895,44 @@ void tokenize(Buf *buf, Tokenization *out) { end_token(&t); t.state = TokenizeStateStart; break; + case '\n': + tokenize_error(&t, "use raw string for multiline string literal"); + break; default: break; } break; + case TokenizeStateRawString: + if (c == '(') { + t.raw_string_id_end = t.pos; + t.cur_tok->raw_string_start = t.pos + 1; + t.state = TokenizeStateRawStringContents; + } + break; + case TokenizeStateRawStringContents: + if (c == ')') { + t.state = TokenizeStateRawStringMaybeEnd; + t.raw_string_id_cmp_pos = t.raw_string_id_start; + t.cur_tok->raw_string_end = t.pos; + } + break; + case TokenizeStateRawStringMaybeEnd: + if (t.raw_string_id_cmp_pos >= t.raw_string_id_end && + c == '"') + { + end_token(&t); + t.state = TokenizeStateStart; + } else if (c != buf_ptr(t.buf)[t.raw_string_id_cmp_pos]) { + if (c == ')') { + t.raw_string_id_cmp_pos = t.raw_string_id_start; + t.cur_tok->raw_string_end = t.pos; + } else { + t.state = TokenizeStateRawStringContents; + } + } else { + t.raw_string_id_cmp_pos += 1; + } + break; case TokenizeStateCharLiteral: switch (c) { case '\'': @@ -1002,11 +1093,18 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateString: tokenize_error(&t, "unterminated string"); break; + case TokenizeStateRawString: + case TokenizeStateRawStringContents: + case TokenizeStateRawStringMaybeEnd: + tokenize_error(&t, "unterminated raw string"); + break; case TokenizeStateCharLiteral: tokenize_error(&t, "unterminated character literal"); break; case TokenizeStateSymbol: case TokenizeStateSymbolFirst: + case TokenizeStateSymbolFirstRaw: + case TokenizeStateFirstR: case TokenizeStateZero: case TokenizeStateNumber: case TokenizeStateFloatFraction: diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp index 0eeda70ef..22767adbb 100644 --- a/src/tokenizer.hpp +++ b/src/tokenizer.hpp @@ -112,6 +112,10 @@ struct Token { int radix; // if != 10, then skip the first 2 characters int decimal_point_pos; // either exponent_marker_pos or the position of the '.' int exponent_marker_pos; // either end_pos or the position of the 'e'/'p' + + // for id == TokenIdStringLiteral + int raw_string_start; + int raw_string_end; }; struct Tokenization { diff --git a/test/run_tests.cpp b/test/run_tests.cpp index 92622b702..fcec983db 100644 --- a/test/run_tests.cpp +++ b/test/run_tests.cpp @@ -1770,6 +1770,12 @@ fn f() { const std = @import("std"); } )SOURCE", 1, ".tmp_source.zig:3:17: error: @import invalid inside function bodies"); + + + add_compile_fail_case("normal string with newline", R"SOURCE( +const foo = "a +b"; + )SOURCE", 1, ".tmp_source.zig:2:13: error: use raw string for multiline string literal"); } ////////////////////////////////////////////////////////////////////////////// diff --git a/test/self_hosted.zig b/test/self_hosted.zig index a475dad7e..ef8dcdef0 100644 --- a/test/self_hosted.zig +++ b/test/self_hosted.zig @@ -495,7 +495,31 @@ fn count_trailing_zeroes() { } +#attribute("test") +fn multiline_string() { + const s1 = r"AOEU( +one +two) +three)AOEU"; + const s2 = "\none\ntwo)\nthree"; + const s3 = r"( +one +two) +three)"; + assert(str_eql(s1, s2)); + assert(str_eql(s3, s2)); +} + + fn assert(b: bool) { if (!b) unreachable{} } + +fn str_eql(s1: []u8, s2: []u8) -> bool { + if (s1.len != s2.len) return false; + for (s1) |c, i| { + if (s2[i] != c) return false; + } + return true; +}