add multiline string literal

and make multiple lines in normal string literals an error
2016-04-03 18:44:17 -07:00 · 2016-04-03 18:44:17 -07:00 · e144ddab24
parent 5bae9ba086
commit e144ddab24
7 changed files with 162 additions and 11 deletions
--- a/doc/langref.md
+++ b/doc/langref.md
@ -267,13 +267,22 @@ from codegen.

 #### Character and String Literals
 ```
-Literal         Example   Characters   Escapes         Null Term  Type
+Literal            Example       Characters   Escapes         Null Term  Type

-Byte            'H'       All ASCII    Byte            No         u8
-UTF-8 Bytes     "hello"   All Unicode  Byte & Unicode  No         [5]u8
-UTF-8 C string  c"hello"  All Unicode  Byte & Unicode  Yes        &const u8
+Byte               'H'           All ASCII    Byte            No         u8
+UTF-8 Bytes        "hello"       All Unicode  Byte & Unicode  No         [5]u8
+UTF-8 C string     c"hello"      All Unicode  Byte & Unicode  Yes        &const u8
+UTF-8 Raw String   r"A(hello)A"  All Unicode  None            No         [5]u8
+UTF-8 Raw C String rc"A(hello)A" All Unicode  None            Yes        &const u8
 ```

+##### Raw Strings
+
+Raw string literals have no escapes and can span across multiple lines. To
+start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('.
+To end a raw string, use ')' followed by the same unique bytes, followed by '"'.
+
+
 ```
 Escape  Name

--- a/doc/vim/syntax/zig.vim
+++ b/doc/vim/syntax/zig.vim
@ -51,7 +51,7 @@ syn match     zigEscape        display contained /\\\([nrt0\\'"]\|x\x\{2}\)/
 syn match     zigEscapeUnicode display contained /\\\(u\x\{4}\|U\x\{8}\)/
 syn match     zigEscapeUnicode display contained /\\u{\x\{1,6}}/
 syn match     zigStringContinuation display contained /\\\n\s*/
-syn region    zigString      start=+c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell
+syn region    zigString      start=+r\?c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell
 syn region    zigString      start='b\?r\z(#*\)"' end='"\z1' contains=@Spell

 let b:current_syntax = "zig"
--- a/src/parser.cpp
+++ b/src/parser.cpp
@ -226,6 +226,16 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) {
 static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool *out_c_str,
        ZigList<SrcPos> *offset_map)
 {
+    if (token->raw_string_start > 0) {
+        uint8_t c1 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos);
+        uint8_t c2 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos + 1);
+        assert(c1 == 'r');
+        *out_c_str = (c2 == 'c');
+        const char *str = buf_ptr(pc->buf) + token->raw_string_start;
+        buf_init_from_mem(buf, str, token->raw_string_end - token->raw_string_start);
+        return;
+    }
+
    // skip the double quotes at beginning and end
    // convert escape sequences
    // detect c string literal
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -30,7 +30,7 @@
         '0': \
    case DIGIT_NON_ZERO

-#define ALPHA_EXCEPT_C \
+#define ALPHA_EXCEPT_CR \
         'a': \
    case 'b': \
  /*case 'c':*/ \
@ -48,7 +48,7 @@
    case 'o': \
    case 'p': \
    case 'q': \
-    case 'r': \
+  /*case 'r':*/ \
    case 's': \
    case 't': \
    case 'u': \
@ -85,11 +85,17 @@
    case 'Z'

 #define ALPHA \
-    ALPHA_EXCEPT_C: \
-    case 'c'
+    ALPHA_EXCEPT_CR: \
+    case 'c': \
+    case 'r'

 #define SYMBOL_CHAR \
-    ALPHA: \
+    SYMBOL_CHAR_EXCEPT_C: \
+    case 'c'
+
+#define SYMBOL_CHAR_EXCEPT_C \
+    ALPHA_EXCEPT_CR: \
+    case 'r': \
    case DIGIT: \
    case '_'

@ -118,12 +124,17 @@ enum TokenizeState {
    TokenizeStateStart,
    TokenizeStateSymbol,
    TokenizeStateSymbolFirst,
+    TokenizeStateSymbolFirstRaw,
+    TokenizeStateFirstR,
    TokenizeStateZero, // "0", which might lead to "0x"
    TokenizeStateNumber, // "123", "0x123"
    TokenizeStateFloatFraction, // "123.456", "0x123.456"
    TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
    TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
    TokenizeStateString,
+    TokenizeStateRawString,
+    TokenizeStateRawStringContents,
+    TokenizeStateRawStringMaybeEnd,
    TokenizeStateCharLiteral,
    TokenizeStateSawStar,
    TokenizeStateSawSlash,
@ -162,6 +173,9 @@ struct Tokenize {
    Token *cur_tok;
    int multi_line_comment_count;
    Tokenization *out;
+    int raw_string_id_start;
+    int raw_string_id_end;
+    int raw_string_id_cmp_pos;
 };

 __attribute__ ((format (printf, 2, 3)))
@ -193,6 +207,8 @@ static void begin_token(Tokenize *t, TokenId id) {
    token->radix = 0;
    token->decimal_point_pos = 0;
    token->exponent_marker_pos = 0;
+    token->raw_string_start = 0;
+    token->raw_string_end = 0;
    t->cur_tok = token;
 }

@ -324,7 +340,11 @@ void tokenize(Buf *buf, Tokenization *out) {
                        t.state = TokenizeStateSymbolFirst;
                        begin_token(&t, TokenIdSymbol);
                        break;
-                    case ALPHA_EXCEPT_C:
+                    case 'r':
+                        t.state = TokenizeStateFirstR;
+                        begin_token(&t, TokenIdSymbol);
+                        break;
+                    case ALPHA_EXCEPT_CR:
                    case '_':
                        t.state = TokenizeStateSymbol;
                        begin_token(&t, TokenIdSymbol);
@ -821,6 +841,43 @@ void tokenize(Buf *buf, Tokenization *out) {
                        continue;
                }
                break;
+            case TokenizeStateSymbolFirstRaw:
+                switch (c) {
+                    case '"':
+                        t.cur_tok->id = TokenIdStringLiteral;
+                        t.state = TokenizeStateRawString;
+                        t.raw_string_id_start = t.pos + 1;
+                        break;
+                    case SYMBOL_CHAR:
+                        t.state = TokenizeStateSymbol;
+                        break;
+                    default:
+                        t.pos -= 1;
+                        end_token(&t);
+                        t.state = TokenizeStateStart;
+                        continue;
+                }
+                break;
+            case TokenizeStateFirstR:
+                switch (c) {
+                    case '"':
+                        t.cur_tok->id = TokenIdStringLiteral;
+                        t.state = TokenizeStateRawString;
+                        t.raw_string_id_start = t.pos + 1;
+                        break;
+                    case 'c':
+                        t.state = TokenizeStateSymbolFirstRaw;
+                        break;
+                    case SYMBOL_CHAR_EXCEPT_C:
+                        t.state = TokenizeStateSymbol;
+                        break;
+                    default:
+                        t.pos -= 1;
+                        end_token(&t);
+                        t.state = TokenizeStateStart;
+                        continue;
+                }
+                break;
            case TokenizeStateSymbol:
                switch (c) {
                    case SYMBOL_CHAR:
@ -838,10 +895,44 @@ void tokenize(Buf *buf, Tokenization *out) {
                        end_token(&t);
                        t.state = TokenizeStateStart;
                        break;
+                    case '\n':
+                        tokenize_error(&t, "use raw string for multiline string literal");
+                        break;
                    default:
                        break;
                }
                break;
+            case TokenizeStateRawString:
+                if (c == '(') {
+                    t.raw_string_id_end = t.pos;
+                    t.cur_tok->raw_string_start = t.pos + 1;
+                    t.state = TokenizeStateRawStringContents;
+                }
+                break;
+            case TokenizeStateRawStringContents:
+                if (c == ')') {
+                    t.state = TokenizeStateRawStringMaybeEnd;
+                    t.raw_string_id_cmp_pos = t.raw_string_id_start;
+                    t.cur_tok->raw_string_end = t.pos;
+                }
+                break;
+            case TokenizeStateRawStringMaybeEnd:
+                if (t.raw_string_id_cmp_pos >= t.raw_string_id_end &&
+                    c == '"')
+                {
+                    end_token(&t);
+                    t.state = TokenizeStateStart;
+                } else if (c != buf_ptr(t.buf)[t.raw_string_id_cmp_pos]) {
+                    if (c == ')') {
+                        t.raw_string_id_cmp_pos = t.raw_string_id_start;
+                        t.cur_tok->raw_string_end = t.pos;
+                    } else {
+                        t.state = TokenizeStateRawStringContents;
+                    }
+                } else {
+                    t.raw_string_id_cmp_pos += 1;
+                }
+                break;
            case TokenizeStateCharLiteral:
                switch (c) {
                    case '\'':
@ -1002,11 +1093,18 @@ void tokenize(Buf *buf, Tokenization *out) {
        case TokenizeStateString:
            tokenize_error(&t, "unterminated string");
            break;
+        case TokenizeStateRawString:
+        case TokenizeStateRawStringContents:
+        case TokenizeStateRawStringMaybeEnd:
+            tokenize_error(&t, "unterminated raw string");
+            break;
        case TokenizeStateCharLiteral:
            tokenize_error(&t, "unterminated character literal");
            break;
        case TokenizeStateSymbol:
        case TokenizeStateSymbolFirst:
+        case TokenizeStateSymbolFirstRaw:
+        case TokenizeStateFirstR:
        case TokenizeStateZero:
        case TokenizeStateNumber:
        case TokenizeStateFloatFraction:
--- a/src/tokenizer.hpp
+++ b/src/tokenizer.hpp
@ -112,6 +112,10 @@ struct Token {
    int radix; // if != 10, then skip the first 2 characters
    int decimal_point_pos; // either exponent_marker_pos or the position of the '.'
    int exponent_marker_pos; // either end_pos or the position of the 'e'/'p'
+
+    // for id == TokenIdStringLiteral
+    int raw_string_start;
+    int raw_string_end;
 };

 struct Tokenization {
--- a/test/run_tests.cpp
+++ b/test/run_tests.cpp
@ -1770,6 +1770,12 @@ fn f() {
    const std = @import("std");
 }
    )SOURCE", 1, ".tmp_source.zig:3:17: error: @import invalid inside function bodies");
+
+
+    add_compile_fail_case("normal string with newline", R"SOURCE(
+const foo = "a
+b";
+    )SOURCE", 1, ".tmp_source.zig:2:13: error: use raw string for multiline string literal");
 }

 //////////////////////////////////////////////////////////////////////////////
--- a/test/self_hosted.zig
+++ b/test/self_hosted.zig
@ -495,7 +495,31 @@ fn count_trailing_zeroes() {
 }


+#attribute("test")
+fn multiline_string() {
+    const s1 = r"AOEU(
+one
+two)
+three)AOEU";
+    const s2 = "\none\ntwo)\nthree";
+    const s3 = r"(
+one
+two)
+three)";
+    assert(str_eql(s1, s2));
+    assert(str_eql(s3, s2));
+}
+
+

 fn assert(b: bool) {
    if (!b) unreachable{}
 }
+
+fn str_eql(s1: []u8, s2: []u8) -> bool {
+    if (s1.len != s2.len) return false;
+    for (s1) |c, i| {
+        if (s2[i] != c) return false;
+    }
+    return true;
+}