implement string escapes

2016-05-01 14:53:48 -07:00 · 2016-05-01 14:53:48 -07:00 · 9ccd0ba961
parent 037283c3b3
commit 9ccd0ba961
4 changed files with 180 additions and 34 deletions
--- a/doc/langref.md
+++ b/doc/langref.md
@ -272,10 +272,26 @@ Literal            Example       Characters   Escapes         Null Term  Type
 Byte               'H'           All ASCII    Byte            No         u8
 UTF-8 Bytes        "hello"       All Unicode  Byte & Unicode  No         [5]u8
 UTF-8 C string     c"hello"      All Unicode  Byte & Unicode  Yes        &const u8
-UTF-8 Raw String   r"A(hello)A"  All Unicode  None            No         [5]u8
-UTF-8 Raw C String rc"A(hello)A" All Unicode  None            Yes        &const u8
+UTF-8 Raw String   r"X(hello)X"  All Unicode  None            No         [5]u8
+UTF-8 Raw C String rc"X(hello)X" All Unicode  None            Yes        &const u8
 ```

+### Escapes
+
+ Escape   | Name
+----------|-------------------------------------------------------------------
+ \n       | Newline
+ \r       | Carriage Return
+ \t       | Tab
+ \\       | Backslash
+ \'       | Single Quote
+ \"       | Double Quote
+ \xNN     | hexadecimal 8-bit character code (2 digits)
+ \uNNNN   | hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits)
+ \UNNNNNN | hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits)
+
+Note that the maximum valid Unicode point is 0x10ffff.
+
 ##### Raw Strings

 Raw string literals have no escapes and can span across multiple lines. To
@ -283,25 +299,6 @@ start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('.
 To end a raw string, use ')' followed by the same unique bytes, followed by '"'.


-```
-Escape  Name
-
-\xNN    hexadecimal 8-bit character code (exactly 2 digits)
-\n      Newline
-\r      Carriage return
-\t      Tab
-\\      Backslash
-\0      Null
-\'      Single quote
-\"      Double quote
-```
-
-### Unicode Escapes
-
- Escape     | Name
------------|-----------------------------------------------
- \u{NNNNNN} | hexadecimal 24-bit Unicode character code (up to 6 digits)
-
 #### Numeric Literals

 ```
--- a/src/parser.cpp
+++ b/src/parser.cpp
@ -219,7 +219,7 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) {
    return return_value;
 }

-static int get_hex_digit(uint8_t c) {
+static uint32_t get_hex_digit(uint8_t c) {
    switch (c) {
        case '0': return 0;
        case '1': return 1;
@ -251,7 +251,7 @@ static int get_hex_digit(uint8_t c) {
        case 'F':
            return 15;
        default:
-            return -1;
+            return UINT32_MAX;
    }
 }

@ -279,13 +279,17 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
        StateEscape,
        StateHex1,
        StateHex2,
+        StateUnicode,
    };

    buf_resize(buf, 0);

+    int unicode_index;
+    int unicode_end;
+
    State state = StatePre;
    SrcPos pos = {token->start_line, token->start_column};
-    int hex_value = 0;
+    uint32_t hex_value = 0;
    for (int i = token->start_pos; i < token->end_pos - 1; i += 1) {
        uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + i);

@ -348,17 +352,34 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
                        if (offset_map) offset_map->append(pos);
                        state = StateStart;
                        break;
+                    case '\'':
+                        buf_append_char(buf, '\'');
+                        if (offset_map) offset_map->append(pos);
+                        state = StateStart;
+                        break;
                    case 'x':
                        state = StateHex1;
                        break;
+                    case 'u':
+                        state = StateUnicode;
+                        unicode_index = 0;
+                        unicode_end = 4;
+                        hex_value = 0;
+                        break;
+                    case 'U':
+                        state = StateUnicode;
+                        unicode_index = 0;
+                        unicode_end = 6;
+                        hex_value = 0;
+                        break;
                    default:
                        ast_error(pc, token, "invalid escape character");
                }
                break;
            case StateHex1:
                {
-                    int hex_digit = get_hex_digit(c);
-                    if (hex_digit == -1) {
+                    uint32_t hex_digit = get_hex_digit(c);
+                    if (hex_digit == UINT32_MAX) {
                        ast_error(pc, token, "invalid hex digit: '%c'", c);
                    }
                    hex_value = hex_digit * 16;
@ -367,8 +388,8 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
                }
            case StateHex2:
                {
-                    int hex_digit = get_hex_digit(c);
-                    if (hex_digit == -1) {
+                    uint32_t hex_digit = get_hex_digit(c);
+                    if (hex_digit == UINT32_MAX) {
                        ast_error(pc, token, "invalid hex digit: '%c'", c);
                    }
                    hex_value += hex_digit;
@ -377,6 +398,47 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
                    state = StateStart;
                    break;
                }
+            case StateUnicode:
+                {
+                    uint32_t hex_digit = get_hex_digit(c);
+                    if (hex_digit == UINT32_MAX) {
+                        ast_error(pc, token, "invalid hex digit: '%c'", c);
+                    }
+                    hex_value *= 16;
+                    hex_value += hex_digit;
+                    unicode_index += 1;
+                    if (unicode_index >= unicode_end) {
+                        if (hex_value <= 0x7f) {
+                            // 00000000 00000000 00000000 0xxxxxxx
+                            buf_append_char(buf, hex_value);
+                        } else if (hex_value <= 0x7ff) {
+                            // 00000000 00000000 00000xxx xx000000
+                            buf_append_char(buf, (unsigned char)(0xc0 | (hex_value >> 6)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
+                        } else if (hex_value <= 0xffff) {
+                            // 00000000 00000000 xxxx0000 00000000
+                            buf_append_char(buf, (unsigned char)(0xe0 | (hex_value >> 12)));
+                            // 00000000 00000000 0000xxxx xx000000
+                            buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
+                        } else if (hex_value <= 0x10ffff) {
+                            // 00000000 000xxx00 00000000 00000000
+                            buf_append_char(buf, (unsigned char)(0xf0 | (hex_value >> 18)));
+                            // 00000000 000000xx xxxx0000 00000000
+                            buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 12) & 0x3f)));
+                            // 00000000 00000000 0000xxxx xx000000
+                            buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
+                        } else {
+                            ast_error(pc, token, "unicode value out of range: %x", hex_value);
+                        }
+                        state = StateStart;
+                    }
+                    break;
+                }
        }
        if (c == '\n') {
            pos.line += 1;
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -103,6 +103,21 @@
    ALPHA: \
    case '_'

+#define HEX_DIGIT \
+         'a': \
+    case 'b': \
+    case 'c': \
+    case 'd': \
+    case 'e': \
+    case 'f': \
+    case 'A': \
+    case 'B': \
+    case 'C': \
+    case 'D': \
+    case 'E': \
+    case 'F': \
+    case DIGIT
+
 const char * zig_keywords[] = {
    "true", "false", "null", "fn", "return", "var", "const", "extern",
    "pub", "export", "use", "if", "else", "goto", "asm",
@ -132,11 +147,11 @@ enum TokenizeState {
    TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
    TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
    TokenizeStateString,
+    TokenizeStateStringEscape,
    TokenizeStateRawString,
    TokenizeStateRawStringContents,
    TokenizeStateRawStringMaybeEnd,
    TokenizeStateCharLiteral,
-    TokenizeStateCharLiteralEscape,
    TokenizeStateCharLiteralEnd,
    TokenizeStateSawStar,
    TokenizeStateSawSlash,
@ -162,6 +177,7 @@ enum TokenizeState {
    TokenizeStateSawDotDot,
    TokenizeStateSawQuestionMark,
    TokenizeStateSawAtSign,
+    TokenizeStateHex,
    TokenizeStateError,
 };

@ -179,6 +195,7 @@ struct Tokenize {
    int raw_string_id_start;
    int raw_string_id_end;
    int raw_string_id_cmp_pos;
+    int hex_chars_left;
 };

 __attribute__ ((format (printf, 2, 3)))
@ -921,10 +938,63 @@ void tokenize(Buf *buf, Tokenization *out) {
                    case '\n':
                        tokenize_error(&t, "use raw string for multiline string literal");
                        break;
+                    case '\\':
+                        t.state = TokenizeStateStringEscape;
+                        break;
                    default:
                        break;
                }
                break;
+            case TokenizeStateStringEscape:
+                switch (c) {
+                    case 'x':
+                        t.state = TokenizeStateHex;
+                        t.hex_chars_left = 2;
+                        break;
+                    case 'u':
+                        t.state = TokenizeStateHex;
+                        t.hex_chars_left = 4;
+                        break;
+                    case 'U':
+                        t.state = TokenizeStateHex;
+                        t.hex_chars_left = 6;
+                        break;
+                    case 'n':
+                    case 'r':
+                    case '\\':
+                    case 't':
+                    case '\'':
+                    case '"':
+                        if (t.cur_tok->id == TokenIdCharLiteral) {
+                            t.state = TokenizeStateCharLiteralEnd;
+                        } else if (t.cur_tok->id == TokenIdStringLiteral) {
+                            t.state = TokenizeStateString;
+                        } else {
+                            zig_unreachable();
+                        }
+                        break;
+                    default:
+                        tokenize_error(&t, "invalid character: '%c'", c);
+                }
+                break;
+            case TokenizeStateHex:
+                switch (c) {
+                    case HEX_DIGIT:
+                        t.hex_chars_left -= 1;
+                        if (t.hex_chars_left == 0) {
+                            if (t.cur_tok->id == TokenIdCharLiteral) {
+                                t.state = TokenizeStateCharLiteralEnd;
+                            } else if (t.cur_tok->id == TokenIdStringLiteral) {
+                                t.state = TokenizeStateString;
+                            } else {
+                                zig_unreachable();
+                            }
+                        }
+                        break;
+                    default:
+                        tokenize_error(&t, "invalid character: '%c'", c);
+                }
+                break;
            case TokenizeStateRawString:
                if (c == '(') {
                    t.raw_string_id_end = t.pos;
@ -963,16 +1033,13 @@ void tokenize(Buf *buf, Tokenization *out) {
                        t.state = TokenizeStateStart;
                        break;
                    case '\\':
-                        t.state = TokenizeStateCharLiteralEscape;
+                        t.state = TokenizeStateStringEscape;
                        break;
                    default:
                        t.state = TokenizeStateCharLiteralEnd;
                        break;
                }
                break;
-            case TokenizeStateCharLiteralEscape:
-                t.state = TokenizeStateCharLiteralEnd;
-                break;
            case TokenizeStateCharLiteralEnd:
                switch (c) {
                    case '\'':
@ -1136,13 +1203,22 @@ void tokenize(Buf *buf, Tokenization *out) {
        case TokenizeStateString:
            tokenize_error(&t, "unterminated string");
            break;
+        case TokenizeStateStringEscape:
+        case TokenizeStateHex:
+            if (t.cur_tok->id == TokenIdStringLiteral) {
+                tokenize_error(&t, "unterminated string");
+            } else if (t.cur_tok->id == TokenIdCharLiteral) {
+                tokenize_error(&t, "unterminated character literal");
+            } else {
+                zig_unreachable();
+            }
+            break;
        case TokenizeStateRawString:
        case TokenizeStateRawStringContents:
        case TokenizeStateRawStringMaybeEnd:
            tokenize_error(&t, "unterminated raw string");
            break;
        case TokenizeStateCharLiteral:
-        case TokenizeStateCharLiteralEscape:
        case TokenizeStateCharLiteralEnd:
            tokenize_error(&t, "unterminated character literal");
            break;
--- a/test/self_hosted.zig
+++ b/test/self_hosted.zig
@ -1398,3 +1398,14 @@ fn test_take_address_of_parameter_noeval(f: f32) {
 fn array_mult_operator() {
    assert(str.eql("ab" ** 5, "ababababab"));
 }
+
+#attribute("test")
+fn string_escapes() {
+    assert(str.eql("\"", "\x22"));
+    assert(str.eql("\'", "\x27"));
+    assert(str.eql("\n", "\x0a"));
+    assert(str.eql("\r", "\x0d"));
+    assert(str.eql("\t", "\x09"));
+    assert(str.eql("\\", "\x5c"));
+    assert(str.eql("\u1234\u0069", "\xe1\x88\xb4\x69"));
+}