add multiline string literal

and make multiple lines in normal string literals an error
master
Andrew Kelley 2016-04-03 18:44:17 -07:00
parent 5bae9ba086
commit e144ddab24
7 changed files with 162 additions and 11 deletions

View File

@ -267,13 +267,22 @@ from codegen.
#### Character and String Literals
```
Literal Example Characters Escapes Null Term Type
Literal Example Characters Escapes Null Term Type
Byte 'H' All ASCII Byte No u8
UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8
UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8
Byte 'H' All ASCII Byte No u8
UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8
UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8
UTF-8 Raw String r"A(hello)A" All Unicode None No [5]u8
UTF-8 Raw C String rc"A(hello)A" All Unicode None Yes &const u8
```
##### Raw Strings
Raw string literals have no escapes and can span across multiple lines. To
start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('.
To end a raw string, use ')' followed by the same unique bytes, followed by '"'.
```
Escape Name

View File

@ -51,7 +51,7 @@ syn match zigEscape display contained /\\\([nrt0\\'"]\|x\x\{2}\)/
syn match zigEscapeUnicode display contained /\\\(u\x\{4}\|U\x\{8}\)/
syn match zigEscapeUnicode display contained /\\u{\x\{1,6}}/
syn match zigStringContinuation display contained /\\\n\s*/
syn region zigString start=+c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell
syn region zigString start=+r\?c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell
syn region zigString start='b\?r\z(#*\)"' end='"\z1' contains=@Spell
let b:current_syntax = "zig"

View File

@ -226,6 +226,16 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) {
static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool *out_c_str,
ZigList<SrcPos> *offset_map)
{
if (token->raw_string_start > 0) {
uint8_t c1 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos);
uint8_t c2 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos + 1);
assert(c1 == 'r');
*out_c_str = (c2 == 'c');
const char *str = buf_ptr(pc->buf) + token->raw_string_start;
buf_init_from_mem(buf, str, token->raw_string_end - token->raw_string_start);
return;
}
// skip the double quotes at beginning and end
// convert escape sequences
// detect c string literal

View File

@ -30,7 +30,7 @@
'0': \
case DIGIT_NON_ZERO
#define ALPHA_EXCEPT_C \
#define ALPHA_EXCEPT_CR \
'a': \
case 'b': \
/*case 'c':*/ \
@ -48,7 +48,7 @@
case 'o': \
case 'p': \
case 'q': \
case 'r': \
/*case 'r':*/ \
case 's': \
case 't': \
case 'u': \
@ -85,11 +85,17 @@
case 'Z'
#define ALPHA \
ALPHA_EXCEPT_C: \
case 'c'
ALPHA_EXCEPT_CR: \
case 'c': \
case 'r'
#define SYMBOL_CHAR \
ALPHA: \
SYMBOL_CHAR_EXCEPT_C: \
case 'c'
#define SYMBOL_CHAR_EXCEPT_C \
ALPHA_EXCEPT_CR: \
case 'r': \
case DIGIT: \
case '_'
@ -118,12 +124,17 @@ enum TokenizeState {
TokenizeStateStart,
TokenizeStateSymbol,
TokenizeStateSymbolFirst,
TokenizeStateSymbolFirstRaw,
TokenizeStateFirstR,
TokenizeStateZero, // "0", which might lead to "0x"
TokenizeStateNumber, // "123", "0x123"
TokenizeStateFloatFraction, // "123.456", "0x123.456"
TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
TokenizeStateString,
TokenizeStateRawString,
TokenizeStateRawStringContents,
TokenizeStateRawStringMaybeEnd,
TokenizeStateCharLiteral,
TokenizeStateSawStar,
TokenizeStateSawSlash,
@ -162,6 +173,9 @@ struct Tokenize {
Token *cur_tok;
int multi_line_comment_count;
Tokenization *out;
int raw_string_id_start;
int raw_string_id_end;
int raw_string_id_cmp_pos;
};
__attribute__ ((format (printf, 2, 3)))
@ -193,6 +207,8 @@ static void begin_token(Tokenize *t, TokenId id) {
token->radix = 0;
token->decimal_point_pos = 0;
token->exponent_marker_pos = 0;
token->raw_string_start = 0;
token->raw_string_end = 0;
t->cur_tok = token;
}
@ -324,7 +340,11 @@ void tokenize(Buf *buf, Tokenization *out) {
t.state = TokenizeStateSymbolFirst;
begin_token(&t, TokenIdSymbol);
break;
case ALPHA_EXCEPT_C:
case 'r':
t.state = TokenizeStateFirstR;
begin_token(&t, TokenIdSymbol);
break;
case ALPHA_EXCEPT_CR:
case '_':
t.state = TokenizeStateSymbol;
begin_token(&t, TokenIdSymbol);
@ -821,6 +841,43 @@ void tokenize(Buf *buf, Tokenization *out) {
continue;
}
break;
case TokenizeStateSymbolFirstRaw:
switch (c) {
case '"':
t.cur_tok->id = TokenIdStringLiteral;
t.state = TokenizeStateRawString;
t.raw_string_id_start = t.pos + 1;
break;
case SYMBOL_CHAR:
t.state = TokenizeStateSymbol;
break;
default:
t.pos -= 1;
end_token(&t);
t.state = TokenizeStateStart;
continue;
}
break;
case TokenizeStateFirstR:
switch (c) {
case '"':
t.cur_tok->id = TokenIdStringLiteral;
t.state = TokenizeStateRawString;
t.raw_string_id_start = t.pos + 1;
break;
case 'c':
t.state = TokenizeStateSymbolFirstRaw;
break;
case SYMBOL_CHAR_EXCEPT_C:
t.state = TokenizeStateSymbol;
break;
default:
t.pos -= 1;
end_token(&t);
t.state = TokenizeStateStart;
continue;
}
break;
case TokenizeStateSymbol:
switch (c) {
case SYMBOL_CHAR:
@ -838,10 +895,44 @@ void tokenize(Buf *buf, Tokenization *out) {
end_token(&t);
t.state = TokenizeStateStart;
break;
case '\n':
tokenize_error(&t, "use raw string for multiline string literal");
break;
default:
break;
}
break;
case TokenizeStateRawString:
if (c == '(') {
t.raw_string_id_end = t.pos;
t.cur_tok->raw_string_start = t.pos + 1;
t.state = TokenizeStateRawStringContents;
}
break;
case TokenizeStateRawStringContents:
if (c == ')') {
t.state = TokenizeStateRawStringMaybeEnd;
t.raw_string_id_cmp_pos = t.raw_string_id_start;
t.cur_tok->raw_string_end = t.pos;
}
break;
case TokenizeStateRawStringMaybeEnd:
if (t.raw_string_id_cmp_pos >= t.raw_string_id_end &&
c == '"')
{
end_token(&t);
t.state = TokenizeStateStart;
} else if (c != buf_ptr(t.buf)[t.raw_string_id_cmp_pos]) {
if (c == ')') {
t.raw_string_id_cmp_pos = t.raw_string_id_start;
t.cur_tok->raw_string_end = t.pos;
} else {
t.state = TokenizeStateRawStringContents;
}
} else {
t.raw_string_id_cmp_pos += 1;
}
break;
case TokenizeStateCharLiteral:
switch (c) {
case '\'':
@ -1002,11 +1093,18 @@ void tokenize(Buf *buf, Tokenization *out) {
case TokenizeStateString:
tokenize_error(&t, "unterminated string");
break;
case TokenizeStateRawString:
case TokenizeStateRawStringContents:
case TokenizeStateRawStringMaybeEnd:
tokenize_error(&t, "unterminated raw string");
break;
case TokenizeStateCharLiteral:
tokenize_error(&t, "unterminated character literal");
break;
case TokenizeStateSymbol:
case TokenizeStateSymbolFirst:
case TokenizeStateSymbolFirstRaw:
case TokenizeStateFirstR:
case TokenizeStateZero:
case TokenizeStateNumber:
case TokenizeStateFloatFraction:

View File

@ -112,6 +112,10 @@ struct Token {
int radix; // if != 10, then skip the first 2 characters
int decimal_point_pos; // either exponent_marker_pos or the position of the '.'
int exponent_marker_pos; // either end_pos or the position of the 'e'/'p'
// for id == TokenIdStringLiteral
int raw_string_start;
int raw_string_end;
};
struct Tokenization {

View File

@ -1770,6 +1770,12 @@ fn f() {
const std = @import("std");
}
)SOURCE", 1, ".tmp_source.zig:3:17: error: @import invalid inside function bodies");
add_compile_fail_case("normal string with newline", R"SOURCE(
const foo = "a
b";
)SOURCE", 1, ".tmp_source.zig:2:13: error: use raw string for multiline string literal");
}
//////////////////////////////////////////////////////////////////////////////

View File

@ -495,7 +495,31 @@ fn count_trailing_zeroes() {
}
#attribute("test")
fn multiline_string() {
const s1 = r"AOEU(
one
two)
three)AOEU";
const s2 = "\none\ntwo)\nthree";
const s3 = r"(
one
two)
three)";
assert(str_eql(s1, s2));
assert(str_eql(s3, s2));
}
fn assert(b: bool) {
if (!b) unreachable{}
}
fn str_eql(s1: []u8, s2: []u8) -> bool {
if (s1.len != s2.len) return false;
for (s1) |c, i| {
if (s2[i] != c) return false;
}
return true;
}