From 0bf7ebcfea7934cb972aef84b25494c92f0dcf6f Mon Sep 17 00:00:00 2001
From: Andrew Kelley <superjoe30@gmail.com>
Date: Mon, 30 Apr 2018 00:52:09 -0400
Subject: [PATCH] std.zig.tokenizer: fix handling of line comment / doc comment

---
 std/zig/parser_test.zig |   1 +
 std/zig/tokenizer.zig   | 115 +++++++++++++++++++++++++++++++---------
 2 files changed, 91 insertions(+), 25 deletions(-)

diff --git a/std/zig/parser_test.zig b/std/zig/parser_test.zig
index b4cbef33b..74a49a70e 100644
--- a/std/zig/parser_test.zig
+++ b/std/zig/parser_test.zig
@@ -124,6 +124,7 @@ test "zig fmt: comments before statements" {
         \\    _ = @import("foo/bar.zig");
         \\
         \\    // middle
+        \\    // middle2
         \\
         \\    // end
         \\}
diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig
index 1d24a88d2..9f5712fa9 100644
--- a/std/zig/tokenizer.zig
+++ b/std/zig/tokenizer.zig
@@ -260,6 +260,7 @@ pub const Tokenizer = struct {
         Slash,
         LineCommentStart,
         LineComment,
+        DocComment,
         Zero,
         IntegerLiteral,
         IntegerLiteralWithRadix,
@@ -825,6 +826,7 @@ pub const Tokenizer = struct {
                 State.Slash => switch (c) {
                     '/' => {
                         state = State.LineCommentStart;
+                        result.id = Token.Id.LineComment;
                     },
                     '=' => {
                         result.id = Token.Id.SlashEqual;
@@ -839,15 +841,15 @@ pub const Tokenizer = struct {
                 State.LineCommentStart => switch (c) {
                     '/' => {
                         result.id = Token.Id.DocComment;
+                        state = State.DocComment;
+                    },
+                    '\n' => break,
+                    else => {
                         state = State.LineComment;
+                        self.checkLiteralCharacter();
                     },
-                    '\n' => {
-                        result.id = Token.Id.LineComment;
-                        break;
-                    },
-                    else => self.checkLiteralCharacter(),
                 },
-                State.LineComment => switch (c) {
+                State.LineComment, State.DocComment => switch (c) {
                     '\n' => break,
                     else => self.checkLiteralCharacter(),
                 },
@@ -934,7 +936,10 @@ pub const Tokenizer = struct {
                 },
                 State.LineCommentStart,
                 State.LineComment => {
-                    result.id = Token.Id.Eof;
+                    result.id = Token.Id.LineComment;
+                },
+                State.DocComment => {
+                    result.id = Token.Id.DocComment;
                 },
 
                 State.NumberDot,
@@ -1105,41 +1110,77 @@ test "tokenizer - invalid literal/comment characters" {
         Token.Id.Invalid,
     });
     testTokenize("//\x00", []Token.Id {
+        Token.Id.LineComment,
         Token.Id.Invalid,
     });
     testTokenize("//\x1f", []Token.Id {
+        Token.Id.LineComment,
         Token.Id.Invalid,
     });
     testTokenize("//\x7f", []Token.Id {
+        Token.Id.LineComment,
         Token.Id.Invalid,
     });
 }
 
 test "tokenizer - utf8" {
-    testTokenize("//\xc2\x80", []Token.Id{});
-    testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{});
+    testTokenize("//\xc2\x80", []Token.Id{Token.Id.LineComment});
+    testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{Token.Id.LineComment});
 }
 
 test "tokenizer - invalid utf8" {
-    testTokenize("//\x80", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xbf", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xf8", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xff", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xc2\xc0", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xe0", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xf0", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xf0\x90\x80\xc0", []Token.Id{Token.Id.Invalid});
+    testTokenize("//\x80", []Token.Id{
+        Token.Id.LineComment,
+        Token.Id.Invalid,
+    });
+    testTokenize("//\xbf", []Token.Id{
+        Token.Id.LineComment,
+        Token.Id.Invalid,
+    });
+    testTokenize("//\xf8", []Token.Id{
+        Token.Id.LineComment,
+        Token.Id.Invalid,
+    });
+    testTokenize("//\xff", []Token.Id{
+        Token.Id.LineComment,
+        Token.Id.Invalid,
+    });
+    testTokenize("//\xc2\xc0", []Token.Id{
+        Token.Id.LineComment,
+        Token.Id.Invalid,
+    });
+    testTokenize("//\xe0", []Token.Id{
+        Token.Id.LineComment,
+        Token.Id.Invalid,
+    });
+    testTokenize("//\xf0", []Token.Id{
+        Token.Id.LineComment,
+        Token.Id.Invalid,
+    });
+    testTokenize("//\xf0\x90\x80\xc0", []Token.Id{
+        Token.Id.LineComment,
+        Token.Id.Invalid,
+    });
 }
 
 test "tokenizer - illegal unicode codepoints" {
     // unicode newline characters.U+0085, U+2028, U+2029
-    testTokenize("//\xc2\x84", []Token.Id{});
-    testTokenize("//\xc2\x85", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xc2\x86", []Token.Id{});
-    testTokenize("//\xe2\x80\xa7", []Token.Id{});
-    testTokenize("//\xe2\x80\xa8", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xe2\x80\xa9", []Token.Id{Token.Id.Invalid});
-    testTokenize("//\xe2\x80\xaa", []Token.Id{});
+    testTokenize("//\xc2\x84", []Token.Id{Token.Id.LineComment});
+    testTokenize("//\xc2\x85", []Token.Id{
+        Token.Id.LineComment,
+        Token.Id.Invalid,
+    });
+    testTokenize("//\xc2\x86", []Token.Id{Token.Id.LineComment});
+    testTokenize("//\xe2\x80\xa7", []Token.Id{Token.Id.LineComment});
+    testTokenize("//\xe2\x80\xa8", []Token.Id{
+        Token.Id.LineComment,
+        Token.Id.Invalid,
+    });
+    testTokenize("//\xe2\x80\xa9", []Token.Id{
+        Token.Id.LineComment,
+        Token.Id.Invalid,
+    });
+    testTokenize("//\xe2\x80\xaa", []Token.Id{Token.Id.LineComment});
 }
 
 test "tokenizer - string identifier and builtin fns" {
@@ -1166,11 +1207,35 @@ test "tokenizer - pipe and then invalid" {
     });
 }
 
+test "tokenizer - line comment and doc comment" {
+    testTokenize("//", []Token.Id{Token.Id.LineComment});
+    testTokenize("// a / b", []Token.Id{Token.Id.LineComment});
+    testTokenize("// /", []Token.Id{Token.Id.LineComment});
+    testTokenize("/// a", []Token.Id{Token.Id.DocComment});
+    testTokenize("///", []Token.Id{Token.Id.DocComment});
+}
+
+test "tokenizer - line comment followed by identifier" {
+    testTokenize(
+        \\    Unexpected,
+        \\    // another
+        \\    Another,
+    , []Token.Id{
+        Token.Id.Identifier,
+        Token.Id.Comma,
+        Token.Id.LineComment,
+        Token.Id.Identifier,
+        Token.Id.Comma,
+    });
+}
+
 fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void {
     var tokenizer = Tokenizer.init(source);
     for (expected_tokens) |expected_token_id| {
         const token = tokenizer.next();
-        std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id));
+        if (@TagType(Token.Id)(token.id) != @TagType(Token.Id)(expected_token_id)) {
+            std.debug.panic("expected {}, found {}\n", @tagName(@TagType(Token.Id)(expected_token_id)), @tagName(@TagType(Token.Id)(token.id)));
+        }
         switch (expected_token_id) {
             Token.Id.StringLiteral => |expected_kind| {
                 std.debug.assert(expected_kind == switch (token.id) { Token.Id.StringLiteral => |kind| kind, else => unreachable });