Add utf8 rule to tools/check-typo

The utf8 rule allows UTF-8 sequences anywhere in a file and line-length calculations take this into account.
2018-06-07 13:32:09 +01:00 · 2018-06-07 13:32:09 +01:00 · 7a7c156d3f
parent e1270c41be
commit 7a7c156d3f
2 changed files with 37 additions and 11 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -44,7 +44,7 @@ README*                  ocaml-typo=missing-header

 /.mailmap                ocaml-typo=long-line,missing-header,non-ascii
 /.merlin                 ocaml-typo=missing-header
-/Changes                 ocaml-typo=non-ascii,missing-header
+/Changes                 ocaml-typo=utf8,missing-header
 /INSTALL                 ocaml-typo=missing-header
 /LICENSE                 ocaml-typo=long-line,very-long-line,missing-header
 # tools/ci/appveyor/appveyor_build.cmd only has missing-header because
@ -87,8 +87,8 @@ testsuite/tests/**                            ocaml-typo=missing-header
 testsuite/tests/lib-bigarray-2/bigarrf.f      ocaml-typo=missing-header,tab
 testsuite/tests/lib-unix/win-stat/fakeclock.c ocaml-typo=
 testsuite/tests/misc-unsafe/almabench.ml      ocaml-typo=missing-header,long-line
-testsuite/tests/tool-toplevel/strings.ml      ocaml-typo=non-ascii,missing-header
-testsuite/tests/win-unicode/*.ml              ocaml-typo=non-ascii,missing-header
+testsuite/tests/tool-toplevel/strings.ml      ocaml-typo=utf8,missing-header
+testsuite/tests/win-unicode/*.ml              ocaml-typo=utf8,missing-header
 testsuite/typing                              ocaml-typo=missing-header

 tools/magic         ocaml-typo=missing-header
--- a/tools/check-typo
+++ b/tools/check-typo
@ -81,6 +81,10 @@
 # You can ignore a rule by giving the option -<rule> on the command
 # line (before any file names).

+# Files which include the utf8 rule will have line-length computations take
+# UTF-8 sequences into account. As a special case, UTF-8 sequences are always
+# allowed in the copyright headers.
+
 # First prevent i18n from messing up everything.
 export LC_ALL=C

@ -191,16 +195,31 @@ IGNORE_DIRS="
          return c > limit;
        }

+        function utf8_decode(str) {
+          if (is_err("utf8")) {
+            return str;
+          } else {
+            # This script assumes that the UTF-8 has been externally validated
+            t = str;
+            gsub(/[\300-\367][\200-\277]+/, "?", t);
+            if (t != str) {
+              ++ counts["utf8"];
+            }
+            return t;
+          }
+        }
+
        BEGIN { state = "(first line)"; }

        match($0, /\t/) {
          err("tab", "TAB character(s)");
-          if (more_columns($0, 80)){
+          t = utf8_decode($0);
+          if (more_columns(t, 80)){
            RSTART=81;
            RLENGTH = 0;
            err("long-line", "line is over 80 columns");
          }
-          if (more_columns($0, 132)){
+          if (more_columns(t, 132)){
            RSTART=133;
            RLENGTH = 0;
            err("very-long-line", "line is over 132 columns");
@ -209,10 +228,14 @@ IGNORE_DIRS="

        match($0, /[\200-\377]/) \
        && state != "authors" && state != "copyright" {
-          err("non-ascii", "non-ASCII character(s)");
-          if (header_utf8 && !is_err("non-ascii")) {
-            err("non-ascii-utf8", \
-                "non-ASCII character(s) AND UTF-8 encountered");
+          if (is_err("utf8")) {
+            err("non-ascii", "non-ASCII character(s)");
+            if (header_utf8 && !is_err("non-ascii")) {
+              err("non-ascii-utf8", \
+                  "non-ASCII character(s) AND UTF-8 encountered");
+            }
+          } else {
+            ++ counts["utf8"];
          }
        }

@ -229,7 +252,7 @@ IGNORE_DIRS="
        }

        $0 !~ /\t/ && length($0) > 80 {
-          t = $0;
+          t = utf8_decode($0);
          sub(/https?:[A-Za-z0-9._~:/?#\[\]@!$&\047()*+,;=%-]{73,}$/, "", t);
          if (length(t) > 80) {
            RSTART = 81;
@ -241,7 +264,10 @@ IGNORE_DIRS="
        $0 !~ /\t/ && length($0) > 132 {
          RSTART = 133;
          RLENGTH = 0;
-          err("very-long-line", "line is over 132 columns");
+          t = utf8_decode($0);
+          if (length(t) > 132) {
+            err("very-long-line", "line is over 132 columns");
+          }
        }

        # Record that the header contained UTF-8 sequences