Add utf8 rule to tools/check-typo

The utf8 rule allows UTF-8 sequences anywhere in a file and line-length
calculations take this into account.
master
David Allsopp 2018-06-07 13:32:09 +01:00
parent e1270c41be
commit 7a7c156d3f
2 changed files with 37 additions and 11 deletions

6
.gitattributes vendored
View File

@ -44,7 +44,7 @@ README* ocaml-typo=missing-header
/.mailmap ocaml-typo=long-line,missing-header,non-ascii
/.merlin ocaml-typo=missing-header
/Changes ocaml-typo=non-ascii,missing-header
/Changes ocaml-typo=utf8,missing-header
/INSTALL ocaml-typo=missing-header
/LICENSE ocaml-typo=long-line,very-long-line,missing-header
# tools/ci/appveyor/appveyor_build.cmd only has missing-header because
@ -87,8 +87,8 @@ testsuite/tests/** ocaml-typo=missing-header
testsuite/tests/lib-bigarray-2/bigarrf.f ocaml-typo=missing-header,tab
testsuite/tests/lib-unix/win-stat/fakeclock.c ocaml-typo=
testsuite/tests/misc-unsafe/almabench.ml ocaml-typo=missing-header,long-line
testsuite/tests/tool-toplevel/strings.ml ocaml-typo=non-ascii,missing-header
testsuite/tests/win-unicode/*.ml ocaml-typo=non-ascii,missing-header
testsuite/tests/tool-toplevel/strings.ml ocaml-typo=utf8,missing-header
testsuite/tests/win-unicode/*.ml ocaml-typo=utf8,missing-header
testsuite/typing ocaml-typo=missing-header
tools/magic ocaml-typo=missing-header

View File

@ -81,6 +81,10 @@
# You can ignore a rule by giving the option -<rule> on the command
# line (before any file names).
# Files which include the utf8 rule will have line-length computations take
# UTF-8 sequences into account. As a special case, UTF-8 sequences are always
# allowed in the copyright headers.
# First prevent i18n from messing up everything.
export LC_ALL=C
@ -191,16 +195,31 @@ IGNORE_DIRS="
return c > limit;
}
function utf8_decode(str) {
if (is_err("utf8")) {
return str;
} else {
# This script assumes that the UTF-8 has been externally validated
t = str;
gsub(/[\300-\367][\200-\277]+/, "?", t);
if (t != str) {
++ counts["utf8"];
}
return t;
}
}
BEGIN { state = "(first line)"; }
match($0, /\t/) {
err("tab", "TAB character(s)");
if (more_columns($0, 80)){
t = utf8_decode($0);
if (more_columns(t, 80)){
RSTART=81;
RLENGTH = 0;
err("long-line", "line is over 80 columns");
}
if (more_columns($0, 132)){
if (more_columns(t, 132)){
RSTART=133;
RLENGTH = 0;
err("very-long-line", "line is over 132 columns");
@ -209,10 +228,14 @@ IGNORE_DIRS="
match($0, /[\200-\377]/) \
&& state != "authors" && state != "copyright" {
err("non-ascii", "non-ASCII character(s)");
if (header_utf8 && !is_err("non-ascii")) {
err("non-ascii-utf8", \
"non-ASCII character(s) AND UTF-8 encountered");
if (is_err("utf8")) {
err("non-ascii", "non-ASCII character(s)");
if (header_utf8 && !is_err("non-ascii")) {
err("non-ascii-utf8", \
"non-ASCII character(s) AND UTF-8 encountered");
}
} else {
++ counts["utf8"];
}
}
@ -229,7 +252,7 @@ IGNORE_DIRS="
}
$0 !~ /\t/ && length($0) > 80 {
t = $0;
t = utf8_decode($0);
sub(/https?:[A-Za-z0-9._~:/?#\[\]@!$&\047()*+,;=%-]{73,}$/, "", t);
if (length(t) > 80) {
RSTART = 81;
@ -241,7 +264,10 @@ IGNORE_DIRS="
$0 !~ /\t/ && length($0) > 132 {
RSTART = 133;
RLENGTH = 0;
err("very-long-line", "line is over 132 columns");
t = utf8_decode($0);
if (length(t) > 132) {
err("very-long-line", "line is over 132 columns");
}
}
# Record that the header contained UTF-8 sequences