Warn if a file contains UTF-8 and Latin-1

Add a new warning non-ascii-utf8 displayed only if the non-ascii
attribute is specified and UTF-8 characters were ignored in the
copyright or authors lines in the header.
master
David Allsopp 2017-10-11 11:23:15 +01:00
parent 49723e5fd4
commit bfff8f9251
1 changed files with 19 additions and 2 deletions

View File

@ -162,10 +162,13 @@ IGNORE_DIRS="
(cat "$f" | tr -d '\r'; echo) \
| awk -v rules="$rules" -v svnrules="$svnrules" -v file="$f" \
'
function is_err(name) {
return (("," rules svnrules ",") !~ ("[, ]" name "[, ]"));
}
function err(name, msg) {
++ counts[name];
if (("," rules svnrules ",") !~ ("[, ]" name "[, ]") \
&& counts[name] <= 10){
if (is_err(name) && counts[name] <= 10){
printf ("%s:%d.%d:", file, NR, RSTART + RLENGTH);
printf (" [%s] %s\n", name, msg);
got_errors = 1;
@ -207,6 +210,10 @@ IGNORE_DIRS="
match($0, /[\200-\377]/) \
&& state != "authors" && state != "copyright" {
err("non-ascii", "non-ASCII character(s)");
if (header_utf8 && !is_err("non-ascii")) {
err("non-ascii-utf8", \
"non-ASCII character(s) AND UTF-8 encountered");
}
}
match($0, /[^\t\200-\377 -~]/) {
@ -237,6 +244,16 @@ IGNORE_DIRS="
err("very-long-line", "line is over 132 columns");
}
# Record that the header contained UTF-8 sequences
match($0, /[\300-\367][\200-\277]+/) \
&& (state == "authors" || state == "copyright") {
header_utf8 = 1;
if (counts["non-ascii"] > 0 && is_err("non-ascii")) {
err("non-ascii-utf8", \
"non-ASCII character(s) AND UTF-8 encountered");
}
}
# Header-recognition automaton. Read this from bottom to top.
# Valid UTF-8 chars are recognised in copyright and authors
# TODO: ensure all files are valid UTF-8 before awking them.