457 lines
16 KiB
Bash
Executable File
457 lines
16 KiB
Bash
Executable File
#!/bin/sh
|
|
|
|
#**************************************************************************
|
|
#* *
|
|
#* OCaml *
|
|
#* *
|
|
#* Damien Doligez, projet Gallium, INRIA Rocquencourt *
|
|
#* *
|
|
#* Copyright 2012 Institut National de Recherche en Informatique et *
|
|
#* en Automatique. *
|
|
#* *
|
|
#* All rights reserved. This file is distributed under the terms of *
|
|
#* the GNU Lesser General Public License version 2.1, with the *
|
|
#* special exception on linking described in the file LICENSE. *
|
|
#* *
|
|
#**************************************************************************
|
|
|
|
# check-typo - Check typographic conventions on OCaml sources.
|
|
|
|
# This program will check files for the following rules:
|
|
|
|
# - absence of TAB characters (tab)
|
|
# - absence of non-ASCII characters (non-ascii)
|
|
# - absence of non-printing ASCII characters (non-printing)
|
|
# - absence of white space at end of line (white-at-eol)
|
|
# - absence of empty lines at end of file (white-at-eof)
|
|
# - presence of a LF character at the end of the file (missing-lf)
|
|
# - maximum line length of 80 characters (long-line)
|
|
# - maximum line length of 132 characters (very-long-line)
|
|
# - presence of a copyright header (missing-header)
|
|
# - absence of a leftover "$Id" string (svn-keyword)
|
|
|
|
# Exceptions are handled with git attributes: "typo.*".
|
|
# Its value for a given file is a comma-separated list of rule names,
|
|
# which lists the rules that should be disabled for this file.
|
|
# The rule names are the ones shown above in parentheses.
|
|
|
|
# Built-in exception:
|
|
# - Any file git identifies as binary
|
|
# is automatically exempt from all the rules.
|
|
|
|
# ASCII characters are bytes from 0 to 127. Any other byte is
|
|
# flagged as a non-ASCII character.
|
|
|
|
# For the purpose of this tool, printing ASCII characters are:
|
|
# - the non-white printable ASCII characters (33 to 126)
|
|
# - TAB (09)
|
|
# - LF (10)
|
|
# - SPC (32)
|
|
# Anything else is flagged as a non-printing ASCII character.
|
|
|
|
# This program will recursively explore the files and directories given
|
|
# on the command line (or by default the current directory), and check
|
|
# every file therein for compliance to the rules.
|
|
|
|
# Directories named .git (and their contents) are always ignored.
|
|
# This program ignores any file that is not under git control, unless
|
|
# explicitly given on the command line.
|
|
|
|
# If a directory has the git attribute "typo.prune" then it and its contents are
|
|
# ignored.
|
|
|
|
# You can ignore a rule by giving the option -<rule> on the command
|
|
# line (before any file names).
|
|
|
|
# Files which include the utf8 rule will be validated using grep and line-length
|
|
# computations will take UTF-8 sequences into account. As a special case, UTF-8
|
|
# sequences are always allowed in the copyright headers.
|
|
|
|
# First prevent i18n from messing up everything.
|
|
export LC_ALL=C
|
|
|
|
OCAML_CT_CAT=${OCAML_CT_CAT:-cat}
|
|
OCAML_CT_LS_FILES=${OCAML_CT_LS_FILES:-git ls-files}
|
|
OCAML_CT_HEAD=${OCAML_CT_HEAD:-HEAD}
|
|
OCAML_CT_AWK=${OCAML_CT_AWK:-awk}
|
|
if [ -z "${OCAML_CT_GIT_INDEX+x}" ] ; then
|
|
OCAML_CT_GIT_INDEX=
|
|
else
|
|
OCAML_CT_GIT_INDEX="GIT_INDEX_FILE=$OCAML_CT_GIT_INDEX"
|
|
fi
|
|
|
|
# The output of processing the attributes should be whitespace-separated with
|
|
# - the "typo." prefix dropped
|
|
# - unset/false keys not present
|
|
# - set/true keys present
|
|
# - "may" keys present, suffixed by a question mark
|
|
#
|
|
# for example,
|
|
# typo.long-line: set
|
|
# typo.missing-header: may
|
|
# typo.very-long-line: false
|
|
# should result in "long-line missing-header?"
|
|
get_attrs() {
|
|
env $OCAML_CT_GIT_INDEX git check-attr --all $OCAML_CT_CA_FLAG "$1" \
|
|
| grep -o " typo\\..*$" | sed "s/ typo\\.//g" \
|
|
| grep -v ": unset" | grep -v ": false" \
|
|
| sed "s/: set//g" | sed "s/: true//g" | sed "s/: may/?/g"
|
|
}
|
|
|
|
# empty if the path is *not* pruned
|
|
check_prune() {
|
|
env $OCAML_CT_GIT_INDEX git check-attr typo.prune $OCAML_CT_CA_FLAG "$1" \
|
|
| grep -v ': unspecified$' | grep -v ': false$'
|
|
}
|
|
|
|
# Special case for recursive call from the find command (see IGNORE_DIRS).
|
|
case "$1" in
|
|
--check-prune)
|
|
case $2 in
|
|
.git|.git/*)
|
|
echo "INFO: pruned path $2 (.git)" >&2
|
|
exit 0;;
|
|
esac
|
|
if test -n "$(check_prune "$2")"; then
|
|
echo "INFO: pruned path $2 (typo.prune)" >&2
|
|
exit 0
|
|
fi
|
|
exit 3;;
|
|
esac
|
|
|
|
case "$1" in
|
|
--get-attrs)
|
|
get_attrs "$2"
|
|
exit 0;;
|
|
esac
|
|
|
|
usage () {
|
|
echo "usage: check-typo {-<rule>} [--] {<file-or-dir>}" >&2
|
|
exit 2
|
|
}
|
|
|
|
check_script () {
|
|
if [ "$($OCAML_CT_CAT "$OCAML_CT_PREFIX$1" \
|
|
| sed -ne '1s/^#!.*/#!/p')" != '#!' ] ; then
|
|
# These files are listed manually, rather than via gitattributes,
|
|
# because the list should never expand, and it should not be trivial to
|
|
# expand (the unix-execvpe test is an ultra-special-case!)
|
|
f=${1#./}
|
|
if [ "$f" != "boot/ocamlc" ] && [ "$f" != "boot/ocamllex" ] && \
|
|
[ "$f" != "testsuite/tests/lib-unix/unix-execvpe/subdir/script2" ] ; then
|
|
echo "$1 shouldn't be executable; either:"
|
|
echo " - Add a #! line"
|
|
echo " - Run chmod -x $1 (on Unix)"
|
|
echo " - Run git update-index --chmod=-x $1 (on Windows)"
|
|
echo "You may wish to check your core.fileMode setting"
|
|
EXIT_CODE=1
|
|
fi
|
|
fi
|
|
}
|
|
|
|
userrules=''
|
|
|
|
while : ; do
|
|
case "$1" in
|
|
-help|--help) usage;;
|
|
-*) userrules="${1#-} $userrules"; shift;;
|
|
--) shift; break;;
|
|
*) break;;
|
|
esac
|
|
done
|
|
|
|
IGNORE_DIRS="
|
|
-name .git -prune -o
|
|
-type d -exec $0 --check-prune {} ; -prune -o
|
|
"
|
|
# `-type d`: simple files (not directories) are not pruned during the
|
|
# "find" invocation but below (look for "check_prune") for performance
|
|
# reasons: most files outside pruned directories are not pruned, so it
|
|
# is faster to optimistically run check-typo on them (and maybe get
|
|
# out in the middle) than to first check then run.
|
|
|
|
TEST_AWK='BEGIN {if ("a{1}" ~ /a{1}$/) exit 1}'
|
|
if ! $OCAML_CT_AWK "$TEST_AWK" ; then
|
|
if $OCAML_CT_AWK --re-interval "$TEST_AWK" 2>/dev/null ; then
|
|
OCAML_CT_AWK="$OCAML_CT_AWK --re-interval"
|
|
else
|
|
echo "This script requires interval support in regexes ({m} notation)">&2
|
|
echo "Please install a version of awk (e.g. gawk) which supports this">&2
|
|
exit 2
|
|
fi
|
|
fi
|
|
|
|
EXIT_CODE=0
|
|
( case $# in
|
|
0) find . $IGNORE_DIRS -type f -print;;
|
|
*) for i in "$@"; do find "$i" $IGNORE_DIRS -type f -print; done;;
|
|
esac
|
|
) | (
|
|
while read f; do
|
|
if test -n "$(check_prune "$f")"; then continue; fi
|
|
if $(git check-ignore -q "$f"); then continue; fi
|
|
case `$OCAML_CT_LS_FILES "$f" 2>&1` in
|
|
"") path_in_index=false;;
|
|
*) path_in_index=true;;
|
|
esac
|
|
case "$*" in
|
|
*$f*) is_cmd_line=true;;
|
|
*) is_cmd_line=false;;
|
|
esac
|
|
if [ -z "$OCAML_CT_PREFIX" ] ; then
|
|
if [ -x "$f" ] ; then
|
|
check_script "$f"
|
|
fi
|
|
else
|
|
if git ls-files -s "$f" | grep -q "^100755" ; then
|
|
check_script "$f"
|
|
fi
|
|
fi
|
|
if $path_in_index || $is_cmd_line; then :; else continue; fi
|
|
attr_rules=''
|
|
if $path_in_index; then
|
|
# Below is a git plumbing command to detect whether git regards a
|
|
# particular file as binary. This takes into account .gitattributes, but
|
|
# also works if the file has been automatically detected as binary by git.
|
|
# EMPTY is the hash of the empty tree (which is specially known to git -
|
|
# it is automatically included in every repository) as a way to get
|
|
# `diff-tree` to print the whole tree state; its `--numstat` output then
|
|
# prints a summary where two dashes in the first two columns indicates a
|
|
# binary file.
|
|
# (See https://git-scm.com/docs/git-diff-tree#_other_diff_formats and
|
|
# the documentation for the --numstat option. Commands designated as
|
|
# "plumbing" commands in git have stable output intended for parsing)
|
|
EMPTY=`git hash-object -t tree /dev/null`
|
|
git diff-tree --numstat $EMPTY $OCAML_CT_HEAD -- "$f" \
|
|
| grep -q "^-[[:blank:]]-" && continue
|
|
attr_rules=$(get_attrs "$f")
|
|
fi
|
|
rules="$userrules"
|
|
|
|
# remove newlines, ensure spaces at boundary
|
|
rules=" $(echo $rules) "
|
|
attr_rules=" $(echo $attr_rules) "
|
|
|
|
if test -n "$(echo "$rules $attr_rules" | grep " utf8 ")"
|
|
then
|
|
# grep -a is used to force the file to be considered as text and -x
|
|
# requires the entire line to match. This specifically detects the
|
|
# presence of lines containing malformed UTF-8. It may be tested using
|
|
# https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
|
if $OCAML_CT_CAT "$OCAML_CT_PREFIX$f" \
|
|
| LC_ALL=en_US.UTF8 grep -qaxv '.*' ; then
|
|
echo "File \"$f\" is not correctly encoded in UTF-8"
|
|
exit 2
|
|
fi
|
|
fi
|
|
if ! \
|
|
($OCAML_CT_CAT "$OCAML_CT_PREFIX$f" | tr -d '\r'; echo) \
|
|
| $OCAML_CT_AWK -v rules="$rules" -v attr_rules="$attr_rules" -v file="$f" \
|
|
'
|
|
function is_err(name) {
|
|
return ((rules attr_rules) !~ (" " name "[\\? ]"));
|
|
}
|
|
|
|
function report_err(name, msg) {
|
|
printf ("%s:%d.%d:", file, NR, RSTART + RLENGTH);
|
|
printf (" [%s] %s\n", name, msg);
|
|
got_errors = 1;
|
|
}
|
|
|
|
function err(name, msg) {
|
|
++ counts[name];
|
|
if (is_err(name) && counts[name] <= 10) {
|
|
report_err(name, msg);
|
|
if (counts[name] == 10){
|
|
printf ("WARNING: too many [%s] in this file.", name);
|
|
printf (" Others will not be reported.\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
function err_if(guard, name, msg) {
|
|
if (is_err(guard)) {
|
|
err(name, msg);
|
|
} else {
|
|
++ counts[name];
|
|
}
|
|
}
|
|
|
|
function more_columns(str, limit, c){
|
|
c = 0;
|
|
for (i = 1; i <= length(str); i++){
|
|
if (substr(str, i, 1) == "\t"){
|
|
c = int((c + 8) / 8) * 8;
|
|
}else{
|
|
++ c;
|
|
}
|
|
}
|
|
return c > limit;
|
|
}
|
|
|
|
function utf8_decode(str) {
|
|
if (is_err("utf8")) {
|
|
return str;
|
|
} else {
|
|
# This script assumes that the UTF-8 has been externally validated
|
|
t = str;
|
|
gsub(/[\300-\367][\200-\277]+/, "?", t);
|
|
if (t != str) {
|
|
++ counts["utf8"];
|
|
}
|
|
return t;
|
|
}
|
|
}
|
|
|
|
BEGIN { state = "(first line)"; }
|
|
|
|
match($0, /\t/) {
|
|
err("tab", "TAB character(s)");
|
|
t = utf8_decode($0);
|
|
if (more_columns(t, 80)){
|
|
RSTART=81;
|
|
RLENGTH = 0;
|
|
err_if("very-long-line", "long-line", "line is over 80 columns");
|
|
}
|
|
if (more_columns(t, 132)){
|
|
RSTART=133;
|
|
RLENGTH = 0;
|
|
err("very-long-line", "line is over 132 columns");
|
|
}
|
|
}
|
|
|
|
match($0, /[\200-\377]/) \
|
|
&& state != "authors" && state != "copyright" {
|
|
if (is_err("utf8")) {
|
|
err("non-ascii", "non-ASCII character(s)");
|
|
if (header_utf8 && !is_err("non-ascii")) {
|
|
err("non-ascii-utf8", \
|
|
"non-ASCII character(s) AND UTF-8 encountered");
|
|
}
|
|
} else {
|
|
++ counts["utf8"];
|
|
}
|
|
}
|
|
|
|
match($0, /[^\t\200-\377 -~]/) {
|
|
err("non-printing", "non-printing ASCII character(s)");
|
|
}
|
|
|
|
match($0, /[ \t]+$/) {
|
|
err("white-at-eol", "whitespace at end of line");
|
|
}
|
|
|
|
match($0, /\$Id(: .*)?\$/) {
|
|
err("svn-keyword", "SVN keyword marker");
|
|
}
|
|
|
|
$0 !~ /\t/ && length($0) > 80 {
|
|
t = utf8_decode($0);
|
|
sub(/https?:[A-Za-z0-9._~:\/?#\[\]@!$&\047()*+,;=%-]{73,}$/, "", t);
|
|
if (length(t) > 80) {
|
|
RSTART = 81;
|
|
RLENGTH = 0;
|
|
err_if("very-long-line", "long-line", "line is over 80 columns");
|
|
}
|
|
}
|
|
|
|
$0 !~ /\t/ && length($0) > 132 {
|
|
RSTART = 133;
|
|
RLENGTH = 0;
|
|
t = utf8_decode($0);
|
|
if (length(t) > 132) {
|
|
err("very-long-line", "line is over 132 columns");
|
|
}
|
|
}
|
|
|
|
# Record that the header contained UTF-8 sequences
|
|
match($0, /[\300-\367][\200-\277]+/) \
|
|
&& (state == "authors" || state == "copyright") {
|
|
header_utf8 = 1;
|
|
if (counts["non-ascii"] > 0 && is_err("non-ascii")) {
|
|
err("non-ascii-utf8", \
|
|
"non-ASCII character(s) AND UTF-8 encountered");
|
|
}
|
|
}
|
|
|
|
# Header-recognition automaton. Read this from bottom to top.
|
|
# Valid UTF-8 chars are recognised in copyright and authors
|
|
# TODO: ensure all files are valid UTF-8 before awking them.
|
|
# Note that this code also assumes that combining characters are NOT
|
|
# used (i.e. that every Unicode code-point corresponds to exactly
|
|
# one displayed character, i.e. no Camels and no including
|
|
# weird-and-wonderful ways of encoded accented letters).
|
|
|
|
state == "close" && $0 ~ /\*{74}/ { state = "OK"; }
|
|
state == "close" { state = "(last line)"; }
|
|
state == "blurb" && $0 ~ /\* {72}\*/ { state = "close"; }
|
|
state == "blurb" && $0 ~ /\/LICENSE/ { state = "(license path)" }
|
|
state == "blurb1" && $0 ~ /\* All rights reserved. .{47} \*/ \
|
|
{ state = "blurb"; }
|
|
state == "blurb1" { state = "(blurb line 1)"; }
|
|
state == "copyright" && $0 ~ /\* {72}\*/ { state = "blurb1"; }
|
|
state == "copyright" \
|
|
&& $0 !~ /\* Copyright [0-9]{4}([\300-\367][\200-\277]+|.){54} \*/ \
|
|
&& $0 !~ /\* ([\300-\367][\200-\277]+|.){66} \*/ \
|
|
{ state = "(copyright lines)"; }
|
|
state == "authors" && $0 ~ /\* {72}\*/ { state = "copyright"; }
|
|
state == "authors" \
|
|
&& $0 !~ /\* ([\300-\367][\200-\277]+|.){70} \*/ \
|
|
{ state = "(authors)"; }
|
|
state == "blank2" && $0 ~ /\* {72}\*/ { state = "authors"; }
|
|
state == "blank2" { state = "(blank line 2)"; }
|
|
state == "title" && $0 ~ /\* {33}OCaml {34}\*/ { state = "blank2"; }
|
|
state == "title" { state = "(title line)"; }
|
|
state == "blank1" && $0 ~ /\* {72}\*/ { state = "title"; }
|
|
state == "blank1" { state = "(blank line 1)"; }
|
|
state == "(first line)" && NR < 4 && $0 ~ /\*{74}/ { state = "blank1"; }
|
|
|
|
{
|
|
prev_line = last_line;
|
|
last_line = $0;
|
|
}
|
|
|
|
END {
|
|
if (match(last_line, /.+/)){
|
|
err("missing-lf", "missing linefeed at EOF");
|
|
prev_line = last_line;
|
|
++ NR;
|
|
empty_file = 0;
|
|
}else{
|
|
empty_file = NR == 1;
|
|
}
|
|
if (!empty_file && match(prev_line, /^$/)){
|
|
err("white-at-eof", "empty line(s) at EOF");
|
|
}
|
|
if (state != "OK"){
|
|
if (NR >= 10){
|
|
NR = 1;
|
|
RSTART = 1;
|
|
RLENGTH = 0;
|
|
err("missing-header", sprintf("bad copyright header %s", state));
|
|
}else{
|
|
counts["missing-header"] = 1;
|
|
}
|
|
}
|
|
split(attr_rules, r, "[? ]");
|
|
for (i in r){
|
|
name = r[i];
|
|
if (name != "" && !counts[name]){
|
|
NR = 1;
|
|
RSTART = 1;
|
|
RLENGTH = 0;
|
|
if (attr_rules !~ (" " name "\\? ")) {
|
|
report_err(name,
|
|
sprintf("attribute is unused", name));
|
|
}
|
|
}
|
|
}
|
|
exit got_errors;
|
|
}
|
|
' ; then
|
|
EXIT_CODE=1
|
|
fi
|
|
done
|
|
exit $EXIT_CODE
|
|
)
|