#!/bin/sh #************************************************************************** #* * #* OCaml * #* * #* Damien Doligez, projet Gallium, INRIA Rocquencourt * #* * #* Copyright 2012 Institut National de Recherche en Informatique et * #* en Automatique. * #* * #* All rights reserved. This file is distributed under the terms of * #* the GNU Lesser General Public License version 2.1, with the * #* special exception on linking described in the file LICENSE. * #* * #************************************************************************** # check-typo - Check typographic conventions on OCaml sources. # This program will check files for the following rules: # - absence of TAB characters (tab) # - absence of non-ASCII characters (non-ascii) # - absence of non-printing ASCII characters (non-printing) # - absence of white space at end of line (white-at-eol) # - absence of empty lines at end of file (white-at-eof) # - presence of a LF character at the end of the file (missing-lf) # - maximum line length of 80 characters (long-line) # - maximum line length of 132 characters (very-long-line) # - presence of a copyright header (missing-header) # - absence of a leftover "$Id" string (svn-keyword) # Exceptions are handled with git attributes: "typo.*". # Its value for a given file is a comma-separated list of rule names, # which lists the rules that should be disabled for this file. # The rule names are the ones shown above in parentheses. # Built-in exception: # - Any file git identifies as binary # is automatically exempt from all the rules. # ASCII characters are bytes from 0 to 127. Any other byte is # flagged as a non-ASCII character. # For the purpose of this tool, printing ASCII characters are: # - the non-white printable ASCII characters (33 to 126) # - TAB (09) # - LF (10) # - SPC (32) # Anything else is flagged as a non-printing ASCII character. # This program will recursively explore the files and directories given # on the command line (or by default the current directory), and check # every file therein for compliance to the rules. # Directories named .git (and their contents) are always ignored. # This program ignores any file that is not under git control, unless # explicitly given on the command line. # If a directory has the git attribute "typo.prune" then it and its contents are # ignored. # You can ignore a rule by giving the option - on the command # line (before any file names). # Files which include the utf8 rule will be validated using grep and line-length # computations will take UTF-8 sequences into account. As a special case, UTF-8 # sequences are always allowed in the copyright headers. # First prevent i18n from messing up everything. export LC_ALL=C OCAML_CT_CAT=${OCAML_CT_CAT:-cat} OCAML_CT_LS_FILES=${OCAML_CT_LS_FILES:-git ls-files} OCAML_CT_HEAD=${OCAML_CT_HEAD:-HEAD} OCAML_CT_AWK=${OCAML_CT_AWK:-awk} if [ -z "${OCAML_CT_GIT_INDEX+x}" ] ; then OCAML_CT_GIT_INDEX= else OCAML_CT_GIT_INDEX="GIT_INDEX_FILE=$OCAML_CT_GIT_INDEX" fi # The output of processing the attributes should be whitespace-separated with # - the "typo." prefix dropped # - unset/false keys not present # - set/true keys present # - "may" keys present, suffixed by a question mark # # for example, # typo.long-line: set # typo.missing-header: may # typo.very-long-line: false # should result in "long-line missing-header?" get_attrs() { env $OCAML_CT_GIT_INDEX git check-attr --all $OCAML_CT_CA_FLAG "$1" \ | grep -o " typo\\..*$" | sed "s/ typo\\.//g" \ | grep -v ": unset" | grep -v ": false" \ | sed "s/: set//g" | sed "s/: true//g" | sed "s/: may/?/g" } # empty if the path is *not* pruned check_prune() { env $OCAML_CT_GIT_INDEX git check-attr typo.prune $OCAML_CT_CA_FLAG "$1" \ | grep -v ': unspecified$' | grep -v ': false$' } # Special case for recursive call from the find command (see IGNORE_DIRS). case "$1" in --check-prune) case $2 in .git|.git/*) echo "INFO: pruned path $2 (.git)" >&2 exit 0;; esac if test -n "$(check_prune "$2")"; then echo "INFO: pruned path $2 (typo.prune)" >&2 exit 0 fi exit 3;; esac case "$1" in --get-attrs) get_attrs "$2" exit 0;; esac usage () { echo "usage: check-typo {-} [--] {}" >&2 exit 2 } check_script () { if [ "$($OCAML_CT_CAT "$OCAML_CT_PREFIX$1" \ | sed -ne '1s/^#!.*/#!/p')" != '#!' ] ; then # These files are listed manually, rather than via gitattributes, # because the list should never expand, and it should not be trivial to # expand (the unix-execvpe test is an ultra-special-case!) f=${1#./} if [ "$f" != "boot/ocamlc" ] && [ "$f" != "boot/ocamllex" ] && \ [ "$f" != "testsuite/tests/lib-unix/unix-execvpe/subdir/script2" ] ; then echo "$1 shouldn't be executable; either:" echo " - Add a #! line" echo " - Run chmod -x $1 (on Unix)" echo " - Run git update-index --chmod=-x $1 (on Windows)" echo "You may wish to check your core.fileMode setting" EXIT_CODE=1 fi fi } userrules='' while : ; do case "$1" in -help|--help) usage;; -*) userrules="${1#-} $userrules"; shift;; --) shift; break;; *) break;; esac done IGNORE_DIRS=" -name .git -prune -o -type d -exec $0 --check-prune {} ; -prune -o " # `-type d`: simple files (not directories) are not pruned during the # "find" invocation but below (look for "check_prune") for performance # reasons: most files outside pruned directories are not pruned, so it # is faster to optimistically run check-typo on them (and maybe get # out in the middle) than to first check then run. TEST_AWK='BEGIN {if ("a{1}" ~ /a{1}$/) exit 1}' if ! $OCAML_CT_AWK "$TEST_AWK" ; then if $OCAML_CT_AWK --re-interval "$TEST_AWK" 2>/dev/null ; then OCAML_CT_AWK="$OCAML_CT_AWK --re-interval" else echo "This script requires interval support in regexes ({m} notation)">&2 echo "Please install a version of awk (e.g. gawk) which supports this">&2 exit 2 fi fi EXIT_CODE=0 ( case $# in 0) find . $IGNORE_DIRS -type f -print;; *) for i in "$@"; do find "$i" $IGNORE_DIRS -type f -print; done;; esac ) | ( while read f; do if test -n "$(check_prune "$f")"; then continue; fi if $(git check-ignore -q "$f"); then continue; fi case `$OCAML_CT_LS_FILES "$f" 2>&1` in "") path_in_index=false;; *) path_in_index=true;; esac case "$*" in *$f*) is_cmd_line=true;; *) is_cmd_line=false;; esac if [ -z "$OCAML_CT_PREFIX" ] ; then if [ -x "$f" ] ; then check_script "$f" fi else if git ls-files -s "$f" | grep -q "^100755" ; then check_script "$f" fi fi if $path_in_index || $is_cmd_line; then :; else continue; fi attr_rules='' if $path_in_index; then # Below is a git plumbing command to detect whether git regards a # particular file as binary. This takes into account .gitattributes, but # also works if the file has been automatically detected as binary by git. # EMPTY is the hash of the empty tree (which is specially known to git - # it is automatically included in every repository) as a way to get # `diff-tree` to print the whole tree state; its `--numstat` output then # prints a summary where two dashes in the first two columns indicates a # binary file. # (See https://git-scm.com/docs/git-diff-tree#_other_diff_formats and # the documentation for the --numstat option. Commands designated as # "plumbing" commands in git have stable output intended for parsing) EMPTY=`git hash-object -t tree /dev/null` git diff-tree --numstat $EMPTY $OCAML_CT_HEAD -- "$f" \ | grep -q "^-[[:blank:]]-" && continue attr_rules=$(get_attrs "$f") fi rules="$userrules" # remove newlines, ensure spaces at boundary rules=" $(echo $rules) " attr_rules=" $(echo $attr_rules) " if test -n "$(echo "$rules $attr_rules" | grep " utf8 ")" then # grep -a is used to force the file to be considered as text and -x # requires the entire line to match. This specifically detects the # presence of lines containing malformed UTF-8. It may be tested using # https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt if $OCAML_CT_CAT "$OCAML_CT_PREFIX$f" \ | LC_ALL=en_US.UTF8 grep -qaxv '.*' ; then echo "File \"$f\" is not correctly encoded in UTF-8" exit 2 fi fi if ! \ ($OCAML_CT_CAT "$OCAML_CT_PREFIX$f" | tr -d '\r'; echo) \ | $OCAML_CT_AWK -v rules="$rules" -v attr_rules="$attr_rules" -v file="$f" \ ' function is_err(name) { return ((rules attr_rules) !~ (" " name "[\\? ]")); } function report_err(name, msg) { printf ("%s:%d.%d:", file, NR, RSTART + RLENGTH); printf (" [%s] %s\n", name, msg); got_errors = 1; } function err(name, msg) { ++ counts[name]; if (is_err(name) && counts[name] <= 10) { report_err(name, msg); if (counts[name] == 10){ printf ("WARNING: too many [%s] in this file.", name); printf (" Others will not be reported.\n"); } } } function err_if(guard, name, msg) { if (is_err(guard)) { err(name, msg); } else { ++ counts[name]; } } function more_columns(str, limit, c){ c = 0; for (i = 1; i <= length(str); i++){ if (substr(str, i, 1) == "\t"){ c = int((c + 8) / 8) * 8; }else{ ++ c; } } return c > limit; } function utf8_decode(str) { if (is_err("utf8")) { return str; } else { # This script assumes that the UTF-8 has been externally validated t = str; gsub(/[\300-\367][\200-\277]+/, "?", t); if (t != str) { ++ counts["utf8"]; } return t; } } BEGIN { state = "(first line)"; } match($0, /\t/) { err("tab", "TAB character(s)"); t = utf8_decode($0); if (more_columns(t, 80)){ RSTART=81; RLENGTH = 0; err_if("very-long-line", "long-line", "line is over 80 columns"); } if (more_columns(t, 132)){ RSTART=133; RLENGTH = 0; err("very-long-line", "line is over 132 columns"); } } match($0, /[\200-\377]/) \ && state != "authors" && state != "copyright" { if (is_err("utf8")) { err("non-ascii", "non-ASCII character(s)"); if (header_utf8 && !is_err("non-ascii")) { err("non-ascii-utf8", \ "non-ASCII character(s) AND UTF-8 encountered"); } } else { ++ counts["utf8"]; } } match($0, /[^\t\200-\377 -~]/) { err("non-printing", "non-printing ASCII character(s)"); } match($0, /[ \t]+$/) { err("white-at-eol", "whitespace at end of line"); } match($0, /\$Id(: .*)?\$/) { err("svn-keyword", "SVN keyword marker"); } $0 !~ /\t/ && length($0) > 80 { t = utf8_decode($0); sub(/https?:[A-Za-z0-9._~:\/?#\[\]@!$&\047()*+,;=%-]{73,}$/, "", t); if (length(t) > 80) { RSTART = 81; RLENGTH = 0; err_if("very-long-line", "long-line", "line is over 80 columns"); } } $0 !~ /\t/ && length($0) > 132 { RSTART = 133; RLENGTH = 0; t = utf8_decode($0); if (length(t) > 132) { err("very-long-line", "line is over 132 columns"); } } # Record that the header contained UTF-8 sequences match($0, /[\300-\367][\200-\277]+/) \ && (state == "authors" || state == "copyright") { header_utf8 = 1; if (counts["non-ascii"] > 0 && is_err("non-ascii")) { err("non-ascii-utf8", \ "non-ASCII character(s) AND UTF-8 encountered"); } } # Header-recognition automaton. Read this from bottom to top. # Valid UTF-8 chars are recognised in copyright and authors # TODO: ensure all files are valid UTF-8 before awking them. # Note that this code also assumes that combining characters are NOT # used (i.e. that every Unicode code-point corresponds to exactly # one displayed character, i.e. no Camels and no including # weird-and-wonderful ways of encoded accented letters). state == "close" && $0 ~ /\*{74}/ { state = "OK"; } state == "close" { state = "(last line)"; } state == "blurb" && $0 ~ /\* {72}\*/ { state = "close"; } state == "blurb" && $0 ~ /\/LICENSE/ { state = "(license path)" } state == "blurb1" && $0 ~ /\* All rights reserved. .{47} \*/ \ { state = "blurb"; } state == "blurb1" { state = "(blurb line 1)"; } state == "copyright" && $0 ~ /\* {72}\*/ { state = "blurb1"; } state == "copyright" \ && $0 !~ /\* Copyright [0-9]{4}([\300-\367][\200-\277]+|.){54} \*/ \ && $0 !~ /\* ([\300-\367][\200-\277]+|.){66} \*/ \ { state = "(copyright lines)"; } state == "authors" && $0 ~ /\* {72}\*/ { state = "copyright"; } state == "authors" \ && $0 !~ /\* ([\300-\367][\200-\277]+|.){70} \*/ \ { state = "(authors)"; } state == "blank2" && $0 ~ /\* {72}\*/ { state = "authors"; } state == "blank2" { state = "(blank line 2)"; } state == "title" && $0 ~ /\* {33}OCaml {34}\*/ { state = "blank2"; } state == "title" { state = "(title line)"; } state == "blank1" && $0 ~ /\* {72}\*/ { state = "title"; } state == "blank1" { state = "(blank line 1)"; } state == "(first line)" && NR < 4 && $0 ~ /\*{74}/ { state = "blank1"; } { prev_line = last_line; last_line = $0; } END { if (match(last_line, /.+/)){ err("missing-lf", "missing linefeed at EOF"); prev_line = last_line; ++ NR; empty_file = 0; }else{ empty_file = NR == 1; } if (!empty_file && match(prev_line, /^$/)){ err("white-at-eof", "empty line(s) at EOF"); } if (state != "OK"){ if (NR >= 10){ NR = 1; RSTART = 1; RLENGTH = 0; err("missing-header", sprintf("bad copyright header %s", state)); }else{ counts["missing-header"] = 1; } } split(attr_rules, r, "[? ]"); for (i in r){ name = r[i]; if (name != "" && !counts[name]){ NR = 1; RSTART = 1; RLENGTH = 0; if (attr_rules !~ (" " name "\\? ")) { report_err(name, sprintf("attribute is unused", name)); } } } exit got_errors; } ' ; then EXIT_CODE=1 fi done exit $EXIT_CODE )