pioneer/scripts/scan_enums.py

#!/usr/bin/env python
# vim: set ts=8 sts=4 sw=4 expandtab autoindent fileencoding=utf-8:

import sys
import os
from optparse import OptionParser
import re
import fnmatch
import glob

# splice lines (std: lex.phase -- Phases of translation: 2.1.1.2)
def splice_lines(lines):
    accum = []
    for ln in lines:
        if not ln.endswith('\n'):
            if ln.endswith('\\'):
                err = ' ends with a backslash'
            else:
                err = ' does not end with a newline'
            try:
                f = lines.filename
            except AttributeError:
                sys.stderr.write('Warning: input file' + err + '\n')
            else:
                sys.stderr.write('Warning: ' + f() + err + '\n')

        if ln.endswith('\\\n'):
            accum.append(ln[:-2])
        else:
            accum.append(ln)
            yield ''.join(accum)
            accum = []

#RX_STRING = re.compile()
RX_LINE_COMMENT = re.compile(r'//(.*)')
RX_BLOCK_COMMENT_BEGIN = re.compile(r'/\*')
RX_BLOCK_COMMENT_END = re.compile(r'\*/\s*') # skip trailing whitespace
# std: lex.operators (2.12.1)
RX_PUNCTUATION = re.compile(
    r""" # 3-character puncutation
       ( \.\.\.                     # varargs ellipsis
       | ->\*                       # dereference member function pointer
       | <<= | >>=                  # bit-shift-assignment
         # 2-character puncutation
       | \#\#                       # preprocessor token concatenation
       | ::                         # scope resolution operator
       | \.\*                       # dereference member function pointer
       | \+= | -= | \*= | /= | %=   # arithmetic-assignment
       | &= | \|= | \^=             # bitwise op-assignment
       | ->                         # dereference member access
       | >> | <<                    # bit-shift and bit-shift-assignment
       | \+\+ | --                  # increment and decrement
       | == | != | <= | >=          # comparison
       | && | \|\|                  # logical and, logical or
         # 1-character puncutation
       | [{}#();:?+*/%^&|~!=<>,\[\]\.\-]
       ) \s*                        # skip trailing whitespace
    """, re.VERBOSE)
RX_IDENTIFIER = re.compile(r'([a-zA-Z_][a-zA-Z0-9_]*)\s*')
RX_PP_HEADERNAME = re.compile(r'((?:<[^>]+>)|(?:"[^"]+"))\s*')
RX_LITERAL = re.compile(
    r"""(?:   (L?)               # wide-char prefix
          (?: "((?: [^"\\] | \\. )*)"  # string literal
          |   '((?: [^'\\] | \\. )+)'  # character literal
        ) \s* )
        | (\.?\d (?: [a-zA-Z0-9_\.] | [eE][+-] )* ) \s*  # pp-number literal
    """, re.VERBOSE)

KEYWORDS = set([
    # alternative representations (std: lex.key, 2.11.2)
    'and','and_eq','bitand','bitor','compl','not','not_eq','or','or_eq','xor','xor_eq',

    # keywords
    'asm', 'auto', 'bool', 'break', 'case', 'catch',
    'char', 'class', 'const', 'const_cast', 'continue',
    'default', 'delete', 'do', 'double', 'dynamic_cast',
    'else', 'enum', 'explicit', 'export', 'extern',
    'false', 'float', 'for', 'friend', 'goto', 'if',
    'inline', 'int', 'long', 'mutable', 'namespace',
    'new', 'operator', 'private', 'protected', 'public',
    'register', 'reinterpret_cast', 'return', 'short',
    'signed', 'sizeof', 'static', 'static_cast', 'struct',
    'switch', 'template', 'this', 'throw', 'true', 'try',
    'typedef', 'typeid', 'typename', 'union', 'unsigned',
    'using', 'virtual', 'void', 'volatile', 'wchar_t', 'while',
])
assert (len(KEYWORDS) == 74)

class CppParseError(Exception):
    def __init__(s, value):
        s.value = value

    def __str__(s):
        return repr(s.value)

def match_pp_token(ln, lines):
    tok = RX_LINE_COMMENT.match(ln)
    if tok is not None:
        return '', 'comment', tok.group(1)
    tok = RX_BLOCK_COMMENT_BEGIN.match(ln)
    if tok is not None:
        end = RX_BLOCK_COMMENT_END.search(ln, 2)
        if end is None:
            accum = [ln[2:]]
            for xln in lines:
                end = RX_BLOCK_COMMENT_END.search(xln)
                if end is not None:
                    accum.append(xln[:end.start()])
                    return xln[end.end():], 'comment', ''.join(accum)
                else:
                    accum.append(xln)
            raise CppParseError('Unclosed block comment')
        else:
            return ln[end.end():], 'comment', ln[2:end.start()]
    tok = RX_LITERAL.match(ln)
    if tok is not None:
        ln = ln[tok.end():]
        if tok.group(2) is not None:
            return ln, 'string', tok.group(2)
        elif tok.group(3) is not None:
            return ln, 'character', tok.group(3)
        elif tok.group(4) is not None:
            return ln, 'number', tok.group(4)
        else:
            assert False, "One of the above should have matched"
    # has to come after string & character, because they can have an 'L' prefix
    tok = RX_IDENTIFIER.match(ln)
    if tok is not None:
        ln = ln[tok.end():]
        if tok.group(1) in KEYWORDS:
            return ln, 'keyword', tok.group(1)
        else:
            return ln, 'identifier', tok.group(1)
    # has to come after block comment, because block comments start with two valid punctuation characters
    # also has to come after pp-number, because pp-number can start with a '.'
    tok = RX_PUNCTUATION.match(ln)
    if tok is not None:
        return ln[tok.end():], 'punctuation', tok.group(1)
    # one of the above should have matched...
    raise CppParseError('Unmatched token: ' + repr(ln))

def match_pp_headername_or_token(ln, lines):
    tok = RX_PP_HEADERNAME.match(ln)
    if tok is not None:
        return ln[tok.end():], 'headername', tok.group(1)
    return match_pp_token(ln, lines)

def lex(lines):
    ln = ''
    while True:
        # grab the next line if this one is blank
        if ln == '':
            try:
                ln = next(lines).lstrip()
            except StopIteration:
                break

        # skip blank lines
        if ln == '':
            continue

        ln, toktype, toktext = match_pp_token(ln, lines)
        # preprocessor include directives are a little special
        # (the source path is a different token type that only matches in this context)
        # this if predicate also copes with null directives (C99 6.10.7)
        if toktype == 'punctuation' and toktext == '#' and ln != '':
            yield toktype, toktext
            ln, toktype, toktext = match_pp_token(ln, lines)
            while toktype == 'comment':
                yield toktype, toktext
                ln, toktype, toktext = match_pp_token(ln, lines)
            if toktype == 'identifier' and toktext == 'include':
                yield toktype, toktext
                ln, toktype, toktext = match_pp_headername_or_token(ln, lines)
                while toktype == 'comment':
                    yield toktype, toktext
                    ln, toktype, toktext = match_pp_headername_or_token(ln, lines)
                # if it didn't match a comment, it should have matched a headername, or some other pp-token
                # either way, it can't match another headername (though it might match more pp-tokens)
        yield toktype, toktext

def collect_comments(tokens, comments):
    for toktype, toktext in tokens:
        if toktype != 'comment':
            return toktype, toktext
        else:
            comments.append(toktext)

def skip_comments(tokens):
    for toktype, toktext in tokens:
        if toktype != 'comment':
            return toktype, toktext

class EnumItem:
    def __init__(s, identifier):
        s.identifier = identifier
        s.name = None
        s.skip = False

    def __str__(s):
        x = ['EnumItem(', repr(s.identifier)]
        if s.name is not None:
            x += [', name=', repr(s.name)]
        if s.skip:
            x += [', skip=True']
        x.append(')')
        return ''.join(x)

    def read_attrs(s, attrs):
        if 'skip' in attrs:
            s.skip = attrs['skip']
        if 'name' in attrs:
            s.name = attrs['name']

class EnumData:
    def __init__(s, identifier):
        s.identifier = identifier
        s.name = None
        s.prefix = None
        s.scope = None
        s.public = None
        s.items = []

    def __str__(s):
        x = [ 'EnumData:'
            , '  identifier: ' + repr(s.identifier)
            , '  name: ' + repr(s.name)
            , '  prefix: ' + repr(s.prefix)
            , '  scope: ' + repr(s.scope)
            , '  public: ' + repr(s.public)
            , '  items:']
        if s.items:
            for item in s.items:
                x.append('    ' + str(item))
        else:
            x.append('    pass')
        return '\n'.join(x)

    def read_attrs(s, attrs):
        if 'name' in attrs:
            s.name = attrs['name']
        if 'prefix' in attrs:
            s.prefix = attrs['prefix']
        if 'scope' in attrs:
            s.scope = attrs['scope']
        if 'public' in attrs:
            s.public = attrs['public']

    def ident(s):
        if s.name is not None:
            id = s.name
        else:
            if s.identifier != '':
                id = s.identifier
            else:
                raise Exception('Enum with no identifier')
        return id

    def write_c_table(s, fl):
        id = s.ident()
        scope_prefix = '' if s.scope is None else s.scope + '::'
        fl.write('const struct EnumItem ENUM_' + id + '[] = {\n')
        for item in s.items:
            if item.skip:
                continue
            id = item.name if item.name is not None else item.identifier
            fl.write('\t{ "' + id + '", int(' + scope_prefix + item.identifier + ') },\n')
        fl.write('\t{ 0, 0 },\n')
        fl.write('};\n')

    def write_c_table_table_row(s, fl, allTables = True):
        id = s.ident()
        if not allTables and not s.public:
            return
        fl.write('\t{ "' + id + '", ENUM_' + id + ' },\n')

    def write_c_header(s, fl):
        id = s.ident()
        fl.write('extern const struct EnumItem ENUM_' + id + '[];\n')

RX_ENUM_TAG = re.compile(r'<\s*enum((?:\s+[a-zA-Z_]+(?:=(\w+|\'[^\']*\'|"[^"]*"))?)*)\s*>')
RX_ENUM_ATTR = re.compile(r'([a-zA-Z_]+)(?:=(\w+|\'[^\']*\'|"[^"]*"))?')

def extract_attributes(text):
    attr = {}
    for m in RX_ENUM_ATTR.finditer(text):
        if m.group(2) is not None:
            value = m.group(2)
            if (value[0] == '"' and value[-1] == '"') or (value[0] == "'" and value[-1] == "'"):
                value = value[1:-1]
            attr[m.group(1)] = value
        else:
            attr[m.group(1)] = True
    return attr

def parse_enum(toktype, toktext, tokens, preceding_comment=None):
    assert toktype == 'keyword'
    assert toktext == 'enum'
    tag = []
    if preceding_comment is not None:
        tag.append(preceding_comment)
    toktype, toktext = collect_comments(tokens, tag)

    if toktype == 'keyword' and toktext in ['struct', 'class']:
        # C++11 'enum class' (strongly typed enum) declaration
        toktype, toktext = collect_comments(tokens, tag)

    if toktype == 'identifier':
        identifier = toktext
        toktype, toktext = collect_comments(tokens, tag)
    else:
        identifier = ''

    if toktype == 'punctuation' and toktext == ':':
        # C++11 enum with fixed underlying type
        # FIXME: look up correct grammar for enum underlying type
        # currently this should match any valid underlying type, but also matches various invalid things
        # that's ok, because exact syntactic semantic checking is the compiler's job, not the job of scan_enums
        toktype, toktext = collect_comments(tokens, tag)
        while toktype == 'keyword' and toktext in ['signed','unsigned','long','short','int','char']:
            toktype, toktext = collect_comments(tokens, tag)
        if toktype == 'identifier':
            toktype, toktext = collect_comments(tokens, tag)

    if toktype == 'punctuation' and toktext == '{':
        # comments become part of the enum tag right up until
        # the identifier for the first elements
        toktype, toktext = collect_comments(tokens, tag)

        tag = RX_ENUM_TAG.search(' '.join(tag))
        if tag is None:
            return None

        e = EnumData(identifier)
        e.read_attrs(extract_attributes(tag.group(1)))

        while toktype == 'identifier':
            item = EnumItem(toktext)
            tag = []
            toktype, toktext = collect_comments(tokens, tag)
            while toktype != 'punctuation' or toktext not in [',', '}']:
                toktype, toktext = collect_comments(tokens, tag)
            if toktype == 'punctuation' and toktext == ',':
                toktype, toktext = collect_comments(tokens, tag)

            tag = RX_ENUM_TAG.search(' '.join(tag))
            if tag is not None:
                item.read_attrs(extract_attributes(tag.group(1)))

            if item.name is None and e.prefix is not None and item.identifier.startswith(e.prefix):
                item.name = item.identifier[len(e.prefix):]
            e.items.append(item)

        if toktype != 'punctuation' or toktext != '}':
            raise Exception('Bad enum')

        return e
    else:
        # might be a forward declaration or a variable declaration
        return None

def write_license_header(fl):
    fl.write('/* Copyright © 2008-2020 Pioneer Developers. See AUTHORS.txt for details */\n')
    fl.write('/* Licensed under the terms of the GPL v3. See licenses/GPL-3.txt        */\n')
    fl.write('\n')

def write_generation_header(fl):
    fl.write('/* THIS FILE IS AUTO-GENERATED, CHANGES WILL BE OVERWRITTEN */\n')
    fl.write('/* enum table generated by scan_enums.py */\n\n')

def write_header(enums, fl):
    write_license_header(fl)
    fl.write('#ifndef HX_GEN_ENUM_TABLES\n')
    fl.write('#define HX_GEN_ENUM_TABLES\n\n')
    write_generation_header(fl)
    fl.write('struct EnumItem { const char *name; int value; };\n')
    fl.write('struct EnumTable { const char *name; const EnumItem *first; };\n\n')
    for e in enums:
        e.write_c_header(fl)
    fl.write('\n')
    fl.write('extern const struct EnumTable ENUM_TABLES[];\n')
    fl.write('extern const struct EnumTable ENUM_TABLES_PUBLIC[];\n\n')
    fl.write('#endif\n')

def write_tables(enums, headers, hpath, fl):
    write_license_header(fl)
    write_generation_header(fl)
    if hpath is not None:
       fl.write('#include "' + hpath + '"\n')
    for h in headers:
        fl.write('#include "' + h + '"\n')
    fl.write('\n')
    for e in enums:
        e.write_c_table(fl)
        fl.write('\n')
    fl.write('const struct EnumTable ENUM_TABLES[] = {\n')
    for e in enums:
        e.write_c_table_table_row(fl)
    fl.write('\t{ 0, 0 },\n')
    fl.write('};\n\n')
    fl.write('const struct EnumTable ENUM_TABLES_PUBLIC[] = {\n')
    for e in enums:
        e.write_c_table_table_row(fl, False)
    fl.write('\t{ 0, 0 },\n')
    fl.write('};\n')

def extract_enums(lines):
    lines = splice_lines(lines)
    tokens = lex(lines)
    lastcomment = ''
    for toktype, toktext in tokens:
        if toktype == 'comment':
            lastcomment = toktext
        elif toktype == 'keyword' and toktext == 'enum':
            e = parse_enum(toktype, toktext, tokens, lastcomment)
            if e is not None:
                yield e
        else:
            # comments that don't immediately precede the 'enum' keyword
            # are discarded
            lastcomment = ''

def recursive_glob(basedir, pattern):
    for root, dirnames, filenames in os.walk(basedir):
        for name in fnmatch.filter(filenames, pattern):
            yield os.path.join(root, name)

def expand_dirs(args, pattern, recursive):
    for path in args:
        if path != '-' and os.path.isdir(path):
            if not pattern:
                sys.stderr.write("Warning: skipping directory input '" + path + "'\n")
                continue

            if recursive:
                for name in recursive_glob(path, pattern):
                    yield name
            else:
                for name in glob.iglob(os.path.join(path, pattern)):
                    yield name
        else:
            yield path

def main():
    oparse = OptionParser(usage='%prog [options] headers-to-scan')
    oparse.add_option('-o', '--output', type="string", dest="outfile", default='-',
          help="Specify the output file to write to (typically with a .c or .cpp extension). " +
               "Specify '-' to write to stdout (this is the default).")
    oparse.add_option('--header', type="string", dest="headerfile",
          help="Specify the header file to write to. If the main output file is not stdout " +
               "then this defaults to a file of the same name with the extension changed to .h; " +
               "otherwise, then no header content is written.")
    oparse.add_option('--pattern', type='string', dest='pattern',
          help="Specify a file pattern to match for the input files (e.g., *.h). " +
               "This pattern is used to scan any directory inputs.")
    oparse.add_option('-r','--recursive', dest='recursive', action='store_true', default=False,
          help="Scan directory inputs recursively (used with the --pattern argument).")
    (options, args) = oparse.parse_args()

    if options.headerfile is not None and options.outfile is None:
        oparse.error('if you specify --header you must also specify --output')

    # if no input files are specified, default to reading from stdin
    if not args:
        args = ['-']

    # scan input files and record list of headers that have enums
    enums = []
    headers = []
    allinputs = list(expand_dirs(args, options.pattern, options.recursive))
    allinputs.sort()
    for path in allinputs:
        try:
            if path == '-':
                es = list(extract_enums(sys.stdin))
            else:
                with open(path, 'rU') as fl:
                    # skip an optional UTF-8 Byte Order Mark
                    if sys.version_info[0] >= 3:
                        hasbom = (fl.read(1) == '\uFEFF')
                    else:
                        hasbom = (fl.read(3) == '\xef\xbb\xbf')
                    if hasbom:
                        sys.stderr.write("Warning: file '" + path + "' uses a UTF-8 Byte Order Mark\n")
                    else:
                        fl.seek(0)

                    es = list(extract_enums(fl))
                if es:
                    if options.outfile == '-':
                        hpath = os.path.basename(path)
                    else:
                        hpath = os.path.relpath(path, os.path.dirname(options.outfile))
                    headers.append(hpath)
            enums += es
        except CppParseError as e:
            if path == '-':
                prettypath = 'input'
            else:
                prettypath = "'" + path + "'"
            sys.stderr.write("Warning: C++ parse error in " + prettypath + ":\n")
            sys.stderr.write('    ' + e.value + '\n')
        except UnicodeDecodeError as e:
            if path == '-':
                prettypath = 'input'
            else:
                prettypath = "'" + path + "'"
            sys.stderr.write("Warning: UTF-8 decode error in " + prettypath + ":\n")
            sys.stderr.write('    ' + str(e) + '\n')

    if options.outfile == '-':
        # write to stdout (no header)
        write_tables(enums, headers, None, sys.stdout)
    else:
        # write to the specified file(s)
        assert options.outfile is not None
        cpath = options.outfile
        if options.headerfile is None:
            fname, ext = os.path.splitext(options.outfile)
            hpath = fname + '.h'
        else:
            hpath = options.headerfile

        with open(hpath, 'w') as fl:
            write_header(enums, fl)
        with open(cpath, 'w') as fl:
            write_tables(enums, headers, os.path.basename(hpath), fl)

if __name__ == '__main__' and not sys.flags.interactive:
    main()