LIBS: updated simplecpp

2021-08-06 15:11:57 +02:00 · 2021-08-06 15:11:57 +02:00 · 8940d77443
parent 7d6cadcfa9
commit 8940d77443
2 changed files with 289 additions and 25 deletions
--- a/contrib/libs/simplecpp/simplecpp.cpp
+++ b/contrib/libs/simplecpp/simplecpp.cpp
@ -23,6 +23,7 @@
 #include "simplecpp.h"

 #include <algorithm>
+#include <climits>
 #include <cstdlib>
 #include <cstring>
 #include <exception>
@ -1503,8 +1504,11 @@ namespace simplecpp {
                                expanded = true;
                            }
                        }
-                        if (!expanded)
+                        if (!expanded) {
                            tokens->push_back(new Token(*tok));
+                            if (tok->macro.empty() && (par > 0 || tok->str() != "("))
+                                tokens->back()->macro = name();
+                        }
                    }

                    if (tok->op == '(')
@ -1608,7 +1612,14 @@ namespace simplecpp {
                    if (sameline(tok, tok->next) && tok->next && tok->next->op == '#' && tok->next->next && tok->next->next->op == '#') {
                        if (!sameline(tok, tok->next->next->next))
                            throw invalidHashHash(tok->location, name());
-                        output->push_back(newMacroToken(expandArgStr(tok, parametertokens2), loc, isReplaced(expandedmacros)));
+                        TokenList new_output(files);
+                        if (!expandArg(&new_output, tok, parametertokens2))
+                            output->push_back(newMacroToken(tok->str(), loc, isReplaced(expandedmacros)));
+                        else if (new_output.empty()) // placemarker token
+                            output->push_back(newMacroToken("", loc, isReplaced(expandedmacros)));
+                        else
+                            for (const Token *tok2 = new_output.cfront(); tok2; tok2 = tok2->next)
+                                output->push_back(newMacroToken(tok2->str(), loc, isReplaced(expandedmacros)));
                        tok = tok->next;
                    } else {
                        tok = expandToken(output, loc, tok, macros, expandedmacros, parametertokens2);
@ -1804,29 +1815,13 @@ namespace simplecpp {
                    partok = it->second.expand(output, loc, partok, macros, expandedmacros);
                else {
                    output->push_back(newMacroToken(partok->str(), loc, isReplaced(expandedmacros)));
+                    output->back()->macro = partok->macro;
                    partok = partok->next;
                }
            }
            return true;
        }

-        /**
-         * Get string for token. If token is argument, the expanded string is returned.
-         * @param tok              The token
-         * @param parametertokens  parameters given when expanding this macro
-         * @return string
-         */
-        std::string expandArgStr(const Token *tok, const std::vector<const Token *> &parametertokens) const {
-            TokenList tokens(files);
-            if (expandArg(&tokens, tok, parametertokens)) {
-                std::string s;
-                for (const Token *tok2 = tokens.cfront(); tok2; tok2 = tok2->next)
-                    s += tok2->str();
-                return s;
-            }
-            return tok->str();
-        }
-
        /**
         * Expand #X => "X"
         * @param output  destination tokenlist
@ -2304,6 +2299,253 @@ static void simplifyName(simplecpp::TokenList &expr)
    }
 }

+/*
+ * Reads at least minlen and at most maxlen digits (inc. prefix) in base base
+ * from s starting at position pos and converts them to a
+ * unsigned long long value, updating pos to point to the first
+ * unused element of s.
+ * Returns ULLONG_MAX if the result is not representable and
+ * throws if the above requirements were not possible to satisfy.
+ */
+static unsigned long long stringToULLbounded(
+    const std::string& s,
+    std::size_t& pos,
+    int base = 0,
+    std::ptrdiff_t minlen = 1,
+    std::size_t maxlen = std::string::npos
+)
+{
+    std::string sub = s.substr(pos, maxlen);
+    const char* start = sub.c_str();
+    char* end;
+    unsigned long long value = std::strtoull(start, &end, base);
+    pos += end - start;
+    if (end - start < minlen)
+        throw std::runtime_error("expected digit");
+    return value;
+}
+
+/* Converts character literal (including prefix, but not ud-suffix)
+ * to long long value.
+ *
+ * Assumes ASCII-compatible single-byte encoded str for narrow literals
+ * and UTF-8 otherwise.
+ *
+ * For target assumes
+ * - execution character set encoding matching str
+ * - UTF-32 execution wide-character set encoding
+ * - requirements for __STDC_UTF_16__, __STDC_UTF_32__ and __STDC_ISO_10646__ satisfied
+ * - char16_t is 16bit wide
+ * - char32_t is 32bit wide
+ * - wchar_t is 32bit wide and unsigned
+ * - matching char signedness to host
+ * - matching sizeof(int) to host
+ *
+ * For host assumes
+ * - ASCII-compatible execution character set
+ *
+ * For host and target assumes
+ * - CHAR_BIT == 8
+ * - two's complement
+ *
+ * Implements multi-character narrow literals according to GCC's behavior,
+ * except multi code unit universal character names are not supported.
+ * Multi-character wide literals are not supported.
+ * Limited support of universal character names for non-UTF-8 execution character set encodings.
+ */
+long long simplecpp::characterLiteralToLL(const std::string& str)
+{
+    // default is wide/utf32
+    bool narrow = false;
+    bool utf8 = false;
+    bool utf16 = false;
+
+    std::size_t pos;
+
+    if (str.size() >= 1 && str[0] == '\'') {
+        narrow = true;
+        pos = 1;
+    } else if (str.size() >= 2 && str[0] == 'u' && str[1] == '\'') {
+        utf16 = true;
+        pos = 2;
+    } else if (str.size() >= 3 && str[0] == 'u' && str[1] == '8' && str[2] == '\'') {
+        utf8 = true;
+        pos = 3;
+    } else if (str.size() >= 2 && (str[0] == 'L' || str[0] == 'U') && str[1] == '\'') {
+        pos = 2;
+    } else
+        throw std::runtime_error("expected a character literal");
+
+    unsigned long long multivalue = 0;
+
+    std::size_t nbytes = 0;
+
+    while (pos + 1 < str.size()) {
+        if (str[pos] == '\'' || str[pos] == '\n')
+            throw std::runtime_error("raw single quotes and newlines not allowed in character literals");
+
+        if (nbytes >= 1 && !narrow)
+            throw std::runtime_error("multiple characters only supported in narrow character literals");
+
+        unsigned long long value;
+
+        if (str[pos] == '\\') {
+            pos++;
+            char escape = str[pos++];
+
+            if (pos >= str.size())
+                throw std::runtime_error("unexpected end of character literal");
+
+            switch (escape) {
+            // obscure GCC extensions
+            case '%':
+            case '(':
+            case '[':
+            case '{':
+            // standard escape sequences
+            case '\'':
+            case '"':
+            case '?':
+            case '\\':
+                value = static_cast<unsigned char>(escape);
+                break;
+
+            case 'a':
+                value = static_cast<unsigned char>('\a');
+                break;
+            case 'b':
+                value = static_cast<unsigned char>('\b');
+                break;
+            case 'f':
+                value = static_cast<unsigned char>('\f');
+                break;
+            case 'n':
+                value = static_cast<unsigned char>('\n');
+                break;
+            case 'r':
+                value = static_cast<unsigned char>('\r');
+                break;
+            case 't':
+                value = static_cast<unsigned char>('\t');
+                break;
+            case 'v':
+                value = static_cast<unsigned char>('\v');
+                break;
+
+            // GCC extension for ESC character
+            case 'e':
+            case 'E':
+                value = static_cast<unsigned char>('\x1b');
+                break;
+
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+                // octal escape sequences consist of 1 to 3 digits
+                value = stringToULLbounded(str, --pos, 8, 1, 3);
+                break;
+
+            case 'x':
+                // hexadecimal escape sequences consist of at least 1 digit
+                value = stringToULLbounded(str, pos, 16);
+                break;
+
+            case 'u':
+            case 'U': {
+                // universal character names have exactly 4 or 8 digits
+                std::size_t ndigits = (escape == 'u' ? 4 : 8);
+                value = stringToULLbounded(str, pos, 16, ndigits, ndigits);
+
+                // UTF-8 encodes code points above 0x7f in multiple code units
+                // code points above 0x10ffff are not allowed
+                if (((narrow || utf8) && value > 0x7f) || (utf16 && value > 0xffff) || value > 0x10ffff)
+                    throw std::runtime_error("code point too large");
+
+                if (value >= 0xd800 && value <= 0xdfff)
+                    throw std::runtime_error("surrogate code points not allowed in universal character names");
+
+                break;
+            }
+
+            default:
+                throw std::runtime_error("invalid escape sequence");
+            }
+        } else {
+            value = static_cast<unsigned char>(str[pos++]);
+
+            if (!narrow && value >= 0x80) {
+                // Assuming this is a UTF-8 encoded code point.
+                // This decoder may not completely validate the input.
+                // Noncharacters are neither rejected nor replaced.
+
+                int additional_bytes;
+                if (value >= 0xf5)  // higher values would result in code points above 0x10ffff
+                    throw std::runtime_error("assumed UTF-8 encoded source, but sequence is invalid");
+                else if (value >= 0xf0)
+                    additional_bytes = 3;
+                else if (value >= 0xe0)
+                    additional_bytes = 2;
+                else if (value >= 0xc2) // 0xc0 and 0xc1 are always overlong 2-bytes encodings
+                    additional_bytes = 1;
+                else
+                    throw std::runtime_error("assumed UTF-8 encoded source, but sequence is invalid");
+
+                value &= (1 << (6 - additional_bytes)) - 1;
+
+                while (additional_bytes--) {
+                    if (pos + 1 >= str.size())
+                        throw std::runtime_error("assumed UTF-8 encoded source, but character literal ends unexpectedly");
+
+                    unsigned char c = str[pos++];
+
+                    if (((c >> 6) != 2)    // ensure c has form 0xb10xxxxxx
+                        || (!value && additional_bytes == 1 && c < 0xa0)    // overlong 3-bytes encoding
+                        || (!value && additional_bytes == 2 && c < 0x90))   // overlong 4-bytes encoding
+                        throw std::runtime_error("assumed UTF-8 encoded source, but sequence is invalid");
+
+                    value = (value << 6) | (c & ((1 << 7) - 1));
+                }
+
+                if (value >= 0xd800 && value <= 0xdfff)
+                    throw std::runtime_error("assumed UTF-8 encoded source, but sequence is invalid");
+
+                if ((utf8 && value > 0x7f) || (utf16 && value > 0xffff) || value > 0x10ffff)
+                    throw std::runtime_error("code point too large");
+            }
+        }
+
+        if (((narrow || utf8) && value > std::numeric_limits<unsigned char>::max()) || (utf16 && value >> 16) || value >> 32)
+            throw std::runtime_error("numeric escape sequence too large");
+
+        multivalue <<= CHAR_BIT;
+        multivalue |= value;
+        nbytes++;
+    }
+
+    if (pos + 1 != str.size() || str[pos] != '\'')
+        throw std::runtime_error("missing closing quote in character literal");
+
+    if (!nbytes)
+        throw std::runtime_error("empty character literal");
+
+    // ordinary narrow character literal's value is determined by (possibly signed) char
+    if (narrow && nbytes == 1)
+        return static_cast<char>(multivalue);
+
+    // while multi-character literal's value is determined by (signed) int
+    if (narrow)
+        return static_cast<int>(multivalue);
+
+    // All other cases are unsigned. Since long long is at least 64bit wide,
+    // while the literals at most 32bit wide, the conversion preserves all values.
+    return multivalue;
+}
+
 static void simplifyNumbers(simplecpp::TokenList &expr)
 {
    for (simplecpp::Token *tok = expr.front(); tok; tok = tok->next) {
@ -2311,8 +2553,8 @@ static void simplifyNumbers(simplecpp::TokenList &expr)
            continue;
        if (tok->str().compare(0,2,"0x") == 0)
            tok->setstr(toString(stringToULL(tok->str())));
-        else if (tok->str()[0] == '\'')
-            tok->setstr(toString(tok->str()[1] & 0xffU));
+        else if (!tok->number && tok->str().find('\'') != tok->str().npos)
+            tok->setstr(toString(simplecpp::characterLiteralToLL(tok->str())));
    }
 }

@ -2562,7 +2804,7 @@ static bool preprocessToken(simplecpp::TokenList &output, const simplecpp::Token
    return true;
 }

-void simplecpp::preprocess(simplecpp::TokenList &output, const simplecpp::TokenList &rawtokens, std::vector<std::string> &files, std::map<std::string, simplecpp::TokenList *> &filedata, const simplecpp::DUI &dui, simplecpp::OutputList *outputList, std::list<simplecpp::MacroUsage> *macroUsage)
+void simplecpp::preprocess(simplecpp::TokenList &output, const simplecpp::TokenList &rawtokens, std::vector<std::string> &files, std::map<std::string, simplecpp::TokenList *> &filedata, const simplecpp::DUI &dui, simplecpp::OutputList *outputList, std::list<simplecpp::MacroUsage> *macroUsage, std::list<simplecpp::IfCond> *ifCond)
 {
    std::map<std::string, std::size_t> sizeOfType(rawtokens.sizeOfType);
    sizeOfType.insert(std::make_pair("char", sizeof(char)));
@ -2877,7 +3119,17 @@ void simplecpp::preprocess(simplecpp::TokenList &output, const simplecpp::TokenL
                        tok = tmp->previous;
                    }
                    try {
-                        conditionIsTrue = (evaluate(expr, sizeOfType) != 0);
+                        if (ifCond) {
+                            std::string E;
+                            for (const simplecpp::Token *tok = expr.cfront(); tok; tok = tok->next)
+                                E += (E.empty() ? "" : " ") + tok->str();
+                            const long long result = evaluate(expr, sizeOfType);
+                            conditionIsTrue = (result != 0);
+                            ifCond->push_back(IfCond(rawtok->location, E, result));
+                        } else {
+                            const long long result = evaluate(expr, sizeOfType);
+                            conditionIsTrue = (result != 0);
+                        }
                    } catch (const std::exception &e) {
                        if (outputList) {
                            Output out(rawtok->location.files);
--- a/contrib/libs/simplecpp/simplecpp.h
+++ b/contrib/libs/simplecpp/simplecpp.h
@ -108,7 +108,8 @@ namespace simplecpp {
        }

        void flags() {
-            name = (std::isalpha((unsigned char)string[0]) || string[0] == '_' || string[0] == '$');
+            name = (std::isalpha((unsigned char)string[0]) || string[0] == '_' || string[0] == '$')
+                   && (string.find('\'') == string.npos);
            comment = string.size() > 1U && string[0] == '/' && (string[1] == '/' || string[1] == '*');
            number = std::isdigit((unsigned char)string[0]) || (string.size() > 1U && string[0] == '-' && std::isdigit((unsigned char)string[1]));
            op = (string.size() == 1U) ? string[0] : '\0';
@ -287,6 +288,14 @@ namespace simplecpp {
        bool        macroValueKnown;
    };

+    /** Tracking #if/#elif expressions */
+    struct SIMPLECPP_LIB IfCond {
+        explicit IfCond(const Location& location, const std::string &E, long long result) : location(location), E(E), result(result) {}
+        Location location; // location of #if/#elif
+        std::string E; // preprocessed condition
+        long long result; // condition result
+    };
+
    /**
     * Command line preprocessor settings.
     * On the command line these are configured by -D, -U, -I, --include, -std
@ -300,6 +309,8 @@ namespace simplecpp {
        std::string std;
    };

+    SIMPLECPP_LIB long long characterLiteralToLL(const std::string& str);
+
    SIMPLECPP_LIB std::map<std::string, TokenList*> load(const TokenList &rawtokens, std::vector<std::string> &filenames, const DUI &dui, OutputList *outputList = NULL);

    /**
@ -312,8 +323,9 @@ namespace simplecpp {
     * @param dui defines, undefs, and include paths
     * @param outputList output: list that will receive output messages
     * @param macroUsage output: macro usage
+     * @param ifCond output: #if/#elif expressions
     */
-    SIMPLECPP_LIB void preprocess(TokenList &output, const TokenList &rawtokens, std::vector<std::string> &files, std::map<std::string, TokenList*> &filedata, const DUI &dui, OutputList *outputList = NULL, std::list<MacroUsage> *macroUsage = NULL);
+    SIMPLECPP_LIB void preprocess(TokenList &output, const TokenList &rawtokens, std::vector<std::string> &files, std::map<std::string, TokenList*> &filedata, const DUI &dui, OutputList *outputList = NULL, std::list<MacroUsage> *macroUsage = NULL, std::list<IfCond> *ifCond = NULL);

    /**
     * Deallocate data