// Scintilla source code edit control /** @file LexPerl.cxx ** Lexer for subset of Perl. **/ // Copyright 1998-2008 by Neil Hodgson // Lexical analysis fixes by Kein-Hong Man // The License.txt file describes the conditions under which this software may be distributed. #include #include #include #include #include #include "Platform.h" #include "PropSet.h" #include "Accessor.h" #include "KeyWords.h" #include "Scintilla.h" #include "SciLexer.h" #ifdef SCI_NAMESPACE using namespace Scintilla; #endif #define PERLNUM_BINARY 1 // order is significant: 1-4 cannot have a dot #define PERLNUM_HEX 2 #define PERLNUM_OCTAL 3 #define PERLNUM_FLOAT 4 // actually exponent part #define PERLNUM_DECIMAL 5 // 1-5 are numbers; 6-7 are strings #define PERLNUM_VECTOR 6 #define PERLNUM_V_VECTOR 7 #define PERLNUM_BAD 8 #define BACK_NONE 0 // lookback state for bareword disambiguation: #define BACK_OPERATOR 1 // whitespace/comments are insignificant #define BACK_KEYWORD 2 // operators/keywords are needed for disambiguation #define HERE_DELIM_MAX 256 static inline bool isEOLChar(char ch) { return (ch == '\r') || (ch == '\n'); } static bool isSingleCharOp(char ch) { char strCharSet[2]; strCharSet[0] = ch; strCharSet[1] = '\0'; return (NULL != strstr("rwxoRWXOezsfdlpSbctugkTBMAC", strCharSet)); } static inline bool isPerlOperator(char ch) { if (ch == '^' || ch == '&' || ch == '\\' || ch == '(' || ch == ')' || ch == '-' || ch == '+' || ch == '=' || ch == '|' || ch == '{' || ch == '}' || ch == '[' || ch == ']' || ch == ':' || ch == ';' || ch == '>' || ch == ',' || ch == '?' || ch == '!' || ch == '.' || ch == '~') return true; // these chars are already tested before this call // ch == '%' || ch == '*' || ch == '<' || ch == '/' || return false; } static bool isPerlKeyword(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler) { char s[100]; unsigned int i, len = end - start; if (len > 30) { len = 30; } for (i = 0; i < len; i++, start++) s[i] = styler[start]; s[i] = '\0'; return keywords.InList(s); } // Note: as lexer uses chars, UTF-8 bytes are considered as <0 values // Note: iswordchar() was used in only one place in LexPerl, it is // unnecessary as '.' is processed as the concatenation operator, so // only isWordStart() is used in LexPerl static inline bool isWordStart(char ch) { return !isascii(ch) || isalnum(ch) || ch == '_'; } static inline bool isEndVar(char ch) { return isascii(ch) && !isalnum(ch) && ch != '#' && ch != '$' && ch != '_' && ch != '\''; } static inline bool isNonQuote(char ch) { return !isascii(ch) || isalnum(ch) || ch == '_'; } static inline char actualNumStyle(int numberStyle) { if (numberStyle == PERLNUM_VECTOR || numberStyle == PERLNUM_V_VECTOR) { return SCE_PL_STRING; } else if (numberStyle == PERLNUM_BAD) { return SCE_PL_ERROR; } return SCE_PL_NUMBER; } static bool isMatch(Accessor &styler, int lengthDoc, int pos, const char *val) { if ((pos + static_cast(strlen(val))) >= lengthDoc) { return false; } while (*val) { if (*val != styler[pos++]) { return false; } val++; } return true; } static char opposite(char ch) { if (ch == '(') return ')'; if (ch == '[') return ']'; if (ch == '{') return '}'; if (ch == '<') return '>'; return ch; } static void ColourisePerlDoc(unsigned int startPos, int length, int initStyle, WordList *keywordlists[], Accessor &styler) { // Lexer for perl often has to backtrack to start of current style to determine // which characters are being used as quotes, how deeply nested is the // start position and what the termination string is for here documents WordList &keywords = *keywordlists[0]; // keywords that forces /PATTERN/ at all times WordList reWords; reWords.Set("elsif if split while"); class HereDocCls { public: int State; // 0: '<<' encountered // 1: collect the delimiter // 2: here doc text (lines after the delimiter) char Quote; // the char after '<<' bool Quoted; // true if Quote in ('\'','"','`') int DelimiterLength; // strlen(Delimiter) char *Delimiter; // the Delimiter, 256: sizeof PL_tokenbuf HereDocCls() { State = 0; Quote = 0; Quoted = false; DelimiterLength = 0; Delimiter = new char[HERE_DELIM_MAX]; Delimiter[0] = '\0'; } ~HereDocCls() { delete []Delimiter; } }; HereDocCls HereDoc; // TODO: FIFO for stacked here-docs class QuoteCls { public: int Rep; int Count; char Up; char Down; QuoteCls() { this->New(1); } void New(int r) { Rep = r; Count = 0; Up = '\0'; Down = '\0'; } void Open(char u) { Count++; Up = u; Down = opposite(Up); } }; QuoteCls Quote; int state = initStyle; char numState = PERLNUM_DECIMAL; int dotCount = 0; unsigned int lengthDoc = startPos + length; //int sookedpos = 0; // these have no apparent use, see POD state //char sooked[100]; //sooked[sookedpos] = '\0'; styler.StartAt(startPos, static_cast(STYLE_MAX)); // If in a long distance lexical state, seek to the beginning to find quote characters // Perl strings can be multi-line with embedded newlines, so backtrack. // Perl numbers have additional state during lexing, so backtrack too. if (state == SCE_PL_HERE_Q || state == SCE_PL_HERE_QQ || state == SCE_PL_HERE_QX) { while ((startPos > 1) && (styler.StyleAt(startPos) != SCE_PL_HERE_DELIM)) { startPos--; } startPos = styler.LineStart(styler.GetLine(startPos)); state = styler.StyleAt(startPos - 1); } // Backtrack for format body. if (state == SCE_PL_FORMAT) { while ((startPos > 1) && (styler.StyleAt(startPos) != SCE_PL_FORMAT_IDENT)) { startPos--; } startPos = styler.LineStart(styler.GetLine(startPos)); state = styler.StyleAt(startPos - 1); } if ( state == SCE_PL_STRING_Q || state == SCE_PL_STRING_QQ || state == SCE_PL_STRING_QX || state == SCE_PL_STRING_QR || state == SCE_PL_STRING_QW || state == SCE_PL_REGEX || state == SCE_PL_REGSUBST || state == SCE_PL_STRING || state == SCE_PL_BACKTICKS || state == SCE_PL_CHARACTER || state == SCE_PL_NUMBER || state == SCE_PL_IDENTIFIER || state == SCE_PL_ERROR || state == SCE_PL_SUB_PROTOTYPE ) { while ((startPos > 1) && (styler.StyleAt(startPos - 1) == state)) { startPos--; } state = SCE_PL_DEFAULT; } // lookback at start of lexing to set proper state for backflag // after this, they are updated when elements are lexed int backflag = BACK_NONE; unsigned int backPos = startPos; if (backPos > 0) { backPos--; int sty = SCE_PL_DEFAULT; while ((backPos > 0) && (sty = styler.StyleAt(backPos), sty == SCE_PL_DEFAULT || sty == SCE_PL_COMMENTLINE)) backPos--; if (sty == SCE_PL_OPERATOR) backflag = BACK_OPERATOR; else if (sty == SCE_PL_WORD) backflag = BACK_KEYWORD; } styler.StartAt(startPos, static_cast(STYLE_MAX)); char chPrev = styler.SafeGetCharAt(startPos - 1); if (startPos == 0) chPrev = '\n'; char chNext = styler[startPos]; styler.StartSegment(startPos); for (unsigned int i = startPos; i < lengthDoc; i++) { char ch = chNext; // if the current character is not consumed due to the completion of an // earlier style, lexing can be restarted via a simple goto restartLexer: chNext = styler.SafeGetCharAt(i + 1); char chNext2 = styler.SafeGetCharAt(i + 2); if (styler.IsLeadByte(ch)) { chNext = styler.SafeGetCharAt(i + 2); chPrev = ' '; i += 1; continue; } if ((chPrev == '\r' && ch == '\n')) { // skip on DOS/Windows styler.ColourTo(i, state); chPrev = ch; continue; } if (HereDoc.State == 1 && isEOLChar(ch)) { // Begin of here-doc (the line after the here-doc delimiter): // Lexically, the here-doc starts from the next line after the >>, but the // first line of here-doc seem to follow the style of the last EOL sequence HereDoc.State = 2; if (HereDoc.Quoted) { if (state == SCE_PL_HERE_DELIM) { // Missing quote at end of string! We are stricter than perl. // Colour here-doc anyway while marking this bit as an error. state = SCE_PL_ERROR; } styler.ColourTo(i - 1, state); switch (HereDoc.Quote) { case '\'': state = SCE_PL_HERE_Q ; break; case '"': state = SCE_PL_HERE_QQ; break; case '`': state = SCE_PL_HERE_QX; break; } } else { styler.ColourTo(i - 1, state); switch (HereDoc.Quote) { case '\\': state = SCE_PL_HERE_Q ; break; default : state = SCE_PL_HERE_QQ; } } } if (HereDoc.State == 4 && isEOLChar(ch)) { // Start of format body. HereDoc.State = 0; styler.ColourTo(i - 1, state); state = SCE_PL_FORMAT; } if (state == SCE_PL_DEFAULT) { if ((isascii(ch) && isdigit(ch)) || (isascii(chNext) && isdigit(chNext) && (ch == '.' || ch == 'v'))) { state = SCE_PL_NUMBER; backflag = BACK_NONE; numState = PERLNUM_DECIMAL; dotCount = 0; if (ch == '0') { // hex,bin,octal if (chNext == 'x') { numState = PERLNUM_HEX; } else if (chNext == 'b') { numState = PERLNUM_BINARY; } else if (isascii(chNext) && isdigit(chNext)) { numState = PERLNUM_OCTAL; } if (numState != PERLNUM_DECIMAL) { i++; ch = chNext; chNext = chNext2; } } else if (ch == 'v') { // vector numState = PERLNUM_V_VECTOR; } } else if (isWordStart(ch)) { // if immediately prefixed by '::', always a bareword state = SCE_PL_WORD; if (chPrev == ':' && styler.SafeGetCharAt(i - 2) == ':') { state = SCE_PL_IDENTIFIER; } unsigned int kw = i + 1; // first check for possible quote-like delimiter if (ch == 's' && !isNonQuote(chNext)) { state = SCE_PL_REGSUBST; Quote.New(2); } else if (ch == 'm' && !isNonQuote(chNext)) { state = SCE_PL_REGEX; Quote.New(1); } else if (ch == 'q' && !isNonQuote(chNext)) { state = SCE_PL_STRING_Q; Quote.New(1); } else if (ch == 'y' && !isNonQuote(chNext)) { state = SCE_PL_REGSUBST; Quote.New(2); } else if (ch == 't' && chNext == 'r' && !isNonQuote(chNext2)) { state = SCE_PL_REGSUBST; Quote.New(2); kw++; } else if (ch == 'q' && (chNext == 'q' || chNext == 'r' || chNext == 'w' || chNext == 'x') && !isNonQuote(chNext2)) { if (chNext == 'q') state = SCE_PL_STRING_QQ; else if (chNext == 'x') state = SCE_PL_STRING_QX; else if (chNext == 'r') state = SCE_PL_STRING_QR; else if (chNext == 'w') state = SCE_PL_STRING_QW; Quote.New(1); kw++; } else if (ch == 'x' && (chNext == '=' || // repetition !isWordStart(chNext) || (isdigit(chPrev) && isdigit(chNext)))) { state = SCE_PL_OPERATOR; } // if potentially a keyword, scan forward and grab word, then check // if it's really one; if yes, disambiguation test is performed // otherwise it is always a bareword and we skip a lot of scanning // note: keywords assumed to be limited to [_a-zA-Z] only if (state == SCE_PL_WORD) { while (isWordStart(styler.SafeGetCharAt(kw))) kw++; if (!isPerlKeyword(styler.GetStartSegment(), kw, keywords, styler)) { state = SCE_PL_IDENTIFIER; } } // if already SCE_PL_IDENTIFIER, then no ambiguity, skip this // for quote-like delimiters/keywords, attempt to disambiguate // to select for bareword, change state -> SCE_PL_IDENTIFIER if (state != SCE_PL_IDENTIFIER && i > 0) { unsigned int j = i; bool moreback = false; // true if passed newline/comments bool brace = false; // true if opening brace found char ch2; // first look backwards past whitespace/comments for EOLs // if BACK_NONE, neither operator nor keyword, so skip test if (backflag != BACK_NONE) { while (--j > backPos) { if (isEOLChar(styler.SafeGetCharAt(j))) moreback = true; } ch2 = styler.SafeGetCharAt(j); if (ch2 == '{' && !moreback) { // {bareword: possible variable spec brace = true; } else if ((ch2 == '&' && styler.SafeGetCharAt(j - 1) != '&') // &bareword: subroutine call || (ch2 == '>' && styler.SafeGetCharAt(j - 1) == '-') // ->bareword: part of variable spec || (ch2 == 'b' && styler.Match(j - 2, "su"))) { // sub bareword: subroutine declaration // (implied BACK_KEYWORD, no keywords end in 'sub'!) state = SCE_PL_IDENTIFIER; } // if status still ambiguous, look forward after word past // tabs/spaces only; if ch2 isn't one of '[{(,' it can never // match anything, so skip the whole thing j = kw; if (state != SCE_PL_IDENTIFIER && (ch2 == '{' || ch2 == '(' || ch2 == '['|| ch2 == ',') && kw < lengthDoc) { while (ch2 = styler.SafeGetCharAt(j), (ch2 == ' ' || ch2 == '\t') && j < lengthDoc) { j++; } if ((ch2 == '}' && brace) // {bareword}: variable spec || (ch2 == '=' && styler.SafeGetCharAt(j + 1) == '>')) { // [{(, bareword=>: hash literal state = SCE_PL_IDENTIFIER; } } } } backflag = BACK_NONE; // an identifier or bareword if (state == SCE_PL_IDENTIFIER) { if ((!isWordStart(chNext) && chNext != '\'') || (chNext == '.' && chNext2 == '.')) { // We need that if length of word == 1! // This test is copied from the SCE_PL_WORD handler. styler.ColourTo(i, SCE_PL_IDENTIFIER); state = SCE_PL_DEFAULT; } // a keyword } else if (state == SCE_PL_WORD) { i = kw - 1; if (ch == '_' && chNext == '_' && (isMatch(styler, lengthDoc, styler.GetStartSegment(), "__DATA__") || isMatch(styler, lengthDoc, styler.GetStartSegment(), "__END__"))) { styler.ColourTo(i, SCE_PL_DATASECTION); state = SCE_PL_DATASECTION; } else { if (isMatch(styler, lengthDoc, styler.GetStartSegment(), "format")) { state = SCE_PL_FORMAT_IDENT; HereDoc.State = 0; } else { state = SCE_PL_DEFAULT; } styler.ColourTo(i, SCE_PL_WORD); backflag = BACK_KEYWORD; backPos = i; } ch = styler.SafeGetCharAt(i); chNext = styler.SafeGetCharAt(i + 1); // a repetition operator 'x' } else if (state == SCE_PL_OPERATOR) { state = SCE_PL_DEFAULT; goto handleOperator; // quote-like delimiter, skip one char if double-char delimiter } else { i = kw - 1; chNext = styler.SafeGetCharAt(i + 1); } } else if (ch == '#') { state = SCE_PL_COMMENTLINE; } else if (ch == '\"') { state = SCE_PL_STRING; Quote.New(1); Quote.Open(ch); backflag = BACK_NONE; } else if (ch == '\'') { if (chPrev == '&') { // Archaic call styler.ColourTo(i, state); } else { state = SCE_PL_CHARACTER; Quote.New(1); Quote.Open(ch); } backflag = BACK_NONE; } else if (ch == '`') { state = SCE_PL_BACKTICKS; Quote.New(1); Quote.Open(ch); backflag = BACK_NONE; } else if (ch == '$') { if ((chNext == '{') || isspacechar(chNext)) { styler.ColourTo(i, SCE_PL_SCALAR); } else { state = SCE_PL_SCALAR; if ((chNext == '`' && chNext2 == '`') || (chNext == ':' && chNext2 == ':')) { i += 2; ch = styler.SafeGetCharAt(i); chNext = styler.SafeGetCharAt(i + 1); } else { i++; ch = chNext; chNext = chNext2; } } backflag = BACK_NONE; } else if (ch == '@') { if (!isascii(chNext) || isalpha(chNext) || chNext == '#' || chNext == '$' || chNext == '_' || chNext == '+' || chNext == '-') { state = SCE_PL_ARRAY; } else if (chNext == ':' && chNext2 == ':') { state = SCE_PL_ARRAY; i += 2; ch = styler.SafeGetCharAt(i); chNext = styler.SafeGetCharAt(i + 1); } else if (chNext != '{' && chNext != '[') { styler.ColourTo(i, SCE_PL_ARRAY); } else { styler.ColourTo(i, SCE_PL_ARRAY); } backflag = BACK_NONE; } else if (ch == '%') { backflag = BACK_NONE; if (!isascii(chNext) || isalpha(chNext) || chNext == '#' || chNext == '$' || chNext == '_' || chNext == '!' || chNext == '^') { state = SCE_PL_HASH; i++; ch = chNext; chNext = chNext2; } else if (chNext == ':' && chNext2 == ':') { state = SCE_PL_HASH; i += 2; ch = styler.SafeGetCharAt(i); chNext = styler.SafeGetCharAt(i + 1); } else if (chNext == '{') { styler.ColourTo(i, SCE_PL_HASH); } else { goto handleOperator; } } else if (ch == '*') { backflag = BACK_NONE; char strch[2]; strch[0] = chNext; strch[1] = '\0'; if (chNext == ':' && chNext2 == ':') { state = SCE_PL_SYMBOLTABLE; i += 2; ch = styler.SafeGetCharAt(i); chNext = styler.SafeGetCharAt(i + 1); } else if (!isascii(chNext) || isalpha(chNext) || chNext == '_' || NULL != strstr("^/|,\\\";#%^:?<>)[]", strch)) { state = SCE_PL_SYMBOLTABLE; i++; ch = chNext; chNext = chNext2; } else if (chNext == '{') { styler.ColourTo(i, SCE_PL_SYMBOLTABLE); } else { if (chNext == '*') { // exponentiation i++; ch = chNext; chNext = chNext2; } goto handleOperator; } } else if (ch == '/' || (ch == '<' && chNext == '<')) { // Explicit backward peeking to set a consistent preferRE for // any slash found, so no longer need to track preferRE state. // Find first previous significant lexed element and interpret. // Test for HERE doc start '<<' shares this code, helps to // determine if it should be an operator. bool preferRE = false; bool isHereDoc = (ch == '<'); bool hereDocSpace = false; // these are for corner case: bool hereDocScalar = false; // SCALAR [whitespace] '<<' unsigned int bk = (i > 0)? i - 1: 0; unsigned int bkend; char bkch; styler.Flush(); if (styler.StyleAt(bk) == SCE_PL_DEFAULT) hereDocSpace = true; while ((bk > 0) && (styler.StyleAt(bk) == SCE_PL_DEFAULT || styler.StyleAt(bk) == SCE_PL_COMMENTLINE)) { bk--; } if (bk == 0) { // position 0 won't really be checked; rarely happens // hard to fix due to an unsigned index i preferRE = true; } else { int bkstyle = styler.StyleAt(bk); bkch = styler.SafeGetCharAt(bk); switch(bkstyle) { case SCE_PL_OPERATOR: preferRE = true; if (bkch == ')' || bkch == ']') { preferRE = false; } else if (bkch == '}') { // backtrack further, count balanced brace pairs // if a brace pair found, see if it's a variable int braceCount = 1; while (--bk > 0) { bkstyle = styler.StyleAt(bk); if (bkstyle == SCE_PL_OPERATOR) { bkch = styler.SafeGetCharAt(bk); if (bkch == ';') { // early out break; } else if (bkch == '}') { braceCount++; } else if (bkch == '{') { if (--braceCount == 0) break; } } } if (bk == 0) { // at beginning, true } else if (braceCount == 0) { // balanced { found, bk>0, skip more whitespace if (styler.StyleAt(--bk) == SCE_PL_DEFAULT) { while (bk > 0) { bkstyle = styler.StyleAt(--bk); if (bkstyle != SCE_PL_DEFAULT) break; } } bkstyle = styler.StyleAt(bk); if (bkstyle == SCE_PL_SCALAR || bkstyle == SCE_PL_ARRAY || bkstyle == SCE_PL_HASH || bkstyle == SCE_PL_SYMBOLTABLE || bkstyle == SCE_PL_OPERATOR) { preferRE = false; } } } break; case SCE_PL_IDENTIFIER: preferRE = true; if (bkch == '>') { // inputsymbol preferRE = false; break; } // backtrack to find "->" or "::" before identifier while (bk > 0 && styler.StyleAt(bk) == SCE_PL_IDENTIFIER) { bk--; } while (bk > 0) { bkstyle = styler.StyleAt(bk); if (bkstyle == SCE_PL_DEFAULT || bkstyle == SCE_PL_COMMENTLINE) { } else if (bkstyle == SCE_PL_OPERATOR) { bkch = styler.SafeGetCharAt(bk); // test for "->" and "::" if ((bkch == '>' && styler.SafeGetCharAt(bk - 1) == '-') || (bkch == ':' && styler.SafeGetCharAt(bk - 1) == ':')) { preferRE = false; break; } } else { // bare identifier, if '/', /PATTERN/ unless digit/space immediately after '/' // if '//', always expect defined-or operator to follow identifier if (!isHereDoc && (isspacechar(chNext) || isdigit(chNext) || chNext == '/')) preferRE = false; // HERE docs cannot have a space after the >> if (isspacechar(chNext)) preferRE = false; break; } bk--; } break; case SCE_PL_SCALAR: // for $var<< case hereDocScalar = true; break; // for HERE docs, always true for preferRE case SCE_PL_WORD: preferRE = true; if (isHereDoc) break; // adopt heuristics similar to vim-style rules: // keywords always forced as /PATTERN/: split, if, elsif, while // everything else /PATTERN/ unless digit/space immediately after '/' // for '//', defined-or favoured unless special keywords bkend = bk + 1; while (bk > 0 && styler.StyleAt(bk-1) == SCE_PL_WORD) { bk--; } if (isPerlKeyword(bk, bkend, reWords, styler)) break; if (isspacechar(chNext) || isdigit(chNext) || chNext == '/') preferRE = false; break; // other styles uses the default, preferRE=false case SCE_PL_POD: case SCE_PL_POD_VERB: case SCE_PL_HERE_Q: case SCE_PL_HERE_QQ: case SCE_PL_HERE_QX: preferRE = true; break; } } backflag = BACK_NONE; if (isHereDoc) { // handle HERE doc // if SCALAR whitespace '<<', *always* a HERE doc if (preferRE || (hereDocSpace && hereDocScalar)) { state = SCE_PL_HERE_DELIM; HereDoc.State = 0; } else { // << operator i++; ch = chNext; chNext = chNext2; goto handleOperator; } } else { // handle regexp if (preferRE) { state = SCE_PL_REGEX; Quote.New(1); Quote.Open(ch); } else { // / and // operators if (chNext == '/') { i++; ch = chNext; chNext = chNext2; } goto handleOperator; } } } else if (ch == '<') { // looks forward for matching > on same line unsigned int fw = i + 1; while (fw < lengthDoc) { char fwch = styler.SafeGetCharAt(fw); if (fwch == ' ') { if (styler.SafeGetCharAt(fw-1) != '\\' || styler.SafeGetCharAt(fw-2) != '\\') goto handleOperator; } else if (isEOLChar(fwch) || isspacechar(fwch)) { goto handleOperator; } else if (fwch == '>') { if ((fw - i) == 2 && // '<=>' case styler.SafeGetCharAt(fw-1) == '=') { goto handleOperator; } styler.ColourTo(fw, SCE_PL_IDENTIFIER); i = fw; ch = fwch; chNext = styler.SafeGetCharAt(i+1); } fw++; } if (fw == lengthDoc) goto handleOperator; } else if (ch == '=' // POD && isalpha(chNext) && (isEOLChar(chPrev))) { state = SCE_PL_POD; backflag = BACK_NONE; //sookedpos = 0; //sooked[sookedpos] = '\0'; } else if (ch == '-' // file test operators && isSingleCharOp(chNext) && !isalnum((chNext2 = styler.SafeGetCharAt(i+2)))) { styler.ColourTo(i + 1, SCE_PL_WORD); state = SCE_PL_DEFAULT; i++; ch = chNext; chNext = chNext2; backflag = BACK_NONE; } else if (ch == '-' // bareword promotion (-FOO cases) && ((isascii(chNext) && isalpha(chNext)) || chNext == '_') && backflag != BACK_NONE) { state = SCE_PL_IDENTIFIER; backflag = BACK_NONE; } else if (ch == '(' && i > 0) { // backtrack to identify if we're starting a sub prototype // for generality, we need to ignore whitespace/comments unsigned int bk = i - 1; // i > 0 tested above styler.Flush(); while (bk > 0 && (styler.StyleAt(bk) == SCE_PL_DEFAULT || styler.StyleAt(bk) == SCE_PL_COMMENTLINE)) { bk--; } if (bk == 0 || styler.StyleAt(bk) != SCE_PL_IDENTIFIER) // check identifier goto handleOperator; while (bk > 0 && (styler.StyleAt(bk) == SCE_PL_IDENTIFIER)) { bk--; } while (bk > 0 && (styler.StyleAt(bk) == SCE_PL_DEFAULT || styler.StyleAt(bk) == SCE_PL_COMMENTLINE)) { bk--; } if (bk < 2 || styler.StyleAt(bk) != SCE_PL_WORD // check "sub" keyword || !styler.Match(bk - 2, "sub")) // assume suffix is unique! goto handleOperator; state = SCE_PL_SUB_PROTOTYPE; backflag = BACK_NONE; backPos = i; // needed for restart } else if (isPerlOperator(ch)) { if (ch == '.' && chNext == '.') { // .. and ... i++; if (chNext2 == '.') { i++; } state = SCE_PL_DEFAULT; ch = styler.SafeGetCharAt(i); chNext = styler.SafeGetCharAt(i + 1); } handleOperator: styler.ColourTo(i, SCE_PL_OPERATOR); backflag = BACK_OPERATOR; backPos = i; } else if (ch == 4 || ch == 26) { // ^D and ^Z ends valid perl source styler.ColourTo(i, SCE_PL_DATASECTION); state = SCE_PL_DATASECTION; } else { // keep colouring defaults to make restart easier styler.ColourTo(i, SCE_PL_DEFAULT); } } else if (state == SCE_PL_NUMBER) { if (ch == '.') { if (chNext == '.') { // double dot is always an operator goto numAtEnd; } else if (numState <= PERLNUM_FLOAT) { // non-decimal number or float exponent, consume next dot styler.ColourTo(i - 1, SCE_PL_NUMBER); state = SCE_PL_DEFAULT; goto handleOperator; } else { // decimal or vectors allows dots dotCount++; if (numState == PERLNUM_DECIMAL) { if (dotCount > 1) { if (isdigit(chNext)) { // really a vector numState = PERLNUM_VECTOR; } else // number then dot goto numAtEnd; } } else { // vectors if (!isdigit(chNext)) // vector then dot goto numAtEnd; } } } else if (ch == '_') { // permissive underscoring for number and vector literals } else if (!isascii(ch) || isalnum(ch)) { if (numState == PERLNUM_VECTOR || numState == PERLNUM_V_VECTOR) { if (!isascii(ch) || isalpha(ch)) { if (dotCount == 0) { // change to word state = SCE_PL_IDENTIFIER; } else { // vector then word goto numAtEnd; } } } else if (numState == PERLNUM_DECIMAL) { if (ch == 'E' || ch == 'e') { // exponent numState = PERLNUM_FLOAT; if (chNext == '+' || chNext == '-') { i++; ch = chNext; chNext = chNext2; } } else if (!isascii(ch) || !isdigit(ch)) { // number then word goto numAtEnd; } } else if (numState == PERLNUM_FLOAT) { if (!isdigit(ch)) { // float then word goto numAtEnd; } } else if (numState == PERLNUM_OCTAL) { if (!isdigit(ch)) goto numAtEnd; else if (ch > '7') numState = PERLNUM_BAD; } else if (numState == PERLNUM_BINARY) { if (!isdigit(ch)) goto numAtEnd; else if (ch > '1') numState = PERLNUM_BAD; } else if (numState == PERLNUM_HEX) { int ch2 = toupper(ch); if (!isdigit(ch) && !(ch2 >= 'A' && ch2 <= 'F')) goto numAtEnd; } else {//(numState == PERLNUM_BAD) { if (!isdigit(ch)) goto numAtEnd; } } else { // complete current number or vector numAtEnd: styler.ColourTo(i - 1, actualNumStyle(numState)); state = SCE_PL_DEFAULT; goto restartLexer; } } else if (state == SCE_PL_IDENTIFIER) { if (!isWordStart(chNext) && chNext != '\'') { styler.ColourTo(i, SCE_PL_IDENTIFIER); state = SCE_PL_DEFAULT; ch = ' '; } } else { if (state == SCE_PL_COMMENTLINE) { if (isEOLChar(ch)) { styler.ColourTo(i - 1, state); state = SCE_PL_DEFAULT; goto restartLexer; } else if (isEOLChar(chNext)) { styler.ColourTo(i, state); state = SCE_PL_DEFAULT; } } else if (state == SCE_PL_HERE_DELIM) { // // From perldata.pod: // ------------------ // A line-oriented form of quoting is based on the shell ``here-doc'' // syntax. // Following a << you specify a string to terminate the quoted material, // and all lines following the current line down to the terminating // string are the value of the item. // The terminating string may be either an identifier (a word), // or some quoted text. // If quoted, the type of quotes you use determines the treatment of // the text, just as in regular quoting. // An unquoted identifier works like double quotes. // There must be no space between the << and the identifier. // (If you put a space it will be treated as a null identifier, // which is valid, and matches the first empty line.) // (This is deprecated, -w warns of this syntax) // The terminating string must appear by itself (unquoted and with no // surrounding whitespace) on the terminating line. // // From Bash info: // --------------- // Specifier format is: <<[-]WORD // Optional '-' is for removal of leading tabs from here-doc. // Whitespace acceptable after <<[-] operator. // if (HereDoc.State == 0) { // '<<' encountered bool gotspace = false; unsigned int oldi = i; if (chNext == ' ' || chNext == '\t') { // skip whitespace; legal for quoted delimiters gotspace = true; do { i++; chNext = styler.SafeGetCharAt(i + 1); } while ((i + 1 < lengthDoc) && (chNext == ' ' || chNext == '\t')); chNext2 = styler.SafeGetCharAt(i + 2); } HereDoc.State = 1; HereDoc.Quote = chNext; HereDoc.Quoted = false; HereDoc.DelimiterLength = 0; HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0'; if (chNext == '\'' || chNext == '"' || chNext == '`') { // a quoted here-doc delimiter i++; ch = chNext; chNext = chNext2; HereDoc.Quoted = true; } else if (isspacechar(chNext) || isdigit(chNext) || chNext == '\\' || chNext == '=' || chNext == '$' || chNext == '@' || ((isalpha(chNext) || chNext == '_') && gotspace)) { // left shift << or <<= operator cases // restore position if operator i = oldi; styler.ColourTo(i, SCE_PL_OPERATOR); state = SCE_PL_DEFAULT; backflag = BACK_OPERATOR; backPos = i; HereDoc.State = 0; goto restartLexer; } else { // an unquoted here-doc delimiter, no special handling // (cannot be prefixed by spaces/tabs), or // symbols terminates; deprecated zero-length delimiter } } else if (HereDoc.State == 1) { // collect the delimiter backflag = BACK_NONE; if (HereDoc.Quoted) { // a quoted here-doc delimiter if (ch == HereDoc.Quote) { // closing quote => end of delimiter styler.ColourTo(i, state); state = SCE_PL_DEFAULT; } else { if (ch == '\\' && chNext == HereDoc.Quote) { // escaped quote i++; ch = chNext; chNext = chNext2; } HereDoc.Delimiter[HereDoc.DelimiterLength++] = ch; HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0'; } } else { // an unquoted here-doc delimiter if (isalnum(ch) || ch == '_') { HereDoc.Delimiter[HereDoc.DelimiterLength++] = ch; HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0'; } else { styler.ColourTo(i - 1, state); state = SCE_PL_DEFAULT; goto restartLexer; } } if (HereDoc.DelimiterLength >= HERE_DELIM_MAX - 1) { styler.ColourTo(i - 1, state); state = SCE_PL_ERROR; goto restartLexer; } } } else if (HereDoc.State == 2) { // state == SCE_PL_HERE_Q || state == SCE_PL_HERE_QQ || state == SCE_PL_HERE_QX if (isEOLChar(chPrev) && isMatch(styler, lengthDoc, i, HereDoc.Delimiter)) { i += HereDoc.DelimiterLength; chPrev = styler.SafeGetCharAt(i - 1); ch = styler.SafeGetCharAt(i); if (isEOLChar(ch)) { styler.ColourTo(i - 1, state); state = SCE_PL_DEFAULT; backflag = BACK_NONE; HereDoc.State = 0; goto restartLexer; } chNext = styler.SafeGetCharAt(i + 1); } } else if (state == SCE_PL_POD || state == SCE_PL_POD_VERB) { if (isEOLChar(chPrev)) { if (ch == ' ' || ch == '\t') { styler.ColourTo(i - 1, state); state = SCE_PL_POD_VERB; } else { styler.ColourTo(i - 1, state); state = SCE_PL_POD; if (ch == '=') { if (isMatch(styler, lengthDoc, i, "=cut")) { styler.ColourTo(i - 1 + 4, state); i += 4; state = SCE_PL_DEFAULT; ch = styler.SafeGetCharAt(i); //chNext = styler.SafeGetCharAt(i + 1); goto restartLexer; } } } } } else if (state == SCE_PL_SCALAR // variable names || state == SCE_PL_ARRAY || state == SCE_PL_HASH || state == SCE_PL_SYMBOLTABLE) { if (ch == ':' && chNext == ':') { // skip :: i++; ch = chNext; chNext = chNext2; } else if (isEndVar(ch)) { if (i == (styler.GetStartSegment() + 1)) { // Special variable: $(, $_ etc. styler.ColourTo(i, state); state = SCE_PL_DEFAULT; } else { styler.ColourTo(i - 1, state); state = SCE_PL_DEFAULT; goto restartLexer; } } } else if (state == SCE_PL_REGEX || state == SCE_PL_STRING_QR ) { if (!Quote.Up && !isspacechar(ch)) { Quote.Open(ch); } else if (ch == '\\' && Quote.Up != '\\') { // SG: Is it save to skip *every* escaped char? i++; ch = chNext; chNext = styler.SafeGetCharAt(i + 1); } else { if (ch == Quote.Down /*&& chPrev != '\\'*/) { Quote.Count--; if (Quote.Count == 0) { Quote.Rep--; if (Quote.Up == Quote.Down) { Quote.Count++; } } if (!isalpha(chNext)) { if (Quote.Rep <= 0) { styler.ColourTo(i, state); state = SCE_PL_DEFAULT; ch = ' '; } } } else if (ch == Quote.Up /*&& chPrev != '\\'*/) { Quote.Count++; } else if (!isascii(chNext) || !isalpha(chNext)) { if (Quote.Rep <= 0) { styler.ColourTo(i, state); state = SCE_PL_DEFAULT; ch = ' '; } } } } else if (state == SCE_PL_REGSUBST) { if (!Quote.Up && !isspacechar(ch)) { Quote.Open(ch); } else if (ch == '\\' && Quote.Up != '\\') { // SG: Is it save to skip *every* escaped char? i++; ch = chNext; chNext = styler.SafeGetCharAt(i + 1); } else { if (Quote.Count == 0 && Quote.Rep == 1) { /* We matched something like s(...) or tr{...} * and are looking for the next matcher characters, * which could be either bracketed ({...}) or non-bracketed * (/.../). * * Number-signs are problematic. If they occur after * the close of the first part, treat them like * a Quote.Up char, even if they actually start comments. * * If we find an alnum, we end the regsubst, and punt. * * Eric Promislow ericp@activestate.com Aug 9,2000 */ if (isspacechar(ch)) { // Keep going } else if (!isascii(ch) || isalnum(ch)) { styler.ColourTo(i, state); state = SCE_PL_DEFAULT; ch = ' '; } else { Quote.Open(ch); } } else if (ch == Quote.Down /*&& chPrev != '\\'*/) { Quote.Count--; if (Quote.Count == 0) { Quote.Rep--; } if (!isascii(chNext) || !isalpha(chNext)) { if (Quote.Rep <= 0) { styler.ColourTo(i, state); state = SCE_PL_DEFAULT; ch = ' '; } } if (Quote.Up == Quote.Down) { Quote.Count++; } } else if (ch == Quote.Up /*&& chPrev != '\\'*/) { Quote.Count++; } else if (!isascii(chNext) || !isalpha(chNext)) { if (Quote.Rep <= 0) { styler.ColourTo(i, state); state = SCE_PL_DEFAULT; ch = ' '; } } } } else if (state == SCE_PL_STRING_Q || state == SCE_PL_STRING_QQ || state == SCE_PL_STRING_QX || state == SCE_PL_STRING_QW || state == SCE_PL_STRING || state == SCE_PL_CHARACTER || state == SCE_PL_BACKTICKS ) { if (!Quote.Down && !isspacechar(ch)) { Quote.Open(ch); } else if (ch == '\\' && Quote.Up != '\\') { i++; ch = chNext; chNext = styler.SafeGetCharAt(i + 1); } else if (ch == Quote.Down) { Quote.Count--; if (Quote.Count == 0) { Quote.Rep--; if (Quote.Rep <= 0) { styler.ColourTo(i, state); state = SCE_PL_DEFAULT; ch = ' '; } if (Quote.Up == Quote.Down) { Quote.Count++; } } } else if (ch == Quote.Up) { Quote.Count++; } } else if (state == SCE_PL_SUB_PROTOTYPE) { char strch[2]; strch[0] = ch; strch[1] = '\0'; if (NULL != strstr("\\[$@%&*];", strch)) { // keep going } else if (ch == ')') { styler.ColourTo(i, state); state = SCE_PL_DEFAULT; } else { // abandon prototype, restart from '(' i = backPos; styler.ColourTo(i, SCE_PL_OPERATOR); ch = styler.SafeGetCharAt(i); chNext = styler.SafeGetCharAt(i + 1); state = SCE_PL_DEFAULT; } } else if (state == SCE_PL_FORMAT_IDENT) { // occupies different HereDoc states to avoid clashing with HERE docs if (HereDoc.State == 0) { if ((isascii(ch) && isalpha(ch)) || ch == '_' // probable identifier || ch == '=') { // no identifier HereDoc.State = 3; HereDoc.Quoted = false; // whitespace flag } else if (ch == ' ' || ch == '\t') { styler.ColourTo(i, SCE_PL_DEFAULT); } else { state = SCE_PL_DEFAULT; HereDoc.State = 0; goto restartLexer; } } if (HereDoc.State == 3) { // with just a '=', state goes 0->3->4 if (ch == '=') { styler.ColourTo(i, SCE_PL_FORMAT_IDENT); state = SCE_PL_DEFAULT; HereDoc.State = 4; } else if (ch == ' ' || ch == '\t') { HereDoc.Quoted = true; } else if (isEOLChar(ch) || (HereDoc.Quoted && ch != '=')) { // abandon format, restart from after 'format' i = backPos + 1; ch = styler.SafeGetCharAt(i); chNext = styler.SafeGetCharAt(i + 1); state = SCE_PL_DEFAULT; HereDoc.State = 0; } } } else if (state == SCE_PL_FORMAT) { if (isEOLChar(chPrev)) { styler.ColourTo(i - 1, state); if (ch == '.' && isEOLChar(chNext)) { styler.ColourTo(i, state); state = SCE_PL_DEFAULT; } } } } if (state == SCE_PL_ERROR) { break; } chPrev = ch; } styler.ColourTo(lengthDoc - 1, state); } static bool IsCommentLine(int line, Accessor &styler) { int pos = styler.LineStart(line); int eol_pos = styler.LineStart(line + 1) - 1; for (int i = pos; i < eol_pos; i++) { char ch = styler[i]; int style = styler.StyleAt(i); if (ch == '#' && style == SCE_PL_COMMENTLINE) return true; else if (ch != ' ' && ch != '\t') return false; } return false; } static void FoldPerlDoc(unsigned int startPos, int length, int, WordList *[], Accessor &styler) { bool foldComment = styler.GetPropertyInt("fold.comment") != 0; bool foldCompact = styler.GetPropertyInt("fold.compact", 1) != 0; // Custom folding of POD and packages bool foldPOD = styler.GetPropertyInt("fold.perl.pod", 1) != 0; bool foldPackage = styler.GetPropertyInt("fold.perl.package", 1) != 0; unsigned int endPos = startPos + length; int visibleChars = 0; int lineCurrent = styler.GetLine(startPos); int levelPrev = SC_FOLDLEVELBASE; if (lineCurrent > 0) levelPrev = styler.LevelAt(lineCurrent - 1) >> 16; int levelCurrent = levelPrev; char chNext = styler[startPos]; char chPrev = styler.SafeGetCharAt(startPos - 1); int styleNext = styler.StyleAt(startPos); // Used at end of line to determine if the line was a package definition bool isPackageLine = false; bool isPodHeading = false; for (unsigned int i = startPos; i < endPos; i++) { char ch = chNext; chNext = styler.SafeGetCharAt(i + 1); int style = styleNext; styleNext = styler.StyleAt(i + 1); bool atEOL = (ch == '\r' && chNext != '\n') || (ch == '\n'); bool atLineStart = isEOLChar(chPrev) || i == 0; // Comment folding if (foldComment && atEOL && IsCommentLine(lineCurrent, styler)) { if (!IsCommentLine(lineCurrent - 1, styler) && IsCommentLine(lineCurrent + 1, styler)) levelCurrent++; else if (IsCommentLine(lineCurrent - 1, styler) && !IsCommentLine(lineCurrent+1, styler)) levelCurrent--; } if (style == SCE_C_OPERATOR) { if (ch == '{') { levelCurrent++; } else if (ch == '}') { levelCurrent--; } } // Custom POD folding if (foldPOD && atLineStart) { int stylePrevCh = (i) ? styler.StyleAt(i - 1):SCE_PL_DEFAULT; if (style == SCE_PL_POD) { if (stylePrevCh != SCE_PL_POD && stylePrevCh != SCE_PL_POD_VERB) levelCurrent++; else if (styler.Match(i, "=cut")) levelCurrent--; else if (styler.Match(i, "=head")) isPodHeading = true; } else if (style == SCE_PL_DATASECTION) { if (ch == '=' && isalpha(chNext) && levelCurrent == SC_FOLDLEVELBASE) levelCurrent++; else if (styler.Match(i, "=cut") && levelCurrent > SC_FOLDLEVELBASE) levelCurrent--; else if (styler.Match(i, "=head")) isPodHeading = true; // if package used or unclosed brace, level > SC_FOLDLEVELBASE! // reset needed as level test is vs. SC_FOLDLEVELBASE else if (styler.Match(i, "__END__")) levelCurrent = SC_FOLDLEVELBASE; } } // Custom package folding if (foldPackage && atLineStart) { if (style == SCE_PL_WORD && styler.Match(i, "package")) { isPackageLine = true; } } if (atEOL) { int lev = levelPrev; if (isPodHeading) { lev = levelPrev - 1; lev |= SC_FOLDLEVELHEADERFLAG; isPodHeading = false; } // Check if line was a package declaration // because packages need "special" treatment if (isPackageLine) { lev = SC_FOLDLEVELBASE | SC_FOLDLEVELHEADERFLAG; levelCurrent = SC_FOLDLEVELBASE + 1; isPackageLine = false; } lev |= levelCurrent << 16; if (visibleChars == 0 && foldCompact) lev |= SC_FOLDLEVELWHITEFLAG; if ((levelCurrent > levelPrev) && (visibleChars > 0)) lev |= SC_FOLDLEVELHEADERFLAG; if (lev != styler.LevelAt(lineCurrent)) { styler.SetLevel(lineCurrent, lev); } lineCurrent++; levelPrev = levelCurrent; visibleChars = 0; } if (!isspacechar(ch)) visibleChars++; chPrev = ch; } // Fill in the real level of the next line, keeping the current flags as they will be filled in later int flagsNext = styler.LevelAt(lineCurrent) & ~SC_FOLDLEVELNUMBERMASK; styler.SetLevel(lineCurrent, levelPrev | flagsNext); } static const char * const perlWordListDesc[] = { "Keywords", 0 }; LexerModule lmPerl(SCLEX_PERL, ColourisePerlDoc, "perl", FoldPerlDoc, perlWordListDesc, 8);