geany/tagmanager/lregex.c
Nick Treleaven b079b19da0 Use GRegex for CTags instead of POSIX regex - GRegex is more
powerful. This also fixes a (HTML) performance issue on Windows. 
Geany will now print a debug warning when using the "b" CTags 
regex flag option for non-extended syntax. This is not currently 
used by Geany's parsers.
Note: GNU regex can't be removed yet as it's still used elsewhere 
by Geany.



git-svn-id: https://geany.svn.sourceforge.net/svnroot/geany/branches/unstable@5976 ea778897-0a13-0410-b9d1-a72fbfd435f5
2011-09-30 14:43:18 +00:00

701 lines
16 KiB
C

/*
* $Id$
*
* Copyright (c) 2000-2003, Darren Hiebert
*
* This source code is released for free distribution under the terms of the
* GNU General Public License.
*
* This module contains functions for applying regular expression matching.
*
* The code for utlizing the Gnu regex package with regards to processing the
* regex option and checking for regex matches was adapted from routines in
* Gnu etags.
*/
/*
* INCLUDE FILES
*/
#include "general.h" /* must always come first */
#include <string.h>
#include <glib.h>
#include <mio/mio.h>
#ifdef HAVE_REGCOMP
# include <ctype.h>
# include <stddef.h>
# ifdef HAVE_SYS_TYPES_H
# include <sys/types.h> /* declare off_t (not known to regex.h on FreeBSD) */
# endif
#endif
#include "main.h"
#include "entry.h"
#include "parse.h"
#include "read.h"
#ifdef HAVE_REGEX
/*
* MACROS
*/
/* Back-references \0 through \9 */
#define BACK_REFERENCE_COUNT 10
#if defined (HAVE_REGCOMP) && !defined (REGCOMP_BROKEN)
# define POSIX_REGEX
#endif
#define REGEX_NAME "Regex"
/*
* DATA DECLARATIONS
*/
#if defined (POSIX_REGEX)
struct sKind {
boolean enabled;
char letter;
char* name;
char* description;
};
enum pType { PTRN_TAG, PTRN_CALLBACK };
typedef struct {
GRegex *pattern;
enum pType type;
union {
struct {
char *name_pattern;
struct sKind kind;
} tag;
struct {
regexCallback function;
} callback;
} u;
} regexPattern;
#endif
typedef struct {
regexPattern *patterns;
unsigned int count;
} patternSet;
/*
* DATA DEFINITIONS
*/
static boolean regexBroken = FALSE;
/* Array of pattern sets, indexed by language */
static patternSet* Sets = NULL;
static int SetUpper = -1; /* upper language index in list */
/*
* FUNCTION DEFINITIONS
*/
static void clearPatternSet (const langType language)
{
if (language <= SetUpper)
{
patternSet* const set = Sets + language;
unsigned int i;
for (i = 0 ; i < set->count ; ++i)
{
regexPattern *p = &set->patterns [i];
g_regex_unref(p->pattern);
p->pattern = NULL;
if (p->type == PTRN_TAG)
{
eFree (p->u.tag.name_pattern);
p->u.tag.name_pattern = NULL;
eFree (p->u.tag.kind.name);
p->u.tag.kind.name = NULL;
if (p->u.tag.kind.description != NULL)
{
eFree (p->u.tag.kind.description);
p->u.tag.kind.description = NULL;
}
}
}
if (set->patterns != NULL)
eFree (set->patterns);
set->patterns = NULL;
set->count = 0;
}
}
/*
* Regex psuedo-parser
*/
static void makeRegexTag (
const vString* const name, const struct sKind* const kind)
{
Assert (kind != NULL);
if (kind->enabled)
{
tagEntryInfo e;
Assert (name != NULL && vStringLength (name) > 0);
initTagEntry (&e, vStringValue (name));
e.kind = kind->letter;
e.kindName = kind->name;
makeTagEntry (&e);
}
}
/*
* Regex pattern definition
*/
/* Take a string like "/blah/" and turn it into "blah", making sure
* that the first and last characters are the same, and handling
* quoted separator characters. Actually, stops on the occurrence of
* an unquoted separator. Also turns "\t" into a Tab character.
* Returns pointer to terminating separator. Works in place. Null
* terminates name string.
*/
static char* scanSeparators (char* name)
{
char sep = name [0];
char *copyto = name;
boolean quoted = FALSE;
for (++name ; *name != '\0' ; ++name)
{
if (quoted)
{
if (*name == sep)
*copyto++ = sep;
else if (*name == 't')
*copyto++ = '\t';
else
{
/* Something else is quoted, so preserve the quote. */
*copyto++ = '\\';
*copyto++ = *name;
}
quoted = FALSE;
}
else if (*name == '\\')
quoted = TRUE;
else if (*name == sep)
{
break;
}
else
*copyto++ = *name;
}
*copyto = '\0';
return name;
}
/* Parse `regexp', in form "/regex/name/[k,Kind/]flags" (where the separator
* character is whatever the first character of `regexp' is), by breaking it
* up into null terminated strings, removing the separators, and expanding
* '\t' into tabs. When complete, `regexp' points to the line matching
* pattern, a pointer to the name matching pattern is written to `name', a
* pointer to the kinds is written to `kinds' (possibly NULL), and a pointer
* to the trailing flags is written to `flags'. If the pattern is not in the
* correct format, a false value is returned.
*/
static boolean parseTagRegex (
char* const regexp, char** const name,
char** const kinds, char** const flags)
{
boolean result = FALSE;
const int separator = (unsigned char) regexp [0];
*name = scanSeparators (regexp);
if (*regexp == '\0')
printf ("regex: empty regexp\n");
else if (**name != separator)
printf ("regex: %s: incomplete regexp\n", regexp);
else
{
char* const third = scanSeparators (*name);
if (**name == '\0')
printf ("regex: %s: regexp missing name pattern\n", regexp);
if ((*name) [strlen (*name) - 1] == '\\')
printf ("regex: error in name pattern: \"%s\"\n", *name);
if (*third != separator)
printf ("regex: %s: regexp missing final separator\n", regexp);
else
{
char* const fourth = scanSeparators (third);
if (*fourth == separator)
{
*kinds = third;
scanSeparators (fourth);
*flags = fourth;
}
else
{
*flags = third;
*kinds = NULL;
}
result = TRUE;
}
}
return result;
}
static void addCompiledTagPattern (
const langType language, GRegex* const pattern,
char* const name, const char kind, char* const kindName,
char *const description)
{
patternSet* set;
regexPattern *ptrn;
if (language > SetUpper)
{
int i;
Sets = xRealloc (Sets, (language + 1), patternSet);
for (i = SetUpper + 1 ; i <= language ; ++i)
{
Sets [i].patterns = NULL;
Sets [i].count = 0;
}
SetUpper = language;
}
set = Sets + language;
set->patterns = xRealloc (set->patterns, (set->count + 1), regexPattern);
ptrn = &set->patterns [set->count];
set->count += 1;
ptrn->pattern = pattern;
ptrn->type = PTRN_TAG;
ptrn->u.tag.name_pattern = name;
ptrn->u.tag.kind.enabled = TRUE;
ptrn->u.tag.kind.letter = kind;
ptrn->u.tag.kind.name = kindName;
ptrn->u.tag.kind.description = description;
}
static void addCompiledCallbackPattern (
const langType language, GRegex* const pattern,
const regexCallback callback)
{
patternSet* set;
regexPattern *ptrn;
if (language > SetUpper)
{
int i;
Sets = xRealloc (Sets, (language + 1), patternSet);
for (i = SetUpper + 1 ; i <= language ; ++i)
{
Sets [i].patterns = NULL;
Sets [i].count = 0;
}
SetUpper = language;
}
set = Sets + language;
set->patterns = xRealloc (set->patterns, (set->count + 1), regexPattern);
ptrn = &set->patterns [set->count];
set->count += 1;
ptrn->pattern = pattern;
ptrn->type = PTRN_CALLBACK;
ptrn->u.callback.function = callback;
}
#if defined (POSIX_REGEX)
static GRegex* compileRegex (const char* const regexp, const char* const flags)
{
int cflags = G_REGEX_MULTILINE;
GRegex *result = NULL;
GError *error = NULL;
int i;
for (i = 0 ; flags != NULL && flags [i] != '\0' ; ++i)
{
switch ((int) flags [i])
{
case 'b': g_warning("CTags 'b' flag not supported by Geany!"); break;
case 'e': break;
case 'i': cflags |= G_REGEX_CASELESS; break;
default: printf ("regex: unknown regex flag: '%c'\n", *flags); break;
}
}
result = g_regex_new(regexp, cflags, 0, &error);
if (error)
{
printf ("regex: regcomp %s: %s\n", regexp, error->message);
g_error_free(error);
}
return result;
}
#endif
static void parseKinds (
const char* const kinds, char* const kind, char** const kindName,
char **description)
{
*kind = '\0';
*kindName = NULL;
*description = NULL;
if (kinds == NULL || kinds [0] == '\0')
{
*kind = 'r';
*kindName = eStrdup ("regex");
}
else if (kinds [0] != '\0')
{
const char* k = kinds;
if (k [0] != ',' && (k [1] == ',' || k [1] == '\0'))
*kind = *k++;
else
*kind = 'r';
if (*k == ',')
++k;
if (k [0] == '\0')
*kindName = eStrdup ("regex");
else
{
const char *const comma = strchr (k, ',');
if (comma == NULL)
*kindName = eStrdup (k);
else
{
*kindName = (char*) eMalloc (comma - k + 1);
strncpy (*kindName, k, comma - k);
(*kindName) [comma - k] = '\0';
k = comma + 1;
if (k [0] != '\0')
*description = eStrdup (k);
}
}
}
}
static void printRegexKind (const regexPattern *pat, unsigned int i, boolean indent)
{
const struct sKind *const kind = &pat [i].u.tag.kind;
const char *const indentation = indent ? " " : "";
Assert (pat [i].type == PTRN_TAG);
printf ("%s%c %s %s\n", indentation,
kind->letter != '\0' ? kind->letter : '?',
kind->description != NULL ? kind->description : kind->name,
kind->enabled ? "" : " [off]");
}
static void processLanguageRegex (const langType language,
const char* const parameter)
{
if (parameter == NULL || parameter [0] == '\0')
clearPatternSet (language);
else if (parameter [0] != '@')
addLanguageRegex (language, parameter);
else if (! doesFileExist (parameter + 1))
printf ("regex: cannot open regex file\n");
else
{
const char* regexfile = parameter + 1;
MIO* const mio = mio_new_file (regexfile, "r");
if (mio == NULL)
printf ("regex: %s\n", regexfile);
else
{
vString* const regex = vStringNew ();
while (readLine (regex, mio))
addLanguageRegex (language, vStringValue (regex));
mio_free (mio);
vStringDelete (regex);
}
}
}
/*
* Regex pattern matching
*/
#if defined (POSIX_REGEX)
static vString* substitute (
const char* const in, const char* out,
const int nmatch, const GMatchInfo* const minfo)
{
vString* result = vStringNew ();
const char* p;
for (p = out ; *p != '\0' ; p++)
{
if (*p == '\\' && isdigit ((int) *++p))
{
const int dig = *p - '0';
int so, eo;
if (0 < dig && dig < nmatch &&
g_match_info_fetch_pos(minfo, dig, &so, &eo) && so != -1)
{
const int diglen = eo - so;
vStringNCatS (result, in + so, diglen);
}
}
else if (*p != '\n' && *p != '\r')
vStringPut (result, *p);
}
vStringTerminate (result);
return result;
}
static void matchTagPattern (const vString* const line,
const regexPattern* const patbuf,
const GMatchInfo* const minfo)
{
vString *const name = substitute (vStringValue (line),
patbuf->u.tag.name_pattern, BACK_REFERENCE_COUNT, minfo);
vStringStripLeading (name);
vStringStripTrailing (name);
if (vStringLength (name) > 0)
makeRegexTag (name, &patbuf->u.tag.kind);
else
error (WARNING, "%s:%ld: null expansion of name pattern \"%s\"",
getInputFileName (), getInputLineNumber (),
patbuf->u.tag.name_pattern);
vStringDelete (name);
}
static void matchCallbackPattern (
const vString* const line, const regexPattern* const patbuf,
const GMatchInfo* const minfo)
{
regexMatch matches [BACK_REFERENCE_COUNT];
unsigned int count = 0;
int i;
for (i = 0 ; i < BACK_REFERENCE_COUNT ; ++i)
{
int so, eo;
if (!g_match_info_fetch_pos(minfo, i, &so, &eo) || so == -1)
break;
matches [i].start = so;
matches [i].length = eo - so;
++count;
}
patbuf->u.callback.function (vStringValue (line), matches, count);
}
static boolean matchRegexPattern (const vString* const line,
const regexPattern* const patbuf)
{
boolean result = FALSE;
GMatchInfo *minfo;
if (g_regex_match(patbuf->pattern, vStringValue(line), 0, &minfo))
{
result = TRUE;
if (patbuf->type == PTRN_TAG)
matchTagPattern (line, patbuf, minfo);
else if (patbuf->type == PTRN_CALLBACK)
matchCallbackPattern (line, patbuf, minfo);
else
{
Assert ("invalid pattern type" == NULL);
result = FALSE;
}
}
g_match_info_free(minfo);
return result;
}
#endif
/* PUBLIC INTERFACE */
/* Match against all patterns for specified language. Returns true if at least
* on pattern matched.
*/
extern boolean matchRegex (const vString* const line, const langType language)
{
boolean result = FALSE;
if (language != LANG_IGNORE && language <= SetUpper &&
Sets [language].count > 0)
{
const patternSet* const set = Sets + language;
unsigned int i;
for (i = 0 ; i < set->count ; ++i)
if (matchRegexPattern (line, set->patterns + i))
result = TRUE;
}
return result;
}
extern void findRegexTags (void)
{
/* merely read all lines of the file */
while (fileReadLine () != NULL)
;
}
#endif /* HAVE_REGEX */
extern void addTagRegex (
const langType language __unused__,
const char* const regex __unused__,
const char* const name __unused__,
const char* const kinds __unused__,
const char* const flags __unused__)
{
#ifdef HAVE_REGEX
Assert (regex != NULL);
Assert (name != NULL);
if (! regexBroken)
{
GRegex* const cp = compileRegex (regex, flags);
if (cp != NULL)
{
char kind;
char* kindName;
char* description;
parseKinds (kinds, &kind, &kindName, &description);
addCompiledTagPattern (language, cp, eStrdup (name),
kind, kindName, description);
}
}
#endif
}
extern void addCallbackRegex (
const langType language __unused__,
const char* const regex __unused__,
const char* const flags __unused__,
const regexCallback callback __unused__)
{
#ifdef HAVE_REGEX
Assert (regex != NULL);
if (! regexBroken)
{
GRegex* const cp = compileRegex (regex, flags);
if (cp != NULL)
addCompiledCallbackPattern (language, cp, callback);
}
#endif
}
extern void addLanguageRegex (
const langType language __unused__, const char* const regex __unused__)
{
#ifdef HAVE_REGEX
if (! regexBroken)
{
char *const regex_pat = eStrdup (regex);
char *name, *kinds, *flags;
if (parseTagRegex (regex_pat, &name, &kinds, &flags))
{
addTagRegex (language, regex_pat, name, kinds, flags);
eFree (regex_pat);
}
}
#endif
}
/*
* Regex option parsing
*/
extern boolean processRegexOption (const char *const option,
const char *const parameter __unused__)
{
boolean handled = FALSE;
const char* const dash = strchr (option, '-');
if (dash != NULL && strncmp (option, "regex", dash - option) == 0)
{
#ifdef HAVE_REGEX
langType language;
language = getNamedLanguage (dash + 1);
if (language == LANG_IGNORE)
printf ("regex: unknown language \"%s\" in --%s option\n", (dash + 1), option);
else
processLanguageRegex (language, parameter);
#else
printf ("regex: regex support not available; required for --%s option\n",
option);
#endif
handled = TRUE;
}
return handled;
}
extern void disableRegexKinds (const langType language __unused__)
{
#ifdef HAVE_REGEX
if (language <= SetUpper && Sets [language].count > 0)
{
patternSet* const set = Sets + language;
unsigned int i;
for (i = 0 ; i < set->count ; ++i)
if (set->patterns [i].type == PTRN_TAG)
set->patterns [i].u.tag.kind.enabled = FALSE;
}
#endif
}
extern boolean enableRegexKind (
const langType language __unused__,
const int kind __unused__, const boolean mode __unused__)
{
boolean result = FALSE;
#ifdef HAVE_REGEX
if (language <= SetUpper && Sets [language].count > 0)
{
patternSet* const set = Sets + language;
unsigned int i;
for (i = 0 ; i < set->count ; ++i)
if (set->patterns [i].type == PTRN_TAG &&
set->patterns [i].u.tag.kind.letter == kind)
{
set->patterns [i].u.tag.kind.enabled = mode;
result = TRUE;
}
}
#endif
return result;
}
extern void printRegexKinds (const langType language __unused__, boolean indent __unused__)
{
#ifdef HAVE_REGEX
if (language <= SetUpper && Sets [language].count > 0)
{
patternSet* const set = Sets + language;
unsigned int i;
for (i = 0 ; i < set->count ; ++i)
if (set->patterns [i].type == PTRN_TAG)
printRegexKind (set->patterns, i, indent);
}
#endif
}
extern void freeRegexResources (void)
{
#ifdef HAVE_REGEX
int i;
for (i = 0 ; i <= SetUpper ; ++i)
clearPatternSet (i);
if (Sets != NULL)
eFree (Sets);
Sets = NULL;
SetUpper = -1;
#endif
}
/* Check for broken regcomp() on Cygwin */
extern void checkRegex (void)
{
#if 0 && defined (HAVE_REGEX) && defined (CHECK_REGCOMP)
regex_t patbuf;
int errcode;
if (regcomp (&patbuf, "/hello/", 0) != 0)
{
error (WARNING, "Disabling broken regex");
regexBroken = TRUE;
}
#endif
}
/* vi:set tabstop=4 shiftwidth=4: */