570 lines
9.5 KiB
C
570 lines
9.5 KiB
C
#include <u.h>
|
|
#include <libc.h>
|
|
#include <regexp.h>
|
|
#include "regimpl.h"
|
|
|
|
int instrcnt[] = {
|
|
[TANY] 2,
|
|
[TBOL] 1,
|
|
[TCAT] 0,
|
|
[TCLASS] 1,
|
|
[TEOL] 1,
|
|
[TNOTNL] 1,
|
|
[TOR] 2,
|
|
[TPLUS] 1,
|
|
[TQUES] 1,
|
|
[TRUNE] 1,
|
|
[TSTAR] 2,
|
|
[TSUB] 2
|
|
};
|
|
|
|
static Renode*
|
|
node(Parselex *plex, int op, Renode *l, Renode *r)
|
|
{
|
|
Renode *n;
|
|
|
|
plex->instrs += instrcnt[op];
|
|
n = plex->next++;
|
|
n->op = op;
|
|
n->left = l;
|
|
n->right = r;
|
|
return n;
|
|
}
|
|
|
|
static Renode*
|
|
e3(Parselex *plex)
|
|
{
|
|
char error[128];
|
|
Renode *n;
|
|
|
|
switch(lex(plex)) {
|
|
case LANY:
|
|
return node(plex, TANY, nil, nil);
|
|
case LBOL:
|
|
return node(plex, TBOL, nil, nil);
|
|
case LEOL:
|
|
return node(plex, TEOL, nil, nil);
|
|
case LRUNE:
|
|
n = node(plex, TRUNE, nil, nil);
|
|
n->r = plex->rune;
|
|
return n;
|
|
case LCLASS:
|
|
if(plex->nc)
|
|
return buildclassn(plex);
|
|
return buildclass(plex);
|
|
case LLPAR:
|
|
n = e0(plex);
|
|
n = node(plex, TSUB, n, nil);
|
|
if(lex(plex) != LRPAR) {
|
|
snprint(error, sizeof(error), "regexp %s: no matching parenthesis", plex->orig);
|
|
regerror(error);
|
|
longjmp(plex->exitenv, 1);
|
|
}
|
|
return n;
|
|
default:
|
|
if(plex->rune)
|
|
snprint(error, sizeof(error), "regexp %s: syntax error: %C", plex->orig, plex->rune);
|
|
else
|
|
snprint(error, sizeof(error), "regexp %s: parsing error", plex->orig);
|
|
regerror(error);
|
|
longjmp(plex->exitenv, 1);
|
|
}
|
|
return nil;
|
|
}
|
|
|
|
static Renode*
|
|
e2(Parselex *plex)
|
|
{
|
|
Renode *n;
|
|
|
|
n = e3(plex);
|
|
while(lex(plex) == LREP) {
|
|
switch(plex->rune) {
|
|
case L'*':
|
|
n = node(plex, TSTAR, n, nil);
|
|
break;
|
|
case L'+':
|
|
n = node(plex, TPLUS, n, nil);
|
|
break;
|
|
case L'?':
|
|
n = node(plex, TQUES, n, nil);
|
|
break;
|
|
}
|
|
}
|
|
plex->peek = 1;
|
|
return n;
|
|
}
|
|
|
|
static Renode*
|
|
invert(Renode *n)
|
|
{
|
|
Renode *n1;
|
|
|
|
if(n->op != TCAT)
|
|
return n;
|
|
while(n->left->op == TCAT) {
|
|
n1 = n->left;
|
|
n->left = n1->right;
|
|
n1->right = n;
|
|
n = n1;
|
|
}
|
|
return n;
|
|
}
|
|
|
|
static Renode*
|
|
e1(Parselex *plex)
|
|
{
|
|
Renode *n;
|
|
int sym;
|
|
|
|
n = e2(plex);
|
|
for(;;) {
|
|
sym = lex(plex);
|
|
if(sym == LEND || sym == LOR || sym == LRPAR)
|
|
break;
|
|
plex->peek = 1;
|
|
n = node(plex, TCAT, n, e2(plex));
|
|
}
|
|
plex->peek = 1;
|
|
return invert(n);
|
|
}
|
|
|
|
static Renode*
|
|
e0(Parselex *plex)
|
|
{
|
|
Renode *n;
|
|
|
|
n = e1(plex);
|
|
for(;;) {
|
|
if(lex(plex) != LOR)
|
|
break;
|
|
n = node(plex, TOR, n, e1(plex));
|
|
}
|
|
plex->peek = 1;
|
|
return n;
|
|
}
|
|
|
|
static Parselex*
|
|
initplex(Parselex *plex, char *regstr, int lit)
|
|
{
|
|
plex->getnextr = lit ? getnextrlit : getnextr;
|
|
plex->rawexp = plex->orig = regstr;
|
|
plex->sub = 0;
|
|
plex->instrs = 0;
|
|
plex->peek = 0;
|
|
plex->done = 0;
|
|
return plex;
|
|
}
|
|
|
|
static Reprog*
|
|
regcomp1(char *regstr, int nl, int lit)
|
|
{
|
|
Reprog *reprog;
|
|
Parselex plex;
|
|
Renode *parsetr;
|
|
int regstrlen, maxthr;
|
|
|
|
regstrlen = utflen(regstr);
|
|
initplex(&plex, regstr, lit);
|
|
plex.nodes = calloc(sizeof(*plex.nodes), regstrlen*2);
|
|
if(plex.nodes == nil)
|
|
return nil;
|
|
plex.next = plex.nodes;
|
|
|
|
if(setjmp(plex.exitenv) != 0) {
|
|
free(plex.nodes);
|
|
return nil;
|
|
}
|
|
|
|
maxthr = regstrlen + 1;
|
|
parsetr = node(&plex, TSUB, e0(&plex), nil);
|
|
|
|
// prtree(parsetr, 0, 1);
|
|
reprog = malloc(sizeof(Reprog) +
|
|
sizeof(Reinst) * plex.instrs +
|
|
sizeof(Rethread) * maxthr);
|
|
reprog->len = plex.instrs;
|
|
reprog->nthr = maxthr;
|
|
reprog->startinst = compile(parsetr, reprog, nl);
|
|
reprog->threads = (Rethread*)(reprog->startinst + reprog->len);
|
|
reprog->regstr = regstr;
|
|
|
|
free(plex.nodes);
|
|
return reprog;
|
|
}
|
|
|
|
Reprog*
|
|
regcomp(char *str)
|
|
{
|
|
return regcomp1(str, 0, 0);
|
|
}
|
|
|
|
Reprog*
|
|
regcomplit(char *str)
|
|
{
|
|
return regcomp1(str, 0, 1);
|
|
}
|
|
|
|
Reprog*
|
|
regcompnl(char *str)
|
|
{
|
|
return regcomp1(str, 1, 0);
|
|
}
|
|
|
|
static Reinst*
|
|
compile1(Renode *renode, Reinst *reinst, int *sub, int nl)
|
|
{
|
|
Reinst *i;
|
|
int s;
|
|
|
|
Tailcall:
|
|
if(renode == nil)
|
|
return reinst;
|
|
switch(renode->op) {
|
|
case TCLASS:
|
|
reinst->op = OCLASS;
|
|
reinst->r = renode->r;
|
|
reinst->r1 = renode->r1;
|
|
reinst->a = reinst + 1 + renode->nclass;
|
|
renode = renode->left;
|
|
reinst++;
|
|
goto Tailcall;
|
|
case TCAT:
|
|
reinst = compile1(renode->left, reinst, sub, nl);
|
|
renode = renode->right;
|
|
goto Tailcall;
|
|
case TOR:
|
|
reinst->op = OSPLIT;
|
|
reinst->a = reinst + 1;
|
|
i = compile1(renode->left, reinst->a, sub, nl);
|
|
reinst->b = i + 1;
|
|
i->op = OJMP;
|
|
i->a = compile1(renode->right, reinst->b, sub, nl);
|
|
return i->a;
|
|
case TSTAR:
|
|
reinst->op = OSPLIT;
|
|
reinst->a = reinst + 1;
|
|
i = compile1(renode->left, reinst->a, sub, nl);
|
|
reinst->b = i + 1;
|
|
i->op = OJMP;
|
|
i->a = reinst;
|
|
return reinst->b;
|
|
case TPLUS:
|
|
i = reinst;
|
|
reinst = compile1(renode->left, reinst, sub, nl);
|
|
reinst->op = OSPLIT;
|
|
reinst->a = i;
|
|
reinst->b = reinst + 1;
|
|
return reinst->b;
|
|
case TQUES:
|
|
reinst->op = OSPLIT;
|
|
reinst->a = reinst + 1;
|
|
reinst->b = compile1(renode->left, reinst->a, sub, nl);
|
|
return reinst->b;
|
|
case TSUB:
|
|
reinst->op = OSAVE;
|
|
reinst->sub = s = (*sub)++;
|
|
reinst = compile1(renode->left, reinst+1, sub, nl);
|
|
reinst->op = OUNSAVE;
|
|
reinst->sub = s;
|
|
return reinst + 1;
|
|
case TANY:
|
|
if(nl == 0)
|
|
reinst++->op = ONOTNL;
|
|
reinst->op = OANY;
|
|
return reinst + 1;
|
|
case TRUNE:
|
|
reinst->op = ORUNE;
|
|
reinst->r = renode->r;
|
|
return reinst + 1;
|
|
case TNOTNL:
|
|
reinst->op = ONOTNL;
|
|
return reinst + 1;
|
|
case TEOL:
|
|
reinst->op = OEOL;
|
|
return reinst + 1;
|
|
case TBOL:
|
|
reinst->op = OBOL;
|
|
return reinst + 1;
|
|
}
|
|
return nil;
|
|
}
|
|
|
|
static Reinst*
|
|
compile(Renode *parsetr, Reprog *reprog, int nl)
|
|
{
|
|
Reinst *reinst, *end;
|
|
int sub;
|
|
|
|
sub = 0;
|
|
reinst = (Reinst*)(reprog+1);
|
|
end = compile1(parsetr, reinst, &sub, nl);
|
|
assert(end <= reinst + reprog->len);
|
|
return reinst;
|
|
}
|
|
|
|
static void
|
|
getnextr(Parselex *l)
|
|
{
|
|
l->literal = 0;
|
|
if(l->done) {
|
|
l->rune = L'\0';
|
|
return;
|
|
}
|
|
l->rawexp += chartorune(&l->rune, l->rawexp);
|
|
if(*l->rawexp == 0)
|
|
l->done = 1;
|
|
if(l->rune == L'\\')
|
|
getnextrlit(l);
|
|
}
|
|
|
|
static void
|
|
getnextrlit(Parselex *l)
|
|
{
|
|
l->literal = 1;
|
|
if(l->done) {
|
|
l->literal = 0;
|
|
l->rune = L'\0';
|
|
return;
|
|
}
|
|
l->rawexp += chartorune(&l->rune, l->rawexp);
|
|
if(*l->rawexp == 0)
|
|
l->done = 1;
|
|
}
|
|
|
|
static int
|
|
lex(Parselex *l)
|
|
{
|
|
if(l->peek) {
|
|
l->peek = 0;
|
|
return l->peeklex;
|
|
}
|
|
l->getnextr(l);
|
|
if(l->literal)
|
|
return l->peeklex = LRUNE;
|
|
switch(l->rune){
|
|
case L'\0':
|
|
return l->peeklex = LEND;
|
|
case L'*':
|
|
case L'?':
|
|
case L'+':
|
|
return l->peeklex = LREP;
|
|
case L'|':
|
|
return l->peeklex = LOR;
|
|
case L'.':
|
|
return l->peeklex = LANY;
|
|
case L'(':
|
|
return l->peeklex = LLPAR;
|
|
case L')':
|
|
return l->peeklex = LRPAR;
|
|
case L'^':
|
|
return l->peeklex = LBOL;
|
|
case L'$':
|
|
return l->peeklex = LEOL;
|
|
case L'[':
|
|
getclass(l);
|
|
return l->peeklex = LCLASS;
|
|
}
|
|
return l->peeklex = LRUNE;
|
|
}
|
|
|
|
static int
|
|
pcmp(void *va, void *vb)
|
|
{
|
|
Rune *a, *b;
|
|
|
|
a = va;
|
|
b = vb;
|
|
|
|
if(a[0] < b[0])
|
|
return 1;
|
|
if(a[0] > b[0])
|
|
return -1;
|
|
if(a[1] < b[1])
|
|
return 1;
|
|
if(a[1] > b[1])
|
|
return -1;
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
getclass(Parselex *l)
|
|
{
|
|
Rune *p, *q, t;
|
|
|
|
l->nc = 0;
|
|
getnextrlit(l);
|
|
if(l->rune == L'^') {
|
|
l->nc = 1;
|
|
getnextrlit(l);
|
|
}
|
|
p = l->cpairs;
|
|
for(;;) {
|
|
*p = l->rune;
|
|
if(l->rune == L']')
|
|
break;
|
|
if(l->rune == L'-') {
|
|
regerror("Malformed class");
|
|
longjmp(l->exitenv, 1);
|
|
}
|
|
if(l->rune == '\\') {
|
|
getnextrlit(l);
|
|
*p = l->rune;
|
|
}
|
|
if(l->rune == 0) {
|
|
regerror("No closing ] for class");
|
|
longjmp(l->exitenv, 1);
|
|
}
|
|
getnextrlit(l);
|
|
if(l->rune == L'-') {
|
|
getnextrlit(l);
|
|
if(l->rune == L']') {
|
|
regerror("Malformed class");
|
|
longjmp(l->exitenv, 1);
|
|
}
|
|
if(l->rune == L'-') {
|
|
regerror("Malformed class");
|
|
longjmp(l->exitenv, 1);
|
|
}
|
|
if(l->rune == L'\\')
|
|
getnextrlit(l);
|
|
p[1] = l->rune;
|
|
if(p[0] > p[1]) {
|
|
t = p[0];
|
|
p[0] = p[1];
|
|
p[1] = t;
|
|
}
|
|
getnextrlit(l);
|
|
} else
|
|
p[1] = p[0];
|
|
if(p >= l->cpairs + nelem(l->cpairs) - 2) {
|
|
regerror("Class too big\n");
|
|
longjmp(l->exitenv, 1);
|
|
}
|
|
p += 2;
|
|
}
|
|
*p = L'\0';
|
|
qsort(l->cpairs, (p - l->cpairs)/2, 2*sizeof(*l->cpairs), pcmp);
|
|
q = l->cpairs;
|
|
for(p = l->cpairs+2; *p != 0; p += 2) {
|
|
if(p[1] < q[0] - 1) {
|
|
q[2] = p[0];
|
|
q[3] = p[1];
|
|
q += 2;
|
|
continue;
|
|
}
|
|
q[0] = p[0];
|
|
if(p[1] > q[1])
|
|
q[1] = p[1];
|
|
}
|
|
q[2] = 0;
|
|
}
|
|
|
|
/* classes are in descending order see pcmp */
|
|
static Renode*
|
|
buildclassn(Parselex *l)
|
|
{
|
|
Renode *n;
|
|
Rune *p;
|
|
int i;
|
|
|
|
i = 0;
|
|
p = l->cpairs;
|
|
n = node(l, TCLASS, nil, nil);
|
|
n->r = p[1] + 1;
|
|
n->r1 = Runemax;
|
|
n->nclass = i++;
|
|
|
|
for(; *p != 0; p += 2) {
|
|
n = node(l, TCLASS, n, nil);
|
|
n->r = p[3] + 1;
|
|
n->r1 = p[0] - 1;
|
|
n->nclass = i++;
|
|
}
|
|
n->r = 0;
|
|
return node(l, TCAT, node(l, TNOTNL, nil, nil), n);
|
|
}
|
|
|
|
static Renode*
|
|
buildclass(Parselex *l)
|
|
{
|
|
Renode *n;
|
|
Rune *p;
|
|
int i;
|
|
|
|
i = 0;
|
|
n = node(l, TCLASS, nil, nil);
|
|
n->r = Runemax + 1;
|
|
n->nclass = i++;
|
|
|
|
for(p = l->cpairs; *p != 0; p += 2) {
|
|
n = node(l, TCLASS, n, nil);
|
|
n->r = p[0];
|
|
n->r1 = p[1];
|
|
n->nclass = i++;
|
|
}
|
|
return n;
|
|
}
|
|
|
|
static void
|
|
prtree(Renode *tree, int d, int f)
|
|
{
|
|
int i;
|
|
|
|
if(tree == nil)
|
|
return;
|
|
if(f)
|
|
for(i = 0; i < d; i++)
|
|
print("\t");
|
|
switch(tree->op) {
|
|
case TCAT:
|
|
prtree(tree->left, d, 0);
|
|
prtree(tree->right, d, 1);
|
|
break;
|
|
case TOR:
|
|
print("TOR\n");
|
|
prtree(tree->left, d+1, 1);
|
|
for(i = 0; i < d; i++)
|
|
print("\t");
|
|
print("|\n");
|
|
prtree(tree->right, d+1, 1);
|
|
break;
|
|
case TSTAR:
|
|
print("*\n");
|
|
prtree(tree->left, d+1, 1);
|
|
break;
|
|
case TPLUS:
|
|
print("+\n");
|
|
prtree(tree->left, d+1, 1);
|
|
break;
|
|
case TQUES:
|
|
print("?\n");
|
|
prtree(tree->left, d+1, 1);
|
|
break;
|
|
case TANY:
|
|
print(".\n");
|
|
prtree(tree->left, d+1, 1);
|
|
break;
|
|
case TBOL:
|
|
print("^\n");
|
|
break;
|
|
case TEOL:
|
|
print("$\n");
|
|
break;
|
|
case TSUB:
|
|
print("TSUB\n");
|
|
prtree(tree->left, d+1, 1);
|
|
break;
|
|
case TRUNE:
|
|
print("TRUNE: %C\n", tree->r);
|
|
break;
|
|
case TNOTNL:
|
|
print("TNOTNL: !\\n\n");
|
|
break;
|
|
case TCLASS:
|
|
print("CLASS: %C-%C\n", tree->r, tree->r1);
|
|
prtree(tree->left, d, 1);
|
|
break;
|
|
}
|
|
}
|