531 lines
16 KiB
C
531 lines
16 KiB
C
/***********************************************************************/
|
|
/* */
|
|
/* OCaml */
|
|
/* */
|
|
/* Xavier Leroy, projet Cristal, INRIA Rocquencourt */
|
|
/* */
|
|
/* Copyright 1996 Institut National de Recherche en Informatique et */
|
|
/* en Automatique. All rights reserved. This file is distributed */
|
|
/* under the terms of the GNU Library General Public License, with */
|
|
/* the special exception on linking described in file ../../LICENSE. */
|
|
/* */
|
|
/***********************************************************************/
|
|
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include <caml/mlvalues.h>
|
|
#include <caml/alloc.h>
|
|
#include <caml/memory.h>
|
|
#include <caml/fail.h>
|
|
|
|
/* The backtracking NFA interpreter */
|
|
|
|
union backtrack_point {
|
|
struct {
|
|
value * pc; /* with low bit set */
|
|
unsigned char * txt;
|
|
} pos;
|
|
struct {
|
|
unsigned char ** loc; /* with low bit clear */
|
|
unsigned char * val;
|
|
} undo;
|
|
};
|
|
|
|
#define Set_tag(p) ((value *) ((intnat)(p) | 1))
|
|
#define Clear_tag(p) ((value *) ((intnat)(p) & ~1))
|
|
#define Tag_is_set(p) ((intnat)(p) & 1)
|
|
|
|
#define BACKTRACK_STACK_BLOCK_SIZE 500
|
|
|
|
struct backtrack_stack {
|
|
struct backtrack_stack * previous;
|
|
union backtrack_point point[BACKTRACK_STACK_BLOCK_SIZE];
|
|
};
|
|
|
|
#define Opcode(x) ((x) & 0xFF)
|
|
#define Arg(x) ((uintnat)(x) >> 8)
|
|
#define SignedArg(x) ((intnat)(x) >> 8)
|
|
|
|
enum {
|
|
CHAR, /* match a single character */
|
|
CHARNORM, /* match a single character, after normalization */
|
|
STRING, /* match a character string */
|
|
STRINGNORM, /* match a character string, after normalization */
|
|
CHARCLASS, /* match a character class */
|
|
BOL, /* match at beginning of line */
|
|
EOL, /* match at end of line */
|
|
WORDBOUNDARY, /* match on a word boundary */
|
|
BEGGROUP, /* record the beginning of a group */
|
|
ENDGROUP, /* record the end of a group */
|
|
REFGROUP, /* match a previously matched group */
|
|
ACCEPT, /* report success */
|
|
SIMPLEOPT, /* match a character class 0 or 1 times */
|
|
SIMPLESTAR, /* match a character class 0, 1 or several times */
|
|
SIMPLEPLUS, /* match a character class 1 or several times */
|
|
GOTO, /* unconditional branch */
|
|
PUSHBACK, /* record a backtrack point --
|
|
where to jump in case of failure */
|
|
SETMARK, /* remember current position in given register # */
|
|
CHECKPROGRESS /* backtrack if no progress was made w.r.t. reg # */
|
|
};
|
|
|
|
/* Accessors in a compiled regexp */
|
|
#define Prog(re) Field(re, 0)
|
|
#define Cpool(re) Field(re, 1)
|
|
#define Normtable(re) Field(re, 2)
|
|
#define Numgroups(re) Int_val(Field(re, 3))
|
|
#define Numregisters(re) Int_val(Field(re, 4))
|
|
#define Startchars(re) Int_val(Field(re, 5))
|
|
|
|
/* Record positions of matched groups */
|
|
#define NUM_GROUPS 32
|
|
struct re_group {
|
|
unsigned char * start;
|
|
unsigned char * end;
|
|
};
|
|
static struct re_group re_group[NUM_GROUPS];
|
|
|
|
/* Record positions reached during matching; used to check progress
|
|
in repeated matching of a regexp. */
|
|
#define NUM_REGISTERS 64
|
|
static unsigned char * re_register[NUM_REGISTERS];
|
|
|
|
/* The initial backtracking stack */
|
|
static struct backtrack_stack initial_stack = { NULL, };
|
|
|
|
/* Free a chained list of backtracking stacks */
|
|
static void free_backtrack_stack(struct backtrack_stack * stack)
|
|
{
|
|
struct backtrack_stack * prevstack;
|
|
while ((prevstack = stack->previous) != NULL) {
|
|
stat_free(stack);
|
|
stack = prevstack;
|
|
}
|
|
}
|
|
|
|
/* Membership in a bit vector representing a set of booleans */
|
|
#define In_bitset(s,i,tmp) (tmp = (i), ((s)[tmp >> 3] >> (tmp & 7)) & 1)
|
|
|
|
/* Determine if a character is a word constituent */
|
|
/* PR#4874: word constituent = letter, digit, underscore. */
|
|
|
|
static unsigned char re_word_letters[32] = {
|
|
0x00, 0x00, 0x00, 0x00, /* 0x00-0x1F: none */
|
|
0x00, 0x00, 0xFF, 0x03, /* 0x20-0x3F: digits 0-9 */
|
|
0xFE, 0xFF, 0xFF, 0x87, /* 0x40-0x5F: A to Z, _ */
|
|
0xFE, 0xFF, 0xFF, 0x07, /* 0x60-0x7F: a to z */
|
|
0x00, 0x00, 0x00, 0x00, /* 0x80-0x9F: none */
|
|
0x00, 0x00, 0x00, 0x00, /* 0xA0-0xBF: none */
|
|
0xFF, 0xFF, 0x7F, 0xFF, /* 0xC0-0xDF: Latin-1 accented uppercase */
|
|
0xFF, 0xFF, 0x7F, 0xFF /* 0xE0-0xFF: Latin-1 accented lowercase */
|
|
};
|
|
|
|
#define Is_word_letter(c) ((re_word_letters[(c) >> 3] >> ((c) & 7)) & 1)
|
|
|
|
/* The bytecode interpreter for the NFA */
|
|
static int re_match(value re,
|
|
unsigned char * starttxt,
|
|
register unsigned char * txt,
|
|
register unsigned char * endtxt,
|
|
int accept_partial_match)
|
|
{
|
|
register value * pc;
|
|
intnat instr;
|
|
struct backtrack_stack * stack;
|
|
union backtrack_point * sp;
|
|
value cpool;
|
|
value normtable;
|
|
unsigned char c;
|
|
union backtrack_point back;
|
|
|
|
{ int i;
|
|
struct re_group * p;
|
|
unsigned char ** q;
|
|
for (p = &re_group[1], i = Numgroups(re); i > 1; i--, p++)
|
|
p->start = p->end = NULL;
|
|
for (q = &re_register[0], i = Numregisters(re); i > 0; i--, q++)
|
|
*q = NULL;
|
|
}
|
|
|
|
pc = &Field(Prog(re), 0);
|
|
stack = &initial_stack;
|
|
sp = stack->point;
|
|
cpool = Cpool(re);
|
|
normtable = Normtable(re);
|
|
re_group[0].start = txt;
|
|
|
|
while (1) {
|
|
instr = Long_val(*pc++);
|
|
switch (Opcode(instr)) {
|
|
case CHAR:
|
|
if (txt == endtxt) goto prefix_match;
|
|
if (*txt != Arg(instr)) goto backtrack;
|
|
txt++;
|
|
break;
|
|
case CHARNORM:
|
|
if (txt == endtxt) goto prefix_match;
|
|
if (Byte_u(normtable, *txt) != Arg(instr)) goto backtrack;
|
|
txt++;
|
|
break;
|
|
case STRING: {
|
|
unsigned char * s =
|
|
(unsigned char *) String_val(Field(cpool, Arg(instr)));
|
|
while ((c = *s++) != 0) {
|
|
if (txt == endtxt) goto prefix_match;
|
|
if (c != *txt) goto backtrack;
|
|
txt++;
|
|
}
|
|
break;
|
|
}
|
|
case STRINGNORM: {
|
|
unsigned char * s =
|
|
(unsigned char *) String_val(Field(cpool, Arg(instr)));
|
|
while ((c = *s++) != 0) {
|
|
if (txt == endtxt) goto prefix_match;
|
|
if (c != Byte_u(normtable, *txt)) goto backtrack;
|
|
txt++;
|
|
}
|
|
break;
|
|
}
|
|
case CHARCLASS:
|
|
if (txt == endtxt) goto prefix_match;
|
|
if (! In_bitset(String_val(Field(cpool, Arg(instr))), *txt, c))
|
|
goto backtrack;
|
|
txt++;
|
|
break;
|
|
case BOL:
|
|
if (txt > starttxt && txt[-1] != '\n') goto backtrack;
|
|
break;
|
|
case EOL:
|
|
if (txt < endtxt && *txt != '\n') goto backtrack;
|
|
break;
|
|
case WORDBOUNDARY:
|
|
/* At beginning and end of text: no
|
|
At beginning of text: OK if current char is a letter
|
|
At end of text: OK if previous char is a letter
|
|
Otherwise:
|
|
OK if previous char is a letter and current char not a letter
|
|
or previous char is not a letter and current char is a letter */
|
|
if (txt == starttxt) {
|
|
if (txt == endtxt) goto prefix_match;
|
|
if (Is_word_letter(txt[0])) break;
|
|
goto backtrack;
|
|
} else if (txt == endtxt) {
|
|
if (Is_word_letter(txt[-1])) break;
|
|
goto backtrack;
|
|
} else {
|
|
if (Is_word_letter(txt[-1]) != Is_word_letter(txt[0])) break;
|
|
goto backtrack;
|
|
}
|
|
case BEGGROUP: {
|
|
int group_no = Arg(instr);
|
|
struct re_group * group = &(re_group[group_no]);
|
|
back.undo.loc = &(group->start);
|
|
back.undo.val = group->start;
|
|
group->start = txt;
|
|
goto push;
|
|
}
|
|
case ENDGROUP: {
|
|
int group_no = Arg(instr);
|
|
struct re_group * group = &(re_group[group_no]);
|
|
back.undo.loc = &(group->end);
|
|
back.undo.val = group->end;
|
|
group->end = txt;
|
|
goto push;
|
|
}
|
|
case REFGROUP: {
|
|
int group_no = Arg(instr);
|
|
struct re_group * group = &(re_group[group_no]);
|
|
unsigned char * s;
|
|
if (group->start == NULL || group->end == NULL) goto backtrack;
|
|
for (s = group->start; s < group->end; s++) {
|
|
if (txt == endtxt) goto prefix_match;
|
|
if (*s != *txt) goto backtrack;
|
|
txt++;
|
|
}
|
|
break;
|
|
}
|
|
case ACCEPT:
|
|
goto accept;
|
|
case SIMPLEOPT: {
|
|
char * set = String_val(Field(cpool, Arg(instr)));
|
|
if (txt < endtxt && In_bitset(set, *txt, c)) txt++;
|
|
break;
|
|
}
|
|
case SIMPLESTAR: {
|
|
char * set = String_val(Field(cpool, Arg(instr)));
|
|
while (txt < endtxt && In_bitset(set, *txt, c))
|
|
txt++;
|
|
break;
|
|
}
|
|
case SIMPLEPLUS: {
|
|
char * set = String_val(Field(cpool, Arg(instr)));
|
|
if (txt == endtxt) goto prefix_match;
|
|
if (! In_bitset(set, *txt, c)) goto backtrack;
|
|
txt++;
|
|
while (txt < endtxt && In_bitset(set, *txt, c))
|
|
txt++;
|
|
break;
|
|
}
|
|
case GOTO:
|
|
pc = pc + SignedArg(instr);
|
|
break;
|
|
case PUSHBACK:
|
|
back.pos.pc = Set_tag(pc + SignedArg(instr));
|
|
back.pos.txt = txt;
|
|
goto push;
|
|
case SETMARK: {
|
|
int reg_no = Arg(instr);
|
|
unsigned char ** reg = &(re_register[reg_no]);
|
|
back.undo.loc = reg;
|
|
back.undo.val = *reg;
|
|
*reg = txt;
|
|
goto push;
|
|
}
|
|
case CHECKPROGRESS: {
|
|
int reg_no = Arg(instr);
|
|
if (re_register[reg_no] == txt)
|
|
goto backtrack;
|
|
break;
|
|
}
|
|
default:
|
|
caml_fatal_error ("impossible case in re_match");
|
|
}
|
|
/* Continue with next instruction */
|
|
continue;
|
|
|
|
push:
|
|
/* Push an item on the backtrack stack and continue with next instr */
|
|
if (sp == stack->point + BACKTRACK_STACK_BLOCK_SIZE) {
|
|
struct backtrack_stack * newstack =
|
|
caml_stat_alloc(sizeof(struct backtrack_stack));
|
|
newstack->previous = stack;
|
|
stack = newstack;
|
|
sp = stack->point;
|
|
}
|
|
*sp = back;
|
|
sp++;
|
|
continue;
|
|
|
|
prefix_match:
|
|
/* We get here when matching failed because the end of text
|
|
was encountered. */
|
|
if (accept_partial_match) goto accept;
|
|
|
|
backtrack:
|
|
/* We get here when matching fails. Backtrack to most recent saved
|
|
program point, undoing variable assignments on the way. */
|
|
while (1) {
|
|
if (sp == stack->point) {
|
|
struct backtrack_stack * prevstack = stack->previous;
|
|
if (prevstack == NULL) return 0;
|
|
stat_free(stack);
|
|
stack = prevstack;
|
|
sp = stack->point + BACKTRACK_STACK_BLOCK_SIZE;
|
|
}
|
|
sp--;
|
|
if (Tag_is_set(sp->pos.pc)) {
|
|
pc = Clear_tag(sp->pos.pc);
|
|
txt = sp->pos.txt;
|
|
break;
|
|
} else {
|
|
*(sp->undo.loc) = sp->undo.val;
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
|
|
accept:
|
|
/* We get here when the regexp was successfully matched */
|
|
free_backtrack_stack(stack);
|
|
re_group[0].end = txt;
|
|
return 1;
|
|
}
|
|
|
|
/* Allocate an integer array containing the positions of the matched groups.
|
|
Beginning of group #N is at 2N, end is at 2N+1.
|
|
Take position = -1 when group wasn't matched. */
|
|
|
|
static value re_alloc_groups(value re, value str)
|
|
{
|
|
CAMLparam1(str);
|
|
CAMLlocal1(res);
|
|
unsigned char * starttxt = (unsigned char *) String_val(str);
|
|
int n = Numgroups(re);
|
|
int i;
|
|
struct re_group * group;
|
|
|
|
res = alloc(n * 2, 0);
|
|
for (i = 0; i < n; i++) {
|
|
group = &(re_group[i]);
|
|
if (group->start == NULL || group->end == NULL) {
|
|
Field(res, i * 2) = Val_int(-1);
|
|
Field(res, i * 2 + 1) = Val_int(-1);
|
|
} else {
|
|
Field(res, i * 2) = Val_long(group->start - starttxt);
|
|
Field(res, i * 2 + 1) = Val_long(group->end - starttxt);
|
|
}
|
|
}
|
|
CAMLreturn(res);
|
|
}
|
|
|
|
/* String matching and searching. All functions return the empty array
|
|
on failure, and an array of positions on success. */
|
|
|
|
CAMLprim value re_string_match(value re, value str, value pos)
|
|
{
|
|
unsigned char * starttxt = &Byte_u(str, 0);
|
|
unsigned char * txt = &Byte_u(str, Long_val(pos));
|
|
unsigned char * endtxt = &Byte_u(str, string_length(str));
|
|
|
|
if (txt < starttxt || txt > endtxt)
|
|
invalid_argument("Str.string_match");
|
|
if (re_match(re, starttxt, txt, endtxt, 0)) {
|
|
return re_alloc_groups(re, str);
|
|
} else {
|
|
return Atom(0);
|
|
}
|
|
}
|
|
|
|
CAMLprim value re_partial_match(value re, value str, value pos)
|
|
{
|
|
unsigned char * starttxt = &Byte_u(str, 0);
|
|
unsigned char * txt = &Byte_u(str, Long_val(pos));
|
|
unsigned char * endtxt = &Byte_u(str, string_length(str));
|
|
|
|
if (txt < starttxt || txt > endtxt)
|
|
invalid_argument("Str.string_partial_match");
|
|
if (re_match(re, starttxt, txt, endtxt, 1)) {
|
|
return re_alloc_groups(re, str);
|
|
} else {
|
|
return Atom(0);
|
|
}
|
|
}
|
|
|
|
CAMLprim value re_search_forward(value re, value str, value startpos)
|
|
{
|
|
unsigned char * starttxt = &Byte_u(str, 0);
|
|
unsigned char * txt = &Byte_u(str, Long_val(startpos));
|
|
unsigned char * endtxt = &Byte_u(str, string_length(str));
|
|
unsigned char * startchars;
|
|
|
|
if (txt < starttxt || txt > endtxt)
|
|
invalid_argument("Str.search_forward");
|
|
if (Startchars(re) == -1) {
|
|
do {
|
|
if (re_match(re, starttxt, txt, endtxt, 0))
|
|
return re_alloc_groups(re, str);
|
|
txt++;
|
|
} while (txt <= endtxt);
|
|
return Atom(0);
|
|
} else {
|
|
startchars =
|
|
(unsigned char *) String_val(Field(Cpool(re), Startchars(re)));
|
|
do {
|
|
while (txt < endtxt && startchars[*txt] == 0) txt++;
|
|
if (re_match(re, starttxt, txt, endtxt, 0))
|
|
return re_alloc_groups(re, str);
|
|
txt++;
|
|
} while (txt <= endtxt);
|
|
return Atom(0);
|
|
}
|
|
}
|
|
|
|
CAMLprim value re_search_backward(value re, value str, value startpos)
|
|
{
|
|
unsigned char * starttxt = &Byte_u(str, 0);
|
|
unsigned char * txt = &Byte_u(str, Long_val(startpos));
|
|
unsigned char * endtxt = &Byte_u(str, string_length(str));
|
|
unsigned char * startchars;
|
|
|
|
if (txt < starttxt || txt > endtxt)
|
|
invalid_argument("Str.search_backward");
|
|
if (Startchars(re) == -1) {
|
|
do {
|
|
if (re_match(re, starttxt, txt, endtxt, 0))
|
|
return re_alloc_groups(re, str);
|
|
txt--;
|
|
} while (txt >= starttxt);
|
|
return Atom(0);
|
|
} else {
|
|
startchars =
|
|
(unsigned char *) String_val(Field(Cpool(re), Startchars(re)));
|
|
do {
|
|
while (txt > starttxt && startchars[*txt] == 0) txt--;
|
|
if (re_match(re, starttxt, txt, endtxt, 0))
|
|
return re_alloc_groups(re, str);
|
|
txt--;
|
|
} while (txt >= starttxt);
|
|
return Atom(0);
|
|
}
|
|
}
|
|
|
|
/* Replacement */
|
|
|
|
CAMLprim value re_replacement_text(value repl, value groups, value orig)
|
|
{
|
|
CAMLparam3(repl, groups, orig);
|
|
CAMLlocal1(res);
|
|
mlsize_t start, end, len, n;
|
|
char * p, * q;
|
|
int c;
|
|
|
|
len = 0;
|
|
p = String_val(repl);
|
|
n = string_length(repl);
|
|
while (n > 0) {
|
|
c = *p++; n--;
|
|
if(c != '\\')
|
|
len++;
|
|
else {
|
|
if (n == 0) failwith("Str.replace: illegal backslash sequence");
|
|
c = *p++; n--;
|
|
switch (c) {
|
|
case '\\':
|
|
len++; break;
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
c -= '0';
|
|
if (c*2 >= Wosize_val(groups))
|
|
failwith("Str.replace: reference to unmatched group");
|
|
start = Long_val(Field(groups, c*2));
|
|
end = Long_val(Field(groups, c*2 + 1));
|
|
if (start == (mlsize_t) -1)
|
|
failwith("Str.replace: reference to unmatched group");
|
|
len += end - start;
|
|
break;
|
|
default:
|
|
len += 2; break;
|
|
}
|
|
}
|
|
}
|
|
res = alloc_string(len);
|
|
p = String_val(repl);
|
|
q = String_val(res);
|
|
n = string_length(repl);
|
|
while (n > 0) {
|
|
c = *p++; n--;
|
|
if(c != '\\')
|
|
*q++ = c;
|
|
else {
|
|
c = *p++; n--;
|
|
switch (c) {
|
|
case '\\':
|
|
*q++ = '\\'; break;
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
c -= '0';
|
|
start = Long_val(Field(groups, c*2));
|
|
end = Long_val(Field(groups, c*2 + 1));
|
|
len = end - start;
|
|
memmove (q, &Byte(orig, start), len);
|
|
q += len;
|
|
break;
|
|
default:
|
|
*q++ = '\\'; *q++ = c; break;
|
|
}
|
|
}
|
|
}
|
|
CAMLreturn(res);
|
|
}
|