/***********************************************************************/ /* */ /* Objective Caml */ /* */ /* Xavier Leroy, projet Cristal, INRIA Rocquencourt */ /* */ /* Copyright 1996 Institut National de Recherche en Informatique et */ /* en Automatique. All rights reserved. This file is distributed */ /* under the terms of the GNU Library General Public License, with */ /* the special exception on linking described in file ../../LICENSE. */ /* */ /***********************************************************************/ /* $Id$ */ #include #include #include #include #include #include /* The backtracking NFA interpreter */ union backtrack_point { struct { value * pc; /* with low bit set */ unsigned char * txt; } pos; struct { unsigned char ** loc; /* with low bit clear */ unsigned char * val; } undo; }; #define Set_tag(p) ((value *) ((intnat)(p) | 1)) #define Clear_tag(p) ((value *) ((intnat)(p) & ~1)) #define Tag_is_set(p) ((intnat)(p) & 1) #define BACKTRACK_STACK_BLOCK_SIZE 500 struct backtrack_stack { struct backtrack_stack * previous; union backtrack_point point[BACKTRACK_STACK_BLOCK_SIZE]; }; #define Opcode(x) ((x) & 0xFF) #define Arg(x) ((uintnat)(x) >> 8) #define SignedArg(x) ((intnat)(x) >> 8) enum { CHAR, /* match a single character */ CHARNORM, /* match a single character, after normalization */ STRING, /* match a character string */ STRINGNORM, /* match a character string, after normalization */ CHARCLASS, /* match a character class */ BOL, /* match at beginning of line */ EOL, /* match at end of line */ WORDBOUNDARY, /* match on a word boundary */ BEGGROUP, /* record the beginning of a group */ ENDGROUP, /* record the end of a group */ REFGROUP, /* match a previously matched group */ ACCEPT, /* report success */ SIMPLEOPT, /* match a character class 0 or 1 times */ SIMPLESTAR, /* match a character class 0, 1 or several times */ SIMPLEPLUS, /* match a character class 1 or several times */ GOTO, /* unconditional branch */ PUSHBACK, /* record a backtrack point -- where to jump in case of failure */ SETMARK, /* remember current position in given register # */ CHECKPROGRESS /* backtrack if no progress was made w.r.t. reg # */ }; /* Accessors in a compiled regexp */ #define Prog(re) Field(re, 0) #define Cpool(re) Field(re, 1) #define Normtable(re) Field(re, 2) #define Numgroups(re) Int_val(Field(re, 3)) #define Numregisters(re) Int_val(Field(re, 4)) #define Startchars(re) Int_val(Field(re, 5)) /* Record positions of matched groups */ #define NUM_GROUPS 32 struct re_group { unsigned char * start; unsigned char * end; }; static struct re_group re_group[NUM_GROUPS]; /* Record positions reached during matching; used to check progress in repeated matching of a regexp. */ #define NUM_REGISTERS 64 static unsigned char * re_register[NUM_REGISTERS]; /* The initial backtracking stack */ static struct backtrack_stack initial_stack = { NULL, }; /* Free a chained list of backtracking stacks */ static void free_backtrack_stack(struct backtrack_stack * stack) { struct backtrack_stack * prevstack; while ((prevstack = stack->previous) != NULL) { stat_free(stack); stack = prevstack; } } /* Membership in a bit vector representing a set of booleans */ #define In_bitset(s,i,tmp) (tmp = (i), ((s)[tmp >> 3] >> (tmp & 7)) & 1) /* Determine if a character is a word constituent */ /* PR#4874: word constituent = letter, digit, underscore. */ static unsigned char re_word_letters[32] = { 0x00, 0x00, 0x00, 0x00, /* 0x00-0x1F: none */ 0x00, 0x00, 0xFF, 0x03, /* 0x20-0x3F: digits 0-9 */ 0xFE, 0xFF, 0xFF, 0x87, /* 0x40-0x5F: A to Z, _ */ 0xFE, 0xFF, 0xFF, 0x07, /* 0x60-0x7F: a to z */ 0x00, 0x00, 0x00, 0x00, /* 0x80-0x9F: none */ 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xBF: none */ 0xFF, 0xFF, 0x7F, 0xFF, /* 0xC0-0xDF: Latin-1 accented uppercase */ 0xFF, 0xFF, 0x7F, 0xFF /* 0xE0-0xFF: Latin-1 accented lowercase */ }; #define Is_word_letter(c) ((re_word_letters[(c) >> 3] >> ((c) & 7)) & 1) /* The bytecode interpreter for the NFA */ static int re_match(value re, unsigned char * starttxt, register unsigned char * txt, register unsigned char * endtxt, int accept_partial_match) { register value * pc; intnat instr; struct backtrack_stack * stack; union backtrack_point * sp; value cpool; value normtable; unsigned char c; union backtrack_point back; { int i; struct re_group * p; unsigned char ** q; for (p = &re_group[1], i = Numgroups(re); i > 1; i--, p++) p->start = p->end = NULL; for (q = &re_register[0], i = Numregisters(re); i > 0; i--, q++) *q = NULL; } pc = &Field(Prog(re), 0); stack = &initial_stack; sp = stack->point; cpool = Cpool(re); normtable = Normtable(re); re_group[0].start = txt; while (1) { instr = Long_val(*pc++); switch (Opcode(instr)) { case CHAR: if (txt == endtxt) goto prefix_match; if (*txt != Arg(instr)) goto backtrack; txt++; break; case CHARNORM: if (txt == endtxt) goto prefix_match; if (Byte_u(normtable, *txt) != Arg(instr)) goto backtrack; txt++; break; case STRING: { unsigned char * s = (unsigned char *) String_val(Field(cpool, Arg(instr))); while ((c = *s++) != 0) { if (txt == endtxt) goto prefix_match; if (c != *txt) goto backtrack; txt++; } break; } case STRINGNORM: { unsigned char * s = (unsigned char *) String_val(Field(cpool, Arg(instr))); while ((c = *s++) != 0) { if (txt == endtxt) goto prefix_match; if (c != Byte_u(normtable, *txt)) goto backtrack; txt++; } break; } case CHARCLASS: if (txt == endtxt) goto prefix_match; if (! In_bitset(String_val(Field(cpool, Arg(instr))), *txt, c)) goto backtrack; txt++; break; case BOL: if (txt > starttxt && txt[-1] != '\n') goto backtrack; break; case EOL: if (txt < endtxt && *txt != '\n') goto backtrack; break; case WORDBOUNDARY: /* At beginning and end of text: no At beginning of text: OK if current char is a letter At end of text: OK if previous char is a letter Otherwise: OK if previous char is a letter and current char not a letter or previous char is not a letter and current char is a letter */ if (txt == starttxt) { if (txt == endtxt) goto prefix_match; if (Is_word_letter(txt[0])) break; goto backtrack; } else if (txt == endtxt) { if (Is_word_letter(txt[-1])) break; goto backtrack; } else { if (Is_word_letter(txt[-1]) != Is_word_letter(txt[0])) break; goto backtrack; } case BEGGROUP: { int group_no = Arg(instr); struct re_group * group = &(re_group[group_no]); back.undo.loc = &(group->start); back.undo.val = group->start; group->start = txt; goto push; } case ENDGROUP: { int group_no = Arg(instr); struct re_group * group = &(re_group[group_no]); back.undo.loc = &(group->end); back.undo.val = group->end; group->end = txt; goto push; } case REFGROUP: { int group_no = Arg(instr); struct re_group * group = &(re_group[group_no]); unsigned char * s; if (group->start == NULL || group->end == NULL) goto backtrack; for (s = group->start; s < group->end; s++) { if (txt == endtxt) goto prefix_match; if (*s != *txt) goto backtrack; txt++; } break; } case ACCEPT: goto accept; case SIMPLEOPT: { char * set = String_val(Field(cpool, Arg(instr))); if (txt < endtxt && In_bitset(set, *txt, c)) txt++; break; } case SIMPLESTAR: { char * set = String_val(Field(cpool, Arg(instr))); while (txt < endtxt && In_bitset(set, *txt, c)) txt++; break; } case SIMPLEPLUS: { char * set = String_val(Field(cpool, Arg(instr))); if (txt == endtxt) goto prefix_match; if (! In_bitset(set, *txt, c)) goto backtrack; txt++; while (txt < endtxt && In_bitset(set, *txt, c)) txt++; break; } case GOTO: pc = pc + SignedArg(instr); break; case PUSHBACK: back.pos.pc = Set_tag(pc + SignedArg(instr)); back.pos.txt = txt; goto push; case SETMARK: { int reg_no = Arg(instr); unsigned char ** reg = &(re_register[reg_no]); back.undo.loc = reg; back.undo.val = *reg; *reg = txt; goto push; } case CHECKPROGRESS: { int reg_no = Arg(instr); if (re_register[reg_no] == txt) goto backtrack; break; } default: caml_fatal_error ("impossible case in re_match"); } /* Continue with next instruction */ continue; push: /* Push an item on the backtrack stack and continue with next instr */ if (sp == stack->point + BACKTRACK_STACK_BLOCK_SIZE) { struct backtrack_stack * newstack = stat_alloc(sizeof(struct backtrack_stack)); newstack->previous = stack; stack = newstack; sp = stack->point; } *sp = back; sp++; continue; prefix_match: /* We get here when matching failed because the end of text was encountered. */ if (accept_partial_match) goto accept; backtrack: /* We get here when matching fails. Backtrack to most recent saved program point, undoing variable assignments on the way. */ while (1) { if (sp == stack->point) { struct backtrack_stack * prevstack = stack->previous; if (prevstack == NULL) return 0; stat_free(stack); stack = prevstack; sp = stack->point + BACKTRACK_STACK_BLOCK_SIZE; } sp--; if (Tag_is_set(sp->pos.pc)) { pc = Clear_tag(sp->pos.pc); txt = sp->pos.txt; break; } else { *(sp->undo.loc) = sp->undo.val; } } continue; } accept: /* We get here when the regexp was successfully matched */ free_backtrack_stack(stack); re_group[0].end = txt; return 1; } /* Allocate an integer array containing the positions of the matched groups. Beginning of group #N is at 2N, end is at 2N+1. Take position = -1 when group wasn't matched. */ static value re_alloc_groups(value re, value str) { CAMLparam1(str); CAMLlocal1(res); unsigned char * starttxt = (unsigned char *) String_val(str); int n = Numgroups(re); int i; struct re_group * group; res = alloc(n * 2, 0); for (i = 0; i < n; i++) { group = &(re_group[i]); if (group->start == NULL || group->end == NULL) { Field(res, i * 2) = Val_int(-1); Field(res, i * 2 + 1) = Val_int(-1); } else { Field(res, i * 2) = Val_long(group->start - starttxt); Field(res, i * 2 + 1) = Val_long(group->end - starttxt); } } CAMLreturn(res); } /* String matching and searching. All functions return the empty array on failure, and an array of positions on success. */ CAMLprim value re_string_match(value re, value str, value pos) { unsigned char * starttxt = &Byte_u(str, 0); unsigned char * txt = &Byte_u(str, Long_val(pos)); unsigned char * endtxt = &Byte_u(str, string_length(str)); if (txt < starttxt || txt > endtxt) invalid_argument("Str.string_match"); if (re_match(re, starttxt, txt, endtxt, 0)) { return re_alloc_groups(re, str); } else { return Atom(0); } } CAMLprim value re_partial_match(value re, value str, value pos) { unsigned char * starttxt = &Byte_u(str, 0); unsigned char * txt = &Byte_u(str, Long_val(pos)); unsigned char * endtxt = &Byte_u(str, string_length(str)); if (txt < starttxt || txt > endtxt) invalid_argument("Str.string_partial_match"); if (re_match(re, starttxt, txt, endtxt, 1)) { return re_alloc_groups(re, str); } else { return Atom(0); } } CAMLprim value re_search_forward(value re, value str, value startpos) { unsigned char * starttxt = &Byte_u(str, 0); unsigned char * txt = &Byte_u(str, Long_val(startpos)); unsigned char * endtxt = &Byte_u(str, string_length(str)); unsigned char * startchars; if (txt < starttxt || txt > endtxt) invalid_argument("Str.search_forward"); if (Startchars(re) == -1) { do { if (re_match(re, starttxt, txt, endtxt, 0)) return re_alloc_groups(re, str); txt++; } while (txt <= endtxt); return Atom(0); } else { startchars = (unsigned char *) String_val(Field(Cpool(re), Startchars(re))); do { while (txt < endtxt && startchars[*txt] == 0) txt++; if (re_match(re, starttxt, txt, endtxt, 0)) return re_alloc_groups(re, str); txt++; } while (txt <= endtxt); return Atom(0); } } CAMLprim value re_search_backward(value re, value str, value startpos) { unsigned char * starttxt = &Byte_u(str, 0); unsigned char * txt = &Byte_u(str, Long_val(startpos)); unsigned char * endtxt = &Byte_u(str, string_length(str)); unsigned char * startchars; if (txt < starttxt || txt > endtxt) invalid_argument("Str.search_backward"); if (Startchars(re) == -1) { do { if (re_match(re, starttxt, txt, endtxt, 0)) return re_alloc_groups(re, str); txt--; } while (txt >= starttxt); return Atom(0); } else { startchars = (unsigned char *) String_val(Field(Cpool(re), Startchars(re))); do { while (txt > starttxt && startchars[*txt] == 0) txt--; if (re_match(re, starttxt, txt, endtxt, 0)) return re_alloc_groups(re, str); txt--; } while (txt >= starttxt); return Atom(0); } } /* Replacement */ CAMLprim value re_replacement_text(value repl, value groups, value orig) { CAMLparam3(repl, groups, orig); CAMLlocal1(res); mlsize_t start, end, len, n; char * p, * q; int c; len = 0; p = String_val(repl); n = string_length(repl); while (n > 0) { c = *p++; n--; if(c != '\\') len++; else { if (n == 0) failwith("Str.replace: illegal backslash sequence"); c = *p++; n--; switch (c) { case '\\': len++; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c -= '0'; if (c*2 >= Wosize_val(groups)) failwith("Str.replace: reference to unmatched group"); start = Long_val(Field(groups, c*2)); end = Long_val(Field(groups, c*2 + 1)); if (start == (mlsize_t) -1) failwith("Str.replace: reference to unmatched group"); len += end - start; break; default: len += 2; break; } } } res = alloc_string(len); p = String_val(repl); q = String_val(res); n = string_length(repl); while (n > 0) { c = *p++; n--; if(c != '\\') *q++ = c; else { c = *p++; n--; switch (c) { case '\\': *q++ = '\\'; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c -= '0'; start = Long_val(Field(groups, c*2)); end = Long_val(Field(groups, c*2 + 1)); len = end - start; memmove (q, &Byte(orig, start), len); q += len; break; default: *q++ = '\\'; *q++ = c; break; } } } CAMLreturn(res); }