ocaml/byterun/lexing.c

231 lines
7.2 KiB
C

/***********************************************************************/
/* */
/* OCaml */
/* */
/* Xavier Leroy, projet Cristal, INRIA Rocquencourt */
/* */
/* Copyright 1996 Institut National de Recherche en Informatique et */
/* en Automatique. All rights reserved. This file is distributed */
/* under the terms of the GNU Library General Public License, with */
/* the special exception on linking described in file ../LICENSE. */
/* */
/***********************************************************************/
/* $Id$ */
/* The table-driven automaton for lexers generated by camllex. */
#include "fail.h"
#include "mlvalues.h"
#include "stacks.h"
struct lexer_buffer {
value refill_buff;
value lex_buffer;
value lex_buffer_len;
value lex_abs_pos;
value lex_start_pos;
value lex_curr_pos;
value lex_last_pos;
value lex_last_action;
value lex_eof_reached;
value lex_mem;
value lex_start_p;
value lex_curr_p;
};
struct lexing_table {
value lex_base;
value lex_backtrk;
value lex_default;
value lex_trans;
value lex_check;
value lex_base_code;
value lex_backtrk_code;
value lex_default_code;
value lex_trans_code;
value lex_check_code;
value lex_code;
};
#if defined(ARCH_BIG_ENDIAN) || SIZEOF_SHORT != 2
#define Short(tbl,n) \
(*((unsigned char *)((tbl) + (n) * 2)) + \
(*((schar *)((tbl) + (n) * 2 + 1)) << 8))
#else
#define Short(tbl,n) (((short *)(tbl))[(n)])
#endif
CAMLprim value caml_lex_engine(struct lexing_table *tbl, value start_state,
struct lexer_buffer *lexbuf)
{
int state, base, backtrk, c;
state = Int_val(start_state);
if (state >= 0) {
/* First entry */
lexbuf->lex_last_pos = lexbuf->lex_start_pos = lexbuf->lex_curr_pos;
lexbuf->lex_last_action = Val_int(-1);
} else {
/* Reentry after refill */
state = -state - 1;
}
while(1) {
/* Lookup base address or action number for current state */
base = Short(tbl->lex_base, state);
if (base < 0) return Val_int(-base-1);
/* See if it's a backtrack point */
backtrk = Short(tbl->lex_backtrk, state);
if (backtrk >= 0) {
lexbuf->lex_last_pos = lexbuf->lex_curr_pos;
lexbuf->lex_last_action = Val_int(backtrk);
}
/* See if we need a refill */
if (lexbuf->lex_curr_pos >= lexbuf->lex_buffer_len){
if (lexbuf->lex_eof_reached == Val_bool (0)){
return Val_int(-state - 1);
}else{
c = 256;
}
}else{
/* Read next input char */
c = Byte_u(lexbuf->lex_buffer, Long_val(lexbuf->lex_curr_pos));
lexbuf->lex_curr_pos += 2;
}
/* Determine next state */
if (Short(tbl->lex_check, base + c) == state)
state = Short(tbl->lex_trans, base + c);
else
state = Short(tbl->lex_default, state);
/* If no transition on this char, return to last backtrack point */
if (state < 0) {
lexbuf->lex_curr_pos = lexbuf->lex_last_pos;
if (lexbuf->lex_last_action == Val_int(-1)) {
caml_failwith("lexing: empty token");
} else {
return lexbuf->lex_last_action;
}
}else{
/* Erase the EOF condition only if the EOF pseudo-character was
consumed by the automaton (i.e. there was no backtrack above)
*/
if (c == 256) lexbuf->lex_eof_reached = Val_bool (0);
}
}
}
/***********************************************/
/* New lexer engine, with memory of positions */
/***********************************************/
static void run_mem(char *pc, value mem, value curr_pos) {
for (;;) {
unsigned char dst, src ;
dst = *pc++ ;
if (dst == 0xff)
return ;
src = *pc++ ;
if (src == 0xff) {
/* fprintf(stderr,"[%hhu] <- %d\n",dst,Int_val(curr_pos)) ;*/
Field(mem,dst) = curr_pos ;
} else {
/* fprintf(stderr,"[%hhu] <- [%hhu]\n",dst,src) ; */
Field(mem,dst) = Field(mem,src) ;
}
}
}
static void run_tag(char *pc, value mem) {
for (;;) {
unsigned char dst, src ;
dst = *pc++ ;
if (dst == 0xff)
return ;
src = *pc++ ;
if (src == 0xff) {
/* fprintf(stderr,"[%hhu] <- -1\n",dst) ; */
Field(mem,dst) = Val_int(-1) ;
} else {
/* fprintf(stderr,"[%hhu] <- [%hhu]\n",dst,src) ; */
Field(mem,dst) = Field(mem,src) ;
}
}
}
CAMLprim value caml_new_lex_engine(struct lexing_table *tbl, value start_state,
struct lexer_buffer *lexbuf)
{
int state, base, backtrk, c, pstate ;
state = Int_val(start_state);
if (state >= 0) {
/* First entry */
lexbuf->lex_last_pos = lexbuf->lex_start_pos = lexbuf->lex_curr_pos;
lexbuf->lex_last_action = Val_int(-1);
} else {
/* Reentry after refill */
state = -state - 1;
}
while(1) {
/* Lookup base address or action number for current state */
base = Short(tbl->lex_base, state);
if (base < 0) {
int pc_off = Short(tbl->lex_base_code, state) ;
run_tag(Bp_val(tbl->lex_code) + pc_off, lexbuf->lex_mem);
/* fprintf(stderr,"Perform: %d\n",-base-1) ; */
return Val_int(-base-1);
}
/* See if it's a backtrack point */
backtrk = Short(tbl->lex_backtrk, state);
if (backtrk >= 0) {
int pc_off = Short(tbl->lex_backtrk_code, state);
run_tag(Bp_val(tbl->lex_code) + pc_off, lexbuf->lex_mem);
lexbuf->lex_last_pos = lexbuf->lex_curr_pos;
lexbuf->lex_last_action = Val_int(backtrk);
}
/* See if we need a refill */
if (lexbuf->lex_curr_pos >= lexbuf->lex_buffer_len){
if (lexbuf->lex_eof_reached == Val_bool (0)){
return Val_int(-state - 1);
}else{
c = 256;
}
}else{
/* Read next input char */
c = Byte_u(lexbuf->lex_buffer, Long_val(lexbuf->lex_curr_pos));
lexbuf->lex_curr_pos += 2;
}
/* Determine next state */
pstate=state ;
if (Short(tbl->lex_check, base + c) == state)
state = Short(tbl->lex_trans, base + c);
else
state = Short(tbl->lex_default, state);
/* If no transition on this char, return to last backtrack point */
if (state < 0) {
lexbuf->lex_curr_pos = lexbuf->lex_last_pos;
if (lexbuf->lex_last_action == Val_int(-1)) {
caml_failwith("lexing: empty token");
} else {
return lexbuf->lex_last_action;
}
}else{
/* If some transition, get and perform memory moves */
int base_code = Short(tbl->lex_base_code, pstate) ;
int pc_off ;
if (Short(tbl->lex_check_code, base_code + c) == pstate)
pc_off = Short(tbl->lex_trans_code, base_code + c) ;
else
pc_off = Short(tbl->lex_default_code, pstate) ;
if (pc_off > 0)
run_mem(Bp_val(tbl->lex_code) + pc_off, lexbuf->lex_mem, lexbuf->lex_curr_pos) ;
/* Erase the EOF condition only if the EOF pseudo-character was
consumed by the automaton (i.e. there was no backtrack above)
*/
if (c == 256) lexbuf->lex_eof_reached = Val_bool (0);
}
}
}