242 lines
6.8 KiB
OCaml
242 lines
6.8 KiB
OCaml
(***********************************************************************)
|
|
(* *)
|
|
(* Objective Caml *)
|
|
(* *)
|
|
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
|
|
(* *)
|
|
(* Copyright 1996 Institut National de Recherche en Informatique et *)
|
|
(* Automatique. Distributed only by permission. *)
|
|
(* *)
|
|
(***********************************************************************)
|
|
|
|
(* $Id$ *)
|
|
|
|
(* Compiling a lexer definition *)
|
|
|
|
open Syntax
|
|
|
|
(* Deep abstract syntax for regular expressions *)
|
|
|
|
type regexp =
|
|
Empty
|
|
| Chars of int
|
|
| Action of int
|
|
| Seq of regexp * regexp
|
|
| Alt of regexp * regexp
|
|
| Star of regexp
|
|
|
|
type lexer_entry =
|
|
{ lex_name: string;
|
|
lex_regexp: regexp;
|
|
lex_actions: (int * location) list }
|
|
|
|
(* Representation of automata *)
|
|
|
|
type automata =
|
|
Perform of int
|
|
| Shift of automata_trans * automata_move array
|
|
and automata_trans =
|
|
No_remember
|
|
| Remember of int
|
|
and automata_move =
|
|
Backtrack
|
|
| Goto of int
|
|
|
|
(* Representation of entry points *)
|
|
|
|
type automata_entry =
|
|
{ auto_name: string;
|
|
auto_initial_state: int;
|
|
auto_actions: (int * location) list }
|
|
|
|
(* From shallow to deep syntax *)
|
|
|
|
let chars = ref ([] : char list list)
|
|
let chars_count = ref 0
|
|
let actions = ref ([] : (int * location) list)
|
|
let actions_count = ref 0
|
|
|
|
let rec encode_regexp = function
|
|
Epsilon -> Empty
|
|
| Characters cl ->
|
|
let n = !chars_count in
|
|
chars := cl :: !chars;
|
|
incr chars_count;
|
|
Chars(n)
|
|
| Sequence(r1,r2) ->
|
|
Seq(encode_regexp r1, encode_regexp r2)
|
|
| Alternative(r1,r2) ->
|
|
Alt(encode_regexp r1, encode_regexp r2)
|
|
| Repetition r ->
|
|
Star (encode_regexp r)
|
|
|
|
let encode_casedef casedef =
|
|
List.fold_left
|
|
(fun reg (expr, act) ->
|
|
let act_num = !actions_count in
|
|
incr actions_count;
|
|
actions := (act_num, act) :: !actions;
|
|
Alt(reg, Seq(encode_regexp expr, Action act_num)))
|
|
Empty
|
|
casedef
|
|
|
|
let encode_lexdef def =
|
|
chars := [];
|
|
chars_count := 0;
|
|
let entry_list =
|
|
List.map
|
|
(fun (entry_name, casedef) ->
|
|
actions := [];
|
|
actions_count := 0;
|
|
let re = encode_casedef casedef in
|
|
{ lex_name = entry_name;
|
|
lex_regexp = re;
|
|
lex_actions = List.rev !actions })
|
|
def.entrypoints in
|
|
let chr = Array.of_list (List.rev !chars) in
|
|
chars := [];
|
|
actions := [];
|
|
(chr, entry_list)
|
|
|
|
(* To generate directly a NFA from a regular expression.
|
|
Confer Aho-Sethi-Ullman, dragon book, chap. 3 *)
|
|
|
|
type transition =
|
|
OnChars of int
|
|
| ToAction of int
|
|
|
|
module TransSet =
|
|
Set.Make(struct type t = transition let compare = compare end)
|
|
|
|
let rec nullable = function
|
|
Empty -> true
|
|
| Chars _ -> false
|
|
| Action _ -> false
|
|
| Seq(r1,r2) -> nullable r1 & nullable r2
|
|
| Alt(r1,r2) -> nullable r1 or nullable r2
|
|
| Star r -> true
|
|
|
|
let rec firstpos = function
|
|
Empty -> TransSet.empty
|
|
| Chars pos -> TransSet.add (OnChars pos) TransSet.empty
|
|
| Action act -> TransSet.add (ToAction act) TransSet.empty
|
|
| Seq(r1,r2) -> if nullable r1
|
|
then TransSet.union (firstpos r1) (firstpos r2)
|
|
else firstpos r1
|
|
| Alt(r1,r2) -> TransSet.union (firstpos r1) (firstpos r2)
|
|
| Star r -> firstpos r
|
|
|
|
let rec lastpos = function
|
|
Empty -> TransSet.empty
|
|
| Chars pos -> TransSet.add (OnChars pos) TransSet.empty
|
|
| Action act -> TransSet.add (ToAction act) TransSet.empty
|
|
| Seq(r1,r2) -> if nullable r2
|
|
then TransSet.union (lastpos r1) (lastpos r2)
|
|
else lastpos r2
|
|
| Alt(r1,r2) -> TransSet.union (lastpos r1) (lastpos r2)
|
|
| Star r -> lastpos r
|
|
|
|
let followpos size entry_list =
|
|
let v = Array.create size TransSet.empty in
|
|
let fill_pos first = function
|
|
OnChars pos -> v.(pos) <- TransSet.union first v.(pos)
|
|
| ToAction _ -> () in
|
|
let rec fill = function
|
|
Seq(r1,r2) ->
|
|
fill r1; fill r2;
|
|
TransSet.iter (fill_pos (firstpos r2)) (lastpos r1)
|
|
| Alt(r1,r2) ->
|
|
fill r1; fill r2
|
|
| Star r ->
|
|
fill r;
|
|
TransSet.iter (fill_pos (firstpos r)) (lastpos r)
|
|
| _ -> () in
|
|
List.iter (fun entry -> fill entry.lex_regexp) entry_list;
|
|
v
|
|
|
|
let no_action = max_int
|
|
|
|
let split_trans_set trans_set =
|
|
TransSet.fold
|
|
(fun trans (act, pos_set as act_pos_set) ->
|
|
match trans with
|
|
OnChars pos -> (act, pos :: pos_set)
|
|
| ToAction act1 -> if act1 < act then (act1, pos_set) else act_pos_set)
|
|
trans_set
|
|
(no_action, [])
|
|
|
|
module StateMap =
|
|
Map.Make(struct type t = TransSet.t let compare = TransSet.compare end)
|
|
|
|
let state_map = ref (StateMap.empty: int StateMap.t)
|
|
let todo = (Stack.create() : (TransSet.t * int) Stack.t)
|
|
let next_state_num = ref 0
|
|
|
|
let reset_state_mem () =
|
|
state_map := StateMap.empty;
|
|
Stack.clear todo;
|
|
next_state_num := 0
|
|
|
|
let get_state st =
|
|
try
|
|
StateMap.find st !state_map
|
|
with Not_found ->
|
|
let num = !next_state_num in
|
|
incr next_state_num;
|
|
state_map := StateMap.add st num !state_map;
|
|
Stack.push (st, num) todo;
|
|
num
|
|
|
|
let map_on_all_states f =
|
|
let res = ref [] in
|
|
begin try
|
|
while true do
|
|
let (st, i) = Stack.pop todo in
|
|
let r = f st in
|
|
res := (r, i) :: !res
|
|
done
|
|
with Stack.Empty -> ()
|
|
end;
|
|
!res
|
|
|
|
let goto_state st =
|
|
if TransSet.is_empty st then Backtrack else Goto (get_state st)
|
|
|
|
let transition_from chars follow pos_set =
|
|
let tr = Array.create 256 TransSet.empty in
|
|
let shift = Array.create 256 Backtrack in
|
|
List.iter
|
|
(fun pos ->
|
|
List.iter
|
|
(fun c ->
|
|
tr.(Char.code c) <- TransSet.union tr.(Char.code c) follow.(pos))
|
|
chars.(pos))
|
|
pos_set;
|
|
for i = 0 to 255 do
|
|
shift.(i) <- goto_state tr.(i)
|
|
done;
|
|
shift
|
|
|
|
let translate_state chars follow state =
|
|
match split_trans_set state with
|
|
(n, []) -> Perform n
|
|
| (n, ps) -> Shift((if n = no_action then No_remember else Remember n),
|
|
transition_from chars follow ps)
|
|
|
|
let make_dfa lexdef =
|
|
let (chars, entry_list) = encode_lexdef lexdef in
|
|
let follow = followpos (Array.length chars) entry_list in
|
|
reset_state_mem();
|
|
let initial_states =
|
|
List.map
|
|
(fun le ->
|
|
{ auto_name = le.lex_name;
|
|
auto_initial_state = get_state(firstpos le.lex_regexp);
|
|
auto_actions = le.lex_actions })
|
|
entry_list in
|
|
let states = map_on_all_states (translate_state chars follow) in
|
|
let actions = Array.create !next_state_num (Perform 0) in
|
|
List.iter (fun (act, i) -> actions.(i) <- act) states;
|
|
reset_state_mem();
|
|
(initial_states, actions)
|