ocaml/lex/lexgen.ml

242 lines
6.8 KiB
OCaml

(***********************************************************************)
(* *)
(* Objective Caml *)
(* *)
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 1996 Institut National de Recherche en Informatique et *)
(* Automatique. Distributed only by permission. *)
(* *)
(***********************************************************************)
(* $Id$ *)
(* Compiling a lexer definition *)
open Syntax
(* Deep abstract syntax for regular expressions *)
type regexp =
Empty
| Chars of int
| Action of int
| Seq of regexp * regexp
| Alt of regexp * regexp
| Star of regexp
type lexer_entry =
{ lex_name: string;
lex_regexp: regexp;
lex_actions: (int * location) list }
(* Representation of automata *)
type automata =
Perform of int
| Shift of automata_trans * automata_move array
and automata_trans =
No_remember
| Remember of int
and automata_move =
Backtrack
| Goto of int
(* Representation of entry points *)
type automata_entry =
{ auto_name: string;
auto_initial_state: int;
auto_actions: (int * location) list }
(* From shallow to deep syntax *)
let chars = ref ([] : char list list)
let chars_count = ref 0
let actions = ref ([] : (int * location) list)
let actions_count = ref 0
let rec encode_regexp = function
Epsilon -> Empty
| Characters cl ->
let n = !chars_count in
chars := cl :: !chars;
incr chars_count;
Chars(n)
| Sequence(r1,r2) ->
Seq(encode_regexp r1, encode_regexp r2)
| Alternative(r1,r2) ->
Alt(encode_regexp r1, encode_regexp r2)
| Repetition r ->
Star (encode_regexp r)
let encode_casedef casedef =
List.fold_left
(fun reg (expr, act) ->
let act_num = !actions_count in
incr actions_count;
actions := (act_num, act) :: !actions;
Alt(reg, Seq(encode_regexp expr, Action act_num)))
Empty
casedef
let encode_lexdef def =
chars := [];
chars_count := 0;
let entry_list =
List.map
(fun (entry_name, casedef) ->
actions := [];
actions_count := 0;
let re = encode_casedef casedef in
{ lex_name = entry_name;
lex_regexp = re;
lex_actions = List.rev !actions })
def.entrypoints in
let chr = Array.of_list (List.rev !chars) in
chars := [];
actions := [];
(chr, entry_list)
(* To generate directly a NFA from a regular expression.
Confer Aho-Sethi-Ullman, dragon book, chap. 3 *)
type transition =
OnChars of int
| ToAction of int
module TransSet =
Set.Make(struct type t = transition let compare = compare end)
let rec nullable = function
Empty -> true
| Chars _ -> false
| Action _ -> false
| Seq(r1,r2) -> nullable r1 & nullable r2
| Alt(r1,r2) -> nullable r1 or nullable r2
| Star r -> true
let rec firstpos = function
Empty -> TransSet.empty
| Chars pos -> TransSet.add (OnChars pos) TransSet.empty
| Action act -> TransSet.add (ToAction act) TransSet.empty
| Seq(r1,r2) -> if nullable r1
then TransSet.union (firstpos r1) (firstpos r2)
else firstpos r1
| Alt(r1,r2) -> TransSet.union (firstpos r1) (firstpos r2)
| Star r -> firstpos r
let rec lastpos = function
Empty -> TransSet.empty
| Chars pos -> TransSet.add (OnChars pos) TransSet.empty
| Action act -> TransSet.add (ToAction act) TransSet.empty
| Seq(r1,r2) -> if nullable r2
then TransSet.union (lastpos r1) (lastpos r2)
else lastpos r2
| Alt(r1,r2) -> TransSet.union (lastpos r1) (lastpos r2)
| Star r -> lastpos r
let followpos size entry_list =
let v = Array.create size TransSet.empty in
let fill_pos first = function
OnChars pos -> v.(pos) <- TransSet.union first v.(pos)
| ToAction _ -> () in
let rec fill = function
Seq(r1,r2) ->
fill r1; fill r2;
TransSet.iter (fill_pos (firstpos r2)) (lastpos r1)
| Alt(r1,r2) ->
fill r1; fill r2
| Star r ->
fill r;
TransSet.iter (fill_pos (firstpos r)) (lastpos r)
| _ -> () in
List.iter (fun entry -> fill entry.lex_regexp) entry_list;
v
let no_action = max_int
let split_trans_set trans_set =
TransSet.fold
(fun trans (act, pos_set as act_pos_set) ->
match trans with
OnChars pos -> (act, pos :: pos_set)
| ToAction act1 -> if act1 < act then (act1, pos_set) else act_pos_set)
trans_set
(no_action, [])
module StateMap =
Map.Make(struct type t = TransSet.t let compare = TransSet.compare end)
let state_map = ref (StateMap.empty: int StateMap.t)
let todo = (Stack.create() : (TransSet.t * int) Stack.t)
let next_state_num = ref 0
let reset_state_mem () =
state_map := StateMap.empty;
Stack.clear todo;
next_state_num := 0
let get_state st =
try
StateMap.find st !state_map
with Not_found ->
let num = !next_state_num in
incr next_state_num;
state_map := StateMap.add st num !state_map;
Stack.push (st, num) todo;
num
let map_on_all_states f =
let res = ref [] in
begin try
while true do
let (st, i) = Stack.pop todo in
let r = f st in
res := (r, i) :: !res
done
with Stack.Empty -> ()
end;
!res
let goto_state st =
if TransSet.is_empty st then Backtrack else Goto (get_state st)
let transition_from chars follow pos_set =
let tr = Array.create 256 TransSet.empty in
let shift = Array.create 256 Backtrack in
List.iter
(fun pos ->
List.iter
(fun c ->
tr.(Char.code c) <- TransSet.union tr.(Char.code c) follow.(pos))
chars.(pos))
pos_set;
for i = 0 to 255 do
shift.(i) <- goto_state tr.(i)
done;
shift
let translate_state chars follow state =
match split_trans_set state with
(n, []) -> Perform n
| (n, ps) -> Shift((if n = no_action then No_remember else Remember n),
transition_from chars follow ps)
let make_dfa lexdef =
let (chars, entry_list) = encode_lexdef lexdef in
let follow = followpos (Array.length chars) entry_list in
reset_state_mem();
let initial_states =
List.map
(fun le ->
{ auto_name = le.lex_name;
auto_initial_state = get_state(firstpos le.lex_regexp);
auto_actions = le.lex_actions })
entry_list in
let states = map_on_all_states (translate_state chars follow) in
let actions = Array.create !next_state_num (Perform 0) in
List.iter (fun (act, i) -> actions.(i) <- act) states;
reset_state_mem();
(initial_states, actions)