(* Compiling a lexer definition *) open Syntax (* Deep abstract syntax for regular expressions *) type regexp = Empty | Chars of int | Action of int | Seq of regexp * regexp | Alt of regexp * regexp | Star of regexp (* From shallow to deep syntax *) (*** let print_char_class c = let print_interval low high = prerr_int low; if high - 1 > low then begin prerr_char '-'; prerr_int (high-1) end; prerr_char ' ' in let rec print_class first next = function [] -> print_interval first next | c::l -> if char.code c = next then print_class first (next+1) l else begin print_interval first next; print_class (char.code c) (char.code c + 1) l end in match c with [] -> prerr_newline() | c::l -> print_class (char.code c) (char.code c + 1) l; prerr_newline() let rec print_regexp = function Empty -> prerr_string "Empty" | Chars n -> prerr_string "Chars "; prerr_int n | Action n -> prerr_string "Action "; prerr_int n | Seq(r1,r2) -> print_regexp r1; prerr_string "; "; print_regexp r2 | Alt(r1,r2) -> prerr_string "("; print_regexp r1; prerr_string " | "; print_regexp r2; prerr_string ")" | Star r -> prerr_string "("; print_regexp r; prerr_string ")*" ***) let chars = ref ([] : char list list) let chars_count = ref 0 let actions = ref ([] : (int * location) list) let actions_count = ref 0 let rec encode_regexp = function Epsilon -> Empty | Characters cl -> let n = !chars_count in (*** prerr_int n; prerr_char ' '; print_char_class cl; ***) chars := cl :: !chars; chars_count := !chars_count + 1; Chars(n) | Sequence(r1,r2) -> Seq(encode_regexp r1, encode_regexp r2) | Alternative(r1,r2) -> Alt(encode_regexp r1, encode_regexp r2) | Repetition r -> Star (encode_regexp r) let encode_casedef = List.fold_left (fun reg (expr,act) -> let act_num = !actions_count in actions_count := !actions_count + 1; actions := (act_num, act) :: !actions; Alt(reg, Seq(encode_regexp expr, Action act_num))) Empty let encode_lexdef (Lexdef(_, ld)) = chars := []; chars_count := 0; actions := []; actions_count := 0; let name_regexp_list = List.map (fun (name, casedef) -> (name, encode_casedef casedef)) ld in (* List.iter print_char_class chars; *) let chr = Array.of_list (List.rev !chars) and act = !actions in chars := []; actions := []; (chr, name_regexp_list, act) (* To generate directly a NFA from a regular expression. Confer Aho-Sethi-Ullman, dragon book, chap. 3 *) type transition = OnChars of int | ToAction of int let rec merge_trans l1 l2 = match (l1, l2) with ([], s2) -> s2 | (s1, []) -> s1 | ((OnChars n1 as t1) :: r1 as s1), ((OnChars n2 as t2) :: r2 as s2) -> if n1 = n2 then t1 :: merge_trans r1 r2 else if n1 < n2 then t1 :: merge_trans r1 s2 else t2 :: merge_trans s1 r2 | ((ToAction n1 as t1) :: r1 as s1), ((ToAction n2 as t2) :: r2 as s2) -> if n1 = n2 then t1 :: merge_trans r1 r2 else if n1 < n2 then t1 :: merge_trans r1 s2 else t2 :: merge_trans s1 r2 | ((OnChars n1 as t1) :: r1 as s1), ((ToAction n2 as t2) :: r2 as s2) -> t1 :: merge_trans r1 s2 | ((ToAction n1 as t1) :: r1 as s1), ((OnChars n2 as t2) :: r2 as s2) -> t2 :: merge_trans s1 r2 let rec nullable = function Empty -> true | Chars _ -> false | Action _ -> false | Seq(r1,r2) -> nullable r1 & nullable r2 | Alt(r1,r2) -> nullable r1 or nullable r2 | Star r -> true let rec firstpos = function Empty -> [] | Chars pos -> [OnChars pos] | Action act -> [ToAction act] | Seq(r1,r2) -> if nullable r1 then merge_trans (firstpos r1) (firstpos r2) else firstpos r1 | Alt(r1,r2) -> merge_trans (firstpos r1) (firstpos r2) | Star r -> firstpos r let rec lastpos = function Empty -> [] | Chars pos -> [OnChars pos] | Action act -> [ToAction act] | Seq(r1,r2) -> if nullable r2 then merge_trans (lastpos r1) (lastpos r2) else lastpos r2 | Alt(r1,r2) -> merge_trans (lastpos r1) (lastpos r2) | Star r -> lastpos r let followpos size name_regexp_list = let v = Array.new size [] in let fill_pos first = function OnChars pos -> v.(pos) <- merge_trans first v.(pos); () | ToAction _ -> () in let rec fill = function Seq(r1,r2) -> fill r1; fill r2; List.iter (fill_pos (firstpos r2)) (lastpos r1) | Alt(r1,r2) -> fill r1; fill r2 | Star r -> fill r; List.iter (fill_pos (firstpos r)) (lastpos r) | _ -> () in List.iter (fun (name, regexp) -> fill regexp) name_regexp_list; v let no_action = 0x3FFFFFFF let split_trans_set = List.fold_left (fun (act, pos_set as act_pos_set) trans -> match trans with OnChars pos -> (act, pos :: pos_set) | ToAction act1 -> if act1 < act then (act1, pos_set) else act_pos_set) (no_action, []) let memory = (Hashtbl.new 131 : (transition list, int) Hashtbl.t) let todo = ref ([] : (transition list * int) list) let next = ref 0 let get_state st = try Hashtbl.find memory st with Not_found -> let nbr = !next in next := !next + 1; Hashtbl.add memory st nbr; todo := (st, nbr) :: !todo; nbr let rec map_on_states f = match !todo with [] -> [] | (st,i)::r -> todo := r; let res = f st in (res,i) :: map_on_states f let number_of_states () = !next let goto_state = function [] -> Backtrack | ps -> Goto (get_state ps) let transition_from chars follow pos_set = let tr = Array.new 256 [] and shift = Array.new 256 Backtrack in List.iter (fun pos -> List.iter (fun c -> tr.(Char.code c) <- merge_trans tr.(Char.code c) follow.(pos)) chars.(pos)) pos_set; for i = 0 to 255 do shift.(i) <- goto_state tr.(i) done; shift let translate_state chars follow state = match split_trans_set state with n, [] -> Perform n | n, ps -> Shift( (if n = no_action then No_remember else Remember n), transition_from chars follow ps) let make_dfa lexdef = let (chars, name_regexp_list, actions) = encode_lexdef lexdef in (** List.iter (fun (name, regexp) -> prerr_string name; prerr_string " = "; print_regexp regexp; prerr_newline()) name_regexp_list; **) let follow = followpos (Array.length chars) name_regexp_list in let initial_states = List.map (fun (name, regexp) -> (name, get_state(firstpos regexp))) name_regexp_list in let states = map_on_states (translate_state chars follow) in let v = Array.new (number_of_states()) (Perform 0) in List.iter (fun (auto, i) -> v.(i) <- auto) states; (initial_states, v, actions)