ocaml/camlp4/etc/pa_ocamllex.ml

261 lines
7.3 KiB
OCaml

(* camlp4 ./pa_o.cmo q_MLast.cmo pa_extend.cmo pr_dump.cmo *)
(* $Id$ *)
(* Alain Frisch's contribution *)
open Syntax
open Lexgen
open Compact
(* Adapted from output.ml *)
(**************************)
(* Output the DFA tables and its entry points *)
(* To output an array of short ints, encoded as a string *)
let output_byte buf b =
Buffer.add_char buf '\\';
Buffer.add_char buf (Char.chr(48 + b / 100));
Buffer.add_char buf (Char.chr(48 + (b / 10) mod 10));
Buffer.add_char buf (Char.chr(48 + b mod 10))
let loc = (-1,-1)
let output_array v =
let b = Buffer.create (Array.length v * 3) in
for i = 0 to Array.length v - 1 do
output_byte b (v.(i) land 0xFF);
output_byte b ((v.(i) asr 8) land 0xFF);
if i land 7 = 7 then Buffer.add_string b "\\\n "
done;
let s = Buffer.contents b in
<:expr< $str:s$ >>
(* Output the tables *)
let output_tables tbl =
<:str_item< value lex_tables = {
Lexing.lex_base = $output_array tbl.tbl_base$;
Lexing.lex_backtrk = $output_array tbl.tbl_backtrk$;
Lexing.lex_default = $output_array tbl.tbl_default$;
Lexing.lex_trans = $output_array tbl.tbl_trans$;
Lexing.lex_check = $output_array tbl.tbl_check$
} >>
(* Output the entries *)
let rec make_alias n = function
| [] -> []
| h::t ->
(h, "__ocaml_lex_arg_" ^ (string_of_int n)) :: (make_alias (succ n) t)
let abstraction =
List.fold_right (fun (p,a) e -> <:expr< fun ($p$ as $lid:a$) -> $e$ >>)
let application =
List.fold_left (fun f (_,a) -> <:expr< $f$ $lid:a$ >>)
let output_entry e =
let args = make_alias 0 (<:patt< lexbuf >> :: e.auto_args) in
let f = "__ocaml_lex_rec_" ^ e.auto_name ^ "_rec" in
let call_f = application <:expr< $lid:f$ >> args in
let inistate = <:expr< $int:string_of_int e.auto_initial_state$ >> in
let cases =
List.map
(fun (num, (loc,e)) ->
<:patt< $int:string_of_int num$ >>,
None, (* when ... *)
e
) e.auto_actions @
[ <:patt< __ocaml_lex_n >>,
None,
<:expr< do
{ lexbuf.Lexing.refill_buff lexbuf; $call_f$ __ocaml_lex_n }>> ]
in
[
<:patt< $lid:e.auto_name$ >>,
(abstraction args <:expr< $call_f$ $inistate$ >>);
<:patt< $lid:f$ >>,
(abstraction args <:expr<
fun state ->
match Lexing.engine lex_tables state lexbuf with
[ $list:cases$ ] >>)
]
(* Main output function *)
exception Table_overflow
let output_lexdef tables entry_points =
Printf.eprintf
"pa_ocamllex: found lexer; %d states, %d transitions, table size %d bytes\n"
(Array.length tables.tbl_base)
(Array.length tables.tbl_trans)
(2 * (Array.length tables.tbl_base + Array.length tables.tbl_backtrk +
Array.length tables.tbl_default + Array.length tables.tbl_trans +
Array.length tables.tbl_check));
flush stderr;
if Array.length tables.tbl_trans > 0x8000 then raise Table_overflow;
let entries = List.map output_entry entry_points in
[output_tables tables; <:str_item< value rec $list:List.flatten entries$ >> ]
(* Adapted from parser.mly and main.ml *)
(***************************************)
(* Auxiliaries for the parser. *)
let char s = Char.code (Token.eval_char s)
let named_regexps =
(Hashtbl.create 13 : (string, regular_expression) Hashtbl.t)
let regexp_for_string s =
let rec re_string n =
if n >= String.length s then Epsilon
else if succ n = String.length s then Characters([Char.code (s.[n])])
else Sequence(Characters([Char.code (s.[n])]), re_string (succ n))
in re_string 0
let char_class c1 c2 =
let rec cl n =
if n > c2 then [] else n :: cl(succ n)
in cl c1
let all_chars = char_class 0 255
let rec subtract l1 l2 =
match l1 with
[] -> []
| a::r -> if List.mem a l2 then subtract r l2 else a :: subtract r l2
let () =
Hashtbl.add named_regexps "eof" (Characters [256])
(* The parser *)
let let_regexp = Grammar.Entry.create Pcaml.gram "pa_ocamllex let"
let header = Grammar.Entry.create Pcaml.gram "pa_ocamllex header"
let lexer_def = Grammar.Entry.create Pcaml.gram "pa_ocaml lexerdef"
EXTEND
GLOBAL: Pcaml.str_item let_regexp header lexer_def;
let_regexp: [
[ x = LIDENT; "="; r = regexp ->
if Hashtbl.mem named_regexps x then
Printf.eprintf
"pa_ocamllex (warning): multiple definition of named regexp '%s'\n"
x;
Hashtbl.add named_regexps x r;
]
];
lexer_def: [
[ def = LIST0 definition SEP "and" ->
(try
let (entries, transitions) = make_dfa def in
let tables = compact_tables transitions in
let output = output_lexdef tables entries in
<:str_item< declare $list: output$ end >>
with Table_overflow ->
failwith "Transition table overflow in lexer, automaton is too big")
]
];
Pcaml.str_item: [
[ "pa_ocamllex"; LIDENT "rule"; d = lexer_def -> d
| "pa_ocamllex"; "let"; let_regexp ->
<:str_item< declare $list: []$ end >>
]
];
definition: [
[ x=LIDENT; pl = LIST0 Pcaml.patt; "="; LIDENT "parse";
OPT "|"; l = LIST0 [ r=regexp; a=action -> (r,a) ] SEP "|" -> ((x,pl),l) ]
];
action: [
[ "{"; e = OPT Pcaml.expr; "}" ->
let e = match e with
| Some e -> e
| None -> <:expr< () >>
in
(loc,e)
]
];
header: [
[ "{"; e = LIST0 [ si = Pcaml.str_item; OPT ";;" -> si ]; "}" ->
[<:str_item< declare $list:e$ end>>, loc] ]
| [ -> [] ]
];
regexp: [
[ r1 = regexp; "|"; r2 = regexp -> Alternative(r1,r2) ]
| [ r1 = regexp; r2 = regexp -> Sequence(r1,r2) ]
| [ r = regexp; "*" -> Repetition r
| r = regexp; "+" -> Sequence(r, Repetition r)
| r = regexp; "?" -> Alternative(r, Epsilon)
| "("; r = regexp; ")" -> r
| "_" -> Characters all_chars
| c = CHAR -> Characters [char c]
| s = STRING -> regexp_for_string (Token.eval_string s)
| "["; cc = ch_class; "]" -> Characters cc
| x = LIDENT ->
try Hashtbl.find named_regexps x
with Not_found ->
failwith
("pa_ocamllex (error): reference to unbound regexp name `"^x^"'")
]
];
ch_class: [
[ "^"; cc = ch_class -> subtract all_chars cc]
| [ c1 = CHAR; "-"; c2 = CHAR -> char_class (char c1) (char c2)
| c = CHAR -> [char c]
| cc1 = ch_class; cc2 = ch_class -> cc1 @ cc2
]
];
END
(* We have to be careful about "rule"; in standalone mode,
it is used as a keyword (otherwise, there is a conflict
with named regexp); in normal mode, it is used as LIDENT
(we do not want to reserve such an useful identifier).
Plexer does not like identifiers used as keyword _and_
as LIDENT ...
*)
let standalone =
let already = ref false in
fun () ->
if not (!already) then
begin
already := true;
Printf.eprintf "pa_ocamllex: stand-alone mode\n";
DELETE_RULE Pcaml.str_item: "pa_ocamllex"; LIDENT "rule";lexer_def END;
DELETE_RULE Pcaml.str_item: "pa_ocamllex"; "let"; let_regexp END;
let ocamllex = Grammar.Entry.create Pcaml.gram "pa_ocamllex" in
EXTEND GLOBAL: ocamllex let_regexp header lexer_def;
ocamllex: [
[ h = header;
l = [LIST0 ["let"; let_regexp]; "rule"; d = lexer_def -> (d,loc)];
t = header; EOI -> h @ (l :: t) ,false
]
];
END;
Pcaml.parse_implem := Grammar.Entry.parse ocamllex
end
let () =
Pcaml.add_option "-ocamllex" (Arg.Unit standalone)
"Activate (standalone) ocamllex emulation mode."