ocaml/camlp4/etc/pa_ocamllex.ml

(* camlp4 ./pa_o.cmo q_MLast.cmo pa_extend.cmo pr_dump.cmo *)
(* $Id$ *)
(* Alain Frisch's contribution *)

open Syntax
open Lexgen
open Compact

(* Adapted from output.ml *)
(**************************)

(* Output the DFA tables and its entry points *)

(* To output an array of short ints, encoded as a string *)

let output_byte buf b =
  Buffer.add_char buf '\\';
  Buffer.add_char buf (Char.chr(48 + b / 100));
  Buffer.add_char buf (Char.chr(48 + (b / 10) mod 10));
  Buffer.add_char buf (Char.chr(48 + b mod 10))

let loc = (-1,-1)

let output_array v =
  let b = Buffer.create (Array.length v * 3) in
  for i = 0 to Array.length v - 1 do
    output_byte b (v.(i) land 0xFF);
    output_byte b ((v.(i) asr 8) land 0xFF);
    if i land 7 = 7 then Buffer.add_string b "\\\n    "
  done;
  let s = Buffer.contents b in
  <:expr< $str:s$ >>

(* Output the tables *)

let output_tables tbl =
  <:str_item< value lex_tables = {
    Lexing.lex_base = $output_array tbl.tbl_base$;
    Lexing.lex_backtrk = $output_array tbl.tbl_backtrk$;
    Lexing.lex_default = $output_array tbl.tbl_default$;
    Lexing.lex_trans = $output_array tbl.tbl_trans$;
    Lexing.lex_check = $output_array tbl.tbl_check$
  } >>

(* Output the entries *)

let rec make_alias n = function
  | [] -> []
  | h::t -> 
      (h, "__ocaml_lex_arg_" ^ (string_of_int n)) :: (make_alias (succ n) t)

let abstraction =
  List.fold_right (fun (p,a) e -> <:expr< fun ($p$ as $lid:a$) -> $e$ >>)


let application =
  List.fold_left (fun f (_,a) -> <:expr< $f$ $lid:a$ >>)

let output_entry e =
  let args = make_alias 0 (<:patt< lexbuf >> :: e.auto_args) in
  let f = "__ocaml_lex_rec_" ^ e.auto_name ^ "_rec" in
  let call_f = application <:expr< $lid:f$ >> args in
  let inistate = <:expr< $int:string_of_int e.auto_initial_state$ >> in
  let cases = 
    List.map
      (fun (num, (loc,e)) ->
         <:patt< $int:string_of_int num$ >>,
         None, (* when ... *)
         e
      ) e.auto_actions @
    [ <:patt< __ocaml_lex_n >>,
      None,
      <:expr< do 
        { lexbuf.Lexing.refill_buff lexbuf; $call_f$ __ocaml_lex_n  }>> ]
  in
  [
    <:patt< $lid:e.auto_name$ >>,
    (abstraction args <:expr< $call_f$ $inistate$ >>);

    <:patt< $lid:f$ >>,
    (abstraction args <:expr< 
       fun state -> 
       match Lexing.engine lex_tables state lexbuf with 
       [ $list:cases$ ] >>)
  ]

(* Main output function *)

exception Table_overflow

let output_lexdef tables entry_points =
  Printf.eprintf 
    "pa_ocamllex: found lexer; %d states, %d transitions, table size %d bytes\n"
    (Array.length tables.tbl_base)
    (Array.length tables.tbl_trans)
    (2 * (Array.length tables.tbl_base + Array.length tables.tbl_backtrk +
          Array.length tables.tbl_default + Array.length tables.tbl_trans +
          Array.length tables.tbl_check));
  flush stderr;
  if Array.length tables.tbl_trans > 0x8000 then raise Table_overflow;

  let entries = List.map output_entry entry_points in
  [output_tables tables; <:str_item< value rec $list:List.flatten entries$ >> ]


(* Adapted from parser.mly and main.ml *)
(***************************************)

(* Auxiliaries for the parser. *)

let char s = Char.code (Token.eval_char s)

let named_regexps =
  (Hashtbl.create 13 : (string, regular_expression) Hashtbl.t)

let regexp_for_string s =
  let rec re_string n =
    if n >= String.length s then Epsilon
    else if succ n = String.length s then Characters([Char.code (s.[n])])
    else Sequence(Characters([Char.code (s.[n])]), re_string (succ n))
  in re_string 0

let char_class c1 c2 =
  let rec cl n =
    if n > c2 then [] else n :: cl(succ n)
  in cl c1

let all_chars = char_class 0 255

let rec subtract l1 l2 =
  match l1 with
    [] -> []
  | a::r -> if List.mem a l2 then subtract r l2 else a :: subtract r l2

let () =
  Hashtbl.add named_regexps "eof" (Characters [256])

(* The parser *)

let let_regexp = Grammar.Entry.create Pcaml.gram "pa_ocamllex let"
let header = Grammar.Entry.create Pcaml.gram "pa_ocamllex header"
let lexer_def = Grammar.Entry.create Pcaml.gram "pa_ocaml lexerdef"

EXTEND
 GLOBAL: Pcaml.str_item let_regexp header lexer_def;

 let_regexp: [
   [ x = LIDENT; "="; r = regexp ->
       if Hashtbl.mem named_regexps x then
         Printf.eprintf 
           "pa_ocamllex (warning): multiple definition of named regexp '%s'\n"
           x;
       Hashtbl.add named_regexps x r;
   ]
 ];

 lexer_def: [
   [ def = LIST0 definition SEP "and" ->
       (try
          let (entries, transitions) = make_dfa def in
          let tables = compact_tables transitions in
          let output = output_lexdef tables entries in
          <:str_item< declare $list: output$ end >> 
        with Table_overflow ->
          failwith "Transition table overflow in lexer, automaton is too big")
   ]
 ];


 Pcaml.str_item: [
   [ "pa_ocamllex"; LIDENT "rule"; d = lexer_def -> d
   | "pa_ocamllex"; "let"; let_regexp -> 
       <:str_item< declare $list: []$ end >>
   ]
 ];
 
 definition: [
   [ x=LIDENT; pl = LIST0 Pcaml.patt; "="; LIDENT "parse"; 
     OPT "|"; l = LIST0 [ r=regexp; a=action -> (r,a) ] SEP "|" -> ((x,pl),l) ]
 ];

 action: [
   [ "{"; e = OPT Pcaml.expr; "}" -> 
       let e = match e with
         | Some e -> e
         | None -> <:expr< () >>
       in
       (loc,e)
   ]
 ];

 header:  [
   [ "{"; e = LIST0 [ si = Pcaml.str_item; OPT ";;" -> si ]; "}" -> 
       [<:str_item< declare $list:e$ end>>, loc] ]
   | [ -> [] ]
 ];

 regexp: [
   [ r1 = regexp; "|"; r2 = regexp -> Alternative(r1,r2) ]
 | [ r1 = regexp; r2 = regexp -> Sequence(r1,r2) ]
 | [ r = regexp; "*" -> Repetition r
   | r = regexp; "+" -> Sequence(r, Repetition r)
   | r = regexp; "?" -> Alternative(r, Epsilon)
   | "("; r = regexp; ")" -> r
   | "_" -> Characters all_chars
   | c = CHAR -> Characters [char c]
   | s = STRING -> regexp_for_string (Token.eval_string s)
   | "["; cc = ch_class; "]" ->  Characters cc
   | x = LIDENT ->
       try  Hashtbl.find named_regexps x
       with Not_found ->
         failwith 
           ("pa_ocamllex (error): reference to unbound regexp name `"^x^"'")
   ]
 ];

 ch_class: [
   [ "^"; cc = ch_class -> subtract all_chars cc]
 | [ c1 = CHAR; "-"; c2 = CHAR -> char_class (char c1) (char c2)
   | c = CHAR -> [char c]
   | cc1 = ch_class; cc2 = ch_class -> cc1 @ cc2
   ]
 ];
END

(* We have to be careful about "rule"; in standalone mode,
   it is used as a keyword (otherwise, there is a conflict
   with named regexp); in normal mode, it is used as LIDENT
   (we do not want to reserve such an useful identifier).

   Plexer does not like identifiers used as keyword _and_
   as LIDENT ...
*)

let standalone =
  let already = ref false in
  fun () ->
    if not (!already) then
    begin
      already := true;
      Printf.eprintf "pa_ocamllex: stand-alone mode\n";

      DELETE_RULE Pcaml.str_item: "pa_ocamllex"; LIDENT "rule";lexer_def END;
      DELETE_RULE Pcaml.str_item: "pa_ocamllex"; "let"; let_regexp END;
      let ocamllex = Grammar.Entry.create Pcaml.gram "pa_ocamllex" in
      EXTEND GLOBAL: ocamllex let_regexp header lexer_def;
      ocamllex: [
        [ h = header;
          l  = [LIST0 ["let"; let_regexp]; "rule"; d = lexer_def -> (d,loc)];
          t = header; EOI -> h @ (l :: t) ,false
        ]
      ];
      END;
      Pcaml.parse_implem := Grammar.Entry.parse ocamllex
    end

let () =
  Pcaml.add_option "-ocamllex" (Arg.Unit standalone)
    "Activate (standalone) ocamllex emulation mode."
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00			`(* camlp4 ./pa_o.cmo q_MLast.cmo pa_extend.cmo pr_dump.cmo *)`
			`(* $Id$ *)`
			`(* Alain Frisch's contribution *)`

			`open Syntax`
			`open Lexgen`
			`open Compact`

			`(* Adapted from output.ml *)`
			`(**************************)`

			`(* Output the DFA tables and its entry points *)`

			`(* To output an array of short ints, encoded as a string *)`

			`let output_byte buf b =`
			`Buffer.add_char buf '\\';`
			`Buffer.add_char buf (Char.chr(48 + b / 100));`
			`Buffer.add_char buf (Char.chr(48 + (b / 10) mod 10));`
			`Buffer.add_char buf (Char.chr(48 + b mod 10))`

			`let loc = (-1,-1)`

			`let output_array v =`
			`let b = Buffer.create (Array.length v * 3) in`
			`for i = 0 to Array.length v - 1 do`
			`output_byte b (v.(i) land 0xFF);`
			`output_byte b ((v.(i) asr 8) land 0xFF);`
			`if i land 7 = 7 then Buffer.add_string b "\\\n "`
			`done;`
			`let s = Buffer.contents b in`
			`<:expr< $str:s$ >>`

			`(* Output the tables *)`

			`let output_tables tbl =`
			`<:str_item< value lex_tables = {`
			`Lexing.lex_base = $output_array tbl.tbl_base$;`
			`Lexing.lex_backtrk = $output_array tbl.tbl_backtrk$;`
			`Lexing.lex_default = $output_array tbl.tbl_default$;`
			`Lexing.lex_trans = $output_array tbl.tbl_trans$;`
			`Lexing.lex_check = $output_array tbl.tbl_check$`
			`} >>`

			`(* Output the entries *)`

			`let rec make_alias n = function`
			`\| [] -> []`
			`\| h::t ->`
			`(h, "__ocaml_lex_arg_" ^ (string_of_int n)) :: (make_alias (succ n) t)`

			`let abstraction =`
			`List.fold_right (fun (p,a) e -> <:expr< fun ($p$ as $lid:a$) -> $e$ >>)`


			`let application =`
			`List.fold_left (fun f (_,a) -> <:expr< $f$ $lid:a$ >>)`

			`let output_entry e =`
			`let args = make_alias 0 (<:patt< lexbuf >> :: e.auto_args) in`
			`let f = "__ocaml_lex_rec_" ^ e.auto_name ^ "_rec" in`
			`let call_f = application <:expr< $lid:f$ >> args in`
			`let inistate = <:expr< $int:string_of_int e.auto_initial_state$ >> in`
			`let cases =`
			`List.map`
			`(fun (num, (loc,e)) ->`
detabisation git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4281 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-20 07:12:21 -08:00			`<:patt< $int:string_of_int num$ >>,`
			`None, (* when ... *)`
			`e`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00			`) e.auto_actions @`
			`[ <:patt< __ocaml_lex_n >>,`
			`None,`
			`<:expr< do`
detabisation git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4281 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-20 07:12:21 -08:00			`{ lexbuf.Lexing.refill_buff lexbuf; $call_f$ __ocaml_lex_n }>> ]`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00			`in`
			`[`
			`<:patt< $lid:e.auto_name$ >>,`
			`(abstraction args <:expr< $call_f$ $inistate$ >>);`

			`<:patt< $lid:f$ >>,`
			`(abstraction args <:expr<`
			`fun state ->`
			`match Lexing.engine lex_tables state lexbuf with`
			`[ $list:cases$ ] >>)`
			`]`

			`(* Main output function *)`

			`exception Table_overflow`

			`let output_lexdef tables entry_points =`
			`Printf.eprintf`
			`"pa_ocamllex: found lexer; %d states, %d transitions, table size %d bytes\n"`
			`(Array.length tables.tbl_base)`
			`(Array.length tables.tbl_trans)`
			`(2 * (Array.length tables.tbl_base + Array.length tables.tbl_backtrk +`
			`Array.length tables.tbl_default + Array.length tables.tbl_trans +`
			`Array.length tables.tbl_check));`
			`flush stderr;`
			`if Array.length tables.tbl_trans > 0x8000 then raise Table_overflow;`

			`let entries = List.map output_entry entry_points in`
			`[output_tables tables; <:str_item< value rec $list:List.flatten entries$ >> ]`


			`(* Adapted from parser.mly and main.ml *)`
			`(***************************************)`

			`(* Auxiliaries for the parser. *)`

			`let char s = Char.code (Token.eval_char s)`

			`let named_regexps =`
			`(Hashtbl.create 13 : (string, regular_expression) Hashtbl.t)`

			`let regexp_for_string s =`
			`let rec re_string n =`
			`if n >= String.length s then Epsilon`
			`else if succ n = String.length s then Characters([Char.code (s.[n])])`
			`else Sequence(Characters([Char.code (s.[n])]), re_string (succ n))`
			`in re_string 0`

			`let char_class c1 c2 =`
			`let rec cl n =`
			`if n > c2 then [] else n :: cl(succ n)`
			`in cl c1`

			`let all_chars = char_class 0 255`

			`let rec subtract l1 l2 =`
			`match l1 with`
			`[] -> []`
			`\| a::r -> if List.mem a l2 then subtract r l2 else a :: subtract r l2`

- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4249 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-12 05:22:10 -08:00			`let () =`
			`Hashtbl.add named_regexps "eof" (Characters [256])`

- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00			`(* The parser *)`

- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4249 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-12 05:22:10 -08:00			`let let_regexp = Grammar.Entry.create Pcaml.gram "pa_ocamllex let"`
			`let header = Grammar.Entry.create Pcaml.gram "pa_ocamllex header"`
			`let lexer_def = Grammar.Entry.create Pcaml.gram "pa_ocaml lexerdef"`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00
			`EXTEND`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4249 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-12 05:22:10 -08:00			`GLOBAL: Pcaml.str_item let_regexp header lexer_def;`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00
			`let_regexp: [`
			`[ x = LIDENT; "="; r = regexp ->`
			`if Hashtbl.mem named_regexps x then`
detabisation git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4281 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-20 07:12:21 -08:00			`Printf.eprintf`
			`"pa_ocamllex (warning): multiple definition of named regexp '%s'\n"`
			`x;`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00			`Hashtbl.add named_regexps x r;`
			`]`
			`];`

			`lexer_def: [`
			`[ def = LIST0 definition SEP "and" ->`
			`(try`
detabisation git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4281 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-20 07:12:21 -08:00			`let (entries, transitions) = make_dfa def in`
			`let tables = compact_tables transitions in`
			`let output = output_lexdef tables entries in`
			`<:str_item< declare $list: output$ end >>`
			`with Table_overflow ->`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00			`failwith "Transition table overflow in lexer, automaton is too big")`
			`]`
			`];`


			`Pcaml.str_item: [`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4249 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-12 05:22:10 -08:00			`[ "pa_ocamllex"; LIDENT "rule"; d = lexer_def -> d`
			`\| "pa_ocamllex"; "let"; let_regexp ->`
			`<:str_item< declare $list: []$ end >>`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00			`]`
			`];`

			`definition: [`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4249 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-12 05:22:10 -08:00			`[ x=LIDENT; pl = LIST0 Pcaml.patt; "="; LIDENT "parse";`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00			`OPT "\|"; l = LIST0 [ r=regexp; a=action -> (r,a) ] SEP "\|" -> ((x,pl),l) ]`
			`];`

			`action: [`
			`[ "{"; e = OPT Pcaml.expr; "}" ->`
			`let e = match e with`
detabisation git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4281 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-20 07:12:21 -08:00			`\| Some e -> e`
			`\| None -> <:expr< () >>`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00			`in`
			`(loc,e)`
			`]`
			`];`

			`header: [`
			`[ "{"; e = LIST0 [ si = Pcaml.str_item; OPT ";;" -> si ]; "}" ->`
			`[<:str_item< declare $list:e$ end>>, loc] ]`
			`\| [ -> [] ]`
			`];`

			`regexp: [`
			`[ r1 = regexp; "\|"; r2 = regexp -> Alternative(r1,r2) ]`
			`\| [ r1 = regexp; r2 = regexp -> Sequence(r1,r2) ]`
			`\| [ r = regexp; "*" -> Repetition r`
			`\| r = regexp; "+" -> Sequence(r, Repetition r)`
			`\| r = regexp; "?" -> Alternative(r, Epsilon)`
			`\| "("; r = regexp; ")" -> r`
			`\| "_" -> Characters all_chars`
			`\| c = CHAR -> Characters [char c]`
			`\| s = STRING -> regexp_for_string (Token.eval_string s)`
			`\| "["; cc = ch_class; "]" -> Characters cc`
			`\| x = LIDENT ->`
			`try Hashtbl.find named_regexps x`
			`with Not_found ->`
detabisation git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4281 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-20 07:12:21 -08:00			`failwith`
			("pa_ocamllex (error): reference to unbound regexp name `"^x^"'")
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00			`]`
			`];`

			`ch_class: [`
			`[ "^"; cc = ch_class -> subtract all_chars cc]`
			`\| [ c1 = CHAR; "-"; c2 = CHAR -> char_class (char c1) (char c2)`
			`\| c = CHAR -> [char c]`
			`\| cc1 = ch_class; cc2 = ch_class -> cc1 @ cc2`
			`]`
			`];`
			`END`

- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4249 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-12 05:22:10 -08:00			`(* We have to be careful about "rule"; in standalone mode,`
			`it is used as a keyword (otherwise, there is a conflict`
			`with named regexp); in normal mode, it is used as LIDENT`
			`(we do not want to reserve such an useful identifier).`

			`Plexer does not like identifiers used as keyword _and_`
			`as LIDENT ...`
			`*)`

			`let standalone =`
			`let already = ref false in`
			`fun () ->`
			`if not (!already) then`
			`begin`
			`already := true;`
			`Printf.eprintf "pa_ocamllex: stand-alone mode\n";`

			`DELETE_RULE Pcaml.str_item: "pa_ocamllex"; LIDENT "rule";lexer_def END;`
			`DELETE_RULE Pcaml.str_item: "pa_ocamllex"; "let"; let_regexp END;`
			`let ocamllex = Grammar.Entry.create Pcaml.gram "pa_ocamllex" in`
			`EXTEND GLOBAL: ocamllex let_regexp header lexer_def;`
			`ocamllex: [`
			`[ h = header;`
			`l = [LIST0 ["let"; let_regexp]; "rule"; d = lexer_def -> (d,loc)];`
			`t = header; EOI -> h @ (l :: t) ,false`
			`]`
			`];`
			`END;`
			`Pcaml.parse_implem := Grammar.Entry.parse ocamllex`
			`end`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00
			`let () =`
			`Pcaml.add_option "-ocamllex" (Arg.Unit standalone)`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4355 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-02-05 07:47:33 -08:00			`"Activate (standalone) ocamllex emulation mode."`
- git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@4222 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 2002-01-04 06:33:11 -08:00