/***********************************************************************/ /* */ /* OCaml */ /* */ /* Xavier Leroy, projet Cristal, INRIA Rocquencourt */ /* */ /* Copyright 1996 Institut National de Recherche en Informatique et */ /* en Automatique. All rights reserved. This file is distributed */ /* under the terms of the Q Public License version 1.0. */ /* */ /***********************************************************************/ /* The grammar for lexer definitions */ %{ open Syntax (* Auxiliaries for the parser. *) let named_regexps = (Hashtbl.create 13 : (string, regular_expression) Hashtbl.t) let regexp_for_string s = let rec re_string n = if n >= String.length s then Epsilon else if succ n = String.length s then Characters (Cset.singleton (Char.code s.[n])) else Sequence (Characters(Cset.singleton (Char.code s.[n])), re_string (succ n)) in re_string 0 let rec remove_as = function | Bind (e,_) -> remove_as e | Epsilon|Eof|Characters _ as e -> e | Sequence (e1, e2) -> Sequence (remove_as e1, remove_as e2) | Alternative (e1, e2) -> Alternative (remove_as e1, remove_as e2) | Repetition e -> Repetition (remove_as e) let as_cset = function | Characters s -> s | _ -> raise Cset.Bad %} %token Tident %token Tchar %token Tstring %token Taction %token Trule Tparse Tparse_shortest Tand Tequal Tend Tor Tunderscore Teof Tlbracket Trbracket %token Tstar Tmaybe Tplus Tlparen Trparen Tcaret Tdash Tlet Tas Tsharp %right Tas %left Tor %nonassoc CONCAT %nonassoc Tmaybe Tstar Tplus %left Tsharp %nonassoc Tident Tchar Tstring Tunderscore Teof Tlbracket Tlparen %start lexer_definition %type lexer_definition %% lexer_definition: header named_regexps Trule definition other_definitions header Tend { {header = $1; entrypoints = $4 :: List.rev $5; trailer = $6} } ; header: Taction { $1 } | /*epsilon*/ { { start_pos = 0; end_pos = 0; start_line = 1; start_col = 0 } } ; named_regexps: named_regexps Tlet Tident Tequal regexp { Hashtbl.add named_regexps $3 $5 } | /*epsilon*/ { () } ; other_definitions: other_definitions Tand definition { $3::$1 } | /*epsilon*/ { [] } ; definition: Tident arguments Tequal Tparse entry { {name=$1 ; shortest=false ; args=$2 ; clauses=$5} } | Tident arguments Tequal Tparse_shortest entry { {name=$1 ; shortest=true ; args=$2 ; clauses=$5} } ; arguments: Tident arguments { $1::$2 } | /*epsilon*/ { [] } ; entry: case rest_of_entry { $1::List.rev $2 } | Tor case rest_of_entry { $2::List.rev $3 } ; rest_of_entry: rest_of_entry Tor case { $3::$1 } | { [] } ; case: regexp Taction { ($1,$2) } ; regexp: Tunderscore { Characters Cset.all_chars } | Teof { Eof } | Tchar { Characters (Cset.singleton $1) } | Tstring { regexp_for_string $1 } | Tlbracket char_class Trbracket { Characters $2 } | regexp Tstar { Repetition $1 } | regexp Tmaybe { Alternative(Epsilon, $1) } | regexp Tplus { Sequence(Repetition (remove_as $1), $1) } | regexp Tsharp regexp { let s1 = as_cset $1 and s2 = as_cset $3 in Characters (Cset.diff s1 s2) } | regexp Tor regexp { Alternative($1,$3) } | regexp regexp %prec CONCAT { Sequence($1,$2) } | Tlparen regexp Trparen { $2 } | Tident { try Hashtbl.find named_regexps $1 with Not_found -> let p = Parsing.symbol_start_pos () in Printf.eprintf "File \"%s\", line %d, character %d:\n\ Reference to unbound regexp name `%s'.\n" p.Lexing.pos_fname p.Lexing.pos_lnum (p.Lexing.pos_cnum - p.Lexing.pos_bol) $1; exit 2 } | regexp Tas ident {let p1 = Parsing.rhs_start_pos 3 and p2 = Parsing.rhs_end_pos 3 in let p = { start_pos = p1.Lexing.pos_cnum ; end_pos = p2.Lexing.pos_cnum ; start_line = p1.Lexing.pos_lnum ; start_col = p1.Lexing.pos_cnum - p1.Lexing.pos_bol ; } in Bind ($1, ($3, p))} ; ident: Tident {$1} ; char_class: Tcaret char_class1 { Cset.complement $2 } | char_class1 { $1 } ; char_class1: Tchar Tdash Tchar { Cset.interval $1 $3 } | Tchar { Cset.singleton $1 } | char_class1 char_class1 %prec CONCAT { Cset.union $1 $2 } ; %%