(***********************************************************************) (* *) (* OCaml *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 1996 Institut National de Recherche en Informatique et *) (* en Automatique. All rights reserved. This file is distributed *) (* under the terms of the Q Public License version 1.0. *) (* *) (***********************************************************************) (* $Id$ *) (* The lexical analyzer for lexer definitions. Bootstrapped! *) { open Syntax open Parser (* Auxiliaries for the lexical analyzer *) let brace_depth = ref 0 and comment_depth = ref 0 let in_pattern () = !brace_depth = 0 && !comment_depth = 0 exception Lexical_error of string * string * int * int let string_buff = Buffer.create 256 let reset_string_buffer () = Buffer.clear string_buff let store_string_char c = Buffer.add_char string_buff c let get_stored_string () = Buffer.contents string_buff let char_for_backslash = function 'n' -> '\n' | 't' -> '\t' | 'b' -> '\b' | 'r' -> '\r' | c -> c let raise_lexical_error lexbuf msg = let p = Lexing.lexeme_start_p lexbuf in raise (Lexical_error (msg, p.Lexing.pos_fname, p.Lexing.pos_lnum, p.Lexing.pos_cnum - p.Lexing.pos_bol + 1)) ;; let handle_lexical_error fn lexbuf = let p = Lexing.lexeme_start_p lexbuf in let line = p.Lexing.pos_lnum and column = p.Lexing.pos_cnum - p.Lexing.pos_bol + 1 and file = p.Lexing.pos_fname in try fn lexbuf with Lexical_error (msg, "", 0, 0) -> raise(Lexical_error(msg, file, line, column)) let get_input_name () = Sys.argv.(Array.length Sys.argv - 1) let warning lexbuf msg = let p = Lexing.lexeme_start_p lexbuf in Printf.eprintf "ocamllex warning:\nFile \"%s\", line %d, character %d: %s.\n" p.Lexing.pos_fname p.Lexing.pos_lnum (p.Lexing.pos_cnum - p.Lexing.pos_bol + 1) msg; flush stderr let decimal_code c d u = 100 * (Char.code c - 48) + 10 * (Char.code d - 48) + (Char.code u - 48) let char_for_hexadecimal_code d u = let d1 = Char.code d in let val1 = if d1 >= 97 then d1 - 87 else if d1 >= 65 then d1 - 55 else d1 - 48 in let d2 = Char.code u in let val2 = if d2 >= 97 then d2 - 87 else if d2 >= 65 then d2 - 55 else d2 - 48 in Char.chr (val1 * 16 + val2) let incr_loc lexbuf delta = let pos = lexbuf.Lexing.lex_curr_p in lexbuf.Lexing.lex_curr_p <- { pos with Lexing.pos_lnum = pos.Lexing.pos_lnum + 1; Lexing.pos_bol = pos.Lexing.pos_cnum - delta; } ;; let update_loc lexbuf opt_file line = let pos = lexbuf.Lexing.lex_curr_p in let new_file = match opt_file with | None -> pos.Lexing.pos_fname | Some f -> f in lexbuf.Lexing.lex_curr_p <- { pos with Lexing.pos_fname = new_file; Lexing.pos_lnum = line; Lexing.pos_bol = pos.Lexing.pos_cnum; } ;; } let identstart = ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255'] let identbody = ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9'] let backslash_escapes = ['\\' '"' '\'' 'n' 't' 'b' 'r'] rule main = parse [' ' '\013' '\009' '\012' ] + { main lexbuf } | '\010' { incr_loc lexbuf 0; main lexbuf } | "#" [' ' '\t']* (['0'-'9']+ as num) [' ' '\t']* ('\"' ([^ '\010' '\013' '\"']* as name) '\"')? [^ '\010' '\013']* '\010' { update_loc lexbuf name (int_of_string num); main lexbuf } | "(*" { comment_depth := 1; handle_lexical_error comment lexbuf; main lexbuf } | '_' { Tunderscore } | identstart identbody * { match Lexing.lexeme lexbuf with "rule" -> Trule | "parse" -> Tparse | "shortest" -> Tparse_shortest | "and" -> Tand | "eof" -> Teof | "let" -> Tlet | "as" -> Tas | s -> Tident s } | '"' { reset_string_buffer(); handle_lexical_error string lexbuf; Tstring(get_stored_string()) } (* note: ''' is a valid character literal (by contrast with the compiler) *) | "'" [^ '\\'] "'" { Tchar(Char.code(Lexing.lexeme_char lexbuf 1)) } | "'" '\\' backslash_escapes "'" { Tchar(Char.code(char_for_backslash (Lexing.lexeme_char lexbuf 2))) } | "'" '\\' (['0'-'9'] as c) (['0'-'9'] as d) (['0'-'9'] as u)"'" { let v = decimal_code c d u in if v > 255 then raise_lexical_error lexbuf (Printf.sprintf "illegal escape sequence \\%c%c%c" c d u) else Tchar v } | "'" '\\' 'x' (['0'-'9' 'a'-'f' 'A'-'F'] as d) (['0'-'9' 'a'-'f' 'A'-'F'] as u) "'" { Tchar(Char.code(char_for_hexadecimal_code d u)) } | "'" '\\' (_ as c) { raise_lexical_error lexbuf (Printf.sprintf "illegal escape sequence \\%c" c) } | '{' { let p = Lexing.lexeme_end_p lexbuf in let n1 = p.Lexing.pos_cnum and l1 = p.Lexing.pos_lnum and s1 = p.Lexing.pos_bol in brace_depth := 1; let n2 = handle_lexical_error action lexbuf in Taction({start_pos = n1; end_pos = n2; start_line = l1; start_col = n1 - s1}) } | '=' { Tequal } | '|' { Tor } | '[' { Tlbracket } | ']' { Trbracket } | '*' { Tstar } | '?' { Tmaybe } | '+' { Tplus } | '(' { Tlparen } | ')' { Trparen } | '^' { Tcaret } | '-' { Tdash } | '#' { Tsharp } | eof { Tend } | _ { raise_lexical_error lexbuf ("illegal character " ^ String.escaped(Lexing.lexeme lexbuf)) } (* String parsing comes from the compiler lexer *) and string = parse '"' { () } | '\\' ("\010" | "\013" | "\013\010") ([' ' '\009'] * as spaces) { incr_loc lexbuf (String.length spaces); string lexbuf } | '\\' (backslash_escapes as c) { store_string_char(char_for_backslash c); string lexbuf } | '\\' (['0'-'9'] as c) (['0'-'9'] as d) (['0'-'9'] as u) { let v = decimal_code c d u in if in_pattern () && v > 255 then warning lexbuf (Printf.sprintf "illegal backslash escape in string: `\\%c%c%c'" c d u) ; store_string_char (Char.chr v); string lexbuf } | '\\' 'x' (['0'-'9' 'a'-'f' 'A'-'F'] as d) (['0'-'9' 'a'-'f' 'A'-'F'] as u) { store_string_char (char_for_hexadecimal_code d u) ; string lexbuf } | '\\' (_ as c) {if in_pattern () then warning lexbuf (Printf.sprintf "illegal backslash escape in string: `\\%c'" c) ; store_string_char '\\' ; store_string_char c ; string lexbuf } | eof { raise(Lexical_error("unterminated string", "", 0, 0)) } | '\010' { store_string_char '\010'; incr_loc lexbuf 0; string lexbuf } | _ as c { store_string_char c; string lexbuf } (* Lexers comment and action are quite similar, they should lex both strings and characters, in order not to be confused by what is inside then *) and comment = parse "(*" { incr comment_depth; comment lexbuf } | "*)" { decr comment_depth; if !comment_depth = 0 then () else comment lexbuf } | '"' { reset_string_buffer(); string lexbuf; reset_string_buffer(); comment lexbuf } | "'" { skip_char lexbuf ; comment lexbuf } | eof { raise(Lexical_error("unterminated comment", "", 0, 0)) } | '\010' { incr_loc lexbuf 0; comment lexbuf } | _ { comment lexbuf } and action = parse '{' { incr brace_depth; action lexbuf } | '}' { decr brace_depth; if !brace_depth = 0 then Lexing.lexeme_start lexbuf else action lexbuf } | '"' { reset_string_buffer(); handle_lexical_error string lexbuf; reset_string_buffer(); action lexbuf } | "'" { skip_char lexbuf ; action lexbuf } | "(*" { comment_depth := 1; comment lexbuf; action lexbuf } | eof { raise (Lexical_error("unterminated action", "", 0, 0)) } | '\010' { incr_loc lexbuf 0; action lexbuf } | _ { action lexbuf } and skip_char = parse | '\\'? '\010' "'" { incr_loc lexbuf 1; } | [^ '\\' '\''] "'" (* regular character *) (* one character and numeric escape sequences *) | '\\' _ "'" | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "'" {()} (* A dieu va ! *) | "" {()}