(***********************************************************************) (* *) (* OCaml *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 1996 Institut National de Recherche en Informatique et *) (* en Automatique. All rights reserved. This file is distributed *) (* under the terms of the Q Public License version 1.0. *) (* *) (***********************************************************************) (* The lexer definition *) { open Lexing open Misc open Parser type error = | Illegal_character of char | Illegal_escape of string | Unterminated_comment of Location.t | Unterminated_string | Unterminated_string_in_comment of Location.t * Location.t | Keyword_as_label of string | Invalid_literal of string ;; exception Error of error * Location.t;; (* The table of keywords *) let keyword_table = create_hashtable 149 [ "and", AND; "as", AS; "assert", ASSERT; "begin", BEGIN; "class", CLASS; "constraint", CONSTRAINT; "do", DO; "done", DONE; "downto", DOWNTO; "else", ELSE; "end", END; "exception", EXCEPTION; "external", EXTERNAL; "false", FALSE; "for", FOR; "fun", FUN; "function", FUNCTION; "functor", FUNCTOR; "if", IF; "in", IN; "include", INCLUDE; "inherit", INHERIT; "initializer", INITIALIZER; "lazy", LAZY; "let", LET; "match", MATCH; "method", METHOD; "module", MODULE; "mutable", MUTABLE; "new", NEW; "nonrec", NONREC; "object", OBJECT; "of", OF; "open", OPEN; "or", OR; (* "parser", PARSER; *) "private", PRIVATE; "rec", REC; "sig", SIG; "struct", STRUCT; "then", THEN; "to", TO; "true", TRUE; "try", TRY; "type", TYPE; "val", VAL; "virtual", VIRTUAL; "when", WHEN; "while", WHILE; "with", WITH; "lor", INFIXOP3("lor"); (* Should be INFIXOP2 *) "lxor", INFIXOP3("lxor"); (* Should be INFIXOP2 *) "mod", INFIXOP3("mod"); "land", INFIXOP3("land"); "lsl", INFIXOP4("lsl"); "lsr", INFIXOP4("lsr"); "asr", INFIXOP4("asr") ] (* To buffer string literals *) let initial_string_buffer = Bytes.create 256 let string_buff = ref initial_string_buffer let string_index = ref 0 let reset_string_buffer () = string_buff := initial_string_buffer; string_index := 0 let store_string_char c = if !string_index >= Bytes.length !string_buff then begin let new_buff = Bytes.create (Bytes.length (!string_buff) * 2) in Bytes.blit !string_buff 0 new_buff 0 (Bytes.length !string_buff); string_buff := new_buff end; Bytes.unsafe_set !string_buff !string_index c; incr string_index let store_string s = for i = 0 to String.length s - 1 do store_string_char s.[i]; done let store_lexeme lexbuf = store_string (Lexing.lexeme lexbuf) let get_stored_string () = let s = Bytes.sub_string !string_buff 0 !string_index in string_buff := initial_string_buffer; s (* To store the position of the beginning of a string and comment *) let string_start_loc = ref Location.none;; let comment_start_loc = ref [];; let in_comment () = !comment_start_loc <> [];; let is_in_string = ref false let in_string () = !is_in_string let print_warnings = ref true let with_comment_buffer comment lexbuf = let start_loc = Location.curr lexbuf in comment_start_loc := [start_loc]; reset_string_buffer (); let end_loc = comment lexbuf in let s = get_stored_string () in reset_string_buffer (); let loc = { start_loc with Location.loc_end = end_loc.Location.loc_end } in s, loc (* To translate escape sequences *) let char_for_backslash = function | 'n' -> '\010' | 'r' -> '\013' | 'b' -> '\008' | 't' -> '\009' | c -> c let char_for_decimal_code lexbuf i = let c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) + 10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) + (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48) in if (c < 0 || c > 255) then if in_comment () then 'x' else raise (Error(Illegal_escape (Lexing.lexeme lexbuf), Location.curr lexbuf)) else Char.chr c let char_for_hexadecimal_code lexbuf i = let d1 = Char.code (Lexing.lexeme_char lexbuf i) in let val1 = if d1 >= 97 then d1 - 87 else if d1 >= 65 then d1 - 55 else d1 - 48 in let d2 = Char.code (Lexing.lexeme_char lexbuf (i+1)) in let val2 = if d2 >= 97 then d2 - 87 else if d2 >= 65 then d2 - 55 else d2 - 48 in Char.chr (val1 * 16 + val2) (* recover the name from a LABEL or OPTLABEL token *) let get_label_name lexbuf = let s = Lexing.lexeme lexbuf in let name = String.sub s 1 (String.length s - 2) in if Hashtbl.mem keyword_table name then raise (Error(Keyword_as_label name, Location.curr lexbuf)); name ;; (* Update the current location with file name and line number. *) let update_loc lexbuf file line absolute chars = let pos = lexbuf.lex_curr_p in let new_file = match file with | None -> pos.pos_fname | Some s -> s in lexbuf.lex_curr_p <- { pos with pos_fname = new_file; pos_lnum = if absolute then line else pos.pos_lnum + line; pos_bol = pos.pos_cnum - chars; } ;; let preprocessor = ref None let escaped_newlines = ref false (* Warn about Latin-1 characters used in idents *) let warn_latin1 lexbuf = Location.prerr_warning (Location.curr lexbuf) (Warnings.Deprecated "ISO-Latin1 characters in identifiers") ;; let comment_list = ref [] let add_comment com = comment_list := com :: !comment_list let add_docstring_comment ds = let com = ("*" ^ Docstrings.docstring_body ds, Docstrings.docstring_loc ds) in add_comment com let comments () = List.rev !comment_list (* Error report *) open Format let report_error ppf = function | Illegal_character c -> fprintf ppf "Illegal character (%s)" (Char.escaped c) | Illegal_escape s -> fprintf ppf "Illegal backslash escape in string or character (%s)" s | Unterminated_comment _ -> fprintf ppf "Comment not terminated" | Unterminated_string -> fprintf ppf "String literal not terminated" | Unterminated_string_in_comment (_, loc) -> fprintf ppf "This comment contains an unterminated string literal@.\ %aString literal begins here" Location.print_error loc | Keyword_as_label kwd -> fprintf ppf "`%s' is a keyword, it cannot be used as label name" kwd | Invalid_literal s -> fprintf ppf "Invalid literal %s" s let () = Location.register_error_of_exn (function | Error (err, loc) -> Some (Location.error_of_printer loc report_error err) | _ -> None ) } let newline = ('\013'* '\010') let blank = [' ' '\009' '\012'] let lowercase = ['a'-'z' '_'] let uppercase = ['A'-'Z'] let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] let lowercase_latin1 = ['a'-'z' '\223'-'\246' '\248'-'\255' '_'] let uppercase_latin1 = ['A'-'Z' '\192'-'\214' '\216'-'\222'] let identchar_latin1 = ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9'] let symbolchar = ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] let decimal_literal = ['0'-'9'] ['0'-'9' '_']* let hex_literal = '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']['0'-'9' 'A'-'F' 'a'-'f' '_']* let oct_literal = '0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']* let bin_literal = '0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']* let int_literal = decimal_literal | hex_literal | oct_literal | bin_literal let float_literal = ['0'-'9'] ['0'-'9' '_']* ('.' ['0'-'9' '_']* )? (['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']*)? let hex_float_literal = '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']['0'-'9' 'A'-'F' 'a'-'f' '_']* ('.' ['0'-'9' 'A'-'F' 'a'-'f' '_']*)? (['p' 'P'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']*)? let literal_modifier = ['G'-'Z' 'g'-'z'] rule token = parse | "\\" newline { if not !escaped_newlines then raise (Error(Illegal_character (Lexing.lexeme_char lexbuf 0), Location.curr lexbuf)); update_loc lexbuf None 1 false 0; token lexbuf } | newline { update_loc lexbuf None 1 false 0; EOL } | blank + { token lexbuf } | "_" { UNDERSCORE } | "~" { TILDE } | "~" lowercase identchar * ':' { LABEL (get_label_name lexbuf) } | "~" lowercase_latin1 identchar_latin1 * ':' { warn_latin1 lexbuf; LABEL (get_label_name lexbuf) } | "?" { QUESTION } | "?" lowercase identchar * ':' { OPTLABEL (get_label_name lexbuf) } | "?" lowercase_latin1 identchar_latin1 * ':' { warn_latin1 lexbuf; OPTLABEL (get_label_name lexbuf) } | lowercase identchar * { let s = Lexing.lexeme lexbuf in try Hashtbl.find keyword_table s with Not_found -> LIDENT s } | lowercase_latin1 identchar_latin1 * { warn_latin1 lexbuf; LIDENT (Lexing.lexeme lexbuf) } | uppercase identchar * { UIDENT(Lexing.lexeme lexbuf) } (* No capitalized keywords *) | uppercase_latin1 identchar_latin1 * { warn_latin1 lexbuf; UIDENT(Lexing.lexeme lexbuf) } | int_literal { INT (Lexing.lexeme lexbuf, None) } | (int_literal as lit) (literal_modifier as modif) { INT (lit, Some modif) } | float_literal | hex_float_literal { FLOAT (Lexing.lexeme lexbuf, None) } | ((float_literal | hex_float_literal) as lit) (literal_modifier as modif) { FLOAT (lit, Some modif) } | (float_literal | hex_float_literal | int_literal) identchar+ { raise (Error(Invalid_literal (Lexing.lexeme lexbuf), Location.curr lexbuf)) } | "\"" { reset_string_buffer(); is_in_string := true; let string_start = lexbuf.lex_start_p in string_start_loc := Location.curr lexbuf; string lexbuf; is_in_string := false; lexbuf.lex_start_p <- string_start; STRING (get_stored_string(), None) } | "{" lowercase* "|" { reset_string_buffer(); let delim = Lexing.lexeme lexbuf in let delim = String.sub delim 1 (String.length delim - 2) in is_in_string := true; let string_start = lexbuf.lex_start_p in string_start_loc := Location.curr lexbuf; quoted_string delim lexbuf; is_in_string := false; lexbuf.lex_start_p <- string_start; STRING (get_stored_string(), Some delim) } | "\'" newline "\'" { update_loc lexbuf None 1 false 1; CHAR (Lexing.lexeme_char lexbuf 1) } | "\'" [^ '\\' '\'' '\010' '\013'] "\'" { CHAR(Lexing.lexeme_char lexbuf 1) } | "\'\\" ['\\' '\'' '\"' 'n' 't' 'b' 'r' ' '] "\'" { CHAR(char_for_backslash (Lexing.lexeme_char lexbuf 2)) } | "\'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "\'" { CHAR(char_for_decimal_code lexbuf 2) } | "\'\\" 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "\'" { CHAR(char_for_hexadecimal_code lexbuf 3) } | "\'\\" _ { let l = Lexing.lexeme lexbuf in let esc = String.sub l 1 (String.length l - 1) in raise (Error(Illegal_escape esc, Location.curr lexbuf)) } | "(*" { let s, loc = with_comment_buffer comment lexbuf in COMMENT (s, loc) } | "(**" { let s, loc = with_comment_buffer comment lexbuf in DOCSTRING (Docstrings.docstring s loc) } | "(**" ('*'+) as stars { let s, loc = with_comment_buffer (fun lexbuf -> store_string ("*" ^ stars); comment lexbuf) lexbuf in COMMENT (s, loc) } | "(*)" { if !print_warnings then Location.prerr_warning (Location.curr lexbuf) Warnings.Comment_start; let s, loc = with_comment_buffer comment lexbuf in COMMENT (s, loc) } | "(*" ('*'*) as stars "*)" { COMMENT (stars, Location.curr lexbuf) } | "*)" { let loc = Location.curr lexbuf in Location.prerr_warning loc Warnings.Comment_not_end; lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 1; let curpos = lexbuf.lex_curr_p in lexbuf.lex_curr_p <- { curpos with pos_cnum = curpos.pos_cnum - 1 }; STAR } | "#" [' ' '\t']* (['0'-'9']+ as num) [' ' '\t']* ("\"" ([^ '\010' '\013' '\"' ] * as name) "\"")? [^ '\010' '\013'] * newline { update_loc lexbuf name (int_of_string num) true 0; token lexbuf } | "#" { SHARP } | "&" { AMPERSAND } | "&&" { AMPERAMPER } | "`" { BACKQUOTE } | "\'" { QUOTE } | "(" { LPAREN } | ")" { RPAREN } | "*" { STAR } | "," { COMMA } | "->" { MINUSGREATER } | "." { DOT } | ".." { DOTDOT } | ":" { COLON } | "::" { COLONCOLON } | ":=" { COLONEQUAL } | ":>" { COLONGREATER } | ";" { SEMI } | ";;" { SEMISEMI } | "<" { LESS } | "<-" { LESSMINUS } | "=" { EQUAL } | "[" { LBRACKET } | "[|" { LBRACKETBAR } | "[<" { LBRACKETLESS } | "[>" { LBRACKETGREATER } | "]" { RBRACKET } | "{" { LBRACE } | "{<" { LBRACELESS } | "|" { BAR } | "||" { BARBAR } | "|]" { BARRBRACKET } | ">" { GREATER } | ">]" { GREATERRBRACKET } | "}" { RBRACE } | ">}" { GREATERRBRACE } | "[@" { LBRACKETAT } | "[%" { LBRACKETPERCENT } | "[%%" { LBRACKETPERCENTPERCENT } | "[@@" { LBRACKETATAT } | "[@@@" { LBRACKETATATAT } | "!" { BANG } | "!=" { INFIXOP0 "!=" } | "+" { PLUS } | "+." { PLUSDOT } | "+=" { PLUSEQ } | "-" { MINUS } | "-." { MINUSDOT } | "!" symbolchar + { PREFIXOP(Lexing.lexeme lexbuf) } | ['~' '?'] symbolchar + { PREFIXOP(Lexing.lexeme lexbuf) } | ['=' '<' '>' '|' '&' '$'] symbolchar * { INFIXOP0(Lexing.lexeme lexbuf) } | ['@' '^'] symbolchar * { INFIXOP1(Lexing.lexeme lexbuf) } | ['+' '-'] symbolchar * { INFIXOP2(Lexing.lexeme lexbuf) } | "**" symbolchar * { INFIXOP4(Lexing.lexeme lexbuf) } | '%' { PERCENT } | ['*' '/' '%'] symbolchar * { INFIXOP3(Lexing.lexeme lexbuf) } | '#' (symbolchar | '#') + { SHARPOP(Lexing.lexeme lexbuf) } | eof { EOF } | _ { raise (Error(Illegal_character (Lexing.lexeme_char lexbuf 0), Location.curr lexbuf)) } and comment = parse "(*" { comment_start_loc := (Location.curr lexbuf) :: !comment_start_loc; store_lexeme lexbuf; comment lexbuf; } | "*)" { match !comment_start_loc with | [] -> assert false | [_] -> comment_start_loc := []; Location.curr lexbuf | _ :: l -> comment_start_loc := l; store_lexeme lexbuf; comment lexbuf; } | "\"" { string_start_loc := Location.curr lexbuf; store_string_char '\"'; is_in_string := true; begin try string lexbuf with Error (Unterminated_string, str_start) -> match !comment_start_loc with | [] -> assert false | loc :: _ -> let start = List.hd (List.rev !comment_start_loc) in comment_start_loc := []; raise (Error (Unterminated_string_in_comment (start, str_start), loc)) end; is_in_string := false; store_string_char '\"'; comment lexbuf } | "{" lowercase* "|" { let delim = Lexing.lexeme lexbuf in let delim = String.sub delim 1 (String.length delim - 2) in string_start_loc := Location.curr lexbuf; store_lexeme lexbuf; is_in_string := true; begin try quoted_string delim lexbuf with Error (Unterminated_string, str_start) -> match !comment_start_loc with | [] -> assert false | loc :: _ -> let start = List.hd (List.rev !comment_start_loc) in comment_start_loc := []; raise (Error (Unterminated_string_in_comment (start, str_start), loc)) end; is_in_string := false; store_string_char '|'; store_string delim; store_string_char '}'; comment lexbuf } | "\'\'" { store_lexeme lexbuf; comment lexbuf } | "\'" newline "\'" { update_loc lexbuf None 1 false 1; store_lexeme lexbuf; comment lexbuf } | "\'" [^ '\\' '\'' '\010' '\013' ] "\'" { store_lexeme lexbuf; comment lexbuf } | "\'\\" ['\\' '\"' '\'' 'n' 't' 'b' 'r' ' '] "\'" { store_lexeme lexbuf; comment lexbuf } | "\'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "\'" { store_lexeme lexbuf; comment lexbuf } | "\'\\" 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "\'" { store_lexeme lexbuf; comment lexbuf } | eof { match !comment_start_loc with | [] -> assert false | loc :: _ -> let start = List.hd (List.rev !comment_start_loc) in comment_start_loc := []; raise (Error (Unterminated_comment start, loc)) } | newline { update_loc lexbuf None 1 false 0; store_lexeme lexbuf; comment lexbuf } | _ { store_lexeme lexbuf; comment lexbuf } and string = parse '\"' { () } | '\\' newline ([' ' '\t'] * as space) { update_loc lexbuf None 1 false (String.length space); string lexbuf } | '\\' ['\\' '\'' '\"' 'n' 't' 'b' 'r' ' '] { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1)); string lexbuf } | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] { store_string_char(char_for_decimal_code lexbuf 1); string lexbuf } | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] { store_string_char(char_for_hexadecimal_code lexbuf 2); string lexbuf } | '\\' _ { if in_comment () then string lexbuf else begin (* Should be an error, but we are very lax. raise (Error (Illegal_escape (Lexing.lexeme lexbuf), Location.curr lexbuf)) *) let loc = Location.curr lexbuf in Location.prerr_warning loc Warnings.Illegal_backslash; store_string_char (Lexing.lexeme_char lexbuf 0); store_string_char (Lexing.lexeme_char lexbuf 1); string lexbuf end } | newline { if not (in_comment ()) then Location.prerr_warning (Location.curr lexbuf) Warnings.Eol_in_string; update_loc lexbuf None 1 false 0; store_lexeme lexbuf; string lexbuf } | eof { is_in_string := false; raise (Error (Unterminated_string, !string_start_loc)) } | _ { store_string_char(Lexing.lexeme_char lexbuf 0); string lexbuf } and quoted_string delim = parse | newline { update_loc lexbuf None 1 false 0; store_lexeme lexbuf; quoted_string delim lexbuf } | eof { is_in_string := false; raise (Error (Unterminated_string, !string_start_loc)) } | "|" lowercase* "}" { let edelim = Lexing.lexeme lexbuf in let edelim = String.sub edelim 1 (String.length edelim - 2) in if delim = edelim then () else (store_lexeme lexbuf; quoted_string delim lexbuf) } | _ { store_string_char(Lexing.lexeme_char lexbuf 0); quoted_string delim lexbuf } and skip_sharp_bang = parse | "#!" [^ '\n']* '\n' [^ '\n']* "\n!#\n" { update_loc lexbuf None 3 false 0 } | "#!" [^ '\n']* '\n' { update_loc lexbuf None 1 false 0 } | "" { () } { let token_with_comments lexbuf = match !preprocessor with | None -> token lexbuf | Some (_init, preprocess) -> preprocess token lexbuf type newline_state = | NoLine (* There have been no blank lines yet. *) | NewLine (* There have been no blank lines, and the previous token was a newline. *) | BlankLine (* There have been blank lines. *) type doc_state = | Initial (* There have been no docstrings yet *) | After of docstring list (* There have been docstrings, none of which were preceeded by a blank line *) | Before of docstring list * docstring list * docstring list (* There have been docstrings, some of which were preceeded by a blank line *) and docstring = Docstrings.docstring let token lexbuf = let post_pos = lexeme_end_p lexbuf in let attach lines docs pre_pos = let open Docstrings in match docs, lines with | Initial, _ -> () | After a, (NoLine | NewLine) -> set_post_docstrings post_pos (List.rev a); set_pre_docstrings pre_pos a; | After a, BlankLine -> set_post_docstrings post_pos (List.rev a); set_pre_extra_docstrings pre_pos (List.rev a) | Before(a, f, b), (NoLine | NewLine) -> set_post_docstrings post_pos (List.rev a); set_post_extra_docstrings post_pos (List.rev_append f (List.rev b)); set_floating_docstrings pre_pos (List.rev f); set_pre_extra_docstrings pre_pos (List.rev a); set_pre_docstrings pre_pos b | Before(a, f, b), BlankLine -> set_post_docstrings post_pos (List.rev a); set_post_extra_docstrings post_pos (List.rev_append f (List.rev b)); set_floating_docstrings pre_pos (List.rev_append f (List.rev b)); set_pre_extra_docstrings pre_pos (List.rev a) in let rec loop lines docs lexbuf = match token_with_comments lexbuf with | COMMENT (s, loc) -> add_comment (s, loc); let lines' = match lines with | NoLine -> NoLine | NewLine -> NoLine | BlankLine -> BlankLine in loop lines' docs lexbuf | EOL -> let lines' = match lines with | NoLine -> NewLine | NewLine -> BlankLine | BlankLine -> BlankLine in loop lines' docs lexbuf | DOCSTRING doc -> add_docstring_comment doc; let docs' = match docs, lines with | Initial, (NoLine | NewLine) -> After [doc] | Initial, BlankLine -> Before([], [], [doc]) | After a, (NoLine | NewLine) -> After (doc :: a) | After a, BlankLine -> Before (a, [], [doc]) | Before(a, f, b), (NoLine | NewLine) -> Before(a, f, doc :: b) | Before(a, f, b), BlankLine -> Before(a, b @ f, [doc]) in loop NoLine docs' lexbuf | tok -> attach lines docs (lexeme_start_p lexbuf); tok in loop NoLine Initial lexbuf let init () = is_in_string := false; comment_start_loc := []; comment_list := []; match !preprocessor with | None -> () | Some (init, _preprocess) -> init () let set_preprocessor init preprocess = escaped_newlines := true; preprocessor := Some (init, preprocess) }