(***********************************************************************) (* *) (* Objective Caml *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 1996 Institut National de Recherche en Informatique et *) (* en Automatique. All rights reserved. This file is distributed *) (* under the terms of the Q Public License version 1.0. *) (* *) (***********************************************************************) (* $Id$ *) (* The lexer definition *) { open Misc type token = AMPERAMPER | AMPERSAND | AND | AS | ASSERT | BACKQUOTE | BAR | BARBAR | BARRBRACKET | BEGIN | CHAR of (char) | CLASS | COLON | COLONCOLON | COLONEQUAL | COLONGREATER | COMMA | CONSTRAINT | DO | DONE | DOT | DOTDOT | DOWNTO | ELSE | END | EOF | EQUAL | EXCEPTION | EXTERNAL | FALSE | FLOAT of (string) | FOR | FUN | FUNCTION | FUNCTOR | GREATER | GREATERRBRACE | GREATERRBRACKET | IF | IN | INCLUDE | INFIXOP0 of (string) | INFIXOP1 of (string) | INFIXOP2 of (string) | INFIXOP3 of (string) | INFIXOP4 of (string) | INHERIT | INITIALIZER | INT of (int) | LABEL of (string) | LAZY | LBRACE | LBRACELESS | LBRACKET | LBRACKETBAR | LBRACKETLESS | LESS | LESSMINUS | LET | LIDENT of (string) | LPAREN | MATCH | METHOD | MINUS | MINUSDOT | MINUSGREATER | MODULE | MUTABLE | NEW | OBJECT | OF | OPEN | OPTLABEL of (string) | OR | PARSER | PLUS | PREFIXOP of (string) | PRIVATE | QUESTION | QUESTION2 | QUOTE | RBRACE | RBRACKET | REC | RPAREN | SEMI | SEMISEMI | SHARP | SIG | STAR | STRING of (string) | STRUCT | THEN | TILDE | TO | TRUE | TRY | TYPE | UIDENT of (string) | UNDERSCORE | VAL | VIRTUAL | WHEN | WHILE | WITH type error = | Illegal_character of char | Unterminated_comment | Unterminated_string | Unterminated_string_in_comment | Keyword_as_label of string ;; exception Error of error * int * int (* The table of keywords *) let keyword_table = create_hashtable 149 [ "and", AND; "as", AS; "assert", ASSERT; "begin", BEGIN; "class", CLASS; "constraint", CONSTRAINT; "do", DO; "done", DONE; "downto", DOWNTO; "else", ELSE; "end", END; "exception", EXCEPTION; "external", EXTERNAL; "false", FALSE; "for", FOR; "fun", FUN; "function", FUNCTION; "functor", FUNCTOR; "if", IF; "in", IN; "include", INCLUDE; "inherit", INHERIT; "initializer", INITIALIZER; "lazy", LAZY; "let", LET; "match", MATCH; "method", METHOD; "module", MODULE; "mutable", MUTABLE; "new", NEW; "object", OBJECT; "of", OF; "open", OPEN; "or", OR; "parser", PARSER; "private", PRIVATE; "rec", REC; "sig", SIG; "struct", STRUCT; "then", THEN; "to", TO; "true", TRUE; "try", TRY; "type", TYPE; "val", VAL; "virtual", VIRTUAL; "when", WHEN; "while", WHILE; "with", WITH; "mod", INFIXOP3("mod"); "land", INFIXOP3("land"); "lor", INFIXOP3("lor"); "lxor", INFIXOP3("lxor"); "lsl", INFIXOP4("lsl"); "lsr", INFIXOP4("lsr"); "asr", INFIXOP4("asr") ] (* To buffer string literals *) let initial_string_buffer = String.create 256 let string_buff = ref initial_string_buffer let string_index = ref 0 let reset_string_buffer () = string_buff := initial_string_buffer; string_index := 0 let store_string_char c = if !string_index >= String.length (!string_buff) then begin let new_buff = String.create (String.length (!string_buff) * 2) in String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff)); string_buff := new_buff end; String.unsafe_set (!string_buff) (!string_index) c; incr string_index let get_stored_string () = let s = String.sub (!string_buff) 0 (!string_index) in string_buff := initial_string_buffer; s (* To translate escape sequences *) let char_for_backslash = match Sys.os_type with | "Unix" | "Win32" | "Cygwin" -> begin function | 'n' -> '\010' | 'r' -> '\013' | 'b' -> '\008' | 't' -> '\009' | c -> c end | "MacOS" -> begin function | 'n' -> '\013' | 'r' -> '\010' | 'b' -> '\008' | 't' -> '\009' | c -> c end | x -> fatal_error "Lexer: unknown system type" let char_for_decimal_code lexbuf i = let c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) + 10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) + (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48) in Char.chr(c land 0xFF) (* To store the position of the beginning of a string and comment *) let string_start_pos = ref 0;; let comment_start_pos = ref [];; let in_comment () = !comment_start_pos <> [];; (* Error report *) open Format let report_error ppf = function | Illegal_character c -> fprintf ppf "Illegal character (%s)" (Char.escaped c) | Unterminated_comment -> fprintf ppf "Comment not terminated" | Unterminated_string -> fprintf ppf "String literal not terminated" | Unterminated_string_in_comment -> fprintf ppf "This comment contains an unterminated string literal" | Keyword_as_label kwd -> fprintf ppf "`%s' is a keyword, it cannot be used as label name" kwd ;; } let blank = [' ' '\010' '\013' '\009' '\012'] let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_'] let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222'] let identchar = ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9'] let symbolchar = ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] let decimal_literal = ['0'-'9']+ let hex_literal = '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+ let oct_literal = '0' ['o' 'O'] ['0'-'7']+ let bin_literal = '0' ['b' 'B'] ['0'-'1']+ let float_literal = ['0'-'9']+ ('.' ['0'-'9']* )? (['e' 'E'] ['+' '-']? ['0'-'9']+)? rule token = parse blank + { token lexbuf } | "_" { UNDERSCORE } | "~" { TILDE } | "~" lowercase identchar * ':' { let s = Lexing.lexeme lexbuf in let name = String.sub s 1 (String.length s - 2) in if Hashtbl.mem keyword_table name then raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)); LABEL name } | "?" { QUESTION } | "?" lowercase identchar * ':' { let s = Lexing.lexeme lexbuf in let name = String.sub s 1 (String.length s - 2) in if Hashtbl.mem keyword_table name then raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)); OPTLABEL name } | lowercase identchar * { let s = Lexing.lexeme lexbuf in try Hashtbl.find keyword_table s with Not_found -> LIDENT s } | uppercase identchar * { UIDENT(Lexing.lexeme lexbuf) } (* No capitalized keywords *) | decimal_literal | hex_literal | oct_literal | bin_literal { INT (int_of_string(Lexing.lexeme lexbuf)) } | float_literal { FLOAT (Lexing.lexeme lexbuf) } | "\"" { reset_string_buffer(); let string_start = Lexing.lexeme_start lexbuf in string_start_pos := string_start; string lexbuf; lexbuf.Lexing.lex_start_pos <- string_start - lexbuf.Lexing.lex_abs_pos; STRING (get_stored_string()) } | "'" [^ '\\' '\''] "'" { CHAR(Lexing.lexeme_char lexbuf 1) } | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'" { CHAR(char_for_backslash (Lexing.lexeme_char lexbuf 2)) } | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" { CHAR(char_for_decimal_code lexbuf 2) } | "(*" { comment_start_pos := [Lexing.lexeme_start lexbuf]; comment lexbuf; token lexbuf } | "(*)" { let loc = Location.curr lexbuf and warn = Warnings.Comment "the start of a comment" in Location.prerr_warning loc warn; comment_start_pos := [Lexing.lexeme_start lexbuf]; comment lexbuf; token lexbuf } | "*)" { let loc = Location.curr lexbuf and warn = Warnings.Comment "not the end of a comment" in Location.prerr_warning loc warn; lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 1; STAR } | "#" [' ' '\t']* ['0'-'9']+ [^ '\n' '\r'] * ('\n' | '\r' | "\r\n") (* # linenum ... *) { token lexbuf } | "#" { SHARP } | "&" { AMPERSAND } | "&&" { AMPERAMPER } | "`" { BACKQUOTE } | "'" { QUOTE } | "(" { LPAREN } | ")" { RPAREN } | "*" { STAR } | "," { COMMA } | "??" { QUESTION2 } | "->" { MINUSGREATER } | "." { DOT } | ".." { DOTDOT } | ":" { COLON } | "::" { COLONCOLON } | ":=" { COLONEQUAL } | ":>" { COLONGREATER } | ";" { SEMI } | ";;" { SEMISEMI } | "<" { LESS } | "<-" { LESSMINUS } | "=" { EQUAL } | "[" { LBRACKET } | "[|" { LBRACKETBAR } | "[<" { LBRACKETLESS } | "]" { RBRACKET } | "{" { LBRACE } | "{<" { LBRACELESS } | "|" { BAR } | "||" { BARBAR } | "|]" { BARRBRACKET } | ">" { GREATER } | ">]" { GREATERRBRACKET } | "}" { RBRACE } | ">}" { GREATERRBRACE } | "!=" { INFIXOP0 "!=" } | "+" { PLUS } | "-" { MINUS } | "-." { MINUSDOT } | "!" symbolchar * { PREFIXOP(Lexing.lexeme lexbuf) } | ['~' '?'] symbolchar + { PREFIXOP(Lexing.lexeme lexbuf) } | ['=' '<' '>' '|' '&' '$'] symbolchar * { INFIXOP0(Lexing.lexeme lexbuf) } | ['@' '^'] symbolchar * { INFIXOP1(Lexing.lexeme lexbuf) } | ['+' '-'] symbolchar * { INFIXOP2(Lexing.lexeme lexbuf) } | "**" symbolchar * { INFIXOP4(Lexing.lexeme lexbuf) } | ['*' '/' '%'] symbolchar * { INFIXOP3(Lexing.lexeme lexbuf) } | eof { EOF } | _ { raise (Error(Illegal_character ((Lexing.lexeme lexbuf).[0]), Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)) } and comment = parse "(*" { comment_start_pos := Lexing.lexeme_start lexbuf :: !comment_start_pos; comment lexbuf; } | "*)" { match !comment_start_pos with | [] -> assert false | [x] -> comment_start_pos := []; | _ :: l -> comment_start_pos := l; comment lexbuf; } | "\"" { reset_string_buffer(); string_start_pos := Lexing.lexeme_start lexbuf; begin try string lexbuf with Error (Unterminated_string, _, _) -> let st = List.hd !comment_start_pos in raise (Error (Unterminated_string_in_comment, st, st + 2)) end; string_buff := initial_string_buffer; comment lexbuf } | "''" { comment lexbuf } | "'" [^ '\\' '\''] "'" { comment lexbuf } | "'\\" ['\\' '\'' 'n' 't' 'b' 'r'] "'" { comment lexbuf } | "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" { comment lexbuf } | eof { let st = List.hd !comment_start_pos in raise (Error (Unterminated_comment, st, st + 2)); } | _ { comment lexbuf } and string = parse '"' { () } | '\\' ("\010" | "\013" | "\013\010") [' ' '\009'] * { string lexbuf } | '\\' ['\\' '"' 'n' 't' 'b' 'r'] { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1)); string lexbuf } | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] { store_string_char(char_for_decimal_code lexbuf 1); string lexbuf } | eof { raise (Error (Unterminated_string, !string_start_pos, !string_start_pos+1)) } | _ { store_string_char(Lexing.lexeme_char lexbuf 0); string lexbuf }