(***********************************************************************) (* *) (* Objective Caml *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 1996 Institut National de Recherche en Informatique et *) (* Automatique. Distributed only by permission. *) (* *) (***********************************************************************) (* $Id$ *) (* The lexer definition *) { open Misc open Parser type error = Illegal_character | Unterminated_comment | Unterminated_string exception Error of error * int * int (* For nested comments *) let comment_depth = ref 0 (* The table of keywords *) let keyword_table = create_hashtable 149 [ "and", AND; "as", AS; "begin", BEGIN; "class", CLASS; "closed", CLOSED; "constraint", CONSTRAINT; "do", DO; "done", DONE; "downto", DOWNTO; "else", ELSE; "end", END; "exception", EXCEPTION; "external", EXTERNAL; "false", FALSE; "for", FOR; "fun", FUN; "function", FUNCTION; "functor", FUNCTOR; "if", IF; "in", IN; "include", INCLUDE; "inherit", INHERIT; "let", LET; "match", MATCH; "method", METHOD; "module", MODULE; "mutable", MUTABLE; "new", NEW; "of", OF; "open", OPEN; "or", OR; "parser", PARSER; "private", PRIVATE; "rec", REC; "sig", SIG; "struct", STRUCT; "then", THEN; "to", TO; "true", TRUE; "try", TRY; "type", TYPE; "val", VAL; "virtual", VIRTUAL; "when", WHEN; "while", WHILE; "with", WITH; "mod", INFIXOP3("mod"); "land", INFIXOP3("land"); "lor", INFIXOP3("lor"); "lxor", INFIXOP3("lxor"); "lsl", INFIXOP4("lsl"); "lsr", INFIXOP4("lsr"); "asr", INFIXOP4("asr") ] (* To buffer string literals *) let initial_string_buffer = String.create 256 let string_buff = ref initial_string_buffer let string_index = ref 0 let reset_string_buffer () = string_buff := initial_string_buffer; string_index := 0 let store_string_char c = if !string_index >= String.length (!string_buff) then begin let new_buff = String.create (String.length (!string_buff) * 2) in String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff)); string_buff := new_buff end; String.unsafe_set (!string_buff) (!string_index) c; incr string_index let get_stored_string () = let s = String.sub (!string_buff) 0 (!string_index) in string_buff := initial_string_buffer; s (* To translate escape sequences *) let char_for_backslash = match Sys.os_type with | "Unix" | "Win32" -> begin function | 'n' -> '\010' | 'r' -> '\013' | 'b' -> '\008' | 't' -> '\009' | c -> c end | "MacOS" -> begin function | 'n' -> '\013' | 'r' -> '\010' | 'b' -> '\008' | 't' -> '\009' | c -> c end | x -> fatal_error "Lexer: unknown system type" ;; let char_for_decimal_code lexbuf i = let c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) + 10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) + (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48) in Char.chr(c land 0xFF) (* To store the position of the beginning of a string or comment *) let start_pos = ref 0 (* Error report *) open Format let report_error = function Illegal_character -> print_string "Illegal character" | Unterminated_comment -> print_string "Comment not terminated" | Unterminated_string -> print_string "String literal not terminated" } rule token = parse [' ' '\010' '\013' '\009' '\012'] + { token lexbuf } | ['a'-'z' '\223'-'\246' '\248'-'\255' ] (['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9' ]) * { let s = Lexing.lexeme lexbuf in try Hashtbl.find keyword_table s with Not_found -> LIDENT s } | ['A'-'Z' '\192'-'\214' '\216'-'\222' ] (['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9' ]) * { UIDENT(Lexing.lexeme lexbuf) } (* No capitalized keywords *) | ['0'-'9']+ | '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+ | '0' ['o' 'O'] ['0'-'7']+ | '0' ['b' 'B'] ['0'-'1']+ { INT (int_of_string(Lexing.lexeme lexbuf)) } | ['0'-'9']+ ('.' ['0'-'9']*)? (['e' 'E'] ['+' '-']? ['0'-'9']+)? { FLOAT (Lexing.lexeme lexbuf) } | "\"" { reset_string_buffer(); let string_start = Lexing.lexeme_start lexbuf in start_pos := string_start; string lexbuf; lexbuf.Lexing.lex_start_pos <- string_start - lexbuf.Lexing.lex_abs_pos; STRING (get_stored_string()) } | "'" [^ '\\' '\''] "'" { CHAR(Lexing.lexeme_char lexbuf 1) } | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'" { CHAR(char_for_backslash (Lexing.lexeme_char lexbuf 2)) } | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" { CHAR(char_for_decimal_code lexbuf 2) } | "(*" { comment_depth := 1; start_pos := Lexing.lexeme_start lexbuf; comment lexbuf; token lexbuf } | "#" { SHARP } | "&" { AMPERSAND } | "&&" { AMPERAMPER } | "'" { QUOTE } | "(" { LPAREN } | ")" { RPAREN } | "*" { STAR } | "," { COMMA } | "?" { QUESTION } | "->" { MINUSGREATER } | "." { DOT } | ".." { DOTDOT } | ":" { COLON } | "::" { COLONCOLON } | ":=" { COLONEQUAL } | ":>" { COLONGREATER } | ";" { SEMI } | ";;" { SEMISEMI } | "<" { LESS } | "<-" { LESSMINUS } | "=" { EQUAL } | "[" { LBRACKET } | "[|" { LBRACKETBAR } | "[<" { LBRACKETLESS } | "]" { RBRACKET } | "_" { UNDERSCORE } | "{" { LBRACE } | "{<" { LBRACELESS } | "|" { BAR } | "||" { BARBAR } | "|]" { BARRBRACKET } | ">" { GREATER } | ">]" { GREATERRBRACKET } | "}" { RBRACE } | ">}" { GREATERRBRACE } | "!=" { INFIXOP0 "!=" } | "-" { SUBTRACTIVE "-" } | "-." { SUBTRACTIVE "-." } | ['!' '?' '~'] ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] * { PREFIXOP(Lexing.lexeme lexbuf) } | ['=' '<' '>' '|' '&' '$'] ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] * { INFIXOP0(Lexing.lexeme lexbuf) } | ['@' '^'] ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] * { INFIXOP1(Lexing.lexeme lexbuf) } | ['+' '-'] ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] * { INFIXOP2(Lexing.lexeme lexbuf) } | "**" ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] * { INFIXOP4(Lexing.lexeme lexbuf) } | ['*' '/' '%'] ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] * { INFIXOP3(Lexing.lexeme lexbuf) } | eof { EOF } | _ { raise (Error(Illegal_character, Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)) } and comment = parse "(*" { comment_depth := succ !comment_depth; comment lexbuf } | "*)" { comment_depth := pred !comment_depth; if !comment_depth > 0 then comment lexbuf } | "\"" { reset_string_buffer(); start_pos := Lexing.lexeme_start lexbuf; string lexbuf; string_buff := initial_string_buffer; comment lexbuf } | "''" { comment lexbuf } | "'" [^ '\\' '\''] "'" { comment lexbuf } | "'\\" ['\\' '\'' 'n' 't' 'b' 'r'] "'" { comment lexbuf } | "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" { comment lexbuf } | eof { raise (Error(Unterminated_comment, !start_pos, !start_pos+2)) } | _ { comment lexbuf } and string = parse '"' { () } | '\\' ("\010" | "\013" | "\013\010") [' ' '\009'] * { string lexbuf } | '\\' ['\\' '"' 'n' 't' 'b' 'r'] { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1)); string lexbuf } | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] { store_string_char(char_for_decimal_code lexbuf 1); string lexbuf } | eof { raise (Error(Unterminated_string, !start_pos, !start_pos+1)) } | _ { store_string_char(Lexing.lexeme_char lexbuf 0); string lexbuf }