ocaml/parsing/lexer.mll

515 lines
15 KiB
OCaml

(***********************************************************************)
(* *)
(* Objective Caml *)
(* *)
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 1996 Institut National de Recherche en Informatique et *)
(* en Automatique. All rights reserved. This file is distributed *)
(* under the terms of the Q Public License version 1.0. *)
(* *)
(***********************************************************************)
(* $Id$ *)
(* The lexer definition *)
{
open Lexing
open Misc
open Parser
type error =
| Illegal_character of char
| Illegal_escape of string
| Unterminated_comment
| Unterminated_string
| Unterminated_string_in_comment
| Keyword_as_label of string
| Literal_overflow of string
;;
exception Error of error * Location.t;;
(* The table of keywords *)
let keyword_table =
create_hashtable 149 [
"and", AND;
"as", AS;
"assert", ASSERT;
"begin", BEGIN;
"class", CLASS;
"constraint", CONSTRAINT;
"do", DO;
"done", DONE;
"downto", DOWNTO;
"else", ELSE;
"end", END;
"exception", EXCEPTION;
"external", EXTERNAL;
"false", FALSE;
"for", FOR;
"fun", FUN;
"function", FUNCTION;
"functor", FUNCTOR;
"if", IF;
"in", IN;
"include", INCLUDE;
"inherit", INHERIT;
"initializer", INITIALIZER;
"lazy", LAZY;
"let", LET;
"match", MATCH;
"method", METHOD;
"module", MODULE;
"mutable", MUTABLE;
"new", NEW;
"object", OBJECT;
"of", OF;
"open", OPEN;
"or", OR;
(* "parser", PARSER; *)
"private", PRIVATE;
"rec", REC;
"sig", SIG;
"struct", STRUCT;
"then", THEN;
"to", TO;
"true", TRUE;
"try", TRY;
"type", TYPE;
"val", VAL;
"virtual", VIRTUAL;
"when", WHEN;
"while", WHILE;
"with", WITH;
"mod", INFIXOP3("mod");
"land", INFIXOP3("land");
"lor", INFIXOP3("lor");
"lxor", INFIXOP3("lxor");
"lsl", INFIXOP4("lsl");
"lsr", INFIXOP4("lsr");
"asr", INFIXOP4("asr")
]
(* To buffer string literals *)
let initial_string_buffer = String.create 256
let string_buff = ref initial_string_buffer
let string_index = ref 0
let reset_string_buffer () =
string_buff := initial_string_buffer;
string_index := 0
let store_string_char c =
if !string_index >= String.length (!string_buff) then begin
let new_buff = String.create (String.length (!string_buff) * 2) in
String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff));
string_buff := new_buff
end;
String.unsafe_set (!string_buff) (!string_index) c;
incr string_index
let get_stored_string () =
let s = String.sub (!string_buff) 0 (!string_index) in
string_buff := initial_string_buffer;
s
(* To store the position of the beginning of a string and comment *)
let string_start_loc = ref Location.none;;
let comment_start_loc = ref [];;
let in_comment () = !comment_start_loc <> [];;
(* To translate escape sequences *)
let char_for_backslash = function
| 'n' -> '\010'
| 'r' -> '\013'
| 'b' -> '\008'
| 't' -> '\009'
| c -> c
let char_for_decimal_code lexbuf i =
let c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
(Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48) in
if (c < 0 || c > 255) then
if in_comment ()
then 'x'
else raise (Error(Illegal_escape (Lexing.lexeme lexbuf),
Location.curr lexbuf))
else Char.chr c
let char_for_hexadecimal_code lexbuf i =
let d1 = Char.code (Lexing.lexeme_char lexbuf i) in
let val1 = if d1 >= 97 then d1 - 87
else if d1 >= 65 then d1 - 55
else d1 - 48
in
let d2 = Char.code (Lexing.lexeme_char lexbuf (i+1)) in
let val2 = if d2 >= 97 then d2 - 87
else if d2 >= 65 then d2 - 55
else d2 - 48
in
Char.chr (val1 * 16 + val2)
(* To convert integer literals, allowing max_int + 1 (PR#4210) *)
let cvt_int_literal s =
- int_of_string ("-" ^ s)
let cvt_int32_literal s =
Int32.neg (Int32.of_string ("-" ^ String.sub s 0 (String.length s - 1)))
let cvt_int64_literal s =
Int64.neg (Int64.of_string ("-" ^ String.sub s 0 (String.length s - 1)))
let cvt_nativeint_literal s =
Nativeint.neg (Nativeint.of_string ("-" ^ String.sub s 0 (String.length s - 1)))
(* Remove underscores from float literals *)
let remove_underscores s =
let l = String.length s in
let rec remove src dst =
if src >= l then
if dst >= l then s else String.sub s 0 dst
else
match s.[src] with
'_' -> remove (src + 1) dst
| c -> s.[dst] <- c; remove (src + 1) (dst + 1)
in remove 0 0
(* Update the current location with file name and line number. *)
let update_loc lexbuf file line absolute chars =
let pos = lexbuf.lex_curr_p in
let new_file = match file with
| None -> pos.pos_fname
| Some s -> s
in
lexbuf.lex_curr_p <- { pos with
pos_fname = new_file;
pos_lnum = if absolute then line else pos.pos_lnum + line;
pos_bol = pos.pos_cnum - chars;
}
;;
(* Error report *)
open Format
let report_error ppf = function
| Illegal_character c ->
fprintf ppf "Illegal character (%s)" (Char.escaped c)
| Illegal_escape s ->
fprintf ppf "Illegal backslash escape in string or character (%s)" s
| Unterminated_comment ->
fprintf ppf "Comment not terminated"
| Unterminated_string ->
fprintf ppf "String literal not terminated"
| Unterminated_string_in_comment ->
fprintf ppf "This comment contains an unterminated string literal"
| Keyword_as_label kwd ->
fprintf ppf "`%s' is a keyword, it cannot be used as label name" kwd
| Literal_overflow ty ->
fprintf ppf "Integer literal exceeds the range of representable integers of type %s" ty
;;
}
let newline = ('\010' | '\013' | "\013\010")
let blank = [' ' '\009' '\012']
let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']
let identchar =
['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
let symbolchar =
['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
let decimal_literal =
['0'-'9'] ['0'-'9' '_']*
let hex_literal =
'0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']['0'-'9' 'A'-'F' 'a'-'f' '_']*
let oct_literal =
'0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']*
let bin_literal =
'0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']*
let int_literal =
decimal_literal | hex_literal | oct_literal | bin_literal
let float_literal =
['0'-'9'] ['0'-'9' '_']*
('.' ['0'-'9' '_']* )?
(['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']*)?
rule token = parse
| newline
{ update_loc lexbuf None 1 false 0;
token lexbuf
}
| blank +
{ token lexbuf }
| "_"
{ UNDERSCORE }
| "~"
{ TILDE }
| "~" lowercase identchar * ':'
{ let s = Lexing.lexeme lexbuf in
let name = String.sub s 1 (String.length s - 2) in
if Hashtbl.mem keyword_table name then
raise (Error(Keyword_as_label name, Location.curr lexbuf));
LABEL name }
| "?" { QUESTION }
| "??" { QUESTIONQUESTION }
| "?" lowercase identchar * ':'
{ let s = Lexing.lexeme lexbuf in
let name = String.sub s 1 (String.length s - 2) in
if Hashtbl.mem keyword_table name then
raise (Error(Keyword_as_label name, Location.curr lexbuf));
OPTLABEL name }
| lowercase identchar *
{ let s = Lexing.lexeme lexbuf in
try
Hashtbl.find keyword_table s
with Not_found ->
LIDENT s }
| uppercase identchar *
{ UIDENT(Lexing.lexeme lexbuf) } (* No capitalized keywords *)
| int_literal
{ try
INT (cvt_int_literal (Lexing.lexeme lexbuf))
with Failure _ ->
raise (Error(Literal_overflow "int", Location.curr lexbuf))
}
| float_literal
{ FLOAT (remove_underscores(Lexing.lexeme lexbuf)) }
| int_literal "l"
{ try
INT32 (cvt_int32_literal (Lexing.lexeme lexbuf))
with Failure _ ->
raise (Error(Literal_overflow "int32", Location.curr lexbuf)) }
| int_literal "L"
{ try
INT64 (cvt_int64_literal (Lexing.lexeme lexbuf))
with Failure _ ->
raise (Error(Literal_overflow "int64", Location.curr lexbuf)) }
| int_literal "n"
{ try
NATIVEINT (cvt_nativeint_literal (Lexing.lexeme lexbuf))
with Failure _ ->
raise (Error(Literal_overflow "nativeint", Location.curr lexbuf)) }
| "\""
{ reset_string_buffer();
let string_start = lexbuf.lex_start_p in
string_start_loc := Location.curr lexbuf;
string lexbuf;
lexbuf.lex_start_p <- string_start;
STRING (get_stored_string()) }
| "'" newline "'"
{ update_loc lexbuf None 1 false 1;
CHAR (Lexing.lexeme_char lexbuf 1) }
| "'" [^ '\\' '\'' '\010' '\013'] "'"
{ CHAR(Lexing.lexeme_char lexbuf 1) }
| "'\\" ['\\' '\'' '"' 'n' 't' 'b' 'r' ' '] "'"
{ CHAR(char_for_backslash (Lexing.lexeme_char lexbuf 2)) }
| "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
{ CHAR(char_for_decimal_code lexbuf 2) }
| "'\\" 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "'"
{ CHAR(char_for_hexadecimal_code lexbuf 3) }
| "'\\" _
{ let l = Lexing.lexeme lexbuf in
let esc = String.sub l 1 (String.length l - 1) in
raise (Error(Illegal_escape esc, Location.curr lexbuf))
}
| "(*"
{ comment_start_loc := [Location.curr lexbuf];
comment lexbuf;
token lexbuf }
| "(*)"
{ let loc = Location.curr lexbuf in
Location.prerr_warning loc Warnings.Comment_start;
comment_start_loc := [Location.curr lexbuf];
comment lexbuf;
token lexbuf
}
| "*)"
{ let loc = Location.curr lexbuf in
Location.prerr_warning loc Warnings.Comment_not_end;
lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 1;
let curpos = lexbuf.lex_curr_p in
lexbuf.lex_curr_p <- { curpos with pos_cnum = curpos.pos_cnum - 1 };
STAR
}
| "#" [' ' '\t']* (['0'-'9']+ as num) [' ' '\t']*
("\"" ([^ '\010' '\013' '"' ] * as name) "\"")?
[^ '\010' '\013'] * newline
{ update_loc lexbuf name (int_of_string num) true 0;
token lexbuf
}
| "#" { SHARP }
| "&" { AMPERSAND }
| "&&" { AMPERAMPER }
| "`" { BACKQUOTE }
| "'" { QUOTE }
| "(" { LPAREN }
| ")" { RPAREN }
| "*" { STAR }
| "," { COMMA }
| "->" { MINUSGREATER }
| "." { DOT }
| ".." { DOTDOT }
| ":" { COLON }
| "::" { COLONCOLON }
| ":=" { COLONEQUAL }
| ":>" { COLONGREATER }
| ";" { SEMI }
| ";;" { SEMISEMI }
| "<" { LESS }
| "<-" { LESSMINUS }
| "=" { EQUAL }
| "[" { LBRACKET }
| "[|" { LBRACKETBAR }
| "[<" { LBRACKETLESS }
| "[>" { LBRACKETGREATER }
| "]" { RBRACKET }
| "{" { LBRACE }
| "{<" { LBRACELESS }
| "|" { BAR }
| "||" { BARBAR }
| "|]" { BARRBRACKET }
| ">" { GREATER }
| ">]" { GREATERRBRACKET }
| "}" { RBRACE }
| ">}" { GREATERRBRACE }
| "!" { BANG }
| "!=" { INFIXOP0 "!=" }
| "+" { PLUS }
| "+." { PLUSDOT }
| "-" { MINUS }
| "-." { MINUSDOT }
| "!" symbolchar +
{ PREFIXOP(Lexing.lexeme lexbuf) }
| ['~' '?'] symbolchar +
{ PREFIXOP(Lexing.lexeme lexbuf) }
| ['=' '<' '>' '|' '&' '$'] symbolchar *
{ INFIXOP0(Lexing.lexeme lexbuf) }
| ['@' '^'] symbolchar *
{ INFIXOP1(Lexing.lexeme lexbuf) }
| ['+' '-'] symbolchar *
{ INFIXOP2(Lexing.lexeme lexbuf) }
| "**" symbolchar *
{ INFIXOP4(Lexing.lexeme lexbuf) }
| ['*' '/' '%'] symbolchar *
{ INFIXOP3(Lexing.lexeme lexbuf) }
| eof { EOF }
| _
{ raise (Error(Illegal_character (Lexing.lexeme_char lexbuf 0),
Location.curr lexbuf))
}
and comment = parse
"(*"
{ comment_start_loc := (Location.curr lexbuf) :: !comment_start_loc;
comment lexbuf;
}
| "*)"
{ match !comment_start_loc with
| [] -> assert false
| [x] -> comment_start_loc := [];
| _ :: l -> comment_start_loc := l;
comment lexbuf;
}
| "\""
{ reset_string_buffer();
string_start_loc := Location.curr lexbuf;
begin try string lexbuf
with Error (Unterminated_string, _) ->
match !comment_start_loc with
| [] -> assert false
| loc :: _ -> comment_start_loc := [];
raise (Error (Unterminated_string_in_comment, loc))
end;
reset_string_buffer ();
comment lexbuf }
| "''"
{ comment lexbuf }
| "'" newline "'"
{ update_loc lexbuf None 1 false 1;
comment lexbuf
}
| "'" [^ '\\' '\'' '\010' '\013' ] "'"
{ comment lexbuf }
| "'\\" ['\\' '"' '\'' 'n' 't' 'b' 'r' ' '] "'"
{ comment lexbuf }
| "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
{ comment lexbuf }
| "'\\" 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "'"
{ comment lexbuf }
| eof
{ match !comment_start_loc with
| [] -> assert false
| loc :: _ -> comment_start_loc := [];
raise (Error (Unterminated_comment, loc))
}
| newline
{ update_loc lexbuf None 1 false 0;
comment lexbuf
}
| _
{ comment lexbuf }
and string = parse
'"'
{ () }
| '\\' newline ([' ' '\t'] * as space)
{ update_loc lexbuf None 1 false (String.length space);
string lexbuf
}
| '\\' ['\\' '\'' '"' 'n' 't' 'b' 'r' ' ']
{ store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
string lexbuf }
| '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
{ store_string_char(char_for_decimal_code lexbuf 1);
string lexbuf }
| '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F']
{ store_string_char(char_for_hexadecimal_code lexbuf 2);
string lexbuf }
| '\\' _
{ if in_comment ()
then string lexbuf
else begin
(* Should be an error, but we are very lax.
raise (Error (Illegal_escape (Lexing.lexeme lexbuf),
Location.curr lexbuf))
*)
let loc = Location.curr lexbuf in
Location.prerr_warning loc Warnings.Illegal_backslash;
store_string_char (Lexing.lexeme_char lexbuf 0);
store_string_char (Lexing.lexeme_char lexbuf 1);
string lexbuf
end
}
| newline
{ if not (in_comment ()) then
Location.prerr_warning (Location.curr lexbuf) Warnings.Eol_in_string;
update_loc lexbuf None 1 false 0;
let s = Lexing.lexeme lexbuf in
for i = 0 to String.length s - 1 do
store_string_char s.[i];
done;
string lexbuf
}
| eof
{ raise (Error (Unterminated_string, !string_start_loc)) }
| _
{ store_string_char(Lexing.lexeme_char lexbuf 0);
string lexbuf }
and skip_sharp_bang = parse
| "#!" [^ '\n']* '\n' [^ '\n']* "\n!#\n"
{ update_loc lexbuf None 3 false 0 }
| "#!" [^ '\n']* '\n'
{ update_loc lexbuf None 1 false 0 }
| "" { () }