ocaml/parsing/lexer.mll

312 lines
8.6 KiB
OCaml

(***********************************************************************)
(* *)
(* Objective Caml *)
(* *)
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 1996 Institut National de Recherche en Informatique et *)
(* Automatique. Distributed only by permission. *)
(* *)
(***********************************************************************)
(* $Id$ *)
(* The lexer definition *)
{
open Misc
open Parser
type error =
Illegal_character
| Unterminated_comment
| Unterminated_string
exception Error of error * int * int
(* For nested comments *)
let comment_depth = ref 0
(* The table of keywords *)
let keyword_table =
create_hashtable 149 [
"and", AND;
"as", AS;
"assert", ASSERT;
"begin", BEGIN;
"class", CLASS;
"closed", CLOSED;
"constraint", CONSTRAINT;
"do", DO;
"done", DONE;
"downto", DOWNTO;
"else", ELSE;
"end", END;
"exception", EXCEPTION;
"external", EXTERNAL;
"false", FALSE;
"for", FOR;
"fun", FUN;
"function", FUNCTION;
"functor", FUNCTOR;
"if", IF;
"in", IN;
"include", INCLUDE;
"inherit", INHERIT;
"let", LET;
"match", MATCH;
"method", METHOD;
"module", MODULE;
"mutable", MUTABLE;
"new", NEW;
"of", OF;
"open", OPEN;
"or", OR;
"parser", PARSER;
"private", PRIVATE;
"rec", REC;
"sig", SIG;
"struct", STRUCT;
"then", THEN;
"to", TO;
"true", TRUE;
"try", TRY;
"type", TYPE;
"val", VAL;
"virtual", VIRTUAL;
"when", WHEN;
"while", WHILE;
"with", WITH;
"mod", INFIXOP3("mod");
"land", INFIXOP3("land");
"lor", INFIXOP3("lor");
"lxor", INFIXOP3("lxor");
"lsl", INFIXOP4("lsl");
"lsr", INFIXOP4("lsr");
"asr", INFIXOP4("asr")
]
(* To buffer string literals *)
let initial_string_buffer = String.create 256
let string_buff = ref initial_string_buffer
let string_index = ref 0
let reset_string_buffer () =
string_buff := initial_string_buffer;
string_index := 0
let store_string_char c =
if !string_index >= String.length (!string_buff) then begin
let new_buff = String.create (String.length (!string_buff) * 2) in
String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff));
string_buff := new_buff
end;
String.unsafe_set (!string_buff) (!string_index) c;
incr string_index
let get_stored_string () =
let s = String.sub (!string_buff) 0 (!string_index) in
string_buff := initial_string_buffer;
s
(* To translate escape sequences *)
let char_for_backslash =
match Sys.os_type with
| "Unix" | "Win32" ->
begin function
| 'n' -> '\010'
| 'r' -> '\013'
| 'b' -> '\008'
| 't' -> '\009'
| c -> c
end
| "MacOS" ->
begin function
| 'n' -> '\013'
| 'r' -> '\010'
| 'b' -> '\008'
| 't' -> '\009'
| c -> c
end
| x -> fatal_error "Lexer: unknown system type"
let char_for_decimal_code lexbuf i =
let c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
(Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48) in
Char.chr(c land 0xFF)
(* To store the position of the beginning of a string or comment *)
let start_pos = ref 0
(* Error report *)
open Format
let report_error = function
Illegal_character ->
print_string "Illegal character"
| Unterminated_comment ->
print_string "Comment not terminated"
| Unterminated_string ->
print_string "String literal not terminated"
}
rule token = parse
[' ' '\010' '\013' '\009' '\012'] +
{ token lexbuf }
| ['a'-'z' '\223'-'\246' '\248'-'\255' ]
(['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255'
'\'' '0'-'9' ]) *
{ let s = Lexing.lexeme lexbuf in
try
Hashtbl.find keyword_table s
with Not_found ->
LIDENT s }
| ['A'-'Z' '\192'-'\214' '\216'-'\222' ]
(['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255'
'\'' '0'-'9' ]) *
{ UIDENT(Lexing.lexeme lexbuf) } (* No capitalized keywords *)
| ['0'-'9']+
| '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+
| '0' ['o' 'O'] ['0'-'7']+
| '0' ['b' 'B'] ['0'-'1']+
{ INT (int_of_string(Lexing.lexeme lexbuf)) }
| ['0'-'9']+ ('.' ['0'-'9']*)? (['e' 'E'] ['+' '-']? ['0'-'9']+)?
{ FLOAT (Lexing.lexeme lexbuf) }
| "\""
{ reset_string_buffer();
let string_start = Lexing.lexeme_start lexbuf in
start_pos := string_start;
string lexbuf;
lexbuf.Lexing.lex_start_pos <-
string_start - lexbuf.Lexing.lex_abs_pos;
STRING (get_stored_string()) }
| "'" [^ '\\' '\''] "'"
{ CHAR(Lexing.lexeme_char lexbuf 1) }
| "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'"
{ CHAR(char_for_backslash (Lexing.lexeme_char lexbuf 2)) }
| "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
{ CHAR(char_for_decimal_code lexbuf 2) }
| "(*"
{ comment_depth := 1;
start_pos := Lexing.lexeme_start lexbuf;
comment lexbuf;
token lexbuf }
| "#" [' ' '\t']* ['0'-'9']+ [' ' '\t']* "\"" [^ '\n' '\r'] *
('\n' | '\r' | "\r\n")
(* # linenum "filename" flags \n *)
{ token lexbuf }
| "#" { SHARP }
| "&" { AMPERSAND }
| "&&" { AMPERAMPER }
| "'" { QUOTE }
| "(" { LPAREN }
| ")" { RPAREN }
| "*" { STAR }
| "," { COMMA }
| "?" { QUESTION }
| "->" { MINUSGREATER }
| "." { DOT }
| ".." { DOTDOT }
| ":" { COLON }
| "::" { COLONCOLON }
| ":=" { COLONEQUAL }
| ":>" { COLONGREATER }
| ";" { SEMI }
| ";;" { SEMISEMI }
| "<" { LESS }
| "<-" { LESSMINUS }
| "=" { EQUAL }
| "[" { LBRACKET }
| "[|" { LBRACKETBAR }
| "[<" { LBRACKETLESS }
| "]" { RBRACKET }
| "_" { UNDERSCORE }
| "{" { LBRACE }
| "{<" { LBRACELESS }
| "|" { BAR }
| "||" { BARBAR }
| "|]" { BARRBRACKET }
| ">" { GREATER }
| ">]" { GREATERRBRACKET }
| "}" { RBRACE }
| ">}" { GREATERRBRACE }
| "!=" { INFIXOP0 "!=" }
| "-" { SUBTRACTIVE "-" }
| "-." { SUBTRACTIVE "-." }
| ['!' '?' '~']
['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] *
{ PREFIXOP(Lexing.lexeme lexbuf) }
| ['=' '<' '>' '|' '&' '$']
['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] *
{ INFIXOP0(Lexing.lexeme lexbuf) }
| ['@' '^']
['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] *
{ INFIXOP1(Lexing.lexeme lexbuf) }
| ['+' '-']
['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] *
{ INFIXOP2(Lexing.lexeme lexbuf) }
| "**"
['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] *
{ INFIXOP4(Lexing.lexeme lexbuf) }
| ['*' '/' '%']
['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] *
{ INFIXOP3(Lexing.lexeme lexbuf) }
| eof { EOF }
| _
{ raise (Error(Illegal_character,
Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)) }
and comment = parse
"(*"
{ comment_depth := succ !comment_depth; comment lexbuf }
| "*)"
{ comment_depth := pred !comment_depth;
if !comment_depth > 0 then comment lexbuf }
| "\""
{ reset_string_buffer();
start_pos := Lexing.lexeme_start lexbuf;
string lexbuf;
string_buff := initial_string_buffer;
comment lexbuf }
| "''"
{ comment lexbuf }
| "'" [^ '\\' '\''] "'"
{ comment lexbuf }
| "'\\" ['\\' '\'' 'n' 't' 'b' 'r'] "'"
{ comment lexbuf }
| "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
{ comment lexbuf }
| eof
{ raise (Error(Unterminated_comment, !start_pos, !start_pos+2)) }
| _
{ comment lexbuf }
and string = parse
'"'
{ () }
| '\\' ("\010" | "\013" | "\013\010") [' ' '\009'] *
{ string lexbuf }
| '\\' ['\\' '"' 'n' 't' 'b' 'r']
{ store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
string lexbuf }
| '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
{ store_string_char(char_for_decimal_code lexbuf 1);
string lexbuf }
| eof
{ raise (Error(Unterminated_string, !start_pos, !start_pos+1)) }
| _
{ store_string_char(Lexing.lexeme_char lexbuf 0);
string lexbuf }