ocaml/parsing/lexer.mll

(* The lexer definition *)

{
open Misc
open Parser

type error =
    Illegal_character
  | Unterminated_comment
  | Unterminated_string

exception Error of error * int * int

(* For nested comments *)

let comment_depth = ref 0

(* The table of keywords *)

let keyword_table =
  create_hashtable 149 [
    "and", AND;
    "as", AS;
    "begin", BEGIN;
    "do", DO;
    "done", DONE;
    "downto", DOWNTO;
    "else", ELSE;
    "end", END;
    "exception", EXCEPTION;
    "external", EXTERNAL;
    "false", FALSE;
    "for", FOR;
    "fun", FUN;
    "function", FUNCTION;
    "functor", FUNCTOR;
    "if", IF;
    "in", IN;
    "include", INCLUDE;
    "let", LET;
    "match", MATCH;
    "module", MODULE;
    "mutable", MUTABLE;
    "of", OF;
    "open", OPEN;
    "or", OR;
    "rec", REC;
    "sig", SIG;
    "struct", STRUCT;
    "then", THEN;
    "to", TO;
    "true", TRUE;
    "try", TRY;
    "type", TYPE;
    "val", VAL;
    "when", WHEN;
    "while", WHILE;
    "with", WITH;

    "mod", INFIXOP3("mod");
    "land", INFIXOP3("land");
    "lor", INFIXOP3("lor");
    "lxor", INFIXOP3("lxor");
    "lsl", INFIXOP4("lsl");
    "lsr", INFIXOP4("lsr");
    "asr", INFIXOP4("asr")
]

(* To buffer string literals *)

let initial_string_buffer = String.create 256
let string_buff = ref initial_string_buffer
let string_index = ref 0

let reset_string_buffer () =
  string_buff := initial_string_buffer;
  string_index := 0

let store_string_char c =
  if !string_index >= String.length (!string_buff) then begin
    let new_buff = String.create (String.length (!string_buff) * 2) in
      String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff));
      string_buff := new_buff
  end;
  String.unsafe_set (!string_buff) (!string_index) c;
  incr string_index

let get_stored_string () =
  let s = String.sub (!string_buff) 0 (!string_index) in
  string_buff := initial_string_buffer;
  s

(* To translate escape sequences *)

let char_for_backslash = function
    'n' -> '\010'
  | 'r' -> '\013'
  | 'b' -> '\008'
  | 't' -> '\009'
  | c   -> c

let char_for_decimal_code lexbuf i =
  Char.chr(100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
               10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
                    (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48))

(* To store the position of the beginning of a string or comment *)

let start_pos = ref 0

(* Error report *)

open Format

let report_error = function
    Illegal_character ->
      print_string "Illegal character"
  | Unterminated_comment ->
      print_string "Comment not terminated"
  | Unterminated_string ->
      print_string "String literal not terminated"

}

rule token = parse
    [' ' '\010' '\013' '\009' '\012'] +
      { token lexbuf }
  | ['a'-'z' '\223'-'\246' '\248'-'\255' ]
    (['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255'
      '\'' '0'-'9' ]) *
      { let s = Lexing.lexeme lexbuf in
          try
            Hashtbl.find keyword_table s
          with Not_found ->
            LIDENT s }
  | ['A'-'Z' '\192'-'\214' '\216'-'\222' ]
    (['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255'
      '\'' '0'-'9' ]) *
      { UIDENT(Lexing.lexeme lexbuf) }       (* No capitalized keywords *)
  | ['0'-'9']+
    | '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+
    | '0' ['o' 'O'] ['0'-'7']+
    | '0' ['b' 'B'] ['0'-'1']+
      { INT (int_of_string(Lexing.lexeme lexbuf)) }
  | ['0'-'9']+ ('.' ['0'-'9']*)? (['e' 'E'] ['+' '-']? ['0'-'9']+)?
      { FLOAT (Lexing.lexeme lexbuf) }
  | "\""
      { reset_string_buffer();
        let string_start = Lexing.lexeme_start lexbuf in
        start_pos := string_start;
        string lexbuf;
        lexbuf.lex_start_pos <- string_start - lexbuf.lex_abs_pos;
        STRING (get_stored_string()) }
  | "'" [^ '\\' '\''] "'"
      { CHAR(Lexing.lexeme_char lexbuf 1) }
  | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'"
      { CHAR(char_for_backslash (Lexing.lexeme_char lexbuf 2)) }
  | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
      { CHAR(char_for_decimal_code lexbuf 2) }
  | "(*"
      { comment_depth := 1;
        start_pos := Lexing.lexeme_start lexbuf;
        comment lexbuf;
        token lexbuf }
  | "#" { SHARP }
  | "&" { AMPERSAND }
  | "'" { QUOTE }
  | "(" { LPAREN }
  | ")" { RPAREN }
  | "*" { STAR }
  | "," { COMMA }
  | "->" { MINUSGREATER }
  | "." { DOT }
  | ".." { DOTDOT }
  | ".(" { DOTLPAREN }
  | ".[" { DOTLBRACKET }
  | ":" { COLON }
  | "::" { COLONCOLON }
  | ":=" { COLONEQUAL }
  | ";" { SEMI }
  | ";;" { SEMISEMI }
  | "<-" { LESSMINUS }
  | "=" { EQUAL }
  | "[" { LBRACKET }
  | "[|" { LBRACKETBAR }
  | "]" { RBRACKET }
  | "_" { UNDERSCORE }
  | "{" { LBRACE }
  | "|" { BAR }
  | "|]" { BARRBRACKET }
  | "}" { RBRACE }

  | "!="    { INFIXOP1 "!=" }
  | "-"     { SUBTRACTIVE "-" }
  | "-."    { SUBTRACTIVE "-." }

  | [ '!' '?' ]
    [ '!' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~' ] *
            { PREFIXOP(Lexing.lexeme lexbuf) }
  | [ '=' '<' '>' '@' '^' '|' '&' '~' ]
    [ '!' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~' ] *
            { INFIXOP1(Lexing.lexeme lexbuf) }
  | [ '+' '-' ]
    [ '!' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~' ] *
            { INFIXOP2(Lexing.lexeme lexbuf) }
  | "**"
    [ '!' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~' ] *
            { INFIXOP4(Lexing.lexeme lexbuf) }
  | [ '*' '/' '%' ]
    [ '!' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~' ] *
            { INFIXOP3(Lexing.lexeme lexbuf) }
  | eof { EOF }
  | _
      { raise (Error(Illegal_character,
                     Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)) }

and comment = parse
    "(*"
      { comment_depth := succ !comment_depth; comment lexbuf }
  | "*)"
      { comment_depth := pred !comment_depth;
        if !comment_depth > 0 then comment lexbuf }
  | eof
      { raise (Error(Unterminated_comment, !start_pos, !start_pos+2)) }
  | _
      { comment lexbuf }

and string = parse
    '"'
      { () }
  | '\\' [' ' '\010' '\013' '\009' '\026' '\012'] +
      { string lexbuf }
  | '\\' ['\\' '"' 'n' 't' 'b' 'r']
      { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
        string lexbuf }
  | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
      { store_string_char(char_for_decimal_code lexbuf 1);
         string lexbuf }
  | eof
      { raise (Error(Unterminated_string, !start_pos, !start_pos+1)) }
  | _
      { store_string_char(Lexing.lexeme_char lexbuf 0);
        string lexbuf }