ocaml/camlp4/Camlp4/Struct/Lexer.mll

(****************************************************************************)
(*                                                                          *)
(*                              Objective Caml                              *)
(*                                                                          *)
(*                            INRIA Rocquencourt                            *)
(*                                                                          *)
(*  Copyright 2006-2006 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  All rights reserved.  This file is distributed under   *)
(*  the terms of the GNU Library General Public License, with the special   *)
(*  exception on linking described in LICENSE at the top of the Objective   *)
(*  Caml source tree.                                                       *)
(*                                                                          *)
(****************************************************************************)

(* Authors:
 * - Daniel de Rauglaudre: initial version
 * - Nicolas Pouillard: refactoring
 *)


(* $Id$ *)

(* The lexer definition *)


{

(** A lexical analyzer. *)

(* FIXME interface module Make (Token : Token) |+ Note that this Token sig is not in Sig +| *)
(* : Sig.Lexer. S with module Loc = Token.Loc and module Token = Token; *)

(* type context =
{ loc        : Loc.t    ;
  in_comment : bool     ;
   |+* FIXME When True, all lexers built by [Plexer.make ()] do not lex the
       quotation syntax any more. Default is False (quotations are
       lexed). +|
  quotations : bool     };

value default_context : context;

value mk : Loc.t -> Stream.t char -> Stream.t (Token.t * Loc.t);

value mk' : context -> Stream.t char -> Stream.t (Token.t * Loc.t);              *)
(* FIXME Beware the context argument must be given like that:
 * mk' { (default_context) with ... = ... } strm
 *)

module TokenEval = Token.Eval
module Make (Token : Sig.Camlp4Token)
= struct
  module Loc = Token.Loc
  module Token = Token

  open Lexing
  open Sig

  (* Error report *)
  module Error = struct

    type t =
      | Illegal_character of char
      | Illegal_escape    of string
      | Unterminated_comment
      | Unterminated_string
      | Unterminated_quotation
      | Unterminated_antiquot
      | Unterminated_string_in_comment
      | Comment_start
      | Comment_not_end
      | Literal_overflow of string

    exception E of t

    open Format

    let print ppf =
      function
      | Illegal_character c ->
          fprintf ppf "Illegal character (%s)" (Char.escaped c)
      | Illegal_escape s ->
          fprintf ppf "Illegal backslash escape in string or character (%s)" s
      | Unterminated_comment ->
          fprintf ppf "Comment not terminated"
      | Unterminated_string ->
          fprintf ppf "String literal not terminated"
      | Unterminated_string_in_comment ->
          fprintf ppf "This comment contains an unterminated string literal"
      | Unterminated_quotation ->
          fprintf ppf "Quotation not terminated"
      | Unterminated_antiquot ->
          fprintf ppf "Antiquotation not terminated"
      | Literal_overflow ty ->
          fprintf ppf "Integer literal exceeds the range of representable integers of type %s" ty
      | Comment_start ->
          fprintf ppf "this is the start of a comment"
      | Comment_not_end ->
          fprintf ppf "this is not the end of a comment"

    let to_string x =
      let b = Buffer.create 50 in
      let () = bprintf b "%a" print x in Buffer.contents b
  end;;

  let module M = ErrorHandler.Register(Error) in ()

  open Error

  (* To store some context information:
  *   loc       : position of the beginning of a string, quotation and comment
  *   in_comment: are we in a comment?
  *   quotations: shall we lex quotation?
  *               If quotations is false it's a SYMBOL token.
  *   antiquots : shall we lex antiquotations.
  *)

  type context =
  { loc        : Loc.t    ;
    in_comment : bool     ;
    quotations : bool     ;
    antiquots  : bool     ;
    lexbuf     : lexbuf   ;
    buffer     : Buffer.t }

  let default_context lb =
  { loc        = Loc.ghost ;
    in_comment = false     ;
    quotations = true      ;
    antiquots  = false     ;
    lexbuf     = lb        ;
    buffer     = Buffer.create 256 }

  (* To buffer string literals, quotations and antiquotations *)

  let store c = Buffer.add_string c.buffer (Lexing.lexeme c.lexbuf)
  let istore_char c i = Buffer.add_char c.buffer (Lexing.lexeme_char c.lexbuf i)
  let buff_contents c =
    let contents = Buffer.contents c.buffer in
    Buffer.reset c.buffer; contents

  let loc c = Loc.merge c.loc (Loc.of_lexbuf c.lexbuf)
  let quotations c = c.quotations
  let antiquots c = c.antiquots
  let is_in_comment c = c.in_comment
  let in_comment c = { (c) with in_comment = true }
  let set_start_p c = c.lexbuf.lex_start_p <- Loc.start_pos c.loc
  let move_start_p shift c =
    let p = c.lexbuf.lex_start_p in
    c.lexbuf.lex_start_p <- { (p) with pos_cnum = p.pos_cnum + shift }

  let update_loc c = { (c) with loc = Loc.of_lexbuf c.lexbuf }
  let with_curr_loc f c = f (update_loc c) c.lexbuf
  let parse_nested f c =
    with_curr_loc f c;
    set_start_p c;
    buff_contents c
  let shift n c = { (c) with loc = Loc.move `both n c.loc }
  let store_parse f c = store c ; f c c.lexbuf
  let parse f c = f c c.lexbuf
  let mk_quotation quotation c name loc shift =
    let s = parse_nested quotation (update_loc c) in
    let contents = String.sub s 0 (String.length s - 2) in
    QUOTATION { q_name     = name     ;
                q_loc      = loc      ;
                q_shift    = shift    ;
                q_contents = contents }


  (* Update the current location with file name and line number. *)

  let update_loc c file line absolute chars =
    let lexbuf = c.lexbuf in
    let pos = lexbuf.lex_curr_p in
    let new_file = match file with
                  | None -> pos.pos_fname
                  | Some s -> s
    in
    lexbuf.lex_curr_p <- { pos with
      pos_fname = new_file;
      pos_lnum = if absolute then line else pos.pos_lnum + line;
      pos_bol = pos.pos_cnum - chars;
    }

  let err error loc =
    raise(Loc.Exc_located(loc, Error.E error))

  let warn error loc =
    Format.eprintf "Warning: %a: %a@." Loc.print loc Error.print error

  }

  let newline = ('\010' | '\013' | "\013\010")
  let blank = [' ' '\009' '\012']
  let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
  let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']
  let identchar =
    ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
  let ident = (lowercase|uppercase) identchar*
  let locname = ident
  let not_star_symbolchar =
    ['$' '!' '%' '&' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~' '\\']
  let symbolchar = '*' | not_star_symbolchar
  let quotchar =
    ['!' '%' '&' '+' '-' '.' '/' ':' '=' '?' '@' '^' '|' '~' '\\' '*']
  let hexa_char = ['0'-'9' 'A'-'F' 'a'-'f']
  let decimal_literal =
    ['0'-'9'] ['0'-'9' '_']*
  let hex_literal =
    '0' ['x' 'X'] hexa_char ['0'-'9' 'A'-'F' 'a'-'f' '_']*
  let oct_literal =
    '0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']*
  let bin_literal =
    '0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']*
  let int_literal =
    decimal_literal | hex_literal | oct_literal | bin_literal
  let float_literal =
    ['0'-'9'] ['0'-'9' '_']*
    ('.' ['0'-'9' '_']* )?
    (['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']*)?

  (* Delimitors are extended (from 3.09) in a conservative way *)

  (* These chars that can't start an expression or a pattern: *)
  let safe_delimchars = ['%' '&' '/' '@' '^']

  (* These symbols are unsafe since "[<", "[|", etc. exsist. *)
  let delimchars = safe_delimchars | ['|' '<' '>' ':' '=' '.']

  let left_delims  = ['(' '[' '{']
  let right_delims = [')' ']' '}']

  let left_delimitor =
    (* At least a safe_delimchars *)
    left_delims delimchars* safe_delimchars (delimchars|left_delims)*

  (* A '(' or a new super '(' without "(<" *)
  | '(' (['|' ':'] delimchars*)?
  (* Old brackets, no new brackets starting with "[|" or "[:" *)
  | '[' ['|' ':']?
  (* Old "[<","{<" and new ones *)
  | ['[' '{'] delimchars* '<'
  (* Old brace and new ones *)
  | '{' (['|' ':'] delimchars*)?

  let right_delimitor =
    (* At least a safe_delimchars *)
    (delimchars|right_delims)* safe_delimchars (delimchars|right_delims)* right_delims
  (* A ')' or a new super ')' without ">)" *)
  | (delimchars* ['|' ':'])? ')'
  (* Old brackets, no new brackets ending with "|]" or ":]" *)
  | ['|' ':']? ']'
  (* Old ">]",">}" and new ones *)
  | '>' delimchars* [']' '}']
  (* Old brace and new ones *)
  | (delimchars* ['|' ':'])? '}'


  rule token c = parse
    | newline                            { update_loc c None 1 false 0; NEWLINE }
    | blank + as x                                                   { BLANKS x }
    | "~" (lowercase identchar * as x) ':'                            { LABEL x }
    | "?" (lowercase identchar * as x) ':'                         { OPTLABEL x }
    | lowercase identchar * as x                                     { LIDENT x }
    | uppercase identchar * as x                                     { UIDENT x }
    | int_literal as i
        { try  INT(int_of_string i, i)
          with Failure _ -> err (Literal_overflow "int") (Loc.of_lexbuf lexbuf) }
    | float_literal as f
        { try  FLOAT(float_of_string f, f)
          with Failure _ -> err (Literal_overflow "float") (Loc.of_lexbuf lexbuf) }
    | (int_literal as i) "l"
        { try  INT32(Int32.of_string i, i)
          with Failure _ -> err (Literal_overflow "int32") (Loc.of_lexbuf lexbuf) }
    | (int_literal as i) "L"
        { try  INT64(Int64.of_string i, i)
          with Failure _ -> err (Literal_overflow "int64") (Loc.of_lexbuf lexbuf) }
    | (int_literal as i) "n"
        { try NATIVEINT(Nativeint.of_string i, i)
          with Failure _ -> err (Literal_overflow "nativeint") (Loc.of_lexbuf lexbuf) }
    | '"'
        { with_curr_loc string c;
          let s = buff_contents c in STRING (TokenEval.string s, s)             }
    | "'" (newline as x) "'"
        { update_loc c None 1 false 1; CHAR (TokenEval.char x, x)               }
    | "'" ( [^ '\\' '\010' '\013']
          | '\\' (['\\' '"' 'n' 't' 'b' 'r' ' ' '\'']
                |['0'-'9'] ['0'-'9'] ['0'-'9']
                |'x' hexa_char hexa_char)
          as x) "'"                                { CHAR (TokenEval.char x, x) }
    | "'\\" (_ as c)
        { err (Illegal_escape (String.make 1 c)) (Loc.of_lexbuf lexbuf)         }
    | "(*"
        { store c; COMMENT(parse_nested comment (in_comment c))                 }
    | "(*)"
        { warn Comment_start (Loc.of_lexbuf lexbuf)                             ;
          parse comment (in_comment c); COMMENT (buff_contents c)               }
    | "*)"
        { warn Comment_not_end (Loc.of_lexbuf lexbuf)                           ;
          move_start_p (-1) c; SYMBOL "*"                                       }
    | "<<" (quotchar* as beginning)
      { if quotations c
        then (move_start_p (-String.length beginning);
              mk_quotation quotation c "" "" 2)
        else parse (symbolchar_star ("<<" ^ beginning)) c                       }
    | "<<>>"
      { if quotations c
        then QUOTATION { q_name = ""; q_loc = ""; q_shift = 2; q_contents = "" }
        else parse (symbolchar_star "<<>>") c                                   }
    | "<@"
      { if quotations c then with_curr_loc maybe_quotation_at c
        else parse (symbolchar_star "<@") c                                     }
    | "<:"
      { if quotations c then with_curr_loc maybe_quotation_colon c
        else parse (symbolchar_star "<:") c                                     }
    | "#" [' ' '\t']* (['0'-'9']+ as num) [' ' '\t']*
          ("\"" ([^ '\010' '\013' '"' ] * as name) "\"")?
          [^ '\010' '\013'] * newline
      { let inum = int_of_string num
        in update_loc c name inum true 0; LINE_DIRECTIVE(inum, name)            }
    | '(' (not_star_symbolchar as op) ')'
                                             { ESCAPED_IDENT (String.make 1 op) }
    | '(' (not_star_symbolchar symbolchar* not_star_symbolchar as op) ')'
                                                             { ESCAPED_IDENT op }
    | '(' (not_star_symbolchar symbolchar* as op) blank+ ')'
                                                             { ESCAPED_IDENT op }
    | '(' blank+ (symbolchar* not_star_symbolchar as op) ')'
                                                             { ESCAPED_IDENT op }
    | '(' blank+ (symbolchar+ as op) blank+ ')'
                                                             { ESCAPED_IDENT op }
    | ( "#"  | "`"  | "'"  | ","  | "."  | ".." | ":"  | "::"
      | ":=" | ":>" | ";"  | ";;" | "_"
      | left_delimitor | right_delimitor ) as x  { SYMBOL x }
    | '$' { if antiquots c
            then with_curr_loc dollar (shift 1 c)
            else parse (symbolchar_star "$") c }
    | ['~' '?' '!' '=' '<' '>' '|' '&' '@' '^' '+' '-' '*' '/' '%' '\\'] symbolchar *
                                                                as x { SYMBOL x }
    | eof
      { let pos = lexbuf.lex_curr_p in
        lexbuf.lex_curr_p <- { pos with pos_bol  = pos.pos_bol  + 1 ;
                                        pos_cnum = pos.pos_cnum + 1 }; EOI      }
    | _ as c                 { err (Illegal_character c) (Loc.of_lexbuf lexbuf) }

  and comment c = parse
      "(*"
        { store c; with_curr_loc comment c; parse comment c                     }
    | "*)"                                                            { store c }
    | '<' (':' ident)? ('@' locname)? '<'
        { store c;
          if quotations c then with_curr_loc quotation c; parse comment c       }
    | ident                                             { store_parse comment c }
    | "\""
        { store c;
          begin try with_curr_loc string c
          with Loc.Exc_located(_, Error.E Unterminated_string) ->
            err Unterminated_string_in_comment (loc c)
          end;
          Buffer.add_char c.buffer '"';
          parse comment c }
    | "''"                                              { store_parse comment c }
    | "'''"                                             { store_parse comment c }
    | "'" newline "'"
      { update_loc c None 1 false 1; store_parse comment c                      }
    | "'" [^ '\\' '\'' '\010' '\013' ] "'"              { store_parse comment c }
    | "'\\" ['\\' '"' '\'' 'n' 't' 'b' 'r' ' '] "'"     { store_parse comment c }
    | "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"           { store_parse comment c }
    | "'\\" 'x' hexa_char hexa_char "'"                 { store_parse comment c }
    | eof
      { err Unterminated_comment (loc c)                                        }
    | newline
      { update_loc c None 1 false 0; store_parse comment c                      }
    | _                                                 { store_parse comment c }

  and string c = parse
      '"'                                                       { set_start_p c }
    | '\\' newline ([' ' '\t'] * as space)
        { update_loc c None 1 false (String.length space);
          store_parse string c                                                  }
    | '\\' ['\\' '"' 'n' 't' 'b' 'r' ' ' '\'']           { store_parse string c }
    | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']                 { store_parse string c }
    | '\\' 'x' hexa_char hexa_char                       { store_parse string c }
    | '\\' (_ as x)
        { if is_in_comment c
          then store_parse string c
          else begin
            warn (Illegal_escape (String.make 1 x)) (Loc.of_lexbuf lexbuf);
            store_parse string c
          end }
    | newline
      { update_loc c None 1 false 0; store_parse string c                       }
    | eof                                     { err Unterminated_string (loc c) }
    | _                                                  { store_parse string c }

  and symbolchar_star beginning c = parse
    | symbolchar* as tok            { move_start_p (-String.length beginning) c ;
                                                        SYMBOL(beginning ^ tok) }

  and maybe_quotation_at c = parse
    | (ident as loc) '<'      
      { mk_quotation quotation c "" loc (3 + String.length loc)                 }
    | symbolchar* as tok                                   { SYMBOL("<@" ^ tok) }

  and maybe_quotation_colon c = parse
    | (ident as name) '<'
      { mk_quotation quotation c name "" (3 + String.length name)               }
    | (ident as name) '@' (locname as loc) '<'
      { mk_quotation quotation c name loc
                     (4 + String.length loc + String.length name)               }
    | symbolchar* as tok                                   { SYMBOL("<:" ^ tok) }

  and quotation c = parse
    | '<' (':' ident)? ('@' locname)? '<'    {                          store c ;
                                                      with_curr_loc quotation c ;
                                                              parse quotation c }
    | ">>"                                                            { store c }
    | eof                                  { err Unterminated_quotation (loc c) }
    | newline                                     { update_loc c None 1 false 0 ;
                                                        store_parse quotation c }
    | _                                               { store_parse quotation c }

  and dollar c = parse
    | '$'                                     { set_start_p c; ANTIQUOT("", "") }
    | ('`'? (identchar*|'.'+) as name) ':'
      { with_curr_loc (antiquot name) (shift (1 + String.length name) c)        }
    | _                                           { store_parse (antiquot "") c }

  and antiquot name c = parse
    | '$'                      { set_start_p c; ANTIQUOT(name, buff_contents c) }
    | eof                                   { err Unterminated_antiquot (loc c) }
    | newline
      { update_loc c None 1 false 0; store_parse (antiquot name) c              }
    | '<' (':' ident)? ('@' locname)? '<'
      { store c; with_curr_loc quotation c; parse (antiquot name) c             }
    | _                                         { store_parse (antiquot name) c }

  {
    
  let lexing_store s buff max =
    let rec self n s =
      if n >= max then n
      else
        match Stream.peek s with
        | Some x ->
            Stream.junk s;
            buff.[n] <- x;
            succ n
        | _ -> n
    in
    self 0 s

  let from_context c =
    let next _ =
      let tok = with_curr_loc token c in
      let loc = Loc.of_lexbuf c.lexbuf in
      Some ((tok, loc))
    in Stream.from next

  let from_lexbuf ?(quotations = true) lb =
    let c = { (default_context lb) with
              loc        = Loc.of_lexbuf lb;
              antiquots  = !Camlp4_config.antiquotations;
              quotations = quotations      }
    in from_context c

  let setup_loc lb loc =
    let start_pos = Loc.start_pos loc in
    lb.lex_abs_pos <- start_pos.pos_cnum;
    lb.lex_curr_p  <- start_pos

  let from_string ?quotations loc str =
    let lb = Lexing.from_string str in
    setup_loc lb loc;
    from_lexbuf ?quotations lb

  let from_stream ?quotations loc strm =
    let lb = Lexing.from_function (lexing_store strm) in
    setup_loc lb loc;
    from_lexbuf ?quotations lb

  let mk () loc strm =
    from_stream ~quotations:!Camlp4_config.quotations loc strm
end
}