ocaml/camlp4/Camlp4/Struct/Lexer.mll

485 lines
19 KiB
OCaml
Raw Normal View History

(****************************************************************************)
(* *)
(* Objective Caml *)
(* *)
(* INRIA Rocquencourt *)
(* *)
(* Copyright 2006-2006 Institut National de Recherche en Informatique et *)
(* en Automatique. All rights reserved. This file is distributed under *)
(* the terms of the GNU Library General Public License, with the special *)
(* exception on linking described in LICENSE at the top of the Objective *)
(* Caml source tree. *)
(* *)
(****************************************************************************)
(* Authors:
* - Daniel de Rauglaudre: initial version
* - Nicolas Pouillard: refactoring
*)
(* $Id$ *)
(* The lexer definition *)
{
(** A lexical analyzer. *)
(* FIXME interface module Make (Token : Token) |+ Note that this Token sig is not in Sig +| *)
(* : Sig.Lexer. S with module Loc = Token.Loc and module Token = Token; *)
(* type context =
{ loc : Loc.t ;
in_comment : bool ;
|+* FIXME When True, all lexers built by [Plexer.make ()] do not lex the
quotation syntax any more. Default is False (quotations are
lexed). +|
quotations : bool };
value default_context : context;
value mk : Loc.t -> Stream.t char -> Stream.t (Token.t * Loc.t);
value mk' : context -> Stream.t char -> Stream.t (Token.t * Loc.t); *)
(* FIXME Beware the context argument must be given like that:
* mk' { (default_context) with ... = ... } strm
*)
module TokenEval = Token.Eval
module Make (Token : Sig.Camlp4Token)
= struct
module Loc = Token.Loc
module Token = Token
open Lexing
open Sig
(* Error report *)
module Error = struct
type t =
| Illegal_character of char
| Illegal_escape of string
| Unterminated_comment
| Unterminated_string
| Unterminated_quotation
| Unterminated_antiquot
| Unterminated_string_in_comment
| Comment_start
| Comment_not_end
| Literal_overflow of string
exception E of t
open Format
let print ppf =
function
| Illegal_character c ->
fprintf ppf "Illegal character (%s)" (Char.escaped c)
| Illegal_escape s ->
fprintf ppf "Illegal backslash escape in string or character (%s)" s
| Unterminated_comment ->
fprintf ppf "Comment not terminated"
| Unterminated_string ->
fprintf ppf "String literal not terminated"
| Unterminated_string_in_comment ->
fprintf ppf "This comment contains an unterminated string literal"
| Unterminated_quotation ->
fprintf ppf "Quotation not terminated"
| Unterminated_antiquot ->
fprintf ppf "Antiquotation not terminated"
| Literal_overflow ty ->
fprintf ppf "Integer literal exceeds the range of representable integers of type %s" ty
| Comment_start ->
fprintf ppf "this is the start of a comment"
| Comment_not_end ->
fprintf ppf "this is not the end of a comment"
let to_string x =
let b = Buffer.create 50 in
let () = bprintf b "%a" print x in Buffer.contents b
end;;
let module M = ErrorHandler.Register(Error) in ()
open Error
(* To store some context information:
* loc : position of the beginning of a string, quotation and comment
* in_comment: are we in a comment?
* quotations: shall we lex quotation?
* If quotations is false it's a SYMBOL token.
* antiquots : shall we lex antiquotations.
*)
type context =
{ loc : Loc.t ;
in_comment : bool ;
quotations : bool ;
antiquots : bool ;
lexbuf : lexbuf ;
buffer : Buffer.t }
let default_context lb =
{ loc = Loc.ghost ;
in_comment = false ;
quotations = true ;
antiquots = false ;
lexbuf = lb ;
buffer = Buffer.create 256 }
(* To buffer string literals, quotations and antiquotations *)
let store c = Buffer.add_string c.buffer (Lexing.lexeme c.lexbuf)
let istore_char c i = Buffer.add_char c.buffer (Lexing.lexeme_char c.lexbuf i)
let buff_contents c =
let contents = Buffer.contents c.buffer in
Buffer.reset c.buffer; contents
let loc c = Loc.merge c.loc (Loc.of_lexbuf c.lexbuf)
let quotations c = c.quotations
let antiquots c = c.antiquots
let is_in_comment c = c.in_comment
let in_comment c = { (c) with in_comment = true }
let set_start_p c = c.lexbuf.lex_start_p <- Loc.start_pos c.loc
let move_start_p shift c =
let p = c.lexbuf.lex_start_p in
c.lexbuf.lex_start_p <- { (p) with pos_cnum = p.pos_cnum + shift }
let update_loc c = { (c) with loc = Loc.of_lexbuf c.lexbuf }
let with_curr_loc f c = f (update_loc c) c.lexbuf
let parse_nested f c =
with_curr_loc f c;
set_start_p c;
buff_contents c
let shift n c = { (c) with loc = Loc.move `both n c.loc }
let store_parse f c = store c ; f c c.lexbuf
let parse f c = f c c.lexbuf
let mk_quotation quotation c name loc shift =
let s = parse_nested quotation (update_loc c) in
let contents = String.sub s 0 (String.length s - 2) in
QUOTATION { q_name = name ;
q_loc = loc ;
q_shift = shift ;
q_contents = contents }
(* Update the current location with file name and line number. *)
let update_loc c file line absolute chars =
let lexbuf = c.lexbuf in
let pos = lexbuf.lex_curr_p in
let new_file = match file with
| None -> pos.pos_fname
| Some s -> s
in
lexbuf.lex_curr_p <- { pos with
pos_fname = new_file;
pos_lnum = if absolute then line else pos.pos_lnum + line;
pos_bol = pos.pos_cnum - chars;
}
let err error loc =
raise(Loc.Exc_located(loc, Error.E error))
let warn error loc =
Format.eprintf "Warning: %a: %a@." Loc.print loc Error.print error
}
let newline = ('\010' | '\013' | "\013\010")
let blank = [' ' '\009' '\012']
let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']
let identchar =
['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
let ident = (lowercase|uppercase) identchar*
let locname = ident
let not_star_symbolchar =
['$' '!' '%' '&' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~' '\\']
let symbolchar = '*' | not_star_symbolchar
let quotchar =
['!' '%' '&' '+' '-' '.' '/' ':' '=' '?' '@' '^' '|' '~' '\\' '*']
let hexa_char = ['0'-'9' 'A'-'F' 'a'-'f']
let decimal_literal =
['0'-'9'] ['0'-'9' '_']*
let hex_literal =
'0' ['x' 'X'] hexa_char ['0'-'9' 'A'-'F' 'a'-'f' '_']*
let oct_literal =
'0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']*
let bin_literal =
'0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']*
let int_literal =
decimal_literal | hex_literal | oct_literal | bin_literal
let float_literal =
['0'-'9'] ['0'-'9' '_']*
('.' ['0'-'9' '_']* )?
(['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']*)?
(* Delimitors are extended (from 3.09) in a conservative way *)
(* These chars that can't start an expression or a pattern: *)
let safe_delimchars = ['%' '&' '/' '@' '^']
(* These symbols are unsafe since "[<", "[|", etc. exsist. *)
let delimchars = safe_delimchars | ['|' '<' '>' ':' '=' '.']
let left_delims = ['(' '[' '{']
let right_delims = [')' ']' '}']
let left_delimitor =
(* At least a safe_delimchars *)
left_delims delimchars* safe_delimchars (delimchars|left_delims)*
(* A '(' or a new super '(' without "(<" *)
| '(' (['|' ':'] delimchars*)?
(* Old brackets, no new brackets starting with "[|" or "[:" *)
| '[' ['|' ':']?
(* Old "[<","{<" and new ones *)
| ['[' '{'] delimchars* '<'
(* Old brace and new ones *)
| '{' (['|' ':'] delimchars*)?
let right_delimitor =
(* At least a safe_delimchars *)
(delimchars|right_delims)* safe_delimchars (delimchars|right_delims)* right_delims
(* A ')' or a new super ')' without ">)" *)
| (delimchars* ['|' ':'])? ')'
(* Old brackets, no new brackets ending with "|]" or ":]" *)
| ['|' ':']? ']'
(* Old ">]",">}" and new ones *)
| '>' delimchars* [']' '}']
(* Old brace and new ones *)
| (delimchars* ['|' ':'])? '}'
rule token c = parse
| newline { update_loc c None 1 false 0; NEWLINE }
| blank + as x { BLANKS x }
| "~" (lowercase identchar * as x) ':' { LABEL x }
| "?" (lowercase identchar * as x) ':' { OPTLABEL x }
| lowercase identchar * as x { LIDENT x }
| uppercase identchar * as x { UIDENT x }
| int_literal as i
{ try INT(int_of_string i, i)
with Failure _ -> err (Literal_overflow "int") (Loc.of_lexbuf lexbuf) }
| float_literal as f
{ try FLOAT(float_of_string f, f)
with Failure _ -> err (Literal_overflow "float") (Loc.of_lexbuf lexbuf) }
| (int_literal as i) "l"
{ try INT32(Int32.of_string i, i)
with Failure _ -> err (Literal_overflow "int32") (Loc.of_lexbuf lexbuf) }
| (int_literal as i) "L"
{ try INT64(Int64.of_string i, i)
with Failure _ -> err (Literal_overflow "int64") (Loc.of_lexbuf lexbuf) }
| (int_literal as i) "n"
{ try NATIVEINT(Nativeint.of_string i, i)
with Failure _ -> err (Literal_overflow "nativeint") (Loc.of_lexbuf lexbuf) }
| '"'
{ with_curr_loc string c;
let s = buff_contents c in STRING (TokenEval.string s, s) }
| "'" (newline as x) "'"
{ update_loc c None 1 false 1; CHAR (TokenEval.char x, x) }
| "'" ( [^ '\\' '\010' '\013']
| '\\' (['\\' '"' 'n' 't' 'b' 'r' ' ' '\'']
|['0'-'9'] ['0'-'9'] ['0'-'9']
|'x' hexa_char hexa_char)
as x) "'" { CHAR (TokenEval.char x, x) }
| "'\\" (_ as c)
{ err (Illegal_escape (String.make 1 c)) (Loc.of_lexbuf lexbuf) }
| "(*"
{ store c; COMMENT(parse_nested comment (in_comment c)) }
| "(*)"
{ warn Comment_start (Loc.of_lexbuf lexbuf) ;
parse comment (in_comment c); COMMENT (buff_contents c) }
| "*)"
{ warn Comment_not_end (Loc.of_lexbuf lexbuf) ;
move_start_p (-1) c; SYMBOL "*" }
| "<<" (quotchar* as beginning)
{ if quotations c
then (move_start_p (-String.length beginning);
mk_quotation quotation c "" "" 2)
else parse (symbolchar_star ("<<" ^ beginning)) c }
| "<<>>"
{ if quotations c
then QUOTATION { q_name = ""; q_loc = ""; q_shift = 2; q_contents = "" }
else parse (symbolchar_star "<<>>") c }
| "<@"
{ if quotations c then with_curr_loc maybe_quotation_at c
else parse (symbolchar_star "<@") c }
| "<:"
{ if quotations c then with_curr_loc maybe_quotation_colon c
else parse (symbolchar_star "<:") c }
| "#" [' ' '\t']* (['0'-'9']+ as num) [' ' '\t']*
("\"" ([^ '\010' '\013' '"' ] * as name) "\"")?
[^ '\010' '\013'] * newline
{ let inum = int_of_string num
in update_loc c name inum true 0; LINE_DIRECTIVE(inum, name) }
| '(' (not_star_symbolchar as op) ')'
{ ESCAPED_IDENT (String.make 1 op) }
| '(' (not_star_symbolchar symbolchar* not_star_symbolchar as op) ')'
{ ESCAPED_IDENT op }
| '(' (not_star_symbolchar symbolchar* as op) blank+ ')'
{ ESCAPED_IDENT op }
| '(' blank+ (symbolchar* not_star_symbolchar as op) ')'
{ ESCAPED_IDENT op }
| '(' blank+ (symbolchar+ as op) blank+ ')'
{ ESCAPED_IDENT op }
| ( "#" | "`" | "'" | "," | "." | ".." | ":" | "::"
| ":=" | ":>" | ";" | ";;" | "_"
| left_delimitor | right_delimitor ) as x { SYMBOL x }
| '$' { if antiquots c
then with_curr_loc dollar (shift 1 c)
else parse (symbolchar_star "$") c }
| ['~' '?' '!' '=' '<' '>' '|' '&' '@' '^' '+' '-' '*' '/' '%' '\\'] symbolchar *
as x { SYMBOL x }
| eof
{ let pos = lexbuf.lex_curr_p in
lexbuf.lex_curr_p <- { pos with pos_bol = pos.pos_bol + 1 ;
pos_cnum = pos.pos_cnum + 1 }; EOI }
| _ as c { err (Illegal_character c) (Loc.of_lexbuf lexbuf) }
and comment c = parse
"(*"
{ store c; with_curr_loc comment c; parse comment c }
| "*)" { store c }
| '<' (':' ident)? ('@' locname)? '<'
{ store c;
if quotations c then with_curr_loc quotation c; parse comment c }
| ident { store_parse comment c }
| "\""
{ store c;
begin try with_curr_loc string c
with Loc.Exc_located(_, Error.E Unterminated_string) ->
err Unterminated_string_in_comment (loc c)
end;
Buffer.add_char c.buffer '"';
parse comment c }
| "''" { store_parse comment c }
| "'''" { store_parse comment c }
| "'" newline "'"
{ update_loc c None 1 false 1; store_parse comment c }
| "'" [^ '\\' '\'' '\010' '\013' ] "'" { store_parse comment c }
| "'\\" ['\\' '"' '\'' 'n' 't' 'b' 'r' ' '] "'" { store_parse comment c }
| "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" { store_parse comment c }
| "'\\" 'x' hexa_char hexa_char "'" { store_parse comment c }
| eof
{ err Unterminated_comment (loc c) }
| newline
{ update_loc c None 1 false 0; store_parse comment c }
| _ { store_parse comment c }
and string c = parse
'"' { set_start_p c }
| '\\' newline ([' ' '\t'] * as space)
{ update_loc c None 1 false (String.length space);
store_parse string c }
| '\\' ['\\' '"' 'n' 't' 'b' 'r' ' ' '\''] { store_parse string c }
| '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] { store_parse string c }
| '\\' 'x' hexa_char hexa_char { store_parse string c }
| '\\' (_ as x)
{ if is_in_comment c
then store_parse string c
else begin
warn (Illegal_escape (String.make 1 x)) (Loc.of_lexbuf lexbuf);
store_parse string c
end }
| newline
{ update_loc c None 1 false 0; store_parse string c }
| eof { err Unterminated_string (loc c) }
| _ { store_parse string c }
and symbolchar_star beginning c = parse
| symbolchar* as tok { move_start_p (-String.length beginning) c ;
SYMBOL(beginning ^ tok) }
and maybe_quotation_at c = parse
| (ident as loc) '<'
{ mk_quotation quotation c "" loc (3 + String.length loc) }
| symbolchar* as tok { SYMBOL("<@" ^ tok) }
and maybe_quotation_colon c = parse
| (ident as name) '<'
{ mk_quotation quotation c name "" (3 + String.length name) }
| (ident as name) '@' (locname as loc) '<'
{ mk_quotation quotation c name loc
(4 + String.length loc + String.length name) }
| symbolchar* as tok { SYMBOL("<:" ^ tok) }
and quotation c = parse
| '<' (':' ident)? ('@' locname)? '<' { store c ;
with_curr_loc quotation c ;
parse quotation c }
| ">>" { store c }
| eof { err Unterminated_quotation (loc c) }
| newline { update_loc c None 1 false 0 ;
store_parse quotation c }
| _ { store_parse quotation c }
and dollar c = parse
| '$' { set_start_p c; ANTIQUOT("", "") }
| ('`'? (identchar*|'.'+) as name) ':'
{ with_curr_loc (antiquot name) (shift (1 + String.length name) c) }
| _ { store_parse (antiquot "") c }
and antiquot name c = parse
| '$' { set_start_p c; ANTIQUOT(name, buff_contents c) }
| eof { err Unterminated_antiquot (loc c) }
| newline
{ update_loc c None 1 false 0; store_parse (antiquot name) c }
| '<' (':' ident)? ('@' locname)? '<'
{ store c; with_curr_loc quotation c; parse (antiquot name) c }
| _ { store_parse (antiquot name) c }
{
let lexing_store s buff max =
let rec self n s =
if n >= max then n
else
match Stream.peek s with
| Some x ->
Stream.junk s;
buff.[n] <- x;
succ n
| _ -> n
in
self 0 s
let from_context c =
let next _ =
let tok = with_curr_loc token c in
let loc = Loc.of_lexbuf c.lexbuf in
Some ((tok, loc))
in Stream.from next
let from_lexbuf ?(quotations = true) lb =
let c = { (default_context lb) with
loc = Loc.of_lexbuf lb;
antiquots = !Camlp4_config.antiquotations;
quotations = quotations }
in from_context c
let setup_loc lb loc =
let start_pos = Loc.start_pos loc in
lb.lex_abs_pos <- start_pos.pos_cnum;
lb.lex_curr_p <- start_pos
let from_string ?quotations loc str =
let lb = Lexing.from_string str in
setup_loc lb loc;
from_lexbuf ?quotations lb
let from_stream ?quotations loc strm =
let lb = Lexing.from_function (lexing_store strm) in
setup_loc lb loc;
from_lexbuf ?quotations lb
let mk () loc strm =
from_stream ~quotations:!Camlp4_config.quotations loc strm
end
}