ocaml/lex/lexer.mll

214 lines
6.2 KiB
OCaml

(***********************************************************************)
(* *)
(* Objective Caml *)
(* *)
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 1996 Institut National de Recherche en Informatique et *)
(* en Automatique. All rights reserved. This file is distributed *)
(* under the terms of the Q Public License version 1.0. *)
(* *)
(***********************************************************************)
(* $Id$ *)
(* The lexical analyzer for lexer definitions. Bootstrapped! *)
{
open Syntax
open Parser
(* Auxiliaries for the lexical analyzer *)
let brace_depth = ref 0
and comment_depth = ref 0
exception Lexical_error of string * int * int
let initial_string_buffer = String.create 256
let string_buff = ref initial_string_buffer
let string_index = ref 0
let reset_string_buffer () =
string_buff := initial_string_buffer;
string_index := 0
let store_string_char c =
if !string_index >= String.length !string_buff then begin
let new_buff = String.create (String.length !string_buff * 2) in
String.blit !string_buff 0 new_buff 0 (String.length !string_buff);
string_buff := new_buff
end;
!string_buff.[!string_index] <- c;
incr string_index
let get_stored_string () =
String.sub !string_buff 0 !string_index
let char_for_backslash = function
'n' -> '\n'
| 't' -> '\t'
| 'b' -> '\b'
| 'r' -> '\r'
| c -> c
let char_for_decimal_code lexbuf i =
Char.chr(100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
(Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48))
let line_num = ref 1
let line_start_pos = ref 0
let handle_lexical_error fn lexbuf =
let line = !line_num
and column = Lexing.lexeme_start lexbuf - !line_start_pos in
try
fn lexbuf
with Lexical_error(msg, _, _) ->
raise(Lexical_error(msg, line, column))
}
let identstart =
['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255']
let identbody =
['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
rule main = parse
[' ' '\013' '\009' '\012' ] +
{ main lexbuf }
| '\010'
{ line_start_pos := Lexing.lexeme_end lexbuf;
incr line_num;
main lexbuf }
| "(*"
{ comment_depth := 1;
handle_lexical_error comment lexbuf;
main lexbuf }
| '_' { Tunderscore }
| identstart identbody *
{ match Lexing.lexeme lexbuf with
"rule" -> Trule
| "parse" -> Tparse
| "and" -> Tand
| "eof" -> Teof
| "let" -> Tlet
| s -> Tident s }
| '"'
{ reset_string_buffer();
handle_lexical_error string lexbuf;
Tstring(get_stored_string()) }
| "'" [^ '\\'] "'"
{ Tchar(Char.code(Lexing.lexeme_char lexbuf 1)) }
| "'" '\\' ['\\' '"' '\'' 'n' 't' 'b' 'r'] "'"
{ Tchar(Char.code(char_for_backslash (Lexing.lexeme_char lexbuf 2))) }
| "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
{ Tchar(Char.code(char_for_decimal_code lexbuf 2)) }
| '{'
{ let n1 = Lexing.lexeme_end lexbuf
and l1 = !line_num
and s1 = !line_start_pos in
brace_depth := 1;
let n2 = handle_lexical_error action lexbuf in
Taction({start_pos = n1; end_pos = n2;
start_line = l1; start_col = n1 - s1}) }
| '=' { Tequal }
| '|' { Tor }
| '[' { Tlbracket }
| ']' { Trbracket }
| '*' { Tstar }
| '?' { Tmaybe }
| '+' { Tplus }
| '(' { Tlparen }
| ')' { Trparen }
| '^' { Tcaret }
| '-' { Tdash }
| eof { Tend }
| _
{ raise(Lexical_error
("illegal character " ^ String.escaped(Lexing.lexeme lexbuf),
!line_num, Lexing.lexeme_start lexbuf - !line_start_pos)) }
and action = parse
'{'
{ incr brace_depth;
action lexbuf }
| '}'
{ decr brace_depth;
if !brace_depth = 0 then Lexing.lexeme_start lexbuf else action lexbuf }
| '"'
{ reset_string_buffer();
string lexbuf;
reset_string_buffer();
action lexbuf }
| "'" [^ '\\' '\''] "'"
{ action lexbuf }
| "'" '\\' ['\\' '"' '\'' 'n' 't' 'b' 'r'] "'"
{ action lexbuf }
| "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
{ action lexbuf }
| "(*"
{ comment_depth := 1;
comment lexbuf;
action lexbuf }
| eof
{ raise (Lexical_error("unterminated action", 0, 0)) }
| '\010'
{ line_start_pos := Lexing.lexeme_end lexbuf;
incr line_num;
action lexbuf }
| _
{ action lexbuf }
and string = parse
'"'
{ () }
| '\\' [' ' '\013' '\009' '\012'] * '\010' [' ' '\013' '\009' '\012'] *
{ line_start_pos := Lexing.lexeme_end lexbuf;
incr line_num;
string lexbuf }
| '\\' ['\\' '"' '\'' 'n' 't' 'b' 'r']
{ store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
string lexbuf }
| '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
{ store_string_char(char_for_decimal_code lexbuf 1);
string lexbuf }
| eof
{ raise(Lexical_error("unterminated string", 0, 0)) }
| '\010'
{ store_string_char '\010';
line_start_pos := Lexing.lexeme_end lexbuf;
incr line_num;
string lexbuf }
| _
{ store_string_char(Lexing.lexeme_char lexbuf 0);
string lexbuf }
and comment = parse
"(*"
{ incr comment_depth; comment lexbuf }
| "*)"
{ decr comment_depth;
if !comment_depth = 0 then () else comment lexbuf }
| '"'
{ reset_string_buffer();
string lexbuf;
reset_string_buffer();
comment lexbuf }
| "''"
{ comment lexbuf }
| "'" [^ '\\' '\''] "'"
{ comment lexbuf }
| "'\\" ['\\' '\'' 'n' 't' 'b' 'r'] "'"
{ comment lexbuf }
| "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
{ comment lexbuf }
| eof
{ raise(Lexical_error("unterminated comment", 0, 0)) }
| '\010'
{ line_start_pos := Lexing.lexeme_end lexbuf;
incr line_num;
comment lexbuf }
| _
{ comment lexbuf }