ocaml/stdlib/genlex.ml

(***********************************************************************)
(*                                                                     *)
(*                           Objective Caml                            *)
(*                                                                     *)
(*              Xavier Leroy, projet Cristal, INRIA Rocquencourt       *)
(*                                                                     *)
(*  Copyright 1996 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  All rights reserved.  This file is distributed    *)
(*  under the terms of the GNU Library General Public License.         *)
(*                                                                     *)
(***********************************************************************)

(* $Id$ *)

type token =
    Kwd of string
  | Ident of string
  | Int of int
  | Float of float
  | String of string
  | Char of char


(* The string buffering machinery *)

let initial_buffer = String.create 32

let buffer = ref initial_buffer
let bufpos = ref 0

let reset_buffer () =
  buffer := initial_buffer;
  bufpos := 0

let store c =
  if !bufpos >= String.length !buffer then begin
    let newbuffer = String.create (2 * !bufpos) in
    String.blit !buffer 0 newbuffer 0 !bufpos;
    buffer := newbuffer
  end;
  String.set !buffer !bufpos c;
  incr bufpos

let get_string () =
  let s = String.sub !buffer 0 !bufpos in buffer := initial_buffer; s

(* The lexer *)

let make_lexer keywords =

  let kwd_table = Hashtbl.create 17 in
  List.iter (fun s -> Hashtbl.add kwd_table s (Kwd s)) keywords;

  let ident_or_keyword id =
    try Hashtbl.find kwd_table id with Not_found -> Ident id

  and keyword_or_error c =
    let s = String.make 1 c in
      try Hashtbl.find kwd_table s
      with Not_found -> raise(Stream.Error("Illegal character " ^ s)) in

  let rec next_token = parser
    [< '  ' '|'\010'|'\013'|'\009'|'\026'|'\012'; s >] ->
      next_token s
  | [< '  'A'..'Z'|'a'..'z'|'\192'..'\255' as c; s>] ->
      reset_buffer(); store c; ident s
  | [< '  '!'|'%'|'&'|'$'|'#'|'+'|'/'|':'|'<'|'='|'>'|'?'|'@'|'\\'|
              '~'|'^'|'|'|'*' as c; s >] ->
      reset_buffer(); store c; ident2 s
  | [< '  '0'..'9' as c; s>] ->
      reset_buffer(); store c; number s
  | [< '  '\''; c = char; '  '\'' >] ->
      Some(Char c)
  | [< '  '"' (* '"' *); s >] ->
      reset_buffer(); Some(String(string s))
  | [< '  '-'; s >] ->
      neg_number s
  | [< '  '('; s >] ->
      maybe_comment s
  | [< ' c >] ->
      Some(keyword_or_error c)
  | [< >] ->
      None

  and ident = parser
    [< '  'A'..'Z'|'a'..'z'|'\192'..'\255'|'0'..'9'|'_'|'\'' as c; s>] ->
      store c; ident s
  | [< >] ->
      Some(ident_or_keyword(get_string()))

  and ident2 = parser
    [< '  '!'|'%'|'&'|'$'|'#'|'+'|'-'|'/'|':'|'<'|'='|'>'|'?'|'@'|'\\'|
              '~'|'^'|'|'|'*' as c; s >] ->
      store c; ident2 s
  | [< >] ->
      Some(ident_or_keyword(get_string()))

  and neg_number = parser
    [< '  '0'..'9' as c; s >] ->
      reset_buffer(); store '-'; store c; number s
  | [< s >] ->
      reset_buffer(); store '-'; ident2 s

  and number = parser
    [< '  '0'..'9' as c; s >] ->
      store c; number s
  | [< '  '.'; s >] ->
      store '.'; decimal_part s
  | [< '  'e'|'E'; s >] ->
      store 'E'; exponent_part s
  | [< >] ->
      Some(Int(int_of_string(get_string())))

  and decimal_part = parser
    [< '  '0'..'9' as c; s >] ->
      store c; decimal_part s
  | [< '  'e'|'E'; s >] ->
      store 'E'; exponent_part s
  | [< >] ->
      Some(Float(float_of_string(get_string())))

  and exponent_part = parser
    [< '  '+'|'-' as c; s >] ->
      store c; end_exponent_part s
  | [< s >] ->
      end_exponent_part s

  and end_exponent_part = parser
    [< '  '0'..'9' as c; s >] ->
      store c; end_exponent_part s
  | [< >] ->
      Some(Float(float_of_string(get_string())))

  and string = parser
    [< '  '"' (* '"' *)  >] -> get_string()
  | [< '  '\\'; c = escape; s >] -> store c; string s
  | [< ' c; s >] -> store c; string s

  and char = parser
    [< '  '\\'; c = escape >] -> c
  | [< ' c >] -> c

  and escape = parser
    [< '  'n' >] -> '\n'
  | [< '  'r' >] -> '\r'
  | [< '  't' >] -> '\t'
  | [< '  '0'..'9' as c1; '  '0'..'9' as c2; '  '0'..'9' as c3 >] ->
      Char.chr((Char.code c1 - 48) * 100 +
               (Char.code c2 - 48) * 10 +
               (Char.code c3 - 48))
  | [< ' c >] -> c

  and maybe_comment = parser
    [< '  '*'; s >] -> comment s; next_token s
  | [< >] -> Some(keyword_or_error '(')

  and comment = parser
    [< '  '('; s >] -> maybe_nested_comment s
  | [< '  '*'; s >] -> maybe_end_comment s
  | [< ' c; s >] -> comment s

  and maybe_nested_comment = parser
    [< '  '*'; s >] -> comment s; comment s
  | [< ' c; s >] -> comment s

  and maybe_end_comment = parser
    [< '  ')' >] -> ()
  | [< '  '*'; s >] -> maybe_end_comment s
  | [< ' c; s >] -> comment s

  in fun input -> Stream.from (fun count -> next_token input)