ocaml/stdlib/genlex.ml

(***********************************************************************)
(*                                                                     *)
(*                           Objective Caml                            *)
(*                                                                     *)
(*              Xavier Leroy, projet Cristal, INRIA Rocquencourt       *)
(*                                                                     *)
(*  Copyright 1996 Institut National de Recherche en Informatique et   *)
(*  Automatique.  Distributed only by permission.                      *)
(*                                                                     *)
(***********************************************************************)

(* $Id$ *)

type token =
    Kwd of string
  | Ident of string
  | Int of int
  | Float of float
  | String of string
  | Char of char


(* The string buffering machinery *)

let initial_buffer = String.create 32

let buffer = ref initial_buffer
let bufpos = ref 0

let reset_buffer () =
  buffer := initial_buffer;
  bufpos := 0

let store c =
  if !bufpos >= String.length !buffer then begin
    let newbuffer = String.create (2 * !bufpos) in
    String.blit !buffer 0 newbuffer 0 !bufpos;
    buffer := newbuffer
  end;
  String.set !buffer !bufpos c;
  incr bufpos

let get_string () =
  let s = String.sub !buffer 0 !bufpos in buffer := initial_buffer; s

(* The lexer *)

let make_lexer keywords =

  let kwd_table = Hashtbl.create 17 in
  List.iter (fun s -> Hashtbl.add kwd_table s (Kwd s)) keywords;

  let ident_or_keyword id =
    try Hashtbl.find kwd_table id with Not_found -> Ident id

  and keyword_or_error c =
    let s = String.make 1 c in
      try Hashtbl.find kwd_table s
      with Not_found -> raise(Stream.Parse_error("Illegal character " ^ s)) in

  let rec next_token = parser
    [< '  ' '|'\010'|'\013'|'\009'|'\026'|'\012'; s >] ->
      next_token s
  | [< '  'A'..'Z'|'a'..'z'|'\192'..'\255' as c; s>] ->
      reset_buffer(); store c; ident s
  | [< '  '!'|'%'|'&'|'$'|'#'|'+'|'/'|':'|'<'|'='|'>'|'?'|'@'|'\\'|
              '~'|'^'|'|'|'*' as c; s >] ->
      reset_buffer(); store c; ident2 s
  | [< '  '0'..'9' as c; s>] ->
      reset_buffer(); store c; number s
  | [< '  '\''; c = char; '  '\'' >] ->
      Some(Char c)
  | [< '  '"' (* '"' *); s >] ->
      reset_buffer(); Some(String(string s))
  | [< '  '-'; s >] ->
      neg_number s
  | [< '  '('; s >] ->
      maybe_comment s
  | [< ' c >] ->
      Some(keyword_or_error c)
  | [< >] ->
      None

  and ident = parser
    [< '  'A'..'Z'|'a'..'z'|'\192'..'\255'|'0'..'9'|'_'|'\'' as c; s>] ->
      store c; ident s
  | [< >] ->
      Some(ident_or_keyword(get_string()))

  and ident2 = parser
    [< '  '!'|'%'|'&'|'$'|'#'|'+'|'-'|'/'|':'|'<'|'='|'>'|'?'|'@'|'\\'|
              '~'|'^'|'|'|'*' as c; s >] ->
      store c; ident2 s
  | [< >] ->
      Some(ident_or_keyword(get_string()))

  and neg_number = parser
    [< '  '0'..'9' as c; s >] ->
      reset_buffer(); store '-'; store c; number s
  | [< s >] ->
      reset_buffer(); store '-'; ident2 s
    
  and number = parser
    [< '  '0'..'9' as c; s >] ->
      store c; number s
  | [< '  '.'; s >] ->
      store '.'; decimal_part s
  | [< '  'e'|'E'; s >] ->
      store 'E'; exponent_part s
  | [< >] ->
      Some(Int(int_of_string(get_string())))

  and decimal_part = parser
    [< '  '0'..'9' as c; s >] ->
      store c; decimal_part s
  | [< '  'e'|'E'; s >] ->
      store 'E'; exponent_part s
  | [< >] ->
      Some(Float(float_of_string(get_string())))

  and exponent_part = parser
    [< '  '+'|'-' as c; s >] ->
      store c; end_exponent_part s
  | [< s >] ->
      end_exponent_part s

  and end_exponent_part = parser
    [< '  '0'..'9' as c; s >] ->
      store c; end_exponent_part s
  | [< >] ->
      Some(Float(float_of_string(get_string())))

  and string = parser
    [< '  '"' (* '"' *)  >] -> get_string()
  | [< '  '\\'; c = escape; s >] -> store c; string s
  | [< ' c; s >] -> store c; string s

  and char = parser
    [< '  '\\'; c = escape >] -> c
  | [< ' c >] -> c

  and escape = parser
    [< '  'n' >] -> '\n'
  | [< '  'r' >] -> '\r'
  | [< '  't' >] -> '\t'
  | [< '  '0'..'9' as c1; '  '0'..'9' as c2; '  '0'..'9' as c3 >] ->
      Char.chr((Char.code c1 - 48) * 100 +
               (Char.code c2 - 48) * 10 + (Char.code c3))
  | [< ' c >] -> c

  and maybe_comment = parser
    [< '  '*'; s >] -> comment s; next_token s
  | [< >] -> Some(keyword_or_error '(')

  and comment = parser
    [< '  '('; s >] -> maybe_nested_comment s
  | [< '  '*'; s >] -> maybe_end_comment s
  | [< ' c; s >] -> comment s

  and maybe_nested_comment = parser
    [< '  '*'; s >] -> comment s; comment s
  | [< ' c; s >] -> comment s

  and maybe_end_comment = parser
    [< '  ')' >] -> ()
  | [< ' c; s >] -> comment s

  in fun input -> Stream.from (fun count -> next_token input)
Pervasives: ajout option Open_nonblock Stream: commentaire. Makefile: ajout de Genlex. git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@769 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 1996-04-29 06:23:25 -07:00			`(***********************************************************************)`
			`(* *)`
Renommage en Objective Caml git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@782 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 1996-04-30 07:53:58 -07:00			`(* Objective Caml *)`
Pervasives: ajout option Open_nonblock Stream: commentaire. Makefile: ajout de Genlex. git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@769 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 1996-04-29 06:23:25 -07:00			`(* *)`
			`(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)`
			`(* *)`
Renommage en Objective Caml git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@782 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 1996-04-30 07:53:58 -07:00			`(* Copyright 1996 Institut National de Recherche en Informatique et *)`
Pervasives: ajout option Open_nonblock Stream: commentaire. Makefile: ajout de Genlex. git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@769 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02 1996-04-29 06:23:25 -07:00			`(* Automatique. Distributed only by permission. *)`
			`(* *)`
			`(***********************************************************************)`

			`(* $Id$ *)`

			`type token =`
			`Kwd of string`
			`\| Ident of string`
			`\| Int of int`
			`\| Float of float`
			`\| String of string`
			`\| Char of char`


			`(* The string buffering machinery *)`

			`let initial_buffer = String.create 32`

			`let buffer = ref initial_buffer`
			`let bufpos = ref 0`

			`let reset_buffer () =`
			`buffer := initial_buffer;`
			`bufpos := 0`

			`let store c =`
			`if !bufpos >= String.length !buffer then begin`
			`let newbuffer = String.create (2 * !bufpos) in`
			`String.blit !buffer 0 newbuffer 0 !bufpos;`
			`buffer := newbuffer`
			`end;`
			`String.set !buffer !bufpos c;`
			`incr bufpos`

			`let get_string () =`
			`let s = String.sub !buffer 0 !bufpos in buffer := initial_buffer; s`

			`(* The lexer *)`

			`let make_lexer keywords =`

			`let kwd_table = Hashtbl.create 17 in`
			`List.iter (fun s -> Hashtbl.add kwd_table s (Kwd s)) keywords;`

			`let ident_or_keyword id =`
			`try Hashtbl.find kwd_table id with Not_found -> Ident id`

			`and keyword_or_error c =`
			`let s = String.make 1 c in`
			`try Hashtbl.find kwd_table s`
			`with Not_found -> raise(Stream.Parse_error("Illegal character " ^ s)) in`

			`let rec next_token = parser`
			`[< ' ' '\|'\010'\|'\013'\|'\009'\|'\026'\|'\012'; s >] ->`
			`next_token s`
			`\| [< ' 'A'..'Z'\|'a'..'z'\|'\192'..'\255' as c; s>] ->`
			`reset_buffer(); store c; ident s`
			`\| [< ' '!'\|'%'\|'&'\|'$'\|'#'\|'+'\|'/'\|':'\|'<'\|'='\|'>'\|'?'\|'@'\|'\\'\|`
			`'~'\|'^'\|'\|'\|'*' as c; s >] ->`
			`reset_buffer(); store c; ident2 s`
			`\| [< ' '0'..'9' as c; s>] ->`
			`reset_buffer(); store c; number s`
			`\| [< ' '\''; c = char; ' '\'' >] ->`
			`Some(Char c)`
			`\| [< ' '"' (* '"' *); s >] ->`
			`reset_buffer(); Some(String(string s))`
			`\| [< ' '-'; s >] ->`
			`neg_number s`
			`\| [< ' '('; s >] ->`
			`maybe_comment s`
			`\| [< ' c >] ->`
			`Some(keyword_or_error c)`
			`\| [< >] ->`
			`None`

			`and ident = parser`
			`[< ' 'A'..'Z'\|'a'..'z'\|'\192'..'\255'\|'0'..'9'\|'_'\|'\'' as c; s>] ->`
			`store c; ident s`
			`\| [< >] ->`
			`Some(ident_or_keyword(get_string()))`

			`and ident2 = parser`
			`[< ' '!'\|'%'\|'&'\|'$'\|'#'\|'+'\|'-'\|'/'\|':'\|'<'\|'='\|'>'\|'?'\|'@'\|'\\'\|`
			`'~'\|'^'\|'\|'\|'*' as c; s >] ->`
			`store c; ident2 s`
			`\| [< >] ->`
			`Some(ident_or_keyword(get_string()))`

			`and neg_number = parser`
			`[< ' '0'..'9' as c; s >] ->`
			`reset_buffer(); store '-'; store c; number s`
			`\| [< s >] ->`
			`reset_buffer(); store '-'; ident2 s`

			`and number = parser`
			`[< ' '0'..'9' as c; s >] ->`
			`store c; number s`
			`\| [< ' '.'; s >] ->`
			`store '.'; decimal_part s`
			`\| [< ' 'e'\|'E'; s >] ->`
			`store 'E'; exponent_part s`
			`\| [< >] ->`
			`Some(Int(int_of_string(get_string())))`

			`and decimal_part = parser`
			`[< ' '0'..'9' as c; s >] ->`
			`store c; decimal_part s`
			`\| [< ' 'e'\|'E'; s >] ->`
			`store 'E'; exponent_part s`
			`\| [< >] ->`
			`Some(Float(float_of_string(get_string())))`

			`and exponent_part = parser`
			`[< ' '+'\|'-' as c; s >] ->`
			`store c; end_exponent_part s`
			`\| [< s >] ->`
			`end_exponent_part s`

			`and end_exponent_part = parser`
			`[< ' '0'..'9' as c; s >] ->`
			`store c; end_exponent_part s`
			`\| [< >] ->`
			`Some(Float(float_of_string(get_string())))`

			`and string = parser`
			`[< ' '"' (* '"' *) >] -> get_string()`
			`\| [< ' '\\'; c = escape; s >] -> store c; string s`
			`\| [< ' c; s >] -> store c; string s`

			`and char = parser`
			`[< ' '\\'; c = escape >] -> c`
			`\| [< ' c >] -> c`

			`and escape = parser`
			`[< ' 'n' >] -> '\n'`
			`\| [< ' 'r' >] -> '\r'`
			`\| [< ' 't' >] -> '\t'`
			`\| [< ' '0'..'9' as c1; ' '0'..'9' as c2; ' '0'..'9' as c3 >] ->`
			`Char.chr((Char.code c1 - 48) * 100 +`
			`(Char.code c2 - 48) * 10 + (Char.code c3))`
			`\| [< ' c >] -> c`

			`and maybe_comment = parser`
			`[< ' '*'; s >] -> comment s; next_token s`
			`\| [< >] -> Some(keyword_or_error '(')`

			`and comment = parser`
			`[< ' '('; s >] -> maybe_nested_comment s`
			`\| [< ' '*'; s >] -> maybe_end_comment s`
			`\| [< ' c; s >] -> comment s`

			`and maybe_nested_comment = parser`
			`[< ' '*'; s >] -> comment s; comment s`
			`\| [< ' c; s >] -> comment s`

			`and maybe_end_comment = parser`
			`[< ' ')' >] -> ()`
			`\| [< ' c; s >] -> comment s`

			`in fun input -> Stream.from (fun count -> next_token input)`