1996-04-29 06:23:25 -07:00
|
|
|
(***********************************************************************)
|
|
|
|
(* *)
|
2011-07-27 07:17:02 -07:00
|
|
|
(* OCaml *)
|
1996-04-29 06:23:25 -07:00
|
|
|
(* *)
|
|
|
|
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
|
|
|
|
(* *)
|
1996-04-30 07:53:58 -07:00
|
|
|
(* Copyright 1996 Institut National de Recherche en Informatique et *)
|
1999-11-17 10:59:06 -08:00
|
|
|
(* en Automatique. All rights reserved. This file is distributed *)
|
2001-12-07 05:41:02 -08:00
|
|
|
(* under the terms of the GNU Library General Public License, with *)
|
|
|
|
(* the special exception on linking described in file ../LICENSE. *)
|
1996-04-29 06:23:25 -07:00
|
|
|
(* *)
|
|
|
|
(***********************************************************************)
|
|
|
|
|
2001-10-26 15:37:14 -07:00
|
|
|
(** A generic lexical analyzer.
|
1996-04-29 06:23:25 -07:00
|
|
|
|
2001-10-26 15:37:14 -07:00
|
|
|
|
|
|
|
This module implements a simple ``standard'' lexical analyzer, presented
|
1996-04-29 06:23:25 -07:00
|
|
|
as a function from character streams to token streams. It implements
|
2011-12-21 07:37:54 -08:00
|
|
|
roughly the lexical conventions of OCaml, but is parameterized by the
|
2005-10-25 11:34:07 -07:00
|
|
|
set of keywords of your language.
|
2001-10-26 15:37:14 -07:00
|
|
|
|
|
|
|
|
|
|
|
Example: a lexer suitable for a desk calculator is obtained by
|
|
|
|
{[ let lexer = make_lexer ["+";"-";"*";"/";"let";"="; "("; ")"] ]}
|
1996-04-29 06:23:25 -07:00
|
|
|
|
2001-10-26 15:37:14 -07:00
|
|
|
The associated parser would be a function from [token stream]
|
|
|
|
to, for instance, [int], and would have rules such as:
|
|
|
|
|
|
|
|
{[
|
|
|
|
let parse_expr = parser
|
|
|
|
[< 'Int n >] -> n
|
|
|
|
| [< 'Kwd "("; n = parse_expr; 'Kwd ")" >] -> n
|
|
|
|
| [< n1 = parse_expr; n2 = parse_remainder n1 >] -> n2
|
|
|
|
and parse_remainder n1 = parser
|
|
|
|
[< 'Kwd "+"; n2 = parse_expr >] -> n1+n2
|
|
|
|
| ...
|
|
|
|
]}
|
2012-03-08 11:52:03 -08:00
|
|
|
|
|
|
|
One should notice that the use of the [parser] keyword and associated
|
|
|
|
notation for streams are only available through camlp4 extensions. This
|
|
|
|
means that one has to preprocess its sources {i e. g.} by using the
|
|
|
|
["-pp"] command-line switch of the compilers.
|
2001-10-26 15:37:14 -07:00
|
|
|
*)
|
|
|
|
|
|
|
|
(** The type of tokens. The lexical classes are: [Int] and [Float]
|
|
|
|
for integer and floating-point numbers; [String] for
|
|
|
|
string literals, enclosed in double quotes; [Char] for
|
|
|
|
character literals, enclosed in single quotes; [Ident] for
|
|
|
|
identifiers (either sequences of letters, digits, underscores
|
|
|
|
and quotes, or sequences of ``operator characters'' such as
|
|
|
|
[+], [*], etc); and [Kwd] for keywords (either identifiers or
|
|
|
|
single ``special characters'' such as [(], [}], etc). *)
|
2001-12-03 14:47:17 -08:00
|
|
|
type token =
|
|
|
|
Kwd of string
|
|
|
|
| Ident of string
|
|
|
|
| Int of int
|
|
|
|
| Float of float
|
|
|
|
| String of string
|
|
|
|
| Char of char
|
2005-10-25 11:34:07 -07:00
|
|
|
|
2001-12-03 14:01:28 -08:00
|
|
|
val make_lexer : string list -> char Stream.t -> token Stream.t
|
2001-10-26 15:37:14 -07:00
|
|
|
(** Construct the lexer function. The first argument is the list of
|
|
|
|
keywords. An identifier [s] is returned as [Kwd s] if [s]
|
|
|
|
belongs to this list, and as [Ident s] otherwise.
|
|
|
|
A special character [s] is returned as [Kwd s] if [s]
|
|
|
|
belongs to this list, and cause a lexical error (exception
|
2013-01-16 04:29:25 -08:00
|
|
|
[Stream.Error] with the offending lexeme as its parameter) otherwise.
|
|
|
|
Blanks and newlines are skipped. Comments delimited by [(*] and [*)]
|
|
|
|
are skipped as well, and can be nested. A [Stream.Failure] exception
|
|
|
|
is raised if end of stream is unexpectedly reached.*)
|