ocaml/stdlib/scanf.ml

598 lines
19 KiB
OCaml

(***********************************************************************)
(* *)
(* Objective Caml *)
(* *)
(* Pierre Weis, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 2002 Institut National de Recherche en Informatique et *)
(* en Automatique. All rights reserved. This file is distributed *)
(* under the terms of the GNU Library General Public License, with *)
(* the special exception on linking described in file ../LICENSE. *)
(* *)
(***********************************************************************)
(* $Id$ *)
(* The run-time library for scanners. *)
(* {6 Scanning buffers} *)
module type SCANNING = sig
type scanbuf;;
val next_char : scanbuf -> unit;;
(** [Scanning.next_char scanbuf] advance the scanning buffer for
one character. *)
val peek_char : scanbuf -> char;;
(** [Scanning.peek_char scanbuf] returns the current char available in
the input. *)
val store_char : scanbuf -> char -> int -> int;;
(** [Scanning.store_char scanbuf c lim] adds [c] to the token buffer
of the scanning buffer. It also advances the scanning buffer for one
character and returns [lim - 1], indicating that there
is one less character to read. *)
val char_count : scanbuf -> int;;
(** [Scanning.char_count scanbuf] returns the number of characters read
from the given buffer. *)
val token : scanbuf -> string;;
(** [Scanning.token scanbuf] returns the string stored into the token
buffer of the scanning buffer: it returns the token matched by the
format. *)
val reset_token : scanbuf -> unit;;
(** [Scanning.reset_token scanbuf] resets the token buffer of
the given scanning buffer. *)
val token_count : scanbuf -> int;;
(** [Scanning.token_count scanbuf] returns the number of tokens read
so far from [scanbuf]. *)
val end_of_input : scanbuf -> bool;;
(** [Scanning.end_of_input scanbuf] tests the end of input condition
of the given buffer. *)
val from_string : string -> scanbuf;;
val from_channel : in_channel -> scanbuf;;
val from_function : (unit -> char) -> scanbuf;;
end;;
module Scanning : SCANNING = struct
(* The run-time library for scanf. *)
type scanbuf = {
mutable eof : bool;
mutable cur_char : char;
mutable char_count : int;
mutable token_count : int;
mutable get_next_char : unit -> char;
tokbuf : Buffer.t;
};;
let next_char ib =
try
ib.cur_char <- ib.get_next_char ();
ib.char_count <- ib.char_count + 1
with End_of_file ->
ib.cur_char <- '\000';
ib.eof <- true;;
let peek_char ib = ib.cur_char;;
let end_of_input ib = ib.eof && (ib.eof <- false; next_char ib; ib.eof);;
let char_count ib = ib.char_count;;
let reset_token ib = Buffer.reset ib.tokbuf;;
let token ib =
let tokbuf = ib.tokbuf in
let tok = Buffer.contents tokbuf in
Buffer.clear tokbuf;
ib.token_count <- 1 + ib.token_count;
tok;;
let token_count ib = ib.token_count;;
let store_char ib c max =
Buffer.add_char ib.tokbuf c;
next_char ib;
max - 1;;
let create next =
let ib = {
eof = true;
cur_char = '\000';
char_count = 0;
get_next_char = next;
tokbuf = Buffer.create 10;
token_count = 0;
} in
ib;;
let from_string s =
let i = ref 0 in
let len = String.length s in
let next () =
if !i >= len then raise End_of_file else
let c = s.[!i] in
incr i;
c in
create next;;
let from_channel ic =
let next () = input_char ic in
create next;;
let from_function f = create f;;
end;;
(** Formatted input functions. *)
let bad_input ib s =
let i = Scanning.char_count ib in
failwith
(Printf.sprintf "scanf: bad input at char number %i%s"
i (if s = "" then s else Printf.sprintf ", while scanning %s" s));;
let bad_input_buff ib = bad_input ib (Scanning.token ib);;
let bad_format fmt i fc =
invalid_arg
(Printf.sprintf
"scanf: bad format %c, at char number %i of format %s" fc i fmt);;
(* Extracting tokens from ouput token buffer. *)
let token_int ib =
let s = Scanning.token ib in
try Pervasives.int_of_string s
with Failure "int_of_string" -> bad_input ib s;;
let token_bool ib =
match Scanning.token ib with
| "true" -> true
| "false" -> false
| s -> bad_input ib ("a boolean, found " ^ s);;
let token_char ib =
(Scanning.token ib).[0];;
let token_float ib =
let s = Scanning.token ib in
float_of_string s;;
let token_string = Scanning.token;;
(* To scan native ints, int32 and int64 integers.
We cannot access to convertion to/from strings for those types,
Nativeint.of_string, Int32.of_string, and Int64.of_string,
since those modules are not available to scanf.
However, we can bind and use the corresponding primitives that are
available in the runtime. *)
external nativeint_of_string: string -> nativeint = "nativeint_of_string";;
external int32_of_string : string -> int32 = "int32_of_string";;
external int64_of_string : string -> int64 = "int64_of_string";;
let token_nativeint ib =
let s = Scanning.token ib in
nativeint_of_string s;;
let token_int32 ib =
let s = Scanning.token ib in
int32_of_string s;;
let token_int64 ib =
let s = Scanning.token ib in
int64_of_string s;;
(* Scanning numbers. *)
let scan_sign max ib =
let c = Scanning.peek_char ib in
match c with
| '+' -> Scanning.store_char ib c max
| '-' -> Scanning.store_char ib c max
| c -> max;;
(* Decimal case is optimized. *)
let rec scan_decimal_digits max ib =
if max = 0 || Scanning.end_of_input ib then max else
match Scanning.peek_char ib with
| '0' .. '9' as c ->
let max = Scanning.store_char ib c max in
scan_decimal_digits max ib
| c -> max;;
(* Other cases uses a predicate argument to scan_digits. *)
let rec scan_digits digitp max ib =
if max = 0 || Scanning.end_of_input ib then max else
match Scanning.peek_char ib with
| c when digitp c ->
let max = Scanning.store_char ib c max in
scan_digits digitp max ib
| _ -> max;;
let scan_binary_digits =
let is_binary = function
| '0' .. '1' -> true
| _ -> false in
scan_digits is_binary;;
let scan_octal_digits =
let is_octal = function
| '0' .. '8' -> true
| _ -> false in
scan_digits is_octal;;
let scan_hexadecimal_digits =
let is_hexa = function
| '0' .. '9' | 'a' .. 'f' -> true
| _ -> false in
scan_digits is_hexa;;
let scan_Hexadecimal_digits =
let is_Hexa = function
| '0' .. '9' | 'A' .. 'F' -> true
| _ -> false in
scan_digits is_Hexa;;
(* Decimal integers. *)
let scan_unsigned_decimal_int max ib =
if max = 0 || Scanning.end_of_input ib then bad_input ib "an int" else
scan_decimal_digits max ib;;
let scan_optionally_signed_decimal_int max ib =
let max = scan_sign max ib in
scan_unsigned_decimal_int max ib;;
(* Scan an unsigned integer that could be given in any (common) basis.
If digits are prefixed by 0b for one of x, X, o, b the number is
assumed to be written respectively in hexadecimal, hexadecimal,
octal, or binary. *)
let scan_unsigned_int max ib =
match Scanning.peek_char ib with
| '0' as c ->
let max = Scanning.store_char ib c max in
if max = 0 || Scanning.end_of_input ib then max else
let c = Scanning.peek_char ib in
begin match c with
| 'x' -> scan_hexadecimal_digits (Scanning.store_char ib c max) ib
| 'X' -> scan_Hexadecimal_digits (Scanning.store_char ib c max) ib
| 'o' -> scan_octal_digits (Scanning.store_char ib c max) ib
| 'b' -> scan_binary_digits (Scanning.store_char ib c max) ib
| c -> scan_decimal_digits max ib end
| c -> scan_decimal_digits max ib;;
let scan_optionally_signed_int max ib =
let max = scan_sign max ib in
if max = 0 || Scanning.end_of_input ib then bad_input ib "an int" else
scan_unsigned_int max ib;;
let scan_int conv max ib =
match conv with
| 'd' -> scan_optionally_signed_decimal_int max ib
| 'i' -> scan_optionally_signed_int max ib
| 'o' -> scan_octal_digits max ib
| 'u' -> scan_unsigned_decimal_int max ib
| 'x' -> scan_hexadecimal_digits max ib
| 'X' -> scan_Hexadecimal_digits max ib
| c -> assert false;;
(* Scanning floating point numbers. *)
let scan_frac_part max ib = scan_unsigned_decimal_int max ib;;
let scan_exp_part max ib =
if max = 0 || Scanning.end_of_input ib then max else
let c = Scanning.peek_char ib in
match c with
| 'e' | 'E' as c ->
scan_optionally_signed_int (Scanning.store_char ib c max) ib
| _ -> max;;
let scan_float max ib =
let max = scan_optionally_signed_decimal_int max ib in
if max = 0 || Scanning.end_of_input ib then max else
let c = Scanning.peek_char ib in
match c with
| '.' ->
let max = Scanning.store_char ib c max in
let max = scan_frac_part max ib in
scan_exp_part max ib
| c -> scan_exp_part max ib;;
(* Scan a regular string: it stops with a space or one of the
characters in stp. *)
let scan_string stp max ib =
let rec loop max =
if max = 0 || Scanning.end_of_input ib then max else
let c = Scanning.peek_char ib in
if stp = [] then
match c with
| ' ' | '\t' | '\n' | '\r' -> max
| c -> loop (Scanning.store_char ib c max) else
if List.mem c stp then max else loop (Scanning.store_char ib c max) in
loop max;;
(* Scan a char: peek strictly one character in the input, whatsoever. *)
let scan_char max ib =
if max = 0 || Scanning.end_of_input ib then bad_input ib "a char" else
Scanning.store_char ib (Scanning.peek_char ib) max;;
let char_for_backslash =
match Sys.os_type with
| "Unix" | "Win32" | "Cygwin" ->
begin function
| 'n' -> '\010'
| 'r' -> '\013'
| 'b' -> '\008'
| 't' -> '\009'
| c -> c
end
| "MacOS" ->
begin function
| 'n' -> '\013'
| 'r' -> '\010'
| 'b' -> '\008'
| 't' -> '\009'
| c -> c
end
| x -> assert false;;
let char_for_decimal_code ib c0 c1 c2 =
let c =
100 * (int_of_char c0 - 48) + 10 * (int_of_char c1 - 48) +
(int_of_char c2 - 48) in
if c < 0 || c > 255
then bad_input ib (Printf.sprintf "\\ %c%c%c" c0 c1 c2)
else char_of_int c;;
let bad_escape c = failwith ("illegal escape character " ^ String.make 1 c);;
(* Called when encountering '\\' as starter of a char.
Stops before the corresponding '\''. *)
let scan_backslash_char max ib =
if max = 0 || Scanning.end_of_input ib then bad_input ib "a char" else
let c = Scanning.peek_char ib in
match c with
| '\\' | '\'' | '"' | 'n' | 't' | 'b' | 'r' (* '"' helping Emacs *) ->
Scanning.store_char ib (char_for_backslash c) max
| '0' .. '9' as c ->
let get_digit () =
Scanning.next_char ib;
let c = Scanning.peek_char ib in
match c with
| '0' .. '9' as c -> c
| c -> bad_escape c in
let c0 = c in
let c1 = get_digit () in
let c2 = get_digit () in
Scanning.store_char ib (char_for_decimal_code ib c0 c1 c2) (max - 2)
| c -> bad_escape c;;
let scan_Char max ib =
let rec loop s max =
if max = 0 || Scanning.end_of_input ib then bad_input ib "a char" else
let c = Scanning.peek_char ib in
match c, s with
| '\'', 3 -> Scanning.next_char ib; loop 2 (max - 1)
| '\'', 1 -> Scanning.next_char ib; max - 1
| '\\', 2 -> Scanning.next_char ib; loop 1 (scan_backslash_char (max - 1) ib)
| c, 2 -> loop 1 (Scanning.store_char ib c max)
| c, _ -> bad_escape c in
loop 3 max;;
let scan_String stp max ib =
let rec loop s max =
if max = 0 || Scanning.end_of_input ib then bad_input ib "a string" else
let c = Scanning.peek_char ib in
if stp = [] then
match c, s with
| '"', true (* '"' helping Emacs *) ->
Scanning.next_char ib; loop false (max - 1)
| '"', false (* '"' helping Emacs *) ->
Scanning.next_char ib; max - 1
| '\\', false ->
Scanning.next_char ib; loop false (scan_backslash_char (max - 1) ib)
| c, false -> loop false (Scanning.store_char ib c max)
| c, _ -> bad_input ib (String.make 1 c) else
if List.mem c stp then max else loop s (Scanning.store_char ib c max) in
loop true max;;
let scan_bool max ib =
let m =
match Scanning.peek_char ib with
| 't' -> 4
| 'f' -> 5
| _ -> 0 in
scan_string [] (min max m) ib;;
type char_set =
| Pos_set of string
| Neg_set of string;;
let read_char_set fmt i =
let lim = String.length fmt - 1 in
let rec find_in_set i j =
if j > lim then bad_format fmt j fmt.[lim - 1] else
match fmt.[j] with
| ']' -> String.sub fmt i (j - i), j
| c -> find_in_set i (j + 1)
and find_set_sign i =
if i > lim then bad_format fmt i fmt.[lim - 1] else
match fmt.[i] with
| '^' -> let set, i = find_set (i + 1) in i, Neg_set set
| _ -> let set, i = find_set i in i, Pos_set set
and find_set i =
if i > lim then bad_format fmt i fmt.[lim - 1] else
match fmt.[i] with
| ']' -> find_in_set i (i + 1)
| c -> find_in_set i i in
find_set_sign i;;
let make_setp stp char_set =
let make_predv set =
let v = Array.make 256 false in
let lim = String.length set - 1 in
let rec loop b i =
if i <= lim then
match set.[i] with
| '-' when b ->
(* if i = 0 then b is false (since the initial call is loop false 0)
hence i >= 1 and the following is safe. *)
let c1 = set.[i - 1] in
let i = i + 1 in
if i > lim then loop false (i - 1) else
let c2 = set.[i] in
for j = int_of_char c1 to int_of_char c2 do v.(j) <- true done;
loop false (i + 1)
| c -> v.(int_of_char set.[i]) <- true; loop true (i + 1) in
loop false 0;
v in
match char_set with
| Pos_set set ->
let v = make_predv set in
List.iter (fun c -> v.(int_of_char c) <- false) stp;
(fun c -> v.(int_of_char c))
| Neg_set set ->
let v = make_predv set in
List.iter (fun c -> v.(int_of_char c) <- true) stp;
(fun c -> not (v.(int_of_char c)));;
let scan_chars_in_char_set stp char_set max ib =
let setp = make_setp stp char_set in
let rec loop max ib =
if max = 0 || Scanning.end_of_input ib then max else
let c = Scanning.peek_char ib in
if setp c then loop (Scanning.store_char ib c max) ib else max in
loop max ib;;
let rec skip_whites ib =
if not (Scanning.end_of_input ib) then
match Scanning.peek_char ib with
| ' ' | '\r' | '\t' | '\n' -> Scanning.next_char ib; skip_whites ib
| _ -> ();;
external string_of_format : ('a, 'b, 'c) format -> string = "%identity";;
(* Main scanning function:
it takes an input buffer, a format and a function.
Then it scans the format and the buffer in parallel to find out
values as specified by the format. When it founds some it applies it
to the function f and continue. *)
let bscanf ib (fmt : ('a, Scanning.scanbuf, 'c) format) f =
let fmt = string_of_format fmt in
let lim = String.length fmt - 1 in
let return v = Obj.magic v () in
let delay f x () = f x in
let stack f = delay (return f) in
let rec scan f i =
if i > lim then return f else
match fmt.[i] with
| '%' -> scan_width f (i + 1)
| '@' as t ->
let i = i + 1 in
if i > lim then bad_format fmt (i - 1) t else begin
match fmt.[i] with
| fc when Scanning.end_of_input ib -> bad_input_buff ib
| '@' as fc when Scanning.peek_char ib = fc ->
Scanning.next_char ib; scan f (i + 1)
| fc when Scanning.peek_char ib = fc ->
Scanning.next_char ib; scan f (i + 1)
| fc -> bad_input_buff ib end
| ' ' | '\r' | '\t' | '\n' -> skip_whites ib; scan f (i + 1)
| fc when Scanning.end_of_input ib -> bad_input_buff ib
| fc when Scanning.peek_char ib = fc ->
Scanning.next_char ib; scan f (i + 1)
| fc -> bad_input_buff ib
and scan_width f i =
if i > lim then bad_format fmt i '%' else
match fmt.[i] with
| '0' .. '9' as c ->
let rec read_width accu i =
if i > lim then accu, i else
match fmt.[i] with
| '0' .. '9' as c ->
let accu = 10 * accu + (int_of_char c - int_of_char '0') in
read_width accu (i + 1)
| _ -> accu, i in
let max, j = read_width 0 i in
scan_conversion max f j
| _ -> scan_conversion max_int f i
and scan_conversion max f i =
if i > lim then bad_format fmt i fmt.[lim - 1] else
match fmt.[i] with
| 'c' | 'C' as conv ->
let x = if conv = 'c' then scan_char max ib else scan_Char max ib in
scan (stack f (token_char ib)) (i + 1)
| fc when Scanning.end_of_input ib -> bad_input_buff ib
| '%' as fc when Scanning.peek_char ib = fc ->
Scanning.next_char ib; scan f (i + 1)
| '%' as fc -> bad_input_buff ib
| 'd' | 'i' | 'o' | 'u' | 'x' | 'X' as conv ->
let x = scan_int conv max ib in
scan (stack f (token_int ib)) (i + 1)
| 'f' | 'g' | 'G' | 'e' | 'E' ->
let x = scan_float max ib in
scan (stack f (token_float ib)) (i + 1)
| 's' | 'S' as conv ->
let i, stp = scan_stoppers (i + 1) in
let x =
if conv = 's'
then scan_string stp max ib
else scan_String stp max ib in
scan (stack f (token_string ib)) (i + 1)
| 'b' ->
let x = scan_bool max ib in
scan (stack f (token_bool ib)) (i + 1)
| '[' ->
let i, char_set = read_char_set fmt (i + 1) in
let i, stp = scan_stoppers (i + 1) in
let x = scan_chars_in_char_set stp char_set max ib in
scan (stack f (token_string ib)) (i + 1)
| 'l' | 'n' | 'L' as t ->
let i = i + 1 in
if i > lim then bad_format fmt (i - 1) t else begin
match fmt.[i] with
| 'd' | 'i' | 'o' | 'u' | 'x' | 'X' as c ->
let x = scan_int c max ib in
begin match t with
| 'l' -> scan (stack f (token_int32 ib)) (i + 1)
| 'L' -> scan (stack f (token_int64 ib)) (i + 1)
| _ -> scan (stack f (token_nativeint ib)) (i + 1) end
| fc -> bad_format fmt i fc end
| 'N' ->
let x = Scanning.char_count ib in
scan (stack f x) (i + 1)
| 'r' ->
Obj.magic (fun reader arg ->
let x = reader ib arg in
scan (stack f x) (succ i))
| c -> bad_format fmt i c
and scan_stoppers i =
if i > lim then i - 1, [] else
match fmt.[i] with
| '@' when i < lim -> let i = i + 1 in i, [fmt.[i]]
| _ -> i - 1, [] in
Scanning.reset_token ib;
scan (fun () -> f) 0;;
let fscanf ic = bscanf (Scanning.from_channel ic);;
let scanf fmt = fscanf stdin fmt;;
let sscanf s = bscanf (Scanning.from_string s);;