ocaml/otherlibs/str/str.ml

(***********************************************************************)
(*                                                                     *)
(*                           Objective Caml                            *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 1996 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  All rights reserved.  This file is distributed    *)
(*  under the terms of the GNU Library General Public License, with    *)
(*  the special exception on linking described in file ../../LICENSE.  *)
(*                                                                     *)
(***********************************************************************)

(* $Id$ *)

(** String utilities *)

let string_before s n = String.sub s 0 n

let string_after s n = String.sub s n (String.length s - n)

let first_chars s n = String.sub s 0 n

let last_chars s n = String.sub s (String.length s - n) n

(** Representation of character sets **)

module Charset =
  struct
    type t = string (* of length 32 *)

    let empty = String.make 32 '\000'
    let full = String.make 32 '\255'

    let make_empty () = String.make 32 '\000'

    let add s c =
      let i = Char.code c in
      s.[i lsr 3] <- Char.chr(Char.code s.[i lsr 3] lor (1 lsl (i land 7)))

    let add_range s c1 c2 =
      for i = Char.code c1 to Char.code c2 do add s (Char.chr i) done

    let singleton c =
      let s = make_empty () in add s c; s

    let range c1 c2 =
      let s = make_empty () in add_range s c1 c2; s

    let complement s =
      let r = String.create 32 in
      for i = 0 to 31 do
        r.[i] <- Char.chr(Char.code s.[i] lxor 0xFF)
      done;
      r

    let union s1 s2 =
      let r = String.create 32 in
      for i = 0 to 31 do
        r.[i] <- Char.chr(Char.code s1.[i] lor Char.code s2.[i])
      done;
      r

    let disjoint s1 s2 =
      try
        for i = 0 to 31 do
          if Char.code s1.[i] land Char.code s2.[i] <> 0 then raise Exit
        done;
        true
      with Exit ->
        false

    let iter fn s =
      for i = 0 to 31 do
        let c = Char.code s.[i] in
        if c <> 0 then
          for j = 0 to 7 do
            if c land (1 lsl j) <> 0 then fn (Char.chr ((i lsl 3) + j))
          done
      done

    let expand s =
      let r = String.make 256 '\000' in
      iter (fun c -> r.[Char.code c] <- '\001') s;
      r

    let fold_case s =
      let r = make_empty() in
      iter (fun c -> add r (Char.lowercase c); add r (Char.uppercase c)) s;
      r

  end

(** Abstract syntax tree for regular expressions *)

type re_syntax =
    Char of char
  | String of string
  | CharClass of Charset.t
  | Seq of re_syntax list
  | Alt of re_syntax * re_syntax
  | Star of re_syntax
  | Plus of re_syntax
  | Option of re_syntax
  | Group of int * re_syntax
  | Refgroup of int
  | Bol
  | Eol
  | Wordboundary

(** Representation of compiled regular expressions *)

type regexp = {
  prog: int array;         (* bytecode instructions *)
  cpool: string array;     (* constant pool (string literals) *)
  normtable: string;       (* case folding table (if any) *)
  numgroups: int;          (* number of \(...\) groups *)
  numregisters: int;       (* number of nullable Star or Plus *)
  startchars: int          (* index of set of starting chars, or -1 if none *)
}

(** Opcodes for bytecode instructions; see strstubs.c for description *)

let op_CHAR = 0
let op_CHARNORM = 1
let op_STRING = 2
let op_STRINGNORM = 3
let op_CHARCLASS = 4
let op_BOL = 5
let op_EOL = 6
let op_WORDBOUNDARY = 7
let op_BEGGROUP = 8
let op_ENDGROUP = 9
let op_REFGROUP = 10
let op_ACCEPT = 11
let op_SIMPLEOPT = 12
let op_SIMPLESTAR = 13
let op_SIMPLEPLUS = 14
let op_GOTO = 15
let op_PUSHBACK = 16
let op_SETMARK = 17
let op_CHECKPROGRESS = 18

(* Encoding of bytecode instructions *)

let instr opc arg = opc lor (arg lsl 8)

(* Computing relative displacements for GOTO and PUSHBACK instructions *)

let displ dest from = dest - from - 1

(** Compilation of a regular expression *)

(* Determine if a regexp can match the empty string *)

let rec is_nullable = function
    Char c -> false
  | String s -> s = ""
  | CharClass cl -> false
  | Seq rl -> List.for_all is_nullable rl
  | Alt (r1, r2) -> is_nullable r1 || is_nullable r2
  | Star r -> true
  | Plus r -> is_nullable r
  | Option r -> true
  | Group(n, r) -> is_nullable r
  | Refgroup n -> true
  | Bol -> true
  | Eol -> true
  | Wordboundary -> true

(* first r returns a set of characters C such that:
     for all string s, s matches r => the first character of s is in C.
   For convenience, return Charset.full if r is nullable. *)

let rec first = function
    Char c -> Charset.singleton c
  | String s -> if s = "" then Charset.full else Charset.singleton s.[0]
  | CharClass cl -> cl
  | Seq rl -> first_seq rl
  | Alt (r1, r2) -> Charset.union (first r1) (first r2)
  | Star r -> Charset.full
  | Plus r -> first r
  | Option r -> Charset.full
  | Group(n, r) -> first r
  | Refgroup n -> Charset.full
  | Bol -> Charset.full
  | Eol -> Charset.full
  | Wordboundary -> Charset.full

and first_seq = function
    [] -> Charset.full
  | (Bol | Eol | Wordboundary) :: rl -> first_seq rl
  | Star r :: rl -> Charset.union (first r) (first_seq rl)
  | Option r :: rl -> Charset.union (first r) (first_seq rl)
  | r :: rl -> first r

(* Transform a Char or CharClass regexp into a character class *)

let charclass_of_regexp fold_case re =
  let cl =
    match re with
      Char c -> Charset.singleton c
    | CharClass cl -> cl
    | _ -> assert false in
  if fold_case then Charset.fold_case cl else cl

(* The case fold table: maps characters to their lowercase equivalent *)

let fold_case_table =
  let t = String.create 256 in
  for i = 0 to 255 do t.[i] <- Char.lowercase(Char.chr i) done;
  t

module StringMap = Map.Make(struct type t = string let compare = compare end)

(* Compilation of a regular expression *)

let compile fold_case re =

  (* Instruction buffering *)
  let prog = ref (Array.make 32 0)
  and progpos = ref 0
  and cpool = ref StringMap.empty
  and cpoolpos = ref 0
  and numgroups = ref 1
  and numregs = ref 0 in
  (* Add a new instruction *)
  let emit_instr opc arg =
    if !progpos >= Array.length !prog then begin
      let newlen = ref (Array.length !prog) in
      while !progpos >= !newlen do newlen := !newlen * 2 done;
      let nprog = Array.make !newlen 0 in
      Array.blit !prog 0 nprog 0 (Array.length !prog);
      prog := nprog
    end;
    (!prog).(!progpos) <- (instr opc arg);
    incr progpos in
  (* Reserve an instruction slot and return its position *)
  let emit_hole () =
    let p = !progpos in incr progpos; p in
  (* Fill a reserved instruction slot with a GOTO or PUSHBACK instruction *)
  let patch_instr pos opc dest =
    (!prog).(pos) <- (instr opc (displ dest pos)) in
  (* Return the cpool index for the given string, adding it if not
     already there *)
  let cpool_index s =
    try
      StringMap.find s !cpool
    with Not_found ->
      let p = !cpoolpos in
      cpool := StringMap.add s p !cpool;
      incr cpoolpos;
      p in
  (* Allocate fresh register if regexp is nullable *)
  let allocate_register_if_nullable r =
    if is_nullable r then begin
      let n = !numregs in
      if n >= 64 then failwith "too many r* or r+ where r is nullable";
      incr numregs;
      n
    end else
      -1 in
  (* Main recursive compilation function *)
  let rec emit_code = function
    Char c ->
      if fold_case then
        emit_instr op_CHARNORM (Char.code (Char.lowercase c))
      else
        emit_instr op_CHAR (Char.code c)
  | String s ->
      begin match String.length s with
        0 -> ()
      | 1 ->
        if fold_case then
          emit_instr op_CHARNORM (Char.code (Char.lowercase s.[0]))
        else
          emit_instr op_CHAR (Char.code s.[0])
      | _ ->
        try
          (* null characters are not accepted by the STRING* instructions;
             if one is found, split string at null character *)
          let i = String.index s '\000' in
          emit_code (String (string_before s i));
          emit_instr op_CHAR 0;
          emit_code (String (string_after s (i+1)))
        with Not_found ->
          if fold_case then
            emit_instr op_STRINGNORM (cpool_index (String.lowercase s))
          else
            emit_instr op_STRING (cpool_index s)
      end
  | CharClass cl ->
      let cl' = if fold_case then Charset.fold_case cl else cl in
      emit_instr op_CHARCLASS (cpool_index cl')
  | Seq rl ->
      emit_seq_code rl
  | Alt(r1, r2) ->
      (*      PUSHBACK lbl1
              <match r1>
              GOTO lbl2
        lbl1: <match r2>
        lbl2: ... *)
      let pos_pushback = emit_hole() in
      emit_code r1;
      let pos_goto_end = emit_hole() in
      let lbl1 = !progpos in
      emit_code r2;
      let lbl2 = !progpos in
      patch_instr pos_pushback op_PUSHBACK lbl1;
      patch_instr pos_goto_end op_GOTO lbl2
  | Star r ->
      (* Implement longest match semantics for compatibility with old Str *)
      (* General translation:
           lbl1: PUSHBACK lbl2
                 SETMARK regno
                 <match r>
                 CHECKPROGRESS regno
                 GOTO lbl1
           lbl2:
         If r cannot match the empty string, code can be simplified:
           lbl1: PUSHBACK lbl2
                 <match r>
                 GOTO lbl1
           lbl2:
        *)
      let regno = allocate_register_if_nullable r in
      let lbl1 = emit_hole() in
      if regno >= 0 then emit_instr op_SETMARK regno;
      emit_code r;
      if regno >= 0 then emit_instr op_CHECKPROGRESS regno;
      emit_instr op_GOTO (displ lbl1 !progpos);
      let lbl2 = !progpos in
      patch_instr lbl1 op_PUSHBACK lbl2
  | Plus r ->
      (* Implement longest match semantics for compatibility with old Str *)
      (* General translation:
           lbl1: <match r>
                 CHECKPROGRESS regno
                 PUSHBACK lbl2
                 SETMARK regno
                 GOTO lbl1
           lbl2:
         If r cannot match the empty string, code can be simplified:
           lbl1: <match r>
                 PUSHBACK lbl2
                 GOTO_PLUS lbl1
           lbl2:
      *)
      let regno = allocate_register_if_nullable r in
      let lbl1 = !progpos in
      emit_code r;
      if regno >= 0 then emit_instr op_CHECKPROGRESS regno;
      let pos_pushback = emit_hole() in
      if regno >= 0 then emit_instr op_SETMARK regno;
      emit_instr op_GOTO (displ lbl1 !progpos);
      let lbl2 = !progpos in
      patch_instr pos_pushback op_PUSHBACK lbl2
  | Option r ->
      (* Implement longest match semantics for compatibility with old Str *)
      (*      PUSHBACK lbl
              <match r>
         lbl:
      *)
      let pos_pushback = emit_hole() in
      emit_code r;
      let lbl = !progpos in
      patch_instr pos_pushback op_PUSHBACK lbl
  | Group(n, r) ->
      if n >= 32 then failwith "too many \\(...\\) groups";
      emit_instr op_BEGGROUP n;
      emit_code r;
      emit_instr op_ENDGROUP n;
      numgroups := max !numgroups (n+1)
  | Refgroup n ->
      emit_instr op_REFGROUP n
  | Bol ->
      emit_instr op_BOL 0
  | Eol ->
      emit_instr op_EOL 0
  | Wordboundary ->
      emit_instr op_WORDBOUNDARY 0

  and emit_seq_code = function
    [] -> ()
  | Star(Char _ | CharClass _ as r) :: rl
    when disjoint_modulo_case (first r) (first_seq rl) ->
      emit_instr op_SIMPLESTAR (cpool_index (charclass_of_regexp fold_case r));
      emit_seq_code rl
  | Plus(Char _ | CharClass _ as r) :: rl
    when disjoint_modulo_case (first r) (first_seq rl) ->
      emit_instr op_SIMPLEPLUS (cpool_index (charclass_of_regexp fold_case r));
      emit_seq_code rl
  | Option(Char _ | CharClass _ as r) :: rl
    when disjoint_modulo_case (first r) (first_seq rl) ->
      emit_instr op_SIMPLEOPT (cpool_index (charclass_of_regexp fold_case r));
      emit_seq_code rl
  | r :: rl ->
      emit_code r;
      emit_seq_code rl

  and disjoint_modulo_case c1 c2 =
    if fold_case
    then Charset.disjoint (Charset.fold_case c1) (Charset.fold_case c2)
    else Charset.disjoint c1 c2
  in

  emit_code re;
  emit_instr op_ACCEPT 0;
  let start = first re in
  let start' = if fold_case then Charset.fold_case start else start in
  let start_pos =
    if start = Charset.full
    then -1
    else cpool_index (Charset.expand start') in
  let constantpool = Array.make !cpoolpos "" in
  StringMap.iter (fun str idx -> constantpool.(idx) <- str) !cpool;
  { prog = Array.sub !prog 0 !progpos;
    cpool = constantpool;
    normtable = if fold_case then fold_case_table else "";
    numgroups = !numgroups;
    numregisters = !numregs;
    startchars = start_pos }

(** Parsing of a regular expression *)

(* Efficient buffering of sequences *)

module SeqBuffer = struct

  type t = { sb_chars: Buffer.t; mutable sb_next: re_syntax list }

  let create() = { sb_chars = Buffer.create 16; sb_next = [] }

  let flush buf =
    let s = Buffer.contents buf.sb_chars in
    Buffer.clear buf.sb_chars;
    match String.length s with
      0 -> ()
    | 1 -> buf.sb_next <- Char s.[0] :: buf.sb_next
    | _ -> buf.sb_next <- String s :: buf.sb_next

  let add buf re =
    match re with
      Char c -> Buffer.add_char buf.sb_chars c
    | _ -> flush buf; buf.sb_next <- re :: buf.sb_next

  let extract buf =
    flush buf; Seq(List.rev buf.sb_next)

end

(* The character class corresponding to `.' *)

let dotclass = Charset.complement (Charset.singleton '\n')

(* Parse a regular expression *)

let parse s =
  let len = String.length s in
  let group_counter = ref 1 in

  let rec regexp0 i =
    let (r, j) = regexp1 i in
    regexp0cont r j
  and regexp0cont r1 i =
    if i + 2 <= len && s.[i] = '\\' && s.[i+1] = '|' then
      let (r2, j) = regexp1 (i+2) in
      regexp0cont (Alt(r1, r2)) j
    else
      (r1, i)
  and regexp1 i =
    regexp1cont (SeqBuffer.create()) i
  and regexp1cont sb i =
    if i >= len
    || i + 2 <= len && s.[i] = '\\' && (let c = s.[i+1] in c = '|' || c = ')')
    then
      (SeqBuffer.extract sb, i)
    else
      let (r, j) = regexp2 i in
      SeqBuffer.add sb r;
      regexp1cont sb j
  and regexp2 i =
    let (r, j) = regexp3 i in
    regexp2cont r j
  and regexp2cont r i =
    if i >= len then (r, i) else
      match s.[i] with
        '?' -> regexp2cont (Option r) (i+1)
      | '*' -> regexp2cont (Star r) (i+1)
      | '+' -> regexp2cont (Plus r) (i+1)
      |  _  -> (r, i)
  and regexp3 i =
    match s.[i] with
      '\\' -> regexpbackslash (i+1)
    | '['  -> let (c, j) = regexpclass0 (i+1) in (CharClass c, j)
    | '^'  -> (Bol, i+1)
    | '$'  -> (Eol, i+1)
    | '.'  -> (CharClass dotclass, i+1)
    | c    -> (Char c, i+1)
  and regexpbackslash i =
    if i >= len then (Char '\\', i) else
      match s.[i] with
        '|' | ')' ->
          assert false
      | '(' ->
          let group_no = !group_counter in
          if group_no < 32 then incr group_counter;
          let (r, j) = regexp0 (i+1) in
          if j + 1 < len && s.[j] = '\\' && s.[j+1] = ')' then
            if group_no < 32
            then (Group(group_no, r), j + 2)
            else (r, j + 2)
          else
            failwith "\\( group not closed by \\)"
      | '1' .. '9' as c ->
          (Refgroup(Char.code c - 48), i + 1)
      | 'b' ->
          (Wordboundary, i + 1)
      | c ->
          (Char c, i + 1)
  and regexpclass0 i =
    if i < len && s.[i] = '^'
    then let (c, j) = regexpclass1 (i+1) in (Charset.complement c, j)
    else regexpclass1 i
  and regexpclass1 i =
    let c = Charset.make_empty() in
    let j = regexpclass2 c i i in
    (c, j)
  and regexpclass2 c start i =
    if i >= len then failwith "[ class not closed by ]";
    if s.[i] = ']' && i > start then i+1 else begin
      let c1 = s.[i] in
      if i+2 < len && s.[i+1] = '-' && s.[i+2] <> ']' then begin
        let c2 = s.[i+2] in
        Charset.add_range c c1 c2;
        regexpclass2 c start (i+3)
      end else begin
        Charset.add c c1;
        regexpclass2 c start (i+1)
      end
    end in

  let (r, j) = regexp0 0 in
  if j = len then r else failwith "spurious \\) in regular expression"

(** Parsing and compilation *)

let regexp e = compile false (parse e)

let regexp_case_fold e = compile true (parse e)

let quote s =
  let len = String.length s in
  let buf = String.create (2 * len) in
  let pos = ref 0 in
  for i = 0 to len - 1 do
    match s.[i] with
      '[' | ']' | '*' | '.' | '\\' | '?' | '+' | '^' | '$' as c ->
        buf.[!pos] <- '\\'; buf.[!pos + 1] <- c; pos := !pos + 2
    | c ->
        buf.[!pos] <- c; pos := !pos + 1
  done;
  String.sub buf 0 !pos

let regexp_string s = compile false (String s)

let regexp_string_case_fold s = compile true (String s)

(** Matching functions **)

external re_string_match: regexp -> string -> int -> int array
     = "re_string_match"
external re_partial_match: regexp -> string -> int -> int array
     = "re_partial_match"
external re_search_forward: regexp -> string -> int -> int array
     = "re_search_forward"
external re_search_backward: regexp -> string -> int -> int array
     = "re_search_backward"

let last_search_result = ref [||]

let string_match re s pos =
  let res = re_string_match re s pos in
  last_search_result := res;
  Array.length res > 0

let string_partial_match re s pos =
  let res = re_partial_match re s pos in
  last_search_result := res;
  Array.length res > 0

let search_forward re s pos =
  let res = re_search_forward re s pos in
  last_search_result := res;
  if Array.length res = 0 then raise Not_found else res.(0)

let search_backward re s pos =
  let res = re_search_backward re s pos in
  last_search_result := res;
  if Array.length res = 0 then raise Not_found else res.(0)

let group_beginning n =
  let n2 = n + n in
  if n < 0 || n2 >= Array.length !last_search_result then
    invalid_arg "Str.group_beginning"
  else
    let pos = !last_search_result.(n2) in
    if pos = -1 then raise Not_found else pos

let group_end n =
  let n2 = n + n in
  if n < 0 || n2 >= Array.length !last_search_result then
    invalid_arg "Str.group_end"
  else
    let pos = !last_search_result.(n2 + 1) in
    if pos = -1 then raise Not_found else pos

let matched_group n txt =
  let n2 = n + n in
  if n < 0 || n2 >= Array.length !last_search_result then
    invalid_arg "Str.matched_group"
  else
    let b = !last_search_result.(n2)
    and e = !last_search_result.(n2 + 1) in
    if b = -1 then raise Not_found else String.sub txt b (e - b)

let match_beginning () = group_beginning 0
and match_end () = group_end 0
and matched_string txt = matched_group 0 txt

(** Replacement **)

external re_replacement_text: string -> int array -> string -> string
    = "re_replacement_text"

let replace_matched repl matched =
  re_replacement_text repl !last_search_result matched

let substitute_first expr repl_fun text =
  try
    let pos = search_forward expr text 0 in
    String.concat "" [string_before text pos;
                      repl_fun text;
                      string_after text (match_end())]
  with Not_found ->
    text

let global_substitute expr repl_fun text =
  let rec replace start last_was_empty =
    try
      let startpos = if last_was_empty then start + 1 else start in
      if startpos > String.length text then raise Not_found;
      let pos = search_forward expr text startpos in
      let end_pos = match_end() in
      let repl_text = repl_fun text in
      String.sub text start (pos-start) ::
      repl_text ::
      replace end_pos (end_pos = pos)
    with Not_found ->
      [string_after text start] in
  String.concat "" (replace 0 false)

let global_replace expr repl text =
  global_substitute expr (replace_matched repl) text
and replace_first expr repl text =
  substitute_first expr (replace_matched repl) text

(** Splitting *)

let search_forward_progress expr text start =
  let pos = search_forward expr text start in
  if match_end() = start && start < String.length text
  then search_forward expr text (start + 1)
  else pos

let bounded_split expr text num =
  let start =
    if string_match expr text 0 then match_end() else 0 in
  let rec split start n =
    if start >= String.length text then [] else
    if n = 1 then [string_after text start] else
      try
        let pos = search_forward_progress expr text start in
        String.sub text start (pos-start) :: split (match_end()) (n-1)
      with Not_found ->
        [string_after text start] in
  split start num

let split expr text = bounded_split expr text 0

let bounded_split_delim expr text num =
  let rec split start n =
    if start > String.length text then [] else
    if n = 1 then [string_after text start] else
      try
        let pos = search_forward_progress expr text start in
        String.sub text start (pos-start) :: split (match_end()) (n-1)
      with Not_found ->
        [string_after text start] in
  if text = "" then [] else split 0 num

let split_delim expr text = bounded_split_delim expr text 0

type split_result = Text of string | Delim of string

let bounded_full_split expr text num =
  let rec split start n =
    if start >= String.length text then [] else
    if n = 1 then [Text(string_after text start)] else
      try
        let pos = search_forward_progress expr text start in
        let s = matched_string text in
        if pos > start then
          Text(String.sub text start (pos-start)) ::
          Delim(s) ::
          split (match_end()) (n-1)
        else
          Delim(s) ::
          split (match_end()) (n-1)
      with Not_found ->
        [Text(string_after text start)] in
  split 0 num

let full_split expr text = bounded_full_split expr text 0