ocaml/ocamldoc/odoc_ocamlhtml.mll

{
(**************************************************************************)
(*                                                                        *)
(*                                 OCaml                                  *)
(*                                                                        *)
(*             Maxence Guesdon, projet Cristal, INRIA Rocquencourt        *)
(*                                                                        *)
(*   Copyright 2001 Institut National de Recherche en Informatique et     *)
(*     en Automatique.                                                    *)
(*                                                                        *)
(*   All rights reserved.  This file is distributed under the terms of    *)
(*   the GNU Lesser General Public License version 2.1, with the          *)
(*   special exception on linking described in the file LICENSE.          *)
(*                                                                        *)
(**************************************************************************)

(** Generation of html code to display OCaml code. *)
open Lexing

exception Fatal_error

let fatal_error msg =
  prerr_string ">> Fatal error: "; prerr_endline msg; raise Fatal_error

type error =
  | Illegal_character of char
  | Unterminated_comment
  | Unterminated_string
  | Unterminated_string_in_comment
  | Keyword_as_label of string
;;

exception Error of error * int * int

let base_escape_strings = [
    ("&", "&amp;") ;
    ("<", "&lt;") ;
    (">", "&gt;") ;
]


let prelike_escape_strings = [
  (" ", "&nbsp;") ;
  ("\t", "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;") ;
  ("\n", "<br>\n")
]


let pre = ref false
let fmt = ref Format.str_formatter

(** Escape the strings which would clash with html syntax,
   and some other strings if we want to get a PRE style outside of
   <pre> </pre>.*)
let escape s =
  let escape_strings =
    if !pre then
      base_escape_strings
    else
      base_escape_strings @ prelike_escape_strings in
  List.fold_left
    (fun acc -> fun (s, s2) -> Str.global_replace (Str.regexp s) s2 acc)
    s
    escape_strings

(** Escape the strings which would clash with html syntax. *)
let escape_base s =
  List.fold_left
    (fun acc -> fun (s, s2) -> Str.global_replace (Str.regexp s) s2 acc)
    s
    base_escape_strings

(** The output functions *)

let print ?(esc=true) s =
  Format.pp_print_string !fmt (if esc then escape s else s)
;;

let print_class ?(esc=true) cl s =
  print ~esc: false ("<span class=\""^cl^"\">"^
                     (if esc then escape s else s)^
                     "</span>")
;;

(** The table of keywords with colors *)
let create_hashtable size init =
  let tbl = Hashtbl.create size in
  List.iter (fun (key, data) -> Hashtbl.add tbl key data) init;
  tbl

(** The function used to return html code for the given comment body. *)
let html_of_comment = ref
    (fun (_ : string) -> "<b>Odoc_ocamlhtml.html_of_comment not initialized</b>")

let keyword_table =
  create_hashtable 149 [
    "and", "keyword" ;
    "as", "keyword" ;
    "assert", "keyword" ;
    "begin", "keyword" ;
    "class", "keyword" ;
    "constraint", "keyword" ;
    "do", "keyword" ;
    "done", "keyword" ;
    "downto", "keyword" ;
    "else", "keyword" ;
    "end", "keyword" ;
    "exception", "keyword" ;
    "external", "keyword" ;
    "false", "keyword" ;
    "for", "keyword" ;
    "fun", "keyword" ;
    "function", "keyword" ;
    "functor", "keyword" ;
    "if", "keyword" ;
    "in", "keyword" ;
    "include", "keyword" ;
    "inherit", "keyword" ;
    "initializer", "keyword" ;
    "lazy", "keyword" ;
    "let", "keyword" ;
    "match", "keyword" ;
    "method", "keyword" ;
    "module", "keyword" ;
    "mutable", "keyword" ;
    "new", "keyword" ;
    "object", "keyword" ;
    "of", "keyword" ;
    "open", "keyword" ;
    "or", "keyword" ;
    "parser", "keyword" ;
    "private", "keyword" ;
    "rec", "keyword" ;
    "sig", "keyword" ;
    "struct", "keyword" ;
    "then", "keyword" ;
    "to", "keyword" ;
    "true", "keyword" ;
    "try", "keyword" ;
    "type", "keyword" ;
    "val", "keyword" ;
    "virtual", "keyword" ;
    "when", "keyword" ;
    "while", "keyword" ;
    "with", "keyword" ;

    "mod", "keyword" ;
    "land", "keyword" ;
    "lor", "keyword" ;
    "lxor", "keyword" ;
    "lsl", "keyword" ;
    "lsr", "keyword" ;
    "asr", "keyword" ;
]

let kwsign_class = "keywordsign"
let constructor_class = "constructor"
let comment_class = "comment"
let string_class = "string"
let code_class = "code"


(** To buffer and print comments *)


let margin = ref 0

let comment_buffer = Buffer.create 32
let reset_comment_buffer () = Buffer.reset comment_buffer
let store_comment_char = Buffer.add_char comment_buffer
let add_comment_string = Buffer.add_string comment_buffer

let make_margin () =
  let rec iter n =
    if n <= 0 then ""
    else "&nbsp;"^(iter (n-1))
  in
  iter !margin

let print_comment () =
  let s = Buffer.contents comment_buffer in
  let len = String.length s in
  let code =
    if len < 1 then
      "<span class=\""^comment_class^"\">(*"^(escape s)^"*)</span>"
    else
      match s.[0] with
        '*' ->
          (
           try
             let html = !html_of_comment (String.sub s 1 (len-1)) in
             "</code><table><tr><td>"^(make_margin ())^"</td><td>"^
             "<span class=\""^comment_class^"\">"^
             "(**"^html^"*)"^
             "</span></td></tr></table><code class=\""^code_class^"\">"
           with
             e ->
               prerr_endline (Printexc.to_string e);
               "<span class=\""^comment_class^"\">(*"^(escape s)^"*)</span>"
          )
      | _ ->
          "<span class=\""^comment_class^"\">(*"^(escape s)^"*)</span>"
  in
  print ~esc: false code

(** To buffer string literals *)

let string_buffer = Buffer.create 32
let reset_string_buffer () = Buffer.reset string_buffer
let store_string_char = Buffer.add_char string_buffer
let get_stored_string () =
  Buffer.contents string_buffer

(** To translate escape sequences *)

let char_for_backslash = function
  | 'n' -> '\010'
  | 'r' -> '\013'
  | 'b' -> '\008'
  | 't' -> '\009'
  | c   -> c

let char_for_decimal_code lexbuf i =
  let c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
           10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
                (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48) in
  Char.chr(c land 0xFF)

let char_for_hexa_code lexbuf i =
  let c = 16 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
                (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) in
  Char.chr(c land 0xFF)

(** To store the position of the beginning of a string and comment *)
let string_start_pos = ref 0;;
let comment_start_pos = ref [];;
let in_comment () = !comment_start_pos <> [];;

(** Error report *)

open Format

let report_error ppf = function
  | Illegal_character c ->
      fprintf ppf "Illegal character (%s)" (Char.escaped c)
  | Unterminated_comment ->
      fprintf ppf "Comment not terminated"
  | Unterminated_string ->
      fprintf ppf "String literal not terminated"
  | Unterminated_string_in_comment ->
      fprintf ppf "This comment contains an unterminated string literal"
  | Keyword_as_label kwd ->
      fprintf ppf "`%s' is a keyword, it cannot be used as label name" kwd
;;

}

let blank = [' ' '\010' '\013' '\009' '\012']
let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']
let identchar =
  ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
let symbolchar =
  ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
let decimal_literal = ['0'-'9']+
let hex_literal = '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+
let oct_literal = '0' ['o' 'O'] ['0'-'7']+
let bin_literal = '0' ['b' 'B'] ['0'-'1']+
let float_literal =
  ['0'-'9']+ ('.' ['0'-'9']* )? (['e' 'E'] ['+' '-']? ['0'-'9']+)?

rule token = parse
    blank
      {
        let s = Lexing.lexeme lexbuf in
        (
         match s with
           " " -> incr margin
         | "\t" -> margin := !margin + 8
         | "\n" -> margin := 0
         | _ -> ()
        );
        print s;
        token lexbuf
      }
  | "_"
      { print "_" ; token lexbuf }
  | "~"  { print "~" ; token lexbuf }
  | "~" lowercase identchar * ':'
      { let s = Lexing.lexeme lexbuf in
        let name = String.sub s 1 (String.length s - 2) in
        if Hashtbl.mem keyword_table name then
          raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf,
                       Lexing.lexeme_end lexbuf));
        print s ; token lexbuf }
  | "?"  { print "?" ; token lexbuf }
  | "?" lowercase identchar * ':'
      { let s = Lexing.lexeme lexbuf in
        let name = String.sub s 1 (String.length s - 2) in
        if Hashtbl.mem keyword_table name then
          raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf,
                       Lexing.lexeme_end lexbuf));
        print s ; token lexbuf }
  | lowercase identchar *
      { let s = Lexing.lexeme lexbuf in
          try
            let cl = Hashtbl.find keyword_table s in
            (print_class cl s ; token lexbuf )
          with Not_found ->
            (print s ; token lexbuf )}
  | uppercase identchar *
      { print_class constructor_class (Lexing.lexeme lexbuf) ; token lexbuf }       (* No capitalized keywords *)
  | decimal_literal | hex_literal | oct_literal | bin_literal
      { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | float_literal
      { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "\""
      { reset_string_buffer();
        let string_start = Lexing.lexeme_start lexbuf in
        string_start_pos := string_start;
        string lexbuf;
        lexbuf.Lexing.lex_start_pos <-
          string_start - lexbuf.Lexing.lex_abs_pos;
        print_class string_class ("\""^(get_stored_string())^"\"") ;
        token lexbuf }
  | "'" [^ '\\' '\''] "'"
      { print_class string_class (Lexing.lexeme lexbuf) ;
        token lexbuf }
  | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'"
      { print_class string_class (Lexing.lexeme lexbuf ) ;
        token lexbuf }
  | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
      { print_class string_class (Lexing.lexeme lexbuf ) ;
        token lexbuf }
  | "(*"
      {
        reset_comment_buffer ();
        comment_start_pos := [Lexing.lexeme_start lexbuf];
        comment lexbuf ;
        print_comment ();
        token lexbuf }
  | "(*)"
      { reset_comment_buffer ();
        comment_start_pos := [Lexing.lexeme_start lexbuf];
        comment lexbuf ;
        print_comment ();
        token lexbuf
      }
  | "*)"
      { lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 1;
        lexbuf.Lexing.lex_curr_p <-
          { lexbuf.Lexing.lex_curr_p with
            pos_cnum = lexbuf.Lexing.lex_curr_p.pos_cnum - 1
          } ;
        print (Lexing.lexeme lexbuf) ;
        token lexbuf
      }
  | "#" [' ' '\t']* ['0'-'9']+ [^ '\n' '\r'] * ('\n' | '\r' | "\r\n")
      (* # linenum ...  *)
      {
        print (Lexing.lexeme lexbuf);
        token lexbuf
      }
  | "#"  { print_class kwsign_class (Lexing.lexeme lexbuf) ; token lexbuf }
  | "&"  { print_class kwsign_class (Lexing.lexeme lexbuf) ; token lexbuf }
  | "&&" { print_class kwsign_class (Lexing.lexeme lexbuf) ; token lexbuf }
  | "`"  { print_class kwsign_class (Lexing.lexeme lexbuf) ; token lexbuf }
  | "'"  { print_class kwsign_class (Lexing.lexeme lexbuf) ; token lexbuf }
  | "("  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ")"  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "*"  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ","  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "??" { print_class kwsign_class (Lexing.lexeme lexbuf) ; token lexbuf }
  | "->" { print_class kwsign_class (Lexing.lexeme lexbuf) ; token lexbuf }
  | "."  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ".." { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ":"  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "::" { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ":=" { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ":>" { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ";"  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ";;" { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "<"  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "<-" { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "="  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "["  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "[|" { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "[<" { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "]"  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "{"  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "{<" { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "|"  { print_class kwsign_class (Lexing.lexeme lexbuf) ; token lexbuf }
  | "||" { print_class kwsign_class (Lexing.lexeme lexbuf) ; token lexbuf }
  | "|]" { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ">"  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ">]" { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "}"  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ">}" { print (Lexing.lexeme lexbuf) ; token lexbuf }

  | "!=" { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "+"  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "-"  { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "-." { print (Lexing.lexeme lexbuf) ; token lexbuf }

  | "!" symbolchar *
            { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ['~' '?'] symbolchar +
            { print_class kwsign_class (Lexing.lexeme lexbuf) ; token lexbuf }
  | ['=' '<' '>' '|' '&' '$'] symbolchar *
            { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ['@' '^'] symbolchar *
            { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ['+' '-'] symbolchar *
            { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | "**" symbolchar *
            { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | ['*' '/' '%'] symbolchar *
            { print (Lexing.lexeme lexbuf) ; token lexbuf }
  | eof { () }
  | _
      { raise (Error(Illegal_character ((Lexing.lexeme lexbuf).[0]),
                     Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)) }

and comment = parse
    "(*"
      { comment_start_pos := Lexing.lexeme_start lexbuf :: !comment_start_pos;
        store_comment_char '(';
        store_comment_char '*';
        comment lexbuf;
      }
  | "*)"
      { match !comment_start_pos with
        | [] -> assert false
        | [_] -> comment_start_pos := []
        | _ :: l ->
            store_comment_char '*';
            store_comment_char ')';
            comment_start_pos := l;
            comment lexbuf;
       }
(* These filters are useless
  | "\""
      { reset_string_buffer();
        string_start_pos := Lexing.lexeme_start lexbuf;
        store_comment_char '"';
        begin
          try string lexbuf; add_comment_string ((get_stored_string()^"\""))
          with Error (Unterminated_string, _, _) ->
          let st = List.hd !comment_start_pos in
          raise (Error (Unterminated_string_in_comment, st, st + 2))
        end;
        comment lexbuf }
  | "'" [^ '\\' '\''] "'"
      {
        store_comment_char '\'';
        store_comment_char (Lexing.lexeme_char lexbuf 1);
        store_comment_char '\'';
        comment lexbuf }
  | "'\\" ['\\' '\'' 'n' 't' 'b' 'r'] "'"
      {
        store_comment_char '\'';
        store_comment_char '\\';
        store_comment_char(char_for_backslash(Lexing.lexeme_char lexbuf 1)) ;
        store_comment_char '\'';
        comment lexbuf }
  | "\\" ['0'-'9'] ['0'-'9'] ['0'-'9']
      {
        store_comment_char(char_for_decimal_code lexbuf 1);
        comment lexbuf }
  | "\\x" ['0'-'9' 'A'-'Z' 'a'-'z' ] ['0'-'9' 'A'-'Z' 'a'-'z']
      {
        store_comment_char(char_for_hexa_code lexbuf 2);
        string lexbuf }
  | "''"
      {
        store_comment_char '\'';
        store_comment_char '\'';
        comment lexbuf }
*)
  | eof
      { let st = List.hd !comment_start_pos in
        raise (Error (Unterminated_comment, st, st + 2));
      }
  | _
      { store_comment_char(Lexing.lexeme_char lexbuf 0);
        comment lexbuf }

and string = parse
    '"'
      { () }
  | '\\' ("\010" | "\013" | "\013\010") [' ' '\009'] *
      { string lexbuf }
  | '\\' ['\\' '"' 'n' 't' 'b' 'r' ]
      { Buffer.add_string string_buffer (Lexing.lexeme lexbuf) ;
        string lexbuf }
  | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
      {
        Buffer.add_string string_buffer (Lexing.lexeme lexbuf) ;
        string lexbuf
      }
  | '\\' 'x' ['0'-'9' 'A'-'Z' 'a'-'z' ] ['0'-'9' 'A'-'Z' 'a'-'z']
      {  Buffer.add_string string_buffer (Lexing.lexeme lexbuf) ;
         string lexbuf }
  | eof
      { raise (Error (Unterminated_string,
                      !string_start_pos, !string_start_pos+1)) }
  | _
      { store_string_char(Lexing.lexeme_char lexbuf 0);
        string lexbuf }
{

let html_of_code b ?(with_pre=true) code =
  let old_pre = !pre in
  let old_margin = !margin in
  let old_comment_buffer = Buffer.contents comment_buffer in
  let old_string_buffer = Buffer.contents string_buffer in
  let buf = Buffer.create 256 in
  let old_fmt = !fmt in
  fmt := Format.formatter_of_buffer buf ;
  pre := with_pre;
  margin := 0;

  let start = "<code class=\""^code_class^"\">" in
  let ending = "</code>" in
  let html =
    (
     try
       print ~esc: false start ;
       let lexbuf = Lexing.from_string code in
       token lexbuf;
       print ~esc: false ending ;
       Format.pp_print_flush !fmt () ;
       Buffer.contents buf
     with
       _ ->
         (* flush str_formatter because we already output
            something in it *)
         Format.pp_print_flush !fmt () ;
         start^code^ending
    )
  in
  pre := old_pre;
  margin := old_margin ;
  Buffer.reset comment_buffer;
  Buffer.add_string comment_buffer old_comment_buffer ;
  Buffer.reset string_buffer;
  Buffer.add_string string_buffer old_string_buffer ;
  fmt := old_fmt ;

  Buffer.add_string b html

}