(* $Id: camlrb.ml,v 1.16 2004/04/06 22:12:40 yori Exp $ *)
(* Copyright 2002, 2003 Yamagata Yoriyuki *)

open UCharInfo
open AbsCe

let (enc, readfile, dir) =
  let enc = ref CharEncoding.utf8 in
  let readfile = ref stdin in
  let dir = ref Filename.current_dir_name in
  Arg.parse
    ["--enc", Arg.String (fun encname -> enc := CharEncoding.of_name encname),
     "Encoding name";
     "--file", Arg.String (fun filename -> readfile := open_in_bin filename),
     "Reading file"]
    (fun dirname -> dir := dirname) "ocamllocaledef --enc ENCNAME --file INPUTFILE DIRECTORY:\n\
    Read the localedef INPUTFILE using the encoding ENCNAME \
and put the compiled data into DIRECTORY. \
    DATADIR specifies where camomile put the datafiles.\
    If ENCNAME is ommited, UTF-8 is used.  \
    If INPUTFILE is ommited, reading from stdin. \
    If DIRECTORY is ommited, the current directory is used.";
  !enc, !readfile, !dir

module Utf8Buffer = UTF8.Buf
module Utf8NF = UNF.Make (UTF8)

let ff = 0x000c				(*form feed*)
let cr = Char.code '\r'
let lf = Char.code '\n'
let nel = 0x0085
let tab = Char.code '\t'

let backslash = Char.code '\\'
let sq = Char.code '\\'
let dq = Char.code '\"'

let backslash = Str.regexp "\\\\\\\\"
let literal_1 =
  Str.regexp "\\\\[u]\\([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]\\)"

let literal_2 =
  Str.regexp "\\\\[v]\\([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]\\)"

let unescape s =
  let s =
    Str.global_substitute literal_1
      (fun _ ->
         let n = int_of_string (Str.replace_matched "0x\\1" s) in
         UTF8.init 1 (fun _ -> UChar.chr_of_uint n))
      s
  in
  let s =
    Str.global_substitute literal_2
      (fun _ ->
         let n = int_of_string (Str.replace_matched "0x\\1" s) in
         UTF8.init 1 (fun _ -> UChar.chr_of_uint n))
      s
  in
  Str.global_replace backslash "\\\\" s

let rec stream_to_list_aux a s =
  (fun (strm__ : _ Stream.t) ->
     match Stream.peek strm__ with
       Some e -> Stream.junk strm__; stream_to_list_aux (e :: a) strm__
     | _ -> List.rev a)
    s

let stream_to_list s = stream_to_list_aux [] s

type token =
    Text of string
  | Brace_r
  | Brace_l
  | Colon
  | Comma

let rec prep (strm__ : _ Stream.t) =
  match Stream.peek strm__ with
    Some u ->
      Stream.junk strm__;
      let rest = strm__ in
      let c =
        try Some (UChar.char_of u) with
          _ -> None
      in
      begin match general_category u with
        `Cc | `Cf when c <> Some '\n' -> prep rest
      | ct ->
          Stream.lcons (fun _ -> c, ct, u) (Stream.slazy (fun _ -> prep rest))
      end
  | _ -> Stream.sempty

let rec remove_comment (strm__ : _ Stream.t) =
  match Stream.peek strm__ with
    Some (Some '/', _, _ as data) ->
      Stream.junk strm__;
      let rest = strm__ in
      (fun (strm__ : _ Stream.t) ->
         match Stream.peek strm__ with
           Some (Some '/', _, _) -> Stream.junk strm__; comment strm__
         | Some (Some '*', _, _) -> Stream.junk strm__; comment2 strm__
         | _ ->
             let rest = strm__ in
             Stream.icons data (Stream.slazy (fun _ -> remove_comment rest)))
        rest
  | Some (Some '\"', _, _ as data) ->
      Stream.junk strm__;
      let rest = strm__ in
      Stream.icons data (Stream.slazy (fun _ -> in_quote rest))
  | Some data ->
      Stream.junk strm__;
      let rest = strm__ in
      Stream.icons data (Stream.slazy (fun _ -> remove_comment rest))
  | _ -> Stream.sempty
and comment (strm__ : _ Stream.t) =
  match Stream.peek strm__ with
    Some (Some ('\r' | '\n' | '\133'), _, _ | _, (`Zl | `Zp), _) ->
      Stream.junk strm__; remove_comment strm__
  | Some data -> Stream.junk strm__; comment strm__
  | _ -> Stream.sempty
and comment2 (strm__ : _ Stream.t) =
  match Stream.peek strm__ with
    Some (Some '*', _, _ as data) ->
      Stream.junk strm__;
      let rest = strm__ in
      (fun (strm__ : _ Stream.t) ->
         match Stream.peek strm__ with
           Some (Some '/', _, _) -> Stream.junk strm__; remove_comment strm__
         | _ -> comment2 strm__)
        rest
  | Some data -> Stream.junk strm__; comment2 strm__
  | _ -> Stream.sempty
and in_quote (strm__ : _ Stream.t) =
  match Stream.peek strm__ with
    Some (Some '\\', _, _ as data1) ->
      Stream.junk strm__;
      begin match Stream.peek strm__ with
        Some data2 ->
          Stream.junk strm__;
          let rest = strm__ in
          Stream.icons data1
            (Stream.icons data2 (Stream.slazy (fun _ -> in_quote rest)))
      | _ -> raise (Stream.Error "")
      end
  | Some (Some '\"', _, _ as data) ->
      Stream.junk strm__;
      let rest = strm__ in
      Stream.icons data (Stream.slazy (fun _ -> remove_comment rest))
  | Some data ->
      Stream.junk strm__;
      let rest = strm__ in
      Stream.icons data (Stream.slazy (fun _ -> in_quote rest))
  | _ -> Stream.sempty

let rec merge_text (strm__ : _ Stream.t) =
  match Stream.peek strm__ with
    Some (Text s) -> Stream.junk strm__; do_merge s strm__
  | Some e ->
      Stream.junk strm__;
      let rest = strm__ in
      Stream.icons e (Stream.slazy (fun _ -> merge_text rest))
  | _ -> Stream.sempty
and do_merge s (strm__ : _ Stream.t) =
  match Stream.peek strm__ with
    Some (Text s') -> Stream.junk strm__; do_merge (s ^ s') strm__
  | Some e ->
      Stream.junk strm__;
      let rest = strm__ in
      Stream.icons (Text s)
        (Stream.icons e (Stream.slazy (fun _ -> merge_text rest)))
  | _ -> Stream.sempty

let lexer s =
  let rec parse (strm__ : _ Stream.t) =
    match Stream.peek strm__ with
      Some (Some '{', _, _) ->
        Stream.junk strm__;
        let rest = strm__ in
        Stream.icons Brace_l (Stream.slazy (fun _ -> parse rest))
    | Some (Some '}', _, _) ->
        Stream.junk strm__;
        let rest = strm__ in
        Stream.icons Brace_r (Stream.slazy (fun _ -> parse rest))
    | Some (Some ':', _, _) ->
        Stream.junk strm__;
        let rest = strm__ in
        Stream.icons Colon (Stream.slazy (fun _ -> parse rest))
    | Some (Some ',', _, _) ->
        Stream.junk strm__;
        let rest = strm__ in
        Stream.icons Comma (Stream.slazy (fun _ -> parse rest))
    | Some (Some '\"', _, _) -> Stream.junk strm__; quote strm__
    | Some
        (Some ('\r' | '\n' | '\133' | '\t'), _, _ |
         _, (`Zs | `Zl | `Zp), _) ->
        Stream.junk strm__; parse strm__
    | Some e -> Stream.junk strm__; text (Stream.icons e strm__)
    | _ -> Stream.sempty
  and quote s =
    let buf = Utf8Buffer.create 16 in
    let rec loop (strm__ : _ Stream.t) =
      match Stream.peek strm__ with
        Some (Some '\\', _, u1) ->
          Stream.junk strm__;
          begin match Stream.peek strm__ with
            Some (_, _, u2) ->
              Stream.junk strm__;
              let rest = strm__ in
              Utf8Buffer.add_char buf u1;
              Utf8Buffer.add_char buf u2;
              loop rest
          | _ -> raise (Stream.Error "")
          end
      | Some (Some '\"', _, _) ->
          Stream.junk strm__;
          let rest = strm__ in
          let s = Utf8Buffer.contents buf in
          let s' = unescape s in
          Stream.icons (Text s') (Stream.slazy (fun _ -> parse rest))
      | Some (_, _, u) ->
          Stream.junk strm__;
          let rest = strm__ in Utf8Buffer.add_char buf u; loop rest
      | _ -> failwith "A quote is not enclosed."
    in
    loop s
  and text s =
    let buf = Utf8Buffer.create 16 in
    let rec loop (strm__ : _ Stream.t) =
      match Stream.peek strm__ with
        Some
          (Some ('\r' | '\n' | '\133' | '\t'), _, _ |
           _, (`Zs | `Zl | `Zp), _) ->
          Stream.junk strm__;
          let rest = strm__ in
          let s = Utf8Buffer.contents buf in
          let s' = unescape s in
          Stream.icons (Text s') (Stream.slazy (fun _ -> parse rest))
      | Some (Some ('{' | '}' | ':' | ',' | '\"'), _, _ as e) ->
          Stream.junk strm__;
          let rest = strm__ in
          let s = Utf8Buffer.contents buf in
          let s' = unescape s in
          Stream.icons (Text s')
            (Stream.slazy (fun _ -> parse (Stream.icons e rest)))
      | Some (_, _, u) ->
          Stream.junk strm__;
          let rest = strm__ in Utf8Buffer.add_char buf u; loop rest
      | _ ->
          let s = Utf8Buffer.contents buf in
          let s' = unescape s in Stream.ising (Text s')
    in
    loop s
  in
  let p = prep s in
  let p1 = remove_comment p in
  let tokens = parse p1 in
  let tokens1 = merge_text tokens in let l = stream_to_list tokens1 in l

let string_to_binary s =
  let n = String.length s / 2 in
  let b = String.create n in
  for i = 0 to n - 1 do
    let d = int_of_string ("0x" ^ String.sub s (i * 2) 2) in
    b.[i] <- Char.chr d
  done;
  b

let root = ref ""

let load_file filename =
  let file =
    if Filename.is_implicit filename then Filename.concat !root filename
    else filename
  in
  let c = open_in_bin file in
  let buf = Buffer.create 16 in
  try while true do Buffer.add_channel buf c 1 done; assert false with
    End_of_file -> Buffer.contents buf

type data =
    Table of (string, data) Hashtbl.t
  | Array_data of data array
  | String_data of string
  | Binary of string
  | Int of int
  | Intvect of int array
  | Tagged of string * data

let rec parse_intvect l a =
  match l with
    Text num :: Comma :: rest -> parse_intvect rest (int_of_string num :: a)
  | Text num :: rest ->
      Intvect (Array.of_list (List.rev (int_of_string num :: a))), rest
  | _ -> Intvect (Array.of_list (List.rev a)), l

let rec parse_table l a =
  match parse l with
    Some d, rest -> parse_table rest (d :: a)
  | None, rest ->
      let tbl = Hashtbl.create (List.length a) in
      let proc ent =
        match ent with
          Tagged (name, data) -> Hashtbl.add tbl name data
        | _ -> failwith "A broken table entry."
      in
      List.iter proc a; Table tbl, rest
and parse_array l a =
  match l with
    Brace_l :: rest ->
      let (data, rest) = parse_unknown rest in
      begin match rest with
        Brace_r :: Comma :: rest -> parse_array rest (data :: a)
      | Brace_r :: rest -> parse_array rest (data :: a)
      | _ -> failwith "A brace is not enclosed."
      end
  | Text text :: Comma :: rest -> parse_array rest (String_data text :: a)
  | Text text :: rest ->
      Array_data (Array.of_list (List.rev (String_data text :: a))), rest
  | _ -> Array_data (Array.of_list (List.rev a)), l
and parse_unknown l =
  match l with
    Text text :: Brace_r :: rest -> String_data text, Brace_r :: rest
  | Text text :: Comma :: rest -> parse_array l []
  | Text text :: rest -> parse_table l []
  | _ -> parse_array l []
and parse l =
  match l with
    Text tname :: Colon :: Text "table" :: Brace_l :: rest ->
      let (data, rest) = parse_table rest [] in
      begin match rest with
        Brace_r :: rest -> Some (Tagged (tname, data)), rest
      | _ -> failwith "A brace is not enclosed."
      end
  | Text tname :: Colon :: Text "array" :: Brace_l :: rest ->
      let (data, rest) = parse_array rest [] in
      begin match rest with
        Brace_r :: rest -> Some (Tagged (tname, data)), rest
      | _ -> failwith "A brace is not enclosed."
      end
  | Text tname :: Colon :: Text "string" :: Brace_l :: Text data :: Brace_r ::
    rest ->
      Some (Tagged (tname, String_data data)), rest
  | Text tname :: Colon :: Text "bin" :: Brace_l :: Text data :: Brace_r ::
    rest ->
      let b = string_to_binary data in Some (Tagged (tname, Binary b)), rest
  | Text tname :: Colon :: Text "import" :: Brace_l :: Text filename ::
    Brace_r :: rest ->
      prerr_endline "Warning : file loading is not supported.";
      Some (Tagged (tname, Binary "")), rest
  | Text tname :: Colon :: Text "int" :: Brace_l :: Text num :: Brace_r ::
    rest ->
      let n = int_of_string num in Some (Tagged (tname, Int n)), rest
  | Text tname :: Colon :: Text "intvector" :: Brace_l :: rest ->
      let (data, rest) = parse_intvect rest [] in
      begin match rest with
        Brace_r :: rest -> Some (Tagged (tname, data)), rest
      | _ -> failwith "A brace is not enclosed."
      end
  | Text name :: Brace_l :: rest ->
      let (data, rest) = parse_unknown rest in
      begin match rest with
        Brace_r :: rest -> Some (Tagged (name, data)), rest
      | _ -> failwith "A brace is not enclosed."
      end
  | _ -> None, l

let col_parse s =
  let s = Utf8NF.nfd s in
  let lexbuf = Lexing.from_string s in
  let ace_info = ColParser.main ColLexer.token lexbuf in cetbl_of ace_info

let localedef =
  function
    Table tbl ->
      let col_info =
        try
          Some
            (match Hashtbl.find tbl "CollationElements" with
               Table tbl ->
                 begin match Hashtbl.find tbl "Sequence" with
                   String_data s -> col_parse s
                 | _ -> assert false
                 end
             | _ -> assert false)
        with
          Not_found -> None
      in
      {Unidata.col_info = col_info}
  | _ -> assert false

let main () =
  let cs = Stream.of_channel readfile in
  let stream = CharEncoding.ustream_of enc cs in
  let lexed = lexer stream in
  let (data, rest) = parse_table lexed [] in
  if rest <> [] then failwith "Strange trailing data.";
  let proc key entry =
    let locale_info = localedef entry in
    let file = Filename.concat dir (key ^ ".mar") in
    let c = open_out_bin file in output_value c locale_info
  in
  match data with
    Table tbl -> Hashtbl.iter proc tbl
  | _ -> failwith "Broken data."
    
let _ = main ()
