(* open MyStream *) (* A symbol is an integer. (We could parameterize the entire module over an arbitrary type of symbols, which must be equipped with equality and hash functions. It does not seem to be worth the trouble.) *) type symbol = int (* A Huffman tree is either a leaf, which carries a symbol, or a binary node, which carries two sub-trees. Here, we choose NOT to store frequency information inside every leaf and node. Instead, we accompany a tree with a frequency when we insert into the priority queue. *) type tree = | Leaf of int | Node of tree * tree (* We represent a path in a tree as a MyStream.t of Booleans, where [false] means "go left" and [true] means "go right". *) type path = bool MyStream.t (* By following (a prefix of) a path in the tree, we end up at a leaf that carries a symbol. This is used while decoding. The function returns a pair of the symbol that was found and the remainder of the path. *) let rec decode_symbol (tree : tree) (path : path) : symbol * path = match tree with | Leaf symbol -> (* We have reached a leaf. We are done. *) symbol, path | Node (tree0, tree1) -> (* We are at a node. The path must be non-empty. *) ( match Lazy.force path with | MyStream.Cons (direction, path) -> decode_symbol (if direction then tree1 else tree0) path | MyStream.Nil -> assert false ) (* We use priority queues whose elements are pairs of a tree and a frequency. *) module Q = BinomialQueue.Make(struct (* The priority queue contains pairs of a tree and a frequency. *) type t = tree * int (* Elements are compared based on their frequency. In other words, drawing an element out of the queue yields an element with least frequency. *) let compare (_, freq1) (_, freq2) = freq1 - freq2 end) (* An alphabet maps symbols to integer frequencies. *) type alphabet = (symbol, int) Hashtbl.t (* Out of an alphabet, we build a tree. *) let build_tree (alphabet : alphabet) : tree = (* Assumption: the alphabet is nonempty. *) assert (Hashtbl.length alphabet > 0); (* Initialize a priority queue. *) let queue : Q.t = Hashtbl.fold (fun symbol freq queue -> Q.insert (Leaf symbol, freq) queue ) alphabet Q.empty in (* Process the priority queue. *) let rec process (queue : Q.t) : tree = (* Assumption: [queue] is nonempty. *) assert (not (Q.is_empty queue)); let (tree0, freq0), queue = Q.extract queue in (* If the queue is now empty, we are done. *) if Q.is_empty queue then tree0 else (* Otherwise, extract another tree. *) let (tree1, freq1), queue = Q.extract queue in (* Construct a new node, compute its cumulated frequency, insert it back into the queue, and continue. *) let tree = Node (tree0, tree1) in let freq = freq0 + freq1 in let queue = Q.insert (tree, freq) queue in process queue in process queue (* By traversing a tree, one can build a mapping of symbols to their encodings, which are (finite) lists of Booleans. As we go down, we keep track of the (reversed) path that we have followed into the tree. *) type cipher_symbol = bool list type cipher_text = bool MyStream.t type encoding_dictionary = (symbol, cipher_text) Hashtbl.t let build_dictionary (tree : tree) : encoding_dictionary = let dictionary = Hashtbl.create 256 in let rec traverse (path : bool list) (tree : tree) : unit = match tree with | Leaf symbol -> Hashtbl.add dictionary symbol (MyStream.from_list (List.rev path)) | Node (tree0, tree1) -> traverse (false :: path) tree0; traverse (true :: path) tree1 in traverse [] tree; dictionary (* Encoding. *) let encode_symbol (dictionary : encoding_dictionary) (symbol : symbol) : cipher_text = try Hashtbl.find dictionary symbol with Not_found -> assert false (* unknown character *) type plain_text = symbol MyStream.t (* Pick new names for the end user. *) type decoding_dictionary = tree let build_dictionaries alphabet = let tree = build_tree alphabet in let dictionary = build_dictionary tree in dictionary, tree (* Dumping a tree. *) (* For simplicity, we encode a tree as a MyStream.t of integers, where the labels "Leaf" and "Node" are encoded as 0 and 1 respectively, and where a symbol is encoded as itself. This is slightly inefficient, since this means that we are encoding a label using [k] bits, where [k] is the bit width of a symbol. In principle, we could use just one bit for this purpose. We would have to encode a tree directly as a MyStream.t of bits. This could be done by exploiting the functions in the module [Pack]. *) let rec push_decoding_dictionary (tree : tree) (accu : int MyStream.t) : int MyStream.t = match tree with | Leaf symbol -> lazy(MyStream.Cons (0, (lazy(MyStream.Cons (symbol, accu))))) | Node (tree0, tree1) -> lazy (MyStream.Cons (1, (push_decoding_dictionary tree0 (push_decoding_dictionary tree1 accu)))) (* Reading back a tree. *) let rec pop_decoding_dictionary (is : int MyStream.t) : tree * int MyStream.t = let label, is = match Lazy.force is with | MyStream.Cons (label, is) -> label, is | MyStream.Nil -> assert false in match label with | 0 -> let symbol, is = match Lazy.force is with | MyStream.Cons (symbol, is) -> symbol, is | MyStream.Nil -> assert false in Leaf symbol, is | 1 -> let tree0, is = pop_decoding_dictionary is in let tree1, is = pop_decoding_dictionary is in Node (tree0, tree1), is | _ -> assert false (* Out of a string, one can build an alphabet. *) let build_alphabet (text : plain_text) : int * alphabet = let table = Hashtbl.create 1023 in let total = ref 0 in MyStream.iter (fun symbol -> let freq = try Hashtbl.find table symbol with Not_found -> 0 in Hashtbl.replace table symbol (freq + 1); incr total ) text; !total, table (* ........ *) let dictionaries_from_text (text : plain_text) : int * encoding_dictionary * decoding_dictionary = let (size, alphabet) = build_alphabet text in let (encoding_dictionary, decoding_dictionary) = build_dictionaries alphabet in (size, encoding_dictionary, decoding_dictionary)