Optimize Huffman code generation

author Eric Biggers <ebiggers3@gmail.com>

Thu, 29 May 2014 05:42:54 +0000 (00:42 -0500)

committer Eric Biggers <ebiggers3@gmail.com>

Sat, 31 May 2014 14:52:53 +0000 (09:52 -0500)
author Eric Biggers <ebiggers3@gmail.com>
Thu, 29 May 2014 05:42:54 +0000 (00:42 -0500)
committer Eric Biggers <ebiggers3@gmail.com>
Sat, 31 May 2014 14:52:53 +0000 (09:52 -0500)
diff --git a/include/wimlib/compress_common.h b/include/wimlib/compress_common.h

index 666f10dcc43a031b5a9af818c1a00edbac35364b..f1f788e29470e561b1b259df96278daf84ec0afa 100644 (file)
--- a/include/wimlib/compress_common.h
+++ b/include/wimlib/compress_common.h
@@ -69,6 +69,6 @@ make_canonical_huffman_code(unsigned num_syms,
                             unsigned max_codeword_len,
                             const input_idx_t freq_tab[restrict],
                             u8 lens[restrict],
-                           u16 codewords[restrict]);
+                           u32 codewords[restrict]);
  
  #endif /* _WIMLIB_COMPRESS_H */
diff --git a/src/compress_common.c b/src/compress_common.c

index 9659d07b00d930f22badaaac6f5c9897c2f926b0..e8ed00d3c3096e810b46d67ef4617d5196618058 100644 (file)
--- a/src/compress_common.c
+++ b/src/compress_common.c
@@ -5,7 +5,7 @@
   */
  
  /*
- * Copyright (C) 2012, 2013 Eric Biggers
+ * Copyright (C) 2012, 2013, 2014 Eric Biggers
   *
   * This file is part of wimlib, a library for working with WIM files.
   *
@@ -126,346 +126,627 @@ init_output_bitstream(struct output_bitstream *ostream,
         ostream->overrun             = false;
  }
  
-typedef struct {
-       input_idx_t freq;
-       u16 sym;
-       union {
-               u16 path_len;
-               u16 height;
-       };
-} HuffmanNode;
-
-typedef struct HuffmanIntermediateNode {
-       HuffmanNode node_base;
-       HuffmanNode *left_child;
-       HuffmanNode *right_child;
-} HuffmanIntermediateNode;
-
-
-/* Comparator function for HuffmanNodes.  Sorts primarily by symbol
- * frequency and secondarily by symbol value. */
-static int
-cmp_nodes_by_freq(const void *_leaf1, const void *_leaf2)
+/* Given the binary tree node A[subtree_idx] whose children already
+ * satisfy the maxheap property, swap the node with its greater child
+ * until it is greater than both its children, so that the maxheap
+ * property is satisfied in the subtree rooted at A[subtree_idx].  */
+static void
+heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
  {
-       const HuffmanNode *leaf1 = _leaf1;
-       const HuffmanNode *leaf2 = _leaf2;
-
-       if (leaf1->freq > leaf2->freq)
-               return 1;
-       else if (leaf1->freq < leaf2->freq)
-               return -1;
-       else
-               return (int)leaf1->sym - (int)leaf2->sym;
+       unsigned parent_idx;
+       unsigned child_idx;
+       u32 v;
+
+       v = A[subtree_idx];
+       parent_idx = subtree_idx;
+       while ((child_idx = parent_idx * 2) <= length) {
+               if (child_idx < length && A[child_idx + 1] > A[child_idx])
+                       child_idx++;
+               if (v >= A[child_idx])
+                       break;
+               A[parent_idx] = A[child_idx];
+               parent_idx = child_idx;
+       }
+       A[parent_idx] = v;
  }
  
-/* Comparator function for HuffmanNodes.  Sorts primarily by code length and
- * secondarily by symbol value. */
-static int
-cmp_nodes_by_code_len(const void *_leaf1, const void *_leaf2)
+/* Rearrange the array 'A' so that it satisfies the maxheap property.
+ * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
+ */
+static void
+heapify_array(u32 A[], unsigned length)
  {
-       const HuffmanNode *leaf1 = _leaf1;
-       const HuffmanNode *leaf2 = _leaf2;
-
-       int code_len_diff = (int)leaf1->path_len - (int)leaf2->path_len;
-
-       if (code_len_diff == 0)
-               return (int)leaf1->sym - (int)leaf2->sym;
-       else
-               return code_len_diff;
+       for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
+               heapify_subtree(A, length, subtree_idx);
  }
  
-#define INVALID_SYMBOL 0xffff
-
-/* Recursive function to calculate the depth of the leaves in a Huffman tree.
- * */
+/* Sort the array 'A', which contains 'length' unsigned 32-bit integers.  */
  static void
-huffman_tree_compute_path_lengths(HuffmanNode *base_node, u16 cur_len)
+heapsort(u32 A[], unsigned length)
  {
-       if (base_node->sym == INVALID_SYMBOL) {
-               /* Intermediate node. */
-               HuffmanIntermediateNode *node = (HuffmanIntermediateNode*)base_node;
-               huffman_tree_compute_path_lengths(node->left_child, cur_len + 1);
-               huffman_tree_compute_path_lengths(node->right_child, cur_len + 1);
-       } else {
-               /* Leaf node. */
-               base_node->path_len = cur_len;
+       A--; /* Use 1-based indices  */
+
+       heapify_array(A, length);
+
+       while (length >= 2) {
+               swap(A[1], A[length]);
+               length--;
+               heapify_subtree(A, length, 1);
         }
  }
  
-/* make_canonical_huffman_code: - Creates a canonical Huffman code from an array
- *                               of symbol frequencies.
- *
- * The algorithm used is similar to the well-known algorithm that builds a
- * Huffman tree using a minheap.  In that algorithm, the leaf nodes are
- * initialized and inserted into the minheap with the frequency as the key.
- * Repeatedly, the top two nodes (nodes with the lowest frequency) are taken out
- * of the heap and made the children of a new node that has a frequency equal to
- * the sum of the two frequencies of its children.  This new node is inserted
- * into the heap.  When all the nodes have been removed from the heap, what
- * remains is the Huffman tree. The Huffman code for a symbol is given by the
- * path to it in the tree, where each left pointer is mapped to a 0 bit and each
- * right pointer is mapped to a 1 bit.
- *
- * The algorithm used here uses an optimization that removes the need to
- * actually use a heap.  The leaf nodes are first sorted by frequency, as
- * opposed to being made into a heap.  Note that this sorting step takes O(n log
- * n) time vs.  O(n) time for heapifying the array, where n is the number of
- * symbols.  However, the heapless method is probably faster overall, due to the
- * time saved later.  In the heapless method, whenever an intermediate node is
- * created, it is not inserted into the sorted array.  Instead, the intermediate
- * nodes are kept in a separate array, which is easily kept sorted because every
- * time an intermediate node is initialized, it will have a frequency at least
- * as high as that of the previous intermediate node that was initialized.  So
- * whenever we want the 2 nodes, leaf or intermediate, that have the lowest
- * frequency, we check the low-frequency ends of both arrays, which is an O(1)
- * operation.
- *
- * The function builds a canonical Huffman code, not just any Huffman code.  A
- * Huffman code is canonical if the codeword for each symbol numerically
- * precedes the codeword for all other symbols of the same length that are
- * numbered higher than the symbol, and additionally, all shorter codewords,
- * 0-extended, numerically precede longer codewords.  A canonical Huffman code
- * is useful because it can be reconstructed by only knowing the path lengths in
- * the tree.  See the make_huffman_decode_table() function to see how to
- * reconstruct a canonical Huffman code from only the lengths of the codes.
- *
- * @num_syms:  The number of symbols in the alphabet.
- *
- * @max_codeword_len:  The maximum allowed length of a codeword in the code.
- *                     Note that if the code being created runs up against
- *                     this restriction, the code ultimately created will be
- *                     suboptimal, although there are some advantages for
- *                     limiting the length of the codewords.
- *
- * @freq_tab:  An array of length @num_syms that contains the frequencies
- *                     of each symbol in the uncompressed data.
- *
- * @lens:         An array of length @num_syms into which the lengths of the
- *                     codewords for each symbol will be written.
- *
- * @codewords:    An array of @num_syms short integers into which the
- *                     codewords for each symbol will be written.  The first
- *                     lens[i] bits of codewords[i] will contain the codeword
- *                     for symbol i.
+#define NUM_SYMBOL_BITS 10
+#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
+
+/*
+ * Sort the symbols primarily by frequency and secondarily by symbol
+ * value.  Discard symbols with zero frequency and fill in an array with
+ * the remaining symbols, along with their frequencies.  The low
+ * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
+ * value, and the remaining bits will contain the frequency.
+ *
+ * @num_syms
+ *     Number of symbols in the alphabet.
+ *     Can't be greater than (1 << NUM_SYMBOL_BITS).
+ *
+ * @freqs[num_syms]
+ *     The frequency of each symbol.
+ *
+ * @lens[num_syms]
+ *     An array that eventually will hold the length of each codeword.
+ *     This function only fills in the codeword lengths for symbols that
+ *     have zero frequency, which are not well defined per se but will
+ *     be set to 0.
+ *
+ * @symout[num_syms]
+ *     The output array, described above.
+ *
+ * Returns the number of entries in 'symout' that were filled.  This is
+ * the number of symbols that have nonzero frequency.
   */
-void
-make_canonical_huffman_code(unsigned num_syms,
-                           unsigned max_codeword_len,
-                           const input_idx_t freq_tab[restrict],
-                           u8 lens[restrict],
-                           u16 codewords[restrict])
+static unsigned
+sort_symbols(unsigned num_syms, const u32 freqs[restrict],
+            u8 lens[restrict], u32 symout[restrict])
  {
-       /* We require at least 2 possible symbols in the alphabet to produce a
-        * valid Huffman decoding table. It is allowed that fewer than 2 symbols
-        * are actually used, though. */
-       wimlib_assert(num_syms >= 2 && num_syms < INVALID_SYMBOL);
-
-       /* Initialize the lengths and codewords to 0 */
-       memset(lens, 0, num_syms * sizeof(lens[0]));
-       memset(codewords, 0, num_syms * sizeof(codewords[0]));
-
-       /* Calculate how many symbols have non-zero frequency.  These are the
-        * symbols that actually appeared in the input. */
-       unsigned num_used_symbols = 0;
-       for (unsigned i = 0; i < num_syms; i++)
-               if (freq_tab[i] != 0)
-                       num_used_symbols++;
-
-
-       /* It is impossible to make a code for num_used_symbols symbols if there
-        * aren't enough code bits to uniquely represent all of them. */
-       wimlib_assert((1 << max_codeword_len) > num_used_symbols);
-
-       /* Initialize the array of leaf nodes with the symbols and their
-        * frequencies. */
-       HuffmanNode leaves[num_used_symbols];
-       unsigned leaf_idx = 0;
-       for (unsigned i = 0; i < num_syms; i++) {
-               if (freq_tab[i] != 0) {
-                       leaves[leaf_idx].freq = freq_tab[i];
-                       leaves[leaf_idx].sym  = i;
-                       leaves[leaf_idx].height = 0;
-                       leaf_idx++;
-               }
+       unsigned num_used_syms;
+       unsigned num_counters;
+
+       /* We rely on heapsort, but with an added optimization.  Since
+        * it's common for most symbol frequencies to be low, we first do
+        * a count sort using a limited number of counters.  High
+        * frequencies will be counted in the last counter, and only they
+        * will be sorted with heapsort.
+        *
+        * Note: with more symbols, it is generally beneficial to have more
+        * counters.  About 1 counter per 4 symbols seems fast.
+        *
+        * Note: I also tested radix sort, but even for large symbol
+        * counts (> 255) and frequencies bounded at 16 bits (enabling
+        * radix sort by just two base-256 digits), it didn't seem any
+        * faster than the method implemented here.
+        *
+        * Note: I tested the optimized quicksort implementation from
+        * glibc (with indirection overhead removed), but it was only
+        * marginally faster than the simple heapsort implemented here.
+        *
+        * Tests were done with building the codes for LZX.  Results may
+        * vary for different compression algorithms...!  */
+
+       num_counters = (DIV_ROUND_UP(num_syms, 4) + 3) & ~3;
+
+       unsigned counters[num_counters];
+
+       memset(counters, 0, sizeof(counters));
+
+       /* Count the frequencies.  */
+       for (unsigned sym = 0; sym < num_syms; sym++)
+               counters[min(freqs[sym], num_counters - 1)]++;
+
+       /* Make the counters cumulative, ignoring the zero-th, which
+        * counted symbols with zero frequency.  As a side effect, this
+        * calculates the number of symbols with nonzero frequency.  */
+       num_used_syms = 0;
+       for (unsigned i = 1; i < num_counters; i++) {
+               unsigned count = counters[i];
+               counters[i] = num_used_syms;
+               num_used_syms += count;
         }
  
-       /* Deal with the special cases where num_used_symbols < 2. */
-       if (num_used_symbols < 2) {
-               if (num_used_symbols == 0) {
-                       /* If num_used_symbols is 0, there are no symbols in the
-                        * input, so it must be empty.  This should be an error,
-                        * but the LZX format expects this case to succeed.  All
-                        * the codeword lengths are simply marked as 0 (which
-                        * was already done.) */
+       /* Sort nonzero-frequency symbols using the counters.  At the
+        * same time, set the codeword lengths of zero-frequency symbols
+        * to 0.  */
+       for (unsigned sym = 0; sym < num_syms; sym++) {
+               u32 freq = freqs[sym];
+               if (freq != 0) {
+                       symout[counters[min(freq, num_counters - 1)]++] =
+                               sym | (freq << NUM_SYMBOL_BITS);
                 } else {
-                       /* If only one symbol is present, the LZX format
-                        * requires that the Huffman code include two codewords.
-                        * One is not used.  Note that this doesn't make the
-                        * encoded data take up more room anyway, since binary
-                        * data itself has 2 symbols. */
-
-                       unsigned sym = leaves[0].sym;
-
-                       codewords[0] = 0;
-                       lens[0]      = 1;
-                       if (sym == 0) {
-                               /* dummy symbol is 1, real symbol is 0 */
-                               codewords[1] = 1;
-                               lens[1]      = 1;
-                       } else {
-                               /* dummy symbol is 0, real symbol is sym */
-                               codewords[sym] = 1;
-                               lens[sym]      = 1;
-                       }
+                       lens[sym] = 0;
                 }
-               return;
         }
  
-       /* Otherwise, there are at least 2 symbols in the input, so we need to
-        * find a real Huffman code. */
+       /* Sort the symbols counted in the last counter.  */
+       heapsort(symout + counters[num_counters - 2],
+                counters[num_counters - 1] - counters[num_counters - 2]);
+
+       return num_used_syms;
+}
  
+/*
+ * Build the Huffman tree.
+ *
+ * This is an optimized implementation that
+ *     (a) takes advantage of the frequencies being already sorted;
+ *     (b) only generates non-leaf nodes, since the non-leaf nodes of a
+ *         Huffman tree are sufficient to generate a canonical code;
+ *     (c) Only stores parent pointers, not child pointers;
+ *     (d) Produces the nodes in the same memory used for input
+ *         frequency information.
+ *
+ * Array 'A', which contains 'sym_count' entries, is used for both input
+ * and output.  For this function, 'sym_count' must be at least 2.
+ *
+ * For input, the array must contain the frequencies of the symbols,
+ * sorted in increasing order.  Specifically, each entry must contain a
+ * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
+ * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
+ * Although these bits will, in fact, contain the symbols that correspond
+ * to the frequencies, this function is concerned with frequencies only
+ * and keeps the symbols as-is.
+ *
+ * For output, this function will produce the non-leaf nodes of the
+ * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
+ * entries of the array.  Entry A[sym_count - 2] will represent the root
+ * node.  Each other node will contain the zero-based index of its parent
+ * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
+ * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
+ * note that although these low bits will, in fact, contain a symbol
+ * value, this symbol will have *no relationship* with the Huffman tree
+ * node that happens to occupy the same slot.  This is because this
+ * implementation only generates the non-leaf nodes of the tree.
+ */
+static void
+build_tree(u32 A[], unsigned sym_count)
+{
+       /* Index, in 'A', of next lowest frequency symbol that has not
+        * yet been processed.  */
+       unsigned i = 0;
+
+       /* Index, in 'A', of next lowest frequency parentless non-leaf
+        * node; or, if equal to 'e', then no such node exists yet.  */
+       unsigned b = 0;
+
+       /* Index, in 'A', of next node to allocate as a non-leaf.  */
+       unsigned e = 0;
+
+       do {
+               unsigned m, n;
+               u32 freq_shifted;
+
+               /* Choose the two next lowest frequency entries.  */
+
+               if (i != sym_count &&
+                   (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
+                       m = i++;
+               else
+                       m = b++;
+
+               if (i != sym_count &&
+                   (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
+                       n = i++;
+               else
+                       n = b++;
+
+               /* Allocate a non-leaf node and link the entries to it.
+                *
+                * If we link an entry that we're visiting for the first
+                * time (via index 'i'), then we're actually linking a
+                * leaf node and it will have no effect, since the leaf
+                * will be overwritten with a non-leaf when index 'e'
+                * catches up to it.  But it's not any slower to
+                * unconditionally set the parent index.
+                *
+                * We also compute the frequency of the non-leaf node as
+                * the sum of its two children's frequencies.  */
+
+               freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
+
+               A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
+               A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
+               A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
+               e++;
+       } while (sym_count - e > 1);
+               /* When just one entry remains, it is a "leaf" that was
+                * linked to some other node.  We ignore it, since the
+                * rest of the array contains the non-leaves which we
+                * need.  (Note that we're assuming the cases with 0 or 1
+                * symbols were handled separately.) */
+}
  
-       /* Declare the array of intermediate nodes.  An intermediate node is not
-        * associated with a symbol. Instead, it represents some binary code
-        * prefix that is shared between at least 2 codewords.  There can be at
-        * most num_used_symbols - 1 intermediate nodes when creating a Huffman
-        * code.  This is because if there were at least num_used_symbols nodes,
-        * the code would be suboptimal because there would be at least one
-        * unnecessary intermediate node.
+/*
+ * Given the stripped-down Huffman tree constructed by build_tree(),
+ * determine the number of codewords that should be assigned each
+ * possible length, taking into account the length-limited constraint.
+ *
+ * @A
+ *     The array produced by build_tree(), containing parent index
+ *     information for the non-leaf nodes of the Huffman tree.  Each
+ *     entry in this array is a node; a node's parent always has a
+ *     greater index than that node itself.  This function will
+ *     overwrite the parent index information in this array, so
+ *     essentially it will destroy the tree.  However, the data in the
+ *     low NUM_SYMBOL_BITS of each entry will be preserved.
+ *
+ * @root_idx
+ *     The 0-based index of the root node in 'A', and consequently one
+ *     less than the number of tree node entries in 'A'.  (Or, really 2
+ *     less than the actual length of 'A'.)
+ *
+ * @len_counts
+ *     An array of length ('max_codeword_len' + 1) in which the number of
+ *     codewords having each length <= max_codeword_len will be
+ *     returned.
+ *
+ * @max_codeword_len
+ *     The maximum permissible codeword length.
+ */
+static void
+compute_length_counts(u32 A[restrict], unsigned root_idx,
+                     unsigned len_counts[restrict], unsigned max_codeword_len)
+{
+       /* The key observations are:
          *
-        * The worst case (greatest number of intermediate nodes) would be if
-        * all the intermediate nodes were chained together.  This results in
-        * num_used_symbols - 1 intermediate nodes.  If num_used_symbols is at
-        * least 17, this configuration would not be allowed because the LZX
-        * format constrains codes to 16 bits or less each.  However, it is
-        * still possible for there to be more than 16 intermediate nodes, as
-        * long as no leaf has a depth of more than 16.  */
-       HuffmanIntermediateNode inodes[num_used_symbols - 1];
-
-
-       /* Pointer to the leaf node of lowest frequency that hasn't already been
-        * added as the child of some intermediate note. */
-       HuffmanNode *cur_leaf;
-
-       /* Pointer past the end of the array of leaves. */
-       HuffmanNode *end_leaf = &leaves[num_used_symbols];
-
-       /* Pointer to the intermediate node of lowest frequency. */
-       HuffmanIntermediateNode     *cur_inode;
-
-       /* Pointer to the next unallocated intermediate node. */
-       HuffmanIntermediateNode     *next_inode;
-
-       /* Only jump back to here if the maximum length of the codewords allowed
-        * by the LZX format (16 bits) is exceeded. */
-try_building_tree_again:
-
-       /* Sort the leaves from those that correspond to the least frequent
-        * symbol, to those that correspond to the most frequent symbol.  If two
-        * leaves have the same frequency, they are sorted by symbol. */
-       qsort(leaves, num_used_symbols, sizeof(leaves[0]), cmp_nodes_by_freq);
-
-       cur_leaf   = &leaves[0];
-       cur_inode  = &inodes[0];
-       next_inode = &inodes[0];
-
-       /* The following loop takes the two lowest frequency nodes of those
-        * remaining and makes them the children of the next available
-        * intermediate node.  It continues until all the leaf nodes and
-        * intermediate nodes have been used up, or the maximum allowed length
-        * for the codewords is exceeded.  For the latter case, we must adjust
-        * the frequencies to be more equal and then execute this loop again. */
-       while (1) {
-
-               /* Lowest frequency node. */
-               HuffmanNode *f1;
-
-               /* Second lowest frequency node. */
-               HuffmanNode *f2;
-
-               /* Get the lowest and second lowest frequency nodes from the
-                * remaining leaves or from the intermediate nodes. */
-
-               if (cur_leaf != end_leaf && (cur_inode == next_inode ||
-                                       cur_leaf->freq <= cur_inode->node_base.freq)) {
-                       f1 = cur_leaf++;
-               } else if (cur_inode != next_inode) {
-                       f1 = (HuffmanNode*)cur_inode++;
-               }
+        * (1) We can traverse the non-leaf nodes of the tree, always
+        * visiting a parent before its children, by simply iterating
+        * through the array in reverse order.  Consequently, we can
+        * compute the depth of each node in one pass, overwriting the
+        * parent indices with depths.
+        *
+        * (2) We can initially assume that in the real Huffman tree,
+        * both children of the root are leaves.  This corresponds to two
+        * codewords of length 1.  Then, whenever we visit a (non-leaf)
+        * node during the traversal, we modify this assumption to
+        * account for the current node *not* being a leaf, but rather
+        * its two children being leaves.  This causes the loss of one
+        * codeword for the current depth and the addition of two
+        * codewords for the current depth plus one.
+        *
+        * (3) We can handle the length-limited constraint fairly easily
+        * by simply using the largest length available when a depth
+        * exceeds max_codeword_len.
+        */
  
-               if (cur_leaf != end_leaf && (cur_inode == next_inode ||
-                                       cur_leaf->freq <= cur_inode->node_base.freq)) {
-                       f2 = cur_leaf++;
-               } else if (cur_inode != next_inode) {
-                       f2 = (HuffmanNode*)cur_inode++;
-               } else {
-                       /* All nodes used up! */
-                       break;
-               }
+       for (unsigned len = 0; len <= max_codeword_len; len++)
+               len_counts[len] = 0;
+       len_counts[1] = 2;
  
-               /* next_inode becomes the parent of f1 and f2. */
-
-               next_inode->node_base.freq = f1->freq + f2->freq;
-               next_inode->node_base.sym  = INVALID_SYMBOL;
-               next_inode->left_child     = f1;
-               next_inode->right_child    = f2;
-
-               /* We need to keep track of the height so that we can detect if
-                * the length of a codeword has execeed max_codeword_len.   The
-                * parent node has a height one higher than the maximum height
-                * of its children. */
-               next_inode->node_base.height = max(f1->height, f2->height) + 1;
-
-               /* Check to see if the code length of the leaf farthest away
-                * from next_inode has exceeded the maximum code length. */
-               if (next_inode->node_base.height > max_codeword_len) {
-                       /* The code lengths can be made more uniform by making
-                        * the frequencies more uniform.  Divide all the
-                        * frequencies by 2, leaving 1 as the minimum frequency.
-                        * If this keeps happening, the symbol frequencies will
-                        * approach equality, which makes their Huffman
-                        * codewords approach the length
-                        * log_2(num_used_symbols).
-                        * */
-                       for (unsigned i = 0; i < num_used_symbols; i++)
-                               leaves[i].freq = (leaves[i].freq + 1) >> 1;
-
-                       goto try_building_tree_again;
+       /* Set the root node's depth to 0.  */
+       A[root_idx] &= SYMBOL_MASK;
+
+       for (int node = root_idx - 1; node >= 0; node--) {
+
+               /* Calculate the depth of this node.  */
+
+               unsigned parent = A[node] >> NUM_SYMBOL_BITS;
+               unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
+               unsigned depth = parent_depth + 1;
+               unsigned len = depth;
+
+               /* Set the depth of this node so that it is available
+                * when its children (if any) are processed.  */
+
+               A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
+
+               /* If needed, decrease the length to meet the
+                * length-limited constraint.  This is not the optimal
+                * method for generating length-limited Huffman codes!
+                * But it should be good enough.  */
+               if (len >= max_codeword_len) {
+                       len = max_codeword_len;
+                       do {
+                               len--;
+                       } while (len_counts[len] == 0);
                 }
-               next_inode++;
+
+               /* Account for the fact that we have a non-leaf node at
+                * the current depth.  */
+               len_counts[len]--;
+               len_counts[len + 1] += 2;
         }
+}
  
-       /* The Huffman tree is now complete, and its height is no more than
-        * max_codeword_len.  */
+/*
+ * Generate the codewords for a canonical Huffman code.
+ *
+ * @A
+ *     The output array for codewords.  In addition, initially this
+ *     array must contain the symbols, sorted primarily by frequency and
+ *     secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
+ *     each entry.
+ *
+ * @len
+ *     Output array for codeword lengths.
+ *
+ * @len_counts
+ *     An array that provides the number of codewords that will have
+ *     each possible length <= max_codeword_len.
+ *
+ * @max_codeword_len
+ *     Maximum length, in bits, of each codeword.
+ *
+ * @num_syms
+ *     Number of symbols in the alphabet, including symbols with zero
+ *     frequency.  This is the length of the 'A' and 'len' arrays.
+ */
+static void
+gen_codewords(u32 A[restrict], u8 lens[restrict],
+             const unsigned len_counts[restrict],
+             unsigned max_codeword_len, unsigned num_syms)
+{
+       u32 next_codewords[max_codeword_len + 1];
+
+       /* Given the number of codewords that will have each length,
+        * assign codeword lengths to symbols.  We do this by assigning
+        * the lengths in decreasing order to the symbols sorted
+        * primarily by increasing frequency and secondarily by
+        * increasing symbol value.  */
+       for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) {
+               unsigned count = len_counts[len];
+               while (count--)
+                       lens[A[i++] & SYMBOL_MASK] = len;
+       }
  
-       HuffmanIntermediateNode *root = next_inode - 1;
-       wimlib_assert(root->node_base.height <= max_codeword_len);
+       /* Generate the codewords themselves.  We initialize the
+        * 'next_codewords' array to provide the lexicographically first
+        * codeword of each length, then assign codewords in symbol
+        * order.  This produces a canonical code.  */
+       next_codewords[0] = 0;
+       next_codewords[1] = 0;
+       for (unsigned len = 2; len <= max_codeword_len; len++)
+               next_codewords[len] =
+                       (next_codewords[len - 1] + len_counts[len - 1]) << 1;
+
+       for (unsigned sym = 0; sym < num_syms; sym++)
+               A[sym] = next_codewords[lens[sym]]++;
+}
  
-       /* Compute the path lengths for the leaf nodes. */
-       huffman_tree_compute_path_lengths(&root->node_base, 0);
+/*
+ * ---------------------------------------------------------------------
+ *                     make_canonical_huffman_code()
+ * ---------------------------------------------------------------------
+ *
+ * Given an alphabet and the frequency of each symbol in it, construct a
+ * length-limited canonical Huffman code.
+ *
+ * @num_syms
+ *     The number of symbols in the alphabet.  The symbols are the
+ *     integers in the range [0, num_syms - 1].  This parameter must be
+ *     at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
+ *
+ * @max_codeword_len
+ *     The maximum permissible codeword length.
+ *
+ * @freqs
+ *     An array of @num_syms entries, each of which specifies the
+ *     frequency of the corresponding symbol.  It is valid for some,
+ *     none, or all of the frequencies to be 0.
+ *
+ * @lens
+ *     An array of @num_syms entries in which this function will return
+ *     the length, in bits, of the codeword assigned to each symbol.
+ *     Symbols with 0 frequency will not have codewords per se, but
+ *     their entries in this array will be set to 0.  No lengths greater
+ *     than @max_codeword_len will be assigned.
+ *
+ * @codewords
+ *     An array of @num_syms entries in which this function will return
+ *     the codeword for each symbol, right-justified and padded on the
+ *     left with zeroes.  Codewords for symbols with 0 frequency will be
+ *     undefined.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * This function builds a length-limited canonical Huffman code.
+ *
+ * A length-limited Huffman code contains no codewords longer than some
+ * specified length, and has exactly (with some algorithms) or
+ * approximately (with the algorithm used here) the minimum weighted path
+ * length from the root, given this constraint.
+ *
+ * A canonical Huffman code satisfies the properties that a codeword
+ * never lexicographically precedes a shorter codeword, and the
+ * lexicographic ordering of codewords of the same length is the same as
+ * the lexicographic ordering of the corresponding symbols.  A canonical
+ * Huffman code, or more generally a canonical prefix code, can be
+ * reconstructed from only a list containing the codeword length of each
+ * symbol.
+ *
+ * The classic algorithm to generate a Huffman code creates a node for
+ * each symbol, then inserts these nodes into a min-heap keyed by symbol
+ * frequency.  Then, repeatedly, the two lowest-frequency nodes are
+ * removed from the min-heap and added as the children of a new node
+ * having frequency equal to the sum of its two children, which is then
+ * inserted into the min-heap.  When only a single node remains in the
+ * min-heap, it is the root of the Huffman tree.  The codeword for each
+ * symbol is determined by the path needed to reach the corresponding
+ * node from the root.  Descending to the left child appends a 0 bit,
+ * whereas descending to the right child appends a 1 bit.
+ *
+ * The classic algorithm is relatively easy to understand, but it is
+ * subject to a number of inefficiencies.  In practice, it is fastest to
+ * first sort the symbols by frequency.  (This itself can be subject to
+ * an optimization based on the fact that most frequencies tend to be
+ * low.)  At the same time, we sort secondarily by symbol value, which
+ * aids the process of generating a canonical code.  Then, during tree
+ * construction, no heap is necessary because both the leaf nodes and the
+ * unparented non-leaf nodes can be easily maintained in sorted order.
+ * Consequently, there can never be more than two possibilities for the
+ * next-lowest-frequency node.
+ *
+ * In addition, because we're generating a canonical code, we actually
+ * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
+ * This is because for canonical code generation we don't need to know
+ * where the symbols are in the tree.  Rather, we only need to know how
+ * many leaf nodes have each depth (codeword length).  And this
+ * information can, in fact, be quickly generated from the tree of
+ * non-leaves only.
+ *
+ * Furthermore, we can build this stripped-down Huffman tree directly in
+ * the array in which the codewords are to be generated, provided that
+ * these array slots are large enough to hold a symbol and frequency
+ * value.
+ *
+ * Still furthermore, we don't even need to maintain explicit child
+ * pointers.  We only need the parent pointers, and even those can be
+ * overwritten in-place with depth information as part of the process of
+ * extracting codeword lengths from the tree.  So in summary, we do NOT
+ * need a big structure like:
+ *
+ *     struct huffman_tree_node {
+ *             unsigned int symbol;
+ *             unsigned int frequency;
+ *             unsigned int depth;
+ *             struct huffman_tree_node *left_child;
+ *             struct huffman_tree_node *right_child;
+ *     };
+ *
+ *
+ *   ... which often gets used in "naive" implementations of Huffman code
+ *   generation.
+ *
+ * Most of these optimizations are based on the implementation in 7-Zip
+ * (source file: C/HuffEnc.c), which has been placed in the public domain
+ * by Igor Pavlov.  But I've rewritten the code with extensive comments,
+ * as it took me a while to figure out what it was doing...!
+ *
+ * ---------------------------------------------------------------------
+ *
+ * NOTE: in general, the same frequencies can be used to generate
+ * different length-limited canonical Huffman codes.  One choice we have
+ * is during tree construction, when we must decide whether to prefer a
+ * leaf or non-leaf when there is a tie in frequency.  Another choice we
+ * have is how to deal with codewords that would exceed @max_codeword_len
+ * bits in length.  Both of these choices affect the resulting codeword
+ * lengths, which otherwise can be mapped uniquely onto the resulting
+ * canonical Huffman code.
+ *
+ * Normally, there is no problem with choosing one valid code over
+ * another, provided that they produce similar compression ratios.
+ * However, the LZMS compression format uses adaptive Huffman coding.  It
+ * requires that both the decompressor and compressor build a canonical
+ * code equivalent to that which can be generated by using the classic
+ * Huffman tree construction algorithm and always processing leaves
+ * before non-leaves when there is a frequency tie.  Therefore, we make
+ * sure to do this.  This method also has the advantage of sometimes
+ * shortening the longest codeword that is generated.
+ *
+ * There also is the issue of how codewords longer than @max_codeword_len
+ * are dealt with.  Fortunately, for LZMS this is irrelevant because
+ * because for the LZMS alphabets no codeword can ever exceed
+ * LZMS_MAX_CODEWORD_LEN (= 15).  Since the LZMS algorithm regularly
+ * halves all frequencies, the frequencies cannot become high enough for
+ * a length 16 codeword to be generated.  Specifically, I think that if
+ * ties are broken in favor of non-leaves (as we do), the lowest total
+ * frequency that would give a length-16 codeword would be the sum of the
+ * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which
+ * is 3570.  And in LZMS we can't get a frequency that high based on the
+ * alphabet sizes, rebuild frequencies, and scaling factors.  This
+ * worst-case scenario is based on the following degenerate case (only
+ * the bottom of the tree shown):
+ *
+ *                          ...
+ *                        17
+ *                       /  \
+ *                      10   7
+ *                     / \
+ *                    6   4
+ *                   / \
+ *                  3   3
+ *                 / \
+ *                2   1
+ *               / \
+ *              1   1
+ *
+ * Excluding the first leaves (those with value 1), each leaf value must
+ * be greater than the non-leaf up 1 and down 2 from it; otherwise that
+ * leaf would have taken precedence over that non-leaf and been combined
+ * with the leaf below, thereby decreasing the height compared to that
+ * shown.
+ *
+ * Interesting fact: if we were to instead prioritize non-leaves over
+ * leaves, then the worst case frequencies would be the Fibonacci
+ * sequence, plus an extra frequency of 1.  In this hypothetical
+ * scenario, it would be slightly easier for longer codewords to be
+ * generated.
+ */
+void
+make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
+                           const u32 freqs[restrict],
+                           u8 lens[restrict], u32 codewords[restrict])
+{
+       u32 *A = codewords;
+       unsigned num_used_syms;
+
+       /* Assumptions  */
+       wimlib_assert2(num_syms >= 2);
+       wimlib_assert2(num_syms <= (1 << NUM_SYMBOL_BITS));
+       wimlib_assert2(max_codeword_len > 0);
+       wimlib_assert2(max_codeword_len <= 32);
+
+       /* We begin by sorting the symbols primarily by frequency and
+        * secondarily by symbol value.  As an optimization, the array
+        * used for this purpose ('A') shares storage with the space in
+        * which we will eventually return the codewords.  */
+
+       num_used_syms = sort_symbols(num_syms, freqs, lens, A);
+
+       /* 'num_used_syms' is the number of symbols with nonzero
+        * frequency.  This may be less than @num_syms.  'num_used_syms'
+        * is also the number of entries in 'A' that are valid.  Each
+        * entry consists of a distinct symbol and a nonzero frequency
+        * packed into a 32-bit integer.  */
+
+       /* Handle special cases where only 0 or 1 symbols were used (had
+        * nonzero frequency).  */
+
+       if (unlikely(num_used_syms == 0)) {
+               /* Code is empty.  sort_symbols() already set all lengths
+                * to 0, so there is nothing more to do.  */
+               return;
+       }
  
-       /* Sort the leaf nodes primarily by code length and secondarily by
-        * symbol.  */
-       qsort(leaves, num_used_symbols, sizeof(leaves[0]), cmp_nodes_by_code_len);
+       if (unlikely(num_used_syms == 1)) {
+               /* Only one symbol was used, so we only need one
+                * codeword.  But two codewords are needed to form the
+                * smallest complete Huffman code, which uses codewords 0
+                * and 1.  Therefore, we choose another symbol to which
+                * to assign a codeword.  We use 0 (if the used symbol is
+                * not 0) or 1 (if the used symbol is 0).  In either
+                * case, the lesser-valued symbol must be assigned
+                * codeword 0 so that the resulting code is canonical.  */
+
+               unsigned sym = A[0] & SYMBOL_MASK;
+               unsigned nonzero_idx = sym ? sym : 1;
+
+               codewords[0] = 0;
+               lens[0] = 1;
+               codewords[nonzero_idx] = 1;
+               lens[nonzero_idx] = 1;
+               return;
+       }
  
-       u16 cur_codeword = 0;
-       unsigned cur_codeword_len = 0;
-       for (unsigned i = 0; i < num_used_symbols; i++) {
+       /* Build a stripped-down version of the Huffman tree, sharing the
+        * array 'A' with the symbol values.  Then extract length counts
+        * from the tree and use them to generate the final codewords.  */
  
-               /* Each time a codeword becomes one longer, the current codeword
-                * is left shifted by one place.  This is part of the procedure
-                * for enumerating the canonical Huffman code.  Additionally,
-                * whenever a codeword is used, 1 is added to the current
-                * codeword.  */
+       build_tree(A, num_used_syms);
  
-               unsigned len_diff = leaves[i].path_len - cur_codeword_len;
-               cur_codeword <<= len_diff;
-               cur_codeword_len += len_diff;
+       {
+               unsigned len_counts[max_codeword_len + 1];
  
-               u16 sym = leaves[i].sym;
-               codewords[sym] = cur_codeword;
-               lens[sym] = cur_codeword_len;
+               compute_length_counts(A, num_used_syms - 2,
+                                     len_counts, max_codeword_len);
  
-               cur_codeword++;
+               gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
         }
  }
diff --git a/src/lzms-compress.c b/src/lzms-compress.c

index 64a674214b0766b17a6758650ed6558f38c4e2f6..3b0caab4a963e75cd60161d2fb5ddf5a05b8ffe2 100644 (file)
--- a/src/lzms-compress.c
+++ b/src/lzms-compress.c
@@ -163,7 +163,7 @@ struct lzms_huffman_encoder {
         u8 lens[LZMS_MAX_NUM_SYMS];
  
         /* The codeword of each symbol in the Huffman code.  */
-       u16 codewords[LZMS_MAX_NUM_SYMS];
+       u32 codewords[LZMS_MAX_NUM_SYMS];
  };
  
  /* State of the LZMS compressor.  */
diff --git a/src/lzms-decompress.c b/src/lzms-decompress.c

index 6cc6dab326e795484ed678a3716cf06bff77efe0..2bc1c754cd69e49b86c17cce9e934ed888392cd4 100644 (file)
--- a/src/lzms-decompress.c
+++ b/src/lzms-decompress.c
@@ -304,7 +304,7 @@ struct lzms_huffman_decoder {
         u8 lens[LZMS_MAX_NUM_SYMS];
  
         /* The codeword of each symbol in the Huffman code.  */
-       u16 codewords[LZMS_MAX_NUM_SYMS];
+       u32 codewords[LZMS_MAX_NUM_SYMS];
  
         /* A table for quickly decoding symbols encoded using the Huffman code.
          */
diff --git a/src/lzx-compress.c b/src/lzx-compress.c

index 360ee9c4e49a528da6b90fe6b26a4249f2ef3998..edda90aefc1a59cc96bc37779b32df762a1b4521 100644 (file)
--- a/src/lzx-compress.c
+++ b/src/lzx-compress.c
@@ -179,9 +179,9 @@ typedef u32 block_cost_t;
  
  /* Codewords for the LZX main, length, and aligned offset Huffman codes  */
  struct lzx_codewords {
-       u16 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
-       u16 len[LZX_LENCODE_NUM_SYMBOLS];
-       u16 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
+       u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
+       u32 len[LZX_LENCODE_NUM_SYMBOLS];
+       u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
  };
  
  /* Codeword lengths (in bits) for the LZX main, length, and aligned offset
@@ -519,7 +519,7 @@ lzx_build_precode(const u8 lens[restrict],
                   input_idx_t precode_freqs[restrict LZX_PRECODE_NUM_SYMBOLS],
                   u8 output_syms[restrict num_syms],
                   u8 precode_lens[restrict LZX_PRECODE_NUM_SYMBOLS],
-                 u16 precode_codewords[restrict LZX_PRECODE_NUM_SYMBOLS],
+                 u32 precode_codewords[restrict LZX_PRECODE_NUM_SYMBOLS],
                   unsigned *num_additional_bits_ret)
  {
         memset(precode_freqs, 0,
@@ -688,7 +688,7 @@ lzx_write_compressed_code(struct output_bitstream *out,
         input_idx_t precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
         u8 output_syms[num_syms];
         u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
-       u16 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
+       u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
         unsigned i;
         unsigned num_output_syms;
         u8 precode_sym;
diff --git a/src/xpress-compress.c b/src/xpress-compress.c

index 6fb7e1c4846a92db78d9969875b1aa41e35d6b36..228ac51e623b166aeeb58d653b746d881c9d6e3c 100644 (file)
--- a/src/xpress-compress.c
+++ b/src/xpress-compress.c
@@ -50,7 +50,7 @@ struct xpress_compressor {
         u32 max_window_size;
         struct xpress_match *matches;
         input_idx_t *prev_tab;
-       u16 codewords[XPRESS_NUM_SYMBOLS];
+       u32 codewords[XPRESS_NUM_SYMBOLS];
         u8 lens[XPRESS_NUM_SYMBOLS];
         struct xpress_record_ctx record_ctx;
  };
@@ -71,7 +71,7 @@ struct xpress_match {
  static void
  xpress_write_match(struct xpress_match match,
                    struct output_bitstream *restrict ostream,
-                  const u16 codewords[restrict],
+                  const u32 codewords[restrict],
                    const u8 lens[restrict])
  {
         u8 len_hdr = min(match.adjusted_len, 0xf);
@@ -95,7 +95,7 @@ static void
  xpress_write_matches_and_literals(struct output_bitstream *ostream,
                                   const struct xpress_match matches[restrict],
                                   input_idx_t num_matches,
-                                 const u16 codewords[restrict],
+                                 const u32 codewords[restrict],
                                   const u8 lens[restrict])
  {
         for (input_idx_t i = 0; i < num_matches; i++) {
author	Eric Biggers <ebiggers3@gmail.com>
	Thu, 29 May 2014 05:42:54 +0000 (00:42 -0500)
committer	Eric Biggers <ebiggers3@gmail.com>
	Sat, 31 May 2014 14:52:53 +0000 (09:52 -0500)
include/wimlib/compress_common.h		patch \| blob \| history
src/compress_common.c		patch \| blob \| history
src/lzms-compress.c		patch \| blob \| history
src/lzms-decompress.c		patch \| blob \| history
src/lzx-compress.c		patch \| blob \| history
src/xpress-compress.c		patch \| blob \| history