v1.14.4

[wimlib] / src / compress_common.c
diff --git a/src/compress_common.c b/src/compress_common.c

index c6d1133f901c909181a170f9bab0b1870358b7ab..0e2ba05a3a067d70d0d58470c50325b1e74c4d11 100644 (file)
--- a/src/compress_common.c
+++ b/src/compress_common.c
@@ -3,21 +3,28 @@
   *
   * Code for compression shared among multiple compression formats.
   *
- * The following copying information applies to this specific source code file:
- *
- * Written in 2012-2014 by Eric Biggers <ebiggers3@gmail.com>
- *
- * To the extent possible under law, the author(s) have dedicated all copyright
- * and related and neighboring rights to this software to the public domain
- * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
- * Dedication (the "CC0").
- *
- * This software is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
- *
- * You should have received a copy of the CC0 along with this software; if not
- * see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
   */
  
  #ifdef HAVE_CONFIG_H
@@ -26,13 +33,16 @@
  
  #include <string.h>
  
+#include "wimlib/assert.h"
  #include "wimlib/compress_common.h"
  #include "wimlib/util.h"
  
-/* Given the binary tree node A[subtree_idx] whose children already
- * satisfy the maxheap property, swap the node with its greater child
- * until it is greater than both its children, so that the maxheap
- * property is satisfied in the subtree rooted at A[subtree_idx].  */
+/*
+ * Given the binary tree node A[subtree_idx] whose children already satisfy the
+ * maxheap property, swap the node with its greater child until it is greater
+ * than or equal to both of its children, so that the maxheap property is
+ * satisfied in the subtree rooted at A[subtree_idx].  'A' uses 1-based indices.
+ */
  static void
  heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
  {
@@ -53,297 +63,306 @@ heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
         A[parent_idx] = v;
  }
  
-/* Rearrange the array 'A' so that it satisfies the maxheap property.
+/*
+ * Rearrange the array 'A' so that it satisfies the maxheap property.
   * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
   */
  static void
  heapify_array(u32 A[], unsigned length)
  {
-       for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
+       unsigned subtree_idx;
+
+       for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
                 heapify_subtree(A, length, subtree_idx);
  }
  
-/* Sort the array 'A', which contains 'length' unsigned 32-bit integers.  */
+/*
+ * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
+ *
+ * Note: name this function heap_sort() instead of heapsort() to avoid colliding
+ * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
+ * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
+ */
  static void
-heapsort(u32 A[], unsigned length)
+heap_sort(u32 A[], unsigned length)
  {
         A--; /* Use 1-based indices  */
  
         heapify_array(A, length);
  
         while (length >= 2) {
-               swap(A[1], A[length]);
+               u32 tmp = A[length];
+
+               A[length] = A[1];
+               A[1] = tmp;
                 length--;
                 heapify_subtree(A, length, 1);
         }
  }
  
  #define NUM_SYMBOL_BITS 10
-#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
+#define NUM_FREQ_BITS  (32 - NUM_SYMBOL_BITS)
+#define SYMBOL_MASK    ((1 << NUM_SYMBOL_BITS) - 1)
+#define FREQ_MASK      (~SYMBOL_MASK)
+
+#define GET_NUM_COUNTERS(num_syms)     (num_syms)
  
  /*
- * Sort the symbols primarily by frequency and secondarily by symbol
- * value.  Discard symbols with zero frequency and fill in an array with
- * the remaining symbols, along with their frequencies.  The low
- * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
- * value, and the remaining bits will contain the frequency.
+ * Sort the symbols primarily by frequency and secondarily by symbol value.
+ * Discard symbols with zero frequency and fill in an array with the remaining
+ * symbols, along with their frequencies.  The low NUM_SYMBOL_BITS bits of each
+ * array entry will contain the symbol value, and the remaining bits will
+ * contain the frequency.
   *
   * @num_syms
- *     Number of symbols in the alphabet.
- *     Can't be greater than (1 << NUM_SYMBOL_BITS).
+ *     Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
   *
   * @freqs[num_syms]
- *     The frequency of each symbol.
+ *     Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
   *
   * @lens[num_syms]
- *     An array that eventually will hold the length of each codeword.
- *     This function only fills in the codeword lengths for symbols that
- *     have zero frequency, which are not well defined per se but will
- *     be set to 0.
+ *     An array that eventually will hold the length of each codeword.  This
+ *     function only fills in the codeword lengths for symbols that have zero
+ *     frequency, which are not well defined per se but will be set to 0.
   *
   * @symout[num_syms]
   *     The output array, described above.
   *
- * Returns the number of entries in 'symout' that were filled.  This is
- * the number of symbols that have nonzero frequency.
+ * Returns the number of entries in 'symout' that were filled.  This is the
+ * number of symbols that have nonzero frequency.
   */
  static unsigned
-sort_symbols(unsigned num_syms, const u32 freqs[restrict],
-            u8 lens[restrict], u32 symout[restrict])
+sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[])
  {
+       unsigned sym;
+       unsigned i;
         unsigned num_used_syms;
         unsigned num_counters;
+       unsigned counters[GET_NUM_COUNTERS(MAX_NUM_SYMS)];
  
-       /* We rely on heapsort, but with an added optimization.  Since
-        * it's common for most symbol frequencies to be low, we first do
-        * a count sort using a limited number of counters.  High
-        * frequencies will be counted in the last counter, and only they
-        * will be sorted with heapsort.
+       /*
+        * We use heapsort, but with an added optimization.  Since often most
+        * symbol frequencies are low, we first do a count sort using a limited
+        * number of counters.  High frequencies are counted in the last
+        * counter, and only they will be sorted with heapsort.
          *
          * Note: with more symbols, it is generally beneficial to have more
-        * counters.  About 1 counter per 4 symbols seems fast.
-        *
-        * Note: I also tested radix sort, but even for large symbol
-        * counts (> 255) and frequencies bounded at 16 bits (enabling
-        * radix sort by just two base-256 digits), it didn't seem any
-        * faster than the method implemented here.
-        *
-        * Note: I tested the optimized quicksort implementation from
-        * glibc (with indirection overhead removed), but it was only
-        * marginally faster than the simple heapsort implemented here.
-        *
-        * Tests were done with building the codes for LZX.  Results may
-        * vary for different compression algorithms...!  */
-
-       num_counters = ALIGN(DIV_ROUND_UP(num_syms, 4), 4);
+        * counters.  About 1 counter per symbol seems fastest.
+        */
  
-       unsigned counters[num_counters];
+       num_counters = GET_NUM_COUNTERS(num_syms);
  
-       memset(counters, 0, sizeof(counters));
+       memset(counters, 0, num_counters * sizeof(counters[0]));
  
-       /* Count the frequencies.  */
-       for (unsigned sym = 0; sym < num_syms; sym++)
-               counters[min(freqs[sym], num_counters - 1)]++;
+       /* Count the frequencies. */
+       for (sym = 0; sym < num_syms; sym++)
+               counters[MIN(freqs[sym], num_counters - 1)]++;
  
-       /* Make the counters cumulative, ignoring the zero-th, which
-        * counted symbols with zero frequency.  As a side effect, this
-        * calculates the number of symbols with nonzero frequency.  */
+       /*
+        * Make the counters cumulative, ignoring the zero-th, which counted
+        * symbols with zero frequency.  As a side effect, this calculates the
+        * number of symbols with nonzero frequency.
+        */
         num_used_syms = 0;
-       for (unsigned i = 1; i < num_counters; i++) {
+       for (i = 1; i < num_counters; i++) {
                 unsigned count = counters[i];
+
                 counters[i] = num_used_syms;
                 num_used_syms += count;
         }
  
-       /* Sort nonzero-frequency symbols using the counters.  At the
-        * same time, set the codeword lengths of zero-frequency symbols
-        * to 0.  */
-       for (unsigned sym = 0; sym < num_syms; sym++) {
+       /*
+        * Sort nonzero-frequency symbols using the counters.  At the same time,
+        * set the codeword lengths of zero-frequency symbols to 0.
+        */
+       for (sym = 0; sym < num_syms; sym++) {
                 u32 freq = freqs[sym];
+
                 if (freq != 0) {
-                       symout[counters[min(freq, num_counters - 1)]++] =
+                       symout[counters[MIN(freq, num_counters - 1)]++] =
                                 sym | (freq << NUM_SYMBOL_BITS);
                 } else {
                         lens[sym] = 0;
                 }
         }
  
-       /* Sort the symbols counted in the last counter.  */
-       heapsort(symout + counters[num_counters - 2],
-                counters[num_counters - 1] - counters[num_counters - 2]);
+       /* Sort the symbols counted in the last counter. */
+       heap_sort(symout + counters[num_counters - 2],
+                 counters[num_counters - 1] - counters[num_counters - 2]);
  
         return num_used_syms;
  }
  
  /*
- * Build the Huffman tree.
+ * Build a Huffman tree.
   *
   * This is an optimized implementation that
   *     (a) takes advantage of the frequencies being already sorted;
- *     (b) only generates non-leaf nodes, since the non-leaf nodes of a
- *         Huffman tree are sufficient to generate a canonical code;
+ *     (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
+ *         tree are sufficient to generate a canonical code;
   *     (c) Only stores parent pointers, not child pointers;
- *     (d) Produces the nodes in the same memory used for input
- *         frequency information.
- *
- * Array 'A', which contains 'sym_count' entries, is used for both input
- * and output.  For this function, 'sym_count' must be at least 2.
- *
- * For input, the array must contain the frequencies of the symbols,
- * sorted in increasing order.  Specifically, each entry must contain a
- * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
- * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
- * Although these bits will, in fact, contain the symbols that correspond
- * to the frequencies, this function is concerned with frequencies only
- * and keeps the symbols as-is.
- *
- * For output, this function will produce the non-leaf nodes of the
- * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
- * entries of the array.  Entry A[sym_count - 2] will represent the root
- * node.  Each other node will contain the zero-based index of its parent
- * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
- * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
- * note that although these low bits will, in fact, contain a symbol
- * value, this symbol will have *no relationship* with the Huffman tree
- * node that happens to occupy the same slot.  This is because this
+ *     (d) Produces the nodes in the same memory used for input frequency
+ *         information.
+ *
+ * Array 'A', which contains 'sym_count' entries, is used for both input and
+ * output.  For this function, 'sym_count' must be at least 2.
+ *
+ * For input, the array must contain the frequencies of the symbols, sorted in
+ * increasing order.  Specifically, each entry must contain a frequency left
+ * shifted by NUM_SYMBOL_BITS bits.  Any data in the low NUM_SYMBOL_BITS bits of
+ * the entries will be ignored by this function.  Although these bits will, in
+ * fact, contain the symbols that correspond to the frequencies, this function
+ * is concerned with frequencies only and keeps the symbols as-is.
+ *
+ * For output, this function will produce the non-leaf nodes of the Huffman
+ * tree.  These nodes will be stored in the first (sym_count - 1) entries of the
+ * array.  Entry A[sym_count - 2] will represent the root node.  Each other node
+ * will contain the zero-based index of its parent node in 'A', left shifted by
+ * NUM_SYMBOL_BITS bits.  The low NUM_SYMBOL_BITS bits of each entry in A will
+ * be kept as-is.  Again, note that although these low bits will, in fact,
+ * contain a symbol value, this symbol will have *no relationship* with the
+ * Huffman tree node that happens to occupy the same slot.  This is because this
   * implementation only generates the non-leaf nodes of the tree.
   */
  static void
  build_tree(u32 A[], unsigned sym_count)
  {
-       /* Index, in 'A', of next lowest frequency symbol that has not
-        * yet been processed.  */
+       const unsigned last_idx = sym_count - 1;
+
+       /* Index of the next lowest frequency leaf that still needs a parent */
         unsigned i = 0;
  
-       /* Index, in 'A', of next lowest frequency parentless non-leaf
-        * node; or, if equal to 'e', then no such node exists yet.  */
+       /*
+        * Index of the next lowest frequency non-leaf that still needs a
+        * parent, or 'e' if there is currently no such node
+        */
         unsigned b = 0;
  
-       /* Index, in 'A', of next node to allocate as a non-leaf.  */
+       /* Index of the next spot for a non-leaf (will overwrite a leaf) */
         unsigned e = 0;
  
         do {
-               unsigned m, n;
-               u32 freq_shifted;
-
-               /* Choose the two next lowest frequency entries.  */
+               u32 new_freq;
  
-               if (i != sym_count &&
-                   (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
-                       m = i++;
-               else
-                       m = b++;
-
-               if (i != sym_count &&
-                   (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
-                       n = i++;
-               else
-                       n = b++;
-
-               /* Allocate a non-leaf node and link the entries to it.
-                *
-                * If we link an entry that we're visiting for the first
-                * time (via index 'i'), then we're actually linking a
-                * leaf node and it will have no effect, since the leaf
-                * will be overwritten with a non-leaf when index 'e'
-                * catches up to it.  But it's not any slower to
-                * unconditionally set the parent index.
+               /*
+                * Select the next two lowest frequency nodes among the leaves
+                * A[i] and non-leaves A[b], and create a new node A[e] to be
+                * their parent.  Set the new node's frequency to the sum of the
+                * frequencies of its two children.
                  *
-                * We also compute the frequency of the non-leaf node as
-                * the sum of its two children's frequencies.  */
-
-               freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
-
-               A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
-               A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
-               A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
-               e++;
-       } while (sym_count - e > 1);
-               /* When just one entry remains, it is a "leaf" that was
-                * linked to some other node.  We ignore it, since the
-                * rest of the array contains the non-leaves which we
-                * need.  (Note that we're assuming the cases with 0 or 1
-                * symbols were handled separately.) */
+                * Usually the next two lowest frequency nodes are of the same
+                * type (leaf or non-leaf), so check those cases first.
+                */
+               if (i + 1 <= last_idx &&
+                   (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
+                       /* Two leaves */
+                       new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
+                       i += 2;
+               } else if (b + 2 <= e &&
+                          (i > last_idx ||
+                           (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
+                       /* Two non-leaves */
+                       new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
+                       A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+                       A[b + 1] = (e << NUM_SYMBOL_BITS) |
+                                  (A[b + 1] & SYMBOL_MASK);
+                       b += 2;
+               } else {
+                       /* One leaf and one non-leaf */
+                       new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
+                       A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+                       i++;
+                       b++;
+               }
+               A[e] = new_freq | (A[e] & SYMBOL_MASK);
+               /*
+                * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
+                * tree is complete once we've created 'n - 1' non-leaves.
+                */
+       } while (++e < last_idx);
  }
  
  /*
- * Given the stripped-down Huffman tree constructed by build_tree(),
- * determine the number of codewords that should be assigned each
- * possible length, taking into account the length-limited constraint.
+ * Given the stripped-down Huffman tree constructed by build_tree(), determine
+ * the number of codewords that should be assigned each possible length, taking
+ * into account the length-limited constraint.
   *
   * @A
- *     The array produced by build_tree(), containing parent index
- *     information for the non-leaf nodes of the Huffman tree.  Each
- *     entry in this array is a node; a node's parent always has a
- *     greater index than that node itself.  This function will
- *     overwrite the parent index information in this array, so
- *     essentially it will destroy the tree.  However, the data in the
- *     low NUM_SYMBOL_BITS of each entry will be preserved.
+ *     The array produced by build_tree(), containing parent index information
+ *     for the non-leaf nodes of the Huffman tree.  Each entry in this array is
+ *     a node; a node's parent always has a greater index than that node
+ *     itself.  This function will overwrite the parent index information in
+ *     this array, so essentially it will destroy the tree.  However, the data
+ *     in the low NUM_SYMBOL_BITS of each entry will be preserved.
   *
   * @root_idx
- *     The 0-based index of the root node in 'A', and consequently one
- *     less than the number of tree node entries in 'A'.  (Or, really 2
- *     less than the actual length of 'A'.)
+ *     The 0-based index of the root node in 'A', and consequently one less
+ *     than the number of tree node entries in 'A'.  (Or, really 2 less than
+ *     the actual length of 'A'.)
   *
   * @len_counts
   *     An array of length ('max_codeword_len' + 1) in which the number of
- *     codewords having each length <= max_codeword_len will be
- *     returned.
+ *     codewords having each length <= max_codeword_len will be returned.
   *
   * @max_codeword_len
   *     The maximum permissible codeword length.
   */
  static void
-compute_length_counts(u32 A[restrict], unsigned root_idx,
-                     unsigned len_counts[restrict], unsigned max_codeword_len)
+compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
+                     unsigned max_codeword_len)
  {
-       /* The key observations are:
+       unsigned len;
+       int node;
+
+       /*
+        * The key observations are:
          *
-        * (1) We can traverse the non-leaf nodes of the tree, always
-        * visiting a parent before its children, by simply iterating
-        * through the array in reverse order.  Consequently, we can
-        * compute the depth of each node in one pass, overwriting the
-        * parent indices with depths.
+        * (1) We can traverse the non-leaf nodes of the tree, always visiting a
+        *     parent before its children, by simply iterating through the array
+        *     in reverse order.  Consequently, we can compute the depth of each
+        *     node in one pass, overwriting the parent indices with depths.
          *
-        * (2) We can initially assume that in the real Huffman tree,
-        * both children of the root are leaves.  This corresponds to two
-        * codewords of length 1.  Then, whenever we visit a (non-leaf)
-        * node during the traversal, we modify this assumption to
-        * account for the current node *not* being a leaf, but rather
-        * its two children being leaves.  This causes the loss of one
-        * codeword for the current depth and the addition of two
-        * codewords for the current depth plus one.
+        * (2) We can initially assume that in the real Huffman tree, both
+        *     children of the root are leaves.  This corresponds to two
+        *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
+        *     during the traversal, we modify this assumption to account for
+        *     the current node *not* being a leaf, but rather its two children
+        *     being leaves.  This causes the loss of one codeword for the
+        *     current depth and the addition of two codewords for the current
+        *     depth plus one.
          *
-        * (3) We can handle the length-limited constraint fairly easily
-        * by simply using the largest length available when a depth
-        * exceeds max_codeword_len.
+        * (3) We can handle the length-limited constraint fairly easily by
+        *     simply using the largest length available when a depth exceeds
+        *     max_codeword_len.
          */
  
-       for (unsigned len = 0; len <= max_codeword_len; len++)
+       for (len = 0; len <= max_codeword_len; len++)
                 len_counts[len] = 0;
         len_counts[1] = 2;
  
-       /* Set the root node's depth to 0.  */
+       /* Set the root node's depth to 0. */
         A[root_idx] &= SYMBOL_MASK;
  
-       for (int node = root_idx - 1; node >= 0; node--) {
+       for (node = root_idx - 1; node >= 0; node--) {
  
-               /* Calculate the depth of this node.  */
+               /* Calculate the depth of this node. */
  
                 unsigned parent = A[node] >> NUM_SYMBOL_BITS;
                 unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
                 unsigned depth = parent_depth + 1;
                 unsigned len = depth;
  
-               /* Set the depth of this node so that it is available
-                * when its children (if any) are processed.  */
-
+               /*
+                * Set the depth of this node so that it is available when its
+                * children (if any) are processed.
+                */
                 A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
  
-               /* If needed, decrease the length to meet the
-                * length-limited constraint.  This is not the optimal
-                * method for generating length-limited Huffman codes!
-                * But it should be good enough.  */
+               /*
+                * If needed, decrease the length to meet the length-limited
+                * constraint.  This is not the optimal method for generating
+                * length-limited Huffman codes!  But it should be good enough.
+                */
                 if (len >= max_codeword_len) {
                         len = max_codeword_len;
                         do {
@@ -351,8 +370,10 @@ compute_length_counts(u32 A[restrict], unsigned root_idx,
                         } while (len_counts[len] == 0);
                 }
  
-               /* Account for the fact that we have a non-leaf node at
-                * the current depth.  */
+               /*
+                * Account for the fact that we have a non-leaf node at the
+                * current depth.
+                */
                 len_counts[len]--;
                 len_counts[len + 1] += 2;
         }
@@ -382,34 +403,40 @@ compute_length_counts(u32 A[restrict], unsigned root_idx,
   *     frequency.  This is the length of the 'A' and 'len' arrays.
   */
  static void
-gen_codewords(u32 A[restrict], u8 lens[restrict],
-             const unsigned len_counts[restrict],
+gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
               unsigned max_codeword_len, unsigned num_syms)
  {
-       u32 next_codewords[max_codeword_len + 1];
-
-       /* Given the number of codewords that will have each length,
-        * assign codeword lengths to symbols.  We do this by assigning
-        * the lengths in decreasing order to the symbols sorted
-        * primarily by increasing frequency and secondarily by
-        * increasing symbol value.  */
-       for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) {
+       u32 next_codewords[MAX_CODEWORD_LEN + 1];
+       unsigned i;
+       unsigned len;
+       unsigned sym;
+
+       /*
+        * Given the number of codewords that will have each length, assign
+        * codeword lengths to symbols.  We do this by assigning the lengths in
+        * decreasing order to the symbols sorted primarily by increasing
+        * frequency and secondarily by increasing symbol value.
+        */
+       for (i = 0, len = max_codeword_len; len >= 1; len--) {
                 unsigned count = len_counts[len];
+
                 while (count--)
                         lens[A[i++] & SYMBOL_MASK] = len;
         }
  
-       /* Generate the codewords themselves.  We initialize the
+       /*
+        * Generate the codewords themselves.  We initialize the
          * 'next_codewords' array to provide the lexicographically first
-        * codeword of each length, then assign codewords in symbol
-        * order.  This produces a canonical code.  */
+        * codeword of each length, then assign codewords in symbol order.  This
+        * produces a canonical code.
+        */
         next_codewords[0] = 0;
         next_codewords[1] = 0;
-       for (unsigned len = 2; len <= max_codeword_len; len++)
+       for (len = 2; len <= max_codeword_len; len++)
                 next_codewords[len] =
                         (next_codewords[len - 1] + len_counts[len - 1]) << 1;
  
-       for (unsigned sym = 0; sym < num_syms; sym++)
+       for (sym = 0; sym < num_syms; sym++)
                 A[sym] = next_codewords[lens[sym]]++;
  }
  
@@ -422,88 +449,81 @@ gen_codewords(u32 A[restrict], u8 lens[restrict],
   * length-limited canonical Huffman code.
   *
   * @num_syms
- *     The number of symbols in the alphabet.  The symbols are the
- *     integers in the range [0, num_syms - 1].  This parameter must be
- *     at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
+ *     The number of symbols in the alphabet.  The symbols are the integers in
+ *     the range [0, num_syms - 1].  This parameter must be at least 2 and
+ *     must not exceed (1 << NUM_SYMBOL_BITS).
   *
   * @max_codeword_len
   *     The maximum permissible codeword length.
   *
   * @freqs
- *     An array of @num_syms entries, each of which specifies the
- *     frequency of the corresponding symbol.  It is valid for some,
- *     none, or all of the frequencies to be 0.
+ *     An array of length @num_syms that gives the frequency of each symbol.
+ *     It is valid for some, none, or all of the frequencies to be 0.  The sum
+ *     of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
   *
   * @lens
- *     An array of @num_syms entries in which this function will return
- *     the length, in bits, of the codeword assigned to each symbol.
- *     Symbols with 0 frequency will not have codewords per se, but
- *     their entries in this array will be set to 0.  No lengths greater
- *     than @max_codeword_len will be assigned.
+ *     An array of @num_syms entries in which this function will return the
+ *     length, in bits, of the codeword assigned to each symbol.  Symbols with
+ *     0 frequency will not have codewords per se, but their entries in this
+ *     array will be set to 0.  No lengths greater than @max_codeword_len will
+ *     be assigned.
   *
   * @codewords
- *     An array of @num_syms entries in which this function will return
- *     the codeword for each symbol, right-justified and padded on the
- *     left with zeroes.  Codewords for symbols with 0 frequency will be
- *     undefined.
+ *     An array of @num_syms entries in which this function will return the
+ *     codeword for each symbol, right-justified and padded on the left with
+ *     zeroes.  Codewords for symbols with 0 frequency will be undefined.
   *
   * ---------------------------------------------------------------------
   *
   * This function builds a length-limited canonical Huffman code.
   *
   * A length-limited Huffman code contains no codewords longer than some
- * specified length, and has exactly (with some algorithms) or
- * approximately (with the algorithm used here) the minimum weighted path
- * length from the root, given this constraint.
- *
- * A canonical Huffman code satisfies the properties that a longer
- * codeword never lexicographically precedes a shorter codeword, and the
- * lexicographic ordering of codewords of the same length is the same as
- * the lexicographic ordering of the corresponding symbols.  A canonical
- * Huffman code, or more generally a canonical prefix code, can be
- * reconstructed from only a list containing the codeword length of each
- * symbol.
- *
- * The classic algorithm to generate a Huffman code creates a node for
- * each symbol, then inserts these nodes into a min-heap keyed by symbol
- * frequency.  Then, repeatedly, the two lowest-frequency nodes are
- * removed from the min-heap and added as the children of a new node
- * having frequency equal to the sum of its two children, which is then
- * inserted into the min-heap.  When only a single node remains in the
- * min-heap, it is the root of the Huffman tree.  The codeword for each
- * symbol is determined by the path needed to reach the corresponding
- * node from the root.  Descending to the left child appends a 0 bit,
- * whereas descending to the right child appends a 1 bit.
- *
- * The classic algorithm is relatively easy to understand, but it is
- * subject to a number of inefficiencies.  In practice, it is fastest to
- * first sort the symbols by frequency.  (This itself can be subject to
- * an optimization based on the fact that most frequencies tend to be
- * low.)  At the same time, we sort secondarily by symbol value, which
- * aids the process of generating a canonical code.  Then, during tree
- * construction, no heap is necessary because both the leaf nodes and the
- * unparented non-leaf nodes can be easily maintained in sorted order.
- * Consequently, there can never be more than two possibilities for the
- * next-lowest-frequency node.
- *
- * In addition, because we're generating a canonical code, we actually
- * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
- * This is because for canonical code generation we don't need to know
- * where the symbols are in the tree.  Rather, we only need to know how
- * many leaf nodes have each depth (codeword length).  And this
- * information can, in fact, be quickly generated from the tree of
- * non-leaves only.
- *
- * Furthermore, we can build this stripped-down Huffman tree directly in
- * the array in which the codewords are to be generated, provided that
- * these array slots are large enough to hold a symbol and frequency
- * value.
- *
- * Still furthermore, we don't even need to maintain explicit child
- * pointers.  We only need the parent pointers, and even those can be
- * overwritten in-place with depth information as part of the process of
- * extracting codeword lengths from the tree.  So in summary, we do NOT
- * need a big structure like:
+ * specified length, and has exactly (with some algorithms) or approximately
+ * (with the algorithm used here) the minimum weighted path length from the
+ * root, given this constraint.
+ *
+ * A canonical Huffman code satisfies the properties that a longer codeword
+ * never lexicographically precedes a shorter codeword, and the lexicographic
+ * ordering of codewords of the same length is the same as the lexicographic
+ * ordering of the corresponding symbols.  A canonical Huffman code, or more
+ * generally a canonical prefix code, can be reconstructed from only a list
+ * containing the codeword length of each symbol.
+ *
+ * The classic algorithm to generate a Huffman code creates a node for each
+ * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
+ * Then, repeatedly, the two lowest-frequency nodes are removed from the
+ * min-heap and added as the children of a new node having frequency equal to
+ * the sum of its two children, which is then inserted into the min-heap.  When
+ * only a single node remains in the min-heap, it is the root of the Huffman
+ * tree.  The codeword for each symbol is determined by the path needed to reach
+ * the corresponding node from the root.  Descending to the left child appends a
+ * 0 bit, whereas descending to the right child appends a 1 bit.
+ *
+ * The classic algorithm is relatively easy to understand, but it is subject to
+ * a number of inefficiencies.  In practice, it is fastest to first sort the
+ * symbols by frequency.  (This itself can be subject to an optimization based
+ * on the fact that most frequencies tend to be low.)  At the same time, we sort
+ * secondarily by symbol value, which aids the process of generating a canonical
+ * code.  Then, during tree construction, no heap is necessary because both the
+ * leaf nodes and the unparented non-leaf nodes can be easily maintained in
+ * sorted order.  Consequently, there can never be more than two possibilities
+ * for the next-lowest-frequency node.
+ *
+ * In addition, because we're generating a canonical code, we actually don't
+ * need the leaf nodes of the tree at all, only the non-leaf nodes.  This is
+ * because for canonical code generation we don't need to know where the symbols
+ * are in the tree.  Rather, we only need to know how many leaf nodes have each
+ * depth (codeword length).  And this information can, in fact, be quickly
+ * generated from the tree of non-leaves only.
+ *
+ * Furthermore, we can build this stripped-down Huffman tree directly in the
+ * array in which the codewords are to be generated, provided that these array
+ * slots are large enough to hold a symbol and frequency value.
+ *
+ * Still furthermore, we don't even need to maintain explicit child pointers.
+ * We only need the parent pointers, and even those can be overwritten in-place
+ * with depth information as part of the process of extracting codeword lengths
+ * from the tree.  So in summary, we do NOT need a big structure like:
   *
   *     struct huffman_tree_node {
   *             unsigned int symbol;
@@ -514,48 +534,40 @@ gen_codewords(u32 A[restrict], u8 lens[restrict],
   *     };
   *
   *
- *   ... which often gets used in "naive" implementations of Huffman code
- *   generation.
- *
- * Most of these optimizations are based on the implementation in 7-Zip
- * (source file: C/HuffEnc.c), which has been placed in the public domain
- * by Igor Pavlov.  But I've rewritten the code with extensive comments,
- * as it took me a while to figure out what it was doing...!
- *
- * ---------------------------------------------------------------------
- *
- * NOTE: in general, the same frequencies can be used to generate
- * different length-limited canonical Huffman codes.  One choice we have
- * is during tree construction, when we must decide whether to prefer a
- * leaf or non-leaf when there is a tie in frequency.  Another choice we
- * have is how to deal with codewords that would exceed @max_codeword_len
- * bits in length.  Both of these choices affect the resulting codeword
- * lengths, which otherwise can be mapped uniquely onto the resulting
- * canonical Huffman code.
- *
- * Normally, there is no problem with choosing one valid code over
- * another, provided that they produce similar compression ratios.
- * However, the LZMS compression format uses adaptive Huffman coding.  It
- * requires that both the decompressor and compressor build a canonical
- * code equivalent to that which can be generated by using the classic
- * Huffman tree construction algorithm and always processing leaves
- * before non-leaves when there is a frequency tie.  Therefore, we make
- * sure to do this.  This method also has the advantage of sometimes
- * shortening the longest codeword that is generated.
- *
- * There also is the issue of how codewords longer than @max_codeword_len
- * are dealt with.  Fortunately, for LZMS this is irrelevant because
- * because for the LZMS alphabets no codeword can ever exceed
- * LZMS_MAX_CODEWORD_LEN (= 15).  Since the LZMS algorithm regularly
- * halves all frequencies, the frequencies cannot become high enough for
- * a length 16 codeword to be generated.  Specifically, I think that if
- * ties are broken in favor of non-leaves (as we do), the lowest total
- * frequency that would give a length-16 codeword would be the sum of the
- * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which
- * is 3570.  And in LZMS we can't get a frequency that high based on the
- * alphabet sizes, rebuild frequencies, and scaling factors.  This
- * worst-case scenario is based on the following degenerate case (only
- * the bottom of the tree shown):
+ * ... which often gets used in "naive" implementations of Huffman code
+ * generation.
+ *
+ * Many of these optimizations are based on the implementation in 7-Zip (source
+ * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
+ *
+ * NOTE: in general, the same frequencies can be used to generate different
+ * length-limited canonical Huffman codes.  One choice we have is during tree
+ * construction, when we must decide whether to prefer a leaf or non-leaf when
+ * there is a tie in frequency.  Another choice we have is how to deal with
+ * codewords that would exceed @max_codeword_len bits in length.  Both of these
+ * choices affect the resulting codeword lengths, which otherwise can be mapped
+ * uniquely onto the resulting canonical Huffman code.
+ *
+ * Normally, there is no problem with choosing one valid code over another,
+ * provided that they produce similar compression ratios.  However, the LZMS
+ * compression format uses adaptive Huffman coding.  It requires that both the
+ * decompressor and compressor build a canonical code equivalent to that which
+ * can be generated by using the classic Huffman tree construction algorithm and
+ * always processing leaves before non-leaves when there is a frequency tie.
+ * Therefore, we make sure to do this.  This method also has the advantage of
+ * sometimes shortening the longest codeword that is generated.
+ *
+ * There also is the issue of how codewords longer than @max_codeword_len are
+ * dealt with.  Fortunately, for LZMS this is irrelevant because for the LZMS
+ * alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN (= 15).  Since
+ * the LZMS algorithm regularly halves all frequencies, the frequencies cannot
+ * become high enough for a length 16 codeword to be generated.  Specifically, I
+ * think that if ties are broken in favor of non-leaves (as we do), the lowest
+ * total frequency that would give a length-16 codeword would be the sum of the
+ * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which is
+ * 3570.  And in LZMS we can't get a frequency that high based on the alphabet
+ * sizes, rebuild frequencies, and scaling factors.  This worst-case scenario is
+ * based on the following degenerate case (only the bottom of the tree shown):
   *
   *                          ...
   *                        17
@@ -570,57 +582,65 @@ gen_codewords(u32 A[restrict], u8 lens[restrict],
   *               / \
   *              1   1
   *
- * Excluding the first leaves (those with value 1), each leaf value must
- * be greater than the non-leaf up 1 and down 2 from it; otherwise that
- * leaf would have taken precedence over that non-leaf and been combined
- * with the leaf below, thereby decreasing the height compared to that
- * shown.
- *
- * Interesting fact: if we were to instead prioritize non-leaves over
- * leaves, then the worst case frequencies would be the Fibonacci
- * sequence, plus an extra frequency of 1.  In this hypothetical
- * scenario, it would be slightly easier for longer codewords to be
- * generated.
+ * Excluding the first leaves (those with value 1), each leaf value must be
+ * greater than the non-leaf up 1 and down 2 from it; otherwise that leaf would
+ * have taken precedence over that non-leaf and been combined with the leaf
+ * below, thereby decreasing the height compared to that shown.
+ *
+ * Interesting fact: if we were to instead prioritize non-leaves over leaves,
+ * then the worst case frequencies would be the Fibonacci sequence, plus an
+ * extra frequency of 1.  In this hypothetical scenario, it would be slightly
+ * easier for longer codewords to be generated.
   */
  void
  make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
-                           const u32 freqs[restrict],
-                           u8 lens[restrict], u32 codewords[restrict])
+                           const u32 freqs[], u8 lens[], u32 codewords[])
  {
         u32 *A = codewords;
         unsigned num_used_syms;
  
-       /* We begin by sorting the symbols primarily by frequency and
-        * secondarily by symbol value.  As an optimization, the array
-        * used for this purpose ('A') shares storage with the space in
-        * which we will eventually return the codewords.  */
+       wimlib_assert(num_syms <= MAX_NUM_SYMS);
+       STATIC_ASSERT(MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
+       wimlib_assert(max_codeword_len <= MAX_CODEWORD_LEN);
  
+       /*
+        * We begin by sorting the symbols primarily by frequency and
+        * secondarily by symbol value.  As an optimization, the array used for
+        * this purpose ('A') shares storage with the space in which we will
+        * eventually return the codewords.
+        */
         num_used_syms = sort_symbols(num_syms, freqs, lens, A);
  
-       /* 'num_used_syms' is the number of symbols with nonzero
-        * frequency.  This may be less than @num_syms.  'num_used_syms'
-        * is also the number of entries in 'A' that are valid.  Each
-        * entry consists of a distinct symbol and a nonzero frequency
-        * packed into a 32-bit integer.  */
+       /*
+        * 'num_used_syms' is the number of symbols with nonzero frequency.
+        * This may be less than @num_syms.  'num_used_syms' is also the number
+        * of entries in 'A' that are valid.  Each entry consists of a distinct
+        * symbol and a nonzero frequency packed into a 32-bit integer.
+        */
  
-       /* Handle special cases where only 0 or 1 symbols were used (had
-        * nonzero frequency).  */
+       /*
+        * Handle special cases where only 0 or 1 symbols were used (had nonzero
+        * frequency).
+        */
  
         if (unlikely(num_used_syms == 0)) {
-               /* Code is empty.  sort_symbols() already set all lengths
-                * to 0, so there is nothing more to do.  */
+               /*
+                * Code is empty.  sort_symbols() already set all lengths to 0,
+                * so there is nothing more to do.
+                */
                 return;
         }
  
         if (unlikely(num_used_syms == 1)) {
-               /* Only one symbol was used, so we only need one
-                * codeword.  But two codewords are needed to form the
-                * smallest complete Huffman code, which uses codewords 0
-                * and 1.  Therefore, we choose another symbol to which
-                * to assign a codeword.  We use 0 (if the used symbol is
-                * not 0) or 1 (if the used symbol is 0).  In either
-                * case, the lesser-valued symbol must be assigned
-                * codeword 0 so that the resulting code is canonical.  */
+               /*
+                * Only one symbol was used, so we only need one codeword.  But
+                * two codewords are needed to form the smallest complete
+                * Huffman code, which uses codewords 0 and 1.  Therefore, we
+                * choose another symbol to which to assign a codeword.  We use
+                * 0 (if the used symbol is not 0) or 1 (if the used symbol is
+                * 0).  In either case, the lesser-valued symbol must be
+                * assigned codeword 0 so that the resulting code is canonical.
+                */
  
                 unsigned sym = A[0] & SYMBOL_MASK;
                 unsigned nonzero_idx = sym ? sym : 1;
@@ -632,14 +652,16 @@ make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
                 return;
         }
  
-       /* Build a stripped-down version of the Huffman tree, sharing the
-        * array 'A' with the symbol values.  Then extract length counts
-        * from the tree and use them to generate the final codewords.  */
+       /*
+        * Build a stripped-down version of the Huffman tree, sharing the array
+        * 'A' with the symbol values.  Then extract length counts from the tree
+        * and use them to generate the final codewords.
+        */
  
         build_tree(A, num_used_syms);
  
         {
-               unsigned len_counts[max_codeword_len + 1];
+               unsigned len_counts[MAX_CODEWORD_LEN + 1];
  
                 compute_length_counts(A, num_used_syms - 2,
                                       len_counts, max_codeword_len);