From f2aeb875aa7e73a486c6b4f8bdf4952e49ec8b8e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sat, 30 Jul 2022 19:03:42 -0700
Subject: [PATCH] compress_common: sync with libdeflate

---
 include/wimlib/compiler.h        |   3 +
 include/wimlib/compress_common.h |  12 +-
 src/compress_common.c            | 698 ++++++++++++++++---------------
 src/divsufsort.c                 |   3 -
 4 files changed, 366 insertions(+), 350 deletions(-)

diff --git a/include/wimlib/compiler.h b/include/wimlib/compiler.h
index 0eacffd8..294108c3 100644
--- a/include/wimlib/compiler.h
+++ b/include/wimlib/compiler.h
@@ -149,17 +149,20 @@
 #  define min(a, b)  ({ typeof(a) _a = (a); typeof(b) _b = (b); \
 			(_a < _b) ? _a : _b; })
 #endif
+#define MIN(a, b)	min((a), (b))
 
 /* Get the maximum of two variables, without multiple evaluation.  */
 #ifndef max
 #  define max(a, b)  ({ typeof(a) _a = (a); typeof(b) _b = (b); \
 			(_a > _b) ? _a : _b; })
 #endif
+#define MAX(a, b)	max((a), (b))
 
 /* Swap the values of two variables, without multiple evaluation.  */
 #ifndef swap
 #  define swap(a, b) ({ typeof(a) _a = (a); (a) = (b); (b) = _a; })
 #endif
+#define SWAP(a, b)	swap((a), (b))
 
 /* (Optional) Efficiently swap the bytes of a 16-bit integer.  */
 #if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
diff --git a/include/wimlib/compress_common.h b/include/wimlib/compress_common.h
index 7913a138..f2ed4109 100644
--- a/include/wimlib/compress_common.h
+++ b/include/wimlib/compress_common.h
@@ -9,11 +9,11 @@
 
 #include "wimlib/types.h"
 
-extern void
-make_canonical_huffman_code(unsigned num_syms,
-			    unsigned max_codeword_len,
-			    const u32 freq_tab[restrict],
-			    u8 lens[restrict],
-			    u32 codewords[restrict]);
+#define MAX_NUM_SYMS		799	/* LZMS_MAX_NUM_SYMS */
+#define MAX_CODEWORD_LEN	16
+
+void
+make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
+			    const u32 freqs[], u8 lens[], u32 codewords[]);
 
 #endif /* _WIMLIB_COMPRESS_COMMON_H */
diff --git a/src/compress_common.c b/src/compress_common.c
index ac02e0cf..0e2ba05a 100644
--- a/src/compress_common.c
+++ b/src/compress_common.c
@@ -33,13 +33,16 @@
 
 #include <string.h>
 
+#include "wimlib/assert.h"
 #include "wimlib/compress_common.h"
 #include "wimlib/util.h"
 
-/* Given the binary tree node A[subtree_idx] whose children already
- * satisfy the maxheap property, swap the node with its greater child
- * until it is greater than both its children, so that the maxheap
- * property is satisfied in the subtree rooted at A[subtree_idx].  */
+/*
+ * Given the binary tree node A[subtree_idx] whose children already satisfy the
+ * maxheap property, swap the node with its greater child until it is greater
+ * than or equal to both of its children, so that the maxheap property is
+ * satisfied in the subtree rooted at A[subtree_idx].  'A' uses 1-based indices.
+ */
 static void
 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
 {
@@ -60,297 +63,306 @@ heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
 	A[parent_idx] = v;
 }
 
-/* Rearrange the array 'A' so that it satisfies the maxheap property.
+/*
+ * Rearrange the array 'A' so that it satisfies the maxheap property.
  * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
  */
 static void
 heapify_array(u32 A[], unsigned length)
 {
-	for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
+	unsigned subtree_idx;
+
+	for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
 		heapify_subtree(A, length, subtree_idx);
 }
 
-/* Sort the array 'A', which contains 'length' unsigned 32-bit integers.  */
+/*
+ * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
+ *
+ * Note: name this function heap_sort() instead of heapsort() to avoid colliding
+ * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
+ * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
+ */
 static void
-heapsort(u32 A[], unsigned length)
+heap_sort(u32 A[], unsigned length)
 {
 	A--; /* Use 1-based indices  */
 
 	heapify_array(A, length);
 
 	while (length >= 2) {
-		swap(A[1], A[length]);
+		u32 tmp = A[length];
+
+		A[length] = A[1];
+		A[1] = tmp;
 		length--;
 		heapify_subtree(A, length, 1);
 	}
 }
 
 #define NUM_SYMBOL_BITS 10
-#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
+#define NUM_FREQ_BITS	(32 - NUM_SYMBOL_BITS)
+#define SYMBOL_MASK	((1 << NUM_SYMBOL_BITS) - 1)
+#define FREQ_MASK	(~SYMBOL_MASK)
+
+#define GET_NUM_COUNTERS(num_syms)	(num_syms)
 
 /*
- * Sort the symbols primarily by frequency and secondarily by symbol
- * value.  Discard symbols with zero frequency and fill in an array with
- * the remaining symbols, along with their frequencies.  The low
- * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
- * value, and the remaining bits will contain the frequency.
+ * Sort the symbols primarily by frequency and secondarily by symbol value.
+ * Discard symbols with zero frequency and fill in an array with the remaining
+ * symbols, along with their frequencies.  The low NUM_SYMBOL_BITS bits of each
+ * array entry will contain the symbol value, and the remaining bits will
+ * contain the frequency.
  *
  * @num_syms
- *	Number of symbols in the alphabet.
- *	Can't be greater than (1 << NUM_SYMBOL_BITS).
+ *	Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
  *
  * @freqs[num_syms]
- *	The frequency of each symbol.
+ *	Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
  *
  * @lens[num_syms]
- *	An array that eventually will hold the length of each codeword.
- *	This function only fills in the codeword lengths for symbols that
- *	have zero frequency, which are not well defined per se but will
- *	be set to 0.
+ *	An array that eventually will hold the length of each codeword.  This
+ *	function only fills in the codeword lengths for symbols that have zero
+ *	frequency, which are not well defined per se but will be set to 0.
  *
  * @symout[num_syms]
  *	The output array, described above.
  *
- * Returns the number of entries in 'symout' that were filled.  This is
- * the number of symbols that have nonzero frequency.
+ * Returns the number of entries in 'symout' that were filled.  This is the
+ * number of symbols that have nonzero frequency.
  */
 static unsigned
-sort_symbols(unsigned num_syms, const u32 freqs[restrict],
-	     u8 lens[restrict], u32 symout[restrict])
+sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[])
 {
+	unsigned sym;
+	unsigned i;
 	unsigned num_used_syms;
 	unsigned num_counters;
+	unsigned counters[GET_NUM_COUNTERS(MAX_NUM_SYMS)];
 
-	/* We rely on heapsort, but with an added optimization.  Since
-	 * it's common for most symbol frequencies to be low, we first do
-	 * a count sort using a limited number of counters.  High
-	 * frequencies will be counted in the last counter, and only they
-	 * will be sorted with heapsort.
+	/*
+	 * We use heapsort, but with an added optimization.  Since often most
+	 * symbol frequencies are low, we first do a count sort using a limited
+	 * number of counters.  High frequencies are counted in the last
+	 * counter, and only they will be sorted with heapsort.
 	 *
 	 * Note: with more symbols, it is generally beneficial to have more
-	 * counters.  About 1 counter per 4 symbols seems fast.
-	 *
-	 * Note: I also tested radix sort, but even for large symbol
-	 * counts (> 255) and frequencies bounded at 16 bits (enabling
-	 * radix sort by just two base-256 digits), it didn't seem any
-	 * faster than the method implemented here.
-	 *
-	 * Note: I tested the optimized quicksort implementation from
-	 * glibc (with indirection overhead removed), but it was only
-	 * marginally faster than the simple heapsort implemented here.
-	 *
-	 * Tests were done with building the codes for LZX.  Results may
-	 * vary for different compression algorithms...!  */
-
-	num_counters = ALIGN(DIV_ROUND_UP(num_syms, 4), 4);
+	 * counters.  About 1 counter per symbol seems fastest.
+	 */
 
-	unsigned counters[num_counters];
+	num_counters = GET_NUM_COUNTERS(num_syms);
 
-	memset(counters, 0, sizeof(counters));
+	memset(counters, 0, num_counters * sizeof(counters[0]));
 
-	/* Count the frequencies.  */
-	for (unsigned sym = 0; sym < num_syms; sym++)
-		counters[min(freqs[sym], num_counters - 1)]++;
+	/* Count the frequencies. */
+	for (sym = 0; sym < num_syms; sym++)
+		counters[MIN(freqs[sym], num_counters - 1)]++;
 
-	/* Make the counters cumulative, ignoring the zero-th, which
-	 * counted symbols with zero frequency.  As a side effect, this
-	 * calculates the number of symbols with nonzero frequency.  */
+	/*
+	 * Make the counters cumulative, ignoring the zero-th, which counted
+	 * symbols with zero frequency.  As a side effect, this calculates the
+	 * number of symbols with nonzero frequency.
+	 */
 	num_used_syms = 0;
-	for (unsigned i = 1; i < num_counters; i++) {
+	for (i = 1; i < num_counters; i++) {
 		unsigned count = counters[i];
+
 		counters[i] = num_used_syms;
 		num_used_syms += count;
 	}
 
-	/* Sort nonzero-frequency symbols using the counters.  At the
-	 * same time, set the codeword lengths of zero-frequency symbols
-	 * to 0.  */
-	for (unsigned sym = 0; sym < num_syms; sym++) {
+	/*
+	 * Sort nonzero-frequency symbols using the counters.  At the same time,
+	 * set the codeword lengths of zero-frequency symbols to 0.
+	 */
+	for (sym = 0; sym < num_syms; sym++) {
 		u32 freq = freqs[sym];
+
 		if (freq != 0) {
-			symout[counters[min(freq, num_counters - 1)]++] =
+			symout[counters[MIN(freq, num_counters - 1)]++] =
 				sym | (freq << NUM_SYMBOL_BITS);
 		} else {
 			lens[sym] = 0;
 		}
 	}
 
-	/* Sort the symbols counted in the last counter.  */
-	heapsort(symout + counters[num_counters - 2],
-		 counters[num_counters - 1] - counters[num_counters - 2]);
+	/* Sort the symbols counted in the last counter. */
+	heap_sort(symout + counters[num_counters - 2],
+		  counters[num_counters - 1] - counters[num_counters - 2]);
 
 	return num_used_syms;
 }
 
 /*
- * Build the Huffman tree.
+ * Build a Huffman tree.
  *
  * This is an optimized implementation that
  *	(a) takes advantage of the frequencies being already sorted;
- *	(b) only generates non-leaf nodes, since the non-leaf nodes of a
- *	    Huffman tree are sufficient to generate a canonical code;
+ *	(b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
+ *	    tree are sufficient to generate a canonical code;
  *	(c) Only stores parent pointers, not child pointers;
- *	(d) Produces the nodes in the same memory used for input
- *	    frequency information.
- *
- * Array 'A', which contains 'sym_count' entries, is used for both input
- * and output.  For this function, 'sym_count' must be at least 2.
- *
- * For input, the array must contain the frequencies of the symbols,
- * sorted in increasing order.  Specifically, each entry must contain a
- * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
- * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
- * Although these bits will, in fact, contain the symbols that correspond
- * to the frequencies, this function is concerned with frequencies only
- * and keeps the symbols as-is.
- *
- * For output, this function will produce the non-leaf nodes of the
- * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
- * entries of the array.  Entry A[sym_count - 2] will represent the root
- * node.  Each other node will contain the zero-based index of its parent
- * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
- * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
- * note that although these low bits will, in fact, contain a symbol
- * value, this symbol will have *no relationship* with the Huffman tree
- * node that happens to occupy the same slot.  This is because this
+ *	(d) Produces the nodes in the same memory used for input frequency
+ *	    information.
+ *
+ * Array 'A', which contains 'sym_count' entries, is used for both input and
+ * output.  For this function, 'sym_count' must be at least 2.
+ *
+ * For input, the array must contain the frequencies of the symbols, sorted in
+ * increasing order.  Specifically, each entry must contain a frequency left
+ * shifted by NUM_SYMBOL_BITS bits.  Any data in the low NUM_SYMBOL_BITS bits of
+ * the entries will be ignored by this function.  Although these bits will, in
+ * fact, contain the symbols that correspond to the frequencies, this function
+ * is concerned with frequencies only and keeps the symbols as-is.
+ *
+ * For output, this function will produce the non-leaf nodes of the Huffman
+ * tree.  These nodes will be stored in the first (sym_count - 1) entries of the
+ * array.  Entry A[sym_count - 2] will represent the root node.  Each other node
+ * will contain the zero-based index of its parent node in 'A', left shifted by
+ * NUM_SYMBOL_BITS bits.  The low NUM_SYMBOL_BITS bits of each entry in A will
+ * be kept as-is.  Again, note that although these low bits will, in fact,
+ * contain a symbol value, this symbol will have *no relationship* with the
+ * Huffman tree node that happens to occupy the same slot.  This is because this
  * implementation only generates the non-leaf nodes of the tree.
  */
 static void
 build_tree(u32 A[], unsigned sym_count)
 {
-	/* Index, in 'A', of next lowest frequency symbol that has not
-	 * yet been processed.  */
+	const unsigned last_idx = sym_count - 1;
+
+	/* Index of the next lowest frequency leaf that still needs a parent */
 	unsigned i = 0;
 
-	/* Index, in 'A', of next lowest frequency parentless non-leaf
-	 * node; or, if equal to 'e', then no such node exists yet.  */
+	/*
+	 * Index of the next lowest frequency non-leaf that still needs a
+	 * parent, or 'e' if there is currently no such node
+	 */
 	unsigned b = 0;
 
-	/* Index, in 'A', of next node to allocate as a non-leaf.  */
+	/* Index of the next spot for a non-leaf (will overwrite a leaf) */
 	unsigned e = 0;
 
 	do {
-		unsigned m, n;
-		u32 freq_shifted;
+		u32 new_freq;
 
-		/* Choose the two next lowest frequency entries.  */
-
-		if (i != sym_count &&
-		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
-			m = i++;
-		else
-			m = b++;
-
-		if (i != sym_count &&
-		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
-			n = i++;
-		else
-			n = b++;
-
-		/* Allocate a non-leaf node and link the entries to it.
+		/*
+		 * Select the next two lowest frequency nodes among the leaves
+		 * A[i] and non-leaves A[b], and create a new node A[e] to be
+		 * their parent.  Set the new node's frequency to the sum of the
+		 * frequencies of its two children.
 		 *
-		 * If we link an entry that we're visiting for the first
-		 * time (via index 'i'), then we're actually linking a
-		 * leaf node and it will have no effect, since the leaf
-		 * will be overwritten with a non-leaf when index 'e'
-		 * catches up to it.  But it's not any slower to
-		 * unconditionally set the parent index.
-		 *
-		 * We also compute the frequency of the non-leaf node as
-		 * the sum of its two children's frequencies.  */
-
-		freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
-
-		A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
-		A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
-		A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
-		e++;
-	} while (sym_count - e > 1);
-		/* When just one entry remains, it is a "leaf" that was
-		 * linked to some other node.  We ignore it, since the
-		 * rest of the array contains the non-leaves which we
-		 * need.  (Note that we're assuming the cases with 0 or 1
-		 * symbols were handled separately.) */
+		 * Usually the next two lowest frequency nodes are of the same
+		 * type (leaf or non-leaf), so check those cases first.
+		 */
+		if (i + 1 <= last_idx &&
+		    (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
+			/* Two leaves */
+			new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
+			i += 2;
+		} else if (b + 2 <= e &&
+			   (i > last_idx ||
+			    (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
+			/* Two non-leaves */
+			new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
+			A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+			A[b + 1] = (e << NUM_SYMBOL_BITS) |
+				   (A[b + 1] & SYMBOL_MASK);
+			b += 2;
+		} else {
+			/* One leaf and one non-leaf */
+			new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
+			A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+			i++;
+			b++;
+		}
+		A[e] = new_freq | (A[e] & SYMBOL_MASK);
+		/*
+		 * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
+		 * tree is complete once we've created 'n - 1' non-leaves.
+		 */
+	} while (++e < last_idx);
 }
 
 /*
- * Given the stripped-down Huffman tree constructed by build_tree(),
- * determine the number of codewords that should be assigned each
- * possible length, taking into account the length-limited constraint.
+ * Given the stripped-down Huffman tree constructed by build_tree(), determine
+ * the number of codewords that should be assigned each possible length, taking
+ * into account the length-limited constraint.
  *
  * @A
- *	The array produced by build_tree(), containing parent index
- *	information for the non-leaf nodes of the Huffman tree.  Each
- *	entry in this array is a node; a node's parent always has a
- *	greater index than that node itself.  This function will
- *	overwrite the parent index information in this array, so
- *	essentially it will destroy the tree.  However, the data in the
- *	low NUM_SYMBOL_BITS of each entry will be preserved.
+ *	The array produced by build_tree(), containing parent index information
+ *	for the non-leaf nodes of the Huffman tree.  Each entry in this array is
+ *	a node; a node's parent always has a greater index than that node
+ *	itself.  This function will overwrite the parent index information in
+ *	this array, so essentially it will destroy the tree.  However, the data
+ *	in the low NUM_SYMBOL_BITS of each entry will be preserved.
  *
  * @root_idx
- *	The 0-based index of the root node in 'A', and consequently one
- *	less than the number of tree node entries in 'A'.  (Or, really 2
- *	less than the actual length of 'A'.)
+ *	The 0-based index of the root node in 'A', and consequently one less
+ *	than the number of tree node entries in 'A'.  (Or, really 2 less than
+ *	the actual length of 'A'.)
  *
  * @len_counts
  *	An array of length ('max_codeword_len' + 1) in which the number of
- *	codewords having each length <= max_codeword_len will be
- *	returned.
+ *	codewords having each length <= max_codeword_len will be returned.
  *
  * @max_codeword_len
  *	The maximum permissible codeword length.
  */
 static void
-compute_length_counts(u32 A[restrict], unsigned root_idx,
-		      unsigned len_counts[restrict], unsigned max_codeword_len)
+compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
+		      unsigned max_codeword_len)
 {
-	/* The key observations are:
+	unsigned len;
+	int node;
+
+	/*
+	 * The key observations are:
 	 *
-	 * (1) We can traverse the non-leaf nodes of the tree, always
-	 * visiting a parent before its children, by simply iterating
-	 * through the array in reverse order.  Consequently, we can
-	 * compute the depth of each node in one pass, overwriting the
-	 * parent indices with depths.
+	 * (1) We can traverse the non-leaf nodes of the tree, always visiting a
+	 *     parent before its children, by simply iterating through the array
+	 *     in reverse order.  Consequently, we can compute the depth of each
+	 *     node in one pass, overwriting the parent indices with depths.
 	 *
-	 * (2) We can initially assume that in the real Huffman tree,
-	 * both children of the root are leaves.  This corresponds to two
-	 * codewords of length 1.  Then, whenever we visit a (non-leaf)
-	 * node during the traversal, we modify this assumption to
-	 * account for the current node *not* being a leaf, but rather
-	 * its two children being leaves.  This causes the loss of one
-	 * codeword for the current depth and the addition of two
-	 * codewords for the current depth plus one.
+	 * (2) We can initially assume that in the real Huffman tree, both
+	 *     children of the root are leaves.  This corresponds to two
+	 *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
+	 *     during the traversal, we modify this assumption to account for
+	 *     the current node *not* being a leaf, but rather its two children
+	 *     being leaves.  This causes the loss of one codeword for the
+	 *     current depth and the addition of two codewords for the current
+	 *     depth plus one.
 	 *
-	 * (3) We can handle the length-limited constraint fairly easily
-	 * by simply using the largest length available when a depth
-	 * exceeds max_codeword_len.
+	 * (3) We can handle the length-limited constraint fairly easily by
+	 *     simply using the largest length available when a depth exceeds
+	 *     max_codeword_len.
 	 */
 
-	for (unsigned len = 0; len <= max_codeword_len; len++)
+	for (len = 0; len <= max_codeword_len; len++)
 		len_counts[len] = 0;
 	len_counts[1] = 2;
 
-	/* Set the root node's depth to 0.  */
+	/* Set the root node's depth to 0. */
 	A[root_idx] &= SYMBOL_MASK;
 
-	for (int node = root_idx - 1; node >= 0; node--) {
+	for (node = root_idx - 1; node >= 0; node--) {
 
-		/* Calculate the depth of this node.  */
+		/* Calculate the depth of this node. */
 
 		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
 		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
 		unsigned depth = parent_depth + 1;
 		unsigned len = depth;
 
-		/* Set the depth of this node so that it is available
-		 * when its children (if any) are processed.  */
-
+		/*
+		 * Set the depth of this node so that it is available when its
+		 * children (if any) are processed.
+		 */
 		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
 
-		/* If needed, decrease the length to meet the
-		 * length-limited constraint.  This is not the optimal
-		 * method for generating length-limited Huffman codes!
-		 * But it should be good enough.  */
+		/*
+		 * If needed, decrease the length to meet the length-limited
+		 * constraint.  This is not the optimal method for generating
+		 * length-limited Huffman codes!  But it should be good enough.
+		 */
 		if (len >= max_codeword_len) {
 			len = max_codeword_len;
 			do {
@@ -358,8 +370,10 @@ compute_length_counts(u32 A[restrict], unsigned root_idx,
 			} while (len_counts[len] == 0);
 		}
 
-		/* Account for the fact that we have a non-leaf node at
-		 * the current depth.  */
+		/*
+		 * Account for the fact that we have a non-leaf node at the
+		 * current depth.
+		 */
 		len_counts[len]--;
 		len_counts[len + 1] += 2;
 	}
@@ -389,34 +403,40 @@ compute_length_counts(u32 A[restrict], unsigned root_idx,
  *	frequency.  This is the length of the 'A' and 'len' arrays.
  */
 static void
-gen_codewords(u32 A[restrict], u8 lens[restrict],
-	      const unsigned len_counts[restrict],
+gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
 	      unsigned max_codeword_len, unsigned num_syms)
 {
-	u32 next_codewords[max_codeword_len + 1];
-
-	/* Given the number of codewords that will have each length,
-	 * assign codeword lengths to symbols.  We do this by assigning
-	 * the lengths in decreasing order to the symbols sorted
-	 * primarily by increasing frequency and secondarily by
-	 * increasing symbol value.  */
-	for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) {
+	u32 next_codewords[MAX_CODEWORD_LEN + 1];
+	unsigned i;
+	unsigned len;
+	unsigned sym;
+
+	/*
+	 * Given the number of codewords that will have each length, assign
+	 * codeword lengths to symbols.  We do this by assigning the lengths in
+	 * decreasing order to the symbols sorted primarily by increasing
+	 * frequency and secondarily by increasing symbol value.
+	 */
+	for (i = 0, len = max_codeword_len; len >= 1; len--) {
 		unsigned count = len_counts[len];
+
 		while (count--)
 			lens[A[i++] & SYMBOL_MASK] = len;
 	}
 
-	/* Generate the codewords themselves.  We initialize the
+	/*
+	 * Generate the codewords themselves.  We initialize the
 	 * 'next_codewords' array to provide the lexicographically first
-	 * codeword of each length, then assign codewords in symbol
-	 * order.  This produces a canonical code.  */
+	 * codeword of each length, then assign codewords in symbol order.  This
+	 * produces a canonical code.
+	 */
 	next_codewords[0] = 0;
 	next_codewords[1] = 0;
-	for (unsigned len = 2; len <= max_codeword_len; len++)
+	for (len = 2; len <= max_codeword_len; len++)
 		next_codewords[len] =
 			(next_codewords[len - 1] + len_counts[len - 1]) << 1;
 
-	for (unsigned sym = 0; sym < num_syms; sym++)
+	for (sym = 0; sym < num_syms; sym++)
 		A[sym] = next_codewords[lens[sym]]++;
 }
 
@@ -429,88 +449,81 @@ gen_codewords(u32 A[restrict], u8 lens[restrict],
  * length-limited canonical Huffman code.
  *
  * @num_syms
- *	The number of symbols in the alphabet.  The symbols are the
- *	integers in the range [0, num_syms - 1].  This parameter must be
- *	at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
+ *	The number of symbols in the alphabet.  The symbols are the integers in
+ *	the range [0, num_syms - 1].  This parameter must be at least 2 and
+ *	must not exceed (1 << NUM_SYMBOL_BITS).
  *
  * @max_codeword_len
  *	The maximum permissible codeword length.
  *
  * @freqs
- *	An array of @num_syms entries, each of which specifies the
- *	frequency of the corresponding symbol.  It is valid for some,
- *	none, or all of the frequencies to be 0.
+ *	An array of length @num_syms that gives the frequency of each symbol.
+ *	It is valid for some, none, or all of the frequencies to be 0.  The sum
+ *	of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
  *
  * @lens
- *	An array of @num_syms entries in which this function will return
- *	the length, in bits, of the codeword assigned to each symbol.
- *	Symbols with 0 frequency will not have codewords per se, but
- *	their entries in this array will be set to 0.  No lengths greater
- *	than @max_codeword_len will be assigned.
+ *	An array of @num_syms entries in which this function will return the
+ *	length, in bits, of the codeword assigned to each symbol.  Symbols with
+ *	0 frequency will not have codewords per se, but their entries in this
+ *	array will be set to 0.  No lengths greater than @max_codeword_len will
+ *	be assigned.
  *
  * @codewords
- *	An array of @num_syms entries in which this function will return
- *	the codeword for each symbol, right-justified and padded on the
- *	left with zeroes.  Codewords for symbols with 0 frequency will be
- *	undefined.
+ *	An array of @num_syms entries in which this function will return the
+ *	codeword for each symbol, right-justified and padded on the left with
+ *	zeroes.  Codewords for symbols with 0 frequency will be undefined.
  *
  * ---------------------------------------------------------------------
  *
  * This function builds a length-limited canonical Huffman code.
  *
  * A length-limited Huffman code contains no codewords longer than some
- * specified length, and has exactly (with some algorithms) or
- * approximately (with the algorithm used here) the minimum weighted path
- * length from the root, given this constraint.
- *
- * A canonical Huffman code satisfies the properties that a longer
- * codeword never lexicographically precedes a shorter codeword, and the
- * lexicographic ordering of codewords of the same length is the same as
- * the lexicographic ordering of the corresponding symbols.  A canonical
- * Huffman code, or more generally a canonical prefix code, can be
- * reconstructed from only a list containing the codeword length of each
- * symbol.
- *
- * The classic algorithm to generate a Huffman code creates a node for
- * each symbol, then inserts these nodes into a min-heap keyed by symbol
- * frequency.  Then, repeatedly, the two lowest-frequency nodes are
- * removed from the min-heap and added as the children of a new node
- * having frequency equal to the sum of its two children, which is then
- * inserted into the min-heap.  When only a single node remains in the
- * min-heap, it is the root of the Huffman tree.  The codeword for each
- * symbol is determined by the path needed to reach the corresponding
- * node from the root.  Descending to the left child appends a 0 bit,
- * whereas descending to the right child appends a 1 bit.
- *
- * The classic algorithm is relatively easy to understand, but it is
- * subject to a number of inefficiencies.  In practice, it is fastest to
- * first sort the symbols by frequency.  (This itself can be subject to
- * an optimization based on the fact that most frequencies tend to be
- * low.)  At the same time, we sort secondarily by symbol value, which
- * aids the process of generating a canonical code.  Then, during tree
- * construction, no heap is necessary because both the leaf nodes and the
- * unparented non-leaf nodes can be easily maintained in sorted order.
- * Consequently, there can never be more than two possibilities for the
- * next-lowest-frequency node.
- *
- * In addition, because we're generating a canonical code, we actually
- * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
- * This is because for canonical code generation we don't need to know
- * where the symbols are in the tree.  Rather, we only need to know how
- * many leaf nodes have each depth (codeword length).  And this
- * information can, in fact, be quickly generated from the tree of
- * non-leaves only.
- *
- * Furthermore, we can build this stripped-down Huffman tree directly in
- * the array in which the codewords are to be generated, provided that
- * these array slots are large enough to hold a symbol and frequency
- * value.
- *
- * Still furthermore, we don't even need to maintain explicit child
- * pointers.  We only need the parent pointers, and even those can be
- * overwritten in-place with depth information as part of the process of
- * extracting codeword lengths from the tree.  So in summary, we do NOT
- * need a big structure like:
+ * specified length, and has exactly (with some algorithms) or approximately
+ * (with the algorithm used here) the minimum weighted path length from the
+ * root, given this constraint.
+ *
+ * A canonical Huffman code satisfies the properties that a longer codeword
+ * never lexicographically precedes a shorter codeword, and the lexicographic
+ * ordering of codewords of the same length is the same as the lexicographic
+ * ordering of the corresponding symbols.  A canonical Huffman code, or more
+ * generally a canonical prefix code, can be reconstructed from only a list
+ * containing the codeword length of each symbol.
+ *
+ * The classic algorithm to generate a Huffman code creates a node for each
+ * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
+ * Then, repeatedly, the two lowest-frequency nodes are removed from the
+ * min-heap and added as the children of a new node having frequency equal to
+ * the sum of its two children, which is then inserted into the min-heap.  When
+ * only a single node remains in the min-heap, it is the root of the Huffman
+ * tree.  The codeword for each symbol is determined by the path needed to reach
+ * the corresponding node from the root.  Descending to the left child appends a
+ * 0 bit, whereas descending to the right child appends a 1 bit.
+ *
+ * The classic algorithm is relatively easy to understand, but it is subject to
+ * a number of inefficiencies.  In practice, it is fastest to first sort the
+ * symbols by frequency.  (This itself can be subject to an optimization based
+ * on the fact that most frequencies tend to be low.)  At the same time, we sort
+ * secondarily by symbol value, which aids the process of generating a canonical
+ * code.  Then, during tree construction, no heap is necessary because both the
+ * leaf nodes and the unparented non-leaf nodes can be easily maintained in
+ * sorted order.  Consequently, there can never be more than two possibilities
+ * for the next-lowest-frequency node.
+ *
+ * In addition, because we're generating a canonical code, we actually don't
+ * need the leaf nodes of the tree at all, only the non-leaf nodes.  This is
+ * because for canonical code generation we don't need to know where the symbols
+ * are in the tree.  Rather, we only need to know how many leaf nodes have each
+ * depth (codeword length).  And this information can, in fact, be quickly
+ * generated from the tree of non-leaves only.
+ *
+ * Furthermore, we can build this stripped-down Huffman tree directly in the
+ * array in which the codewords are to be generated, provided that these array
+ * slots are large enough to hold a symbol and frequency value.
+ *
+ * Still furthermore, we don't even need to maintain explicit child pointers.
+ * We only need the parent pointers, and even those can be overwritten in-place
+ * with depth information as part of the process of extracting codeword lengths
+ * from the tree.  So in summary, we do NOT need a big structure like:
  *
  *	struct huffman_tree_node {
  *		unsigned int symbol;
@@ -521,47 +534,40 @@ gen_codewords(u32 A[restrict], u8 lens[restrict],
  *	};
  *
  *
- *   ... which often gets used in "naive" implementations of Huffman code
- *   generation.
- *
- * Most of these optimizations are based on the implementation in 7-Zip
- * (source file: C/HuffEnc.c), which has been placed in the public domain
- * by Igor Pavlov.  But I've rewritten the code with extensive comments,
- * as it took me a while to figure out what it was doing...!
- *
- * ---------------------------------------------------------------------
- *
- * NOTE: in general, the same frequencies can be used to generate
- * different length-limited canonical Huffman codes.  One choice we have
- * is during tree construction, when we must decide whether to prefer a
- * leaf or non-leaf when there is a tie in frequency.  Another choice we
- * have is how to deal with codewords that would exceed @max_codeword_len
- * bits in length.  Both of these choices affect the resulting codeword
- * lengths, which otherwise can be mapped uniquely onto the resulting
- * canonical Huffman code.
- *
- * Normally, there is no problem with choosing one valid code over
- * another, provided that they produce similar compression ratios.
- * However, the LZMS compression format uses adaptive Huffman coding.  It
- * requires that both the decompressor and compressor build a canonical
- * code equivalent to that which can be generated by using the classic
- * Huffman tree construction algorithm and always processing leaves
- * before non-leaves when there is a frequency tie.  Therefore, we make
- * sure to do this.  This method also has the advantage of sometimes
- * shortening the longest codeword that is generated.
- *
- * There also is the issue of how codewords longer than @max_codeword_len
- * are dealt with.  Fortunately, for LZMS this is irrelevant because for
- * the LZMS alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN
- * (= 15).  Since the LZMS algorithm regularly halves all frequencies,
- * the frequencies cannot become high enough for a length 16 codeword to
- * be generated.  Specifically, I think that if ties are broken in favor
- * of non-leaves (as we do), the lowest total frequency that would give a
- * length-16 codeword would be the sum of the frequencies 1 1 1 3 4 7 11
- * 18 29 47 76 123 199 322 521 843 1364, which is 3570.  And in LZMS we
- * can't get a frequency that high based on the alphabet sizes, rebuild
- * frequencies, and scaling factors.  This worst-case scenario is based
- * on the following degenerate case (only the bottom of the tree shown):
+ * ... which often gets used in "naive" implementations of Huffman code
+ * generation.
+ *
+ * Many of these optimizations are based on the implementation in 7-Zip (source
+ * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
+ *
+ * NOTE: in general, the same frequencies can be used to generate different
+ * length-limited canonical Huffman codes.  One choice we have is during tree
+ * construction, when we must decide whether to prefer a leaf or non-leaf when
+ * there is a tie in frequency.  Another choice we have is how to deal with
+ * codewords that would exceed @max_codeword_len bits in length.  Both of these
+ * choices affect the resulting codeword lengths, which otherwise can be mapped
+ * uniquely onto the resulting canonical Huffman code.
+ *
+ * Normally, there is no problem with choosing one valid code over another,
+ * provided that they produce similar compression ratios.  However, the LZMS
+ * compression format uses adaptive Huffman coding.  It requires that both the
+ * decompressor and compressor build a canonical code equivalent to that which
+ * can be generated by using the classic Huffman tree construction algorithm and
+ * always processing leaves before non-leaves when there is a frequency tie.
+ * Therefore, we make sure to do this.  This method also has the advantage of
+ * sometimes shortening the longest codeword that is generated.
+ *
+ * There also is the issue of how codewords longer than @max_codeword_len are
+ * dealt with.  Fortunately, for LZMS this is irrelevant because for the LZMS
+ * alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN (= 15).  Since
+ * the LZMS algorithm regularly halves all frequencies, the frequencies cannot
+ * become high enough for a length 16 codeword to be generated.  Specifically, I
+ * think that if ties are broken in favor of non-leaves (as we do), the lowest
+ * total frequency that would give a length-16 codeword would be the sum of the
+ * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which is
+ * 3570.  And in LZMS we can't get a frequency that high based on the alphabet
+ * sizes, rebuild frequencies, and scaling factors.  This worst-case scenario is
+ * based on the following degenerate case (only the bottom of the tree shown):
  *
  *                          ...
  *                        17
@@ -576,57 +582,65 @@ gen_codewords(u32 A[restrict], u8 lens[restrict],
  *               / \
  *              1   1
  *
- * Excluding the first leaves (those with value 1), each leaf value must
- * be greater than the non-leaf up 1 and down 2 from it; otherwise that
- * leaf would have taken precedence over that non-leaf and been combined
- * with the leaf below, thereby decreasing the height compared to that
- * shown.
- *
- * Interesting fact: if we were to instead prioritize non-leaves over
- * leaves, then the worst case frequencies would be the Fibonacci
- * sequence, plus an extra frequency of 1.  In this hypothetical
- * scenario, it would be slightly easier for longer codewords to be
- * generated.
+ * Excluding the first leaves (those with value 1), each leaf value must be
+ * greater than the non-leaf up 1 and down 2 from it; otherwise that leaf would
+ * have taken precedence over that non-leaf and been combined with the leaf
+ * below, thereby decreasing the height compared to that shown.
+ *
+ * Interesting fact: if we were to instead prioritize non-leaves over leaves,
+ * then the worst case frequencies would be the Fibonacci sequence, plus an
+ * extra frequency of 1.  In this hypothetical scenario, it would be slightly
+ * easier for longer codewords to be generated.
  */
 void
 make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
-			    const u32 freqs[restrict],
-			    u8 lens[restrict], u32 codewords[restrict])
+			    const u32 freqs[], u8 lens[], u32 codewords[])
 {
 	u32 *A = codewords;
 	unsigned num_used_syms;
 
-	/* We begin by sorting the symbols primarily by frequency and
-	 * secondarily by symbol value.  As an optimization, the array
-	 * used for this purpose ('A') shares storage with the space in
-	 * which we will eventually return the codewords.  */
+	wimlib_assert(num_syms <= MAX_NUM_SYMS);
+	STATIC_ASSERT(MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
+	wimlib_assert(max_codeword_len <= MAX_CODEWORD_LEN);
 
+	/*
+	 * We begin by sorting the symbols primarily by frequency and
+	 * secondarily by symbol value.  As an optimization, the array used for
+	 * this purpose ('A') shares storage with the space in which we will
+	 * eventually return the codewords.
+	 */
 	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
 
-	/* 'num_used_syms' is the number of symbols with nonzero
-	 * frequency.  This may be less than @num_syms.  'num_used_syms'
-	 * is also the number of entries in 'A' that are valid.  Each
-	 * entry consists of a distinct symbol and a nonzero frequency
-	 * packed into a 32-bit integer.  */
+	/*
+	 * 'num_used_syms' is the number of symbols with nonzero frequency.
+	 * This may be less than @num_syms.  'num_used_syms' is also the number
+	 * of entries in 'A' that are valid.  Each entry consists of a distinct
+	 * symbol and a nonzero frequency packed into a 32-bit integer.
+	 */
 
-	/* Handle special cases where only 0 or 1 symbols were used (had
-	 * nonzero frequency).  */
+	/*
+	 * Handle special cases where only 0 or 1 symbols were used (had nonzero
+	 * frequency).
+	 */
 
 	if (unlikely(num_used_syms == 0)) {
-		/* Code is empty.  sort_symbols() already set all lengths
-		 * to 0, so there is nothing more to do.  */
+		/*
+		 * Code is empty.  sort_symbols() already set all lengths to 0,
+		 * so there is nothing more to do.
+		 */
 		return;
 	}
 
 	if (unlikely(num_used_syms == 1)) {
-		/* Only one symbol was used, so we only need one
-		 * codeword.  But two codewords are needed to form the
-		 * smallest complete Huffman code, which uses codewords 0
-		 * and 1.  Therefore, we choose another symbol to which
-		 * to assign a codeword.  We use 0 (if the used symbol is
-		 * not 0) or 1 (if the used symbol is 0).  In either
-		 * case, the lesser-valued symbol must be assigned
-		 * codeword 0 so that the resulting code is canonical.  */
+		/*
+		 * Only one symbol was used, so we only need one codeword.  But
+		 * two codewords are needed to form the smallest complete
+		 * Huffman code, which uses codewords 0 and 1.  Therefore, we
+		 * choose another symbol to which to assign a codeword.  We use
+		 * 0 (if the used symbol is not 0) or 1 (if the used symbol is
+		 * 0).  In either case, the lesser-valued symbol must be
+		 * assigned codeword 0 so that the resulting code is canonical.
+		 */
 
 		unsigned sym = A[0] & SYMBOL_MASK;
 		unsigned nonzero_idx = sym ? sym : 1;
@@ -638,14 +652,16 @@ make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 		return;
 	}
 
-	/* Build a stripped-down version of the Huffman tree, sharing the
-	 * array 'A' with the symbol values.  Then extract length counts
-	 * from the tree and use them to generate the final codewords.  */
+	/*
+	 * Build a stripped-down version of the Huffman tree, sharing the array
+	 * 'A' with the symbol values.  Then extract length counts from the tree
+	 * and use them to generate the final codewords.
+	 */
 
 	build_tree(A, num_used_syms);
 
 	{
-		unsigned len_counts[max_codeword_len + 1];
+		unsigned len_counts[MAX_CODEWORD_LEN + 1];
 
 		compute_length_counts(A, num_used_syms - 2,
 				      len_counts, max_codeword_len);
diff --git a/src/divsufsort.c b/src/divsufsort.c
index c80412f5..81a13f97 100644
--- a/src/divsufsort.c
+++ b/src/divsufsort.c
@@ -56,9 +56,6 @@
 
 
 /*- Macros -*/
-#define SWAP swap
-#define MIN min
-#define MAX max
 
 #define STACK_PUSH(_a, _b, _c, _d)\
   do {\
-- 
2.43.0