X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Fcompress_common.c;h=270b8cfe3b4ffd410a2cc02e0247e3327846f4a0;hp=9659d07b00d930f22badaaac6f5c9897c2f926b0;hb=fe548d263d477a745dfa5057f540cc5c35ecce89;hpb=883833a4b3dabec325edf1ca938000f91d587c00

diff --git a/src/compress_common.c b/src/compress_common.c
index 9659d07b..270b8cfe 100644
--- a/src/compress_common.c
+++ b/src/compress_common.c
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (C) 2012, 2013 Eric Biggers
+ * Copyright (C) 2012, 2013, 2014 Eric Biggers
  *
  * This file is part of wimlib, a library for working with WIM files.
  *
@@ -33,7 +33,6 @@
 #include "wimlib/compress_common.h"
 #include "wimlib/util.h"
 
-#include <stdlib.h>
 #include <string.h>
 
 /* Writes @num_bits bits, given by the @num_bits least significant bits of
@@ -126,346 +125,627 @@ init_output_bitstream(struct output_bitstream *ostream,
 	ostream->overrun             = false;
 }
 
-typedef struct {
-	input_idx_t freq;
-	u16 sym;
-	union {
-		u16 path_len;
-		u16 height;
-	};
-} HuffmanNode;
-
-typedef struct HuffmanIntermediateNode {
-	HuffmanNode node_base;
-	HuffmanNode *left_child;
-	HuffmanNode *right_child;
-} HuffmanIntermediateNode;
-
-
-/* Comparator function for HuffmanNodes.  Sorts primarily by symbol
- * frequency and secondarily by symbol value. */
-static int
-cmp_nodes_by_freq(const void *_leaf1, const void *_leaf2)
+/* Given the binary tree node A[subtree_idx] whose children already
+ * satisfy the maxheap property, swap the node with its greater child
+ * until it is greater than both its children, so that the maxheap
+ * property is satisfied in the subtree rooted at A[subtree_idx].  */
+static void
+heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
 {
-	const HuffmanNode *leaf1 = _leaf1;
-	const HuffmanNode *leaf2 = _leaf2;
-
-	if (leaf1->freq > leaf2->freq)
-		return 1;
-	else if (leaf1->freq < leaf2->freq)
-		return -1;
-	else
-		return (int)leaf1->sym - (int)leaf2->sym;
+	unsigned parent_idx;
+	unsigned child_idx;
+	u32 v;
+
+	v = A[subtree_idx];
+	parent_idx = subtree_idx;
+	while ((child_idx = parent_idx * 2) <= length) {
+		if (child_idx < length && A[child_idx + 1] > A[child_idx])
+			child_idx++;
+		if (v >= A[child_idx])
+			break;
+		A[parent_idx] = A[child_idx];
+		parent_idx = child_idx;
+	}
+	A[parent_idx] = v;
 }
 
-/* Comparator function for HuffmanNodes.  Sorts primarily by code length and
- * secondarily by symbol value. */
-static int
-cmp_nodes_by_code_len(const void *_leaf1, const void *_leaf2)
+/* Rearrange the array 'A' so that it satisfies the maxheap property.
+ * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
+ */
+static void
+heapify_array(u32 A[], unsigned length)
 {
-	const HuffmanNode *leaf1 = _leaf1;
-	const HuffmanNode *leaf2 = _leaf2;
-
-	int code_len_diff = (int)leaf1->path_len - (int)leaf2->path_len;
-
-	if (code_len_diff == 0)
-		return (int)leaf1->sym - (int)leaf2->sym;
-	else
-		return code_len_diff;
+	for (unsigned subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
+		heapify_subtree(A, length, subtree_idx);
 }
 
-#define INVALID_SYMBOL 0xffff
-
-/* Recursive function to calculate the depth of the leaves in a Huffman tree.
- * */
+/* Sort the array 'A', which contains 'length' unsigned 32-bit integers.  */
 static void
-huffman_tree_compute_path_lengths(HuffmanNode *base_node, u16 cur_len)
+heapsort(u32 A[], unsigned length)
 {
-	if (base_node->sym == INVALID_SYMBOL) {
-		/* Intermediate node. */
-		HuffmanIntermediateNode *node = (HuffmanIntermediateNode*)base_node;
-		huffman_tree_compute_path_lengths(node->left_child, cur_len + 1);
-		huffman_tree_compute_path_lengths(node->right_child, cur_len + 1);
-	} else {
-		/* Leaf node. */
-		base_node->path_len = cur_len;
+	A--; /* Use 1-based indices  */
+
+	heapify_array(A, length);
+
+	while (length >= 2) {
+		swap(A[1], A[length]);
+		length--;
+		heapify_subtree(A, length, 1);
 	}
 }
 
-/* make_canonical_huffman_code: - Creates a canonical Huffman code from an array
- *				  of symbol frequencies.
- *
- * The algorithm used is similar to the well-known algorithm that builds a
- * Huffman tree using a minheap.  In that algorithm, the leaf nodes are
- * initialized and inserted into the minheap with the frequency as the key.
- * Repeatedly, the top two nodes (nodes with the lowest frequency) are taken out
- * of the heap and made the children of a new node that has a frequency equal to
- * the sum of the two frequencies of its children.  This new node is inserted
- * into the heap.  When all the nodes have been removed from the heap, what
- * remains is the Huffman tree. The Huffman code for a symbol is given by the
- * path to it in the tree, where each left pointer is mapped to a 0 bit and each
- * right pointer is mapped to a 1 bit.
- *
- * The algorithm used here uses an optimization that removes the need to
- * actually use a heap.  The leaf nodes are first sorted by frequency, as
- * opposed to being made into a heap.  Note that this sorting step takes O(n log
- * n) time vs.  O(n) time for heapifying the array, where n is the number of
- * symbols.  However, the heapless method is probably faster overall, due to the
- * time saved later.  In the heapless method, whenever an intermediate node is
- * created, it is not inserted into the sorted array.  Instead, the intermediate
- * nodes are kept in a separate array, which is easily kept sorted because every
- * time an intermediate node is initialized, it will have a frequency at least
- * as high as that of the previous intermediate node that was initialized.  So
- * whenever we want the 2 nodes, leaf or intermediate, that have the lowest
- * frequency, we check the low-frequency ends of both arrays, which is an O(1)
- * operation.
- *
- * The function builds a canonical Huffman code, not just any Huffman code.  A
- * Huffman code is canonical if the codeword for each symbol numerically
- * precedes the codeword for all other symbols of the same length that are
- * numbered higher than the symbol, and additionally, all shorter codewords,
- * 0-extended, numerically precede longer codewords.  A canonical Huffman code
- * is useful because it can be reconstructed by only knowing the path lengths in
- * the tree.  See the make_huffman_decode_table() function to see how to
- * reconstruct a canonical Huffman code from only the lengths of the codes.
- *
- * @num_syms:  The number of symbols in the alphabet.
- *
- * @max_codeword_len:  The maximum allowed length of a codeword in the code.
- *			Note that if the code being created runs up against
- *			this restriction, the code ultimately created will be
- *			suboptimal, although there are some advantages for
- *			limiting the length of the codewords.
- *
- * @freq_tab:  An array of length @num_syms that contains the frequencies
- *			of each symbol in the uncompressed data.
- *
- * @lens:	   An array of length @num_syms into which the lengths of the
- *			codewords for each symbol will be written.
- *
- * @codewords:	   An array of @num_syms short integers into which the
- *			codewords for each symbol will be written.  The first
- *			lens[i] bits of codewords[i] will contain the codeword
- *			for symbol i.
+#define NUM_SYMBOL_BITS 10
+#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
+
+/*
+ * Sort the symbols primarily by frequency and secondarily by symbol
+ * value.  Discard symbols with zero frequency and fill in an array with
+ * the remaining symbols, along with their frequencies.  The low
+ * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
+ * value, and the remaining bits will contain the frequency.
+ *
+ * @num_syms
+ *	Number of symbols in the alphabet.
+ *	Can't be greater than (1 << NUM_SYMBOL_BITS).
+ *
+ * @freqs[num_syms]
+ *	The frequency of each symbol.
+ *
+ * @lens[num_syms]
+ *	An array that eventually will hold the length of each codeword.
+ *	This function only fills in the codeword lengths for symbols that
+ *	have zero frequency, which are not well defined per se but will
+ *	be set to 0.
+ *
+ * @symout[num_syms]
+ *	The output array, described above.
+ *
+ * Returns the number of entries in 'symout' that were filled.  This is
+ * the number of symbols that have nonzero frequency.
  */
-void
-make_canonical_huffman_code(unsigned num_syms,
-			    unsigned max_codeword_len,
-			    const input_idx_t freq_tab[restrict],
-			    u8 lens[restrict],
-			    u16 codewords[restrict])
+static unsigned
+sort_symbols(unsigned num_syms, const u32 freqs[restrict],
+	     u8 lens[restrict], u32 symout[restrict])
 {
-	/* We require at least 2 possible symbols in the alphabet to produce a
-	 * valid Huffman decoding table. It is allowed that fewer than 2 symbols
-	 * are actually used, though. */
-	wimlib_assert(num_syms >= 2 && num_syms < INVALID_SYMBOL);
-
-	/* Initialize the lengths and codewords to 0 */
-	memset(lens, 0, num_syms * sizeof(lens[0]));
-	memset(codewords, 0, num_syms * sizeof(codewords[0]));
-
-	/* Calculate how many symbols have non-zero frequency.  These are the
-	 * symbols that actually appeared in the input. */
-	unsigned num_used_symbols = 0;
-	for (unsigned i = 0; i < num_syms; i++)
-		if (freq_tab[i] != 0)
-			num_used_symbols++;
-
-
-	/* It is impossible to make a code for num_used_symbols symbols if there
-	 * aren't enough code bits to uniquely represent all of them. */
-	wimlib_assert((1 << max_codeword_len) > num_used_symbols);
-
-	/* Initialize the array of leaf nodes with the symbols and their
-	 * frequencies. */
-	HuffmanNode leaves[num_used_symbols];
-	unsigned leaf_idx = 0;
-	for (unsigned i = 0; i < num_syms; i++) {
-		if (freq_tab[i] != 0) {
-			leaves[leaf_idx].freq = freq_tab[i];
-			leaves[leaf_idx].sym  = i;
-			leaves[leaf_idx].height = 0;
-			leaf_idx++;
-		}
+	unsigned num_used_syms;
+	unsigned num_counters;
+
+	/* We rely on heapsort, but with an added optimization.  Since
+	 * it's common for most symbol frequencies to be low, we first do
+	 * a count sort using a limited number of counters.  High
+	 * frequencies will be counted in the last counter, and only they
+	 * will be sorted with heapsort.
+	 *
+	 * Note: with more symbols, it is generally beneficial to have more
+	 * counters.  About 1 counter per 4 symbols seems fast.
+	 *
+	 * Note: I also tested radix sort, but even for large symbol
+	 * counts (> 255) and frequencies bounded at 16 bits (enabling
+	 * radix sort by just two base-256 digits), it didn't seem any
+	 * faster than the method implemented here.
+	 *
+	 * Note: I tested the optimized quicksort implementation from
+	 * glibc (with indirection overhead removed), but it was only
+	 * marginally faster than the simple heapsort implemented here.
+	 *
+	 * Tests were done with building the codes for LZX.  Results may
+	 * vary for different compression algorithms...!  */
+
+	num_counters = (DIV_ROUND_UP(num_syms, 4) + 3) & ~3;
+
+	unsigned counters[num_counters];
+
+	memset(counters, 0, sizeof(counters));
+
+	/* Count the frequencies.  */
+	for (unsigned sym = 0; sym < num_syms; sym++)
+		counters[min(freqs[sym], num_counters - 1)]++;
+
+	/* Make the counters cumulative, ignoring the zero-th, which
+	 * counted symbols with zero frequency.  As a side effect, this
+	 * calculates the number of symbols with nonzero frequency.  */
+	num_used_syms = 0;
+	for (unsigned i = 1; i < num_counters; i++) {
+		unsigned count = counters[i];
+		counters[i] = num_used_syms;
+		num_used_syms += count;
 	}
 
-	/* Deal with the special cases where num_used_symbols < 2. */
-	if (num_used_symbols < 2) {
-		if (num_used_symbols == 0) {
-			/* If num_used_symbols is 0, there are no symbols in the
-			 * input, so it must be empty.  This should be an error,
-			 * but the LZX format expects this case to succeed.  All
-			 * the codeword lengths are simply marked as 0 (which
-			 * was already done.) */
+	/* Sort nonzero-frequency symbols using the counters.  At the
+	 * same time, set the codeword lengths of zero-frequency symbols
+	 * to 0.  */
+	for (unsigned sym = 0; sym < num_syms; sym++) {
+		u32 freq = freqs[sym];
+		if (freq != 0) {
+			symout[counters[min(freq, num_counters - 1)]++] =
+				sym | (freq << NUM_SYMBOL_BITS);
 		} else {
-			/* If only one symbol is present, the LZX format
-			 * requires that the Huffman code include two codewords.
-			 * One is not used.  Note that this doesn't make the
-			 * encoded data take up more room anyway, since binary
-			 * data itself has 2 symbols. */
-
-			unsigned sym = leaves[0].sym;
-
-			codewords[0] = 0;
-			lens[0]      = 1;
-			if (sym == 0) {
-				/* dummy symbol is 1, real symbol is 0 */
-				codewords[1] = 1;
-				lens[1]      = 1;
-			} else {
-				/* dummy symbol is 0, real symbol is sym */
-				codewords[sym] = 1;
-				lens[sym]      = 1;
-			}
+			lens[sym] = 0;
 		}
-		return;
 	}
 
-	/* Otherwise, there are at least 2 symbols in the input, so we need to
-	 * find a real Huffman code. */
+	/* Sort the symbols counted in the last counter.  */
+	heapsort(symout + counters[num_counters - 2],
+		 counters[num_counters - 1] - counters[num_counters - 2]);
+
+	return num_used_syms;
+}
 
+/*
+ * Build the Huffman tree.
+ *
+ * This is an optimized implementation that
+ *	(a) takes advantage of the frequencies being already sorted;
+ *	(b) only generates non-leaf nodes, since the non-leaf nodes of a
+ *	    Huffman tree are sufficient to generate a canonical code;
+ *	(c) Only stores parent pointers, not child pointers;
+ *	(d) Produces the nodes in the same memory used for input
+ *	    frequency information.
+ *
+ * Array 'A', which contains 'sym_count' entries, is used for both input
+ * and output.  For this function, 'sym_count' must be at least 2.
+ *
+ * For input, the array must contain the frequencies of the symbols,
+ * sorted in increasing order.  Specifically, each entry must contain a
+ * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
+ * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
+ * Although these bits will, in fact, contain the symbols that correspond
+ * to the frequencies, this function is concerned with frequencies only
+ * and keeps the symbols as-is.
+ *
+ * For output, this function will produce the non-leaf nodes of the
+ * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
+ * entries of the array.  Entry A[sym_count - 2] will represent the root
+ * node.  Each other node will contain the zero-based index of its parent
+ * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
+ * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
+ * note that although these low bits will, in fact, contain a symbol
+ * value, this symbol will have *no relationship* with the Huffman tree
+ * node that happens to occupy the same slot.  This is because this
+ * implementation only generates the non-leaf nodes of the tree.
+ */
+static void
+build_tree(u32 A[], unsigned sym_count)
+{
+	/* Index, in 'A', of next lowest frequency symbol that has not
+	 * yet been processed.  */
+	unsigned i = 0;
+
+	/* Index, in 'A', of next lowest frequency parentless non-leaf
+	 * node; or, if equal to 'e', then no such node exists yet.  */
+	unsigned b = 0;
+
+	/* Index, in 'A', of next node to allocate as a non-leaf.  */
+	unsigned e = 0;
+
+	do {
+		unsigned m, n;
+		u32 freq_shifted;
+
+		/* Choose the two next lowest frequency entries.  */
+
+		if (i != sym_count &&
+		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
+			m = i++;
+		else
+			m = b++;
+
+		if (i != sym_count &&
+		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
+			n = i++;
+		else
+			n = b++;
+
+		/* Allocate a non-leaf node and link the entries to it.
+		 *
+		 * If we link an entry that we're visiting for the first
+		 * time (via index 'i'), then we're actually linking a
+		 * leaf node and it will have no effect, since the leaf
+		 * will be overwritten with a non-leaf when index 'e'
+		 * catches up to it.  But it's not any slower to
+		 * unconditionally set the parent index.
+		 *
+		 * We also compute the frequency of the non-leaf node as
+		 * the sum of its two children's frequencies.  */
+
+		freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
+
+		A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
+		A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
+		A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
+		e++;
+	} while (sym_count - e > 1);
+		/* When just one entry remains, it is a "leaf" that was
+		 * linked to some other node.  We ignore it, since the
+		 * rest of the array contains the non-leaves which we
+		 * need.  (Note that we're assuming the cases with 0 or 1
+		 * symbols were handled separately.) */
+}
 
-	/* Declare the array of intermediate nodes.  An intermediate node is not
-	 * associated with a symbol. Instead, it represents some binary code
-	 * prefix that is shared between at least 2 codewords.  There can be at
-	 * most num_used_symbols - 1 intermediate nodes when creating a Huffman
-	 * code.  This is because if there were at least num_used_symbols nodes,
-	 * the code would be suboptimal because there would be at least one
-	 * unnecessary intermediate node.
+/*
+ * Given the stripped-down Huffman tree constructed by build_tree(),
+ * determine the number of codewords that should be assigned each
+ * possible length, taking into account the length-limited constraint.
+ *
+ * @A
+ *	The array produced by build_tree(), containing parent index
+ *	information for the non-leaf nodes of the Huffman tree.  Each
+ *	entry in this array is a node; a node's parent always has a
+ *	greater index than that node itself.  This function will
+ *	overwrite the parent index information in this array, so
+ *	essentially it will destroy the tree.  However, the data in the
+ *	low NUM_SYMBOL_BITS of each entry will be preserved.
+ *
+ * @root_idx
+ *	The 0-based index of the root node in 'A', and consequently one
+ *	less than the number of tree node entries in 'A'.  (Or, really 2
+ *	less than the actual length of 'A'.)
+ *
+ * @len_counts
+ *	An array of length ('max_codeword_len' + 1) in which the number of
+ *	codewords having each length <= max_codeword_len will be
+ *	returned.
+ *
+ * @max_codeword_len
+ *	The maximum permissible codeword length.
+ */
+static void
+compute_length_counts(u32 A[restrict], unsigned root_idx,
+		      unsigned len_counts[restrict], unsigned max_codeword_len)
+{
+	/* The key observations are:
 	 *
-	 * The worst case (greatest number of intermediate nodes) would be if
-	 * all the intermediate nodes were chained together.  This results in
-	 * num_used_symbols - 1 intermediate nodes.  If num_used_symbols is at
-	 * least 17, this configuration would not be allowed because the LZX
-	 * format constrains codes to 16 bits or less each.  However, it is
-	 * still possible for there to be more than 16 intermediate nodes, as
-	 * long as no leaf has a depth of more than 16.  */
-	HuffmanIntermediateNode inodes[num_used_symbols - 1];
-
-
-	/* Pointer to the leaf node of lowest frequency that hasn't already been
-	 * added as the child of some intermediate note. */
-	HuffmanNode *cur_leaf;
-
-	/* Pointer past the end of the array of leaves. */
-	HuffmanNode *end_leaf = &leaves[num_used_symbols];
-
-	/* Pointer to the intermediate node of lowest frequency. */
-	HuffmanIntermediateNode     *cur_inode;
-
-	/* Pointer to the next unallocated intermediate node. */
-	HuffmanIntermediateNode     *next_inode;
-
-	/* Only jump back to here if the maximum length of the codewords allowed
-	 * by the LZX format (16 bits) is exceeded. */
-try_building_tree_again:
-
-	/* Sort the leaves from those that correspond to the least frequent
-	 * symbol, to those that correspond to the most frequent symbol.  If two
-	 * leaves have the same frequency, they are sorted by symbol. */
-	qsort(leaves, num_used_symbols, sizeof(leaves[0]), cmp_nodes_by_freq);
-
-	cur_leaf   = &leaves[0];
-	cur_inode  = &inodes[0];
-	next_inode = &inodes[0];
-
-	/* The following loop takes the two lowest frequency nodes of those
-	 * remaining and makes them the children of the next available
-	 * intermediate node.  It continues until all the leaf nodes and
-	 * intermediate nodes have been used up, or the maximum allowed length
-	 * for the codewords is exceeded.  For the latter case, we must adjust
-	 * the frequencies to be more equal and then execute this loop again. */
-	while (1) {
-
-		/* Lowest frequency node. */
-		HuffmanNode *f1;
-
-		/* Second lowest frequency node. */
-		HuffmanNode *f2;
-
-		/* Get the lowest and second lowest frequency nodes from the
-		 * remaining leaves or from the intermediate nodes. */
-
-		if (cur_leaf != end_leaf && (cur_inode == next_inode ||
-					cur_leaf->freq <= cur_inode->node_base.freq)) {
-			f1 = cur_leaf++;
-		} else if (cur_inode != next_inode) {
-			f1 = (HuffmanNode*)cur_inode++;
-		}
+	 * (1) We can traverse the non-leaf nodes of the tree, always
+	 * visiting a parent before its children, by simply iterating
+	 * through the array in reverse order.  Consequently, we can
+	 * compute the depth of each node in one pass, overwriting the
+	 * parent indices with depths.
+	 *
+	 * (2) We can initially assume that in the real Huffman tree,
+	 * both children of the root are leaves.  This corresponds to two
+	 * codewords of length 1.  Then, whenever we visit a (non-leaf)
+	 * node during the traversal, we modify this assumption to
+	 * account for the current node *not* being a leaf, but rather
+	 * its two children being leaves.  This causes the loss of one
+	 * codeword for the current depth and the addition of two
+	 * codewords for the current depth plus one.
+	 *
+	 * (3) We can handle the length-limited constraint fairly easily
+	 * by simply using the largest length available when a depth
+	 * exceeds max_codeword_len.
+	 */
 
-		if (cur_leaf != end_leaf && (cur_inode == next_inode ||
-					cur_leaf->freq <= cur_inode->node_base.freq)) {
-			f2 = cur_leaf++;
-		} else if (cur_inode != next_inode) {
-			f2 = (HuffmanNode*)cur_inode++;
-		} else {
-			/* All nodes used up! */
-			break;
-		}
+	for (unsigned len = 0; len <= max_codeword_len; len++)
+		len_counts[len] = 0;
+	len_counts[1] = 2;
 
-		/* next_inode becomes the parent of f1 and f2. */
-
-		next_inode->node_base.freq = f1->freq + f2->freq;
-		next_inode->node_base.sym  = INVALID_SYMBOL;
-		next_inode->left_child     = f1;
-		next_inode->right_child    = f2;
-
-		/* We need to keep track of the height so that we can detect if
-		 * the length of a codeword has execeed max_codeword_len.   The
-		 * parent node has a height one higher than the maximum height
-		 * of its children. */
-		next_inode->node_base.height = max(f1->height, f2->height) + 1;
-
-		/* Check to see if the code length of the leaf farthest away
-		 * from next_inode has exceeded the maximum code length. */
-		if (next_inode->node_base.height > max_codeword_len) {
-			/* The code lengths can be made more uniform by making
-			 * the frequencies more uniform.  Divide all the
-			 * frequencies by 2, leaving 1 as the minimum frequency.
-			 * If this keeps happening, the symbol frequencies will
-			 * approach equality, which makes their Huffman
-			 * codewords approach the length
-			 * log_2(num_used_symbols).
-			 * */
-			for (unsigned i = 0; i < num_used_symbols; i++)
-				leaves[i].freq = (leaves[i].freq + 1) >> 1;
-
-			goto try_building_tree_again;
+	/* Set the root node's depth to 0.  */
+	A[root_idx] &= SYMBOL_MASK;
+
+	for (int node = root_idx - 1; node >= 0; node--) {
+
+		/* Calculate the depth of this node.  */
+
+		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
+		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
+		unsigned depth = parent_depth + 1;
+		unsigned len = depth;
+
+		/* Set the depth of this node so that it is available
+		 * when its children (if any) are processed.  */
+
+		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
+
+		/* If needed, decrease the length to meet the
+		 * length-limited constraint.  This is not the optimal
+		 * method for generating length-limited Huffman codes!
+		 * But it should be good enough.  */
+		if (len >= max_codeword_len) {
+			len = max_codeword_len;
+			do {
+				len--;
+			} while (len_counts[len] == 0);
 		}
-		next_inode++;
+
+		/* Account for the fact that we have a non-leaf node at
+		 * the current depth.  */
+		len_counts[len]--;
+		len_counts[len + 1] += 2;
 	}
+}
 
-	/* The Huffman tree is now complete, and its height is no more than
-	 * max_codeword_len.  */
+/*
+ * Generate the codewords for a canonical Huffman code.
+ *
+ * @A
+ *	The output array for codewords.  In addition, initially this
+ *	array must contain the symbols, sorted primarily by frequency and
+ *	secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
+ *	each entry.
+ *
+ * @len
+ *	Output array for codeword lengths.
+ *
+ * @len_counts
+ *	An array that provides the number of codewords that will have
+ *	each possible length <= max_codeword_len.
+ *
+ * @max_codeword_len
+ *	Maximum length, in bits, of each codeword.
+ *
+ * @num_syms
+ *	Number of symbols in the alphabet, including symbols with zero
+ *	frequency.  This is the length of the 'A' and 'len' arrays.
+ */
+static void
+gen_codewords(u32 A[restrict], u8 lens[restrict],
+	      const unsigned len_counts[restrict],
+	      unsigned max_codeword_len, unsigned num_syms)
+{
+	u32 next_codewords[max_codeword_len + 1];
+
+	/* Given the number of codewords that will have each length,
+	 * assign codeword lengths to symbols.  We do this by assigning
+	 * the lengths in decreasing order to the symbols sorted
+	 * primarily by increasing frequency and secondarily by
+	 * increasing symbol value.  */
+	for (unsigned i = 0, len = max_codeword_len; len >= 1; len--) {
+		unsigned count = len_counts[len];
+		while (count--)
+			lens[A[i++] & SYMBOL_MASK] = len;
+	}
 
-	HuffmanIntermediateNode *root = next_inode - 1;
-	wimlib_assert(root->node_base.height <= max_codeword_len);
+	/* Generate the codewords themselves.  We initialize the
+	 * 'next_codewords' array to provide the lexicographically first
+	 * codeword of each length, then assign codewords in symbol
+	 * order.  This produces a canonical code.  */
+	next_codewords[0] = 0;
+	next_codewords[1] = 0;
+	for (unsigned len = 2; len <= max_codeword_len; len++)
+		next_codewords[len] =
+			(next_codewords[len - 1] + len_counts[len - 1]) << 1;
+
+	for (unsigned sym = 0; sym < num_syms; sym++)
+		A[sym] = next_codewords[lens[sym]]++;
+}
 
-	/* Compute the path lengths for the leaf nodes. */
-	huffman_tree_compute_path_lengths(&root->node_base, 0);
+/*
+ * ---------------------------------------------------------------------
+ *			make_canonical_huffman_code()
+ * ---------------------------------------------------------------------
+ *
+ * Given an alphabet and the frequency of each symbol in it, construct a
+ * length-limited canonical Huffman code.
+ *
+ * @num_syms
+ *	The number of symbols in the alphabet.  The symbols are the
+ *	integers in the range [0, num_syms - 1].  This parameter must be
+ *	at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
+ *
+ * @max_codeword_len
+ *	The maximum permissible codeword length.
+ *
+ * @freqs
+ *	An array of @num_syms entries, each of which specifies the
+ *	frequency of the corresponding symbol.  It is valid for some,
+ *	none, or all of the frequencies to be 0.
+ *
+ * @lens
+ *	An array of @num_syms entries in which this function will return
+ *	the length, in bits, of the codeword assigned to each symbol.
+ *	Symbols with 0 frequency will not have codewords per se, but
+ *	their entries in this array will be set to 0.  No lengths greater
+ *	than @max_codeword_len will be assigned.
+ *
+ * @codewords
+ *	An array of @num_syms entries in which this function will return
+ *	the codeword for each symbol, right-justified and padded on the
+ *	left with zeroes.  Codewords for symbols with 0 frequency will be
+ *	undefined.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * This function builds a length-limited canonical Huffman code.
+ *
+ * A length-limited Huffman code contains no codewords longer than some
+ * specified length, and has exactly (with some algorithms) or
+ * approximately (with the algorithm used here) the minimum weighted path
+ * length from the root, given this constraint.
+ *
+ * A canonical Huffman code satisfies the properties that a longer
+ * codeword never lexicographically precedes a shorter codeword, and the
+ * lexicographic ordering of codewords of the same length is the same as
+ * the lexicographic ordering of the corresponding symbols.  A canonical
+ * Huffman code, or more generally a canonical prefix code, can be
+ * reconstructed from only a list containing the codeword length of each
+ * symbol.
+ *
+ * The classic algorithm to generate a Huffman code creates a node for
+ * each symbol, then inserts these nodes into a min-heap keyed by symbol
+ * frequency.  Then, repeatedly, the two lowest-frequency nodes are
+ * removed from the min-heap and added as the children of a new node
+ * having frequency equal to the sum of its two children, which is then
+ * inserted into the min-heap.  When only a single node remains in the
+ * min-heap, it is the root of the Huffman tree.  The codeword for each
+ * symbol is determined by the path needed to reach the corresponding
+ * node from the root.  Descending to the left child appends a 0 bit,
+ * whereas descending to the right child appends a 1 bit.
+ *
+ * The classic algorithm is relatively easy to understand, but it is
+ * subject to a number of inefficiencies.  In practice, it is fastest to
+ * first sort the symbols by frequency.  (This itself can be subject to
+ * an optimization based on the fact that most frequencies tend to be
+ * low.)  At the same time, we sort secondarily by symbol value, which
+ * aids the process of generating a canonical code.  Then, during tree
+ * construction, no heap is necessary because both the leaf nodes and the
+ * unparented non-leaf nodes can be easily maintained in sorted order.
+ * Consequently, there can never be more than two possibilities for the
+ * next-lowest-frequency node.
+ *
+ * In addition, because we're generating a canonical code, we actually
+ * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
+ * This is because for canonical code generation we don't need to know
+ * where the symbols are in the tree.  Rather, we only need to know how
+ * many leaf nodes have each depth (codeword length).  And this
+ * information can, in fact, be quickly generated from the tree of
+ * non-leaves only.
+ *
+ * Furthermore, we can build this stripped-down Huffman tree directly in
+ * the array in which the codewords are to be generated, provided that
+ * these array slots are large enough to hold a symbol and frequency
+ * value.
+ *
+ * Still furthermore, we don't even need to maintain explicit child
+ * pointers.  We only need the parent pointers, and even those can be
+ * overwritten in-place with depth information as part of the process of
+ * extracting codeword lengths from the tree.  So in summary, we do NOT
+ * need a big structure like:
+ *
+ *	struct huffman_tree_node {
+ *		unsigned int symbol;
+ *		unsigned int frequency;
+ *		unsigned int depth;
+ *		struct huffman_tree_node *left_child;
+ *		struct huffman_tree_node *right_child;
+ *	};
+ *
+ *
+ *   ... which often gets used in "naive" implementations of Huffman code
+ *   generation.
+ *
+ * Most of these optimizations are based on the implementation in 7-Zip
+ * (source file: C/HuffEnc.c), which has been placed in the public domain
+ * by Igor Pavlov.  But I've rewritten the code with extensive comments,
+ * as it took me a while to figure out what it was doing...!
+ *
+ * ---------------------------------------------------------------------
+ *
+ * NOTE: in general, the same frequencies can be used to generate
+ * different length-limited canonical Huffman codes.  One choice we have
+ * is during tree construction, when we must decide whether to prefer a
+ * leaf or non-leaf when there is a tie in frequency.  Another choice we
+ * have is how to deal with codewords that would exceed @max_codeword_len
+ * bits in length.  Both of these choices affect the resulting codeword
+ * lengths, which otherwise can be mapped uniquely onto the resulting
+ * canonical Huffman code.
+ *
+ * Normally, there is no problem with choosing one valid code over
+ * another, provided that they produce similar compression ratios.
+ * However, the LZMS compression format uses adaptive Huffman coding.  It
+ * requires that both the decompressor and compressor build a canonical
+ * code equivalent to that which can be generated by using the classic
+ * Huffman tree construction algorithm and always processing leaves
+ * before non-leaves when there is a frequency tie.  Therefore, we make
+ * sure to do this.  This method also has the advantage of sometimes
+ * shortening the longest codeword that is generated.
+ *
+ * There also is the issue of how codewords longer than @max_codeword_len
+ * are dealt with.  Fortunately, for LZMS this is irrelevant because
+ * because for the LZMS alphabets no codeword can ever exceed
+ * LZMS_MAX_CODEWORD_LEN (= 15).  Since the LZMS algorithm regularly
+ * halves all frequencies, the frequencies cannot become high enough for
+ * a length 16 codeword to be generated.  Specifically, I think that if
+ * ties are broken in favor of non-leaves (as we do), the lowest total
+ * frequency that would give a length-16 codeword would be the sum of the
+ * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which
+ * is 3570.  And in LZMS we can't get a frequency that high based on the
+ * alphabet sizes, rebuild frequencies, and scaling factors.  This
+ * worst-case scenario is based on the following degenerate case (only
+ * the bottom of the tree shown):
+ *
+ *                          ...
+ *                        17
+ *                       /  \
+ *                      10   7
+ *                     / \
+ *                    6   4
+ *                   / \
+ *                  3   3
+ *                 / \
+ *                2   1
+ *               / \
+ *              1   1
+ *
+ * Excluding the first leaves (those with value 1), each leaf value must
+ * be greater than the non-leaf up 1 and down 2 from it; otherwise that
+ * leaf would have taken precedence over that non-leaf and been combined
+ * with the leaf below, thereby decreasing the height compared to that
+ * shown.
+ *
+ * Interesting fact: if we were to instead prioritize non-leaves over
+ * leaves, then the worst case frequencies would be the Fibonacci
+ * sequence, plus an extra frequency of 1.  In this hypothetical
+ * scenario, it would be slightly easier for longer codewords to be
+ * generated.
+ */
+void
+make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
+			    const u32 freqs[restrict],
+			    u8 lens[restrict], u32 codewords[restrict])
+{
+	u32 *A = codewords;
+	unsigned num_used_syms;
+
+	/* Assumptions  */
+	wimlib_assert2(num_syms >= 2);
+	wimlib_assert2(num_syms <= (1 << NUM_SYMBOL_BITS));
+	wimlib_assert2(max_codeword_len > 0);
+	wimlib_assert2(max_codeword_len <= 32);
+
+	/* We begin by sorting the symbols primarily by frequency and
+	 * secondarily by symbol value.  As an optimization, the array
+	 * used for this purpose ('A') shares storage with the space in
+	 * which we will eventually return the codewords.  */
+
+	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
+
+	/* 'num_used_syms' is the number of symbols with nonzero
+	 * frequency.  This may be less than @num_syms.  'num_used_syms'
+	 * is also the number of entries in 'A' that are valid.  Each
+	 * entry consists of a distinct symbol and a nonzero frequency
+	 * packed into a 32-bit integer.  */
+
+	/* Handle special cases where only 0 or 1 symbols were used (had
+	 * nonzero frequency).  */
+
+	if (unlikely(num_used_syms == 0)) {
+		/* Code is empty.  sort_symbols() already set all lengths
+		 * to 0, so there is nothing more to do.  */
+		return;
+	}
 
-	/* Sort the leaf nodes primarily by code length and secondarily by
-	 * symbol.  */
-	qsort(leaves, num_used_symbols, sizeof(leaves[0]), cmp_nodes_by_code_len);
+	if (unlikely(num_used_syms == 1)) {
+		/* Only one symbol was used, so we only need one
+		 * codeword.  But two codewords are needed to form the
+		 * smallest complete Huffman code, which uses codewords 0
+		 * and 1.  Therefore, we choose another symbol to which
+		 * to assign a codeword.  We use 0 (if the used symbol is
+		 * not 0) or 1 (if the used symbol is 0).  In either
+		 * case, the lesser-valued symbol must be assigned
+		 * codeword 0 so that the resulting code is canonical.  */
+
+		unsigned sym = A[0] & SYMBOL_MASK;
+		unsigned nonzero_idx = sym ? sym : 1;
+
+		codewords[0] = 0;
+		lens[0] = 1;
+		codewords[nonzero_idx] = 1;
+		lens[nonzero_idx] = 1;
+		return;
+	}
 
-	u16 cur_codeword = 0;
-	unsigned cur_codeword_len = 0;
-	for (unsigned i = 0; i < num_used_symbols; i++) {
+	/* Build a stripped-down version of the Huffman tree, sharing the
+	 * array 'A' with the symbol values.  Then extract length counts
+	 * from the tree and use them to generate the final codewords.  */
 
-		/* Each time a codeword becomes one longer, the current codeword
-		 * is left shifted by one place.  This is part of the procedure
-		 * for enumerating the canonical Huffman code.  Additionally,
-		 * whenever a codeword is used, 1 is added to the current
-		 * codeword.  */
+	build_tree(A, num_used_syms);
 
-		unsigned len_diff = leaves[i].path_len - cur_codeword_len;
-		cur_codeword <<= len_diff;
-		cur_codeword_len += len_diff;
+	{
+		unsigned len_counts[max_codeword_len + 1];
 
-		u16 sym = leaves[i].sym;
-		codewords[sym] = cur_codeword;
-		lens[sym] = cur_codeword_len;
+		compute_length_counts(A, num_used_syms - 2,
+				      len_counts, max_codeword_len);
 
-		cur_codeword++;
+		gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
 	}
 }