From 67c5f2c5954d07c25f57b6d9aedb73fb3720ec0b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Tue, 27 May 2014 16:51:24 -0500
Subject: [PATCH] Faster Huffman symbol decoding

When decoding a codeword short enough for a direct mapping, we can read
the codeword length at the same time we read the symbol itself.  This
speeds up Huffman decoding slightly.

This commit also updates and improves the comments for
make_huffman_decode_table().
---
 include/wimlib/decompress_common.h |  81 ++++--
 src/decompress_common.c            | 417 ++++++++++++++++-------------
 src/lzms-decompress.c              |  41 +--
 src/lzx-decompress.c               |  16 +-
 src/xpress-decompress.c            |   2 +-
 5 files changed, 319 insertions(+), 238 deletions(-)

diff --git a/include/wimlib/decompress_common.h b/include/wimlib/decompress_common.h
index 3b20f4e0..40fa49ab 100644
--- a/include/wimlib/decompress_common.h
+++ b/include/wimlib/decompress_common.h
@@ -129,48 +129,75 @@ bitstream_read_byte(struct input_bitstream *istream)
 	return *istream->data++;
 }
 
+
+/* Needed alignment of decode_table parameter to make_huffman_decode_table().
+ *
+ * Reason: We may fill the entries with SSE instructions without worrying
+ * about dealing with the unaligned case.  */
+#define DECODE_TABLE_ALIGNMENT 16
+
+/* Maximum supported symbol count for make_huffman_decode_table().
+ *
+ * Reason: In direct mapping entries, we store the symbol in 11 bits.  */
+#define DECODE_TABLE_MAX_SYMBOLS 2048
+
+/* Maximum supported table bits for make_huffman_decode_table().
+ *
+ * Reason: In internal binary tree nodes, offsets are encoded in 14 bits.
+ * But the real limit is 13, because we allocate entries past the end of
+ * the direct lookup part of the table for binary tree nodes.  (Note: if
+ * needed this limit could be removed by encoding the offsets relative to
+ * &decode_table[1 << table_bits].)  */
+#define DECODE_TABLE_MAX_TABLE_BITS 13
+
+/* Maximum supported codeword length for make_huffman_decode_table().
+ *
+ * Reason: In direct mapping entries, we encode the codeword length in 5
+ * bits, and the top 2 bits can't both be set because that has special
+ * meaning.  */
+#define DECODE_TABLE_MAX_CODEWORD_LEN 23
+
 /* Reads and returns the next Huffman-encoded symbol from a bitstream.  If the
  * input data is exhausted, the Huffman symbol is decoded as if the missing bits
- * are all zeroes.  */
+ * are all zeroes.
+ *
+ * XXX: This is mostly duplicated in lzms_huffman_decode_symbol() in
+ * lzms-decompress.c.  */
 static inline u16
-read_huffsym(struct input_bitstream * restrict istream,
-	     const u16 decode_table[restrict],
-	     const u8 lens[restrict],
-	     unsigned num_syms,
-	     unsigned table_bits,
-	     unsigned max_codeword_len)
+read_huffsym(struct input_bitstream *istream, const u16 decode_table[],
+	     unsigned num_syms, unsigned table_bits, unsigned max_codeword_len)
 {
+	u16 entry;
+	u16 key_bits;
 
 	bitstream_ensure_bits(istream, max_codeword_len);
 
-	/* Use the next table_bits of the input as an index into the
-	 * decode_table.  */
-	u16 key_bits = bitstream_peek_bits(istream, table_bits);
-
-	u16 sym = decode_table[key_bits];
-
-	if (likely(sym < num_syms)) {
-		/* Fast case: The decode table directly provided the symbol.  */
-		bitstream_remove_bits(istream, lens[sym]);
+	/* Index the decode table by the next table_bits bits of the input.  */
+	key_bits = bitstream_peek_bits(istream, table_bits);
+	entry = decode_table[key_bits];
+	if (likely(entry < 0xC000)) {
+		/* Fast case: The decode table directly provided the
+		 * symbol and codeword length.  The low 11 bits are the
+		 * symbol, and the high 5 bits are the codeword length.  */
+		bitstream_remove_bits(istream, entry >> 11);
+		return entry & 0x7FF;
 	} else {
-		/* Slow case: The symbol took too many bits to include directly
-		 * in the decode table, so search for it in a binary tree at the
-		 * end of the decode table.  */
+		/* Slow case: The codeword for the symbol is longer than
+		 * table_bits, so the symbol does not have an entry
+		 * directly in the first (1 << table_bits) entries of the
+		 * decode table.  Traverse the appropriate binary tree
+		 * bit-by-bit to decode the symbol.  */
 		bitstream_remove_bits(istream, table_bits);
 		do {
-			key_bits = sym + bitstream_pop_bits(istream, 1);
-		} while ((sym = decode_table[key_bits]) >= num_syms);
+			key_bits = (entry & 0x3FFF) + bitstream_pop_bits(istream, 1);
+		} while ((entry = decode_table[key_bits]) >= 0xC000);
+		return entry;
 	}
-	return sym;
 }
 
 extern int
 make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
-			  unsigned num_bits, const u8 lengths[],
+			  unsigned num_bits, const u8 lens[],
 			  unsigned max_codeword_len);
 
-/* Minimum alignment for the decode_table parameter to
- * make_huffman_decode_table().  */
-#define DECODE_TABLE_ALIGNMENT 16
-
 #endif /* _WIMLIB_DECOMPRESS_H */
diff --git a/src/decompress_common.c b/src/decompress_common.c
index ee85bc96..9743c7d0 100644
--- a/src/decompress_common.c
+++ b/src/decompress_common.c
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (C) 2012, 2013 Eric Biggers
+ * Copyright (C) 2012, 2013, 2014 Eric Biggers
  *
  * This file is part of wimlib, a library for working with WIM files.
  *
@@ -42,87 +42,119 @@
 #  endif
 #endif
 
+/* Construct a direct mapping entry in the lookup table.  */
+#define MAKE_DIRECT_ENTRY(symbol, length) ((symbol) | ((length) << 11))
+
 /*
- * make_huffman_decode_table: - Builds a fast huffman decoding table from an
- * array that gives the length of the codeword for each symbol in the alphabet.
- * Originally based on code written by David Tritscher (taken the original LZX
- * decompression code); also heavily modified to add some optimizations used in
- * the zlib code, as well as more comments; also added some optimizations to
- * make filling in the decode table entries faster (may not help significantly
- * though).
+ * make_huffman_decode_table() -
+ *
+ * Build a decoding table for a canonical prefix code, or "Huffman code".
+ *
+ * This takes as input the length of the codeword for each symbol in the
+ * alphabet and produces as output a table that can be used for fast
+ * decoding of prefix-encoded symbols using read_huffsym().
+ *
+ * Strictly speaking, a canonical prefix code might not be a Huffman
+ * code.  But this algorithm will work either way; and in fact, since
+ * Huffman codes are defined in terms of symbol frequencies, there is no
+ * way for the decompressor to know whether the code is a true Huffman
+ * code or not until all symbols have been decoded.
  *
- * @decode_table:	The array in which to create the fast huffman decoding
- *			table.  It must have a length of at least
- *			(2**table_bits) + 2 * num_syms to guarantee
- *			that there is enough space.  Also must be 16-byte
- *			aligned (at least when USE_SSE2_FILL gets defined).
+ * Because the prefix code is assumed to be "canonical", it can be
+ * reconstructed directly from the codeword lengths.  A prefix code is
+ * canonical if and only if a longer codeword never lexicographically
+ * precedes a shorter codeword, and the lexicographic ordering of
+ * codewords of the same length is the same as the lexicographic ordering
+ * of the corresponding symbols.  Consequently, we can sort the symbols
+ * primarily by codeword length and secondarily by symbol value, then
+ * reconstruct the prefix code by generating codewords lexicographically
+ * in that order.
  *
- * @num_syms:		Number of symbols in the alphabet, including symbols
- *			that do not appear in this particular input chunk.
+ * This function does not, however, generate the prefix code explicitly.
+ * Instead, it directly builds a table for decoding symbols using the
+ * code.  The basic idea is this: given the next 'max_codeword_len' bits
+ * in the input, we can look up the decoded symbol by indexing a table
+ * containing 2**max_codeword_len entries.  A codeword with length
+ * 'max_codeword_len' will have exactly one entry in this table, whereas
+ * a codeword shorter than 'max_codeword_len' will have multiple entries
+ * in this table.  Precisely, a codeword of length n will be represented
+ * by 2**(max_codeword_len - n) entries in this table.  The 0-based index
+ * of each such entry will contain the corresponding codeword as a prefix
+ * when zero-padded on the left to 'max_codeword_len' binary digits.
  *
- * @table_bits:		Any symbols with a code length of table_bits or less can
- *			be decoded in one lookup of the table.  2**table_bits
- *			must be greater than or equal to @num_syms if there are
- *			any Huffman codes longer than @table_bits.
+ * That's the basic idea, but we implement two optimizations regarding
+ * the format of the decode table itself:
  *
- * @lens:		An array of length @num_syms, indexable by symbol, that
- *			gives the length of the Huffman codeword for that
- *			symbol.  Because the Huffman tree is in canonical form,
- *			it can be reconstructed by only knowing the length of
- *			the codeword for each symbol.  It is assumed, but not
- *			checked, that every length is less than
- *			@max_codeword_len.
+ * - For many compression formats, the maximum codeword length is too
+ *   long for it to be efficient to build the full decoding table
+ *   whenever a new prefix code is used.  Instead, we can build the table
+ *   using only 2**table_bits entries, where 'table_bits' is some number
+ *   less than or equal to 'max_codeword_len'.  Then, only codewords of
+ *   length 'table_bits' and shorter can be directly looked up.  For
+ *   longer codewords, the direct lookup instead produces the root of a
+ *   binary tree.  Using this tree, the decoder can do traditional
+ *   bit-by-bit decoding of the remainder of the codeword.  Child nodes
+ *   are allocated in extra entries at the end of the table; leaf nodes
+ *   contain symbols.  Note that the long-codeword case is, in general,
+ *   not performance critical, since in Huffman codes the most frequently
+ *   used symbols are assigned the shortest codeword lengths.
  *
- * @max_codeword_len:	The longest codeword length allowed in the compression
- *			format.
+ * - When we decode a symbol using a direct lookup of the table, we still
+ *   need to know its length so that the bitstream can be advanced by the
+ *   appropriate number of bits.  The simple solution is to simply retain
+ *   the 'lens' array and use the decoded symbol as an index into it.
+ *   However, this requires two separate array accesses in the fast path.
+ *   The optimization is to store the length directly in the decode
+ *   table.  We use the bottom 11 bits for the symbol and the top 5 bits
+ *   for the length.  In addition, to combine this optimization with the
+ *   previous one, we introduce a special case where the top 2 bits of
+ *   the length are both set if the entry is actually the root of a
+ *   binary tree.
  *
- * Returns 0 on success; returns -1 if the length values do not correspond to a
- * valid Huffman tree.
+ * @decode_table:
+ *	The array in which to create the decoding table.
+ *	This must be 16-byte aligned and must have a length of at least
+ *	((2**table_bits) + 2 * num_syms) entries.
  *
- * The format of the Huffamn decoding table is as follows.  The first (1 <<
- * table_bits) entries of the table are indexed by chunks of the input of size
- * @table_bits.  If the next Huffman codeword in the input happens to have a
- * length of exactly @table_bits, the symbol is simply read directly from the
- * decoding table.  Alternatively, if the next Huffman codeword has length _less
- * than_ @table_bits, the symbol is also read directly from the decode table;
- * this is possible because every entry in the table that is indexed by an
- * integer that has the shorter codeword as a binary prefix is filled in with
- * the appropriate symbol.  If a codeword has length n <= table_bits, it will
- * have 2**(table_bits - n) possible suffixes, and thus that many entries in the
- * decoding table.
+ * @num_syms:
+ *	The number of symbols in the alphabet; also, the length of the
+ *	'lens' array.  Must be less than or equal to
+ *	DECODE_TABLE_MAX_SYMBOLS.
  *
- * It's a bit more complicated if the next Huffman codeword has length of more
- * than @table_bits.  The table entry indexed by the first @table_bits of that
- * codeword cannot give the appropriate symbol directly, because that entry is
- * guaranteed to be referenced by the Huffman codewords of multiple symbols.
- * And while the LZX compression format does not allow codes longer than 16
- * bits, a table of size (2 ** 16) = 65536 entries would be too slow to create.
+ * @table_bits:
+ *	The order of the decode table size, as explained above.  Must be
+ *	less than or equal to DECODE_TABLE_MAX_TABLE_BITS.
  *
- * There are several different ways to make it possible to look up the symbols
- * for codewords longer than @table_bits.  One way is to make the entries for
- * the prefixes of length @table_bits of those entries be pointers to additional
- * decoding tables that are indexed by some number of additional bits of the
- * codeword.  The technique used here is a bit simpler, however: just store the
- * needed subtrees of the Huffman tree in the decoding table after the lookup
- * entries, beginning at index (2**table_bits).  Real pointers are replaced by
- * indices into the decoding table, and symbol entries are distinguished from
- * pointers by the fact that values less than @num_syms must be symbol values.
+ * @lens:
+ *	An array of length @num_syms, indexable by symbol, that gives the
+ *	length of the codeword, in bits, for that symbol.  The length can
+ *	be 0, which means that the symbol does not have a codeword
+ *	assigned.
+ *
+ * @max_codeword_len:
+ *	The longest codeword length allowed in the compression format.
+ *	All entries in 'lens' must be less than or equal to this value.
+ *	This must be less than or equal to DECODE_TABLE_MAX_CODEWORD_LEN.
+ *
+ * Returns 0 on success, or -1 if the lengths do not form a valid prefix
+ * code.
  */
 int
-make_huffman_decode_table(u16 *decode_table,  unsigned num_syms,
-			  unsigned table_bits, const u8 *lens,
-			  unsigned max_codeword_len)
+make_huffman_decode_table(u16 decode_table[const restrict],
+			  const unsigned num_syms,
+			  const unsigned table_bits,
+			  const u8 lens[const restrict],
+			  const unsigned max_codeword_len)
 {
+	const unsigned table_num_entries = 1 << table_bits;
 	unsigned len_counts[max_codeword_len + 1];
 	u16 sorted_syms[num_syms];
-	unsigned offsets[max_codeword_len + 1];
-	const unsigned table_num_entries = 1 << table_bits;
 	int left;
-	unsigned decode_table_pos;
 	void *decode_table_ptr;
 	unsigned sym_idx;
 	unsigned codeword_len;
 	unsigned stores_per_loop;
+	unsigned decode_table_pos;
 
 #ifdef USE_LONG_FILL
 	const unsigned entries_per_long = sizeof(unsigned long) / sizeof(decode_table[0]);
@@ -132,64 +164,96 @@ make_huffman_decode_table(u16 *decode_table,  unsigned num_syms,
 	const unsigned entries_per_xmm = sizeof(__m128i) / sizeof(decode_table[0]);
 #endif
 
+	/* Check parameters if assertions are enabled.  */
 	wimlib_assert2((uintptr_t)decode_table % DECODE_TABLE_ALIGNMENT == 0);
-
-	/* accumulate lengths for codes */
-	for (unsigned i = 0; i <= max_codeword_len; i++)
-		len_counts[i] = 0;
-
-	for (unsigned sym = 0; sym < num_syms; sym++) {
+	wimlib_assert2(num_syms <= DECODE_TABLE_MAX_SYMBOLS);
+	wimlib_assert2(table_bits <= DECODE_TABLE_MAX_TABLE_BITS);
+	wimlib_assert2(max_codeword_len <= DECODE_TABLE_MAX_CODEWORD_LEN);
+	for (unsigned sym = 0; sym < num_syms; sym++)
 		wimlib_assert2(lens[sym] <= max_codeword_len);
+
+	/* Count how many symbols have each possible codeword length.
+	 * Note that a length of 0 indicates the corresponding symbol is not
+	 * used in the code and therefore does not have a codeword.  */
+	for (unsigned len = 0; len <= max_codeword_len; len++)
+		len_counts[len] = 0;
+	for (unsigned sym = 0; sym < num_syms; sym++)
 		len_counts[lens[sym]]++;
-	}
 
-	/* check for an over-subscribed or incomplete set of lengths */
+	/* We can assume all lengths are <= max_codeword_len, but we
+	 * cannot assume they form a valid prefix code.  A codeword of
+	 * length n should require a proportion of the codespace equaling
+	 * (1/2)^n.  The code is valid if and only if the codespace is
+	 * exactly filled by the lengths, by this measure.  */
 	left = 1;
 	for (unsigned len = 1; len <= max_codeword_len; len++) {
 		left <<= 1;
 		left -= len_counts[len];
-		if (unlikely(left < 0)) { /* over-subscribed */
-			DEBUG("Invalid Huffman code (over-subscribed)");
+		if (unlikely(left < 0)) {
+			/* The lengths overflow the codespace; that is, the code
+			 * is over-subscribed.  */
+			DEBUG("Invalid prefix code (over-subscribed)");
 			return -1;
 		}
 	}
 
-	if (unlikely(left != 0)) /* incomplete set */{
-		if (left == 1 << max_codeword_len) {
-			/* Empty code--- okay in XPRESS and LZX */
+	if (unlikely(left != 0)) {
+		/* The lengths do not fill the codespace; that is, they form an
+		 * incomplete set.  */
+		if (left == (1 << max_codeword_len)) {
+			/* The code is completely empty.  This is arguably
+			 * invalid, but in fact it is valid in LZX and XPRESS,
+			 * so we must allow it.  By definition, no symbols can
+			 * be decoded with an empty code.  Consequently, we
+			 * technically don't even need to fill in the decode
+			 * table.  However, to avoid accessing uninitialized
+			 * memory if the algorithm nevertheless attempts to
+			 * decode symbols using such a code, we zero out the
+			 * decode table.  */
 			memset(decode_table, 0,
 			       table_num_entries * sizeof(decode_table[0]));
 			return 0;
-		} else {
-			DEBUG("Invalid Huffman code (incomplete set)");
-			return -1;
 		}
+		DEBUG("Invalid prefix code (incomplete set)");
+		return -1;
 	}
 
-	/* Generate offsets into symbol table for each length for sorting */
-	offsets[1] = 0;
-	for (unsigned len = 1; len < max_codeword_len; len++)
-		offsets[len + 1] = offsets[len] + len_counts[len];
+	/* Sort the symbols primarily by length and secondarily by symbol order.
+	 */
+	{
+		unsigned offsets[max_codeword_len + 1];
+
+		/* Initialize 'offsets' so that offsets[len] for 1 <= len <=
+		 * max_codeword_len is the number of codewords shorter than
+		 * 'len' bits.  */
+		offsets[1] = 0;
+		for (unsigned len = 1; len < max_codeword_len; len++)
+			offsets[len + 1] = offsets[len] + len_counts[len];
+
+		/* Use the 'offsets' array to sort the symbols.
+		 * Note that we do not include symbols that are not used in the
+		 * code.  Consequently, fewer than 'num_syms' entries in
+		 * 'sorted_syms' may be filled.  */
+		for (unsigned sym = 0; sym < num_syms; sym++)
+			if (lens[sym] != 0)
+				sorted_syms[offsets[lens[sym]]++] = sym;
+	}
 
-	/* Sort symbols primarily by length and secondarily by symbol order.
-	 * This is basically a count-sort over the codeword lengths. */
-	for (unsigned sym = 0; sym < num_syms; sym++)
-		if (lens[sym] != 0)
-			sorted_syms[offsets[lens[sym]]++] = sym;
-
-	/* Fill entries for codewords short enough for a direct mapping.  We can
-	 * take advantage of the ordering of the codewords, since the Huffman
-	 * code is canonical.  It must be the case that all the codewords of
-	 * some length L numerically precede all the codewords of length L + 1.
-	 * Furthermore, if we have 2 symbols A and B with the same codeword
-	 * length but symbol A is sorted before symbol B, then then we know that
-	 * the codeword for A numerically precedes the codeword for B. */
+	/* Fill entries for codewords with length <= table_bits
+	 * --- that is, those short enough for a direct mapping.
+	 *
+	 * The table will start with entries for the shortest codeword(s), which
+	 * have the most entries.  From there, the number of entries per
+	 * codeword will decrease.  As an optimization, we may begin filling
+	 * entries with SSE2 vector accesses (8 entries/store), then change to
+	 * 'unsigned long' accesses (2 or 4 entries/store), then change to
+	 * 16-bit accesses (1 entry/store).  */
 	decode_table_ptr = decode_table;
 	sym_idx = 0;
 	codeword_len = 1;
 #ifdef USE_SSE2_FILL
-	/* Fill in the Huffman decode table entries one 128-bit vector at a
-	 * time.  This is 8 entries per store. */
+	/* Fill the entries one 128-bit vector at a time.
+	 * This is 8 entries per store.  */
 	stores_per_loop = (1 << (table_bits - codeword_len)) / entries_per_xmm;
 	for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) {
 		unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
@@ -197,15 +261,16 @@ make_huffman_decode_table(u16 *decode_table,  unsigned num_syms,
 			/* Note: unlike in the 'long' version below, the __m128i
 			 * type already has __attribute__((may_alias)), so using
 			 * it to access the decode table, which is an array of
-			 * unsigned shorts, will not violate strict aliasing. */
-			u16 sym;
+			 * unsigned shorts, will not violate strict aliasing.
+			 */
+			u16 entry;
 			__m128i v;
 			__m128i *p;
 			unsigned n;
 
-			sym = sorted_syms[sym_idx];
+			entry = MAKE_DIRECT_ENTRY(sorted_syms[sym_idx], codeword_len);
 
-			v = _mm_set1_epi16(sym);
+			v = _mm_set1_epi16(entry);
 			p = (__m128i*)decode_table_ptr;
 			n = stores_per_loop;
 			do {
@@ -217,9 +282,9 @@ make_huffman_decode_table(u16 *decode_table,  unsigned num_syms,
 #endif /* USE_SSE2_FILL */
 
 #ifdef USE_LONG_FILL
-	/* Fill in the Huffman decode table entries one 'unsigned long' at a
-	 * time.  On 32-bit systems this is 2 entries per store, while on 64-bit
-	 * systems this is 4 entries per store. */
+	/* Fill the entries one 'unsigned long' at a time.
+	 * On 32-bit systems this is 2 entries per store, while on 64-bit
+	 * systems this is 4 entries per store.  */
 	stores_per_loop = (1 << (table_bits - codeword_len)) / entries_per_long;
 	for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) {
 		unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
@@ -234,24 +299,20 @@ make_huffman_decode_table(u16 *decode_table,  unsigned num_syms,
 			 * variable.  */
 			typedef unsigned long __attribute__((may_alias)) aliased_long_t;
 
-			u16 sym;
+			unsigned long v;
 			aliased_long_t *p;
-			aliased_long_t v;
 			unsigned n;
 
-			sym = sorted_syms[sym_idx];
-
-			BUILD_BUG_ON(sizeof(aliased_long_t) != 4 &&
-				     sizeof(aliased_long_t) != 8);
+			BUILD_BUG_ON(sizeof(unsigned long) != 4 &&
+				     sizeof(unsigned long) != 8);
 
-			v = sym;
-			if (sizeof(aliased_long_t) >= 4)
-				v |= v << 16;
-			if (sizeof(aliased_long_t) >= 8) {
+			v = MAKE_DIRECT_ENTRY(sorted_syms[sym_idx], codeword_len);
+			v |= v << 16;
+			if (sizeof(unsigned long) == 8) {
 				/* This may produce a compiler warning if an
-				 * aliased_long_t is 32 bits, but this won't be
-				 * executed unless an aliased_long_t is at least
-				 * 64 bits anyway. */
+				 * 'unsigned long' is 32 bits, but this won't be
+				 * executed unless an 'unsigned long' is at
+				 * least 64 bits anyway.  */
 				v |= v << 32;
 			}
 
@@ -266,33 +327,31 @@ make_huffman_decode_table(u16 *decode_table,  unsigned num_syms,
 	}
 #endif /* USE_LONG_FILL */
 
-	/* Fill in the Huffman decode table entries one 16-bit integer at a
-	 * time. */
+	/* Fill the entries one 16-bit integer at a time.  */
 	stores_per_loop = (1 << (table_bits - codeword_len));
 	for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) {
 		unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
 		for (; sym_idx < end_sym_idx; sym_idx++) {
-			u16 sym;
+			u16 entry;
 			u16 *p;
 			unsigned n;
 
-			sym = sorted_syms[sym_idx];
+			entry = MAKE_DIRECT_ENTRY(sorted_syms[sym_idx], codeword_len);
 
 			p = (u16*)decode_table_ptr;
 			n = stores_per_loop;
 
 			do {
-				*p++ = sym;
+				*p++ = entry;
 			} while (--n);
 
 			decode_table_ptr = p;
 		}
 	}
 
-	/* If we've filled in the entire table, we are done.  Otherwise, there
-	 * are codes longer than table bits that we need to store in the
-	 * tree-like structure at the end of the table rather than directly in
-	 * the main decode table itself. */
+	/* If we've filled in the entire table, we are done.  Otherwise,
+	 * there are codes longer than table_bits for which we must
+	 * generate binary trees.  */
 
 	decode_table_pos = (u16*)decode_table_ptr - decode_table;
 	if (decode_table_pos != table_num_entries) {
@@ -300,88 +359,82 @@ make_huffman_decode_table(u16 *decode_table,  unsigned num_syms,
 		unsigned next_free_tree_slot;
 		unsigned cur_codeword;
 
-		wimlib_assert2(decode_table_pos < table_num_entries);
-
-		/* Fill in the remaining entries, which correspond to codes
-		 * longer than @table_bits.
-		 *
-		 * First, zero out the rest of the entries.  This is necessary
-		 * so that the entries appear as "unallocated" in the next part.
-		 * */
+		/* First, zero out the rest of the entries.  This is
+		 * necessary so that the entries appear as "unallocated"
+		 * in the next part.  Each of these entries will
+		 * eventually be filled the representation of the root
+		 * node of a binary tree.  */
 		j = decode_table_pos;
 		do {
 			decode_table[j] = 0;
 		} while (++j != table_num_entries);
 
-		/* Assert that 2**table_bits is at least num_syms.  If this
-		 * wasn't the case, we wouldn't be able to distinguish pointer
-		 * entries from symbol entries. */
-		wimlib_assert2(table_num_entries >= num_syms);
-
-
-		/* The tree nodes are allocated starting at decode_table[1 <<
-		 * table_bits].  Remember that the full size of the table,
-		 * including the extra space for the tree nodes, is actually
-		 * 2**table_bits + 2 * num_syms slots, while table_num_entries
-		 * is only 2**table_bits. */
+		/* We allocate child nodes starting at the end of the
+		 * direct lookup table.  Note that there should be
+		 * 2*num_syms extra entries for this purpose.  */
 		next_free_tree_slot = table_num_entries;
 
-		/* The current Huffman codeword  */
-		cur_codeword = decode_table_pos << 1;
-
-		/* Go through every codeword of length greater than @table_bits,
-		 * primarily in order of codeword length and secondarily in
-		 * order of symbol. */
-		wimlib_assert2(codeword_len == table_bits + 1);
-		for (; codeword_len <= max_codeword_len; codeword_len++, cur_codeword <<= 1)
+		/* Iterate through each codeword with length greater than
+		 * 'table_bits', primarily in order of codeword length
+		 * and secondarily in order of symbol.  */
+		for (cur_codeword = decode_table_pos << 1;
+		     codeword_len <= max_codeword_len;
+		     codeword_len++, cur_codeword <<= 1)
 		{
 			unsigned end_sym_idx = sym_idx + len_counts[codeword_len];
-			for (; sym_idx < end_sym_idx; sym_idx++, cur_codeword++) {
+			for (; sym_idx < end_sym_idx; sym_idx++, cur_codeword++)
+			{
+				/* 'sym' is the symbol represented by the
+				 * codeword.  */
 				unsigned sym = sorted_syms[sym_idx];
+
 				unsigned extra_bits = codeword_len - table_bits;
 
-				/* index of the current node; find it from the
-				 * prefix of the current Huffman codeword. */
 				unsigned node_idx = cur_codeword >> extra_bits;
-				wimlib_assert2(node_idx < table_num_entries);
 
-				/* Go through each bit of the current Huffman
-				 * codeword beyond the prefix of length
-				 * @table_bits and walk the tree, allocating any
-				 * slots that have not yet been allocated. */
+				/* Go through each bit of the current codeword
+				 * beyond the prefix of length @table_bits and
+				 * walk the appropriate binary tree, allocating
+				 * any slots that have not yet been allocated.
+				 *
+				 * Note that the 'pointer' entry to the binary
+				 * tree, which is stored in the direct lookup
+				 * portion of the table, is represented
+				 * identically to other internal (non-leaf)
+				 * nodes of the binary tree; it can be thought
+				 * of as simply the root of the tree.  The
+				 * representation of these internal nodes is
+				 * simply the index of the left child combined
+				 * with the special bits 0xC000 to distingush
+				 * the entry from direct mapping and leaf node
+				 * entries.  */
 				do {
 
-					/* If the current tree node points to
-					 * nowhere but we need to follow it,
-					 * allocate a new node for it to point
-					 * to. */
+					/* At least one bit remains in the
+					 * codeword, but the current node is an
+					 * unallocated leaf.  Change it to an
+					 * internal node.  */
 					if (decode_table[node_idx] == 0) {
-						decode_table[node_idx] = next_free_tree_slot;
+						decode_table[node_idx] =
+							next_free_tree_slot | 0xC000;
 						decode_table[next_free_tree_slot++] = 0;
 						decode_table[next_free_tree_slot++] = 0;
-						wimlib_assert2(next_free_tree_slot <=
-							       table_num_entries + 2 * num_syms);
 					}
 
-					/* Set node_idx to left child */
-					node_idx = decode_table[node_idx];
-
-					/* Is the next bit 0 or 1? If 0, go left
-					 * (already done).  If 1, go right by
-					 * incrementing node_idx. */
+					/* Go to the left child if the next bit
+					 * in the codeword is 0; otherwise go to
+					 * the right child.  */
+					node_idx = decode_table[node_idx] & 0x3FFF;
 					--extra_bits;
 					node_idx += (cur_codeword >> extra_bits) & 1;
 				} while (extra_bits != 0);
 
-				/* node_idx is now the index of the leaf entry
-				 * into which the actual symbol will go. */
+				/* We've traversed the tree using the entire
+				 * codeword, and we're now at the entry where
+				 * the actual symbol will be stored.  This is
+				 * distinguished from internal nodes by not
+				 * having its high two bits set.  */
 				decode_table[node_idx] = sym;
-
-				/* Note: cur_codeword is always incremented at
-				 * the end of this loop because this is how
-				 * canonical Huffman codes are generated (add 1
-				 * for each code, then left shift whenever the
-				 * code length increases) */
 			}
 		}
 	}
diff --git a/src/lzms-decompress.c b/src/lzms-decompress.c
index 1516e364..6cc6dab3 100644
--- a/src/lzms-decompress.c
+++ b/src/lzms-decompress.c
@@ -568,9 +568,11 @@ lzms_rebuild_adaptive_huffman_code(struct lzms_huffman_decoder *dec)
 static u32
 lzms_huffman_decode_symbol(struct lzms_huffman_decoder *dec)
 {
-	const u8 *lens = dec->lens;
 	const u16 *decode_table = dec->decode_table;
 	struct lzms_input_bitstream *is = dec->is;
+	u16 entry;
+	u16 key_bits;
+	u16 sym;
 
 	/* The Huffman codes used in LZMS are adaptive and must be rebuilt
 	 * whenever a certain number of symbols have been read.  Each such
@@ -588,28 +590,31 @@ lzms_huffman_decode_symbol(struct lzms_huffman_decoder *dec)
 		dec->num_syms_read = 0;
 	}
 
-	/* In the following Huffman decoding implementation, the first
-	 * LZMS_DECODE_TABLE_BITS of the input are used as an offset into a
-	 * decode table.  The entry will either provide the decoded symbol
-	 * directly, or else a "real" Huffman binary tree will be searched to
-	 * decode the symbol.  */
-
+	/* XXX: Copied from read_huffsym() (decompress_common.h), since this
+	 * uses a different input bitstream type.  Should unify the
+	 * implementations.  */
 	lzms_input_bitstream_ensure_bits(is, LZMS_MAX_CODEWORD_LEN);
 
-	u16 key_bits = lzms_input_bitstream_peek_bits(is, LZMS_DECODE_TABLE_BITS);
-	u16 sym = decode_table[key_bits];
-
-	if (sym < dec->num_syms) {
-		/* Fast case: The decode table directly provided the symbol.  */
-		lzms_input_bitstream_remove_bits(is, lens[sym]);
+	/* Index the decode table by the next table_bits bits of the input.  */
+	key_bits = lzms_input_bitstream_peek_bits(is, LZMS_DECODE_TABLE_BITS);
+	entry = decode_table[key_bits];
+	if (likely(entry < 0xC000)) {
+		/* Fast case: The decode table directly provided the symbol and
+		 * codeword length.  The low 11 bits are the symbol, and the
+		 * high 5 bits are the codeword length.  */
+		lzms_input_bitstream_remove_bits(is, entry >> 11);
+		sym = entry & 0x7FF;
 	} else {
-		/* Slow case: The symbol took too many bits to include directly
-		 * in the decode table, so search for it in a binary tree at the
-		 * end of the decode table.  */
+		/* Slow case: The codeword for the symbol is longer than
+		 * table_bits, so the symbol does not have an entry directly in
+		 * the first (1 << table_bits) entries of the decode table.
+		 * Traverse the appropriate binary tree bit-by-bit in order to
+		 * decode the symbol.  */
 		lzms_input_bitstream_remove_bits(is, LZMS_DECODE_TABLE_BITS);
 		do {
-			key_bits = sym + lzms_input_bitstream_pop_bits(is, 1);
-		} while ((sym = decode_table[key_bits]) >= dec->num_syms);
+			key_bits = (entry & 0x3FFF) + lzms_input_bitstream_pop_bits(is, 1);
+		} while ((entry = decode_table[key_bits]) >= 0xC000);
+		sym = entry;
 	}
 
 	/* Tally and return the decoded symbol.  */
diff --git a/src/lzx-decompress.c b/src/lzx-decompress.c
index da8950a2..a2a60889 100644
--- a/src/lzx-decompress.c
+++ b/src/lzx-decompress.c
@@ -151,10 +151,9 @@ struct lzx_decompressor {
  */
 static inline u16
 read_huffsym_using_pretree(struct input_bitstream *istream,
-			   const u16 pretree_decode_table[],
-			   const u8 pretree_lens[])
+			   const u16 pretree_decode_table[])
 {
-	return read_huffsym(istream, pretree_decode_table, pretree_lens,
+	return read_huffsym(istream, pretree_decode_table,
 			    LZX_PRECODE_NUM_SYMBOLS, LZX_PRECODE_TABLEBITS,
 			    LZX_MAX_PRE_CODEWORD_LEN);
 }
@@ -166,7 +165,7 @@ read_huffsym_using_maintree(struct input_bitstream *istream,
 			    unsigned num_main_syms)
 {
 	return read_huffsym(istream, tables->maintree_decode_table,
-			    tables->maintree_lens, num_main_syms,
+			    num_main_syms,
 			    LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN);
 }
 
@@ -176,7 +175,7 @@ read_huffsym_using_lentree(struct input_bitstream *istream,
 			   const struct lzx_tables *tables)
 {
 	return read_huffsym(istream, tables->lentree_decode_table,
-			    tables->lentree_lens, LZX_LENCODE_NUM_SYMBOLS,
+			    LZX_LENCODE_NUM_SYMBOLS,
 			    LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN);
 }
 
@@ -186,7 +185,6 @@ read_huffsym_using_alignedtree(struct input_bitstream *istream,
 			       const struct lzx_tables *tables)
 {
 	return read_huffsym(istream, tables->alignedtree_decode_table,
-			    tables->alignedtree_lens,
 			    LZX_ALIGNEDCODE_NUM_SYMBOLS,
 			    LZX_ALIGNEDCODE_TABLEBITS,
 			    LZX_MAX_ALIGNED_CODEWORD_LEN);
@@ -250,8 +248,7 @@ lzx_read_code_lens(struct input_bitstream *istream, u8 lens[],
 		signed char value;
 
 		tree_code = read_huffsym_using_pretree(istream,
-						       pretree_decode_table,
-						       pretree_lens);
+						       pretree_decode_table);
 		switch (tree_code) {
 		case 17: /* Run of 0's */
 			num_zeroes = bitstream_read_bits(istream, 4);
@@ -275,8 +272,7 @@ lzx_read_code_lens(struct input_bitstream *istream, u8 lens[],
 			num_same = bitstream_read_bits(istream, 1);
 			num_same += 4;
 			code = read_huffsym_using_pretree(istream,
-							  pretree_decode_table,
-							  pretree_lens);
+							  pretree_decode_table);
 			value = (signed char)*lens - (signed char)code;
 			if (value < 0)
 				value += 17;
diff --git a/src/xpress-decompress.c b/src/xpress-decompress.c
index 98ad03d4..1ab81e31 100644
--- a/src/xpress-decompress.c
+++ b/src/xpress-decompress.c
@@ -156,7 +156,7 @@ xpress_lz_decode(struct input_bitstream * restrict istream,
 
 		bitstream_ensure_bits(istream, 16);
 
-		sym = read_huffsym(istream, decode_table, lens,
+		sym = read_huffsym(istream, decode_table,
 				   XPRESS_NUM_SYMBOLS, XPRESS_TABLEBITS,
 				   XPRESS_MAX_CODEWORD_LEN);
 		if (sym < XPRESS_NUM_CHARS) {
-- 
2.43.0