From 4757f17833c96b8c83a7e17cbc6f374c449d60db Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sun, 8 Dec 2013 21:06:57 -0600
Subject: [PATCH] Clean up other compression/decompression code

---
 include/wimlib/compress.h   |  88 ++++++-----
 include/wimlib/decompress.h | 226 ++++++++++++----------------
 include/wimlib/lzx.h        |   4 +
 include/wimlib/xpress.h     |   9 +-
 src/compress.c              | 102 +++++++------
 src/decompress.c            |   5 +-
 src/lz77.c                  | 119 +++++----------
 src/lzx-compress.c          |  93 ++++++------
 src/lzx-decompress.c        |  14 +-
 src/xpress-compress.c       | 286 +++++++++++++++---------------------
 src/xpress-decompress.c     | 198 +++++++++++--------------
 11 files changed, 510 insertions(+), 634 deletions(-)

diff --git a/include/wimlib/compress.h b/include/wimlib/compress.h
index b6485a0d..f3ce6e2d 100644
--- a/include/wimlib/compress.h
+++ b/include/wimlib/compress.h
@@ -1,7 +1,7 @@
 /*
  * compress.h
  *
- * Functions useful for compression, mainly bitstreams.
+ * Header for compression code shared by multiple compression formats.
  */
 
 #ifndef _WIMLIB_COMPRESS_H
@@ -12,9 +12,12 @@
 
 typedef u16 output_bitbuf_t;
 
-/* Assuming that WIM chunks are at most 32768 bytes, 16 bits is enough for any
- * symbol frequency. */
-typedef u16 freq_t;
+/* Variable type that can represent all possible window positions.  */
+typedef u32 freq_t;
+#ifndef INPUT_IDX_T_DEFINED
+#define INPUT_IDX_T_DEFINED
+typedef u32 input_idx_t;
+#endif
 
 /* Structure to keep track of the current position in the compressed output. */
 struct output_bitstream {
@@ -26,38 +29,42 @@ struct output_bitstream {
 	/* Number of free bits in @bitbuf */
 	unsigned free_bits;
 
+	/* Pointer to the start of the output buffer.  */
+	u8 *output_start;
+
+	/* Position at which to write the next 16 bits.  */
 	u8 *bit_output;
+
+	/* Next position to write 16 bits, after they are written to bit_output.
+	 * This is after @next_bit_output and may be separated from @bit_output
+	 * by literal bytes.  */
 	u8 *next_bit_output;
 
-	/* Pointer to the next byte in the compressed output. */
+	/* Next position to write literal bytes.  This is after @bit_output and
+	 * @next_bit_output, and may be separated from them by literal bytes.
+	 */
 	u8 *output;
 
+	/* Bytes remaining in @output buffer.  */
+	input_idx_t bytes_remaining;
 
-	/* Number of bytes left in the memory pointed to by @output. */
-	int num_bytes_remaining;
+	/* Set to true if the buffer has been exhausted.  */
+	bool overrun;
 };
 
-static inline int
-bitstream_put_byte(struct output_bitstream *ostream, u8 n)
-{
-	if (ostream->num_bytes_remaining < 1)
-		return 1;
-	*ostream->output = n;
-	ostream->output++;
-	ostream->num_bytes_remaining--;
-	return 0;
-}
-
-static inline int
-bitstream_put_two_bytes(struct output_bitstream *ostream, u16 n)
-{
-	if (ostream->num_bytes_remaining < 2)
-		return 1;
-	*(u16*)ostream->output = cpu_to_le16(n);
-	ostream->output += 2;
-	ostream->num_bytes_remaining -= 2;
-	return 0;
-}
+extern void
+init_output_bitstream(struct output_bitstream *ostream,
+		      void *data, unsigned num_bytes);
+
+extern input_idx_t
+flush_output_bitstream(struct output_bitstream *ostream);
+
+extern void
+bitstream_put_bits(struct output_bitstream *ostream,
+		   output_bitbuf_t bits, unsigned num_bits);
+
+extern void
+bitstream_put_byte(struct output_bitstream *ostream, u8 n);
 
 struct lz_params {
 	unsigned min_match;
@@ -69,30 +76,17 @@ struct lz_params {
 	unsigned too_far;
 };
 
-typedef u32 (*lz_record_match_t)(unsigned, unsigned, void *, void *);
-typedef u32 (*lz_record_literal_t)(u8, void *);
+typedef void (*lz_record_match_t)(unsigned len, unsigned offset, void *ctx);
+typedef void (*lz_record_literal_t)(u8 lit, void *ctx);
 
-extern unsigned
-lz_analyze_block(const u8 uncompressed_data[],
-		 unsigned uncompressed_len,
-		 u32 match_tab[],
+extern void
+lz_analyze_block(const u8 window[],
+		 input_idx_t window_size,
 		 lz_record_match_t record_match,
 		 lz_record_literal_t record_literal,
-		 void *record_match_arg1,
-		 void *record_match_arg2,
-		 void *record_literal_arg,
+		 void *record_ctx,
 		 const struct lz_params *params);
 
-extern int bitstream_put_bits(struct output_bitstream *ostream,
-			      output_bitbuf_t bits, unsigned num_bits);
-
-extern void
-init_output_bitstream(struct output_bitstream *ostream,
-		      void *data, unsigned num_bytes);
-
-extern int
-flush_output_bitstream(struct output_bitstream *ostream);
-
 extern void
 make_canonical_huffman_code(unsigned num_syms,
 			    unsigned max_codeword_len,
diff --git a/include/wimlib/decompress.h b/include/wimlib/decompress.h
index 7d0d3852..b1f534fb 100644
--- a/include/wimlib/decompress.h
+++ b/include/wimlib/decompress.h
@@ -1,30 +1,37 @@
 /*
  * decompress.h
  *
- * Functions useful for decompression, mainly bitstreams.
+ * Header for decompression code shared by multiple compression formats.
  */
 
 #ifndef _WIMLIB_DECOMPRESS_H
 #define _WIMLIB_DECOMPRESS_H
 
 #include "wimlib/assert.h"
-#include "wimlib/endianness.h"
+#include "wimlib/compiler.h"
 #include "wimlib/error.h"
+#include "wimlib/endianness.h"
 #include "wimlib/types.h"
 
 /* Must be at least 32 bits. */
-typedef unsigned long input_bitbuf_t;
+typedef u32 input_bitbuf_t;
+#define INPUT_BITBUF_BITS (sizeof(input_bitbuf_t) * 8)
+
+#ifndef INPUT_IDX_T_DEFINED
+#define INPUT_IDX_T_DEFINED
+typedef u32 input_idx_t;
+#endif
 
 /* Structure to encapsulate a block of in-memory data that is being interpreted
  * as a stream of bits.
  *
  * This is geared specifically towards the XPRESS and LZX compression formats
- * with regards to the actual ordering the bits within the byte sequence. */
+ * with regards to the actual ordering the bits within the byte sequence.  */
 struct input_bitstream {
 
 	/* A variable of length at least 32 bits that is used to hold bits that
 	 * have been read from the stream.  The bits are ordered from high-order
-	 * to low-order, and the next bit is always the high-order bit. */
+	 * to low-order, and the next bit is always the high-order bit.  */
 	input_bitbuf_t  bitbuf;
 
 	/* Pointer to the next byte to be retrieved from the input. */
@@ -33,14 +40,14 @@ struct input_bitstream {
 	/* Number of bits in @bitbuf that are valid. */
 	unsigned bitsleft;
 
-	/* Number of words of data that are left. */
-	unsigned data_bytes_left;
+	/* Number of words of data that are left.  */
+	input_idx_t data_bytes_left;
 };
 
 /* Initializes a bitstream to receive its input from @data. */
 static inline void
 init_input_bitstream(struct input_bitstream *istream,
-		     const void *data, unsigned num_data_bytes)
+		     const void *data, input_idx_t num_data_bytes)
 {
 	istream->bitbuf          = 0;
 	istream->bitsleft        = 0;
@@ -49,7 +56,7 @@ init_input_bitstream(struct input_bitstream *istream,
 }
 
 /* Ensures that the bit buffer variable for the bitstream contains @num_bits
- * bits.
+ * bits, which must be 16 or fewer.
  *
  * If there are at least @num_bits bits remaining in the bitstream, 0 is
  * returned.  Otherwise, -1 is returned.  */
@@ -58,103 +65,83 @@ bitstream_ensure_bits(struct input_bitstream *istream, unsigned num_bits)
 {
 	wimlib_assert2(num_bits <= 16);
 
-	int ret = 0;
-
-	/* Unfortunately this needs to be different for the different
-	 * compression types.  LZX requires reading no more than the number of
-	 * bits needed, otherwise the end of the compressed data may be overrun.
-	 * XPRESS, on the other hand, requires that we always return with at
-	 * least 16 bits in the buffer, even if fewer are requested.  This is
-	 * important because this may change the location of a literal byte
-	 * read with bitstream_read_byte(). */
-#ifdef XPRESS_DECOMP
-	if (istream->bitsleft < 16) {
-#else
-	if (istream->bitsleft < num_bits) {
-#endif
-		if (likely(istream->data_bytes_left >= 2)) {
-			unsigned shift = sizeof(input_bitbuf_t) * 8 - 16 -
-					 istream->bitsleft;
-			istream->bitbuf |= (input_bitbuf_t)le16_to_cpu(
-						*(le16*)istream->data) << shift;
-			istream->data += 2;
-			istream->bitsleft += 16;
-			istream->data_bytes_left -= 2;
-		} else {
-			ret = -1;
-		}
-	}
-	wimlib_assert2(ret != 0 || istream->bitsleft >= num_bits);
-	return ret;
+	if (istream->bitsleft >= num_bits)
+		return 0;
+
+	if (unlikely(istream->data_bytes_left < 2))
+		return -1;
+
+	istream->bitbuf |= le16_to_cpu(*(le16*)istream->data) <<
+			   (INPUT_BITBUF_BITS - 16 - istream->bitsleft);
+	istream->data += 2;
+	istream->bitsleft += 16;
+	istream->data_bytes_left -= 2;
+	return 0;
 }
 
 /* Returns the next @num_bits bits in the buffer variable, which must contain at
- * least @num_bits bits, for the bitstream. */
+ * least @num_bits bits, for the bitstream.  */
 static inline unsigned
 bitstream_peek_bits(const struct input_bitstream *istream, unsigned num_bits)
 {
 	wimlib_assert2(istream->bitsleft >= num_bits);
-	int ret;
+
 	if (unlikely(num_bits == 0))
-		ret = 0;
-	else
-		ret = istream->bitbuf >> (sizeof(input_bitbuf_t) * 8 - num_bits);
-	return ret;
+		return 0;
+
+	return istream->bitbuf >> (INPUT_BITBUF_BITS - num_bits);
 }
 
 /* Removes @num_bits bits from the buffer variable, which must contain at least
- * @num_bits bits, for the bitstream. */
+ * @num_bits bits, for the bitstream.  */
 static inline void
 bitstream_remove_bits(struct input_bitstream *istream, unsigned num_bits)
 {
 	wimlib_assert2(istream->bitsleft >= num_bits);
+
 	istream->bitbuf <<= num_bits;
 	istream->bitsleft -= num_bits;
 }
 
-/* Reads @num_bits bits from the input bitstream.  @num_bits must be 16 or fewer.
- * On success, returns 0 and returns the requested bits in @n.  If there are
- * fewer than @num_bits remaining in the bitstream, -1 is returned. */
+/* Gets and removes @num_bits bits from the buffer variable, which must contain
+ * at least @num_bits bits, for the bitstream.  */
+static inline unsigned
+bitstream_pop_bits(struct input_bitstream *istream,
+		   unsigned num_bits)
+{
+	unsigned n = bitstream_peek_bits(istream, num_bits);
+	bitstream_remove_bits(istream, num_bits);
+	return n;
+}
+
+/* Reads @num_bits bits from the input bitstream.  @num_bits must be 16 or
+ * fewer.  On success, returns 0 and returns the requested bits in @n.  If there
+ * are fewer than @num_bits remaining in the bitstream, -1 is returned. */
 static inline int
 bitstream_read_bits(struct input_bitstream *istream,
 		    unsigned num_bits, unsigned *n)
 {
-	wimlib_assert2(num_bits <= 16);
-	int ret = bitstream_ensure_bits(istream, num_bits);
-	if (likely(ret == 0)) {
-		*n = bitstream_peek_bits(istream, num_bits);
-		bitstream_remove_bits(istream, num_bits);
-	} else {
-		ERROR("bitstream_read_bits(): Input buffer exhausted");
-	}
-	return ret;
+	if (unlikely(bitstream_ensure_bits(istream, num_bits)))
+		return -1;
+
+	*n = bitstream_pop_bits(istream, num_bits);
+	return 0;
 }
 
-/* In the XPRESS format there can be literal bytes embedded in the bitstream.
- * These bytes are basically separate from the bitstream, as they come AFTER the
- * bits that are currently in the buffer variable (based on reading 16 bits at a
- * time), even though the buffer variable may not be empty.
- *
- * This function returns the next such literal byte, or -1 if there are no more.
- */
+/* Return the next literal byte embedded in the bitstream, or -1 if the input
+ * was exhausted.  */
 static inline int
 bitstream_read_byte(struct input_bitstream *istream)
 {
-	wimlib_assert2(istream->bitsleft < 32);
-	int ret;
+	if (unlikely(istream->data_bytes_left < 1))
+		return -1;
 
-	if (unlikely(istream->data_bytes_left == 0)) {
-		ERROR("bitstream_read_byte(): Input buffer exhausted");
-		ret = -1;
-	} else {
-		istream->data_bytes_left--;
-		ret = *istream->data++;
-	}
-	return ret;
+	istream->data_bytes_left--;
+	return *istream->data++;
 }
 
 /* Reads @num_bits bits from the buffer variable for a bistream without checking
- * to see if that many bits are in the buffer or not. */
+ * to see if that many bits are in the buffer or not.  */
 static inline unsigned
 bitstream_read_bits_nocheck(struct input_bitstream *istream, unsigned num_bits)
 {
@@ -171,70 +158,45 @@ read_huffsym_near_end_of_input(struct input_bitstream *istream,
 			       unsigned table_bits,
 			       unsigned *n);
 
-/*
- * Reads a Huffman-encoded symbol from a bitstream.
- *
- * This function may be called hundreds of millions of times when extracting a
- * large WIM file.  I'm not sure it could be made much faster that it is,
- * especially since there isn't enough time to make a big table that allows
- * decoding multiple symbols per lookup.  But if extracting files to a hard
- * disk, the I/O will be the bottleneck anyway.
- *
- * @buf:	The input buffer from which the symbol will be read.
- * @decode_table:	The fast Huffman decoding table for the Huffman tree.
- * @lengths:		The table that gives the length of the code for each
- * 				symbol.
- * @num_symbols:	The number of symbols in the Huffman code.
- * @table_bits:		Huffman codes this length or less can be looked up
- * 				directory in the decode_table, as the
- * 				decode_table contains 2**table_bits entries.
- */
+/* Read a Huffman-encoded symbol from a bitstream.  */
 static inline int
-read_huffsym(struct input_bitstream *istream,
-	     const u16 decode_table[],
-	     const u8 lens[],
+read_huffsym(struct input_bitstream * restrict istream,
+	     const u16 decode_table[restrict],
+	     const u8 lens[restrict],
 	     unsigned num_syms,
 	     unsigned table_bits,
-	     unsigned *n,
+	     unsigned *restrict n,
 	     unsigned max_codeword_len)
 {
-	int ret;
-
-	/* In the most common case, there are at least max_codeword_len bits
-	 * remaining in the stream. */
-	if (likely(bitstream_ensure_bits(istream, max_codeword_len) == 0)) {
-
-		/* Use the next table_bits of the input as an index into the
-		 * decode_table. */
-		u16 key_bits = bitstream_peek_bits(istream, table_bits);
-
-		u16 sym = decode_table[key_bits];
-
-		/* If the entry in the decode table is not a valid symbol, it is
-		 * the offset of the root of its Huffman subtree. */
-		if (likely(sym < num_syms)) {
-			wimlib_assert2(lens[sym] <= table_bits);
-			bitstream_remove_bits(istream, lens[sym]);
-		} else {
-			bitstream_remove_bits(istream, table_bits);
-			do {
-				key_bits = sym + bitstream_peek_bits(istream, 1);
-				bitstream_remove_bits(istream, 1);
-
-				wimlib_assert2(key_bits < num_syms * 2 +
-					       (1 << table_bits));
-			} while ((sym = decode_table[key_bits]) >= num_syms);
-		}
-		*n = sym;
-		ret = 0;
+	/* If there are fewer bits remaining in the input than the maximum
+	 * codeword length, use the slow path that has extra checks.  */
+	if (unlikely(bitstream_ensure_bits(istream, max_codeword_len))) {
+		return read_huffsym_near_end_of_input(istream, decode_table,
+						      lens, num_syms,
+						      table_bits, n);
+	}
+
+	/* Use the next table_bits of the input as an index into the
+	 * decode_table.  */
+	u16 key_bits = bitstream_peek_bits(istream, table_bits);
+
+	u16 sym = decode_table[key_bits];
+
+	if (likely(sym < num_syms)) {
+		/* Fast case: The decode table directly provided the symbol.  */
+		bitstream_remove_bits(istream, lens[sym]);
 	} else {
-		/* Otherwise, we must be careful to use only the bits that are
-		 * actually remaining.  */
-		ret = read_huffsym_near_end_of_input(istream, decode_table,
-						     lens, num_syms,
-						     table_bits, n);
+		/* Slow case: The symbol took too many bits to include directly
+		 * in the decode table, so search for it in a binary tree at the
+		 * end of the decode table.  */
+		bitstream_remove_bits(istream, table_bits);
+		do {
+			key_bits = sym + bitstream_peek_bits(istream, 1);
+			bitstream_remove_bits(istream, 1);
+		} while ((sym = decode_table[key_bits]) >= num_syms);
 	}
-	return ret;
+	*n = sym;
+	return 0;
 }
 
 extern int
@@ -243,7 +205,7 @@ make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
 			  unsigned max_codeword_len);
 
 /* Minimum alignment for the decode_table parameter to
- * make_huffman_decode_table(). */
+ * make_huffman_decode_table().  */
 #define DECODE_TABLE_ALIGNMENT 16
 
 #endif /* _WIMLIB_DECOMPRESS_H */
diff --git a/include/wimlib/lzx.h b/include/wimlib/lzx.h
index b6d1ec23..cb17fbea 100644
--- a/include/wimlib/lzx.h
+++ b/include/wimlib/lzx.h
@@ -1,6 +1,10 @@
 #ifndef _WIMLIB_LZX_H
 #define _WIMLIB_LZX_H
 
+/* Constants for the LZX data compression format.  See the comments in
+ * lzx-compress.c and lzx-decompress.c for more information about this format.
+ * */
+
 #include "wimlib/assert.h"
 #include "wimlib/types.h"
 
diff --git a/include/wimlib/xpress.h b/include/wimlib/xpress.h
index c07dbae8..2b57bae7 100644
--- a/include/wimlib/xpress.h
+++ b/include/wimlib/xpress.h
@@ -1,13 +1,16 @@
 #ifndef _WIMLIB_XPRESS_H
 #define _WIMLIB_XPRESS_H
 
-/* See the comments in xpress-decompress.c about the XPRESS format. */
+/* Constants for the XPRESS data compression format.  See the comments in
+ * xpress-decompress.c for more information about this format.  */
 
 //#define ENABLE_XPRESS_DEBUG
 #ifdef ENABLE_XPRESS_DEBUG
 #	define XPRESS_DEBUG DEBUG
+#       define XPRESS_ASSERT wimlib_assert
 #else
 #	define XPRESS_DEBUG(format, ...)
+#	define XPRESS_ASSERT(...)
 #endif
 
 #define XPRESS_NUM_CHARS	256
@@ -20,7 +23,7 @@
 #define XPRESS_MIN_OFFSET	1
 #define XPRESS_MAX_OFFSET	65535
 
-#define XPRESS_MIN_MATCH	3
-#define XPRESS_MAX_MATCH    	65538
+#define XPRESS_MIN_MATCH_LEN	3
+#define XPRESS_MAX_MATCH_LEN	65538
 
 #endif /* _WIMLIB_XPRESS_H */
diff --git a/src/compress.c b/src/compress.c
index dfaabe6b..f98a5a52 100644
--- a/src/compress.c
+++ b/src/compress.c
@@ -27,73 +27,85 @@
 #  include "config.h"
 #endif
 
-
 #include "wimlib/assert.h"
+#include "wimlib/compiler.h"
 #include "wimlib/compress.h"
 #include "wimlib/util.h"
 
 #include <stdlib.h>
 #include <string.h>
 
-static inline void
-flush_bits(struct output_bitstream *ostream)
-{
-	*(le16*)ostream->bit_output = cpu_to_le16(ostream->bitbuf);
-	ostream->bit_output = ostream->next_bit_output;
-	ostream->next_bit_output = ostream->output;
-	ostream->output += 2;
-	ostream->num_bytes_remaining -= 2;
-}
-
 /* Writes @num_bits bits, given by the @num_bits least significant bits of
  * @bits, to the output @ostream. */
-int
+void
 bitstream_put_bits(struct output_bitstream *ostream, output_bitbuf_t bits,
 		   unsigned num_bits)
 {
 	unsigned rem_bits;
 
-	wimlib_assert(num_bits <= 16);
 	if (num_bits <= ostream->free_bits) {
+		/* Buffer variable had space for the new bits.  */
 		ostream->bitbuf = (ostream->bitbuf << num_bits) | bits;
 		ostream->free_bits -= num_bits;
-	} else {
+		return;
+	}
 
-		if (ostream->num_bytes_remaining + (ostream->output -
-						ostream->bit_output) < 2)
-			return 1;
-
-		/* It is tricky to output the bits correctly.  The correct way
-		 * is to output little-endian 2-byte words, such that the bits
-		 * in the SECOND byte logically precede those in the FIRST byte.
-		 * While the byte order is little-endian, the bit order is
-		 * big-endian; the first bit in a byte is the high-order one.
-		 * Any multi-bit numbers are in bit-big-endian form, so the
-		 * low-order bit of a multi-bit number is the LAST bit to be
-		 * output. */
-		rem_bits = num_bits - ostream->free_bits;
-		ostream->bitbuf <<= ostream->free_bits;
-		ostream->bitbuf |= bits >> rem_bits;
-		flush_bits(ostream);
-		ostream->free_bits = 16 - rem_bits;
-		ostream->bitbuf = bits;
+	/* Buffer variable does not have space for the new bits.  It needs to be
+	 * flushed as a 16-bit integer.  Bits in the second byte logically
+	 * precede those in the first byte (little-endian), but within each byte
+	 * the bits are ordered from high to low.  This is true for both XPRESS
+	 * and LZX compression.  */
 
+	wimlib_assert(num_bits <= 16);
+
+	/* There must be at least 2 bytes of space remaining.  */
+	if (unlikely(ostream->bytes_remaining < 2)) {
+		ostream->overrun = true;
+		return;
 	}
-	return 0;
+
+	/* Fill the buffer with as many bits that fit.  */
+	rem_bits = num_bits - ostream->free_bits;
+	ostream->bitbuf <<= ostream->free_bits;
+	ostream->bitbuf |= bits >> rem_bits;
+
+	*(le16*)ostream->bit_output = cpu_to_le16(ostream->bitbuf);
+	ostream->bit_output = ostream->next_bit_output;
+	ostream->next_bit_output = ostream->output;
+	ostream->output += 2;
+	ostream->bytes_remaining -= 2;
+
+	ostream->free_bits = 16 - rem_bits;
+	ostream->bitbuf = bits;
 }
 
-/* Flushes any remaining bits in the output buffer to the output byte stream. */
-int
-flush_output_bitstream(struct output_bitstream *ostream)
+void
+bitstream_put_byte(struct output_bitstream *ostream, u8 n)
 {
-	if (ostream->num_bytes_remaining + (ostream->output -
-					ostream->bit_output) < 2)
-		return 1;
-	if (ostream->free_bits != 16) {
-		ostream->bitbuf <<= ostream->free_bits;
-		flush_bits(ostream);
+	if (unlikely(ostream->bytes_remaining < 1)) {
+		ostream->overrun = true;
+		return;
 	}
-	return 0;
+	*ostream->output++ = n;
+	ostream->bytes_remaining--;
+}
+
+/* Flushes any remaining bits to the output bitstream.
+ *
+ * Returns -1 if the stream has overrun; otherwise returns the total number of
+ * bytes in the output.  */
+input_idx_t
+flush_output_bitstream(struct output_bitstream *ostream)
+{
+	if (unlikely(ostream->overrun))
+		return ~(input_idx_t)0;
+
+	*(le16*)ostream->bit_output =
+		cpu_to_le16((u16)((u32)ostream->bitbuf << ostream->free_bits));
+	*(le16*)ostream->next_bit_output =
+		cpu_to_le16(0);
+
+	return ostream->output - ostream->output_start;
 }
 
 /* Initializes an output bit buffer to write its output to the memory location
@@ -106,10 +118,12 @@ init_output_bitstream(struct output_bitstream *ostream,
 
 	ostream->bitbuf              = 0;
 	ostream->free_bits           = 16;
+	ostream->output_start        = data;
 	ostream->bit_output          = data;
 	ostream->next_bit_output     = data + 2;
 	ostream->output              = data + 4;
-	ostream->num_bytes_remaining = num_bytes - 4;
+	ostream->bytes_remaining     = num_bytes;
+	ostream->overrun             = false;
 }
 
 typedef struct {
diff --git a/src/decompress.c b/src/decompress.c
index 227bfb06..502ca47d 100644
--- a/src/decompress.c
+++ b/src/decompress.c
@@ -28,6 +28,7 @@
 #endif
 
 #include "wimlib/decompress.h"
+#include "wimlib/error.h"
 #include "wimlib/util.h"
 
 #include <string.h>
@@ -417,10 +418,8 @@ read_huffsym_near_end_of_input(struct input_bitstream *istream,
 	if (sym >= num_syms) {
 		bitstream_remove_bits(istream, key_size);
 		do {
-			if (bitsleft == 0) {
-				DEBUG("Input stream exhausted");
+			if (bitsleft == 0)
 				return -1;
-			}
 			key_bits = sym + bitstream_peek_bits(istream, 1);
 			bitstream_remove_bits(istream, 1);
 			bitsleft--;
diff --git a/src/lz77.c b/src/lz77.c
index 5ce8d89f..b9a40174 100644
--- a/src/lz77.c
+++ b/src/lz77.c
@@ -27,7 +27,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#  include "config.h"
+#  include <config.h>
 #endif
 
 #include "wimlib/compress.h"
@@ -78,7 +78,7 @@ update_hash(unsigned hash, u8 c)
  * indicating the end of the hash chain.
  */
 static inline unsigned
-insert_string(u16 hash_tab[], u16 prev_tab[],
+insert_string(input_idx_t hash_tab[], input_idx_t prev_tab[],
 	      const u8 window[], unsigned str_pos,
 	      unsigned hash)
 {
@@ -112,7 +112,7 @@ insert_string(u16 hash_tab[], u16 prev_tab[],
  */
 static unsigned
 longest_match(const u8 window[], unsigned bytes_remaining,
-	      unsigned strstart, const u16 prev_tab[],
+	      unsigned strstart, const input_idx_t prev_tab[],
 	      unsigned cur_match, unsigned prev_len,
 	      unsigned *match_start_ret,
 	      const struct lz_params *params)
@@ -193,44 +193,23 @@ longest_match(const u8 window[], unsigned bytes_remaining,
  * Determines the sequence of matches and literals that a block of data will be
  * compressed to.
  *
- * @uncompressed_data:	The data that is to be compressed.
- * @uncompressed_len:	The length of @uncompressed_data, in bytes.
- * @match_tab:		An array for the intermediate representation of matches.
- * @record_match:	A function that will be called to produce the
- * 				intermediate representation of a match, given
- * 				the offset and length.  This function should also
- * 				update the appropriate symbol frequency counts
- * 				so that any needed Huffman codes can be made
- * 				later.
- * @record_literal:	A function that will be called to produce the
- * 				intermediate representation of a literal, given
- * 				the character of the literal.  This function
- * 				should also update the appropriate symbol
- * 				frequency counts so that any needed Huffman
- * 				codes can be made later.
- * @record_match_arg_1:
- * @record_match_arg_2:	Extra arguments to be passed to @record_match.
- * @record_literal_arg:	Extra arguments to be passed to @record_literal.
+ * @window:		The data that is to be compressed.
+ * @window_size:	The length of @window, in bytes.
+ * @record_match:	Consumer for matches.
+ * @record_literal:	Consumer for literals.
+ * @record_ctx:		Context passed to @record_match and @record_literal.
  * @params:		Structure that contains parameters that affect how the
  * 				analysis proceeds (mainly how good the matches
  * 				have to be).
- *
- * Returns the total number of matches and literal bytes that were found; this
- * is the number of slots in @match_tab that have been filled with the
- * intermediate representation of a match or literal byte.
  */
-unsigned
-lz_analyze_block(const u8 uncompressed_data[],
-		 unsigned uncompressed_len,
-		 u32 match_tab[],
+void
+lz_analyze_block(const u8 window[],
+		 input_idx_t window_size,
 		 lz_record_match_t record_match,
 		 lz_record_literal_t record_literal,
-		 void *record_match_arg1,
-		 void *record_match_arg2,
-		 void *record_literal_arg,
+		 void *record_ctx,
 		 const struct lz_params *params)
 {
-	unsigned cur_match_pos = 0;
 	unsigned cur_input_pos = 0;
 	unsigned hash          = 0;
 	unsigned hash_head     = 0;
@@ -239,23 +218,21 @@ lz_analyze_block(const u8 uncompressed_data[],
 	unsigned match_len     = params->min_match - 1;
 	unsigned match_start   = 0;
 	bool match_available = false;
-	u16 hash_tab[HASH_SIZE];
-	u32 match;
-	u16 prev_tab[uncompressed_len];
+	input_idx_t hash_tab[HASH_SIZE];
+	input_idx_t prev_tab[window_size];
 
 	ZERO_ARRAY(hash_tab);
-	ZERO_ARRAY(prev_tab);
 
 	do {
 		/* If there are at least 3 characters remaining in the input,
 		 * insert the 3-character string beginning at
-		 * uncompressed_data[cur_input_pos] into the hash table.
+		 * window[cur_input_pos] into the hash table.
 		 *
 		 * hash_head is set to the index of the previous string in the
 		 * hash bucket, or 0 if there is no such string */
-		if (uncompressed_len - cur_input_pos >= params->min_match) {
+		if (window_size - cur_input_pos >= params->min_match) {
 			hash = insert_string(hash_tab, prev_tab,
-					     uncompressed_data,
+					     window,
 					     cur_input_pos, hash);
 			hash_head = prev_tab[cur_input_pos];
 		} else {
@@ -273,8 +250,8 @@ lz_analyze_block(const u8 uncompressed_data[],
 			 * string of window index 0 (in particular we have to
 			 * avoid a match of the string with itself at the start
 			 * of the input file).  */
-			match_len = longest_match(uncompressed_data,
-						  uncompressed_len - cur_input_pos,
+			match_len = longest_match(window,
+						  window_size - cur_input_pos,
 						  cur_input_pos, prev_tab,
 						  hash_head, prev_len,
 						  &match_start, params);
@@ -289,26 +266,18 @@ lz_analyze_block(const u8 uncompressed_data[],
 		 */
 		if (prev_len >= params->min_match && match_len <= prev_len) {
 
-			/* Do not insert strings in hash table beyond this. */
-			unsigned max_insert = uncompressed_len - params->min_match;
-
-			/*DEBUG("Recording match (pos = %u, offset = %u, len = %u)\n",*/
-					/*cur_input_pos - 1, */
-					/*cur_input_pos - 1 - prev_start,*/
-					/*prev_len);*/
 
-			match = (*record_match)(cur_input_pos - 1 - prev_start,
-						prev_len,
-						record_match_arg1,
-						record_match_arg2);
+			(*record_match)(prev_len,
+					cur_input_pos - 1 - prev_start,
+					record_ctx);
 
-			match_tab[cur_match_pos++] = match;
+			/* Insert in hash table all strings up to the end of the
+			 * match.  strstart-1 and strstart are already inserted.
+			 * If there is not enough lookahead, the last two
+			 * strings are not inserted in the hash table.  */
 
-			/* Insert in hash table all strings up to the end of the match.
-			 * strstart-1 and strstart are already inserted. If there is not
-			 * enough lookahead, the last two strings are not inserted in
-			 * the hash table.
-			 */
+			/* Do not insert strings in hash table beyond this. */
+			unsigned max_insert = window_size - params->min_match;
 #if LZ_MIN_MATCH == 2
 			if (prev_len >= 3)
 #endif
@@ -318,7 +287,7 @@ lz_analyze_block(const u8 uncompressed_data[],
 				do {
 					if (++cur_input_pos <= max_insert) {
 						hash = insert_string(hash_tab, prev_tab,
-								     uncompressed_data,
+								     window,
 								     cur_input_pos,
 								     hash);
 					}
@@ -327,30 +296,18 @@ lz_analyze_block(const u8 uncompressed_data[],
 			match_available = false;
 			match_len = params->min_match - 1;
 		} else if (match_available) {
-			/* If there was no match at the previous position, output a
-			 * single literal. If there was a match but the current match
-			 * is longer, truncate the previous match to a single literal.
-			 */
-
-			/*DEBUG("Recording litrl (pos = %u, value = %u)\n",*/
-					/*cur_input_pos - 1, */
-					/*uncompressed_data[cur_input_pos - 1]);*/
-
-			match = (*record_literal)(
-					uncompressed_data[cur_input_pos - 1],
-							record_literal_arg);
-			match_tab[cur_match_pos++] = match;
+			/* If there was no match at the previous position,
+			 * output a single literal. If there was a match but the
+			 * current match is longer, truncate the previous match
+			 * to a single literal.  */
+			(*record_literal)(window[cur_input_pos - 1], record_ctx);
 		} else {
 			/* There is no previous match to compare with, wait for
 			 * the next step to decide.  */
 			match_available = true;
 		}
-	} while (++cur_input_pos < uncompressed_len);
-
-	if (match_available) {
-		match = (*record_literal)(uncompressed_data[cur_input_pos - 1],
-						record_literal_arg);
-		match_tab[cur_match_pos++] = match;
-	}
-	return cur_match_pos;
+	} while (++cur_input_pos < window_size);
+
+	if (match_available)
+		(*record_literal)(window[cur_input_pos - 1], record_ctx);
 }
diff --git a/src/lzx-compress.c b/src/lzx-compress.c
index 4c0e6ce1..9874a7b9 100644
--- a/src/lzx-compress.c
+++ b/src/lzx-compress.c
@@ -195,12 +195,11 @@
 #include <string.h>
 
 #ifdef ENABLE_LZX_DEBUG
-#  include <wimlib/decompress.h>
+#  include "wimlib/decompress.h"
 #endif
 
 #include "divsufsort/divsufsort.h"
 
-typedef freq_t input_idx_t;
 typedef u32 block_cost_t;
 #define INFINITE_BLOCK_COST	((block_cost_t)~0U)
 
@@ -1074,13 +1073,10 @@ lzx_write_all_blocks(struct lzx_compressor *ctx, struct output_bitstream *ostrea
 /* Constructs an LZX match from a literal byte and updates the main code symbol
  * frequencies.  */
 static u32
-lzx_record_literal(u8 literal, void *_freqs)
+lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
 {
-	struct lzx_freqs *freqs = _freqs;
-
-	freqs->main[literal]++;
-
-	return (u32)literal;
+	freqs->main[lit]++;
+	return (u32)lit;
 }
 
 /* Constructs an LZX match from an offset and a length, and updates the LRU
@@ -1088,11 +1084,9 @@ lzx_record_literal(u8 literal, void *_freqs)
  * alphabets.  The return value is a 32-bit number that provides the match in an
  * intermediate representation documented below.  */
 static u32
-lzx_record_match(unsigned match_offset, unsigned match_len,
-		 void *_freqs, void *_queue)
+lzx_tally_match(unsigned match_len, unsigned match_offset,
+		struct lzx_freqs *freqs, struct lzx_lru_queue *queue)
 {
-	struct lzx_freqs *freqs = _freqs;
-	struct lzx_lru_queue *queue = _queue;
 	unsigned position_slot;
 	unsigned position_footer;
 	u32 len_header;
@@ -1166,6 +1160,28 @@ lzx_record_match(unsigned match_offset, unsigned match_len,
 		(adjusted_match_len);
 }
 
+struct lzx_record_ctx {
+	struct lzx_freqs freqs;
+	struct lzx_lru_queue queue;
+	struct lzx_match *matches;
+};
+
+static void
+lzx_record_match(unsigned len, unsigned offset, void *_ctx)
+{
+	struct lzx_record_ctx *ctx = _ctx;
+
+	(ctx->matches++)->data = lzx_tally_match(len, offset, &ctx->freqs, &ctx->queue);
+}
+
+static void
+lzx_record_literal(u8 lit, void *_ctx)
+{
+	struct lzx_record_ctx *ctx = _ctx;
+
+	(ctx->matches++)->data = lzx_tally_literal(lit, &ctx->freqs);
+}
+
 /* Returns the cost, in bits, to output a literal byte using the specified cost
  * model.  */
 static unsigned
@@ -1921,11 +1937,11 @@ lzx_optimize_block(struct lzx_compressor *ctx, struct lzx_block_spec *spec,
 
 			raw_match = lzx_lz_get_near_optimal_match(ctx);
 			if (raw_match.len >= LZX_MIN_MATCH_LEN) {
-				lzx_match.data = lzx_record_match(raw_match.offset, raw_match.len,
-								  &freqs, &ctx->queue);
+				lzx_match.data = lzx_tally_match(raw_match.len, raw_match.offset,
+								 &freqs, &ctx->queue);
 				i += raw_match.len;
 			} else {
-				lzx_match.data = lzx_record_literal(ctx->window[i], &freqs);
+				lzx_match.data = lzx_tally_literal(ctx->window[i], &freqs);
 				i += 1;
 			}
 			ctx->chosen_matches[spec->chosen_matches_start_pos +
@@ -2139,9 +2155,6 @@ lzx_prepare_blocks(struct lzx_compressor * ctx)
  *	ctx->window[]
  *	ctx->window_size
  *
- * Working space:
- *	ctx->queue
- *
  * Output --- the block specification and the corresponding match/literal data:
  *
  *	ctx->block_specs[]
@@ -2151,8 +2164,7 @@ lzx_prepare_blocks(struct lzx_compressor * ctx)
 static void
 lzx_prepare_block_fast(struct lzx_compressor * ctx)
 {
-	unsigned num_matches;
-	struct lzx_freqs freqs;
+	struct lzx_record_ctx record_ctx;
 	struct lzx_block_spec *spec;
 
 	/* Parameters to hash chain LZ match finder
@@ -2170,19 +2182,17 @@ lzx_prepare_block_fast(struct lzx_compressor * ctx)
 	};
 
 	/* Initialize symbol frequencies and match offset LRU queue.  */
-	memset(&freqs, 0, sizeof(struct lzx_freqs));
-	lzx_lru_queue_init(&ctx->queue);
+	memset(&record_ctx.freqs, 0, sizeof(struct lzx_freqs));
+	lzx_lru_queue_init(&record_ctx.queue);
+	record_ctx.matches = ctx->chosen_matches;
 
 	/* Determine series of matches/literals to output.  */
-	num_matches = lz_analyze_block(ctx->window,
-				       ctx->window_size,
-				       (u32*)ctx->chosen_matches,
-				       lzx_record_match,
-				       lzx_record_literal,
-				       &freqs,
-				       &ctx->queue,
-				       &freqs,
-				       &lzx_lz_params);
+	lz_analyze_block(ctx->window,
+			 ctx->window_size,
+			 lzx_record_match,
+			 lzx_record_literal,
+			 &record_ctx,
+			 &lzx_lz_params);
 
 
 	/* Set up block specification.  */
@@ -2190,9 +2200,9 @@ lzx_prepare_block_fast(struct lzx_compressor * ctx)
 	spec->block_type = LZX_BLOCKTYPE_ALIGNED;
 	spec->window_pos = 0;
 	spec->block_size = ctx->window_size;
-	spec->num_chosen_matches = num_matches;
+	spec->num_chosen_matches = (record_ctx.matches - ctx->chosen_matches);
 	spec->chosen_matches_start_pos = 0;
-	lzx_make_huffman_codes(&freqs, &spec->codes);
+	lzx_make_huffman_codes(&record_ctx.freqs, &spec->codes);
 	ctx->num_blocks = 1;
 }
 
@@ -2239,7 +2249,7 @@ wimlib_lzx_compress2(const void			* const restrict uncompressed_data,
 {
 	struct lzx_compressor *ctx = (struct lzx_compressor*)lzx_ctx;
 	struct output_bitstream ostream;
-	unsigned compressed_len;
+	input_idx_t compressed_len;
 
 	if (uncompressed_len < 100) {
 		LZX_DEBUG("Too small to bother compressing.");
@@ -2286,16 +2296,12 @@ wimlib_lzx_compress2(const void			* const restrict uncompressed_data,
 	lzx_write_all_blocks(ctx, &ostream);
 
 	LZX_DEBUG("Flushing bitstream...");
-	if (flush_output_bitstream(&ostream)) {
-		/* If the bitstream cannot be flushed, then the output space was
-		 * exhausted.  */
+	compressed_len = flush_output_bitstream(&ostream);
+	if (compressed_len == ~(input_idx_t)0) {
 		LZX_DEBUG("Data did not compress to less than original length!");
 		return 0;
 	}
 
-	/* Compute the length of the compressed data.  */
-	compressed_len = ostream.bit_output - (u8*)compressed_data;
-
 	LZX_DEBUG("Done: compressed %u => %u bytes.",
 		  uncompressed_len, compressed_len);
 
@@ -2310,11 +2316,10 @@ wimlib_lzx_compress2(const void			* const restrict uncompressed_data,
 	    )
 	{
 		u8 buf[uncompressed_len];
-		int ret;
 
-		ret = wimlib_lzx_decompress(compressed_data, compressed_len,
-					    buf, uncompressed_len);
-		if (ret) {
+		if (wimlib_lzx_decompress(compressed_data, compressed_len,
+					  buf, uncompressed_len))
+		{
 			ERROR("Failed to decompress data we "
 			      "compressed using LZX algorithm");
 			wimlib_assert(0);
diff --git a/src/lzx-decompress.c b/src/lzx-decompress.c
index 935c6367..0a456297 100644
--- a/src/lzx-decompress.c
+++ b/src/lzx-decompress.c
@@ -38,13 +38,13 @@
  *
  * A compressed WIM resource consists of a table of chunk offsets followed by
  * the compressed chunks themselves.  All compressed chunks except possibly the
- * last decompress to WIM_CHUNK_SIZE (= 32768) bytes.  This is quite similar to
- * the cabinet (.cab) file format, but they are not the same.  According to the
- * cabinet format documentation, the LZX block size is independent from the
- * CFDATA blocks, and a LZX block may span several CFDATA blocks.  However, in
- * WIMs, LZX blocks do not appear to ever span multiple WIM chunks.  Note that
- * this means any WIM chunk may be decompressed or compressed independently from
- * any other chunk, which is convenient.
+ * last decompress to 32768 bytes.  This is quite similar to the cabinet (.cab)
+ * file format, but they are not the same.  According to the cabinet format
+ * documentation, the LZX block size is independent from the CFDATA blocks, and
+ * a LZX block may span several CFDATA blocks.  However, in WIMs, LZX blocks do
+ * not appear to ever span multiple WIM chunks.  Note that this means any WIM
+ * chunk may be decompressed or compressed independently from any other chunk,
+ * which is convenient.
  *
  * A LZX compressed WIM chunk contains one or more LZX blocks of the aligned,
  * verbatim, or uncompressed block types.  For aligned and verbatim blocks, the
diff --git a/src/xpress-compress.c b/src/xpress-compress.c
index 6b90ba9e..5a314b69 100644
--- a/src/xpress-compress.c
+++ b/src/xpress-compress.c
@@ -36,113 +36,102 @@
 #include "wimlib/util.h"
 #include "wimlib/xpress.h"
 
-#include <stdlib.h>
 #include <string.h>
 
+/* Intermediate XPRESS match/literal representation.  */
+struct xpress_match {
+	u16 adjusted_len;  /* Match length minus XPRESS_MIN_MATCH_LEN */
+	u16 offset;        /* Match offset */
+	/* For literals, offset == 0 and adjusted_len is the literal.  */
+};
+
 /*
  * Writes @match, which is a match given in the intermediate representation for
  * XPRESS matches, to the output stream @ostream.
  *
  * @codewords and @lens provide the Huffman code that is being used.
  */
-static int
-xpress_write_match(struct output_bitstream *restrict ostream,
-		   u32 match,
+static void
+xpress_write_match(struct xpress_match match,
+		   struct output_bitstream *restrict ostream,
 		   const u16 codewords[restrict],
 		   const u8 lens[restrict])
 {
-	u32 adjusted_match_len = match & 0xffff;
-	u32 match_offset = match >> 16;
-	u32 len_hdr = min(adjusted_match_len, 0xf);
-	u32 offset_bsr = bsr32(match_offset);
-	u32 sym = len_hdr | (offset_bsr << 4) | XPRESS_NUM_CHARS;
-	int ret;
+	u8 len_hdr = min(match.adjusted_len, 0xf);
+	u8 offset_bsr = bsr32(match.offset);
+	unsigned sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
 
-	ret = bitstream_put_bits(ostream, codewords[sym], lens[sym]);
-	if (ret)
-		return ret;
+	bitstream_put_bits(ostream, codewords[sym], lens[sym]);
 
-	if (adjusted_match_len >= 0xf) {
-		u8 byte1 = min(adjusted_match_len - 0xf, 0xff);
-		ret = bitstream_put_byte(ostream, byte1);
-		if (ret)
-			return ret;
+	if (match.adjusted_len >= 0xf) {
+		u8 byte1 = min(match.adjusted_len - 0xf, 0xff);
+		bitstream_put_byte(ostream, byte1);
 		if (byte1 == 0xff) {
-			ret = bitstream_put_two_bytes(ostream, adjusted_match_len);
-			if (ret)
-				return ret;
+			bitstream_put_byte(ostream, match.adjusted_len & 0xff);
+			bitstream_put_byte(ostream, match.adjusted_len >> 8);
 		}
 	}
-	return bitstream_put_bits(ostream,
-				  match_offset ^ (1 << offset_bsr), offset_bsr);
+	bitstream_put_bits(ostream, match.offset ^ (1U << offset_bsr), offset_bsr);
 }
 
-static int
-xpress_write_compressed_literals(struct output_bitstream *ostream,
-				 const u32 match_tab[restrict],
-				 unsigned num_matches,
-				 const u16 codewords[restrict],
-				 const u8 lens[restrict])
+static void
+xpress_write_matches_and_literals(struct output_bitstream *ostream,
+				  const struct xpress_match matches[restrict],
+				  input_idx_t num_matches,
+				  const u16 codewords[restrict],
+				  const u8 lens[restrict])
 {
-	for (unsigned i = 0; i < num_matches; i++) {
-		int ret;
-		u32 match = match_tab[i];
-
-		if (match >= XPRESS_NUM_CHARS) /* match */
-			ret = xpress_write_match(ostream, match, codewords,
-						 lens);
-		else /* literal byte */
-			ret = bitstream_put_bits(ostream, codewords[match],
-						 lens[match]);
-		if (ret)
-			return ret;
+	for (input_idx_t i = 0; i < num_matches; i++) {
+		if (matches[i].offset) {
+			/* Real match  */
+			xpress_write_match(matches[i], ostream, codewords, lens);
+		} else {
+			/* Literal byte  */
+			u8 lit = matches[i].adjusted_len;
+			bitstream_put_bits(ostream, codewords[lit], lens[lit]);
+		}
 	}
-	return bitstream_put_bits(ostream, codewords[XPRESS_END_OF_DATA],
-				  lens[XPRESS_END_OF_DATA]);
+	bitstream_put_bits(ostream, codewords[XPRESS_END_OF_DATA], lens[XPRESS_END_OF_DATA]);
 }
 
-static u32
-xpress_record_literal(u8 literal, void *_freq_tab)
+struct xpress_record_ctx {
+	freq_t freqs[XPRESS_NUM_SYMBOLS];
+	struct xpress_match *matches;
+};
+
+static void
+xpress_record_literal(u8 lit, void *_ctx)
 {
-	freq_t *freq_tab = _freq_tab;
-	freq_tab[literal]++;
-	return literal;
+	struct xpress_record_ctx *ctx = _ctx;
+	ctx->freqs[lit]++;
+	*(ctx->matches++) = (struct xpress_match) { .offset = 0, .adjusted_len = lit };
 }
 
-static u32
-xpress_record_match(unsigned match_offset, unsigned match_len,
-		    void *freq_tab, void *_ignore)
+static void
+xpress_record_match(unsigned len, unsigned offset, void *_ctx)
 {
-	wimlib_assert(match_len >= XPRESS_MIN_MATCH &&
-		      match_len <= XPRESS_MAX_MATCH);
-	wimlib_assert(match_offset >= XPRESS_MIN_OFFSET &&
-		      match_offset <= XPRESS_MAX_OFFSET);
+	struct xpress_record_ctx *ctx = _ctx;
 
-	/*
-	 * The intermediate representation of XPRESS matches is as follows:
-	 *
-	 * bits    description
-	 * ----    -----------------------------------------------------------
-	 *
-	 * 16-31   match offset (XPRESS_MIN_OFFSET < x < XPRESS_MAX_OFFSET)
-	 *
-	 * 0-15    adjusted match length (0 <= x <= XPRESS_MAX_MATCH - XPRESS_MIN_MATCH)
-	 *
-	 * Literals are simply represented as themselves and can be
-	 * distinguished from matches by the fact that only literals will have
-	 * the upper three bytes completely clear. */
+	XPRESS_ASSERT(len >= XPRESS_MIN_MATCH_LEN);
+	XPRESS_ASSERT(len <= XPRESS_MAX_MATCH_LEN);
+	XPRESS_ASSERT(offset >= XPRESS_MIN_OFFSET);
+	XPRESS_ASSERT(offset <= XPRESS_MAX_OFFSET);
+
+	unsigned adjusted_len = len - XPRESS_MIN_MATCH_LEN;
+	unsigned len_hdr = min(adjusted_len, 0xf);
+	unsigned sym = XPRESS_NUM_CHARS + ((bsr32(offset) << 4) | len_hdr);
 
-	u32 adjusted_match_len = match_len - XPRESS_MIN_MATCH;
-	u32 len_hdr = min(adjusted_match_len, 0xf);
-	u32 offset_bsr = bsr32(match_offset);
-	u32 sym = len_hdr | (offset_bsr << 4) | XPRESS_NUM_CHARS;
-	((freq_t*)freq_tab)[sym]++;
-	return adjusted_match_len | (match_offset << 16);
+	XPRESS_ASSERT(sym >= XPRESS_NUM_CHARS);
+	XPRESS_ASSERT(sym < XPRESS_NUM_SYMBOLS);
+
+	ctx->freqs[sym]++;
+	*(ctx->matches++) = (struct xpress_match) { .offset = offset,
+						    .adjusted_len = adjusted_len };
 }
 
 static const struct lz_params xpress_lz_params = {
-	.min_match      = XPRESS_MIN_MATCH,
-	.max_match      = XPRESS_MAX_MATCH,
+	.min_match      = XPRESS_MIN_MATCH_LEN,
+	.max_match      = XPRESS_MAX_MATCH_LEN,
 	.good_match	= 16,
 	.nice_match     = 32,
 	.max_chain_len  = 16,
@@ -152,122 +141,91 @@ static const struct lz_params xpress_lz_params = {
 
 /* API function documented in wimlib.h  */
 WIMLIBAPI unsigned
-wimlib_xpress_compress(const void * restrict _uncompressed_data,
+wimlib_xpress_compress(const void * restrict uncompressed_data,
 		       unsigned uncompressed_len,
-		       void * restrict _compressed_data)
+		       void * restrict compressed_data)
 {
-	u8 *compressed_data = _compressed_data;
+	u8 *cptr = compressed_data;
 	struct output_bitstream ostream;
-	u32 match_tab[uncompressed_len];
-	freq_t freq_tab[XPRESS_NUM_SYMBOLS];
+
+	struct xpress_record_ctx record_ctx;
+	struct xpress_match matches[uncompressed_len];
+	u8 udata[uncompressed_len + 8];
 	u16 codewords[XPRESS_NUM_SYMBOLS];
 	u8 lens[XPRESS_NUM_SYMBOLS];
-	unsigned num_matches;
-	unsigned compressed_len;
-	unsigned i;
-	int ret;
-	u8 uncompressed_data[uncompressed_len + 8];
-
-	memcpy(uncompressed_data, _uncompressed_data, uncompressed_len);
-	memset(uncompressed_data + uncompressed_len, 0, 8);
+	input_idx_t num_matches;
+	input_idx_t compressed_len;
+	input_idx_t i;
 
-	wimlib_assert(uncompressed_len <= 32768);
-
-	/* XPRESS requires 256 bytes of overhead for the Huffman tables, so it's
-	 * impossible cannot compress 256 bytes or less of data to less than the
+	/* XPRESS requires 256 bytes of overhead for the Huffman code, so it's
+	 * impossible to compress 256 bytes or less of data to less than the
 	 * input size.
 	 *
 	 * +1 to take into account that the buffer for compressed data is 1 byte
 	 * smaller than the buffer for uncompressed data.
 	 *
 	 * +4 to take into account that init_output_bitstream() requires at
-	 * least 4 bytes of data. */
+	 * least 4 bytes of data.  */
 	if (uncompressed_len < XPRESS_NUM_SYMBOLS / 2 + 1 + 4)
 		return 0;
 
-	ZERO_ARRAY(freq_tab);
-	num_matches = lz_analyze_block(uncompressed_data, uncompressed_len,
-				       match_tab, xpress_record_match,
-				       xpress_record_literal, freq_tab,
-				       NULL, freq_tab,
-				       &xpress_lz_params);
+	/* Copy the data to a temporary buffer, but only to avoid
+	 * inconsequential accesses of uninitialized memory in
+	 * lz_analyze_block().  */
+	memcpy(udata, uncompressed_data, uncompressed_len);
+	memset(udata + uncompressed_len, 0, 8);
+
+	/* Determine match/literal sequence to divide the data into.  */
+	ZERO_ARRAY(record_ctx.freqs);
+	record_ctx.matches = matches;
+	lz_analyze_block(udata,
+			 uncompressed_len,
+			 xpress_record_match,
+			 xpress_record_literal,
+			 &record_ctx,
+			 &xpress_lz_params);
 
-	freq_tab[XPRESS_END_OF_DATA]++;
+	num_matches = (record_ctx.matches - matches);
 
+	/* Account for end of data symbol.  */
+	record_ctx.freqs[XPRESS_END_OF_DATA]++;
+
+	/* Build the Huffman code.  */
 	make_canonical_huffman_code(XPRESS_NUM_SYMBOLS, XPRESS_MAX_CODEWORD_LEN,
-				    freq_tab, lens, codewords);
+				    record_ctx.freqs, lens, codewords);
 
-	/* IMPORTANT NOTE:
-	 *
-	 * It's tempting to output the 512 Huffman codeword lengths using the
-	 * bitstream_put_bits() function.  However, this is NOT correct because
-	 * bitstream_put_bits() will output 2 bytes at a time in little-endian
-	 * order, which is the order that is needed for the compressed literals.
-	 * However, the bytes in the lengths table are in order, so they need to
-	 * be written one at a time without using bitstream_put_bits().
-	 *
-	 * Because of this, init_output_bitstream() is not called until after
-	 * the lengths table is output.
-	 */
+	/* Output the Huffman code as a series of 512 4-bit lengths.  */
 	for (i = 0; i < XPRESS_NUM_SYMBOLS; i += 2)
-		*compressed_data++ = (lens[i] & 0xf) | (lens[i + 1] << 4);
+		*cptr++ = (lens[i] & 0xf) | (lens[i + 1] << 4);
 
-	init_output_bitstream(&ostream, compressed_data,
+	/* Output the encoded matches/literals.  */
+	init_output_bitstream(&ostream, cptr,
 			      uncompressed_len - XPRESS_NUM_SYMBOLS / 2 - 1);
+	xpress_write_matches_and_literals(&ostream, matches,
+					  num_matches, codewords, lens);
 
-	ret = xpress_write_compressed_literals(&ostream, match_tab,
-					       num_matches, codewords, lens);
-	if (ret)
+	/* Flush any pending data and get the length of the compressed data.  */
+	compressed_len = flush_output_bitstream(&ostream);
+	if (compressed_len == ~(input_idx_t)0)
 		return 0;
+	compressed_len += XPRESS_NUM_SYMBOLS / 2;
 
-	/* Flush any bits that are buffered. */
-	ret = flush_output_bitstream(&ostream);
-	if (ret)
+#if defined(ENABLE_XPRESS_DEBUG) || defined(ENABLE_VERIFY_COMPRESSION) || 1
+	/* Verify that we really get the same thing back when decompressing.  */
+	if (wimlib_xpress_decompress(compressed_data, compressed_len,
+				     udata, uncompressed_len))
+	{
+		ERROR("Failed to decompress data we "
+		      "compressed using XPRESS algorithm");
+		wimlib_assert(0);
 		return 0;
+	}
 
-	/* Assert that there are no output bytes between the ostream.output
-	 * pointer and the ostream.next_bit_output pointer.  This can only
-	 * happen if bytes had been written at the ostream.output pointer before
-	 * the last bit word was written to the stream.  But, this does not
-	 * occur since xpress_write_match() always finishes by writing some bits
-	 * (a Huffman symbol), and the bitstream was just flushed. */
-	wimlib_assert(ostream.output - ostream.next_bit_output == 2);
-
-	/* The length of the compressed data is supposed to be the value of the
-	 * ostream.output pointer before flushing, which is now the
-	 * output.next_bit_output pointer after flushing.
-	 *
-	 * There will be an extra 2 bytes at the ostream.bit_output pointer,
-	 * which is zeroed out.  (These 2 bytes may be either the last bytes in
-	 * the compressed data, in which case they are actually unnecessary, or
-	 * they may precede a number of bytes embedded into the bitstream.) */
-	if (ostream.bit_output >
-	    (const u8*)_compressed_data + uncompressed_len - 3)
+	if (memcmp(uncompressed_data, udata, uncompressed_len)) {
+		ERROR("Data we compressed using XPRESS algorithm "
+		      "didn't decompress to original");
+		wimlib_assert(0);
 		return 0;
-	*(u16*)ostream.bit_output = cpu_to_le16(0);
-	compressed_len = ostream.next_bit_output - (const u8*)_compressed_data;
-
-	wimlib_assert(compressed_len <= uncompressed_len - 1);
-
-#ifdef ENABLE_VERIFY_COMPRESSION
-	/* Verify that we really get the same thing back when decompressing. */
-	{
-		u8 buf[uncompressed_len];
-		ret = wimlib_xpress_decompress(_compressed_data, compressed_len,
-					       buf, uncompressed_len);
-		if (ret) {
-			ERROR("xpress_compress(): Failed to decompress data we "
-			      "compressed");
-			abort();
-		}
-		for (i = 0; i < uncompressed_len; i++) {
-			if (buf[i] != uncompressed_data[i]) {
-				ERROR("xpress_compress(): Data we compressed didn't "
-				      "decompress to the original data (difference at "
-				      "byte %u of %u)", i + 1, uncompressed_len);
-				abort();
-			}
-		}
 	}
 #endif
 	return compressed_len;
diff --git a/src/xpress-decompress.c b/src/xpress-decompress.c
index cf08a836..b71f6be2 100644
--- a/src/xpress-decompress.c
+++ b/src/xpress-decompress.c
@@ -25,11 +25,10 @@
  */
 
 
-
 /*
- * The XPRESS compression format is a LZ77 and Huffman-code based algorithm.
- * That means it is quite similar to LZX compression, but XPRESS is slightly
- * simpler, so it is a little faster to compress and decompress.
+ * The XPRESS compression format is an LZ77 and Huffman-code based algorithm.
+ * That means it is fairly similar to LZX compression, but XPRESS is simpler, so
+ * it is a little faster to compress and decompress.
  *
  * The XPRESS compression format is mostly documented in a file called "[MS-XCA]
  * Xpress Compression Algorithm".  In the MSDN library, it can currently be
@@ -41,20 +40,18 @@
  * If you are already familiar with the LZ77 algorithm and Huffman coding, the
  * XPRESS format is fairly simple.  The compressed data begins with 256 bytes
  * that contain 512 4-bit integers that are the lengths of the symbols in the
- * Huffman tree used for decoding compressed literals.  This is the only Huffman
- * tree that is used for the entirety of the compressed data, and the codeword
+ * Huffman code used for match/literal headers.  In contrast with more
+ * complicated formats such as DEFLATE and LZX, this is the only Huffman code
+ * that is used for the entirety of the XPRESS compressed data, and the codeword
  * lengths are not encoded with a pretree.
  *
  * The rest of the compressed data is Huffman-encoded symbols.  Values 0 through
- * 255 are literal bytes.  Values 256 through 511 are matches and may require
- * extra bits or bytes to be read to get the match offset and match length.
- *
- * There is no notion of a "compressed block" in the XPRESS format, so in the
- * XPRESS format, each WIM chunk (32768 bytes) will always use only one Huffman
- * tree.
+ * 255 represent the corresponding literal bytes.  Values 256 through 511
+ * represent matches and may require extra bits or bytes to be read to get the
+ * match offset and match length.
  *
- * The trickiest part is probably the fact that literal bytes for match lengths
- * are encoded "separately" from the bitstream.
+ * The trickiest part is probably the way in which literal bytes for match
+ * lengths are interleaved in the bitstream.
  *
  * Also, a caveat--- according to Microsoft's documentation for XPRESS,
  *
@@ -63,12 +60,9 @@
  * 	fail during decompression if the Huffman symbol 256 is not found after
  * 	the actual data."
  *
- * This is the case for WIM files--- in we must write this extra symbol "256" at
- * the end.  Otherwise Microsoft's software will fail to decompress the
- * XPRESS-compressed data.
- *
- * However, wimlib's decompressor in this file currently does not care if this
- * extra symbol is there or not.
+ * This is the case for the implementation in WIMGAPI.  However, wimlib's
+ * decompressor in this file currently does not care if this extra symbol is
+ * there or not.
  */
 
 #ifdef HAVE_CONFIG_H
@@ -76,51 +70,49 @@
 #endif
 
 #include "wimlib.h"
-#include "wimlib/assert.h"
-#define XPRESS_DECOMP
 #include "wimlib/decompress.h"
-#include "wimlib/util.h"
 #include "wimlib/xpress.h"
 
 /*
- * Decodes a symbol @huffsym that begins an XPRESS match.
+ * Decodes a symbol @sym that begins an XPRESS match.
  *
  * The low 8 bits of the symbol are divided into:
  *
  * bits 0-3:  length header
  * bits 4-7:  index of high-order bit of match offset
  *
- * Note: taking the low 8 bits of the symbol is the same as subtracting 256, the
- * number of symbols reserved for literals.
- *
- * Returns the match length, or -1 on error.
+ * Returns the match length, or -1 if the data is invalid.
  */
 static int
-xpress_decode_match(unsigned huffsym, unsigned window_pos,
-		    unsigned window_len, u8 window[restrict],
+xpress_decode_match(unsigned sym, input_idx_t window_pos,
+		    input_idx_t window_len, u8 window[restrict],
 		    struct input_bitstream * restrict istream)
 {
-	unsigned match_len;
-	unsigned match_offset;
-	u8 match_sym = (u8)huffsym;
-	u8 len_hdr = match_sym & 0xf;
-	u8 offset_bsr = match_sym >> 4;
+
+	u8 len_hdr;
+	u8 offset_bsr;
 	int ret;
 	u8 *match_dest;
 	u8 *match_src;
 	unsigned i;
+	unsigned match_len;
+	unsigned match_offset;
+
+	sym -= XPRESS_NUM_CHARS;
+	len_hdr = sym & 0xf;
+	offset_bsr = sym >> 4;
+
+	if (bitstream_ensure_bits(istream, 16))
+		return -1;
 
-	ret = bitstream_read_bits(istream, offset_bsr, &match_offset);
-	if (ret)
-		return ret;
-	match_offset |= (1 << offset_bsr);
+	match_offset = (1U << offset_bsr) | bitstream_pop_bits(istream, offset_bsr);
 
 	if (len_hdr == 0xf) {
 		ret = bitstream_read_byte(istream);
 		if (ret < 0)
 			return ret;
 		match_len = ret;
-		if (match_len == 0xff) {
+		if (unlikely(match_len == 0xff)) {
 			ret = bitstream_read_byte(istream);
 			if (ret < 0)
 				return ret;
@@ -137,24 +129,17 @@ xpress_decode_match(unsigned huffsym, unsigned window_pos,
 	} else {
 		match_len = len_hdr;
 	}
-	match_len += XPRESS_MIN_MATCH;
+	match_len += XPRESS_MIN_MATCH_LEN;
 
-	/* Verify that the match is in the bounds of the part of the window
-	 * currently in use, then copy the source of the match to the current
-	 * position. */
 
-	if (window_pos + match_len > window_len) {
-		DEBUG("XPRESS decompression error: match of length %u "
-		      "bytes overflows window", match_len);
+	/* Verify the match is in bounds, then copy its data to the the current
+	 * position.  */
+
+	if (window_pos + match_len > window_len)
 		return -1;
-	}
 
-	if (match_offset > window_pos) {
-		DEBUG("XPRESS decompression error: match of length %u bytes "
-		      "references data before window (match_offset = %u, "
-		      "window_pos = %u)", match_len, match_offset, window_pos);
+	if (match_offset > window_pos)
 		return -1;
-	}
 
 	match_dest = window + window_pos;
 	match_src = match_dest - match_offset;
@@ -165,39 +150,44 @@ xpress_decode_match(unsigned huffsym, unsigned window_pos,
 	return match_len;
 }
 
-/* Decodes the Huffman-encoded matches and literal bytes in a block of
- * XPRESS-encoded data. */
+/* Decodes the Huffman-encoded matches and literal bytes in a region of
+ * XPRESS-encoded data.  */
 static int
-xpress_decompress_block(struct input_bitstream * restrict istream,
-			u8 uncompressed_data[restrict],
-			unsigned uncompressed_len,
-			const u8 lens[restrict],
-			const u16 decode_table[restrict])
+xpress_lz_decode(struct input_bitstream * restrict istream,
+		 u8 uncompressed_data[restrict],
+		 unsigned uncompressed_len,
+		 const u8 lens[restrict],
+		 const u16 decode_table[restrict])
 {
-	unsigned curpos;
-	unsigned huffsym;
-	int ret;
-	int match_len;
-
-	curpos = 0;
-	while (curpos < uncompressed_len) {
-		ret = read_huffsym(istream, decode_table, lens,
-				   XPRESS_NUM_SYMBOLS, XPRESS_TABLEBITS,
-				   &huffsym, XPRESS_MAX_CODEWORD_LEN);
-		if (ret)
-			return ret;
+	input_idx_t curpos;
+	unsigned match_len;
+
+	for (curpos = 0; curpos < uncompressed_len; curpos += match_len) {
+		unsigned sym;
+		int ret;
+
+		if (unlikely(bitstream_ensure_bits(istream, 16)))
+			return -1;
 
-		if (huffsym < XPRESS_NUM_CHARS) {
-			uncompressed_data[curpos++] = huffsym;
+		if (unlikely(read_huffsym(istream, decode_table, lens,
+					  XPRESS_NUM_SYMBOLS, XPRESS_TABLEBITS,
+					  &sym, XPRESS_MAX_CODEWORD_LEN)))
+			return -1;
+
+		if (sym < XPRESS_NUM_CHARS) {
+			/* Literal  */
+			uncompressed_data[curpos] = sym;
+			match_len = 1;
 		} else {
-			match_len = xpress_decode_match(huffsym,
-							curpos,
-							uncompressed_len,
-							uncompressed_data,
-							istream);
-			if (match_len < 0)
-				return match_len;
-			curpos += match_len;
+			/* Match  */
+			ret = xpress_decode_match(sym,
+						  curpos,
+						  uncompressed_len,
+						  uncompressed_data,
+						  istream);
+			if (unlikely(ret < 0))
+				return -1;
+			match_len = ret;
 		}
 	}
 	return 0;
@@ -206,48 +196,38 @@ xpress_decompress_block(struct input_bitstream * restrict istream,
 
 /* API function documented in wimlib.h  */
 WIMLIBAPI int
-wimlib_xpress_decompress(const void * restrict _compressed_data, unsigned compressed_len,
-			 void * restrict uncompressed_data, unsigned uncompressed_len)
+wimlib_xpress_decompress(const void * const restrict _compressed_data,
+			 const unsigned compressed_len,
+			 void * const restrict uncompressed_data,
+			 const unsigned uncompressed_len)
 {
+	const u8 *compressed_data = _compressed_data;
 	u8 lens[XPRESS_NUM_SYMBOLS];
+	u8 *lens_p;
 	u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS]
 			_aligned_attribute(DECODE_TABLE_ALIGNMENT);
 	struct input_bitstream istream;
-	u8 *lens_p;
-	const u8 *compressed_data;
-	unsigned i;
-	int ret;
-
-	compressed_data = _compressed_data;
-	lens_p = lens;
 
-	DEBUG2("compressed_len = %d, uncompressed_len = %d",
-	       compressed_len, uncompressed_len);
-
-	/* XPRESS uses only one Huffman tree.  It contains 512 symbols, and the
+	/* XPRESS uses only one Huffman code.  It contains 512 symbols, and the
 	 * code lengths of these symbols are given literally as 4-bit integers
-	 * in the first 256 bytes of the compressed data.
-	 */
-	if (compressed_len < XPRESS_NUM_SYMBOLS / 2) {
-		DEBUG("xpress_decompress(): Compressed length too short!");
+	 * in the first 256 bytes of the compressed data.  */
+	if (compressed_len < XPRESS_NUM_SYMBOLS / 2)
 		return -1;
-	}
 
-	for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) {
+	lens_p = lens;
+	for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) {
 		*lens_p++ = compressed_data[i] & 0xf;
 		*lens_p++ = compressed_data[i] >> 4;
 	}
 
-	ret = make_huffman_decode_table(decode_table, XPRESS_NUM_SYMBOLS,
-					XPRESS_TABLEBITS, lens,
-					XPRESS_MAX_CODEWORD_LEN);
-	if (ret)
-		return ret;
+	if (make_huffman_decode_table(decode_table, XPRESS_NUM_SYMBOLS,
+				      XPRESS_TABLEBITS, lens,
+				      XPRESS_MAX_CODEWORD_LEN))
+		return -1;
 
 	init_input_bitstream(&istream, compressed_data + XPRESS_NUM_SYMBOLS / 2,
 			     compressed_len - XPRESS_NUM_SYMBOLS / 2);
 
-	return xpress_decompress_block(&istream, uncompressed_data,
-				       uncompressed_len, lens,
-				       decode_table);
+	return xpress_lz_decode(&istream, uncompressed_data,
+				uncompressed_len, lens, decode_table);
 }
-- 
2.43.0