X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Flzx-compress.c;h=a51cfb1b15f7d82c3d5bac490beb204ccb3cb13f;hp=c8f61099f0f92c98b9ec9ea47790fc3523eabc58;hb=41c221c509deed7dc9c2bd8eb8c7e93563b21199;hpb=60b890543d514a9857bc18758e1de956d2516160

diff --git a/src/lzx-compress.c b/src/lzx-compress.c
index c8f61099..a51cfb1b 100644
--- a/src/lzx-compress.c
+++ b/src/lzx-compress.c
@@ -254,7 +254,7 @@
 #define LZX_CACHE_PER_POS	8
 
 #define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1))
-#define LZX_CACHE_SIZE (LZX_CACHE_LEN * sizeof(struct raw_match))
+#define LZX_CACHE_SIZE (LZX_CACHE_LEN * sizeof(struct lz_match))
 #define LZX_MAX_MATCHES_PER_POS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
 
 /* Codewords for the LZX main, length, and aligned offset Huffman codes  */
@@ -279,7 +279,7 @@ struct lzx_lens {
  *
  * If a codeword has zero frequency, it must still be assigned some nonzero cost
  * --- generally a high cost, since even if it gets used in the next iteration,
- * it probably will not be used very times.  */
+ * it probably will not be used very many times.  */
 struct lzx_costs {
 	u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 	u8 len[LZX_LENCODE_NUM_SYMBOLS];
@@ -294,13 +294,13 @@ struct lzx_codes {
 
 /* Tables for tallying symbol frequencies in the three LZX alphabets  */
 struct lzx_freqs {
-	input_idx_t main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
-	input_idx_t len[LZX_LENCODE_NUM_SYMBOLS];
-	input_idx_t aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
+	u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
+	u32 len[LZX_LENCODE_NUM_SYMBOLS];
+	u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
 /* LZX intermediate match/literal format  */
-struct lzx_match {
+struct lzx_item {
 	/* Bit     Description
 	 *
 	 * 31      1 if a match, 0 if a literal.
@@ -325,16 +325,16 @@ struct lzx_block_spec {
 	int block_type;
 
 	/* 0-based position in the window at which this block starts.  */
-	input_idx_t window_pos;
+	u32 window_pos;
 
 	/* The number of bytes of uncompressed data this block represents.  */
-	input_idx_t block_size;
+	u32 block_size;
 
 	/* The match/literal sequence for this block.  */
-	struct lzx_match *chosen_matches;
+	struct lzx_item *chosen_items;
 
-	/* The length of the @chosen_matches sequence.  */
-	input_idx_t num_chosen_matches;
+	/* The length of the @chosen_items sequence.  */
+	u32 num_chosen_items;
 
 	/* Huffman codes for this block.  */
 	struct lzx_codes codes;
@@ -363,10 +363,10 @@ struct lzx_compressor {
 
 	/* Number of bytes of data to be compressed, which is the number of
 	 * bytes of data in @window that are actually valid.  */
-	input_idx_t window_size;
+	u32 window_size;
 
 	/* Allocated size of the @window.  */
-	input_idx_t max_window_size;
+	u32 max_window_size;
 
 	/* Number of symbols in the main alphabet (depends on the
 	 * @max_window_size since it determines the maximum allowed offset).  */
@@ -377,7 +377,7 @@ struct lzx_compressor {
 
 	/* Space for the sequences of matches/literals that were chosen for each
 	 * block.  */
-	struct lzx_match *chosen_matches;
+	struct lzx_item *chosen_items;
 
 	/* Information about the LZX blocks the preprocessed input was divided
 	 * into.  */
@@ -397,27 +397,27 @@ struct lzx_compressor {
 	struct lzx_costs costs;
 
 	/* Fast algorithm only:  Array of hash table links.  */
-	input_idx_t *prev_tab;
+	u32 *prev_tab;
 
 	/* Slow algorithm only: Binary tree match-finder.  */
 	struct lz_bt mf;
 
 	/* Position in window of next match to return.  */
-	input_idx_t match_window_pos;
+	u32 match_window_pos;
 
 	/* The end-of-block position.  We can't allow any matches to span this
 	 * position.  */
-	input_idx_t match_window_end;
+	u32 match_window_end;
 
 	/* Matches found by the match-finder are cached in the following array
 	 * to achieve a slight speedup when the same matches are needed on
 	 * subsequent passes.  This is suboptimal because different matches may
 	 * be preferred with different cost models, but seems to be a worthwhile
 	 * speedup.  */
-	struct raw_match *cached_matches;
-	struct raw_match *cache_ptr;
+	struct lz_match *cached_matches;
+	struct lz_match *cache_ptr;
 	bool matches_cached;
-	struct raw_match *cache_limit;
+	struct lz_match *cache_limit;
 
 	/* Match-chooser state.
 	 * When matches have been chosen, optimum_cur_idx is set to the position
@@ -453,22 +453,22 @@ struct lzx_mc_pos_data {
 			/* Position of the start of the match or literal that
 			 * was taken to get to this position in the approximate
 			 * minimum-cost parse.  */
-			input_idx_t link;
+			u32 link;
 
 			/* Offset (as in an LZ (length, offset) pair) of the
 			 * match or literal that was taken to get to this
 			 * position in the approximate minimum-cost parse.  */
-			input_idx_t match_offset;
+			u32 match_offset;
 		} prev;
 		struct {
 			/* Position at which the match or literal starting at
 			 * this position ends in the minimum-cost parse.  */
-			input_idx_t link;
+			u32 link;
 
 			/* Offset (as in an LZ (length, offset) pair) of the
 			 * match or literal starting at this position in the
 			 * approximate minimum-cost parse.  */
-			input_idx_t match_offset;
+			u32 match_offset;
 		} next;
 	};
 
@@ -550,14 +550,14 @@ lzx_make_huffman_codes(const struct lzx_freqs *freqs,
  *	The type of the LZX block (LZX_BLOCKTYPE_ALIGNED or
  *	LZX_BLOCKTYPE_VERBATIM)
  * @match:
- *	The match, as a (length, offset) pair.
+ *	The match data.
  * @codes:
  *	Pointer to a structure that contains the codewords for the main, length,
  *	and aligned offset Huffman codes for the current LZX compressed block.
  */
 static void
 lzx_write_match(struct output_bitstream *out, int block_type,
-		struct lzx_match match, const struct lzx_codes *codes)
+		struct lzx_item match, const struct lzx_codes *codes)
 {
 	/* low 8 bits are the match length minus 2 */
 	unsigned match_len_minus_2 = match.data & 0xff;
@@ -643,7 +643,7 @@ static unsigned
 lzx_build_precode(const u8 lens[restrict],
 		  const u8 prev_lens[restrict],
 		  const unsigned num_syms,
-		  input_idx_t precode_freqs[restrict LZX_PRECODE_NUM_SYMBOLS],
+		  u32 precode_freqs[restrict LZX_PRECODE_NUM_SYMBOLS],
 		  u8 output_syms[restrict num_syms],
 		  u8 precode_lens[restrict LZX_PRECODE_NUM_SYMBOLS],
 		  u32 precode_codewords[restrict LZX_PRECODE_NUM_SYMBOLS],
@@ -812,7 +812,7 @@ lzx_write_compressed_code(struct output_bitstream *out,
 			  const u8 prev_lens[restrict],
 			  unsigned num_syms)
 {
-	input_idx_t precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
+	u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
 	u8 output_syms[num_syms];
 	u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
 	u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
@@ -872,31 +872,27 @@ lzx_write_compressed_code(struct output_bitstream *out,
  * @block_type
  *	The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
  *	LZX_BLOCKTYPE_VERBATIM).
- * @match_tab
+ * @items
  *	The array of matches/literals to output.
- * @match_count
- *	Number of matches/literals to output (length of @match_tab).
+ * @num_items
+ *	Number of matches/literals to output (length of @items).
  * @codes
  *	The main, length, and aligned offset Huffman codes for the current
  *	LZX compressed block.
  */
 static void
-lzx_write_matches_and_literals(struct output_bitstream *ostream,
-			       int block_type,
-			       const struct lzx_match match_tab[],
-			       unsigned match_count,
-			       const struct lzx_codes *codes)
+lzx_write_items(struct output_bitstream *ostream, int block_type,
+		const struct lzx_item items[], u32 num_items,
+		const struct lzx_codes *codes)
 {
-	for (unsigned i = 0; i < match_count; i++) {
-		struct lzx_match match = match_tab[i];
-
+	for (u32 i = 0; i < num_items; i++) {
 		/* The high bit of the 32-bit intermediate representation
 		 * indicates whether the item is an actual LZ-style match (1) or
 		 * a literal byte (0).  */
-		if (match.data & 0x80000000)
-			lzx_write_match(ostream, block_type, match, codes);
+		if (items[i].data & 0x80000000)
+			lzx_write_match(ostream, block_type, items[i], codes);
 		else
-			lzx_write_literal(ostream, match.data, codes);
+			lzx_write_literal(ostream, items[i].data, codes);
 	}
 }
 
@@ -943,8 +939,8 @@ lzx_write_compressed_block(int block_type,
 			   unsigned block_size,
 			   unsigned max_window_size,
 			   unsigned num_main_syms,
-			   struct lzx_match * chosen_matches,
-			   unsigned num_chosen_matches,
+			   struct lzx_item * chosen_items,
+			   unsigned num_chosen_items,
 			   const struct lzx_codes * codes,
 			   const struct lzx_codes * prev_codes,
 			   struct output_bitstream * ostream)
@@ -1021,9 +1017,8 @@ lzx_write_compressed_block(int block_type,
 	LZX_DEBUG("Writing matches and literals...");
 
 	/* Write the actual matches and literals.  */
-	lzx_write_matches_and_literals(ostream, block_type,
-				       chosen_matches, num_chosen_matches,
-				       codes);
+	lzx_write_items(ostream, block_type,
+			chosen_items, num_chosen_items, codes);
 
 	LZX_DEBUG("Done writing block.");
 }
@@ -1037,17 +1032,17 @@ lzx_write_all_blocks(struct lzx_compressor *ctx, struct output_bitstream *ostrea
 	for (unsigned i = 0; i < ctx->num_blocks; i++) {
 		const struct lzx_block_spec *spec = &ctx->block_specs[i];
 
-		LZX_DEBUG("Writing block %u/%u (type=%d, size=%u, num_chosen_matches=%u)...",
+		LZX_DEBUG("Writing block %u/%u (type=%d, size=%u, num_chosen_items=%u)...",
 			  i + 1, ctx->num_blocks,
 			  spec->block_type, spec->block_size,
-			  spec->num_chosen_matches);
+			  spec->num_chosen_items);
 
 		lzx_write_compressed_block(spec->block_type,
 					   spec->block_size,
 					   ctx->max_window_size,
 					   ctx->num_main_syms,
-					   spec->chosen_matches,
-					   spec->num_chosen_matches,
+					   spec->chosen_items,
+					   spec->num_chosen_items,
 					   &spec->codes,
 					   prev_codes,
 					   ostream);
@@ -1058,7 +1053,7 @@ lzx_write_all_blocks(struct lzx_compressor *ctx, struct output_bitstream *ostrea
 
 /* Constructs an LZX match from a literal byte and updates the main code symbol
  * frequencies.  */
-static u32
+static inline u32
 lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
 {
 	freqs->main[lit]++;
@@ -1069,7 +1064,7 @@ lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
  * queue and the frequency of symbols in the main, length, and aligned offset
  * alphabets.  The return value is a 32-bit number that provides the match in an
  * intermediate representation documented below.  */
-static u32
+static inline u32
 lzx_tally_match(unsigned match_len, u32 match_offset,
 		struct lzx_freqs *freqs, struct lzx_lru_queue *queue)
 {
@@ -1121,7 +1116,7 @@ lzx_tally_match(unsigned match_len, u32 match_offset,
 		freqs->aligned[position_footer & 7]++;
 
 	/* Pack the position slot, position footer, and match length into an
-	 * intermediate representation.  See `struct lzx_match' for details.
+	 * intermediate representation.  See `struct lzx_item' for details.
 	 */
 	LZX_ASSERT(LZX_MAX_POSITION_SLOTS <= 64);
 	LZX_ASSERT(lzx_get_num_extra_bits(LZX_MAX_POSITION_SLOTS - 1) <= 17);
@@ -1139,7 +1134,7 @@ lzx_tally_match(unsigned match_len, u32 match_offset,
 struct lzx_record_ctx {
 	struct lzx_freqs freqs;
 	struct lzx_lru_queue queue;
-	struct lzx_match *matches;
+	struct lzx_item *items;
 };
 
 static void
@@ -1147,7 +1142,7 @@ lzx_record_match(unsigned len, unsigned offset, void *_ctx)
 {
 	struct lzx_record_ctx *ctx = _ctx;
 
-	(ctx->matches++)->data = lzx_tally_match(len, offset, &ctx->freqs, &ctx->queue);
+	(ctx->items++)->data = lzx_tally_match(len, offset, &ctx->freqs, &ctx->queue);
 }
 
 static void
@@ -1155,7 +1150,7 @@ lzx_record_literal(u8 lit, void *_ctx)
 {
 	struct lzx_record_ctx *ctx = _ctx;
 
-	(ctx->matches++)->data = lzx_tally_literal(lit, &ctx->freqs);
+	(ctx->items++)->data = lzx_tally_literal(lit, &ctx->freqs);
 }
 
 /* Returns the cost, in bits, to output a literal byte using the specified cost
@@ -1170,7 +1165,7 @@ lzx_literal_cost(u8 c, const struct lzx_costs * costs)
  * well as costs for the codewords in the main, length, and aligned Huffman
  * codes, return the approximate number of bits it will take to represent this
  * match in the compressed output.  Take into account the match offset LRU
- * queue and optionally update it.  */
+ * queue and also updates it.  */
 static u32
 lzx_match_cost(unsigned length, u32 offset, const struct lzx_costs *costs,
 	       struct lzx_lru_queue *queue)
@@ -1248,10 +1243,10 @@ lzx_set_costs(struct lzx_compressor * ctx, const struct lzx_lens * lens)
  * value is the number of matches found.  */
 static unsigned
 lzx_get_matches(struct lzx_compressor *ctx,
-		const struct raw_match **matches_ret)
+		const struct lz_match **matches_ret)
 {
-	struct raw_match *cache_ptr;
-	struct raw_match *matches;
+	struct lz_match *cache_ptr;
+	struct lz_match *matches;
 	unsigned num_matches;
 
 	LZX_ASSERT(ctx->match_window_pos < ctx->match_window_end);
@@ -1324,7 +1319,7 @@ lzx_get_matches(struct lzx_compressor *ctx,
 static void
 lzx_skip_bytes(struct lzx_compressor *ctx, unsigned n)
 {
-	struct raw_match *cache_ptr;
+	struct lz_match *cache_ptr;
 
 	LZX_ASSERT(n <= ctx->match_window_end - ctx->match_window_pos);
 
@@ -1349,7 +1344,7 @@ lzx_skip_bytes(struct lzx_compressor *ctx, unsigned n)
  *
  * Returns the first match in the list.
  */
-static struct raw_match
+static struct lz_match
 lzx_match_chooser_reverse_list(struct lzx_compressor *ctx, unsigned cur_pos)
 {
 	unsigned prev_link, saved_prev_link;
@@ -1375,7 +1370,7 @@ lzx_match_chooser_reverse_list(struct lzx_compressor *ctx, unsigned cur_pos)
 
 	ctx->optimum_cur_idx = ctx->optimum[0].next.link;
 
-	return (struct raw_match)
+	return (struct lz_match)
 		{ .len = ctx->optimum_cur_idx,
 		  .offset = ctx->optimum[0].next.match_offset,
 		};
@@ -1443,13 +1438,12 @@ lzx_match_chooser_reverse_list(struct lzx_compressor *ctx, unsigned cur_pos)
  * The return value is a (length, offset) pair specifying the match or literal
  * chosen.  For literals, the length is 0 or 1 and the offset is meaningless.
  */
-static struct raw_match
+static struct lz_match
 lzx_get_near_optimal_match(struct lzx_compressor *ctx)
 {
 	unsigned num_matches;
-	const struct raw_match *matches;
-	const struct raw_match *matchptr;
-	struct raw_match match;
+	const struct lz_match *matches;
+	struct lz_match match;
 	unsigned longest_len;
 	unsigned longest_rep_len;
 	u32 longest_rep_offset;
@@ -1494,7 +1488,7 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx)
 	/* If there's a long match with a recent offset, take it.  */
 	if (longest_rep_len >= ctx->params.alg_params.slow.nice_match_length) {
 		lzx_skip_bytes(ctx, longest_rep_len);
-		return (struct raw_match) {
+		return (struct lz_match) {
 			.len = longest_rep_len,
 			.offset = longest_rep_offset,
 		};
@@ -1522,18 +1516,54 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx)
 	ctx->optimum[1].prev.link = 0;
 
 	/* Calculate the cost to reach any position up to and including that
-	 * reached by the longest match.  */
-	matchptr = matches;
-	for (unsigned len = 2; len <= longest_len; len++) {
-		u32 offset = matchptr->offset;
-
-		ctx->optimum[len].queue = ctx->queue;
-		ctx->optimum[len].prev.link = 0;
-		ctx->optimum[len].prev.match_offset = offset;
-		ctx->optimum[len].cost = lzx_match_cost(len, offset, &ctx->costs,
-							&ctx->optimum[len].queue);
-		if (len == matchptr->len)
-			matchptr++;
+	 * reached by the longest match.
+	 *
+	 * Note: We consider only the lowest-offset match that reaches each
+	 * position.
+	 *
+	 * Note: Some of the cost calculation stays the same for each offset,
+	 * regardless of how many lengths it gets used for.  Therefore, to
+	 * improve performance, we hand-code the cost calculation instead of
+	 * calling lzx_match_cost() to do a from-scratch cost evaluation at each
+	 * length.  */
+	for (unsigned i = 0, len = 2; i < num_matches; i++) {
+		u32 offset;
+		struct lzx_lru_queue queue;
+		u32 position_cost;
+		unsigned position_slot;
+		unsigned num_extra_bits;
+
+		offset = matches[i].offset;
+		queue = ctx->queue;
+		position_cost = 0;
+
+		position_slot = lzx_get_position_slot(offset, &queue);
+		num_extra_bits = lzx_get_num_extra_bits(position_slot);
+		if (num_extra_bits >= 3) {
+			position_cost += num_extra_bits - 3;
+			position_cost += ctx->costs.aligned[(offset + LZX_OFFSET_OFFSET) & 7];
+		} else {
+			position_cost += num_extra_bits;
+		}
+
+		do {
+			unsigned len_header;
+			unsigned main_symbol;
+			u32 cost;
+
+			cost = position_cost;
+
+			len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
+			main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
+			cost += ctx->costs.main[main_symbol];
+			if (len_header == LZX_NUM_PRIMARY_LENS)
+				cost += ctx->costs.len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
+
+			ctx->optimum[len].queue = queue;
+			ctx->optimum[len].prev.link = 0;
+			ctx->optimum[len].prev.match_offset = offset;
+			ctx->optimum[len].cost = cost;
+		} while (++len <= matches[i].len);
 	}
 	end_pos = longest_len;
 
@@ -1683,25 +1713,59 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx)
 			ctx->optimum[cur_pos + 1].prev.link = cur_pos;
 		}
 
-		/* Consider coding a match.  */
-		matchptr = matches;
-		for (unsigned len = 2; len <= longest_len; len++) {
+		/* Consider coding a match.
+		 *
+		 * The hard-coded cost calculation is done for the same reason
+		 * stated in the comment for the similar loop earlier.
+		 * Actually, it is *this* one that has the biggest effect on
+		 * performance; overall LZX compression is > 10% faster with
+		 * this code compared to calling lzx_match_cost() with each
+		 * length.  */
+		for (unsigned i = 0, len = 2; i < num_matches; i++) {
 			u32 offset;
 			struct lzx_lru_queue queue;
+			u32 position_cost;
+			unsigned position_slot;
+			unsigned num_extra_bits;
 
-			offset = matchptr->offset;
+			offset = matches[i].offset;
 			queue = ctx->optimum[cur_pos].queue;
-
-			cost = ctx->optimum[cur_pos].cost +
-				lzx_match_cost(len, offset, &ctx->costs, &queue);
-			if (cost < ctx->optimum[cur_pos + len].cost) {
-				ctx->optimum[cur_pos + len].queue = queue;
-				ctx->optimum[cur_pos + len].prev.link = cur_pos;
-				ctx->optimum[cur_pos + len].prev.match_offset = offset;
-				ctx->optimum[cur_pos + len].cost = cost;
+			position_cost = ctx->optimum[cur_pos].cost;
+
+			position_slot = lzx_get_position_slot(offset, &queue);
+			num_extra_bits = lzx_get_num_extra_bits(position_slot);
+			if (num_extra_bits >= 3) {
+				position_cost += num_extra_bits - 3;
+				position_cost += ctx->costs.aligned[
+						(offset + LZX_OFFSET_OFFSET) & 7];
+			} else {
+				position_cost += num_extra_bits;
 			}
-			if (len == matchptr->len)
-				matchptr++;
+
+			do {
+				unsigned len_header;
+				unsigned main_symbol;
+				u32 cost;
+
+				cost = position_cost;
+
+				len_header = min(len - LZX_MIN_MATCH_LEN,
+						 LZX_NUM_PRIMARY_LENS);
+				main_symbol = ((position_slot << 3) | len_header) +
+						LZX_NUM_CHARS;
+				cost += ctx->costs.main[main_symbol];
+				if (len_header == LZX_NUM_PRIMARY_LENS) {
+					cost += ctx->costs.len[len -
+							LZX_MIN_MATCH_LEN -
+							LZX_NUM_PRIMARY_LENS];
+				}
+				if (cost < ctx->optimum[cur_pos + len].cost) {
+					ctx->optimum[cur_pos + len].queue = queue;
+					ctx->optimum[cur_pos + len].prev.link = cur_pos;
+					ctx->optimum[cur_pos + len].prev.match_offset = offset;
+					ctx->optimum[cur_pos + len].cost = cost;
+				}
+			} while (++len <= matches[i].len);
 		}
 
 		if (longest_rep_len >= LZX_MIN_MATCH_LEN) {
@@ -1788,61 +1852,82 @@ lzx_optimize_block(struct lzx_compressor *ctx, struct lzx_block_spec *spec,
 	const struct lzx_lru_queue orig_queue = ctx->queue;
 	unsigned num_passes_remaining = num_passes;
 	struct lzx_freqs freqs;
+	const u8 *window_ptr;
+	const u8 *window_end;
+	struct lzx_item *next_chosen_match;
+	struct lz_match lz_match;
+	struct lzx_item lzx_item;
 
 	LZX_ASSERT(num_passes >= 1);
 	LZX_ASSERT(lz_bt_get_position(&ctx->mf) == spec->window_pos);
 
 	ctx->match_window_end = spec->window_pos + spec->block_size;
-	spec->chosen_matches = &ctx->chosen_matches[spec->window_pos];
 	ctx->matches_cached = false;
 
 	/* The first optimal parsing pass is done using the cost model already
 	 * set in ctx->costs.  Each later pass is done using a cost model
-	 * computed from the previous pass.  */
-	do {
-		const u8 *window_ptr;
-		const u8 *window_end;
-		struct lzx_match *next_chosen_match;
+	 * computed from the previous pass.
+	 *
+	 * To improve performance we only generate the array containing the
+	 * matches and literals in intermediate form on the final pass.  */
 
-		--num_passes_remaining;
+	while (--num_passes_remaining) {
 		ctx->match_window_pos = spec->window_pos;
 		ctx->cache_ptr = ctx->cached_matches;
 		memset(&freqs, 0, sizeof(freqs));
 		window_ptr = &ctx->window[spec->window_pos];
 		window_end = window_ptr + spec->block_size;
-		next_chosen_match = spec->chosen_matches;
 
 		while (window_ptr != window_end) {
-			struct raw_match raw_match;
-			struct lzx_match lzx_match;
 
-			raw_match = lzx_get_near_optimal_match(ctx);
+			lz_match = lzx_get_near_optimal_match(ctx);
 
-			LZX_ASSERT(!(raw_match.len == LZX_MIN_MATCH_LEN &&
-				     raw_match.offset == ctx->max_window_size -
+			LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
+				     lz_match.offset == ctx->max_window_size -
 							 LZX_MIN_MATCH_LEN));
-			if (raw_match.len >= LZX_MIN_MATCH_LEN) {
-				lzx_match.data = lzx_tally_match(raw_match.len,
-								 raw_match.offset,
-								 &freqs,
-								 &ctx->queue);
-				window_ptr += raw_match.len;
+			if (lz_match.len >= LZX_MIN_MATCH_LEN) {
+				lzx_tally_match(lz_match.len, lz_match.offset,
+						&freqs, &ctx->queue);
+				window_ptr += lz_match.len;
 			} else {
-				lzx_match.data = lzx_tally_literal(*window_ptr,
-								   &freqs);
+				lzx_tally_literal(*window_ptr, &freqs);
 				window_ptr += 1;
 			}
-			*next_chosen_match++ = lzx_match;
 		}
-		spec->num_chosen_matches = next_chosen_match - spec->chosen_matches;
 		lzx_make_huffman_codes(&freqs, &spec->codes, ctx->num_main_syms);
-		if (num_passes_remaining) {
-			lzx_set_costs(ctx, &spec->codes.lens);
-			ctx->queue = orig_queue;
-			ctx->matches_cached = true;
-		}
-	} while (num_passes_remaining);
+		lzx_set_costs(ctx, &spec->codes.lens);
+		ctx->queue = orig_queue;
+		ctx->matches_cached = true;
+	}
 
+	ctx->match_window_pos = spec->window_pos;
+	ctx->cache_ptr = ctx->cached_matches;
+	memset(&freqs, 0, sizeof(freqs));
+	window_ptr = &ctx->window[spec->window_pos];
+	window_end = window_ptr + spec->block_size;
+
+	spec->chosen_items = &ctx->chosen_items[spec->window_pos];
+	next_chosen_match = spec->chosen_items;
+
+	while (window_ptr != window_end) {
+		lz_match = lzx_get_near_optimal_match(ctx);
+
+		LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
+			     lz_match.offset == ctx->max_window_size -
+						 LZX_MIN_MATCH_LEN));
+		if (lz_match.len >= LZX_MIN_MATCH_LEN) {
+			lzx_item.data = lzx_tally_match(lz_match.len,
+							 lz_match.offset,
+							 &freqs, &ctx->queue);
+			window_ptr += lz_match.len;
+		} else {
+			lzx_item.data = lzx_tally_literal(*window_ptr, &freqs);
+			window_ptr += 1;
+		}
+		*next_chosen_match++ = lzx_item;
+	}
+	spec->num_chosen_items = next_chosen_match - spec->chosen_items;
+	lzx_make_huffman_codes(&freqs, &spec->codes, ctx->num_main_syms);
 	spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes);
 }
 
@@ -1893,7 +1978,7 @@ lzx_prepare_blocks(struct lzx_compressor * ctx)
  *
  *	ctx->block_specs[]
  *	ctx->num_blocks
- *	ctx->chosen_matches[]
+ *	ctx->chosen_items[]
  */
 static void
 lzx_prepare_block_fast(struct lzx_compressor * ctx)
@@ -1919,7 +2004,7 @@ lzx_prepare_block_fast(struct lzx_compressor * ctx)
 	/* Initialize symbol frequencies and match offset LRU queue.  */
 	memset(&record_ctx.freqs, 0, sizeof(struct lzx_freqs));
 	lzx_lru_queue_init(&record_ctx.queue);
-	record_ctx.matches = ctx->chosen_matches;
+	record_ctx.items = ctx->chosen_items;
 
 	/* Determine series of matches/literals to output.  */
 	lz_analyze_block(ctx->window,
@@ -1935,8 +2020,8 @@ lzx_prepare_block_fast(struct lzx_compressor * ctx)
 	spec->block_type = LZX_BLOCKTYPE_ALIGNED;
 	spec->window_pos = 0;
 	spec->block_size = ctx->window_size;
-	spec->num_chosen_matches = (record_ctx.matches - ctx->chosen_matches);
-	spec->chosen_matches = ctx->chosen_matches;
+	spec->num_chosen_items = record_ctx.items - ctx->chosen_items;
+	spec->chosen_items = ctx->chosen_items;
 	lzx_make_huffman_codes(&record_ctx.freqs, &spec->codes,
 			       ctx->num_main_syms);
 	ctx->num_blocks = 1;
@@ -1996,7 +2081,7 @@ lzx_compress(const void *uncompressed_data, size_t uncompressed_size,
 
 	LZX_DEBUG("Flushing bitstream...");
 	compressed_size = flush_output_bitstream(&ostream);
-	if (compressed_size == ~(input_idx_t)0) {
+	if (compressed_size == (u32)~0UL) {
 		LZX_DEBUG("Data did not compress to %zu bytes or less!",
 			  compressed_size_avail);
 		return 0;
@@ -2056,7 +2141,7 @@ lzx_free_compressor(void *_ctx)
 	struct lzx_compressor *ctx = _ctx;
 
 	if (ctx) {
-		FREE(ctx->chosen_matches);
+		FREE(ctx->chosen_items);
 		FREE(ctx->cached_matches);
 		FREE(ctx->optimum);
 		lz_bt_destroy(&ctx->mf);
@@ -2130,8 +2215,6 @@ lzx_create_compressor(size_t window_size,
 	if (!lzx_window_size_valid(window_size))
 		return WIMLIB_ERR_INVALID_PARAM;
 
-	LZX_DEBUG("Allocating memory.");
-
 	ctx = CALLOC(1, sizeof(struct lzx_compressor));
 	if (ctx == NULL)
 		goto oom;
@@ -2172,7 +2255,7 @@ lzx_create_compressor(size_t window_size,
 				       min(params->alg_params.slow.nice_match_length,
 					   LZX_MAX_MATCH_LEN)) *
 						sizeof(ctx->optimum[0]));
-		if (!ctx->optimum)
+		if (ctx->optimum == NULL)
 			goto oom;
 	}
 
@@ -2184,8 +2267,8 @@ lzx_create_compressor(size_t window_size,
 				   LZX_CACHE_LEN - (LZX_MAX_MATCHES_PER_POS + 1);
 	}
 
-	ctx->chosen_matches = MALLOC(window_size * sizeof(ctx->chosen_matches[0]));
-	if (ctx->chosen_matches == NULL)
+	ctx->chosen_items = MALLOC(window_size * sizeof(ctx->chosen_items[0]));
+	if (ctx->chosen_items == NULL)
 		goto oom;
 
 	memcpy(&ctx->params, params, sizeof(struct wimlib_lzx_compressor_params));
@@ -2217,7 +2300,7 @@ lzx_get_needed_memory(size_t max_block_size,
 		sizeof(((struct lzx_compressor*)0)->block_specs[0]);
 
 	if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) {
-		size += max_block_size * sizeof(((struct lzx_compressor*)0)->chosen_matches[0]);
+		size += max_block_size * sizeof(((struct lzx_compressor*)0)->chosen_items[0]);
 		size += lz_bt_get_needed_memory(max_block_size);
 		size += (LZX_OPTIM_ARRAY_SIZE +
 			 min(params->alg_params.slow.nice_match_length,