From: Eric Biggers <ebiggers3@gmail.com>
Date: Thu, 2 Jan 2014 01:54:32 +0000 (-0600)
Subject: Update LZMS compressor
X-Git-Tag: v1.6.0~27
X-Git-Url: https://wimlib.net/git/?p=wimlib;a=commitdiff_plain;h=62e209e9aeaa36ba9e3c2a174428805b7264e0e7

Update LZMS compressor
---

diff --git a/doc/imagex-capture.1.in b/doc/imagex-capture.1.in
index bee9b48b..deefd852 100644
--- a/doc/imagex-capture.1.in
+++ b/doc/imagex-capture.1.in
@@ -199,8 +199,9 @@ used in packed resources created if the \fB--pack-streams\fR option is
 specified.
 .TP
 \fB--compress-slow\fR
-Like \fB--compress\fR=\fImaximum\fR, but spend even more time compressing the
-data to achieve a very slightly better compression ratio.
+Spend even more time compressing the data to achieve a very slightly better
+compression ratio.  This currently only has an effect for LZX ("maximum", the
+default) and LZMS ("recovery") compression.
 .TP
 \fB--chunk-size\fR=\fISIZE\fR
 Set the WIM compression chunk size to \fISIZE\fR.  Using this option is not
diff --git a/doc/imagex-optimize.1.in b/doc/imagex-optimize.1.in
index 234b3310..d718f55e 100644
--- a/doc/imagex-optimize.1.in
+++ b/doc/imagex-optimize.1.in
@@ -38,14 +38,19 @@ better compression than Microsoft's, but is slightly slower.
 .TP
 \fB--recompress-slow\fR, \fB--compress-slow\fR
 Spend even more time compressing the data in order to achieve a more optimal
-compression ratio.  Compared to the default \fB--recompress\fR, this will make
-compression about twice as slow and will increase the compression ratio by maybe
-1%, depending on the data.  This option implies \fB--recompress\fR and
-\fB--compress\fR=\fImaximum\fR (recompress using LZX compression).
+compression ratio.  For LZX ("maximum") compression, compared to the default
+\fB--recompress\fR this will make compression about twice as slow and may
+improve the compression ratio by maybe 1%, depending on the data.  For LZMS
+("recovery") compression this option also has an effect.  For XPRESS ("fast")
+compression this option has no effect; however you may use \fB--compress\fR=LZX
+\fB--recompress-slow\fR to change the compression type to LZX and recompress
+slowly, as per this option.  In any case, this option implies
+\fB--recompress\fR.
 .TP
 \fB--compress\fR=\fITYPE\fR
 Recompress the WIM file using the specified compression type.  \fITYPE\fR may be
-"none", "fast", or "maximum".  This implies \fB--recompress\fR.
+"none", "fast" (or "XPRESS"), or "maximum" (or "LZX").  This implies
+\fB--recompress\fR.
 .IP ""
 \fITYPE\fR may also be "recovery" (or "LZMS"); however, this will result in
 reduced compatibility.  See the documentation for this option to
diff --git a/include/wimlib.h b/include/wimlib.h
index c0aaf13b..372bb760 100644
--- a/include/wimlib.h
+++ b/include/wimlib.h
@@ -4061,6 +4061,42 @@ struct wimlib_lzx_compressor_params {
 	} alg_params;
 };
 
+/** LZMS compression parameters that can optionally be passed to
+ * wimlib_create_compressor() with the compression type
+ * ::WIMLIB_COMPRESSION_TYPE_LZMS.  */
+struct wimlib_lzms_compressor_params {
+	/** hdr.size Must be set to the size of this structure, in bytes.  */
+	struct wimlib_compressor_params_header hdr;
+
+	/** Minimum match length to output.  This must be at least 2.  Suggested
+	 * value: 2  */
+	uint32_t min_match_length;
+
+	/** Maximum match length to output.  This must be at least @p
+	 * min_match_length.  Suggested value: @p UINT32_MAX.  */
+	uint32_t max_match_length;
+
+	/** Matches with length (in bytes) greater than or equal to this value
+	 * are immediately taken without spending time on minimum-cost
+	 * measurements.  The minimum of @p max_match_length and @p
+	 * nice_match_length may not exceed 65536.  Suggested value: 32.  */
+	uint32_t nice_match_length;
+
+	/** Maximum depth to search for matches at each position.  Suggested
+	 * value: 50.  */
+	uint32_t max_search_depth;
+
+	/** Maximum number of potentially good matches to consider at each
+	 * position.  Suggested value: 3.  */
+	uint32_t max_matches_per_pos;
+
+	/** Length of the array for the near-optimal LZ parsing algorithm.  This
+	 * must be at least 1.  Suggested value: 1024.  */
+	uint32_t optim_array_length;
+
+	uint64_t reserved2[4];
+};
+
 /** Opaque compressor handle.  */
 struct wimlib_compressor;
 
diff --git a/include/wimlib/lz_optimal.h b/include/wimlib/lz_optimal.h
index f352ea47..32bbb8b5 100644
--- a/include/wimlib/lz_optimal.h
+++ b/include/wimlib/lz_optimal.h
@@ -204,7 +204,8 @@ lz_match_chooser_reverse_list(struct lz_match_chooser *mc, input_idx_t cur_pos)
  * be the number of matches found (which may be 0) and a pointer to the returned
  * matches must be written into @matches_ret.  Matches must be of distinct
  * lengths and sorted in decreasing order by length.  Furthermore, match lengths
- * may not exceed the @max_match_len passed to lz_match_chooser_init().  */
+ * may not exceed the @max_match_len passed to lz_match_chooser_init(), and all
+ * match lengths must be at least 2.  */
 typedef u32 (*lz_get_matches_t)(LZ_COMPRESSOR *ctx,
 				const LZ_ADAPTIVE_STATE *state,
 				struct raw_match **matches_ret);
diff --git a/programs/imagex.c b/programs/imagex.c
index 0a0934d9..d2f58b2f 100644
--- a/programs/imagex.c
+++ b/programs/imagex.c
@@ -434,11 +434,11 @@ get_compression_type(const tchar *optarg)
 	}
 }
 
-static int
+static void
 set_compress_slow(void)
 {
 	int ret;
-	static const struct wimlib_lzx_compressor_params slow_params = {
+	static const struct wimlib_lzx_compressor_params lzx_slow_params = {
 		.hdr = {
 			.size = sizeof(struct wimlib_lzx_compressor_params),
 		},
@@ -456,11 +456,24 @@ set_compress_slow(void)
 			},
 		},
 	};
-	ret = wimlib_set_default_compressor_params(WIMLIB_COMPRESSION_TYPE_LZX,
-						   &slow_params.hdr);
-	if (ret)
-		imagex_error(T("Couldn't set slow compression parameters.!"));
-	return ret;
+
+	static const struct wimlib_lzms_compressor_params lzms_slow_params = {
+		.hdr = {
+			.size = sizeof(struct wimlib_lzms_compressor_params),
+		},
+		.min_match_length = 2,
+		.max_match_length = UINT32_MAX,
+		.nice_match_length = 96,
+		.max_search_depth = 100,
+		.max_matches_per_pos = 10,
+		.optim_array_length = 1024,
+	};
+
+	wimlib_set_default_compressor_params(WIMLIB_COMPRESSION_TYPE_LZX,
+					     &lzx_slow_params.hdr);
+
+	wimlib_set_default_compressor_params(WIMLIB_COMPRESSION_TYPE_LZMS,
+					     &lzms_slow_params.hdr);
 }
 
 struct string_set {
@@ -1768,6 +1781,7 @@ imagex_capture_or_append(int argc, tchar **argv, int cmd)
 	struct wimlib_capture_source *capture_sources;
 	size_t num_sources;
 	bool name_defaulted;
+	bool compress_slow = false;
 
 	for_opt(c, capture_or_append_options) {
 		switch (c) {
@@ -1791,10 +1805,7 @@ imagex_capture_or_append(int argc, tchar **argv, int cmd)
 				goto out_err;
 			break;
 		case IMAGEX_COMPRESS_SLOW_OPTION:
-			ret = set_compress_slow();
-			if (ret)
-				goto out_err;
-			compression_type = WIMLIB_COMPRESSION_TYPE_LZX;
+			compress_slow = true;
 			break;
 		case IMAGEX_CHUNK_SIZE_OPTION:
 			chunk_size = parse_chunk_size(optarg);
@@ -1888,19 +1899,26 @@ imagex_capture_or_append(int argc, tchar **argv, int cmd)
 	source = argv[0];
 	wimfile = argv[1];
 
-	/* Set default compression type.  */
+	/* Set default compression type and parameters.  */
+
+
 	if (compression_type == WIMLIB_COMPRESSION_TYPE_INVALID) {
-		struct wimlib_lzx_compressor_params params;
-		memset(&params, 0, sizeof(params));
-		params.hdr.size = sizeof(params);
-		params.algorithm = WIMLIB_LZX_ALGORITHM_FAST;
-		params.use_defaults = 1;
-
-		wimlib_set_default_compressor_params(WIMLIB_COMPRESSION_TYPE_LZX,
-						     &params.hdr);
 		compression_type = WIMLIB_COMPRESSION_TYPE_LZX;
+
+		if (!compress_slow) {
+			struct wimlib_lzx_compressor_params params = {
+				.hdr.size = sizeof(params),
+				.algorithm = WIMLIB_LZX_ALGORITHM_FAST,
+				.use_defaults = 1,
+			};
+			wimlib_set_default_compressor_params(WIMLIB_COMPRESSION_TYPE_LZX,
+							     &params.hdr);
+		}
 	}
 
+	if (compress_slow)
+		set_compress_slow();
+
 	if (!tstrcmp(wimfile, T("-"))) {
 		/* Writing captured WIM to standard output.  */
 	#if 0
@@ -3415,10 +3433,7 @@ imagex_optimize(int argc, tchar **argv, int cmd)
 			break;
 		case IMAGEX_COMPRESS_SLOW_OPTION:
 			write_flags |= WIMLIB_WRITE_FLAG_RECOMPRESS;
-			compression_type = WIMLIB_COMPRESSION_TYPE_LZX;
-			ret = set_compress_slow();
-			if (ret)
-				goto out_err;
+			set_compress_slow();
 			break;
 		case IMAGEX_CHUNK_SIZE_OPTION:
 			chunk_size = parse_chunk_size(optarg);
diff --git a/src/lzms-common.c b/src/lzms-common.c
index 03b70736..3609e5a5 100644
--- a/src/lzms-common.c
+++ b/src/lzms-common.c
@@ -47,6 +47,7 @@ u32 lzms_length_slot_base[LZMS_NUM_LEN_SYMS + 1];
 unsigned
 lzms_get_slot(u32 value, const u32 slot_base_tab[], unsigned num_slots)
 {
+	/* TODO:  Speed this up.  */
 	unsigned slot = 0;
 
 	while (slot_base_tab[slot + 1] <= value)
diff --git a/src/lzms-compress.c b/src/lzms-compress.c
index b98ee760..6df10806 100644
--- a/src/lzms-compress.c
+++ b/src/lzms-compress.c
@@ -46,6 +46,7 @@
 
 #include <string.h>
 #include <limits.h>
+#include <pthread.h>
 
 #define LZMS_OPTIM_ARRAY_SIZE	1024
 
@@ -55,6 +56,7 @@ struct lzms_adaptive_state {
 	u8 main_state;
 	u8 match_state;
 	u8 lz_match_state;
+	u8 lz_repeat_match_state[LZMS_NUM_RECENT_OFFSETS - 1];
 };
 #define LZ_ADAPTIVE_STATE struct lzms_adaptive_state
 #define LZ_COMPRESSOR	  struct lzms_compressor
@@ -184,9 +186,11 @@ struct lzms_compressor {
 	/* Size of the data in @buffer.  */
 	u32 window_size;
 
+#if 0
 	/* Temporary array used by lz_analyze_block(); must be at least as long
 	 * as the window.  */
 	u32 *prev_tab;
+#endif
 
 	/* Suffix array match-finder.  */
 	struct lz_sarray lz_sarray;
@@ -434,7 +438,11 @@ static void
 lzms_huffman_encode_symbol(struct lzms_huffman_encoder *enc, u32 sym)
 {
 	LZMS_ASSERT(sym < enc->num_syms);
-	if (enc->num_syms_written == enc->rebuild_freq) {
+	lzms_output_bitstream_put_bits(enc->os,
+				       enc->codewords[sym],
+				       enc->lens[sym]);
+	++enc->sym_freqs[sym];
+	if (++enc->num_syms_written == enc->rebuild_freq) {
 		/* Adaptive code needs to be rebuilt.  */
 		LZMS_DEBUG("Rebuilding code (num_syms=%u)", enc->num_syms);
 		make_canonical_huffman_code(enc->num_syms,
@@ -450,11 +458,6 @@ lzms_huffman_encode_symbol(struct lzms_huffman_encoder *enc, u32 sym)
 		}
 		enc->num_syms_written = 0;
 	}
-	lzms_output_bitstream_put_bits(enc->os,
-				       enc->codewords[sym],
-				       enc->lens[sym]);
-	++enc->num_syms_written;
-	++enc->sym_freqs[sym];
 }
 
 /* Encode a number as a Huffman symbol specifying a slot, plus a number of
@@ -524,15 +527,17 @@ lzms_encode_lz_match(struct lzms_compressor *ctx, u32 length, u32 offset)
 {
 	int recent_offset_idx;
 
+	LZMS_DEBUG("Position %u: Encoding LZ match {length=%u, offset=%u}",
+		   ctx->cur_window_pos, length, offset);
+
+	LZMS_ASSERT(length <= ctx->window_size - ctx->cur_window_pos);
+	LZMS_ASSERT(offset <= ctx->cur_window_pos);
 	LZMS_ASSERT(!memcmp(&ctx->window[ctx->cur_window_pos],
 			    &ctx->window[ctx->cur_window_pos - offset],
 			    length));
 
 	lzms_begin_encode_item(ctx);
 
-	LZMS_DEBUG("Position %u: Encoding LZ match {length=%u, offset=%u}",
-		   ctx->cur_window_pos, length, offset);
-
 	/* Main bit: 1 = a match, not a literal.  */
 	lzms_range_encode_bit(&ctx->main_range_encoder, 1);
 
@@ -549,7 +554,7 @@ lzms_encode_lz_match(struct lzms_compressor *ctx, u32 length, u32 offset)
 	if (recent_offset_idx == LZMS_NUM_RECENT_OFFSETS) {
 		/* Explicit offset.  */
 
-		/* LZ match bit: 0 = explicit offset, not a repeat offset.  */
+		/* LZ match bit: 0 = explicit offset, not a recent offset.  */
 		lzms_range_encode_bit(&ctx->lz_match_range_encoder, 0);
 
 		/* Encode the match offset.  */
@@ -557,9 +562,9 @@ lzms_encode_lz_match(struct lzms_compressor *ctx, u32 length, u32 offset)
 	} else {
 		int i;
 
-		/* Repeat offset.  */
+		/* Recent offset.  */
 
-		/* LZ match bit: 1 = repeat offset, not an explicit offset.  */
+		/* LZ match bit: 1 = recent offset, not an explicit offset.  */
 		lzms_range_encode_bit(&ctx->lz_match_range_encoder, 1);
 
 		/* Encode the recent offset index.  A 1 bit is encoded for each
@@ -587,6 +592,7 @@ lzms_encode_lz_match(struct lzms_compressor *ctx, u32 length, u32 offset)
 	lzms_end_encode_item(ctx, length);
 }
 
+#if 0
 static void
 lzms_record_literal(u8 literal, void *_ctx)
 {
@@ -626,12 +632,13 @@ lzms_fast_encode(struct lzms_compressor *ctx)
 			 ctx->prev_tab);
 
 }
+#endif
 
 /* Fast heuristic cost evaluation to use in the inner loop of the match-finder.
- * Unlike lzms_get_match_cost(), which does a true cost evaluation, this simply
- * prioritize matches based on their offset.  */
+ * Unlike lzms_get_lz_match_cost(), which does a true cost evaluation, this
+ * simply prioritize matches based on their offset.  */
 static input_idx_t
-lzms_match_cost_fast(input_idx_t length, input_idx_t offset, const void *_lru)
+lzms_lz_match_cost_fast(input_idx_t length, input_idx_t offset, const void *_lru)
 {
 	const struct lzms_lz_lru_queues *lru = _lru;
 
@@ -642,36 +649,124 @@ lzms_match_cost_fast(input_idx_t length, input_idx_t offset, const void *_lru)
 	return offset;
 }
 
+#define LZMS_COST_SHIFT 5
+
+/*#define LZMS_RC_COSTS_USE_FLOATING_POINT*/
+
 static u32
-lzms_rc_bit_cost(const struct lzms_range_encoder *enc, u8 *cur_state, int bit)
+lzms_rc_costs[LZMS_PROBABILITY_MAX];
+
+#ifdef LZMS_RC_COSTS_USE_FLOATING_POINT
+#  include <math.h>
+#endif
+
+static void
+lzms_do_init_rc_costs(void)
 {
-	u32 prob;
-	u32 cost;
+	/* Fill in a table that maps range coding probabilities needed to code a
+	 * bit X (0 or 1) to the number of bits (scaled by a constant factor, to
+	 * handle fractional costs) needed to code that bit X.
+	 *
+	 * Consider the range of the range decoder.  To eliminate exactly half
+	 * the range (logical probability of 0.5), we need exactly 1 bit.  For
+	 * lower probabilities we need more bits and for higher probabilities we
+	 * need fewer bits.  In general, a logical probability of N will
+	 * eliminate the proportion 1 - N of the range; this information takes
+	 * log2(1 / N) bits to encode.
+	 *
+	 * The below loop is simply calculating this number of bits for each
+	 * possible probability allowed by the LZMS compression format, but
+	 * without using real numbers.  To handle fractional probabilities, each
+	 * cost is multiplied by (1 << LZMS_COST_SHIFT).  These techniques are
+	 * based on those used by LZMA.
+	 *
+	 * Note that in LZMS, a probability x really means x / 64, and 0 / 64 is
+	 * really interpreted as 1 / 64 and 64 / 64 is really interpreted as
+	 * 63 / 64.
+	 */
+	for (u32 i = 0; i < LZMS_PROBABILITY_MAX; i++) {
+		u32 prob = i;
+
+		if (prob == 0)
+			prob = 1;
+		else if (prob == LZMS_PROBABILITY_MAX)
+			prob = LZMS_PROBABILITY_MAX - 1;
+
+	#ifdef LZMS_RC_COSTS_USE_FLOATING_POINT
+		lzms_rc_costs[i] = log2((double)LZMS_PROBABILITY_MAX / prob) *
+					(1 << LZMS_COST_SHIFT);
+	#else
+		u32 w = prob;
+		u32 bit_count = 0;
+		for (u32 j = 0; j < LZMS_COST_SHIFT; j++) {
+			w *= w;
+			bit_count <<= 1;
+			while (w >= (1U << 16)) {
+				w >>= 1;
+				++bit_count;
+			}
+		}
+		lzms_rc_costs[i] = (LZMS_PROBABILITY_BITS << LZMS_COST_SHIFT) -
+				   (15 + bit_count);
+	#endif
+	}
+}
 
-	prob = enc->prob_entries[*cur_state & enc->mask].num_recent_zero_bits;
-	if (prob == 0)
-		prob = 1;
-	else if (prob == LZMS_PROBABILITY_MAX)
-		prob = LZMS_PROBABILITY_MAX - 1;
+static void
+lzms_init_rc_costs(void)
+{
+	static bool done = false;
+	static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+
+	if (!done) {
+		pthread_mutex_lock(&mutex);
+		if (!done) {
+			lzms_do_init_rc_costs();
+			done = true;
+		}
+		pthread_mutex_unlock(&mutex);
+	}
+}
 
-	if (bit == 0)
-		prob = LZMS_PROBABILITY_MAX - prob;
+/*
+ * Return the cost to range-encode the specified bit when in the specified
+ * state.
+ *
+ * @enc		The range encoder to use.
+ * @cur_state	Current state, which indicates the probability entry to choose.
+ *		Updated by this function.
+ * @bit		The bit to encode (0 or 1).
+ */
+static u32
+lzms_rc_bit_cost(const struct lzms_range_encoder *enc, u8 *cur_state, int bit)
+{
+	u32 prob_zero;
+	u32 prob_correct;
 
-	cost = prob * 2; /* TODO */
+	prob_zero = enc->prob_entries[*cur_state & enc->mask].num_recent_zero_bits;
 
 	*cur_state = (*cur_state << 1) | bit;
 
-	return cost;
-}
+	if (prob_zero == 0)
+		prob_zero = 1;
+	else if (prob_zero == LZMS_PROBABILITY_MAX)
+		prob_zero = LZMS_PROBABILITY_MAX - 1;
 
-#define LZMS_COST_SCALE 64
+	if (bit == 0)
+		prob_correct = prob_zero;
+	else
+		prob_correct = LZMS_PROBABILITY_MAX - prob_zero;
+
+	return lzms_rc_costs[prob_correct];
+}
 
 static u32
 lzms_huffman_symbol_cost(const struct lzms_huffman_encoder *enc, u32 sym)
 {
-	return enc->lens[sym] * LZMS_COST_SCALE;
+	return enc->lens[sym] << LZMS_COST_SHIFT;
 }
 
+/* Compute the cost to encode a number with lzms_encode_value().  */
 static u32
 lzms_value_cost(const struct lzms_huffman_encoder *enc, u32 value)
 {
@@ -686,14 +781,14 @@ lzms_value_cost(const struct lzms_huffman_encoder *enc, u32 value)
 	num_extra_bits = bsr32(enc->slot_base_tab[slot + 1] -
 			       enc->slot_base_tab[slot]);
 
-	cost += num_extra_bits * LZMS_COST_SCALE;
+	cost += num_extra_bits << LZMS_COST_SHIFT;
 
 	return cost;
 }
 
 static u32
 lzms_get_matches(struct lzms_compressor *ctx,
-		 const struct lzms_adaptive_state *cost_state,
+		 const struct lzms_adaptive_state *state,
 		 struct raw_match **matches_ret)
 {
 	u32 num_matches;
@@ -701,12 +796,18 @@ lzms_get_matches(struct lzms_compressor *ctx,
 
 	num_matches = lz_sarray_get_matches(&ctx->lz_sarray,
 					    matches,
-					    lzms_match_cost_fast,
-					    &cost_state->lru);
+					    lzms_lz_match_cost_fast,
+					    &state->lru);
+#if 0
+	fprintf(stderr, "Pos %u: %u matches\n",
+		lz_sarray_get_pos(&ctx->lz_sarray) - 1, num_matches);
+	for (u32 i = 0; i < num_matches; i++)
+		fprintf(stderr, "\tLen %u Offset %u\n", matches[i].len, matches[i].offset);
+#endif
 
 #ifdef ENABLE_LZMS_DEBUG
+	LZMS_ASSERT(lz_sarray_get_pos(&ctx->lz_sarray) > 0);
 	u32 curpos = lz_sarray_get_pos(&ctx->lz_sarray) - 1;
-	LZMS_ASSERT(curpos >= 0);
 	for (u32 i = 0; i < num_matches; i++) {
 		LZMS_ASSERT(matches[i].len <= ctx->window_size - curpos);
 		LZMS_ASSERT(matches[i].offset > 0);
@@ -714,12 +815,11 @@ lzms_get_matches(struct lzms_compressor *ctx,
 		LZMS_ASSERT(!memcmp(&ctx->window[curpos],
 				    &ctx->window[curpos - matches[i].offset],
 				    matches[i].len));
-		if (i > 0)
-			LZMS_ASSERT(matches[i - 1].len > matches[i].len);
+		if (i < num_matches - 1)
+			LZMS_ASSERT(matches[i].len > matches[i + 1].len);
 
 	}
 #endif
-
 	*matches_ret = matches;
 	return num_matches;
 }
@@ -733,68 +833,72 @@ lzms_skip_bytes(struct lzms_compressor *ctx, input_idx_t n)
 
 static u32
 lzms_get_prev_literal_cost(struct lzms_compressor *ctx,
-			   struct lzms_adaptive_state *cost_state)
+			   struct lzms_adaptive_state *state)
 {
 	u8 literal = ctx->window[lz_sarray_get_pos(&ctx->lz_sarray) - 1];
 	u32 cost = 0;
 
-	cost_state->lru.upcoming_offset = 0;
-	lzms_update_lz_lru_queues(&cost_state->lru);
+	state->lru.upcoming_offset = 0;
+	lzms_update_lz_lru_queues(&state->lru);
 
 	cost += lzms_rc_bit_cost(&ctx->main_range_encoder,
-				 &cost_state->main_state, 0);
+				 &state->main_state, 0);
+
 	cost += lzms_huffman_symbol_cost(&ctx->literal_encoder, literal);
 
 	return cost;
 }
 
 static u32
-lzms_get_match_cost(struct lzms_compressor *ctx,
-		    struct lzms_adaptive_state *cost_state,
-		    input_idx_t length, input_idx_t offset)
+lzms_get_lz_match_cost(struct lzms_compressor *ctx,
+		       struct lzms_adaptive_state *state,
+		       input_idx_t length, input_idx_t offset)
 {
 	u32 cost = 0;
 	int recent_offset_idx;
 
 	cost += lzms_rc_bit_cost(&ctx->main_range_encoder,
-				 &cost_state->main_state, 1);
+				 &state->main_state, 1);
 	cost += lzms_rc_bit_cost(&ctx->match_range_encoder,
-				 &cost_state->match_state, 0);
+				 &state->match_state, 0);
 
 	for (recent_offset_idx = 0;
 	     recent_offset_idx < LZMS_NUM_RECENT_OFFSETS;
 	     recent_offset_idx++)
-		if (offset == cost_state->lru.recent_offsets[recent_offset_idx])
+		if (offset == state->lru.recent_offsets[recent_offset_idx])
 			break;
 
 	if (recent_offset_idx == LZMS_NUM_RECENT_OFFSETS) {
 		/* Explicit offset.  */
 		cost += lzms_rc_bit_cost(&ctx->lz_match_range_encoder,
-					 &cost_state->lz_match_state, 0);
+					 &state->lz_match_state, 0);
 
 		cost += lzms_value_cost(&ctx->lz_offset_encoder, offset);
 	} else {
 		int i;
 
-		/* Repeat offset.  */
+		/* Recent offset.  */
 		cost += lzms_rc_bit_cost(&ctx->lz_match_range_encoder,
-					 &cost_state->lz_match_state, 1);
+					 &state->lz_match_state, 1);
 
 		for (i = 0; i < recent_offset_idx; i++)
-			cost++; /* TODO */
+			cost += lzms_rc_bit_cost(&ctx->lz_repeat_match_range_encoders[i],
+						 &state->lz_repeat_match_state[i], 0);
 
 		if (i < LZMS_NUM_RECENT_OFFSETS - 1)
-			cost++; /* TODO */
+			cost += lzms_rc_bit_cost(&ctx->lz_repeat_match_range_encoders[i],
+						 &state->lz_repeat_match_state[i], 1);
+
 
 		/* Initial update of the LZ match offset LRU queue.  */
 		for (; i < LZMS_NUM_RECENT_OFFSETS; i++)
-			cost_state->lru.recent_offsets[i] = cost_state->lru.recent_offsets[i + 1];
+			state->lru.recent_offsets[i] = state->lru.recent_offsets[i + 1];
 	}
 
 	cost += lzms_value_cost(&ctx->length_encoder, length);
 
-	cost_state->lru.upcoming_offset = offset;
-	lzms_update_lz_lru_queues(&cost_state->lru);
+	state->lru.upcoming_offset = offset;
+	lzms_update_lz_lru_queues(&state->lru);
 
 	return cost;
 }
@@ -802,23 +906,46 @@ lzms_get_match_cost(struct lzms_compressor *ctx,
 static struct raw_match
 lzms_get_near_optimal_match(struct lzms_compressor *ctx)
 {
-	struct lzms_adaptive_state initial_state = {
-		.lru = ctx->lru.lz,
-		.main_state = ctx->main_range_encoder.state,
-		.match_state = ctx->match_range_encoder.state,
-		.lz_match_state = ctx->lz_match_range_encoder.state,
-	};
+	struct lzms_adaptive_state initial_state;
+
+	initial_state.lru = ctx->lru.lz;
+	initial_state.main_state = ctx->main_range_encoder.state;
+	initial_state.match_state = ctx->match_range_encoder.state;
+	initial_state.lz_match_state = ctx->lz_match_range_encoder.state;
+	for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS - 1; i++)
+		initial_state.lz_repeat_match_state[i] =
+			ctx->lz_repeat_match_range_encoders[i].state;
 	return lz_get_near_optimal_match(&ctx->mc,
 					 lzms_get_matches,
 					 lzms_skip_bytes,
 					 lzms_get_prev_literal_cost,
-					 lzms_get_match_cost,
+					 lzms_get_lz_match_cost,
 					 ctx,
 					 &initial_state);
 }
 
+/*
+ * The main loop for the LZMS compressor.
+ *
+ * Notes:
+ *
+ * - This uses near-optimal LZ parsing backed by a suffix-array match-finder.
+ *   More details can be found in the corresponding files (lz_optimal.h,
+ *   lz_sarray.{h,c}).
+ *
+ * - This does not output any delta matches.  It would take a specialized
+ *   algorithm to find them, then more code in lz_optimal.h and here to handle
+ *   evaluating and outputting them.
+ *
+ * - The costs of literals and matches are estimated using the range encoder
+ *   states and the semi-adaptive Huffman codes.  Except for range encoding
+ *   states, costs are assumed to be constant throughout a single run of the
+ *   parsing algorithm, which can parse up to LZMS_OPTIM_ARRAY_SIZE bytes of
+ *   data.  This introduces a source of inaccuracy because the probabilities and
+ *   Huffman codes can change over this part of the data.
+ */
 static void
-lzms_slow_encode(struct lzms_compressor *ctx)
+lzms_normal_encode(struct lzms_compressor *ctx)
 {
 	struct raw_match match;
 
@@ -828,17 +955,12 @@ lzms_slow_encode(struct lzms_compressor *ctx)
 	/* Reset the match-chooser.  */
 	lz_match_chooser_begin(&ctx->mc);
 
-	/* TODO */
 	while (ctx->cur_window_pos != ctx->window_size) {
-
 		match = lzms_get_near_optimal_match(ctx);
-		if (match.len <= 1) {
-			/* Literal  */
+		if (match.len <= 1)
 			lzms_encode_literal(ctx, ctx->window[ctx->cur_window_pos]);
-		} else {
-			/* LZ match  */
+		else
 			lzms_encode_lz_match(ctx, match.len, match.offset);
-		}
 	}
 }
 
@@ -864,11 +986,17 @@ lzms_init_huffman_encoder(struct lzms_huffman_encoder *enc,
 {
 	enc->os = os;
 	enc->slot_base_tab = slot_base_tab;
-	enc->num_syms_written = rebuild_freq;
+	enc->num_syms_written = 0;
 	enc->rebuild_freq = rebuild_freq;
 	enc->num_syms = num_syms;
 	for (unsigned i = 0; i < num_syms; i++)
 		enc->sym_freqs[i] = 1;
+
+	make_canonical_huffman_code(enc->num_syms,
+				    LZMS_MAX_CODEWORD_LEN,
+				    enc->sym_freqs,
+				    enc->lens,
+				    enc->codewords);
 }
 
 /* Initialize the LZMS compressor.  */
@@ -891,9 +1019,6 @@ lzms_init_compressor(struct lzms_compressor *ctx, const u8 *udata, u32 ulen,
 	 * (writing backwards).  */
 	lzms_output_bitstream_init(&ctx->os, cdata, clen16);
 
-	/* Initialize position and length slot bases if not done already.  */
-	lzms_init_slot_bases();
-
 	/* Calculate the number of position slots needed for this compressed
 	 * block.  */
 	num_position_slots = lzms_get_position_slot(ulen - 1) + 1;
@@ -979,8 +1104,8 @@ lzms_finalize(struct lzms_compressor *ctx, u8 *cdata, size_t csize_avail)
 
 	/* Now the compressed buffer contains the data output by the forwards
 	 * bitstream, then empty space, then data output by the backwards
-	 * bitstream.  Move the data output by the forwards bitstream to be
-	 * adjacent to the data output by the backwards bitstream, and calculate
+	 * bitstream.  Move the data output by the backwards bitstream to be
+	 * adjacent to the data output by the forward bitstream, and calculate
 	 * the compressed size that this results in.  */
 	num_forwards_bytes = (u8*)ctx->rc.out - (u8*)cdata;
 	num_backwards_bytes = ((u8*)cdata + csize_avail) - (u8*)ctx->os.out;
@@ -991,7 +1116,7 @@ lzms_finalize(struct lzms_compressor *ctx, u8 *cdata, size_t csize_avail)
 	LZMS_DEBUG("num_forwards_bytes=%zu, num_backwards_bytes=%zu, "
 		   "compressed_size=%zu",
 		   num_forwards_bytes, num_backwards_bytes, compressed_size);
-	LZMS_ASSERT(!(compressed_size & 1));
+	LZMS_ASSERT(compressed_size % 2 == 0);
 	return compressed_size;
 }
 
@@ -1015,10 +1140,12 @@ lzms_compress(const void *uncompressed_data, size_t uncompressed_size,
 	}
 
 	/* Don't bother compressing extremely small inputs.  */
-	if (uncompressed_size < 4)
+	if (uncompressed_size < 4) {
+		LZMS_DEBUG("Input too small to bother compressing.");
 		return 0;
+	}
 
-	/* Cap the available compressed size to a 32-bit integer, and round it
+	/* Cap the available compressed size to a 32-bit integer and round it
 	 * down to the nearest multiple of 2.  */
 	if (compressed_size_avail > UINT32_MAX)
 		compressed_size_avail = UINT32_MAX;
@@ -1033,12 +1160,12 @@ lzms_compress(const void *uncompressed_data, size_t uncompressed_size,
 	lzms_x86_filter(ctx->window, ctx->window_size,
 			ctx->last_target_usages, false);
 
-	/* Determine and output a literal/match sequence that decompresses to
-	 * the preprocessed data.  */
-	if (1)
-		lzms_slow_encode(ctx);
-	else
-		lzms_fast_encode(ctx);
+	/* Compute and encode a literal/match sequence that decompresses to the
+	 * preprocessed data.  */
+	lzms_normal_encode(ctx);
+#if 0
+	lzms_fast_encode(ctx);
+#endif
 
 	/* Get and return the compressed data size.  */
 	compressed_size = lzms_finalize(ctx, compressed_data,
@@ -1103,25 +1230,51 @@ lzms_free_compressor(void *_ctx)
 
 	if (ctx) {
 		FREE(ctx->window);
+#if 0
 		FREE(ctx->prev_tab);
+#endif
 		lz_sarray_destroy(&ctx->lz_sarray);
 		lz_match_chooser_destroy(&ctx->mc);
 		FREE(ctx);
 	}
 }
 
+static const struct wimlib_lzms_compressor_params default_params = {
+	.hdr = sizeof(struct wimlib_lzms_compressor_params),
+	.min_match_length = 2,
+	.max_match_length = UINT32_MAX,
+	.nice_match_length = 32,
+	.max_search_depth = 50,
+	.max_matches_per_pos = 3,
+	.optim_array_length = 1024,
+};
+
 static int
 lzms_create_compressor(size_t max_block_size,
-		       const struct wimlib_compressor_params_header *params,
+		       const struct wimlib_compressor_params_header *_params,
 		       void **ctx_ret)
 {
 	struct lzms_compressor *ctx;
+	const struct wimlib_lzms_compressor_params *params;
 
 	if (max_block_size == 0 || max_block_size >= INT32_MAX) {
 		LZMS_DEBUG("Invalid max_block_size (%u)", max_block_size);
 		return WIMLIB_ERR_INVALID_PARAM;
 	}
 
+	if (_params)
+		params = (const struct wimlib_lzms_compressor_params*)_params;
+	else
+		params = &default_params;
+
+	if (params->max_match_length < params->min_match_length ||
+	    params->min_match_length < 2 ||
+	    params->optim_array_length == 0 ||
+	    min(params->max_match_length, params->nice_match_length) > 65536) {
+		LZMS_DEBUG("Invalid compression parameter!");
+		return WIMLIB_ERR_INVALID_PARAM;
+	}
+
 	ctx = CALLOC(1, sizeof(struct lzms_compressor));
 	if (ctx == NULL)
 		goto oom;
@@ -1130,24 +1283,31 @@ lzms_create_compressor(size_t max_block_size,
 	if (ctx->window == NULL)
 		goto oom;
 
+#if 0
 	ctx->prev_tab = MALLOC(max_block_size * sizeof(ctx->prev_tab[0]));
 	if (ctx->prev_tab == NULL)
 		goto oom;
+#endif
 
-	if (!lz_sarray_init(&ctx->lz_sarray,
-			    max_block_size,
-			    2,
-			    max_block_size,
-			    100,
-			    10))
+	if (!lz_sarray_init(&ctx->lz_sarray, max_block_size,
+			    params->min_match_length,
+			    params->max_match_length,
+			    params->max_search_depth,
+			    params->max_matches_per_pos))
 		goto oom;
 
 	if (!lz_match_chooser_init(&ctx->mc,
-				   LZMS_OPTIM_ARRAY_SIZE,
-				   32,
-				   max_block_size))
+				   params->optim_array_length,
+				   params->nice_match_length,
+				   params->max_match_length))
 		goto oom;
 
+	/* Initialize position and length slot bases if not done already.  */
+	lzms_init_slot_bases();
+
+	/* Initialize range encoding cost table if not done already.  */
+	lzms_init_rc_costs();
+
 	ctx->max_block_size = max_block_size;
 
 	*ctx_ret = ctx;
diff --git a/src/lzms-decompress.c b/src/lzms-decompress.c
index f2fcb60e..83629008 100644
--- a/src/lzms-decompress.c
+++ b/src/lzms-decompress.c
@@ -795,6 +795,7 @@ lzms_decode_delta_match(struct lzms_decompressor *ctx)
 	return lzms_copy_delta_match(ctx, length, power, raw_offset);
 }
 
+/* Decode a LZ or delta match.  */
 static int
 lzms_decode_match(struct lzms_decompressor *ctx)
 {
@@ -831,7 +832,6 @@ lzms_decode_item(struct lzms_decompressor *ctx)
 	if (ret)
 		return ret;
 
-	/* Update LRU queues  */
         lzms_update_lru_queues(&ctx->lru);
 	return 0;
 }
@@ -886,9 +886,6 @@ lzms_init_decompressor(struct lzms_decompressor *ctx,
 	 * backwards)  */
 	lzms_input_bitstream_init(&ctx->is, cdata, clen / 2);
 
-	/* Initialize position and length slot bases if not done already.  */
-	lzms_init_slot_bases();
-
 	/* Calculate the number of position slots needed for this compressed
 	 * block.  */
 	num_position_slots = lzms_get_position_slot(ulen - 1) + 1;
@@ -1034,6 +1031,9 @@ lzms_create_decompressor(size_t max_block_size,
 	if (ctx == NULL)
 		return WIMLIB_ERR_NOMEM;
 
+	/* Initialize position and length slot bases if not done already.  */
+	lzms_init_slot_bases();
+
 	*ctx_ret = ctx;
 	return 0;
 }