X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flzx_compress.c;h=8dd12b385e6379c71ae527c228e95f94fa94ce8c;hb=refs%2Fheads%2Flzx_lcpit;hp=520c07e33572081b7756a9c3622733522569817a;hpb=ab64cfca6f15354ccef5a86441cd9c35898a220a;p=wimlib

diff --git a/src/lzx_compress.c b/src/lzx_compress.c
index 520c07e3..8dd12b38 100644
--- a/src/lzx_compress.c
+++ b/src/lzx_compress.c
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (C) 2012, 2013, 2014, 2015 Eric Biggers
+ * Copyright (C) 2012-2016 Eric Biggers
  *
  * This file is free software; you can redistribute it and/or modify it under
  * the terms of the GNU Lesser General Public License as published by the Free
@@ -65,33 +65,41 @@
 #endif
 
 /*
- * Start a new LZX block (with new Huffman codes) after this many bytes.
+ * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
+ * except if the last block has to be shorter.
+ */
+#define MIN_BLOCK_LENGTH		6500
+
+/*
+ * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
+ * the final size might be larger due to matches extending beyond the end of the
+ * block.  Specifically:
  *
- * Note: actual block sizes may slightly exceed this value.
+ *  - The greedy parser may choose an arbitrarily long match starting at the
+ *    SOFT_MAX_BLOCK_LENGTH'th byte.
  *
- * TODO: recursive splitting and cost evaluation might be good for an extremely
- * high compression mode, but otherwise it is almost always far too slow for how
- * much it helps.  Perhaps some sort of heuristic would be useful?
+ *  - The lazy parser may choose a sequence of literals starting at the
+ *    SOFT_MAX_BLOCK_LENGTH'th byte when it sees a sequence of increasing good
+ *    matches.  The final match may be of arbitrary length.  The length of the
+ *    literal sequence is approximately limited by the "nice match length"
+ *    parameter.
  */
-#define LZX_DIV_BLOCK_SIZE	32768
+#define SOFT_MAX_BLOCK_LENGTH	100000
 
 /*
- * LZX_CACHE_PER_POS is the number of lz_match structures to reserve in the
- * match cache for each byte position.  This value should be high enough so that
- * nearly the time, all matches found in a given block can fit in the match
- * cache.  However, fallback behavior (immediately terminating the block) on
- * cache overflow is still required.
+ * The number of observed matches or literals that represents sufficient data to
+ * decide whether the current block should be terminated or not.
  */
-#define LZX_CACHE_PER_POS	7
+#define NUM_OBSERVATIONS_PER_BLOCK_CHECK	500
 
 /*
  * LZX_CACHE_LENGTH is the number of lz_match structures in the match cache,
- * excluding the extra "overflow" entries.  The per-position multiplier is '1 +
- * LZX_CACHE_PER_POS' instead of 'LZX_CACHE_PER_POS' because there is an
- * overhead of one lz_match per position, used to hold the match count at that
- * position.
+ * excluding the extra "overflow" entries.  This value should be high enough so
+ * that nearly the time, all matches found in a given block can fit in the match
+ * cache.  However, fallback behavior (immediately terminating the block) on
+ * cache overflow is still required.
  */
-#define LZX_CACHE_LENGTH	(LZX_DIV_BLOCK_SIZE * (1 + LZX_CACHE_PER_POS))
+#define LZX_CACHE_LENGTH	(SOFT_MAX_BLOCK_LENGTH * 5)
 
 /*
  * LZX_MAX_MATCHES_PER_POS is an upper bound on the number of matches that can
@@ -111,7 +119,7 @@
  * unknown.  In reality, each token in LZX requires a whole number of bits to
  * output.
  */
-#define LZX_BIT_COST		16
+#define LZX_BIT_COST		64
 
 /*
  * Should the compressor take into account the costs of aligned offset symbols?
@@ -136,9 +144,9 @@
  * These are the compressor-side limits on the codeword lengths for each Huffman
  * code.  To make outputting bits slightly faster, some of these limits are
  * lower than the limits defined by the LZX format.  This does not significantly
- * affect the compression ratio, at least for the block sizes we use.
+ * affect the compression ratio, at least for the block lengths we use.
  */
-#define MAIN_CODEWORD_LIMIT	12	/* 64-bit: can buffer 4 main symbols  */
+#define MAIN_CODEWORD_LIMIT	16
 #define LENGTH_CODEWORD_LIMIT	12
 #define ALIGNED_CODEWORD_LIMIT	7
 #define PRE_CODEWORD_LIMIT	7
@@ -154,7 +162,7 @@
 /* Matchfinders with 16-bit positions  */
 #define mf_pos_t	u16
 #define MF_SUFFIX	_16
-#include "wimlib/bt_matchfinder.h"
+#include "wimlib/lcpit_matchfinder.h"
 #include "wimlib/hc_matchfinder.h"
 
 /* Matchfinders with 32-bit positions  */
@@ -162,7 +170,7 @@
 #undef MF_SUFFIX
 #define mf_pos_t	u32
 #define MF_SUFFIX	_32
-#include "wimlib/bt_matchfinder.h"
+#include "wimlib/lcpit_matchfinder.h"
 #include "wimlib/hc_matchfinder.h"
 
 struct lzx_output_bitstream;
@@ -214,6 +222,17 @@ struct lzx_freqs {
 	u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
+/* Block split statistics.  See "Block splitting algorithm" below. */
+#define NUM_LITERAL_OBSERVATION_TYPES 8
+#define NUM_MATCH_OBSERVATION_TYPES 2
+#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
+struct block_split_stats {
+	u32 new_observations[NUM_OBSERVATION_TYPES];
+	u32 observations[NUM_OBSERVATION_TYPES];
+	u32 num_new_observations;
+	u32 num_observations;
+};
+
 /*
  * Represents a run of literals followed by a match or end-of-block.  This
  * struct is needed to temporarily store items chosen by the parser, since items
@@ -274,6 +293,9 @@ struct lzx_optimum_node {
 	u32 item;
 #define OPTIMUM_OFFSET_SHIFT 9
 #define OPTIMUM_LEN_MASK ((1 << OPTIMUM_OFFSET_SHIFT) - 1)
+#define OPTIMUM_EXTRA_FLAG 0x80000000
+	u32 extra_match;
+	u32 extra_literal;
 } _aligned_attribute(8);
 
 /*
@@ -394,6 +416,9 @@ struct lzx_compressor {
 	/* The Huffman symbol frequency counters for the current block.  */
 	struct lzx_freqs freqs;
 
+	/* Block split statistics.  */
+	struct block_split_stats split_stats;
+
 	/* The Huffman codes for the current and previous blocks.  The one with
 	 * index 'codes_index' is for the current block, and the other one is
 	 * for the previous block.  */
@@ -405,7 +430,7 @@ struct lzx_compressor {
 	 * number of matches that can ever be chosen for a single block, plus
 	 * one for the special entry at the end.  */
 	struct lzx_sequence chosen_sequences[
-		       DIV_ROUND_UP(LZX_DIV_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
+		       DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH, LZX_MIN_MATCH_LEN) + 1];
 
 	/* Tables for mapping adjusted offsets to offset slots  */
 
@@ -428,24 +453,18 @@ struct lzx_compressor {
 		/* Data for near-optimal parsing  */
 		struct {
 			/*
-			 * The graph nodes for the current block.
+			 * Array of nodes, one per position, for running the
+			 * minimum-cost path algorithm.
 			 *
-			 * We need at least 'LZX_DIV_BLOCK_SIZE +
-			 * LZX_MAX_MATCH_LEN - 1' nodes because that is the
-			 * maximum block size that may be used.  Add 1 because
-			 * we need a node to represent end-of-block.
-			 *
-			 * It is possible that nodes past end-of-block are
-			 * accessed during match consideration, but this can
-			 * only occur if the block was truncated at
-			 * LZX_DIV_BLOCK_SIZE.  So the same bound still applies.
-			 * Note that since nodes past the end of the block will
-			 * never actually have an effect on the items that are
-			 * chosen for the block, it makes no difference what
-			 * their costs are initialized to (if anything).
+			 * This array must be large enough to accommodate the
+			 * worst-case number of nodes, which occurs if we find a
+			 * match of length LZX_MAX_MATCH_LEN at position
+			 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of length
+			 * SOFT_MAX_BLOCK_LENGTH - 1 + LZX_MAX_MATCH_LEN.  Add one
+			 * for the end-of-block node.
 			 */
-			struct lzx_optimum_node optimum_nodes[LZX_DIV_BLOCK_SIZE +
-							      LZX_MAX_MATCH_LEN - 1 + 1];
+			struct lzx_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
+							      LZX_MAX_MATCH_LEN + 1];
 
 			/* The cost model for the current block  */
 			struct lzx_costs costs;
@@ -477,11 +496,7 @@ struct lzx_compressor {
 						    LZX_MAX_MATCHES_PER_POS +
 						    LZX_MAX_MATCH_LEN - 1];
 
-			/* Binary trees matchfinder (MUST BE LAST!!!)  */
-			union {
-				struct bt_matchfinder_16 bt_mf_16;
-				struct bt_matchfinder_32 bt_mf_32;
-			};
+			struct lcpit_matchfinder lcpit_mf;
 		};
 	};
 };
@@ -510,10 +525,6 @@ lzx_is_16_bit(size_t max_bufsize)
 	((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \
 		       CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__));
 
-#define CALL_BT_MF(is_16_bit, c, funcname, ...)				      \
-	((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \
-		       CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__));
-
 /*
  * Structure to keep track of the current state of sending bits to the
  * compressed output buffer.
@@ -586,13 +597,13 @@ lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
 
 	if (os->end - os->next < 6)
 		return;
-	put_unaligned_u16_le(os->bitbuf >> ((os->bitcount - 16) &
+	put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
 					    shift_mask), os->next + 0);
 	if (max_num_bits > 16)
-		put_unaligned_u16_le(os->bitbuf >> ((os->bitcount - 32) &
+		put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
 						shift_mask), os->next + 2);
 	if (max_num_bits > 32)
-		put_unaligned_u16_le(os->bitbuf >> ((os->bitcount - 48) &
+		put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
 						shift_mask), os->next + 4);
 	os->next += (os->bitcount >> 4) << 1;
 	os->bitcount &= 15;
@@ -617,17 +628,19 @@ lzx_flush_output(struct lzx_output_bitstream *os)
 		return 0;
 
 	if (os->bitcount != 0) {
-		put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next);
+		put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
 		os->next += 2;
 	}
 
 	return os->next - os->start;
 }
 
-/* Build the main, length, and aligned offset Huffman codes used in LZX.
+/*
+ * Build the main, length, and aligned offset Huffman codes used in LZX.
  *
  * This takes as input the frequency tables for each code and produces as output
- * a set of tables that map symbols to codewords and codeword lengths.  */
+ * a set of tables that map symbols to codewords and codeword lengths.
+ */
 static void
 lzx_make_huffman_codes(struct lzx_compressor *c)
 {
@@ -893,27 +906,24 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 
 			/* Verify optimization is enabled on 64-bit  */
 			STATIC_ASSERT(sizeof(machine_word_t) < 8 ||
-				      CAN_BUFFER(4 * MAIN_CODEWORD_LIMIT));
+				      CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT));
 
-			if (CAN_BUFFER(4 * MAIN_CODEWORD_LIMIT)) {
+			if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) {
 
-				/* 64-bit: write 4 literals at a time.  */
-				while (litrunlen >= 4) {
+				/* 64-bit: write 3 literals at a time.  */
+				while (litrunlen >= 3) {
 					unsigned lit0 = block_data[0];
 					unsigned lit1 = block_data[1];
 					unsigned lit2 = block_data[2];
-					unsigned lit3 = block_data[3];
 					lzx_add_bits(os, codes->codewords.main[lit0],
 						     codes->lens.main[lit0]);
 					lzx_add_bits(os, codes->codewords.main[lit1],
 						     codes->lens.main[lit1]);
 					lzx_add_bits(os, codes->codewords.main[lit2],
 						     codes->lens.main[lit2]);
-					lzx_add_bits(os, codes->codewords.main[lit3],
-						     codes->lens.main[lit3]);
-					lzx_flush_bits(os, 4 * MAIN_CODEWORD_LIMIT);
-					block_data += 4;
-					litrunlen -= 4;
+					lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
+					block_data += 3;
+					litrunlen -= 3;
 				}
 				if (litrunlen--) {
 					unsigned lit = *block_data++;
@@ -923,14 +933,7 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 						unsigned lit = *block_data++;
 						lzx_add_bits(os, codes->codewords.main[lit],
 							     codes->lens.main[lit]);
-						if (litrunlen--) {
-							unsigned lit = *block_data++;
-							lzx_add_bits(os, codes->codewords.main[lit],
-								     codes->lens.main[lit]);
-							lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
-						} else {
-							lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
-						}
+						lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
 					} else {
 						lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
 					}
@@ -1026,7 +1029,7 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 static void
 lzx_write_compressed_block(const u8 *block_begin,
 			   int block_type,
-			   u32 block_size,
+			   u32 block_length,
 			   unsigned window_order,
 			   unsigned num_main_syms,
 			   const struct lzx_sequence sequences[],
@@ -1038,30 +1041,33 @@ lzx_write_compressed_block(const u8 *block_begin,
 	 * LZX_BLOCKTYPE_* constants.  */
 	lzx_write_bits(os, block_type, 3);
 
-	/* Output the block size.
+	/*
+	 * Output the block length.
 	 *
-	 * The original LZX format seemed to always encode the block size in 3
+	 * The original LZX format seemed to always encode the block length in 3
 	 * bytes.  However, the implementation in WIMGAPI, as used in WIM files,
-	 * uses the first bit to indicate whether the block is the default size
-	 * (32768) or a different size given explicitly by the next 16 bits.
+	 * uses the first bit to indicate whether the block is the default
+	 * length (32768) or a different length given explicitly by the next 16
+	 * bits.
 	 *
 	 * By default, this compressor uses a window size of 32768 and therefore
 	 * follows the WIMGAPI behavior.  However, this compressor also supports
 	 * window sizes greater than 32768 bytes, which do not appear to be
 	 * supported by WIMGAPI.  In such cases, we retain the default size bit
-	 * to mean a size of 32768 bytes but output non-default block size in 24
-	 * bits rather than 16.  The compatibility of this behavior is unknown
-	 * because WIMs created with chunk size greater than 32768 can seemingly
-	 * only be opened by wimlib anyway.  */
-	if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
+	 * to mean a size of 32768 bytes but output non-default block length in
+	 * 24 bits rather than 16.  The compatibility of this behavior is
+	 * unknown because WIMs created with chunk size greater than 32768 can
+	 * seemingly only be opened by wimlib anyway.
+	 */
+	if (block_length == LZX_DEFAULT_BLOCK_SIZE) {
 		lzx_write_bits(os, 1, 1);
 	} else {
 		lzx_write_bits(os, 0, 1);
 
 		if (window_order >= 16)
-			lzx_write_bits(os, block_size >> 16, 8);
+			lzx_write_bits(os, block_length >> 16, 8);
 
-		lzx_write_bits(os, block_size & 0xFFFF, 16);
+		lzx_write_bits(os, block_length & 0xFFFF, 16);
 	}
 
 	/* If it's an aligned offset block, output the aligned offset code.  */
@@ -1130,16 +1136,16 @@ lzx_comp_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
 }
 
 /*
- * Finish an LZX block:
+ * Flush an LZX block:
  *
- * - build the Huffman codes
- * - decide whether to output the block as VERBATIM or ALIGNED
- * - output the block
- * - swap the indices of the current and previous Huffman codes
+ * 1. Build the Huffman codes.
+ * 2. Decide whether to output the block as VERBATIM or ALIGNED.
+ * 3. Write the block.
+ * 4. Swap the indices of the current and previous Huffman codes.
  */
 static void
-lzx_finish_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
-		 const u8 *block_begin, u32 block_size, u32 seq_idx)
+lzx_flush_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
+		const u8 *block_begin, u32 block_length, u32 seq_idx)
 {
 	int block_type;
 
@@ -1149,7 +1155,7 @@ lzx_finish_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 						    &c->codes[c->codes_index]);
 	lzx_write_compressed_block(block_begin,
 				   block_type,
-				   block_size,
+				   block_length,
 				   c->window_order,
 				   c->num_main_syms,
 				   &c->chosen_sequences[seq_idx],
@@ -1237,6 +1243,121 @@ lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
 	last_seq->adjusted_offset_and_match_hdr = 0x80000000;
 }
 
+/******************************************************************************/
+
+/*
+ * Block splitting algorithm.  The problem is to decide when it is worthwhile to
+ * start a new block with new entropy codes.  There is a theoretically optimal
+ * solution: recursively consider every possible block split, considering the
+ * exact cost of each block, and choose the minimum cost approach.  But this is
+ * far too slow.  Instead, as an approximation, we can count symbols and after
+ * every N symbols, compare the expected distribution of symbols based on the
+ * previous data with the actual distribution.  If they differ "by enough", then
+ * start a new block.
+ *
+ * As an optimization and heuristic, we don't distinguish between every symbol
+ * but rather we combine many symbols into a single "observation type".  For
+ * literals we only look at the high bits and low bits, and for matches we only
+ * look at whether the match is long or not.  The assumption is that for typical
+ * "real" data, places that are good block boundaries will tend to be noticable
+ * based only on changes in these aggregate frequencies, without looking for
+ * subtle differences in individual symbols.  For example, a change from ASCII
+ * bytes to non-ASCII bytes, or from few matches (generally less compressible)
+ * to many matches (generally more compressible), would be easily noticed based
+ * on the aggregates.
+ *
+ * For determining whether the frequency distributions are "different enough" to
+ * start a new block, the simply heuristic of splitting when the sum of absolute
+ * differences exceeds a constant seems to be good enough.  We also add a number
+ * proportional to the block length so that the algorithm is more likely to end
+ * long blocks than short blocks.  This reflects the general expectation that it
+ * will become increasingly beneficial to start a new block as the current
+ * blocks grows larger.
+ *
+ * Finally, for an approximation, it is not strictly necessary that the exact
+ * symbols being used are considered.  With "near-optimal parsing", for example,
+ * the actual symbols that will be used are unknown until after the block
+ * boundary is chosen and the block has been optimized.  Since the final choices
+ * cannot be used, we can use preliminary "greedy" choices instead.
+ */
+
+/* Initialize the block split statistics when starting a new block. */
+static void
+init_block_split_stats(struct block_split_stats *stats)
+{
+	for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->new_observations[i] = 0;
+		stats->observations[i] = 0;
+	}
+	stats->num_new_observations = 0;
+	stats->num_observations = 0;
+}
+
+/* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
+ * literal, for 8 possible literal observation types.  */
+static inline void
+observe_literal(struct block_split_stats *stats, u8 lit)
+{
+	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
+	stats->num_new_observations++;
+}
+
+/* Match observation.  Heuristic: use one observation type for "short match" and
+ * one observation type for "long match".  */
+static inline void
+observe_match(struct block_split_stats *stats, unsigned length)
+{
+	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
+	stats->num_new_observations++;
+}
+
+static bool
+do_end_block_check(struct block_split_stats *stats, u32 block_length)
+{
+	if (stats->num_observations > 0) {
+
+		/* Note: to avoid slow divisions, we do not divide by
+		 * 'num_observations', but rather do all math with the numbers
+		 * multiplied by 'num_observations'.  */
+		u32 total_delta = 0;
+		for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+			u32 expected = stats->observations[i] * stats->num_new_observations;
+			u32 actual = stats->new_observations[i] * stats->num_observations;
+			u32 delta = (actual > expected) ? actual - expected :
+							  expected - actual;
+			total_delta += delta;
+		}
+
+		/* Ready to end the block? */
+		if (total_delta + (block_length / 1024) * stats->num_observations >=
+		    stats->num_new_observations * 51 / 64 * stats->num_observations)
+			return true;
+	}
+
+	for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->num_observations += stats->new_observations[i];
+		stats->observations[i] += stats->new_observations[i];
+		stats->new_observations[i] = 0;
+	}
+	stats->num_new_observations = 0;
+	return false;
+}
+
+static inline bool
+should_end_block(struct block_split_stats *stats,
+		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
+{
+	/* Ready to check block split statistics? */
+	if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
+	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
+	    in_end - in_next < MIN_BLOCK_LENGTH)
+		return false;
+
+	return do_end_block_check(stats, in_next - in_block_begin);
+}
+
+/******************************************************************************/
+
 /*
  * Given the minimum-cost path computed through the item graph for the current
  * block, walk the path and count how many of each symbol in each Huffman-coded
@@ -1248,10 +1369,12 @@ lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
  * computes frequencies.
  */
 static inline void
-lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
+lzx_tally_item_list(struct lzx_compressor *c, u32 block_length, bool is_16_bit)
 {
-	u32 node_idx = block_size;
+	u32 node_idx = block_length;
+
 	for (;;) {
+		u32 item;
 		u32 len;
 		u32 offset_data;
 		unsigned v;
@@ -1260,21 +1383,37 @@ lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 		/* Tally literals until either a match or the beginning of the
 		 * block is reached.  */
 		for (;;) {
-			u32 item = c->optimum_nodes[node_idx].item;
+			item = c->optimum_nodes[node_idx].item;
+			if (item & OPTIMUM_LEN_MASK)
+				break;
+			c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
+			node_idx--;
+		}
 
-			len = item & OPTIMUM_LEN_MASK;
-			offset_data = item >> OPTIMUM_OFFSET_SHIFT;
+		if (item & OPTIMUM_EXTRA_FLAG) {
 
-			if (len != 0)  /* Not a literal?  */
+			if (node_idx == 0)
 				break;
 
-			/* Tally the main symbol for the literal.  */
-			c->freqs.main[offset_data]++;
+			/* Tally a rep0 match.  */
+			len = item & OPTIMUM_LEN_MASK;
+			v = len - LZX_MIN_MATCH_LEN;
+			if (v >= LZX_NUM_PRIMARY_LENS) {
+				c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
+				v = LZX_NUM_PRIMARY_LENS;
+			}
+			c->freqs.main[LZX_NUM_CHARS + v]++;
+
+			/* Tally a literal.  */
+			c->freqs.main[c->optimum_nodes[node_idx].extra_literal]++;
 
-			if (--node_idx == 0) /* Beginning of block was reached?  */
-				return;
+			item = c->optimum_nodes[node_idx].extra_match;
+			node_idx -= len + 1;
 		}
 
+		len = item & OPTIMUM_LEN_MASK;
+		offset_data = item >> OPTIMUM_OFFSET_SHIFT;
+
 		node_idx -= len;
 
 		/* Tally a match.  */
@@ -1294,9 +1433,6 @@ lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 		offset_slot = lzx_comp_get_offset_slot(c, offset_data, is_16_bit);
 		v += offset_slot * LZX_NUM_LEN_HEADERS;
 		c->freqs.main[LZX_NUM_CHARS + v]++;
-
-		if (node_idx == 0) /* Beginning of block was reached?  */
-			return;
 	}
 }
 
@@ -1310,9 +1446,9 @@ lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
  * which the lzx_sequences begin.
  */
 static inline u32
-lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
+lzx_record_item_list(struct lzx_compressor *c, u32 block_length, bool is_16_bit)
 {
-	u32 node_idx = block_size;
+	u32 node_idx = block_length;
 	u32 seq_idx = ARRAY_LEN(c->chosen_sequences) - 1;
 	u32 lit_start_node;
 
@@ -1321,29 +1457,54 @@ lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 
 	lit_start_node = node_idx;
 	for (;;) {
+		u32 item;
 		u32 len;
 		u32 offset_data;
 		unsigned v;
 		unsigned offset_slot;
 
-		/* Record literals until either a match or the beginning of the
+		/* Tally literals until either a match or the beginning of the
 		 * block is reached.  */
 		for (;;) {
-			u32 item = c->optimum_nodes[node_idx].item;
+			item = c->optimum_nodes[node_idx].item;
+			if (item & OPTIMUM_LEN_MASK)
+				break;
+			c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
+			node_idx--;
+		}
 
-			len = item & OPTIMUM_LEN_MASK;
-			offset_data = item >> OPTIMUM_OFFSET_SHIFT;
+		if (item & OPTIMUM_EXTRA_FLAG) {
 
-			if (len != 0) /* Not a literal?  */
+			if (node_idx == 0)
 				break;
 
-			/* Tally the main symbol for the literal.  */
-			c->freqs.main[offset_data]++;
+			/* Save the literal run length for the next sequence
+			 * (the "previous sequence" when walking backwards).  */
+			len = item & OPTIMUM_LEN_MASK;
+			c->chosen_sequences[seq_idx].litrunlen = lit_start_node - node_idx;
+			seq_idx--;
+			lit_start_node = node_idx - len;
+
+			/* Tally a rep0 match.  */
+			v = len - LZX_MIN_MATCH_LEN;
+			c->chosen_sequences[seq_idx].adjusted_length = v;
+			if (v >= LZX_NUM_PRIMARY_LENS) {
+				c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
+				v = LZX_NUM_PRIMARY_LENS;
+			}
+			c->freqs.main[LZX_NUM_CHARS + v]++;
+			c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = v;
 
-			if (--node_idx == 0) /* Beginning of block was reached?  */
-				goto out;
+			/* Tally a literal.  */
+			c->freqs.main[c->optimum_nodes[node_idx].extra_literal]++;
+
+			item = c->optimum_nodes[node_idx].extra_match;
+			node_idx -= len + 1;
 		}
 
+		len = item & OPTIMUM_LEN_MASK;
+		offset_data = item >> OPTIMUM_OFFSET_SHIFT;
+
 		/* Save the literal run length for the next sequence (the
 		 * "previous sequence" when walking backwards).  */
 		c->chosen_sequences[seq_idx--].litrunlen = lit_start_node - node_idx;
@@ -1374,12 +1535,8 @@ lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 		/* Save the adjusted offset and match header.  */
 		c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr =
 				(offset_data << 9) | v;
-
-		if (node_idx == 0) /* Beginning of block was reached?  */
-			goto out;
 	}
 
-out:
 	/* Save the literal run length for the first sequence.  */
 	c->chosen_sequences[seq_idx].litrunlen = lit_start_node - node_idx;
 
@@ -1391,10 +1548,11 @@ out:
 /*
  * Find an inexpensive path through the graph of possible match/literal choices
  * for the current block.  The nodes of the graph are
- * c->optimum_nodes[0...block_size].  They correspond directly to the bytes in
+ * c->optimum_nodes[0...block_length].  They correspond directly to the bytes in
  * the current block, plus one extra node for end-of-block.  The edges of the
  * graph are matches and literals.  The goal is to find the minimum cost path
- * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]'.
+ * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_length]', given the cost
+ * model 'c->costs'.
  *
  * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and
  * proceeding forwards one node at a time.  At each node, a selection of matches
@@ -1422,15 +1580,14 @@ out:
 static inline struct lzx_lru_queue
 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 		       const u8 * const restrict block_begin,
-		       const u32 block_size,
+		       const u32 block_length,
 		       const struct lzx_lru_queue initial_queue,
 		       bool is_16_bit)
 {
 	struct lzx_optimum_node *cur_node = c->optimum_nodes;
-	struct lzx_optimum_node * const end_node = &c->optimum_nodes[block_size];
 	struct lz_match *cache_ptr = c->match_cache;
 	const u8 *in_next = block_begin;
-	const u8 * const block_end = block_begin + block_size;
+	const u8 * const block_end = block_begin + block_length;
 
 	/* Instead of storing the match offset LRU queues in the
 	 * 'lzx_optimum_node' structures, we save memory (and cache lines) by
@@ -1445,15 +1602,16 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 
 	/* Initially, the cost to reach each node is "infinity".  */
 	memset(c->optimum_nodes, 0xFF,
-	       (block_size + 1) * sizeof(c->optimum_nodes[0]));
+	       (block_length + 1) * sizeof(c->optimum_nodes[0]));
 
 	QUEUE(block_begin) = initial_queue;
 
-	/* The following loop runs 'block_size' iterations, one per node.  */
+	/* The following loop runs 'block_length' iterations, one per node.  */
 	do {
 		unsigned num_matches;
 		unsigned literal;
 		u32 cost;
+		struct lz_match *matches;
 
 		/*
 		 * A selection of matches for the block was already saved in
@@ -1481,9 +1639,9 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 
 		num_matches = cache_ptr->length;
 		cache_ptr++;
+		matches = cache_ptr;
 
 		if (num_matches) {
-			struct lz_match *end_matches = cache_ptr + num_matches;
 			unsigned next_len = LZX_MIN_MATCH_LEN;
 			unsigned max_len = min(block_end - in_next, LZX_MAX_MATCH_LEN);
 			const u8 *matchptr;
@@ -1502,10 +1660,8 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 					(cur_node + next_len)->item =
 						(0 << OPTIMUM_OFFSET_SHIFT) | next_len;
 				}
-				if (unlikely(++next_len > max_len)) {
-					cache_ptr = end_matches;
+				if (unlikely(++next_len > max_len))
 					goto done_matches;
-				}
 			} while (in_next[next_len - 1] == matchptr[next_len - 1]);
 
 		R0_done:
@@ -1528,10 +1684,8 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 					(cur_node + next_len)->item =
 						(1 << OPTIMUM_OFFSET_SHIFT) | next_len;
 				}
-				if (unlikely(++next_len > max_len)) {
-					cache_ptr = end_matches;
+				if (unlikely(++next_len > max_len))
 					goto done_matches;
-				}
 			} while (in_next[next_len - 1] == matchptr[next_len - 1]);
 
 		R1_done:
@@ -1554,35 +1708,36 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 					(cur_node + next_len)->item =
 						(2 << OPTIMUM_OFFSET_SHIFT) | next_len;
 				}
-				if (unlikely(++next_len > max_len)) {
-					cache_ptr = end_matches;
+				if (unlikely(++next_len > max_len))
 					goto done_matches;
-				}
 			} while (in_next[next_len - 1] == matchptr[next_len - 1]);
 
 		R2_done:
-
-			while (next_len > cache_ptr->length)
-				if (++cache_ptr == end_matches)
+			matches = cache_ptr;
+			cache_ptr += num_matches - 1;
+			while (next_len > cache_ptr->length) {
+				if (cache_ptr == matches)
 					goto done_matches;
+				cache_ptr--;
+			}
 
 			/* Consider explicit offset matches  */
-			do {
+			for (;;) {
 				u32 offset = cache_ptr->offset;
 				u32 offset_data = offset + LZX_OFFSET_ADJUSTMENT;
 				unsigned offset_slot = lzx_comp_get_offset_slot(c, offset_data,
 										is_16_bit);
 				u32 base_cost = cur_node->cost;
+				u32 cost;
 
 			#if LZX_CONSIDER_ALIGNED_COSTS
 				if (offset_data >= 16)
 					base_cost += c->costs.aligned[offset_data &
 								      LZX_ALIGNED_OFFSET_BITMASK];
 			#endif
-
 				do {
-					u32 cost = base_cost +
-						   c->costs.match_cost[offset_slot][
+					cost = base_cost +
+					       c->costs.match_cost[offset_slot][
 								next_len - LZX_MIN_MATCH_LEN];
 					if (cost < (cur_node + next_len)->cost) {
 						(cur_node + next_len)->cost = cost;
@@ -1590,10 +1745,38 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 							(offset_data << OPTIMUM_OFFSET_SHIFT) | next_len;
 					}
 				} while (++next_len <= cache_ptr->length);
-			} while (++cache_ptr != end_matches);
+
+				if (cache_ptr == matches) {
+					/* Consider match + lit + rep0 */
+					u32 remaining = block_end - (in_next + next_len);
+					if (likely(remaining >= 2)) {
+						const u8 *strptr = in_next + next_len;
+						const u8 *matchptr = strptr - offset;
+						if (unlikely(load_u16_unaligned(strptr) == load_u16_unaligned(matchptr))) {
+							u32 rep0_len = lz_extend(strptr, matchptr, 2,
+										 min(remaining, LZX_MAX_MATCH_LEN));
+							u8 lit = strptr[-1];
+							cost += c->costs.main[lit] +
+								c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN];
+							u32 total_len = next_len + rep0_len;
+							if (cost < (cur_node + total_len)->cost) {
+								(cur_node + total_len)->cost = cost;
+								(cur_node + total_len)->item =
+									OPTIMUM_EXTRA_FLAG | rep0_len;
+								(cur_node + total_len)->extra_literal = lit;
+								(cur_node + total_len)->extra_match =
+									(offset_data << OPTIMUM_OFFSET_SHIFT) | (next_len - 1);
+							}
+						}
+					}
+					break;
+				}
+				cache_ptr--;
+			}
 		}
 
 	done_matches:
+		cache_ptr = matches + num_matches;
 
 		/* Consider coding a literal.
 
@@ -1618,12 +1801,21 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 		} else {
 			/* Match: queue update is needed.  */
 			unsigned len = cur_node->item & OPTIMUM_LEN_MASK;
-			u32 offset_data = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
+			u32 offset_data = (cur_node->item &
+					   ~OPTIMUM_EXTRA_FLAG) >> OPTIMUM_OFFSET_SHIFT;
 			if (offset_data >= LZX_NUM_RECENT_OFFSETS) {
 				/* Explicit offset match: insert offset at front  */
 				QUEUE(in_next) =
 					lzx_lru_queue_push(QUEUE(in_next - len),
 							   offset_data - LZX_OFFSET_ADJUSTMENT);
+			} else if (cur_node->item & OPTIMUM_EXTRA_FLAG) {
+				/* Explicit offset match, then literal, then
+				 * rep0 match: insert offset at front  */
+				len += 1 + (cur_node->extra_match & OPTIMUM_LEN_MASK);
+				QUEUE(in_next) =
+					lzx_lru_queue_push(QUEUE(in_next - len),
+							   (cur_node->extra_match >> OPTIMUM_OFFSET_SHIFT) -
+							   LZX_OFFSET_ADJUSTMENT);
 			} else {
 				/* Repeat offset match: swap offset to front  */
 				QUEUE(in_next) =
@@ -1631,7 +1823,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 							   offset_data);
 			}
 		}
-	} while (cur_node != end_node);
+	} while (in_next != block_end);
 
 	/* Return the match offset queue at the end of the minimum cost path. */
 	return QUEUE(block_end);
@@ -1672,14 +1864,14 @@ lzx_compute_match_costs(struct lzx_compressor *c)
 /* Set default LZX Huffman symbol costs to bootstrap the iterative optimization
  * algorithm.  */
 static void
-lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size)
+lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_length)
 {
 	u32 i;
 	bool have_byte[256];
 	unsigned num_used_bytes;
 
-	/* The costs below are hard coded to use a scaling factor of 16.  */
-	STATIC_ASSERT(LZX_BIT_COST == 16);
+	/* The costs below are hard coded to use a scaling factor of 64.  */
+	STATIC_ASSERT(LZX_BIT_COST == 64);
 
 	/*
 	 * Heuristics:
@@ -1696,7 +1888,7 @@ lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size)
 	for (i = 0; i < 256; i++)
 		have_byte[i] = false;
 
-	for (i = 0; i < block_size; i++)
+	for (i = 0; i < block_length; i++)
 		have_byte[block[i]] = true;
 
 	num_used_bytes = 0;
@@ -1704,13 +1896,13 @@ lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size)
 		num_used_bytes += have_byte[i];
 
 	for (i = 0; i < 256; i++)
-		c->costs.main[i] = 140 - (256 - num_used_bytes) / 4;
+		c->costs.main[i] = 560 - (256 - num_used_bytes);
 
 	for (; i < c->num_main_syms; i++)
-		c->costs.main[i] = 170;
+		c->costs.main[i] = 680;
 
 	for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
-		c->costs.len[i] = 103 + (i / 4);
+		c->costs.len[i] = 412 + i;
 
 #if LZX_CONSIDER_ALIGNED_COSTS
 	for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
@@ -1722,7 +1914,7 @@ lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size)
 
 /* Update the current cost model to reflect the computed Huffman codes.  */
 static void
-lzx_update_costs(struct lzx_compressor *c)
+lzx_set_costs_from_codes(struct lzx_compressor *c)
 {
 	unsigned i;
 	const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
@@ -1747,11 +1939,20 @@ lzx_update_costs(struct lzx_compressor *c)
 	lzx_compute_match_costs(c);
 }
 
+/*
+ * Choose a "near-optimal" literal/match sequence to use for the current block.
+ * Because the cost of each Huffman symbol is unknown until the Huffman codes
+ * have been built and the Huffman codes themselves depend on the symbol
+ * frequencies, this uses an iterative optimization algorithm to approximate an
+ * optimal solution.  The first optimization pass for the block uses default
+ * costs.  Additional passes use costs taken from the Huffman codes computed in
+ * the previous pass.
+ */
 static inline struct lzx_lru_queue
 lzx_optimize_and_write_block(struct lzx_compressor * const restrict c,
 			     struct lzx_output_bitstream * const restrict os,
 			     const u8 * const restrict block_begin,
-			     const u32 block_size,
+			     const u32 block_length,
 			     const struct lzx_lru_queue initial_queue,
 			     bool is_16_bit)
 {
@@ -1759,25 +1960,26 @@ lzx_optimize_and_write_block(struct lzx_compressor * const restrict c,
 	struct lzx_lru_queue new_queue;
 	u32 seq_idx;
 
-	/* The first optimization pass uses a default cost model.  Each
-	 * additional optimization pass uses a cost model derived from the
-	 * Huffman code computed in the previous pass.  */
+	lzx_set_default_costs(c, block_begin, block_length);
 
-	lzx_set_default_costs(c, block_begin, block_size);
-	lzx_reset_symbol_frequencies(c);
-	do {
-		new_queue = lzx_find_min_cost_path(c, block_begin, block_size,
+	for (;;) {
+		new_queue = lzx_find_min_cost_path(c, block_begin, block_length,
 						   initial_queue, is_16_bit);
-		if (num_passes_remaining > 1) {
-			lzx_tally_item_list(c, block_size, is_16_bit);
-			lzx_make_huffman_codes(c);
-			lzx_update_costs(c);
-			lzx_reset_symbol_frequencies(c);
-		}
-	} while (--num_passes_remaining);
 
-	seq_idx = lzx_record_item_list(c, block_size, is_16_bit);
-	lzx_finish_block(c, os, block_begin, block_size, seq_idx);
+		if (--num_passes_remaining == 0)
+			break;
+
+		/* At least one iteration remains; update the costs. */
+		lzx_reset_symbol_frequencies(c);
+		lzx_tally_item_list(c, block_length, is_16_bit);
+		lzx_make_huffman_codes(c);
+		lzx_set_costs_from_codes(c);
+	}
+
+	/* Done optimizing.  Generate the sequence list and flush the block. */
+	lzx_reset_symbol_frequencies(c);
+	seq_idx = lzx_record_item_list(c, block_length, is_16_bit);
+	lzx_flush_block(c, os, block_begin, block_length, seq_idx);
 	return new_queue;
 }
 
@@ -1795,63 +1997,51 @@ lzx_optimize_and_write_block(struct lzx_compressor * const restrict c,
  * simpler "greedy" or "lazy" parse while still being relatively fast.
  */
 static inline void
-lzx_compress_near_optimal(struct lzx_compressor *c,
-			  struct lzx_output_bitstream *os,
+lzx_compress_near_optimal(struct lzx_compressor * restrict c,
+			  const u8 * const restrict in_begin,
+			  struct lzx_output_bitstream * restrict os,
 			  bool is_16_bit)
 {
-	const u8 * const in_begin = c->in_buffer;
 	const u8 *	 in_next = in_begin;
 	const u8 * const in_end  = in_begin + c->in_nbytes;
-	u32 max_len = LZX_MAX_MATCH_LEN;
-	u32 nice_len = min(c->nice_match_length, max_len);
-	u32 next_hashes[2] = {};
 	struct lzx_lru_queue queue;
 
-	CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
+	lcpit_matchfinder_load_buffer(&c->lcpit_mf, in_begin, c->in_nbytes);
 	lzx_lru_queue_init(&queue);
 
 	do {
 		/* Starting a new block  */
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_block_end =
-			in_next + min(LZX_DIV_BLOCK_SIZE, in_end - in_next);
+		const u8 * const in_max_block_end =
+			in_next + min(SOFT_MAX_BLOCK_LENGTH, in_end - in_next);
+		struct lz_match *cache_ptr = c->match_cache;
+		const u8 *next_observation = in_next;
+		const u8 *next_pause_point = min(in_next + MIN_BLOCK_LENGTH,
+						 in_max_block_end - LZX_MAX_MATCH_LEN - 1);
+
+		init_block_split_stats(&c->split_stats);
 
 		/* Run the block through the matchfinder and cache the matches. */
-		struct lz_match *cache_ptr = c->match_cache;
+	enter_mf_loop:
 		do {
-			struct lz_match *lz_matchptr;
-			u32 best_len;
-
-			/* If approaching the end of the input buffer, adjust
-			 * 'max_len' and 'nice_len' accordingly.  */
-			if (unlikely(max_len > in_end - in_next)) {
-				max_len = in_end - in_next;
-				nice_len = min(max_len, nice_len);
-				if (unlikely(max_len <
-					     BT_MATCHFINDER_REQUIRED_NBYTES))
-				{
-					in_next++;
-					cache_ptr->length = 0;
-					cache_ptr++;
-					continue;
+			u32 num_matches;
+			u32 best_len = 0;
+
+			num_matches = lcpit_matchfinder_get_matches(&c->lcpit_mf, cache_ptr + 1);
+			cache_ptr->length = num_matches;
+			if (num_matches)
+				best_len = cache_ptr[1].length;
+
+			if (in_next >= next_observation) {
+				if (best_len) {
+					observe_match(&c->split_stats, best_len);
+					next_observation = in_next + best_len;
+				} else {
+					observe_literal(&c->split_stats, *in_next);
+					next_observation = in_next + 1;
 				}
 			}
 
-			/* Check for matches.  */
-			lz_matchptr = CALL_BT_MF(is_16_bit, c,
-						 bt_matchfinder_get_matches,
-						 in_begin,
-						 in_next - in_begin,
-						 max_len,
-						 nice_len,
-						 c->max_search_depth,
-						 next_hashes,
-						 &best_len,
-						 cache_ptr + 1);
-			in_next++;
-			cache_ptr->length = lz_matchptr - (cache_ptr + 1);
-			cache_ptr = lz_matchptr;
-
 			/*
 			 * If there was a very long match found, then don't
 			 * cache any matches for the bytes covered by that
@@ -1864,37 +2054,47 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 			 * data must be highly compressible, so it doesn't
 			 * matter as much what we do.
 			 */
-			if (best_len >= nice_len) {
-				--best_len;
-				do {
-					if (unlikely(max_len > in_end - in_next)) {
-						max_len = in_end - in_next;
-						nice_len = min(max_len, nice_len);
-						if (unlikely(max_len <
-							     BT_MATCHFINDER_REQUIRED_NBYTES))
-						{
-							in_next++;
-							cache_ptr->length = 0;
-							cache_ptr++;
-							continue;
-						}
-					}
-					CALL_BT_MF(is_16_bit, c,
-						   bt_matchfinder_skip_position,
-						   in_begin,
-						   in_next - in_begin,
-						   max_len,
-						   nice_len,
-						   c->max_search_depth,
-						   next_hashes);
-					in_next++;
+			if (best_len >= c->nice_match_length) {
+				best_len = lz_extend(in_next, in_next - cache_ptr[1].offset,
+						     best_len,
+						     min(LZX_MAX_MATCH_LEN,
+							 in_end - in_next));
+				cache_ptr[1].length = best_len;
+				lcpit_matchfinder_skip_bytes(&c->lcpit_mf, best_len - 1);
+				cache_ptr += 1 + num_matches;
+				for (u32 i = 0; i < best_len - 1; i++) {
 					cache_ptr->length = 0;
 					cache_ptr++;
-				} while (--best_len);
+				}
+				in_next += best_len;
+				next_observation = in_next;
+			} else {
+				cache_ptr += 1 + num_matches;
+				in_next++;
 			}
-		} while (in_next < in_block_end &&
+		} while (in_next < next_pause_point &&
 			 likely(cache_ptr < &c->match_cache[LZX_CACHE_LENGTH]));
 
+		if (unlikely(cache_ptr >= &c->match_cache[LZX_CACHE_LENGTH]))
+			goto flush_block;
+
+		if (in_next >= in_max_block_end)
+			goto flush_block;
+
+		if (c->split_stats.num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK) {
+			if (do_end_block_check(&c->split_stats, in_next - in_block_begin))
+				goto flush_block;
+			if (in_max_block_end - in_next <= MIN_BLOCK_LENGTH)
+				next_observation = in_max_block_end;
+		}
+
+		next_pause_point = min(in_next +
+				       NUM_OBSERVATIONS_PER_BLOCK_CHECK * 2 -
+				       c->split_stats.num_new_observations,
+				       in_max_block_end - LZX_MAX_MATCH_LEN - 1);
+		goto enter_mf_loop;
+
+	flush_block:
 		/* We've finished running the block through the matchfinder.
 		 * Now choose a match/literal sequence and write the block.  */
 
@@ -1908,14 +2108,14 @@ static void
 lzx_compress_near_optimal_16(struct lzx_compressor *c,
 			     struct lzx_output_bitstream *os)
 {
-	lzx_compress_near_optimal(c, os, true);
+	lzx_compress_near_optimal(c, c->in_buffer, os, true);
 }
 
 static void
 lzx_compress_near_optimal_32(struct lzx_compressor *c,
 			     struct lzx_output_bitstream *os)
 {
-	lzx_compress_near_optimal(c, os, false);
+	lzx_compress_near_optimal(c, c->in_buffer, os, false);
 }
 
 /*
@@ -2012,8 +2212,8 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 		/* Starting a new block  */
 
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_block_end =
-			in_next + min(LZX_DIV_BLOCK_SIZE, in_end - in_next);
+		const u8 * const in_max_block_end =
+			in_next + min(SOFT_MAX_BLOCK_LENGTH, in_end - in_next);
 		struct lzx_sequence *next_seq = c->chosen_sequences;
 		unsigned cur_len;
 		u32 cur_offset;
@@ -2030,6 +2230,7 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 		u32 litrunlen = 0;
 
 		lzx_reset_symbol_frequencies(c);
+		init_block_split_stats(&c->split_stats);
 
 		do {
 			if (unlikely(max_len > in_end - in_next)) {
@@ -2058,10 +2259,14 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 			{
 				/* There was no match found, or the only match found
 				 * was a distant length 3 match.  Output a literal.  */
-				lzx_record_literal(c, *in_next++, &litrunlen);
+				lzx_record_literal(c, *in_next, &litrunlen);
+				observe_literal(&c->split_stats, *in_next);
+				in_next++;
 				continue;
 			}
 
+			observe_match(&c->split_stats, cur_len);
+
 			if (cur_offset == recent_offsets[0]) {
 				in_next++;
 				cur_offset_data = 0;
@@ -2174,11 +2379,12 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 					     in_end - in_begin,
 					     skip_len,
 					     next_hashes);
-		} while (in_next < in_block_end);
+		} while (in_next < in_max_block_end &&
+			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
 
 		lzx_finish_sequence(next_seq, litrunlen);
 
-		lzx_finish_block(c, os, in_block_begin, in_next - in_block_begin, 0);
+		lzx_flush_block(c, os, in_block_begin, in_next - in_block_begin, 0);
 
 	} while (in_next != in_end);
 }
@@ -2233,11 +2439,11 @@ lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
 			       hc_matchfinder_size_32(max_bufsize);
 	} else {
 		if (lzx_is_16_bit(max_bufsize))
-			return offsetof(struct lzx_compressor, bt_mf_16) +
-			       bt_matchfinder_size_16(max_bufsize);
+			return offsetof(struct lzx_compressor, lcpit_mf) +
+			       sizeof(struct lcpit_matchfinder);
 		else
-			return offsetof(struct lzx_compressor, bt_mf_32) +
-			       bt_matchfinder_size_32(max_bufsize);
+			return offsetof(struct lzx_compressor, lcpit_mf) +
+			       sizeof(struct lcpit_matchfinder);
 	}
 }
 
@@ -2253,6 +2459,8 @@ lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
 	size += lzx_get_compressor_size(max_bufsize, compression_level);
 	if (!destructive)
 		size += max_bufsize; /* in_buffer */
+	if (compression_level > LZX_MAX_FAST_LEVEL)
+		size += lcpit_matchfinder_get_needed_memory(max_bufsize);
 	return size;
 }
 
@@ -2343,10 +2551,17 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 	if (c->nice_match_length > LZX_MAX_MATCH_LEN)
 		c->nice_match_length = LZX_MAX_MATCH_LEN;
 
+	if (!lcpit_matchfinder_init(&c->lcpit_mf, max_bufsize,
+				    LZX_MIN_MATCH_LEN, c->nice_match_length))
+		goto oom2;
+
 	lzx_init_offset_slot_tabs(c);
 	*c_ret = c;
 	return 0;
 
+oom2:
+	if (!c->destructive)
+		FREE(c->in_buffer);
 oom1:
 	FREE(c);
 oom0:
@@ -2395,6 +2610,7 @@ lzx_free_compressor(void *_c)
 {
 	struct lzx_compressor *c = _c;
 
+	lcpit_matchfinder_destroy(&c->lcpit_mf);
 	if (!c->destructive)
 		FREE(c->in_buffer);
 	FREE(c);