X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flzx_compress.c;h=e43c370136e717b1e91f43bfab355778d745beee;hb=0852fe9daca5413edf943fb418ad8eca9ff82644;hp=d36f8aaa380ae747723f8e0f475862464af5145b;hpb=5ec910e4e9126a37eed1ff199d55a1952c76e0f7;p=wimlib

diff --git a/src/lzx_compress.c b/src/lzx_compress.c
index d36f8aaa..e43c3701 100644
--- a/src/lzx_compress.c
+++ b/src/lzx_compress.c
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (C) 2012, 2013, 2014, 2015 Eric Biggers
+ * Copyright (C) 2012-2016 Eric Biggers
  *
  * This file is free software; you can redistribute it and/or modify it under
  * the terms of the GNU Lesser General Public License as published by the Free
@@ -65,33 +65,35 @@
 #endif
 
 /*
- * Start a new LZX block (with new Huffman codes) after this many bytes.
- *
- * Note: actual block sizes may slightly exceed this value.
- *
- * TODO: recursive splitting and cost evaluation might be good for an extremely
- * high compression mode, but otherwise it is almost always far too slow for how
- * much it helps.  Perhaps some sort of heuristic would be useful?
+ * The compressor always chooses a block of at least MIN_BLOCK_SIZE bytes,
+ * except if the last block has to be shorter.
  */
-#define LZX_DIV_BLOCK_SIZE	32768
+#define MIN_BLOCK_SIZE		6500
 
 /*
- * LZX_CACHE_PER_POS is the number of lz_match structures to reserve in the
- * match cache for each byte position.  This value should be high enough so that
- * nearly the time, all matches found in a given block can fit in the match
- * cache.  However, fallback behavior (immediately terminating the block) on
- * cache overflow is still required.
+ * The compressor attempts to end blocks after SOFT_MAX_BLOCK_SIZE bytes, but
+ * the final size might be larger due to matches extending beyond the end of the
+ * block.  Specifically:
+ *
+ *  - The greedy parser may choose an arbitrarily long match starting at the
+ *    SOFT_MAX_BLOCK_SIZE'th byte.
+ *
+ *  - The lazy parser may choose a sequence of literals starting at the
+ *    SOFT_MAX_BLOCK_SIZE'th byte when it sees a sequence of increasing good
+ *    matches.  The final match may be of arbitrary length.  The length of the
+ *    literal sequence is approximately limited by the "nice match length"
+ *    parameter.
  */
-#define LZX_CACHE_PER_POS	7
+#define SOFT_MAX_BLOCK_SIZE	100000
 
 /*
  * LZX_CACHE_LENGTH is the number of lz_match structures in the match cache,
- * excluding the extra "overflow" entries.  The per-position multiplier is '1 +
- * LZX_CACHE_PER_POS' instead of 'LZX_CACHE_PER_POS' because there is an
- * overhead of one lz_match per position, used to hold the match count at that
- * position.
+ * excluding the extra "overflow" entries.  This value should be high enough so
+ * that nearly the time, all matches found in a given block can fit in the match
+ * cache.  However, fallback behavior (immediately terminating the block) on
+ * cache overflow is still required.
  */
-#define LZX_CACHE_LENGTH	(LZX_DIV_BLOCK_SIZE * (1 + LZX_CACHE_PER_POS))
+#define LZX_CACHE_LENGTH	(SOFT_MAX_BLOCK_SIZE * 5)
 
 /*
  * LZX_MAX_MATCHES_PER_POS is an upper bound on the number of matches that can
@@ -214,6 +216,17 @@ struct lzx_freqs {
 	u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
+/* Block split statistics.  See "Block splitting algorithm" below. */
+#define NUM_LITERAL_OBSERVATION_TYPES 8
+#define NUM_MATCH_OBSERVATION_TYPES 2
+#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
+struct block_split_stats {
+	u32 new_observations[NUM_OBSERVATION_TYPES];
+	u32 observations[NUM_OBSERVATION_TYPES];
+	u32 num_new_observations;
+	u32 num_observations;
+};
+
 /*
  * Represents a run of literals followed by a match or end-of-block.  This
  * struct is needed to temporarily store items chosen by the parser, since items
@@ -232,9 +245,10 @@ struct lzx_sequence {
 	u16 adjusted_length;
 
 	/* If bit 31 is clear, then this field contains the match header in bits
-	 * 0-8 and the match offset minus LZX_OFFSET_ADJUSTMENT in bits 9-30.
-	 * Otherwise, this sequence's literal run was the last literal run in
-	 * the block, so there is no match that follows it.  */
+	 * 0-8, and either the match offset plus LZX_OFFSET_ADJUSTMENT or a
+	 * recent offset code in bits 9-30.  Otherwise (if bit 31 is set), this
+	 * sequence's literal run was the last literal run in the block, so
+	 * there is no match that follows it.  */
 	u32 adjusted_offset_and_match_hdr;
 };
 
@@ -331,15 +345,6 @@ lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
 	};
 }
 
-/* Pop a match offset off the front (most recently used) end of the queue.  */
-static inline u32
-lzx_lru_queue_pop(struct lzx_lru_queue *queue_p)
-{
-	u32 offset = queue_p->R & LZX_QUEUE64_OFFSET_MASK;
-	queue_p->R >>= LZX_QUEUE64_OFFSET_SHIFT;
-	return offset;
-}
-
 /* Swap a match offset to the front of the queue.  */
 static inline struct lzx_lru_queue
 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
@@ -402,6 +407,9 @@ struct lzx_compressor {
 	/* The Huffman symbol frequency counters for the current block.  */
 	struct lzx_freqs freqs;
 
+	/* Block split statistics.  */
+	struct block_split_stats split_stats;
+
 	/* The Huffman codes for the current and previous blocks.  The one with
 	 * index 'codes_index' is for the current block, and the other one is
 	 * for the previous block.  */
@@ -410,8 +418,10 @@ struct lzx_compressor {
 
 	/* The matches and literals that the parser has chosen for the current
 	 * block.  The required length of this array is limited by the maximum
-	 * number of matches that can ever be chosen for a single block.  */
-	struct lzx_sequence chosen_sequences[DIV_ROUND_UP(LZX_DIV_BLOCK_SIZE, LZX_MIN_MATCH_LEN)];
+	 * number of matches that can ever be chosen for a single block, plus
+	 * one for the special entry at the end.  */
+	struct lzx_sequence chosen_sequences[
+		       DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
 
 	/* Tables for mapping adjusted offsets to offset slots  */
 
@@ -434,24 +444,18 @@ struct lzx_compressor {
 		/* Data for near-optimal parsing  */
 		struct {
 			/*
-			 * The graph nodes for the current block.
+			 * Array of nodes, one per position, for running the
+			 * minimum-cost path algorithm.
 			 *
-			 * We need at least 'LZX_DIV_BLOCK_SIZE +
-			 * LZX_MAX_MATCH_LEN - 1' nodes because that is the
-			 * maximum block size that may be used.  Add 1 because
-			 * we need a node to represent end-of-block.
-			 *
-			 * It is possible that nodes past end-of-block are
-			 * accessed during match consideration, but this can
-			 * only occur if the block was truncated at
-			 * LZX_DIV_BLOCK_SIZE.  So the same bound still applies.
-			 * Note that since nodes past the end of the block will
-			 * never actually have an effect on the items that are
-			 * chosen for the block, it makes no difference what
-			 * their costs are initialized to (if anything).
+			 * This array must be large enough to accommodate the
+			 * worst-case number of nodes, which occurs if we find a
+			 * match of length LZX_MAX_MATCH_LEN at position
+			 * SOFT_MAX_BLOCK_SIZE - 1, producing a block of length
+			 * SOFT_MAX_BLOCK_SIZE - 1 + LZX_MAX_MATCH_LEN.  Add one
+			 * for the end-of-block node.
 			 */
-			struct lzx_optimum_node optimum_nodes[LZX_DIV_BLOCK_SIZE +
-							      LZX_MAX_MATCH_LEN - 1 + 1];
+			struct lzx_optimum_node optimum_nodes[SOFT_MAX_BLOCK_SIZE - 1 +
+							      LZX_MAX_MATCH_LEN + 1];
 
 			/* The cost model for the current block  */
 			struct lzx_costs costs;
@@ -548,7 +552,7 @@ struct lzx_output_bitstream {
 
 /* Can the specified number of bits always be added to 'bitbuf' after any
  * pending 16-bit coding units have been flushed?  */
-#define CAN_BUFFER(n)	((n) <= (8 * sizeof(machine_word_t)) - 16)
+#define CAN_BUFFER(n)	((n) <= (8 * sizeof(machine_word_t)) - 15)
 
 /*
  * Initialize the output bitstream.
@@ -585,13 +589,21 @@ lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 static inline void
 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
 {
+	/* Masking the number of bits to shift is only needed to avoid undefined
+	 * behavior; we don't actually care about the results of bad shifts.  On
+	 * x86, the explicit masking generates no extra code.  */
+	const u32 shift_mask = 8 * sizeof(os->bitbuf) - 1;
+
 	if (os->end - os->next < 6)
 		return;
-	put_unaligned_u16_le(os->bitbuf >> (os->bitcount - 16), os->next + 0);
+	put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
+					    shift_mask), os->next + 0);
 	if (max_num_bits > 16)
-		put_unaligned_u16_le(os->bitbuf >> (os->bitcount - 32), os->next + 2);
+		put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
+						shift_mask), os->next + 2);
 	if (max_num_bits > 32)
-		put_unaligned_u16_le(os->bitbuf >> (os->bitcount - 48), os->next + 4);
+		put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
+						shift_mask), os->next + 4);
 	os->next += (os->bitcount >> 4) << 1;
 	os->bitcount &= 15;
 }
@@ -615,7 +627,7 @@ lzx_flush_output(struct lzx_output_bitstream *os)
 		return 0;
 
 	if (os->bitcount != 0) {
-		put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next);
+		put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
 		os->next += 2;
 	}
 
@@ -634,7 +646,7 @@ lzx_make_huffman_codes(struct lzx_compressor *c)
 
 	STATIC_ASSERT(MAIN_CODEWORD_LIMIT >= 9 &&
 		      MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
-	STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 9 &&
+	STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 8 &&
 		      LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
 	STATIC_ASSERT(ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
 		      ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
@@ -901,23 +913,30 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 					unsigned lit1 = block_data[1];
 					unsigned lit2 = block_data[2];
 					unsigned lit3 = block_data[3];
-					lzx_add_bits(os, codes->codewords.main[lit0], codes->lens.main[lit0]);
-					lzx_add_bits(os, codes->codewords.main[lit1], codes->lens.main[lit1]);
-					lzx_add_bits(os, codes->codewords.main[lit2], codes->lens.main[lit2]);
-					lzx_add_bits(os, codes->codewords.main[lit3], codes->lens.main[lit3]);
+					lzx_add_bits(os, codes->codewords.main[lit0],
+						     codes->lens.main[lit0]);
+					lzx_add_bits(os, codes->codewords.main[lit1],
+						     codes->lens.main[lit1]);
+					lzx_add_bits(os, codes->codewords.main[lit2],
+						     codes->lens.main[lit2]);
+					lzx_add_bits(os, codes->codewords.main[lit3],
+						     codes->lens.main[lit3]);
 					lzx_flush_bits(os, 4 * MAIN_CODEWORD_LIMIT);
 					block_data += 4;
 					litrunlen -= 4;
 				}
 				if (litrunlen--) {
 					unsigned lit = *block_data++;
-					lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
+					lzx_add_bits(os, codes->codewords.main[lit],
+						     codes->lens.main[lit]);
 					if (litrunlen--) {
 						unsigned lit = *block_data++;
-						lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
+						lzx_add_bits(os, codes->codewords.main[lit],
+							     codes->lens.main[lit]);
 						if (litrunlen--) {
 							unsigned lit = *block_data++;
-							lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
+							lzx_add_bits(os, codes->codewords.main[lit],
+								     codes->lens.main[lit]);
 							lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
 						} else {
 							lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
@@ -930,7 +949,8 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 				/* 32-bit: write 1 literal at a time.  */
 				do {
 					unsigned lit = *block_data++;
-					lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
+					lzx_add_bits(os, codes->codewords.main[lit],
+						     codes->lens.main[lit]);
 					lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
 				} while (--litrunlen);
 			}
@@ -970,8 +990,10 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 		/* If needed, output the length symbol for the match.  */
 
 		if (adjusted_length >= LZX_NUM_PRIMARY_LENS) {
-			lzx_add_bits(os, codes->codewords.len[adjusted_length - LZX_NUM_PRIMARY_LENS],
-				     codes->lens.len[adjusted_length - LZX_NUM_PRIMARY_LENS]);
+			lzx_add_bits(os, codes->codewords.len[adjusted_length -
+							      LZX_NUM_PRIMARY_LENS],
+				     codes->lens.len[adjusted_length -
+						     LZX_NUM_PRIMARY_LENS]);
 			if (!CAN_BUFFER(MAX_MATCH_BITS))
 				lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
 		}
@@ -989,11 +1011,15 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 			if (!CAN_BUFFER(MAX_MATCH_BITS))
 				lzx_flush_bits(os, 14);
 
-			lzx_add_bits(os, codes->codewords.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK],
-				     codes->lens.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]);
+			lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
+								  LZX_ALIGNED_OFFSET_BITMASK],
+				     codes->lens.aligned[adjusted_offset &
+							 LZX_ALIGNED_OFFSET_BITMASK]);
 			if (!CAN_BUFFER(MAX_MATCH_BITS))
 				lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
 		} else {
+			STATIC_ASSERT(CAN_BUFFER(17));
+
 			lzx_add_bits(os, extra_bits, num_extra_bits);
 			if (!CAN_BUFFER(MAX_MATCH_BITS))
 				lzx_flush_bits(os, 17);
@@ -1221,6 +1247,121 @@ lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
 	last_seq->adjusted_offset_and_match_hdr = 0x80000000;
 }
 
+/******************************************************************************/
+
+/*
+ * Block splitting algorithm.  The problem is to decide when it is worthwhile to
+ * start a new block with new entropy codes.  There is a theoretically optimal
+ * solution: recursively consider every possible block split, considering the
+ * exact cost of each block, and choose the minimum cost approach.  But this is
+ * far too slow.  Instead, as an approximation, we can count symbols and after
+ * every N symbols, compare the expected distribution of symbols based on the
+ * previous data with the actual distribution.  If they differ "by enough", then
+ * start a new block.
+ *
+ * As an optimization and heuristic, we don't distinguish between every symbol
+ * but rather we combine many symbols into a single "observation type".  For
+ * literals we only look at the high bits and low bits, and for matches we only
+ * look at whether the match is long or not.  The assumption is that for typical
+ * "real" data, places that are good block boundaries will tend to be noticable
+ * based only on changes in these aggregate frequencies, without looking for
+ * subtle differences in individual symbols.  For example, a change from ASCII
+ * bytes to non-ASCII bytes, or from few matches (generally less compressible)
+ * to many matches (generally more compressible), would be easily noticed based
+ * on the aggregates.
+ *
+ * For determining whether the frequency distributions are "different enough" to
+ * start a new block, the simply heuristic of splitting when the sum of absolute
+ * differences exceeds a constant seems to be good enough.  We also add a number
+ * proportional to the block size so that the algorithm is more likely to end
+ * large blocks than small blocks.  This reflects the general expectation that
+ * it will become increasingly beneficial to start a new block as the current
+ * blocks grows larger.
+ *
+ * Finally, for an approximation, it is not strictly necessary that the exact
+ * symbols being used are considered.  With "near-optimal parsing", for example,
+ * the actual symbols that will be used are unknown until after the block
+ * boundary is chosen and the block has been optimized.  Since the final choices
+ * cannot be used, we can use preliminary "greedy" choices instead.
+ */
+
+/* Initialize the block split statistics when starting a new block. */
+static void
+init_block_split_stats(struct block_split_stats *stats)
+{
+	for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->new_observations[i] = 0;
+		stats->observations[i] = 0;
+	}
+	stats->num_new_observations = 0;
+	stats->num_observations = 0;
+}
+
+/* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
+ * literal, for 8 possible literal observation types.  */
+static inline void
+observe_literal(struct block_split_stats *stats, u8 lit)
+{
+	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
+	stats->num_new_observations++;
+}
+
+/* Match observation.  Heuristic: use one observation type for "short match" and
+ * one observation type for "long match".  */
+static inline void
+observe_match(struct block_split_stats *stats, unsigned length)
+{
+	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
+	stats->num_new_observations++;
+}
+
+static bool
+do_end_block_check(struct block_split_stats *stats, u32 block_size)
+{
+	if (stats->num_observations > 0) {
+
+		/* Note: to avoid slow divisions, we do not divide by
+		 * 'num_observations', but rather do all math with the numbers
+		 * multiplied by 'num_observations'.  */
+		u32 total_delta = 0;
+		for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+			u32 expected = stats->observations[i] * stats->num_new_observations;
+			u32 actual = stats->new_observations[i] * stats->num_observations;
+			u32 delta = (actual > expected) ? actual - expected :
+							  expected - actual;
+			total_delta += delta;
+		}
+
+		/* Ready to end the block? */
+		if (total_delta + (block_size >> 10) * stats->num_observations >=
+		    200 * stats->num_observations)
+			return true;
+	}
+
+	for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->num_observations += stats->new_observations[i];
+		stats->observations[i] += stats->new_observations[i];
+		stats->new_observations[i] = 0;
+	}
+	stats->num_new_observations = 0;
+	return false;
+}
+
+static inline bool
+should_end_block(struct block_split_stats *stats,
+		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
+{
+	/* Ready to check block split statistics? */
+	if (stats->num_new_observations < 250 ||
+	    in_next - in_block_begin < MIN_BLOCK_SIZE ||
+	    in_end - in_next < MIN_BLOCK_SIZE)
+		return false;
+
+	return do_end_block_check(stats, in_next - in_block_begin);
+}
+
+/******************************************************************************/
+
 /*
  * Given the minimum-cost path computed through the item graph for the current
  * block, walk the path and count how many of each symbol in each Huffman-coded
@@ -1585,8 +1726,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 		 * of coding the literal is integrated into the queue update
 		 * code below.  */
 		literal = *in_next++;
-		cost = cur_node->cost +
-		       c->costs.main[lzx_main_symbol_for_literal(literal)];
+		cost = cur_node->cost + c->costs.main[literal];
 
 		/* Advance to the next position.  */
 		cur_node++;
@@ -1626,13 +1766,15 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 static void
 lzx_compute_match_costs(struct lzx_compressor *c)
 {
-	unsigned num_offset_slots = lzx_get_num_offset_slots(c->window_order);
+	unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) /
+					LZX_NUM_LEN_HEADERS;
 	struct lzx_costs *costs = &c->costs;
 
 	for (unsigned offset_slot = 0; offset_slot < num_offset_slots; offset_slot++) {
 
 		u32 extra_cost = (u32)lzx_extra_offset_bits[offset_slot] * LZX_BIT_COST;
-		unsigned main_symbol = lzx_main_symbol_for_match(offset_slot, 0);
+		unsigned main_symbol = LZX_NUM_CHARS + (offset_slot *
+							LZX_NUM_LEN_HEADERS);
 		unsigned i;
 
 	#if LZX_CONSIDER_ALIGNED_COSTS
@@ -1710,15 +1852,21 @@ lzx_update_costs(struct lzx_compressor *c)
 	unsigned i;
 	const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
 
-	for (i = 0; i < c->num_main_syms; i++)
-		c->costs.main[i] = (lens->main[i] ? lens->main[i] : 15) * LZX_BIT_COST;
+	for (i = 0; i < c->num_main_syms; i++) {
+		c->costs.main[i] = (lens->main[i] ? lens->main[i] :
+				    MAIN_CODEWORD_LIMIT) * LZX_BIT_COST;
+	}
 
-	for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
-		c->costs.len[i] = (lens->len[i] ? lens->len[i] : 15) * LZX_BIT_COST;
+	for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
+		c->costs.len[i] = (lens->len[i] ? lens->len[i] :
+				   LENGTH_CODEWORD_LIMIT) * LZX_BIT_COST;
+	}
 
 #if LZX_CONSIDER_ALIGNED_COSTS
-	for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
-		c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] : 7) * LZX_BIT_COST;
+	for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
+		c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] :
+				       ALIGNED_CODEWORD_LIMIT) * LZX_BIT_COST;
+	}
 #endif
 
 	lzx_compute_match_costs(c);
@@ -1781,7 +1929,7 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 	const u8 * const in_end  = in_begin + c->in_nbytes;
 	u32 max_len = LZX_MAX_MATCH_LEN;
 	u32 nice_len = min(c->nice_match_length, max_len);
-	u32 next_hash = 0;
+	u32 next_hashes[2] = {};
 	struct lzx_lru_queue queue;
 
 	CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
@@ -1790,8 +1938,11 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 	do {
 		/* Starting a new block  */
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_block_end =
-			in_next + min(LZX_DIV_BLOCK_SIZE, in_end - in_next);
+		const u8 * const in_max_block_end =
+			in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
+		const u8 *next_observation = in_next;
+
+		init_block_split_stats(&c->split_stats);
 
 		/* Run the block through the matchfinder and cache the matches. */
 		struct lz_match *cache_ptr = c->match_cache;
@@ -1804,7 +1955,9 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 			if (unlikely(max_len > in_end - in_next)) {
 				max_len = in_end - in_next;
 				nice_len = min(max_len, nice_len);
-				if (unlikely(max_len < 5)) {
+				if (unlikely(max_len <
+					     BT_MATCHFINDER_REQUIRED_NBYTES))
+				{
 					in_next++;
 					cache_ptr->length = 0;
 					cache_ptr++;
@@ -1813,15 +1966,30 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 			}
 
 			/* Check for matches.  */
-			lz_matchptr = CALL_BT_MF(is_16_bit, c, bt_matchfinder_get_matches,
+			lz_matchptr = CALL_BT_MF(is_16_bit, c,
+						 bt_matchfinder_get_matches,
 						 in_begin,
 						 in_next - in_begin,
 						 max_len,
 						 nice_len,
 						 c->max_search_depth,
-						 &next_hash,
+						 next_hashes,
 						 &best_len,
 						 cache_ptr + 1);
+
+			if (in_next >= next_observation) {
+				best_len = 0;
+				if (lz_matchptr > cache_ptr + 1)
+					best_len = (lz_matchptr - 1)->length;
+				if (best_len >= 2) {
+					observe_match(&c->split_stats, best_len);
+					next_observation = in_next + best_len;
+				} else {
+					observe_literal(&c->split_stats, *in_next);
+					next_observation = in_next + 1;
+				}
+			}
+
 			in_next++;
 			cache_ptr->length = lz_matchptr - (cache_ptr + 1);
 			cache_ptr = lz_matchptr;
@@ -1844,27 +2012,31 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 					if (unlikely(max_len > in_end - in_next)) {
 						max_len = in_end - in_next;
 						nice_len = min(max_len, nice_len);
-						if (unlikely(max_len < 5)) {
+						if (unlikely(max_len <
+							     BT_MATCHFINDER_REQUIRED_NBYTES))
+						{
 							in_next++;
 							cache_ptr->length = 0;
 							cache_ptr++;
 							continue;
 						}
 					}
-					CALL_BT_MF(is_16_bit, c, bt_matchfinder_skip_position,
+					CALL_BT_MF(is_16_bit, c,
+						   bt_matchfinder_skip_position,
 						   in_begin,
 						   in_next - in_begin,
 						   max_len,
 						   nice_len,
 						   c->max_search_depth,
-						   &next_hash);
+						   next_hashes);
 					in_next++;
 					cache_ptr->length = 0;
 					cache_ptr++;
 				} while (--best_len);
 			}
-		} while (in_next < in_block_end &&
-			 likely(cache_ptr < &c->match_cache[LZX_CACHE_LENGTH]));
+		} while (in_next < in_max_block_end &&
+			 likely(cache_ptr < &c->match_cache[LZX_CACHE_LENGTH]) &&
+			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
 
 		/* We've finished running the block through the matchfinder.
 		 * Now choose a match/literal sequence and write the block.  */
@@ -1983,8 +2155,8 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 		/* Starting a new block  */
 
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_block_end =
-			in_next + min(LZX_DIV_BLOCK_SIZE, in_end - in_next);
+		const u8 * const in_max_block_end =
+			in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
 		struct lzx_sequence *next_seq = c->chosen_sequences;
 		unsigned cur_len;
 		u32 cur_offset;
@@ -2001,6 +2173,7 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 		u32 litrunlen = 0;
 
 		lzx_reset_symbol_frequencies(c);
+		init_block_split_stats(&c->split_stats);
 
 		do {
 			if (unlikely(max_len > in_end - in_next)) {
@@ -2010,7 +2183,8 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 
 			/* Find the longest match at the current position.  */
 
-			cur_len = CALL_HC_MF(is_16_bit, c, hc_matchfinder_longest_match,
+			cur_len = CALL_HC_MF(is_16_bit, c,
+					     hc_matchfinder_longest_match,
 					     in_begin,
 					     in_next - in_begin,
 					     2,
@@ -2028,10 +2202,14 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 			{
 				/* There was no match found, or the only match found
 				 * was a distant length 3 match.  Output a literal.  */
-				lzx_record_literal(c, *in_next++, &litrunlen);
+				lzx_record_literal(c, *in_next, &litrunlen);
+				observe_literal(&c->split_stats, *in_next);
+				in_next++;
 				continue;
 			}
 
+			observe_match(&c->split_stats, cur_len);
+
 			if (cur_offset == recent_offsets[0]) {
 				in_next++;
 				cur_offset_data = 0;
@@ -2076,7 +2254,8 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 				nice_len = min(max_len, nice_len);
 			}
 
-			next_len = CALL_HC_MF(is_16_bit, c, hc_matchfinder_longest_match,
+			next_len = CALL_HC_MF(is_16_bit, c,
+					      hc_matchfinder_longest_match,
 					      in_begin,
 					      in_next - in_begin,
 					      cur_len - 2,
@@ -2136,13 +2315,15 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 			lzx_record_match(c, cur_len, cur_offset_data,
 					 recent_offsets, is_16_bit,
 					 &litrunlen, &next_seq);
-			in_next = CALL_HC_MF(is_16_bit, c, hc_matchfinder_skip_positions,
+			in_next = CALL_HC_MF(is_16_bit, c,
+					     hc_matchfinder_skip_positions,
 					     in_begin,
 					     in_next - in_begin,
 					     in_end - in_begin,
 					     skip_len,
 					     next_hashes);
-		} while (in_next < in_block_end);
+		} while (in_next < in_max_block_end &&
+			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
 
 		lzx_finish_sequence(next_seq, litrunlen);
 
@@ -2258,8 +2439,8 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 			c->impl = lzx_compress_lazy_16;
 		else
 			c->impl = lzx_compress_lazy_32;
-		c->max_search_depth = (36 * compression_level) / 20;
-		c->nice_match_length = (72 * compression_level) / 20;
+		c->max_search_depth = (60 * compression_level) / 20;
+		c->nice_match_length = (80 * compression_level) / 20;
 
 		/* lzx_compress_lazy() needs max_search_depth >= 2 because it
 		 * halves the max_search_depth when attempting a lazy match, and
@@ -2278,7 +2459,7 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 		/* Scale nice_match_length and max_search_depth with the
 		 * compression level.  */
 		c->max_search_depth = (24 * compression_level) / 50;
-		c->nice_match_length = (32 * compression_level) / 50;
+		c->nice_match_length = (48 * compression_level) / 50;
 
 		/* Set a number of optimization passes appropriate for the
 		 * compression level.  */
@@ -2339,7 +2520,7 @@ lzx_compress(const void *restrict in, size_t in_nbytes,
 	else
 		memcpy(c->in_buffer, in, in_nbytes);
 	c->in_nbytes = in_nbytes;
-	lzx_do_e8_preprocessing(c->in_buffer, in_nbytes);
+	lzx_preprocess(c->in_buffer, in_nbytes);
 
 	/* Initially, the previous Huffman codeword lengths are all zeroes.  */
 	c->codes_index = 0;
@@ -2354,7 +2535,7 @@ lzx_compress(const void *restrict in, size_t in_nbytes,
 	/* Flush the output bitstream and return the compressed size or 0.  */
 	result = lzx_flush_output(&os);
 	if (!result && c->destructive)
-		lzx_undo_e8_preprocessing(c->in_buffer, c->in_nbytes);
+		lzx_postprocess(c->in_buffer, c->in_nbytes);
 	return result;
 }