X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flzx_compress.c;h=9fffa4c211534e218dccf6456134faf9d546a65d;hb=2544ba6be482ef32e23eb60ccfcf154eabf6c7e6;hp=ac65c5dcaf15f7aa01a8803dbb578472524da08f;hpb=7e71e0ea5e7c2958cef2cc3522367a7da82e4728;p=wimlib

diff --git a/src/lzx_compress.c b/src/lzx_compress.c
index ac65c5dc..9fffa4c2 100644
--- a/src/lzx_compress.c
+++ b/src/lzx_compress.c
@@ -5,7 +5,7 @@
  */
 
 /*
- * Copyright (C) 2012, 2013, 2014, 2015 Eric Biggers
+ * Copyright (C) 2012-2016 Eric Biggers
  *
  * This file is free software; you can redistribute it and/or modify it under
  * the terms of the GNU Lesser General Public License as published by the Free
@@ -65,33 +65,41 @@
 #endif
 
 /*
- * Start a new LZX block (with new Huffman codes) after this many bytes.
+ * The compressor always chooses a block of at least MIN_BLOCK_SIZE bytes,
+ * except if the last block has to be shorter.
+ */
+#define MIN_BLOCK_SIZE		6500
+
+/*
+ * The compressor attempts to end blocks after SOFT_MAX_BLOCK_SIZE bytes, but
+ * the final size might be larger due to matches extending beyond the end of the
+ * block.  Specifically:
  *
- * Note: actual block sizes may slightly exceed this value.
+ *  - The greedy parser may choose an arbitrarily long match starting at the
+ *    SOFT_MAX_BLOCK_SIZE'th byte.
  *
- * TODO: recursive splitting and cost evaluation might be good for an extremely
- * high compression mode, but otherwise it is almost always far too slow for how
- * much it helps.  Perhaps some sort of heuristic would be useful?
+ *  - The lazy parser may choose a sequence of literals starting at the
+ *    SOFT_MAX_BLOCK_SIZE'th byte when it sees a sequence of increasing good
+ *    matches.  The final match may be of arbitrary length.  The length of the
+ *    literal sequence is approximately limited by the "nice match length"
+ *    parameter.
  */
-#define LZX_DIV_BLOCK_SIZE	32768
+#define SOFT_MAX_BLOCK_SIZE	100000
 
 /*
- * LZX_CACHE_PER_POS is the number of lz_match structures to reserve in the
- * match cache for each byte position.  This value should be high enough so that
- * nearly the time, all matches found in a given block can fit in the match
- * cache.  However, fallback behavior (immediately terminating the block) on
- * cache overflow is still required.
+ * The number of observed matches or literals that represents sufficient data to
+ * decide whether the current block should be terminated or not.
  */
-#define LZX_CACHE_PER_POS	7
+#define NUM_OBSERVATIONS_PER_BLOCK_CHECK	500
 
 /*
  * LZX_CACHE_LENGTH is the number of lz_match structures in the match cache,
- * excluding the extra "overflow" entries.  The per-position multiplier is '1 +
- * LZX_CACHE_PER_POS' instead of 'LZX_CACHE_PER_POS' because there is an
- * overhead of one lz_match per position, used to hold the match count at that
- * position.
+ * excluding the extra "overflow" entries.  This value should be high enough so
+ * that nearly the time, all matches found in a given block can fit in the match
+ * cache.  However, fallback behavior (immediately terminating the block) on
+ * cache overflow is still required.
  */
-#define LZX_CACHE_LENGTH	(LZX_DIV_BLOCK_SIZE * (1 + LZX_CACHE_PER_POS))
+#define LZX_CACHE_LENGTH	(SOFT_MAX_BLOCK_SIZE * 5)
 
 /*
  * LZX_MAX_MATCHES_PER_POS is an upper bound on the number of matches that can
@@ -111,13 +119,12 @@
  * unknown.  In reality, each token in LZX requires a whole number of bits to
  * output.
  */
-#define LZX_BIT_COST		16
+#define LZX_BIT_COST		64
 
 /*
- * Consideration of aligned offset costs is disabled for now, due to
- * insufficient benefit gained from the time spent.
+ * Should the compressor take into account the costs of aligned offset symbols?
  */
-#define LZX_CONSIDER_ALIGNED_COSTS	0
+#define LZX_CONSIDER_ALIGNED_COSTS	1
 
 /*
  * LZX_MAX_FAST_LEVEL is the maximum compression level at which we use the
@@ -126,13 +133,12 @@
 #define LZX_MAX_FAST_LEVEL	34
 
 /*
- * LZX_HASH2_ORDER is the log base 2 of the number of entries in the hash table
- * for finding length 2 matches.  This can be as high as 16 (in which case the
- * hash function is trivial), but using a smaller hash table speeds up
- * compression due to reduced cache pressure.
+ * BT_MATCHFINDER_HASH2_ORDER is the log base 2 of the number of entries in the
+ * hash table for finding length 2 matches.  This could be as high as 16, but
+ * using a smaller hash table speeds up compression due to reduced cache
+ * pressure.
  */
-#define LZX_HASH2_ORDER		12
-#define LZX_HASH2_LENGTH	(1UL << LZX_HASH2_ORDER)
+#define BT_MATCHFINDER_HASH2_ORDER	12
 
 /*
  * These are the compressor-side limits on the codeword lengths for each Huffman
@@ -140,7 +146,7 @@
  * lower than the limits defined by the LZX format.  This does not significantly
  * affect the compression ratio, at least for the block sizes we use.
  */
-#define MAIN_CODEWORD_LIMIT	12	/* 64-bit: can buffer 4 main symbols  */
+#define MAIN_CODEWORD_LIMIT	16
 #define LENGTH_CODEWORD_LIMIT	12
 #define ALIGNED_CODEWORD_LIMIT	7
 #define PRE_CODEWORD_LIMIT	7
@@ -154,16 +160,16 @@
 #include "wimlib/util.h"
 
 /* Matchfinders with 16-bit positions  */
-#define pos_t	u16
-#define MF_SUFFIX _16
+#define mf_pos_t	u16
+#define MF_SUFFIX	_16
 #include "wimlib/bt_matchfinder.h"
 #include "wimlib/hc_matchfinder.h"
 
 /* Matchfinders with 32-bit positions  */
-#undef pos_t
+#undef mf_pos_t
 #undef MF_SUFFIX
-#define pos_t	u32
-#define MF_SUFFIX _32
+#define mf_pos_t	u32
+#define MF_SUFFIX	_32
 #include "wimlib/bt_matchfinder.h"
 #include "wimlib/hc_matchfinder.h"
 
@@ -216,6 +222,17 @@ struct lzx_freqs {
 	u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
+/* Block split statistics.  See "Block splitting algorithm" below. */
+#define NUM_LITERAL_OBSERVATION_TYPES 8
+#define NUM_MATCH_OBSERVATION_TYPES 2
+#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
+struct block_split_stats {
+	u32 new_observations[NUM_OBSERVATION_TYPES];
+	u32 observations[NUM_OBSERVATION_TYPES];
+	u32 num_new_observations;
+	u32 num_observations;
+};
+
 /*
  * Represents a run of literals followed by a match or end-of-block.  This
  * struct is needed to temporarily store items chosen by the parser, since items
@@ -234,9 +251,10 @@ struct lzx_sequence {
 	u16 adjusted_length;
 
 	/* If bit 31 is clear, then this field contains the match header in bits
-	 * 0-8 and the match offset minus LZX_OFFSET_ADJUSTMENT in bits 9-30.
-	 * Otherwise, this sequence's literal run was the last literal run in
-	 * the block, so there is no match that follows it.  */
+	 * 0-8, and either the match offset plus LZX_OFFSET_ADJUSTMENT or a
+	 * recent offset code in bits 9-30.  Otherwise (if bit 31 is set), this
+	 * sequence's literal run was the last literal run in the block, so
+	 * there is no match that follows it.  */
 	u32 adjusted_offset_and_match_hdr;
 };
 
@@ -275,6 +293,9 @@ struct lzx_optimum_node {
 	u32 item;
 #define OPTIMUM_OFFSET_SHIFT 9
 #define OPTIMUM_LEN_MASK ((1 << OPTIMUM_OFFSET_SHIFT) - 1)
+#define OPTIMUM_EXTRA_FLAG 0x80000000
+	u32 extra_match;
+	u32 extra_literal;
 } _aligned_attribute(8);
 
 /*
@@ -333,15 +354,6 @@ lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
 	};
 }
 
-/* Pop a match offset off the front (most recently used) end of the queue.  */
-static inline u32
-lzx_lru_queue_pop(struct lzx_lru_queue *queue_p)
-{
-	u32 offset = queue_p->R & LZX_QUEUE64_OFFSET_MASK;
-	queue_p->R >>= LZX_QUEUE64_OFFSET_SHIFT;
-	return offset;
-}
-
 /* Swap a match offset to the front of the queue.  */
 static inline struct lzx_lru_queue
 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
@@ -404,6 +416,9 @@ struct lzx_compressor {
 	/* The Huffman symbol frequency counters for the current block.  */
 	struct lzx_freqs freqs;
 
+	/* Block split statistics.  */
+	struct block_split_stats split_stats;
+
 	/* The Huffman codes for the current and previous blocks.  The one with
 	 * index 'codes_index' is for the current block, and the other one is
 	 * for the previous block.  */
@@ -412,8 +427,10 @@ struct lzx_compressor {
 
 	/* The matches and literals that the parser has chosen for the current
 	 * block.  The required length of this array is limited by the maximum
-	 * number of matches that can ever be chosen for a single block.  */
-	struct lzx_sequence chosen_sequences[DIV_ROUND_UP(LZX_DIV_BLOCK_SIZE, LZX_MIN_MATCH_LEN)];
+	 * number of matches that can ever be chosen for a single block, plus
+	 * one for the special entry at the end.  */
+	struct lzx_sequence chosen_sequences[
+		       DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
 
 	/* Tables for mapping adjusted offsets to offset slots  */
 
@@ -436,24 +453,18 @@ struct lzx_compressor {
 		/* Data for near-optimal parsing  */
 		struct {
 			/*
-			 * The graph nodes for the current block.
+			 * Array of nodes, one per position, for running the
+			 * minimum-cost path algorithm.
 			 *
-			 * We need at least 'LZX_DIV_BLOCK_SIZE +
-			 * LZX_MAX_MATCH_LEN - 1' nodes because that is the
-			 * maximum block size that may be used.  Add 1 because
-			 * we need a node to represent end-of-block.
-			 *
-			 * It is possible that nodes past end-of-block are
-			 * accessed during match consideration, but this can
-			 * only occur if the block was truncated at
-			 * LZX_DIV_BLOCK_SIZE.  So the same bound still applies.
-			 * Note that since nodes past the end of the block will
-			 * never actually have an effect on the items that are
-			 * chosen for the block, it makes no difference what
-			 * their costs are initialized to (if anything).
+			 * This array must be large enough to accommodate the
+			 * worst-case number of nodes, which occurs if we find a
+			 * match of length LZX_MAX_MATCH_LEN at position
+			 * SOFT_MAX_BLOCK_SIZE - 1, producing a block of length
+			 * SOFT_MAX_BLOCK_SIZE - 1 + LZX_MAX_MATCH_LEN.  Add one
+			 * for the end-of-block node.
 			 */
-			struct lzx_optimum_node optimum_nodes[LZX_DIV_BLOCK_SIZE +
-							      LZX_MAX_MATCH_LEN - 1 + 1];
+			struct lzx_optimum_node optimum_nodes[SOFT_MAX_BLOCK_SIZE - 1 +
+							      LZX_MAX_MATCH_LEN + 1];
 
 			/* The cost model for the current block  */
 			struct lzx_costs costs;
@@ -485,9 +496,6 @@ struct lzx_compressor {
 						    LZX_MAX_MATCHES_PER_POS +
 						    LZX_MAX_MATCH_LEN - 1];
 
-			/* Hash table for finding length 2 matches  */
-			u32 hash2_tab[LZX_HASH2_LENGTH];
-
 			/* Binary trees matchfinder (MUST BE LAST!!!)  */
 			union {
 				struct bt_matchfinder_16 bt_mf_16;
@@ -553,7 +561,7 @@ struct lzx_output_bitstream {
 
 /* Can the specified number of bits always be added to 'bitbuf' after any
  * pending 16-bit coding units have been flushed?  */
-#define CAN_BUFFER(n)	((n) <= (8 * sizeof(machine_word_t)) - 16)
+#define CAN_BUFFER(n)	((n) <= (8 * sizeof(machine_word_t)) - 15)
 
 /*
  * Initialize the output bitstream.
@@ -590,13 +598,21 @@ lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 static inline void
 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
 {
+	/* Masking the number of bits to shift is only needed to avoid undefined
+	 * behavior; we don't actually care about the results of bad shifts.  On
+	 * x86, the explicit masking generates no extra code.  */
+	const u32 shift_mask = 8 * sizeof(os->bitbuf) - 1;
+
 	if (os->end - os->next < 6)
 		return;
-	put_unaligned_u16_le(os->bitbuf >> (os->bitcount - 16), os->next + 0);
+	put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
+					    shift_mask), os->next + 0);
 	if (max_num_bits > 16)
-		put_unaligned_u16_le(os->bitbuf >> (os->bitcount - 32), os->next + 2);
+		put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
+						shift_mask), os->next + 2);
 	if (max_num_bits > 32)
-		put_unaligned_u16_le(os->bitbuf >> (os->bitcount - 48), os->next + 4);
+		put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
+						shift_mask), os->next + 4);
 	os->next += (os->bitcount >> 4) << 1;
 	os->bitcount &= 15;
 }
@@ -620,7 +636,7 @@ lzx_flush_output(struct lzx_output_bitstream *os)
 		return 0;
 
 	if (os->bitcount != 0) {
-		put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next);
+		put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
 		os->next += 2;
 	}
 
@@ -639,7 +655,7 @@ lzx_make_huffman_codes(struct lzx_compressor *c)
 
 	STATIC_ASSERT(MAIN_CODEWORD_LIMIT >= 9 &&
 		      MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
-	STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 9 &&
+	STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 8 &&
 		      LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
 	STATIC_ASSERT(ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
 		      ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
@@ -896,37 +912,34 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 
 			/* Verify optimization is enabled on 64-bit  */
 			STATIC_ASSERT(sizeof(machine_word_t) < 8 ||
-				      CAN_BUFFER(4 * MAIN_CODEWORD_LIMIT));
+				      CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT));
 
-			if (CAN_BUFFER(4 * MAIN_CODEWORD_LIMIT)) {
+			if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) {
 
-				/* 64-bit: write 4 literals at a time.  */
-				while (litrunlen >= 4) {
+				/* 64-bit: write 3 literals at a time.  */
+				while (litrunlen >= 3) {
 					unsigned lit0 = block_data[0];
 					unsigned lit1 = block_data[1];
 					unsigned lit2 = block_data[2];
-					unsigned lit3 = block_data[3];
-					lzx_add_bits(os, codes->codewords.main[lit0], codes->lens.main[lit0]);
-					lzx_add_bits(os, codes->codewords.main[lit1], codes->lens.main[lit1]);
-					lzx_add_bits(os, codes->codewords.main[lit2], codes->lens.main[lit2]);
-					lzx_add_bits(os, codes->codewords.main[lit3], codes->lens.main[lit3]);
-					lzx_flush_bits(os, 4 * MAIN_CODEWORD_LIMIT);
-					block_data += 4;
-					litrunlen -= 4;
+					lzx_add_bits(os, codes->codewords.main[lit0],
+						     codes->lens.main[lit0]);
+					lzx_add_bits(os, codes->codewords.main[lit1],
+						     codes->lens.main[lit1]);
+					lzx_add_bits(os, codes->codewords.main[lit2],
+						     codes->lens.main[lit2]);
+					lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
+					block_data += 3;
+					litrunlen -= 3;
 				}
 				if (litrunlen--) {
 					unsigned lit = *block_data++;
-					lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
+					lzx_add_bits(os, codes->codewords.main[lit],
+						     codes->lens.main[lit]);
 					if (litrunlen--) {
 						unsigned lit = *block_data++;
-						lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
-						if (litrunlen--) {
-							unsigned lit = *block_data++;
-							lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
-							lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
-						} else {
-							lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
-						}
+						lzx_add_bits(os, codes->codewords.main[lit],
+							     codes->lens.main[lit]);
+						lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
 					} else {
 						lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
 					}
@@ -935,7 +948,8 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 				/* 32-bit: write 1 literal at a time.  */
 				do {
 					unsigned lit = *block_data++;
-					lzx_add_bits(os, codes->codewords.main[lit], codes->lens.main[lit]);
+					lzx_add_bits(os, codes->codewords.main[lit],
+						     codes->lens.main[lit]);
 					lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
 				} while (--litrunlen);
 			}
@@ -975,8 +989,10 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 		/* If needed, output the length symbol for the match.  */
 
 		if (adjusted_length >= LZX_NUM_PRIMARY_LENS) {
-			lzx_add_bits(os, codes->codewords.len[adjusted_length - LZX_NUM_PRIMARY_LENS],
-				     codes->lens.len[adjusted_length - LZX_NUM_PRIMARY_LENS]);
+			lzx_add_bits(os, codes->codewords.len[adjusted_length -
+							      LZX_NUM_PRIMARY_LENS],
+				     codes->lens.len[adjusted_length -
+						     LZX_NUM_PRIMARY_LENS]);
 			if (!CAN_BUFFER(MAX_MATCH_BITS))
 				lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
 		}
@@ -994,11 +1010,15 @@ lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 			if (!CAN_BUFFER(MAX_MATCH_BITS))
 				lzx_flush_bits(os, 14);
 
-			lzx_add_bits(os, codes->codewords.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK],
-				     codes->lens.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]);
+			lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
+								  LZX_ALIGNED_OFFSET_BITMASK],
+				     codes->lens.aligned[adjusted_offset &
+							 LZX_ALIGNED_OFFSET_BITMASK]);
 			if (!CAN_BUFFER(MAX_MATCH_BITS))
 				lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
 		} else {
+			STATIC_ASSERT(CAN_BUFFER(17));
+
 			lzx_add_bits(os, extra_bits, num_extra_bits);
 			if (!CAN_BUFFER(MAX_MATCH_BITS))
 				lzx_flush_bits(os, 17);
@@ -1023,9 +1043,6 @@ lzx_write_compressed_block(const u8 *block_begin,
 			   const struct lzx_lens * prev_lens,
 			   struct lzx_output_bitstream * os)
 {
-	LZX_ASSERT(block_type == LZX_BLOCKTYPE_ALIGNED ||
-		   block_type == LZX_BLOCKTYPE_VERBATIM);
-
 	/* The first three bits indicate the type of block and are one of the
 	 * LZX_BLOCKTYPE_* constants.  */
 	lzx_write_bits(os, block_type, 3);
@@ -1229,6 +1246,121 @@ lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
 	last_seq->adjusted_offset_and_match_hdr = 0x80000000;
 }
 
+/******************************************************************************/
+
+/*
+ * Block splitting algorithm.  The problem is to decide when it is worthwhile to
+ * start a new block with new entropy codes.  There is a theoretically optimal
+ * solution: recursively consider every possible block split, considering the
+ * exact cost of each block, and choose the minimum cost approach.  But this is
+ * far too slow.  Instead, as an approximation, we can count symbols and after
+ * every N symbols, compare the expected distribution of symbols based on the
+ * previous data with the actual distribution.  If they differ "by enough", then
+ * start a new block.
+ *
+ * As an optimization and heuristic, we don't distinguish between every symbol
+ * but rather we combine many symbols into a single "observation type".  For
+ * literals we only look at the high bits and low bits, and for matches we only
+ * look at whether the match is long or not.  The assumption is that for typical
+ * "real" data, places that are good block boundaries will tend to be noticable
+ * based only on changes in these aggregate frequencies, without looking for
+ * subtle differences in individual symbols.  For example, a change from ASCII
+ * bytes to non-ASCII bytes, or from few matches (generally less compressible)
+ * to many matches (generally more compressible), would be easily noticed based
+ * on the aggregates.
+ *
+ * For determining whether the frequency distributions are "different enough" to
+ * start a new block, the simply heuristic of splitting when the sum of absolute
+ * differences exceeds a constant seems to be good enough.  We also add a number
+ * proportional to the block size so that the algorithm is more likely to end
+ * large blocks than small blocks.  This reflects the general expectation that
+ * it will become increasingly beneficial to start a new block as the current
+ * blocks grows larger.
+ *
+ * Finally, for an approximation, it is not strictly necessary that the exact
+ * symbols being used are considered.  With "near-optimal parsing", for example,
+ * the actual symbols that will be used are unknown until after the block
+ * boundary is chosen and the block has been optimized.  Since the final choices
+ * cannot be used, we can use preliminary "greedy" choices instead.
+ */
+
+/* Initialize the block split statistics when starting a new block. */
+static void
+init_block_split_stats(struct block_split_stats *stats)
+{
+	for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->new_observations[i] = 0;
+		stats->observations[i] = 0;
+	}
+	stats->num_new_observations = 0;
+	stats->num_observations = 0;
+}
+
+/* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
+ * literal, for 8 possible literal observation types.  */
+static inline void
+observe_literal(struct block_split_stats *stats, u8 lit)
+{
+	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
+	stats->num_new_observations++;
+}
+
+/* Match observation.  Heuristic: use one observation type for "short match" and
+ * one observation type for "long match".  */
+static inline void
+observe_match(struct block_split_stats *stats, unsigned length)
+{
+	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
+	stats->num_new_observations++;
+}
+
+static bool
+do_end_block_check(struct block_split_stats *stats, u32 block_size)
+{
+	if (stats->num_observations > 0) {
+
+		/* Note: to avoid slow divisions, we do not divide by
+		 * 'num_observations', but rather do all math with the numbers
+		 * multiplied by 'num_observations'.  */
+		u32 total_delta = 0;
+		for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+			u32 expected = stats->observations[i] * stats->num_new_observations;
+			u32 actual = stats->new_observations[i] * stats->num_observations;
+			u32 delta = (actual > expected) ? actual - expected :
+							  expected - actual;
+			total_delta += delta;
+		}
+
+		/* Ready to end the block? */
+		if (total_delta + (block_size / 1024) * stats->num_observations >=
+		    stats->num_new_observations * 51 / 64 * stats->num_observations)
+			return true;
+	}
+
+	for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->num_observations += stats->new_observations[i];
+		stats->observations[i] += stats->new_observations[i];
+		stats->new_observations[i] = 0;
+	}
+	stats->num_new_observations = 0;
+	return false;
+}
+
+static inline bool
+should_end_block(struct block_split_stats *stats,
+		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
+{
+	/* Ready to check block split statistics? */
+	if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
+	    in_next - in_block_begin < MIN_BLOCK_SIZE ||
+	    in_end - in_next < MIN_BLOCK_SIZE)
+		return false;
+
+	return do_end_block_check(stats, in_next - in_block_begin);
+}
+
+/******************************************************************************/
+
 /*
  * Given the minimum-cost path computed through the item graph for the current
  * block, walk the path and count how many of each symbol in each Huffman-coded
@@ -1243,7 +1375,9 @@ static inline void
 lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 {
 	u32 node_idx = block_size;
+
 	for (;;) {
+		u32 item;
 		u32 len;
 		u32 offset_data;
 		unsigned v;
@@ -1252,21 +1386,37 @@ lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 		/* Tally literals until either a match or the beginning of the
 		 * block is reached.  */
 		for (;;) {
-			u32 item = c->optimum_nodes[node_idx].item;
+			item = c->optimum_nodes[node_idx].item;
+			if (item & OPTIMUM_LEN_MASK)
+				break;
+			c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
+			node_idx--;
+		}
 
-			len = item & OPTIMUM_LEN_MASK;
-			offset_data = item >> OPTIMUM_OFFSET_SHIFT;
+		if (item & OPTIMUM_EXTRA_FLAG) {
 
-			if (len != 0)  /* Not a literal?  */
+			if (node_idx == 0)
 				break;
 
-			/* Tally the main symbol for the literal.  */
-			c->freqs.main[offset_data]++;
+			/* Tally a rep0 match.  */
+			len = item & OPTIMUM_LEN_MASK;
+			v = len - LZX_MIN_MATCH_LEN;
+			if (v >= LZX_NUM_PRIMARY_LENS) {
+				c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
+				v = LZX_NUM_PRIMARY_LENS;
+			}
+			c->freqs.main[LZX_NUM_CHARS + v]++;
 
-			if (--node_idx == 0) /* Beginning of block was reached?  */
-				return;
+			/* Tally a literal.  */
+			c->freqs.main[c->optimum_nodes[node_idx].extra_literal]++;
+
+			item = c->optimum_nodes[node_idx].extra_match;
+			node_idx -= len + 1;
 		}
 
+		len = item & OPTIMUM_LEN_MASK;
+		offset_data = item >> OPTIMUM_OFFSET_SHIFT;
+
 		node_idx -= len;
 
 		/* Tally a match.  */
@@ -1286,9 +1436,6 @@ lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 		offset_slot = lzx_comp_get_offset_slot(c, offset_data, is_16_bit);
 		v += offset_slot * LZX_NUM_LEN_HEADERS;
 		c->freqs.main[LZX_NUM_CHARS + v]++;
-
-		if (node_idx == 0) /* Beginning of block was reached?  */
-			return;
 	}
 }
 
@@ -1313,29 +1460,54 @@ lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 
 	lit_start_node = node_idx;
 	for (;;) {
+		u32 item;
 		u32 len;
 		u32 offset_data;
 		unsigned v;
 		unsigned offset_slot;
 
-		/* Record literals until either a match or the beginning of the
+		/* Tally literals until either a match or the beginning of the
 		 * block is reached.  */
 		for (;;) {
-			u32 item = c->optimum_nodes[node_idx].item;
+			item = c->optimum_nodes[node_idx].item;
+			if (item & OPTIMUM_LEN_MASK)
+				break;
+			c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
+			node_idx--;
+		}
 
-			len = item & OPTIMUM_LEN_MASK;
-			offset_data = item >> OPTIMUM_OFFSET_SHIFT;
+		if (item & OPTIMUM_EXTRA_FLAG) {
 
-			if (len != 0) /* Not a literal?  */
+			if (node_idx == 0)
 				break;
 
-			/* Tally the main symbol for the literal.  */
-			c->freqs.main[offset_data]++;
+			/* Save the literal run length for the next sequence
+			 * (the "previous sequence" when walking backwards).  */
+			len = item & OPTIMUM_LEN_MASK;
+			c->chosen_sequences[seq_idx].litrunlen = lit_start_node - node_idx;
+			seq_idx--;
+			lit_start_node = node_idx - len;
+
+			/* Tally a rep0 match.  */
+			v = len - LZX_MIN_MATCH_LEN;
+			c->chosen_sequences[seq_idx].adjusted_length = v;
+			if (v >= LZX_NUM_PRIMARY_LENS) {
+				c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
+				v = LZX_NUM_PRIMARY_LENS;
+			}
+			c->freqs.main[LZX_NUM_CHARS + v]++;
+			c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = v;
 
-			if (--node_idx == 0) /* Beginning of block was reached?  */
-				goto out;
+			/* Tally a literal.  */
+			c->freqs.main[c->optimum_nodes[node_idx].extra_literal]++;
+
+			item = c->optimum_nodes[node_idx].extra_match;
+			node_idx -= len + 1;
 		}
 
+		len = item & OPTIMUM_LEN_MASK;
+		offset_data = item >> OPTIMUM_OFFSET_SHIFT;
+
 		/* Save the literal run length for the next sequence (the
 		 * "previous sequence" when walking backwards).  */
 		c->chosen_sequences[seq_idx--].litrunlen = lit_start_node - node_idx;
@@ -1366,12 +1538,8 @@ lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
 		/* Save the adjusted offset and match header.  */
 		c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr =
 				(offset_data << 9) | v;
-
-		if (node_idx == 0) /* Beginning of block was reached?  */
-			goto out;
 	}
 
-out:
 	/* Save the literal run length for the first sequence.  */
 	c->chosen_sequences[seq_idx].litrunlen = lit_start_node - node_idx;
 
@@ -1419,7 +1587,6 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 		       bool is_16_bit)
 {
 	struct lzx_optimum_node *cur_node = c->optimum_nodes;
-	struct lzx_optimum_node * const end_node = &c->optimum_nodes[block_size];
 	struct lz_match *cache_ptr = c->match_cache;
 	const u8 *in_next = block_begin;
 	const u8 * const block_end = block_begin + block_size;
@@ -1559,28 +1726,56 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 					goto done_matches;
 
 			/* Consider explicit offset matches  */
-			do {
+			for (;;) {
 				u32 offset = cache_ptr->offset;
 				u32 offset_data = offset + LZX_OFFSET_ADJUSTMENT;
 				unsigned offset_slot = lzx_comp_get_offset_slot(c, offset_data,
 										is_16_bit);
+				u32 base_cost = cur_node->cost;
+				u32 cost;
+
+			#if LZX_CONSIDER_ALIGNED_COSTS
+				if (offset_data >= 16)
+					base_cost += c->costs.aligned[offset_data &
+								      LZX_ALIGNED_OFFSET_BITMASK];
+			#endif
 				do {
-					u32 cost = cur_node->cost +
-						   c->costs.match_cost[offset_slot][
+					cost = base_cost +
+					       c->costs.match_cost[offset_slot][
 								next_len - LZX_MIN_MATCH_LEN];
-				#if LZX_CONSIDER_ALIGNED_COSTS
-					if (lzx_extra_offset_bits[offset_slot] >=
-					    LZX_NUM_ALIGNED_OFFSET_BITS)
-						cost += c->costs.aligned[offset_data &
-									 LZX_ALIGNED_OFFSET_BITMASK];
-				#endif
 					if (cost < (cur_node + next_len)->cost) {
 						(cur_node + next_len)->cost = cost;
 						(cur_node + next_len)->item =
 							(offset_data << OPTIMUM_OFFSET_SHIFT) | next_len;
 					}
 				} while (++next_len <= cache_ptr->length);
-			} while (++cache_ptr != end_matches);
+
+				if (++cache_ptr == end_matches) {
+					/* Consider match + lit + rep0 */
+					u32 remaining = block_end - (in_next + next_len);
+					if (likely(remaining >= 2)) {
+						const u8 *strptr = in_next + next_len;
+						const u8 *matchptr = strptr - offset;
+						if (unlikely(load_u16_unaligned(strptr) == load_u16_unaligned(matchptr))) {
+							u32 rep0_len = lz_extend(strptr, matchptr, 2,
+										 min(remaining, LZX_MAX_MATCH_LEN));
+							u8 lit = strptr[-1];
+							cost += c->costs.main[lit] +
+								c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN];
+							u32 total_len = next_len + rep0_len;
+							if (cost < (cur_node + total_len)->cost) {
+								(cur_node + total_len)->cost = cost;
+								(cur_node + total_len)->item =
+									OPTIMUM_EXTRA_FLAG | rep0_len;
+								(cur_node + total_len)->extra_literal = lit;
+								(cur_node + total_len)->extra_match =
+									(offset_data << OPTIMUM_OFFSET_SHIFT) | (next_len - 1);
+							}
+						}
+					}
+					break;
+				}
+			}
 		}
 
 	done_matches:
@@ -1591,8 +1786,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 		 * of coding the literal is integrated into the queue update
 		 * code below.  */
 		literal = *in_next++;
-		cost = cur_node->cost +
-		       c->costs.main[lzx_main_symbol_for_literal(literal)];
+		cost = cur_node->cost + c->costs.main[literal];
 
 		/* Advance to the next position.  */
 		cur_node++;
@@ -1609,12 +1803,21 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 		} else {
 			/* Match: queue update is needed.  */
 			unsigned len = cur_node->item & OPTIMUM_LEN_MASK;
-			u32 offset_data = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
+			u32 offset_data = (cur_node->item &
+					   ~OPTIMUM_EXTRA_FLAG) >> OPTIMUM_OFFSET_SHIFT;
 			if (offset_data >= LZX_NUM_RECENT_OFFSETS) {
 				/* Explicit offset match: insert offset at front  */
 				QUEUE(in_next) =
 					lzx_lru_queue_push(QUEUE(in_next - len),
 							   offset_data - LZX_OFFSET_ADJUSTMENT);
+			} else if (cur_node->item & OPTIMUM_EXTRA_FLAG) {
+				/* Explicit offset match, then literal, then
+				 * rep0 match: insert offset at front  */
+				len += 1 + (cur_node->extra_match & OPTIMUM_LEN_MASK);
+				QUEUE(in_next) =
+					lzx_lru_queue_push(QUEUE(in_next - len),
+							   (cur_node->extra_match >> OPTIMUM_OFFSET_SHIFT) -
+							   LZX_OFFSET_ADJUSTMENT);
 			} else {
 				/* Repeat offset match: swap offset to front  */
 				QUEUE(in_next) =
@@ -1622,7 +1825,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 							   offset_data);
 			}
 		}
-	} while (cur_node != end_node);
+	} while (in_next != block_end);
 
 	/* Return the match offset queue at the end of the minimum cost path. */
 	return QUEUE(block_end);
@@ -1632,17 +1835,19 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 static void
 lzx_compute_match_costs(struct lzx_compressor *c)
 {
-	unsigned num_offset_slots = lzx_get_num_offset_slots(c->window_order);
+	unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) /
+					LZX_NUM_LEN_HEADERS;
 	struct lzx_costs *costs = &c->costs;
 
 	for (unsigned offset_slot = 0; offset_slot < num_offset_slots; offset_slot++) {
 
 		u32 extra_cost = (u32)lzx_extra_offset_bits[offset_slot] * LZX_BIT_COST;
-		unsigned main_symbol = lzx_main_symbol_for_match(offset_slot, 0);
+		unsigned main_symbol = LZX_NUM_CHARS + (offset_slot *
+							LZX_NUM_LEN_HEADERS);
 		unsigned i;
 
 	#if LZX_CONSIDER_ALIGNED_COSTS
-		if (lzx_extra_offset_bits[offset_slot] >= LZX_NUM_ALIGNED_OFFSET_BITS)
+		if (offset_slot >= 8)
 			extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS * LZX_BIT_COST;
 	#endif
 
@@ -1667,8 +1872,8 @@ lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size)
 	bool have_byte[256];
 	unsigned num_used_bytes;
 
-	/* The costs below are hard coded to use a scaling factor of 16.  */
-	STATIC_ASSERT(LZX_BIT_COST == 16);
+	/* The costs below are hard coded to use a scaling factor of 64.  */
+	STATIC_ASSERT(LZX_BIT_COST == 64);
 
 	/*
 	 * Heuristics:
@@ -1693,13 +1898,13 @@ lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size)
 		num_used_bytes += have_byte[i];
 
 	for (i = 0; i < 256; i++)
-		c->costs.main[i] = 140 - (256 - num_used_bytes) / 4;
+		c->costs.main[i] = 560 - (256 - num_used_bytes);
 
 	for (; i < c->num_main_syms; i++)
-		c->costs.main[i] = 170;
+		c->costs.main[i] = 680;
 
 	for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
-		c->costs.len[i] = 103 + (i / 4);
+		c->costs.len[i] = 412 + i;
 
 #if LZX_CONSIDER_ALIGNED_COSTS
 	for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
@@ -1716,15 +1921,21 @@ lzx_update_costs(struct lzx_compressor *c)
 	unsigned i;
 	const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
 
-	for (i = 0; i < c->num_main_syms; i++)
-		c->costs.main[i] = (lens->main[i] ? lens->main[i] : 15) * LZX_BIT_COST;
+	for (i = 0; i < c->num_main_syms; i++) {
+		c->costs.main[i] = (lens->main[i] ? lens->main[i] :
+				    MAIN_CODEWORD_LIMIT) * LZX_BIT_COST;
+	}
 
-	for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
-		c->costs.len[i] = (lens->len[i] ? lens->len[i] : 15) * LZX_BIT_COST;
+	for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
+		c->costs.len[i] = (lens->len[i] ? lens->len[i] :
+				   LENGTH_CODEWORD_LIMIT) * LZX_BIT_COST;
+	}
 
 #if LZX_CONSIDER_ALIGNED_COSTS
-	for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
-		c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] : 7) * LZX_BIT_COST;
+	for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
+		c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] :
+				       ALIGNED_CODEWORD_LIMIT) * LZX_BIT_COST;
+	}
 #endif
 
 	lzx_compute_match_costs(c);
@@ -1778,49 +1989,44 @@ lzx_optimize_and_write_block(struct lzx_compressor * const restrict c,
  * simpler "greedy" or "lazy" parse while still being relatively fast.
  */
 static inline void
-lzx_compress_near_optimal(struct lzx_compressor *c,
-			  struct lzx_output_bitstream *os,
+lzx_compress_near_optimal(struct lzx_compressor * restrict c,
+			  const u8 * const restrict in_begin,
+			  struct lzx_output_bitstream * restrict os,
 			  bool is_16_bit)
 {
-	const u8 * const in_begin = c->in_buffer;
 	const u8 *	 in_next = in_begin;
 	const u8 * const in_end  = in_begin + c->in_nbytes;
-	unsigned max_len = LZX_MAX_MATCH_LEN;
-	unsigned nice_len = min(c->nice_match_length, max_len);
-	u32 next_hash;
+	u32 max_len = LZX_MAX_MATCH_LEN;
+	u32 nice_len = min(c->nice_match_length, max_len);
+	u32 next_hashes[2] = {};
 	struct lzx_lru_queue queue;
 
 	CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
-	memset(c->hash2_tab, 0, sizeof(c->hash2_tab));
-	next_hash = bt_matchfinder_hash_3_bytes(in_next);
 	lzx_lru_queue_init(&queue);
 
 	do {
 		/* Starting a new block  */
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_block_end =
-			in_next + min(LZX_DIV_BLOCK_SIZE, in_end - in_next);
+		const u8 * const in_max_block_end =
+			in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
+		const u8 *next_observation = in_next;
+
+		init_block_split_stats(&c->split_stats);
 
 		/* Run the block through the matchfinder and cache the matches. */
 		struct lz_match *cache_ptr = c->match_cache;
 		do {
 			struct lz_match *lz_matchptr;
-			u32 hash2;
-			pos_t cur_match;
-			unsigned best_len;
+			u32 best_len;
 
 			/* If approaching the end of the input buffer, adjust
 			 * 'max_len' and 'nice_len' accordingly.  */
 			if (unlikely(max_len > in_end - in_next)) {
 				max_len = in_end - in_next;
 				nice_len = min(max_len, nice_len);
-
-				/* This extra check is needed to ensure that we
-				 * never output a length 2 match of the very
-				 * last two bytes with the very first two bytes,
-				 * since such a match has an offset too large to
-				 * be represented.  */
-				if (unlikely(max_len < 3)) {
+				if (unlikely(max_len <
+					     BT_MATCHFINDER_REQUIRED_NBYTES))
+				{
 					in_next++;
 					cache_ptr->length = 0;
 					cache_ptr++;
@@ -1828,37 +2034,34 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 				}
 			}
 
-			lz_matchptr = cache_ptr + 1;
-
-			/* Check for a length 2 match.  */
-			hash2 = lz_hash_2_bytes(in_next, LZX_HASH2_ORDER);
-			cur_match = c->hash2_tab[hash2];
-			c->hash2_tab[hash2] = in_next - in_begin;
-			if (cur_match != 0 &&
-			    (LZX_HASH2_ORDER == 16 ||
-			     load_u16_unaligned(&in_begin[cur_match]) ==
-			     load_u16_unaligned(in_next)))
-			{
-				lz_matchptr->length = 2;
-				lz_matchptr->offset = in_next - &in_begin[cur_match];
-				lz_matchptr++;
-			}
-
-			/* Check for matches of length >= 3.  */
-			lz_matchptr = CALL_BT_MF(is_16_bit, c, bt_matchfinder_get_matches,
+			/* Check for matches.  */
+			lz_matchptr = CALL_BT_MF(is_16_bit, c,
+						 bt_matchfinder_get_matches,
 						 in_begin,
-						 in_next,
-						 3,
+						 in_next - in_begin,
 						 max_len,
 						 nice_len,
 						 c->max_search_depth,
-						 &next_hash,
+						 next_hashes,
 						 &best_len,
-						 lz_matchptr);
-			in_next++;
+						 cache_ptr + 1);
+
 			cache_ptr->length = lz_matchptr - (cache_ptr + 1);
 			cache_ptr = lz_matchptr;
 
+			if (in_next >= next_observation) {
+				best_len = cache_ptr[-1].length;
+				if (best_len) {
+					observe_match(&c->split_stats, best_len);
+					next_observation = in_next + best_len;
+				} else {
+					observe_literal(&c->split_stats, *in_next);
+					next_observation = in_next + 1;
+				}
+			}
+
+			in_next++;
+
 			/*
 			 * If there was a very long match found, then don't
 			 * cache any matches for the bytes covered by that
@@ -1877,29 +2080,30 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 					if (unlikely(max_len > in_end - in_next)) {
 						max_len = in_end - in_next;
 						nice_len = min(max_len, nice_len);
-						if (unlikely(max_len < 3)) {
+						if (unlikely(max_len <
+							     BT_MATCHFINDER_REQUIRED_NBYTES))
+						{
 							in_next++;
 							cache_ptr->length = 0;
 							cache_ptr++;
 							continue;
 						}
 					}
-					c->hash2_tab[lz_hash_2_bytes(in_next, LZX_HASH2_ORDER)] =
-						in_next - in_begin;
-					CALL_BT_MF(is_16_bit, c, bt_matchfinder_skip_position,
+					CALL_BT_MF(is_16_bit, c,
+						   bt_matchfinder_skip_position,
 						   in_begin,
-						   in_next,
-						   in_end,
+						   in_next - in_begin,
 						   nice_len,
 						   c->max_search_depth,
-						   &next_hash);
+						   next_hashes);
 					in_next++;
 					cache_ptr->length = 0;
 					cache_ptr++;
 				} while (--best_len);
 			}
-		} while (in_next < in_block_end &&
-			 likely(cache_ptr < &c->match_cache[LZX_CACHE_LENGTH]));
+		} while (in_next < in_max_block_end &&
+			 likely(cache_ptr < &c->match_cache[LZX_CACHE_LENGTH]) &&
+			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
 
 		/* We've finished running the block through the matchfinder.
 		 * Now choose a match/literal sequence and write the block.  */
@@ -1914,14 +2118,14 @@ static void
 lzx_compress_near_optimal_16(struct lzx_compressor *c,
 			     struct lzx_output_bitstream *os)
 {
-	lzx_compress_near_optimal(c, os, true);
+	lzx_compress_near_optimal(c, c->in_buffer, os, true);
 }
 
 static void
 lzx_compress_near_optimal_32(struct lzx_compressor *c,
 			     struct lzx_output_bitstream *os)
 {
-	lzx_compress_near_optimal(c, os, false);
+	lzx_compress_near_optimal(c, c->in_buffer, os, false);
 }
 
 /*
@@ -1940,7 +2144,6 @@ lzx_find_longest_repeat_offset_match(const u8 * const in_next,
 				     unsigned *rep_max_idx_ret)
 {
 	STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
-	LZX_ASSERT(bytes_remaining >= 2);
 
 	const unsigned max_len = min(bytes_remaining, LZX_MAX_MATCH_LEN);
 	const u16 next_2_bytes = load_u16_unaligned(in_next);
@@ -2019,8 +2222,8 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 		/* Starting a new block  */
 
 		const u8 * const in_block_begin = in_next;
-		const u8 * const in_block_end =
-			in_next + min(LZX_DIV_BLOCK_SIZE, in_end - in_next);
+		const u8 * const in_max_block_end =
+			in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
 		struct lzx_sequence *next_seq = c->chosen_sequences;
 		unsigned cur_len;
 		u32 cur_offset;
@@ -2037,6 +2240,7 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 		u32 litrunlen = 0;
 
 		lzx_reset_symbol_frequencies(c);
+		init_block_split_stats(&c->split_stats);
 
 		do {
 			if (unlikely(max_len > in_end - in_next)) {
@@ -2046,7 +2250,8 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 
 			/* Find the longest match at the current position.  */
 
-			cur_len = CALL_HC_MF(is_16_bit, c, hc_matchfinder_longest_match,
+			cur_len = CALL_HC_MF(is_16_bit, c,
+					     hc_matchfinder_longest_match,
 					     in_begin,
 					     in_next - in_begin,
 					     2,
@@ -2064,10 +2269,14 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 			{
 				/* There was no match found, or the only match found
 				 * was a distant length 3 match.  Output a literal.  */
-				lzx_record_literal(c, *in_next++, &litrunlen);
+				lzx_record_literal(c, *in_next, &litrunlen);
+				observe_literal(&c->split_stats, *in_next);
+				in_next++;
 				continue;
 			}
 
+			observe_match(&c->split_stats, cur_len);
+
 			if (cur_offset == recent_offsets[0]) {
 				in_next++;
 				cur_offset_data = 0;
@@ -2112,7 +2321,8 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 				nice_len = min(max_len, nice_len);
 			}
 
-			next_len = CALL_HC_MF(is_16_bit, c, hc_matchfinder_longest_match,
+			next_len = CALL_HC_MF(is_16_bit, c,
+					      hc_matchfinder_longest_match,
 					      in_begin,
 					      in_next - in_begin,
 					      cur_len - 2,
@@ -2172,13 +2382,15 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os,
 			lzx_record_match(c, cur_len, cur_offset_data,
 					 recent_offsets, is_16_bit,
 					 &litrunlen, &next_seq);
-			in_next = CALL_HC_MF(is_16_bit, c, hc_matchfinder_skip_positions,
+			in_next = CALL_HC_MF(is_16_bit, c,
+					     hc_matchfinder_skip_positions,
 					     in_begin,
 					     in_next - in_begin,
 					     in_end - in_begin,
 					     skip_len,
 					     next_hashes);
-		} while (in_next < in_block_end);
+		} while (in_next < in_max_block_end &&
+			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
 
 		lzx_finish_sequence(next_seq, litrunlen);
 
@@ -2294,8 +2506,8 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 			c->impl = lzx_compress_lazy_16;
 		else
 			c->impl = lzx_compress_lazy_32;
-		c->max_search_depth = (36 * compression_level) / 20;
-		c->nice_match_length = (72 * compression_level) / 20;
+		c->max_search_depth = (60 * compression_level) / 20;
+		c->nice_match_length = (80 * compression_level) / 20;
 
 		/* lzx_compress_lazy() needs max_search_depth >= 2 because it
 		 * halves the max_search_depth when attempting a lazy match, and
@@ -2314,7 +2526,7 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 		/* Scale nice_match_length and max_search_depth with the
 		 * compression level.  */
 		c->max_search_depth = (24 * compression_level) / 50;
-		c->nice_match_length = (32 * compression_level) / 50;
+		c->nice_match_length = (48 * compression_level) / 50;
 
 		/* Set a number of optimization passes appropriate for the
 		 * compression level.  */
@@ -2375,7 +2587,7 @@ lzx_compress(const void *restrict in, size_t in_nbytes,
 	else
 		memcpy(c->in_buffer, in, in_nbytes);
 	c->in_nbytes = in_nbytes;
-	lzx_do_e8_preprocessing(c->in_buffer, in_nbytes);
+	lzx_preprocess(c->in_buffer, in_nbytes);
 
 	/* Initially, the previous Huffman codeword lengths are all zeroes.  */
 	c->codes_index = 0;
@@ -2390,7 +2602,7 @@ lzx_compress(const void *restrict in, size_t in_nbytes,
 	/* Flush the output bitstream and return the compressed size or 0.  */
 	result = lzx_flush_output(&os);
 	if (!result && c->destructive)
-		lzx_undo_e8_preprocessing(c->in_buffer, c->in_nbytes);
+		lzx_postprocess(c->in_buffer, c->in_nbytes);
 	return result;
 }