X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flzx_compress.c;h=d38c5819d31af3f80cc6414da762f0c93ddf0812;hb=c78979435a730f9d9abb5e6371db2618b235de89;hp=45bb019bb94341c1a6cf146239fffe8ff15ea943;hpb=3e8aa757aaa63297f0d54007adf46411778fb6a8;p=wimlib

diff --git a/src/lzx_compress.c b/src/lzx_compress.c
index 45bb019b..d38c5819 100644
--- a/src/lzx_compress.c
+++ b/src/lzx_compress.c
@@ -79,12 +79,28 @@
  * LZX_CACHE_PER_POS is the number of lz_match structures to reserve in the
  * match cache for each byte position.  This value should be high enough so that
  * nearly the time, all matches found in a given block can fit in the match
- * cache.  However, fallback behavior on cache overflow is still required.
+ * cache.  However, fallback behavior (immediately terminating the block) on
+ * cache overflow is still required.
  */
 #define LZX_CACHE_PER_POS	6
 
-#define LZX_CACHE_LEN		(LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1))
+/*
+ * LZX_CACHE_LENGTH is the number of lz_match structures in the match cache,
+ * excluding the extra "overflow" entries.  The per-position multiplier is '1 +
+ * LZX_CACHE_PER_POS' instead of 'LZX_CACHE_PER_POS' because there is an
+ * overhead of one lz_match per position, used to hold the match count at that
+ * position.
+ */
+#define LZX_CACHE_LENGTH	(LZX_DIV_BLOCK_SIZE * (1 + LZX_CACHE_PER_POS))
 
+/*
+ * LZX_MAX_MATCHES_PER_POS is an upper bound on the number of matches that can
+ * ever be saved in the match cache for a single position.  Since each match we
+ * save for a single position has a distinct length, we can use the number of
+ * possible match lengths in LZX as this bound.  This bound is guaranteed to be
+ * valid in all cases, although if 'nice_match_length < LZX_MAX_MATCH_LEN', then
+ * it will never actually be reached.
+ */
 #define LZX_MAX_MATCHES_PER_POS	LZX_NUM_LENS
 
 /*
@@ -104,7 +120,8 @@
 #define LZX_CONSIDER_ALIGNED_COSTS	0
 
 /*
- * The maximum compression level at which we use the faster algorithm.
+ * LZX_MAX_FAST_LEVEL is the maximum compression level at which we use the
+ * faster algorithm.
  */
 #define LZX_MAX_FAST_LEVEL	34
 
@@ -129,7 +146,6 @@
 #include "wimlib/bt_matchfinder.h"
 #include "wimlib/compress_common.h"
 #include "wimlib/compressor_ops.h"
-#include "wimlib/endianness.h"
 #include "wimlib/error.h"
 #include "wimlib/hc_matchfinder.h"
 #include "wimlib/lz_extend.h"
@@ -361,9 +377,26 @@ struct lzx_compressor {
 	struct lzx_codes codes[2];
 	unsigned codes_index;
 
-	/* The match/literal sequence the algorithm chose for the current block.
+	/*
+	 * The match/literal sequence the algorithm chose for the current block.
+	 *
+	 * Notes on how large this array actually needs to be:
+	 *
+	 * - In lzx_compress_near_optimal(), the maximum block size is
+	 *   'LZX_DIV_BLOCK_SIZE + LZX_MAX_MATCH_LEN - 1' bytes.  This occurs if
+	 *   a match of the maximum length is found on the last byte.  Although
+	 *   it is impossible for this particular case to actually result in a
+	 *   parse of all literals, we reserve this many spaces anyway.
+	 *
+	 * - The worst case for lzx_compress_lazy() is a block of almost all
+	 *   literals that ends with a series of matches of increasing scores,
+	 *   causing a sequence of literals to be chosen before the last match
+	 *   is finally chosen.  The number of items actually chosen in this
+	 *   scenario is limited by the number of distinct match scores that
+	 *   exist for matches shorter than 'nice_match_length'.  Having
+	 *   'LZX_MAX_MATCH_LEN - 1' extra spaces is plenty for now.
 	 */
-	struct lzx_item chosen_items[LZX_DIV_BLOCK_SIZE + LZX_MAX_MATCH_LEN + 1];
+	struct lzx_item chosen_items[LZX_DIV_BLOCK_SIZE + LZX_MAX_MATCH_LEN - 1];
 
 	/* Table mapping match offset => offset slot for small offsets  */
 #define LZX_NUM_FAST_OFFSETS 32768
@@ -378,17 +411,55 @@ struct lzx_compressor {
 
 		/* Data for near-optimal parsing  */
 		struct {
-			/* The graph nodes for the current block  */
+			/*
+			 * The graph nodes for the current block.
+			 *
+			 * We need at least 'LZX_DIV_BLOCK_SIZE +
+			 * LZX_MAX_MATCH_LEN - 1' nodes because that is the
+			 * maximum block size that may be used.  Add 1 because
+			 * we need a node to represent end-of-block.
+			 *
+			 * It is possible that nodes past end-of-block are
+			 * accessed during match consideration, but this can
+			 * only occur if the block was truncated at
+			 * LZX_DIV_BLOCK_SIZE.  So the same bound still applies.
+			 * Note that since nodes past the end of the block will
+			 * never actually have an effect on the items that are
+			 * chosen for the block, it makes no difference what
+			 * their costs are initialized to (if anything).
+			 */
 			struct lzx_optimum_node optimum_nodes[LZX_DIV_BLOCK_SIZE +
-							      LZX_MAX_MATCH_LEN + 1];
+							      LZX_MAX_MATCH_LEN - 1 + 1];
 
 			/* The cost model for the current block  */
 			struct lzx_costs costs;
 
-			/* Cached matches for the current block  */
-			struct lz_match match_cache[LZX_CACHE_LEN + 1 +
-						    LZX_MAX_MATCHES_PER_POS];
-			struct lz_match *cache_overflow_mark;
+			/*
+			 * Cached matches for the current block.  This array
+			 * contains the matches that were found at each position
+			 * in the block.  Specifically, for each position, there
+			 * is a special 'struct lz_match' whose 'length' field
+			 * contains the number of matches that were found at
+			 * that position; this is followed by the matches
+			 * themselves, if any, sorted by strictly increasing
+			 * length and strictly increasing offset.
+			 *
+			 * Note: in rare cases, there will be a very high number
+			 * of matches in the block and this array will overflow.
+			 * If this happens, we force the end of the current
+			 * block.  LZX_CACHE_LENGTH is the length at which we
+			 * actually check for overflow.  The extra slots beyond
+			 * this are enough to absorb the worst case overflow,
+			 * which occurs if starting at
+			 * &match_cache[LZX_CACHE_LENGTH - 1], we write the
+			 * match count header, then write
+			 * LZX_MAX_MATCHES_PER_POS matches, then skip searching
+			 * for matches at 'LZX_MAX_MATCH_LEN - 1' positions and
+			 * write the match count header for each.
+			 */
+			struct lz_match match_cache[LZX_CACHE_LENGTH +
+						    LZX_MAX_MATCHES_PER_POS +
+						    LZX_MAX_MATCH_LEN - 1];
 
 			/* Hash table for finding length 2 matches  */
 			pos_t hash2_tab[LZX_HASH2_LENGTH]
@@ -400,17 +471,6 @@ struct lzx_compressor {
 	};
 };
 
-/* Compute a hash value for the next 2 bytes of uncompressed data.  */
-static inline u32
-lz_hash_2_bytes(const u8 *in_next)
-{
-	u16 next_2_bytes = load_u16_unaligned(in_next);
-	if (LZX_HASH2_ORDER == 16)
-		return next_2_bytes;
-	else
-		return lz_hash(next_2_bytes, LZX_HASH2_ORDER);
-}
-
 /*
  * Structure to keep track of the current state of sending bits to the
  * compressed output buffer.
@@ -426,14 +486,15 @@ struct lzx_output_bitstream {
 	u32 bitcount;
 
 	/* Pointer to the start of the output buffer.  */
-	le16 *start;
+	u8 *start;
 
 	/* Pointer to the position in the output buffer at which the next coding
 	 * unit should be written.  */
-	le16 *next;
+	u8 *next;
 
-	/* Pointer past the end of the output buffer.  */
-	le16 *end;
+	/* Pointer just past the end of the output buffer, rounded down to a
+	 * 2-byte boundary.  */
+	u8 *end;
 };
 
 /*
@@ -453,7 +514,7 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
 	os->bitcount = 0;
 	os->start = buffer;
 	os->next = os->start;
-	os->end = os->start + size / sizeof(le16);
+	os->end = os->start + (size & ~1);
 }
 
 /*
@@ -463,9 +524,9 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
  * bits in @bits cannot be set.  At most 17 bits can be written at once.
  *
  * @max_num_bits is a compile-time constant that specifies the maximum number of
- * bits that can ever be written at the call site.  Currently, it is used to
- * optimize away the conditional code for writing a second 16-bit coding unit
- * when writing fewer than 17 bits.
+ * bits that can ever be written at the call site.  It is used to optimize away
+ * the conditional code for writing a second 16-bit coding unit when writing
+ * fewer than 17 bits.
  *
  * If the output buffer space is exhausted, then the bits will be ignored, and
  * lzx_flush_output() will return 0 when it gets called.
@@ -493,16 +554,20 @@ lzx_write_varbits(struct lzx_output_bitstream *os,
 		os->bitcount -= 16;
 
 		/* Write a coding unit, unless it would overflow the buffer.  */
-		if (os->next != os->end)
-			put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next++);
+		if (os->next != os->end) {
+			put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next);
+			os->next += 2;
+		}
 
 		/* If writing 17 bits, a second coding unit might need to be
 		 * written.  But because 'max_num_bits' is a compile-time
 		 * constant, the compiler will optimize away this code at most
 		 * call sites.  */
 		if (max_num_bits == 17 && os->bitcount == 16) {
-			if (os->next != os->end)
-				put_unaligned_u16_le(os->bitbuf, os->next++);
+			if (os->next != os->end) {
+				put_unaligned_u16_le(os->bitbuf, os->next);
+				os->next += 2;
+			}
 			os->bitcount = 0;
 		}
 	}
@@ -511,8 +576,7 @@ lzx_write_varbits(struct lzx_output_bitstream *os,
 /* Use when @num_bits is a compile-time constant.  Otherwise use
  * lzx_write_varbits().  */
 static inline void
-lzx_write_bits(struct lzx_output_bitstream *os,
-	       const u32 bits, const unsigned num_bits)
+lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 {
 	lzx_write_varbits(os, bits, num_bits, num_bits);
 }
@@ -527,10 +591,12 @@ lzx_flush_output(struct lzx_output_bitstream *os)
 	if (os->next == os->end)
 		return 0;
 
-	if (os->bitcount != 0)
-		put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next++);
+	if (os->bitcount != 0) {
+		put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next);
+		os->next += 2;
+	}
 
-	return (const u8 *)os->next - (const u8 *)os->start;
+	return os->next - os->start;
 }
 
 /* Build the main, length, and aligned offset Huffman codes used in LZX.
@@ -1144,7 +1210,7 @@ lzx_tally_item_list(struct lzx_compressor *c, struct lzx_optimum_node *cur_node)
  * Also, note that because of the presence of the recent offsets queue (which is
  * a type of adaptive state), the algorithm cannot work backwards and compute
  * "cost to end" instead of "cost to beginning".  Furthermore, the way the
- * algorithm handles this adaptive state in the "minimum-cost" parse is actually
+ * algorithm handles this adaptive state in the "minimum cost" parse is actually
  * only an approximation.  It's possible for the globally optimal, minimum cost
  * path to contain a prefix, ending at a position, where that path prefix is
  * *not* the minimum cost path to that position.  This can happen if such a path
@@ -1365,7 +1431,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 		}
 	} while (cur_node != end_node);
 
-	/* Return the match offset queue at the end of the minimum-cost path. */
+	/* Return the match offset queue at the end of the minimum cost path. */
 	return QUEUE(block_end);
 }
 
@@ -1561,9 +1627,7 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 				 * match of the very last two bytes with the
 				 * very first two bytes, since such a match has
 				 * an offset too large to be represented.  */
-				if (unlikely(max_len <
-					     max(LZ_HASH_REQUIRED_NBYTES, 3)))
-				{
+				if (unlikely(max_len < 3)) {
 					in_next++;
 					cache_ptr->length = 0;
 					cache_ptr++;
@@ -1574,7 +1638,7 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 			lz_matchptr = cache_ptr + 1;
 
 			/* Check for a length 2 match.  */
-			hash2 = lz_hash_2_bytes(in_next);
+			hash2 = lz_hash_2_bytes(in_next, LZX_HASH2_ORDER);
 			cur_match = c->hash2_tab[hash2];
 			c->hash2_tab[hash2] = in_next - in_begin;
 			if (matchfinder_node_valid(cur_match) &&
@@ -1621,16 +1685,14 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 					if (unlikely(max_len > in_end - in_next)) {
 						max_len = in_end - in_next;
 						nice_len = min(max_len, nice_len);
-						if (unlikely(max_len <
-							     max(LZ_HASH_REQUIRED_NBYTES, 3)))
-						{
+						if (unlikely(max_len < 3)) {
 							in_next++;
 							cache_ptr->length = 0;
 							cache_ptr++;
 							continue;
 						}
 					}
-					c->hash2_tab[lz_hash_2_bytes(in_next)] =
+					c->hash2_tab[lz_hash_2_bytes(in_next, LZX_HASH2_ORDER)] =
 						in_next - in_begin;
 					bt_matchfinder_skip_position(&c->bt_mf,
 								     in_begin,
@@ -1645,7 +1707,7 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 				} while (--best_len);
 			}
 		} while (in_next < in_block_end &&
-			 likely(cache_ptr < c->cache_overflow_mark));
+			 likely(cache_ptr < &c->match_cache[LZX_CACHE_LENGTH]));
 
 		/* We've finished running the block through the matchfinder.
 		 * Now choose a match/literal sequence and write the block.  */
@@ -1991,9 +2053,13 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 
 		c->impl = lzx_compress_lazy;
 		c->max_search_depth = (36 * compression_level) / 20;
-		c->nice_match_length = min((72 * compression_level) / 20,
-					   LZX_MAX_MATCH_LEN);
+		c->nice_match_length = (72 * compression_level) / 20;
 
+		/* lzx_compress_lazy() needs max_search_depth >= 2 because it
+		 * halves the max_search_depth when attempting a lazy match, and
+		 * max_search_depth cannot be 0.  */
+		if (c->max_search_depth < 2)
+			c->max_search_depth = 2;
 	} else {
 
 		/* Normal / high compression: Use near-optimal parsing.  */
@@ -2003,8 +2069,7 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 		/* Scale nice_match_length and max_search_depth with the
 		 * compression level.  */
 		c->max_search_depth = (24 * compression_level) / 50;
-		c->nice_match_length = min((32 * compression_level) / 50,
-					   LZX_MAX_MATCH_LEN);
+		c->nice_match_length = (32 * compression_level) / 50;
 
 		/* Set a number of optimization passes appropriate for the
 		 * compression level.  */
@@ -2028,10 +2093,15 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 			if (compression_level >= 300)
 				c->num_optim_passes++;
 		}
-
-		c->cache_overflow_mark = &c->match_cache[LZX_CACHE_LEN];
 	}
 
+	/* max_search_depth == 0 is invalid.  */
+	if (c->max_search_depth < 1)
+		c->max_search_depth = 1;
+
+	if (c->nice_match_length > LZX_MAX_MATCH_LEN)
+		c->nice_match_length = LZX_MAX_MATCH_LEN;
+
 	lzx_init_offset_slot_fast(c);
 	*c_ret = c;
 	return 0;