X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Flzx_compress.c;h=5e1be485f3bc9ff0590db093edc8f09599c3f0b5;hp=4434cde34a02bd13a8b1685765207c2d8b15bee1;hb=80c2fe3e6463cfd0eca5bead23a08731b6db9576;hpb=e7a3df0a6bf2af6500611f6c464dc36cab3332d8

diff --git a/src/lzx_compress.c b/src/lzx_compress.c
index 4434cde3..5e1be485 100644
--- a/src/lzx_compress.c
+++ b/src/lzx_compress.c
@@ -82,7 +82,7 @@
  * cache.  However, fallback behavior (immediately terminating the block) on
  * cache overflow is still required.
  */
-#define LZX_CACHE_PER_POS	6
+#define LZX_CACHE_PER_POS	7
 
 /*
  * LZX_CACHE_LENGTH is the number of lz_match structures in the match cache,
@@ -105,7 +105,7 @@
 
 /*
  * LZX_BIT_COST is a scaling factor that represents the cost to output one bit.
- * THis makes it possible to consider fractional bit costs.
+ * This makes it possible to consider fractional bit costs.
  *
  * Note: this is only useful as a statistical trick for when the true costs are
  * unknown.  In reality, each token in LZX requires a whole number of bits to
@@ -120,14 +120,15 @@
 #define LZX_CONSIDER_ALIGNED_COSTS	0
 
 /*
- * The maximum compression level at which we use the faster algorithm.
+ * LZX_MAX_FAST_LEVEL is the maximum compression level at which we use the
+ * faster algorithm.
  */
 #define LZX_MAX_FAST_LEVEL	34
 
 /*
  * LZX_HASH2_ORDER is the log base 2 of the number of entries in the hash table
  * for finding length 2 matches.  This can be as high as 16 (in which case the
- * hash function is trivial), but using a smaller hash table actually speeds up
+ * hash function is trivial), but using a smaller hash table speeds up
  * compression due to reduced cache pressure.
  */
 #define LZX_HASH2_ORDER		12
@@ -145,7 +146,6 @@
 #include "wimlib/bt_matchfinder.h"
 #include "wimlib/compress_common.h"
 #include "wimlib/compressor_ops.h"
-#include "wimlib/endianness.h"
 #include "wimlib/error.h"
 #include "wimlib/hc_matchfinder.h"
 #include "wimlib/lz_extend.h"
@@ -368,6 +368,10 @@ struct lzx_compressor {
 	/* Pointer to the compress() implementation chosen at allocation time */
 	void (*impl)(struct lzx_compressor *, struct lzx_output_bitstream *);
 
+	/* If true, the compressor need not preserve the input buffer if it
+	 * compresses the data successfully.  */
+	bool destructive;
+
 	/* The Huffman symbol frequency counters for the current block.  */
 	struct lzx_freqs freqs;
 
@@ -442,7 +446,7 @@ struct lzx_compressor {
 			 * contains the number of matches that were found at
 			 * that position; this is followed by the matches
 			 * themselves, if any, sorted by strictly increasing
-			 * length and strictly increasing offset.
+			 * length.
 			 *
 			 * Note: in rare cases, there will be a very high number
 			 * of matches in the block and this array will overflow.
@@ -462,8 +466,7 @@ struct lzx_compressor {
 						    LZX_MAX_MATCH_LEN - 1];
 
 			/* Hash table for finding length 2 matches  */
-			pos_t hash2_tab[LZX_HASH2_LENGTH]
-				_aligned_attribute(MATCHFINDER_ALIGNMENT);
+			pos_t hash2_tab[LZX_HASH2_LENGTH];
 
 			/* Binary trees matchfinder (MUST BE LAST!!!)  */
 			struct bt_matchfinder bt_mf;
@@ -486,14 +489,15 @@ struct lzx_output_bitstream {
 	u32 bitcount;
 
 	/* Pointer to the start of the output buffer.  */
-	le16 *start;
+	u8 *start;
 
 	/* Pointer to the position in the output buffer at which the next coding
 	 * unit should be written.  */
-	le16 *next;
+	u8 *next;
 
-	/* Pointer past the end of the output buffer.  */
-	le16 *end;
+	/* Pointer just past the end of the output buffer, rounded down to a
+	 * 2-byte boundary.  */
+	u8 *end;
 };
 
 /*
@@ -513,7 +517,7 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
 	os->bitcount = 0;
 	os->start = buffer;
 	os->next = os->start;
-	os->end = os->start + size / sizeof(le16);
+	os->end = os->start + (size & ~1);
 }
 
 /*
@@ -523,9 +527,9 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
  * bits in @bits cannot be set.  At most 17 bits can be written at once.
  *
  * @max_num_bits is a compile-time constant that specifies the maximum number of
- * bits that can ever be written at the call site.  Currently, it is used to
- * optimize away the conditional code for writing a second 16-bit coding unit
- * when writing fewer than 17 bits.
+ * bits that can ever be written at the call site.  It is used to optimize away
+ * the conditional code for writing a second 16-bit coding unit when writing
+ * fewer than 17 bits.
  *
  * If the output buffer space is exhausted, then the bits will be ignored, and
  * lzx_flush_output() will return 0 when it gets called.
@@ -553,16 +557,20 @@ lzx_write_varbits(struct lzx_output_bitstream *os,
 		os->bitcount -= 16;
 
 		/* Write a coding unit, unless it would overflow the buffer.  */
-		if (os->next != os->end)
-			put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next++);
+		if (os->next != os->end) {
+			put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next);
+			os->next += 2;
+		}
 
 		/* If writing 17 bits, a second coding unit might need to be
 		 * written.  But because 'max_num_bits' is a compile-time
 		 * constant, the compiler will optimize away this code at most
 		 * call sites.  */
 		if (max_num_bits == 17 && os->bitcount == 16) {
-			if (os->next != os->end)
-				put_unaligned_u16_le(os->bitbuf, os->next++);
+			if (os->next != os->end) {
+				put_unaligned_u16_le(os->bitbuf, os->next);
+				os->next += 2;
+			}
 			os->bitcount = 0;
 		}
 	}
@@ -571,8 +579,7 @@ lzx_write_varbits(struct lzx_output_bitstream *os,
 /* Use when @num_bits is a compile-time constant.  Otherwise use
  * lzx_write_varbits().  */
 static inline void
-lzx_write_bits(struct lzx_output_bitstream *os,
-	       const u32 bits, const unsigned num_bits)
+lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 {
 	lzx_write_varbits(os, bits, num_bits, num_bits);
 }
@@ -587,10 +594,12 @@ lzx_flush_output(struct lzx_output_bitstream *os)
 	if (os->next == os->end)
 		return 0;
 
-	if (os->bitcount != 0)
-		put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next++);
+	if (os->bitcount != 0) {
+		put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next);
+		os->next += 2;
+	}
 
-	return (const u8 *)os->next - (const u8 *)os->start;
+	return os->next - os->start;
 }
 
 /* Build the main, length, and aligned offset Huffman codes used in LZX.
@@ -1111,8 +1120,8 @@ lzx_declare_explicit_offset_match(struct lzx_compressor *c, unsigned len, u32 of
 		extra_bits = (offset + LZX_OFFSET_ADJUSTMENT) -
 			     lzx_offset_slot_base[offset_slot];
 
-		BUILD_BUG_ON(LZX_MAINCODE_MAX_NUM_SYMBOLS > (1 << 10));
-		BUILD_BUG_ON(LZX_LENCODE_NUM_SYMBOLS > (1 << 8));
+		STATIC_ASSERT(LZX_MAINCODE_MAX_NUM_SYMBOLS <= (1 << 10));
+		STATIC_ASSERT(LZX_LENCODE_NUM_SYMBOLS <= (1 << 8));
 		*(*next_chosen_item)++ = (struct lzx_item) {
 			.data = (u64)main_symbol |
 				((u64)len_symbol << 10) |
@@ -1204,7 +1213,7 @@ lzx_tally_item_list(struct lzx_compressor *c, struct lzx_optimum_node *cur_node)
  * Also, note that because of the presence of the recent offsets queue (which is
  * a type of adaptive state), the algorithm cannot work backwards and compute
  * "cost to end" instead of "cost to beginning".  Furthermore, the way the
- * algorithm handles this adaptive state in the "minimum-cost" parse is actually
+ * algorithm handles this adaptive state in the "minimum cost" parse is actually
  * only an approximation.  It's possible for the globally optimal, minimum cost
  * path to contain a prefix, ending at a position, where that path prefix is
  * *not* the minimum cost path to that position.  This can happen if such a path
@@ -1232,7 +1241,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 	 * it is no longer needed.  */
 	struct lzx_lru_queue queues[512];
 
-	BUILD_BUG_ON(ARRAY_LEN(queues) < LZX_MAX_MATCH_LEN + 1);
+	STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1);
 #define QUEUE(in) (queues[(uintptr_t)(in) % ARRAY_LEN(queues)])
 
 	/* Initially, the cost to reach each node is "infinity".  */
@@ -1284,7 +1293,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 			matchptr = in_next - lzx_lru_queue_R0(QUEUE(in_next));
 			if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
 				goto R0_done;
-			BUILD_BUG_ON(LZX_MIN_MATCH_LEN != 2);
+			STATIC_ASSERT(LZX_MIN_MATCH_LEN == 2);
 			do {
 				u32 cost = cur_node->cost +
 					   c->costs.match_cost[0][
@@ -1425,7 +1434,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
 		}
 	} while (cur_node != end_node);
 
-	/* Return the match offset queue at the end of the minimum-cost path. */
+	/* Return the match offset queue at the end of the minimum cost path. */
 	return QUEUE(block_end);
 }
 
@@ -1469,7 +1478,7 @@ lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size)
 	unsigned num_used_bytes;
 
 	/* The costs below are hard coded to use a scaling factor of 16.  */
-	BUILD_BUG_ON(LZX_BIT_COST != 16);
+	STATIC_ASSERT(LZX_BIT_COST == 16);
 
 	/*
 	 * Heuristics:
@@ -1590,7 +1599,7 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 	struct lzx_lru_queue queue;
 
 	bt_matchfinder_init(&c->bt_mf);
-	matchfinder_init(c->hash2_tab, LZX_HASH2_LENGTH);
+	memset(c->hash2_tab, 0, sizeof(c->hash2_tab));
 	next_hash = bt_matchfinder_hash_3_bytes(in_next);
 	lzx_lru_queue_init(&queue);
 
@@ -1614,13 +1623,11 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 				max_len = in_end - in_next;
 				nice_len = min(max_len, nice_len);
 
-				/* This extra check is needed to ensure that
-				 * reading the next 3 bytes when looking for a
-				 * length 2 match is valid.  In addition, we
-				 * cannot allow ourselves to find a length 2
-				 * match of the very last two bytes with the
-				 * very first two bytes, since such a match has
-				 * an offset too large to be represented.  */
+				/* This extra check is needed to ensure that we
+				 * never output a length 2 match of the very
+				 * last two bytes with the very first two bytes,
+				 * since such a match has an offset too large to
+				 * be represented.  */
 				if (unlikely(max_len < 3)) {
 					in_next++;
 					cache_ptr->length = 0;
@@ -1635,11 +1642,10 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
 			hash2 = lz_hash_2_bytes(in_next, LZX_HASH2_ORDER);
 			cur_match = c->hash2_tab[hash2];
 			c->hash2_tab[hash2] = in_next - in_begin;
-			if (matchfinder_node_valid(cur_match) &&
+			if (cur_match != 0 &&
 			    (LZX_HASH2_ORDER == 16 ||
 			     load_u16_unaligned(&in_begin[cur_match]) ==
-			     load_u16_unaligned(in_next)) &&
-			    in_begin[cur_match + 2] != in_next[2])
+			     load_u16_unaligned(in_next)))
 			{
 				lz_matchptr->length = 2;
 				lz_matchptr->offset = in_next - &in_begin[cur_match];
@@ -1727,7 +1733,7 @@ lzx_find_longest_repeat_offset_match(const u8 * const in_next,
 				     struct lzx_lru_queue queue,
 				     unsigned *rep_max_idx_ret)
 {
-	BUILD_BUG_ON(LZX_NUM_RECENT_OFFSETS != 3);
+	STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
 	LZX_ASSERT(bytes_remaining >= 2);
 
 	const unsigned max_len = min(bytes_remaining, LZX_MAX_MATCH_LEN);
@@ -2005,7 +2011,8 @@ lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
 }
 
 static u64
-lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level)
+lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
+		      bool destructive)
 {
 	u64 size = 0;
 
@@ -2013,13 +2020,14 @@ lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level)
 		return 0;
 
 	size += lzx_get_compressor_size(max_bufsize, compression_level);
-	size += max_bufsize; /* in_buffer */
+	if (!destructive)
+		size += max_bufsize; /* in_buffer */
 	return size;
 }
 
 static int
 lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
-		      void **c_ret)
+		      bool destructive, void **c_ret)
 {
 	unsigned window_order;
 	struct lzx_compressor *c;
@@ -2028,18 +2036,20 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 	if (window_order == 0)
 		return WIMLIB_ERR_INVALID_PARAM;
 
-	c = ALIGNED_MALLOC(lzx_get_compressor_size(max_bufsize,
-						   compression_level),
-			   MATCHFINDER_ALIGNMENT);
+	c = MALLOC(lzx_get_compressor_size(max_bufsize, compression_level));
 	if (!c)
 		goto oom0;
 
+	c->destructive = destructive;
+
 	c->num_main_syms = lzx_get_num_main_syms(window_order);
 	c->window_order = window_order;
 
-	c->in_buffer = MALLOC(max_bufsize);
-	if (!c->in_buffer)
-		goto oom1;
+	if (!c->destructive) {
+		c->in_buffer = MALLOC(max_bufsize);
+		if (!c->in_buffer)
+			goto oom1;
+	}
 
 	if (compression_level <= LZX_MAX_FAST_LEVEL) {
 
@@ -2101,24 +2111,28 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
 	return 0;
 
 oom1:
-	ALIGNED_FREE(c);
+	FREE(c);
 oom0:
 	return WIMLIB_ERR_NOMEM;
 }
 
 static size_t
-lzx_compress(const void *in, size_t in_nbytes,
-	     void *out, size_t out_nbytes_avail, void *_c)
+lzx_compress(const void *restrict in, size_t in_nbytes,
+	     void *restrict out, size_t out_nbytes_avail, void *restrict _c)
 {
 	struct lzx_compressor *c = _c;
 	struct lzx_output_bitstream os;
+	size_t result;
 
 	/* Don't bother trying to compress very small inputs.  */
 	if (in_nbytes < 100)
 		return 0;
 
 	/* Copy the input data into the internal buffer and preprocess it.  */
-	memcpy(c->in_buffer, in, in_nbytes);
+	if (c->destructive)
+		c->in_buffer = (void *)in;
+	else
+		memcpy(c->in_buffer, in, in_nbytes);
 	c->in_nbytes = in_nbytes;
 	lzx_do_e8_preprocessing(c->in_buffer, in_nbytes);
 
@@ -2133,7 +2147,10 @@ lzx_compress(const void *in, size_t in_nbytes,
 	(*c->impl)(c, &os);
 
 	/* Flush the output bitstream and return the compressed size or 0.  */
-	return lzx_flush_output(&os);
+	result = lzx_flush_output(&os);
+	if (!result && c->destructive)
+		lzx_undo_e8_preprocessing(c->in_buffer, c->in_nbytes);
+	return result;
 }
 
 static void
@@ -2141,8 +2158,9 @@ lzx_free_compressor(void *_c)
 {
 	struct lzx_compressor *c = _c;
 
-	FREE(c->in_buffer);
-	ALIGNED_FREE(c);
+	if (!c->destructive)
+		FREE(c->in_buffer);
+	FREE(c);
 }
 
 const struct compressor_ops lzx_compressor_ops = {