X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Flzms_compress.c;h=f42a46ee2fe8d82292265dd5d9685c7bf9f5bbdf;hp=abf39f5e89f194d5ff0fb087b28a3c4c73b57100;hb=0096c56e9a0ee15f708523075d67ea7a1d77b456;hpb=a141cf94ae7562406ea1bc32c026ac85b37751a6

diff --git a/src/lzms_compress.c b/src/lzms_compress.c
index abf39f5e..f42a46ee 100644
--- a/src/lzms_compress.c
+++ b/src/lzms_compress.c
@@ -173,7 +173,7 @@ struct lzms_item {
 static inline void
 check_that_powers_fit_in_bitfield(void)
 {
-	BUILD_BUG_ON(LZMS_NUM_DELTA_POWER_SYMS > (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
+	STATIC_ASSERT(LZMS_NUM_DELTA_POWER_SYMS <= (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
 }
 
 /* A stripped-down version of the adaptive state in LZMS which excludes the
@@ -249,7 +249,7 @@ struct lzms_optimum_node {
 	 *
 	 * Note: this adaptive state structure also does not include the
 	 * probability entries or current Huffman codewords.  Those aren't
-	 * maintained per-position and are only updated occassionally.
+	 * maintained per-position and are only updated occasionally.
 	 */
 	struct lzms_adaptive_state state;
 } _aligned_attribute(64);
@@ -287,6 +287,10 @@ struct lzms_compressor {
 	 */
 	bool use_delta_matches;
 
+	/* If true, the compressor need not preserve the input buffer if it
+	 * compresses the data successfully.  */
+	bool destructive;
+
 	/* 'last_target_usages' is a large array that is only needed for
 	 * preprocessing, so it is in union with fields that don't need to be
 	 * initialized until after preprocessing.  */
@@ -303,7 +307,8 @@ struct lzms_compressor {
 	u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
 
 	/* The per-byte graph nodes for near-optimal parsing  */
-	struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH];
+	struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH +
+					       1 + MAX_FAST_LENGTH];
 
 	/* Table: length => current cost for small match lengths  */
 	u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
@@ -323,14 +328,7 @@ struct lzms_compressor {
 	unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
 	unsigned delta_state;
 	unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
-	struct lzms_probability_entry main_probs[LZMS_NUM_MAIN_PROBS];
-	struct lzms_probability_entry match_probs[LZMS_NUM_MATCH_PROBS];
-	struct lzms_probability_entry lz_probs[LZMS_NUM_LZ_PROBS];
-	struct lzms_probability_entry lz_rep_probs[LZMS_NUM_LZ_REP_DECISIONS]
-						  [LZMS_NUM_LZ_REP_PROBS];
-	struct lzms_probability_entry delta_probs[LZMS_NUM_DELTA_PROBS];
-	struct lzms_probability_entry delta_rep_probs[LZMS_NUM_DELTA_REP_DECISIONS]
-						     [LZMS_NUM_DELTA_REP_PROBS];
+	struct lzms_probabilites probs;
 
 	/* Huffman codes  */
 
@@ -500,7 +498,7 @@ lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
 		do {
 			if (likely(rc->next >= rc->begin)) {
 				if (rc->next != rc->end) {
-					put_unaligned_u16_le(rc->cache +
+					put_unaligned_le16(rc->cache +
 							     (u16)(rc->lower_bound >> 32),
 							     rc->next++);
 				}
@@ -582,42 +580,42 @@ static void
 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
 {
 	lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
-			c->main_probs, &c->rc);
+			c->probs.main, &c->rc);
 }
 
 static void
 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
 {
 	lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
-			c->match_probs, &c->rc);
+			c->probs.match, &c->rc);
 }
 
 static void
 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
 {
 	lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
-			c->lz_probs, &c->rc);
+			c->probs.lz, &c->rc);
 }
 
 static void
 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
 {
 	lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
-			c->lz_rep_probs[idx], &c->rc);
+			c->probs.lz_rep[idx], &c->rc);
 }
 
 static void
 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
 {
 	lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
-			c->delta_probs, &c->rc);
+			c->probs.delta, &c->rc);
 }
 
 static void
 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
 {
 	lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
-			c->delta_rep_probs[idx], &c->rc);
+			c->probs.delta_rep[idx], &c->rc);
 }
 
 /******************************************************************************
@@ -660,7 +658,7 @@ lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
 
 		/* Write a coding unit, unless it would underflow the buffer. */
 		if (os->next != os->begin)
-			put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
+			put_unaligned_le16(os->bitbuf >> os->bitcount, --os->next);
 
 		/* Optimization for call sites that never write more than 16
 		 * bits at once.  */
@@ -681,7 +679,7 @@ lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
 		return false;
 
 	if (os->bitcount != 0)
-		put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
+		put_unaligned_le16(os->bitbuf << (16 - os->bitcount), --os->next);
 
 	return true;
 }
@@ -942,7 +940,7 @@ lzms_encode_item_list(struct lzms_compressor *c,
 }
 
 /******************************************************************************
- *                             Cost evalution                                 *
+ *                             Cost evaluation                                *
  ******************************************************************************/
 
 /*
@@ -977,7 +975,7 @@ static inline void
 check_cost_shift(void)
 {
 	/* lzms_bit_costs is hard-coded to the current COST_SHIFT.  */
-	BUILD_BUG_ON(COST_SHIFT != 6);
+	STATIC_ASSERT(COST_SHIFT == 6);
 }
 
 #if 0
@@ -1018,7 +1016,7 @@ lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
 static inline u32
 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
 {
-	return lzms_bit_0_cost(main_state, c->main_probs) +
+	return lzms_bit_0_cost(main_state, c->probs.main) +
 		((u32)c->literal_lens[literal] << COST_SHIFT);
 }
 
@@ -1122,7 +1120,7 @@ lzms_update_lru_queues(struct lzms_adaptive_state *state)
 static inline void
 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
 {
-	*state_p = ((*state_p << 1) | bit) % num_states;
+	*state_p = ((*state_p << 1) | bit) & (num_states - 1);
 }
 
 static inline void
@@ -1182,7 +1180,7 @@ static void
 lzms_init_delta_matchfinder(struct lzms_compressor *c)
 {
 	/* Set all entries to use an invalid power, which will never match.  */
-	BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER >= (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
+	STATIC_ASSERT(NUM_POWERS_TO_CONSIDER < (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
 	memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
 
 	/* Initialize the next hash code for each power.  We can just use zeroes
@@ -1197,7 +1195,7 @@ lzms_init_delta_matchfinder(struct lzms_compressor *c)
  * delta context with the specified @span.
  */
 static inline u32
-lzms_delta_hash(const u8 *p, u32 span)
+lzms_delta_hash(const u8 *p, const u32 pos, u32 span)
 {
 	/* A delta match has a certain span and an offset that is a multiple of
 	 * that span.  To reduce wasted space we use a single combined hash
@@ -1205,11 +1203,11 @@ lzms_delta_hash(const u8 *p, u32 span)
 	 * include in the hash code computation the span and the low-order bits
 	 * of the current position.  */
 
-	BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
+	STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
 	u8 d0 = *(p + 0) - *(p + 0 - span);
 	u8 d1 = *(p + 1) - *(p + 1 - span);
 	u8 d2 = *(p + 2) - *(p + 2 - span);
-	u32 v = ((span + ((u32)(uintptr_t)p & (span - 1))) << 24) |
+	u32 v = ((span + (pos & (span - 1))) << 24) |
 		((u32)d2 << 16) | ((u32)d1 << 8) | d0;
 	return lz_hash(v, DELTA_HASH_ORDER);
 }
@@ -1245,12 +1243,12 @@ lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
 			const u32 span = (u32)1 << power;
 			if (unlikely(pos < span))
 				continue;
-			const u32 next_hash = lzms_delta_hash(in_next + 1, span);
+			const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
 			const u32 hash = c->next_delta_hashes[power];
 			c->delta_hash_table[hash] =
 				(power << DELTA_SOURCE_POWER_SHIFT) | pos;
 			c->next_delta_hashes[power] = next_hash;
-			prefetch(&c->delta_hash_table[next_hash]);
+			prefetchw(&c->delta_hash_table[next_hash]);
 		}
 	} while (in_next++, pos++, --count);
 }
@@ -1282,10 +1280,10 @@ lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
  * can be reached using a match or literal from the current position.  This is
  * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
  * the graph edges are possible matches/literals to code, and the cost of each
- * edge is the estimated number of bits that will be required to output the
- * corresponding match or literal.  But one difference is that we actually
- * compute the lowest-cost path in pieces, where each piece is terminated when
- * there are no choices to be made.
+ * edge is the estimated number of bits (scaled up by COST_SHIFT) that will be
+ * required to output the corresponding match or literal.  But one difference is
+ * that we actually compute the lowest-cost path in pieces, where each piece is
+ * terminated when there are no choices to be made.
  *
  * The costs of literals and matches are estimated using the range encoder
  * states and the semi-adaptive Huffman codes.  Except for range encoding
@@ -1375,19 +1373,19 @@ begin:
 
 				u32 base_cost = cur_node->cost +
 						lzms_bit_1_cost(cur_node->state.main_state,
-								c->main_probs) +
+								c->probs.main) +
 						lzms_bit_0_cost(cur_node->state.match_state,
-								c->match_probs) +
+								c->probs.match) +
 						lzms_bit_1_cost(cur_node->state.lz_state,
-								c->lz_probs);
+								c->probs.lz);
 
 				for (int i = 0; i < rep_idx; i++)
 					base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
-								     c->lz_rep_probs[i]);
+								     c->probs.lz_rep[i]);
 
 				if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
 					base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
-								     c->lz_rep_probs[rep_idx]);
+								     c->probs.lz_rep[rep_idx]);
 
 				u32 len = 2;
 				do {
@@ -1437,10 +1435,10 @@ begin:
 					main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
 
 					/* add LZ-rep0 cost  */
-					cost += lzms_bit_1_cost(main_state, c->main_probs) +
-						lzms_bit_0_cost(match_state, c->match_probs) +
-						lzms_bit_1_cost(lz_state, c->lz_probs) +
-						lzms_bit_0_cost(lz_rep0_state, c->lz_rep_probs[0]) +
+					cost += lzms_bit_1_cost(main_state, c->probs.main) +
+						lzms_bit_0_cost(match_state, c->probs.match) +
+						lzms_bit_1_cost(lz_state, c->probs.lz) +
+						lzms_bit_0_cost(lz_rep0_state, c->probs.lz_rep[0]) +
 						lzms_fast_length_cost(c, rep0_len);
 
 					const u32 total_len = rep_len + 1 + rep0_len;
@@ -1527,19 +1525,19 @@ begin:
 
 				u32 base_cost = cur_node->cost +
 						lzms_bit_1_cost(cur_node->state.main_state,
-								c->main_probs) +
+								c->probs.main) +
 						lzms_bit_1_cost(cur_node->state.match_state,
-								c->match_probs) +
+								c->probs.match) +
 						lzms_bit_1_cost(cur_node->state.delta_state,
-								c->delta_probs);
+								c->probs.delta);
 
 				for (int i = 0; i < rep_idx; i++)
 					base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
-								     c->delta_rep_probs[i]);
+								     c->probs.delta_rep[i]);
 
 				if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
 					base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
-								     c->delta_rep_probs[rep_idx]);
+								     c->probs.delta_rep[rep_idx]);
 
 				u32 len = 2;
 				do {
@@ -1596,11 +1594,11 @@ begin:
 
 			u32 base_cost = cur_node->cost +
 					lzms_bit_1_cost(cur_node->state.main_state,
-							c->main_probs) +
+							c->probs.main) +
 					lzms_bit_0_cost(cur_node->state.match_state,
-							c->match_probs) +
+							c->probs.match) +
 					lzms_bit_0_cost(cur_node->state.lz_state,
-							c->lz_probs);
+							c->probs.lz);
 
 			if (c->try_lzmatch_lit_lzrep0 &&
 			    likely(in_end - (in_next + c->matches[0].length) >= 3))
@@ -1656,11 +1654,11 @@ begin:
 					main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
 
 					/* add LZ-rep0 cost  */
-					cost += lzms_bit_1_cost(main_state, c->main_probs) +
-						lzms_bit_0_cost(match_state, c->match_probs) +
-						lzms_bit_1_cost(lz_state, c->lz_probs) +
+					cost += lzms_bit_1_cost(main_state, c->probs.main) +
+						lzms_bit_0_cost(match_state, c->probs.match) +
+						lzms_bit_1_cost(lz_state, c->probs.lz) +
 						lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
-								c->lz_rep_probs[0]) +
+								c->probs.lz_rep[0]) +
 						lzms_fast_length_cost(c, rep0_len);
 
 					const u32 total_len = len + 1 + rep0_len;
@@ -1714,7 +1712,7 @@ begin:
 			const u32 pos = in_next - c->in_buffer;
 
 			/* Consider each possible power (log2 of span)  */
-			BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER > LZMS_NUM_DELTA_POWER_SYMS);
+			STATIC_ASSERT(NUM_POWERS_TO_CONSIDER <= LZMS_NUM_DELTA_POWER_SYMS);
 			for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
 
 				const u32 span = (u32)1 << power;
@@ -1722,13 +1720,13 @@ begin:
 				if (unlikely(pos < span))
 					continue;
 
-				const u32 next_hash = lzms_delta_hash(in_next + 1, span);
+				const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
 				const u32 hash = c->next_delta_hashes[power];
 				const u32 cur_match = c->delta_hash_table[hash];
 
 				c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
 				c->next_delta_hashes[power] = next_hash;
-				prefetch(&c->delta_hash_table[next_hash]);
+				prefetchw(&c->delta_hash_table[next_hash]);
 
 				if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
 					continue;
@@ -1743,7 +1741,7 @@ begin:
 
 				/* Check the first 3 bytes before entering the
 				 * extension loop.  */
-				BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
+				STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
 				if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
 				     (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
 				    ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
@@ -1755,11 +1753,16 @@ begin:
 				/* Extend the delta match to its full length.  */
 				const u32 len = lzms_extend_delta_match(in_next,
 									matchptr,
-									3,
+									NBYTES_HASHED_FOR_DELTA,
 									in_end - in_next,
 									span);
 
 				const u32 raw_offset = offset >> power;
+
+				if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK -
+							  (LZMS_NUM_DELTA_REPS - 1)))
+					continue;
+
 				const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
 						 raw_offset;
 				const u32 source = DELTA_SOURCE_TAG |
@@ -1790,11 +1793,11 @@ begin:
 
 				u32 base_cost = cur_node->cost +
 						lzms_bit_1_cost(cur_node->state.main_state,
-								c->main_probs) +
+								c->probs.main) +
 						lzms_bit_1_cost(cur_node->state.match_state,
-								c->match_probs) +
+								c->probs.match) +
 						lzms_bit_0_cost(cur_node->state.delta_state,
-								c->delta_probs) +
+								c->probs.delta) +
 						lzms_delta_source_cost(c, power, raw_offset);
 
 				u32 l = NBYTES_HASHED_FOR_DELTA;
@@ -1848,13 +1851,13 @@ begin:
 
 				/* Add cost of LZ-rep0  */
 				const u32 cost = cur_and_lit_cost +
-						 lzms_bit_1_cost(main_state, c->main_probs) +
+						 lzms_bit_1_cost(main_state, c->probs.main) +
 						 lzms_bit_0_cost(cur_node->state.match_state,
-								 c->match_probs) +
+								 c->probs.match) +
 						 lzms_bit_1_cost(cur_node->state.lz_state,
-								 c->lz_probs) +
+								 c->probs.lz) +
 						 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
-								 c->lz_rep_probs[0]) +
+								 c->probs.lz_rep[0]) +
 						 lzms_fast_length_cost(c, rep0_len);
 
 				const u32 total_len = 1 + rep0_len;
@@ -1885,7 +1888,7 @@ begin:
 		 * Finalize the adaptive state that results from taking this
 		 * lowest-cost path.  */
 		struct lzms_item item_to_take = cur_node->item;
-		struct lzms_optimum_node *source_node = cur_node - (item_to_take.length);
+		struct lzms_optimum_node *source_node = cur_node - item_to_take.length;
 		int next_item_idx = -1;
 		for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
 			item_to_take = cur_node->extra_items[i];
@@ -1912,9 +1915,9 @@ begin:
 
 					if (source >= LZMS_NUM_DELTA_REPS) {
 						/* Explicit offset delta match  */
-						u32 pair = source - (LZMS_NUM_DELTA_REPS - 1);
 						lzms_update_delta_state(&cur_node->state, 0);
-						cur_node->state.upcoming_delta_pair = pair;
+						cur_node->state.upcoming_delta_pair =
+							source - (LZMS_NUM_DELTA_REPS - 1);
 					} else {
 						/* Repeat offset delta match  */
 						int rep_idx = source;
@@ -2013,14 +2016,7 @@ lzms_init_states_and_probabilities(struct lzms_compressor *c)
 	for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
 		c->delta_rep_states[i] = 0;
 
-	lzms_init_probability_entries(c->main_probs, LZMS_NUM_MAIN_PROBS);
-	lzms_init_probability_entries(c->match_probs, LZMS_NUM_MATCH_PROBS);
-	lzms_init_probability_entries(c->lz_probs, LZMS_NUM_LZ_PROBS);
-	for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
-		lzms_init_probability_entries(c->lz_rep_probs[i], LZMS_NUM_LZ_REP_PROBS);
-	lzms_init_probability_entries(c->delta_probs, LZMS_NUM_DELTA_PROBS);
-	for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
-		lzms_init_probability_entries(c->delta_rep_probs[i], LZMS_NUM_DELTA_REP_PROBS);
+	lzms_init_probabilities(&c->probs);
 }
 
 static void
@@ -2100,7 +2096,8 @@ lzms_finalize(struct lzms_compressor *c)
 }
 
 static u64
-lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level)
+lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level,
+		       bool destructive)
 {
 	u64 size = 0;
 
@@ -2109,8 +2106,8 @@ lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level)
 
 	size += sizeof(struct lzms_compressor);
 
-	/* in_buffer */
-	size += max_bufsize;
+	if (!destructive)
+		size += max_bufsize; /* in_buffer */
 
 	/* mf */
 	size += lcpit_matchfinder_get_needed_memory(max_bufsize);
@@ -2120,7 +2117,7 @@ lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level)
 
 static int
 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
-		       void **c_ret)
+		       bool destructive, void **c_ret)
 {
 	struct lzms_compressor *c;
 	u32 nice_match_len;
@@ -2132,6 +2129,8 @@ lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
 	if (!c)
 		goto oom0;
 
+	c->destructive = destructive;
+
 	/* Scale nice_match_len with the compression level.  But to allow an
 	 * optimization for length cost calculations, don't allow nice_match_len
 	 * to exceed MAX_FAST_LENGTH.  */
@@ -2142,9 +2141,11 @@ lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
 	c->try_lit_lzrep0 = (compression_level >= 60);
 	c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
 
-	c->in_buffer = MALLOC(max_bufsize);
-	if (!c->in_buffer)
-		goto oom1;
+	if (!c->destructive) {
+		c->in_buffer = MALLOC(max_bufsize);
+		if (!c->in_buffer)
+			goto oom1;
+	}
 
 	if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
 		goto oom2;
@@ -2156,7 +2157,8 @@ lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
 	return 0;
 
 oom2:
-	FREE(c->in_buffer);
+	if (!c->destructive)
+		FREE(c->in_buffer);
 oom1:
 	ALIGNED_FREE(c);
 oom0:
@@ -2164,17 +2166,21 @@ oom0:
 }
 
 static size_t
-lzms_compress(const void *in, size_t in_nbytes,
-	      void *out, size_t out_nbytes_avail, void *_c)
+lzms_compress(const void *restrict in, size_t in_nbytes,
+	      void *restrict out, size_t out_nbytes_avail, void *restrict _c)
 {
 	struct lzms_compressor *c = _c;
+	size_t result;
 
 	/* Don't bother trying to compress extremely small inputs.  */
 	if (in_nbytes < 4)
 		return 0;
 
 	/* Copy the input data into the internal buffer and preprocess it.  */
-	memcpy(c->in_buffer, in, in_nbytes);
+	if (c->destructive)
+		c->in_buffer = (void *)in;
+	else
+		memcpy(c->in_buffer, in, in_nbytes);
 	c->in_nbytes = in_nbytes;
 	lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
 
@@ -2187,13 +2193,16 @@ lzms_compress(const void *in, size_t in_nbytes,
 	lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16));
 	lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16));
 	lzms_init_states_and_probabilities(c);
-	lzms_init_huffman_codes(c, lzms_get_num_offset_slots(in_nbytes));
+	lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes));
 
 	/* The main loop: parse and encode.  */
 	lzms_near_optimal_parse(c);
 
 	/* Return the compressed data size or 0.  */
-	return lzms_finalize(c);
+	result = lzms_finalize(c);
+	if (!result && c->destructive)
+		lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true);
+	return result;
 }
 
 static void
@@ -2201,7 +2210,8 @@ lzms_free_compressor(void *_c)
 {
 	struct lzms_compressor *c = _c;
 
-	FREE(c->in_buffer);
+	if (!c->destructive)
+		FREE(c->in_buffer);
 	lcpit_matchfinder_destroy(&c->mf);
 	ALIGNED_FREE(c);
 }