Get rid of matchfinder_common.h and manual memsets

[wimlib] / src / lzx_compress.c
diff --git a/src/lzx_compress.c b/src/lzx_compress.c

index 45bb019bb94341c1a6cf146239fffe8ff15ea943..5e1be485f3bc9ff0590db093edc8f09599c3f0b5 100644 (file)
--- a/src/lzx_compress.c
+++ b/src/lzx_compress.c
@@ -79,17 +79,33 @@
   * LZX_CACHE_PER_POS is the number of lz_match structures to reserve in the
   * match cache for each byte position.  This value should be high enough so that
   * nearly the time, all matches found in a given block can fit in the match
- * cache.  However, fallback behavior on cache overflow is still required.
+ * cache.  However, fallback behavior (immediately terminating the block) on
+ * cache overflow is still required.
   */
-#define LZX_CACHE_PER_POS      6
+#define LZX_CACHE_PER_POS      7
  
-#define LZX_CACHE_LEN          (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1))
+/*
+ * LZX_CACHE_LENGTH is the number of lz_match structures in the match cache,
+ * excluding the extra "overflow" entries.  The per-position multiplier is '1 +
+ * LZX_CACHE_PER_POS' instead of 'LZX_CACHE_PER_POS' because there is an
+ * overhead of one lz_match per position, used to hold the match count at that
+ * position.
+ */
+#define LZX_CACHE_LENGTH       (LZX_DIV_BLOCK_SIZE * (1 + LZX_CACHE_PER_POS))
  
+/*
+ * LZX_MAX_MATCHES_PER_POS is an upper bound on the number of matches that can
+ * ever be saved in the match cache for a single position.  Since each match we
+ * save for a single position has a distinct length, we can use the number of
+ * possible match lengths in LZX as this bound.  This bound is guaranteed to be
+ * valid in all cases, although if 'nice_match_length < LZX_MAX_MATCH_LEN', then
+ * it will never actually be reached.
+ */
  #define LZX_MAX_MATCHES_PER_POS        LZX_NUM_LENS
  
  /*
   * LZX_BIT_COST is a scaling factor that represents the cost to output one bit.
- * THis makes it possible to consider fractional bit costs.
+ * This makes it possible to consider fractional bit costs.
   *
   * Note: this is only useful as a statistical trick for when the true costs are
   * unknown.  In reality, each token in LZX requires a whole number of bits to
@@ -104,14 +120,15 @@
  #define LZX_CONSIDER_ALIGNED_COSTS     0
  
  /*
- * The maximum compression level at which we use the faster algorithm.
+ * LZX_MAX_FAST_LEVEL is the maximum compression level at which we use the
+ * faster algorithm.
   */
  #define LZX_MAX_FAST_LEVEL     34
  
  /*
   * LZX_HASH2_ORDER is the log base 2 of the number of entries in the hash table
   * for finding length 2 matches.  This can be as high as 16 (in which case the
- * hash function is trivial), but using a smaller hash table actually speeds up
+ * hash function is trivial), but using a smaller hash table speeds up
   * compression due to reduced cache pressure.
   */
  #define LZX_HASH2_ORDER                12
@@ -129,7 +146,6 @@
  #include "wimlib/bt_matchfinder.h"
  #include "wimlib/compress_common.h"
  #include "wimlib/compressor_ops.h"
-#include "wimlib/endianness.h"
  #include "wimlib/error.h"
  #include "wimlib/hc_matchfinder.h"
  #include "wimlib/lz_extend.h"
@@ -352,6 +368,10 @@ struct lzx_compressor {
         /* Pointer to the compress() implementation chosen at allocation time */
         void (*impl)(struct lzx_compressor *, struct lzx_output_bitstream *);
  
+       /* If true, the compressor need not preserve the input buffer if it
+        * compresses the data successfully.  */
+       bool destructive;
+
         /* The Huffman symbol frequency counters for the current block.  */
         struct lzx_freqs freqs;
  
@@ -361,9 +381,26 @@ struct lzx_compressor {
         struct lzx_codes codes[2];
         unsigned codes_index;
  
-       /* The match/literal sequence the algorithm chose for the current block.
+       /*
+        * The match/literal sequence the algorithm chose for the current block.
+        *
+        * Notes on how large this array actually needs to be:
+        *
+        * - In lzx_compress_near_optimal(), the maximum block size is
+        *   'LZX_DIV_BLOCK_SIZE + LZX_MAX_MATCH_LEN - 1' bytes.  This occurs if
+        *   a match of the maximum length is found on the last byte.  Although
+        *   it is impossible for this particular case to actually result in a
+        *   parse of all literals, we reserve this many spaces anyway.
+        *
+        * - The worst case for lzx_compress_lazy() is a block of almost all
+        *   literals that ends with a series of matches of increasing scores,
+        *   causing a sequence of literals to be chosen before the last match
+        *   is finally chosen.  The number of items actually chosen in this
+        *   scenario is limited by the number of distinct match scores that
+        *   exist for matches shorter than 'nice_match_length'.  Having
+        *   'LZX_MAX_MATCH_LEN - 1' extra spaces is plenty for now.
          */
-       struct lzx_item chosen_items[LZX_DIV_BLOCK_SIZE + LZX_MAX_MATCH_LEN + 1];
+       struct lzx_item chosen_items[LZX_DIV_BLOCK_SIZE + LZX_MAX_MATCH_LEN - 1];
  
         /* Table mapping match offset => offset slot for small offsets  */
  #define LZX_NUM_FAST_OFFSETS 32768
@@ -378,21 +415,58 @@ struct lzx_compressor {
  
                 /* Data for near-optimal parsing  */
                 struct {
-                       /* The graph nodes for the current block  */
+                       /*
+                        * The graph nodes for the current block.
+                        *
+                        * We need at least 'LZX_DIV_BLOCK_SIZE +
+                        * LZX_MAX_MATCH_LEN - 1' nodes because that is the
+                        * maximum block size that may be used.  Add 1 because
+                        * we need a node to represent end-of-block.
+                        *
+                        * It is possible that nodes past end-of-block are
+                        * accessed during match consideration, but this can
+                        * only occur if the block was truncated at
+                        * LZX_DIV_BLOCK_SIZE.  So the same bound still applies.
+                        * Note that since nodes past the end of the block will
+                        * never actually have an effect on the items that are
+                        * chosen for the block, it makes no difference what
+                        * their costs are initialized to (if anything).
+                        */
                         struct lzx_optimum_node optimum_nodes[LZX_DIV_BLOCK_SIZE +
-                                                             LZX_MAX_MATCH_LEN + 1];
+                                                             LZX_MAX_MATCH_LEN - 1 + 1];
  
                         /* The cost model for the current block  */
                         struct lzx_costs costs;
  
-                       /* Cached matches for the current block  */
-                       struct lz_match match_cache[LZX_CACHE_LEN + 1 +
-                                                   LZX_MAX_MATCHES_PER_POS];
-                       struct lz_match *cache_overflow_mark;
+                       /*
+                        * Cached matches for the current block.  This array
+                        * contains the matches that were found at each position
+                        * in the block.  Specifically, for each position, there
+                        * is a special 'struct lz_match' whose 'length' field
+                        * contains the number of matches that were found at
+                        * that position; this is followed by the matches
+                        * themselves, if any, sorted by strictly increasing
+                        * length.
+                        *
+                        * Note: in rare cases, there will be a very high number
+                        * of matches in the block and this array will overflow.
+                        * If this happens, we force the end of the current
+                        * block.  LZX_CACHE_LENGTH is the length at which we
+                        * actually check for overflow.  The extra slots beyond
+                        * this are enough to absorb the worst case overflow,
+                        * which occurs if starting at
+                        * &match_cache[LZX_CACHE_LENGTH - 1], we write the
+                        * match count header, then write
+                        * LZX_MAX_MATCHES_PER_POS matches, then skip searching
+                        * for matches at 'LZX_MAX_MATCH_LEN - 1' positions and
+                        * write the match count header for each.
+                        */
+                       struct lz_match match_cache[LZX_CACHE_LENGTH +
+                                                   LZX_MAX_MATCHES_PER_POS +
+                                                   LZX_MAX_MATCH_LEN - 1];
  
                         /* Hash table for finding length 2 matches  */
-                       pos_t hash2_tab[LZX_HASH2_LENGTH]
-                               _aligned_attribute(MATCHFINDER_ALIGNMENT);
+                       pos_t hash2_tab[LZX_HASH2_LENGTH];
  
                         /* Binary trees matchfinder (MUST BE LAST!!!)  */
                         struct bt_matchfinder bt_mf;
@@ -400,17 +474,6 @@ struct lzx_compressor {
         };
  };
  
-/* Compute a hash value for the next 2 bytes of uncompressed data.  */
-static inline u32
-lz_hash_2_bytes(const u8 *in_next)
-{
-       u16 next_2_bytes = load_u16_unaligned(in_next);
-       if (LZX_HASH2_ORDER == 16)
-               return next_2_bytes;
-       else
-               return lz_hash(next_2_bytes, LZX_HASH2_ORDER);
-}
-
  /*
   * Structure to keep track of the current state of sending bits to the
   * compressed output buffer.
@@ -426,14 +489,15 @@ struct lzx_output_bitstream {
         u32 bitcount;
  
         /* Pointer to the start of the output buffer.  */
-       le16 *start;
+       u8 *start;
  
         /* Pointer to the position in the output buffer at which the next coding
          * unit should be written.  */
-       le16 *next;
+       u8 *next;
  
-       /* Pointer past the end of the output buffer.  */
-       le16 *end;
+       /* Pointer just past the end of the output buffer, rounded down to a
+        * 2-byte boundary.  */
+       u8 *end;
  };
  
  /*
@@ -453,7 +517,7 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
         os->bitcount = 0;
         os->start = buffer;
         os->next = os->start;
-       os->end = os->start + size / sizeof(le16);
+       os->end = os->start + (size & ~1);
  }
  
  /*
@@ -463,9 +527,9 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
   * bits in @bits cannot be set.  At most 17 bits can be written at once.
   *
   * @max_num_bits is a compile-time constant that specifies the maximum number of
- * bits that can ever be written at the call site.  Currently, it is used to
- * optimize away the conditional code for writing a second 16-bit coding unit
- * when writing fewer than 17 bits.
+ * bits that can ever be written at the call site.  It is used to optimize away
+ * the conditional code for writing a second 16-bit coding unit when writing
+ * fewer than 17 bits.
   *
   * If the output buffer space is exhausted, then the bits will be ignored, and
   * lzx_flush_output() will return 0 when it gets called.
@@ -493,16 +557,20 @@ lzx_write_varbits(struct lzx_output_bitstream *os,
                 os->bitcount -= 16;
  
                 /* Write a coding unit, unless it would overflow the buffer.  */
-               if (os->next != os->end)
-                       put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next++);
+               if (os->next != os->end) {
+                       put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next);
+                       os->next += 2;
+               }
  
                 /* If writing 17 bits, a second coding unit might need to be
                  * written.  But because 'max_num_bits' is a compile-time
                  * constant, the compiler will optimize away this code at most
                  * call sites.  */
                 if (max_num_bits == 17 && os->bitcount == 16) {
-                       if (os->next != os->end)
-                               put_unaligned_u16_le(os->bitbuf, os->next++);
+                       if (os->next != os->end) {
+                               put_unaligned_u16_le(os->bitbuf, os->next);
+                               os->next += 2;
+                       }
                         os->bitcount = 0;
                 }
         }
@@ -511,8 +579,7 @@ lzx_write_varbits(struct lzx_output_bitstream *os,
  /* Use when @num_bits is a compile-time constant.  Otherwise use
   * lzx_write_varbits().  */
  static inline void
-lzx_write_bits(struct lzx_output_bitstream *os,
-              const u32 bits, const unsigned num_bits)
+lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
  {
         lzx_write_varbits(os, bits, num_bits, num_bits);
  }
@@ -527,10 +594,12 @@ lzx_flush_output(struct lzx_output_bitstream *os)
         if (os->next == os->end)
                 return 0;
  
-       if (os->bitcount != 0)
-               put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next++);
+       if (os->bitcount != 0) {
+               put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next);
+               os->next += 2;
+       }
  
-       return (const u8 *)os->next - (const u8 *)os->start;
+       return os->next - os->start;
  }
  
  /* Build the main, length, and aligned offset Huffman codes used in LZX.
@@ -1051,8 +1120,8 @@ lzx_declare_explicit_offset_match(struct lzx_compressor *c, unsigned len, u32 of
                 extra_bits = (offset + LZX_OFFSET_ADJUSTMENT) -
                              lzx_offset_slot_base[offset_slot];
  
-               BUILD_BUG_ON(LZX_MAINCODE_MAX_NUM_SYMBOLS > (1 << 10));
-               BUILD_BUG_ON(LZX_LENCODE_NUM_SYMBOLS > (1 << 8));
+               STATIC_ASSERT(LZX_MAINCODE_MAX_NUM_SYMBOLS <= (1 << 10));
+               STATIC_ASSERT(LZX_LENCODE_NUM_SYMBOLS <= (1 << 8));
                 *(*next_chosen_item)++ = (struct lzx_item) {
                         .data = (u64)main_symbol |
                                 ((u64)len_symbol << 10) |
@@ -1144,7 +1213,7 @@ lzx_tally_item_list(struct lzx_compressor *c, struct lzx_optimum_node *cur_node)
   * Also, note that because of the presence of the recent offsets queue (which is
   * a type of adaptive state), the algorithm cannot work backwards and compute
   * "cost to end" instead of "cost to beginning".  Furthermore, the way the
- * algorithm handles this adaptive state in the "minimum-cost" parse is actually
+ * algorithm handles this adaptive state in the "minimum cost" parse is actually
   * only an approximation.  It's possible for the globally optimal, minimum cost
   * path to contain a prefix, ending at a position, where that path prefix is
   * *not* the minimum cost path to that position.  This can happen if such a path
@@ -1172,7 +1241,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
          * it is no longer needed.  */
         struct lzx_lru_queue queues[512];
  
-       BUILD_BUG_ON(ARRAY_LEN(queues) < LZX_MAX_MATCH_LEN + 1);
+       STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1);
  #define QUEUE(in) (queues[(uintptr_t)(in) % ARRAY_LEN(queues)])
  
         /* Initially, the cost to reach each node is "infinity".  */
@@ -1224,7 +1293,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
                         matchptr = in_next - lzx_lru_queue_R0(QUEUE(in_next));
                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
                                 goto R0_done;
-                       BUILD_BUG_ON(LZX_MIN_MATCH_LEN != 2);
+                       STATIC_ASSERT(LZX_MIN_MATCH_LEN == 2);
                         do {
                                 u32 cost = cur_node->cost +
                                            c->costs.match_cost[0][
@@ -1365,7 +1434,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
                 }
         } while (cur_node != end_node);
  
-       /* Return the match offset queue at the end of the minimum-cost path. */
+       /* Return the match offset queue at the end of the minimum cost path. */
         return QUEUE(block_end);
  }
  
@@ -1409,7 +1478,7 @@ lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size)
         unsigned num_used_bytes;
  
         /* The costs below are hard coded to use a scaling factor of 16.  */
-       BUILD_BUG_ON(LZX_BIT_COST != 16);
+       STATIC_ASSERT(LZX_BIT_COST == 16);
  
         /*
          * Heuristics:
@@ -1530,7 +1599,7 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
         struct lzx_lru_queue queue;
  
         bt_matchfinder_init(&c->bt_mf);
-       matchfinder_init(c->hash2_tab, LZX_HASH2_LENGTH);
+       memset(c->hash2_tab, 0, sizeof(c->hash2_tab));
         next_hash = bt_matchfinder_hash_3_bytes(in_next);
         lzx_lru_queue_init(&queue);
  
@@ -1554,16 +1623,12 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
                                 max_len = in_end - in_next;
                                 nice_len = min(max_len, nice_len);
  
-                               /* This extra check is needed to ensure that
-                                * reading the next 3 bytes when looking for a
-                                * length 2 match is valid.  In addition, we
-                                * cannot allow ourselves to find a length 2
-                                * match of the very last two bytes with the
-                                * very first two bytes, since such a match has
-                                * an offset too large to be represented.  */
-                               if (unlikely(max_len <
-                                            max(LZ_HASH_REQUIRED_NBYTES, 3)))
-                               {
+                               /* This extra check is needed to ensure that we
+                                * never output a length 2 match of the very
+                                * last two bytes with the very first two bytes,
+                                * since such a match has an offset too large to
+                                * be represented.  */
+                               if (unlikely(max_len < 3)) {
                                         in_next++;
                                         cache_ptr->length = 0;
                                         cache_ptr++;
@@ -1574,14 +1639,13 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
                         lz_matchptr = cache_ptr + 1;
  
                         /* Check for a length 2 match.  */
-                       hash2 = lz_hash_2_bytes(in_next);
+                       hash2 = lz_hash_2_bytes(in_next, LZX_HASH2_ORDER);
                         cur_match = c->hash2_tab[hash2];
                         c->hash2_tab[hash2] = in_next - in_begin;
-                       if (matchfinder_node_valid(cur_match) &&
+                       if (cur_match != 0 &&
                             (LZX_HASH2_ORDER == 16 ||
                              load_u16_unaligned(&in_begin[cur_match]) ==
-                            load_u16_unaligned(in_next)) &&
-                           in_begin[cur_match + 2] != in_next[2])
+                            load_u16_unaligned(in_next)))
                         {
                                 lz_matchptr->length = 2;
                                 lz_matchptr->offset = in_next - &in_begin[cur_match];
@@ -1621,16 +1685,14 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
                                         if (unlikely(max_len > in_end - in_next)) {
                                                 max_len = in_end - in_next;
                                                 nice_len = min(max_len, nice_len);
-                                               if (unlikely(max_len <
-                                                            max(LZ_HASH_REQUIRED_NBYTES, 3)))
-                                               {
+                                               if (unlikely(max_len < 3)) {
                                                         in_next++;
                                                         cache_ptr->length = 0;
                                                         cache_ptr++;
                                                         continue;
                                                 }
                                         }
-                                       c->hash2_tab[lz_hash_2_bytes(in_next)] =
+                                       c->hash2_tab[lz_hash_2_bytes(in_next, LZX_HASH2_ORDER)] =
                                                 in_next - in_begin;
                                         bt_matchfinder_skip_position(&c->bt_mf,
                                                                      in_begin,
@@ -1645,7 +1707,7 @@ lzx_compress_near_optimal(struct lzx_compressor *c,
                                 } while (--best_len);
                         }
                 } while (in_next < in_block_end &&
-                        likely(cache_ptr < c->cache_overflow_mark));
+                        likely(cache_ptr < &c->match_cache[LZX_CACHE_LENGTH]));
  
                 /* We've finished running the block through the matchfinder.
                  * Now choose a match/literal sequence and write the block.  */
@@ -1671,7 +1733,7 @@ lzx_find_longest_repeat_offset_match(const u8 * const in_next,
                                      struct lzx_lru_queue queue,
                                      unsigned *rep_max_idx_ret)
  {
-       BUILD_BUG_ON(LZX_NUM_RECENT_OFFSETS != 3);
+       STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
         LZX_ASSERT(bytes_remaining >= 2);
  
         const unsigned max_len = min(bytes_remaining, LZX_MAX_MATCH_LEN);
@@ -1949,7 +2011,8 @@ lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
  }
  
  static u64
-lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level)
+lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
+                     bool destructive)
  {
         u64 size = 0;
  
@@ -1957,13 +2020,14 @@ lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level)
                 return 0;
  
         size += lzx_get_compressor_size(max_bufsize, compression_level);
-       size += max_bufsize; /* in_buffer */
+       if (!destructive)
+               size += max_bufsize; /* in_buffer */
         return size;
  }
  
  static int
  lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
-                     void **c_ret)
+                     bool destructive, void **c_ret)
  {
         unsigned window_order;
         struct lzx_compressor *c;
@@ -1972,18 +2036,20 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
         if (window_order == 0)
                 return WIMLIB_ERR_INVALID_PARAM;
  
-       c = ALIGNED_MALLOC(lzx_get_compressor_size(max_bufsize,
-                                                  compression_level),
-                          MATCHFINDER_ALIGNMENT);
+       c = MALLOC(lzx_get_compressor_size(max_bufsize, compression_level));
         if (!c)
                 goto oom0;
  
+       c->destructive = destructive;
+
         c->num_main_syms = lzx_get_num_main_syms(window_order);
         c->window_order = window_order;
  
-       c->in_buffer = MALLOC(max_bufsize);
-       if (!c->in_buffer)
-               goto oom1;
+       if (!c->destructive) {
+               c->in_buffer = MALLOC(max_bufsize);
+               if (!c->in_buffer)
+                       goto oom1;
+       }
  
         if (compression_level <= LZX_MAX_FAST_LEVEL) {
  
@@ -1991,9 +2057,13 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
  
                 c->impl = lzx_compress_lazy;
                 c->max_search_depth = (36 * compression_level) / 20;
-               c->nice_match_length = min((72 * compression_level) / 20,
-                                          LZX_MAX_MATCH_LEN);
+               c->nice_match_length = (72 * compression_level) / 20;
  
+               /* lzx_compress_lazy() needs max_search_depth >= 2 because it
+                * halves the max_search_depth when attempting a lazy match, and
+                * max_search_depth cannot be 0.  */
+               if (c->max_search_depth < 2)
+                       c->max_search_depth = 2;
         } else {
  
                 /* Normal / high compression: Use near-optimal parsing.  */
@@ -2003,8 +2073,7 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
                 /* Scale nice_match_length and max_search_depth with the
                  * compression level.  */
                 c->max_search_depth = (24 * compression_level) / 50;
-               c->nice_match_length = min((32 * compression_level) / 50,
-                                          LZX_MAX_MATCH_LEN);
+               c->nice_match_length = (32 * compression_level) / 50;
  
                 /* Set a number of optimization passes appropriate for the
                  * compression level.  */
@@ -2028,33 +2097,42 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
                         if (compression_level >= 300)
                                 c->num_optim_passes++;
                 }
-
-               c->cache_overflow_mark = &c->match_cache[LZX_CACHE_LEN];
         }
  
+       /* max_search_depth == 0 is invalid.  */
+       if (c->max_search_depth < 1)
+               c->max_search_depth = 1;
+
+       if (c->nice_match_length > LZX_MAX_MATCH_LEN)
+               c->nice_match_length = LZX_MAX_MATCH_LEN;
+
         lzx_init_offset_slot_fast(c);
         *c_ret = c;
         return 0;
  
  oom1:
-       ALIGNED_FREE(c);
+       FREE(c);
  oom0:
         return WIMLIB_ERR_NOMEM;
  }
  
  static size_t
-lzx_compress(const void *in, size_t in_nbytes,
-            void *out, size_t out_nbytes_avail, void *_c)
+lzx_compress(const void *restrict in, size_t in_nbytes,
+            void *restrict out, size_t out_nbytes_avail, void *restrict _c)
  {
         struct lzx_compressor *c = _c;
         struct lzx_output_bitstream os;
+       size_t result;
  
         /* Don't bother trying to compress very small inputs.  */
         if (in_nbytes < 100)
                 return 0;
  
         /* Copy the input data into the internal buffer and preprocess it.  */
-       memcpy(c->in_buffer, in, in_nbytes);
+       if (c->destructive)
+               c->in_buffer = (void *)in;
+       else
+               memcpy(c->in_buffer, in, in_nbytes);
         c->in_nbytes = in_nbytes;
         lzx_do_e8_preprocessing(c->in_buffer, in_nbytes);
  
@@ -2069,7 +2147,10 @@ lzx_compress(const void *in, size_t in_nbytes,
         (*c->impl)(c, &os);
  
         /* Flush the output bitstream and return the compressed size or 0.  */
-       return lzx_flush_output(&os);
+       result = lzx_flush_output(&os);
+       if (!result && c->destructive)
+               lzx_undo_e8_preprocessing(c->in_buffer, c->in_nbytes);
+       return result;
  }
  
  static void
@@ -2077,8 +2158,9 @@ lzx_free_compressor(void *_c)
  {
         struct lzx_compressor *c = _c;
  
-       FREE(c->in_buffer);
-       ALIGNED_FREE(c);
+       if (!c->destructive)
+               FREE(c->in_buffer);
+       FREE(c);
  }
  
  const struct compressor_ops lzx_compressor_ops = {