]> wimlib.net Git - wimlib/blobdiff - src/xpress_compress.c
Stop force-inlining everything marked 'inline'
[wimlib] / src / xpress_compress.c
index 04105ae06fb53c319f1e5a6c0f21091c2e8a9b00..1b430912de7521d5b2abb622f6b1a694f7ab6281 100644 (file)
 #define MIN_LEVEL_FOR_NEAR_OPTIMAL     60
 
 /*
- * The window order for the matchfinder.  This must be the base 2 logarithm of
- * the maximum buffer size.
+ * Matchfinder definitions.  For XPRESS, only a 16-bit matchfinder is needed.
  */
-#define MATCHFINDER_WINDOW_ORDER       16
+#define mf_pos_t       u16
+#define MF_SUFFIX
 
 /*
- * Although XPRESS can potentially use a sliding window, it isn't well suited
- * for large buffers of data because there is no way to reset the Huffman code.
- * Therefore, we only allow buffers in which there is no restriction on match
- * offsets (no sliding window).  This simplifies the code and allows some
+ * Note: although XPRESS can potentially use a sliding window, it isn't well
+ * suited for large buffers of data because there is no way to reset the Huffman
+ * code.  Therefore, we only allow buffers in which there is no restriction on
+ * match offsets (no sliding window).  This simplifies the code and allows some
  * optimizations.
  */
-#define MATCHFINDER_IS_SLIDING         0
-
-#include <string.h>
 
 #include "wimlib/bitops.h"
 #include "wimlib/compress_common.h"
@@ -122,21 +119,21 @@ struct xpress_compressor {
        union {
                /* Data for greedy or lazy parsing  */
                struct {
-                       struct hc_matchfinder hc_mf;
                        struct xpress_item *chosen_items;
-                       u8 nonoptimal_end[0];
+                       struct hc_matchfinder hc_mf;
+                       /* hc_mf must be last!  */
                };
 
        #if SUPPORT_NEAR_OPTIMAL_PARSING
                /* Data for near-optimal parsing  */
                struct {
-                       struct bt_matchfinder bt_mf;
                        struct xpress_optimum_node *optimum_nodes;
                        struct lz_match *match_cache;
                        struct lz_match *cache_overflow_mark;
                        unsigned num_optim_passes;
                        u32 costs[XPRESS_NUM_SYMBOLS];
-                       u8 optimal_end[0];
+                       struct bt_matchfinder bt_mf;
+                       /* bt_mf must be last!  */
                };
        #endif
        };
@@ -215,7 +212,7 @@ struct xpress_output_bitstream {
        /* Pointer to the start of the output buffer.  */
        u8 *start;
 
-       /* Pointer to the location in the ouput buffer at which to write the
+       /* Pointer to the location in the output buffer at which to write the
         * next 16 bits.  */
        u8 *next_bits;
 
@@ -282,7 +279,7 @@ xpress_init_output(struct xpress_output_bitstream *os, void *buffer, size_t size
  * If the output buffer space is exhausted, then the bits will be ignored, and
  * xpress_flush_output() will return 0 when it gets called.
  */
-static inline void
+static forceinline void
 xpress_write_bits(struct xpress_output_bitstream *os,
                  const u32 bits, const unsigned num_bits)
 {
@@ -295,7 +292,7 @@ xpress_write_bits(struct xpress_output_bitstream *os,
        if (os->bitcount > 16) {
                os->bitcount -= 16;
                if (os->end - os->next_byte >= 2) {
-                       put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next_bits);
+                       put_unaligned_le16(os->bitbuf >> os->bitcount, os->next_bits);
                        os->next_bits = os->next_bits2;
                        os->next_bits2 = os->next_byte;
                        os->next_byte += 2;
@@ -306,7 +303,7 @@ xpress_write_bits(struct xpress_output_bitstream *os,
 /*
  * Interweave a literal byte into the output bitstream.
  */
-static inline void
+static forceinline void
 xpress_write_byte(struct xpress_output_bitstream *os, u8 byte)
 {
        if (os->next_byte < os->end)
@@ -316,11 +313,11 @@ xpress_write_byte(struct xpress_output_bitstream *os, u8 byte)
 /*
  * Interweave two literal bytes into the output bitstream.
  */
-static inline void
+static forceinline void
 xpress_write_u16(struct xpress_output_bitstream *os, u16 v)
 {
        if (os->end - os->next_byte >= 2) {
-               put_unaligned_u16_le(v, os->next_byte);
+               put_unaligned_le16(v, os->next_byte);
                os->next_byte += 2;
        }
 }
@@ -335,13 +332,13 @@ xpress_flush_output(struct xpress_output_bitstream *os)
        if (os->end - os->next_byte < 2)
                return 0;
 
-       put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next_bits);
-       put_unaligned_u16_le(0, os->next_bits2);
+       put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next_bits);
+       put_unaligned_le16(0, os->next_bits2);
 
        return os->next_byte - os->start;
 }
 
-static inline void
+static forceinline void
 xpress_write_extra_length_bytes(struct xpress_output_bitstream *os,
                                unsigned adjusted_len)
 {
@@ -356,7 +353,7 @@ xpress_write_extra_length_bytes(struct xpress_output_bitstream *os,
 }
 
 /* Output a match or literal.  */
-static inline void
+static forceinline void
 xpress_write_item(struct xpress_item item, struct xpress_output_bitstream *os,
                  const u32 codewords[], const u8 lens[])
 {
@@ -397,11 +394,11 @@ xpress_write_item_list(struct xpress_output_bitstream *os,
                       struct xpress_optimum_node *optimum_nodes,
                       size_t count, const u32 codewords[], const u8 lens[])
 {
-       struct xpress_optimum_node *cur_optimum_ptr = optimum_nodes;
-       struct xpress_optimum_node *end_optimum_ptr = optimum_nodes + count;
+       struct xpress_optimum_node *cur_node = optimum_nodes;
+       struct xpress_optimum_node *end_node = optimum_nodes + count;
        do {
-               unsigned length = cur_optimum_ptr->item & OPTIMUM_LEN_MASK;
-               unsigned offset = cur_optimum_ptr->item >> OPTIMUM_OFFSET_SHIFT;
+               unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
+               unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
 
                if (length == 1) {
                        /* Literal  */
@@ -411,22 +408,22 @@ xpress_write_item_list(struct xpress_output_bitstream *os,
                } else {
                        /* Match  */
                        unsigned adjusted_len;
-                       unsigned offset_high_bit;
+                       unsigned log2_offset;
                        unsigned len_hdr;
                        unsigned sym;
 
                        adjusted_len = length - XPRESS_MIN_MATCH_LEN;
-                       offset_high_bit = fls32(offset);
+                       log2_offset = bsr32(offset);
                        len_hdr = min(0xF, adjusted_len);
-                       sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
+                       sym = XPRESS_NUM_CHARS + ((log2_offset << 4) | len_hdr);
 
                        xpress_write_bits(os, codewords[sym], lens[sym]);
                        xpress_write_extra_length_bytes(os, adjusted_len);
-                       xpress_write_bits(os, offset - (1U << offset_high_bit),
-                                         offset_high_bit);
+                       xpress_write_bits(os, offset - (1U << log2_offset),
+                                         log2_offset);
                }
-               cur_optimum_ptr += length;
-       } while (cur_optimum_ptr != end_optimum_ptr);
+               cur_node += length;
+       } while (cur_node != end_node);
 }
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
@@ -487,7 +484,7 @@ xpress_write(struct xpress_compressor *c, void *out, size_t out_nbytes_avail,
 
 /* Tally the Huffman symbol for a literal and return the intermediate
  * representation of that literal.  */
-static inline struct xpress_item
+static forceinline struct xpress_item
 xpress_record_literal(struct xpress_compressor *c, unsigned literal)
 {
        c->freqs[literal]++;
@@ -499,21 +496,21 @@ xpress_record_literal(struct xpress_compressor *c, unsigned literal)
 
 /* Tally the Huffman symbol for a match and return the intermediate
  * representation of that match.  */
-static inline struct xpress_item
+static forceinline struct xpress_item
 xpress_record_match(struct xpress_compressor *c, unsigned length, unsigned offset)
 {
        unsigned adjusted_len = length - XPRESS_MIN_MATCH_LEN;
        unsigned len_hdr = min(adjusted_len, 0xF);
-       unsigned offset_high_bit = fls32(offset);
-       unsigned sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
+       unsigned log2_offset = bsr32(offset);
+       unsigned sym = XPRESS_NUM_CHARS + ((log2_offset << 4) | len_hdr);
 
        c->freqs[sym]++;
 
        return (struct xpress_item) {
                .data = (u64)sym |
                        ((u64)adjusted_len << 9) |
-                       ((u64)offset_high_bit << 25) |
-                       ((u64)(offset ^ (1U << offset_high_bit)) << 29),
+                       ((u64)log2_offset << 25) |
+                       ((u64)(offset ^ (1U << log2_offset)) << 29),
        };
 }
 
@@ -527,11 +524,12 @@ xpress_compress_greedy(struct xpress_compressor * restrict c,
                       const void * restrict in, size_t in_nbytes,
                       void * restrict out, size_t out_nbytes_avail)
 {
-       const u8 * const in_base = in;
-       const u8 *       in_next = in_base;
-       const u8 * const in_end = in_base + in_nbytes;
+       const u8 * const in_begin = in;
+       const u8 *       in_next = in_begin;
+       const u8 * const in_end = in_begin + in_nbytes;
        struct xpress_item *next_chosen_item = c->chosen_items;
        unsigned len_3_too_far;
+       u32 next_hashes[2] = {};
 
        if (in_nbytes <= 8192)
                len_3_too_far = 2048;
@@ -545,12 +543,13 @@ xpress_compress_greedy(struct xpress_compressor * restrict c,
                unsigned offset;
 
                length = hc_matchfinder_longest_match(&c->hc_mf,
-                                                     in_base,
-                                                     in_next,
+                                                     in_begin,
+                                                     in_next - in_begin,
                                                      XPRESS_MIN_MATCH_LEN - 1,
                                                      in_end - in_next,
                                                      min(in_end - in_next, c->nice_match_length),
                                                      c->max_search_depth,
+                                                     next_hashes,
                                                      &offset);
                if (length >= XPRESS_MIN_MATCH_LEN &&
                    !(length == XPRESS_MIN_MATCH_LEN && offset >= len_3_too_far))
@@ -560,10 +559,11 @@ xpress_compress_greedy(struct xpress_compressor * restrict c,
                                xpress_record_match(c, length, offset);
                        in_next += 1;
                        hc_matchfinder_skip_positions(&c->hc_mf,
-                                                     in_base,
-                                                     in_next,
-                                                     in_end,
-                                                     length - 1);
+                                                     in_begin,
+                                                     in_next - in_begin,
+                                                     in_end - in_begin,
+                                                     length - 1,
+                                                     next_hashes);
                        in_next += length - 1;
                } else {
                        /* No match found  */
@@ -587,11 +587,12 @@ xpress_compress_lazy(struct xpress_compressor * restrict c,
                     const void * restrict in, size_t in_nbytes,
                     void * restrict out, size_t out_nbytes_avail)
 {
-       const u8 * const in_base = in;
-       const u8 *       in_next = in_base;
-       const u8 * const in_end = in_base + in_nbytes;
+       const u8 * const in_begin = in;
+       const u8 *       in_next = in_begin;
+       const u8 * const in_end = in_begin + in_nbytes;
        struct xpress_item *next_chosen_item = c->chosen_items;
        unsigned len_3_too_far;
+       u32 next_hashes[2] = {};
 
        if (in_nbytes <= 8192)
                len_3_too_far = 2048;
@@ -608,12 +609,13 @@ xpress_compress_lazy(struct xpress_compressor * restrict c,
 
                /* Find the longest match at the current position.  */
                cur_len = hc_matchfinder_longest_match(&c->hc_mf,
-                                                      in_base,
-                                                      in_next,
+                                                      in_begin,
+                                                      in_next - in_begin,
                                                       XPRESS_MIN_MATCH_LEN - 1,
                                                       in_end - in_next,
                                                       min(in_end - in_next, c->nice_match_length),
                                                       c->max_search_depth,
+                                                      next_hashes,
                                                       &cur_offset);
                in_next += 1;
 
@@ -637,10 +639,11 @@ xpress_compress_lazy(struct xpress_compressor * restrict c,
                                xpress_record_match(c, cur_len, cur_offset);
 
                        hc_matchfinder_skip_positions(&c->hc_mf,
-                                                     in_base,
-                                                     in_next,
-                                                     in_end,
-                                                     cur_len - 1);
+                                                     in_begin,
+                                                     in_next - in_begin,
+                                                     in_end - in_begin,
+                                                     cur_len - 1,
+                                                     next_hashes);
                        in_next += cur_len - 1;
                        continue;
                }
@@ -662,12 +665,13 @@ xpress_compress_lazy(struct xpress_compressor * restrict c,
                 * longest_match() inlined at each.
                 */
                next_len = hc_matchfinder_longest_match(&c->hc_mf,
-                                                       in_base,
-                                                       in_next,
+                                                       in_begin,
+                                                       in_next - in_begin,
                                                        cur_len,
                                                        in_end - in_next,
                                                        min(in_end - in_next, c->nice_match_length),
                                                        c->max_search_depth / 2,
+                                                       next_hashes,
                                                        &next_offset);
                in_next += 1;
 
@@ -685,10 +689,11 @@ xpress_compress_lazy(struct xpress_compressor * restrict c,
                        *next_chosen_item++ =
                                xpress_record_match(c, cur_len, cur_offset);
                        hc_matchfinder_skip_positions(&c->hc_mf,
-                                                     in_base,
-                                                     in_next,
-                                                     in_end,
-                                                     cur_len - 2);
+                                                     in_begin,
+                                                     in_next - in_begin,
+                                                     in_end - in_begin,
+                                                     cur_len - 2,
+                                                     next_hashes);
                        in_next += cur_len - 2;
                        continue;
                }
@@ -729,13 +734,13 @@ xpress_update_costs(struct xpress_compressor *c)
  */
 static void
 xpress_tally_item_list(struct xpress_compressor *c,
-                      struct xpress_optimum_node *end_optimum_ptr)
+                      struct xpress_optimum_node *end_node)
 {
-       struct xpress_optimum_node *cur_optimum_ptr = c->optimum_nodes;
+       struct xpress_optimum_node *cur_node = c->optimum_nodes;
 
        do {
-               unsigned length = cur_optimum_ptr->item & OPTIMUM_LEN_MASK;
-               unsigned offset = cur_optimum_ptr->item >> OPTIMUM_OFFSET_SHIFT;
+               unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
+               unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
 
                if (length == 1) {
                        /* Literal  */
@@ -745,19 +750,19 @@ xpress_tally_item_list(struct xpress_compressor *c,
                } else {
                        /* Match  */
                        unsigned adjusted_len;
-                       unsigned offset_high_bit;
+                       unsigned log2_offset;
                        unsigned len_hdr;
                        unsigned sym;
 
                        adjusted_len = length - XPRESS_MIN_MATCH_LEN;
-                       offset_high_bit = fls32(offset);
+                       log2_offset = bsr32(offset);
                        len_hdr = min(0xF, adjusted_len);
-                       sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr);
+                       sym = XPRESS_NUM_CHARS + ((log2_offset << 4) | len_hdr);
 
                        c->freqs[sym]++;
                }
-               cur_optimum_ptr += length;
-       } while (cur_optimum_ptr != end_optimum_ptr);
+               cur_node += length;
+       } while (cur_node != end_node);
 }
 
 /*
@@ -776,10 +781,10 @@ static void
 xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes,
                          struct lz_match *end_cache_ptr)
 {
-       struct xpress_optimum_node *cur_optimum_ptr = c->optimum_nodes + in_nbytes;
+       struct xpress_optimum_node *cur_node = c->optimum_nodes + in_nbytes;
        struct lz_match *cache_ptr = end_cache_ptr;
 
-       cur_optimum_ptr->cost_to_end = 0;
+       cur_node->cost_to_end = 0;
        do {
                unsigned literal;
                u32 best_item;
@@ -788,7 +793,7 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes,
                struct lz_match *match;
                unsigned len;
 
-               cur_optimum_ptr--;
+               cur_node--;
                cache_ptr--;
 
                literal = cache_ptr->offset;
@@ -796,14 +801,14 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes,
                /* Consider coding a literal.  */
                best_item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
                best_cost_to_end = c->costs[literal] +
-                                  (cur_optimum_ptr + 1)->cost_to_end;
+                                  (cur_node + 1)->cost_to_end;
 
                num_matches = cache_ptr->length;
 
                if (num_matches == 0) {
                        /* No matches; the only choice is the literal.  */
-                       cur_optimum_ptr->cost_to_end = best_cost_to_end;
-                       cur_optimum_ptr->item = best_item;
+                       cur_node->cost_to_end = best_cost_to_end;
+                       cur_node->item = best_item;
                        continue;
                }
 
@@ -822,12 +827,12 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes,
                        /* All lengths are small.  Optimize accordingly.  */
                        do {
                                unsigned offset;
-                               unsigned offset_high_bit;
+                               unsigned log2_offset;
                                u32 offset_cost;
 
                                offset = match->offset;
-                               offset_high_bit = fls32(offset);
-                               offset_cost = offset_high_bit;
+                               log2_offset = bsr32(offset);
+                               offset_cost = log2_offset;
                                do {
                                        unsigned len_hdr;
                                        unsigned sym;
@@ -835,10 +840,10 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes,
 
                                        len_hdr = len - XPRESS_MIN_MATCH_LEN;
                                        sym = XPRESS_NUM_CHARS +
-                                             ((offset_high_bit << 4) | len_hdr);
+                                             ((log2_offset << 4) | len_hdr);
                                        cost_to_end =
                                                offset_cost + c->costs[sym] +
-                                               (cur_optimum_ptr + len)->cost_to_end;
+                                               (cur_node + len)->cost_to_end;
                                        if (cost_to_end < best_cost_to_end) {
                                                best_cost_to_end = cost_to_end;
                                                best_item =
@@ -851,12 +856,12 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes,
                        /* Some lengths are big.  */
                        do {
                                unsigned offset;
-                               unsigned offset_high_bit;
+                               unsigned log2_offset;
                                u32 offset_cost;
 
                                offset = match->offset;
-                               offset_high_bit = fls32(offset);
-                               offset_cost = offset_high_bit;
+                               log2_offset = bsr32(offset);
+                               offset_cost = log2_offset;
                                do {
                                        unsigned adjusted_len;
                                        unsigned len_hdr;
@@ -866,10 +871,10 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes,
                                        adjusted_len = len - XPRESS_MIN_MATCH_LEN;
                                        len_hdr = min(adjusted_len, 0xF);
                                        sym = XPRESS_NUM_CHARS +
-                                             ((offset_high_bit << 4) | len_hdr);
+                                             ((log2_offset << 4) | len_hdr);
                                        cost_to_end =
                                                offset_cost + c->costs[sym] +
-                                               (cur_optimum_ptr + len)->cost_to_end;
+                                               (cur_node + len)->cost_to_end;
                                        if (adjusted_len >= 0xF) {
                                                cost_to_end += 8;
                                                if (adjusted_len - 0xF >= 0xFF)
@@ -885,9 +890,9 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes,
                        } while (++match != cache_ptr);
                }
                cache_ptr -= num_matches;
-               cur_optimum_ptr->cost_to_end = best_cost_to_end;
-               cur_optimum_ptr->item = best_item;
-       } while (cur_optimum_ptr != c->optimum_nodes);
+               cur_node->cost_to_end = best_cost_to_end;
+               cur_node->item = best_item;
+       } while (cur_node != c->optimum_nodes);
 }
 
 /*
@@ -899,81 +904,83 @@ static struct lz_match *
 xpress_find_matches(struct xpress_compressor * restrict c,
                    const void * restrict in, size_t in_nbytes)
 {
-       const u8 * const in_base = in;
-       const u8 *in_next = in_base;
-       const u8 * const in_end = in_base + in_nbytes;
+       const u8 * const in_begin = in;
+       const u8 *in_next = in_begin;
        struct lz_match *cache_ptr = c->match_cache;
-       unsigned long prev_hash = 0;
+       u32 next_hashes[2] = {};
+       u32 max_len = in_nbytes;
+       u32 nice_len = min(max_len, c->nice_match_length);
 
        bt_matchfinder_init(&c->bt_mf);
 
-       do {
-               unsigned num_matches;
+       for (;;) {
+               struct lz_match *matches;
+               u32 best_len;
 
                /* If we've found so many matches that the cache might overflow
                 * if we keep finding more, then stop finding matches.  This
                 * case is very unlikely.  */
-               if (unlikely(cache_ptr >= c->cache_overflow_mark)) {
-                       do {
-                               cache_ptr->length = 0;
-                               cache_ptr->offset = *in_next++;
-                               cache_ptr++;
-                       } while (in_next != in_end);
-                       return cache_ptr;
-               }
+               if (unlikely(cache_ptr >= c->cache_overflow_mark ||
+                            max_len < BT_MATCHFINDER_REQUIRED_NBYTES))
+                       break;
+
+               matches = cache_ptr;
 
                /* Find matches with the current position using the binary tree
                 * matchfinder and save them in the next available slots in
                 * the match cache.  */
-               num_matches =
+               cache_ptr =
                        bt_matchfinder_get_matches(&c->bt_mf,
-                                                  in_base,
-                                                  in_next,
-                                                  XPRESS_MIN_MATCH_LEN,
-                                                  in_end - in_next,
-                                                  min(in_end - in_next, c->nice_match_length),
+                                                  in_begin,
+                                                  in_next - in_begin,
+                                                  max_len,
+                                                  nice_len,
                                                   c->max_search_depth,
-                                                  &prev_hash,
+                                                  next_hashes,
+                                                  &best_len,
                                                   cache_ptr);
-               cache_ptr += num_matches;
-               cache_ptr->length = num_matches;
-               cache_ptr->offset = *in_next;
-               in_next++;
+               cache_ptr->length = cache_ptr - matches;
+               cache_ptr->offset = *in_next++;
                cache_ptr++;
+               max_len--;
+               nice_len = min(nice_len, max_len);
 
-               if (num_matches) {
-                       /*
-                        * If there was a very long match found, then don't
-                        * cache any matches for the bytes covered by that
-                        * match.  This avoids degenerate behavior when
-                        * compressing highly redundant data, where the number
-                        * of matches can be very large.
-                        *
-                        * This heuristic doesn't actually hurt the compression
-                        * ratio very much.  If there's a long match, then the
-                        * data must be highly compressible, so it doesn't
-                        * matter as much what we do.
-                        */
-                       unsigned best_len = cache_ptr[-2].length;
-                       if (best_len >= c->nice_match_length) {
-                               --best_len;
-                               do {
-                                       bt_matchfinder_skip_position(&c->bt_mf,
-                                                                    in_base,
-                                                                    in_next,
-                                                                    in_end,
-                                                                    min(in_end - in_next,
-                                                                        c->nice_match_length),
-                                                                    c->max_search_depth,
-                                                                    &prev_hash);
-
-                                       cache_ptr->length = 0;
-                                       cache_ptr->offset = *in_next++;
-                                       cache_ptr++;
-                               } while (--best_len);
-                       }
+               /*
+                * If there was a very long match found, then don't cache any
+                * matches for the bytes covered by that match.  This avoids
+                * degenerate behavior when compressing highly redundant data,
+                * where the number of matches can be very large.
+                *
+                * This heuristic doesn't actually hurt the compression ratio
+                * very much.  If there's a long match, then the data must be
+                * highly compressible, so it doesn't matter as much what we do.
+                */
+               if (best_len >= nice_len) {
+                       if (unlikely(best_len +
+                                    BT_MATCHFINDER_REQUIRED_NBYTES >= max_len))
+                               break;
+                       --best_len;
+                       do {
+                               bt_matchfinder_skip_position(&c->bt_mf,
+                                                            in_begin,
+                                                            in_next - in_begin,
+                                                            nice_len,
+                                                            c->max_search_depth,
+                                                            next_hashes);
+                               cache_ptr->length = 0;
+                               cache_ptr->offset = *in_next++;
+                               cache_ptr++;
+                               max_len--;
+                               nice_len = min(nice_len, max_len);
+                       } while (--best_len);
                }
-       } while (in_next != in_end);
+       }
+
+       while (max_len--) {
+               cache_ptr->length = 0;
+               cache_ptr->offset = *in_next++;
+               cache_ptr++;
+       }
 
        return cache_ptr;
 }
@@ -1019,23 +1026,40 @@ xpress_compress_near_optimal(struct xpress_compressor * restrict c,
 
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
+static size_t
+xpress_get_compressor_size(size_t max_bufsize, unsigned compression_level)
+{
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+       if (compression_level >= MIN_LEVEL_FOR_NEAR_OPTIMAL)
+               return offsetof(struct xpress_compressor, bt_mf) +
+                       bt_matchfinder_size(max_bufsize);
+#endif
+
+       return offsetof(struct xpress_compressor, hc_mf) +
+               hc_matchfinder_size(max_bufsize);
+}
+
 static u64
-xpress_get_needed_memory(size_t max_bufsize, unsigned compression_level)
+xpress_get_needed_memory(size_t max_bufsize, unsigned compression_level,
+                        bool destructive)
 {
-       size_t size = 0;
+       u64 size = 0;
 
        if (max_bufsize > XPRESS_MAX_BUFSIZE)
                return 0;
 
+       size += xpress_get_compressor_size(max_bufsize, compression_level);
+
        if (compression_level < MIN_LEVEL_FOR_NEAR_OPTIMAL ||
            !SUPPORT_NEAR_OPTIMAL_PARSING) {
-               size += offsetof(struct xpress_compressor, nonoptimal_end);
+               /* chosen_items  */
                size += max_bufsize * sizeof(struct xpress_item);
        }
 #if SUPPORT_NEAR_OPTIMAL_PARSING
        else {
-               size += offsetof(struct xpress_compressor, optimal_end);
+               /* optimum_nodes  */
                size += (max_bufsize + 1) * sizeof(struct xpress_optimum_node);
+               /* match_cache */
                size += ((max_bufsize * CACHE_RESERVE_PER_POS) +
                         XPRESS_MAX_MATCH_LEN + max_bufsize) *
                                sizeof(struct lz_match);
@@ -1046,56 +1070,44 @@ xpress_get_needed_memory(size_t max_bufsize, unsigned compression_level)
 
 static int
 xpress_create_compressor(size_t max_bufsize, unsigned compression_level,
-                        void **c_ret)
+                        bool destructive, void **c_ret)
 {
        struct xpress_compressor *c;
 
        if (max_bufsize > XPRESS_MAX_BUFSIZE)
                return WIMLIB_ERR_INVALID_PARAM;
 
-       if (compression_level < 30) {
-               c = ALIGNED_MALLOC(offsetof(struct xpress_compressor,
-                                           nonoptimal_end),
-                                  MATCHFINDER_ALIGNMENT);
-               if (!c)
-                       return WIMLIB_ERR_NOMEM;
-               c->impl = xpress_compress_greedy;
-               c->max_search_depth = (compression_level * 24) / 16;
-               c->nice_match_length = (compression_level * 48) / 16;
-               c->chosen_items = MALLOC(max_bufsize * sizeof(struct xpress_item));
-               if (!c->chosen_items) {
-                       ALIGNED_FREE(c);
-                       return WIMLIB_ERR_NOMEM;
-               }
-       } else if (compression_level < MIN_LEVEL_FOR_NEAR_OPTIMAL ||
-                  !SUPPORT_NEAR_OPTIMAL_PARSING)
+       c = MALLOC(xpress_get_compressor_size(max_bufsize, compression_level));
+       if (!c)
+               goto oom0;
+
+       if (compression_level < MIN_LEVEL_FOR_NEAR_OPTIMAL ||
+           !SUPPORT_NEAR_OPTIMAL_PARSING)
        {
-               c = ALIGNED_MALLOC(offsetof(struct xpress_compressor,
-                                           nonoptimal_end),
-                                  MATCHFINDER_ALIGNMENT);
-               if (!c)
-                       return WIMLIB_ERR_NOMEM;
-
-               c->impl = xpress_compress_lazy;
-               c->max_search_depth = (compression_level * 24) / 32;
-               c->nice_match_length = (compression_level * 48) / 32;
+
                c->chosen_items = MALLOC(max_bufsize * sizeof(struct xpress_item));
-               if (!c->chosen_items) {
-                       ALIGNED_FREE(c);
-                       return WIMLIB_ERR_NOMEM;
+               if (!c->chosen_items)
+                       goto oom1;
+
+               if (compression_level < 30) {
+                       c->impl = xpress_compress_greedy;
+                       c->max_search_depth = (compression_level * 30) / 16;
+                       c->nice_match_length = (compression_level * 60) / 16;
+               } else {
+                       c->impl = xpress_compress_lazy;
+                       c->max_search_depth = (compression_level * 30) / 32;
+                       c->nice_match_length = (compression_level * 60) / 32;
+
+                       /* xpress_compress_lazy() needs max_search_depth >= 2
+                        * because it halves the max_search_depth when
+                        * attempting a lazy match, and max_search_depth cannot
+                        * be 0.  */
+                       if (c->max_search_depth < 2)
+                               c->max_search_depth = 2;
                }
        }
 #if SUPPORT_NEAR_OPTIMAL_PARSING
        else {
-               c = ALIGNED_MALLOC(offsetof(struct xpress_compressor,
-                                           optimal_end),
-                                  MATCHFINDER_ALIGNMENT);
-               if (!c)
-                       return WIMLIB_ERR_NOMEM;
-               c->impl = xpress_compress_near_optimal;
-               c->max_search_depth = (compression_level * 32) / 100;
-               c->nice_match_length = (compression_level * 50) / 100;
-               c->num_optim_passes = compression_level / 40;
 
                c->optimum_nodes = MALLOC((max_bufsize + 1) *
                                          sizeof(struct xpress_optimum_node));
@@ -1105,24 +1117,41 @@ xpress_create_compressor(size_t max_bufsize, unsigned compression_level,
                if (!c->optimum_nodes || !c->match_cache) {
                        FREE(c->optimum_nodes);
                        FREE(c->match_cache);
-                       ALIGNED_FREE(c);
-                       return WIMLIB_ERR_NOMEM;
+                       goto oom1;
                }
                c->cache_overflow_mark =
                        &c->match_cache[max_bufsize * CACHE_RESERVE_PER_POS];
+
+               c->impl = xpress_compress_near_optimal;
+               c->max_search_depth = (compression_level * 28) / 100;
+               c->nice_match_length = (compression_level * 56) / 100;
+               c->num_optim_passes = compression_level / 40;
        }
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
+       /* max_search_depth == 0 is invalid.  */
+       if (c->max_search_depth < 1)
+               c->max_search_depth = 1;
+
        *c_ret = c;
        return 0;
+
+oom1:
+       FREE(c);
+oom0:
+       return WIMLIB_ERR_NOMEM;
 }
 
 static size_t
-xpress_compress(const void *in, size_t in_nbytes,
-               void *out, size_t out_nbytes_avail, void *_c)
+xpress_compress(const void *restrict in, size_t in_nbytes,
+               void *restrict out, size_t out_nbytes_avail, void *restrict _c)
 {
        struct xpress_compressor *c = _c;
 
+       /* Don't bother trying to compress very small inputs.  */
+       if (in_nbytes < 25)
+               return 0;
+
        if (out_nbytes_avail <= XPRESS_NUM_SYMBOLS / 2 + 4)
                return 0;
 
@@ -1136,16 +1165,14 @@ xpress_free_compressor(void *_c)
 {
        struct xpress_compressor *c = _c;
 
-       if (c) {
-       #if SUPPORT_NEAR_OPTIMAL_PARSING
-               if (c->impl == xpress_compress_near_optimal) {
-                       FREE(c->optimum_nodes);
-                       FREE(c->match_cache);
-               } else
-       #endif
-                       FREE(c->chosen_items);
-               ALIGNED_FREE(c);
-       }
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+       if (c->impl == xpress_compress_near_optimal) {
+               FREE(c->optimum_nodes);
+               FREE(c->match_cache);
+       } else
+#endif
+               FREE(c->chosen_items);
+       FREE(c);
 }
 
 const struct compressor_ops xpress_compressor_ops = {