]> wimlib.net Git - wimlib/blobdiff - src/lzx-compress.c
lzx-compress.c: Minor cleanups
[wimlib] / src / lzx-compress.c
index e65b6c8329e35faa4c1b72f5bd69a86b25e77e87..cdbb63699004edbed7066f9344f6d1c6f30207d2 100644 (file)
 
 #define LZX_DIV_BLOCK_SIZE     32768
 
-#define LZX_CACHE_PER_POS      10
+#define LZX_CACHE_PER_POS      8
 
 #define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1))
-#define LZX_CACHE_SIZE (LZX_CACHE_LEN * sizeof(struct raw_match))
-
-/* Dependent on behavior of lz_bt_get_matches().  */
+#define LZX_CACHE_SIZE (LZX_CACHE_LEN * sizeof(struct lz_match))
 #define LZX_MAX_MATCHES_PER_POS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
 
 /* Codewords for the LZX main, length, and aligned offset Huffman codes  */
@@ -281,7 +279,7 @@ struct lzx_lens {
  *
  * If a codeword has zero frequency, it must still be assigned some nonzero cost
  * --- generally a high cost, since even if it gets used in the next iteration,
- * it probably will not be used very times.  */
+ * it probably will not be used very many times.  */
 struct lzx_costs {
        u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
        u8 len[LZX_LENCODE_NUM_SYMBOLS];
@@ -296,13 +294,13 @@ struct lzx_codes {
 
 /* Tables for tallying symbol frequencies in the three LZX alphabets  */
 struct lzx_freqs {
-       input_idx_t main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
-       input_idx_t len[LZX_LENCODE_NUM_SYMBOLS];
-       input_idx_t aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
+       u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
+       u32 len[LZX_LENCODE_NUM_SYMBOLS];
+       u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 };
 
 /* LZX intermediate match/literal format  */
-struct lzx_match {
+struct lzx_item {
        /* Bit     Description
         *
         * 31      1 if a match, 0 if a literal.
@@ -327,16 +325,16 @@ struct lzx_block_spec {
        int block_type;
 
        /* 0-based position in the window at which this block starts.  */
-       input_idx_t window_pos;
+       u32 window_pos;
 
        /* The number of bytes of uncompressed data this block represents.  */
-       input_idx_t block_size;
+       u32 block_size;
 
        /* The match/literal sequence for this block.  */
-       struct lzx_match *chosen_matches;
+       struct lzx_item *chosen_items;
 
-       /* The length of the @chosen_matches sequence.  */
-       input_idx_t num_chosen_matches;
+       /* The length of the @chosen_items sequence.  */
+       u32 num_chosen_items;
 
        /* Huffman codes for this block.  */
        struct lzx_codes codes;
@@ -365,10 +363,10 @@ struct lzx_compressor {
 
        /* Number of bytes of data to be compressed, which is the number of
         * bytes of data in @window that are actually valid.  */
-       input_idx_t window_size;
+       u32 window_size;
 
        /* Allocated size of the @window.  */
-       input_idx_t max_window_size;
+       u32 max_window_size;
 
        /* Number of symbols in the main alphabet (depends on the
         * @max_window_size since it determines the maximum allowed offset).  */
@@ -379,7 +377,7 @@ struct lzx_compressor {
 
        /* Space for the sequences of matches/literals that were chosen for each
         * block.  */
-       struct lzx_match *chosen_matches;
+       struct lzx_item *chosen_items;
 
        /* Information about the LZX blocks the preprocessed input was divided
         * into.  */
@@ -399,27 +397,27 @@ struct lzx_compressor {
        struct lzx_costs costs;
 
        /* Fast algorithm only:  Array of hash table links.  */
-       input_idx_t *prev_tab;
+       u32 *prev_tab;
 
        /* Slow algorithm only: Binary tree match-finder.  */
        struct lz_bt mf;
 
        /* Position in window of next match to return.  */
-       input_idx_t match_window_pos;
+       u32 match_window_pos;
 
        /* The end-of-block position.  We can't allow any matches to span this
         * position.  */
-       input_idx_t match_window_end;
+       u32 match_window_end;
 
        /* Matches found by the match-finder are cached in the following array
         * to achieve a slight speedup when the same matches are needed on
         * subsequent passes.  This is suboptimal because different matches may
         * be preferred with different cost models, but seems to be a worthwhile
         * speedup.  */
-       struct raw_match *cached_matches;
-       struct raw_match *cache_ptr;
+       struct lz_match *cached_matches;
+       struct lz_match *cache_ptr;
        bool matches_cached;
-       struct raw_match *cache_limit;
+       struct lz_match *cache_limit;
 
        /* Match-chooser state.
         * When matches have been chosen, optimum_cur_idx is set to the position
@@ -455,22 +453,22 @@ struct lzx_mc_pos_data {
                        /* Position of the start of the match or literal that
                         * was taken to get to this position in the approximate
                         * minimum-cost parse.  */
-                       input_idx_t link;
+                       u32 link;
 
                        /* Offset (as in an LZ (length, offset) pair) of the
                         * match or literal that was taken to get to this
                         * position in the approximate minimum-cost parse.  */
-                       input_idx_t match_offset;
+                       u32 match_offset;
                } prev;
                struct {
                        /* Position at which the match or literal starting at
                         * this position ends in the minimum-cost parse.  */
-                       input_idx_t link;
+                       u32 link;
 
                        /* Offset (as in an LZ (length, offset) pair) of the
                         * match or literal starting at this position in the
                         * approximate minimum-cost parse.  */
-                       input_idx_t match_offset;
+                       u32 match_offset;
                } next;
        };
 
@@ -559,7 +557,7 @@ lzx_make_huffman_codes(const struct lzx_freqs *freqs,
  */
 static void
 lzx_write_match(struct output_bitstream *out, int block_type,
-               struct lzx_match match, const struct lzx_codes *codes)
+               struct lzx_item match, const struct lzx_codes *codes)
 {
        /* low 8 bits are the match length minus 2 */
        unsigned match_len_minus_2 = match.data & 0xff;
@@ -645,7 +643,7 @@ static unsigned
 lzx_build_precode(const u8 lens[restrict],
                  const u8 prev_lens[restrict],
                  const unsigned num_syms,
-                 input_idx_t precode_freqs[restrict LZX_PRECODE_NUM_SYMBOLS],
+                 u32 precode_freqs[restrict LZX_PRECODE_NUM_SYMBOLS],
                  u8 output_syms[restrict num_syms],
                  u8 precode_lens[restrict LZX_PRECODE_NUM_SYMBOLS],
                  u32 precode_codewords[restrict LZX_PRECODE_NUM_SYMBOLS],
@@ -814,7 +812,7 @@ lzx_write_compressed_code(struct output_bitstream *out,
                          const u8 prev_lens[restrict],
                          unsigned num_syms)
 {
-       input_idx_t precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
+       u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
        u8 output_syms[num_syms];
        u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
        u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
@@ -885,12 +883,12 @@ lzx_write_compressed_code(struct output_bitstream *out,
 static void
 lzx_write_matches_and_literals(struct output_bitstream *ostream,
                               int block_type,
-                              const struct lzx_match match_tab[],
+                              const struct lzx_item match_tab[],
                               unsigned match_count,
                               const struct lzx_codes *codes)
 {
        for (unsigned i = 0; i < match_count; i++) {
-               struct lzx_match match = match_tab[i];
+               struct lzx_item match = match_tab[i];
 
                /* The high bit of the 32-bit intermediate representation
                 * indicates whether the item is an actual LZ-style match (1) or
@@ -945,8 +943,8 @@ lzx_write_compressed_block(int block_type,
                           unsigned block_size,
                           unsigned max_window_size,
                           unsigned num_main_syms,
-                          struct lzx_match * chosen_matches,
-                          unsigned num_chosen_matches,
+                          struct lzx_item * chosen_items,
+                          unsigned num_chosen_items,
                           const struct lzx_codes * codes,
                           const struct lzx_codes * prev_codes,
                           struct output_bitstream * ostream)
@@ -1024,7 +1022,7 @@ lzx_write_compressed_block(int block_type,
 
        /* Write the actual matches and literals.  */
        lzx_write_matches_and_literals(ostream, block_type,
-                                      chosen_matches, num_chosen_matches,
+                                      chosen_items, num_chosen_items,
                                       codes);
 
        LZX_DEBUG("Done writing block.");
@@ -1039,17 +1037,17 @@ lzx_write_all_blocks(struct lzx_compressor *ctx, struct output_bitstream *ostrea
        for (unsigned i = 0; i < ctx->num_blocks; i++) {
                const struct lzx_block_spec *spec = &ctx->block_specs[i];
 
-               LZX_DEBUG("Writing block %u/%u (type=%d, size=%u, num_chosen_matches=%u)...",
+               LZX_DEBUG("Writing block %u/%u (type=%d, size=%u, num_chosen_items=%u)...",
                          i + 1, ctx->num_blocks,
                          spec->block_type, spec->block_size,
-                         spec->num_chosen_matches);
+                         spec->num_chosen_items);
 
                lzx_write_compressed_block(spec->block_type,
                                           spec->block_size,
                                           ctx->max_window_size,
                                           ctx->num_main_syms,
-                                          spec->chosen_matches,
-                                          spec->num_chosen_matches,
+                                          spec->chosen_items,
+                                          spec->num_chosen_items,
                                           &spec->codes,
                                           prev_codes,
                                           ostream);
@@ -1060,7 +1058,7 @@ lzx_write_all_blocks(struct lzx_compressor *ctx, struct output_bitstream *ostrea
 
 /* Constructs an LZX match from a literal byte and updates the main code symbol
  * frequencies.  */
-static u32
+static inline u32
 lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
 {
        freqs->main[lit]++;
@@ -1071,7 +1069,7 @@ lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
  * queue and the frequency of symbols in the main, length, and aligned offset
  * alphabets.  The return value is a 32-bit number that provides the match in an
  * intermediate representation documented below.  */
-static u32
+static inline u32
 lzx_tally_match(unsigned match_len, u32 match_offset,
                struct lzx_freqs *freqs, struct lzx_lru_queue *queue)
 {
@@ -1123,7 +1121,7 @@ lzx_tally_match(unsigned match_len, u32 match_offset,
                freqs->aligned[position_footer & 7]++;
 
        /* Pack the position slot, position footer, and match length into an
-        * intermediate representation.  See `struct lzx_match' for details.
+        * intermediate representation.  See `struct lzx_item' for details.
         */
        LZX_ASSERT(LZX_MAX_POSITION_SLOTS <= 64);
        LZX_ASSERT(lzx_get_num_extra_bits(LZX_MAX_POSITION_SLOTS - 1) <= 17);
@@ -1141,7 +1139,7 @@ lzx_tally_match(unsigned match_len, u32 match_offset,
 struct lzx_record_ctx {
        struct lzx_freqs freqs;
        struct lzx_lru_queue queue;
-       struct lzx_match *matches;
+       struct lzx_item *matches;
 };
 
 static void
@@ -1172,7 +1170,7 @@ lzx_literal_cost(u8 c, const struct lzx_costs * costs)
  * well as costs for the codewords in the main, length, and aligned Huffman
  * codes, return the approximate number of bits it will take to represent this
  * match in the compressed output.  Take into account the match offset LRU
- * queue and optionally update it.  */
+ * queue and also updates it.  */
 static u32
 lzx_match_cost(unsigned length, u32 offset, const struct lzx_costs *costs,
               struct lzx_lru_queue *queue)
@@ -1250,21 +1248,25 @@ lzx_set_costs(struct lzx_compressor * ctx, const struct lzx_lens * lens)
  * value is the number of matches found.  */
 static unsigned
 lzx_get_matches(struct lzx_compressor *ctx,
-               const struct raw_match **matches_ret)
+               const struct lz_match **matches_ret)
 {
-       struct raw_match *cache_ptr;
-       struct raw_match *matches;
+       struct lz_match *cache_ptr;
+       struct lz_match *matches;
        unsigned num_matches;
 
        LZX_ASSERT(ctx->match_window_pos < ctx->match_window_end);
 
        cache_ptr = ctx->cache_ptr;
        matches = cache_ptr + 1;
-       if (ctx->matches_cached) {
-               num_matches = cache_ptr->len;
+       if (likely(cache_ptr <= ctx->cache_limit)) {
+               if (ctx->matches_cached) {
+                       num_matches = cache_ptr->len;
+               } else {
+                       num_matches = lz_bt_get_matches(&ctx->mf, matches);
+                       cache_ptr->len = num_matches;
+               }
        } else {
-               num_matches = lz_bt_get_matches(&ctx->mf, matches);
-               cache_ptr->len = num_matches;
+               num_matches = 0;
        }
 
        /* Don't allow matches to span the end of an LZX block.  */
@@ -1322,18 +1324,18 @@ lzx_get_matches(struct lzx_compressor *ctx,
 static void
 lzx_skip_bytes(struct lzx_compressor *ctx, unsigned n)
 {
-       struct raw_match *cache_ptr;
+       struct lz_match *cache_ptr;
 
        LZX_ASSERT(n <= ctx->match_window_end - ctx->match_window_pos);
 
        cache_ptr = ctx->cache_ptr;
        ctx->match_window_pos += n;
        if (ctx->matches_cached) {
-               while (n--)
+               while (n-- && cache_ptr <= ctx->cache_limit)
                        cache_ptr += 1 + cache_ptr->len;
        } else {
                lz_bt_skip_positions(&ctx->mf, n);
-               while (n--) {
+               while (n-- && cache_ptr <= ctx->cache_limit) {
                        cache_ptr->len = 0;
                        cache_ptr += 1;
                }
@@ -1347,7 +1349,7 @@ lzx_skip_bytes(struct lzx_compressor *ctx, unsigned n)
  *
  * Returns the first match in the list.
  */
-static struct raw_match
+static struct lz_match
 lzx_match_chooser_reverse_list(struct lzx_compressor *ctx, unsigned cur_pos)
 {
        unsigned prev_link, saved_prev_link;
@@ -1373,7 +1375,7 @@ lzx_match_chooser_reverse_list(struct lzx_compressor *ctx, unsigned cur_pos)
 
        ctx->optimum_cur_idx = ctx->optimum[0].next.link;
 
-       return (struct raw_match)
+       return (struct lz_match)
                { .len = ctx->optimum_cur_idx,
                  .offset = ctx->optimum[0].next.match_offset,
                };
@@ -1441,13 +1443,12 @@ lzx_match_chooser_reverse_list(struct lzx_compressor *ctx, unsigned cur_pos)
  * The return value is a (length, offset) pair specifying the match or literal
  * chosen.  For literals, the length is 0 or 1 and the offset is meaningless.
  */
-static struct raw_match
+static struct lz_match
 lzx_get_near_optimal_match(struct lzx_compressor *ctx)
 {
        unsigned num_matches;
-       const struct raw_match *matches;
-       const struct raw_match *matchptr;
-       struct raw_match match;
+       const struct lz_match *matches;
+       struct lz_match match;
        unsigned longest_len;
        unsigned longest_rep_len;
        u32 longest_rep_offset;
@@ -1492,7 +1493,7 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx)
        /* If there's a long match with a recent offset, take it.  */
        if (longest_rep_len >= ctx->params.alg_params.slow.nice_match_length) {
                lzx_skip_bytes(ctx, longest_rep_len);
-               return (struct raw_match) {
+               return (struct lz_match) {
                        .len = longest_rep_len,
                        .offset = longest_rep_offset,
                };
@@ -1520,18 +1521,54 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx)
        ctx->optimum[1].prev.link = 0;
 
        /* Calculate the cost to reach any position up to and including that
-        * reached by the longest match.  */
-       matchptr = matches;
-       for (unsigned len = 2; len <= longest_len; len++) {
-               u32 offset = matchptr->offset;
-
-               ctx->optimum[len].queue = ctx->queue;
-               ctx->optimum[len].prev.link = 0;
-               ctx->optimum[len].prev.match_offset = offset;
-               ctx->optimum[len].cost = lzx_match_cost(len, offset, &ctx->costs,
-                                                       &ctx->optimum[len].queue);
-               if (len == matchptr->len)
-                       matchptr++;
+        * reached by the longest match.
+        *
+        * Note: We consider only the lowest-offset match that reaches each
+        * position.
+        *
+        * Note: Some of the cost calculation stays the same for each offset,
+        * regardless of how many lengths it gets used for.  Therefore, to
+        * improve performance, we hand-code the cost calculation instead of
+        * calling lzx_match_cost() to do a from-scratch cost evaluation at each
+        * length.  */
+       for (unsigned i = 0, len = 2; i < num_matches; i++) {
+               u32 offset;
+               struct lzx_lru_queue queue;
+               u32 position_cost;
+               unsigned position_slot;
+               unsigned num_extra_bits;
+
+               offset = matches[i].offset;
+               queue = ctx->queue;
+               position_cost = 0;
+
+               position_slot = lzx_get_position_slot(offset, &queue);
+               num_extra_bits = lzx_get_num_extra_bits(position_slot);
+               if (num_extra_bits >= 3) {
+                       position_cost += num_extra_bits - 3;
+                       position_cost += ctx->costs.aligned[(offset + LZX_OFFSET_OFFSET) & 7];
+               } else {
+                       position_cost += num_extra_bits;
+               }
+
+               do {
+                       unsigned len_header;
+                       unsigned main_symbol;
+                       u32 cost;
+
+                       cost = position_cost;
+
+                       len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
+                       main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
+                       cost += ctx->costs.main[main_symbol];
+                       if (len_header == LZX_NUM_PRIMARY_LENS)
+                               cost += ctx->costs.len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
+
+                       ctx->optimum[len].queue = queue;
+                       ctx->optimum[len].prev.link = 0;
+                       ctx->optimum[len].prev.match_offset = offset;
+                       ctx->optimum[len].cost = cost;
+               } while (++len <= matches[i].len);
        }
        end_pos = longest_len;
 
@@ -1681,25 +1718,59 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx)
                        ctx->optimum[cur_pos + 1].prev.link = cur_pos;
                }
 
-               /* Consider coding a match.  */
-               matchptr = matches;
-               for (unsigned len = 2; len <= longest_len; len++) {
+               /* Consider coding a match.
+                *
+                * The hard-coded cost calculation is done for the same reason
+                * stated in the comment for the similar loop earlier.
+                * Actually, it is *this* one that has the biggest effect on
+                * performance; overall LZX compression is > 10% faster with
+                * this code compared to calling lzx_match_cost() with each
+                * length.  */
+               for (unsigned i = 0, len = 2; i < num_matches; i++) {
                        u32 offset;
                        struct lzx_lru_queue queue;
+                       u32 position_cost;
+                       unsigned position_slot;
+                       unsigned num_extra_bits;
 
-                       offset = matchptr->offset;
+                       offset = matches[i].offset;
                        queue = ctx->optimum[cur_pos].queue;
-
-                       cost = ctx->optimum[cur_pos].cost +
-                               lzx_match_cost(len, offset, &ctx->costs, &queue);
-                       if (cost < ctx->optimum[cur_pos + len].cost) {
-                               ctx->optimum[cur_pos + len].queue = queue;
-                               ctx->optimum[cur_pos + len].prev.link = cur_pos;
-                               ctx->optimum[cur_pos + len].prev.match_offset = offset;
-                               ctx->optimum[cur_pos + len].cost = cost;
+                       position_cost = ctx->optimum[cur_pos].cost;
+
+                       position_slot = lzx_get_position_slot(offset, &queue);
+                       num_extra_bits = lzx_get_num_extra_bits(position_slot);
+                       if (num_extra_bits >= 3) {
+                               position_cost += num_extra_bits - 3;
+                               position_cost += ctx->costs.aligned[
+                                               (offset + LZX_OFFSET_OFFSET) & 7];
+                       } else {
+                               position_cost += num_extra_bits;
                        }
-                       if (len == matchptr->len)
-                               matchptr++;
+
+                       do {
+                               unsigned len_header;
+                               unsigned main_symbol;
+                               u32 cost;
+
+                               cost = position_cost;
+
+                               len_header = min(len - LZX_MIN_MATCH_LEN,
+                                                LZX_NUM_PRIMARY_LENS);
+                               main_symbol = ((position_slot << 3) | len_header) +
+                                               LZX_NUM_CHARS;
+                               cost += ctx->costs.main[main_symbol];
+                               if (len_header == LZX_NUM_PRIMARY_LENS) {
+                                       cost += ctx->costs.len[len -
+                                                       LZX_MIN_MATCH_LEN -
+                                                       LZX_NUM_PRIMARY_LENS];
+                               }
+                               if (cost < ctx->optimum[cur_pos + len].cost) {
+                                       ctx->optimum[cur_pos + len].queue = queue;
+                                       ctx->optimum[cur_pos + len].prev.link = cur_pos;
+                                       ctx->optimum[cur_pos + len].prev.match_offset = offset;
+                                       ctx->optimum[cur_pos + len].cost = cost;
+                               }
+                       } while (++len <= matches[i].len);
                }
 
                if (longest_rep_len >= LZX_MIN_MATCH_LEN) {
@@ -1786,61 +1857,82 @@ lzx_optimize_block(struct lzx_compressor *ctx, struct lzx_block_spec *spec,
        const struct lzx_lru_queue orig_queue = ctx->queue;
        unsigned num_passes_remaining = num_passes;
        struct lzx_freqs freqs;
+       const u8 *window_ptr;
+       const u8 *window_end;
+       struct lzx_item *next_chosen_match;
+       struct lz_match lz_match;
+       struct lzx_item lzx_item;
 
        LZX_ASSERT(num_passes >= 1);
        LZX_ASSERT(lz_bt_get_position(&ctx->mf) == spec->window_pos);
 
        ctx->match_window_end = spec->window_pos + spec->block_size;
-       spec->chosen_matches = &ctx->chosen_matches[spec->window_pos];
        ctx->matches_cached = false;
 
        /* The first optimal parsing pass is done using the cost model already
         * set in ctx->costs.  Each later pass is done using a cost model
-        * computed from the previous pass.  */
-       do {
-               const u8 *window_ptr;
-               const u8 *window_end;
-               struct lzx_match *next_chosen_match;
+        * computed from the previous pass.
+        *
+        * To improve performance we only generate the array containing the
+        * matches and literals in intermediate form on the final pass.  */
 
-               --num_passes_remaining;
+       while (--num_passes_remaining) {
                ctx->match_window_pos = spec->window_pos;
                ctx->cache_ptr = ctx->cached_matches;
                memset(&freqs, 0, sizeof(freqs));
                window_ptr = &ctx->window[spec->window_pos];
                window_end = window_ptr + spec->block_size;
-               next_chosen_match = spec->chosen_matches;
 
                while (window_ptr != window_end) {
-                       struct raw_match raw_match;
-                       struct lzx_match lzx_match;
 
-                       raw_match = lzx_get_near_optimal_match(ctx);
+                       lz_match = lzx_get_near_optimal_match(ctx);
 
-                       LZX_ASSERT(!(raw_match.len == LZX_MIN_MATCH_LEN &&
-                                    raw_match.offset == ctx->max_window_size -
+                       LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
+                                    lz_match.offset == ctx->max_window_size -
                                                         LZX_MIN_MATCH_LEN));
-                       if (raw_match.len >= LZX_MIN_MATCH_LEN) {
-                               lzx_match.data = lzx_tally_match(raw_match.len,
-                                                                raw_match.offset,
-                                                                &freqs,
-                                                                &ctx->queue);
-                               window_ptr += raw_match.len;
+                       if (lz_match.len >= LZX_MIN_MATCH_LEN) {
+                               lzx_tally_match(lz_match.len, lz_match.offset,
+                                               &freqs, &ctx->queue);
+                               window_ptr += lz_match.len;
                        } else {
-                               lzx_match.data = lzx_tally_literal(*window_ptr,
-                                                                  &freqs);
+                               lzx_tally_literal(*window_ptr, &freqs);
                                window_ptr += 1;
                        }
-                       *next_chosen_match++ = lzx_match;
                }
-               spec->num_chosen_matches = next_chosen_match - spec->chosen_matches;
                lzx_make_huffman_codes(&freqs, &spec->codes, ctx->num_main_syms);
-               if (num_passes_remaining) {
-                       lzx_set_costs(ctx, &spec->codes.lens);
-                       ctx->queue = orig_queue;
-                       ctx->matches_cached = true;
-               }
-       } while (num_passes_remaining);
+               lzx_set_costs(ctx, &spec->codes.lens);
+               ctx->queue = orig_queue;
+               ctx->matches_cached = true;
+       }
 
+       ctx->match_window_pos = spec->window_pos;
+       ctx->cache_ptr = ctx->cached_matches;
+       memset(&freqs, 0, sizeof(freqs));
+       window_ptr = &ctx->window[spec->window_pos];
+       window_end = window_ptr + spec->block_size;
+
+       spec->chosen_items = &ctx->chosen_items[spec->window_pos];
+       next_chosen_match = spec->chosen_items;
+
+       while (window_ptr != window_end) {
+               lz_match = lzx_get_near_optimal_match(ctx);
+
+               LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
+                            lz_match.offset == ctx->max_window_size -
+                                                LZX_MIN_MATCH_LEN));
+               if (lz_match.len >= LZX_MIN_MATCH_LEN) {
+                       lzx_item.data = lzx_tally_match(lz_match.len,
+                                                        lz_match.offset,
+                                                        &freqs, &ctx->queue);
+                       window_ptr += lz_match.len;
+               } else {
+                       lzx_item.data = lzx_tally_literal(*window_ptr, &freqs);
+                       window_ptr += 1;
+               }
+               *next_chosen_match++ = lzx_item;
+       }
+       spec->num_chosen_items = next_chosen_match - spec->chosen_items;
+       lzx_make_huffman_codes(&freqs, &spec->codes, ctx->num_main_syms);
        spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes);
 }
 
@@ -1891,7 +1983,7 @@ lzx_prepare_blocks(struct lzx_compressor * ctx)
  *
  *     ctx->block_specs[]
  *     ctx->num_blocks
- *     ctx->chosen_matches[]
+ *     ctx->chosen_items[]
  */
 static void
 lzx_prepare_block_fast(struct lzx_compressor * ctx)
@@ -1917,7 +2009,7 @@ lzx_prepare_block_fast(struct lzx_compressor * ctx)
        /* Initialize symbol frequencies and match offset LRU queue.  */
        memset(&record_ctx.freqs, 0, sizeof(struct lzx_freqs));
        lzx_lru_queue_init(&record_ctx.queue);
-       record_ctx.matches = ctx->chosen_matches;
+       record_ctx.matches = ctx->chosen_items;
 
        /* Determine series of matches/literals to output.  */
        lz_analyze_block(ctx->window,
@@ -1933,8 +2025,8 @@ lzx_prepare_block_fast(struct lzx_compressor * ctx)
        spec->block_type = LZX_BLOCKTYPE_ALIGNED;
        spec->window_pos = 0;
        spec->block_size = ctx->window_size;
-       spec->num_chosen_matches = (record_ctx.matches - ctx->chosen_matches);
-       spec->chosen_matches = ctx->chosen_matches;
+       spec->num_chosen_items = (record_ctx.matches - ctx->chosen_items);
+       spec->chosen_items = ctx->chosen_items;
        lzx_make_huffman_codes(&record_ctx.freqs, &spec->codes,
                               ctx->num_main_syms);
        ctx->num_blocks = 1;
@@ -1994,7 +2086,7 @@ lzx_compress(const void *uncompressed_data, size_t uncompressed_size,
 
        LZX_DEBUG("Flushing bitstream...");
        compressed_size = flush_output_bitstream(&ostream);
-       if (compressed_size == ~(input_idx_t)0) {
+       if (compressed_size == (u32)~0UL) {
                LZX_DEBUG("Data did not compress to %zu bytes or less!",
                          compressed_size_avail);
                return 0;
@@ -2054,7 +2146,7 @@ lzx_free_compressor(void *_ctx)
        struct lzx_compressor *ctx = _ctx;
 
        if (ctx) {
-               FREE(ctx->chosen_matches);
+               FREE(ctx->chosen_items);
                FREE(ctx->cached_matches);
                FREE(ctx->optimum);
                lz_bt_destroy(&ctx->mf);
@@ -2128,8 +2220,6 @@ lzx_create_compressor(size_t window_size,
        if (!lzx_window_size_valid(window_size))
                return WIMLIB_ERR_INVALID_PARAM;
 
-       LZX_DEBUG("Allocating memory.");
-
        ctx = CALLOC(1, sizeof(struct lzx_compressor));
        if (ctx == NULL)
                goto oom;
@@ -2170,7 +2260,7 @@ lzx_create_compressor(size_t window_size,
                                       min(params->alg_params.slow.nice_match_length,
                                           LZX_MAX_MATCH_LEN)) *
                                                sizeof(ctx->optimum[0]));
-               if (!ctx->optimum)
+               if (ctx->optimum == NULL)
                        goto oom;
        }
 
@@ -2182,8 +2272,8 @@ lzx_create_compressor(size_t window_size,
                                   LZX_CACHE_LEN - (LZX_MAX_MATCHES_PER_POS + 1);
        }
 
-       ctx->chosen_matches = MALLOC(window_size * sizeof(ctx->chosen_matches[0]));
-       if (ctx->chosen_matches == NULL)
+       ctx->chosen_items = MALLOC(window_size * sizeof(ctx->chosen_items[0]));
+       if (ctx->chosen_items == NULL)
                goto oom;
 
        memcpy(&ctx->params, params, sizeof(struct wimlib_lzx_compressor_params));
@@ -2215,7 +2305,7 @@ lzx_get_needed_memory(size_t max_block_size,
                sizeof(((struct lzx_compressor*)0)->block_specs[0]);
 
        if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) {
-               size += max_block_size * sizeof(((struct lzx_compressor*)0)->chosen_matches[0]);
+               size += max_block_size * sizeof(((struct lzx_compressor*)0)->chosen_items[0]);
                size += lz_bt_get_needed_memory(max_block_size);
                size += (LZX_OPTIM_ARRAY_SIZE +
                         min(params->alg_params.slow.nice_match_length,