extract.c: Send "extract streams" progress at least every 5 MB

[wimlib] / src / lzx-compress.c
diff --git a/src/lzx-compress.c b/src/lzx-compress.c

index c8f61099f0f92c98b9ec9ea47790fc3523eabc58..e792fb1e1043c3e5a18e4289c6e10bcdec594b2f 100644 (file)
--- a/src/lzx-compress.c
+++ b/src/lzx-compress.c
@@ -1058,7 +1058,7 @@ lzx_write_all_blocks(struct lzx_compressor *ctx, struct output_bitstream *ostrea
  
  /* Constructs an LZX match from a literal byte and updates the main code symbol
   * frequencies.  */
-static u32
+static inline u32
  lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
  {
         freqs->main[lit]++;
@@ -1069,7 +1069,7 @@ lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
   * queue and the frequency of symbols in the main, length, and aligned offset
   * alphabets.  The return value is a 32-bit number that provides the match in an
   * intermediate representation documented below.  */
-static u32
+static inline u32
  lzx_tally_match(unsigned match_len, u32 match_offset,
                 struct lzx_freqs *freqs, struct lzx_lru_queue *queue)
  {
@@ -1170,7 +1170,7 @@ lzx_literal_cost(u8 c, const struct lzx_costs * costs)
   * well as costs for the codewords in the main, length, and aligned Huffman
   * codes, return the approximate number of bits it will take to represent this
   * match in the compressed output.  Take into account the match offset LRU
- * queue and optionally update it.  */
+ * queue and also updates it.  */
  static u32
  lzx_match_cost(unsigned length, u32 offset, const struct lzx_costs *costs,
                struct lzx_lru_queue *queue)
@@ -1448,7 +1448,6 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx)
  {
         unsigned num_matches;
         const struct raw_match *matches;
-       const struct raw_match *matchptr;
         struct raw_match match;
         unsigned longest_len;
         unsigned longest_rep_len;
@@ -1522,18 +1521,54 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx)
         ctx->optimum[1].prev.link = 0;
  
         /* Calculate the cost to reach any position up to and including that
-        * reached by the longest match.  */
-       matchptr = matches;
-       for (unsigned len = 2; len <= longest_len; len++) {
-               u32 offset = matchptr->offset;
-
-               ctx->optimum[len].queue = ctx->queue;
-               ctx->optimum[len].prev.link = 0;
-               ctx->optimum[len].prev.match_offset = offset;
-               ctx->optimum[len].cost = lzx_match_cost(len, offset, &ctx->costs,
-                                                       &ctx->optimum[len].queue);
-               if (len == matchptr->len)
-                       matchptr++;
+        * reached by the longest match.
+        *
+        * Note: We consider only the lowest-offset match that reaches each
+        * position.
+        *
+        * Note: Some of the cost calculation stays the same for each offset,
+        * regardless of how many lengths it gets used for.  Therefore, to
+        * improve performance, we hand-code the cost calculation instead of
+        * calling lzx_match_cost() to do a from-scratch cost evaluation at each
+        * length.  */
+       for (unsigned i = 0, len = 2; i < num_matches; i++) {
+               u32 offset;
+               struct lzx_lru_queue queue;
+               u32 position_cost;
+               unsigned position_slot;
+               unsigned num_extra_bits;
+
+               offset = matches[i].offset;
+               queue = ctx->queue;
+               position_cost = 0;
+
+               position_slot = lzx_get_position_slot(offset, &queue);
+               num_extra_bits = lzx_get_num_extra_bits(position_slot);
+               if (num_extra_bits >= 3) {
+                       position_cost += num_extra_bits - 3;
+                       position_cost += ctx->costs.aligned[(offset + LZX_OFFSET_OFFSET) & 7];
+               } else {
+                       position_cost += num_extra_bits;
+               }
+
+               do {
+                       unsigned len_header;
+                       unsigned main_symbol;
+                       u32 cost;
+
+                       cost = position_cost;
+
+                       len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
+                       main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
+                       cost += ctx->costs.main[main_symbol];
+                       if (len_header == LZX_NUM_PRIMARY_LENS)
+                               cost += ctx->costs.len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
+
+                       ctx->optimum[len].queue = queue;
+                       ctx->optimum[len].prev.link = 0;
+                       ctx->optimum[len].prev.match_offset = offset;
+                       ctx->optimum[len].cost = cost;
+               } while (++len <= matches[i].len);
         }
         end_pos = longest_len;
  
@@ -1683,25 +1718,59 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx)
                         ctx->optimum[cur_pos + 1].prev.link = cur_pos;
                 }
  
-               /* Consider coding a match.  */
-               matchptr = matches;
-               for (unsigned len = 2; len <= longest_len; len++) {
+               /* Consider coding a match.
+                *
+                * The hard-coded cost calculation is done for the same reason
+                * stated in the comment for the similar loop earlier.
+                * Actually, it is *this* one that has the biggest effect on
+                * performance; overall LZX compression is > 10% faster with
+                * this code compared to calling lzx_match_cost() with each
+                * length.  */
+               for (unsigned i = 0, len = 2; i < num_matches; i++) {
                         u32 offset;
                         struct lzx_lru_queue queue;
+                       u32 position_cost;
+                       unsigned position_slot;
+                       unsigned num_extra_bits;
  
-                       offset = matchptr->offset;
+                       offset = matches[i].offset;
                         queue = ctx->optimum[cur_pos].queue;
-
-                       cost = ctx->optimum[cur_pos].cost +
-                               lzx_match_cost(len, offset, &ctx->costs, &queue);
-                       if (cost < ctx->optimum[cur_pos + len].cost) {
-                               ctx->optimum[cur_pos + len].queue = queue;
-                               ctx->optimum[cur_pos + len].prev.link = cur_pos;
-                               ctx->optimum[cur_pos + len].prev.match_offset = offset;
-                               ctx->optimum[cur_pos + len].cost = cost;
+                       position_cost = ctx->optimum[cur_pos].cost;
+
+                       position_slot = lzx_get_position_slot(offset, &queue);
+                       num_extra_bits = lzx_get_num_extra_bits(position_slot);
+                       if (num_extra_bits >= 3) {
+                               position_cost += num_extra_bits - 3;
+                               position_cost += ctx->costs.aligned[
+                                               (offset + LZX_OFFSET_OFFSET) & 7];
+                       } else {
+                               position_cost += num_extra_bits;
                         }
-                       if (len == matchptr->len)
-                               matchptr++;
+
+                       do {
+                               unsigned len_header;
+                               unsigned main_symbol;
+                               u32 cost;
+
+                               cost = position_cost;
+
+                               len_header = min(len - LZX_MIN_MATCH_LEN,
+                                                LZX_NUM_PRIMARY_LENS);
+                               main_symbol = ((position_slot << 3) | len_header) +
+                                               LZX_NUM_CHARS;
+                               cost += ctx->costs.main[main_symbol];
+                               if (len_header == LZX_NUM_PRIMARY_LENS) {
+                                       cost += ctx->costs.len[len -
+                                                       LZX_MIN_MATCH_LEN -
+                                                       LZX_NUM_PRIMARY_LENS];
+                               }
+                               if (cost < ctx->optimum[cur_pos + len].cost) {
+                                       ctx->optimum[cur_pos + len].queue = queue;
+                                       ctx->optimum[cur_pos + len].prev.link = cur_pos;
+                                       ctx->optimum[cur_pos + len].prev.match_offset = offset;
+                                       ctx->optimum[cur_pos + len].cost = cost;
+                               }
+                       } while (++len <= matches[i].len);
                 }
  
                 if (longest_rep_len >= LZX_MIN_MATCH_LEN) {
@@ -1788,33 +1857,33 @@ lzx_optimize_block(struct lzx_compressor *ctx, struct lzx_block_spec *spec,
         const struct lzx_lru_queue orig_queue = ctx->queue;
         unsigned num_passes_remaining = num_passes;
         struct lzx_freqs freqs;
+       const u8 *window_ptr;
+       const u8 *window_end;
+       struct lzx_match *next_chosen_match;
+       struct raw_match raw_match;
+       struct lzx_match lzx_match;
  
         LZX_ASSERT(num_passes >= 1);
         LZX_ASSERT(lz_bt_get_position(&ctx->mf) == spec->window_pos);
  
         ctx->match_window_end = spec->window_pos + spec->block_size;
-       spec->chosen_matches = &ctx->chosen_matches[spec->window_pos];
         ctx->matches_cached = false;
  
         /* The first optimal parsing pass is done using the cost model already
          * set in ctx->costs.  Each later pass is done using a cost model
-        * computed from the previous pass.  */
-       do {
-               const u8 *window_ptr;
-               const u8 *window_end;
-               struct lzx_match *next_chosen_match;
+        * computed from the previous pass.
+        *
+        * To improve performance we only generate the array containing the
+        * matches and literals in intermediate form on the final pass.  */
  
-               --num_passes_remaining;
+       while (--num_passes_remaining) {
                 ctx->match_window_pos = spec->window_pos;
                 ctx->cache_ptr = ctx->cached_matches;
                 memset(&freqs, 0, sizeof(freqs));
                 window_ptr = &ctx->window[spec->window_pos];
                 window_end = window_ptr + spec->block_size;
-               next_chosen_match = spec->chosen_matches;
  
                 while (window_ptr != window_end) {
-                       struct raw_match raw_match;
-                       struct lzx_match lzx_match;
  
                         raw_match = lzx_get_near_optimal_match(ctx);
  
@@ -1822,27 +1891,48 @@ lzx_optimize_block(struct lzx_compressor *ctx, struct lzx_block_spec *spec,
                                      raw_match.offset == ctx->max_window_size -
                                                          LZX_MIN_MATCH_LEN));
                         if (raw_match.len >= LZX_MIN_MATCH_LEN) {
-                               lzx_match.data = lzx_tally_match(raw_match.len,
-                                                                raw_match.offset,
-                                                                &freqs,
-                                                                &ctx->queue);
+                               lzx_tally_match(raw_match.len, raw_match.offset,
+                                               &freqs, &ctx->queue);
                                 window_ptr += raw_match.len;
                         } else {
-                               lzx_match.data = lzx_tally_literal(*window_ptr,
-                                                                  &freqs);
+                               lzx_tally_literal(*window_ptr, &freqs);
                                 window_ptr += 1;
                         }
-                       *next_chosen_match++ = lzx_match;
                 }
-               spec->num_chosen_matches = next_chosen_match - spec->chosen_matches;
                 lzx_make_huffman_codes(&freqs, &spec->codes, ctx->num_main_syms);
-               if (num_passes_remaining) {
-                       lzx_set_costs(ctx, &spec->codes.lens);
-                       ctx->queue = orig_queue;
-                       ctx->matches_cached = true;
-               }
-       } while (num_passes_remaining);
+               lzx_set_costs(ctx, &spec->codes.lens);
+               ctx->queue = orig_queue;
+               ctx->matches_cached = true;
+       }
  
+       ctx->match_window_pos = spec->window_pos;
+       ctx->cache_ptr = ctx->cached_matches;
+       memset(&freqs, 0, sizeof(freqs));
+       window_ptr = &ctx->window[spec->window_pos];
+       window_end = window_ptr + spec->block_size;
+
+       spec->chosen_matches = &ctx->chosen_matches[spec->window_pos];
+       next_chosen_match = spec->chosen_matches;
+
+       while (window_ptr != window_end) {
+               raw_match = lzx_get_near_optimal_match(ctx);
+
+               LZX_ASSERT(!(raw_match.len == LZX_MIN_MATCH_LEN &&
+                            raw_match.offset == ctx->max_window_size -
+                                                LZX_MIN_MATCH_LEN));
+               if (raw_match.len >= LZX_MIN_MATCH_LEN) {
+                       lzx_match.data = lzx_tally_match(raw_match.len,
+                                                        raw_match.offset,
+                                                        &freqs, &ctx->queue);
+                       window_ptr += raw_match.len;
+               } else {
+                       lzx_match.data = lzx_tally_literal(*window_ptr, &freqs);
+                       window_ptr += 1;
+               }
+               *next_chosen_match++ = lzx_match;
+       }
+       spec->num_chosen_matches = next_chosen_match - spec->chosen_matches;
+       lzx_make_huffman_codes(&freqs, &spec->codes, ctx->num_main_syms);
         spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes);
  }