};
/* LZX intermediate match/literal format */
-struct lzx_match {
+struct lzx_item {
/* Bit Description
*
* 31 1 if a match, 0 if a literal.
input_idx_t block_size;
/* The match/literal sequence for this block. */
- struct lzx_match *chosen_matches;
+ struct lzx_item *chosen_items;
- /* The length of the @chosen_matches sequence. */
- input_idx_t num_chosen_matches;
+ /* The length of the @chosen_items sequence. */
+ input_idx_t num_chosen_items;
/* Huffman codes for this block. */
struct lzx_codes codes;
/* Space for the sequences of matches/literals that were chosen for each
* block. */
- struct lzx_match *chosen_matches;
+ struct lzx_item *chosen_items;
/* Information about the LZX blocks the preprocessed input was divided
* into. */
*/
static void
lzx_write_match(struct output_bitstream *out, int block_type,
- struct lzx_match match, const struct lzx_codes *codes)
+ struct lzx_item match, const struct lzx_codes *codes)
{
/* low 8 bits are the match length minus 2 */
unsigned match_len_minus_2 = match.data & 0xff;
static void
lzx_write_matches_and_literals(struct output_bitstream *ostream,
int block_type,
- const struct lzx_match match_tab[],
+ const struct lzx_item match_tab[],
unsigned match_count,
const struct lzx_codes *codes)
{
for (unsigned i = 0; i < match_count; i++) {
- struct lzx_match match = match_tab[i];
+ struct lzx_item match = match_tab[i];
/* The high bit of the 32-bit intermediate representation
* indicates whether the item is an actual LZ-style match (1) or
unsigned block_size,
unsigned max_window_size,
unsigned num_main_syms,
- struct lzx_match * chosen_matches,
- unsigned num_chosen_matches,
+ struct lzx_item * chosen_items,
+ unsigned num_chosen_items,
const struct lzx_codes * codes,
const struct lzx_codes * prev_codes,
struct output_bitstream * ostream)
/* Write the actual matches and literals. */
lzx_write_matches_and_literals(ostream, block_type,
- chosen_matches, num_chosen_matches,
+ chosen_items, num_chosen_items,
codes);
LZX_DEBUG("Done writing block.");
for (unsigned i = 0; i < ctx->num_blocks; i++) {
const struct lzx_block_spec *spec = &ctx->block_specs[i];
- LZX_DEBUG("Writing block %u/%u (type=%d, size=%u, num_chosen_matches=%u)...",
+ LZX_DEBUG("Writing block %u/%u (type=%d, size=%u, num_chosen_items=%u)...",
i + 1, ctx->num_blocks,
spec->block_type, spec->block_size,
- spec->num_chosen_matches);
+ spec->num_chosen_items);
lzx_write_compressed_block(spec->block_type,
spec->block_size,
ctx->max_window_size,
ctx->num_main_syms,
- spec->chosen_matches,
- spec->num_chosen_matches,
+ spec->chosen_items,
+ spec->num_chosen_items,
&spec->codes,
prev_codes,
ostream);
/* Constructs an LZX match from a literal byte and updates the main code symbol
* frequencies. */
-static u32
+static inline u32
lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
{
freqs->main[lit]++;
* queue and the frequency of symbols in the main, length, and aligned offset
* alphabets. The return value is a 32-bit number that provides the match in an
* intermediate representation documented below. */
-static u32
+static inline u32
lzx_tally_match(unsigned match_len, u32 match_offset,
struct lzx_freqs *freqs, struct lzx_lru_queue *queue)
{
freqs->aligned[position_footer & 7]++;
/* Pack the position slot, position footer, and match length into an
- * intermediate representation. See `struct lzx_match' for details.
+ * intermediate representation. See `struct lzx_item' for details.
*/
LZX_ASSERT(LZX_MAX_POSITION_SLOTS <= 64);
LZX_ASSERT(lzx_get_num_extra_bits(LZX_MAX_POSITION_SLOTS - 1) <= 17);
struct lzx_record_ctx {
struct lzx_freqs freqs;
struct lzx_lru_queue queue;
- struct lzx_match *matches;
+ struct lzx_item *matches;
};
static void
* well as costs for the codewords in the main, length, and aligned Huffman
* codes, return the approximate number of bits it will take to represent this
* match in the compressed output. Take into account the match offset LRU
- * queue and optionally update it. */
+ * queue and also updates it. */
static u32
lzx_match_cost(unsigned length, u32 offset, const struct lzx_costs *costs,
struct lzx_lru_queue *queue)
{
unsigned num_matches;
const struct raw_match *matches;
- const struct raw_match *matchptr;
struct raw_match match;
unsigned longest_len;
unsigned longest_rep_len;
ctx->optimum[1].prev.link = 0;
/* Calculate the cost to reach any position up to and including that
- * reached by the longest match. */
- matchptr = matches;
- for (unsigned len = 2; len <= longest_len; len++) {
- u32 offset = matchptr->offset;
-
- ctx->optimum[len].queue = ctx->queue;
- ctx->optimum[len].prev.link = 0;
- ctx->optimum[len].prev.match_offset = offset;
- ctx->optimum[len].cost = lzx_match_cost(len, offset, &ctx->costs,
- &ctx->optimum[len].queue);
- if (len == matchptr->len)
- matchptr++;
+ * reached by the longest match.
+ *
+ * Note: We consider only the lowest-offset match that reaches each
+ * position.
+ *
+ * Note: Some of the cost calculation stays the same for each offset,
+ * regardless of how many lengths it gets used for. Therefore, to
+ * improve performance, we hand-code the cost calculation instead of
+ * calling lzx_match_cost() to do a from-scratch cost evaluation at each
+ * length. */
+ for (unsigned i = 0, len = 2; i < num_matches; i++) {
+ u32 offset;
+ struct lzx_lru_queue queue;
+ u32 position_cost;
+ unsigned position_slot;
+ unsigned num_extra_bits;
+
+ offset = matches[i].offset;
+ queue = ctx->queue;
+ position_cost = 0;
+
+ position_slot = lzx_get_position_slot(offset, &queue);
+ num_extra_bits = lzx_get_num_extra_bits(position_slot);
+ if (num_extra_bits >= 3) {
+ position_cost += num_extra_bits - 3;
+ position_cost += ctx->costs.aligned[(offset + LZX_OFFSET_OFFSET) & 7];
+ } else {
+ position_cost += num_extra_bits;
+ }
+
+ do {
+ unsigned len_header;
+ unsigned main_symbol;
+ u32 cost;
+
+ cost = position_cost;
+
+ len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
+ main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
+ cost += ctx->costs.main[main_symbol];
+ if (len_header == LZX_NUM_PRIMARY_LENS)
+ cost += ctx->costs.len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
+
+ ctx->optimum[len].queue = queue;
+ ctx->optimum[len].prev.link = 0;
+ ctx->optimum[len].prev.match_offset = offset;
+ ctx->optimum[len].cost = cost;
+ } while (++len <= matches[i].len);
}
end_pos = longest_len;
ctx->optimum[cur_pos + 1].prev.link = cur_pos;
}
- /* Consider coding a match. */
- matchptr = matches;
- for (unsigned len = 2; len <= longest_len; len++) {
+ /* Consider coding a match.
+ *
+ * The hard-coded cost calculation is done for the same reason
+ * stated in the comment for the similar loop earlier.
+ * Actually, it is *this* one that has the biggest effect on
+ * performance; overall LZX compression is > 10% faster with
+ * this code compared to calling lzx_match_cost() with each
+ * length. */
+ for (unsigned i = 0, len = 2; i < num_matches; i++) {
u32 offset;
struct lzx_lru_queue queue;
+ u32 position_cost;
+ unsigned position_slot;
+ unsigned num_extra_bits;
- offset = matchptr->offset;
+ offset = matches[i].offset;
queue = ctx->optimum[cur_pos].queue;
-
- cost = ctx->optimum[cur_pos].cost +
- lzx_match_cost(len, offset, &ctx->costs, &queue);
- if (cost < ctx->optimum[cur_pos + len].cost) {
- ctx->optimum[cur_pos + len].queue = queue;
- ctx->optimum[cur_pos + len].prev.link = cur_pos;
- ctx->optimum[cur_pos + len].prev.match_offset = offset;
- ctx->optimum[cur_pos + len].cost = cost;
+ position_cost = ctx->optimum[cur_pos].cost;
+
+ position_slot = lzx_get_position_slot(offset, &queue);
+ num_extra_bits = lzx_get_num_extra_bits(position_slot);
+ if (num_extra_bits >= 3) {
+ position_cost += num_extra_bits - 3;
+ position_cost += ctx->costs.aligned[
+ (offset + LZX_OFFSET_OFFSET) & 7];
+ } else {
+ position_cost += num_extra_bits;
}
- if (len == matchptr->len)
- matchptr++;
+
+ do {
+ unsigned len_header;
+ unsigned main_symbol;
+ u32 cost;
+
+ cost = position_cost;
+
+ len_header = min(len - LZX_MIN_MATCH_LEN,
+ LZX_NUM_PRIMARY_LENS);
+ main_symbol = ((position_slot << 3) | len_header) +
+ LZX_NUM_CHARS;
+ cost += ctx->costs.main[main_symbol];
+ if (len_header == LZX_NUM_PRIMARY_LENS) {
+ cost += ctx->costs.len[len -
+ LZX_MIN_MATCH_LEN -
+ LZX_NUM_PRIMARY_LENS];
+ }
+ if (cost < ctx->optimum[cur_pos + len].cost) {
+ ctx->optimum[cur_pos + len].queue = queue;
+ ctx->optimum[cur_pos + len].prev.link = cur_pos;
+ ctx->optimum[cur_pos + len].prev.match_offset = offset;
+ ctx->optimum[cur_pos + len].cost = cost;
+ }
+ } while (++len <= matches[i].len);
}
if (longest_rep_len >= LZX_MIN_MATCH_LEN) {
const struct lzx_lru_queue orig_queue = ctx->queue;
unsigned num_passes_remaining = num_passes;
struct lzx_freqs freqs;
+ const u8 *window_ptr;
+ const u8 *window_end;
+ struct lzx_item *next_chosen_match;
+ struct raw_match raw_match;
+ struct lzx_item lzx_item;
LZX_ASSERT(num_passes >= 1);
LZX_ASSERT(lz_bt_get_position(&ctx->mf) == spec->window_pos);
ctx->match_window_end = spec->window_pos + spec->block_size;
- spec->chosen_matches = &ctx->chosen_matches[spec->window_pos];
ctx->matches_cached = false;
/* The first optimal parsing pass is done using the cost model already
* set in ctx->costs. Each later pass is done using a cost model
- * computed from the previous pass. */
- do {
- const u8 *window_ptr;
- const u8 *window_end;
- struct lzx_match *next_chosen_match;
+ * computed from the previous pass.
+ *
+ * To improve performance we only generate the array containing the
+ * matches and literals in intermediate form on the final pass. */
- --num_passes_remaining;
+ while (--num_passes_remaining) {
ctx->match_window_pos = spec->window_pos;
ctx->cache_ptr = ctx->cached_matches;
memset(&freqs, 0, sizeof(freqs));
window_ptr = &ctx->window[spec->window_pos];
window_end = window_ptr + spec->block_size;
- next_chosen_match = spec->chosen_matches;
while (window_ptr != window_end) {
- struct raw_match raw_match;
- struct lzx_match lzx_match;
raw_match = lzx_get_near_optimal_match(ctx);
raw_match.offset == ctx->max_window_size -
LZX_MIN_MATCH_LEN));
if (raw_match.len >= LZX_MIN_MATCH_LEN) {
- lzx_match.data = lzx_tally_match(raw_match.len,
- raw_match.offset,
- &freqs,
- &ctx->queue);
+ lzx_tally_match(raw_match.len, raw_match.offset,
+ &freqs, &ctx->queue);
window_ptr += raw_match.len;
} else {
- lzx_match.data = lzx_tally_literal(*window_ptr,
- &freqs);
+ lzx_tally_literal(*window_ptr, &freqs);
window_ptr += 1;
}
- *next_chosen_match++ = lzx_match;
}
- spec->num_chosen_matches = next_chosen_match - spec->chosen_matches;
lzx_make_huffman_codes(&freqs, &spec->codes, ctx->num_main_syms);
- if (num_passes_remaining) {
- lzx_set_costs(ctx, &spec->codes.lens);
- ctx->queue = orig_queue;
- ctx->matches_cached = true;
- }
- } while (num_passes_remaining);
+ lzx_set_costs(ctx, &spec->codes.lens);
+ ctx->queue = orig_queue;
+ ctx->matches_cached = true;
+ }
+ ctx->match_window_pos = spec->window_pos;
+ ctx->cache_ptr = ctx->cached_matches;
+ memset(&freqs, 0, sizeof(freqs));
+ window_ptr = &ctx->window[spec->window_pos];
+ window_end = window_ptr + spec->block_size;
+
+ spec->chosen_items = &ctx->chosen_items[spec->window_pos];
+ next_chosen_match = spec->chosen_items;
+
+ while (window_ptr != window_end) {
+ raw_match = lzx_get_near_optimal_match(ctx);
+
+ LZX_ASSERT(!(raw_match.len == LZX_MIN_MATCH_LEN &&
+ raw_match.offset == ctx->max_window_size -
+ LZX_MIN_MATCH_LEN));
+ if (raw_match.len >= LZX_MIN_MATCH_LEN) {
+ lzx_item.data = lzx_tally_match(raw_match.len,
+ raw_match.offset,
+ &freqs, &ctx->queue);
+ window_ptr += raw_match.len;
+ } else {
+ lzx_item.data = lzx_tally_literal(*window_ptr, &freqs);
+ window_ptr += 1;
+ }
+ *next_chosen_match++ = lzx_item;
+ }
+ spec->num_chosen_items = next_chosen_match - spec->chosen_items;
+ lzx_make_huffman_codes(&freqs, &spec->codes, ctx->num_main_syms);
spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes);
}
*
* ctx->block_specs[]
* ctx->num_blocks
- * ctx->chosen_matches[]
+ * ctx->chosen_items[]
*/
static void
lzx_prepare_block_fast(struct lzx_compressor * ctx)
/* Initialize symbol frequencies and match offset LRU queue. */
memset(&record_ctx.freqs, 0, sizeof(struct lzx_freqs));
lzx_lru_queue_init(&record_ctx.queue);
- record_ctx.matches = ctx->chosen_matches;
+ record_ctx.matches = ctx->chosen_items;
/* Determine series of matches/literals to output. */
lz_analyze_block(ctx->window,
spec->block_type = LZX_BLOCKTYPE_ALIGNED;
spec->window_pos = 0;
spec->block_size = ctx->window_size;
- spec->num_chosen_matches = (record_ctx.matches - ctx->chosen_matches);
- spec->chosen_matches = ctx->chosen_matches;
+ spec->num_chosen_items = (record_ctx.matches - ctx->chosen_items);
+ spec->chosen_items = ctx->chosen_items;
lzx_make_huffman_codes(&record_ctx.freqs, &spec->codes,
ctx->num_main_syms);
ctx->num_blocks = 1;
struct lzx_compressor *ctx = _ctx;
if (ctx) {
- FREE(ctx->chosen_matches);
+ FREE(ctx->chosen_items);
FREE(ctx->cached_matches);
FREE(ctx->optimum);
lz_bt_destroy(&ctx->mf);
LZX_CACHE_LEN - (LZX_MAX_MATCHES_PER_POS + 1);
}
- ctx->chosen_matches = MALLOC(window_size * sizeof(ctx->chosen_matches[0]));
- if (ctx->chosen_matches == NULL)
+ ctx->chosen_items = MALLOC(window_size * sizeof(ctx->chosen_items[0]));
+ if (ctx->chosen_items == NULL)
goto oom;
memcpy(&ctx->params, params, sizeof(struct wimlib_lzx_compressor_params));
sizeof(((struct lzx_compressor*)0)->block_specs[0]);
if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) {
- size += max_block_size * sizeof(((struct lzx_compressor*)0)->chosen_matches[0]);
+ size += max_block_size * sizeof(((struct lzx_compressor*)0)->chosen_items[0]);
size += lz_bt_get_needed_memory(max_block_size);
size += (LZX_OPTIM_ARRAY_SIZE +
min(params->alg_params.slow.nice_match_length,