From 14a1547f3404346127a82e643809bc837425ec8e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 11 Jun 2014 22:39:29 -0500 Subject: [PATCH] lzx-compress.c: Don't do redundant work in cost calculations --- src/lzx-compress.c | 125 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 97 insertions(+), 28 deletions(-) diff --git a/src/lzx-compress.c b/src/lzx-compress.c index c8f61099..aedd05a8 100644 --- a/src/lzx-compress.c +++ b/src/lzx-compress.c @@ -1170,7 +1170,7 @@ lzx_literal_cost(u8 c, const struct lzx_costs * costs) * well as costs for the codewords in the main, length, and aligned Huffman * codes, return the approximate number of bits it will take to represent this * match in the compressed output. Take into account the match offset LRU - * queue and optionally update it. */ + * queue and also updates it. */ static u32 lzx_match_cost(unsigned length, u32 offset, const struct lzx_costs *costs, struct lzx_lru_queue *queue) @@ -1448,7 +1448,6 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx) { unsigned num_matches; const struct raw_match *matches; - const struct raw_match *matchptr; struct raw_match match; unsigned longest_len; unsigned longest_rep_len; @@ -1522,18 +1521,54 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx) ctx->optimum[1].prev.link = 0; /* Calculate the cost to reach any position up to and including that - * reached by the longest match. */ - matchptr = matches; - for (unsigned len = 2; len <= longest_len; len++) { - u32 offset = matchptr->offset; - - ctx->optimum[len].queue = ctx->queue; - ctx->optimum[len].prev.link = 0; - ctx->optimum[len].prev.match_offset = offset; - ctx->optimum[len].cost = lzx_match_cost(len, offset, &ctx->costs, - &ctx->optimum[len].queue); - if (len == matchptr->len) - matchptr++; + * reached by the longest match. + * + * Note: We consider only the lowest-offset match that reaches each + * position. + * + * Note: Some of the cost calculation stays the same for each offset, + * regardless of how many lengths it gets used for. Therefore, to + * improve performance, we hand-code the cost calculation instead of + * calling lzx_match_cost() to do a from-scratch cost evaluation at each + * length. */ + for (unsigned i = 0, len = 2; i < num_matches; i++) { + u32 offset; + struct lzx_lru_queue queue; + u32 position_cost; + unsigned position_slot; + unsigned num_extra_bits; + + offset = matches[i].offset; + queue = ctx->queue; + position_cost = 0; + + position_slot = lzx_get_position_slot(offset, &queue); + num_extra_bits = lzx_get_num_extra_bits(position_slot); + if (num_extra_bits >= 3) { + position_cost += num_extra_bits - 3; + position_cost += ctx->costs.aligned[(offset + LZX_OFFSET_OFFSET) & 7]; + } else { + position_cost += num_extra_bits; + } + + do { + unsigned len_header; + unsigned main_symbol; + u32 cost; + + cost = position_cost; + + len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS); + main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS; + cost += ctx->costs.main[main_symbol]; + if (len_header == LZX_NUM_PRIMARY_LENS) + cost += ctx->costs.len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS]; + + ctx->optimum[len].queue = queue; + ctx->optimum[len].prev.link = 0; + ctx->optimum[len].prev.match_offset = offset; + ctx->optimum[len].cost = cost; + } while (++len <= matches[i].len); } end_pos = longest_len; @@ -1683,25 +1718,59 @@ lzx_get_near_optimal_match(struct lzx_compressor *ctx) ctx->optimum[cur_pos + 1].prev.link = cur_pos; } - /* Consider coding a match. */ - matchptr = matches; - for (unsigned len = 2; len <= longest_len; len++) { + /* Consider coding a match. + * + * The hard-coded cost calculation is done for the same reason + * stated in the comment for the similar loop earlier. + * Actually, it is *this* one that has the biggest effect on + * performance; overall LZX compression is > 10% faster with + * this code compared to calling lzx_match_cost() with each + * length. */ + for (unsigned i = 0, len = 2; i < num_matches; i++) { u32 offset; struct lzx_lru_queue queue; + u32 position_cost; + unsigned position_slot; + unsigned num_extra_bits; - offset = matchptr->offset; + offset = matches[i].offset; queue = ctx->optimum[cur_pos].queue; - - cost = ctx->optimum[cur_pos].cost + - lzx_match_cost(len, offset, &ctx->costs, &queue); - if (cost < ctx->optimum[cur_pos + len].cost) { - ctx->optimum[cur_pos + len].queue = queue; - ctx->optimum[cur_pos + len].prev.link = cur_pos; - ctx->optimum[cur_pos + len].prev.match_offset = offset; - ctx->optimum[cur_pos + len].cost = cost; + position_cost = ctx->optimum[cur_pos].cost; + + position_slot = lzx_get_position_slot(offset, &queue); + num_extra_bits = lzx_get_num_extra_bits(position_slot); + if (num_extra_bits >= 3) { + position_cost += num_extra_bits - 3; + position_cost += ctx->costs.aligned[ + (offset + LZX_OFFSET_OFFSET) & 7]; + } else { + position_cost += num_extra_bits; } - if (len == matchptr->len) - matchptr++; + + do { + unsigned len_header; + unsigned main_symbol; + u32 cost; + + cost = position_cost; + + len_header = min(len - LZX_MIN_MATCH_LEN, + LZX_NUM_PRIMARY_LENS); + main_symbol = ((position_slot << 3) | len_header) + + LZX_NUM_CHARS; + cost += ctx->costs.main[main_symbol]; + if (len_header == LZX_NUM_PRIMARY_LENS) { + cost += ctx->costs.len[len - + LZX_MIN_MATCH_LEN - + LZX_NUM_PRIMARY_LENS]; + } + if (cost < ctx->optimum[cur_pos + len].cost) { + ctx->optimum[cur_pos + len].queue = queue; + ctx->optimum[cur_pos + len].prev.link = cur_pos; + ctx->optimum[cur_pos + len].prev.match_offset = offset; + ctx->optimum[cur_pos + len].cost = cost; + } + } while (++len <= matches[i].len); } if (longest_rep_len >= LZX_MIN_MATCH_LEN) { -- 2.43.0