#define LZX_DIV_BLOCK_SIZE 32768
-#define LZX_CACHE_PER_POS 10
+#define LZX_CACHE_PER_POS 8
#define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1))
#define LZX_CACHE_SIZE (LZX_CACHE_LEN * sizeof(struct raw_match))
-
-/* Dependent on behavior of lz_bt_get_matches(). */
#define LZX_MAX_MATCHES_PER_POS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
/* Codewords for the LZX main, length, and aligned offset Huffman codes */
/* Constructs an LZX match from a literal byte and updates the main code symbol
* frequencies. */
-static u32
+static inline u32
lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
{
freqs->main[lit]++;
* queue and the frequency of symbols in the main, length, and aligned offset
* alphabets. The return value is a 32-bit number that provides the match in an
* intermediate representation documented below. */
-static u32
+static inline u32
lzx_tally_match(unsigned match_len, u32 match_offset,
struct lzx_freqs *freqs, struct lzx_lru_queue *queue)
{
* well as costs for the codewords in the main, length, and aligned Huffman
* codes, return the approximate number of bits it will take to represent this
* match in the compressed output. Take into account the match offset LRU
- * queue and optionally update it. */
+ * queue and also updates it. */
static u32
lzx_match_cost(unsigned length, u32 offset, const struct lzx_costs *costs,
struct lzx_lru_queue *queue)
cache_ptr = ctx->cache_ptr;
matches = cache_ptr + 1;
- if (ctx->matches_cached) {
- num_matches = cache_ptr->len;
+ if (likely(cache_ptr <= ctx->cache_limit)) {
+ if (ctx->matches_cached) {
+ num_matches = cache_ptr->len;
+ } else {
+ num_matches = lz_bt_get_matches(&ctx->mf, matches);
+ cache_ptr->len = num_matches;
+ }
} else {
- num_matches = lz_bt_get_matches(&ctx->mf, matches);
- cache_ptr->len = num_matches;
+ num_matches = 0;
}
/* Don't allow matches to span the end of an LZX block. */
cache_ptr = ctx->cache_ptr;
ctx->match_window_pos += n;
if (ctx->matches_cached) {
- while (n--)
+ while (n-- && cache_ptr <= ctx->cache_limit)
cache_ptr += 1 + cache_ptr->len;
} else {
lz_bt_skip_positions(&ctx->mf, n);
- while (n--) {
+ while (n-- && cache_ptr <= ctx->cache_limit) {
cache_ptr->len = 0;
cache_ptr += 1;
}
{
unsigned num_matches;
const struct raw_match *matches;
- const struct raw_match *matchptr;
struct raw_match match;
unsigned longest_len;
unsigned longest_rep_len;
ctx->optimum[1].prev.link = 0;
/* Calculate the cost to reach any position up to and including that
- * reached by the longest match. */
- matchptr = matches;
- for (unsigned len = 2; len <= longest_len; len++) {
- u32 offset = matchptr->offset;
-
- ctx->optimum[len].queue = ctx->queue;
- ctx->optimum[len].prev.link = 0;
- ctx->optimum[len].prev.match_offset = offset;
- ctx->optimum[len].cost = lzx_match_cost(len, offset, &ctx->costs,
- &ctx->optimum[len].queue);
- if (len == matchptr->len)
- matchptr++;
+ * reached by the longest match.
+ *
+ * Note: We consider only the lowest-offset match that reaches each
+ * position.
+ *
+ * Note: Some of the cost calculation stays the same for each offset,
+ * regardless of how many lengths it gets used for. Therefore, to
+ * improve performance, we hand-code the cost calculation instead of
+ * calling lzx_match_cost() to do a from-scratch cost evaluation at each
+ * length. */
+ for (unsigned i = 0, len = 2; i < num_matches; i++) {
+ u32 offset;
+ struct lzx_lru_queue queue;
+ u32 position_cost;
+ unsigned position_slot;
+ unsigned num_extra_bits;
+
+ offset = matches[i].offset;
+ queue = ctx->queue;
+ position_cost = 0;
+
+ position_slot = lzx_get_position_slot(offset, &queue);
+ num_extra_bits = lzx_get_num_extra_bits(position_slot);
+ if (num_extra_bits >= 3) {
+ position_cost += num_extra_bits - 3;
+ position_cost += ctx->costs.aligned[(offset + LZX_OFFSET_OFFSET) & 7];
+ } else {
+ position_cost += num_extra_bits;
+ }
+
+ do {
+ unsigned len_header;
+ unsigned main_symbol;
+ u32 cost;
+
+ cost = position_cost;
+
+ len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
+ main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
+ cost += ctx->costs.main[main_symbol];
+ if (len_header == LZX_NUM_PRIMARY_LENS)
+ cost += ctx->costs.len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
+
+ ctx->optimum[len].queue = queue;
+ ctx->optimum[len].prev.link = 0;
+ ctx->optimum[len].prev.match_offset = offset;
+ ctx->optimum[len].cost = cost;
+ } while (++len <= matches[i].len);
}
end_pos = longest_len;
ctx->optimum[cur_pos + 1].prev.link = cur_pos;
}
- /* Consider coding a match. */
- matchptr = matches;
- for (unsigned len = 2; len <= longest_len; len++) {
+ /* Consider coding a match.
+ *
+ * The hard-coded cost calculation is done for the same reason
+ * stated in the comment for the similar loop earlier.
+ * Actually, it is *this* one that has the biggest effect on
+ * performance; overall LZX compression is > 10% faster with
+ * this code compared to calling lzx_match_cost() with each
+ * length. */
+ for (unsigned i = 0, len = 2; i < num_matches; i++) {
u32 offset;
struct lzx_lru_queue queue;
+ u32 position_cost;
+ unsigned position_slot;
+ unsigned num_extra_bits;
- offset = matchptr->offset;
+ offset = matches[i].offset;
queue = ctx->optimum[cur_pos].queue;
-
- cost = ctx->optimum[cur_pos].cost +
- lzx_match_cost(len, offset, &ctx->costs, &queue);
- if (cost < ctx->optimum[cur_pos + len].cost) {
- ctx->optimum[cur_pos + len].queue = queue;
- ctx->optimum[cur_pos + len].prev.link = cur_pos;
- ctx->optimum[cur_pos + len].prev.match_offset = offset;
- ctx->optimum[cur_pos + len].cost = cost;
+ position_cost = ctx->optimum[cur_pos].cost;
+
+ position_slot = lzx_get_position_slot(offset, &queue);
+ num_extra_bits = lzx_get_num_extra_bits(position_slot);
+ if (num_extra_bits >= 3) {
+ position_cost += num_extra_bits - 3;
+ position_cost += ctx->costs.aligned[
+ (offset + LZX_OFFSET_OFFSET) & 7];
+ } else {
+ position_cost += num_extra_bits;
}
- if (len == matchptr->len)
- matchptr++;
+
+ do {
+ unsigned len_header;
+ unsigned main_symbol;
+ u32 cost;
+
+ cost = position_cost;
+
+ len_header = min(len - LZX_MIN_MATCH_LEN,
+ LZX_NUM_PRIMARY_LENS);
+ main_symbol = ((position_slot << 3) | len_header) +
+ LZX_NUM_CHARS;
+ cost += ctx->costs.main[main_symbol];
+ if (len_header == LZX_NUM_PRIMARY_LENS) {
+ cost += ctx->costs.len[len -
+ LZX_MIN_MATCH_LEN -
+ LZX_NUM_PRIMARY_LENS];
+ }
+ if (cost < ctx->optimum[cur_pos + len].cost) {
+ ctx->optimum[cur_pos + len].queue = queue;
+ ctx->optimum[cur_pos + len].prev.link = cur_pos;
+ ctx->optimum[cur_pos + len].prev.match_offset = offset;
+ ctx->optimum[cur_pos + len].cost = cost;
+ }
+ } while (++len <= matches[i].len);
}
if (longest_rep_len >= LZX_MIN_MATCH_LEN) {
const struct lzx_lru_queue orig_queue = ctx->queue;
unsigned num_passes_remaining = num_passes;
struct lzx_freqs freqs;
+ const u8 *window_ptr;
+ const u8 *window_end;
+ struct lzx_match *next_chosen_match;
+ struct raw_match raw_match;
+ struct lzx_match lzx_match;
LZX_ASSERT(num_passes >= 1);
LZX_ASSERT(lz_bt_get_position(&ctx->mf) == spec->window_pos);
ctx->match_window_end = spec->window_pos + spec->block_size;
- spec->chosen_matches = &ctx->chosen_matches[spec->window_pos];
ctx->matches_cached = false;
/* The first optimal parsing pass is done using the cost model already
* set in ctx->costs. Each later pass is done using a cost model
- * computed from the previous pass. */
- do {
- const u8 *window_ptr;
- const u8 *window_end;
- struct lzx_match *next_chosen_match;
+ * computed from the previous pass.
+ *
+ * To improve performance we only generate the array containing the
+ * matches and literals in intermediate form on the final pass. */
- --num_passes_remaining;
+ while (--num_passes_remaining) {
ctx->match_window_pos = spec->window_pos;
ctx->cache_ptr = ctx->cached_matches;
memset(&freqs, 0, sizeof(freqs));
window_ptr = &ctx->window[spec->window_pos];
window_end = window_ptr + spec->block_size;
- next_chosen_match = spec->chosen_matches;
while (window_ptr != window_end) {
- struct raw_match raw_match;
- struct lzx_match lzx_match;
raw_match = lzx_get_near_optimal_match(ctx);
raw_match.offset == ctx->max_window_size -
LZX_MIN_MATCH_LEN));
if (raw_match.len >= LZX_MIN_MATCH_LEN) {
- lzx_match.data = lzx_tally_match(raw_match.len,
- raw_match.offset,
- &freqs,
- &ctx->queue);
+ lzx_tally_match(raw_match.len, raw_match.offset,
+ &freqs, &ctx->queue);
window_ptr += raw_match.len;
} else {
- lzx_match.data = lzx_tally_literal(*window_ptr,
- &freqs);
+ lzx_tally_literal(*window_ptr, &freqs);
window_ptr += 1;
}
- *next_chosen_match++ = lzx_match;
}
- spec->num_chosen_matches = next_chosen_match - spec->chosen_matches;
lzx_make_huffman_codes(&freqs, &spec->codes, ctx->num_main_syms);
- if (num_passes_remaining) {
- lzx_set_costs(ctx, &spec->codes.lens);
- ctx->queue = orig_queue;
- ctx->matches_cached = true;
- }
- } while (num_passes_remaining);
+ lzx_set_costs(ctx, &spec->codes.lens);
+ ctx->queue = orig_queue;
+ ctx->matches_cached = true;
+ }
+
+ ctx->match_window_pos = spec->window_pos;
+ ctx->cache_ptr = ctx->cached_matches;
+ memset(&freqs, 0, sizeof(freqs));
+ window_ptr = &ctx->window[spec->window_pos];
+ window_end = window_ptr + spec->block_size;
+ spec->chosen_matches = &ctx->chosen_matches[spec->window_pos];
+ next_chosen_match = spec->chosen_matches;
+
+ while (window_ptr != window_end) {
+ raw_match = lzx_get_near_optimal_match(ctx);
+
+ LZX_ASSERT(!(raw_match.len == LZX_MIN_MATCH_LEN &&
+ raw_match.offset == ctx->max_window_size -
+ LZX_MIN_MATCH_LEN));
+ if (raw_match.len >= LZX_MIN_MATCH_LEN) {
+ lzx_match.data = lzx_tally_match(raw_match.len,
+ raw_match.offset,
+ &freqs, &ctx->queue);
+ window_ptr += raw_match.len;
+ } else {
+ lzx_match.data = lzx_tally_literal(*window_ptr, &freqs);
+ window_ptr += 1;
+ }
+ *next_chosen_match++ = lzx_match;
+ }
+ spec->num_chosen_matches = next_chosen_match - spec->chosen_matches;
+ lzx_make_huffman_codes(&freqs, &spec->codes, ctx->num_main_syms);
spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes);
}