+
+/* Set the cost model @c->costs from the Huffman codeword lengths specified in
+ * @lens.
+ *
+ * The cost model and codeword lengths are almost the same thing, but the
+ * Huffman codewords with length 0 correspond to symbols with zero frequency
+ * that still need to be assigned actual costs. The specific values assigned
+ * are arbitrary, but they should be fairly high (near the maximum codeword
+ * length) to take into account the fact that uses of these symbols are expected
+ * to be rare. */
+static void
+lzx_set_costs(struct lzx_compressor *c, const struct lzx_lens * lens,
+ unsigned nostat)
+{
+ unsigned i;
+
+ /* Main code */
+ for (i = 0; i < c->num_main_syms; i++)
+ c->costs.main[i] = lens->main[i] ? lens->main[i] : nostat;
+
+ /* Length code */
+ for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
+ c->costs.len[i] = lens->len[i] ? lens->len[i] : nostat;
+
+ /* Aligned offset code */
+ for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
+ c->costs.aligned[i] = lens->aligned[i] ? lens->aligned[i] : nostat / 2;
+}
+
+/* Don't allow matches to span the end of an LZX block. */
+static inline u32
+maybe_truncate_matches(struct lz_match matches[], u32 num_matches,
+ struct lzx_compressor *c)
+{
+ if (c->match_window_end < c->cur_window_size && num_matches != 0) {
+ u32 limit = c->match_window_end - c->match_window_pos;
+
+ if (limit >= LZX_MIN_MATCH_LEN) {
+
+ u32 i = num_matches - 1;
+ do {
+ if (matches[i].len >= limit) {
+ matches[i].len = limit;
+
+ /* Truncation might produce multiple
+ * matches with length 'limit'. Keep at
+ * most 1. */
+ num_matches = i + 1;
+ }
+ } while (i--);
+ } else {
+ num_matches = 0;
+ }
+ }
+ return num_matches;
+}
+
+static unsigned
+lzx_get_matches_fillcache_singleblock(struct lzx_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ struct lz_match *cache_ptr;
+ struct lz_match *matches;
+ unsigned num_matches;
+
+ cache_ptr = c->cache_ptr;
+ matches = cache_ptr + 1;
+ if (likely(cache_ptr <= c->cache_limit)) {
+ num_matches = lz_mf_get_matches(c->mf, matches);
+ cache_ptr->len = num_matches;
+ c->cache_ptr = matches + num_matches;
+ } else {
+ num_matches = 0;
+ }
+ c->match_window_pos++;
+ *matches_ret = matches;
+ return num_matches;
+}
+
+static unsigned
+lzx_get_matches_fillcache_multiblock(struct lzx_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ struct lz_match *cache_ptr;
+ struct lz_match *matches;
+ unsigned num_matches;
+
+ cache_ptr = c->cache_ptr;
+ matches = cache_ptr + 1;
+ if (likely(cache_ptr <= c->cache_limit)) {
+ num_matches = lz_mf_get_matches(c->mf, matches);
+ num_matches = maybe_truncate_matches(matches, num_matches, c);
+ cache_ptr->len = num_matches;
+ c->cache_ptr = matches + num_matches;
+ } else {
+ num_matches = 0;
+ }
+ c->match_window_pos++;
+ *matches_ret = matches;
+ return num_matches;
+}
+
+static unsigned
+lzx_get_matches_usecache(struct lzx_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ struct lz_match *cache_ptr;
+ struct lz_match *matches;
+ unsigned num_matches;
+
+ cache_ptr = c->cache_ptr;
+ matches = cache_ptr + 1;
+ if (cache_ptr <= c->cache_limit) {
+ num_matches = cache_ptr->len;
+ c->cache_ptr = matches + num_matches;
+ } else {
+ num_matches = 0;
+ }
+ c->match_window_pos++;
+ *matches_ret = matches;
+ return num_matches;
+}
+
+static unsigned
+lzx_get_matches_usecache_nocheck(struct lzx_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ struct lz_match *cache_ptr;
+ struct lz_match *matches;
+ unsigned num_matches;
+
+ cache_ptr = c->cache_ptr;
+ matches = cache_ptr + 1;
+ num_matches = cache_ptr->len;
+ c->cache_ptr = matches + num_matches;
+ c->match_window_pos++;
+ *matches_ret = matches;
+ return num_matches;
+}
+
+static unsigned
+lzx_get_matches_nocache_singleblock(struct lzx_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ struct lz_match *matches;
+ unsigned num_matches;
+
+ matches = c->cache_ptr;
+ num_matches = lz_mf_get_matches(c->mf, matches);
+ c->match_window_pos++;
+ *matches_ret = matches;
+ return num_matches;
+}
+
+static unsigned
+lzx_get_matches_nocache_multiblock(struct lzx_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ struct lz_match *matches;
+ unsigned num_matches;
+
+ matches = c->cache_ptr;
+ num_matches = lz_mf_get_matches(c->mf, matches);
+ num_matches = maybe_truncate_matches(matches, num_matches, c);
+ c->match_window_pos++;
+ *matches_ret = matches;
+ return num_matches;
+}
+
+/*
+ * Find matches at the next position in the window.
+ *
+ * Returns the number of matches found and sets *matches_ret to point to the
+ * matches array. The matches will be sorted by strictly increasing length and
+ * offset.
+ */
+static inline unsigned
+lzx_get_matches(struct lzx_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ return (*c->get_matches_func)(c, matches_ret);
+}
+
+static void
+lzx_skip_bytes_fillcache(struct lzx_compressor *c, unsigned n)
+{
+ struct lz_match *cache_ptr;
+
+ cache_ptr = c->cache_ptr;
+ c->match_window_pos += n;
+ lz_mf_skip_positions(c->mf, n);
+ if (cache_ptr <= c->cache_limit) {
+ do {
+ cache_ptr->len = 0;
+ cache_ptr += 1;
+ } while (--n && cache_ptr <= c->cache_limit);
+ }
+ c->cache_ptr = cache_ptr;
+}
+
+static void
+lzx_skip_bytes_usecache(struct lzx_compressor *c, unsigned n)
+{
+ struct lz_match *cache_ptr;
+
+ cache_ptr = c->cache_ptr;
+ c->match_window_pos += n;
+ if (cache_ptr <= c->cache_limit) {
+ do {
+ cache_ptr += 1 + cache_ptr->len;
+ } while (--n && cache_ptr <= c->cache_limit);
+ }
+ c->cache_ptr = cache_ptr;
+}
+
+static void
+lzx_skip_bytes_usecache_nocheck(struct lzx_compressor *c, unsigned n)
+{
+ struct lz_match *cache_ptr;
+
+ cache_ptr = c->cache_ptr;
+ c->match_window_pos += n;
+ do {
+ cache_ptr += 1 + cache_ptr->len;
+ } while (--n);
+ c->cache_ptr = cache_ptr;
+}
+
+static void
+lzx_skip_bytes_nocache(struct lzx_compressor *c, unsigned n)
+{
+ c->match_window_pos += n;
+ lz_mf_skip_positions(c->mf, n);
+}
+
+/*
+ * Skip the specified number of positions in the window (don't search for
+ * matches at them).
+ */
+static inline void
+lzx_skip_bytes(struct lzx_compressor *c, unsigned n)
+{
+ return (*c->skip_bytes_func)(c, n);
+}
+
+/*
+ * Reverse the linked list of near-optimal matches so that they can be returned
+ * in forwards order.
+ *
+ * Returns the first match in the list.
+ */
+static struct lz_match
+lzx_match_chooser_reverse_list(struct lzx_compressor *c, unsigned cur_pos)
+{
+ unsigned prev_link, saved_prev_link;
+ unsigned prev_match_offset, saved_prev_match_offset;
+
+ c->optimum_end_idx = cur_pos;
+
+ saved_prev_link = c->optimum[cur_pos].prev.link;
+ saved_prev_match_offset = c->optimum[cur_pos].prev.match_offset;
+
+ do {
+ prev_link = saved_prev_link;
+ prev_match_offset = saved_prev_match_offset;
+
+ saved_prev_link = c->optimum[prev_link].prev.link;
+ saved_prev_match_offset = c->optimum[prev_link].prev.match_offset;
+
+ c->optimum[prev_link].next.link = cur_pos;
+ c->optimum[prev_link].next.match_offset = prev_match_offset;
+
+ cur_pos = prev_link;
+ } while (cur_pos != 0);
+
+ c->optimum_cur_idx = c->optimum[0].next.link;
+
+ return (struct lz_match)
+ { .len = c->optimum_cur_idx,
+ .offset = c->optimum[0].next.match_offset,
+ };
+}
+
+/*
+ * lzx_choose_near_optimal_match() -
+ *
+ * Choose an approximately optimal match or literal to use at the next position
+ * in the string, or "window", being LZ-encoded.
+ *
+ * This is based on algorithms used in 7-Zip, including the DEFLATE encoder
+ * and the LZMA encoder, written by Igor Pavlov.
+ *
+ * Unlike a greedy parser that always takes the longest match, or even a "lazy"
+ * parser with one match/literal look-ahead like zlib, the algorithm used here
+ * may look ahead many matches/literals to determine the approximately optimal
+ * match/literal to code next. The motivation is that the compression ratio is
+ * improved if the compressor can do things like use a shorter-than-possible
+ * match in order to allow a longer match later, and also take into account the
+ * estimated real cost of coding each match/literal based on the underlying
+ * entropy encoding.
+ *
+ * Still, this is not a true optimal parser for several reasons:
+ *
+ * - Real compression formats use entropy encoding of the literal/match
+ * sequence, so the real cost of coding each match or literal is unknown until
+ * the parse is fully determined. It can be approximated based on iterative
+ * parses, but the end result is not guaranteed to be globally optimal.
+ *
+ * - Very long matches are chosen immediately. This is because locations with
+ * long matches are likely to have many possible alternatives that would cause
+ * slow optimal parsing, but also such locations are already highly
+ * compressible so it is not too harmful to just grab the longest match.
+ *
+ * - Not all possible matches at each location are considered because the
+ * underlying match-finder limits the number and type of matches produced at
+ * each position. For example, for a given match length it's usually not
+ * worth it to only consider matches other than the lowest-offset match,
+ * except in the case of a repeat offset.
+ *
+ * - Although we take into account the adaptive state (in LZX, the recent offset
+ * queue), coding decisions made with respect to the adaptive state will be
+ * locally optimal but will not necessarily be globally optimal. This is
+ * because the algorithm only keeps the least-costly path to get to a given
+ * location and does not take into account that a slightly more costly path
+ * could result in a different adaptive state that ultimately results in a
+ * lower global cost.
+ *
+ * - The array space used by this function is bounded, so in degenerate cases it
+ * is forced to start returning matches/literals before the algorithm has
+ * really finished.
+ *
+ * Each call to this function does one of two things:
+ *
+ * 1. Build a sequence of near-optimal matches/literals, up to some point, that
+ * will be returned by subsequent calls to this function, then return the
+ * first one.
+ *
+ * OR
+ *
+ * 2. Return the next match/literal previously computed by a call to this
+ * function.
+ *
+ * The return value is a (length, offset) pair specifying the match or literal
+ * chosen. For literals, the length is 0 or 1 and the offset is meaningless.
+ */
+static struct lz_match
+lzx_choose_near_optimal_item(struct lzx_compressor *c)
+{
+ unsigned num_matches;
+ const struct lz_match *matches;
+ struct lz_match match;
+ u32 longest_len;
+ u32 longest_rep_len;
+ unsigned longest_rep_slot;
+ unsigned cur_pos;
+ unsigned end_pos;
+ struct lzx_mc_pos_data *optimum = c->optimum;
+
+ if (c->optimum_cur_idx != c->optimum_end_idx) {
+ /* Case 2: Return the next match/literal already found. */
+ match.len = optimum[c->optimum_cur_idx].next.link -
+ c->optimum_cur_idx;
+ match.offset = optimum[c->optimum_cur_idx].next.match_offset;
+
+ c->optimum_cur_idx = optimum[c->optimum_cur_idx].next.link;
+ return match;
+ }
+
+ /* Case 1: Compute a new list of matches/literals to return. */
+
+ c->optimum_cur_idx = 0;
+ c->optimum_end_idx = 0;
+
+ /* Search for matches at repeat offsets. As a heuristic, we only keep
+ * the one with the longest match length. */
+ longest_rep_len = LZX_MIN_MATCH_LEN - 1;
+ if (c->match_window_pos >= 1) {
+ unsigned limit = min(LZX_MAX_MATCH_LEN,
+ c->match_window_end - c->match_window_pos);
+ for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
+ u32 offset = c->queue.R[i];
+ const u8 *strptr = &c->cur_window[c->match_window_pos];
+ const u8 *matchptr = strptr - offset;
+ unsigned len = 0;
+ while (len < limit && strptr[len] == matchptr[len])
+ len++;
+ if (len > longest_rep_len) {
+ longest_rep_len = len;
+ longest_rep_slot = i;
+ }
+ }
+ }
+
+ /* If there's a long match with a repeat offset, choose it immediately. */
+ if (longest_rep_len >= c->params.nice_match_length) {
+ lzx_skip_bytes(c, longest_rep_len);
+ return (struct lz_match) {
+ .len = longest_rep_len,
+ .offset = c->queue.R[longest_rep_slot],
+ };
+ }
+
+ /* Find other matches. */
+ num_matches = lzx_get_matches(c, &matches);
+
+ /* If there's a long match, choose it immediately. */
+ if (num_matches) {
+ longest_len = matches[num_matches - 1].len;
+ if (longest_len >= c->params.nice_match_length) {
+ lzx_skip_bytes(c, longest_len - 1);
+ return matches[num_matches - 1];
+ }
+ } else {
+ longest_len = 1;
+ }
+
+ /* Calculate the cost to reach the next position by coding a literal. */
+ optimum[1].queue = c->queue;
+ optimum[1].cost = lzx_literal_cost(c->cur_window[c->match_window_pos - 1],
+ &c->costs);
+ optimum[1].prev.link = 0;
+
+ /* Calculate the cost to reach any position up to and including that
+ * reached by the longest match.
+ *
+ * Note: We consider only the lowest-offset match that reaches each
+ * position.
+ *
+ * Note: Some of the cost calculation stays the same for each offset,
+ * regardless of how many lengths it gets used for. Therefore, to
+ * improve performance, we hand-code the cost calculation instead of
+ * calling lzx_match_cost() to do a from-scratch cost evaluation at each
+ * length. */
+ for (unsigned i = 0, len = 2; i < num_matches; i++) {
+ u32 offset;
+ struct lzx_lru_queue queue;
+ u32 position_cost;
+ unsigned position_slot;
+ unsigned num_extra_bits;
+
+ offset = matches[i].offset;
+ queue = c->queue;
+ position_cost = 0;
+
+ position_slot = lzx_get_position_slot(offset, &queue);
+ num_extra_bits = lzx_get_num_extra_bits(position_slot);
+ if (num_extra_bits >= 3) {
+ position_cost += num_extra_bits - 3;
+ position_cost += c->costs.aligned[(offset + LZX_OFFSET_OFFSET) & 7];
+ } else {
+ position_cost += num_extra_bits;
+ }
+
+ do {
+ unsigned len_header;
+ unsigned main_symbol;
+ u32 cost;
+
+ cost = position_cost;
+
+ len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
+ main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
+ cost += c->costs.main[main_symbol];
+ if (len_header == LZX_NUM_PRIMARY_LENS)
+ cost += c->costs.len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
+
+ optimum[len].queue = queue;
+ optimum[len].prev.link = 0;
+ optimum[len].prev.match_offset = offset;
+ optimum[len].cost = cost;
+ } while (++len <= matches[i].len);
+ }
+ end_pos = longest_len;
+
+ if (longest_rep_len >= LZX_MIN_MATCH_LEN) {
+ u32 cost;
+
+ while (end_pos < longest_rep_len)
+ optimum[++end_pos].cost = MC_INFINITE_COST;
+
+ cost = lzx_repmatch_cost(longest_rep_len, longest_rep_slot,
+ &c->costs);
+ if (cost <= optimum[longest_rep_len].cost) {
+ optimum[longest_rep_len].queue = c->queue;
+ swap(optimum[longest_rep_len].queue.R[0],
+ optimum[longest_rep_len].queue.R[longest_rep_slot]);
+ optimum[longest_rep_len].prev.link = 0;
+ optimum[longest_rep_len].prev.match_offset =
+ optimum[longest_rep_len].queue.R[0];
+ optimum[longest_rep_len].cost = cost;
+ }
+ }
+
+ /* Step forward, calculating the estimated minimum cost to reach each
+ * position. The algorithm may find multiple paths to reach each
+ * position; only the lowest-cost path is saved.
+ *
+ * The progress of the parse is tracked in the @optimum array, which for
+ * each position contains the minimum cost to reach that position, the
+ * index of the start of the match/literal taken to reach that position
+ * through the minimum-cost path, the offset of the match taken (not
+ * relevant for literals), and the adaptive state that will exist at
+ * that position after the minimum-cost path is taken. The @cur_pos
+ * variable stores the position at which the algorithm is currently
+ * considering coding choices, and the @end_pos variable stores the
+ * greatest position at which the costs of coding choices have been
+ * saved.
+ *
+ * The loop terminates when any one of the following conditions occurs:
+ *
+ * 1. A match with length greater than or equal to @nice_match_length is
+ * found. When this occurs, the algorithm chooses this match
+ * unconditionally, and consequently the near-optimal match/literal
+ * sequence up to and including that match is fully determined and it
+ * can begin returning the match/literal list.
+ *
+ * 2. @cur_pos reaches a position not overlapped by a preceding match.
+ * In such cases, the near-optimal match/literal sequence up to
+ * @cur_pos is fully determined and it can begin returning the
+ * match/literal list.
+ *
+ * 3. Failing either of the above in a degenerate case, the loop
+ * terminates when space in the @optimum array is exhausted.
+ * This terminates the algorithm and forces it to start returning
+ * matches/literals even though they may not be globally optimal.
+ *
+ * Upon loop termination, a nonempty list of matches/literals will have
+ * been produced and stored in the @optimum array. These
+ * matches/literals are linked in reverse order, so the last thing this
+ * function does is reverse this list and return the first
+ * match/literal, leaving the rest to be returned immediately by
+ * subsequent calls to this function.
+ */
+ cur_pos = 0;
+ for (;;) {
+ u32 cost;
+
+ /* Advance to next position. */
+ cur_pos++;
+
+ /* Check termination conditions (2) and (3) noted above. */
+ if (cur_pos == end_pos || cur_pos == LZX_OPTIM_ARRAY_LENGTH)
+ return lzx_match_chooser_reverse_list(c, cur_pos);
+
+ /* Search for matches at repeat offsets. Again, as a heuristic
+ * we only keep the longest one. */
+ longest_rep_len = LZX_MIN_MATCH_LEN - 1;
+ unsigned limit = min(LZX_MAX_MATCH_LEN,
+ c->match_window_end - c->match_window_pos);
+ for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
+ u32 offset = optimum[cur_pos].queue.R[i];
+ const u8 *strptr = &c->cur_window[c->match_window_pos];
+ const u8 *matchptr = strptr - offset;
+ unsigned len = 0;
+ while (len < limit && strptr[len] == matchptr[len])
+ len++;
+ if (len > longest_rep_len) {
+ longest_rep_len = len;
+ longest_rep_slot = i;
+ }
+ }
+
+ /* If we found a long match at a repeat offset, choose it
+ * immediately. */
+ if (longest_rep_len >= c->params.nice_match_length) {
+ /* Build the list of matches to return and get
+ * the first one. */
+ match = lzx_match_chooser_reverse_list(c, cur_pos);
+
+ /* Append the long match to the end of the list. */
+ optimum[cur_pos].next.match_offset =
+ optimum[cur_pos].queue.R[longest_rep_slot];
+ optimum[cur_pos].next.link = cur_pos + longest_rep_len;
+ c->optimum_end_idx = cur_pos + longest_rep_len;
+
+ /* Skip over the remaining bytes of the long match. */
+ lzx_skip_bytes(c, longest_rep_len);
+
+ /* Return first match in the list. */
+ return match;
+ }
+
+ /* Find other matches. */
+ num_matches = lzx_get_matches(c, &matches);
+
+ /* If there's a long match, choose it immediately. */
+ if (num_matches) {
+ longest_len = matches[num_matches - 1].len;
+ if (longest_len >= c->params.nice_match_length) {
+ /* Build the list of matches to return and get
+ * the first one. */
+ match = lzx_match_chooser_reverse_list(c, cur_pos);
+
+ /* Append the long match to the end of the list. */
+ optimum[cur_pos].next.match_offset =
+ matches[num_matches - 1].offset;
+ optimum[cur_pos].next.link = cur_pos + longest_len;
+ c->optimum_end_idx = cur_pos + longest_len;
+
+ /* Skip over the remaining bytes of the long match. */
+ lzx_skip_bytes(c, longest_len - 1);
+
+ /* Return first match in the list. */
+ return match;
+ }
+ } else {
+ longest_len = 1;
+ }
+
+ /* If we are reaching any positions for the first time, we need
+ * to initialize their costs to infinity. */
+ while (end_pos < cur_pos + longest_len)
+ optimum[++end_pos].cost = MC_INFINITE_COST;
+
+ /* Consider coding a literal. */
+ cost = optimum[cur_pos].cost +
+ lzx_literal_cost(c->cur_window[c->match_window_pos - 1],
+ &c->costs);
+ if (cost < optimum[cur_pos + 1].cost) {
+ optimum[cur_pos + 1].queue = optimum[cur_pos].queue;
+ optimum[cur_pos + 1].cost = cost;
+ optimum[cur_pos + 1].prev.link = cur_pos;
+ }
+
+ /* Consider coding a match.
+ *
+ * The hard-coded cost calculation is done for the same reason
+ * stated in the comment for the similar loop earlier.
+ * Actually, it is *this* one that has the biggest effect on
+ * performance; overall LZX compression is > 10% faster with
+ * this code compared to calling lzx_match_cost() with each
+ * length. */
+ for (unsigned i = 0, len = 2; i < num_matches; i++) {
+ u32 offset;
+ u32 position_cost;
+ unsigned position_slot;
+ unsigned num_extra_bits;
+
+ offset = matches[i].offset;
+ position_cost = optimum[cur_pos].cost;
+
+ /* Yet another optimization: instead of calling
+ * lzx_get_position_slot(), hand-inline the search of
+ * the repeat offset queue. Then we can omit the
+ * extra_bits calculation for repeat offset matches, and
+ * also only compute the updated queue if we actually do
+ * find a new lowest cost path. */
+ for (position_slot = 0; position_slot < LZX_NUM_RECENT_OFFSETS; position_slot++)
+ if (offset == optimum[cur_pos].queue.R[position_slot])
+ goto have_position_cost;
+
+ position_slot = lzx_get_position_slot_raw(offset + LZX_OFFSET_OFFSET);
+
+ num_extra_bits = lzx_get_num_extra_bits(position_slot);
+ if (num_extra_bits >= 3) {
+ position_cost += num_extra_bits - 3;
+ position_cost += c->costs.aligned[
+ (offset + LZX_OFFSET_OFFSET) & 7];
+ } else {
+ position_cost += num_extra_bits;
+ }
+
+ have_position_cost:
+
+ do {
+ unsigned len_header;
+ unsigned main_symbol;
+ u32 cost;
+
+ cost = position_cost;
+
+ len_header = min(len - LZX_MIN_MATCH_LEN,
+ LZX_NUM_PRIMARY_LENS);
+ main_symbol = ((position_slot << 3) | len_header) +
+ LZX_NUM_CHARS;
+ cost += c->costs.main[main_symbol];
+ if (len_header == LZX_NUM_PRIMARY_LENS) {
+ cost += c->costs.len[len -
+ LZX_MIN_MATCH_LEN -
+ LZX_NUM_PRIMARY_LENS];
+ }
+ if (cost < optimum[cur_pos + len].cost) {
+ if (position_slot < LZX_NUM_RECENT_OFFSETS) {
+ optimum[cur_pos + len].queue = optimum[cur_pos].queue;
+ swap(optimum[cur_pos + len].queue.R[0],
+ optimum[cur_pos + len].queue.R[position_slot]);
+ } else {
+ optimum[cur_pos + len].queue.R[0] = offset;
+ optimum[cur_pos + len].queue.R[1] = optimum[cur_pos].queue.R[0];
+ optimum[cur_pos + len].queue.R[2] = optimum[cur_pos].queue.R[1];
+ }
+ optimum[cur_pos + len].prev.link = cur_pos;
+ optimum[cur_pos + len].prev.match_offset = offset;
+ optimum[cur_pos + len].cost = cost;
+ }
+ } while (++len <= matches[i].len);
+ }
+
+ /* Consider coding a repeat offset match.
+ *
+ * As a heuristic, we only consider the longest length of the
+ * longest repeat offset match. This does not, however,
+ * necessarily mean that we will never consider any other repeat
+ * offsets, because above we detect repeat offset matches that
+ * were found by the regular match-finder. Therefore, this
+ * special handling of the longest repeat-offset match is only
+ * helpful for coding a repeat offset match that was *not* found
+ * by the match-finder, e.g. due to being obscured by a less
+ * distant match that is at least as long.
+ *
+ * Note: an alternative, used in LZMA, is to consider every
+ * length of every repeat offset match. This is a more thorough
+ * search, and it makes it unnecessary to detect repeat offset
+ * matches that were found by the regular match-finder. But by
+ * my tests, for LZX the LZMA method slows down the compressor
+ * by ~10% and doesn't actually help the compression ratio too
+ * much.
+ *
+ * Also tested a compromise approach: consider every 3rd length
+ * of the longest repeat offset match. Still didn't seem quite
+ * worth it, though.
+ */
+ if (longest_rep_len >= LZX_MIN_MATCH_LEN) {
+
+ while (end_pos < cur_pos + longest_rep_len)
+ optimum[++end_pos].cost = MC_INFINITE_COST;
+
+ cost = optimum[cur_pos].cost +
+ lzx_repmatch_cost(longest_rep_len, longest_rep_slot,
+ &c->costs);
+ if (cost <= optimum[cur_pos + longest_rep_len].cost) {
+ optimum[cur_pos + longest_rep_len].queue =
+ optimum[cur_pos].queue;
+ swap(optimum[cur_pos + longest_rep_len].queue.R[0],
+ optimum[cur_pos + longest_rep_len].queue.R[longest_rep_slot]);
+ optimum[cur_pos + longest_rep_len].prev.link =
+ cur_pos;
+ optimum[cur_pos + longest_rep_len].prev.match_offset =
+ optimum[cur_pos + longest_rep_len].queue.R[0];
+ optimum[cur_pos + longest_rep_len].cost =
+ cost;
+ }
+ }
+ }
+}
+
+static struct lz_match
+lzx_choose_lazy_item(struct lzx_compressor *c)
+{
+ const struct lz_match *matches;
+ struct lz_match cur_match;
+ struct lz_match next_match;
+ u32 num_matches;
+
+ if (c->prev_match.len) {
+ cur_match = c->prev_match;
+ c->prev_match.len = 0;
+ } else {
+ num_matches = lzx_get_matches(c, &matches);
+ if (num_matches == 0 ||
+ (matches[num_matches - 1].len <= 3 &&
+ (matches[num_matches - 1].len <= 2 ||
+ matches[num_matches - 1].offset > 4096)))
+ {
+ return (struct lz_match) { };
+ }
+
+ cur_match = matches[num_matches - 1];
+ }
+
+ if (cur_match.len >= c->params.nice_match_length) {
+ lzx_skip_bytes(c, cur_match.len - 1);
+ return cur_match;
+ }
+
+ num_matches = lzx_get_matches(c, &matches);
+ if (num_matches == 0 ||
+ (matches[num_matches - 1].len <= 3 &&
+ (matches[num_matches - 1].len <= 2 ||
+ matches[num_matches - 1].offset > 4096)))
+ {
+ lzx_skip_bytes(c, cur_match.len - 2);
+ return cur_match;
+ }
+
+ next_match = matches[num_matches - 1];
+
+ if (next_match.len <= cur_match.len) {
+ lzx_skip_bytes(c, cur_match.len - 2);
+ return cur_match;
+ } else {
+ c->prev_match = next_match;
+ return (struct lz_match) { };
+ }
+}
+
+/*
+ * Return the next match or literal to use, delegating to the currently selected
+ * match-choosing algorithm.
+ *
+ * If the length of the returned 'struct lz_match' is less than
+ * LZX_MIN_MATCH_LEN, then it is really a literal.
+ */
+static inline struct lz_match
+lzx_choose_item(struct lzx_compressor *c)
+{
+ return (*c->params.choose_item_func)(c);
+}
+
+/* Set default symbol costs for the LZX Huffman codes. */
+static void
+lzx_set_default_costs(struct lzx_costs * costs, unsigned num_main_syms)
+{
+ unsigned i;
+
+ /* Main code (part 1): Literal symbols */
+ for (i = 0; i < LZX_NUM_CHARS; i++)
+ costs->main[i] = 8;
+
+ /* Main code (part 2): Match header symbols */
+ for (; i < num_main_syms; i++)
+ costs->main[i] = 10;
+
+ /* Length code */
+ for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
+ costs->len[i] = 8;
+
+ /* Aligned offset code */
+ for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
+ costs->aligned[i] = 3;
+}
+
+/* Given the frequencies of symbols in an LZX-compressed block and the
+ * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
+ * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
+ * will take fewer bits to output. */
+static int
+lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
+ const struct lzx_codes * codes)
+{
+ unsigned aligned_cost = 0;
+ unsigned verbatim_cost = 0;
+
+ /* Verbatim blocks have a constant 3 bits per position footer. Aligned
+ * offset blocks have an aligned offset symbol per position footer, plus
+ * an extra 24 bits per block to output the lengths necessary to
+ * reconstruct the aligned offset code itself. */
+ for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
+ verbatim_cost += 3 * freqs->aligned[i];
+ aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
+ }
+ aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE * LZX_ALIGNEDCODE_NUM_SYMBOLS;
+ if (aligned_cost < verbatim_cost)
+ return LZX_BLOCKTYPE_ALIGNED;
+ else
+ return LZX_BLOCKTYPE_VERBATIM;
+}
+
+/* Find a sequence of matches/literals with which to output the specified LZX
+ * block, then set the block's type to that which has the minimum cost to output
+ * (either verbatim or aligned). */
+static void
+lzx_choose_items_for_block(struct lzx_compressor *c, struct lzx_block_spec *spec)
+{
+ const struct lzx_lru_queue orig_queue = c->queue;
+ u32 num_passes_remaining = c->params.num_optim_passes;
+ struct lzx_freqs freqs;
+ const u8 *window_ptr;
+ const u8 *window_end;
+ struct lzx_item *next_chosen_item;
+ struct lz_match lz_match;
+ struct lzx_item lzx_item;
+
+ LZX_ASSERT(num_passes_remaining >= 1);
+ LZX_ASSERT(lz_mf_get_position(c->mf) == spec->window_pos);
+
+ c->match_window_end = spec->window_pos + spec->block_size;
+
+ if (c->params.num_optim_passes > 1) {
+ if (spec->block_size == c->cur_window_size)
+ c->get_matches_func = lzx_get_matches_fillcache_singleblock;
+ else
+ c->get_matches_func = lzx_get_matches_fillcache_multiblock;
+ c->skip_bytes_func = lzx_skip_bytes_fillcache;
+ } else {
+ if (spec->block_size == c->cur_window_size)
+ c->get_matches_func = lzx_get_matches_nocache_singleblock;
+ else
+ c->get_matches_func = lzx_get_matches_nocache_multiblock;
+ c->skip_bytes_func = lzx_skip_bytes_nocache;
+ }
+
+ /* The first optimal parsing pass is done using the cost model already
+ * set in c->costs. Each later pass is done using a cost model
+ * computed from the previous pass.
+ *
+ * To improve performance we only generate the array containing the
+ * matches and literals in intermediate form on the final pass. */
+
+ while (--num_passes_remaining) {
+ c->match_window_pos = spec->window_pos;
+ c->cache_ptr = c->cached_matches;
+ memset(&freqs, 0, sizeof(freqs));
+ window_ptr = &c->cur_window[spec->window_pos];
+ window_end = window_ptr + spec->block_size;
+
+ while (window_ptr != window_end) {
+
+ lz_match = lzx_choose_item(c);
+
+ LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
+ lz_match.offset == c->max_window_size -
+ LZX_MIN_MATCH_LEN));
+ if (lz_match.len >= LZX_MIN_MATCH_LEN) {
+ lzx_tally_match(lz_match.len, lz_match.offset,
+ &freqs, &c->queue);
+ window_ptr += lz_match.len;
+ } else {
+ lzx_tally_literal(*window_ptr, &freqs);
+ window_ptr += 1;
+ }
+ }
+ lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
+ lzx_set_costs(c, &spec->codes.lens, 15);
+ c->queue = orig_queue;
+ if (c->cache_ptr <= c->cache_limit) {
+ c->get_matches_func = lzx_get_matches_usecache_nocheck;
+ c->skip_bytes_func = lzx_skip_bytes_usecache_nocheck;
+ } else {
+ c->get_matches_func = lzx_get_matches_usecache;
+ c->skip_bytes_func = lzx_skip_bytes_usecache;
+ }
+ }
+
+ c->match_window_pos = spec->window_pos;
+ c->cache_ptr = c->cached_matches;
+ memset(&freqs, 0, sizeof(freqs));
+ window_ptr = &c->cur_window[spec->window_pos];
+ window_end = window_ptr + spec->block_size;
+
+ spec->chosen_items = &c->chosen_items[spec->window_pos];
+ next_chosen_item = spec->chosen_items;
+
+ unsigned unseen_cost = 9;
+ while (window_ptr != window_end) {
+
+ lz_match = lzx_choose_item(c);
+
+ LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
+ lz_match.offset == c->max_window_size -
+ LZX_MIN_MATCH_LEN));
+ if (lz_match.len >= LZX_MIN_MATCH_LEN) {
+ lzx_item.data = lzx_tally_match(lz_match.len,
+ lz_match.offset,
+ &freqs, &c->queue);
+ window_ptr += lz_match.len;
+ } else {
+ lzx_item.data = lzx_tally_literal(*window_ptr, &freqs);
+ window_ptr += 1;
+ }
+ *next_chosen_item++ = lzx_item;
+
+ /* When doing one-pass "near-optimal" parsing, update the cost
+ * model occassionally. */
+ if (unlikely((next_chosen_item - spec->chosen_items) % 2048 == 0) &&
+ c->params.choose_item_func == lzx_choose_near_optimal_item &&
+ c->params.num_optim_passes == 1)
+ {
+ lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
+ lzx_set_costs(c, &spec->codes.lens, unseen_cost);
+ if (unseen_cost < 15)
+ unseen_cost++;
+ }
+ }
+ spec->num_chosen_items = next_chosen_item - spec->chosen_items;
+ lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
+ spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes);
+}
+
+/* Prepare the input window into one or more LZX blocks ready to be output. */
+static void
+lzx_prepare_blocks(struct lzx_compressor *c)
+{
+ /* Set up a default cost model. */
+ if (c->params.choose_item_func == lzx_choose_near_optimal_item)
+ lzx_set_default_costs(&c->costs, c->num_main_syms);
+
+ /* Set up the block specifications.
+ * TODO: The compression ratio could be slightly improved by performing
+ * data-dependent block splitting instead of using fixed-size blocks.
+ * Doing so well is a computationally hard problem, however. */
+ c->num_blocks = DIV_ROUND_UP(c->cur_window_size, LZX_DIV_BLOCK_SIZE);
+ for (unsigned i = 0; i < c->num_blocks; i++) {
+ u32 pos = LZX_DIV_BLOCK_SIZE * i;
+ c->block_specs[i].window_pos = pos;
+ c->block_specs[i].block_size = min(c->cur_window_size - pos,
+ LZX_DIV_BLOCK_SIZE);
+ }
+
+ /* Load the window into the match-finder. */
+ lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size);
+
+ /* Determine sequence of matches/literals to output for each block. */
+ lzx_lru_queue_init(&c->queue);
+ c->optimum_cur_idx = 0;
+ c->optimum_end_idx = 0;
+ c->prev_match.len = 0;
+ for (unsigned i = 0; i < c->num_blocks; i++)
+ lzx_choose_items_for_block(c, &c->block_specs[i]);
+}
+
+static void
+lzx_build_params(unsigned int compression_level,
+ u32 max_window_size,
+ struct lzx_compressor_params *lzx_params)
+{
+ if (compression_level < 25) {
+ lzx_params->choose_item_func = lzx_choose_lazy_item;
+ lzx_params->num_optim_passes = 1;
+ if (max_window_size <= 262144)
+ lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
+ else
+ lzx_params->mf_algo = LZ_MF_BINARY_TREES;
+ lzx_params->min_match_length = 3;
+ lzx_params->nice_match_length = 25 + compression_level * 2;
+ lzx_params->max_search_depth = 25 + compression_level;
+ } else {
+ lzx_params->choose_item_func = lzx_choose_near_optimal_item;
+ lzx_params->num_optim_passes = compression_level / 20;
+ if (max_window_size <= 32768 && lzx_params->num_optim_passes == 1)
+ lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
+ else
+ lzx_params->mf_algo = LZ_MF_BINARY_TREES;
+ lzx_params->min_match_length = (compression_level >= 45) ? 2 : 3;
+ lzx_params->nice_match_length = min(((u64)compression_level * 32) / 50,
+ LZX_MAX_MATCH_LEN);
+ lzx_params->max_search_depth = min(((u64)compression_level * 50) / 50,
+ LZX_MAX_MATCH_LEN);
+ }
+}
+
+static void
+lzx_build_mf_params(const struct lzx_compressor_params *lzx_params,
+ u32 max_window_size, struct lz_mf_params *mf_params)
+{
+ memset(mf_params, 0, sizeof(*mf_params));
+
+ mf_params->algorithm = lzx_params->mf_algo;
+ mf_params->max_window_size = max_window_size;
+ mf_params->min_match_len = lzx_params->min_match_length;
+ mf_params->max_match_len = LZX_MAX_MATCH_LEN;
+ mf_params->max_search_depth = lzx_params->max_search_depth;
+ mf_params->nice_match_len = lzx_params->nice_match_length;
+}
+
+static void
+lzx_free_compressor(void *_c);
+
+static u64
+lzx_get_needed_memory(size_t max_window_size, unsigned int compression_level)
+{
+ struct lzx_compressor_params params;
+ u64 size = 0;
+
+ if (!lzx_window_size_valid(max_window_size))
+ return 0;
+
+ lzx_build_params(compression_level, max_window_size, ¶ms);
+
+ size += sizeof(struct lzx_compressor);
+
+ size += max_window_size;
+
+ size += DIV_ROUND_UP(max_window_size, LZX_DIV_BLOCK_SIZE) *
+ sizeof(struct lzx_block_spec);
+
+ size += max_window_size * sizeof(struct lzx_item);
+
+ size += lz_mf_get_needed_memory(params.mf_algo, max_window_size);
+ if (params.choose_item_func == lzx_choose_near_optimal_item) {
+ size += (LZX_OPTIM_ARRAY_LENGTH + params.nice_match_length) *
+ sizeof(struct lzx_mc_pos_data);
+ }
+ if (params.num_optim_passes > 1)
+ size += LZX_CACHE_LEN * sizeof(struct lz_match);
+ else
+ size += LZX_MAX_MATCHES_PER_POS * sizeof(struct lz_match);
+ return size;
+}
+
+static int
+lzx_create_compressor(size_t max_window_size, unsigned int compression_level,
+ void **c_ret)
+{
+ struct lzx_compressor *c;
+ struct lzx_compressor_params params;
+ struct lz_mf_params mf_params;
+
+ if (!lzx_window_size_valid(max_window_size))
+ return WIMLIB_ERR_INVALID_PARAM;
+
+ lzx_build_params(compression_level, max_window_size, ¶ms);
+ lzx_build_mf_params(¶ms, max_window_size, &mf_params);
+ if (!lz_mf_params_valid(&mf_params))
+ return WIMLIB_ERR_INVALID_PARAM;
+
+ c = CALLOC(1, sizeof(struct lzx_compressor));
+ if (!c)
+ goto oom;
+
+ c->params = params;
+ c->num_main_syms = lzx_get_num_main_syms(max_window_size);
+ c->max_window_size = max_window_size;
+
+ c->cur_window = ALIGNED_MALLOC(max_window_size, 16);
+ if (!c->cur_window)
+ goto oom;
+
+ c->block_specs = MALLOC(DIV_ROUND_UP(max_window_size,
+ LZX_DIV_BLOCK_SIZE) *
+ sizeof(struct lzx_block_spec));
+ if (!c->block_specs)
+ goto oom;
+
+ c->chosen_items = MALLOC(max_window_size * sizeof(struct lzx_item));
+ if (!c->chosen_items)
+ goto oom;
+
+ c->mf = lz_mf_alloc(&mf_params);
+ if (!c->mf)
+ goto oom;
+
+ if (params.choose_item_func == lzx_choose_near_optimal_item) {
+ c->optimum = MALLOC((LZX_OPTIM_ARRAY_LENGTH +
+ params.nice_match_length) *
+ sizeof(struct lzx_mc_pos_data));
+ if (!c->optimum)
+ goto oom;
+ }
+
+ if (params.num_optim_passes > 1) {
+ c->cached_matches = MALLOC(LZX_CACHE_LEN *
+ sizeof(struct lz_match));
+ if (!c->cached_matches)
+ goto oom;
+ c->cache_limit = c->cached_matches + LZX_CACHE_LEN -
+ (LZX_MAX_MATCHES_PER_POS + 1);
+ } else {
+ c->cached_matches = MALLOC(LZX_MAX_MATCHES_PER_POS *
+ sizeof(struct lz_match));
+ if (!c->cached_matches)
+ goto oom;
+ }
+
+ *c_ret = c;
+ return 0;
+
+oom:
+ lzx_free_compressor(c);
+ return WIMLIB_ERR_NOMEM;
+}
+
+static size_t
+lzx_compress(const void *uncompressed_data, size_t uncompressed_size,
+ void *compressed_data, size_t compressed_size_avail, void *_c)
+{
+ struct lzx_compressor *c = _c;
+ struct lzx_output_bitstream os;
+
+ /* Don't bother compressing very small inputs. */
+ if (uncompressed_size < 100)
+ return 0;
+
+ /* The input data must be preprocessed. To avoid changing the original
+ * input, copy it to a temporary buffer. */
+ memcpy(c->cur_window, uncompressed_data, uncompressed_size);
+ c->cur_window_size = uncompressed_size;
+
+ /* Preprocess the data. */
+ lzx_do_e8_preprocessing(c->cur_window, c->cur_window_size);
+
+ /* Prepare the compressed data. */
+ lzx_prepare_blocks(c);
+
+ /* Generate the compressed data and return its size, or 0 if an overflow
+ * occurred. */
+ lzx_init_output(&os, compressed_data, compressed_size_avail);
+ lzx_write_all_blocks(c, &os);
+ return lzx_flush_output(&os);
+}
+
+static void
+lzx_free_compressor(void *_c)
+{
+ struct lzx_compressor *c = _c;
+
+ if (c) {
+ ALIGNED_FREE(c->cur_window);
+ FREE(c->block_specs);
+ FREE(c->chosen_items);
+ lz_mf_free(c->mf);
+ FREE(c->optimum);
+ FREE(c->cached_matches);
+ FREE(c);
+ }
+}
+
+const struct compressor_ops lzx_compressor_ops = {
+ .get_needed_memory = lzx_get_needed_memory,
+ .create_compressor = lzx_create_compressor,
+ .compress = lzx_compress,
+ .free_compressor = lzx_free_compressor,
+};