+static unsigned
+xpress_get_matches_fillcache(struct xpress_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ struct lz_match *cache_ptr;
+ struct lz_match *matches;
+ unsigned num_matches;
+
+ cache_ptr = c->cache_ptr;
+ matches = cache_ptr + 1;
+ if (likely(cache_ptr <= c->cache_limit)) {
+ num_matches = lz_mf_get_matches(c->mf, matches);
+ cache_ptr->len = num_matches;
+ c->cache_ptr = matches + num_matches;
+ } else {
+ num_matches = 0;
+ }
+ *matches_ret = matches;
+ return num_matches;
+}
+
+static unsigned
+xpress_get_matches_usecache(struct xpress_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ struct lz_match *cache_ptr;
+ struct lz_match *matches;
+ unsigned num_matches;
+
+ cache_ptr = c->cache_ptr;
+ matches = cache_ptr + 1;
+ if (cache_ptr <= c->cache_limit) {
+ num_matches = cache_ptr->len;
+ c->cache_ptr = matches + num_matches;
+ } else {
+ num_matches = 0;
+ }
+ *matches_ret = matches;
+ return num_matches;
+}
+
+static unsigned
+xpress_get_matches_usecache_nocheck(struct xpress_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ struct lz_match *cache_ptr;
+ struct lz_match *matches;
+ unsigned num_matches;
+
+ cache_ptr = c->cache_ptr;
+ matches = cache_ptr + 1;
+ num_matches = cache_ptr->len;
+ c->cache_ptr = matches + num_matches;
+ *matches_ret = matches;
+ return num_matches;
+}
+
+static unsigned
+xpress_get_matches_noncaching(struct xpress_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ *matches_ret = c->cached_matches;
+ return lz_mf_get_matches(c->mf, c->cached_matches);
+}
+
+/*
+ * Find matches at the next position in the window.
+ *
+ * This uses a wrapper function around the underlying match-finder.
+ *
+ * Returns the number of matches found and sets *matches_ret to point to the
+ * matches array. The matches will be sorted by strictly increasing length and
+ * offset.
+ */
+static inline unsigned
+xpress_get_matches(struct xpress_compressor *c,
+ const struct lz_match **matches_ret)
+{
+ return (*c->get_matches_func)(c, matches_ret);
+}
+
+static void
+xpress_skip_bytes_fillcache(struct xpress_compressor *c, unsigned n)
+{
+ struct lz_match *cache_ptr;
+
+ cache_ptr = c->cache_ptr;
+ lz_mf_skip_positions(c->mf, n);
+ if (cache_ptr <= c->cache_limit) {
+ do {
+ cache_ptr->len = 0;
+ cache_ptr += 1;
+ } while (--n && likely(cache_ptr <= c->cache_limit));
+ }
+ c->cache_ptr = cache_ptr;
+}
+
+static void
+xpress_skip_bytes_usecache(struct xpress_compressor *c, unsigned n)
+{
+ struct lz_match *cache_ptr;
+
+ cache_ptr = c->cache_ptr;
+ if (likely(cache_ptr <= c->cache_limit)) {
+ do {
+ cache_ptr += 1 + cache_ptr->len;
+ } while (--n && likely(cache_ptr <= c->cache_limit));
+ }
+ c->cache_ptr = cache_ptr;
+}
+
+static void
+xpress_skip_bytes_usecache_nocheck(struct xpress_compressor *c, unsigned n)
+{
+ struct lz_match *cache_ptr;
+
+ cache_ptr = c->cache_ptr;
+ do {
+ cache_ptr += 1 + cache_ptr->len;
+ } while (--n);
+ c->cache_ptr = cache_ptr;
+}
+
+static void
+xpress_skip_bytes_noncaching(struct xpress_compressor *c, unsigned n)
+{
+ lz_mf_skip_positions(c->mf, n);
+}
+
+/*
+ * Skip the specified number of positions in the window (don't search for
+ * matches at them).
+ *
+ * This uses a wrapper function around the underlying match-finder.
+ */
+static inline void
+xpress_skip_bytes(struct xpress_compressor *c, unsigned n)
+{
+ return (*c->skip_bytes_func)(c, n);
+}
+
+/* Set default XPRESS Huffman symbol costs to bootstrap the iterative
+ * optimization algorithm. */
+static void
+xpress_set_default_costs(u8 costs[])
+{
+ unsigned i;
+
+ /* Literal symbols */
+ for (i = 0; i < XPRESS_NUM_CHARS; i++)
+ costs[i] = 8;
+
+ /* Match symbols */
+ for (; i < XPRESS_NUM_SYMBOLS; i++)
+ costs[i] = 10;
+}
+
+/* Copy the Huffman codeword lengths array @lens to the Huffman symbol costs
+ * array @costs, but also assign a default cost to each 0-length (unused)
+ * codeword. */
+static void
+xpress_set_costs(u8 costs[], const u8 lens[])
+{
+ for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i++)
+ costs[i] = lens[i] ? lens[i] : XPRESS_MAX_CODEWORD_LEN;
+}
+
+/*
+ * Consider coding each match in @matches.
+ *
+ * @matches must be sorted by strictly increasing length and strictly
+ * increasing offset. This is guaranteed by the match-finder.
+ *
+ * We consider each length from the minimum (3) to the longest
+ * (matches[num_matches - 1].len). For each length, we consider only
+ * the smallest offset for which that length is available. Although
+ * this is not guaranteed to be optimal due to the possibility of a
+ * larger offset costing less than a smaller offset to code, this is a
+ * very useful heuristic.
+ */
+static inline void
+xpress_consider_matches(struct xpress_compressor *c,
+ struct xpress_mc_pos_data *cur_optimum_ptr,
+ const struct lz_match matches[],
+ unsigned num_matches)
+{
+ unsigned i = 0;
+ unsigned len = XPRESS_MIN_MATCH_LEN;
+ u32 cost;
+ u32 position_cost;
+ unsigned offset;
+ unsigned offset_bsr;
+ unsigned adjusted_len;
+ unsigned len_hdr;
+ unsigned sym;
+
+ if (matches[num_matches - 1].len < 0xf + XPRESS_MIN_MATCH_LEN) {
+ /* All lengths are small. Optimize accordingly. */
+ do {
+ offset = matches[i].offset;
+ offset_bsr = bsr32(offset);
+ len_hdr = len - XPRESS_MIN_MATCH_LEN;
+ sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
+
+ position_cost = cur_optimum_ptr->cost + offset_bsr;
+ do {
+ cost = position_cost + c->costs[sym];
+ if (cost < (cur_optimum_ptr + len)->cost) {
+ (cur_optimum_ptr + len)->cost = cost;
+ (cur_optimum_ptr + len)->mc_item_data =
+ (offset << MC_OFFSET_SHIFT) | len;
+ }
+ sym++;
+ } while (++len <= matches[i].len);
+ } while (++i != num_matches);
+ } else {
+ /* Some lengths are big. */
+ do {
+ offset = matches[i].offset;
+ offset_bsr = bsr32(offset);
+ position_cost = cur_optimum_ptr->cost + offset_bsr;
+ do {
+ adjusted_len = len - XPRESS_MIN_MATCH_LEN;
+ len_hdr = min(adjusted_len, 0xf);
+ sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr);
+
+ cost = position_cost + c->costs[sym];
+ if (adjusted_len >= 0xf) {
+ cost += 8;
+ if (adjusted_len - 0xf >= 0xff)
+ cost += 16;
+ }
+
+ if (cost < (cur_optimum_ptr + len)->cost) {
+ (cur_optimum_ptr + len)->cost = cost;
+ (cur_optimum_ptr + len)->mc_item_data =
+ (offset << MC_OFFSET_SHIFT) | len;
+ }
+ } while (++len <= matches[i].len);
+ } while (++i != num_matches);
+ }
+}
+
+/*
+ * The main near-optimal parsing routine.
+ *
+ * Briefly, the algorithm does an approximate minimum-cost path search to find a
+ * "near-optimal" sequence of matches and literals to output, based on the
+ * current cost model. The algorithm steps forward, position by position (byte
+ * by byte), and updates the minimum cost path to reach each later position that
+ * can be reached using a match or literal from the current position. This is
+ * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
+ * the graph edges are possible matches/literals to code, and the cost of each
+ * edge is the estimated number of bits that will be required to output the
+ * corresponding match or literal. But one difference is that we actually
+ * compute the lowest-cost path in pieces, where each piece is terminated when
+ * there are no choices to be made.
+ *
+ * If next_chosen_item != NULL, then all items chosen will be recorded (saved in
+ * the chosen_items array). Otherwise, all items chosen will only be tallied
+ * (symbol frequencies tallied in c->freqs).
+ */
+static void
+xpress_optim_pass(struct xpress_compressor *c,
+ struct xpress_item **next_chosen_item)
+{
+ const u8 *window_end;
+ const u8 *window_ptr;
+ struct xpress_mc_pos_data *cur_optimum_ptr;
+ struct xpress_mc_pos_data *end_optimum_ptr;
+ const struct lz_match *matches;
+ unsigned num_matches;
+ unsigned longest_len;
+ unsigned literal;
+ u32 cost;
+
+ window_ptr = c->cur_window;
+ window_end = &c->cur_window[c->cur_window_size];
+
+begin:
+ /* Start building a new list of items, which will correspond to the next
+ * piece of the overall minimum-cost path. */
+
+ if (window_ptr == window_end)
+ return;
+
+ cur_optimum_ptr = c->optimum;
+ cur_optimum_ptr->cost = 0;
+ end_optimum_ptr = cur_optimum_ptr;
+
+ /* The following loop runs once for each per byte in the window, except
+ * in a couple shortcut cases. */
+ for (;;) {
+
+ /* Find matches with the current position. */
+ num_matches = xpress_get_matches(c, &matches);
+
+ if (num_matches) {
+
+ longest_len = matches[num_matches - 1].len;
+
+ /* If there's a very long match, choose it immediately.
+ */
+ if (longest_len >= c->params.nice_match_length) {
+
+ xpress_skip_bytes(c, longest_len - 1);
+ window_ptr += longest_len;
+
+ if (cur_optimum_ptr != c->optimum)
+ xpress_declare_item_list(c, cur_optimum_ptr,
+ next_chosen_item);
+
+ xpress_declare_match(c, longest_len,
+ matches[num_matches - 1].offset,
+ next_chosen_item);
+ goto begin;
+ }
+
+ /* If reaching any positions for the first time,
+ * initialize their costs to "infinity". */
+ while (end_optimum_ptr < cur_optimum_ptr + longest_len)
+ (++end_optimum_ptr)->cost = MC_INFINITE_COST;
+
+ /* Consider coding a match. */
+ xpress_consider_matches(c, cur_optimum_ptr,
+ matches, num_matches);
+ } else {
+ /* No matches found. The only choice at this position
+ * is to code a literal. */
+
+ if (end_optimum_ptr == cur_optimum_ptr) {
+ #if 1
+ /* Optimization for single literals. */
+ if (likely(cur_optimum_ptr == c->optimum)) {
+ xpress_declare_literal(c, *window_ptr++,
+ next_chosen_item);
+ if (window_ptr == window_end)
+ return;
+ continue;
+ }
+ #endif
+ (++end_optimum_ptr)->cost = MC_INFINITE_COST;
+ }
+ }
+
+ /* Consider coding a literal. */
+ literal = *window_ptr++;
+ cost = cur_optimum_ptr->cost + c->costs[literal];
+ if (cost < (cur_optimum_ptr + 1)->cost) {
+ (cur_optimum_ptr + 1)->cost = cost;
+ (cur_optimum_ptr + 1)->mc_item_data =
+ ((u32)literal << MC_OFFSET_SHIFT) | 1;
+ }
+
+ /* Advance to the next position. */
+ cur_optimum_ptr++;
+
+ /*
+ * This loop will terminate when either of the following
+ * conditions is true:
+ *
+ * (1) cur_optimum_ptr == end_optimum_ptr
+ *
+ * There are no paths that extend beyond the current
+ * position. In this case, any path to a later position
+ * must pass through the current position, so we can go
+ * ahead and choose the list of items that led to this
+ * position.
+ *
+ * (2) cur_optimum_ptr == &c->optimum[XPRESS_OPTIM_ARRAY_LENGTH]
+ *
+ * This bounds the number of times the algorithm can step
+ * forward before it is guaranteed to start choosing items.
+ * This limits the memory usage. But
+ * XPRESS_OPTIM_ARRAY_LENGTH is high enough that on most
+ * inputs this limit is never reached.
+ *
+ * Note: no check for end-of-window is needed because
+ * end-of-window will trigger condition (1).
+ */
+ if (cur_optimum_ptr == end_optimum_ptr ||
+ cur_optimum_ptr == &c->optimum[XPRESS_OPTIM_ARRAY_LENGTH])
+ break;
+ }
+
+ /* Choose the current list of items that constitute the minimum-cost
+ * path to the current position. */
+ xpress_declare_item_list(c, cur_optimum_ptr, next_chosen_item);
+ goto begin;
+}
+
+/* Near-optimal parsing */
+static u32
+xpress_choose_near_optimal_items(struct xpress_compressor *c)
+{
+ u32 num_passes_remaining = c->params.num_optim_passes;
+ struct xpress_item *next_chosen_item;
+ struct xpress_item **next_chosen_item_ptr;
+
+ /* Choose appropriate match-finder wrapper functions. */
+ if (c->params.num_optim_passes > 1) {
+ c->get_matches_func = xpress_get_matches_fillcache;
+ c->skip_bytes_func = xpress_skip_bytes_fillcache;
+ } else {
+ c->get_matches_func = xpress_get_matches_noncaching;
+ c->skip_bytes_func = xpress_skip_bytes_noncaching;
+ }
+
+ /* The first optimization pass will use a default cost model. Each
+ * additional optimization pass will use a cost model computed from the
+ * previous pass.