+ u32 mc_item_data;
+#define MC_OFFSET_SHIFT 9
+#define MC_LEN_MASK ((1 << MC_OFFSET_SHIFT) - 1)
+
+ /* The state of the LZX recent match offsets queue at this position.
+ * This is filled in lazily, only after the minimum-cost path to this
+ * position is found.
+ *
+ * Note: the way we handle this adaptive state in the "minimum-cost"
+ * parse is actually only an approximation. It's possible for the
+ * globally optimal, minimum cost path to contain a prefix, ending at a
+ * position, where that path prefix is *not* the minimum cost path to
+ * that position. This can happen if such a path prefix results in a
+ * different adaptive state which results in lower costs later. We do
+ * not solve this problem; we only consider the lowest cost to reach
+ * each position, which seems to be an acceptable approximation. */
+ struct lzx_lru_queue queue _aligned_attribute(16);
+
+} _aligned_attribute(16);
+
+/* State of the LZX compressor */
+struct lzx_compressor {
+
+ /* Internal compression parameters */
+ struct lzx_compressor_params params;
+
+ /* The preprocessed buffer of data being compressed */
+ u8 *cur_window;
+
+ /* Number of bytes of data to be compressed, which is the number of
+ * bytes of data in @cur_window that are actually valid. */
+ u32 cur_window_size;
+
+ /* log2 order of the LZX window size for LZ match offset encoding
+ * purposes. Will be >= LZX_MIN_WINDOW_ORDER and <=
+ * LZX_MAX_WINDOW_ORDER.
+ *
+ * Note: 1 << @window_order is normally equal to @max_window_size,
+ * a.k.a. the allocated size of @cur_window, but it will be greater than
+ * @max_window_size in the event that the compressor was created with a
+ * non-power-of-2 block size. (See lzx_get_window_order().) */
+ unsigned window_order;
+
+ /* Number of symbols in the main alphabet. This depends on
+ * @window_order, since @window_order determines the maximum possible
+ * offset. It does not, however, depend on the *actual* size of the
+ * current data buffer being processed, which might be less than 1 <<
+ * @window_order. */
+ unsigned num_main_syms;
+
+ /* Lempel-Ziv match-finder */
+ struct lz_mf *mf;
+
+ /* Match-finder wrapper functions and data for near-optimal parsing.
+ *
+ * When doing more than one match-choosing pass over the data, matches
+ * found by the match-finder are cached to achieve a slight speedup when
+ * the same matches are needed on subsequent passes. This is suboptimal
+ * because different matches may be preferred with different cost
+ * models, but it is a very worthwhile speedup. */
+ unsigned (*get_matches_func)(struct lzx_compressor *, const struct lz_match **);
+ void (*skip_bytes_func)(struct lzx_compressor *, unsigned n);
+ u32 match_window_pos;
+ u32 match_window_end;
+ struct lz_match *cached_matches;
+ struct lz_match *cache_ptr;
+ struct lz_match *cache_limit;
+
+ /* Position data for near-optimal parsing. */
+ struct lzx_mc_pos_data optimum[LZX_OPTIM_ARRAY_LENGTH + LZX_MAX_MATCH_LEN];
+
+ /* The cost model currently being used for near-optimal parsing. */
+ struct lzx_costs costs;
+
+ /* The current match offset LRU queue. */
+ struct lzx_lru_queue queue;
+
+ /* Frequency counters for the current block. */
+ struct lzx_freqs freqs;
+
+ /* The Huffman codes for the current and previous blocks. */
+ struct lzx_codes codes[2];
+
+ /* Which 'struct lzx_codes' is being used for the current block. The
+ * other was used for the previous block (if this isn't the first
+ * block). */
+ unsigned int codes_index;
+
+ /* Dummy lengths that are always 0. */
+ struct lzx_lens zero_lens;
+
+ /* Matches/literals that were chosen for the current block. */
+ struct lzx_item chosen_items[LZX_DIV_BLOCK_SIZE];
+
+ /* Table mapping match offset => offset slot for small offsets */
+#define LZX_NUM_FAST_OFFSETS 32768
+ u8 offset_slot_fast[LZX_NUM_FAST_OFFSETS];
+};
+
+/*
+ * Structure to keep track of the current state of sending bits to the
+ * compressed output buffer.
+ *
+ * The LZX bitstream is encoded as a sequence of 16-bit coding units.
+ */
+struct lzx_output_bitstream {
+
+ /* Bits that haven't yet been written to the output buffer. */
+ u32 bitbuf;
+
+ /* Number of bits currently held in @bitbuf. */
+ u32 bitcount;
+
+ /* Pointer to the start of the output buffer. */
+ le16 *start;
+
+ /* Pointer to the position in the output buffer at which the next coding
+ * unit should be written. */
+ le16 *next;
+
+ /* Pointer past the end of the output buffer. */
+ le16 *end;
+};
+
+/*
+ * Initialize the output bitstream.
+ *
+ * @os
+ * The output bitstream structure to initialize.
+ * @buffer
+ * The buffer being written to.
+ * @size
+ * Size of @buffer, in bytes.
+ */
+static void
+lzx_init_output(struct lzx_output_bitstream *os, void *buffer, u32 size)
+{
+ os->bitbuf = 0;
+ os->bitcount = 0;
+ os->start = buffer;
+ os->next = os->start;
+ os->end = os->start + size / sizeof(le16);
+}
+
+/*
+ * Write some bits to the output bitstream.
+ *
+ * The bits are given by the low-order @num_bits bits of @bits. Higher-order
+ * bits in @bits cannot be set. At most 17 bits can be written at once.
+ *
+ * @max_num_bits is a compile-time constant that specifies the maximum number of
+ * bits that can ever be written at the call site. Currently, it is used to
+ * optimize away the conditional code for writing a second 16-bit coding unit
+ * when writing fewer than 17 bits.
+ *
+ * If the output buffer space is exhausted, then the bits will be ignored, and
+ * lzx_flush_output() will return 0 when it gets called.
+ */
+static inline void
+lzx_write_varbits(struct lzx_output_bitstream *os,
+ const u32 bits, const unsigned int num_bits,
+ const unsigned int max_num_bits)
+{
+ /* This code is optimized for LZX, which never needs to write more than
+ * 17 bits at once. */
+ LZX_ASSERT(num_bits <= 17);
+ LZX_ASSERT(num_bits <= max_num_bits);
+ LZX_ASSERT(os->bitcount <= 15);
+
+ /* Add the bits to the bit buffer variable. @bitcount will be at most
+ * 15, so there will be just enough space for the maximum possible
+ * @num_bits of 17. */
+ os->bitcount += num_bits;
+ os->bitbuf = (os->bitbuf << num_bits) | bits;
+
+ /* Check whether any coding units need to be written. */
+ if (os->bitcount >= 16) {
+
+ os->bitcount -= 16;
+
+ /* Write a coding unit, unless it would overflow the buffer. */
+ if (os->next != os->end)
+ *os->next++ = cpu_to_le16(os->bitbuf >> os->bitcount);
+
+ /* If writing 17 bits, a second coding unit might need to be
+ * written. But because 'max_num_bits' is a compile-time
+ * constant, the compiler will optimize away this code at most
+ * call sites. */
+ if (max_num_bits == 17 && os->bitcount == 16) {
+ if (os->next != os->end)
+ *os->next++ = cpu_to_le16(os->bitbuf);
+ os->bitcount = 0;
+ }
+ }
+}
+
+/* Use when @num_bits is a compile-time constant. Otherwise use
+ * lzx_write_varbits(). */
+static inline void
+lzx_write_bits(struct lzx_output_bitstream *os,
+ const u32 bits, const unsigned int num_bits)
+{
+ lzx_write_varbits(os, bits, num_bits, num_bits);
+}
+
+/*
+ * Flush the last coding unit to the output buffer if needed. Return the total
+ * number of bytes written to the output buffer, or 0 if an overflow occurred.
+ */
+static u32
+lzx_flush_output(struct lzx_output_bitstream *os)
+{
+ if (os->next == os->end)
+ return 0;
+
+ if (os->bitcount != 0)
+ *os->next++ = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
+
+ return (const u8 *)os->next - (const u8 *)os->start;
+}
+
+/* Build the main, length, and aligned offset Huffman codes used in LZX.
+ *
+ * This takes as input the frequency tables for each code and produces as output
+ * a set of tables that map symbols to codewords and codeword lengths. */
+static void
+lzx_make_huffman_codes(const struct lzx_freqs *freqs, struct lzx_codes *codes,
+ unsigned num_main_syms)
+{
+ make_canonical_huffman_code(num_main_syms,
+ LZX_MAX_MAIN_CODEWORD_LEN,
+ freqs->main,
+ codes->lens.main,
+ codes->codewords.main);
+
+ make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
+ LZX_MAX_LEN_CODEWORD_LEN,
+ freqs->len,
+ codes->lens.len,
+ codes->codewords.len);
+
+ make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
+ LZX_MAX_ALIGNED_CODEWORD_LEN,
+ freqs->aligned,
+ codes->lens.aligned,
+ codes->codewords.aligned);
+}
+
+static unsigned
+lzx_compute_precode_items(const u8 lens[restrict],
+ const u8 prev_lens[restrict],
+ const unsigned num_lens,
+ u32 precode_freqs[restrict],
+ unsigned precode_items[restrict])
+{
+ unsigned *itemptr;
+ unsigned run_start;
+ unsigned run_end;
+ unsigned extra_bits;
+ int delta;
+ u8 len;
+
+ itemptr = precode_items;
+ run_start = 0;
+ do {
+ /* Find the next run of codeword lengths. */
+
+ /* len = the length being repeated */
+ len = lens[run_start];
+
+ run_end = run_start + 1;
+
+ /* Fast case for a single length. */
+ if (likely(run_end == num_lens || len != lens[run_end])) {
+ delta = prev_lens[run_start] - len;
+ if (delta < 0)
+ delta += 17;
+ precode_freqs[delta]++;
+ *itemptr++ = delta;
+ run_start++;
+ continue;
+ }
+
+ /* Extend the run. */
+ do {
+ run_end++;
+ } while (run_end != num_lens && len == lens[run_end]);
+
+ if (len == 0) {
+ /* Run of zeroes. */
+
+ /* Symbol 18: RLE 20 to 51 zeroes at a time. */
+ while ((run_end - run_start) >= 20) {
+ extra_bits = min((run_end - run_start) - 20, 0x1f);
+ precode_freqs[18]++;
+ *itemptr++ = 18 | (extra_bits << 5);
+ run_start += 20 + extra_bits;
+ }
+
+ /* Symbol 17: RLE 4 to 19 zeroes at a time. */
+ if ((run_end - run_start) >= 4) {
+ extra_bits = min((run_end - run_start) - 4, 0xf);
+ precode_freqs[17]++;
+ *itemptr++ = 17 | (extra_bits << 5);
+ run_start += 4 + extra_bits;
+ }
+ } else {
+
+ /* A run of nonzero lengths. */
+
+ /* Symbol 19: RLE 4 to 5 of any length at a time. */
+ while ((run_end - run_start) >= 4) {
+ extra_bits = (run_end - run_start) > 4;
+ delta = prev_lens[run_start] - len;
+ if (delta < 0)
+ delta += 17;
+ precode_freqs[19]++;
+ precode_freqs[delta]++;
+ *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
+ run_start += 4 + extra_bits;
+ }
+ }
+
+ /* Output any remaining lengths without RLE. */
+ while (run_start != run_end) {
+ delta = prev_lens[run_start] - len;
+ if (delta < 0)
+ delta += 17;
+ precode_freqs[delta]++;
+ *itemptr++ = delta;
+ run_start++;
+ }
+ } while (run_start != num_lens);
+
+ return itemptr - precode_items;
+}
+
+/*
+ * Output a Huffman code in the compressed form used in LZX.
+ *
+ * The Huffman code is represented in the output as a logical series of codeword
+ * lengths from which the Huffman code, which must be in canonical form, can be
+ * reconstructed.
+ *
+ * The codeword lengths are themselves compressed using a separate Huffman code,
+ * the "precode", which contains a symbol for each possible codeword length in
+ * the larger code as well as several special symbols to represent repeated
+ * codeword lengths (a form of run-length encoding). The precode is itself
+ * constructed in canonical form, and its codeword lengths are represented
+ * literally in 20 4-bit fields that immediately precede the compressed codeword
+ * lengths of the larger code.
+ *
+ * Furthermore, the codeword lengths of the larger code are actually represented
+ * as deltas from the codeword lengths of the corresponding code in the previous
+ * block.
+ *
+ * @os:
+ * Bitstream to which to write the compressed Huffman code.
+ * @lens:
+ * The codeword lengths, indexed by symbol, in the Huffman code.
+ * @prev_lens:
+ * The codeword lengths, indexed by symbol, in the corresponding Huffman
+ * code in the previous block, or all zeroes if this is the first block.
+ * @num_lens:
+ * The number of symbols in the Huffman code.
+ */
+static void
+lzx_write_compressed_code(struct lzx_output_bitstream *os,
+ const u8 lens[restrict],
+ const u8 prev_lens[restrict],
+ unsigned num_lens)
+{
+ u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
+ u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
+ u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
+ unsigned precode_items[num_lens];
+ unsigned num_precode_items;
+ unsigned precode_item;
+ unsigned precode_sym;
+ unsigned i;
+
+ for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
+ precode_freqs[i] = 0;
+
+ /* Compute the "items" (RLE / literal tokens and extra bits) with which
+ * the codeword lengths in the larger code will be output. */
+ num_precode_items = lzx_compute_precode_items(lens,
+ prev_lens,
+ num_lens,
+ precode_freqs,
+ precode_items);
+
+ /* Build the precode. */
+ make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS,
+ LZX_MAX_PRE_CODEWORD_LEN,
+ precode_freqs, precode_lens,
+ precode_codewords);
+
+ /* Output the lengths of the codewords in the precode. */
+ for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
+ lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
+
+ /* Output the encoded lengths of the codewords in the larger code. */
+ for (i = 0; i < num_precode_items; i++) {
+ precode_item = precode_items[i];
+ precode_sym = precode_item & 0x1F;
+ lzx_write_varbits(os, precode_codewords[precode_sym],
+ precode_lens[precode_sym],
+ LZX_MAX_PRE_CODEWORD_LEN);
+ if (precode_sym >= 17) {
+ if (precode_sym == 17) {
+ lzx_write_bits(os, precode_item >> 5, 4);
+ } else if (precode_sym == 18) {
+ lzx_write_bits(os, precode_item >> 5, 5);
+ } else {
+ lzx_write_bits(os, (precode_item >> 5) & 1, 1);
+ precode_sym = precode_item >> 6;
+ lzx_write_varbits(os, precode_codewords[precode_sym],
+ precode_lens[precode_sym],
+ LZX_MAX_PRE_CODEWORD_LEN);
+ }
+ }