4 * A compressor for the LZX compression format, as used in WIM archives.
8 * Copyright (C) 2012-2017 Eric Biggers
10 * This file is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU Lesser General Public License as published by the Free
12 * Software Foundation; either version 3 of the License, or (at your option) any
15 * This file is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this file; if not, see http://www.gnu.org/licenses/.
26 * This file contains a compressor for the LZX ("Lempel-Ziv eXtended")
27 * compression format, as used in the WIM (Windows IMaging) file format.
29 * Two different LZX-compatible algorithms are implemented: "near-optimal" and
30 * "lazy". "Near-optimal" is significantly slower than "lazy", but results in a
31 * better compression ratio. The "near-optimal" algorithm is used at the
32 * default compression level.
34 * This file may need some slight modifications to be used outside of the WIM
35 * format. In particular, in other situations the LZX block header might be
36 * slightly different, and sliding window support might be required.
38 * LZX is a compression format derived from DEFLATE, the format used by zlib and
39 * gzip. Both LZX and DEFLATE use LZ77 matching and Huffman coding. Certain
40 * details are quite similar, such as the method for storing Huffman codes.
41 * However, the main differences are:
43 * - LZX preprocesses the data to attempt to make x86 machine code slightly more
44 * compressible before attempting to compress it further.
46 * - LZX uses a "main" alphabet which combines literals and matches, with the
47 * match symbols containing a "length header" (giving all or part of the match
48 * length) and an "offset slot" (giving, roughly speaking, the order of
49 * magnitude of the match offset).
51 * - LZX does not have static Huffman blocks (that is, the kind with preset
52 * Huffman codes); however it does have two types of dynamic Huffman blocks
53 * ("verbatim" and "aligned").
55 * - LZX has a minimum match length of 2 rather than 3. Length 2 matches can be
56 * useful, but generally only if the compressor is smart about choosing them.
58 * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue
59 * of match offsets. This is very useful for certain types of files, such as
60 * binary files that have repeating records.
63 /******************************************************************************/
64 /* General parameters */
65 /*----------------------------------------------------------------------------*/
68 * The compressor uses the faster algorithm at levels <= MAX_FAST_LEVEL. It
69 * uses the slower algorithm at levels > MAX_FAST_LEVEL.
71 #define MAX_FAST_LEVEL 34
74 * The compressor-side limits on the codeword lengths (in bits) for each Huffman
75 * code. To make outputting bits slightly faster, some of these limits are
76 * lower than the limits defined by the LZX format. This does not significantly
77 * affect the compression ratio.
79 #define MAIN_CODEWORD_LIMIT 16
80 #define LENGTH_CODEWORD_LIMIT 12
81 #define ALIGNED_CODEWORD_LIMIT 7
82 #define PRE_CODEWORD_LIMIT 7
85 /******************************************************************************/
86 /* Block splitting parameters */
87 /*----------------------------------------------------------------------------*/
90 * The compressor always outputs blocks of at least this size in bytes, except
91 * for the last block which may need to be smaller.
93 #define MIN_BLOCK_SIZE 6500
96 * The compressor attempts to end a block when it reaches this size in bytes.
97 * The final size might be slightly larger due to matches extending beyond the
98 * end of the block. Specifically:
100 * - The near-optimal compressor may choose a match of up to LZX_MAX_MATCH_LEN
101 * bytes starting at position 'SOFT_MAX_BLOCK_SIZE - 1'.
103 * - The lazy compressor may choose a sequence of literals starting at position
104 * 'SOFT_MAX_BLOCK_SIZE - 1' when it sees a sequence of increasingly better
105 * matches. The final match may be up to LZX_MAX_MATCH_LEN bytes. The
106 * length of the literal sequence is approximately limited by the "nice match
109 #define SOFT_MAX_BLOCK_SIZE 100000
112 * The number of observed items (matches and literals) that represents
113 * sufficient data for the compressor to decide whether the current block should
116 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 400
119 /******************************************************************************/
120 /* Parameters for slower algorithm */
121 /*----------------------------------------------------------------------------*/
124 * The log base 2 of the number of entries in the hash table for finding length
125 * 2 matches. This could be as high as 16, but using a smaller hash table
126 * speeds up compression due to reduced cache pressure.
128 #define BT_MATCHFINDER_HASH2_ORDER 12
131 * The number of lz_match structures in the match cache, excluding the extra
132 * "overflow" entries. This value should be high enough so that nearly the
133 * time, all matches found in a given block can fit in the match cache.
134 * However, fallback behavior (immediately terminating the block) on cache
135 * overflow is still required.
137 #define CACHE_LENGTH (SOFT_MAX_BLOCK_SIZE * 5)
140 * An upper bound on the number of matches that can ever be saved in the match
141 * cache for a single position. Since each match we save for a single position
142 * has a distinct length, we can use the number of possible match lengths in LZX
143 * as this bound. This bound is guaranteed to be valid in all cases, although
144 * if 'nice_match_length < LZX_MAX_MATCH_LEN', then it will never actually be
147 #define MAX_MATCHES_PER_POS LZX_NUM_LENS
150 * A scaling factor that makes it possible to consider fractional bit costs. A
151 * single bit has a cost of BIT_COST.
153 * Note: this is only useful as a statistical trick for when the true costs are
154 * unknown. Ultimately, each token in LZX requires a whole number of bits to
160 * Should the compressor take into account the costs of aligned offset symbols
161 * instead of assuming that all are equally likely?
163 #define CONSIDER_ALIGNED_COSTS 1
166 * Should the "minimum" cost path search algorithm consider "gap" matches, where
167 * a normal match is followed by a literal, then by a match with the same
168 * offset? This is one specific, somewhat common situation in which the true
169 * minimum cost path is often different from the path found by looking only one
172 #define CONSIDER_GAP_MATCHES 1
174 /******************************************************************************/
176 /*----------------------------------------------------------------------------*/
182 #include "wimlib/compress_common.h"
183 #include "wimlib/compressor_ops.h"
184 #include "wimlib/error.h"
185 #include "wimlib/lz_extend.h"
186 #include "wimlib/lzx_common.h"
187 #include "wimlib/unaligned.h"
188 #include "wimlib/util.h"
190 /* Note: BT_MATCHFINDER_HASH2_ORDER must be defined before including
191 * bt_matchfinder.h. */
193 /* Matchfinders with 16-bit positions */
195 #define MF_SUFFIX _16
196 #include "wimlib/bt_matchfinder.h"
197 #include "wimlib/hc_matchfinder.h"
199 /* Matchfinders with 32-bit positions */
203 #define MF_SUFFIX _32
204 #include "wimlib/bt_matchfinder.h"
205 #include "wimlib/hc_matchfinder.h"
207 /******************************************************************************/
208 /* Compressor structure */
209 /*----------------------------------------------------------------------------*/
211 /* Codewords for the Huffman codes */
212 struct lzx_codewords {
213 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
214 u32 len[LZX_LENCODE_NUM_SYMBOLS];
215 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
219 * Codeword lengths, in bits, for the Huffman codes.
221 * A codeword length of 0 means the corresponding codeword has zero frequency.
223 * The main and length codes each have one extra entry for use as a sentinel.
224 * See lzx_write_compressed_code().
227 u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS + 1];
228 u8 len[LZX_LENCODE_NUM_SYMBOLS + 1];
229 u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
232 /* Codewords and lengths for the Huffman codes */
234 struct lzx_codewords codewords;
235 struct lzx_lens lens;
238 /* Symbol frequency counters for the Huffman-encoded alphabets */
240 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
241 u32 len[LZX_LENCODE_NUM_SYMBOLS];
242 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
245 /* Block split statistics. See the "Block splitting algorithm" section later in
246 * this file for details. */
247 #define NUM_LITERAL_OBSERVATION_TYPES 8
248 #define NUM_MATCH_OBSERVATION_TYPES 2
249 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
250 NUM_MATCH_OBSERVATION_TYPES)
251 struct lzx_block_split_stats {
252 u32 new_observations[NUM_OBSERVATION_TYPES];
253 u32 observations[NUM_OBSERVATION_TYPES];
254 u32 num_new_observations;
255 u32 num_observations;
259 * Represents a run of literals followed by a match or end-of-block. This
260 * structure is needed to temporarily store items chosen by the compressor,
261 * since items cannot be written until all items for the block have been chosen
262 * and the block's Huffman codes have been computed.
264 struct lzx_sequence {
267 * Bits 9..31: the number of literals in this run. This may be 0 and
268 * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not
269 * stored explicitly in this structure; instead, they are read directly
270 * from the uncompressed data.
272 * Bits 0..8: the length of the match which follows the literals, or 0
273 * if this literal run was the last in the block, so there is no match
274 * which follows it. This can be at most LZX_MAX_MATCH_LEN.
276 u32 litrunlen_and_matchlen;
277 #define SEQ_MATCHLEN_BITS 9
278 #define SEQ_MATCHLEN_MASK (((u32)1 << SEQ_MATCHLEN_BITS) - 1)
281 * If 'matchlen' doesn't indicate end-of-block, then this contains:
283 * Bits 10..31: either the offset plus LZX_OFFSET_ADJUSTMENT or a recent
284 * offset code, depending on the offset slot encoded in the main symbol.
286 * Bits 0..9: the main symbol.
288 u32 adjusted_offset_and_mainsym;
289 #define SEQ_MAINSYM_BITS 10
290 #define SEQ_MAINSYM_MASK (((u32)1 << SEQ_MAINSYM_BITS) - 1)
291 } _aligned_attribute(8);
294 * This structure represents a byte position in the input buffer and a node in
295 * the graph of possible match/literal choices.
297 * Logically, each incoming edge to this node is labeled with a literal or a
298 * match that can be taken to reach this position from an earlier position; and
299 * each outgoing edge from this node is labeled with a literal or a match that
300 * can be taken to advance from this position to a later position.
302 struct lzx_optimum_node {
304 /* The cost, in bits, of the lowest-cost path that has been found to
305 * reach this position. This can change as progressively lower cost
306 * paths are found to reach this position. */
310 * The best arrival to this node, i.e. the match or literal that was
311 * used to arrive to this position at the given 'cost'. This can change
312 * as progressively lower cost paths are found to reach this position.
314 * For non-gap matches, this variable is divided into two bitfields
315 * whose meanings depend on the item type:
318 * Low bits are 0, high bits are the literal.
320 * Explicit offset matches:
321 * Low bits are the match length, high bits are the offset plus
322 * LZX_OFFSET_ADJUSTMENT.
324 * Repeat offset matches:
325 * Low bits are the match length, high bits are the queue index.
327 * For gap matches, identified by OPTIMUM_GAP_MATCH set, special
328 * behavior applies --- see the code.
331 #define OPTIMUM_OFFSET_SHIFT SEQ_MATCHLEN_BITS
332 #define OPTIMUM_LEN_MASK SEQ_MATCHLEN_MASK
333 #if CONSIDER_GAP_MATCHES
334 # define OPTIMUM_GAP_MATCH 0x80000000
337 } _aligned_attribute(8);
339 /* The cost model for near-optimal parsing */
343 * 'match_cost[offset_slot][len - LZX_MIN_MATCH_LEN]' is the cost of a
344 * length 'len' match which has an offset belonging to 'offset_slot'.
345 * The cost includes the main symbol, the length symbol if required, and
346 * the extra offset bits if any, excluding any entropy-coded bits
347 * (aligned offset bits). It does *not* include the cost of the aligned
348 * offset symbol which may be required.
350 u16 match_cost[LZX_MAX_OFFSET_SLOTS][LZX_NUM_LENS];
352 /* Cost of each symbol in the main code */
353 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
355 /* Cost of each symbol in the length code */
356 u32 len[LZX_LENCODE_NUM_SYMBOLS];
358 #if CONSIDER_ALIGNED_COSTS
359 /* Cost of each symbol in the aligned offset code */
360 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
364 struct lzx_output_bitstream;
366 /* The main LZX compressor structure */
367 struct lzx_compressor {
369 /* The buffer for preprocessed input data, if not using destructive
373 /* If true, then the compressor need not preserve the input buffer if it
374 * compresses the data successfully */
377 /* Pointer to the compress() implementation chosen at allocation time */
378 void (*impl)(struct lzx_compressor *, const u8 *, size_t,
379 struct lzx_output_bitstream *);
381 /* The log base 2 of the window size for match offset encoding purposes.
382 * This will be >= LZX_MIN_WINDOW_ORDER and <= LZX_MAX_WINDOW_ORDER. */
383 unsigned window_order;
385 /* The number of symbols in the main alphabet. This depends on the
386 * window order, since the window order determines the maximum possible
388 unsigned num_main_syms;
390 /* The "nice" match length: if a match of this length is found, then it
391 * is chosen immediately without further consideration. */
392 unsigned nice_match_length;
394 /* The maximum search depth: at most this many potential matches are
395 * considered at each position. */
396 unsigned max_search_depth;
398 /* The number of optimization passes per block */
399 unsigned num_optim_passes;
401 /* The symbol frequency counters for the current block */
402 struct lzx_freqs freqs;
404 /* Block split statistics for the current block */
405 struct lzx_block_split_stats split_stats;
407 /* The Huffman codes for the current and previous blocks. The one with
408 * index 'codes_index' is for the current block, and the other one is
409 * for the previous block. */
410 struct lzx_codes codes[2];
411 unsigned codes_index;
413 /* The matches and literals that the compressor has chosen for the
414 * current block. The required length of this array is limited by the
415 * maximum number of matches that can ever be chosen for a single block,
416 * plus one for the special entry at the end. */
417 struct lzx_sequence chosen_sequences[
418 DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
420 /* Tables for mapping adjusted offsets to offset slots */
421 u8 offset_slot_tab_1[32768]; /* offset slots [0, 29] */
422 u8 offset_slot_tab_2[128]; /* offset slots [30, 49] */
425 /* Data for lzx_compress_lazy() */
427 /* Hash chains matchfinder (MUST BE LAST!!!) */
429 struct hc_matchfinder_16 hc_mf_16;
430 struct hc_matchfinder_32 hc_mf_32;
434 /* Data for lzx_compress_near_optimal() */
437 * Array of nodes, one per position, for running the
438 * minimum-cost path algorithm.
440 * This array must be large enough to accommodate the
441 * worst-case number of nodes, which occurs if the
442 * compressor finds a match of length LZX_MAX_MATCH_LEN
443 * at position 'SOFT_MAX_BLOCK_SIZE - 1', producing a
444 * block of size 'SOFT_MAX_BLOCK_SIZE - 1 +
445 * LZX_MAX_MATCH_LEN'. Add one for the end-of-block
448 struct lzx_optimum_node optimum_nodes[
449 SOFT_MAX_BLOCK_SIZE - 1 +
450 LZX_MAX_MATCH_LEN + 1];
452 /* The cost model for the current optimization pass */
453 struct lzx_costs costs;
456 * Cached matches for the current block. This array
457 * contains the matches that were found at each position
458 * in the block. Specifically, for each position, there
459 * is a special 'struct lz_match' whose 'length' field
460 * contains the number of matches that were found at
461 * that position; this is followed by the matches
462 * themselves, if any, sorted by strictly increasing
465 * Note: in rare cases, there will be a very high number
466 * of matches in the block and this array will overflow.
467 * If this happens, we force the end of the current
468 * block. CACHE_LENGTH is the length at which we
469 * actually check for overflow. The extra slots beyond
470 * this are enough to absorb the worst case overflow,
471 * which occurs if starting at &match_cache[CACHE_LENGTH
472 * - 1], we write the match count header, then write
473 * MAX_MATCHES_PER_POS matches, then skip searching for
474 * matches at 'LZX_MAX_MATCH_LEN - 1' positions and
475 * write the match count header for each.
477 struct lz_match match_cache[CACHE_LENGTH +
478 MAX_MATCHES_PER_POS +
479 LZX_MAX_MATCH_LEN - 1];
481 /* Binary trees matchfinder (MUST BE LAST!!!) */
483 struct bt_matchfinder_16 bt_mf_16;
484 struct bt_matchfinder_32 bt_mf_32;
490 /******************************************************************************/
491 /* Matchfinder utilities */
492 /*----------------------------------------------------------------------------*/
495 * Will a matchfinder using 16-bit positions be sufficient for compressing
496 * buffers of up to the specified size? The limit could be 65536 bytes, but we
497 * also want to optimize out the use of offset_slot_tab_2 in the 16-bit case.
498 * This requires that the limit be no more than the length of offset_slot_tab_1
501 static forceinline bool
502 lzx_is_16_bit(size_t max_bufsize)
504 STATIC_ASSERT(ARRAY_LEN(((struct lzx_compressor *)0)->offset_slot_tab_1) == 32768);
505 return max_bufsize <= 32768;
509 * Return the offset slot for the specified adjusted match offset.
511 static forceinline unsigned
512 lzx_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
515 if (__builtin_constant_p(adjusted_offset) &&
516 adjusted_offset < LZX_NUM_RECENT_OFFSETS)
517 return adjusted_offset;
518 if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
519 return c->offset_slot_tab_1[adjusted_offset];
520 return c->offset_slot_tab_2[adjusted_offset >> 14];
524 * For a match that has the specified length and adjusted offset, tally its main
525 * symbol, and if needed its length symbol; then return its main symbol.
527 static forceinline unsigned
528 lzx_tally_main_and_lensyms(struct lzx_compressor *c, unsigned length,
529 u32 adjusted_offset, bool is_16_bit)
533 if (length >= LZX_MIN_SECONDARY_LEN) {
534 /* Length symbol needed */
535 c->freqs.len[length - LZX_MIN_SECONDARY_LEN]++;
536 mainsym = LZX_NUM_CHARS + LZX_NUM_PRIMARY_LENS;
538 /* No length symbol needed */
539 mainsym = LZX_NUM_CHARS + length - LZX_MIN_MATCH_LEN;
542 mainsym += LZX_NUM_LEN_HEADERS *
543 lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
544 c->freqs.main[mainsym]++;
549 * The following macros call either the 16-bit or the 32-bit version of a
550 * matchfinder function based on the value of 'is_16_bit', which will be known
551 * at compilation time.
554 #define CALL_HC_MF(is_16_bit, c, funcname, ...) \
555 ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \
556 CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__));
558 #define CALL_BT_MF(is_16_bit, c, funcname, ...) \
559 ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \
560 CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__));
562 /******************************************************************************/
563 /* Output bitstream */
564 /*----------------------------------------------------------------------------*/
567 * The LZX bitstream is encoded as a sequence of little endian 16-bit coding
568 * units. Bits are ordered from most significant to least significant within
573 * Structure to keep track of the current state of sending bits to the
574 * compressed output buffer.
576 struct lzx_output_bitstream {
578 /* Bits that haven't yet been written to the output buffer */
579 machine_word_t bitbuf;
581 /* Number of bits currently held in @bitbuf */
582 machine_word_t bitcount;
584 /* Pointer to the start of the output buffer */
587 /* Pointer to the position in the output buffer at which the next coding
588 * unit should be written */
591 /* Pointer to just past the end of the output buffer, rounded down by
592 * one byte if needed to make 'end - start' a multiple of 2 */
596 /* Can the specified number of bits always be added to 'bitbuf' after all
597 * pending 16-bit coding units have been flushed? */
598 #define CAN_BUFFER(n) ((n) <= WORDBITS - 15)
600 /* Initialize the output bitstream to write to the specified buffer. */
602 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
608 os->end = (u8 *)buffer + (size & ~1);
612 * Add some bits to the bitbuffer variable of the output bitstream. The caller
613 * must make sure there is enough room.
615 static forceinline void
616 lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
618 os->bitbuf = (os->bitbuf << num_bits) | bits;
619 os->bitcount += num_bits;
623 * Flush bits from the bitbuffer variable to the output buffer. 'max_num_bits'
624 * specifies the maximum number of bits that may have been added since the last
627 static forceinline void
628 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
630 /* Masking the number of bits to shift is only needed to avoid undefined
631 * behavior; we don't actually care about the results of bad shifts. On
632 * x86, the explicit masking generates no extra code. */
633 const u32 shift_mask = WORDBITS - 1;
635 if (os->end - os->next < 6)
637 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
638 shift_mask), os->next + 0);
639 if (max_num_bits > 16)
640 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
641 shift_mask), os->next + 2);
642 if (max_num_bits > 32)
643 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
644 shift_mask), os->next + 4);
645 os->next += (os->bitcount >> 4) << 1;
649 /* Add at most 16 bits to the bitbuffer and flush it. */
650 static forceinline void
651 lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
653 lzx_add_bits(os, bits, num_bits);
654 lzx_flush_bits(os, 16);
658 * Flush the last coding unit to the output buffer if needed. Return the total
659 * number of bytes written to the output buffer, or 0 if an overflow occurred.
662 lzx_flush_output(struct lzx_output_bitstream *os)
664 if (os->end - os->next < 6)
667 if (os->bitcount != 0) {
668 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
672 return os->next - os->start;
675 /******************************************************************************/
676 /* Preparing Huffman codes */
677 /*----------------------------------------------------------------------------*/
680 * Build the Huffman codes. This takes as input the frequency tables for each
681 * code and produces as output a set of tables that map symbols to codewords and
685 lzx_build_huffman_codes(struct lzx_compressor *c)
687 const struct lzx_freqs *freqs = &c->freqs;
688 struct lzx_codes *codes = &c->codes[c->codes_index];
690 STATIC_ASSERT(MAIN_CODEWORD_LIMIT >= 9 &&
691 MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
692 make_canonical_huffman_code(c->num_main_syms,
696 codes->codewords.main);
698 STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 8 &&
699 LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
700 make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
701 LENGTH_CODEWORD_LIMIT,
704 codes->codewords.len);
706 STATIC_ASSERT(ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
707 ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
708 make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
709 ALIGNED_CODEWORD_LIMIT,
712 codes->codewords.aligned);
715 /* Reset the symbol frequencies for the current block. */
717 lzx_reset_symbol_frequencies(struct lzx_compressor *c)
719 memset(&c->freqs, 0, sizeof(c->freqs));
723 lzx_compute_precode_items(const u8 lens[restrict],
724 const u8 prev_lens[restrict],
725 u32 precode_freqs[restrict],
726 unsigned precode_items[restrict])
735 itemptr = precode_items;
738 while (!((len = lens[run_start]) & 0x80)) {
740 /* len = the length being repeated */
742 /* Find the next run of codeword lengths. */
744 run_end = run_start + 1;
746 /* Fast case for a single length. */
747 if (likely(len != lens[run_end])) {
748 delta = prev_lens[run_start] - len;
751 precode_freqs[delta]++;
757 /* Extend the run. */
760 } while (len == lens[run_end]);
765 /* Symbol 18: RLE 20 to 51 zeroes at a time. */
766 while ((run_end - run_start) >= 20) {
767 extra_bits = min((run_end - run_start) - 20, 0x1F);
769 *itemptr++ = 18 | (extra_bits << 5);
770 run_start += 20 + extra_bits;
773 /* Symbol 17: RLE 4 to 19 zeroes at a time. */
774 if ((run_end - run_start) >= 4) {
775 extra_bits = min((run_end - run_start) - 4, 0xF);
777 *itemptr++ = 17 | (extra_bits << 5);
778 run_start += 4 + extra_bits;
782 /* A run of nonzero lengths. */
784 /* Symbol 19: RLE 4 to 5 of any length at a time. */
785 while ((run_end - run_start) >= 4) {
786 extra_bits = (run_end - run_start) > 4;
787 delta = prev_lens[run_start] - len;
791 precode_freqs[delta]++;
792 *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
793 run_start += 4 + extra_bits;
797 /* Output any remaining lengths without RLE. */
798 while (run_start != run_end) {
799 delta = prev_lens[run_start] - len;
802 precode_freqs[delta]++;
808 return itemptr - precode_items;
811 /******************************************************************************/
812 /* Outputting compressed data */
813 /*----------------------------------------------------------------------------*/
816 * Output a Huffman code in the compressed form used in LZX.
818 * The Huffman code is represented in the output as a logical series of codeword
819 * lengths from which the Huffman code, which must be in canonical form, can be
822 * The codeword lengths are themselves compressed using a separate Huffman code,
823 * the "precode", which contains a symbol for each possible codeword length in
824 * the larger code as well as several special symbols to represent repeated
825 * codeword lengths (a form of run-length encoding). The precode is itself
826 * constructed in canonical form, and its codeword lengths are represented
827 * literally in 20 4-bit fields that immediately precede the compressed codeword
828 * lengths of the larger code.
830 * Furthermore, the codeword lengths of the larger code are actually represented
831 * as deltas from the codeword lengths of the corresponding code in the previous
835 * Bitstream to which to write the compressed Huffman code.
837 * The codeword lengths, indexed by symbol, in the Huffman code.
839 * The codeword lengths, indexed by symbol, in the corresponding Huffman
840 * code in the previous block, or all zeroes if this is the first block.
842 * The number of symbols in the Huffman code.
845 lzx_write_compressed_code(struct lzx_output_bitstream *os,
846 const u8 lens[restrict],
847 const u8 prev_lens[restrict],
850 u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
851 u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
852 u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
853 unsigned precode_items[num_lens];
854 unsigned num_precode_items;
855 unsigned precode_item;
856 unsigned precode_sym;
858 u8 saved = lens[num_lens];
859 *(u8 *)(lens + num_lens) = 0x80;
861 for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
862 precode_freqs[i] = 0;
864 /* Compute the "items" (RLE / literal tokens and extra bits) with which
865 * the codeword lengths in the larger code will be output. */
866 num_precode_items = lzx_compute_precode_items(lens,
871 /* Build the precode. */
872 STATIC_ASSERT(PRE_CODEWORD_LIMIT >= 5 &&
873 PRE_CODEWORD_LIMIT <= LZX_MAX_PRE_CODEWORD_LEN);
874 make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS, PRE_CODEWORD_LIMIT,
875 precode_freqs, precode_lens,
878 /* Output the lengths of the codewords in the precode. */
879 for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
880 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
882 /* Output the encoded lengths of the codewords in the larger code. */
883 for (i = 0; i < num_precode_items; i++) {
884 precode_item = precode_items[i];
885 precode_sym = precode_item & 0x1F;
886 lzx_add_bits(os, precode_codewords[precode_sym],
887 precode_lens[precode_sym]);
888 if (precode_sym >= 17) {
889 if (precode_sym == 17) {
890 lzx_add_bits(os, precode_item >> 5, 4);
891 } else if (precode_sym == 18) {
892 lzx_add_bits(os, precode_item >> 5, 5);
894 lzx_add_bits(os, (precode_item >> 5) & 1, 1);
895 precode_sym = precode_item >> 6;
896 lzx_add_bits(os, precode_codewords[precode_sym],
897 precode_lens[precode_sym]);
900 STATIC_ASSERT(CAN_BUFFER(2 * PRE_CODEWORD_LIMIT + 1));
901 lzx_flush_bits(os, 2 * PRE_CODEWORD_LIMIT + 1);
904 *(u8 *)(lens + num_lens) = saved;
908 * Write all matches and literal bytes (which were precomputed) in an LZX
909 * compressed block to the output bitstream in the final compressed
913 * The output bitstream.
915 * The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
916 * LZX_BLOCKTYPE_VERBATIM).
918 * The uncompressed data of the block.
920 * The matches and literals to output, given as a series of sequences.
922 * The main, length, and aligned offset Huffman codes for the block.
925 lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
926 const u8 *block_data, const struct lzx_sequence sequences[],
927 const struct lzx_codes *codes)
929 const struct lzx_sequence *seq = sequences;
930 unsigned min_aligned_offset_slot;
932 if (block_type == LZX_BLOCKTYPE_ALIGNED)
933 min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
935 min_aligned_offset_slot = LZX_MAX_OFFSET_SLOTS;
938 /* Output the next sequence. */
940 u32 litrunlen = seq->litrunlen_and_matchlen >> SEQ_MATCHLEN_BITS;
941 unsigned matchlen = seq->litrunlen_and_matchlen & SEQ_MATCHLEN_MASK;
942 STATIC_ASSERT((u32)~SEQ_MATCHLEN_MASK >> SEQ_MATCHLEN_BITS >=
943 SOFT_MAX_BLOCK_SIZE);
945 unsigned main_symbol;
946 unsigned offset_slot;
947 unsigned num_extra_bits;
950 /* Output the literal run of the sequence. */
952 if (litrunlen) { /* Is the literal run nonempty? */
954 /* Verify optimization is enabled on 64-bit */
955 STATIC_ASSERT(WORDBITS < 64 ||
956 CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT));
958 if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) {
960 /* 64-bit: write 3 literals at a time. */
961 while (litrunlen >= 3) {
962 unsigned lit0 = block_data[0];
963 unsigned lit1 = block_data[1];
964 unsigned lit2 = block_data[2];
965 lzx_add_bits(os, codes->codewords.main[lit0],
966 codes->lens.main[lit0]);
967 lzx_add_bits(os, codes->codewords.main[lit1],
968 codes->lens.main[lit1]);
969 lzx_add_bits(os, codes->codewords.main[lit2],
970 codes->lens.main[lit2]);
971 lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
976 unsigned lit = *block_data++;
977 lzx_add_bits(os, codes->codewords.main[lit],
978 codes->lens.main[lit]);
980 unsigned lit = *block_data++;
981 lzx_add_bits(os, codes->codewords.main[lit],
982 codes->lens.main[lit]);
983 lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
985 lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
989 /* 32-bit: write 1 literal at a time. */
991 unsigned lit = *block_data++;
992 lzx_add_bits(os, codes->codewords.main[lit],
993 codes->lens.main[lit]);
994 lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
995 } while (--litrunlen);
999 /* Was this the last literal run? */
1003 /* Nope; output the match. */
1005 block_data += matchlen;
1007 adjusted_offset = seq->adjusted_offset_and_mainsym >> SEQ_MAINSYM_BITS;
1008 main_symbol = seq->adjusted_offset_and_mainsym & SEQ_MAINSYM_MASK;
1010 offset_slot = (main_symbol - LZX_NUM_CHARS) / LZX_NUM_LEN_HEADERS;
1011 num_extra_bits = lzx_extra_offset_bits[offset_slot];
1012 extra_bits = adjusted_offset - (lzx_offset_slot_base[offset_slot] +
1013 LZX_OFFSET_ADJUSTMENT);
1015 #define MAX_MATCH_BITS (MAIN_CODEWORD_LIMIT + \
1016 LENGTH_CODEWORD_LIMIT + \
1017 LZX_MAX_NUM_EXTRA_BITS - \
1018 LZX_NUM_ALIGNED_OFFSET_BITS + \
1019 ALIGNED_CODEWORD_LIMIT)
1021 /* Verify optimization is enabled on 64-bit */
1022 STATIC_ASSERT(WORDBITS < 64 || CAN_BUFFER(MAX_MATCH_BITS));
1024 /* Output the main symbol for the match. */
1026 lzx_add_bits(os, codes->codewords.main[main_symbol],
1027 codes->lens.main[main_symbol]);
1028 if (!CAN_BUFFER(MAX_MATCH_BITS))
1029 lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
1031 /* If needed, output the length symbol for the match. */
1033 if (matchlen >= LZX_MIN_SECONDARY_LEN) {
1034 lzx_add_bits(os, codes->codewords.len[matchlen -
1035 LZX_MIN_SECONDARY_LEN],
1036 codes->lens.len[matchlen -
1037 LZX_MIN_SECONDARY_LEN]);
1038 if (!CAN_BUFFER(MAX_MATCH_BITS))
1039 lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
1042 /* Output the extra offset bits for the match. In aligned
1043 * offset blocks, the lowest 3 bits of the adjusted offset are
1044 * Huffman-encoded using the aligned offset code, provided that
1045 * there are at least extra 3 offset bits required. All other
1046 * extra offset bits are output verbatim. */
1048 if (offset_slot >= min_aligned_offset_slot) {
1050 lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
1051 num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS);
1052 if (!CAN_BUFFER(MAX_MATCH_BITS))
1053 lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS -
1054 LZX_NUM_ALIGNED_OFFSET_BITS);
1056 lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
1057 LZX_ALIGNED_OFFSET_BITMASK],
1058 codes->lens.aligned[adjusted_offset &
1059 LZX_ALIGNED_OFFSET_BITMASK]);
1060 if (!CAN_BUFFER(MAX_MATCH_BITS))
1061 lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
1063 STATIC_ASSERT(CAN_BUFFER(LZX_MAX_NUM_EXTRA_BITS));
1065 lzx_add_bits(os, extra_bits, num_extra_bits);
1066 if (!CAN_BUFFER(MAX_MATCH_BITS))
1067 lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS);
1070 if (CAN_BUFFER(MAX_MATCH_BITS))
1071 lzx_flush_bits(os, MAX_MATCH_BITS);
1073 /* Advance to the next sequence. */
1079 lzx_write_compressed_block(const u8 *block_begin,
1082 unsigned window_order,
1083 unsigned num_main_syms,
1084 const struct lzx_sequence sequences[],
1085 const struct lzx_codes * codes,
1086 const struct lzx_lens * prev_lens,
1087 struct lzx_output_bitstream * os)
1089 /* The first three bits indicate the type of block and are one of the
1090 * LZX_BLOCKTYPE_* constants. */
1091 lzx_write_bits(os, block_type, 3);
1094 * Output the block size.
1096 * The original LZX format encoded the block size in 24 bits. However,
1097 * the LZX format used in WIM archives uses 1 bit to specify whether the
1098 * block has the default size of 32768 bytes, then optionally 16 bits to
1099 * specify a non-default size. This works fine for Microsoft's WIM
1100 * software (WIMGAPI), which never compresses more than 32768 bytes at a
1101 * time with LZX. However, as an extension, our LZX compressor supports
1102 * compressing up to 2097152 bytes, with a corresponding increase in
1103 * window size. It is possible for blocks in these larger buffers to
1104 * exceed 65535 bytes; such blocks cannot have their size represented in
1107 * The chosen solution was to use 24 bits for the block size when
1108 * possibly required --- specifically, when the compressor has been
1109 * allocated to be capable of compressing more than 32768 bytes at once
1110 * (which also causes the number of main symbols to be increased).
1112 if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
1113 lzx_write_bits(os, 1, 1);
1115 lzx_write_bits(os, 0, 1);
1117 if (window_order >= 16)
1118 lzx_write_bits(os, block_size >> 16, 8);
1120 lzx_write_bits(os, block_size & 0xFFFF, 16);
1123 /* If it's an aligned offset block, output the aligned offset code. */
1124 if (block_type == LZX_BLOCKTYPE_ALIGNED) {
1125 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1126 lzx_write_bits(os, codes->lens.aligned[i],
1127 LZX_ALIGNEDCODE_ELEMENT_SIZE);
1131 /* Output the main code (two parts). */
1132 lzx_write_compressed_code(os, codes->lens.main,
1135 lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
1136 prev_lens->main + LZX_NUM_CHARS,
1137 num_main_syms - LZX_NUM_CHARS);
1139 /* Output the length code. */
1140 lzx_write_compressed_code(os, codes->lens.len,
1142 LZX_LENCODE_NUM_SYMBOLS);
1144 /* Output the compressed matches and literals. */
1145 lzx_write_sequences(os, block_type, block_begin, sequences, codes);
1149 * Given the frequencies of symbols in an LZX-compressed block and the
1150 * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
1151 * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
1152 * will take fewer bits to output.
1155 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
1156 const struct lzx_codes * codes)
1158 u32 verbatim_cost = 0;
1159 u32 aligned_cost = 0;
1161 /* A verbatim block requires 3 bits in each place that an aligned offset
1162 * symbol would be used in an aligned offset block. */
1163 for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1164 verbatim_cost += LZX_NUM_ALIGNED_OFFSET_BITS * freqs->aligned[i];
1165 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
1168 /* Account for the cost of sending the codeword lengths of the aligned
1170 aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE *
1171 LZX_ALIGNEDCODE_NUM_SYMBOLS;
1173 if (aligned_cost < verbatim_cost)
1174 return LZX_BLOCKTYPE_ALIGNED;
1176 return LZX_BLOCKTYPE_VERBATIM;
1180 * Flush an LZX block:
1182 * 1. Build the Huffman codes.
1183 * 2. Decide whether to output the block as VERBATIM or ALIGNED.
1184 * 3. Write the block.
1185 * 4. Swap the indices of the current and previous Huffman codes.
1187 * Note: we never output UNCOMPRESSED blocks. This probably should be
1188 * implemented sometime, but it doesn't make much difference.
1191 lzx_flush_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
1192 const u8 *block_begin, u32 block_size, u32 seq_idx)
1196 lzx_build_huffman_codes(c);
1198 block_type = lzx_choose_verbatim_or_aligned(&c->freqs,
1199 &c->codes[c->codes_index]);
1200 lzx_write_compressed_block(block_begin,
1205 &c->chosen_sequences[seq_idx],
1206 &c->codes[c->codes_index],
1207 &c->codes[c->codes_index ^ 1].lens,
1209 c->codes_index ^= 1;
1212 /******************************************************************************/
1213 /* Block splitting algorithm */
1214 /*----------------------------------------------------------------------------*/
1217 * The problem of block splitting is to decide when it is worthwhile to start a
1218 * new block with new entropy codes. There is a theoretically optimal solution:
1219 * recursively consider every possible block split, considering the exact cost
1220 * of each block, and choose the minimum cost approach. But this is far too
1221 * slow. Instead, as an approximation, we can count symbols and after every N
1222 * symbols, compare the expected distribution of symbols based on the previous
1223 * data with the actual distribution. If they differ "by enough", then start a
1226 * As an optimization and heuristic, we don't distinguish between every symbol
1227 * but rather we combine many symbols into a single "observation type". For
1228 * literals we only look at the high bits and low bits, and for matches we only
1229 * look at whether the match is long or not. The assumption is that for typical
1230 * "real" data, places that are good block boundaries will tend to be noticeable
1231 * based only on changes in these aggregate frequencies, without looking for
1232 * subtle differences in individual symbols. For example, a change from ASCII
1233 * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1234 * to many matches (generally more compressible), would be easily noticed based
1235 * on the aggregates.
1237 * For determining whether the frequency distributions are "different enough" to
1238 * start a new block, the simply heuristic of splitting when the sum of absolute
1239 * differences exceeds a constant seems to be good enough.
1241 * Finally, for an approximation, it is not strictly necessary that the exact
1242 * symbols being used are considered. With "near-optimal parsing", for example,
1243 * the actual symbols that will be used are unknown until after the block
1244 * boundary is chosen and the block has been optimized. Since the final choices
1245 * cannot be used, we can use preliminary "greedy" choices instead.
1248 /* Initialize the block split statistics when starting a new block. */
1250 lzx_init_block_split_stats(struct lzx_block_split_stats *stats)
1252 memset(stats, 0, sizeof(*stats));
1255 /* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the
1256 * literal, for 8 possible literal observation types. */
1257 static forceinline void
1258 lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
1260 stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1261 stats->num_new_observations++;
1264 /* Match observation. Heuristic: use one observation type for "short match" and
1265 * one observation type for "long match". */
1266 static forceinline void
1267 lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length)
1269 stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
1270 stats->num_new_observations++;
1274 lzx_should_end_block(struct lzx_block_split_stats *stats)
1276 if (stats->num_observations > 0) {
1278 /* Note: to avoid slow divisions, we do not divide by
1279 * 'num_observations', but rather do all math with the numbers
1280 * multiplied by 'num_observations'. */
1281 u32 total_delta = 0;
1282 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1283 u32 expected = stats->observations[i] *
1284 stats->num_new_observations;
1285 u32 actual = stats->new_observations[i] *
1286 stats->num_observations;
1287 u32 delta = (actual > expected) ? actual - expected :
1289 total_delta += delta;
1292 /* Ready to end the block? */
1294 stats->num_new_observations * 7 / 8 * stats->num_observations)
1298 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1299 stats->num_observations += stats->new_observations[i];
1300 stats->observations[i] += stats->new_observations[i];
1301 stats->new_observations[i] = 0;
1303 stats->num_new_observations = 0;
1307 /******************************************************************************/
1308 /* Slower ("near-optimal") compression algorithm */
1309 /*----------------------------------------------------------------------------*/
1312 * Least-recently-used queue for match offsets.
1314 * This is represented as a 64-bit integer for efficiency. There are three
1315 * offsets of 21 bits each. Bit 64 is garbage.
1317 struct lzx_lru_queue {
1319 } _aligned_attribute(8);
1321 #define LZX_QUEUE_OFFSET_SHIFT 21
1322 #define LZX_QUEUE_OFFSET_MASK (((u64)1 << LZX_QUEUE_OFFSET_SHIFT) - 1)
1324 #define LZX_QUEUE_R0_SHIFT (0 * LZX_QUEUE_OFFSET_SHIFT)
1325 #define LZX_QUEUE_R1_SHIFT (1 * LZX_QUEUE_OFFSET_SHIFT)
1326 #define LZX_QUEUE_R2_SHIFT (2 * LZX_QUEUE_OFFSET_SHIFT)
1328 #define LZX_QUEUE_R0_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R0_SHIFT)
1329 #define LZX_QUEUE_R1_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R1_SHIFT)
1330 #define LZX_QUEUE_R2_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R2_SHIFT)
1332 #define LZX_QUEUE_INITIALIZER { \
1333 ((u64)1 << LZX_QUEUE_R0_SHIFT) | \
1334 ((u64)1 << LZX_QUEUE_R1_SHIFT) | \
1335 ((u64)1 << LZX_QUEUE_R2_SHIFT) }
1337 static forceinline u64
1338 lzx_lru_queue_R0(struct lzx_lru_queue queue)
1340 return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1343 static forceinline u64
1344 lzx_lru_queue_R1(struct lzx_lru_queue queue)
1346 return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1349 static forceinline u64
1350 lzx_lru_queue_R2(struct lzx_lru_queue queue)
1352 return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1355 /* Push a match offset onto the front (most recently used) end of the queue. */
1356 static forceinline struct lzx_lru_queue
1357 lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
1359 return (struct lzx_lru_queue) {
1360 .R = (queue.R << LZX_QUEUE_OFFSET_SHIFT) | offset,
1364 /* Swap a match offset to the front of the queue. */
1365 static forceinline struct lzx_lru_queue
1366 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
1368 unsigned shift = idx * 21;
1369 const u64 mask = LZX_QUEUE_R0_MASK;
1370 const u64 mask_high = mask << shift;
1372 return (struct lzx_lru_queue) {
1373 (queue.R & ~(mask | mask_high)) |
1374 ((queue.R & mask_high) >> shift) |
1375 ((queue.R & mask) << shift)
1379 static forceinline u32
1380 lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
1383 struct lzx_sequence *seq =
1384 &c->chosen_sequences[ARRAY_LEN(c->chosen_sequences) - 1];
1385 u32 node_idx = block_size;
1386 u32 litrun_end; /* if record=true: end of the current literal run */
1389 /* The last sequence has matchlen 0 */
1390 seq->litrunlen_and_matchlen = 0;
1391 litrun_end = node_idx;
1397 u32 adjusted_offset;
1400 /* Tally literals until either a match or the beginning of the
1401 * block is reached. Note: the item in the node at the
1402 * beginning of the block (c->optimum_nodes[0]) has all bits
1403 * set, causing this loop to end when it is reached. */
1405 item = c->optimum_nodes[node_idx].item;
1406 if (item & OPTIMUM_LEN_MASK)
1408 c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
1412 #if CONSIDER_GAP_MATCHES
1413 if (item & OPTIMUM_GAP_MATCH) {
1416 /* Tally/record the rep0 match after the gap. */
1417 matchlen = item & OPTIMUM_LEN_MASK;
1418 mainsym = lzx_tally_main_and_lensyms(c, matchlen, 0,
1421 seq->litrunlen_and_matchlen |=
1422 (litrun_end - node_idx) <<
1425 seq->litrunlen_and_matchlen = matchlen;
1426 seq->adjusted_offset_and_mainsym = mainsym;
1427 litrun_end = node_idx - matchlen;
1430 /* Tally the literal in the gap. */
1431 c->freqs.main[(u8)(item >> OPTIMUM_OFFSET_SHIFT)]++;
1433 /* Fall through and tally the match before the gap.
1434 * (It was temporarily saved in the 'cost' field of the
1435 * previous node, which was free to reuse.) */
1436 item = c->optimum_nodes[--node_idx].cost;
1437 node_idx -= matchlen;
1439 #else /* CONSIDER_GAP_MATCHES */
1442 #endif /* !CONSIDER_GAP_MATCHES */
1444 /* Tally/record a match. */
1445 matchlen = item & OPTIMUM_LEN_MASK;
1446 adjusted_offset = item >> OPTIMUM_OFFSET_SHIFT;
1447 mainsym = lzx_tally_main_and_lensyms(c, matchlen,
1450 if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET +
1451 LZX_OFFSET_ADJUSTMENT)
1452 c->freqs.aligned[adjusted_offset &
1453 LZX_ALIGNED_OFFSET_BITMASK]++;
1455 seq->litrunlen_and_matchlen |=
1456 (litrun_end - node_idx) << SEQ_MATCHLEN_BITS;
1458 seq->litrunlen_and_matchlen = matchlen;
1459 seq->adjusted_offset_and_mainsym =
1460 (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym;
1461 litrun_end = node_idx - matchlen;
1463 node_idx -= matchlen;
1466 /* Record the literal run length for the first sequence. */
1468 seq->litrunlen_and_matchlen |=
1469 (litrun_end - node_idx) << SEQ_MATCHLEN_BITS;
1472 /* Return the index in chosen_sequences at which the sequences begin. */
1473 return seq - &c->chosen_sequences[0];
1477 * Given the minimum-cost path computed through the item graph for the current
1478 * block, walk the path and count how many of each symbol in each Huffman-coded
1479 * alphabet would be required to output the items (matches and literals) along
1482 * Note that the path will be walked backwards (from the end of the block to the
1483 * beginning of the block), but this doesn't matter because this function only
1484 * computes frequencies.
1486 static forceinline void
1487 lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1489 lzx_walk_item_list(c, block_size, is_16_bit, false);
1493 * Like lzx_tally_item_list(), but this function also generates the list of
1494 * lzx_sequences for the minimum-cost path and writes it to c->chosen_sequences,
1495 * ready to be output to the bitstream after the Huffman codes are computed.
1496 * The lzx_sequences will be written to decreasing memory addresses as the path
1497 * is walked backwards, which means they will end up in the expected
1498 * first-to-last order. The return value is the index in c->chosen_sequences at
1499 * which the lzx_sequences begin.
1501 static forceinline u32
1502 lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1504 return lzx_walk_item_list(c, block_size, is_16_bit, true);
1508 * Find an inexpensive path through the graph of possible match/literal choices
1509 * for the current block. The nodes of the graph are
1510 * c->optimum_nodes[0...block_size]. They correspond directly to the bytes in
1511 * the current block, plus one extra node for end-of-block. The edges of the
1512 * graph are matches and literals. The goal is to find the minimum cost path
1513 * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]', given the cost
1516 * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and
1517 * proceeding forwards one node at a time. At each node, a selection of matches
1518 * (len >= 2), as well as the literal byte (len = 1), is considered. An item of
1519 * length 'len' provides a new path to reach the node 'len' bytes later. If
1520 * such a path is the lowest cost found so far to reach that later node, then
1521 * that later node is updated with the new cost and the "arrival" which provided
1524 * Note that although this algorithm is based on minimum cost path search, due
1525 * to various simplifying assumptions the result is not guaranteed to be the
1526 * true minimum cost, or "optimal", path over the graph of all valid LZX
1527 * representations of this block.
1529 * Also, note that because of the presence of the recent offsets queue (which is
1530 * a type of adaptive state), the algorithm cannot work backwards and compute
1531 * "cost to end" instead of "cost to beginning". Furthermore, the way the
1532 * algorithm handles this adaptive state in the "minimum cost" parse is actually
1533 * only an approximation. It's possible for the globally optimal, minimum cost
1534 * path to contain a prefix, ending at a position, where that path prefix is
1535 * *not* the minimum cost path to that position. This can happen if such a path
1536 * prefix results in a different adaptive state which results in lower costs
1537 * later. The algorithm does not solve this problem in general; it only looks
1538 * one step ahead, with the exception of special consideration for "gap
1541 static forceinline struct lzx_lru_queue
1542 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
1543 const u8 * const restrict block_begin,
1544 const u32 block_size,
1545 const struct lzx_lru_queue initial_queue,
1548 struct lzx_optimum_node *cur_node = c->optimum_nodes;
1549 struct lzx_optimum_node * const end_node = cur_node + block_size;
1550 struct lz_match *cache_ptr = c->match_cache;
1551 const u8 *in_next = block_begin;
1552 const u8 * const block_end = block_begin + block_size;
1555 * Instead of storing the match offset LRU queues in the
1556 * 'lzx_optimum_node' structures, we save memory (and cache lines) by
1557 * storing them in a smaller array. This works because the algorithm
1558 * only requires a limited history of the adaptive state. Once a given
1559 * state is more than LZX_MAX_MATCH_LEN bytes behind the current node
1560 * (more if gap match consideration is enabled; we just round up to 512
1561 * so it's a power of 2), it is no longer needed.
1563 * The QUEUE() macro finds the queue for the given node. This macro has
1564 * been optimized by taking advantage of 'struct lzx_lru_queue' and
1565 * 'struct lzx_optimum_node' both being 8 bytes in size and alignment.
1567 struct lzx_lru_queue queues[512];
1568 STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1);
1569 STATIC_ASSERT(sizeof(c->optimum_nodes[0]) == sizeof(queues[0]));
1570 #define QUEUE(node) \
1571 (*(struct lzx_lru_queue *)((char *)queues + \
1572 ((uintptr_t)(node) % (ARRAY_LEN(queues) * sizeof(queues[0])))))
1573 /*(queues[(uintptr_t)(node) / sizeof(*(node)) % ARRAY_LEN(queues)])*/
1575 #if CONSIDER_GAP_MATCHES
1576 u32 matches_before_gap[ARRAY_LEN(queues)];
1577 #define MATCH_BEFORE_GAP(node) \
1578 (matches_before_gap[(uintptr_t)(node) / sizeof(*(node)) % \
1579 ARRAY_LEN(matches_before_gap)])
1583 * Initially, the cost to reach each node is "infinity".
1585 * The first node actually should have cost 0, but "infinity"
1586 * (0xFFFFFFFF) works just as well because it immediately overflows.
1588 * The following statement also intentionally sets the 'item' of the
1589 * first node, which would otherwise have no meaning, to 0xFFFFFFFF for
1590 * use as a sentinel. See lzx_walk_item_list().
1592 memset(c->optimum_nodes, 0xFF,
1593 (block_size + 1) * sizeof(c->optimum_nodes[0]));
1595 /* Initialize the recent offsets queue for the first node. */
1596 QUEUE(cur_node) = initial_queue;
1598 do { /* For each node in the block in position order... */
1600 unsigned num_matches;
1605 * A selection of matches for the block was already saved in
1606 * memory so that we don't have to run the uncompressed data
1607 * through the matchfinder on every optimization pass. However,
1608 * we still search for repeat offset matches during each
1609 * optimization pass because we cannot predict the state of the
1610 * recent offsets queue. But as a heuristic, we don't bother
1611 * searching for repeat offset matches if the general-purpose
1612 * matchfinder failed to find any matches.
1614 * Note that a match of length n at some offset implies there is
1615 * also a match of length l for LZX_MIN_MATCH_LEN <= l <= n at
1616 * that same offset. In other words, we don't necessarily need
1617 * to use the full length of a match. The key heuristic that
1618 * saves a significicant amount of time is that for each
1619 * distinct length, we only consider the smallest offset for
1620 * which that length is available. This heuristic also applies
1621 * to repeat offsets, which we order specially: R0 < R1 < R2 <
1622 * any explicit offset. Of course, this heuristic may be
1623 * produce suboptimal results because offset slots in LZX are
1624 * subject to entropy encoding, but in practice this is a useful
1628 num_matches = cache_ptr->length;
1632 struct lz_match *end_matches = cache_ptr + num_matches;
1633 unsigned next_len = LZX_MIN_MATCH_LEN;
1634 unsigned max_len = min(block_end - in_next, LZX_MAX_MATCH_LEN);
1637 /* Consider rep0 matches. */
1638 matchptr = in_next - lzx_lru_queue_R0(QUEUE(cur_node));
1639 if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1641 STATIC_ASSERT(LZX_MIN_MATCH_LEN == 2);
1643 u32 cost = cur_node->cost +
1644 c->costs.match_cost[0][
1645 next_len - LZX_MIN_MATCH_LEN];
1646 if (cost <= (cur_node + next_len)->cost) {
1647 (cur_node + next_len)->cost = cost;
1648 (cur_node + next_len)->item =
1649 (0 << OPTIMUM_OFFSET_SHIFT) | next_len;
1651 if (unlikely(++next_len > max_len)) {
1652 cache_ptr = end_matches;
1655 } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1659 /* Consider rep1 matches. */
1660 matchptr = in_next - lzx_lru_queue_R1(QUEUE(cur_node));
1661 if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1663 if (matchptr[next_len - 1] != in_next[next_len - 1])
1665 for (unsigned len = 2; len < next_len - 1; len++)
1666 if (matchptr[len] != in_next[len])
1669 u32 cost = cur_node->cost +
1670 c->costs.match_cost[1][
1671 next_len - LZX_MIN_MATCH_LEN];
1672 if (cost <= (cur_node + next_len)->cost) {
1673 (cur_node + next_len)->cost = cost;
1674 (cur_node + next_len)->item =
1675 (1 << OPTIMUM_OFFSET_SHIFT) | next_len;
1677 if (unlikely(++next_len > max_len)) {
1678 cache_ptr = end_matches;
1681 } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1685 /* Consider rep2 matches. */
1686 matchptr = in_next - lzx_lru_queue_R2(QUEUE(cur_node));
1687 if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1689 if (matchptr[next_len - 1] != in_next[next_len - 1])
1691 for (unsigned len = 2; len < next_len - 1; len++)
1692 if (matchptr[len] != in_next[len])
1695 u32 cost = cur_node->cost +
1696 c->costs.match_cost[2][
1697 next_len - LZX_MIN_MATCH_LEN];
1698 if (cost <= (cur_node + next_len)->cost) {
1699 (cur_node + next_len)->cost = cost;
1700 (cur_node + next_len)->item =
1701 (2 << OPTIMUM_OFFSET_SHIFT) | next_len;
1703 if (unlikely(++next_len > max_len)) {
1704 cache_ptr = end_matches;
1707 } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1711 while (next_len > cache_ptr->length)
1712 if (++cache_ptr == end_matches)
1715 /* Consider explicit offset matches. */
1717 u32 offset = cache_ptr->offset;
1718 u32 adjusted_offset = offset + LZX_OFFSET_ADJUSTMENT;
1719 unsigned offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
1720 u32 base_cost = cur_node->cost;
1723 #if CONSIDER_ALIGNED_COSTS
1724 if (offset >= LZX_MIN_ALIGNED_OFFSET)
1725 base_cost += c->costs.aligned[adjusted_offset &
1726 LZX_ALIGNED_OFFSET_BITMASK];
1730 c->costs.match_cost[offset_slot][
1731 next_len - LZX_MIN_MATCH_LEN];
1732 if (cost < (cur_node + next_len)->cost) {
1733 (cur_node + next_len)->cost = cost;
1734 (cur_node + next_len)->item =
1735 (adjusted_offset << OPTIMUM_OFFSET_SHIFT) | next_len;
1737 } while (++next_len <= cache_ptr->length);
1739 if (++cache_ptr == end_matches) {
1740 #if CONSIDER_GAP_MATCHES
1741 /* Also consider the longest explicit
1742 * offset match as a "gap match": match
1744 s32 remaining = (block_end - in_next) - (s32)next_len;
1745 if (likely(remaining >= 2)) {
1746 const u8 *strptr = in_next + next_len;
1747 const u8 *matchptr = strptr - offset;
1748 if (load_u16_unaligned(strptr) == load_u16_unaligned(matchptr)) {
1749 STATIC_ASSERT(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2 >= 250);
1750 STATIC_ASSERT(ARRAY_LEN(queues) == ARRAY_LEN(matches_before_gap));
1751 unsigned limit = min(remaining,
1752 min(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2,
1753 LZX_MAX_MATCH_LEN));
1754 unsigned rep0_len = lz_extend(strptr, matchptr, 2, limit);
1755 u8 lit = strptr[-1];
1756 cost += c->costs.main[lit] +
1757 c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN];
1758 unsigned total_len = next_len + rep0_len;
1759 if (cost < (cur_node + total_len)->cost) {
1760 (cur_node + total_len)->cost = cost;
1761 (cur_node + total_len)->item =
1763 ((u32)lit << OPTIMUM_OFFSET_SHIFT) |
1765 MATCH_BEFORE_GAP(cur_node + total_len) =
1766 (adjusted_offset << OPTIMUM_OFFSET_SHIFT) |
1771 #endif /* CONSIDER_GAP_MATCHES */
1779 /* Consider coding a literal.
1781 * To avoid an extra branch, actually checking the preferability
1782 * of coding the literal is integrated into the queue update
1784 literal = *in_next++;
1785 cost = cur_node->cost + c->costs.main[literal];
1787 /* Advance to the next position. */
1790 /* The lowest-cost path to the current position is now known.
1791 * Finalize the recent offsets queue that results from taking
1792 * this lowest-cost path. */
1794 if (cost <= cur_node->cost) {
1795 /* Literal: queue remains unchanged. */
1796 cur_node->cost = cost;
1797 cur_node->item = (u32)literal << OPTIMUM_OFFSET_SHIFT;
1798 QUEUE(cur_node) = QUEUE(cur_node - 1);
1800 /* Match: queue update is needed. */
1801 unsigned len = cur_node->item & OPTIMUM_LEN_MASK;
1802 #if CONSIDER_GAP_MATCHES
1803 s32 adjusted_offset = (s32)cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1804 STATIC_ASSERT(OPTIMUM_GAP_MATCH == 0x80000000); /* assuming sign extension */
1806 u32 adjusted_offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1809 if (adjusted_offset >= LZX_NUM_RECENT_OFFSETS) {
1810 /* Explicit offset match: insert offset at front. */
1812 lzx_lru_queue_push(QUEUE(cur_node - len),
1813 adjusted_offset - LZX_OFFSET_ADJUSTMENT);
1815 #if CONSIDER_GAP_MATCHES
1816 else if (adjusted_offset < 0) {
1817 /* "Gap match": Explicit offset match, then a
1818 * literal, then rep0 match. Save the explicit
1819 * offset match information in the cost field of
1820 * the previous node, which isn't needed
1821 * anymore. Then insert the offset at the front
1823 u32 match_before_gap = MATCH_BEFORE_GAP(cur_node);
1824 (cur_node - 1)->cost = match_before_gap;
1826 lzx_lru_queue_push(QUEUE(cur_node - len - 1 -
1827 (match_before_gap & OPTIMUM_LEN_MASK)),
1828 (match_before_gap >> OPTIMUM_OFFSET_SHIFT) -
1829 LZX_OFFSET_ADJUSTMENT);
1833 /* Repeat offset match: swap offset to front. */
1835 lzx_lru_queue_swap(QUEUE(cur_node - len),
1839 } while (cur_node != end_node);
1841 /* Return the recent offsets queue at the end of the path. */
1842 return QUEUE(cur_node);
1846 * Given the costs for the main and length codewords (c->costs.main and
1847 * c->costs.len), initialize the match cost array (c->costs.match_cost) which
1848 * directly provides the cost of every possible (length, offset slot) pair.
1851 lzx_compute_match_costs(struct lzx_compressor *c)
1853 unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) /
1854 LZX_NUM_LEN_HEADERS;
1855 struct lzx_costs *costs = &c->costs;
1856 unsigned main_symbol = LZX_NUM_CHARS;
1858 for (unsigned offset_slot = 0; offset_slot < num_offset_slots;
1861 u32 extra_cost = lzx_extra_offset_bits[offset_slot] * BIT_COST;
1864 #if CONSIDER_ALIGNED_COSTS
1865 if (offset_slot >= LZX_MIN_ALIGNED_OFFSET_SLOT)
1866 extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS * BIT_COST;
1869 for (i = 0; i < LZX_NUM_PRIMARY_LENS; i++) {
1870 costs->match_cost[offset_slot][i] =
1871 costs->main[main_symbol++] + extra_cost;
1874 extra_cost += costs->main[main_symbol++];
1876 for (; i < LZX_NUM_LENS; i++) {
1877 costs->match_cost[offset_slot][i] =
1878 costs->len[i - LZX_NUM_PRIMARY_LENS] +
1885 * Fast approximation for log2f(x). This is not as accurate as the standard C
1886 * version. It does not need to be perfectly accurate because it is only used
1887 * for estimating symbol costs, which is very approximate anyway.
1897 /* Extract the exponent and subtract 127 to remove the bias. This gives
1898 * the integer part of the result. */
1899 float res = ((u.i >> 23) & 0xFF) - 127;
1901 /* Set the exponent to 0 (plus bias of 127). This transforms the number
1902 * to the range [1, 2) while retaining the same mantissa. */
1903 u.i = (u.i & ~(0xFF << 23)) | (127 << 23);
1906 * Approximate the log2 of the transformed number using a degree 2
1907 * interpolating polynomial for log2(x) over the interval [1, 2). Then
1908 * add this to the extracted exponent to produce the final approximation
1911 * The coefficients of the interpolating polynomial used here were found
1912 * using the script tools/log2_interpolation.r.
1914 return res - 1.653124006f + u.f * (1.9941812f - u.f * 0.3347490189f);
1919 * Return the estimated cost of a symbol which has been estimated to have the
1920 * given probability.
1923 lzx_cost_for_probability(float prob)
1926 * The basic formula is:
1928 * entropy = -log2(probability)
1930 * Use this to get the cost in fractional bits. Then multiply by our
1931 * scaling factor of BIT_COST and convert to an integer.
1933 * In addition, the minimum cost is BIT_COST (one bit) because the
1934 * entropy coding method will be Huffman codes.
1936 * Careful: even though 'prob' should be <= 1.0, 'log2f_fast(prob)' may
1937 * be positive due to inaccuracy in our log2 approximation. Therefore,
1938 * we cannot, in general, assume the computed cost is non-negative, and
1939 * we should make sure negative costs get rounded up correctly.
1941 s32 cost = -log2f_fast(prob) * BIT_COST;
1942 return max(cost, BIT_COST);
1946 * Mapping: number of used literals => heuristic probability of a literal times
1947 * 6870. Generated by running this R command:
1949 * cat(paste(round(6870*2^-((304+(0:256))/64)), collapse=", "))
1951 static const u8 literal_scaled_probs[257] = {
1952 255, 253, 250, 247, 244, 242, 239, 237, 234, 232, 229, 227, 224, 222,
1953 219, 217, 215, 212, 210, 208, 206, 203, 201, 199, 197, 195, 193, 191,
1954 189, 186, 184, 182, 181, 179, 177, 175, 173, 171, 169, 167, 166, 164,
1955 162, 160, 159, 157, 155, 153, 152, 150, 149, 147, 145, 144, 142, 141,
1956 139, 138, 136, 135, 133, 132, 130, 129, 128, 126, 125, 124, 122, 121,
1957 120, 118, 117, 116, 115, 113, 112, 111, 110, 109, 107, 106, 105, 104,
1958 103, 102, 101, 100, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86,
1959 86, 85, 84, 83, 82, 81, 80, 79, 78, 78, 77, 76, 75, 74, 73, 73, 72, 71,
1960 70, 70, 69, 68, 67, 67, 66, 65, 65, 64, 63, 62, 62, 61, 60, 60, 59, 59,
1961 58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 52, 51, 51, 50, 50, 49, 49, 48,
1962 48, 47, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 40, 40, 40,
1963 39, 39, 38, 38, 38, 37, 37, 36, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33,
1964 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 27, 27,
1965 27, 26, 26, 26, 25, 25, 25, 25, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22,
1966 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18,
1967 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16
1971 * Mapping: length symbol => default cost of that symbol. This is derived from
1972 * sample data but has been slightly edited to add more bias towards the
1973 * shortest lengths, which are the most common.
1975 static const u16 lzx_default_len_costs[LZX_LENCODE_NUM_SYMBOLS] = {
1976 300, 310, 320, 330, 360, 396, 399, 416, 451, 448, 463, 466, 505, 492,
1977 503, 514, 547, 531, 566, 561, 589, 563, 592, 586, 623, 602, 639, 627,
1978 659, 643, 657, 650, 685, 662, 661, 672, 685, 686, 696, 680, 657, 682,
1979 666, 699, 674, 699, 679, 709, 688, 712, 692, 714, 694, 716, 698, 712,
1980 706, 727, 714, 727, 713, 723, 712, 718, 719, 719, 720, 735, 725, 735,
1981 728, 740, 727, 739, 727, 742, 716, 733, 733, 740, 738, 746, 737, 747,
1982 738, 745, 736, 748, 742, 749, 745, 749, 743, 748, 741, 752, 745, 752,
1983 747, 750, 747, 752, 748, 753, 750, 752, 753, 753, 749, 744, 752, 755,
1984 753, 756, 745, 748, 746, 745, 723, 757, 755, 758, 755, 758, 752, 757,
1985 754, 757, 755, 759, 755, 758, 753, 755, 755, 758, 757, 761, 755, 750,
1986 758, 759, 759, 760, 758, 751, 757, 757, 759, 759, 758, 759, 758, 761,
1987 750, 761, 758, 760, 759, 761, 758, 761, 760, 752, 759, 760, 759, 759,
1988 757, 762, 760, 761, 761, 748, 761, 760, 762, 763, 752, 762, 762, 763,
1989 762, 762, 763, 763, 762, 763, 762, 763, 762, 763, 763, 764, 763, 762,
1990 763, 762, 762, 762, 764, 764, 763, 764, 763, 763, 763, 762, 763, 763,
1991 762, 764, 764, 763, 762, 763, 763, 763, 763, 762, 764, 763, 762, 764,
1992 764, 763, 763, 765, 764, 764, 762, 763, 764, 765, 763, 764, 763, 764,
1993 762, 764, 764, 754, 763, 764, 763, 763, 762, 763, 584,
1996 /* Set default costs to bootstrap the iterative optimization algorithm. */
1998 lzx_set_default_costs(struct lzx_compressor *c)
2001 u32 num_literals = 0;
2002 u32 num_used_literals = 0;
2003 float inv_num_matches = 1.0f / c->freqs.main[LZX_NUM_CHARS];
2004 float inv_num_items;
2005 float prob_match = 1.0f;
2007 float base_literal_prob;
2009 /* Some numbers here have been hardcoded to assume a bit cost of 64. */
2010 STATIC_ASSERT(BIT_COST == 64);
2012 /* Estimate the number of literals that will used. 'num_literals' is
2013 * the total number, whereas 'num_used_literals' is the number of
2014 * distinct symbols. */
2015 for (i = 0; i < LZX_NUM_CHARS; i++) {
2016 num_literals += c->freqs.main[i];
2017 num_used_literals += (c->freqs.main[i] != 0);
2020 /* Note: all match headers were tallied as symbol 'LZX_NUM_CHARS'. We
2021 * don't attempt to estimate which ones will be used. */
2023 inv_num_items = 1.0f / (num_literals + c->freqs.main[LZX_NUM_CHARS]);
2024 base_literal_prob = literal_scaled_probs[num_used_literals] *
2027 /* Literal costs. We use two different methods to compute the
2028 * probability of each literal and mix together their results. */
2029 for (i = 0; i < LZX_NUM_CHARS; i++) {
2030 u32 freq = c->freqs.main[i];
2032 float prob = 0.5f * ((freq * inv_num_items) +
2034 c->costs.main[i] = lzx_cost_for_probability(prob);
2037 c->costs.main[i] = 11 * BIT_COST;
2041 /* Match header costs. We just assume that all match headers are
2042 * equally probable, but we do take into account the relative cost of a
2043 * match header vs. a literal depending on how common matches are
2044 * expected to be vs. literals. */
2045 prob_match = max(prob_match, 0.15f);
2046 match_cost = lzx_cost_for_probability(prob_match / (c->num_main_syms -
2048 for (; i < c->num_main_syms; i++)
2049 c->costs.main[i] = match_cost;
2051 /* Length symbol costs. These are just set to fixed values which
2052 * reflect the fact the smallest lengths are typically the most common,
2053 * and therefore are typically the cheapest. */
2054 for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
2055 c->costs.len[i] = lzx_default_len_costs[i];
2057 #if CONSIDER_ALIGNED_COSTS
2058 /* Aligned offset symbol costs. These are derived from the estimated
2059 * probability of each aligned offset symbol. */
2060 for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2061 /* We intentionally tallied the frequencies in the wrong slots,
2062 * not accounting for LZX_OFFSET_ADJUSTMENT, since doing the
2063 * fixup here is faster: a constant 8 subtractions here vs. one
2064 * addition for every match. */
2065 unsigned j = (i - LZX_OFFSET_ADJUSTMENT) & LZX_ALIGNED_OFFSET_BITMASK;
2066 if (c->freqs.aligned[j] != 0) {
2067 float prob = c->freqs.aligned[j] * inv_num_matches;
2068 c->costs.aligned[i] = lzx_cost_for_probability(prob);
2070 c->costs.aligned[i] =
2071 (2 * LZX_NUM_ALIGNED_OFFSET_BITS) * BIT_COST;
2077 /* Update the current cost model to reflect the computed Huffman codes. */
2079 lzx_set_costs_from_codes(struct lzx_compressor *c)
2082 const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
2084 for (i = 0; i < c->num_main_syms; i++) {
2085 c->costs.main[i] = (lens->main[i] ? lens->main[i] :
2086 MAIN_CODEWORD_LIMIT) * BIT_COST;
2089 for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
2090 c->costs.len[i] = (lens->len[i] ? lens->len[i] :
2091 LENGTH_CODEWORD_LIMIT) * BIT_COST;
2094 #if CONSIDER_ALIGNED_COSTS
2095 for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2096 c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] :
2097 ALIGNED_CODEWORD_LIMIT) * BIT_COST;
2103 * Choose a "near-optimal" literal/match sequence to use for the current block,
2104 * then flush the block. Because the cost of each Huffman symbol is unknown
2105 * until the Huffman codes have been built and the Huffman codes themselves
2106 * depend on the symbol frequencies, this uses an iterative optimization
2107 * algorithm to approximate an optimal solution. The first optimization pass
2108 * for the block uses default costs; additional passes use costs derived from
2109 * the Huffman codes computed in the previous pass.
2111 static forceinline struct lzx_lru_queue
2112 lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
2113 struct lzx_output_bitstream * const restrict os,
2114 const u8 * const restrict block_begin,
2115 const u32 block_size,
2116 const struct lzx_lru_queue initial_queue,
2119 unsigned num_passes_remaining = c->num_optim_passes;
2120 struct lzx_lru_queue new_queue;
2123 lzx_set_default_costs(c);
2126 lzx_compute_match_costs(c);
2127 new_queue = lzx_find_min_cost_path(c, block_begin, block_size,
2128 initial_queue, is_16_bit);
2130 if (--num_passes_remaining == 0)
2133 /* At least one optimization pass remains. Update the costs. */
2134 lzx_reset_symbol_frequencies(c);
2135 lzx_tally_item_list(c, block_size, is_16_bit);
2136 lzx_build_huffman_codes(c);
2137 lzx_set_costs_from_codes(c);
2140 /* Done optimizing. Generate the sequence list and flush the block. */
2141 lzx_reset_symbol_frequencies(c);
2142 seq_idx = lzx_record_item_list(c, block_size, is_16_bit);
2143 lzx_flush_block(c, os, block_begin, block_size, seq_idx);
2148 * This is the "near-optimal" LZX compressor.
2150 * For each block, it performs a relatively thorough graph search to find an
2151 * inexpensive (in terms of compressed size) way to output the block.
2153 * Note: there are actually many things this algorithm leaves on the table in
2154 * terms of compression ratio. So although it may be "near-optimal", it is
2155 * certainly not "optimal". The goal is not to produce the optimal compression
2156 * ratio, which for LZX is probably impossible within any practical amount of
2157 * time, but rather to produce a compression ratio significantly better than a
2158 * simpler "greedy" or "lazy" parse while still being relatively fast.
2160 static forceinline void
2161 lzx_compress_near_optimal(struct lzx_compressor * restrict c,
2162 const u8 * const restrict in_begin, size_t in_nbytes,
2163 struct lzx_output_bitstream * restrict os,
2166 const u8 * in_next = in_begin;
2167 const u8 * const in_end = in_begin + in_nbytes;
2168 u32 max_len = LZX_MAX_MATCH_LEN;
2169 u32 nice_len = min(c->nice_match_length, max_len);
2170 u32 next_hashes[2] = {0, 0};
2171 struct lzx_lru_queue queue = LZX_QUEUE_INITIALIZER;
2173 /* Initialize the matchfinder. */
2174 CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
2177 /* Starting a new block */
2179 const u8 * const in_block_begin = in_next;
2180 const u8 * const in_max_block_end =
2181 in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2182 struct lz_match *cache_ptr = c->match_cache;
2183 const u8 *next_search_pos = in_next;
2184 const u8 *next_observation = in_next;
2185 const u8 *next_pause_point =
2186 min(in_next + min(MIN_BLOCK_SIZE,
2187 in_max_block_end - in_next),
2188 in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2189 in_max_block_end - in_next));
2191 lzx_init_block_split_stats(&c->split_stats);
2192 lzx_reset_symbol_frequencies(c);
2194 if (in_next >= next_pause_point)
2198 * Run the input buffer through the matchfinder, caching the
2199 * matches, until we decide to end the block.
2201 * For a tighter matchfinding loop, we compute a "pause point",
2202 * which is the next position at which we may need to check
2203 * whether to end the block or to decrease max_len. We then
2204 * only do these extra checks upon reaching the pause point.
2206 resume_matchfinding:
2208 if (in_next >= next_search_pos) {
2209 /* Search for matches at this position. */
2210 struct lz_match *lz_matchptr;
2213 lz_matchptr = CALL_BT_MF(is_16_bit, c,
2214 bt_matchfinder_get_matches,
2219 c->max_search_depth,
2223 cache_ptr->length = lz_matchptr - (cache_ptr + 1);
2224 cache_ptr = lz_matchptr;
2226 /* Accumulate literal/match statistics for block
2227 * splitting and for generating the initial cost
2229 if (in_next >= next_observation) {
2230 best_len = cache_ptr[-1].length;
2231 if (best_len >= 3) {
2232 /* Match (len >= 3) */
2235 * Note: for performance reasons this has
2236 * been simplified significantly:
2238 * - We wait until later to account for
2239 * LZX_OFFSET_ADJUSTMENT.
2240 * - We don't account for repeat offsets.
2241 * - We don't account for different match headers.
2243 c->freqs.aligned[cache_ptr[-1].offset &
2244 LZX_ALIGNED_OFFSET_BITMASK]++;
2245 c->freqs.main[LZX_NUM_CHARS]++;
2247 lzx_observe_match(&c->split_stats, best_len);
2248 next_observation = in_next + best_len;
2251 c->freqs.main[*in_next]++;
2252 lzx_observe_literal(&c->split_stats, *in_next);
2253 next_observation = in_next + 1;
2258 * If there was a very long match found, then
2259 * don't cache any matches for the bytes covered
2260 * by that match. This avoids degenerate
2261 * behavior when compressing highly redundant
2262 * data, where the number of matches can be very
2265 * This heuristic doesn't actually hurt the
2266 * compression ratio *too* much. If there's a
2267 * long match, then the data must be highly
2268 * compressible, so it doesn't matter as much
2271 if (best_len >= nice_len)
2272 next_search_pos = in_next + best_len;
2274 /* Don't search for matches at this position. */
2275 CALL_BT_MF(is_16_bit, c,
2276 bt_matchfinder_skip_position,
2280 c->max_search_depth,
2282 cache_ptr->length = 0;
2285 } while (++in_next < next_pause_point &&
2286 likely(cache_ptr < &c->match_cache[CACHE_LENGTH]));
2290 /* Adjust max_len and nice_len if we're nearing the end of the
2291 * input buffer. In addition, if we are so close to the end of
2292 * the input buffer that there cannot be any more matches, then
2293 * just advance through the last few positions and record no
2295 if (unlikely(max_len > in_end - in_next)) {
2296 max_len = in_end - in_next;
2297 nice_len = min(max_len, nice_len);
2298 if (max_len < BT_MATCHFINDER_REQUIRED_NBYTES) {
2299 while (in_next != in_end) {
2300 cache_ptr->length = 0;
2307 /* End the block if the match cache may overflow. */
2308 if (unlikely(cache_ptr >= &c->match_cache[CACHE_LENGTH]))
2311 /* End the block if the soft maximum size has been reached. */
2312 if (in_next >= in_max_block_end)
2315 /* End the block if the block splitting algorithm thinks this is
2316 * a good place to do so. */
2317 if (c->split_stats.num_new_observations >=
2318 NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2319 in_max_block_end - in_next >= MIN_BLOCK_SIZE &&
2320 lzx_should_end_block(&c->split_stats))
2323 /* It's not time to end the block yet. Compute the next pause
2324 * point and resume matchfinding. */
2326 min(in_next + min(NUM_OBSERVATIONS_PER_BLOCK_CHECK * 2 -
2327 c->split_stats.num_new_observations,
2328 in_max_block_end - in_next),
2329 in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2330 in_max_block_end - in_next));
2331 goto resume_matchfinding;
2334 /* We've decided on a block boundary and cached matches. Now
2335 * choose a match/literal sequence and flush the block. */
2336 queue = lzx_optimize_and_flush_block(c, os, in_block_begin,
2337 in_next - in_block_begin,
2339 } while (in_next != in_end);
2343 lzx_compress_near_optimal_16(struct lzx_compressor *c, const u8 *in,
2344 size_t in_nbytes, struct lzx_output_bitstream *os)
2346 lzx_compress_near_optimal(c, in, in_nbytes, os, true);
2350 lzx_compress_near_optimal_32(struct lzx_compressor *c, const u8 *in,
2351 size_t in_nbytes, struct lzx_output_bitstream *os)
2353 lzx_compress_near_optimal(c, in, in_nbytes, os, false);
2356 /******************************************************************************/
2357 /* Faster ("lazy") compression algorithm */
2358 /*----------------------------------------------------------------------------*/
2361 * Called when the compressor chooses to use a literal. This tallies the
2362 * Huffman symbol for the literal, increments the current literal run length,
2363 * and "observes" the literal for the block split statistics.
2365 static forceinline void
2366 lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
2368 lzx_observe_literal(&c->split_stats, literal);
2369 c->freqs.main[literal]++;
2374 * Called when the compressor chooses to use a match. This tallies the Huffman
2375 * symbol(s) for a match, saves the match data and the length of the preceding
2376 * literal run, updates the recent offsets queue, and "observes" the match for
2377 * the block split statistics.
2379 static forceinline void
2380 lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 adjusted_offset,
2381 u32 recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
2382 u32 *litrunlen_p, struct lzx_sequence **next_seq_p)
2384 struct lzx_sequence *next_seq = *next_seq_p;
2387 lzx_observe_match(&c->split_stats, length);
2389 mainsym = lzx_tally_main_and_lensyms(c, length, adjusted_offset,
2391 next_seq->litrunlen_and_matchlen =
2392 (*litrunlen_p << SEQ_MATCHLEN_BITS) | length;
2393 next_seq->adjusted_offset_and_mainsym =
2394 (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym;
2396 /* Update the recent offsets queue. */
2397 if (adjusted_offset < LZX_NUM_RECENT_OFFSETS) {
2398 /* Repeat offset match. */
2399 swap(recent_offsets[0], recent_offsets[adjusted_offset]);
2401 /* Explicit offset match. */
2403 /* Tally the aligned offset symbol if needed. */
2404 if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET + LZX_OFFSET_ADJUSTMENT)
2405 c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
2407 recent_offsets[2] = recent_offsets[1];
2408 recent_offsets[1] = recent_offsets[0];
2409 recent_offsets[0] = adjusted_offset - LZX_OFFSET_ADJUSTMENT;
2412 /* Reset the literal run length and advance to the next sequence. */
2413 *next_seq_p = next_seq + 1;
2418 * Called when the compressor ends a block. This finshes the last lzx_sequence,
2419 * which is just a literal run with no following match. This literal run might
2422 static forceinline void
2423 lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
2425 last_seq->litrunlen_and_matchlen = litrunlen << SEQ_MATCHLEN_BITS;
2429 * Find the longest repeat offset match with the current position. If a match
2430 * is found, return its length and set *best_rep_idx_ret to the index of its
2431 * offset in @recent_offsets. Otherwise, return 0.
2433 * Don't bother with length 2 matches; consider matches of length >= 3 only.
2434 * Also assume that max_len >= 3.
2437 lzx_find_longest_repeat_offset_match(const u8 * const in_next,
2438 const u32 recent_offsets[],
2439 const unsigned max_len,
2440 unsigned *best_rep_idx_ret)
2442 STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); /* loop is unrolled */
2444 const u32 seq3 = load_u24_unaligned(in_next);
2446 unsigned best_rep_len = 0;
2447 unsigned best_rep_idx = 0;
2450 /* Check for rep0 match (most recent offset) */
2451 matchptr = in_next - recent_offsets[0];
2452 if (load_u24_unaligned(matchptr) == seq3)
2453 best_rep_len = lz_extend(in_next, matchptr, 3, max_len);
2455 /* Check for rep1 match (second most recent offset) */
2456 matchptr = in_next - recent_offsets[1];
2457 if (load_u24_unaligned(matchptr) == seq3) {
2458 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2459 if (rep_len > best_rep_len) {
2460 best_rep_len = rep_len;
2465 /* Check for rep2 match (third most recent offset) */
2466 matchptr = in_next - recent_offsets[2];
2467 if (load_u24_unaligned(matchptr) == seq3) {
2468 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2469 if (rep_len > best_rep_len) {
2470 best_rep_len = rep_len;
2475 *best_rep_idx_ret = best_rep_idx;
2476 return best_rep_len;
2480 * Fast heuristic scoring for lazy parsing: how "good" is this match?
2481 * This is mainly determined by the length: longer matches are better.
2482 * However, we also give a bonus to close (small offset) matches and to repeat
2483 * offset matches, since those require fewer bits to encode.
2486 static forceinline unsigned
2487 lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
2489 unsigned score = len;
2491 if (adjusted_offset < 4096)
2493 if (adjusted_offset < 256)
2499 static forceinline unsigned
2500 lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
2506 * This is the "lazy" LZX compressor. The basic idea is that before it chooses
2507 * a match, it checks to see if there's a longer match at the next position. If
2508 * yes, it chooses a literal and continues to the next position. If no, it
2509 * chooses the match.
2511 * Some additional heuristics are used as well. Repeat offset matches are
2512 * considered favorably and sometimes are chosen immediately. In addition, long
2513 * matches (at least "nice_len" bytes) are chosen immediately as well. Finally,
2514 * when we decide whether a match is "better" than another, we take the offset
2515 * into consideration as well as the length.
2517 static forceinline void
2518 lzx_compress_lazy(struct lzx_compressor * restrict c,
2519 const u8 * const restrict in_begin, size_t in_nbytes,
2520 struct lzx_output_bitstream * restrict os, bool is_16_bit)
2522 const u8 * in_next = in_begin;
2523 const u8 * const in_end = in_begin + in_nbytes;
2524 unsigned max_len = LZX_MAX_MATCH_LEN;
2525 unsigned nice_len = min(c->nice_match_length, max_len);
2526 STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
2527 u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1};
2528 u32 next_hashes[2] = {0, 0};
2530 /* Initialize the matchfinder. */
2531 CALL_HC_MF(is_16_bit, c, hc_matchfinder_init);
2534 /* Starting a new block */
2536 const u8 * const in_block_begin = in_next;
2537 const u8 * const in_max_block_end =
2538 in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2539 struct lzx_sequence *next_seq = c->chosen_sequences;
2543 u32 cur_adjusted_offset;
2547 u32 next_adjusted_offset;
2548 unsigned next_score;
2549 unsigned best_rep_len;
2550 unsigned best_rep_idx;
2554 lzx_reset_symbol_frequencies(c);
2555 lzx_init_block_split_stats(&c->split_stats);
2558 /* Adjust max_len and nice_len if we're nearing the end
2559 * of the input buffer. */
2560 if (unlikely(max_len > in_end - in_next)) {
2561 max_len = in_end - in_next;
2562 nice_len = min(max_len, nice_len);
2565 /* Find the longest match (subject to the
2566 * max_search_depth cutoff parameter) with the current
2567 * position. Don't bother with length 2 matches; only
2568 * look for matches of length >= 3. */
2569 cur_len = CALL_HC_MF(is_16_bit, c,
2570 hc_matchfinder_longest_match,
2576 c->max_search_depth,
2580 /* If there was no match found, or the only match found
2581 * was a distant short match, then choose a literal. */
2584 cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT &&
2585 cur_offset != recent_offsets[0] &&
2586 cur_offset != recent_offsets[1] &&
2587 cur_offset != recent_offsets[2]))
2589 lzx_choose_literal(c, *in_next, &litrunlen);
2594 /* Heuristic: if this match has the most recent offset,
2595 * then go ahead and choose it as a rep0 match. */
2596 if (cur_offset == recent_offsets[0]) {
2598 skip_len = cur_len - 1;
2599 cur_adjusted_offset = 0;
2600 goto choose_cur_match;
2603 /* Compute the longest match's score as an explicit
2605 cur_adjusted_offset = cur_offset + LZX_OFFSET_ADJUSTMENT;
2606 cur_score = lzx_explicit_offset_match_score(cur_len, cur_adjusted_offset);
2608 /* Find the longest repeat offset match at this
2609 * position. If we find one and it's "better" than the
2610 * explicit offset match we found, then go ahead and
2611 * choose the repeat offset match immediately. */
2612 best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2618 if (best_rep_len != 0 &&
2619 (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2620 best_rep_idx)) >= cur_score)
2622 cur_len = best_rep_len;
2623 cur_adjusted_offset = best_rep_idx;
2624 skip_len = best_rep_len - 1;
2625 goto choose_cur_match;
2630 * We have a match at the current position. If the
2631 * match is very long, then choose it immediately.
2632 * Otherwise, see if there's a better match at the next
2636 if (cur_len >= nice_len) {
2637 skip_len = cur_len - 1;
2638 goto choose_cur_match;
2641 if (unlikely(max_len > in_end - in_next)) {
2642 max_len = in_end - in_next;
2643 nice_len = min(max_len, nice_len);
2646 next_len = CALL_HC_MF(is_16_bit, c,
2647 hc_matchfinder_longest_match,
2653 c->max_search_depth / 2,
2657 if (next_len <= cur_len - 2) {
2658 /* No potentially better match was found. */
2660 skip_len = cur_len - 2;
2661 goto choose_cur_match;
2664 next_adjusted_offset = next_offset + LZX_OFFSET_ADJUSTMENT;
2665 next_score = lzx_explicit_offset_match_score(next_len, next_adjusted_offset);
2667 best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2673 if (best_rep_len != 0 &&
2674 (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2675 best_rep_idx)) >= next_score)
2678 if (rep_score > cur_score) {
2679 /* The next match is better, and it's a
2680 * repeat offset match. */
2681 lzx_choose_literal(c, *(in_next - 2),
2683 cur_len = best_rep_len;
2684 cur_adjusted_offset = best_rep_idx;
2685 skip_len = cur_len - 1;
2686 goto choose_cur_match;
2689 if (next_score > cur_score) {
2690 /* The next match is better, and it's an
2691 * explicit offset match. */
2692 lzx_choose_literal(c, *(in_next - 2),
2695 cur_adjusted_offset = next_adjusted_offset;
2696 cur_score = next_score;
2697 goto have_cur_match;
2701 /* The original match was better; choose it. */
2702 skip_len = cur_len - 2;
2705 /* Choose a match and have the matchfinder skip over its
2706 * remaining bytes. */
2707 lzx_choose_match(c, cur_len, cur_adjusted_offset,
2708 recent_offsets, is_16_bit,
2709 &litrunlen, &next_seq);
2710 CALL_HC_MF(is_16_bit, c,
2711 hc_matchfinder_skip_bytes,
2717 in_next += skip_len;
2719 /* Keep going until it's time to end the block. */
2720 } while (in_next < in_max_block_end &&
2721 !(c->split_stats.num_new_observations >=
2722 NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2723 in_next - in_block_begin >= MIN_BLOCK_SIZE &&
2724 in_end - in_next >= MIN_BLOCK_SIZE &&
2725 lzx_should_end_block(&c->split_stats)));
2727 /* Flush the block. */
2728 lzx_finish_sequence(next_seq, litrunlen);
2729 lzx_flush_block(c, os, in_block_begin, in_next - in_block_begin, 0);
2731 /* Keep going until we've reached the end of the input buffer. */
2732 } while (in_next != in_end);
2736 lzx_compress_lazy_16(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2737 struct lzx_output_bitstream *os)
2739 lzx_compress_lazy(c, in, in_nbytes, os, true);
2743 lzx_compress_lazy_32(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2744 struct lzx_output_bitstream *os)
2746 lzx_compress_lazy(c, in, in_nbytes, os, false);
2749 /******************************************************************************/
2750 /* Compressor operations */
2751 /*----------------------------------------------------------------------------*/
2754 * Generate tables for mapping match offsets (actually, "adjusted" match
2755 * offsets) to offset slots.
2758 lzx_init_offset_slot_tabs(struct lzx_compressor *c)
2760 u32 adjusted_offset = 0;
2764 for (; adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1);
2767 if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
2768 LZX_OFFSET_ADJUSTMENT)
2770 c->offset_slot_tab_1[adjusted_offset] = slot;
2773 /* slots [30, 49] */
2774 for (; adjusted_offset < LZX_MAX_WINDOW_SIZE;
2775 adjusted_offset += (u32)1 << 14)
2777 if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
2778 LZX_OFFSET_ADJUSTMENT)
2780 c->offset_slot_tab_2[adjusted_offset >> 14] = slot;
2785 lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
2787 if (compression_level <= MAX_FAST_LEVEL) {
2788 if (lzx_is_16_bit(max_bufsize))
2789 return offsetof(struct lzx_compressor, hc_mf_16) +
2790 hc_matchfinder_size_16(max_bufsize);
2792 return offsetof(struct lzx_compressor, hc_mf_32) +
2793 hc_matchfinder_size_32(max_bufsize);
2795 if (lzx_is_16_bit(max_bufsize))
2796 return offsetof(struct lzx_compressor, bt_mf_16) +
2797 bt_matchfinder_size_16(max_bufsize);
2799 return offsetof(struct lzx_compressor, bt_mf_32) +
2800 bt_matchfinder_size_32(max_bufsize);
2804 /* Compute the amount of memory needed to allocate an LZX compressor. */
2806 lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2811 if (max_bufsize > LZX_MAX_WINDOW_SIZE)
2814 size += lzx_get_compressor_size(max_bufsize, compression_level);
2816 size += max_bufsize; /* account for in_buffer */
2820 /* Allocate an LZX compressor. */
2822 lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
2823 bool destructive, void **c_ret)
2825 unsigned window_order;
2826 struct lzx_compressor *c;
2828 /* Validate the maximum buffer size and get the window order from it. */
2829 window_order = lzx_get_window_order(max_bufsize);
2830 if (window_order == 0)
2831 return WIMLIB_ERR_INVALID_PARAM;
2833 /* Allocate the compressor. */
2834 c = MALLOC(lzx_get_compressor_size(max_bufsize, compression_level));
2838 c->window_order = window_order;
2839 c->num_main_syms = lzx_get_num_main_syms(window_order);
2840 c->destructive = destructive;
2842 /* Allocate the buffer for preprocessed data if needed. */
2843 if (!c->destructive) {
2844 c->in_buffer = MALLOC(max_bufsize);
2849 if (compression_level <= MAX_FAST_LEVEL) {
2851 /* Fast compression: Use lazy parsing. */
2852 if (lzx_is_16_bit(max_bufsize))
2853 c->impl = lzx_compress_lazy_16;
2855 c->impl = lzx_compress_lazy_32;
2857 /* Scale max_search_depth and nice_match_length with the
2858 * compression level. */
2859 c->max_search_depth = (60 * compression_level) / 20;
2860 c->nice_match_length = (80 * compression_level) / 20;
2862 /* lzx_compress_lazy() needs max_search_depth >= 2 because it
2863 * halves the max_search_depth when attempting a lazy match, and
2864 * max_search_depth must be at least 1. */
2865 c->max_search_depth = max(c->max_search_depth, 2);
2868 /* Normal / high compression: Use near-optimal parsing. */
2869 if (lzx_is_16_bit(max_bufsize))
2870 c->impl = lzx_compress_near_optimal_16;
2872 c->impl = lzx_compress_near_optimal_32;
2874 /* Scale max_search_depth and nice_match_length with the
2875 * compression level. */
2876 c->max_search_depth = (24 * compression_level) / 50;
2877 c->nice_match_length = (48 * compression_level) / 50;
2879 /* Also scale num_optim_passes with the compression level. But
2880 * the more passes there are, the less they help --- so don't
2881 * add them linearly. */
2882 c->num_optim_passes = 1;
2883 c->num_optim_passes += (compression_level >= 45);
2884 c->num_optim_passes += (compression_level >= 70);
2885 c->num_optim_passes += (compression_level >= 100);
2886 c->num_optim_passes += (compression_level >= 150);
2887 c->num_optim_passes += (compression_level >= 200);
2888 c->num_optim_passes += (compression_level >= 300);
2890 /* max_search_depth must be at least 1. */
2891 c->max_search_depth = max(c->max_search_depth, 1);
2894 /* Prepare the offset => offset slot mapping. */
2895 lzx_init_offset_slot_tabs(c);
2903 return WIMLIB_ERR_NOMEM;
2906 /* Compress a buffer of data. */
2908 lzx_compress(const void *restrict in, size_t in_nbytes,
2909 void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2911 struct lzx_compressor *c = _c;
2912 struct lzx_output_bitstream os;
2915 /* Don't bother trying to compress very small inputs. */
2919 /* If the compressor is in "destructive" mode, then we can directly
2920 * preprocess the input data. Otherwise, we need to copy it into an
2921 * internal buffer first. */
2922 if (!c->destructive) {
2923 memcpy(c->in_buffer, in, in_nbytes);
2927 /* Preprocess the input data. */
2928 lzx_preprocess((void *)in, in_nbytes);
2930 /* Initially, the previous Huffman codeword lengths are all zeroes. */
2932 memset(&c->codes[1].lens, 0, sizeof(struct lzx_lens));
2934 /* Initialize the output bitstream. */
2935 lzx_init_output(&os, out, out_nbytes_avail);
2937 /* Call the compression level-specific compress() function. */
2938 (*c->impl)(c, in, in_nbytes, &os);
2940 /* Flush the output bitstream. */
2941 result = lzx_flush_output(&os);
2943 /* If the data did not compress to less than its original size and we
2944 * preprocessed the original buffer, then postprocess it to restore it
2945 * to its original state. */
2946 if (result == 0 && c->destructive)
2947 lzx_postprocess((void *)in, in_nbytes);
2949 /* Return the number of compressed bytes, or 0 if the input did not
2950 * compress to less than its original size. */
2954 /* Free an LZX compressor. */
2956 lzx_free_compressor(void *_c)
2958 struct lzx_compressor *c = _c;
2960 if (!c->destructive)
2965 const struct compressor_ops lzx_compressor_ops = {
2966 .get_needed_memory = lzx_get_needed_memory,
2967 .create_compressor = lzx_create_compressor,
2968 .compress = lzx_compress,
2969 .free_compressor = lzx_free_compressor,