4 * A compressor for the LZX compression format, as used in WIM archives.
8 * Copyright (C) 2012-2016 Eric Biggers
10 * This file is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU Lesser General Public License as published by the Free
12 * Software Foundation; either version 3 of the License, or (at your option) any
15 * This file is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this file; if not, see http://www.gnu.org/licenses/.
26 * This file contains a compressor for the LZX ("Lempel-Ziv eXtended")
27 * compression format, as used in the WIM (Windows IMaging) file format.
29 * Two different LZX-compatible algorithms are implemented: "near-optimal" and
30 * "lazy". "Near-optimal" is significantly slower than "lazy", but results in a
31 * better compression ratio. The "near-optimal" algorithm is used at the
32 * default compression level.
34 * This file may need some slight modifications to be used outside of the WIM
35 * format. In particular, in other situations the LZX block header might be
36 * slightly different, and sliding window support might be required.
38 * LZX is a compression format derived from DEFLATE, the format used by zlib and
39 * gzip. Both LZX and DEFLATE use LZ77 matching and Huffman coding. Certain
40 * details are quite similar, such as the method for storing Huffman codes.
41 * However, the main differences are:
43 * - LZX preprocesses the data to attempt to make x86 machine code slightly more
44 * compressible before attempting to compress it further.
46 * - LZX uses a "main" alphabet which combines literals and matches, with the
47 * match symbols containing a "length header" (giving all or part of the match
48 * length) and an "offset slot" (giving, roughly speaking, the order of
49 * magnitude of the match offset).
51 * - LZX does not have static Huffman blocks (that is, the kind with preset
52 * Huffman codes); however it does have two types of dynamic Huffman blocks
53 * ("verbatim" and "aligned").
55 * - LZX has a minimum match length of 2 rather than 3. Length 2 matches can be
56 * useful, but generally only if the compressor is smart about choosing them.
58 * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue
59 * of match offsets. This is very useful for certain types of files, such as
60 * binary files that have repeating records.
63 /******************************************************************************/
64 /* General parameters */
65 /*----------------------------------------------------------------------------*/
68 * The compressor uses the faster algorithm at levels <= MAX_FAST_LEVEL. It
69 * uses the slower algorithm at levels > MAX_FAST_LEVEL.
71 #define MAX_FAST_LEVEL 34
74 * The compressor-side limits on the codeword lengths (in bits) for each Huffman
75 * code. To make outputting bits slightly faster, some of these limits are
76 * lower than the limits defined by the LZX format. This does not significantly
77 * affect the compression ratio.
79 #define MAIN_CODEWORD_LIMIT 16
80 #define LENGTH_CODEWORD_LIMIT 12
81 #define ALIGNED_CODEWORD_LIMIT 7
82 #define PRE_CODEWORD_LIMIT 7
85 /******************************************************************************/
86 /* Block splitting parameters */
87 /*----------------------------------------------------------------------------*/
90 * The compressor always outputs blocks of at least this size in bytes, except
91 * for the last block which may need to be smaller.
93 #define MIN_BLOCK_SIZE 6500
96 * The compressor attempts to end a block when it reaches this size in bytes.
97 * The final size might be slightly larger due to matches extending beyond the
98 * end of the block. Specifically:
100 * - The near-optimal compressor may choose a match of up to LZX_MAX_MATCH_LEN
101 * bytes starting at the SOFT_MAX_BLOCK_SIZE'th byte.
103 * - The lazy compressor may choose a sequence of literals starting at the
104 * SOFT_MAX_BLOCK_SIZE'th byte when it sees a sequence of increasingly better
105 * matches. The final match may be up to LZX_MAX_MATCH_LEN bytes. The
106 * length of the literal sequence is approximately limited by the "nice match
109 #define SOFT_MAX_BLOCK_SIZE 100000
112 * The number of observed items (matches and literals) that represents
113 * sufficient data for the compressor to decide whether the current block should
116 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 400
119 /******************************************************************************/
120 /* Parameters for slower algorithm */
121 /*----------------------------------------------------------------------------*/
124 * The log base 2 of the number of entries in the hash table for finding length
125 * 2 matches. This could be as high as 16, but using a smaller hash table
126 * speeds up compression due to reduced cache pressure.
128 #define BT_MATCHFINDER_HASH2_ORDER 12
131 * The number of lz_match structures in the match cache, excluding the extra
132 * "overflow" entries. This value should be high enough so that nearly the
133 * time, all matches found in a given block can fit in the match cache.
134 * However, fallback behavior (immediately terminating the block) on cache
135 * overflow is still required.
137 #define CACHE_LENGTH (SOFT_MAX_BLOCK_SIZE * 5)
140 * An upper bound on the number of matches that can ever be saved in the match
141 * cache for a single position. Since each match we save for a single position
142 * has a distinct length, we can use the number of possible match lengths in LZX
143 * as this bound. This bound is guaranteed to be valid in all cases, although
144 * if 'nice_match_length < LZX_MAX_MATCH_LEN', then it will never actually be
147 #define MAX_MATCHES_PER_POS LZX_NUM_LENS
150 * A scaling factor that makes it possible to consider fractional bit costs. A
151 * single bit has a cost of (1 << COST_SHIFT).
153 * Note: this is only useful as a statistical trick for when the true costs are
154 * unknown. In reality, each token in LZX requires a whole number of bits to
160 * Should the compressor take into account the costs of aligned offset symbols?
162 #define CONSIDER_ALIGNED_COSTS 1
165 * Should the "minimum" cost path search algorithm consider "gap" matches, where
166 * a normal match is followed by a literal, then by a match with the same
167 * offset? This is one specific, somewhat common situation in which the true
168 * minimum cost path is often different from the path found by looking only one
171 #define CONSIDER_GAP_MATCHES 1
173 /******************************************************************************/
175 /*----------------------------------------------------------------------------*/
181 #include "wimlib/compress_common.h"
182 #include "wimlib/compressor_ops.h"
183 #include "wimlib/error.h"
184 #include "wimlib/lz_extend.h"
185 #include "wimlib/lzx_common.h"
186 #include "wimlib/unaligned.h"
187 #include "wimlib/util.h"
189 /* Note: BT_MATCHFINDER_HASH2_ORDER must be defined *before* including
190 * bt_matchfinder.h. */
192 /* Matchfinders with 16-bit positions */
194 #define MF_SUFFIX _16
195 #include "wimlib/bt_matchfinder.h"
196 #include "wimlib/hc_matchfinder.h"
198 /* Matchfinders with 32-bit positions */
202 #define MF_SUFFIX _32
203 #include "wimlib/bt_matchfinder.h"
204 #include "wimlib/hc_matchfinder.h"
206 /******************************************************************************/
207 /* Compressor structure */
208 /*----------------------------------------------------------------------------*/
210 /* Codewords for the Huffman codes */
211 struct lzx_codewords {
212 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
213 u32 len[LZX_LENCODE_NUM_SYMBOLS];
214 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
218 * Codeword lengths, in bits, for the Huffman codes.
220 * A codeword length of 0 means the corresponding codeword has zero frequency.
222 * The main and length codes each have one extra entry for use as a sentinel.
223 * See lzx_write_compressed_code().
226 u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS + 1];
227 u8 len[LZX_LENCODE_NUM_SYMBOLS + 1];
228 u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
231 /* Codewords and lengths for the Huffman codes */
233 struct lzx_codewords codewords;
234 struct lzx_lens lens;
237 /* Symbol frequency counters for the Huffman-encoded alphabets */
239 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
240 u32 len[LZX_LENCODE_NUM_SYMBOLS];
241 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
244 /* Block split statistics. See the "Block splitting algorithm" section later in
245 * this file for details. */
246 #define NUM_LITERAL_OBSERVATION_TYPES 8
247 #define NUM_MATCH_OBSERVATION_TYPES 2
248 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
249 NUM_MATCH_OBSERVATION_TYPES)
250 struct lzx_block_split_stats {
251 u32 new_observations[NUM_OBSERVATION_TYPES];
252 u32 observations[NUM_OBSERVATION_TYPES];
253 u32 num_new_observations;
254 u32 num_observations;
258 * Represents a run of literals followed by a match or end-of-block. This
259 * structure is needed to temporarily store items chosen by the compressor,
260 * since items cannot be written until all items for the block have been chosen
261 * and the block's Huffman codes have been computed.
263 struct lzx_sequence {
265 /* The number of literals in the run. This may be 0. The literals are
266 * not stored explicitly in this structure; instead, they are read
267 * directly from the uncompressed data. */
270 /* If the next field doesn't indicate end-of-block, then this is the
271 * match length minus LZX_MIN_MATCH_LEN. */
274 /* If bit 31 is clear, then this field contains the match header in bits
275 * 0-8, and either the match offset plus LZX_OFFSET_ADJUSTMENT or a
276 * recent offset code in bits 9-30. Otherwise (if bit 31 is set), this
277 * sequence's literal run was the last literal run in the block, so
278 * there is no match that follows it. */
279 u32 adjusted_offset_and_match_hdr;
283 * This structure represents a byte position in the input buffer and a node in
284 * the graph of possible match/literal choices.
286 * Logically, each incoming edge to this node is labeled with a literal or a
287 * match that can be taken to reach this position from an earlier position; and
288 * each outgoing edge from this node is labeled with a literal or a match that
289 * can be taken to advance from this position to a later position.
291 struct lzx_optimum_node {
293 /* The cost, in bits, of the lowest-cost path that has been found to
294 * reach this position. This can change as progressively lower cost
295 * paths are found to reach this position. */
299 * The match or literal that was taken to reach this position. This can
300 * change as progressively lower cost paths are found to reach this
303 * For non-gap matches, this variable is divided into two bitfields
304 * whose meanings depend on the item type:
307 * Low bits are 0, high bits are the literal.
309 * Explicit offset matches:
310 * Low bits are the match length, high bits are the offset plus 2.
312 * Repeat offset matches:
313 * Low bits are the match length, high bits are the queue index.
315 * For gap matches, identified by OPTIMUM_GAP_MATCH set, special
316 * behavior applies --- see the code.
319 #define OPTIMUM_OFFSET_SHIFT 9
320 #define OPTIMUM_LEN_MASK ((1 << OPTIMUM_OFFSET_SHIFT) - 1)
321 #if CONSIDER_GAP_MATCHES
322 # define OPTIMUM_GAP_MATCH 0x80000000
325 } _aligned_attribute(8);
327 /* The cost model for near-optimal parsing */
330 /* 'match_cost[offset_slot][len - LZX_MIN_MATCH_LEN]' is the cost for a
331 * length 'len' match that has an offset belonging to 'offset_slot'. */
332 u16 match_cost[LZX_MAX_OFFSET_SLOTS][LZX_NUM_LENS];
334 /* Cost for each symbol in the main code */
335 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
337 /* Cost for each symbol in the length code */
338 u32 len[LZX_LENCODE_NUM_SYMBOLS];
340 #if CONSIDER_ALIGNED_COSTS
341 /* Cost for each symbol in the aligned code */
342 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
346 struct lzx_output_bitstream;
348 /* The main LZX compressor structure */
349 struct lzx_compressor {
351 /* The buffer for preprocessed input data, if not using destructive
355 /* If true, the compressor need not preserve the input buffer if it
356 * compresses the data successfully. */
359 /* Pointer to the compress() implementation chosen at allocation time */
360 void (*impl)(struct lzx_compressor *, const u8 *, size_t,
361 struct lzx_output_bitstream *);
363 /* The log base 2 of the window size for LZ match offset encoding
364 * purposes. This will be >= LZX_MIN_WINDOW_ORDER and <=
365 * LZX_MAX_WINDOW_ORDER. */
366 unsigned window_order;
368 /* The number of symbols in the main alphabet. This depends on the
369 * window order, since the window order determines the maximum possible
371 unsigned num_main_syms;
373 /* The "nice" match length: if a match of this length is found, then it
374 * is chosen immediately without further consideration. */
375 unsigned nice_match_length;
377 /* The maximum search depth: at most this many potential matches are
378 * considered at each position. */
379 unsigned max_search_depth;
381 /* The number of optimization passes per block */
382 unsigned num_optim_passes;
384 /* The symbol frequency counters for the current block */
385 struct lzx_freqs freqs;
387 /* Block split statistics */
388 struct lzx_block_split_stats split_stats;
390 /* The Huffman codes for the current and previous blocks. The one with
391 * index 'codes_index' is for the current block, and the other one is
392 * for the previous block. */
393 struct lzx_codes codes[2];
394 unsigned codes_index;
396 /* The matches and literals that the compressor has chosen for the
397 * current block. The required length of this array is limited by the
398 * maximum number of matches that can ever be chosen for a single block,
399 * plus one for the special entry at the end. */
400 struct lzx_sequence chosen_sequences[
401 DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
403 /* Tables for mapping adjusted offsets to offset slots */
404 u8 offset_slot_tab_1[32768]; /* offset slots [0, 29] */
405 u8 offset_slot_tab_2[128]; /* offset slots [30, 49] */
408 /* Data for lzx_compress_lazy() */
410 /* Hash chains matchfinder (MUST BE LAST!!!) */
412 struct hc_matchfinder_16 hc_mf_16;
413 struct hc_matchfinder_32 hc_mf_32;
417 /* Data for lzx_compress_near_optimal() */
420 * Array of nodes, one per position, for running the
421 * minimum-cost path algorithm.
423 * This array must be large enough to accommodate the
424 * worst-case number of nodes, which occurs if the
425 * compressor finds a match of length LZX_MAX_MATCH_LEN
426 * at position 'SOFT_MAX_BLOCK_SIZE - 1', producing a
427 * block of size 'SOFT_MAX_BLOCK_SIZE - 1 +
428 * LZX_MAX_MATCH_LEN'. Add one for the end-of-block
431 struct lzx_optimum_node optimum_nodes[
432 SOFT_MAX_BLOCK_SIZE - 1 +
433 LZX_MAX_MATCH_LEN + 1];
435 /* The cost model for the current optimization pass */
436 struct lzx_costs costs;
439 * Cached matches for the current block. This array
440 * contains the matches that were found at each position
441 * in the block. Specifically, for each position, there
442 * is a special 'struct lz_match' whose 'length' field
443 * contains the number of matches that were found at
444 * that position; this is followed by the matches
445 * themselves, if any, sorted by strictly increasing
448 * Note: in rare cases, there will be a very high number
449 * of matches in the block and this array will overflow.
450 * If this happens, we force the end of the current
451 * block. CACHE_LENGTH is the length at which we
452 * actually check for overflow. The extra slots beyond
453 * this are enough to absorb the worst case overflow,
454 * which occurs if starting at &match_cache[CACHE_LENGTH
455 * - 1], we write the match count header, then write
456 * MAX_MATCHES_PER_POS matches, then skip searching for
457 * matches at 'LZX_MAX_MATCH_LEN - 1' positions and
458 * write the match count header for each.
460 struct lz_match match_cache[CACHE_LENGTH +
461 MAX_MATCHES_PER_POS +
462 LZX_MAX_MATCH_LEN - 1];
464 /* Binary trees matchfinder (MUST BE LAST!!!) */
466 struct bt_matchfinder_16 bt_mf_16;
467 struct bt_matchfinder_32 bt_mf_32;
473 /******************************************************************************/
474 /* Matchfinder utilities */
475 /*----------------------------------------------------------------------------*/
478 * Will a matchfinder using 16-bit positions be sufficient for compressing
479 * buffers of up to the specified size? The limit could be 65536 bytes, but we
480 * also want to optimize out the use of offset_slot_tab_2 in the 16-bit case.
481 * This requires that the limit be no more than the length of offset_slot_tab_1
485 lzx_is_16_bit(size_t max_bufsize)
487 STATIC_ASSERT(ARRAY_LEN(((struct lzx_compressor *)0)->offset_slot_tab_1) == 32768);
488 return max_bufsize <= 32768;
492 * Return the offset slot for the specified adjusted match offset.
494 static inline unsigned
495 lzx_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
498 if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
499 return c->offset_slot_tab_1[adjusted_offset];
500 return c->offset_slot_tab_2[adjusted_offset >> 14];
504 * The following macros call either the 16-bit or the 32-bit version of a
505 * matchfinder function based on the value of 'is_16_bit', which will be known
506 * at compilation time.
509 #define CALL_HC_MF(is_16_bit, c, funcname, ...) \
510 ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \
511 CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__));
513 #define CALL_BT_MF(is_16_bit, c, funcname, ...) \
514 ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \
515 CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__));
517 /******************************************************************************/
518 /* Output bitstream */
519 /*----------------------------------------------------------------------------*/
522 * The LZX bitstream is encoded as a sequence of little endian 16-bit coding
523 * units. Bits are ordered from most significant to least significant within
528 * Structure to keep track of the current state of sending bits to the
529 * compressed output buffer.
531 struct lzx_output_bitstream {
533 /* Bits that haven't yet been written to the output buffer */
534 machine_word_t bitbuf;
536 /* Number of bits currently held in @bitbuf */
537 machine_word_t bitcount;
539 /* Pointer to the start of the output buffer */
542 /* Pointer to the position in the output buffer at which the next coding
543 * unit should be written */
546 /* Pointer just past the end of the output buffer, rounded down to a
551 /* Can the specified number of bits always be added to 'bitbuf' after any
552 * pending 16-bit coding units have been flushed? */
553 #define CAN_BUFFER(n) ((n) <= WORDBITS - 15)
555 /* Initialize the output bitstream to write to the specified buffer. */
557 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
562 os->next = os->start;
563 os->end = os->start + (size & ~1);
567 * Add some bits to the bitbuffer variable of the output bitstream. The caller
568 * must make sure there is enough room.
571 lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
573 os->bitbuf = (os->bitbuf << num_bits) | bits;
574 os->bitcount += num_bits;
578 * Flush bits from the bitbuffer variable to the output buffer. 'max_num_bits'
579 * specifies the maximum number of bits that may have been added since the last
583 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
585 /* Masking the number of bits to shift is only needed to avoid undefined
586 * behavior; we don't actually care about the results of bad shifts. On
587 * x86, the explicit masking generates no extra code. */
588 const u32 shift_mask = WORDBITS - 1;
590 if (os->end - os->next < 6)
592 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
593 shift_mask), os->next + 0);
594 if (max_num_bits > 16)
595 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
596 shift_mask), os->next + 2);
597 if (max_num_bits > 32)
598 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
599 shift_mask), os->next + 4);
600 os->next += (os->bitcount >> 4) << 1;
604 /* Add at most 16 bits to the bitbuffer and flush it. */
606 lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
608 lzx_add_bits(os, bits, num_bits);
609 lzx_flush_bits(os, 16);
613 * Flush the last coding unit to the output buffer if needed. Return the total
614 * number of bytes written to the output buffer, or 0 if an overflow occurred.
617 lzx_flush_output(struct lzx_output_bitstream *os)
619 if (os->end - os->next < 6)
622 if (os->bitcount != 0) {
623 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
627 return os->next - os->start;
630 /******************************************************************************/
631 /* Preparing Huffman codes */
632 /*----------------------------------------------------------------------------*/
635 * Build the Huffman codes. This takes as input the frequency tables for each
636 * code and produces as output a set of tables that map symbols to codewords and
640 lzx_build_huffman_codes(struct lzx_compressor *c)
642 const struct lzx_freqs *freqs = &c->freqs;
643 struct lzx_codes *codes = &c->codes[c->codes_index];
645 STATIC_ASSERT(MAIN_CODEWORD_LIMIT >= 9 &&
646 MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
647 make_canonical_huffman_code(c->num_main_syms,
651 codes->codewords.main);
653 STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 8 &&
654 LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
655 make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
656 LENGTH_CODEWORD_LIMIT,
659 codes->codewords.len);
661 STATIC_ASSERT(ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
662 ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
663 make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
664 ALIGNED_CODEWORD_LIMIT,
667 codes->codewords.aligned);
670 /* Reset the symbol frequencies for the current block. */
672 lzx_reset_symbol_frequencies(struct lzx_compressor *c)
674 memset(&c->freqs, 0, sizeof(c->freqs));
678 lzx_compute_precode_items(const u8 lens[restrict],
679 const u8 prev_lens[restrict],
680 u32 precode_freqs[restrict],
681 unsigned precode_items[restrict])
690 itemptr = precode_items;
693 while (!((len = lens[run_start]) & 0x80)) {
695 /* len = the length being repeated */
697 /* Find the next run of codeword lengths. */
699 run_end = run_start + 1;
701 /* Fast case for a single length. */
702 if (likely(len != lens[run_end])) {
703 delta = prev_lens[run_start] - len;
706 precode_freqs[delta]++;
712 /* Extend the run. */
715 } while (len == lens[run_end]);
720 /* Symbol 18: RLE 20 to 51 zeroes at a time. */
721 while ((run_end - run_start) >= 20) {
722 extra_bits = min((run_end - run_start) - 20, 0x1f);
724 *itemptr++ = 18 | (extra_bits << 5);
725 run_start += 20 + extra_bits;
728 /* Symbol 17: RLE 4 to 19 zeroes at a time. */
729 if ((run_end - run_start) >= 4) {
730 extra_bits = min((run_end - run_start) - 4, 0xf);
732 *itemptr++ = 17 | (extra_bits << 5);
733 run_start += 4 + extra_bits;
737 /* A run of nonzero lengths. */
739 /* Symbol 19: RLE 4 to 5 of any length at a time. */
740 while ((run_end - run_start) >= 4) {
741 extra_bits = (run_end - run_start) > 4;
742 delta = prev_lens[run_start] - len;
746 precode_freqs[delta]++;
747 *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
748 run_start += 4 + extra_bits;
752 /* Output any remaining lengths without RLE. */
753 while (run_start != run_end) {
754 delta = prev_lens[run_start] - len;
757 precode_freqs[delta]++;
763 return itemptr - precode_items;
766 /******************************************************************************/
767 /* Outputting compressed data */
768 /*----------------------------------------------------------------------------*/
771 * Output a Huffman code in the compressed form used in LZX.
773 * The Huffman code is represented in the output as a logical series of codeword
774 * lengths from which the Huffman code, which must be in canonical form, can be
777 * The codeword lengths are themselves compressed using a separate Huffman code,
778 * the "precode", which contains a symbol for each possible codeword length in
779 * the larger code as well as several special symbols to represent repeated
780 * codeword lengths (a form of run-length encoding). The precode is itself
781 * constructed in canonical form, and its codeword lengths are represented
782 * literally in 20 4-bit fields that immediately precede the compressed codeword
783 * lengths of the larger code.
785 * Furthermore, the codeword lengths of the larger code are actually represented
786 * as deltas from the codeword lengths of the corresponding code in the previous
790 * Bitstream to which to write the compressed Huffman code.
792 * The codeword lengths, indexed by symbol, in the Huffman code.
794 * The codeword lengths, indexed by symbol, in the corresponding Huffman
795 * code in the previous block, or all zeroes if this is the first block.
797 * The number of symbols in the Huffman code.
800 lzx_write_compressed_code(struct lzx_output_bitstream *os,
801 const u8 lens[restrict],
802 const u8 prev_lens[restrict],
805 u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
806 u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
807 u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
808 unsigned precode_items[num_lens];
809 unsigned num_precode_items;
810 unsigned precode_item;
811 unsigned precode_sym;
813 u8 saved = lens[num_lens];
814 *(u8 *)(lens + num_lens) = 0x80;
816 for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
817 precode_freqs[i] = 0;
819 /* Compute the "items" (RLE / literal tokens and extra bits) with which
820 * the codeword lengths in the larger code will be output. */
821 num_precode_items = lzx_compute_precode_items(lens,
826 /* Build the precode. */
827 STATIC_ASSERT(PRE_CODEWORD_LIMIT >= 5 &&
828 PRE_CODEWORD_LIMIT <= LZX_MAX_PRE_CODEWORD_LEN);
829 make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS,
831 precode_freqs, precode_lens,
834 /* Output the lengths of the codewords in the precode. */
835 for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
836 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
838 /* Output the encoded lengths of the codewords in the larger code. */
839 for (i = 0; i < num_precode_items; i++) {
840 precode_item = precode_items[i];
841 precode_sym = precode_item & 0x1F;
842 lzx_add_bits(os, precode_codewords[precode_sym],
843 precode_lens[precode_sym]);
844 if (precode_sym >= 17) {
845 if (precode_sym == 17) {
846 lzx_add_bits(os, precode_item >> 5, 4);
847 } else if (precode_sym == 18) {
848 lzx_add_bits(os, precode_item >> 5, 5);
850 lzx_add_bits(os, (precode_item >> 5) & 1, 1);
851 precode_sym = precode_item >> 6;
852 lzx_add_bits(os, precode_codewords[precode_sym],
853 precode_lens[precode_sym]);
856 STATIC_ASSERT(CAN_BUFFER(2 * PRE_CODEWORD_LIMIT + 1));
857 lzx_flush_bits(os, 2 * PRE_CODEWORD_LIMIT + 1);
860 *(u8 *)(lens + num_lens) = saved;
864 * Write all matches and literal bytes (which were precomputed) in an LZX
865 * compressed block to the output bitstream in the final compressed
869 * The output bitstream.
871 * The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
872 * LZX_BLOCKTYPE_VERBATIM).
874 * The uncompressed data of the block.
876 * The matches and literals to output, given as a series of sequences.
878 * The main, length, and aligned offset Huffman codes for the current
879 * LZX compressed block.
882 lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
883 const u8 *block_data, const struct lzx_sequence sequences[],
884 const struct lzx_codes *codes)
886 const struct lzx_sequence *seq = sequences;
887 u32 ones_if_aligned = 0 - (block_type == LZX_BLOCKTYPE_ALIGNED);
890 /* Output the next sequence. */
892 unsigned litrunlen = seq->litrunlen;
894 unsigned main_symbol;
895 unsigned adjusted_length;
897 unsigned offset_slot;
898 unsigned num_extra_bits;
901 /* Output the literal run of the sequence. */
903 if (litrunlen) { /* Is the literal run nonempty? */
905 /* Verify optimization is enabled on 64-bit */
906 STATIC_ASSERT(WORDBITS < 64 ||
907 CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT));
909 if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) {
911 /* 64-bit: write 3 literals at a time. */
912 while (litrunlen >= 3) {
913 unsigned lit0 = block_data[0];
914 unsigned lit1 = block_data[1];
915 unsigned lit2 = block_data[2];
916 lzx_add_bits(os, codes->codewords.main[lit0],
917 codes->lens.main[lit0]);
918 lzx_add_bits(os, codes->codewords.main[lit1],
919 codes->lens.main[lit1]);
920 lzx_add_bits(os, codes->codewords.main[lit2],
921 codes->lens.main[lit2]);
922 lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
927 unsigned lit = *block_data++;
928 lzx_add_bits(os, codes->codewords.main[lit],
929 codes->lens.main[lit]);
931 unsigned lit = *block_data++;
932 lzx_add_bits(os, codes->codewords.main[lit],
933 codes->lens.main[lit]);
934 lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
936 lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
940 /* 32-bit: write 1 literal at a time. */
942 unsigned lit = *block_data++;
943 lzx_add_bits(os, codes->codewords.main[lit],
944 codes->lens.main[lit]);
945 lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
946 } while (--litrunlen);
950 /* Was this the last literal run? */
951 if (seq->adjusted_offset_and_match_hdr & 0x80000000)
954 /* Nope; output the match. */
956 match_hdr = seq->adjusted_offset_and_match_hdr & 0x1FF;
957 main_symbol = LZX_NUM_CHARS + match_hdr;
958 adjusted_length = seq->adjusted_length;
960 block_data += adjusted_length + LZX_MIN_MATCH_LEN;
962 offset_slot = match_hdr / LZX_NUM_LEN_HEADERS;
963 adjusted_offset = seq->adjusted_offset_and_match_hdr >> 9;
965 num_extra_bits = lzx_extra_offset_bits[offset_slot];
966 extra_bits = adjusted_offset - lzx_offset_slot_base[offset_slot];
968 #define MAX_MATCH_BITS (MAIN_CODEWORD_LIMIT + LENGTH_CODEWORD_LIMIT + \
969 14 + ALIGNED_CODEWORD_LIMIT)
971 /* Verify optimization is enabled on 64-bit */
972 STATIC_ASSERT(WORDBITS < 64 || CAN_BUFFER(MAX_MATCH_BITS));
974 /* Output the main symbol for the match. */
976 lzx_add_bits(os, codes->codewords.main[main_symbol],
977 codes->lens.main[main_symbol]);
978 if (!CAN_BUFFER(MAX_MATCH_BITS))
979 lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
981 /* If needed, output the length symbol for the match. */
983 if (adjusted_length >= LZX_NUM_PRIMARY_LENS) {
984 lzx_add_bits(os, codes->codewords.len[adjusted_length -
985 LZX_NUM_PRIMARY_LENS],
986 codes->lens.len[adjusted_length -
987 LZX_NUM_PRIMARY_LENS]);
988 if (!CAN_BUFFER(MAX_MATCH_BITS))
989 lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
992 /* Output the extra offset bits for the match. In aligned
993 * offset blocks, the lowest 3 bits of the adjusted offset are
994 * Huffman-encoded using the aligned offset code, provided that
995 * there are at least extra 3 offset bits required. All other
996 * extra offset bits are output verbatim. */
998 if ((adjusted_offset & ones_if_aligned) >= 16) {
1000 lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
1001 num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS);
1002 if (!CAN_BUFFER(MAX_MATCH_BITS))
1003 lzx_flush_bits(os, 14);
1005 lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
1006 LZX_ALIGNED_OFFSET_BITMASK],
1007 codes->lens.aligned[adjusted_offset &
1008 LZX_ALIGNED_OFFSET_BITMASK]);
1009 if (!CAN_BUFFER(MAX_MATCH_BITS))
1010 lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
1012 STATIC_ASSERT(CAN_BUFFER(17));
1014 lzx_add_bits(os, extra_bits, num_extra_bits);
1015 if (!CAN_BUFFER(MAX_MATCH_BITS))
1016 lzx_flush_bits(os, 17);
1019 if (CAN_BUFFER(MAX_MATCH_BITS))
1020 lzx_flush_bits(os, MAX_MATCH_BITS);
1022 /* Advance to the next sequence. */
1028 lzx_write_compressed_block(const u8 *block_begin,
1031 unsigned window_order,
1032 unsigned num_main_syms,
1033 const struct lzx_sequence sequences[],
1034 const struct lzx_codes * codes,
1035 const struct lzx_lens * prev_lens,
1036 struct lzx_output_bitstream * os)
1038 /* The first three bits indicate the type of block and are one of the
1039 * LZX_BLOCKTYPE_* constants. */
1040 lzx_write_bits(os, block_type, 3);
1043 * Output the block size.
1045 * The original LZX format encoded the block size in 24 bits. However,
1046 * the LZX format used in WIM archives uses 1 bit to specify whether the
1047 * block has the default size of 32768 bytes, then optionally 16 bits to
1048 * specify a non-default size. This works fine for Microsoft's WIM
1049 * software (WIMGAPI), which never compresses more than 32768 bytes at a
1050 * time with LZX. However, as an extension, our LZX compressor supports
1051 * compressing up to 2097152 bytes, with a corresponding increase in
1052 * window size. It is possible for blocks in these larger buffers to
1053 * exceed 65535 bytes; such blocks cannot have their size represented in
1056 * The chosen solution was to use 24 bits for the block size when
1057 * possibly required --- specifically, when the compressor has been
1058 * allocated to be capable of compressing more than 32768 bytes at once.
1060 if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
1061 lzx_write_bits(os, 1, 1);
1063 lzx_write_bits(os, 0, 1);
1065 if (window_order >= 16)
1066 lzx_write_bits(os, block_size >> 16, 8);
1068 lzx_write_bits(os, block_size & 0xFFFF, 16);
1071 /* If it's an aligned offset block, output the aligned offset code. */
1072 if (block_type == LZX_BLOCKTYPE_ALIGNED) {
1073 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1074 lzx_write_bits(os, codes->lens.aligned[i],
1075 LZX_ALIGNEDCODE_ELEMENT_SIZE);
1079 /* Output the main code (two parts). */
1080 lzx_write_compressed_code(os, codes->lens.main,
1083 lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
1084 prev_lens->main + LZX_NUM_CHARS,
1085 num_main_syms - LZX_NUM_CHARS);
1087 /* Output the length code. */
1088 lzx_write_compressed_code(os, codes->lens.len,
1090 LZX_LENCODE_NUM_SYMBOLS);
1092 /* Output the compressed matches and literals. */
1093 lzx_write_sequences(os, block_type, block_begin, sequences, codes);
1097 * Given the frequencies of symbols in an LZX-compressed block and the
1098 * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
1099 * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
1100 * will take fewer bits to output.
1103 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
1104 const struct lzx_codes * codes)
1106 u32 aligned_cost = 0;
1107 u32 verbatim_cost = 0;
1109 /* A verbatim block requires 3 bits in each place that an aligned offset
1110 * symbol would be used in an aligned offset block. */
1111 for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1112 verbatim_cost += LZX_NUM_ALIGNED_OFFSET_BITS * freqs->aligned[i];
1113 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
1116 /* Account for the cost of sending the codeword lengths of the aligned
1118 aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE *
1119 LZX_ALIGNEDCODE_NUM_SYMBOLS;
1121 if (aligned_cost < verbatim_cost)
1122 return LZX_BLOCKTYPE_ALIGNED;
1124 return LZX_BLOCKTYPE_VERBATIM;
1128 * Flush an LZX block:
1130 * 1. Build the Huffman codes.
1131 * 2. Decide whether to output the block as VERBATIM or ALIGNED.
1132 * 3. Write the block.
1133 * 4. Swap the indices of the current and previous Huffman codes.
1136 lzx_flush_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
1137 const u8 *block_begin, u32 block_size, u32 seq_idx)
1141 lzx_build_huffman_codes(c);
1143 block_type = lzx_choose_verbatim_or_aligned(&c->freqs,
1144 &c->codes[c->codes_index]);
1145 lzx_write_compressed_block(block_begin,
1150 &c->chosen_sequences[seq_idx],
1151 &c->codes[c->codes_index],
1152 &c->codes[c->codes_index ^ 1].lens,
1154 c->codes_index ^= 1;
1157 /******************************************************************************/
1158 /* Block splitting algorithm */
1159 /*----------------------------------------------------------------------------*/
1162 * The problem of block splitting is to decide when it is worthwhile to start a
1163 * new block with new entropy codes. There is a theoretically optimal solution:
1164 * recursively consider every possible block split, considering the exact cost
1165 * of each block, and choose the minimum cost approach. But this is far too
1166 * slow. Instead, as an approximation, we can count symbols and after every N
1167 * symbols, compare the expected distribution of symbols based on the previous
1168 * data with the actual distribution. If they differ "by enough", then start a
1171 * As an optimization and heuristic, we don't distinguish between every symbol
1172 * but rather we combine many symbols into a single "observation type". For
1173 * literals we only look at the high bits and low bits, and for matches we only
1174 * look at whether the match is long or not. The assumption is that for typical
1175 * "real" data, places that are good block boundaries will tend to be noticable
1176 * based only on changes in these aggregate frequencies, without looking for
1177 * subtle differences in individual symbols. For example, a change from ASCII
1178 * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1179 * to many matches (generally more compressible), would be easily noticed based
1180 * on the aggregates.
1182 * For determining whether the frequency distributions are "different enough" to
1183 * start a new block, the simply heuristic of splitting when the sum of absolute
1184 * differences exceeds a constant seems to be good enough.
1186 * Finally, for an approximation, it is not strictly necessary that the exact
1187 * symbols being used are considered. With "near-optimal parsing", for example,
1188 * the actual symbols that will be used are unknown until after the block
1189 * boundary is chosen and the block has been optimized. Since the final choices
1190 * cannot be used, we can use preliminary "greedy" choices instead.
1193 /* Initialize the block split statistics when starting a new block. */
1195 lzx_init_block_split_stats(struct lzx_block_split_stats *stats)
1197 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1198 stats->new_observations[i] = 0;
1199 stats->observations[i] = 0;
1201 stats->num_new_observations = 0;
1202 stats->num_observations = 0;
1205 /* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the
1206 * literal, for 8 possible literal observation types. */
1208 lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
1210 stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1211 stats->num_new_observations++;
1214 /* Match observation. Heuristic: use one observation type for "short match" and
1215 * one observation type for "long match". */
1217 lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length)
1219 stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
1220 stats->num_new_observations++;
1224 lzx_end_block_check(struct lzx_block_split_stats *stats)
1226 if (stats->num_observations > 0) {
1228 /* Note: to avoid slow divisions, we do not divide by
1229 * 'num_observations', but rather do all math with the numbers
1230 * multiplied by 'num_observations'. */
1231 u32 total_delta = 0;
1232 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1233 u32 expected = stats->observations[i] *
1234 stats->num_new_observations;
1235 u32 actual = stats->new_observations[i] *
1236 stats->num_observations;
1237 u32 delta = (actual > expected) ? actual - expected :
1239 total_delta += delta;
1242 /* Ready to end the block? */
1244 stats->num_new_observations * 7 / 8 * stats->num_observations)
1248 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1249 stats->num_observations += stats->new_observations[i];
1250 stats->observations[i] += stats->new_observations[i];
1251 stats->new_observations[i] = 0;
1253 stats->num_new_observations = 0;
1258 lzx_should_end_block(struct lzx_block_split_stats *stats,
1259 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
1261 /* Ready to check block split statistics? */
1262 if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
1263 in_next - in_block_begin < MIN_BLOCK_SIZE ||
1264 in_end - in_next < MIN_BLOCK_SIZE)
1267 return lzx_end_block_check(stats);
1270 /******************************************************************************/
1271 /* Slower ("near-optimal") compression algorithm */
1272 /*----------------------------------------------------------------------------*/
1275 * Least-recently-used queue for match offsets.
1277 * This is represented as a 64-bit integer for efficiency. There are three
1278 * offsets of 21 bits each. Bit 64 is garbage.
1280 struct lzx_lru_queue {
1284 #define LZX_QUEUE_OFFSET_SHIFT 21
1285 #define LZX_QUEUE_OFFSET_MASK (((u64)1 << LZX_QUEUE_OFFSET_SHIFT) - 1)
1287 #define LZX_QUEUE_R0_SHIFT (0 * LZX_QUEUE_OFFSET_SHIFT)
1288 #define LZX_QUEUE_R1_SHIFT (1 * LZX_QUEUE_OFFSET_SHIFT)
1289 #define LZX_QUEUE_R2_SHIFT (2 * LZX_QUEUE_OFFSET_SHIFT)
1291 #define LZX_QUEUE_R0_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R0_SHIFT)
1292 #define LZX_QUEUE_R1_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R1_SHIFT)
1293 #define LZX_QUEUE_R2_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R2_SHIFT)
1295 #define LZX_QUEUE_INITIALIZER { \
1296 ((u64)1 << LZX_QUEUE_R0_SHIFT) | \
1297 ((u64)1 << LZX_QUEUE_R1_SHIFT) | \
1298 ((u64)1 << LZX_QUEUE_R2_SHIFT) }
1301 lzx_lru_queue_R0(struct lzx_lru_queue queue)
1303 return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1307 lzx_lru_queue_R1(struct lzx_lru_queue queue)
1309 return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1313 lzx_lru_queue_R2(struct lzx_lru_queue queue)
1315 return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1318 /* Push a match offset onto the front (most recently used) end of the queue. */
1319 static inline struct lzx_lru_queue
1320 lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
1322 return (struct lzx_lru_queue) {
1323 .R = (queue.R << LZX_QUEUE_OFFSET_SHIFT) | offset,
1327 /* Swap a match offset to the front of the queue. */
1328 static inline struct lzx_lru_queue
1329 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
1331 unsigned shift = idx * 21;
1332 const u64 mask = LZX_QUEUE_R0_MASK;
1333 const u64 mask_high = mask << shift;
1335 return (struct lzx_lru_queue) {
1336 .R = (queue.R & ~(mask | mask_high)) |
1337 ((queue.R & mask_high) >> shift) |
1338 ((queue.R & mask) << shift),
1343 lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
1346 u32 node_idx = block_size;
1347 u32 seq_idx = ARRAY_LEN(c->chosen_sequences) - 1;
1351 /* Special value to mark last sequence */
1352 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = 0x80000000;
1353 lit_start_node = node_idx;
1361 unsigned offset_slot;
1363 /* Tally literals until either a match or the beginning of the
1364 * block is reached. Note: the item in the node at the
1365 * beginning of the block has all bits set, causing this loop to
1366 * end when it is reached. */
1368 item = c->optimum_nodes[node_idx].item;
1369 if (item & OPTIMUM_LEN_MASK)
1371 c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
1375 #if CONSIDER_GAP_MATCHES
1376 if (item & OPTIMUM_GAP_MATCH) {
1381 /* Save the literal run length for the next sequence
1382 * (the "previous sequence" when walking backwards). */
1383 len = item & OPTIMUM_LEN_MASK;
1385 c->chosen_sequences[seq_idx--].litrunlen =
1386 lit_start_node - node_idx;
1387 lit_start_node = node_idx - len;
1390 /* Tally the rep0 match after the gap. */
1391 v = len - LZX_MIN_MATCH_LEN;
1393 c->chosen_sequences[seq_idx].adjusted_length = v;
1394 if (v >= LZX_NUM_PRIMARY_LENS) {
1395 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
1396 v = LZX_NUM_PRIMARY_LENS;
1398 c->freqs.main[LZX_NUM_CHARS + v]++;
1400 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = v;
1402 /* Tally the literal in the gap. */
1403 c->freqs.main[(u8)(item >> OPTIMUM_OFFSET_SHIFT)]++;
1405 /* Fall through and tally the match before the gap.
1406 * (It was temporarily saved in the 'cost' field of the
1407 * previous node, which was free to reuse.) */
1408 item = c->optimum_nodes[--node_idx].cost;
1411 #else /* CONSIDER_GAP_MATCHES */
1414 #endif /* !CONSIDER_GAP_MATCHES */
1416 len = item & OPTIMUM_LEN_MASK;
1417 offset_data = item >> OPTIMUM_OFFSET_SHIFT;
1419 /* Save the literal run length for the next sequence (the
1420 * "previous sequence" when walking backwards). */
1422 c->chosen_sequences[seq_idx--].litrunlen =
1423 lit_start_node - node_idx;
1425 lit_start_node = node_idx;
1430 /* Record a match. */
1432 /* Tally the aligned offset symbol if needed. */
1433 if (offset_data >= 16)
1434 c->freqs.aligned[offset_data & LZX_ALIGNED_OFFSET_BITMASK]++;
1436 /* Save the adjusted length. */
1437 v = len - LZX_MIN_MATCH_LEN;
1439 c->chosen_sequences[seq_idx].adjusted_length = v;
1441 /* Tally the length symbol if needed. */
1442 if (v >= LZX_NUM_PRIMARY_LENS) {
1443 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
1444 v = LZX_NUM_PRIMARY_LENS;
1447 /* Tally the main symbol. */
1448 offset_slot = lzx_get_offset_slot(c, offset_data, is_16_bit);
1449 v += offset_slot * LZX_NUM_LEN_HEADERS;
1450 c->freqs.main[LZX_NUM_CHARS + v]++;
1452 /* Save the adjusted offset and match header. */
1454 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr =
1455 (offset_data << 9) | v;
1459 /* Save the literal run length for the first sequence. */
1461 c->chosen_sequences[seq_idx].litrunlen = lit_start_node - node_idx;
1463 /* Return the index in c->chosen_sequences at which the lzx_sequences
1469 * Given the minimum-cost path computed through the item graph for the current
1470 * block, walk the path and count how many of each symbol in each Huffman-coded
1471 * alphabet would be required to output the items (matches and literals) along
1474 * Note that the path will be walked backwards (from the end of the block to the
1475 * beginning of the block), but this doesn't matter because this function only
1476 * computes frequencies.
1479 lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1481 lzx_walk_item_list(c, block_size, is_16_bit, false);
1485 * Like lzx_tally_item_list(), but this function also generates the list of
1486 * lzx_sequences for the minimum-cost path and writes it to c->chosen_sequences,
1487 * ready to be output to the bitstream after the Huffman codes are computed.
1488 * The lzx_sequences will be written to decreasing memory addresses as the path
1489 * is walked backwards, which means they will end up in the expected
1490 * first-to-last order. The return value is the index in c->chosen_sequences at
1491 * which the lzx_sequences begin.
1494 lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1496 return lzx_walk_item_list(c, block_size, is_16_bit, true);
1500 * Find an inexpensive path through the graph of possible match/literal choices
1501 * for the current block. The nodes of the graph are
1502 * c->optimum_nodes[0...block_size]. They correspond directly to the bytes in
1503 * the current block, plus one extra node for end-of-block. The edges of the
1504 * graph are matches and literals. The goal is to find the minimum cost path
1505 * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]', given the cost
1508 * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and
1509 * proceeding forwards one node at a time. At each node, a selection of matches
1510 * (len >= 2), as well as the literal byte (len = 1), is considered. An item of
1511 * length 'len' provides a new path to reach the node 'len' bytes later. If
1512 * such a path is the lowest cost found so far to reach that later node, then
1513 * that later node is updated with the new path.
1515 * Note that although this algorithm is based on minimum cost path search, due
1516 * to various simplifying assumptions the result is not guaranteed to be the
1517 * true minimum cost, or "optimal", path over the graph of all valid LZX
1518 * representations of this block.
1520 * Also, note that because of the presence of the recent offsets queue (which is
1521 * a type of adaptive state), the algorithm cannot work backwards and compute
1522 * "cost to end" instead of "cost to beginning". Furthermore, the way the
1523 * algorithm handles this adaptive state in the "minimum cost" parse is actually
1524 * only an approximation. It's possible for the globally optimal, minimum cost
1525 * path to contain a prefix, ending at a position, where that path prefix is
1526 * *not* the minimum cost path to that position. This can happen if such a path
1527 * prefix results in a different adaptive state which results in lower costs
1528 * later. The algorithm does not solve this problem; it only considers the
1529 * lowest cost to reach each individual position.
1531 static inline struct lzx_lru_queue
1532 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
1533 const u8 * const restrict block_begin,
1534 const u32 block_size,
1535 const struct lzx_lru_queue initial_queue,
1538 struct lzx_optimum_node *cur_node = c->optimum_nodes;
1539 struct lzx_optimum_node * const end_node = cur_node + block_size;
1540 struct lz_match *cache_ptr = c->match_cache;
1541 const u8 *in_next = block_begin;
1542 const u8 * const block_end = block_begin + block_size;
1545 * Instead of storing the match offset LRU queues in the
1546 * 'lzx_optimum_node' structures, we save memory (and cache lines) by
1547 * storing them in a smaller array. This works because the algorithm
1548 * only requires a limited history of the adaptive state. Once a given
1549 * state is more than LZX_MAX_MATCH_LEN bytes behind the current node
1550 * (more if gap match consideration is enabled; we just round up to 512
1551 * so it's a power of 2), it is no longer needed.
1553 struct lzx_lru_queue queues[512];
1554 STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1);
1555 STATIC_ASSERT(sizeof(c->optimum_nodes[0]) == sizeof(queues[0]));
1556 #define QUEUE(node) \
1557 (*(struct lzx_lru_queue *)((char *)queues + \
1558 ((uintptr_t)(node) % (ARRAY_LEN(queues) * sizeof(*(node))))))
1559 /*(queues[(uintptr_t)(node) / sizeof(*(node)) % ARRAY_LEN(queues)])*/
1561 #if CONSIDER_GAP_MATCHES
1562 u32 matches_before_gap[ARRAY_LEN(queues)];
1563 #define MATCH_BEFORE_GAP(node) \
1564 (matches_before_gap[(uintptr_t)(node) / sizeof(*(node)) % \
1565 ARRAY_LEN(matches_before_gap)])
1569 * Initially, the cost to reach each node is "infinity".
1571 * The first node actually should have cost 0, but "infinity"
1572 * (0xFFFFFFFF) works just as well because it immediately overflows.
1574 * This statement also intentionally sets the 'item' of the first node,
1575 * which would otherwise have no meaning, to 0xFFFFFFFF for use as a
1576 * sentinel. See lzx_walk_item_list().
1578 memset(c->optimum_nodes, 0xFF,
1579 (block_size + 1) * sizeof(c->optimum_nodes[0]));
1581 /* Initialize the recent offsets queue for the first node. */
1582 QUEUE(cur_node) = initial_queue;
1584 do { /* For each node in the block in position order... */
1586 unsigned num_matches;
1591 * A selection of matches for the block was already saved in
1592 * memory so that we don't have to run the uncompressed data
1593 * through the matchfinder on every optimization pass. However,
1594 * we still search for repeat offset matches during each
1595 * optimization pass because we cannot predict the state of the
1596 * recent offsets queue. But as a heuristic, we don't bother
1597 * searching for repeat offset matches if the general-purpose
1598 * matchfinder failed to find any matches.
1600 * Note that a match of length n at some offset implies there is
1601 * also a match of length l for LZX_MIN_MATCH_LEN <= l <= n at
1602 * that same offset. In other words, we don't necessarily need
1603 * to use the full length of a match. The key heuristic that
1604 * saves a significicant amount of time is that for each
1605 * distinct length, we only consider the smallest offset for
1606 * which that length is available. This heuristic also applies
1607 * to repeat offsets, which we order specially: R0 < R1 < R2 <
1608 * any explicit offset. Of course, this heuristic may be
1609 * produce suboptimal results because offset slots in LZX are
1610 * subject to entropy encoding, but in practice this is a useful
1614 num_matches = cache_ptr->length;
1618 struct lz_match *end_matches = cache_ptr + num_matches;
1619 unsigned next_len = LZX_MIN_MATCH_LEN;
1620 unsigned max_len = min(block_end - in_next, LZX_MAX_MATCH_LEN);
1623 /* Consider rep0 matches. */
1624 matchptr = in_next - lzx_lru_queue_R0(QUEUE(cur_node));
1625 if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1627 STATIC_ASSERT(LZX_MIN_MATCH_LEN == 2);
1629 u32 cost = cur_node->cost +
1630 c->costs.match_cost[0][
1631 next_len - LZX_MIN_MATCH_LEN];
1632 if (cost <= (cur_node + next_len)->cost) {
1633 (cur_node + next_len)->cost = cost;
1634 (cur_node + next_len)->item =
1635 (0 << OPTIMUM_OFFSET_SHIFT) | next_len;
1637 if (unlikely(++next_len > max_len)) {
1638 cache_ptr = end_matches;
1641 } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1645 /* Consider rep1 matches. */
1646 matchptr = in_next - lzx_lru_queue_R1(QUEUE(cur_node));
1647 if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1649 if (matchptr[next_len - 1] != in_next[next_len - 1])
1651 for (unsigned len = 2; len < next_len - 1; len++)
1652 if (matchptr[len] != in_next[len])
1655 u32 cost = cur_node->cost +
1656 c->costs.match_cost[1][
1657 next_len - LZX_MIN_MATCH_LEN];
1658 if (cost <= (cur_node + next_len)->cost) {
1659 (cur_node + next_len)->cost = cost;
1660 (cur_node + next_len)->item =
1661 (1 << OPTIMUM_OFFSET_SHIFT) | next_len;
1663 if (unlikely(++next_len > max_len)) {
1664 cache_ptr = end_matches;
1667 } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1671 /* Consider rep2 matches. */
1672 matchptr = in_next - lzx_lru_queue_R2(QUEUE(cur_node));
1673 if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1675 if (matchptr[next_len - 1] != in_next[next_len - 1])
1677 for (unsigned len = 2; len < next_len - 1; len++)
1678 if (matchptr[len] != in_next[len])
1681 u32 cost = cur_node->cost +
1682 c->costs.match_cost[2][
1683 next_len - LZX_MIN_MATCH_LEN];
1684 if (cost <= (cur_node + next_len)->cost) {
1685 (cur_node + next_len)->cost = cost;
1686 (cur_node + next_len)->item =
1687 (2 << OPTIMUM_OFFSET_SHIFT) | next_len;
1689 if (unlikely(++next_len > max_len)) {
1690 cache_ptr = end_matches;
1693 } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1697 while (next_len > cache_ptr->length)
1698 if (++cache_ptr == end_matches)
1701 /* Consider explicit offset matches. */
1703 u32 offset = cache_ptr->offset;
1704 u32 offset_data = offset + LZX_OFFSET_ADJUSTMENT;
1705 unsigned offset_slot = lzx_get_offset_slot(c, offset_data, is_16_bit);
1706 u32 base_cost = cur_node->cost;
1709 #if CONSIDER_ALIGNED_COSTS
1710 if (offset >= 16 - LZX_OFFSET_ADJUSTMENT)
1711 base_cost += c->costs.aligned[offset_data &
1712 LZX_ALIGNED_OFFSET_BITMASK];
1716 c->costs.match_cost[offset_slot][
1717 next_len - LZX_MIN_MATCH_LEN];
1718 if (cost < (cur_node + next_len)->cost) {
1719 (cur_node + next_len)->cost = cost;
1720 (cur_node + next_len)->item =
1721 (offset_data << OPTIMUM_OFFSET_SHIFT) | next_len;
1723 } while (++next_len <= cache_ptr->length);
1725 if (++cache_ptr == end_matches) {
1726 #if CONSIDER_GAP_MATCHES
1727 /* Also consider the longest explicit
1728 * offset match as a "gap match": match
1730 s32 remaining = (block_end - in_next) - next_len;
1731 if (likely(remaining >= 2)) {
1732 const u8 *strptr = in_next + next_len;
1733 const u8 *matchptr = strptr - offset;
1734 if (load_u16_unaligned(strptr) == load_u16_unaligned(matchptr)) {
1735 STATIC_ASSERT(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2 >= 250);
1736 STATIC_ASSERT(ARRAY_LEN(queues) == ARRAY_LEN(matches_before_gap));
1737 u32 limit = min(remaining,
1738 min(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2,
1739 LZX_MAX_MATCH_LEN));
1740 u32 rep0_len = lz_extend(strptr, matchptr, 2, limit);
1741 u8 lit = strptr[-1];
1742 cost += c->costs.main[lit] +
1743 c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN];
1744 u32 total_len = next_len + rep0_len;
1745 if (cost < (cur_node + total_len)->cost) {
1746 (cur_node + total_len)->cost = cost;
1747 (cur_node + total_len)->item =
1749 ((u32)lit << OPTIMUM_OFFSET_SHIFT) |
1751 MATCH_BEFORE_GAP(cur_node + total_len) =
1752 (offset_data << OPTIMUM_OFFSET_SHIFT) |
1757 #endif /* CONSIDER_GAP_MATCHES */
1765 /* Consider coding a literal.
1767 * To avoid an extra branch, actually checking the preferability
1768 * of coding the literal is integrated into the queue update
1770 literal = *in_next++;
1771 cost = cur_node->cost + c->costs.main[literal];
1773 /* Advance to the next position. */
1776 /* The lowest-cost path to the current position is now known.
1777 * Finalize the recent offsets queue that results from taking
1778 * this lowest-cost path. */
1780 if (cost <= cur_node->cost) {
1781 /* Literal: queue remains unchanged. */
1782 cur_node->cost = cost;
1783 cur_node->item = (u32)literal << OPTIMUM_OFFSET_SHIFT;
1784 QUEUE(cur_node) = QUEUE(cur_node - 1);
1786 /* Match: queue update is needed. */
1787 unsigned len = cur_node->item & OPTIMUM_LEN_MASK;
1788 #if CONSIDER_GAP_MATCHES
1789 s32 offset_data = (s32)cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1790 STATIC_ASSERT(OPTIMUM_GAP_MATCH == 0x80000000); /* assuming sign extension */
1792 u32 offset_data = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1795 if (offset_data >= LZX_NUM_RECENT_OFFSETS) {
1796 /* Explicit offset match: insert offset at front. */
1798 lzx_lru_queue_push(QUEUE(cur_node - len),
1799 offset_data - LZX_OFFSET_ADJUSTMENT);
1801 #if CONSIDER_GAP_MATCHES
1802 if (offset_data < 0) {
1803 /* "Gap match": Explicit offset match, then a
1804 * literal, then rep0 match. Save the explicit
1805 * offset match information in the cost field of
1806 * the previous node, which isn't needed
1807 * anymore. Then insert the offset at the front
1809 u32 match_before_gap = MATCH_BEFORE_GAP(cur_node);
1810 (cur_node - 1)->cost = match_before_gap;
1812 lzx_lru_queue_push(QUEUE(cur_node - len - 1 -
1813 (match_before_gap & OPTIMUM_LEN_MASK)),
1814 (match_before_gap >> OPTIMUM_OFFSET_SHIFT) -
1815 LZX_OFFSET_ADJUSTMENT);
1819 /* Repeat offset match: swap offset to front. */
1821 lzx_lru_queue_swap(QUEUE(cur_node - len),
1825 } while (cur_node != end_node);
1827 /* Return the recent offsets queue at the end of the path. */
1828 return QUEUE(cur_node);
1831 /* Given the costs for the main and length codewords, compute 'match_costs'. */
1833 lzx_compute_match_costs(struct lzx_compressor *c)
1835 unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) /
1836 LZX_NUM_LEN_HEADERS;
1837 struct lzx_costs *costs = &c->costs;
1839 for (unsigned offset_slot = 0; offset_slot < num_offset_slots;
1842 u32 extra_cost = (u32)lzx_extra_offset_bits[offset_slot] <<
1844 unsigned main_symbol = LZX_NUM_CHARS + (offset_slot *
1845 LZX_NUM_LEN_HEADERS);
1848 #if CONSIDER_ALIGNED_COSTS
1849 if (offset_slot >= 8)
1850 extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS << COST_SHIFT;
1853 for (i = 0; i < LZX_NUM_PRIMARY_LENS; i++) {
1854 costs->match_cost[offset_slot][i] =
1855 costs->main[main_symbol++] + extra_cost;
1858 extra_cost += costs->main[main_symbol];
1860 for (; i < LZX_NUM_LENS; i++) {
1861 costs->match_cost[offset_slot][i] =
1862 costs->len[i - LZX_NUM_PRIMARY_LENS] +
1868 /* Set default LZX Huffman symbol costs to bootstrap the iterative optimization
1871 lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size)
1874 bool have_byte[256];
1875 unsigned num_used_bytes;
1877 /* The costs below are hard coded to use a COST_SHIFT of 6. */
1878 STATIC_ASSERT(COST_SHIFT == 6);
1883 * - Use smaller initial costs for literal symbols when the input buffer
1884 * contains fewer distinct bytes.
1886 * - Assume that match symbols are more costly than literal symbols.
1888 * - Assume that length symbols for shorter lengths are less costly than
1889 * length symbols for longer lengths.
1892 for (i = 0; i < 256; i++)
1893 have_byte[i] = false;
1895 for (i = 0; i < block_size; i++)
1896 have_byte[block[i]] = true;
1899 for (i = 0; i < 256; i++)
1900 num_used_bytes += have_byte[i];
1902 for (i = 0; i < 256; i++)
1903 c->costs.main[i] = 560 - (256 - num_used_bytes);
1905 for (; i < c->num_main_syms; i++)
1906 c->costs.main[i] = 680;
1908 for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
1909 c->costs.len[i] = 412 + i;
1911 #if CONSIDER_ALIGNED_COSTS
1912 for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
1913 c->costs.aligned[i] = LZX_NUM_ALIGNED_OFFSET_BITS << COST_SHIFT;
1916 lzx_compute_match_costs(c);
1919 /* Update the current cost model to reflect the computed Huffman codes. */
1921 lzx_set_costs_from_codes(struct lzx_compressor *c)
1924 const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
1926 for (i = 0; i < c->num_main_syms; i++) {
1927 c->costs.main[i] = (lens->main[i] ? lens->main[i] :
1928 MAIN_CODEWORD_LIMIT) << COST_SHIFT;
1931 for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
1932 c->costs.len[i] = (lens->len[i] ? lens->len[i] :
1933 LENGTH_CODEWORD_LIMIT) << COST_SHIFT;
1936 #if CONSIDER_ALIGNED_COSTS
1937 for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1938 c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] :
1939 ALIGNED_CODEWORD_LIMIT) << COST_SHIFT;
1943 lzx_compute_match_costs(c);
1947 * Choose a "near-optimal" literal/match sequence to use for the current block,
1948 * then flush the block. Because the cost of each Huffman symbol is unknown
1949 * until the Huffman codes have been built and the Huffman codes themselves
1950 * depend on the symbol frequencies, this uses an iterative optimization
1951 * algorithm to approximate an optimal solution. The first optimization pass
1952 * for the block uses default costs. Additional passes use costs taken from the
1953 * Huffman codes computed in the previous pass.
1955 static inline struct lzx_lru_queue
1956 lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
1957 struct lzx_output_bitstream * const restrict os,
1958 const u8 * const restrict block_begin,
1959 const u32 block_size,
1960 const struct lzx_lru_queue initial_queue,
1963 unsigned num_passes_remaining = c->num_optim_passes;
1964 struct lzx_lru_queue new_queue;
1967 lzx_set_default_costs(c, block_begin, block_size);
1970 new_queue = lzx_find_min_cost_path(c, block_begin, block_size,
1971 initial_queue, is_16_bit);
1973 if (--num_passes_remaining == 0)
1976 /* At least one optimization pass remains. Update the costs. */
1977 lzx_reset_symbol_frequencies(c);
1978 lzx_tally_item_list(c, block_size, is_16_bit);
1979 lzx_build_huffman_codes(c);
1980 lzx_set_costs_from_codes(c);
1983 /* Done optimizing. Generate the sequence list and flush the block. */
1984 lzx_reset_symbol_frequencies(c);
1985 seq_idx = lzx_record_item_list(c, block_size, is_16_bit);
1986 lzx_flush_block(c, os, block_begin, block_size, seq_idx);
1991 * This is the "near-optimal" LZX compressor.
1993 * For each block, it performs a relatively thorough graph search to find an
1994 * inexpensive (in terms of compressed size) way to output the block.
1996 * Note: there are actually many things this algorithm leaves on the table in
1997 * terms of compression ratio. So although it may be "near-optimal", it is
1998 * certainly not "optimal". The goal is not to produce the optimal compression
1999 * ratio, which for LZX is probably impossible within any practical amount of
2000 * time, but rather to produce a compression ratio significantly better than a
2001 * simpler "greedy" or "lazy" parse while still being relatively fast.
2004 lzx_compress_near_optimal(struct lzx_compressor * restrict c,
2005 const u8 * const restrict in_begin, size_t in_nbytes,
2006 struct lzx_output_bitstream * restrict os,
2009 const u8 * in_next = in_begin;
2010 const u8 * const in_end = in_begin + in_nbytes;
2011 u32 max_len = LZX_MAX_MATCH_LEN;
2012 u32 nice_len = min(c->nice_match_length, max_len);
2013 u32 next_hashes[2] = {0, 0};
2014 struct lzx_lru_queue queue = LZX_QUEUE_INITIALIZER;
2016 /* Initialize the matchfinder. */
2017 CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
2020 /* Starting a new block */
2021 const u8 * const in_block_begin = in_next;
2022 const u8 * const in_max_block_end =
2023 in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2024 struct lz_match *cache_ptr = c->match_cache;
2025 const u8 *next_search_pos = in_next;
2026 const u8 *next_observation =
2027 (in_max_block_end - in_next < MIN_BLOCK_SIZE) ?
2028 in_max_block_end : in_next;
2029 const u8 *next_pause_point =
2030 min(in_next + min(MIN_BLOCK_SIZE,
2031 in_max_block_end - in_next),
2032 in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2033 in_max_block_end - in_next));
2035 lzx_init_block_split_stats(&c->split_stats);
2038 * Run the input buffer through the matchfinder, caching the
2039 * matches, until we decide to end the block.
2041 * For a tighter matchfinding loop, we compute a "pause point",
2042 * which is the next position at which we may need to check
2043 * whether to end the block or to decrease max_len. We then
2044 * only do these extra checks upon reaching the pause point.
2046 resume_matchfinding:
2048 if (in_next >= next_search_pos) {
2049 /* Search for matches at this position. */
2050 struct lz_match *lz_matchptr;
2053 lz_matchptr = CALL_BT_MF(is_16_bit, c,
2054 bt_matchfinder_get_matches,
2059 c->max_search_depth,
2063 cache_ptr->length = lz_matchptr - (cache_ptr + 1);
2064 cache_ptr = lz_matchptr;
2066 /* Accumulate block split statistics. */
2067 if (in_next >= next_observation) {
2068 if (best_len >= 4) {
2069 lzx_observe_match(&c->split_stats,
2071 next_observation = in_next + best_len;
2073 lzx_observe_literal(&c->split_stats,
2075 next_observation = in_next + 1;
2080 * If there was a very long match found, then
2081 * don't cache any matches for the bytes covered
2082 * by that match. This avoids degenerate
2083 * behavior when compressing highly redundant
2084 * data, where the number of matches can be very
2087 * This heuristic doesn't actually hurt the
2088 * compression ratio very much. If there's a
2089 * long match, then the data must be highly
2090 * compressible, so it doesn't matter as much
2093 if (best_len >= nice_len)
2094 next_search_pos = in_next + best_len;
2096 /* Don't search for matches at this position. */
2097 CALL_BT_MF(is_16_bit, c,
2098 bt_matchfinder_skip_position,
2102 c->max_search_depth,
2104 cache_ptr->length = 0;
2107 } while (++in_next < next_pause_point &&
2108 likely(cache_ptr < &c->match_cache[CACHE_LENGTH]));
2110 /* Adjust max_len and nice_len if we're nearing the end of the
2111 * input buffer. In addition, if we are so close to the end of
2112 * the input buffer that there cannot be any more matches, then
2113 * just advance through the last few positions and record no
2115 if (unlikely(max_len > in_end - in_next)) {
2116 max_len = in_end - in_next;
2117 nice_len = min(max_len, nice_len);
2118 if (max_len < BT_MATCHFINDER_REQUIRED_NBYTES) {
2119 while (in_next != in_end) {
2120 cache_ptr->length = 0;
2127 /* End the block if the match cache may overflow. */
2128 if (unlikely(cache_ptr >= &c->match_cache[CACHE_LENGTH]))
2131 /* End the block if the soft maximum size has been reached. */
2132 if (in_next >= in_max_block_end)
2135 /* End the block if the block splitting algorithm thinks this is
2136 * a good place to do so. */
2137 if (c->split_stats.num_new_observations >=
2138 NUM_OBSERVATIONS_PER_BLOCK_CHECK)
2140 if (in_max_block_end - in_next < MIN_BLOCK_SIZE)
2141 next_observation = in_max_block_end;
2142 else if (lzx_end_block_check(&c->split_stats))
2146 /* It's not time to end the block yet. Compute the next pause
2147 * point and resume the matchfinding. */
2149 min(in_next + min(NUM_OBSERVATIONS_PER_BLOCK_CHECK * 2 -
2150 c->split_stats.num_new_observations,
2151 in_max_block_end - in_next),
2152 in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2153 in_max_block_end - in_next));
2154 goto resume_matchfinding;
2157 /* We've decided on a block boundary and cached matches. Now
2158 * choose a match/literal sequence and flush the block. */
2159 queue = lzx_optimize_and_flush_block(c, os, in_block_begin,
2160 in_next - in_block_begin,
2162 } while (in_next != in_end);
2166 lzx_compress_near_optimal_16(struct lzx_compressor *c, const u8 *in,
2167 size_t in_nbytes, struct lzx_output_bitstream *os)
2169 lzx_compress_near_optimal(c, in, in_nbytes, os, true);
2173 lzx_compress_near_optimal_32(struct lzx_compressor *c, const u8 *in,
2174 size_t in_nbytes, struct lzx_output_bitstream *os)
2176 lzx_compress_near_optimal(c, in, in_nbytes, os, false);
2179 /******************************************************************************/
2180 /* Faster ("lazy") compression algorithm */
2181 /*----------------------------------------------------------------------------*/
2184 * Called when the compressor decides to use a literal. This tallies the
2185 * Huffman symbol for the literal and increment the current literal run length.
2188 lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
2190 lzx_observe_literal(&c->split_stats, literal);
2191 c->freqs.main[literal]++;
2196 * Called when the compressor decides to use a match. This tallies the Huffman
2197 * symbol(s) for a match, saves the match data and the length of the preceding
2198 * literal run, and updates the recent offsets queue.
2201 lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 offset_data,
2202 u32 recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
2203 u32 *litrunlen_p, struct lzx_sequence **next_seq_p)
2205 u32 litrunlen = *litrunlen_p;
2206 struct lzx_sequence *next_seq = *next_seq_p;
2207 unsigned offset_slot;
2210 lzx_observe_match(&c->split_stats, length);
2212 v = length - LZX_MIN_MATCH_LEN;
2214 /* Save the literal run length and adjusted length. */
2215 next_seq->litrunlen = litrunlen;
2216 next_seq->adjusted_length = v;
2218 /* Compute the length header, then tally the length symbol if needed. */
2219 if (v >= LZX_NUM_PRIMARY_LENS) {
2220 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
2221 v = LZX_NUM_PRIMARY_LENS;
2224 /* Compute the offset slot. */
2225 offset_slot = lzx_get_offset_slot(c, offset_data, is_16_bit);
2227 /* Compute the match header. */
2228 v += offset_slot * LZX_NUM_LEN_HEADERS;
2230 /* Save the adjusted offset and match header. */
2231 next_seq->adjusted_offset_and_match_hdr = (offset_data << 9) | v;
2233 /* Tally the main symbol. */
2234 c->freqs.main[LZX_NUM_CHARS + v]++;
2236 /* Update the recent offsets queue. */
2237 if (offset_data < LZX_NUM_RECENT_OFFSETS) {
2238 /* Repeat offset match. */
2239 swap(recent_offsets[0], recent_offsets[offset_data]);
2241 /* Explicit offset match. */
2243 /* Tally the aligned offset symbol if needed. */
2244 if (offset_data >= 16)
2245 c->freqs.aligned[offset_data & LZX_ALIGNED_OFFSET_BITMASK]++;
2247 recent_offsets[2] = recent_offsets[1];
2248 recent_offsets[1] = recent_offsets[0];
2249 recent_offsets[0] = offset_data - LZX_OFFSET_ADJUSTMENT;
2252 /* Reset the literal run length and advance to the next sequence. */
2253 *next_seq_p = next_seq + 1;
2258 * Called when the compressor ends a block. This finshes the last lzx_sequence,
2259 * which is just a literal run which no following match. This literal run might
2263 lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
2265 last_seq->litrunlen = litrunlen;
2267 /* Special value to mark last sequence */
2268 last_seq->adjusted_offset_and_match_hdr = 0x80000000;
2272 * Find the longest repeat offset match with the current position. If a match
2273 * is found, return its length and set *best_rep_idx_ret to the index of its
2274 * offset in @recent_offsets. Otherwise, return 0.
2276 * Don't bother with length 2 matches; consider matches of length >= 3 only.
2279 lzx_find_longest_repeat_offset_match(const u8 * const in_next,
2280 const u32 recent_offsets[],
2281 const unsigned max_len,
2282 unsigned *best_rep_idx_ret)
2284 STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); /* loop is unrolled */
2286 const u32 seq3 = load_u24_unaligned(in_next);
2288 unsigned best_rep_len;
2289 unsigned best_rep_idx;
2292 /* Check for rep0 match (most recent offset) */
2293 matchptr = in_next - recent_offsets[0];
2294 if (load_u24_unaligned(matchptr) == seq3)
2295 best_rep_len = lz_extend(in_next, matchptr, 3, max_len);
2300 /* Check for rep1 match (second most recent offset) */
2301 matchptr = in_next - recent_offsets[1];
2302 if (load_u24_unaligned(matchptr) == seq3) {
2303 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2304 if (rep_len > best_rep_len) {
2305 best_rep_len = rep_len;
2310 /* Check for rep2 match (third most recent offset) */
2311 matchptr = in_next - recent_offsets[2];
2312 if (load_u24_unaligned(matchptr) == seq3) {
2313 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2314 if (rep_len > best_rep_len) {
2315 best_rep_len = rep_len;
2320 *best_rep_idx_ret = best_rep_idx;
2321 return best_rep_len;
2325 * Fast heuristic scoring for lazy parsing: how "good" is this match?
2326 * This is mainly determined by the length: longer matches are better.
2327 * However, we also give a bonus to close (small offset) matches and to repeat
2328 * offset matches, since those require fewer bits to encode.
2331 static inline unsigned
2332 lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
2334 unsigned score = len;
2336 if (adjusted_offset < 4096)
2338 if (adjusted_offset < 256)
2344 static inline unsigned
2345 lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
2351 * This is the "lazy" LZX compressor. The basic idea is that before it chooses
2352 * a match, it checks to see if there's a longer match at the next position. If
2353 * yes, it chooses a literal and continues to the next position. If no, it
2354 * chooses the match.
2356 * Some additional heuristics are used as well. Repeat offset matches are
2357 * considered favorably and sometimes are chosen immediately. In addition, long
2358 * matches (at least "nice_len" bytes) are chosen immediately as well. Finally,
2359 * when we decide whether a match is "better" than another, we take the offset
2360 * into consideration as well as the length.
2363 lzx_compress_lazy(struct lzx_compressor * restrict c,
2364 const u8 * const restrict in_begin, size_t in_nbytes,
2365 struct lzx_output_bitstream * restrict os, bool is_16_bit)
2367 const u8 * in_next = in_begin;
2368 const u8 * const in_end = in_begin + in_nbytes;
2369 unsigned max_len = LZX_MAX_MATCH_LEN;
2370 unsigned nice_len = min(c->nice_match_length, max_len);
2371 STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
2372 u32 recent_offsets[3] = {1, 1, 1};
2373 u32 next_hashes[2] = {0, 0};
2375 /* Initialize the matchfinder. */
2376 CALL_HC_MF(is_16_bit, c, hc_matchfinder_init);
2379 /* Starting a new block */
2381 const u8 * const in_block_begin = in_next;
2382 const u8 * const in_max_block_end =
2383 in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2384 struct lzx_sequence *next_seq = c->chosen_sequences;
2388 u32 cur_offset_data;
2392 u32 next_offset_data;
2393 unsigned next_score;
2394 unsigned best_rep_len;
2395 unsigned best_rep_idx;
2399 lzx_reset_symbol_frequencies(c);
2400 lzx_init_block_split_stats(&c->split_stats);
2403 /* Adjust max_len and nice_len if we're nearing the end
2404 * of the input buffer. */
2405 if (unlikely(max_len > in_end - in_next)) {
2406 max_len = in_end - in_next;
2407 nice_len = min(max_len, nice_len);
2410 /* Find the longest match (subject to the
2411 * max_search_depth cutoff parameter) with the current
2412 * position. Don't bother with length 2 matches; only
2413 * look for matches of length >= 3. */
2414 cur_len = CALL_HC_MF(is_16_bit, c,
2415 hc_matchfinder_longest_match,
2421 c->max_search_depth,
2425 /* If there was no match found, or the only match found
2426 * was a distant short match, then choose a literal. */
2429 cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT &&
2430 cur_offset != recent_offsets[0] &&
2431 cur_offset != recent_offsets[1] &&
2432 cur_offset != recent_offsets[2]))
2434 lzx_choose_literal(c, *in_next, &litrunlen);
2439 /* Heuristic: if this match has the most recent offset,
2440 * then go ahead and choose it as a rep0 match. */
2441 if (cur_offset == recent_offsets[0]) {
2443 cur_offset_data = 0;
2444 skip_len = cur_len - 1;
2445 goto choose_cur_match;
2448 /* Compute the longest match's score as an explicit
2450 cur_offset_data = cur_offset + LZX_OFFSET_ADJUSTMENT;
2451 cur_score = lzx_explicit_offset_match_score(cur_len, cur_offset_data);
2453 /* Find the longest repeat offset match at this
2454 * position. If we find one and it's "better" than the
2455 * explicit offset match we found, then go ahead and
2456 * choose the repeat offset match immediately. */
2457 best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2463 if (best_rep_len != 0 &&
2464 (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2465 best_rep_idx)) >= cur_score)
2467 cur_len = best_rep_len;
2468 cur_offset_data = best_rep_idx;
2469 skip_len = best_rep_len - 1;
2470 goto choose_cur_match;
2475 * We have a match at the current position. If the
2476 * match is very long, then choose it immediately.
2477 * Otherwise, see if there's a better match at the next
2481 if (cur_len >= nice_len) {
2482 skip_len = cur_len - 1;
2483 goto choose_cur_match;
2486 if (unlikely(max_len > in_end - in_next)) {
2487 max_len = in_end - in_next;
2488 nice_len = min(max_len, nice_len);
2491 next_len = CALL_HC_MF(is_16_bit, c,
2492 hc_matchfinder_longest_match,
2498 c->max_search_depth / 2,
2502 if (next_len <= cur_len - 2) {
2503 /* No potentially better match was found. */
2505 skip_len = cur_len - 2;
2506 goto choose_cur_match;
2509 next_offset_data = next_offset + LZX_OFFSET_ADJUSTMENT;
2510 next_score = lzx_explicit_offset_match_score(next_len, next_offset_data);
2512 best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2518 if (best_rep_len != 0 &&
2519 (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2520 best_rep_idx)) >= next_score)
2523 if (rep_score > cur_score) {
2524 /* The next match is better, and it's a
2525 * repeat offset match. */
2526 lzx_choose_literal(c, *(in_next - 2),
2528 cur_len = best_rep_len;
2529 cur_offset_data = best_rep_idx;
2530 skip_len = cur_len - 1;
2531 goto choose_cur_match;
2534 if (next_score > cur_score) {
2535 /* The next match is better, and it's an
2536 * explicit offset match. */
2537 lzx_choose_literal(c, *(in_next - 2),
2540 cur_offset_data = next_offset_data;
2541 cur_score = next_score;
2542 goto have_cur_match;
2546 /* The original match was better; choose it. */
2547 skip_len = cur_len - 2;
2550 /* Choose a match and have the matchfinder skip over its
2551 * remaining bytes. */
2552 lzx_choose_match(c, cur_len, cur_offset_data,
2553 recent_offsets, is_16_bit,
2554 &litrunlen, &next_seq);
2555 in_next = CALL_HC_MF(is_16_bit, c,
2556 hc_matchfinder_skip_positions,
2563 /* Keep going until it's time to end the block. */
2564 } while (in_next < in_max_block_end &&
2565 !lzx_should_end_block(&c->split_stats, in_block_begin,
2568 /* Flush the block. */
2569 lzx_finish_sequence(next_seq, litrunlen);
2570 lzx_flush_block(c, os, in_block_begin, in_next - in_block_begin, 0);
2572 /* Keep going until we've reached the end of the input buffer. */
2573 } while (in_next != in_end);
2577 lzx_compress_lazy_16(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2578 struct lzx_output_bitstream *os)
2580 lzx_compress_lazy(c, in, in_nbytes, os, true);
2584 lzx_compress_lazy_32(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2585 struct lzx_output_bitstream *os)
2587 lzx_compress_lazy(c, in, in_nbytes, os, false);
2590 /******************************************************************************/
2591 /* Compressor operations */
2592 /*----------------------------------------------------------------------------*/
2595 * Generate tables for mapping match offsets (actually, "adjusted" match
2596 * offsets) to offset slots.
2599 lzx_init_offset_slot_tabs(struct lzx_compressor *c)
2601 u32 adjusted_offset = 0;
2605 for (; adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1);
2608 if (adjusted_offset >= lzx_offset_slot_base[slot + 1])
2610 c->offset_slot_tab_1[adjusted_offset] = slot;
2613 /* slots [30, 49] */
2614 for (; adjusted_offset < LZX_MAX_WINDOW_SIZE;
2615 adjusted_offset += (u32)1 << 14)
2617 if (adjusted_offset >= lzx_offset_slot_base[slot + 1])
2619 c->offset_slot_tab_2[adjusted_offset >> 14] = slot;
2624 lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
2626 if (compression_level <= MAX_FAST_LEVEL) {
2627 if (lzx_is_16_bit(max_bufsize))
2628 return offsetof(struct lzx_compressor, hc_mf_16) +
2629 hc_matchfinder_size_16(max_bufsize);
2631 return offsetof(struct lzx_compressor, hc_mf_32) +
2632 hc_matchfinder_size_32(max_bufsize);
2634 if (lzx_is_16_bit(max_bufsize))
2635 return offsetof(struct lzx_compressor, bt_mf_16) +
2636 bt_matchfinder_size_16(max_bufsize);
2638 return offsetof(struct lzx_compressor, bt_mf_32) +
2639 bt_matchfinder_size_32(max_bufsize);
2643 /* Compute the amount of memory needed to allocate an LZX compressor. */
2645 lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2650 if (max_bufsize > LZX_MAX_WINDOW_SIZE)
2653 size += lzx_get_compressor_size(max_bufsize, compression_level);
2655 size += max_bufsize; /* account for in_buffer */
2659 /* Allocate an LZX compressor. */
2661 lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
2662 bool destructive, void **c_ret)
2664 unsigned window_order;
2665 struct lzx_compressor *c;
2667 /* Validate the maximum buffer size and get the window order from it. */
2668 window_order = lzx_get_window_order(max_bufsize);
2669 if (window_order == 0)
2670 return WIMLIB_ERR_INVALID_PARAM;
2672 /* Allocate the compressor. */
2673 c = MALLOC(lzx_get_compressor_size(max_bufsize, compression_level));
2677 c->window_order = window_order;
2678 c->num_main_syms = lzx_get_num_main_syms(window_order);
2679 c->destructive = destructive;
2681 /* Allocate the buffer for preprocessed data if needed. */
2682 if (!c->destructive) {
2683 c->in_buffer = MALLOC(max_bufsize);
2688 if (compression_level <= MAX_FAST_LEVEL) {
2690 /* Fast compression: Use lazy parsing. */
2691 if (lzx_is_16_bit(max_bufsize))
2692 c->impl = lzx_compress_lazy_16;
2694 c->impl = lzx_compress_lazy_32;
2696 /* Scale max_search_depth and nice_match_length with the
2697 * compression level. */
2698 c->max_search_depth = (60 * compression_level) / 20;
2699 c->nice_match_length = (80 * compression_level) / 20;
2701 /* lzx_compress_lazy() needs max_search_depth >= 2 because it
2702 * halves the max_search_depth when attempting a lazy match, and
2703 * max_search_depth must be at least 1. */
2704 c->max_search_depth = max(c->max_search_depth, 2);
2707 /* Normal / high compression: Use near-optimal parsing. */
2708 if (lzx_is_16_bit(max_bufsize))
2709 c->impl = lzx_compress_near_optimal_16;
2711 c->impl = lzx_compress_near_optimal_32;
2713 /* Scale max_search_depth and nice_match_length with the
2714 * compression level. */
2715 c->max_search_depth = (24 * compression_level) / 50;
2716 c->nice_match_length = (48 * compression_level) / 50;
2718 /* Also scale num_optim_passes with the compression level. But
2719 * the more passes there are, the less they help --- so don't
2720 * add them linearly. */
2721 c->num_optim_passes = 1;
2722 c->num_optim_passes += (compression_level >= 45);
2723 c->num_optim_passes += (compression_level >= 70);
2724 c->num_optim_passes += (compression_level >= 100);
2725 c->num_optim_passes += (compression_level >= 150);
2726 c->num_optim_passes += (compression_level >= 200);
2727 c->num_optim_passes += (compression_level >= 300);
2729 /* max_search_depth must be at least 1. */
2730 c->max_search_depth = max(c->max_search_depth, 1);
2733 /* Prepare the offset => offset slot mapping. */
2734 lzx_init_offset_slot_tabs(c);
2742 return WIMLIB_ERR_NOMEM;
2745 /* Compress a buffer of data. */
2747 lzx_compress(const void *restrict in, size_t in_nbytes,
2748 void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2750 struct lzx_compressor *c = _c;
2751 struct lzx_output_bitstream os;
2754 /* Don't bother trying to compress very small inputs. */
2758 /* Copy the input data into the internal buffer if needed. */
2759 if (!c->destructive) {
2760 memcpy(c->in_buffer, in, in_nbytes);
2764 /* Preprocess the input data. */
2765 lzx_preprocess((void *)in, in_nbytes);
2767 /* Initially, the previous Huffman codeword lengths are all zeroes. */
2769 memset(&c->codes[1].lens, 0, sizeof(struct lzx_lens));
2771 /* Initialize the output bitstream. */
2772 lzx_init_output(&os, out, out_nbytes_avail);
2774 /* Call the compression level-specific compress() function. */
2775 (*c->impl)(c, in, in_nbytes, &os);
2777 /* Flush the output bitstream. */
2778 result = lzx_flush_output(&os);
2780 /* If the data did not compress to less than its original size and we
2781 * preprocessed the original buffer, then postprocess it to restore it
2782 * to its original state. */
2783 if (result == 0 && c->destructive)
2784 lzx_postprocess((void *)in, in_nbytes);
2786 /* Return the number of compressed bytes, or 0 if the input did not
2787 * compress to less than its original size. */
2791 /* Free an LZX compressor. */
2793 lzx_free_compressor(void *_c)
2795 struct lzx_compressor *c = _c;
2797 if (!c->destructive)
2802 const struct compressor_ops lzx_compressor_ops = {
2803 .get_needed_memory = lzx_get_needed_memory,
2804 .create_compressor = lzx_create_compressor,
2805 .compress = lzx_compress,
2806 .free_compressor = lzx_free_compressor,