4 * A compressor for the LZX compression format, as used in WIM archives.
8 * Copyright (C) 2012-2016 Eric Biggers
10 * This file is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU Lesser General Public License as published by the Free
12 * Software Foundation; either version 3 of the License, or (at your option) any
15 * This file is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this file; if not, see http://www.gnu.org/licenses/.
26 * This file contains a compressor for the LZX ("Lempel-Ziv eXtended")
27 * compression format, as used in the WIM (Windows IMaging) file format.
29 * Two different LZX-compatible algorithms are implemented: "near-optimal" and
30 * "lazy". "Near-optimal" is significantly slower than "lazy", but results in a
31 * better compression ratio. The "near-optimal" algorithm is used at the
32 * default compression level.
34 * This file may need some slight modifications to be used outside of the WIM
35 * format. In particular, in other situations the LZX block header might be
36 * slightly different, and sliding window support might be required.
38 * LZX is a compression format derived from DEFLATE, the format used by zlib and
39 * gzip. Both LZX and DEFLATE use LZ77 matching and Huffman coding. Certain
40 * details are quite similar, such as the method for storing Huffman codes.
41 * However, the main differences are:
43 * - LZX preprocesses the data to attempt to make x86 machine code slightly more
44 * compressible before attempting to compress it further.
46 * - LZX uses a "main" alphabet which combines literals and matches, with the
47 * match symbols containing a "length header" (giving all or part of the match
48 * length) and an "offset slot" (giving, roughly speaking, the order of
49 * magnitude of the match offset).
51 * - LZX does not have static Huffman blocks (that is, the kind with preset
52 * Huffman codes); however it does have two types of dynamic Huffman blocks
53 * ("verbatim" and "aligned").
55 * - LZX has a minimum match length of 2 rather than 3. Length 2 matches can be
56 * useful, but generally only if the compressor is smart about choosing them.
58 * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue
59 * of match offsets. This is very useful for certain types of files, such as
60 * binary files that have repeating records.
63 /******************************************************************************/
64 /* General parameters */
65 /*----------------------------------------------------------------------------*/
68 * The compressor uses the faster algorithm at levels <= MAX_FAST_LEVEL. It
69 * uses the slower algorithm at levels > MAX_FAST_LEVEL.
71 #define MAX_FAST_LEVEL 34
74 * The compressor-side limits on the codeword lengths (in bits) for each Huffman
75 * code. To make outputting bits slightly faster, some of these limits are
76 * lower than the limits defined by the LZX format. This does not significantly
77 * affect the compression ratio.
79 #define MAIN_CODEWORD_LIMIT 16
80 #define LENGTH_CODEWORD_LIMIT 12
81 #define ALIGNED_CODEWORD_LIMIT 7
82 #define PRE_CODEWORD_LIMIT 7
85 /******************************************************************************/
86 /* Block splitting parameters */
87 /*----------------------------------------------------------------------------*/
90 * The compressor always outputs blocks of at least this size in bytes, except
91 * for the last block which may need to be smaller.
93 #define MIN_BLOCK_SIZE 6500
96 * The compressor attempts to end a block when it reaches this size in bytes.
97 * The final size might be slightly larger due to matches extending beyond the
98 * end of the block. Specifically:
100 * - The near-optimal compressor may choose a match of up to LZX_MAX_MATCH_LEN
101 * bytes starting at position 'SOFT_MAX_BLOCK_SIZE - 1'.
103 * - The lazy compressor may choose a sequence of literals starting at position
104 * 'SOFT_MAX_BLOCK_SIZE - 1' when it sees a sequence of increasingly better
105 * matches. The final match may be up to LZX_MAX_MATCH_LEN bytes. The
106 * length of the literal sequence is approximately limited by the "nice match
109 #define SOFT_MAX_BLOCK_SIZE 100000
112 * The number of observed items (matches and literals) that represents
113 * sufficient data for the compressor to decide whether the current block should
116 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 400
119 /******************************************************************************/
120 /* Parameters for slower algorithm */
121 /*----------------------------------------------------------------------------*/
124 * The log base 2 of the number of entries in the hash table for finding length
125 * 2 matches. This could be as high as 16, but using a smaller hash table
126 * speeds up compression due to reduced cache pressure.
128 #define BT_MATCHFINDER_HASH2_ORDER 12
131 * The number of lz_match structures in the match cache, excluding the extra
132 * "overflow" entries. This value should be high enough so that nearly the
133 * time, all matches found in a given block can fit in the match cache.
134 * However, fallback behavior (immediately terminating the block) on cache
135 * overflow is still required.
137 #define CACHE_LENGTH (SOFT_MAX_BLOCK_SIZE * 5)
140 * An upper bound on the number of matches that can ever be saved in the match
141 * cache for a single position. Since each match we save for a single position
142 * has a distinct length, we can use the number of possible match lengths in LZX
143 * as this bound. This bound is guaranteed to be valid in all cases, although
144 * if 'nice_match_length < LZX_MAX_MATCH_LEN', then it will never actually be
147 #define MAX_MATCHES_PER_POS LZX_NUM_LENS
150 * A scaling factor that makes it possible to consider fractional bit costs. A
151 * single bit has a cost of BIT_COST.
153 * Note: this is only useful as a statistical trick for when the true costs are
154 * unknown. Ultimately, each token in LZX requires a whole number of bits to
160 * Should the compressor take into account the costs of aligned offset symbols
161 * instead of assuming that all are equally likely?
163 #define CONSIDER_ALIGNED_COSTS 1
166 * Should the "minimum" cost path search algorithm consider "gap" matches, where
167 * a normal match is followed by a literal, then by a match with the same
168 * offset? This is one specific, somewhat common situation in which the true
169 * minimum cost path is often different from the path found by looking only one
172 #define CONSIDER_GAP_MATCHES 1
174 /******************************************************************************/
176 /*----------------------------------------------------------------------------*/
182 #include "wimlib/compress_common.h"
183 #include "wimlib/compressor_ops.h"
184 #include "wimlib/error.h"
185 #include "wimlib/lz_extend.h"
186 #include "wimlib/lzx_common.h"
187 #include "wimlib/unaligned.h"
188 #include "wimlib/util.h"
190 /* Note: BT_MATCHFINDER_HASH2_ORDER must be defined before including
191 * bt_matchfinder.h. */
193 /* Matchfinders with 16-bit positions */
195 #define MF_SUFFIX _16
196 #include "wimlib/bt_matchfinder.h"
197 #include "wimlib/hc_matchfinder.h"
199 /* Matchfinders with 32-bit positions */
203 #define MF_SUFFIX _32
204 #include "wimlib/bt_matchfinder.h"
205 #include "wimlib/hc_matchfinder.h"
207 /******************************************************************************/
208 /* Compressor structure */
209 /*----------------------------------------------------------------------------*/
211 /* Codewords for the Huffman codes */
212 struct lzx_codewords {
213 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
214 u32 len[LZX_LENCODE_NUM_SYMBOLS];
215 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
219 * Codeword lengths, in bits, for the Huffman codes.
221 * A codeword length of 0 means the corresponding codeword has zero frequency.
223 * The main and length codes each have one extra entry for use as a sentinel.
224 * See lzx_write_compressed_code().
227 u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS + 1];
228 u8 len[LZX_LENCODE_NUM_SYMBOLS + 1];
229 u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
232 /* Codewords and lengths for the Huffman codes */
234 struct lzx_codewords codewords;
235 struct lzx_lens lens;
238 /* Symbol frequency counters for the Huffman-encoded alphabets */
240 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
241 u32 len[LZX_LENCODE_NUM_SYMBOLS];
242 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
245 /* Block split statistics. See the "Block splitting algorithm" section later in
246 * this file for details. */
247 #define NUM_LITERAL_OBSERVATION_TYPES 8
248 #define NUM_MATCH_OBSERVATION_TYPES 2
249 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
250 NUM_MATCH_OBSERVATION_TYPES)
251 struct lzx_block_split_stats {
252 u32 new_observations[NUM_OBSERVATION_TYPES];
253 u32 observations[NUM_OBSERVATION_TYPES];
254 u32 num_new_observations;
255 u32 num_observations;
259 * Represents a run of literals followed by a match or end-of-block. This
260 * structure is needed to temporarily store items chosen by the compressor,
261 * since items cannot be written until all items for the block have been chosen
262 * and the block's Huffman codes have been computed.
264 struct lzx_sequence {
266 /* The number of literals in the run. This may be 0. The literals are
267 * not stored explicitly in this structure; instead, they are read
268 * directly from the uncompressed data. */
271 /* If the next field doesn't indicate end-of-block, then this is the
272 * match length minus LZX_MIN_MATCH_LEN. */
273 u32 adjusted_length : 8;
275 /* If bit 31 is clear, then this field contains the match header in bits
276 * 0-8, and either the match offset plus LZX_OFFSET_ADJUSTMENT or a
277 * recent offset code in bits 9-30. Otherwise (if bit 31 is set), this
278 * sequence's literal run was the last literal run in the block, so
279 * there is no match that follows it. */
280 u32 adjusted_offset_and_match_hdr;
284 * This structure represents a byte position in the input buffer and a node in
285 * the graph of possible match/literal choices.
287 * Logically, each incoming edge to this node is labeled with a literal or a
288 * match that can be taken to reach this position from an earlier position; and
289 * each outgoing edge from this node is labeled with a literal or a match that
290 * can be taken to advance from this position to a later position.
292 struct lzx_optimum_node {
294 /* The cost, in bits, of the lowest-cost path that has been found to
295 * reach this position. This can change as progressively lower cost
296 * paths are found to reach this position. */
300 * The best arrival to this node, i.e. the match or literal that was
301 * used to arrive to this position at the given 'cost'. This can change
302 * as progressively lower cost paths are found to reach this position.
304 * For non-gap matches, this variable is divided into two bitfields
305 * whose meanings depend on the item type:
308 * Low bits are 0, high bits are the literal.
310 * Explicit offset matches:
311 * Low bits are the match length, high bits are the offset plus
312 * LZX_OFFSET_ADJUSTMENT.
314 * Repeat offset matches:
315 * Low bits are the match length, high bits are the queue index.
317 * For gap matches, identified by OPTIMUM_GAP_MATCH set, special
318 * behavior applies --- see the code.
321 #define OPTIMUM_OFFSET_SHIFT 9
322 #define OPTIMUM_LEN_MASK ((1 << OPTIMUM_OFFSET_SHIFT) - 1)
323 #if CONSIDER_GAP_MATCHES
324 # define OPTIMUM_GAP_MATCH 0x80000000
327 } _aligned_attribute(8);
329 /* The cost model for near-optimal parsing */
333 * 'match_cost[offset_slot][len - LZX_MIN_MATCH_LEN]' is the cost of a
334 * length 'len' match which has an offset belonging to 'offset_slot'.
335 * The cost includes the main symbol, the length symbol if required, and
336 * the extra offset bits if any, excluding any entropy-coded bits
337 * (aligned offset bits). It does *not* include the cost of the aligned
338 * offset symbol which may be required.
340 u16 match_cost[LZX_MAX_OFFSET_SLOTS][LZX_NUM_LENS];
342 /* Cost of each symbol in the main code */
343 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
345 /* Cost of each symbol in the length code */
346 u32 len[LZX_LENCODE_NUM_SYMBOLS];
348 #if CONSIDER_ALIGNED_COSTS
349 /* Cost of each symbol in the aligned offset code */
350 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
354 struct lzx_output_bitstream;
356 /* The main LZX compressor structure */
357 struct lzx_compressor {
359 /* The buffer for preprocessed input data, if not using destructive
363 /* If true, then the compressor need not preserve the input buffer if it
364 * compresses the data successfully */
367 /* Pointer to the compress() implementation chosen at allocation time */
368 void (*impl)(struct lzx_compressor *, const u8 *, size_t,
369 struct lzx_output_bitstream *);
371 /* The log base 2 of the window size for match offset encoding purposes.
372 * This will be >= LZX_MIN_WINDOW_ORDER and <= LZX_MAX_WINDOW_ORDER. */
373 unsigned window_order;
375 /* The number of symbols in the main alphabet. This depends on the
376 * window order, since the window order determines the maximum possible
378 unsigned num_main_syms;
380 /* The "nice" match length: if a match of this length is found, then it
381 * is chosen immediately without further consideration. */
382 unsigned nice_match_length;
384 /* The maximum search depth: at most this many potential matches are
385 * considered at each position. */
386 unsigned max_search_depth;
388 /* The number of optimization passes per block */
389 unsigned num_optim_passes;
391 /* The symbol frequency counters for the current block */
392 struct lzx_freqs freqs;
394 /* Block split statistics for the current block */
395 struct lzx_block_split_stats split_stats;
397 /* The Huffman codes for the current and previous blocks. The one with
398 * index 'codes_index' is for the current block, and the other one is
399 * for the previous block. */
400 struct lzx_codes codes[2];
401 unsigned codes_index;
403 /* The matches and literals that the compressor has chosen for the
404 * current block. The required length of this array is limited by the
405 * maximum number of matches that can ever be chosen for a single block,
406 * plus one for the special entry at the end. */
407 struct lzx_sequence chosen_sequences[
408 DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
410 /* Tables for mapping adjusted offsets to offset slots */
411 u8 offset_slot_tab_1[32768]; /* offset slots [0, 29] */
412 u8 offset_slot_tab_2[128]; /* offset slots [30, 49] */
415 /* Data for lzx_compress_lazy() */
417 /* Hash chains matchfinder (MUST BE LAST!!!) */
419 struct hc_matchfinder_16 hc_mf_16;
420 struct hc_matchfinder_32 hc_mf_32;
424 /* Data for lzx_compress_near_optimal() */
427 * Array of nodes, one per position, for running the
428 * minimum-cost path algorithm.
430 * This array must be large enough to accommodate the
431 * worst-case number of nodes, which occurs if the
432 * compressor finds a match of length LZX_MAX_MATCH_LEN
433 * at position 'SOFT_MAX_BLOCK_SIZE - 1', producing a
434 * block of size 'SOFT_MAX_BLOCK_SIZE - 1 +
435 * LZX_MAX_MATCH_LEN'. Add one for the end-of-block
438 struct lzx_optimum_node optimum_nodes[
439 SOFT_MAX_BLOCK_SIZE - 1 +
440 LZX_MAX_MATCH_LEN + 1];
442 /* The cost model for the current optimization pass */
443 struct lzx_costs costs;
446 * Cached matches for the current block. This array
447 * contains the matches that were found at each position
448 * in the block. Specifically, for each position, there
449 * is a special 'struct lz_match' whose 'length' field
450 * contains the number of matches that were found at
451 * that position; this is followed by the matches
452 * themselves, if any, sorted by strictly increasing
455 * Note: in rare cases, there will be a very high number
456 * of matches in the block and this array will overflow.
457 * If this happens, we force the end of the current
458 * block. CACHE_LENGTH is the length at which we
459 * actually check for overflow. The extra slots beyond
460 * this are enough to absorb the worst case overflow,
461 * which occurs if starting at &match_cache[CACHE_LENGTH
462 * - 1], we write the match count header, then write
463 * MAX_MATCHES_PER_POS matches, then skip searching for
464 * matches at 'LZX_MAX_MATCH_LEN - 1' positions and
465 * write the match count header for each.
467 struct lz_match match_cache[CACHE_LENGTH +
468 MAX_MATCHES_PER_POS +
469 LZX_MAX_MATCH_LEN - 1];
471 /* Binary trees matchfinder (MUST BE LAST!!!) */
473 struct bt_matchfinder_16 bt_mf_16;
474 struct bt_matchfinder_32 bt_mf_32;
480 /******************************************************************************/
481 /* Matchfinder utilities */
482 /*----------------------------------------------------------------------------*/
485 * Will a matchfinder using 16-bit positions be sufficient for compressing
486 * buffers of up to the specified size? The limit could be 65536 bytes, but we
487 * also want to optimize out the use of offset_slot_tab_2 in the 16-bit case.
488 * This requires that the limit be no more than the length of offset_slot_tab_1
491 static forceinline bool
492 lzx_is_16_bit(size_t max_bufsize)
494 STATIC_ASSERT(ARRAY_LEN(((struct lzx_compressor *)0)->offset_slot_tab_1) == 32768);
495 return max_bufsize <= 32768;
499 * Return the offset slot for the specified adjusted match offset.
501 static forceinline unsigned
502 lzx_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
505 if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
506 return c->offset_slot_tab_1[adjusted_offset];
507 return c->offset_slot_tab_2[adjusted_offset >> 14];
511 * The following macros call either the 16-bit or the 32-bit version of a
512 * matchfinder function based on the value of 'is_16_bit', which will be known
513 * at compilation time.
516 #define CALL_HC_MF(is_16_bit, c, funcname, ...) \
517 ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \
518 CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__));
520 #define CALL_BT_MF(is_16_bit, c, funcname, ...) \
521 ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \
522 CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__));
524 /******************************************************************************/
525 /* Output bitstream */
526 /*----------------------------------------------------------------------------*/
529 * The LZX bitstream is encoded as a sequence of little endian 16-bit coding
530 * units. Bits are ordered from most significant to least significant within
535 * Structure to keep track of the current state of sending bits to the
536 * compressed output buffer.
538 struct lzx_output_bitstream {
540 /* Bits that haven't yet been written to the output buffer */
541 machine_word_t bitbuf;
543 /* Number of bits currently held in @bitbuf */
544 machine_word_t bitcount;
546 /* Pointer to the start of the output buffer */
549 /* Pointer to the position in the output buffer at which the next coding
550 * unit should be written */
553 /* Pointer to just past the end of the output buffer, rounded down by
554 * one byte if needed to make 'end - start' a multiple of 2 */
558 /* Can the specified number of bits always be added to 'bitbuf' after all
559 * pending 16-bit coding units have been flushed? */
560 #define CAN_BUFFER(n) ((n) <= WORDBITS - 15)
562 /* Initialize the output bitstream to write to the specified buffer. */
564 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
570 os->end = (u8 *)buffer + (size & ~1);
574 * Add some bits to the bitbuffer variable of the output bitstream. The caller
575 * must make sure there is enough room.
577 static forceinline void
578 lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
580 os->bitbuf = (os->bitbuf << num_bits) | bits;
581 os->bitcount += num_bits;
585 * Flush bits from the bitbuffer variable to the output buffer. 'max_num_bits'
586 * specifies the maximum number of bits that may have been added since the last
589 static forceinline void
590 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
592 /* Masking the number of bits to shift is only needed to avoid undefined
593 * behavior; we don't actually care about the results of bad shifts. On
594 * x86, the explicit masking generates no extra code. */
595 const u32 shift_mask = WORDBITS - 1;
597 if (os->end - os->next < 6)
599 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
600 shift_mask), os->next + 0);
601 if (max_num_bits > 16)
602 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
603 shift_mask), os->next + 2);
604 if (max_num_bits > 32)
605 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
606 shift_mask), os->next + 4);
607 os->next += (os->bitcount >> 4) << 1;
611 /* Add at most 16 bits to the bitbuffer and flush it. */
612 static forceinline void
613 lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
615 lzx_add_bits(os, bits, num_bits);
616 lzx_flush_bits(os, 16);
620 * Flush the last coding unit to the output buffer if needed. Return the total
621 * number of bytes written to the output buffer, or 0 if an overflow occurred.
624 lzx_flush_output(struct lzx_output_bitstream *os)
626 if (os->end - os->next < 6)
629 if (os->bitcount != 0) {
630 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
634 return os->next - os->start;
637 /******************************************************************************/
638 /* Preparing Huffman codes */
639 /*----------------------------------------------------------------------------*/
642 * Build the Huffman codes. This takes as input the frequency tables for each
643 * code and produces as output a set of tables that map symbols to codewords and
647 lzx_build_huffman_codes(struct lzx_compressor *c)
649 const struct lzx_freqs *freqs = &c->freqs;
650 struct lzx_codes *codes = &c->codes[c->codes_index];
652 STATIC_ASSERT(MAIN_CODEWORD_LIMIT >= 9 &&
653 MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
654 make_canonical_huffman_code(c->num_main_syms,
658 codes->codewords.main);
660 STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 8 &&
661 LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
662 make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
663 LENGTH_CODEWORD_LIMIT,
666 codes->codewords.len);
668 STATIC_ASSERT(ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
669 ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
670 make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
671 ALIGNED_CODEWORD_LIMIT,
674 codes->codewords.aligned);
677 /* Reset the symbol frequencies for the current block. */
679 lzx_reset_symbol_frequencies(struct lzx_compressor *c)
681 memset(&c->freqs, 0, sizeof(c->freqs));
685 lzx_compute_precode_items(const u8 lens[restrict],
686 const u8 prev_lens[restrict],
687 u32 precode_freqs[restrict],
688 unsigned precode_items[restrict])
697 itemptr = precode_items;
700 while (!((len = lens[run_start]) & 0x80)) {
702 /* len = the length being repeated */
704 /* Find the next run of codeword lengths. */
706 run_end = run_start + 1;
708 /* Fast case for a single length. */
709 if (likely(len != lens[run_end])) {
710 delta = prev_lens[run_start] - len;
713 precode_freqs[delta]++;
719 /* Extend the run. */
722 } while (len == lens[run_end]);
727 /* Symbol 18: RLE 20 to 51 zeroes at a time. */
728 while ((run_end - run_start) >= 20) {
729 extra_bits = min((run_end - run_start) - 20, 0x1F);
731 *itemptr++ = 18 | (extra_bits << 5);
732 run_start += 20 + extra_bits;
735 /* Symbol 17: RLE 4 to 19 zeroes at a time. */
736 if ((run_end - run_start) >= 4) {
737 extra_bits = min((run_end - run_start) - 4, 0xF);
739 *itemptr++ = 17 | (extra_bits << 5);
740 run_start += 4 + extra_bits;
744 /* A run of nonzero lengths. */
746 /* Symbol 19: RLE 4 to 5 of any length at a time. */
747 while ((run_end - run_start) >= 4) {
748 extra_bits = (run_end - run_start) > 4;
749 delta = prev_lens[run_start] - len;
753 precode_freqs[delta]++;
754 *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
755 run_start += 4 + extra_bits;
759 /* Output any remaining lengths without RLE. */
760 while (run_start != run_end) {
761 delta = prev_lens[run_start] - len;
764 precode_freqs[delta]++;
770 return itemptr - precode_items;
773 /******************************************************************************/
774 /* Outputting compressed data */
775 /*----------------------------------------------------------------------------*/
778 * Output a Huffman code in the compressed form used in LZX.
780 * The Huffman code is represented in the output as a logical series of codeword
781 * lengths from which the Huffman code, which must be in canonical form, can be
784 * The codeword lengths are themselves compressed using a separate Huffman code,
785 * the "precode", which contains a symbol for each possible codeword length in
786 * the larger code as well as several special symbols to represent repeated
787 * codeword lengths (a form of run-length encoding). The precode is itself
788 * constructed in canonical form, and its codeword lengths are represented
789 * literally in 20 4-bit fields that immediately precede the compressed codeword
790 * lengths of the larger code.
792 * Furthermore, the codeword lengths of the larger code are actually represented
793 * as deltas from the codeword lengths of the corresponding code in the previous
797 * Bitstream to which to write the compressed Huffman code.
799 * The codeword lengths, indexed by symbol, in the Huffman code.
801 * The codeword lengths, indexed by symbol, in the corresponding Huffman
802 * code in the previous block, or all zeroes if this is the first block.
804 * The number of symbols in the Huffman code.
807 lzx_write_compressed_code(struct lzx_output_bitstream *os,
808 const u8 lens[restrict],
809 const u8 prev_lens[restrict],
812 u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
813 u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
814 u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
815 unsigned precode_items[num_lens];
816 unsigned num_precode_items;
817 unsigned precode_item;
818 unsigned precode_sym;
820 u8 saved = lens[num_lens];
821 *(u8 *)(lens + num_lens) = 0x80;
823 for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
824 precode_freqs[i] = 0;
826 /* Compute the "items" (RLE / literal tokens and extra bits) with which
827 * the codeword lengths in the larger code will be output. */
828 num_precode_items = lzx_compute_precode_items(lens,
833 /* Build the precode. */
834 STATIC_ASSERT(PRE_CODEWORD_LIMIT >= 5 &&
835 PRE_CODEWORD_LIMIT <= LZX_MAX_PRE_CODEWORD_LEN);
836 make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS, PRE_CODEWORD_LIMIT,
837 precode_freqs, precode_lens,
840 /* Output the lengths of the codewords in the precode. */
841 for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
842 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
844 /* Output the encoded lengths of the codewords in the larger code. */
845 for (i = 0; i < num_precode_items; i++) {
846 precode_item = precode_items[i];
847 precode_sym = precode_item & 0x1F;
848 lzx_add_bits(os, precode_codewords[precode_sym],
849 precode_lens[precode_sym]);
850 if (precode_sym >= 17) {
851 if (precode_sym == 17) {
852 lzx_add_bits(os, precode_item >> 5, 4);
853 } else if (precode_sym == 18) {
854 lzx_add_bits(os, precode_item >> 5, 5);
856 lzx_add_bits(os, (precode_item >> 5) & 1, 1);
857 precode_sym = precode_item >> 6;
858 lzx_add_bits(os, precode_codewords[precode_sym],
859 precode_lens[precode_sym]);
862 STATIC_ASSERT(CAN_BUFFER(2 * PRE_CODEWORD_LIMIT + 1));
863 lzx_flush_bits(os, 2 * PRE_CODEWORD_LIMIT + 1);
866 *(u8 *)(lens + num_lens) = saved;
870 * Write all matches and literal bytes (which were precomputed) in an LZX
871 * compressed block to the output bitstream in the final compressed
875 * The output bitstream.
877 * The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
878 * LZX_BLOCKTYPE_VERBATIM).
880 * The uncompressed data of the block.
882 * The matches and literals to output, given as a series of sequences.
884 * The main, length, and aligned offset Huffman codes for the block.
887 lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
888 const u8 *block_data, const struct lzx_sequence sequences[],
889 const struct lzx_codes *codes)
891 const struct lzx_sequence *seq = sequences;
892 unsigned min_aligned_offset_slot;
894 if (block_type == LZX_BLOCKTYPE_ALIGNED)
895 min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
897 min_aligned_offset_slot = LZX_MAX_OFFSET_SLOTS;
900 /* Output the next sequence. */
902 unsigned litrunlen = seq->litrunlen;
904 unsigned main_symbol;
905 unsigned adjusted_length;
907 unsigned offset_slot;
908 unsigned num_extra_bits;
911 /* Output the literal run of the sequence. */
913 if (litrunlen) { /* Is the literal run nonempty? */
915 /* Verify optimization is enabled on 64-bit */
916 STATIC_ASSERT(WORDBITS < 64 ||
917 CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT));
919 if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) {
921 /* 64-bit: write 3 literals at a time. */
922 while (litrunlen >= 3) {
923 unsigned lit0 = block_data[0];
924 unsigned lit1 = block_data[1];
925 unsigned lit2 = block_data[2];
926 lzx_add_bits(os, codes->codewords.main[lit0],
927 codes->lens.main[lit0]);
928 lzx_add_bits(os, codes->codewords.main[lit1],
929 codes->lens.main[lit1]);
930 lzx_add_bits(os, codes->codewords.main[lit2],
931 codes->lens.main[lit2]);
932 lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
937 unsigned lit = *block_data++;
938 lzx_add_bits(os, codes->codewords.main[lit],
939 codes->lens.main[lit]);
941 unsigned lit = *block_data++;
942 lzx_add_bits(os, codes->codewords.main[lit],
943 codes->lens.main[lit]);
944 lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
946 lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
950 /* 32-bit: write 1 literal at a time. */
952 unsigned lit = *block_data++;
953 lzx_add_bits(os, codes->codewords.main[lit],
954 codes->lens.main[lit]);
955 lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
956 } while (--litrunlen);
960 /* Was this the last literal run? */
961 if (seq->adjusted_offset_and_match_hdr & 0x80000000)
964 /* Nope; output the match. */
966 match_hdr = seq->adjusted_offset_and_match_hdr & 0x1FF;
967 main_symbol = LZX_NUM_CHARS + match_hdr;
968 adjusted_length = seq->adjusted_length;
970 block_data += adjusted_length + LZX_MIN_MATCH_LEN;
972 offset_slot = match_hdr / LZX_NUM_LEN_HEADERS;
973 adjusted_offset = seq->adjusted_offset_and_match_hdr >> 9;
975 num_extra_bits = lzx_extra_offset_bits[offset_slot];
976 extra_bits = adjusted_offset - (lzx_offset_slot_base[offset_slot] +
977 LZX_OFFSET_ADJUSTMENT);
979 #define MAX_MATCH_BITS (MAIN_CODEWORD_LIMIT + \
980 LENGTH_CODEWORD_LIMIT + \
981 LZX_MAX_NUM_EXTRA_BITS - \
982 LZX_NUM_ALIGNED_OFFSET_BITS + \
983 ALIGNED_CODEWORD_LIMIT)
985 /* Verify optimization is enabled on 64-bit */
986 STATIC_ASSERT(WORDBITS < 64 || CAN_BUFFER(MAX_MATCH_BITS));
988 /* Output the main symbol for the match. */
990 lzx_add_bits(os, codes->codewords.main[main_symbol],
991 codes->lens.main[main_symbol]);
992 if (!CAN_BUFFER(MAX_MATCH_BITS))
993 lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
995 /* If needed, output the length symbol for the match. */
997 if (adjusted_length >= LZX_NUM_PRIMARY_LENS) {
998 lzx_add_bits(os, codes->codewords.len[adjusted_length -
999 LZX_NUM_PRIMARY_LENS],
1000 codes->lens.len[adjusted_length -
1001 LZX_NUM_PRIMARY_LENS]);
1002 if (!CAN_BUFFER(MAX_MATCH_BITS))
1003 lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
1006 /* Output the extra offset bits for the match. In aligned
1007 * offset blocks, the lowest 3 bits of the adjusted offset are
1008 * Huffman-encoded using the aligned offset code, provided that
1009 * there are at least extra 3 offset bits required. All other
1010 * extra offset bits are output verbatim. */
1012 if (offset_slot >= min_aligned_offset_slot) {
1014 lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
1015 num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS);
1016 if (!CAN_BUFFER(MAX_MATCH_BITS))
1017 lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS -
1018 LZX_NUM_ALIGNED_OFFSET_BITS);
1020 lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
1021 LZX_ALIGNED_OFFSET_BITMASK],
1022 codes->lens.aligned[adjusted_offset &
1023 LZX_ALIGNED_OFFSET_BITMASK]);
1024 if (!CAN_BUFFER(MAX_MATCH_BITS))
1025 lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
1027 STATIC_ASSERT(CAN_BUFFER(LZX_MAX_NUM_EXTRA_BITS));
1029 lzx_add_bits(os, extra_bits, num_extra_bits);
1030 if (!CAN_BUFFER(MAX_MATCH_BITS))
1031 lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS);
1034 if (CAN_BUFFER(MAX_MATCH_BITS))
1035 lzx_flush_bits(os, MAX_MATCH_BITS);
1037 /* Advance to the next sequence. */
1043 lzx_write_compressed_block(const u8 *block_begin,
1046 unsigned window_order,
1047 unsigned num_main_syms,
1048 const struct lzx_sequence sequences[],
1049 const struct lzx_codes * codes,
1050 const struct lzx_lens * prev_lens,
1051 struct lzx_output_bitstream * os)
1053 /* The first three bits indicate the type of block and are one of the
1054 * LZX_BLOCKTYPE_* constants. */
1055 lzx_write_bits(os, block_type, 3);
1058 * Output the block size.
1060 * The original LZX format encoded the block size in 24 bits. However,
1061 * the LZX format used in WIM archives uses 1 bit to specify whether the
1062 * block has the default size of 32768 bytes, then optionally 16 bits to
1063 * specify a non-default size. This works fine for Microsoft's WIM
1064 * software (WIMGAPI), which never compresses more than 32768 bytes at a
1065 * time with LZX. However, as an extension, our LZX compressor supports
1066 * compressing up to 2097152 bytes, with a corresponding increase in
1067 * window size. It is possible for blocks in these larger buffers to
1068 * exceed 65535 bytes; such blocks cannot have their size represented in
1071 * The chosen solution was to use 24 bits for the block size when
1072 * possibly required --- specifically, when the compressor has been
1073 * allocated to be capable of compressing more than 32768 bytes at once
1074 * (which also causes the number of main symbols to be increased).
1076 if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
1077 lzx_write_bits(os, 1, 1);
1079 lzx_write_bits(os, 0, 1);
1081 if (window_order >= 16)
1082 lzx_write_bits(os, block_size >> 16, 8);
1084 lzx_write_bits(os, block_size & 0xFFFF, 16);
1087 /* If it's an aligned offset block, output the aligned offset code. */
1088 if (block_type == LZX_BLOCKTYPE_ALIGNED) {
1089 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1090 lzx_write_bits(os, codes->lens.aligned[i],
1091 LZX_ALIGNEDCODE_ELEMENT_SIZE);
1095 /* Output the main code (two parts). */
1096 lzx_write_compressed_code(os, codes->lens.main,
1099 lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
1100 prev_lens->main + LZX_NUM_CHARS,
1101 num_main_syms - LZX_NUM_CHARS);
1103 /* Output the length code. */
1104 lzx_write_compressed_code(os, codes->lens.len,
1106 LZX_LENCODE_NUM_SYMBOLS);
1108 /* Output the compressed matches and literals. */
1109 lzx_write_sequences(os, block_type, block_begin, sequences, codes);
1113 * Given the frequencies of symbols in an LZX-compressed block and the
1114 * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
1115 * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
1116 * will take fewer bits to output.
1119 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
1120 const struct lzx_codes * codes)
1122 u32 verbatim_cost = 0;
1123 u32 aligned_cost = 0;
1125 /* A verbatim block requires 3 bits in each place that an aligned offset
1126 * symbol would be used in an aligned offset block. */
1127 for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1128 verbatim_cost += LZX_NUM_ALIGNED_OFFSET_BITS * freqs->aligned[i];
1129 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
1132 /* Account for the cost of sending the codeword lengths of the aligned
1134 aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE *
1135 LZX_ALIGNEDCODE_NUM_SYMBOLS;
1137 if (aligned_cost < verbatim_cost)
1138 return LZX_BLOCKTYPE_ALIGNED;
1140 return LZX_BLOCKTYPE_VERBATIM;
1144 * Flush an LZX block:
1146 * 1. Build the Huffman codes.
1147 * 2. Decide whether to output the block as VERBATIM or ALIGNED.
1148 * 3. Write the block.
1149 * 4. Swap the indices of the current and previous Huffman codes.
1151 * Note: we never output UNCOMPRESSED blocks. This probably should be
1152 * implemented sometime, but it doesn't make much difference.
1155 lzx_flush_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
1156 const u8 *block_begin, u32 block_size, u32 seq_idx)
1160 lzx_build_huffman_codes(c);
1162 block_type = lzx_choose_verbatim_or_aligned(&c->freqs,
1163 &c->codes[c->codes_index]);
1164 lzx_write_compressed_block(block_begin,
1169 &c->chosen_sequences[seq_idx],
1170 &c->codes[c->codes_index],
1171 &c->codes[c->codes_index ^ 1].lens,
1173 c->codes_index ^= 1;
1176 /******************************************************************************/
1177 /* Block splitting algorithm */
1178 /*----------------------------------------------------------------------------*/
1181 * The problem of block splitting is to decide when it is worthwhile to start a
1182 * new block with new entropy codes. There is a theoretically optimal solution:
1183 * recursively consider every possible block split, considering the exact cost
1184 * of each block, and choose the minimum cost approach. But this is far too
1185 * slow. Instead, as an approximation, we can count symbols and after every N
1186 * symbols, compare the expected distribution of symbols based on the previous
1187 * data with the actual distribution. If they differ "by enough", then start a
1190 * As an optimization and heuristic, we don't distinguish between every symbol
1191 * but rather we combine many symbols into a single "observation type". For
1192 * literals we only look at the high bits and low bits, and for matches we only
1193 * look at whether the match is long or not. The assumption is that for typical
1194 * "real" data, places that are good block boundaries will tend to be noticable
1195 * based only on changes in these aggregate frequencies, without looking for
1196 * subtle differences in individual symbols. For example, a change from ASCII
1197 * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1198 * to many matches (generally more compressible), would be easily noticed based
1199 * on the aggregates.
1201 * For determining whether the frequency distributions are "different enough" to
1202 * start a new block, the simply heuristic of splitting when the sum of absolute
1203 * differences exceeds a constant seems to be good enough.
1205 * Finally, for an approximation, it is not strictly necessary that the exact
1206 * symbols being used are considered. With "near-optimal parsing", for example,
1207 * the actual symbols that will be used are unknown until after the block
1208 * boundary is chosen and the block has been optimized. Since the final choices
1209 * cannot be used, we can use preliminary "greedy" choices instead.
1212 /* Initialize the block split statistics when starting a new block. */
1214 lzx_init_block_split_stats(struct lzx_block_split_stats *stats)
1216 memset(stats, 0, sizeof(*stats));
1219 /* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the
1220 * literal, for 8 possible literal observation types. */
1221 static forceinline void
1222 lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
1224 stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1225 stats->num_new_observations++;
1228 /* Match observation. Heuristic: use one observation type for "short match" and
1229 * one observation type for "long match". */
1230 static forceinline void
1231 lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length)
1233 stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
1234 stats->num_new_observations++;
1238 lzx_should_end_block(struct lzx_block_split_stats *stats)
1240 if (stats->num_observations > 0) {
1242 /* Note: to avoid slow divisions, we do not divide by
1243 * 'num_observations', but rather do all math with the numbers
1244 * multiplied by 'num_observations'. */
1245 u32 total_delta = 0;
1246 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1247 u32 expected = stats->observations[i] *
1248 stats->num_new_observations;
1249 u32 actual = stats->new_observations[i] *
1250 stats->num_observations;
1251 u32 delta = (actual > expected) ? actual - expected :
1253 total_delta += delta;
1256 /* Ready to end the block? */
1258 stats->num_new_observations * 7 / 8 * stats->num_observations)
1262 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1263 stats->num_observations += stats->new_observations[i];
1264 stats->observations[i] += stats->new_observations[i];
1265 stats->new_observations[i] = 0;
1267 stats->num_new_observations = 0;
1271 /******************************************************************************/
1272 /* Slower ("near-optimal") compression algorithm */
1273 /*----------------------------------------------------------------------------*/
1276 * Least-recently-used queue for match offsets.
1278 * This is represented as a 64-bit integer for efficiency. There are three
1279 * offsets of 21 bits each. Bit 64 is garbage.
1281 struct lzx_lru_queue {
1283 } _aligned_attribute(8);
1285 #define LZX_QUEUE_OFFSET_SHIFT 21
1286 #define LZX_QUEUE_OFFSET_MASK (((u64)1 << LZX_QUEUE_OFFSET_SHIFT) - 1)
1288 #define LZX_QUEUE_R0_SHIFT (0 * LZX_QUEUE_OFFSET_SHIFT)
1289 #define LZX_QUEUE_R1_SHIFT (1 * LZX_QUEUE_OFFSET_SHIFT)
1290 #define LZX_QUEUE_R2_SHIFT (2 * LZX_QUEUE_OFFSET_SHIFT)
1292 #define LZX_QUEUE_R0_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R0_SHIFT)
1293 #define LZX_QUEUE_R1_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R1_SHIFT)
1294 #define LZX_QUEUE_R2_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R2_SHIFT)
1296 #define LZX_QUEUE_INITIALIZER { \
1297 ((u64)1 << LZX_QUEUE_R0_SHIFT) | \
1298 ((u64)1 << LZX_QUEUE_R1_SHIFT) | \
1299 ((u64)1 << LZX_QUEUE_R2_SHIFT) }
1301 static forceinline u64
1302 lzx_lru_queue_R0(struct lzx_lru_queue queue)
1304 return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1307 static forceinline u64
1308 lzx_lru_queue_R1(struct lzx_lru_queue queue)
1310 return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1313 static forceinline u64
1314 lzx_lru_queue_R2(struct lzx_lru_queue queue)
1316 return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1319 /* Push a match offset onto the front (most recently used) end of the queue. */
1320 static forceinline struct lzx_lru_queue
1321 lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
1323 return (struct lzx_lru_queue) {
1324 .R = (queue.R << LZX_QUEUE_OFFSET_SHIFT) | offset,
1328 /* Swap a match offset to the front of the queue. */
1329 static forceinline struct lzx_lru_queue
1330 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
1332 unsigned shift = idx * 21;
1333 const u64 mask = LZX_QUEUE_R0_MASK;
1334 const u64 mask_high = mask << shift;
1336 return (struct lzx_lru_queue) {
1337 (queue.R & ~(mask | mask_high)) |
1338 ((queue.R & mask_high) >> shift) |
1339 ((queue.R & mask) << shift)
1343 static forceinline u32
1344 lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
1347 u32 node_idx = block_size;
1348 u32 seq_idx = ARRAY_LEN(c->chosen_sequences) - 1;
1352 /* Special value to mark last sequence */
1353 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = 0x80000000;
1354 lit_start_node = node_idx;
1360 u32 adjusted_offset;
1362 unsigned offset_slot;
1364 /* Tally literals until either a match or the beginning of the
1365 * block is reached. Note: the item in the node at the
1366 * beginning of the block has all bits set, causing this loop to
1367 * end when it is reached. */
1369 item = c->optimum_nodes[node_idx].item;
1370 if (item & OPTIMUM_LEN_MASK)
1372 c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
1376 #if CONSIDER_GAP_MATCHES
1377 if (item & OPTIMUM_GAP_MATCH) {
1382 /* Record the literal run length for the next sequence
1383 * (the "previous sequence" when walking backwards). */
1384 len = item & OPTIMUM_LEN_MASK;
1386 c->chosen_sequences[seq_idx--].litrunlen =
1387 lit_start_node - node_idx;
1388 lit_start_node = node_idx - len;
1391 /* Tally the rep0 match after the gap. */
1392 v = len - LZX_MIN_MATCH_LEN;
1394 c->chosen_sequences[seq_idx].adjusted_length = v;
1395 if (v >= LZX_NUM_PRIMARY_LENS) {
1396 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
1397 v = LZX_NUM_PRIMARY_LENS;
1399 c->freqs.main[LZX_NUM_CHARS + v]++;
1401 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = v;
1403 /* Tally the literal in the gap. */
1404 c->freqs.main[(u8)(item >> OPTIMUM_OFFSET_SHIFT)]++;
1406 /* Fall through and tally the match before the gap.
1407 * (It was temporarily saved in the 'cost' field of the
1408 * previous node, which was free to reuse.) */
1409 item = c->optimum_nodes[--node_idx].cost;
1412 #else /* CONSIDER_GAP_MATCHES */
1415 #endif /* !CONSIDER_GAP_MATCHES */
1417 len = item & OPTIMUM_LEN_MASK;
1418 adjusted_offset = item >> OPTIMUM_OFFSET_SHIFT;
1420 /* Record the literal run length for the next sequence (the
1421 * "previous sequence" when walking backwards). */
1423 c->chosen_sequences[seq_idx--].litrunlen =
1424 lit_start_node - node_idx;
1426 lit_start_node = node_idx;
1431 /* Record a match. */
1433 /* Tally the aligned offset symbol if needed. */
1434 if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET + LZX_OFFSET_ADJUSTMENT)
1435 c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
1437 /* Record the adjusted length. */
1438 v = len - LZX_MIN_MATCH_LEN;
1440 c->chosen_sequences[seq_idx].adjusted_length = v;
1442 /* Tally the length symbol if needed. */
1443 if (v >= LZX_NUM_PRIMARY_LENS) {
1444 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
1445 v = LZX_NUM_PRIMARY_LENS;
1448 /* Tally the main symbol. */
1449 offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
1450 v += offset_slot * LZX_NUM_LEN_HEADERS;
1451 c->freqs.main[LZX_NUM_CHARS + v]++;
1453 /* Record the adjusted offset and match header. */
1455 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr =
1456 (adjusted_offset << 9) | v;
1460 /* Record the literal run length for the first sequence. */
1462 c->chosen_sequences[seq_idx].litrunlen = lit_start_node - node_idx;
1464 /* Return the index in chosen_sequences at which the sequences begin. */
1469 * Given the minimum-cost path computed through the item graph for the current
1470 * block, walk the path and count how many of each symbol in each Huffman-coded
1471 * alphabet would be required to output the items (matches and literals) along
1474 * Note that the path will be walked backwards (from the end of the block to the
1475 * beginning of the block), but this doesn't matter because this function only
1476 * computes frequencies.
1478 static forceinline void
1479 lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1481 lzx_walk_item_list(c, block_size, is_16_bit, false);
1485 * Like lzx_tally_item_list(), but this function also generates the list of
1486 * lzx_sequences for the minimum-cost path and writes it to c->chosen_sequences,
1487 * ready to be output to the bitstream after the Huffman codes are computed.
1488 * The lzx_sequences will be written to decreasing memory addresses as the path
1489 * is walked backwards, which means they will end up in the expected
1490 * first-to-last order. The return value is the index in c->chosen_sequences at
1491 * which the lzx_sequences begin.
1493 static forceinline u32
1494 lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1496 return lzx_walk_item_list(c, block_size, is_16_bit, true);
1500 * Find an inexpensive path through the graph of possible match/literal choices
1501 * for the current block. The nodes of the graph are
1502 * c->optimum_nodes[0...block_size]. They correspond directly to the bytes in
1503 * the current block, plus one extra node for end-of-block. The edges of the
1504 * graph are matches and literals. The goal is to find the minimum cost path
1505 * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]', given the cost
1508 * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and
1509 * proceeding forwards one node at a time. At each node, a selection of matches
1510 * (len >= 2), as well as the literal byte (len = 1), is considered. An item of
1511 * length 'len' provides a new path to reach the node 'len' bytes later. If
1512 * such a path is the lowest cost found so far to reach that later node, then
1513 * that later node is updated with the new cost and the "arrival" which provided
1516 * Note that although this algorithm is based on minimum cost path search, due
1517 * to various simplifying assumptions the result is not guaranteed to be the
1518 * true minimum cost, or "optimal", path over the graph of all valid LZX
1519 * representations of this block.
1521 * Also, note that because of the presence of the recent offsets queue (which is
1522 * a type of adaptive state), the algorithm cannot work backwards and compute
1523 * "cost to end" instead of "cost to beginning". Furthermore, the way the
1524 * algorithm handles this adaptive state in the "minimum cost" parse is actually
1525 * only an approximation. It's possible for the globally optimal, minimum cost
1526 * path to contain a prefix, ending at a position, where that path prefix is
1527 * *not* the minimum cost path to that position. This can happen if such a path
1528 * prefix results in a different adaptive state which results in lower costs
1529 * later. The algorithm does not solve this problem in general; it only looks
1530 * one step ahead, with the exception of special consideration for "gap
1533 static forceinline struct lzx_lru_queue
1534 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
1535 const u8 * const restrict block_begin,
1536 const u32 block_size,
1537 const struct lzx_lru_queue initial_queue,
1540 struct lzx_optimum_node *cur_node = c->optimum_nodes;
1541 struct lzx_optimum_node * const end_node = cur_node + block_size;
1542 struct lz_match *cache_ptr = c->match_cache;
1543 const u8 *in_next = block_begin;
1544 const u8 * const block_end = block_begin + block_size;
1547 * Instead of storing the match offset LRU queues in the
1548 * 'lzx_optimum_node' structures, we save memory (and cache lines) by
1549 * storing them in a smaller array. This works because the algorithm
1550 * only requires a limited history of the adaptive state. Once a given
1551 * state is more than LZX_MAX_MATCH_LEN bytes behind the current node
1552 * (more if gap match consideration is enabled; we just round up to 512
1553 * so it's a power of 2), it is no longer needed.
1555 * The QUEUE() macro finds the queue for the given node. This macro has
1556 * been optimized by taking advantage of 'struct lzx_lru_queue' and
1557 * 'struct lzx_optimum_node' both being 8 bytes in size and alignment.
1559 struct lzx_lru_queue queues[512];
1560 STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1);
1561 STATIC_ASSERT(sizeof(c->optimum_nodes[0]) == sizeof(queues[0]));
1562 #define QUEUE(node) \
1563 (*(struct lzx_lru_queue *)((char *)queues + \
1564 ((uintptr_t)(node) % (ARRAY_LEN(queues) * sizeof(queues[0])))))
1565 /*(queues[(uintptr_t)(node) / sizeof(*(node)) % ARRAY_LEN(queues)])*/
1567 #if CONSIDER_GAP_MATCHES
1568 u32 matches_before_gap[ARRAY_LEN(queues)];
1569 #define MATCH_BEFORE_GAP(node) \
1570 (matches_before_gap[(uintptr_t)(node) / sizeof(*(node)) % \
1571 ARRAY_LEN(matches_before_gap)])
1575 * Initially, the cost to reach each node is "infinity".
1577 * The first node actually should have cost 0, but "infinity"
1578 * (0xFFFFFFFF) works just as well because it immediately overflows.
1580 * The following statement also intentionally sets the 'item' of the
1581 * first node, which would otherwise have no meaning, to 0xFFFFFFFF for
1582 * use as a sentinel. See lzx_walk_item_list().
1584 memset(c->optimum_nodes, 0xFF,
1585 (block_size + 1) * sizeof(c->optimum_nodes[0]));
1587 /* Initialize the recent offsets queue for the first node. */
1588 QUEUE(cur_node) = initial_queue;
1590 do { /* For each node in the block in position order... */
1592 unsigned num_matches;
1597 * A selection of matches for the block was already saved in
1598 * memory so that we don't have to run the uncompressed data
1599 * through the matchfinder on every optimization pass. However,
1600 * we still search for repeat offset matches during each
1601 * optimization pass because we cannot predict the state of the
1602 * recent offsets queue. But as a heuristic, we don't bother
1603 * searching for repeat offset matches if the general-purpose
1604 * matchfinder failed to find any matches.
1606 * Note that a match of length n at some offset implies there is
1607 * also a match of length l for LZX_MIN_MATCH_LEN <= l <= n at
1608 * that same offset. In other words, we don't necessarily need
1609 * to use the full length of a match. The key heuristic that
1610 * saves a significicant amount of time is that for each
1611 * distinct length, we only consider the smallest offset for
1612 * which that length is available. This heuristic also applies
1613 * to repeat offsets, which we order specially: R0 < R1 < R2 <
1614 * any explicit offset. Of course, this heuristic may be
1615 * produce suboptimal results because offset slots in LZX are
1616 * subject to entropy encoding, but in practice this is a useful
1620 num_matches = cache_ptr->length;
1624 struct lz_match *end_matches = cache_ptr + num_matches;
1625 unsigned next_len = LZX_MIN_MATCH_LEN;
1626 unsigned max_len = min(block_end - in_next, LZX_MAX_MATCH_LEN);
1629 /* Consider rep0 matches. */
1630 matchptr = in_next - lzx_lru_queue_R0(QUEUE(cur_node));
1631 if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1633 STATIC_ASSERT(LZX_MIN_MATCH_LEN == 2);
1635 u32 cost = cur_node->cost +
1636 c->costs.match_cost[0][
1637 next_len - LZX_MIN_MATCH_LEN];
1638 if (cost <= (cur_node + next_len)->cost) {
1639 (cur_node + next_len)->cost = cost;
1640 (cur_node + next_len)->item =
1641 (0 << OPTIMUM_OFFSET_SHIFT) | next_len;
1643 if (unlikely(++next_len > max_len)) {
1644 cache_ptr = end_matches;
1647 } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1651 /* Consider rep1 matches. */
1652 matchptr = in_next - lzx_lru_queue_R1(QUEUE(cur_node));
1653 if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1655 if (matchptr[next_len - 1] != in_next[next_len - 1])
1657 for (unsigned len = 2; len < next_len - 1; len++)
1658 if (matchptr[len] != in_next[len])
1661 u32 cost = cur_node->cost +
1662 c->costs.match_cost[1][
1663 next_len - LZX_MIN_MATCH_LEN];
1664 if (cost <= (cur_node + next_len)->cost) {
1665 (cur_node + next_len)->cost = cost;
1666 (cur_node + next_len)->item =
1667 (1 << OPTIMUM_OFFSET_SHIFT) | next_len;
1669 if (unlikely(++next_len > max_len)) {
1670 cache_ptr = end_matches;
1673 } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1677 /* Consider rep2 matches. */
1678 matchptr = in_next - lzx_lru_queue_R2(QUEUE(cur_node));
1679 if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1681 if (matchptr[next_len - 1] != in_next[next_len - 1])
1683 for (unsigned len = 2; len < next_len - 1; len++)
1684 if (matchptr[len] != in_next[len])
1687 u32 cost = cur_node->cost +
1688 c->costs.match_cost[2][
1689 next_len - LZX_MIN_MATCH_LEN];
1690 if (cost <= (cur_node + next_len)->cost) {
1691 (cur_node + next_len)->cost = cost;
1692 (cur_node + next_len)->item =
1693 (2 << OPTIMUM_OFFSET_SHIFT) | next_len;
1695 if (unlikely(++next_len > max_len)) {
1696 cache_ptr = end_matches;
1699 } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1703 while (next_len > cache_ptr->length)
1704 if (++cache_ptr == end_matches)
1707 /* Consider explicit offset matches. */
1709 u32 offset = cache_ptr->offset;
1710 u32 adjusted_offset = offset + LZX_OFFSET_ADJUSTMENT;
1711 unsigned offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
1712 u32 base_cost = cur_node->cost;
1715 #if CONSIDER_ALIGNED_COSTS
1716 if (offset >= LZX_MIN_ALIGNED_OFFSET)
1717 base_cost += c->costs.aligned[adjusted_offset &
1718 LZX_ALIGNED_OFFSET_BITMASK];
1722 c->costs.match_cost[offset_slot][
1723 next_len - LZX_MIN_MATCH_LEN];
1724 if (cost < (cur_node + next_len)->cost) {
1725 (cur_node + next_len)->cost = cost;
1726 (cur_node + next_len)->item =
1727 (adjusted_offset << OPTIMUM_OFFSET_SHIFT) | next_len;
1729 } while (++next_len <= cache_ptr->length);
1731 if (++cache_ptr == end_matches) {
1732 #if CONSIDER_GAP_MATCHES
1733 /* Also consider the longest explicit
1734 * offset match as a "gap match": match
1736 s32 remaining = (block_end - in_next) - (s32)next_len;
1737 if (likely(remaining >= 2)) {
1738 const u8 *strptr = in_next + next_len;
1739 const u8 *matchptr = strptr - offset;
1740 if (load_u16_unaligned(strptr) == load_u16_unaligned(matchptr)) {
1741 STATIC_ASSERT(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2 >= 250);
1742 STATIC_ASSERT(ARRAY_LEN(queues) == ARRAY_LEN(matches_before_gap));
1743 unsigned limit = min(remaining,
1744 min(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2,
1745 LZX_MAX_MATCH_LEN));
1746 unsigned rep0_len = lz_extend(strptr, matchptr, 2, limit);
1747 u8 lit = strptr[-1];
1748 cost += c->costs.main[lit] +
1749 c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN];
1750 unsigned total_len = next_len + rep0_len;
1751 if (cost < (cur_node + total_len)->cost) {
1752 (cur_node + total_len)->cost = cost;
1753 (cur_node + total_len)->item =
1755 ((u32)lit << OPTIMUM_OFFSET_SHIFT) |
1757 MATCH_BEFORE_GAP(cur_node + total_len) =
1758 (adjusted_offset << OPTIMUM_OFFSET_SHIFT) |
1763 #endif /* CONSIDER_GAP_MATCHES */
1771 /* Consider coding a literal.
1773 * To avoid an extra branch, actually checking the preferability
1774 * of coding the literal is integrated into the queue update
1776 literal = *in_next++;
1777 cost = cur_node->cost + c->costs.main[literal];
1779 /* Advance to the next position. */
1782 /* The lowest-cost path to the current position is now known.
1783 * Finalize the recent offsets queue that results from taking
1784 * this lowest-cost path. */
1786 if (cost <= cur_node->cost) {
1787 /* Literal: queue remains unchanged. */
1788 cur_node->cost = cost;
1789 cur_node->item = (u32)literal << OPTIMUM_OFFSET_SHIFT;
1790 QUEUE(cur_node) = QUEUE(cur_node - 1);
1792 /* Match: queue update is needed. */
1793 unsigned len = cur_node->item & OPTIMUM_LEN_MASK;
1794 #if CONSIDER_GAP_MATCHES
1795 s32 adjusted_offset = (s32)cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1796 STATIC_ASSERT(OPTIMUM_GAP_MATCH == 0x80000000); /* assuming sign extension */
1798 u32 adjusted_offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1801 if (adjusted_offset >= LZX_NUM_RECENT_OFFSETS) {
1802 /* Explicit offset match: insert offset at front. */
1804 lzx_lru_queue_push(QUEUE(cur_node - len),
1805 adjusted_offset - LZX_OFFSET_ADJUSTMENT);
1807 #if CONSIDER_GAP_MATCHES
1808 else if (adjusted_offset < 0) {
1809 /* "Gap match": Explicit offset match, then a
1810 * literal, then rep0 match. Save the explicit
1811 * offset match information in the cost field of
1812 * the previous node, which isn't needed
1813 * anymore. Then insert the offset at the front
1815 u32 match_before_gap = MATCH_BEFORE_GAP(cur_node);
1816 (cur_node - 1)->cost = match_before_gap;
1818 lzx_lru_queue_push(QUEUE(cur_node - len - 1 -
1819 (match_before_gap & OPTIMUM_LEN_MASK)),
1820 (match_before_gap >> OPTIMUM_OFFSET_SHIFT) -
1821 LZX_OFFSET_ADJUSTMENT);
1825 /* Repeat offset match: swap offset to front. */
1827 lzx_lru_queue_swap(QUEUE(cur_node - len),
1831 } while (cur_node != end_node);
1833 /* Return the recent offsets queue at the end of the path. */
1834 return QUEUE(cur_node);
1838 * Given the costs for the main and length codewords (c->costs.main and
1839 * c->costs.len), initialize the match cost array (c->costs.match_cost) which
1840 * directly provides the cost of every possible (length, offset slot) pair.
1843 lzx_compute_match_costs(struct lzx_compressor *c)
1845 unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) /
1846 LZX_NUM_LEN_HEADERS;
1847 struct lzx_costs *costs = &c->costs;
1848 unsigned main_symbol = LZX_NUM_CHARS;
1850 for (unsigned offset_slot = 0; offset_slot < num_offset_slots;
1853 u32 extra_cost = lzx_extra_offset_bits[offset_slot] * BIT_COST;
1856 #if CONSIDER_ALIGNED_COSTS
1857 if (offset_slot >= LZX_MIN_ALIGNED_OFFSET_SLOT)
1858 extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS * BIT_COST;
1861 for (i = 0; i < LZX_NUM_PRIMARY_LENS; i++) {
1862 costs->match_cost[offset_slot][i] =
1863 costs->main[main_symbol++] + extra_cost;
1866 extra_cost += costs->main[main_symbol++];
1868 for (; i < LZX_NUM_LENS; i++) {
1869 costs->match_cost[offset_slot][i] =
1870 costs->len[i - LZX_NUM_PRIMARY_LENS] +
1877 * Fast approximation for log2f(x). This is not as accurate as the standard C
1878 * version. It does not need to be perfectly accurate because it is only used
1879 * for estimating symbol costs, which is very approximate anyway.
1889 /* Extract the exponent and subtract 127 to remove the bias. This gives
1890 * the integer part of the result. */
1891 float res = ((u.i >> 23) & 0xFF) - 127;
1893 /* Set the exponent to 0 (plus bias of 127). This transforms the number
1894 * to the range [1, 2) while retaining the same mantissa. */
1895 u.i = (u.i & ~(0xFF << 23)) | (127 << 23);
1898 * Approximate the log2 of the transformed number using a degree 2
1899 * interpolating polynomial for log2(x) over the interval [1, 2). Then
1900 * add this to the extracted exponent to produce the final approximation
1903 * The coefficients of the interpolating polynomial used here were found
1904 * using the script tools/log2_interpolation.r.
1906 return res - 1.653124006f + u.f * (1.9941812f - u.f * 0.3347490189f);
1911 * Return the estimated cost of a symbol which has been estimated to have the
1912 * given probability.
1915 lzx_cost_for_probability(float prob)
1918 * The basic formula is:
1920 * entropy = -log2(probability)
1922 * Use this to get the cost in fractional bits. Then multiply by our
1923 * scaling factor of BIT_COST and convert to an integer.
1925 * In addition, the minimum cost is BIT_COST (one bit) because the
1926 * entropy coding method will be Huffman codes.
1928 * Careful: even though 'prob' should be <= 1.0, 'log2f_fast(prob)' may
1929 * be positive due to inaccuracy in our log2 approximation. Therefore,
1930 * we cannot, in general, assume the computed cost is non-negative, and
1931 * we should make sure negative costs get rounded up correctly.
1933 s32 cost = -log2f_fast(prob) * BIT_COST;
1934 return max(cost, BIT_COST);
1938 * Mapping: number of used literals => heuristic probability of a literal times
1939 * 6870. Generated by running this R command:
1941 * cat(paste(round(6870*2^-((304+(0:256))/64)), collapse=", "))
1943 static const u8 literal_scaled_probs[257] = {
1944 255, 253, 250, 247, 244, 242, 239, 237, 234, 232, 229, 227, 224, 222,
1945 219, 217, 215, 212, 210, 208, 206, 203, 201, 199, 197, 195, 193, 191,
1946 189, 186, 184, 182, 181, 179, 177, 175, 173, 171, 169, 167, 166, 164,
1947 162, 160, 159, 157, 155, 153, 152, 150, 149, 147, 145, 144, 142, 141,
1948 139, 138, 136, 135, 133, 132, 130, 129, 128, 126, 125, 124, 122, 121,
1949 120, 118, 117, 116, 115, 113, 112, 111, 110, 109, 107, 106, 105, 104,
1950 103, 102, 101, 100, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86,
1951 86, 85, 84, 83, 82, 81, 80, 79, 78, 78, 77, 76, 75, 74, 73, 73, 72, 71,
1952 70, 70, 69, 68, 67, 67, 66, 65, 65, 64, 63, 62, 62, 61, 60, 60, 59, 59,
1953 58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 52, 51, 51, 50, 50, 49, 49, 48,
1954 48, 47, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 40, 40, 40,
1955 39, 39, 38, 38, 38, 37, 37, 36, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33,
1956 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 27, 27,
1957 27, 26, 26, 26, 25, 25, 25, 25, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22,
1958 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18,
1959 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16
1963 * Mapping: length symbol => default cost of that symbol. This is derived from
1964 * sample data but has been slightly edited to add more bias towards the
1965 * shortest lengths, which are the most common.
1967 static const u16 lzx_default_len_costs[LZX_LENCODE_NUM_SYMBOLS] = {
1968 300, 310, 320, 330, 360, 396, 399, 416, 451, 448, 463, 466, 505, 492,
1969 503, 514, 547, 531, 566, 561, 589, 563, 592, 586, 623, 602, 639, 627,
1970 659, 643, 657, 650, 685, 662, 661, 672, 685, 686, 696, 680, 657, 682,
1971 666, 699, 674, 699, 679, 709, 688, 712, 692, 714, 694, 716, 698, 712,
1972 706, 727, 714, 727, 713, 723, 712, 718, 719, 719, 720, 735, 725, 735,
1973 728, 740, 727, 739, 727, 742, 716, 733, 733, 740, 738, 746, 737, 747,
1974 738, 745, 736, 748, 742, 749, 745, 749, 743, 748, 741, 752, 745, 752,
1975 747, 750, 747, 752, 748, 753, 750, 752, 753, 753, 749, 744, 752, 755,
1976 753, 756, 745, 748, 746, 745, 723, 757, 755, 758, 755, 758, 752, 757,
1977 754, 757, 755, 759, 755, 758, 753, 755, 755, 758, 757, 761, 755, 750,
1978 758, 759, 759, 760, 758, 751, 757, 757, 759, 759, 758, 759, 758, 761,
1979 750, 761, 758, 760, 759, 761, 758, 761, 760, 752, 759, 760, 759, 759,
1980 757, 762, 760, 761, 761, 748, 761, 760, 762, 763, 752, 762, 762, 763,
1981 762, 762, 763, 763, 762, 763, 762, 763, 762, 763, 763, 764, 763, 762,
1982 763, 762, 762, 762, 764, 764, 763, 764, 763, 763, 763, 762, 763, 763,
1983 762, 764, 764, 763, 762, 763, 763, 763, 763, 762, 764, 763, 762, 764,
1984 764, 763, 763, 765, 764, 764, 762, 763, 764, 765, 763, 764, 763, 764,
1985 762, 764, 764, 754, 763, 764, 763, 763, 762, 763, 584,
1988 /* Set default costs to bootstrap the iterative optimization algorithm. */
1990 lzx_set_default_costs(struct lzx_compressor *c)
1993 u32 num_literals = 0;
1994 u32 num_used_literals = 0;
1995 float inv_num_matches = 1.0f / c->freqs.main[LZX_NUM_CHARS];
1996 float inv_num_items;
1997 float prob_match = 1.0f;
1999 float base_literal_prob;
2001 /* Some numbers here have been hardcoded to assume a bit cost of 64. */
2002 STATIC_ASSERT(BIT_COST == 64);
2004 /* Estimate the number of literals that will used. 'num_literals' is
2005 * the total number, whereas 'num_used_literals' is the number of
2006 * distinct symbols. */
2007 for (i = 0; i < LZX_NUM_CHARS; i++) {
2008 num_literals += c->freqs.main[i];
2009 num_used_literals += (c->freqs.main[i] != 0);
2012 /* Note: all match headers were tallied as symbol 'LZX_NUM_CHARS'. We
2013 * don't attempt to estimate which ones will be used. */
2015 inv_num_items = 1.0f / (num_literals + c->freqs.main[LZX_NUM_CHARS]);
2016 base_literal_prob = literal_scaled_probs[num_used_literals] *
2019 /* Literal costs. We use two different methods to compute the
2020 * probability of each literal and mix together their results. */
2021 for (i = 0; i < LZX_NUM_CHARS; i++) {
2022 u32 freq = c->freqs.main[i];
2024 float prob = 0.5f * ((freq * inv_num_items) +
2026 c->costs.main[i] = lzx_cost_for_probability(prob);
2029 c->costs.main[i] = 11 * BIT_COST;
2033 /* Match header costs. We just assume that all match headers are
2034 * equally probable, but we do take into account the relative cost of a
2035 * match header vs. a literal depending on how common matches are
2036 * expected to be vs. literals. */
2037 prob_match = max(prob_match, 0.15f);
2038 match_cost = lzx_cost_for_probability(prob_match / (c->num_main_syms -
2040 for (; i < c->num_main_syms; i++)
2041 c->costs.main[i] = match_cost;
2043 /* Length symbol costs. These are just set to fixed values which
2044 * reflect the fact the smallest lengths are typically the most common,
2045 * and therefore are typically the cheapest. */
2046 for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
2047 c->costs.len[i] = lzx_default_len_costs[i];
2049 #if CONSIDER_ALIGNED_COSTS
2050 /* Aligned offset symbol costs. These are derived from the estimated
2051 * probability of each aligned offset symbol. */
2052 for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2053 /* We intentionally tallied the frequencies in the wrong slots,
2054 * not accounting for LZX_OFFSET_ADJUSTMENT, since doing the
2055 * fixup here is faster: a constant 8 subtractions here vs. one
2056 * addition for every match. */
2057 unsigned j = (i - LZX_OFFSET_ADJUSTMENT) & LZX_ALIGNED_OFFSET_BITMASK;
2058 if (c->freqs.aligned[j] != 0) {
2059 float prob = c->freqs.aligned[j] * inv_num_matches;
2060 c->costs.aligned[i] = lzx_cost_for_probability(prob);
2062 c->costs.aligned[i] =
2063 (2 * LZX_NUM_ALIGNED_OFFSET_BITS) * BIT_COST;
2069 /* Update the current cost model to reflect the computed Huffman codes. */
2071 lzx_set_costs_from_codes(struct lzx_compressor *c)
2074 const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
2076 for (i = 0; i < c->num_main_syms; i++) {
2077 c->costs.main[i] = (lens->main[i] ? lens->main[i] :
2078 MAIN_CODEWORD_LIMIT) * BIT_COST;
2081 for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
2082 c->costs.len[i] = (lens->len[i] ? lens->len[i] :
2083 LENGTH_CODEWORD_LIMIT) * BIT_COST;
2086 #if CONSIDER_ALIGNED_COSTS
2087 for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2088 c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] :
2089 ALIGNED_CODEWORD_LIMIT) * BIT_COST;
2095 * Choose a "near-optimal" literal/match sequence to use for the current block,
2096 * then flush the block. Because the cost of each Huffman symbol is unknown
2097 * until the Huffman codes have been built and the Huffman codes themselves
2098 * depend on the symbol frequencies, this uses an iterative optimization
2099 * algorithm to approximate an optimal solution. The first optimization pass
2100 * for the block uses default costs; additional passes use costs derived from
2101 * the Huffman codes computed in the previous pass.
2103 static forceinline struct lzx_lru_queue
2104 lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
2105 struct lzx_output_bitstream * const restrict os,
2106 const u8 * const restrict block_begin,
2107 const u32 block_size,
2108 const struct lzx_lru_queue initial_queue,
2111 unsigned num_passes_remaining = c->num_optim_passes;
2112 struct lzx_lru_queue new_queue;
2115 lzx_set_default_costs(c);
2118 lzx_compute_match_costs(c);
2119 new_queue = lzx_find_min_cost_path(c, block_begin, block_size,
2120 initial_queue, is_16_bit);
2122 if (--num_passes_remaining == 0)
2125 /* At least one optimization pass remains. Update the costs. */
2126 lzx_reset_symbol_frequencies(c);
2127 lzx_tally_item_list(c, block_size, is_16_bit);
2128 lzx_build_huffman_codes(c);
2129 lzx_set_costs_from_codes(c);
2132 /* Done optimizing. Generate the sequence list and flush the block. */
2133 lzx_reset_symbol_frequencies(c);
2134 seq_idx = lzx_record_item_list(c, block_size, is_16_bit);
2135 lzx_flush_block(c, os, block_begin, block_size, seq_idx);
2140 * This is the "near-optimal" LZX compressor.
2142 * For each block, it performs a relatively thorough graph search to find an
2143 * inexpensive (in terms of compressed size) way to output the block.
2145 * Note: there are actually many things this algorithm leaves on the table in
2146 * terms of compression ratio. So although it may be "near-optimal", it is
2147 * certainly not "optimal". The goal is not to produce the optimal compression
2148 * ratio, which for LZX is probably impossible within any practical amount of
2149 * time, but rather to produce a compression ratio significantly better than a
2150 * simpler "greedy" or "lazy" parse while still being relatively fast.
2152 static forceinline void
2153 lzx_compress_near_optimal(struct lzx_compressor * restrict c,
2154 const u8 * const restrict in_begin, size_t in_nbytes,
2155 struct lzx_output_bitstream * restrict os,
2158 const u8 * in_next = in_begin;
2159 const u8 * const in_end = in_begin + in_nbytes;
2160 u32 max_len = LZX_MAX_MATCH_LEN;
2161 u32 nice_len = min(c->nice_match_length, max_len);
2162 u32 next_hashes[2] = {0, 0};
2163 struct lzx_lru_queue queue = LZX_QUEUE_INITIALIZER;
2165 /* Initialize the matchfinder. */
2166 CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
2169 /* Starting a new block */
2171 const u8 * const in_block_begin = in_next;
2172 const u8 * const in_max_block_end =
2173 in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2174 struct lz_match *cache_ptr = c->match_cache;
2175 const u8 *next_search_pos = in_next;
2176 const u8 *next_observation = in_next;
2177 const u8 *next_pause_point =
2178 min(in_next + min(MIN_BLOCK_SIZE,
2179 in_max_block_end - in_next),
2180 in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2181 in_max_block_end - in_next));
2183 lzx_init_block_split_stats(&c->split_stats);
2184 lzx_reset_symbol_frequencies(c);
2186 if (in_next >= next_pause_point)
2190 * Run the input buffer through the matchfinder, caching the
2191 * matches, until we decide to end the block.
2193 * For a tighter matchfinding loop, we compute a "pause point",
2194 * which is the next position at which we may need to check
2195 * whether to end the block or to decrease max_len. We then
2196 * only do these extra checks upon reaching the pause point.
2198 resume_matchfinding:
2200 if (in_next >= next_search_pos) {
2201 /* Search for matches at this position. */
2202 struct lz_match *lz_matchptr;
2205 lz_matchptr = CALL_BT_MF(is_16_bit, c,
2206 bt_matchfinder_get_matches,
2211 c->max_search_depth,
2215 cache_ptr->length = lz_matchptr - (cache_ptr + 1);
2216 cache_ptr = lz_matchptr;
2218 /* Accumulate literal/match statistics for block
2219 * splitting and for generating the initial cost
2221 if (in_next >= next_observation) {
2222 best_len = cache_ptr[-1].length;
2223 if (best_len >= 3) {
2224 /* Match (len >= 3) */
2227 * Note: for performance reasons this has
2228 * been simplified significantly:
2230 * - We wait until later to account for
2231 * LZX_OFFSET_ADJUSTMENT.
2232 * - We don't account for repeat offsets.
2233 * - We don't account for different match headers.
2235 c->freqs.aligned[cache_ptr[-1].offset &
2236 LZX_ALIGNED_OFFSET_BITMASK]++;
2237 c->freqs.main[LZX_NUM_CHARS]++;
2239 lzx_observe_match(&c->split_stats, best_len);
2240 next_observation = in_next + best_len;
2243 c->freqs.main[*in_next]++;
2244 lzx_observe_literal(&c->split_stats, *in_next);
2245 next_observation = in_next + 1;
2250 * If there was a very long match found, then
2251 * don't cache any matches for the bytes covered
2252 * by that match. This avoids degenerate
2253 * behavior when compressing highly redundant
2254 * data, where the number of matches can be very
2257 * This heuristic doesn't actually hurt the
2258 * compression ratio *too* much. If there's a
2259 * long match, then the data must be highly
2260 * compressible, so it doesn't matter as much
2263 if (best_len >= nice_len)
2264 next_search_pos = in_next + best_len;
2266 /* Don't search for matches at this position. */
2267 CALL_BT_MF(is_16_bit, c,
2268 bt_matchfinder_skip_position,
2272 c->max_search_depth,
2274 cache_ptr->length = 0;
2277 } while (++in_next < next_pause_point &&
2278 likely(cache_ptr < &c->match_cache[CACHE_LENGTH]));
2282 /* Adjust max_len and nice_len if we're nearing the end of the
2283 * input buffer. In addition, if we are so close to the end of
2284 * the input buffer that there cannot be any more matches, then
2285 * just advance through the last few positions and record no
2287 if (unlikely(max_len > in_end - in_next)) {
2288 max_len = in_end - in_next;
2289 nice_len = min(max_len, nice_len);
2290 if (max_len < BT_MATCHFINDER_REQUIRED_NBYTES) {
2291 while (in_next != in_end) {
2292 cache_ptr->length = 0;
2299 /* End the block if the match cache may overflow. */
2300 if (unlikely(cache_ptr >= &c->match_cache[CACHE_LENGTH]))
2303 /* End the block if the soft maximum size has been reached. */
2304 if (in_next >= in_max_block_end)
2307 /* End the block if the block splitting algorithm thinks this is
2308 * a good place to do so. */
2309 if (c->split_stats.num_new_observations >=
2310 NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2311 in_max_block_end - in_next >= MIN_BLOCK_SIZE &&
2312 lzx_should_end_block(&c->split_stats))
2315 /* It's not time to end the block yet. Compute the next pause
2316 * point and resume matchfinding. */
2318 min(in_next + min(NUM_OBSERVATIONS_PER_BLOCK_CHECK * 2 -
2319 c->split_stats.num_new_observations,
2320 in_max_block_end - in_next),
2321 in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2322 in_max_block_end - in_next));
2323 goto resume_matchfinding;
2326 /* We've decided on a block boundary and cached matches. Now
2327 * choose a match/literal sequence and flush the block. */
2328 queue = lzx_optimize_and_flush_block(c, os, in_block_begin,
2329 in_next - in_block_begin,
2331 } while (in_next != in_end);
2335 lzx_compress_near_optimal_16(struct lzx_compressor *c, const u8 *in,
2336 size_t in_nbytes, struct lzx_output_bitstream *os)
2338 lzx_compress_near_optimal(c, in, in_nbytes, os, true);
2342 lzx_compress_near_optimal_32(struct lzx_compressor *c, const u8 *in,
2343 size_t in_nbytes, struct lzx_output_bitstream *os)
2345 lzx_compress_near_optimal(c, in, in_nbytes, os, false);
2348 /******************************************************************************/
2349 /* Faster ("lazy") compression algorithm */
2350 /*----------------------------------------------------------------------------*/
2353 * Called when the compressor chooses to use a literal. This tallies the
2354 * Huffman symbol for the literal, increments the current literal run length,
2355 * and "observes" the literal for the block split statistics.
2357 static forceinline void
2358 lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
2360 lzx_observe_literal(&c->split_stats, literal);
2361 c->freqs.main[literal]++;
2366 * Called when the compressor chooses to use a match. This tallies the Huffman
2367 * symbol(s) for a match, saves the match data and the length of the preceding
2368 * literal run, updates the recent offsets queue, and "observes" the match for
2369 * the block split statistics.
2371 static forceinline void
2372 lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 adjusted_offset,
2373 u32 recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
2374 u32 *litrunlen_p, struct lzx_sequence **next_seq_p)
2376 u32 litrunlen = *litrunlen_p;
2377 struct lzx_sequence *next_seq = *next_seq_p;
2378 unsigned offset_slot;
2381 lzx_observe_match(&c->split_stats, length);
2383 v = length - LZX_MIN_MATCH_LEN;
2385 /* Save the literal run length and adjusted length. */
2386 next_seq->litrunlen = litrunlen;
2387 next_seq->adjusted_length = v;
2389 /* Compute the length header, then tally the length symbol if needed. */
2390 if (v >= LZX_NUM_PRIMARY_LENS) {
2391 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
2392 v = LZX_NUM_PRIMARY_LENS;
2395 /* Compute the offset slot. */
2396 offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
2398 /* Compute the match header. */
2399 v += offset_slot * LZX_NUM_LEN_HEADERS;
2401 /* Save the adjusted offset and match header. */
2402 next_seq->adjusted_offset_and_match_hdr = (adjusted_offset << 9) | v;
2404 /* Tally the main symbol. */
2405 c->freqs.main[LZX_NUM_CHARS + v]++;
2407 /* Update the recent offsets queue. */
2408 if (adjusted_offset < LZX_NUM_RECENT_OFFSETS) {
2409 /* Repeat offset match. */
2410 swap(recent_offsets[0], recent_offsets[adjusted_offset]);
2412 /* Explicit offset match. */
2414 /* Tally the aligned offset symbol if needed. */
2415 if (adjusted_offset >= 16)
2416 c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
2418 recent_offsets[2] = recent_offsets[1];
2419 recent_offsets[1] = recent_offsets[0];
2420 recent_offsets[0] = adjusted_offset - LZX_OFFSET_ADJUSTMENT;
2423 /* Reset the literal run length and advance to the next sequence. */
2424 *next_seq_p = next_seq + 1;
2429 * Called when the compressor ends a block. This finshes the last lzx_sequence,
2430 * which is just a literal run with no following match. This literal run might
2433 static forceinline void
2434 lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
2436 last_seq->litrunlen = litrunlen;
2438 /* Special value to mark last sequence */
2439 last_seq->adjusted_offset_and_match_hdr = 0x80000000;
2443 * Find the longest repeat offset match with the current position. If a match
2444 * is found, return its length and set *best_rep_idx_ret to the index of its
2445 * offset in @recent_offsets. Otherwise, return 0.
2447 * Don't bother with length 2 matches; consider matches of length >= 3 only.
2448 * Also assume that max_len >= 3.
2451 lzx_find_longest_repeat_offset_match(const u8 * const in_next,
2452 const u32 recent_offsets[],
2453 const unsigned max_len,
2454 unsigned *best_rep_idx_ret)
2456 STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); /* loop is unrolled */
2458 const u32 seq3 = load_u24_unaligned(in_next);
2460 unsigned best_rep_len = 0;
2461 unsigned best_rep_idx = 0;
2464 /* Check for rep0 match (most recent offset) */
2465 matchptr = in_next - recent_offsets[0];
2466 if (load_u24_unaligned(matchptr) == seq3)
2467 best_rep_len = lz_extend(in_next, matchptr, 3, max_len);
2469 /* Check for rep1 match (second most recent offset) */
2470 matchptr = in_next - recent_offsets[1];
2471 if (load_u24_unaligned(matchptr) == seq3) {
2472 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2473 if (rep_len > best_rep_len) {
2474 best_rep_len = rep_len;
2479 /* Check for rep2 match (third most recent offset) */
2480 matchptr = in_next - recent_offsets[2];
2481 if (load_u24_unaligned(matchptr) == seq3) {
2482 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2483 if (rep_len > best_rep_len) {
2484 best_rep_len = rep_len;
2489 *best_rep_idx_ret = best_rep_idx;
2490 return best_rep_len;
2494 * Fast heuristic scoring for lazy parsing: how "good" is this match?
2495 * This is mainly determined by the length: longer matches are better.
2496 * However, we also give a bonus to close (small offset) matches and to repeat
2497 * offset matches, since those require fewer bits to encode.
2500 static forceinline unsigned
2501 lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
2503 unsigned score = len;
2505 if (adjusted_offset < 4096)
2507 if (adjusted_offset < 256)
2513 static forceinline unsigned
2514 lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
2520 * This is the "lazy" LZX compressor. The basic idea is that before it chooses
2521 * a match, it checks to see if there's a longer match at the next position. If
2522 * yes, it chooses a literal and continues to the next position. If no, it
2523 * chooses the match.
2525 * Some additional heuristics are used as well. Repeat offset matches are
2526 * considered favorably and sometimes are chosen immediately. In addition, long
2527 * matches (at least "nice_len" bytes) are chosen immediately as well. Finally,
2528 * when we decide whether a match is "better" than another, we take the offset
2529 * into consideration as well as the length.
2531 static forceinline void
2532 lzx_compress_lazy(struct lzx_compressor * restrict c,
2533 const u8 * const restrict in_begin, size_t in_nbytes,
2534 struct lzx_output_bitstream * restrict os, bool is_16_bit)
2536 const u8 * in_next = in_begin;
2537 const u8 * const in_end = in_begin + in_nbytes;
2538 unsigned max_len = LZX_MAX_MATCH_LEN;
2539 unsigned nice_len = min(c->nice_match_length, max_len);
2540 STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
2541 u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1};
2542 u32 next_hashes[2] = {0, 0};
2544 /* Initialize the matchfinder. */
2545 CALL_HC_MF(is_16_bit, c, hc_matchfinder_init);
2548 /* Starting a new block */
2550 const u8 * const in_block_begin = in_next;
2551 const u8 * const in_max_block_end =
2552 in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2553 struct lzx_sequence *next_seq = c->chosen_sequences;
2557 u32 cur_adjusted_offset;
2561 u32 next_adjusted_offset;
2562 unsigned next_score;
2563 unsigned best_rep_len;
2564 unsigned best_rep_idx;
2568 lzx_reset_symbol_frequencies(c);
2569 lzx_init_block_split_stats(&c->split_stats);
2572 /* Adjust max_len and nice_len if we're nearing the end
2573 * of the input buffer. */
2574 if (unlikely(max_len > in_end - in_next)) {
2575 max_len = in_end - in_next;
2576 nice_len = min(max_len, nice_len);
2579 /* Find the longest match (subject to the
2580 * max_search_depth cutoff parameter) with the current
2581 * position. Don't bother with length 2 matches; only
2582 * look for matches of length >= 3. */
2583 cur_len = CALL_HC_MF(is_16_bit, c,
2584 hc_matchfinder_longest_match,
2590 c->max_search_depth,
2594 /* If there was no match found, or the only match found
2595 * was a distant short match, then choose a literal. */
2598 cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT &&
2599 cur_offset != recent_offsets[0] &&
2600 cur_offset != recent_offsets[1] &&
2601 cur_offset != recent_offsets[2]))
2603 lzx_choose_literal(c, *in_next, &litrunlen);
2608 /* Heuristic: if this match has the most recent offset,
2609 * then go ahead and choose it as a rep0 match. */
2610 if (cur_offset == recent_offsets[0]) {
2612 skip_len = cur_len - 1;
2613 cur_adjusted_offset = 0;
2614 goto choose_cur_match;
2617 /* Compute the longest match's score as an explicit
2619 cur_adjusted_offset = cur_offset + LZX_OFFSET_ADJUSTMENT;
2620 cur_score = lzx_explicit_offset_match_score(cur_len, cur_adjusted_offset);
2622 /* Find the longest repeat offset match at this
2623 * position. If we find one and it's "better" than the
2624 * explicit offset match we found, then go ahead and
2625 * choose the repeat offset match immediately. */
2626 best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2632 if (best_rep_len != 0 &&
2633 (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2634 best_rep_idx)) >= cur_score)
2636 cur_len = best_rep_len;
2637 cur_adjusted_offset = best_rep_idx;
2638 skip_len = best_rep_len - 1;
2639 goto choose_cur_match;
2644 * We have a match at the current position. If the
2645 * match is very long, then choose it immediately.
2646 * Otherwise, see if there's a better match at the next
2650 if (cur_len >= nice_len) {
2651 skip_len = cur_len - 1;
2652 goto choose_cur_match;
2655 if (unlikely(max_len > in_end - in_next)) {
2656 max_len = in_end - in_next;
2657 nice_len = min(max_len, nice_len);
2660 next_len = CALL_HC_MF(is_16_bit, c,
2661 hc_matchfinder_longest_match,
2667 c->max_search_depth / 2,
2671 if (next_len <= cur_len - 2) {
2672 /* No potentially better match was found. */
2674 skip_len = cur_len - 2;
2675 goto choose_cur_match;
2678 next_adjusted_offset = next_offset + LZX_OFFSET_ADJUSTMENT;
2679 next_score = lzx_explicit_offset_match_score(next_len, next_adjusted_offset);
2681 best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2687 if (best_rep_len != 0 &&
2688 (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2689 best_rep_idx)) >= next_score)
2692 if (rep_score > cur_score) {
2693 /* The next match is better, and it's a
2694 * repeat offset match. */
2695 lzx_choose_literal(c, *(in_next - 2),
2697 cur_len = best_rep_len;
2698 cur_adjusted_offset = best_rep_idx;
2699 skip_len = cur_len - 1;
2700 goto choose_cur_match;
2703 if (next_score > cur_score) {
2704 /* The next match is better, and it's an
2705 * explicit offset match. */
2706 lzx_choose_literal(c, *(in_next - 2),
2709 cur_adjusted_offset = next_adjusted_offset;
2710 cur_score = next_score;
2711 goto have_cur_match;
2715 /* The original match was better; choose it. */
2716 skip_len = cur_len - 2;
2719 /* Choose a match and have the matchfinder skip over its
2720 * remaining bytes. */
2721 lzx_choose_match(c, cur_len, cur_adjusted_offset,
2722 recent_offsets, is_16_bit,
2723 &litrunlen, &next_seq);
2724 in_next = CALL_HC_MF(is_16_bit, c,
2725 hc_matchfinder_skip_positions,
2732 /* Keep going until it's time to end the block. */
2733 } while (in_next < in_max_block_end &&
2734 !(c->split_stats.num_new_observations >=
2735 NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2736 in_next - in_block_begin >= MIN_BLOCK_SIZE &&
2737 in_end - in_next >= MIN_BLOCK_SIZE &&
2738 lzx_should_end_block(&c->split_stats)));
2740 /* Flush the block. */
2741 lzx_finish_sequence(next_seq, litrunlen);
2742 lzx_flush_block(c, os, in_block_begin, in_next - in_block_begin, 0);
2744 /* Keep going until we've reached the end of the input buffer. */
2745 } while (in_next != in_end);
2749 lzx_compress_lazy_16(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2750 struct lzx_output_bitstream *os)
2752 lzx_compress_lazy(c, in, in_nbytes, os, true);
2756 lzx_compress_lazy_32(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2757 struct lzx_output_bitstream *os)
2759 lzx_compress_lazy(c, in, in_nbytes, os, false);
2762 /******************************************************************************/
2763 /* Compressor operations */
2764 /*----------------------------------------------------------------------------*/
2767 * Generate tables for mapping match offsets (actually, "adjusted" match
2768 * offsets) to offset slots.
2771 lzx_init_offset_slot_tabs(struct lzx_compressor *c)
2773 u32 adjusted_offset = 0;
2777 for (; adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1);
2780 if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
2781 LZX_OFFSET_ADJUSTMENT)
2783 c->offset_slot_tab_1[adjusted_offset] = slot;
2786 /* slots [30, 49] */
2787 for (; adjusted_offset < LZX_MAX_WINDOW_SIZE;
2788 adjusted_offset += (u32)1 << 14)
2790 if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
2791 LZX_OFFSET_ADJUSTMENT)
2793 c->offset_slot_tab_2[adjusted_offset >> 14] = slot;
2798 lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
2800 if (compression_level <= MAX_FAST_LEVEL) {
2801 if (lzx_is_16_bit(max_bufsize))
2802 return offsetof(struct lzx_compressor, hc_mf_16) +
2803 hc_matchfinder_size_16(max_bufsize);
2805 return offsetof(struct lzx_compressor, hc_mf_32) +
2806 hc_matchfinder_size_32(max_bufsize);
2808 if (lzx_is_16_bit(max_bufsize))
2809 return offsetof(struct lzx_compressor, bt_mf_16) +
2810 bt_matchfinder_size_16(max_bufsize);
2812 return offsetof(struct lzx_compressor, bt_mf_32) +
2813 bt_matchfinder_size_32(max_bufsize);
2817 /* Compute the amount of memory needed to allocate an LZX compressor. */
2819 lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2824 if (max_bufsize > LZX_MAX_WINDOW_SIZE)
2827 size += lzx_get_compressor_size(max_bufsize, compression_level);
2829 size += max_bufsize; /* account for in_buffer */
2833 /* Allocate an LZX compressor. */
2835 lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
2836 bool destructive, void **c_ret)
2838 unsigned window_order;
2839 struct lzx_compressor *c;
2841 /* Validate the maximum buffer size and get the window order from it. */
2842 window_order = lzx_get_window_order(max_bufsize);
2843 if (window_order == 0)
2844 return WIMLIB_ERR_INVALID_PARAM;
2846 /* Allocate the compressor. */
2847 c = MALLOC(lzx_get_compressor_size(max_bufsize, compression_level));
2851 c->window_order = window_order;
2852 c->num_main_syms = lzx_get_num_main_syms(window_order);
2853 c->destructive = destructive;
2855 /* Allocate the buffer for preprocessed data if needed. */
2856 if (!c->destructive) {
2857 c->in_buffer = MALLOC(max_bufsize);
2862 if (compression_level <= MAX_FAST_LEVEL) {
2864 /* Fast compression: Use lazy parsing. */
2865 if (lzx_is_16_bit(max_bufsize))
2866 c->impl = lzx_compress_lazy_16;
2868 c->impl = lzx_compress_lazy_32;
2870 /* Scale max_search_depth and nice_match_length with the
2871 * compression level. */
2872 c->max_search_depth = (60 * compression_level) / 20;
2873 c->nice_match_length = (80 * compression_level) / 20;
2875 /* lzx_compress_lazy() needs max_search_depth >= 2 because it
2876 * halves the max_search_depth when attempting a lazy match, and
2877 * max_search_depth must be at least 1. */
2878 c->max_search_depth = max(c->max_search_depth, 2);
2881 /* Normal / high compression: Use near-optimal parsing. */
2882 if (lzx_is_16_bit(max_bufsize))
2883 c->impl = lzx_compress_near_optimal_16;
2885 c->impl = lzx_compress_near_optimal_32;
2887 /* Scale max_search_depth and nice_match_length with the
2888 * compression level. */
2889 c->max_search_depth = (24 * compression_level) / 50;
2890 c->nice_match_length = (48 * compression_level) / 50;
2892 /* Also scale num_optim_passes with the compression level. But
2893 * the more passes there are, the less they help --- so don't
2894 * add them linearly. */
2895 c->num_optim_passes = 1;
2896 c->num_optim_passes += (compression_level >= 45);
2897 c->num_optim_passes += (compression_level >= 70);
2898 c->num_optim_passes += (compression_level >= 100);
2899 c->num_optim_passes += (compression_level >= 150);
2900 c->num_optim_passes += (compression_level >= 200);
2901 c->num_optim_passes += (compression_level >= 300);
2903 /* max_search_depth must be at least 1. */
2904 c->max_search_depth = max(c->max_search_depth, 1);
2907 /* Prepare the offset => offset slot mapping. */
2908 lzx_init_offset_slot_tabs(c);
2916 return WIMLIB_ERR_NOMEM;
2919 /* Compress a buffer of data. */
2921 lzx_compress(const void *restrict in, size_t in_nbytes,
2922 void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2924 struct lzx_compressor *c = _c;
2925 struct lzx_output_bitstream os;
2928 /* Don't bother trying to compress very small inputs. */
2932 /* If the compressor is in "destructive" mode, then we can directly
2933 * preprocess the input data. Otherwise, we need to copy it into an
2934 * internal buffer first. */
2935 if (!c->destructive) {
2936 memcpy(c->in_buffer, in, in_nbytes);
2940 /* Preprocess the input data. */
2941 lzx_preprocess((void *)in, in_nbytes);
2943 /* Initially, the previous Huffman codeword lengths are all zeroes. */
2945 memset(&c->codes[1].lens, 0, sizeof(struct lzx_lens));
2947 /* Initialize the output bitstream. */
2948 lzx_init_output(&os, out, out_nbytes_avail);
2950 /* Call the compression level-specific compress() function. */
2951 (*c->impl)(c, in, in_nbytes, &os);
2953 /* Flush the output bitstream. */
2954 result = lzx_flush_output(&os);
2956 /* If the data did not compress to less than its original size and we
2957 * preprocessed the original buffer, then postprocess it to restore it
2958 * to its original state. */
2959 if (result == 0 && c->destructive)
2960 lzx_postprocess((void *)in, in_nbytes);
2962 /* Return the number of compressed bytes, or 0 if the input did not
2963 * compress to less than its original size. */
2967 /* Free an LZX compressor. */
2969 lzx_free_compressor(void *_c)
2971 struct lzx_compressor *c = _c;
2973 if (!c->destructive)
2978 const struct compressor_ops lzx_compressor_ops = {
2979 .get_needed_memory = lzx_get_needed_memory,
2980 .create_compressor = lzx_create_compressor,
2981 .compress = lzx_compress,
2982 .free_compressor = lzx_free_compressor,