4 * A compressor that produces output compatible with the LZX compression format.
8 * Copyright (C) 2012, 2013, 2014 Eric Biggers
10 * This file is part of wimlib, a library for working with WIM files.
12 * wimlib is free software; you can redistribute it and/or modify it under the
13 * terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 3 of the License, or (at your option)
17 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
18 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
19 * A PARTICULAR PURPOSE. See the GNU General Public License for more
22 * You should have received a copy of the GNU General Public License
23 * along with wimlib; if not, see http://www.gnu.org/licenses/.
28 * This file contains a compressor for the LZX ("Lempel-Ziv eXtended"?)
29 * compression format, as used in the WIM (Windows IMaging) file format. This
30 * code may need some slight modifications to be used outside of the WIM format.
31 * In particular, in other situations the LZX block header might be slightly
32 * different, and a sliding window rather than a fixed-size window might be
35 * ----------------------------------------------------------------------------
39 * The primary reference for LZX is the specification released by Microsoft.
40 * However, the comments in lzx-decompress.c provide more information about LZX
41 * and note some errors in the Microsoft specification.
43 * LZX shares many similarities with DEFLATE, the format used by zlib and gzip.
44 * Both LZX and DEFLATE use LZ77 matching and Huffman coding. Certain details
45 * are quite similar, such as the method for storing Huffman codes. However,
46 * the main differences are:
48 * - LZX preprocesses the data to attempt to make x86 machine code slightly more
49 * compressible before attempting to compress it further.
51 * - LZX uses a "main" alphabet which combines literals and matches, with the
52 * match symbols containing a "length header" (giving all or part of the match
53 * length) and a "position slot" (giving, roughly speaking, the order of
54 * magnitude of the match offset).
56 * - LZX does not have static Huffman blocks (that is, the kind with preset
57 * Huffman codes); however it does have two types of dynamic Huffman blocks
58 * ("verbatim" and "aligned").
60 * - LZX has a minimum match length of 2 rather than 3.
62 * - In LZX, match offsets 0 through 2 actually represent entries in an LRU
63 * queue of match offsets. This is very useful for certain types of files,
64 * such as binary files that have repeating records.
66 * ----------------------------------------------------------------------------
68 * Algorithmic Overview
70 * At a high level, any implementation of LZX compression must operate as
73 * 1. Preprocess the input data to translate the targets of 32-bit x86 call
74 * instructions to absolute offsets. (Actually, this is required for WIM,
75 * but might not be in other places LZX is used.)
77 * 2. Find a sequence of LZ77-style matches and literal bytes that expands to
78 * the preprocessed data.
80 * 3. Divide the match/literal sequence into one or more LZX blocks, each of
81 * which may be "uncompressed", "verbatim", or "aligned".
83 * 4. Output each LZX block.
85 * Step (1) is fairly straightforward. It requires looking for 0xe8 bytes in
86 * the input data and performing a translation on the 4 bytes following each
89 * Step (4) is complicated, but it is mostly determined by the LZX format. The
90 * only real choice we have is what algorithm to use to build the length-limited
91 * canonical Huffman codes. See lzx_write_all_blocks() for details.
93 * That leaves steps (2) and (3) as where all the hard stuff happens. Focusing
94 * on step (2), we need to do LZ77-style parsing on the input data, or "window",
95 * to divide it into a sequence of matches and literals. Each position in the
96 * window might have multiple matches associated with it, and we need to choose
97 * which one, if any, to actually use. Therefore, the problem can really be
98 * divided into two areas of concern: (a) finding matches at a given position,
99 * which we shall call "match-finding", and (b) choosing whether to use a
100 * match or a literal at a given position, and if using a match, which one (if
101 * there is more than one available). We shall call this "match-choosing". We
102 * first consider match-finding, then match-choosing.
104 * ----------------------------------------------------------------------------
108 * Given a position in the window, we want to find LZ77-style "matches" with
109 * that position at previous positions in the window. With LZX, the minimum
110 * match length is 2 and the maximum match length is 257. The only restriction
111 * on offsets is that LZX does not allow the last 2 bytes of the window to match
112 * the beginning of the window.
114 * There are a number of algorithms that can be used for this, including hash
115 * chains, binary trees, and suffix arrays. Binary trees generally work well
116 * for LZX compression since it uses medium-size windows (2^15 to 2^21 bytes).
117 * However, when compressing in a fast mode where many positions are skipped
118 * (not searched for matches), hash chains are faster.
120 * Since the match-finders are not specific to LZX, I will not explain them in
121 * detail here. Instead, see lz_hash_chains.c and lz_binary_trees.c.
123 * ----------------------------------------------------------------------------
127 * Usually, choosing the longest match is best because it encodes the most data
128 * in that one item. However, sometimes the longest match is not optimal
129 * because (a) choosing a long match now might prevent using an even longer
130 * match later, or (b) more generally, what we actually care about is the number
131 * of bits it will ultimately take to output each match or literal, which is
132 * actually dependent on the entropy encoding using by the underlying
133 * compression format. Consequently, a longer match usually, but not always,
134 * takes fewer bits to encode than multiple shorter matches or literals that
135 * cover the same data.
137 * This problem of choosing the truly best match/literal sequence is probably
138 * impossible to solve efficiently when combined with entropy encoding. If we
139 * knew how many bits it takes to output each match/literal, then we could
140 * choose the optimal sequence using shortest-path search a la Dijkstra's
141 * algorithm. However, with entropy encoding, the chosen match/literal sequence
142 * affects its own encoding. Therefore, we can't know how many bits it will
143 * take to actually output any one match or literal until we have actually
144 * chosen the full sequence of matches and literals.
146 * Notwithstanding the entropy encoding problem, we also aren't guaranteed to
147 * choose the optimal match/literal sequence unless the match-finder (see
148 * section "Match-finder") provides the match-chooser with all possible matches
149 * at each position. However, this is not computationally efficient. For
150 * example, there might be many matches of the same length, and usually (but not
151 * always) the best choice is the one with the smallest offset. So in practice,
152 * it's fine to only consider the smallest offset for a given match length at a
153 * given position. (Actually, for LZX, it's also worth considering repeat
156 * In addition, as mentioned earlier, in LZX we have the choice of using
157 * multiple blocks, each of which resets the Huffman codes. This expands the
158 * search space even further. Therefore, to simplify the problem, we currently
159 * we don't attempt to actually choose the LZX blocks based on the data.
160 * Instead, we just divide the data into fixed-size blocks of LZX_DIV_BLOCK_SIZE
161 * bytes each, and always use verbatim or aligned blocks (never uncompressed).
162 * A previous version of this code recursively split the input data into
163 * equal-sized blocks, up to a maximum depth, and chose the lowest-cost block
164 * divisions. However, this made compression much slower and did not actually
165 * help very much. It remains an open question whether a sufficiently fast and
166 * useful block-splitting algorithm is possible for LZX. Essentially the same
167 * problem also applies to DEFLATE. The Microsoft LZX compressor seemingly does
168 * do block splitting, although I don't know how fast or useful it is,
171 * Now, back to the entropy encoding problem. The "solution" is to use an
172 * iterative approach to compute a good, but not necessarily optimal,
173 * match/literal sequence. Start with a fixed assignment of symbol costs and
174 * choose an "optimal" match/literal sequence based on those costs, using
175 * shortest-path seach a la Dijkstra's algorithm. Then, for each iteration of
176 * the optimization, update the costs based on the entropy encoding of the
177 * current match/literal sequence, then choose a new match/literal sequence
178 * based on the updated costs. Usually, the actual cost to output the current
179 * match/literal sequence will decrease in each iteration until it converges on
180 * a fixed point. This result may not be the truly optimal match/literal
181 * sequence, but it usually is much better than one chosen by doing a "greedy"
182 * parse where we always chooe the longest match.
184 * An alternative to both greedy parsing and iterative, near-optimal parsing is
185 * "lazy" parsing. Briefly, "lazy" parsing considers just the longest match at
186 * each position, but it waits to choose that match until it has also examined
187 * the next position. This is actually a useful approach; it's used by zlib,
188 * for example. Therefore, for fast compression we combine lazy parsing with
189 * the hash chain max-finder. For normal/high compression we combine
190 * near-optimal parsing with the binary tree match-finder.
197 #include "wimlib/compressor_ops.h"
198 #include "wimlib/compress_common.h"
199 #include "wimlib/endianness.h"
200 #include "wimlib/error.h"
201 #include "wimlib/lz_mf.h"
202 #include "wimlib/lzx.h"
203 #include "wimlib/util.h"
206 #define LZX_OPTIM_ARRAY_LENGTH 4096
208 #define LZX_DIV_BLOCK_SIZE 32768
210 #define LZX_CACHE_PER_POS 8
212 #define LZX_MAX_MATCHES_PER_POS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
214 #define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1))
216 /* Codewords for the LZX main, length, and aligned offset Huffman codes */
217 struct lzx_codewords {
218 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
219 u32 len[LZX_LENCODE_NUM_SYMBOLS];
220 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
223 /* Codeword lengths (in bits) for the LZX main, length, and aligned offset
226 * A 0 length means the codeword has zero frequency.
229 u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
230 u8 len[LZX_LENCODE_NUM_SYMBOLS];
231 u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
234 /* Costs for the LZX main, length, and aligned offset Huffman symbols.
236 * If a codeword has zero frequency, it must still be assigned some nonzero cost
237 * --- generally a high cost, since even if it gets used in the next iteration,
238 * it probably will not be used very many times. */
240 u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
241 u8 len[LZX_LENCODE_NUM_SYMBOLS];
242 u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
245 /* The LZX main, length, and aligned offset Huffman codes */
247 struct lzx_codewords codewords;
248 struct lzx_lens lens;
251 /* Tables for tallying symbol frequencies in the three LZX alphabets */
253 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
254 u32 len[LZX_LENCODE_NUM_SYMBOLS];
255 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
258 /* LZX intermediate match/literal format */
262 * 31 1 if a match, 0 if a literal.
264 * 30-25 position slot. This can be at most 50, so it will fit in 6
267 * 8-24 position footer. This is the offset of the real formatted
268 * offset from the position base. This can be at most 17 bits
269 * (since lzx_extra_bits[LZX_MAX_POSITION_SLOTS - 1] is 17).
271 * 0-7 length of match, minus 2. This can be at most
272 * (LZX_MAX_MATCH_LEN - 2) == 255, so it will fit in 8 bits. */
276 /* Specification for an LZX block. */
277 struct lzx_block_spec {
279 /* One of the LZX_BLOCKTYPE_* constants indicating which type of this
283 /* 0-based position in the window at which this block starts. */
286 /* The number of bytes of uncompressed data this block represents. */
289 /* The match/literal sequence for this block. */
290 struct lzx_item *chosen_items;
292 /* The length of the @chosen_items sequence. */
293 u32 num_chosen_items;
295 /* Huffman codes for this block. */
296 struct lzx_codes codes;
299 struct lzx_compressor;
301 struct lzx_compressor_params {
302 struct lz_match (*choose_item_func)(struct lzx_compressor *);
303 enum lz_mf_algo mf_algo;
304 u32 num_optim_passes;
305 u32 min_match_length;
306 u32 nice_match_length;
307 u32 max_search_depth;
310 /* State of the LZX compressor. */
311 struct lzx_compressor {
313 /* The buffer of data to be compressed.
315 * 0xe8 byte preprocessing is done directly on the data here before
316 * further compression.
318 * Note that this compressor does *not* use a real sliding window!!!!
319 * It's not needed in the WIM format, since every chunk is compressed
320 * independently. This is by design, to allow random access to the
324 /* Number of bytes of data to be compressed, which is the number of
325 * bytes of data in @cur_window that are actually valid. */
328 /* Allocated size of @cur_window. */
331 /* log2 order of the LZX window size for LZ match offset encoding
332 * purposes. Will be >= LZX_MIN_WINDOW_ORDER and <=
333 * LZX_MAX_WINDOW_ORDER.
335 * Note: 1 << @window_order is normally equal to @max_window_size, but
336 * it will be greater than @max_window_size in the event that the
337 * compressor was created with a non-power-of-2 block size. (See
338 * lzx_get_window_order().) */
339 unsigned window_order;
341 /* Compression parameters. */
342 struct lzx_compressor_params params;
344 unsigned (*get_matches_func)(struct lzx_compressor *, const struct lz_match **);
345 void (*skip_bytes_func)(struct lzx_compressor *, unsigned n);
347 /* Number of symbols in the main alphabet (depends on the @window_order
348 * since it determines the maximum allowed offset). */
349 unsigned num_main_syms;
351 /* The current match offset LRU queue. */
352 struct lzx_lru_queue queue;
354 /* Space for the sequences of matches/literals that were chosen for each
356 struct lzx_item *chosen_items;
358 /* Information about the LZX blocks the preprocessed input was divided
360 struct lzx_block_spec *block_specs;
362 /* Number of LZX blocks the input was divided into; a.k.a. the number of
363 * elements of @block_specs that are valid. */
366 /* This is simply filled in with zeroes and used to avoid special-casing
367 * the output of the first compressed Huffman code, which conceptually
368 * has a delta taken from a code with all symbols having zero-length
370 struct lzx_codes zero_codes;
372 /* The current cost model. */
373 struct lzx_costs costs;
375 /* Lempel-Ziv match-finder. */
378 /* Position in window of next match to return. */
379 u32 match_window_pos;
381 /* The end-of-block position. We can't allow any matches to span this
383 u32 match_window_end;
385 /* When doing more than one match-choosing pass over the data, matches
386 * found by the match-finder are cached in the following array to
387 * achieve a slight speedup when the same matches are needed on
388 * subsequent passes. This is suboptimal because different matches may
389 * be preferred with different cost models, but seems to be a worthwhile
391 struct lz_match *cached_matches;
392 struct lz_match *cache_ptr;
393 struct lz_match *cache_limit;
395 /* Match-chooser state, used when doing near-optimal parsing.
397 * When matches have been chosen, optimum_cur_idx is set to the position
398 * in the window of the next match/literal to return and optimum_end_idx
399 * is set to the position in the window at the end of the last
400 * match/literal to return. */
401 struct lzx_mc_pos_data *optimum;
402 unsigned optimum_cur_idx;
403 unsigned optimum_end_idx;
405 /* Previous match, used when doing lazy parsing. */
406 struct lz_match prev_match;
410 * Match chooser position data:
412 * An array of these structures is used during the match-choosing algorithm.
413 * They correspond to consecutive positions in the window and are used to keep
414 * track of the cost to reach each position, and the match/literal choices that
415 * need to be chosen to reach that position.
417 struct lzx_mc_pos_data {
418 /* The approximate minimum cost, in bits, to reach this position in the
419 * window which has been found so far. */
421 #define MC_INFINITE_COST ((u32)~0UL)
423 /* The union here is just for clarity, since the fields are used in two
424 * slightly different ways. Initially, the @prev structure is filled in
425 * first, and links go from later in the window to earlier in the
426 * window. Later, @next structure is filled in and links go from
427 * earlier in the window to later in the window. */
430 /* Position of the start of the match or literal that
431 * was taken to get to this position in the approximate
432 * minimum-cost parse. */
435 /* Offset (as in an LZ (length, offset) pair) of the
436 * match or literal that was taken to get to this
437 * position in the approximate minimum-cost parse. */
441 /* Position at which the match or literal starting at
442 * this position ends in the minimum-cost parse. */
445 /* Offset (as in an LZ (length, offset) pair) of the
446 * match or literal starting at this position in the
447 * approximate minimum-cost parse. */
452 /* Adaptive state that exists after an approximate minimum-cost path to
453 * reach this position is taken.
455 * Note: we update this whenever we update the pending minimum-cost
456 * path. This is in contrast to LZMA, which also has an optimal parser
457 * that maintains a repeat offset queue per position, but will only
458 * compute the queue once that position is actually reached in the
459 * parse, meaning that matches are being considered *starting* at that
460 * position. However, the two methods seem to have approximately the
461 * same performance if appropriate optimizations are used. Intuitively
462 * the LZMA method seems faster, but it actually suffers from 1-2 extra
463 * hard-to-predict branches at each position. Probably it works better
464 * for LZMA than LZX because LZMA has a larger adaptive state than LZX,
465 * and the LZMA encoder considers more possibilities. */
466 struct lzx_lru_queue queue;
471 * Structure to keep track of the current state of sending bits to the
472 * compressed output buffer.
474 * The LZX bitstream is encoded as a sequence of 16-bit coding units.
476 struct lzx_output_bitstream {
478 /* Bits that haven't yet been written to the output buffer. */
481 /* Number of bits currently held in @bitbuf. */
484 /* Pointer to the start of the output buffer. */
487 /* Pointer to the position in the output buffer at which the next coding
488 * unit should be written. */
491 /* Pointer past the end of the output buffer. */
496 * Initialize the output bitstream.
499 * The output bitstream structure to initialize.
501 * The buffer being written to.
503 * Size of @buffer, in bytes.
506 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, u32 size)
511 os->next = os->start;
512 os->end = os->start + size / sizeof(le16);
516 * Write some bits to the output bitstream.
518 * The bits are given by the low-order @num_bits bits of @bits. Higher-order
519 * bits in @bits cannot be set. At most 17 bits can be written at once.
521 * @max_bits is a compile-time constant that specifies the maximum number of
522 * bits that can ever be written at the call site. Currently, it is used to
523 * optimize away the conditional code for writing a second 16-bit coding unit
524 * when writing fewer than 17 bits.
526 * If the output buffer space is exhausted, then the bits will be ignored, and
527 * lzx_flush_output() will return 0 when it gets called.
529 static _always_inline_attribute void
530 lzx_write_varbits(struct lzx_output_bitstream *os,
531 const u32 bits, const unsigned int num_bits,
532 const unsigned int max_num_bits)
534 /* This code is optimized for LZX, which never needs to write more than
535 * 17 bits at once. */
536 LZX_ASSERT(num_bits <= 17);
537 LZX_ASSERT(num_bits <= max_num_bits);
538 LZX_ASSERT(os->bitcount <= 15);
540 /* Add the bits to the bit buffer variable. @bitcount will be at most
541 * 15, so there will be just enough space for the maximum possible
542 * @num_bits of 17. */
543 os->bitcount += num_bits;
544 os->bitbuf = (os->bitbuf << num_bits) | bits;
546 /* Check whether any coding units need to be written. */
547 if (os->bitcount >= 16) {
551 /* Write a coding unit, unless it would overflow the buffer. */
552 if (os->next != os->end)
553 *os->next++ = cpu_to_le16(os->bitbuf >> os->bitcount);
555 /* If writing 17 bits, a second coding unit might need to be
556 * written. But because 'max_num_bits' is a compile-time
557 * constant, the compiler will optimize away this code at most
559 if (max_num_bits == 17 && os->bitcount == 16) {
560 if (os->next != os->end)
561 *os->next++ = cpu_to_le16(os->bitbuf);
567 /* Use when @num_bits is a compile-time constant. Otherwise use
568 * lzx_write_varbits(). */
569 static _always_inline_attribute void
570 lzx_write_bits(struct lzx_output_bitstream *os,
571 const u32 bits, const unsigned int num_bits)
573 lzx_write_varbits(os, bits, num_bits, num_bits);
577 * Flush the last coding unit to the output buffer if needed. Return the total
578 * number of bytes written to the output buffer, or 0 if an overflow occurred.
581 lzx_flush_output(struct lzx_output_bitstream *os)
583 if (os->next == os->end)
586 if (os->bitcount != 0)
587 *os->next++ = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
589 return (const u8 *)os->next - (const u8 *)os->start;
592 /* Returns the LZX position slot that corresponds to a given match offset,
593 * taking into account the recent offset queue and updating it if the offset is
596 lzx_get_position_slot(u32 offset, struct lzx_lru_queue *queue)
598 unsigned position_slot;
600 /* See if the offset was recently used. */
601 for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
602 if (offset == queue->R[i]) {
605 /* Bring the repeat offset to the front of the
606 * queue. Note: this is, in fact, not a real
607 * LRU queue because repeat matches are simply
608 * swapped to the front. */
609 swap(queue->R[0], queue->R[i]);
611 /* The resulting position slot is simply the first index
612 * at which the offset was found in the queue. */
617 /* The offset was not recently used; look up its real position slot. */
618 position_slot = lzx_get_position_slot_raw(offset + LZX_OFFSET_OFFSET);
620 /* Bring the new offset to the front of the queue. */
621 for (int i = LZX_NUM_RECENT_OFFSETS - 1; i > 0; i--)
622 queue->R[i] = queue->R[i - 1];
623 queue->R[0] = offset;
625 return position_slot;
628 /* Build the main, length, and aligned offset Huffman codes used in LZX.
630 * This takes as input the frequency tables for each code and produces as output
631 * a set of tables that map symbols to codewords and codeword lengths. */
633 lzx_make_huffman_codes(const struct lzx_freqs *freqs,
634 struct lzx_codes *codes,
635 unsigned num_main_syms)
637 make_canonical_huffman_code(num_main_syms,
638 LZX_MAX_MAIN_CODEWORD_LEN,
641 codes->codewords.main);
643 make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
644 LZX_MAX_LEN_CODEWORD_LEN,
647 codes->codewords.len);
649 make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
650 LZX_MAX_ALIGNED_CODEWORD_LEN,
653 codes->codewords.aligned);
657 * Output a precomputed LZX match.
660 * The bitstream to which to write the match.
662 * The type of the LZX block (LZX_BLOCKTYPE_ALIGNED or
663 * LZX_BLOCKTYPE_VERBATIM)
667 * Pointer to a structure that contains the codewords for the main, length,
668 * and aligned offset Huffman codes for the current LZX compressed block.
671 lzx_write_match(struct lzx_output_bitstream *os, int block_type,
672 struct lzx_item match, const struct lzx_codes *codes)
674 unsigned match_len_minus_2 = match.data & 0xff;
675 u32 position_footer = (match.data >> 8) & 0x1ffff;
676 unsigned position_slot = (match.data >> 25) & 0x3f;
679 unsigned main_symbol;
680 unsigned num_extra_bits;
682 /* If the match length is less than MIN_MATCH_LEN (= 2) +
683 * NUM_PRIMARY_LENS (= 7), the length header contains the match length
684 * minus MIN_MATCH_LEN, and there is no length footer.
686 * Otherwise, the length header contains NUM_PRIMARY_LENS, and the
687 * length footer contains the match length minus NUM_PRIMARY_LENS minus
689 if (match_len_minus_2 < LZX_NUM_PRIMARY_LENS) {
690 len_header = match_len_minus_2;
692 len_header = LZX_NUM_PRIMARY_LENS;
693 len_footer = match_len_minus_2 - LZX_NUM_PRIMARY_LENS;
696 /* Combine the position slot with the length header into a single symbol
697 * that will be encoded with the main code.
699 * The actual main symbol is offset by LZX_NUM_CHARS because values
700 * under LZX_NUM_CHARS are used to indicate a literal byte rather than a
702 main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
704 /* Output main symbol. */
705 lzx_write_varbits(os, codes->codewords.main[main_symbol],
706 codes->lens.main[main_symbol],
707 LZX_MAX_MAIN_CODEWORD_LEN);
709 /* If there is a length footer, output it using the
710 * length Huffman code. */
711 if (len_header == LZX_NUM_PRIMARY_LENS) {
712 lzx_write_varbits(os, codes->codewords.len[len_footer],
713 codes->lens.len[len_footer],
714 LZX_MAX_LEN_CODEWORD_LEN);
717 /* Output the position footer. */
719 num_extra_bits = lzx_get_num_extra_bits(position_slot);
721 if ((block_type == LZX_BLOCKTYPE_ALIGNED) && (num_extra_bits >= 3)) {
723 /* Aligned offset blocks: The low 3 bits of the position footer
724 * are Huffman-encoded using the aligned offset code. The
725 * remaining bits are output literally. */
727 lzx_write_varbits(os,
728 position_footer >> 3, num_extra_bits - 3, 14);
730 lzx_write_varbits(os,
731 codes->codewords.aligned[position_footer & 7],
732 codes->lens.aligned[position_footer & 7],
733 LZX_MAX_ALIGNED_CODEWORD_LEN);
735 /* Verbatim blocks, or fewer than 3 extra bits: All position
736 * footer bits are output literally. */
737 lzx_write_varbits(os, position_footer, num_extra_bits, 17);
741 /* Output an LZX literal (encoded with the main Huffman code). */
743 lzx_write_literal(struct lzx_output_bitstream *os, unsigned literal,
744 const struct lzx_codes *codes)
746 lzx_write_varbits(os, codes->codewords.main[literal],
747 codes->lens.main[literal], LZX_MAX_MAIN_CODEWORD_LEN);
751 lzx_build_precode(const u8 lens[restrict],
752 const u8 prev_lens[restrict],
753 const unsigned num_syms,
754 u32 precode_freqs[restrict LZX_PRECODE_NUM_SYMBOLS],
755 u8 output_syms[restrict num_syms],
756 u8 precode_lens[restrict LZX_PRECODE_NUM_SYMBOLS],
757 u32 precode_codewords[restrict LZX_PRECODE_NUM_SYMBOLS],
758 unsigned *num_additional_bits_ret)
760 memset(precode_freqs, 0,
761 LZX_PRECODE_NUM_SYMBOLS * sizeof(precode_freqs[0]));
763 /* Since the code word lengths use a form of RLE encoding, the goal here
764 * is to find each run of identical lengths when going through them in
765 * symbol order (including runs of length 1). For each run, as many
766 * lengths are encoded using RLE as possible, and the rest are output
769 * output_syms[] will be filled in with the length symbols that will be
770 * output, including RLE codes, not yet encoded using the precode.
772 * cur_run_len keeps track of how many code word lengths are in the
773 * current run of identical lengths. */
774 unsigned output_syms_idx = 0;
775 unsigned cur_run_len = 1;
776 unsigned num_additional_bits = 0;
777 for (unsigned i = 1; i <= num_syms; i++) {
779 if (i != num_syms && lens[i] == lens[i - 1]) {
780 /* Still in a run--- keep going. */
785 /* Run ended! Check if it is a run of zeroes or a run of
788 /* The symbol that was repeated in the run--- not to be confused
789 * with the length *of* the run (cur_run_len) */
790 unsigned len_in_run = lens[i - 1];
792 if (len_in_run == 0) {
793 /* A run of 0's. Encode it in as few length
794 * codes as we can. */
796 /* The magic length 18 indicates a run of 20 + n zeroes,
797 * where n is an uncompressed literal 5-bit integer that
798 * follows the magic length. */
799 while (cur_run_len >= 20) {
800 unsigned additional_bits;
802 additional_bits = min(cur_run_len - 20, 0x1f);
803 num_additional_bits += 5;
805 output_syms[output_syms_idx++] = 18;
806 output_syms[output_syms_idx++] = additional_bits;
807 cur_run_len -= 20 + additional_bits;
810 /* The magic length 17 indicates a run of 4 + n zeroes,
811 * where n is an uncompressed literal 4-bit integer that
812 * follows the magic length. */
813 while (cur_run_len >= 4) {
814 unsigned additional_bits;
816 additional_bits = min(cur_run_len - 4, 0xf);
817 num_additional_bits += 4;
819 output_syms[output_syms_idx++] = 17;
820 output_syms[output_syms_idx++] = additional_bits;
821 cur_run_len -= 4 + additional_bits;
826 /* A run of nonzero lengths. */
828 /* The magic length 19 indicates a run of 4 + n
829 * nonzeroes, where n is a literal bit that follows the
830 * magic length, and where the value of the lengths in
831 * the run is given by an extra length symbol, encoded
832 * with the precode, that follows the literal bit.
834 * The extra length symbol is encoded as a difference
835 * from the length of the codeword for the first symbol
836 * in the run in the previous code.
838 while (cur_run_len >= 4) {
839 unsigned additional_bits;
842 additional_bits = (cur_run_len > 4);
843 num_additional_bits += 1;
844 delta = (signed char)prev_lens[i - cur_run_len] -
845 (signed char)len_in_run;
849 precode_freqs[(unsigned char)delta]++;
850 output_syms[output_syms_idx++] = 19;
851 output_syms[output_syms_idx++] = additional_bits;
852 output_syms[output_syms_idx++] = delta;
853 cur_run_len -= 4 + additional_bits;
857 /* Any remaining lengths in the run are outputted without RLE,
858 * as a difference from the length of that codeword in the
860 while (cur_run_len > 0) {
863 delta = (signed char)prev_lens[i - cur_run_len] -
864 (signed char)len_in_run;
868 precode_freqs[(unsigned char)delta]++;
869 output_syms[output_syms_idx++] = delta;
876 /* Build the precode from the frequencies of the length symbols. */
878 make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS,
879 LZX_MAX_PRE_CODEWORD_LEN,
880 precode_freqs, precode_lens,
883 *num_additional_bits_ret = num_additional_bits;
885 return output_syms_idx;
889 * Output a Huffman code in the compressed form used in LZX.
891 * The Huffman code is represented in the output as a logical series of codeword
892 * lengths from which the Huffman code, which must be in canonical form, can be
895 * The codeword lengths are themselves compressed using a separate Huffman code,
896 * the "precode", which contains a symbol for each possible codeword length in
897 * the larger code as well as several special symbols to represent repeated
898 * codeword lengths (a form of run-length encoding). The precode is itself
899 * constructed in canonical form, and its codeword lengths are represented
900 * literally in 20 4-bit fields that immediately precede the compressed codeword
901 * lengths of the larger code.
903 * Furthermore, the codeword lengths of the larger code are actually represented
904 * as deltas from the codeword lengths of the corresponding code in the previous
908 * Bitstream to which to write the compressed Huffman code.
910 * The codeword lengths, indexed by symbol, in the Huffman code.
912 * The codeword lengths, indexed by symbol, in the corresponding Huffman
913 * code in the previous block, or all zeroes if this is the first block.
915 * The number of symbols in the Huffman code.
918 lzx_write_compressed_code(struct lzx_output_bitstream *os,
919 const u8 lens[restrict],
920 const u8 prev_lens[restrict],
923 u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
924 u8 output_syms[num_syms];
925 u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
926 u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
928 unsigned num_output_syms;
932 num_output_syms = lzx_build_precode(lens,
941 /* Write the lengths of the precode codes to the output. */
942 for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
943 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
945 /* Write the length symbols, encoded with the precode, to the output. */
947 for (i = 0; i < num_output_syms; ) {
948 precode_sym = output_syms[i++];
950 lzx_write_varbits(os, precode_codewords[precode_sym],
951 precode_lens[precode_sym],
952 LZX_MAX_PRE_CODEWORD_LEN);
953 switch (precode_sym) {
955 lzx_write_bits(os, output_syms[i++], 4);
958 lzx_write_bits(os, output_syms[i++], 5);
961 lzx_write_bits(os, output_syms[i++], 1);
962 lzx_write_varbits(os, precode_codewords[output_syms[i]],
963 precode_lens[output_syms[i]],
964 LZX_MAX_PRE_CODEWORD_LEN);
974 * Write all matches and literal bytes (which were precomputed) in an LZX
975 * compressed block to the output bitstream in the final compressed
979 * The output bitstream.
981 * The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
982 * LZX_BLOCKTYPE_VERBATIM).
984 * The array of matches/literals to output.
986 * Number of matches/literals to output (length of @items).
988 * The main, length, and aligned offset Huffman codes for the current
989 * LZX compressed block.
992 lzx_write_items(struct lzx_output_bitstream *os, int block_type,
993 const struct lzx_item items[], u32 num_items,
994 const struct lzx_codes *codes)
996 for (u32 i = 0; i < num_items; i++) {
997 /* The high bit of the 32-bit intermediate representation
998 * indicates whether the item is an actual LZ-style match (1) or
999 * a literal byte (0). */
1000 if (items[i].data & 0x80000000)
1001 lzx_write_match(os, block_type, items[i], codes);
1003 lzx_write_literal(os, items[i].data, codes);
1007 /* Write an LZX aligned offset or verbatim block to the output. */
1009 lzx_write_compressed_block(int block_type,
1011 unsigned window_order,
1012 unsigned num_main_syms,
1013 struct lzx_item * chosen_items,
1014 u32 num_chosen_items,
1015 const struct lzx_codes * codes,
1016 const struct lzx_codes * prev_codes,
1017 struct lzx_output_bitstream * os)
1019 LZX_ASSERT(block_type == LZX_BLOCKTYPE_ALIGNED ||
1020 block_type == LZX_BLOCKTYPE_VERBATIM);
1022 /* The first three bits indicate the type of block and are one of the
1023 * LZX_BLOCKTYPE_* constants. */
1024 lzx_write_bits(os, block_type, 3);
1026 /* Output the block size.
1028 * The original LZX format seemed to always encode the block size in 3
1029 * bytes. However, the implementation in WIMGAPI, as used in WIM files,
1030 * uses the first bit to indicate whether the block is the default size
1031 * (32768) or a different size given explicitly by the next 16 bits.
1033 * By default, this compressor uses a window size of 32768 and therefore
1034 * follows the WIMGAPI behavior. However, this compressor also supports
1035 * window sizes greater than 32768 bytes, which do not appear to be
1036 * supported by WIMGAPI. In such cases, we retain the default size bit
1037 * to mean a size of 32768 bytes but output non-default block size in 24
1038 * bits rather than 16. The compatibility of this behavior is unknown
1039 * because WIMs created with chunk size greater than 32768 can seemingly
1040 * only be opened by wimlib anyway. */
1041 if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
1042 lzx_write_bits(os, 1, 1);
1044 lzx_write_bits(os, 0, 1);
1046 if (window_order >= 16)
1047 lzx_write_bits(os, block_size >> 16, 8);
1049 lzx_write_bits(os, block_size & 0xFFFF, 16);
1052 /* Output the aligned offset code. */
1053 if (block_type == LZX_BLOCKTYPE_ALIGNED) {
1054 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1055 lzx_write_bits(os, codes->lens.aligned[i],
1056 LZX_ALIGNEDCODE_ELEMENT_SIZE);
1060 /* Output the main code (two parts). */
1061 lzx_write_compressed_code(os, codes->lens.main,
1062 prev_codes->lens.main,
1064 lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
1065 prev_codes->lens.main + LZX_NUM_CHARS,
1066 num_main_syms - LZX_NUM_CHARS);
1068 /* Output the length code. */
1069 lzx_write_compressed_code(os, codes->lens.len,
1070 prev_codes->lens.len,
1071 LZX_LENCODE_NUM_SYMBOLS);
1073 /* Output the compressed matches and literals. */
1074 lzx_write_items(os, block_type, chosen_items, num_chosen_items, codes);
1077 /* Write out the LZX blocks that were computed. */
1079 lzx_write_all_blocks(struct lzx_compressor *c, struct lzx_output_bitstream *os)
1082 const struct lzx_codes *prev_codes = &c->zero_codes;
1083 for (unsigned i = 0; i < c->num_blocks; i++) {
1084 const struct lzx_block_spec *spec = &c->block_specs[i];
1086 lzx_write_compressed_block(spec->block_type,
1091 spec->num_chosen_items,
1096 prev_codes = &spec->codes;
1100 /* Constructs an LZX match from a literal byte and updates the main code symbol
1103 lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
1109 /* Constructs an LZX match from an offset and a length, and updates the LRU
1110 * queue and the frequency of symbols in the main, length, and aligned offset
1111 * alphabets. The return value is a 32-bit number that provides the match in an
1112 * intermediate representation documented below. */
1114 lzx_tally_match(unsigned match_len, u32 match_offset,
1115 struct lzx_freqs *freqs, struct lzx_lru_queue *queue)
1117 unsigned position_slot;
1118 u32 position_footer;
1120 unsigned main_symbol;
1121 unsigned len_footer;
1122 unsigned adjusted_match_len;
1124 LZX_ASSERT(match_len >= LZX_MIN_MATCH_LEN && match_len <= LZX_MAX_MATCH_LEN);
1126 /* The match offset shall be encoded as a position slot (itself encoded
1127 * as part of the main symbol) and a position footer. */
1128 position_slot = lzx_get_position_slot(match_offset, queue);
1129 position_footer = (match_offset + LZX_OFFSET_OFFSET) &
1130 (((u32)1 << lzx_get_num_extra_bits(position_slot)) - 1);
1132 /* The match length shall be encoded as a length header (itself encoded
1133 * as part of the main symbol) and an optional length footer. */
1134 adjusted_match_len = match_len - LZX_MIN_MATCH_LEN;
1135 if (adjusted_match_len < LZX_NUM_PRIMARY_LENS) {
1136 /* No length footer needed. */
1137 len_header = adjusted_match_len;
1139 /* Length footer needed. It will be encoded using the length
1141 len_header = LZX_NUM_PRIMARY_LENS;
1142 len_footer = adjusted_match_len - LZX_NUM_PRIMARY_LENS;
1143 freqs->len[len_footer]++;
1146 /* Account for the main symbol. */
1147 main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
1149 freqs->main[main_symbol]++;
1151 /* In an aligned offset block, 3 bits of the position footer are output
1152 * as an aligned offset symbol. Account for this, although we may
1153 * ultimately decide to output the block as verbatim. */
1155 /* The following check is equivalent to:
1157 * if (lzx_extra_bits[position_slot] >= 3)
1159 * Note that this correctly excludes position slots that correspond to
1160 * recent offsets. */
1161 if (position_slot >= 8)
1162 freqs->aligned[position_footer & 7]++;
1164 /* Pack the position slot, position footer, and match length into an
1165 * intermediate representation. See `struct lzx_item' for details.
1167 LZX_ASSERT(LZX_MAX_POSITION_SLOTS <= 64);
1168 LZX_ASSERT(lzx_get_num_extra_bits(LZX_MAX_POSITION_SLOTS - 1) <= 17);
1169 LZX_ASSERT(LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1 <= 256);
1171 LZX_ASSERT(position_slot <= (1U << (31 - 25)) - 1);
1172 LZX_ASSERT(position_footer <= (1U << (25 - 8)) - 1);
1173 LZX_ASSERT(adjusted_match_len <= (1U << (8 - 0)) - 1);
1175 (position_slot << 25) |
1176 (position_footer << 8) |
1177 (adjusted_match_len);
1180 /* Returns the cost, in bits, to output a literal byte using the specified cost
1183 lzx_literal_cost(u8 c, const struct lzx_costs * costs)
1185 return costs->main[c];
1188 /* Returns the cost, in bits, to output a repeat offset match of the specified
1189 * length and position slot (repeat index) using the specified cost model. */
1191 lzx_repmatch_cost(u32 len, unsigned position_slot, const struct lzx_costs *costs)
1193 unsigned len_header, main_symbol;
1196 len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
1197 main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
1199 /* Account for main symbol. */
1200 cost += costs->main[main_symbol];
1202 /* Account for extra length information. */
1203 if (len_header == LZX_NUM_PRIMARY_LENS)
1204 cost += costs->len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
1209 /* Set the cost model @c->costs from the Huffman codeword lengths specified in
1212 * The cost model and codeword lengths are almost the same thing, but the
1213 * Huffman codewords with length 0 correspond to symbols with zero frequency
1214 * that still need to be assigned actual costs. The specific values assigned
1215 * are arbitrary, but they should be fairly high (near the maximum codeword
1216 * length) to take into account the fact that uses of these symbols are expected
1219 lzx_set_costs(struct lzx_compressor *c, const struct lzx_lens * lens,
1225 for (i = 0; i < c->num_main_syms; i++)
1226 c->costs.main[i] = lens->main[i] ? lens->main[i] : nostat;
1229 for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
1230 c->costs.len[i] = lens->len[i] ? lens->len[i] : nostat;
1232 /* Aligned offset code */
1233 for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
1234 c->costs.aligned[i] = lens->aligned[i] ? lens->aligned[i] : nostat / 2;
1237 /* Don't allow matches to span the end of an LZX block. */
1239 maybe_truncate_matches(struct lz_match matches[], u32 num_matches,
1240 struct lzx_compressor *c)
1242 if (c->match_window_end < c->cur_window_size && num_matches != 0) {
1243 u32 limit = c->match_window_end - c->match_window_pos;
1245 if (limit >= LZX_MIN_MATCH_LEN) {
1247 u32 i = num_matches - 1;
1249 if (matches[i].len >= limit) {
1250 matches[i].len = limit;
1252 /* Truncation might produce multiple
1253 * matches with length 'limit'. Keep at
1255 num_matches = i + 1;
1266 lzx_get_matches_fillcache_singleblock(struct lzx_compressor *c,
1267 const struct lz_match **matches_ret)
1269 struct lz_match *cache_ptr;
1270 struct lz_match *matches;
1271 unsigned num_matches;
1273 cache_ptr = c->cache_ptr;
1274 matches = cache_ptr + 1;
1275 if (likely(cache_ptr <= c->cache_limit)) {
1276 num_matches = lz_mf_get_matches(c->mf, matches);
1277 cache_ptr->len = num_matches;
1278 c->cache_ptr = matches + num_matches;
1282 c->match_window_pos++;
1283 *matches_ret = matches;
1288 lzx_get_matches_fillcache_multiblock(struct lzx_compressor *c,
1289 const struct lz_match **matches_ret)
1291 struct lz_match *cache_ptr;
1292 struct lz_match *matches;
1293 unsigned num_matches;
1295 cache_ptr = c->cache_ptr;
1296 matches = cache_ptr + 1;
1297 if (likely(cache_ptr <= c->cache_limit)) {
1298 num_matches = lz_mf_get_matches(c->mf, matches);
1299 num_matches = maybe_truncate_matches(matches, num_matches, c);
1300 cache_ptr->len = num_matches;
1301 c->cache_ptr = matches + num_matches;
1305 c->match_window_pos++;
1306 *matches_ret = matches;
1311 lzx_get_matches_usecache(struct lzx_compressor *c,
1312 const struct lz_match **matches_ret)
1314 struct lz_match *cache_ptr;
1315 struct lz_match *matches;
1316 unsigned num_matches;
1318 cache_ptr = c->cache_ptr;
1319 matches = cache_ptr + 1;
1320 if (cache_ptr <= c->cache_limit) {
1321 num_matches = cache_ptr->len;
1322 c->cache_ptr = matches + num_matches;
1326 c->match_window_pos++;
1327 *matches_ret = matches;
1332 lzx_get_matches_usecache_nocheck(struct lzx_compressor *c,
1333 const struct lz_match **matches_ret)
1335 struct lz_match *cache_ptr;
1336 struct lz_match *matches;
1337 unsigned num_matches;
1339 cache_ptr = c->cache_ptr;
1340 matches = cache_ptr + 1;
1341 num_matches = cache_ptr->len;
1342 c->cache_ptr = matches + num_matches;
1343 c->match_window_pos++;
1344 *matches_ret = matches;
1349 lzx_get_matches_nocache_singleblock(struct lzx_compressor *c,
1350 const struct lz_match **matches_ret)
1352 struct lz_match *matches;
1353 unsigned num_matches;
1355 matches = c->cache_ptr;
1356 num_matches = lz_mf_get_matches(c->mf, matches);
1357 c->match_window_pos++;
1358 *matches_ret = matches;
1363 lzx_get_matches_nocache_multiblock(struct lzx_compressor *c,
1364 const struct lz_match **matches_ret)
1366 struct lz_match *matches;
1367 unsigned num_matches;
1369 matches = c->cache_ptr;
1370 num_matches = lz_mf_get_matches(c->mf, matches);
1371 num_matches = maybe_truncate_matches(matches, num_matches, c);
1372 c->match_window_pos++;
1373 *matches_ret = matches;
1378 * Find matches at the next position in the window.
1380 * Returns the number of matches found and sets *matches_ret to point to the
1381 * matches array. The matches will be sorted by strictly increasing length and
1384 static inline unsigned
1385 lzx_get_matches(struct lzx_compressor *c,
1386 const struct lz_match **matches_ret)
1388 return (*c->get_matches_func)(c, matches_ret);
1392 lzx_skip_bytes_fillcache(struct lzx_compressor *c, unsigned n)
1394 struct lz_match *cache_ptr;
1396 cache_ptr = c->cache_ptr;
1397 c->match_window_pos += n;
1398 lz_mf_skip_positions(c->mf, n);
1399 if (cache_ptr <= c->cache_limit) {
1403 } while (--n && cache_ptr <= c->cache_limit);
1405 c->cache_ptr = cache_ptr;
1409 lzx_skip_bytes_usecache(struct lzx_compressor *c, unsigned n)
1411 struct lz_match *cache_ptr;
1413 cache_ptr = c->cache_ptr;
1414 c->match_window_pos += n;
1415 if (cache_ptr <= c->cache_limit) {
1417 cache_ptr += 1 + cache_ptr->len;
1418 } while (--n && cache_ptr <= c->cache_limit);
1420 c->cache_ptr = cache_ptr;
1424 lzx_skip_bytes_usecache_nocheck(struct lzx_compressor *c, unsigned n)
1426 struct lz_match *cache_ptr;
1428 cache_ptr = c->cache_ptr;
1429 c->match_window_pos += n;
1431 cache_ptr += 1 + cache_ptr->len;
1433 c->cache_ptr = cache_ptr;
1437 lzx_skip_bytes_nocache(struct lzx_compressor *c, unsigned n)
1439 c->match_window_pos += n;
1440 lz_mf_skip_positions(c->mf, n);
1444 * Skip the specified number of positions in the window (don't search for
1448 lzx_skip_bytes(struct lzx_compressor *c, unsigned n)
1450 return (*c->skip_bytes_func)(c, n);
1454 * Reverse the linked list of near-optimal matches so that they can be returned
1455 * in forwards order.
1457 * Returns the first match in the list.
1459 static struct lz_match
1460 lzx_match_chooser_reverse_list(struct lzx_compressor *c, unsigned cur_pos)
1462 unsigned prev_link, saved_prev_link;
1463 unsigned prev_match_offset, saved_prev_match_offset;
1465 c->optimum_end_idx = cur_pos;
1467 saved_prev_link = c->optimum[cur_pos].prev.link;
1468 saved_prev_match_offset = c->optimum[cur_pos].prev.match_offset;
1471 prev_link = saved_prev_link;
1472 prev_match_offset = saved_prev_match_offset;
1474 saved_prev_link = c->optimum[prev_link].prev.link;
1475 saved_prev_match_offset = c->optimum[prev_link].prev.match_offset;
1477 c->optimum[prev_link].next.link = cur_pos;
1478 c->optimum[prev_link].next.match_offset = prev_match_offset;
1480 cur_pos = prev_link;
1481 } while (cur_pos != 0);
1483 c->optimum_cur_idx = c->optimum[0].next.link;
1485 return (struct lz_match)
1486 { .len = c->optimum_cur_idx,
1487 .offset = c->optimum[0].next.match_offset,
1492 * lzx_choose_near_optimal_match() -
1494 * Choose an approximately optimal match or literal to use at the next position
1495 * in the string, or "window", being LZ-encoded.
1497 * This is based on algorithms used in 7-Zip, including the DEFLATE encoder
1498 * and the LZMA encoder, written by Igor Pavlov.
1500 * Unlike a greedy parser that always takes the longest match, or even a "lazy"
1501 * parser with one match/literal look-ahead like zlib, the algorithm used here
1502 * may look ahead many matches/literals to determine the approximately optimal
1503 * match/literal to code next. The motivation is that the compression ratio is
1504 * improved if the compressor can do things like use a shorter-than-possible
1505 * match in order to allow a longer match later, and also take into account the
1506 * estimated real cost of coding each match/literal based on the underlying
1509 * Still, this is not a true optimal parser for several reasons:
1511 * - Real compression formats use entropy encoding of the literal/match
1512 * sequence, so the real cost of coding each match or literal is unknown until
1513 * the parse is fully determined. It can be approximated based on iterative
1514 * parses, but the end result is not guaranteed to be globally optimal.
1516 * - Very long matches are chosen immediately. This is because locations with
1517 * long matches are likely to have many possible alternatives that would cause
1518 * slow optimal parsing, but also such locations are already highly
1519 * compressible so it is not too harmful to just grab the longest match.
1521 * - Not all possible matches at each location are considered because the
1522 * underlying match-finder limits the number and type of matches produced at
1523 * each position. For example, for a given match length it's usually not
1524 * worth it to only consider matches other than the lowest-offset match,
1525 * except in the case of a repeat offset.
1527 * - Although we take into account the adaptive state (in LZX, the recent offset
1528 * queue), coding decisions made with respect to the adaptive state will be
1529 * locally optimal but will not necessarily be globally optimal. This is
1530 * because the algorithm only keeps the least-costly path to get to a given
1531 * location and does not take into account that a slightly more costly path
1532 * could result in a different adaptive state that ultimately results in a
1533 * lower global cost.
1535 * - The array space used by this function is bounded, so in degenerate cases it
1536 * is forced to start returning matches/literals before the algorithm has
1539 * Each call to this function does one of two things:
1541 * 1. Build a sequence of near-optimal matches/literals, up to some point, that
1542 * will be returned by subsequent calls to this function, then return the
1547 * 2. Return the next match/literal previously computed by a call to this
1550 * The return value is a (length, offset) pair specifying the match or literal
1551 * chosen. For literals, the length is 0 or 1 and the offset is meaningless.
1553 static struct lz_match
1554 lzx_choose_near_optimal_item(struct lzx_compressor *c)
1556 unsigned num_matches;
1557 const struct lz_match *matches;
1558 struct lz_match match;
1560 u32 longest_rep_len;
1561 unsigned longest_rep_slot;
1564 struct lzx_mc_pos_data *optimum = c->optimum;
1566 if (c->optimum_cur_idx != c->optimum_end_idx) {
1567 /* Case 2: Return the next match/literal already found. */
1568 match.len = optimum[c->optimum_cur_idx].next.link -
1570 match.offset = optimum[c->optimum_cur_idx].next.match_offset;
1572 c->optimum_cur_idx = optimum[c->optimum_cur_idx].next.link;
1576 /* Case 1: Compute a new list of matches/literals to return. */
1578 c->optimum_cur_idx = 0;
1579 c->optimum_end_idx = 0;
1581 /* Search for matches at repeat offsets. As a heuristic, we only keep
1582 * the one with the longest match length. */
1583 longest_rep_len = LZX_MIN_MATCH_LEN - 1;
1584 if (c->match_window_pos >= 1) {
1585 unsigned limit = min(LZX_MAX_MATCH_LEN,
1586 c->match_window_end - c->match_window_pos);
1587 for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
1588 u32 offset = c->queue.R[i];
1589 const u8 *strptr = &c->cur_window[c->match_window_pos];
1590 const u8 *matchptr = strptr - offset;
1592 while (len < limit && strptr[len] == matchptr[len])
1594 if (len > longest_rep_len) {
1595 longest_rep_len = len;
1596 longest_rep_slot = i;
1601 /* If there's a long match with a repeat offset, choose it immediately. */
1602 if (longest_rep_len >= c->params.nice_match_length) {
1603 lzx_skip_bytes(c, longest_rep_len);
1604 return (struct lz_match) {
1605 .len = longest_rep_len,
1606 .offset = c->queue.R[longest_rep_slot],
1610 /* Find other matches. */
1611 num_matches = lzx_get_matches(c, &matches);
1613 /* If there's a long match, choose it immediately. */
1615 longest_len = matches[num_matches - 1].len;
1616 if (longest_len >= c->params.nice_match_length) {
1617 lzx_skip_bytes(c, longest_len - 1);
1618 return matches[num_matches - 1];
1624 /* Calculate the cost to reach the next position by coding a literal. */
1625 optimum[1].queue = c->queue;
1626 optimum[1].cost = lzx_literal_cost(c->cur_window[c->match_window_pos - 1],
1628 optimum[1].prev.link = 0;
1630 /* Calculate the cost to reach any position up to and including that
1631 * reached by the longest match.
1633 * Note: We consider only the lowest-offset match that reaches each
1636 * Note: Some of the cost calculation stays the same for each offset,
1637 * regardless of how many lengths it gets used for. Therefore, to
1638 * improve performance, we hand-code the cost calculation instead of
1639 * calling lzx_match_cost() to do a from-scratch cost evaluation at each
1641 for (unsigned i = 0, len = 2; i < num_matches; i++) {
1643 struct lzx_lru_queue queue;
1645 unsigned position_slot;
1646 unsigned num_extra_bits;
1648 offset = matches[i].offset;
1652 position_slot = lzx_get_position_slot(offset, &queue);
1653 num_extra_bits = lzx_get_num_extra_bits(position_slot);
1654 if (num_extra_bits >= 3) {
1655 position_cost += num_extra_bits - 3;
1656 position_cost += c->costs.aligned[(offset + LZX_OFFSET_OFFSET) & 7];
1658 position_cost += num_extra_bits;
1662 unsigned len_header;
1663 unsigned main_symbol;
1666 cost = position_cost;
1668 len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
1669 main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
1670 cost += c->costs.main[main_symbol];
1671 if (len_header == LZX_NUM_PRIMARY_LENS)
1672 cost += c->costs.len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
1674 optimum[len].queue = queue;
1675 optimum[len].prev.link = 0;
1676 optimum[len].prev.match_offset = offset;
1677 optimum[len].cost = cost;
1678 } while (++len <= matches[i].len);
1680 end_pos = longest_len;
1682 if (longest_rep_len >= LZX_MIN_MATCH_LEN) {
1685 while (end_pos < longest_rep_len)
1686 optimum[++end_pos].cost = MC_INFINITE_COST;
1688 cost = lzx_repmatch_cost(longest_rep_len, longest_rep_slot,
1690 if (cost <= optimum[longest_rep_len].cost) {
1691 optimum[longest_rep_len].queue = c->queue;
1692 swap(optimum[longest_rep_len].queue.R[0],
1693 optimum[longest_rep_len].queue.R[longest_rep_slot]);
1694 optimum[longest_rep_len].prev.link = 0;
1695 optimum[longest_rep_len].prev.match_offset =
1696 optimum[longest_rep_len].queue.R[0];
1697 optimum[longest_rep_len].cost = cost;
1701 /* Step forward, calculating the estimated minimum cost to reach each
1702 * position. The algorithm may find multiple paths to reach each
1703 * position; only the lowest-cost path is saved.
1705 * The progress of the parse is tracked in the @optimum array, which for
1706 * each position contains the minimum cost to reach that position, the
1707 * index of the start of the match/literal taken to reach that position
1708 * through the minimum-cost path, the offset of the match taken (not
1709 * relevant for literals), and the adaptive state that will exist at
1710 * that position after the minimum-cost path is taken. The @cur_pos
1711 * variable stores the position at which the algorithm is currently
1712 * considering coding choices, and the @end_pos variable stores the
1713 * greatest position at which the costs of coding choices have been
1716 * The loop terminates when any one of the following conditions occurs:
1718 * 1. A match with length greater than or equal to @nice_match_length is
1719 * found. When this occurs, the algorithm chooses this match
1720 * unconditionally, and consequently the near-optimal match/literal
1721 * sequence up to and including that match is fully determined and it
1722 * can begin returning the match/literal list.
1724 * 2. @cur_pos reaches a position not overlapped by a preceding match.
1725 * In such cases, the near-optimal match/literal sequence up to
1726 * @cur_pos is fully determined and it can begin returning the
1727 * match/literal list.
1729 * 3. Failing either of the above in a degenerate case, the loop
1730 * terminates when space in the @optimum array is exhausted.
1731 * This terminates the algorithm and forces it to start returning
1732 * matches/literals even though they may not be globally optimal.
1734 * Upon loop termination, a nonempty list of matches/literals will have
1735 * been produced and stored in the @optimum array. These
1736 * matches/literals are linked in reverse order, so the last thing this
1737 * function does is reverse this list and return the first
1738 * match/literal, leaving the rest to be returned immediately by
1739 * subsequent calls to this function.
1745 /* Advance to next position. */
1748 /* Check termination conditions (2) and (3) noted above. */
1749 if (cur_pos == end_pos || cur_pos == LZX_OPTIM_ARRAY_LENGTH)
1750 return lzx_match_chooser_reverse_list(c, cur_pos);
1752 /* Search for matches at repeat offsets. Again, as a heuristic
1753 * we only keep the longest one. */
1754 longest_rep_len = LZX_MIN_MATCH_LEN - 1;
1755 unsigned limit = min(LZX_MAX_MATCH_LEN,
1756 c->match_window_end - c->match_window_pos);
1757 for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
1758 u32 offset = optimum[cur_pos].queue.R[i];
1759 const u8 *strptr = &c->cur_window[c->match_window_pos];
1760 const u8 *matchptr = strptr - offset;
1762 while (len < limit && strptr[len] == matchptr[len])
1764 if (len > longest_rep_len) {
1765 longest_rep_len = len;
1766 longest_rep_slot = i;
1770 /* If we found a long match at a repeat offset, choose it
1772 if (longest_rep_len >= c->params.nice_match_length) {
1773 /* Build the list of matches to return and get
1775 match = lzx_match_chooser_reverse_list(c, cur_pos);
1777 /* Append the long match to the end of the list. */
1778 optimum[cur_pos].next.match_offset =
1779 optimum[cur_pos].queue.R[longest_rep_slot];
1780 optimum[cur_pos].next.link = cur_pos + longest_rep_len;
1781 c->optimum_end_idx = cur_pos + longest_rep_len;
1783 /* Skip over the remaining bytes of the long match. */
1784 lzx_skip_bytes(c, longest_rep_len);
1786 /* Return first match in the list. */
1790 /* Find other matches. */
1791 num_matches = lzx_get_matches(c, &matches);
1793 /* If there's a long match, choose it immediately. */
1795 longest_len = matches[num_matches - 1].len;
1796 if (longest_len >= c->params.nice_match_length) {
1797 /* Build the list of matches to return and get
1799 match = lzx_match_chooser_reverse_list(c, cur_pos);
1801 /* Append the long match to the end of the list. */
1802 optimum[cur_pos].next.match_offset =
1803 matches[num_matches - 1].offset;
1804 optimum[cur_pos].next.link = cur_pos + longest_len;
1805 c->optimum_end_idx = cur_pos + longest_len;
1807 /* Skip over the remaining bytes of the long match. */
1808 lzx_skip_bytes(c, longest_len - 1);
1810 /* Return first match in the list. */
1817 /* If we are reaching any positions for the first time, we need
1818 * to initialize their costs to infinity. */
1819 while (end_pos < cur_pos + longest_len)
1820 optimum[++end_pos].cost = MC_INFINITE_COST;
1822 /* Consider coding a literal. */
1823 cost = optimum[cur_pos].cost +
1824 lzx_literal_cost(c->cur_window[c->match_window_pos - 1],
1826 if (cost < optimum[cur_pos + 1].cost) {
1827 optimum[cur_pos + 1].queue = optimum[cur_pos].queue;
1828 optimum[cur_pos + 1].cost = cost;
1829 optimum[cur_pos + 1].prev.link = cur_pos;
1832 /* Consider coding a match.
1834 * The hard-coded cost calculation is done for the same reason
1835 * stated in the comment for the similar loop earlier.
1836 * Actually, it is *this* one that has the biggest effect on
1837 * performance; overall LZX compression is > 10% faster with
1838 * this code compared to calling lzx_match_cost() with each
1840 for (unsigned i = 0, len = 2; i < num_matches; i++) {
1843 unsigned position_slot;
1844 unsigned num_extra_bits;
1846 offset = matches[i].offset;
1847 position_cost = optimum[cur_pos].cost;
1849 /* Yet another optimization: instead of calling
1850 * lzx_get_position_slot(), hand-inline the search of
1851 * the repeat offset queue. Then we can omit the
1852 * extra_bits calculation for repeat offset matches, and
1853 * also only compute the updated queue if we actually do
1854 * find a new lowest cost path. */
1855 for (position_slot = 0; position_slot < LZX_NUM_RECENT_OFFSETS; position_slot++)
1856 if (offset == optimum[cur_pos].queue.R[position_slot])
1857 goto have_position_cost;
1859 position_slot = lzx_get_position_slot_raw(offset + LZX_OFFSET_OFFSET);
1861 num_extra_bits = lzx_get_num_extra_bits(position_slot);
1862 if (num_extra_bits >= 3) {
1863 position_cost += num_extra_bits - 3;
1864 position_cost += c->costs.aligned[
1865 (offset + LZX_OFFSET_OFFSET) & 7];
1867 position_cost += num_extra_bits;
1873 unsigned len_header;
1874 unsigned main_symbol;
1877 cost = position_cost;
1879 len_header = min(len - LZX_MIN_MATCH_LEN,
1880 LZX_NUM_PRIMARY_LENS);
1881 main_symbol = ((position_slot << 3) | len_header) +
1883 cost += c->costs.main[main_symbol];
1884 if (len_header == LZX_NUM_PRIMARY_LENS) {
1885 cost += c->costs.len[len -
1887 LZX_NUM_PRIMARY_LENS];
1889 if (cost < optimum[cur_pos + len].cost) {
1890 if (position_slot < LZX_NUM_RECENT_OFFSETS) {
1891 optimum[cur_pos + len].queue = optimum[cur_pos].queue;
1892 swap(optimum[cur_pos + len].queue.R[0],
1893 optimum[cur_pos + len].queue.R[position_slot]);
1895 optimum[cur_pos + len].queue.R[0] = offset;
1896 optimum[cur_pos + len].queue.R[1] = optimum[cur_pos].queue.R[0];
1897 optimum[cur_pos + len].queue.R[2] = optimum[cur_pos].queue.R[1];
1899 optimum[cur_pos + len].prev.link = cur_pos;
1900 optimum[cur_pos + len].prev.match_offset = offset;
1901 optimum[cur_pos + len].cost = cost;
1903 } while (++len <= matches[i].len);
1906 /* Consider coding a repeat offset match.
1908 * As a heuristic, we only consider the longest length of the
1909 * longest repeat offset match. This does not, however,
1910 * necessarily mean that we will never consider any other repeat
1911 * offsets, because above we detect repeat offset matches that
1912 * were found by the regular match-finder. Therefore, this
1913 * special handling of the longest repeat-offset match is only
1914 * helpful for coding a repeat offset match that was *not* found
1915 * by the match-finder, e.g. due to being obscured by a less
1916 * distant match that is at least as long.
1918 * Note: an alternative, used in LZMA, is to consider every
1919 * length of every repeat offset match. This is a more thorough
1920 * search, and it makes it unnecessary to detect repeat offset
1921 * matches that were found by the regular match-finder. But by
1922 * my tests, for LZX the LZMA method slows down the compressor
1923 * by ~10% and doesn't actually help the compression ratio too
1926 * Also tested a compromise approach: consider every 3rd length
1927 * of the longest repeat offset match. Still didn't seem quite
1930 if (longest_rep_len >= LZX_MIN_MATCH_LEN) {
1932 while (end_pos < cur_pos + longest_rep_len)
1933 optimum[++end_pos].cost = MC_INFINITE_COST;
1935 cost = optimum[cur_pos].cost +
1936 lzx_repmatch_cost(longest_rep_len, longest_rep_slot,
1938 if (cost <= optimum[cur_pos + longest_rep_len].cost) {
1939 optimum[cur_pos + longest_rep_len].queue =
1940 optimum[cur_pos].queue;
1941 swap(optimum[cur_pos + longest_rep_len].queue.R[0],
1942 optimum[cur_pos + longest_rep_len].queue.R[longest_rep_slot]);
1943 optimum[cur_pos + longest_rep_len].prev.link =
1945 optimum[cur_pos + longest_rep_len].prev.match_offset =
1946 optimum[cur_pos + longest_rep_len].queue.R[0];
1947 optimum[cur_pos + longest_rep_len].cost =
1954 static struct lz_match
1955 lzx_choose_lazy_item(struct lzx_compressor *c)
1957 const struct lz_match *matches;
1958 struct lz_match cur_match;
1959 struct lz_match next_match;
1962 if (c->prev_match.len) {
1963 cur_match = c->prev_match;
1964 c->prev_match.len = 0;
1966 num_matches = lzx_get_matches(c, &matches);
1967 if (num_matches == 0 ||
1968 (matches[num_matches - 1].len <= 3 &&
1969 (matches[num_matches - 1].len <= 2 ||
1970 matches[num_matches - 1].offset > 4096)))
1972 return (struct lz_match) { };
1975 cur_match = matches[num_matches - 1];
1978 if (cur_match.len >= c->params.nice_match_length) {
1979 lzx_skip_bytes(c, cur_match.len - 1);
1983 num_matches = lzx_get_matches(c, &matches);
1984 if (num_matches == 0 ||
1985 (matches[num_matches - 1].len <= 3 &&
1986 (matches[num_matches - 1].len <= 2 ||
1987 matches[num_matches - 1].offset > 4096)))
1989 lzx_skip_bytes(c, cur_match.len - 2);
1993 next_match = matches[num_matches - 1];
1995 if (next_match.len <= cur_match.len) {
1996 lzx_skip_bytes(c, cur_match.len - 2);
1999 c->prev_match = next_match;
2000 return (struct lz_match) { };
2005 * Return the next match or literal to use, delegating to the currently selected
2006 * match-choosing algorithm.
2008 * If the length of the returned 'struct lz_match' is less than
2009 * LZX_MIN_MATCH_LEN, then it is really a literal.
2011 static inline struct lz_match
2012 lzx_choose_item(struct lzx_compressor *c)
2014 return (*c->params.choose_item_func)(c);
2017 /* Set default symbol costs for the LZX Huffman codes. */
2019 lzx_set_default_costs(struct lzx_costs * costs, unsigned num_main_syms)
2023 /* Main code (part 1): Literal symbols */
2024 for (i = 0; i < LZX_NUM_CHARS; i++)
2027 /* Main code (part 2): Match header symbols */
2028 for (; i < num_main_syms; i++)
2029 costs->main[i] = 10;
2032 for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
2035 /* Aligned offset code */
2036 for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
2037 costs->aligned[i] = 3;
2040 /* Given the frequencies of symbols in an LZX-compressed block and the
2041 * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
2042 * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
2043 * will take fewer bits to output. */
2045 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
2046 const struct lzx_codes * codes)
2048 unsigned aligned_cost = 0;
2049 unsigned verbatim_cost = 0;
2051 /* Verbatim blocks have a constant 3 bits per position footer. Aligned
2052 * offset blocks have an aligned offset symbol per position footer, plus
2053 * an extra 24 bits per block to output the lengths necessary to
2054 * reconstruct the aligned offset code itself. */
2055 for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2056 verbatim_cost += 3 * freqs->aligned[i];
2057 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
2059 aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE * LZX_ALIGNEDCODE_NUM_SYMBOLS;
2060 if (aligned_cost < verbatim_cost)
2061 return LZX_BLOCKTYPE_ALIGNED;
2063 return LZX_BLOCKTYPE_VERBATIM;
2066 /* Find a sequence of matches/literals with which to output the specified LZX
2067 * block, then set the block's type to that which has the minimum cost to output
2068 * (either verbatim or aligned). */
2070 lzx_choose_items_for_block(struct lzx_compressor *c, struct lzx_block_spec *spec)
2072 const struct lzx_lru_queue orig_queue = c->queue;
2073 u32 num_passes_remaining = c->params.num_optim_passes;
2074 struct lzx_freqs freqs;
2075 const u8 *window_ptr;
2076 const u8 *window_end;
2077 struct lzx_item *next_chosen_item;
2078 struct lz_match lz_match;
2079 struct lzx_item lzx_item;
2081 LZX_ASSERT(num_passes_remaining >= 1);
2082 LZX_ASSERT(lz_mf_get_position(c->mf) == spec->window_pos);
2084 c->match_window_end = spec->window_pos + spec->block_size;
2086 if (c->params.num_optim_passes > 1) {
2087 if (spec->block_size == c->cur_window_size)
2088 c->get_matches_func = lzx_get_matches_fillcache_singleblock;
2090 c->get_matches_func = lzx_get_matches_fillcache_multiblock;
2091 c->skip_bytes_func = lzx_skip_bytes_fillcache;
2093 if (spec->block_size == c->cur_window_size)
2094 c->get_matches_func = lzx_get_matches_nocache_singleblock;
2096 c->get_matches_func = lzx_get_matches_nocache_multiblock;
2097 c->skip_bytes_func = lzx_skip_bytes_nocache;
2100 /* The first optimal parsing pass is done using the cost model already
2101 * set in c->costs. Each later pass is done using a cost model
2102 * computed from the previous pass.
2104 * To improve performance we only generate the array containing the
2105 * matches and literals in intermediate form on the final pass. */
2107 while (--num_passes_remaining) {
2108 c->match_window_pos = spec->window_pos;
2109 c->cache_ptr = c->cached_matches;
2110 memset(&freqs, 0, sizeof(freqs));
2111 window_ptr = &c->cur_window[spec->window_pos];
2112 window_end = window_ptr + spec->block_size;
2114 while (window_ptr != window_end) {
2116 lz_match = lzx_choose_item(c);
2118 LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
2119 lz_match.offset == c->max_window_size -
2120 LZX_MIN_MATCH_LEN));
2121 if (lz_match.len >= LZX_MIN_MATCH_LEN) {
2122 lzx_tally_match(lz_match.len, lz_match.offset,
2124 window_ptr += lz_match.len;
2126 lzx_tally_literal(*window_ptr, &freqs);
2130 lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
2131 lzx_set_costs(c, &spec->codes.lens, 15);
2132 c->queue = orig_queue;
2133 if (c->cache_ptr <= c->cache_limit) {
2134 c->get_matches_func = lzx_get_matches_usecache_nocheck;
2135 c->skip_bytes_func = lzx_skip_bytes_usecache_nocheck;
2137 c->get_matches_func = lzx_get_matches_usecache;
2138 c->skip_bytes_func = lzx_skip_bytes_usecache;
2142 c->match_window_pos = spec->window_pos;
2143 c->cache_ptr = c->cached_matches;
2144 memset(&freqs, 0, sizeof(freqs));
2145 window_ptr = &c->cur_window[spec->window_pos];
2146 window_end = window_ptr + spec->block_size;
2148 spec->chosen_items = &c->chosen_items[spec->window_pos];
2149 next_chosen_item = spec->chosen_items;
2151 unsigned unseen_cost = 9;
2152 while (window_ptr != window_end) {
2154 lz_match = lzx_choose_item(c);
2156 LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
2157 lz_match.offset == c->max_window_size -
2158 LZX_MIN_MATCH_LEN));
2159 if (lz_match.len >= LZX_MIN_MATCH_LEN) {
2160 lzx_item.data = lzx_tally_match(lz_match.len,
2163 window_ptr += lz_match.len;
2165 lzx_item.data = lzx_tally_literal(*window_ptr, &freqs);
2168 *next_chosen_item++ = lzx_item;
2170 /* When doing one-pass "near-optimal" parsing, update the cost
2171 * model occassionally. */
2172 if (unlikely((next_chosen_item - spec->chosen_items) % 2048 == 0) &&
2173 c->params.choose_item_func == lzx_choose_near_optimal_item &&
2174 c->params.num_optim_passes == 1)
2176 lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
2177 lzx_set_costs(c, &spec->codes.lens, unseen_cost);
2178 if (unseen_cost < 15)
2182 spec->num_chosen_items = next_chosen_item - spec->chosen_items;
2183 lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
2184 spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes);
2187 /* Prepare the input window into one or more LZX blocks ready to be output. */
2189 lzx_prepare_blocks(struct lzx_compressor *c)
2191 /* Set up a default cost model. */
2192 if (c->params.choose_item_func == lzx_choose_near_optimal_item)
2193 lzx_set_default_costs(&c->costs, c->num_main_syms);
2195 /* Set up the block specifications.
2196 * TODO: The compression ratio could be slightly improved by performing
2197 * data-dependent block splitting instead of using fixed-size blocks.
2198 * Doing so well is a computationally hard problem, however. */
2199 c->num_blocks = DIV_ROUND_UP(c->cur_window_size, LZX_DIV_BLOCK_SIZE);
2200 for (unsigned i = 0; i < c->num_blocks; i++) {
2201 u32 pos = LZX_DIV_BLOCK_SIZE * i;
2202 c->block_specs[i].window_pos = pos;
2203 c->block_specs[i].block_size = min(c->cur_window_size - pos,
2204 LZX_DIV_BLOCK_SIZE);
2207 /* Load the window into the match-finder. */
2208 lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size);
2210 /* Determine sequence of matches/literals to output for each block. */
2211 lzx_lru_queue_init(&c->queue);
2212 c->optimum_cur_idx = 0;
2213 c->optimum_end_idx = 0;
2214 c->prev_match.len = 0;
2215 for (unsigned i = 0; i < c->num_blocks; i++)
2216 lzx_choose_items_for_block(c, &c->block_specs[i]);
2220 lzx_build_params(unsigned int compression_level,
2221 u32 max_window_size,
2222 struct lzx_compressor_params *lzx_params)
2224 if (compression_level < 25) {
2225 lzx_params->choose_item_func = lzx_choose_lazy_item;
2226 lzx_params->num_optim_passes = 1;
2227 if (max_window_size <= 262144)
2228 lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
2230 lzx_params->mf_algo = LZ_MF_BINARY_TREES;
2231 lzx_params->min_match_length = 3;
2232 lzx_params->nice_match_length = 25 + compression_level * 2;
2233 lzx_params->max_search_depth = 25 + compression_level;
2235 lzx_params->choose_item_func = lzx_choose_near_optimal_item;
2236 lzx_params->num_optim_passes = compression_level / 20;
2237 if (max_window_size <= 32768 && lzx_params->num_optim_passes == 1)
2238 lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
2240 lzx_params->mf_algo = LZ_MF_BINARY_TREES;
2241 lzx_params->min_match_length = (compression_level >= 45) ? 2 : 3;
2242 lzx_params->nice_match_length = min(((u64)compression_level * 32) / 50,
2244 lzx_params->max_search_depth = min(((u64)compression_level * 50) / 50,
2250 lzx_build_mf_params(const struct lzx_compressor_params *lzx_params,
2251 u32 max_window_size, struct lz_mf_params *mf_params)
2253 memset(mf_params, 0, sizeof(*mf_params));
2255 mf_params->algorithm = lzx_params->mf_algo;
2256 mf_params->max_window_size = max_window_size;
2257 mf_params->min_match_len = lzx_params->min_match_length;
2258 mf_params->max_match_len = LZX_MAX_MATCH_LEN;
2259 mf_params->max_search_depth = lzx_params->max_search_depth;
2260 mf_params->nice_match_len = lzx_params->nice_match_length;
2264 lzx_free_compressor(void *_c);
2267 lzx_get_needed_memory(size_t max_block_size, unsigned int compression_level)
2269 struct lzx_compressor_params params;
2271 unsigned window_order;
2272 u32 max_window_size;
2274 window_order = lzx_get_window_order(max_block_size);
2275 if (window_order == 0)
2277 max_window_size = max_block_size;
2279 lzx_build_params(compression_level, max_window_size, ¶ms);
2281 size += sizeof(struct lzx_compressor);
2283 size += max_window_size;
2285 size += DIV_ROUND_UP(max_window_size, LZX_DIV_BLOCK_SIZE) *
2286 sizeof(struct lzx_block_spec);
2288 size += max_window_size * sizeof(struct lzx_item);
2290 size += lz_mf_get_needed_memory(params.mf_algo, max_window_size);
2291 if (params.choose_item_func == lzx_choose_near_optimal_item) {
2292 size += (LZX_OPTIM_ARRAY_LENGTH + params.nice_match_length) *
2293 sizeof(struct lzx_mc_pos_data);
2295 if (params.num_optim_passes > 1)
2296 size += LZX_CACHE_LEN * sizeof(struct lz_match);
2298 size += LZX_MAX_MATCHES_PER_POS * sizeof(struct lz_match);
2303 lzx_create_compressor(size_t max_block_size, unsigned int compression_level,
2306 struct lzx_compressor *c;
2307 struct lzx_compressor_params params;
2308 struct lz_mf_params mf_params;
2309 unsigned window_order;
2310 u32 max_window_size;
2312 window_order = lzx_get_window_order(max_block_size);
2313 if (window_order == 0)
2314 return WIMLIB_ERR_INVALID_PARAM;
2315 max_window_size = max_block_size;
2317 lzx_build_params(compression_level, max_window_size, ¶ms);
2318 lzx_build_mf_params(¶ms, max_window_size, &mf_params);
2319 if (!lz_mf_params_valid(&mf_params))
2320 return WIMLIB_ERR_INVALID_PARAM;
2322 c = CALLOC(1, sizeof(struct lzx_compressor));
2327 c->num_main_syms = lzx_get_num_main_syms(window_order);
2328 c->max_window_size = max_window_size;
2329 c->window_order = window_order;
2331 c->cur_window = ALIGNED_MALLOC(max_window_size, 16);
2335 c->block_specs = MALLOC(DIV_ROUND_UP(max_window_size,
2336 LZX_DIV_BLOCK_SIZE) *
2337 sizeof(struct lzx_block_spec));
2338 if (!c->block_specs)
2341 c->chosen_items = MALLOC(max_window_size * sizeof(struct lzx_item));
2342 if (!c->chosen_items)
2345 c->mf = lz_mf_alloc(&mf_params);
2349 if (params.choose_item_func == lzx_choose_near_optimal_item) {
2350 c->optimum = MALLOC((LZX_OPTIM_ARRAY_LENGTH +
2351 params.nice_match_length) *
2352 sizeof(struct lzx_mc_pos_data));
2357 if (params.num_optim_passes > 1) {
2358 c->cached_matches = MALLOC(LZX_CACHE_LEN *
2359 sizeof(struct lz_match));
2360 if (!c->cached_matches)
2362 c->cache_limit = c->cached_matches + LZX_CACHE_LEN -
2363 (LZX_MAX_MATCHES_PER_POS + 1);
2365 c->cached_matches = MALLOC(LZX_MAX_MATCHES_PER_POS *
2366 sizeof(struct lz_match));
2367 if (!c->cached_matches)
2375 lzx_free_compressor(c);
2376 return WIMLIB_ERR_NOMEM;
2380 lzx_compress(const void *uncompressed_data, size_t uncompressed_size,
2381 void *compressed_data, size_t compressed_size_avail, void *_c)
2383 struct lzx_compressor *c = _c;
2384 struct lzx_output_bitstream os;
2386 /* Don't bother compressing very small inputs. */
2387 if (uncompressed_size < 100)
2390 /* The input data must be preprocessed. To avoid changing the original
2391 * input, copy it to a temporary buffer. */
2392 memcpy(c->cur_window, uncompressed_data, uncompressed_size);
2393 c->cur_window_size = uncompressed_size;
2395 /* Preprocess the data. */
2396 lzx_do_e8_preprocessing(c->cur_window, c->cur_window_size);
2398 /* Prepare the compressed data. */
2399 lzx_prepare_blocks(c);
2401 /* Generate the compressed data and return its size, or 0 if an overflow
2403 lzx_init_output(&os, compressed_data, compressed_size_avail);
2404 lzx_write_all_blocks(c, &os);
2405 return lzx_flush_output(&os);
2409 lzx_free_compressor(void *_c)
2411 struct lzx_compressor *c = _c;
2414 ALIGNED_FREE(c->cur_window);
2415 FREE(c->block_specs);
2416 FREE(c->chosen_items);
2419 FREE(c->cached_matches);
2424 const struct compressor_ops lzx_compressor_ops = {
2425 .get_needed_memory = lzx_get_needed_memory,
2426 .create_compressor = lzx_create_compressor,
2427 .compress = lzx_compress,
2428 .free_compressor = lzx_free_compressor,