4 * A compressor that produces output compatible with the LZX compression format.
8 * Copyright (C) 2012, 2013, 2014 Eric Biggers
10 * This file is part of wimlib, a library for working with WIM files.
12 * wimlib is free software; you can redistribute it and/or modify it under the
13 * terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 3 of the License, or (at your option)
17 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
18 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
19 * A PARTICULAR PURPOSE. See the GNU General Public License for more
22 * You should have received a copy of the GNU General Public License
23 * along with wimlib; if not, see http://www.gnu.org/licenses/.
28 * This file contains a compressor for the LZX ("Lempel-Ziv eXtended"?)
29 * compression format, as used in the WIM (Windows IMaging) file format. This
30 * code may need some slight modifications to be used outside of the WIM format.
31 * In particular, in other situations the LZX block header might be slightly
32 * different, and a sliding window rather than a fixed-size window might be
35 * ----------------------------------------------------------------------------
39 * The primary reference for LZX is the specification released by Microsoft.
40 * However, the comments in lzx-decompress.c provide more information about LZX
41 * and note some errors in the Microsoft specification.
43 * LZX shares many similarities with DEFLATE, the format used by zlib and gzip.
44 * Both LZX and DEFLATE use LZ77 matching and Huffman coding. Certain details
45 * are quite similar, such as the method for storing Huffman codes. However,
46 * the main differences are:
48 * - LZX preprocesses the data to attempt to make x86 machine code slightly more
49 * compressible before attempting to compress it further.
51 * - LZX uses a "main" alphabet which combines literals and matches, with the
52 * match symbols containing a "length header" (giving all or part of the match
53 * length) and a "position slot" (giving, roughly speaking, the order of
54 * magnitude of the match offset).
56 * - LZX does not have static Huffman blocks (that is, the kind with preset
57 * Huffman codes); however it does have two types of dynamic Huffman blocks
58 * ("verbatim" and "aligned").
60 * - LZX has a minimum match length of 2 rather than 3.
62 * - In LZX, match offsets 0 through 2 actually represent entries in an LRU
63 * queue of match offsets. This is very useful for certain types of files,
64 * such as binary files that have repeating records.
66 * ----------------------------------------------------------------------------
68 * Algorithmic Overview
70 * At a high level, any implementation of LZX compression must operate as
73 * 1. Preprocess the input data to translate the targets of 32-bit x86 call
74 * instructions to absolute offsets. (Actually, this is required for WIM,
75 * but might not be in other places LZX is used.)
77 * 2. Find a sequence of LZ77-style matches and literal bytes that expands to
78 * the preprocessed data.
80 * 3. Divide the match/literal sequence into one or more LZX blocks, each of
81 * which may be "uncompressed", "verbatim", or "aligned".
83 * 4. Output each LZX block.
85 * Step (1) is fairly straightforward. It requires looking for 0xe8 bytes in
86 * the input data and performing a translation on the 4 bytes following each
89 * Step (4) is complicated, but it is mostly determined by the LZX format. The
90 * only real choice we have is what algorithm to use to build the length-limited
91 * canonical Huffman codes. See lzx_write_all_blocks() for details.
93 * That leaves steps (2) and (3) as where all the hard stuff happens. Focusing
94 * on step (2), we need to do LZ77-style parsing on the input data, or "window",
95 * to divide it into a sequence of matches and literals. Each position in the
96 * window might have multiple matches associated with it, and we need to choose
97 * which one, if any, to actually use. Therefore, the problem can really be
98 * divided into two areas of concern: (a) finding matches at a given position,
99 * which we shall call "match-finding", and (b) choosing whether to use a
100 * match or a literal at a given position, and if using a match, which one (if
101 * there is more than one available). We shall call this "match-choosing". We
102 * first consider match-finding, then match-choosing.
104 * ----------------------------------------------------------------------------
108 * Given a position in the window, we want to find LZ77-style "matches" with
109 * that position at previous positions in the window. With LZX, the minimum
110 * match length is 2 and the maximum match length is 257. The only restriction
111 * on offsets is that LZX does not allow the last 2 bytes of the window to match
112 * the beginning of the window.
114 * There are a number of algorithms that can be used for this, including hash
115 * chains, binary trees, and suffix arrays. Binary trees generally work well
116 * for LZX compression since it uses medium-size windows (2^15 to 2^21 bytes).
117 * However, when compressing in a fast mode where many positions are skipped
118 * (not searched for matches), hash chains are faster.
120 * Since the match-finders are not specific to LZX, I will not explain them in
121 * detail here. Instead, see lz_hash_chains.c and lz_binary_trees.c.
123 * ----------------------------------------------------------------------------
127 * Usually, choosing the longest match is best because it encodes the most data
128 * in that one item. However, sometimes the longest match is not optimal
129 * because (a) choosing a long match now might prevent using an even longer
130 * match later, or (b) more generally, what we actually care about is the number
131 * of bits it will ultimately take to output each match or literal, which is
132 * actually dependent on the entropy encoding using by the underlying
133 * compression format. Consequently, a longer match usually, but not always,
134 * takes fewer bits to encode than multiple shorter matches or literals that
135 * cover the same data.
137 * This problem of choosing the truly best match/literal sequence is probably
138 * impossible to solve efficiently when combined with entropy encoding. If we
139 * knew how many bits it takes to output each match/literal, then we could
140 * choose the optimal sequence using shortest-path search a la Dijkstra's
141 * algorithm. However, with entropy encoding, the chosen match/literal sequence
142 * affects its own encoding. Therefore, we can't know how many bits it will
143 * take to actually output any one match or literal until we have actually
144 * chosen the full sequence of matches and literals.
146 * Notwithstanding the entropy encoding problem, we also aren't guaranteed to
147 * choose the optimal match/literal sequence unless the match-finder (see
148 * section "Match-finder") provides the match-chooser with all possible matches
149 * at each position. However, this is not computationally efficient. For
150 * example, there might be many matches of the same length, and usually (but not
151 * always) the best choice is the one with the smallest offset. So in practice,
152 * it's fine to only consider the smallest offset for a given match length at a
153 * given position. (Actually, for LZX, it's also worth considering repeat
156 * In addition, as mentioned earlier, in LZX we have the choice of using
157 * multiple blocks, each of which resets the Huffman codes. This expands the
158 * search space even further. Therefore, to simplify the problem, we currently
159 * we don't attempt to actually choose the LZX blocks based on the data.
160 * Instead, we just divide the data into fixed-size blocks of LZX_DIV_BLOCK_SIZE
161 * bytes each, and always use verbatim or aligned blocks (never uncompressed).
162 * A previous version of this code recursively split the input data into
163 * equal-sized blocks, up to a maximum depth, and chose the lowest-cost block
164 * divisions. However, this made compression much slower and did not actually
165 * help very much. It remains an open question whether a sufficiently fast and
166 * useful block-splitting algorithm is possible for LZX. Essentially the same
167 * problem also applies to DEFLATE. The Microsoft LZX compressor seemingly does
168 * do block splitting, although I don't know how fast or useful it is,
171 * Now, back to the entropy encoding problem. The "solution" is to use an
172 * iterative approach to compute a good, but not necessarily optimal,
173 * match/literal sequence. Start with a fixed assignment of symbol costs and
174 * choose an "optimal" match/literal sequence based on those costs, using
175 * shortest-path seach a la Dijkstra's algorithm. Then, for each iteration of
176 * the optimization, update the costs based on the entropy encoding of the
177 * current match/literal sequence, then choose a new match/literal sequence
178 * based on the updated costs. Usually, the actual cost to output the current
179 * match/literal sequence will decrease in each iteration until it converges on
180 * a fixed point. This result may not be the truly optimal match/literal
181 * sequence, but it usually is much better than one chosen by doing a "greedy"
182 * parse where we always chooe the longest match.
184 * An alternative to both greedy parsing and iterative, near-optimal parsing is
185 * "lazy" parsing. Briefly, "lazy" parsing considers just the longest match at
186 * each position, but it waits to choose that match until it has also examined
187 * the next position. This is actually a useful approach; it's used by zlib,
188 * for example. Therefore, for fast compression we combine lazy parsing with
189 * the hash chain max-finder. For normal/high compression we combine
190 * near-optimal parsing with the binary tree match-finder.
197 #include "wimlib/compressor_ops.h"
198 #include "wimlib/compress_common.h"
199 #include "wimlib/endianness.h"
200 #include "wimlib/error.h"
201 #include "wimlib/lz_mf.h"
202 #include "wimlib/lz_repsearch.h"
203 #include "wimlib/lzx.h"
204 #include "wimlib/util.h"
207 #define LZX_OPTIM_ARRAY_LENGTH 4096
209 #define LZX_DIV_BLOCK_SIZE 32768
211 #define LZX_CACHE_PER_POS 8
213 #define LZX_MAX_MATCHES_PER_POS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
215 #define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1))
217 /* Codewords for the LZX main, length, and aligned offset Huffman codes */
218 struct lzx_codewords {
219 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
220 u32 len[LZX_LENCODE_NUM_SYMBOLS];
221 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
224 /* Codeword lengths (in bits) for the LZX main, length, and aligned offset
227 * A 0 length means the codeword has zero frequency.
230 u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
231 u8 len[LZX_LENCODE_NUM_SYMBOLS];
232 u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
235 /* Costs for the LZX main, length, and aligned offset Huffman symbols.
237 * If a codeword has zero frequency, it must still be assigned some nonzero cost
238 * --- generally a high cost, since even if it gets used in the next iteration,
239 * it probably will not be used very many times. */
241 u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
242 u8 len[LZX_LENCODE_NUM_SYMBOLS];
243 u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
246 /* The LZX main, length, and aligned offset Huffman codes */
248 struct lzx_codewords codewords;
249 struct lzx_lens lens;
252 /* Tables for tallying symbol frequencies in the three LZX alphabets */
254 u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
255 u32 len[LZX_LENCODE_NUM_SYMBOLS];
256 u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
259 /* LZX intermediate match/literal format */
263 * 31 1 if a match, 0 if a literal.
265 * 30-25 position slot. This can be at most 50, so it will fit in 6
268 * 8-24 position footer. This is the offset of the real formatted
269 * offset from the position base. This can be at most 17 bits
270 * (since lzx_extra_bits[LZX_MAX_POSITION_SLOTS - 1] is 17).
272 * 0-7 length of match, minus 2. This can be at most
273 * (LZX_MAX_MATCH_LEN - 2) == 255, so it will fit in 8 bits. */
277 /* Specification for an LZX block. */
278 struct lzx_block_spec {
280 /* One of the LZX_BLOCKTYPE_* constants indicating which type of this
284 /* 0-based position in the window at which this block starts. */
287 /* The number of bytes of uncompressed data this block represents. */
290 /* The match/literal sequence for this block. */
291 struct lzx_item *chosen_items;
293 /* The length of the @chosen_items sequence. */
294 u32 num_chosen_items;
296 /* Huffman codes for this block. */
297 struct lzx_codes codes;
300 struct lzx_compressor;
302 struct lzx_compressor_params {
303 struct lz_match (*choose_item_func)(struct lzx_compressor *);
304 enum lz_mf_algo mf_algo;
305 u32 num_optim_passes;
306 u32 min_match_length;
307 u32 nice_match_length;
308 u32 max_search_depth;
311 /* State of the LZX compressor. */
312 struct lzx_compressor {
314 /* The buffer of data to be compressed.
316 * 0xe8 byte preprocessing is done directly on the data here before
317 * further compression.
319 * Note that this compressor does *not* use a real sliding window!!!!
320 * It's not needed in the WIM format, since every chunk is compressed
321 * independently. This is by design, to allow random access to the
325 /* Number of bytes of data to be compressed, which is the number of
326 * bytes of data in @cur_window that are actually valid. */
329 /* Allocated size of @cur_window. */
332 /* log2 order of the LZX window size for LZ match offset encoding
333 * purposes. Will be >= LZX_MIN_WINDOW_ORDER and <=
334 * LZX_MAX_WINDOW_ORDER.
336 * Note: 1 << @window_order is normally equal to @max_window_size, but
337 * it will be greater than @max_window_size in the event that the
338 * compressor was created with a non-power-of-2 block size. (See
339 * lzx_get_window_order().) */
340 unsigned window_order;
342 /* Compression parameters. */
343 struct lzx_compressor_params params;
345 unsigned (*get_matches_func)(struct lzx_compressor *, const struct lz_match **);
346 void (*skip_bytes_func)(struct lzx_compressor *, unsigned n);
348 /* Number of symbols in the main alphabet (depends on the @window_order
349 * since it determines the maximum allowed offset). */
350 unsigned num_main_syms;
352 /* The current match offset LRU queue. */
353 struct lzx_lru_queue queue;
355 /* Space for the sequences of matches/literals that were chosen for each
357 struct lzx_item *chosen_items;
359 /* Information about the LZX blocks the preprocessed input was divided
361 struct lzx_block_spec *block_specs;
363 /* Number of LZX blocks the input was divided into; a.k.a. the number of
364 * elements of @block_specs that are valid. */
367 /* This is simply filled in with zeroes and used to avoid special-casing
368 * the output of the first compressed Huffman code, which conceptually
369 * has a delta taken from a code with all symbols having zero-length
371 struct lzx_codes zero_codes;
373 /* The current cost model. */
374 struct lzx_costs costs;
376 /* Lempel-Ziv match-finder. */
379 /* Position in window of next match to return. */
380 u32 match_window_pos;
382 /* The end-of-block position. We can't allow any matches to span this
384 u32 match_window_end;
386 /* When doing more than one match-choosing pass over the data, matches
387 * found by the match-finder are cached in the following array to
388 * achieve a slight speedup when the same matches are needed on
389 * subsequent passes. This is suboptimal because different matches may
390 * be preferred with different cost models, but seems to be a worthwhile
392 struct lz_match *cached_matches;
393 struct lz_match *cache_ptr;
394 struct lz_match *cache_limit;
396 /* Match-chooser state, used when doing near-optimal parsing.
398 * When matches have been chosen, optimum_cur_idx is set to the position
399 * in the window of the next match/literal to return and optimum_end_idx
400 * is set to the position in the window at the end of the last
401 * match/literal to return. */
402 struct lzx_mc_pos_data *optimum;
403 unsigned optimum_cur_idx;
404 unsigned optimum_end_idx;
406 /* Previous match, used when doing lazy parsing. */
407 struct lz_match prev_match;
411 * Match chooser position data:
413 * An array of these structures is used during the match-choosing algorithm.
414 * They correspond to consecutive positions in the window and are used to keep
415 * track of the cost to reach each position, and the match/literal choices that
416 * need to be chosen to reach that position.
418 struct lzx_mc_pos_data {
419 /* The approximate minimum cost, in bits, to reach this position in the
420 * window which has been found so far. */
422 #define MC_INFINITE_COST ((u32)~0UL)
424 /* The union here is just for clarity, since the fields are used in two
425 * slightly different ways. Initially, the @prev structure is filled in
426 * first, and links go from later in the window to earlier in the
427 * window. Later, @next structure is filled in and links go from
428 * earlier in the window to later in the window. */
431 /* Position of the start of the match or literal that
432 * was taken to get to this position in the approximate
433 * minimum-cost parse. */
436 /* Offset (as in an LZ (length, offset) pair) of the
437 * match or literal that was taken to get to this
438 * position in the approximate minimum-cost parse. */
442 /* Position at which the match or literal starting at
443 * this position ends in the minimum-cost parse. */
446 /* Offset (as in an LZ (length, offset) pair) of the
447 * match or literal starting at this position in the
448 * approximate minimum-cost parse. */
453 /* Adaptive state that exists after an approximate minimum-cost path to
454 * reach this position is taken.
456 * Note: we update this whenever we update the pending minimum-cost
457 * path. This is in contrast to LZMA, which also has an optimal parser
458 * that maintains a repeat offset queue per position, but will only
459 * compute the queue once that position is actually reached in the
460 * parse, meaning that matches are being considered *starting* at that
461 * position. However, the two methods seem to have approximately the
462 * same performance if appropriate optimizations are used. Intuitively
463 * the LZMA method seems faster, but it actually suffers from 1-2 extra
464 * hard-to-predict branches at each position. Probably it works better
465 * for LZMA than LZX because LZMA has a larger adaptive state than LZX,
466 * and the LZMA encoder considers more possibilities. */
467 struct lzx_lru_queue queue;
472 * Structure to keep track of the current state of sending bits to the
473 * compressed output buffer.
475 * The LZX bitstream is encoded as a sequence of 16-bit coding units.
477 struct lzx_output_bitstream {
479 /* Bits that haven't yet been written to the output buffer. */
482 /* Number of bits currently held in @bitbuf. */
485 /* Pointer to the start of the output buffer. */
488 /* Pointer to the position in the output buffer at which the next coding
489 * unit should be written. */
492 /* Pointer past the end of the output buffer. */
497 * Initialize the output bitstream.
500 * The output bitstream structure to initialize.
502 * The buffer being written to.
504 * Size of @buffer, in bytes.
507 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, u32 size)
512 os->next = os->start;
513 os->end = os->start + size / sizeof(le16);
517 * Write some bits to the output bitstream.
519 * The bits are given by the low-order @num_bits bits of @bits. Higher-order
520 * bits in @bits cannot be set. At most 17 bits can be written at once.
522 * @max_bits is a compile-time constant that specifies the maximum number of
523 * bits that can ever be written at the call site. Currently, it is used to
524 * optimize away the conditional code for writing a second 16-bit coding unit
525 * when writing fewer than 17 bits.
527 * If the output buffer space is exhausted, then the bits will be ignored, and
528 * lzx_flush_output() will return 0 when it gets called.
530 static _always_inline_attribute void
531 lzx_write_varbits(struct lzx_output_bitstream *os,
532 const u32 bits, const unsigned int num_bits,
533 const unsigned int max_num_bits)
535 /* This code is optimized for LZX, which never needs to write more than
536 * 17 bits at once. */
537 LZX_ASSERT(num_bits <= 17);
538 LZX_ASSERT(num_bits <= max_num_bits);
539 LZX_ASSERT(os->bitcount <= 15);
541 /* Add the bits to the bit buffer variable. @bitcount will be at most
542 * 15, so there will be just enough space for the maximum possible
543 * @num_bits of 17. */
544 os->bitcount += num_bits;
545 os->bitbuf = (os->bitbuf << num_bits) | bits;
547 /* Check whether any coding units need to be written. */
548 if (os->bitcount >= 16) {
552 /* Write a coding unit, unless it would overflow the buffer. */
553 if (os->next != os->end)
554 *os->next++ = cpu_to_le16(os->bitbuf >> os->bitcount);
556 /* If writing 17 bits, a second coding unit might need to be
557 * written. But because 'max_num_bits' is a compile-time
558 * constant, the compiler will optimize away this code at most
560 if (max_num_bits == 17 && os->bitcount == 16) {
561 if (os->next != os->end)
562 *os->next++ = cpu_to_le16(os->bitbuf);
568 /* Use when @num_bits is a compile-time constant. Otherwise use
569 * lzx_write_varbits(). */
570 static _always_inline_attribute void
571 lzx_write_bits(struct lzx_output_bitstream *os,
572 const u32 bits, const unsigned int num_bits)
574 lzx_write_varbits(os, bits, num_bits, num_bits);
578 * Flush the last coding unit to the output buffer if needed. Return the total
579 * number of bytes written to the output buffer, or 0 if an overflow occurred.
582 lzx_flush_output(struct lzx_output_bitstream *os)
584 if (os->next == os->end)
587 if (os->bitcount != 0)
588 *os->next++ = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
590 return (const u8 *)os->next - (const u8 *)os->start;
593 /* Returns the LZX position slot that corresponds to a given match offset,
594 * taking into account the recent offset queue and updating it if the offset is
597 lzx_get_position_slot(u32 offset, struct lzx_lru_queue *queue)
599 unsigned position_slot;
601 /* See if the offset was recently used. */
602 for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
603 if (offset == queue->R[i]) {
606 /* Bring the repeat offset to the front of the
607 * queue. Note: this is, in fact, not a real
608 * LRU queue because repeat matches are simply
609 * swapped to the front. */
610 swap(queue->R[0], queue->R[i]);
612 /* The resulting position slot is simply the first index
613 * at which the offset was found in the queue. */
618 /* The offset was not recently used; look up its real position slot. */
619 position_slot = lzx_get_position_slot_raw(offset + LZX_OFFSET_OFFSET);
621 /* Bring the new offset to the front of the queue. */
622 for (int i = LZX_NUM_RECENT_OFFSETS - 1; i > 0; i--)
623 queue->R[i] = queue->R[i - 1];
624 queue->R[0] = offset;
626 return position_slot;
629 /* Build the main, length, and aligned offset Huffman codes used in LZX.
631 * This takes as input the frequency tables for each code and produces as output
632 * a set of tables that map symbols to codewords and codeword lengths. */
634 lzx_make_huffman_codes(const struct lzx_freqs *freqs,
635 struct lzx_codes *codes,
636 unsigned num_main_syms)
638 make_canonical_huffman_code(num_main_syms,
639 LZX_MAX_MAIN_CODEWORD_LEN,
642 codes->codewords.main);
644 make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
645 LZX_MAX_LEN_CODEWORD_LEN,
648 codes->codewords.len);
650 make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
651 LZX_MAX_ALIGNED_CODEWORD_LEN,
654 codes->codewords.aligned);
658 * Output a precomputed LZX match.
661 * The bitstream to which to write the match.
663 * The type of the LZX block (LZX_BLOCKTYPE_ALIGNED or
664 * LZX_BLOCKTYPE_VERBATIM)
668 * Pointer to a structure that contains the codewords for the main, length,
669 * and aligned offset Huffman codes for the current LZX compressed block.
672 lzx_write_match(struct lzx_output_bitstream *os, int block_type,
673 struct lzx_item match, const struct lzx_codes *codes)
675 unsigned match_len_minus_2 = match.data & 0xff;
676 u32 position_footer = (match.data >> 8) & 0x1ffff;
677 unsigned position_slot = (match.data >> 25) & 0x3f;
680 unsigned main_symbol;
681 unsigned num_extra_bits;
683 /* If the match length is less than MIN_MATCH_LEN (= 2) +
684 * NUM_PRIMARY_LENS (= 7), the length header contains the match length
685 * minus MIN_MATCH_LEN, and there is no length footer.
687 * Otherwise, the length header contains NUM_PRIMARY_LENS, and the
688 * length footer contains the match length minus NUM_PRIMARY_LENS minus
690 if (match_len_minus_2 < LZX_NUM_PRIMARY_LENS) {
691 len_header = match_len_minus_2;
693 len_header = LZX_NUM_PRIMARY_LENS;
694 len_footer = match_len_minus_2 - LZX_NUM_PRIMARY_LENS;
697 /* Combine the position slot with the length header into a single symbol
698 * that will be encoded with the main code.
700 * The actual main symbol is offset by LZX_NUM_CHARS because values
701 * under LZX_NUM_CHARS are used to indicate a literal byte rather than a
703 main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
705 /* Output main symbol. */
706 lzx_write_varbits(os, codes->codewords.main[main_symbol],
707 codes->lens.main[main_symbol],
708 LZX_MAX_MAIN_CODEWORD_LEN);
710 /* If there is a length footer, output it using the
711 * length Huffman code. */
712 if (len_header == LZX_NUM_PRIMARY_LENS) {
713 lzx_write_varbits(os, codes->codewords.len[len_footer],
714 codes->lens.len[len_footer],
715 LZX_MAX_LEN_CODEWORD_LEN);
718 /* Output the position footer. */
720 num_extra_bits = lzx_get_num_extra_bits(position_slot);
722 if ((block_type == LZX_BLOCKTYPE_ALIGNED) && (num_extra_bits >= 3)) {
724 /* Aligned offset blocks: The low 3 bits of the position footer
725 * are Huffman-encoded using the aligned offset code. The
726 * remaining bits are output literally. */
728 lzx_write_varbits(os,
729 position_footer >> 3, num_extra_bits - 3, 14);
731 lzx_write_varbits(os,
732 codes->codewords.aligned[position_footer & 7],
733 codes->lens.aligned[position_footer & 7],
734 LZX_MAX_ALIGNED_CODEWORD_LEN);
736 /* Verbatim blocks, or fewer than 3 extra bits: All position
737 * footer bits are output literally. */
738 lzx_write_varbits(os, position_footer, num_extra_bits, 17);
742 /* Output an LZX literal (encoded with the main Huffman code). */
744 lzx_write_literal(struct lzx_output_bitstream *os, unsigned literal,
745 const struct lzx_codes *codes)
747 lzx_write_varbits(os, codes->codewords.main[literal],
748 codes->lens.main[literal], LZX_MAX_MAIN_CODEWORD_LEN);
752 lzx_compute_precode_items(const u8 lens[restrict],
753 const u8 prev_lens[restrict],
754 const unsigned num_lens,
755 u32 precode_freqs[restrict],
756 unsigned precode_items[restrict])
765 itemptr = precode_items;
768 /* Find the next run of codeword lengths. */
770 /* len = the length being repeated */
771 len = lens[run_start];
773 run_end = run_start + 1;
775 /* Fast case for a single length. */
776 if (likely(run_end == num_lens || len != lens[run_end])) {
777 delta = prev_lens[run_start] - len;
780 precode_freqs[delta]++;
786 /* Extend the run. */
789 } while (run_end != num_lens && len == lens[run_end]);
794 /* Symbol 18: RLE 20 to 51 zeroes at a time. */
795 while ((run_end - run_start) >= 20) {
796 extra_bits = min((run_end - run_start) - 20, 0x1f);
798 *itemptr++ = 18 | (extra_bits << 5);
799 run_start += 20 + extra_bits;
802 /* Symbol 17: RLE 4 to 19 zeroes at a time. */
803 if ((run_end - run_start) >= 4) {
804 extra_bits = min((run_end - run_start) - 4, 0xf);
806 *itemptr++ = 17 | (extra_bits << 5);
807 run_start += 4 + extra_bits;
811 /* A run of nonzero lengths. */
813 /* Symbol 19: RLE 4 to 5 of any length at a time. */
814 while ((run_end - run_start) >= 4) {
815 extra_bits = (run_end - run_start) > 4;
816 delta = prev_lens[run_start] - len;
820 precode_freqs[delta]++;
821 *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
822 run_start += 4 + extra_bits;
826 /* Output any remaining lengths without RLE. */
827 while (run_start != run_end) {
828 delta = prev_lens[run_start] - len;
831 precode_freqs[delta]++;
835 } while (run_start != num_lens);
837 return itemptr - precode_items;
841 * Output a Huffman code in the compressed form used in LZX.
843 * The Huffman code is represented in the output as a logical series of codeword
844 * lengths from which the Huffman code, which must be in canonical form, can be
847 * The codeword lengths are themselves compressed using a separate Huffman code,
848 * the "precode", which contains a symbol for each possible codeword length in
849 * the larger code as well as several special symbols to represent repeated
850 * codeword lengths (a form of run-length encoding). The precode is itself
851 * constructed in canonical form, and its codeword lengths are represented
852 * literally in 20 4-bit fields that immediately precede the compressed codeword
853 * lengths of the larger code.
855 * Furthermore, the codeword lengths of the larger code are actually represented
856 * as deltas from the codeword lengths of the corresponding code in the previous
860 * Bitstream to which to write the compressed Huffman code.
862 * The codeword lengths, indexed by symbol, in the Huffman code.
864 * The codeword lengths, indexed by symbol, in the corresponding Huffman
865 * code in the previous block, or all zeroes if this is the first block.
867 * The number of symbols in the Huffman code.
870 lzx_write_compressed_code(struct lzx_output_bitstream *os,
871 const u8 lens[restrict],
872 const u8 prev_lens[restrict],
875 u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
876 u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
877 u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
878 unsigned precode_items[num_lens];
879 unsigned num_precode_items;
880 unsigned precode_item;
881 unsigned precode_sym;
884 for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
885 precode_freqs[i] = 0;
887 /* Compute the "items" (RLE / literal tokens and extra bits) with which
888 * the codeword lengths in the larger code will be output. */
889 num_precode_items = lzx_compute_precode_items(lens,
895 /* Build the precode. */
896 make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS,
897 LZX_MAX_PRE_CODEWORD_LEN,
898 precode_freqs, precode_lens,
901 /* Output the lengths of the codewords in the precode. */
902 for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
903 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
905 /* Output the encoded lengths of the codewords in the larger code. */
906 for (i = 0; i < num_precode_items; i++) {
907 precode_item = precode_items[i];
908 precode_sym = precode_item & 0x1F;
909 lzx_write_varbits(os, precode_codewords[precode_sym],
910 precode_lens[precode_sym],
911 LZX_MAX_PRE_CODEWORD_LEN);
912 if (precode_sym >= 17) {
913 if (precode_sym == 17) {
914 lzx_write_bits(os, precode_item >> 5, 4);
915 } else if (precode_sym == 18) {
916 lzx_write_bits(os, precode_item >> 5, 5);
918 lzx_write_bits(os, (precode_item >> 5) & 1, 1);
919 precode_sym = precode_item >> 6;
920 lzx_write_varbits(os, precode_codewords[precode_sym],
921 precode_lens[precode_sym],
922 LZX_MAX_PRE_CODEWORD_LEN);
929 * Write all matches and literal bytes (which were precomputed) in an LZX
930 * compressed block to the output bitstream in the final compressed
934 * The output bitstream.
936 * The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
937 * LZX_BLOCKTYPE_VERBATIM).
939 * The array of matches/literals to output.
941 * Number of matches/literals to output (length of @items).
943 * The main, length, and aligned offset Huffman codes for the current
944 * LZX compressed block.
947 lzx_write_items(struct lzx_output_bitstream *os, int block_type,
948 const struct lzx_item items[], u32 num_items,
949 const struct lzx_codes *codes)
951 for (u32 i = 0; i < num_items; i++) {
952 /* The high bit of the 32-bit intermediate representation
953 * indicates whether the item is an actual LZ-style match (1) or
954 * a literal byte (0). */
955 if (items[i].data & 0x80000000)
956 lzx_write_match(os, block_type, items[i], codes);
958 lzx_write_literal(os, items[i].data, codes);
962 /* Write an LZX aligned offset or verbatim block to the output. */
964 lzx_write_compressed_block(int block_type,
966 unsigned window_order,
967 unsigned num_main_syms,
968 struct lzx_item * chosen_items,
969 u32 num_chosen_items,
970 const struct lzx_codes * codes,
971 const struct lzx_codes * prev_codes,
972 struct lzx_output_bitstream * os)
974 LZX_ASSERT(block_type == LZX_BLOCKTYPE_ALIGNED ||
975 block_type == LZX_BLOCKTYPE_VERBATIM);
977 /* The first three bits indicate the type of block and are one of the
978 * LZX_BLOCKTYPE_* constants. */
979 lzx_write_bits(os, block_type, 3);
981 /* Output the block size.
983 * The original LZX format seemed to always encode the block size in 3
984 * bytes. However, the implementation in WIMGAPI, as used in WIM files,
985 * uses the first bit to indicate whether the block is the default size
986 * (32768) or a different size given explicitly by the next 16 bits.
988 * By default, this compressor uses a window size of 32768 and therefore
989 * follows the WIMGAPI behavior. However, this compressor also supports
990 * window sizes greater than 32768 bytes, which do not appear to be
991 * supported by WIMGAPI. In such cases, we retain the default size bit
992 * to mean a size of 32768 bytes but output non-default block size in 24
993 * bits rather than 16. The compatibility of this behavior is unknown
994 * because WIMs created with chunk size greater than 32768 can seemingly
995 * only be opened by wimlib anyway. */
996 if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
997 lzx_write_bits(os, 1, 1);
999 lzx_write_bits(os, 0, 1);
1001 if (window_order >= 16)
1002 lzx_write_bits(os, block_size >> 16, 8);
1004 lzx_write_bits(os, block_size & 0xFFFF, 16);
1007 /* Output the aligned offset code. */
1008 if (block_type == LZX_BLOCKTYPE_ALIGNED) {
1009 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1010 lzx_write_bits(os, codes->lens.aligned[i],
1011 LZX_ALIGNEDCODE_ELEMENT_SIZE);
1015 /* Output the main code (two parts). */
1016 lzx_write_compressed_code(os, codes->lens.main,
1017 prev_codes->lens.main,
1019 lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
1020 prev_codes->lens.main + LZX_NUM_CHARS,
1021 num_main_syms - LZX_NUM_CHARS);
1023 /* Output the length code. */
1024 lzx_write_compressed_code(os, codes->lens.len,
1025 prev_codes->lens.len,
1026 LZX_LENCODE_NUM_SYMBOLS);
1028 /* Output the compressed matches and literals. */
1029 lzx_write_items(os, block_type, chosen_items, num_chosen_items, codes);
1032 /* Write out the LZX blocks that were computed. */
1034 lzx_write_all_blocks(struct lzx_compressor *c, struct lzx_output_bitstream *os)
1037 const struct lzx_codes *prev_codes = &c->zero_codes;
1038 for (unsigned i = 0; i < c->num_blocks; i++) {
1039 const struct lzx_block_spec *spec = &c->block_specs[i];
1041 lzx_write_compressed_block(spec->block_type,
1046 spec->num_chosen_items,
1051 prev_codes = &spec->codes;
1055 /* Constructs an LZX match from a literal byte and updates the main code symbol
1058 lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
1064 /* Constructs an LZX match from an offset and a length, and updates the LRU
1065 * queue and the frequency of symbols in the main, length, and aligned offset
1066 * alphabets. The return value is a 32-bit number that provides the match in an
1067 * intermediate representation documented below. */
1069 lzx_tally_match(unsigned match_len, u32 match_offset,
1070 struct lzx_freqs *freqs, struct lzx_lru_queue *queue)
1072 unsigned position_slot;
1073 u32 position_footer;
1075 unsigned main_symbol;
1076 unsigned len_footer;
1077 unsigned adjusted_match_len;
1079 LZX_ASSERT(match_len >= LZX_MIN_MATCH_LEN && match_len <= LZX_MAX_MATCH_LEN);
1081 /* The match offset shall be encoded as a position slot (itself encoded
1082 * as part of the main symbol) and a position footer. */
1083 position_slot = lzx_get_position_slot(match_offset, queue);
1084 position_footer = (match_offset + LZX_OFFSET_OFFSET) &
1085 (((u32)1 << lzx_get_num_extra_bits(position_slot)) - 1);
1087 /* The match length shall be encoded as a length header (itself encoded
1088 * as part of the main symbol) and an optional length footer. */
1089 adjusted_match_len = match_len - LZX_MIN_MATCH_LEN;
1090 if (adjusted_match_len < LZX_NUM_PRIMARY_LENS) {
1091 /* No length footer needed. */
1092 len_header = adjusted_match_len;
1094 /* Length footer needed. It will be encoded using the length
1096 len_header = LZX_NUM_PRIMARY_LENS;
1097 len_footer = adjusted_match_len - LZX_NUM_PRIMARY_LENS;
1098 freqs->len[len_footer]++;
1101 /* Account for the main symbol. */
1102 main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
1104 freqs->main[main_symbol]++;
1106 /* In an aligned offset block, 3 bits of the position footer are output
1107 * as an aligned offset symbol. Account for this, although we may
1108 * ultimately decide to output the block as verbatim. */
1110 /* The following check is equivalent to:
1112 * if (lzx_extra_bits[position_slot] >= 3)
1114 * Note that this correctly excludes position slots that correspond to
1115 * recent offsets. */
1116 if (position_slot >= 8)
1117 freqs->aligned[position_footer & 7]++;
1119 /* Pack the position slot, position footer, and match length into an
1120 * intermediate representation. See `struct lzx_item' for details.
1122 LZX_ASSERT(LZX_MAX_POSITION_SLOTS <= 64);
1123 LZX_ASSERT(lzx_get_num_extra_bits(LZX_MAX_POSITION_SLOTS - 1) <= 17);
1124 LZX_ASSERT(LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1 <= 256);
1126 LZX_ASSERT(position_slot <= (1U << (31 - 25)) - 1);
1127 LZX_ASSERT(position_footer <= (1U << (25 - 8)) - 1);
1128 LZX_ASSERT(adjusted_match_len <= (1U << (8 - 0)) - 1);
1130 (position_slot << 25) |
1131 (position_footer << 8) |
1132 (adjusted_match_len);
1135 /* Returns the cost, in bits, to output a literal byte using the specified cost
1138 lzx_literal_cost(u8 c, const struct lzx_costs * costs)
1140 return costs->main[c];
1143 /* Returns the cost, in bits, to output a repeat offset match of the specified
1144 * length and position slot (repeat index) using the specified cost model. */
1146 lzx_repmatch_cost(u32 len, unsigned position_slot, const struct lzx_costs *costs)
1148 unsigned len_header, main_symbol;
1151 len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
1152 main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
1154 /* Account for main symbol. */
1155 cost += costs->main[main_symbol];
1157 /* Account for extra length information. */
1158 if (len_header == LZX_NUM_PRIMARY_LENS)
1159 cost += costs->len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
1164 /* Set the cost model @c->costs from the Huffman codeword lengths specified in
1167 * The cost model and codeword lengths are almost the same thing, but the
1168 * Huffman codewords with length 0 correspond to symbols with zero frequency
1169 * that still need to be assigned actual costs. The specific values assigned
1170 * are arbitrary, but they should be fairly high (near the maximum codeword
1171 * length) to take into account the fact that uses of these symbols are expected
1174 lzx_set_costs(struct lzx_compressor *c, const struct lzx_lens * lens,
1180 for (i = 0; i < c->num_main_syms; i++)
1181 c->costs.main[i] = lens->main[i] ? lens->main[i] : nostat;
1184 for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
1185 c->costs.len[i] = lens->len[i] ? lens->len[i] : nostat;
1187 /* Aligned offset code */
1188 for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
1189 c->costs.aligned[i] = lens->aligned[i] ? lens->aligned[i] : nostat / 2;
1192 /* Don't allow matches to span the end of an LZX block. */
1194 maybe_truncate_matches(struct lz_match matches[], u32 num_matches,
1195 struct lzx_compressor *c)
1197 if (c->match_window_end < c->cur_window_size && num_matches != 0) {
1198 u32 limit = c->match_window_end - c->match_window_pos;
1200 if (limit >= LZX_MIN_MATCH_LEN) {
1202 u32 i = num_matches - 1;
1204 if (matches[i].len >= limit) {
1205 matches[i].len = limit;
1207 /* Truncation might produce multiple
1208 * matches with length 'limit'. Keep at
1210 num_matches = i + 1;
1221 lzx_get_matches_fillcache_singleblock(struct lzx_compressor *c,
1222 const struct lz_match **matches_ret)
1224 struct lz_match *cache_ptr;
1225 struct lz_match *matches;
1226 unsigned num_matches;
1228 cache_ptr = c->cache_ptr;
1229 matches = cache_ptr + 1;
1230 if (likely(cache_ptr <= c->cache_limit)) {
1231 num_matches = lz_mf_get_matches(c->mf, matches);
1232 cache_ptr->len = num_matches;
1233 c->cache_ptr = matches + num_matches;
1237 c->match_window_pos++;
1238 *matches_ret = matches;
1243 lzx_get_matches_fillcache_multiblock(struct lzx_compressor *c,
1244 const struct lz_match **matches_ret)
1246 struct lz_match *cache_ptr;
1247 struct lz_match *matches;
1248 unsigned num_matches;
1250 cache_ptr = c->cache_ptr;
1251 matches = cache_ptr + 1;
1252 if (likely(cache_ptr <= c->cache_limit)) {
1253 num_matches = lz_mf_get_matches(c->mf, matches);
1254 num_matches = maybe_truncate_matches(matches, num_matches, c);
1255 cache_ptr->len = num_matches;
1256 c->cache_ptr = matches + num_matches;
1260 c->match_window_pos++;
1261 *matches_ret = matches;
1266 lzx_get_matches_usecache(struct lzx_compressor *c,
1267 const struct lz_match **matches_ret)
1269 struct lz_match *cache_ptr;
1270 struct lz_match *matches;
1271 unsigned num_matches;
1273 cache_ptr = c->cache_ptr;
1274 matches = cache_ptr + 1;
1275 if (cache_ptr <= c->cache_limit) {
1276 num_matches = cache_ptr->len;
1277 c->cache_ptr = matches + num_matches;
1281 c->match_window_pos++;
1282 *matches_ret = matches;
1287 lzx_get_matches_usecache_nocheck(struct lzx_compressor *c,
1288 const struct lz_match **matches_ret)
1290 struct lz_match *cache_ptr;
1291 struct lz_match *matches;
1292 unsigned num_matches;
1294 cache_ptr = c->cache_ptr;
1295 matches = cache_ptr + 1;
1296 num_matches = cache_ptr->len;
1297 c->cache_ptr = matches + num_matches;
1298 c->match_window_pos++;
1299 *matches_ret = matches;
1304 lzx_get_matches_nocache_singleblock(struct lzx_compressor *c,
1305 const struct lz_match **matches_ret)
1307 struct lz_match *matches;
1308 unsigned num_matches;
1310 matches = c->cache_ptr;
1311 num_matches = lz_mf_get_matches(c->mf, matches);
1312 c->match_window_pos++;
1313 *matches_ret = matches;
1318 lzx_get_matches_nocache_multiblock(struct lzx_compressor *c,
1319 const struct lz_match **matches_ret)
1321 struct lz_match *matches;
1322 unsigned num_matches;
1324 matches = c->cache_ptr;
1325 num_matches = lz_mf_get_matches(c->mf, matches);
1326 num_matches = maybe_truncate_matches(matches, num_matches, c);
1327 c->match_window_pos++;
1328 *matches_ret = matches;
1333 * Find matches at the next position in the window.
1335 * Returns the number of matches found and sets *matches_ret to point to the
1336 * matches array. The matches will be sorted by strictly increasing length and
1339 static inline unsigned
1340 lzx_get_matches(struct lzx_compressor *c,
1341 const struct lz_match **matches_ret)
1343 return (*c->get_matches_func)(c, matches_ret);
1347 lzx_skip_bytes_fillcache(struct lzx_compressor *c, unsigned n)
1349 struct lz_match *cache_ptr;
1351 cache_ptr = c->cache_ptr;
1352 c->match_window_pos += n;
1353 lz_mf_skip_positions(c->mf, n);
1354 if (cache_ptr <= c->cache_limit) {
1358 } while (--n && cache_ptr <= c->cache_limit);
1360 c->cache_ptr = cache_ptr;
1364 lzx_skip_bytes_usecache(struct lzx_compressor *c, unsigned n)
1366 struct lz_match *cache_ptr;
1368 cache_ptr = c->cache_ptr;
1369 c->match_window_pos += n;
1370 if (cache_ptr <= c->cache_limit) {
1372 cache_ptr += 1 + cache_ptr->len;
1373 } while (--n && cache_ptr <= c->cache_limit);
1375 c->cache_ptr = cache_ptr;
1379 lzx_skip_bytes_usecache_nocheck(struct lzx_compressor *c, unsigned n)
1381 struct lz_match *cache_ptr;
1383 cache_ptr = c->cache_ptr;
1384 c->match_window_pos += n;
1386 cache_ptr += 1 + cache_ptr->len;
1388 c->cache_ptr = cache_ptr;
1392 lzx_skip_bytes_nocache(struct lzx_compressor *c, unsigned n)
1394 c->match_window_pos += n;
1395 lz_mf_skip_positions(c->mf, n);
1399 * Skip the specified number of positions in the window (don't search for
1403 lzx_skip_bytes(struct lzx_compressor *c, unsigned n)
1405 return (*c->skip_bytes_func)(c, n);
1409 * Reverse the linked list of near-optimal matches so that they can be returned
1410 * in forwards order.
1412 * Returns the first match in the list.
1414 static struct lz_match
1415 lzx_match_chooser_reverse_list(struct lzx_compressor *c, unsigned cur_pos)
1417 unsigned prev_link, saved_prev_link;
1418 unsigned prev_match_offset, saved_prev_match_offset;
1420 c->optimum_end_idx = cur_pos;
1422 saved_prev_link = c->optimum[cur_pos].prev.link;
1423 saved_prev_match_offset = c->optimum[cur_pos].prev.match_offset;
1426 prev_link = saved_prev_link;
1427 prev_match_offset = saved_prev_match_offset;
1429 saved_prev_link = c->optimum[prev_link].prev.link;
1430 saved_prev_match_offset = c->optimum[prev_link].prev.match_offset;
1432 c->optimum[prev_link].next.link = cur_pos;
1433 c->optimum[prev_link].next.match_offset = prev_match_offset;
1435 cur_pos = prev_link;
1436 } while (cur_pos != 0);
1438 c->optimum_cur_idx = c->optimum[0].next.link;
1440 return (struct lz_match)
1441 { .len = c->optimum_cur_idx,
1442 .offset = c->optimum[0].next.match_offset,
1447 * Find the longest repeat offset match.
1449 * If no match of at least LZX_MIN_MATCH_LEN bytes is found, then return 0.
1451 * If a match of at least LZX_MIN_MATCH_LEN bytes is found, then return its
1452 * length and set *slot_ret to the index of its offset in @queue.
1455 lzx_repsearch(const u8 * const strptr, const u32 bytes_remaining,
1456 const struct lzx_lru_queue *queue, unsigned *slot_ret)
1458 BUILD_BUG_ON(LZX_MIN_MATCH_LEN != 2);
1459 return lz_repsearch(strptr, bytes_remaining, LZX_MAX_MATCH_LEN,
1460 queue->R, LZX_NUM_RECENT_OFFSETS, slot_ret);
1464 * lzx_choose_near_optimal_match() -
1466 * Choose an approximately optimal match or literal to use at the next position
1467 * in the string, or "window", being LZ-encoded.
1469 * This is based on algorithms used in 7-Zip, including the DEFLATE encoder
1470 * and the LZMA encoder, written by Igor Pavlov.
1472 * Unlike a greedy parser that always takes the longest match, or even a "lazy"
1473 * parser with one match/literal look-ahead like zlib, the algorithm used here
1474 * may look ahead many matches/literals to determine the approximately optimal
1475 * match/literal to code next. The motivation is that the compression ratio is
1476 * improved if the compressor can do things like use a shorter-than-possible
1477 * match in order to allow a longer match later, and also take into account the
1478 * estimated real cost of coding each match/literal based on the underlying
1481 * Still, this is not a true optimal parser for several reasons:
1483 * - Real compression formats use entropy encoding of the literal/match
1484 * sequence, so the real cost of coding each match or literal is unknown until
1485 * the parse is fully determined. It can be approximated based on iterative
1486 * parses, but the end result is not guaranteed to be globally optimal.
1488 * - Very long matches are chosen immediately. This is because locations with
1489 * long matches are likely to have many possible alternatives that would cause
1490 * slow optimal parsing, but also such locations are already highly
1491 * compressible so it is not too harmful to just grab the longest match.
1493 * - Not all possible matches at each location are considered because the
1494 * underlying match-finder limits the number and type of matches produced at
1495 * each position. For example, for a given match length it's usually not
1496 * worth it to only consider matches other than the lowest-offset match,
1497 * except in the case of a repeat offset.
1499 * - Although we take into account the adaptive state (in LZX, the recent offset
1500 * queue), coding decisions made with respect to the adaptive state will be
1501 * locally optimal but will not necessarily be globally optimal. This is
1502 * because the algorithm only keeps the least-costly path to get to a given
1503 * location and does not take into account that a slightly more costly path
1504 * could result in a different adaptive state that ultimately results in a
1505 * lower global cost.
1507 * - The array space used by this function is bounded, so in degenerate cases it
1508 * is forced to start returning matches/literals before the algorithm has
1511 * Each call to this function does one of two things:
1513 * 1. Build a sequence of near-optimal matches/literals, up to some point, that
1514 * will be returned by subsequent calls to this function, then return the
1519 * 2. Return the next match/literal previously computed by a call to this
1522 * The return value is a (length, offset) pair specifying the match or literal
1523 * chosen. For literals, the length is 0 or 1 and the offset is meaningless.
1525 static struct lz_match
1526 lzx_choose_near_optimal_item(struct lzx_compressor *c)
1528 unsigned num_matches;
1529 const struct lz_match *matches;
1530 struct lz_match match;
1532 u32 longest_rep_len;
1533 unsigned longest_rep_slot;
1536 struct lzx_mc_pos_data *optimum = c->optimum;
1538 if (c->optimum_cur_idx != c->optimum_end_idx) {
1539 /* Case 2: Return the next match/literal already found. */
1540 match.len = optimum[c->optimum_cur_idx].next.link -
1542 match.offset = optimum[c->optimum_cur_idx].next.match_offset;
1544 c->optimum_cur_idx = optimum[c->optimum_cur_idx].next.link;
1548 /* Case 1: Compute a new list of matches/literals to return. */
1550 c->optimum_cur_idx = 0;
1551 c->optimum_end_idx = 0;
1553 /* Search for matches at repeat offsets. As a heuristic, we only keep
1554 * the one with the longest match length. */
1555 if (likely(c->match_window_pos >= 1)) {
1556 longest_rep_len = lzx_repsearch(&c->cur_window[c->match_window_pos],
1557 c->match_window_end - c->match_window_pos,
1561 longest_rep_len = 0;
1564 /* If there's a long match with a repeat offset, choose it immediately. */
1565 if (longest_rep_len >= c->params.nice_match_length) {
1566 lzx_skip_bytes(c, longest_rep_len);
1567 return (struct lz_match) {
1568 .len = longest_rep_len,
1569 .offset = c->queue.R[longest_rep_slot],
1573 /* Find other matches. */
1574 num_matches = lzx_get_matches(c, &matches);
1576 /* If there's a long match, choose it immediately. */
1578 longest_len = matches[num_matches - 1].len;
1579 if (longest_len >= c->params.nice_match_length) {
1580 lzx_skip_bytes(c, longest_len - 1);
1581 return matches[num_matches - 1];
1587 /* Calculate the cost to reach the next position by coding a literal. */
1588 optimum[1].queue = c->queue;
1589 optimum[1].cost = lzx_literal_cost(c->cur_window[c->match_window_pos - 1],
1591 optimum[1].prev.link = 0;
1593 /* Calculate the cost to reach any position up to and including that
1594 * reached by the longest match.
1596 * Note: We consider only the lowest-offset match that reaches each
1599 * Note: Some of the cost calculation stays the same for each offset,
1600 * regardless of how many lengths it gets used for. Therefore, to
1601 * improve performance, we hand-code the cost calculation instead of
1602 * calling lzx_match_cost() to do a from-scratch cost evaluation at each
1604 for (unsigned i = 0, len = 2; i < num_matches; i++) {
1606 struct lzx_lru_queue queue;
1608 unsigned position_slot;
1609 unsigned num_extra_bits;
1611 offset = matches[i].offset;
1615 position_slot = lzx_get_position_slot(offset, &queue);
1616 num_extra_bits = lzx_get_num_extra_bits(position_slot);
1617 if (num_extra_bits >= 3) {
1618 position_cost += num_extra_bits - 3;
1619 position_cost += c->costs.aligned[(offset + LZX_OFFSET_OFFSET) & 7];
1621 position_cost += num_extra_bits;
1625 unsigned len_header;
1626 unsigned main_symbol;
1629 cost = position_cost;
1631 len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
1632 main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
1633 cost += c->costs.main[main_symbol];
1634 if (len_header == LZX_NUM_PRIMARY_LENS)
1635 cost += c->costs.len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
1637 optimum[len].queue = queue;
1638 optimum[len].prev.link = 0;
1639 optimum[len].prev.match_offset = offset;
1640 optimum[len].cost = cost;
1641 } while (++len <= matches[i].len);
1643 end_pos = longest_len;
1645 if (longest_rep_len) {
1647 LZX_ASSERT(longest_rep_len >= LZX_MIN_MATCH_LEN);
1651 while (end_pos < longest_rep_len)
1652 optimum[++end_pos].cost = MC_INFINITE_COST;
1654 cost = lzx_repmatch_cost(longest_rep_len, longest_rep_slot,
1656 if (cost <= optimum[longest_rep_len].cost) {
1657 optimum[longest_rep_len].queue = c->queue;
1658 swap(optimum[longest_rep_len].queue.R[0],
1659 optimum[longest_rep_len].queue.R[longest_rep_slot]);
1660 optimum[longest_rep_len].prev.link = 0;
1661 optimum[longest_rep_len].prev.match_offset =
1662 optimum[longest_rep_len].queue.R[0];
1663 optimum[longest_rep_len].cost = cost;
1667 /* Step forward, calculating the estimated minimum cost to reach each
1668 * position. The algorithm may find multiple paths to reach each
1669 * position; only the lowest-cost path is saved.
1671 * The progress of the parse is tracked in the @optimum array, which for
1672 * each position contains the minimum cost to reach that position, the
1673 * index of the start of the match/literal taken to reach that position
1674 * through the minimum-cost path, the offset of the match taken (not
1675 * relevant for literals), and the adaptive state that will exist at
1676 * that position after the minimum-cost path is taken. The @cur_pos
1677 * variable stores the position at which the algorithm is currently
1678 * considering coding choices, and the @end_pos variable stores the
1679 * greatest position at which the costs of coding choices have been
1682 * The loop terminates when any one of the following conditions occurs:
1684 * 1. A match with length greater than or equal to @nice_match_length is
1685 * found. When this occurs, the algorithm chooses this match
1686 * unconditionally, and consequently the near-optimal match/literal
1687 * sequence up to and including that match is fully determined and it
1688 * can begin returning the match/literal list.
1690 * 2. @cur_pos reaches a position not overlapped by a preceding match.
1691 * In such cases, the near-optimal match/literal sequence up to
1692 * @cur_pos is fully determined and it can begin returning the
1693 * match/literal list.
1695 * 3. Failing either of the above in a degenerate case, the loop
1696 * terminates when space in the @optimum array is exhausted.
1697 * This terminates the algorithm and forces it to start returning
1698 * matches/literals even though they may not be globally optimal.
1700 * Upon loop termination, a nonempty list of matches/literals will have
1701 * been produced and stored in the @optimum array. These
1702 * matches/literals are linked in reverse order, so the last thing this
1703 * function does is reverse this list and return the first
1704 * match/literal, leaving the rest to be returned immediately by
1705 * subsequent calls to this function.
1711 /* Advance to next position. */
1714 /* Check termination conditions (2) and (3) noted above. */
1715 if (cur_pos == end_pos || cur_pos == LZX_OPTIM_ARRAY_LENGTH)
1716 return lzx_match_chooser_reverse_list(c, cur_pos);
1718 /* Search for matches at repeat offsets. Again, as a heuristic
1719 * we only keep the longest one. */
1720 longest_rep_len = lzx_repsearch(&c->cur_window[c->match_window_pos],
1721 c->match_window_end - c->match_window_pos,
1722 &optimum[cur_pos].queue,
1725 /* If we found a long match at a repeat offset, choose it
1727 if (longest_rep_len >= c->params.nice_match_length) {
1728 /* Build the list of matches to return and get
1730 match = lzx_match_chooser_reverse_list(c, cur_pos);
1732 /* Append the long match to the end of the list. */
1733 optimum[cur_pos].next.match_offset =
1734 optimum[cur_pos].queue.R[longest_rep_slot];
1735 optimum[cur_pos].next.link = cur_pos + longest_rep_len;
1736 c->optimum_end_idx = cur_pos + longest_rep_len;
1738 /* Skip over the remaining bytes of the long match. */
1739 lzx_skip_bytes(c, longest_rep_len);
1741 /* Return first match in the list. */
1745 /* Find other matches. */
1746 num_matches = lzx_get_matches(c, &matches);
1748 /* If there's a long match, choose it immediately. */
1750 longest_len = matches[num_matches - 1].len;
1751 if (longest_len >= c->params.nice_match_length) {
1752 /* Build the list of matches to return and get
1754 match = lzx_match_chooser_reverse_list(c, cur_pos);
1756 /* Append the long match to the end of the list. */
1757 optimum[cur_pos].next.match_offset =
1758 matches[num_matches - 1].offset;
1759 optimum[cur_pos].next.link = cur_pos + longest_len;
1760 c->optimum_end_idx = cur_pos + longest_len;
1762 /* Skip over the remaining bytes of the long match. */
1763 lzx_skip_bytes(c, longest_len - 1);
1765 /* Return first match in the list. */
1772 /* If we are reaching any positions for the first time, we need
1773 * to initialize their costs to infinity. */
1774 while (end_pos < cur_pos + longest_len)
1775 optimum[++end_pos].cost = MC_INFINITE_COST;
1777 /* Consider coding a literal. */
1778 cost = optimum[cur_pos].cost +
1779 lzx_literal_cost(c->cur_window[c->match_window_pos - 1],
1781 if (cost < optimum[cur_pos + 1].cost) {
1782 optimum[cur_pos + 1].queue = optimum[cur_pos].queue;
1783 optimum[cur_pos + 1].cost = cost;
1784 optimum[cur_pos + 1].prev.link = cur_pos;
1787 /* Consider coding a match.
1789 * The hard-coded cost calculation is done for the same reason
1790 * stated in the comment for the similar loop earlier.
1791 * Actually, it is *this* one that has the biggest effect on
1792 * performance; overall LZX compression is > 10% faster with
1793 * this code compared to calling lzx_match_cost() with each
1795 for (unsigned i = 0, len = 2; i < num_matches; i++) {
1798 unsigned position_slot;
1799 unsigned num_extra_bits;
1801 offset = matches[i].offset;
1802 position_cost = optimum[cur_pos].cost;
1804 /* Yet another optimization: instead of calling
1805 * lzx_get_position_slot(), hand-inline the search of
1806 * the repeat offset queue. Then we can omit the
1807 * extra_bits calculation for repeat offset matches, and
1808 * also only compute the updated queue if we actually do
1809 * find a new lowest cost path. */
1810 for (position_slot = 0; position_slot < LZX_NUM_RECENT_OFFSETS; position_slot++)
1811 if (offset == optimum[cur_pos].queue.R[position_slot])
1812 goto have_position_cost;
1814 position_slot = lzx_get_position_slot_raw(offset + LZX_OFFSET_OFFSET);
1816 num_extra_bits = lzx_get_num_extra_bits(position_slot);
1817 if (num_extra_bits >= 3) {
1818 position_cost += num_extra_bits - 3;
1819 position_cost += c->costs.aligned[
1820 (offset + LZX_OFFSET_OFFSET) & 7];
1822 position_cost += num_extra_bits;
1828 unsigned len_header;
1829 unsigned main_symbol;
1832 cost = position_cost;
1834 len_header = min(len - LZX_MIN_MATCH_LEN,
1835 LZX_NUM_PRIMARY_LENS);
1836 main_symbol = ((position_slot << 3) | len_header) +
1838 cost += c->costs.main[main_symbol];
1839 if (len_header == LZX_NUM_PRIMARY_LENS) {
1840 cost += c->costs.len[len -
1842 LZX_NUM_PRIMARY_LENS];
1844 if (cost < optimum[cur_pos + len].cost) {
1845 if (position_slot < LZX_NUM_RECENT_OFFSETS) {
1846 optimum[cur_pos + len].queue = optimum[cur_pos].queue;
1847 swap(optimum[cur_pos + len].queue.R[0],
1848 optimum[cur_pos + len].queue.R[position_slot]);
1850 optimum[cur_pos + len].queue.R[0] = offset;
1851 optimum[cur_pos + len].queue.R[1] = optimum[cur_pos].queue.R[0];
1852 optimum[cur_pos + len].queue.R[2] = optimum[cur_pos].queue.R[1];
1854 optimum[cur_pos + len].prev.link = cur_pos;
1855 optimum[cur_pos + len].prev.match_offset = offset;
1856 optimum[cur_pos + len].cost = cost;
1858 } while (++len <= matches[i].len);
1861 /* Consider coding a repeat offset match.
1863 * As a heuristic, we only consider the longest length of the
1864 * longest repeat offset match. This does not, however,
1865 * necessarily mean that we will never consider any other repeat
1866 * offsets, because above we detect repeat offset matches that
1867 * were found by the regular match-finder. Therefore, this
1868 * special handling of the longest repeat-offset match is only
1869 * helpful for coding a repeat offset match that was *not* found
1870 * by the match-finder, e.g. due to being obscured by a less
1871 * distant match that is at least as long.
1873 * Note: an alternative, used in LZMA, is to consider every
1874 * length of every repeat offset match. This is a more thorough
1875 * search, and it makes it unnecessary to detect repeat offset
1876 * matches that were found by the regular match-finder. But by
1877 * my tests, for LZX the LZMA method slows down the compressor
1878 * by ~10% and doesn't actually help the compression ratio too
1881 * Also tested a compromise approach: consider every 3rd length
1882 * of the longest repeat offset match. Still didn't seem quite
1885 if (longest_rep_len) {
1887 LZX_ASSERT(longest_rep_len >= LZX_MIN_MATCH_LEN);
1889 while (end_pos < cur_pos + longest_rep_len)
1890 optimum[++end_pos].cost = MC_INFINITE_COST;
1892 cost = optimum[cur_pos].cost +
1893 lzx_repmatch_cost(longest_rep_len, longest_rep_slot,
1895 if (cost <= optimum[cur_pos + longest_rep_len].cost) {
1896 optimum[cur_pos + longest_rep_len].queue =
1897 optimum[cur_pos].queue;
1898 swap(optimum[cur_pos + longest_rep_len].queue.R[0],
1899 optimum[cur_pos + longest_rep_len].queue.R[longest_rep_slot]);
1900 optimum[cur_pos + longest_rep_len].prev.link =
1902 optimum[cur_pos + longest_rep_len].prev.match_offset =
1903 optimum[cur_pos + longest_rep_len].queue.R[0];
1904 optimum[cur_pos + longest_rep_len].cost =
1911 static struct lz_match
1912 lzx_choose_lazy_item(struct lzx_compressor *c)
1914 const struct lz_match *matches;
1915 struct lz_match cur_match;
1916 struct lz_match next_match;
1919 if (c->prev_match.len) {
1920 cur_match = c->prev_match;
1921 c->prev_match.len = 0;
1923 num_matches = lzx_get_matches(c, &matches);
1924 if (num_matches == 0 ||
1925 (matches[num_matches - 1].len <= 3 &&
1926 (matches[num_matches - 1].len <= 2 ||
1927 matches[num_matches - 1].offset > 4096)))
1929 return (struct lz_match) { };
1932 cur_match = matches[num_matches - 1];
1935 if (cur_match.len >= c->params.nice_match_length) {
1936 lzx_skip_bytes(c, cur_match.len - 1);
1940 num_matches = lzx_get_matches(c, &matches);
1941 if (num_matches == 0 ||
1942 (matches[num_matches - 1].len <= 3 &&
1943 (matches[num_matches - 1].len <= 2 ||
1944 matches[num_matches - 1].offset > 4096)))
1946 lzx_skip_bytes(c, cur_match.len - 2);
1950 next_match = matches[num_matches - 1];
1952 if (next_match.len <= cur_match.len) {
1953 lzx_skip_bytes(c, cur_match.len - 2);
1956 c->prev_match = next_match;
1957 return (struct lz_match) { };
1962 * Return the next match or literal to use, delegating to the currently selected
1963 * match-choosing algorithm.
1965 * If the length of the returned 'struct lz_match' is less than
1966 * LZX_MIN_MATCH_LEN, then it is really a literal.
1968 static inline struct lz_match
1969 lzx_choose_item(struct lzx_compressor *c)
1971 return (*c->params.choose_item_func)(c);
1974 /* Set default symbol costs for the LZX Huffman codes. */
1976 lzx_set_default_costs(struct lzx_costs * costs, unsigned num_main_syms)
1980 /* Main code (part 1): Literal symbols */
1981 for (i = 0; i < LZX_NUM_CHARS; i++)
1984 /* Main code (part 2): Match header symbols */
1985 for (; i < num_main_syms; i++)
1986 costs->main[i] = 10;
1989 for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
1992 /* Aligned offset code */
1993 for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
1994 costs->aligned[i] = 3;
1997 /* Given the frequencies of symbols in an LZX-compressed block and the
1998 * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
1999 * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
2000 * will take fewer bits to output. */
2002 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
2003 const struct lzx_codes * codes)
2005 unsigned aligned_cost = 0;
2006 unsigned verbatim_cost = 0;
2008 /* Verbatim blocks have a constant 3 bits per position footer. Aligned
2009 * offset blocks have an aligned offset symbol per position footer, plus
2010 * an extra 24 bits per block to output the lengths necessary to
2011 * reconstruct the aligned offset code itself. */
2012 for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2013 verbatim_cost += 3 * freqs->aligned[i];
2014 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
2016 aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE * LZX_ALIGNEDCODE_NUM_SYMBOLS;
2017 if (aligned_cost < verbatim_cost)
2018 return LZX_BLOCKTYPE_ALIGNED;
2020 return LZX_BLOCKTYPE_VERBATIM;
2023 /* Find a sequence of matches/literals with which to output the specified LZX
2024 * block, then set the block's type to that which has the minimum cost to output
2025 * (either verbatim or aligned). */
2027 lzx_choose_items_for_block(struct lzx_compressor *c, struct lzx_block_spec *spec)
2029 const struct lzx_lru_queue orig_queue = c->queue;
2030 u32 num_passes_remaining = c->params.num_optim_passes;
2031 struct lzx_freqs freqs;
2032 const u8 *window_ptr;
2033 const u8 *window_end;
2034 struct lzx_item *next_chosen_item;
2035 struct lz_match lz_match;
2036 struct lzx_item lzx_item;
2038 LZX_ASSERT(num_passes_remaining >= 1);
2039 LZX_ASSERT(lz_mf_get_position(c->mf) == spec->window_pos);
2041 c->match_window_end = spec->window_pos + spec->block_size;
2043 if (c->params.num_optim_passes > 1) {
2044 if (spec->block_size == c->cur_window_size)
2045 c->get_matches_func = lzx_get_matches_fillcache_singleblock;
2047 c->get_matches_func = lzx_get_matches_fillcache_multiblock;
2048 c->skip_bytes_func = lzx_skip_bytes_fillcache;
2050 if (spec->block_size == c->cur_window_size)
2051 c->get_matches_func = lzx_get_matches_nocache_singleblock;
2053 c->get_matches_func = lzx_get_matches_nocache_multiblock;
2054 c->skip_bytes_func = lzx_skip_bytes_nocache;
2057 /* The first optimal parsing pass is done using the cost model already
2058 * set in c->costs. Each later pass is done using a cost model
2059 * computed from the previous pass.
2061 * To improve performance we only generate the array containing the
2062 * matches and literals in intermediate form on the final pass. */
2064 while (--num_passes_remaining) {
2065 c->match_window_pos = spec->window_pos;
2066 c->cache_ptr = c->cached_matches;
2067 memset(&freqs, 0, sizeof(freqs));
2068 window_ptr = &c->cur_window[spec->window_pos];
2069 window_end = window_ptr + spec->block_size;
2071 while (window_ptr != window_end) {
2073 lz_match = lzx_choose_item(c);
2075 LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
2076 lz_match.offset == c->max_window_size -
2077 LZX_MIN_MATCH_LEN));
2078 if (lz_match.len >= LZX_MIN_MATCH_LEN) {
2079 lzx_tally_match(lz_match.len, lz_match.offset,
2081 window_ptr += lz_match.len;
2083 lzx_tally_literal(*window_ptr, &freqs);
2087 lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
2088 lzx_set_costs(c, &spec->codes.lens, 15);
2089 c->queue = orig_queue;
2090 if (c->cache_ptr <= c->cache_limit) {
2091 c->get_matches_func = lzx_get_matches_usecache_nocheck;
2092 c->skip_bytes_func = lzx_skip_bytes_usecache_nocheck;
2094 c->get_matches_func = lzx_get_matches_usecache;
2095 c->skip_bytes_func = lzx_skip_bytes_usecache;
2099 c->match_window_pos = spec->window_pos;
2100 c->cache_ptr = c->cached_matches;
2101 memset(&freqs, 0, sizeof(freqs));
2102 window_ptr = &c->cur_window[spec->window_pos];
2103 window_end = window_ptr + spec->block_size;
2105 spec->chosen_items = &c->chosen_items[spec->window_pos];
2106 next_chosen_item = spec->chosen_items;
2108 unsigned unseen_cost = 9;
2109 while (window_ptr != window_end) {
2111 lz_match = lzx_choose_item(c);
2113 LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
2114 lz_match.offset == c->max_window_size -
2115 LZX_MIN_MATCH_LEN));
2116 if (lz_match.len >= LZX_MIN_MATCH_LEN) {
2117 lzx_item.data = lzx_tally_match(lz_match.len,
2120 window_ptr += lz_match.len;
2122 lzx_item.data = lzx_tally_literal(*window_ptr, &freqs);
2125 *next_chosen_item++ = lzx_item;
2127 /* When doing one-pass "near-optimal" parsing, update the cost
2128 * model occassionally. */
2129 if (unlikely((next_chosen_item - spec->chosen_items) % 2048 == 0) &&
2130 c->params.choose_item_func == lzx_choose_near_optimal_item &&
2131 c->params.num_optim_passes == 1)
2133 lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
2134 lzx_set_costs(c, &spec->codes.lens, unseen_cost);
2135 if (unseen_cost < 15)
2139 spec->num_chosen_items = next_chosen_item - spec->chosen_items;
2140 lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
2141 spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes);
2144 /* Prepare the input window into one or more LZX blocks ready to be output. */
2146 lzx_prepare_blocks(struct lzx_compressor *c)
2148 /* Set up a default cost model. */
2149 if (c->params.choose_item_func == lzx_choose_near_optimal_item)
2150 lzx_set_default_costs(&c->costs, c->num_main_syms);
2152 /* Set up the block specifications.
2153 * TODO: The compression ratio could be slightly improved by performing
2154 * data-dependent block splitting instead of using fixed-size blocks.
2155 * Doing so well is a computationally hard problem, however. */
2156 c->num_blocks = DIV_ROUND_UP(c->cur_window_size, LZX_DIV_BLOCK_SIZE);
2157 for (unsigned i = 0; i < c->num_blocks; i++) {
2158 u32 pos = LZX_DIV_BLOCK_SIZE * i;
2159 c->block_specs[i].window_pos = pos;
2160 c->block_specs[i].block_size = min(c->cur_window_size - pos,
2161 LZX_DIV_BLOCK_SIZE);
2164 /* Load the window into the match-finder. */
2165 lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size);
2167 /* Determine sequence of matches/literals to output for each block. */
2168 lzx_lru_queue_init(&c->queue);
2169 c->optimum_cur_idx = 0;
2170 c->optimum_end_idx = 0;
2171 c->prev_match.len = 0;
2172 for (unsigned i = 0; i < c->num_blocks; i++)
2173 lzx_choose_items_for_block(c, &c->block_specs[i]);
2177 lzx_build_params(unsigned int compression_level,
2178 u32 max_window_size,
2179 struct lzx_compressor_params *lzx_params)
2181 if (compression_level < 25) {
2182 lzx_params->choose_item_func = lzx_choose_lazy_item;
2183 lzx_params->num_optim_passes = 1;
2184 if (max_window_size <= 262144)
2185 lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
2187 lzx_params->mf_algo = LZ_MF_BINARY_TREES;
2188 lzx_params->min_match_length = 3;
2189 lzx_params->nice_match_length = 25 + compression_level * 2;
2190 lzx_params->max_search_depth = 25 + compression_level;
2192 lzx_params->choose_item_func = lzx_choose_near_optimal_item;
2193 lzx_params->num_optim_passes = compression_level / 20;
2194 if (max_window_size <= 32768 && lzx_params->num_optim_passes == 1)
2195 lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
2197 lzx_params->mf_algo = LZ_MF_BINARY_TREES;
2198 lzx_params->min_match_length = (compression_level >= 45) ? 2 : 3;
2199 lzx_params->nice_match_length = min(((u64)compression_level * 32) / 50,
2201 lzx_params->max_search_depth = min(((u64)compression_level * 50) / 50,
2207 lzx_build_mf_params(const struct lzx_compressor_params *lzx_params,
2208 u32 max_window_size, struct lz_mf_params *mf_params)
2210 memset(mf_params, 0, sizeof(*mf_params));
2212 mf_params->algorithm = lzx_params->mf_algo;
2213 mf_params->max_window_size = max_window_size;
2214 mf_params->min_match_len = lzx_params->min_match_length;
2215 mf_params->max_match_len = LZX_MAX_MATCH_LEN;
2216 mf_params->max_search_depth = lzx_params->max_search_depth;
2217 mf_params->nice_match_len = lzx_params->nice_match_length;
2221 lzx_free_compressor(void *_c);
2224 lzx_get_needed_memory(size_t max_block_size, unsigned int compression_level)
2226 struct lzx_compressor_params params;
2228 unsigned window_order;
2229 u32 max_window_size;
2231 window_order = lzx_get_window_order(max_block_size);
2232 if (window_order == 0)
2234 max_window_size = max_block_size;
2236 lzx_build_params(compression_level, max_window_size, ¶ms);
2238 size += sizeof(struct lzx_compressor);
2240 size += max_window_size;
2242 size += DIV_ROUND_UP(max_window_size, LZX_DIV_BLOCK_SIZE) *
2243 sizeof(struct lzx_block_spec);
2245 size += max_window_size * sizeof(struct lzx_item);
2247 size += lz_mf_get_needed_memory(params.mf_algo, max_window_size);
2248 if (params.choose_item_func == lzx_choose_near_optimal_item) {
2249 size += (LZX_OPTIM_ARRAY_LENGTH + params.nice_match_length) *
2250 sizeof(struct lzx_mc_pos_data);
2252 if (params.num_optim_passes > 1)
2253 size += LZX_CACHE_LEN * sizeof(struct lz_match);
2255 size += LZX_MAX_MATCHES_PER_POS * sizeof(struct lz_match);
2260 lzx_create_compressor(size_t max_block_size, unsigned int compression_level,
2263 struct lzx_compressor *c;
2264 struct lzx_compressor_params params;
2265 struct lz_mf_params mf_params;
2266 unsigned window_order;
2267 u32 max_window_size;
2269 window_order = lzx_get_window_order(max_block_size);
2270 if (window_order == 0)
2271 return WIMLIB_ERR_INVALID_PARAM;
2272 max_window_size = max_block_size;
2274 lzx_build_params(compression_level, max_window_size, ¶ms);
2275 lzx_build_mf_params(¶ms, max_window_size, &mf_params);
2276 if (!lz_mf_params_valid(&mf_params))
2277 return WIMLIB_ERR_INVALID_PARAM;
2279 c = CALLOC(1, sizeof(struct lzx_compressor));
2284 c->num_main_syms = lzx_get_num_main_syms(window_order);
2285 c->max_window_size = max_window_size;
2286 c->window_order = window_order;
2288 c->cur_window = ALIGNED_MALLOC(max_window_size, 16);
2292 c->block_specs = MALLOC(DIV_ROUND_UP(max_window_size,
2293 LZX_DIV_BLOCK_SIZE) *
2294 sizeof(struct lzx_block_spec));
2295 if (!c->block_specs)
2298 c->chosen_items = MALLOC(max_window_size * sizeof(struct lzx_item));
2299 if (!c->chosen_items)
2302 c->mf = lz_mf_alloc(&mf_params);
2306 if (params.choose_item_func == lzx_choose_near_optimal_item) {
2307 c->optimum = MALLOC((LZX_OPTIM_ARRAY_LENGTH +
2308 params.nice_match_length) *
2309 sizeof(struct lzx_mc_pos_data));
2314 if (params.num_optim_passes > 1) {
2315 c->cached_matches = MALLOC(LZX_CACHE_LEN *
2316 sizeof(struct lz_match));
2317 if (!c->cached_matches)
2319 c->cache_limit = c->cached_matches + LZX_CACHE_LEN -
2320 (LZX_MAX_MATCHES_PER_POS + 1);
2322 c->cached_matches = MALLOC(LZX_MAX_MATCHES_PER_POS *
2323 sizeof(struct lz_match));
2324 if (!c->cached_matches)
2332 lzx_free_compressor(c);
2333 return WIMLIB_ERR_NOMEM;
2337 lzx_compress(const void *uncompressed_data, size_t uncompressed_size,
2338 void *compressed_data, size_t compressed_size_avail, void *_c)
2340 struct lzx_compressor *c = _c;
2341 struct lzx_output_bitstream os;
2343 /* Don't bother compressing very small inputs. */
2344 if (uncompressed_size < 100)
2347 /* The input data must be preprocessed. To avoid changing the original
2348 * input, copy it to a temporary buffer. */
2349 memcpy(c->cur_window, uncompressed_data, uncompressed_size);
2350 c->cur_window_size = uncompressed_size;
2352 /* Preprocess the data. */
2353 lzx_do_e8_preprocessing(c->cur_window, c->cur_window_size);
2355 /* Prepare the compressed data. */
2356 lzx_prepare_blocks(c);
2358 /* Generate the compressed data and return its size, or 0 if an overflow
2360 lzx_init_output(&os, compressed_data, compressed_size_avail);
2361 lzx_write_all_blocks(c, &os);
2362 return lzx_flush_output(&os);
2366 lzx_free_compressor(void *_c)
2368 struct lzx_compressor *c = _c;
2371 ALIGNED_FREE(c->cur_window);
2372 FREE(c->block_specs);
2373 FREE(c->chosen_items);
2376 FREE(c->cached_matches);
2381 const struct compressor_ops lzx_compressor_ops = {
2382 .get_needed_memory = lzx_get_needed_memory,
2383 .create_compressor = lzx_create_compressor,
2384 .compress = lzx_compress,
2385 .free_compressor = lzx_free_compressor,