4 * A compressor for the LZMS compression format.
8 * Copyright (C) 2013, 2014, 2015 Eric Biggers
10 * This file is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU Lesser General Public License as published by the Free
12 * Software Foundation; either version 3 of the License, or (at your option) any
15 * This file is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this file; if not, see http://www.gnu.org/licenses/.
31 #include "wimlib/compress_common.h"
32 #include "wimlib/compressor_ops.h"
33 #include "wimlib/error.h"
34 #include "wimlib/lcpit_matchfinder.h"
35 #include "wimlib/lz_extend.h"
36 #include "wimlib/lz_hash.h"
37 #include "wimlib/lzms_common.h"
38 #include "wimlib/unaligned.h"
39 #include "wimlib/util.h"
42 * MAX_FAST_LENGTH is the maximum match length for which the length slot can be
43 * looked up directly in 'fast_length_slot_tab' and the length cost can be
44 * looked up directly in 'fast_length_cost_tab'.
46 * We also limit the 'nice_match_len' parameter to this value. Consequently, if
47 * the longest match found is shorter than 'nice_match_len', then it must also
48 * be shorter than MAX_FAST_LENGTH. This makes it possible to do fast lookups
49 * of length costs using 'fast_length_cost_tab' without having to keep checking
50 * whether the length exceeds MAX_FAST_LENGTH or not.
52 #define MAX_FAST_LENGTH 255
54 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will
55 * step forward before forcing the pending items to be encoded. If this value
56 * is increased, then there will be fewer forced flushes, but the probability
57 * entries and Huffman codes will be more likely to become outdated. */
58 #define NUM_OPTIM_NODES 2048
60 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional
61 * bit costs. A single bit has a cost of (1 << COST_SHIFT). */
64 /* Length of the hash table for finding delta matches */
65 #define DELTA_HASH_ORDER 17
66 #define DELTA_HASH_LENGTH ((u32)1 << DELTA_HASH_ORDER)
68 /* The number of bytes to hash when finding delta matches; also taken to be the
69 * minimum length of an explicit offset delta match */
70 #define NBYTES_HASHED_FOR_DELTA 3
72 /* The number of delta match powers to consider (must be <=
73 * LZMS_NUM_DELTA_POWER_SYMS) */
74 #define NUM_POWERS_TO_CONSIDER 6
76 /* This structure tracks the state of writing bits as a series of 16-bit coding
77 * units, starting at the end of the output buffer and proceeding backwards. */
78 struct lzms_output_bitstream {
80 /* Bits that haven't yet been written to the output buffer */
83 /* Number of bits currently held in @bitbuf */
86 /* Pointer to one past the next position in the output buffer at which
87 * to output a 16-bit coding unit */
90 /* Pointer to the beginning of the output buffer (this is the "end" when
91 * writing backwards!) */
95 /* This structure tracks the state of range encoding and its output, which
96 * starts at the beginning of the output buffer and proceeds forwards. */
97 struct lzms_range_encoder {
99 /* The lower boundary of the current range. Logically, this is a 33-bit
100 * integer whose high bit is needed to detect carries. */
103 /* The size of the current range */
106 /* The next 16-bit coding unit to output */
109 /* The number of 16-bit coding units whose output has been delayed due
110 * to possible carrying. The first such coding unit is @cache; all
111 * subsequent such coding units are 0xffff. */
114 /* Pointer to the beginning of the output buffer */
117 /* Pointer to the position in the output buffer at which the next coding
118 * unit must be written */
121 /* Pointer to just past the end of the output buffer */
125 /* Bookkeeping information for an adaptive Huffman code */
126 struct lzms_huffman_rebuild_info {
128 /* The remaining number of symbols to encode until this code must be
130 unsigned num_syms_until_rebuild;
132 /* The number of symbols in this code */
135 /* The rebuild frequency of this code, in symbols */
136 unsigned rebuild_freq;
138 /* The Huffman codeword of each symbol in this code */
141 /* The length of each Huffman codeword, in bits */
144 /* The frequency of each symbol in this code */
149 * The compressor-internal representation of a match or literal.
151 * Literals have length=1; matches have length > 1. (We disallow matches of
152 * length 1, even though this is a valid length in LZMS.)
154 * The source is encoded as follows:
156 * - Literals: the literal byte itself
157 * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1)
158 * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets
159 * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are
160 * the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1)
161 * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is
162 * the index of the (power, raw_offset) pair in recent_delta_pairs
169 #define DELTA_SOURCE_TAG ((u32)1 << 31)
170 #define DELTA_SOURCE_POWER_SHIFT 28
171 #define DELTA_SOURCE_RAW_OFFSET_MASK (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
174 check_that_powers_fit_in_bitfield(void)
176 STATIC_ASSERT(LZMS_NUM_DELTA_POWER_SYMS <= (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
179 /* A stripped-down version of the adaptive state in LZMS which excludes the
180 * probability entries and Huffman codes */
181 struct lzms_adaptive_state {
183 /* Recent offsets for LZ matches */
184 u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1];
185 u32 prev_lz_offset; /* 0 means none */
186 u32 upcoming_lz_offset; /* 0 means none */
188 /* Recent (power, raw offset) pairs for delta matches.
189 * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw
190 * offset, and the high bits are the power. */
191 u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1];
192 u32 prev_delta_pair; /* 0 means none */
193 u32 upcoming_delta_pair; /* 0 means none */
195 /* States for predicting the probabilities of item types */
199 u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
201 u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
202 } _aligned_attribute(64);
205 * This structure represents a byte position in the preprocessed input data and
206 * a node in the graph of possible match/literal choices.
208 * Logically, each incoming edge to this node is labeled with a literal or a
209 * match that can be taken to reach this position from an earlier position; and
210 * each outgoing edge from this node is labeled with a literal or a match that
211 * can be taken to advance from this position to a later position.
213 struct lzms_optimum_node {
216 * The cost of the lowest-cost path that has been found to reach this
217 * position. This can change as progressively lower cost paths are
218 * found to reach this position.
221 #define INFINITE_COST UINT32_MAX
224 * @item is the last item that was taken to reach this position to reach
225 * it with the stored @cost. This can change as progressively lower
226 * cost paths are found to reach this position.
228 * In some cases we look ahead more than one item. If we looked ahead n
229 * items to reach this position, then @item is the last item taken,
230 * @extra_items contains the other items ordered from second-to-last to
231 * first, and @num_extra_items is n - 1.
233 unsigned num_extra_items;
234 struct lzms_item item;
235 struct lzms_item extra_items[2];
238 * The adaptive state that exists at this position. This is filled in
239 * lazily, only after the minimum-cost path to this position is found.
241 * Note: the way the algorithm handles this adaptive state in the
242 * "minimum-cost" parse is actually only an approximation. It's
243 * possible for the globally optimal, minimum cost path to contain a
244 * prefix, ending at a position, where that path prefix is *not* the
245 * minimum cost path to that position. This can happen if such a path
246 * prefix results in a different adaptive state which results in lower
247 * costs later. Although the algorithm does do some heuristic
248 * multi-item lookaheads, it does not solve this problem in general.
250 * Note: this adaptive state structure also does not include the
251 * probability entries or current Huffman codewords. Those aren't
252 * maintained per-position and are only updated occassionally.
254 struct lzms_adaptive_state state;
255 } _aligned_attribute(64);
257 /* The main compressor structure */
258 struct lzms_compressor {
260 /* The matchfinder for LZ matches */
261 struct lcpit_matchfinder mf;
263 /* The preprocessed buffer of data being compressed */
266 /* The number of bytes of data to be compressed, which is the number of
267 * bytes of data in @in_buffer that are actually valid */
271 * Boolean flags to enable consideration of various types of multi-step
272 * operations during parsing.
274 * Among other cases, multi-step operations can help with gaps where two
275 * matches are separated by a non-matching byte.
277 * This idea is borrowed from Igor Pavlov's LZMA encoder.
280 bool try_lzrep_lit_lzrep0;
281 bool try_lzmatch_lit_lzrep0;
284 * If true, the compressor can use delta matches. This slows down
285 * compression. It improves the compression ratio greatly, slightly, or
286 * not at all, depending on the input data.
288 bool use_delta_matches;
290 /* If true, the compressor need not preserve the input buffer if it
291 * compresses the data successfully. */
294 /* 'last_target_usages' is a large array that is only needed for
295 * preprocessing, so it is in union with fields that don't need to be
296 * initialized until after preprocessing. */
300 /* Temporary space to store matches found by the LZ matchfinder */
301 struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1];
303 /* Hash table for finding delta matches */
304 u32 delta_hash_table[DELTA_HASH_LENGTH];
306 /* For each delta power, the hash code for the next sequence */
307 u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
309 /* The per-byte graph nodes for near-optimal parsing */
310 struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH +
311 1 + MAX_FAST_LENGTH];
313 /* Table: length => current cost for small match lengths */
314 u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
316 /* Range encoder which outputs to the beginning of the compressed data
317 * buffer, proceeding forwards */
318 struct lzms_range_encoder rc;
320 /* Bitstream which outputs to the end of the compressed data buffer,
321 * proceeding backwards */
322 struct lzms_output_bitstream os;
324 /* States and probability entries for item type disambiguation */
326 unsigned match_state;
328 unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
329 unsigned delta_state;
330 unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
331 struct lzms_probabilites probs;
335 struct lzms_huffman_rebuild_info literal_rebuild_info;
336 u32 literal_codewords[LZMS_NUM_LITERAL_SYMS];
337 u8 literal_lens[LZMS_NUM_LITERAL_SYMS];
338 u32 literal_freqs[LZMS_NUM_LITERAL_SYMS];
340 struct lzms_huffman_rebuild_info lz_offset_rebuild_info;
341 u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
342 u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
343 u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
345 struct lzms_huffman_rebuild_info length_rebuild_info;
346 u32 length_codewords[LZMS_NUM_LENGTH_SYMS];
347 u8 length_lens[LZMS_NUM_LENGTH_SYMS];
348 u32 length_freqs[LZMS_NUM_LENGTH_SYMS];
350 struct lzms_huffman_rebuild_info delta_offset_rebuild_info;
351 u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
352 u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
353 u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
355 struct lzms_huffman_rebuild_info delta_power_rebuild_info;
356 u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS];
357 u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS];
358 u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS];
362 s32 last_target_usages[65536];
366 /* Table: length => length slot for small match lengths */
367 u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1];
369 /* Tables for mapping offsets to offset slots */
371 /* slots [0, 167); 0 <= num_extra_bits <= 10 */
372 u8 offset_slot_tab_1[0xe4a5];
374 /* slots [167, 427); 11 <= num_extra_bits <= 15 */
375 u16 offset_slot_tab_2[0x3d0000 >> 11];
377 /* slots [427, 799); 16 <= num_extra_bits */
378 u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16];
381 /******************************************************************************
382 * Offset and length slot acceleration *
383 ******************************************************************************/
385 /* Generate the acceleration table for length slots. */
387 lzms_init_fast_length_slot_tab(struct lzms_compressor *c)
390 for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
391 if (len >= lzms_length_slot_base[slot + 1])
393 c->fast_length_slot_tab[len] = slot;
397 /* Generate the acceleration tables for offset slots. */
399 lzms_init_offset_slot_tabs(struct lzms_compressor *c)
404 /* slots [0, 167); 0 <= num_extra_bits <= 10 */
405 for (offset = 1; offset < 0xe4a5; offset++) {
406 if (offset >= lzms_offset_slot_base[slot + 1])
408 c->offset_slot_tab_1[offset] = slot;
411 /* slots [167, 427); 11 <= num_extra_bits <= 15 */
412 for (; offset < 0x3de4a5; offset += (u32)1 << 11) {
413 if (offset >= lzms_offset_slot_base[slot + 1])
415 c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot;
418 /* slots [427, 799); 16 <= num_extra_bits */
419 for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) {
420 if (offset >= lzms_offset_slot_base[slot + 1])
422 c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot;
427 * Return the length slot for the specified match length, using the compressor's
428 * acceleration table if the length is small enough.
430 static inline unsigned
431 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
433 if (likely(length <= MAX_FAST_LENGTH))
434 return c->fast_length_slot_tab[length];
435 return lzms_get_length_slot(length);
439 * Return the offset slot for the specified match offset, using the compressor's
440 * acceleration tables to speed up the mapping.
442 static inline unsigned
443 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
446 return c->offset_slot_tab_1[offset];
448 if (offset < 0x3d0000)
449 return c->offset_slot_tab_2[offset >> 11];
450 return c->offset_slot_tab_3[offset >> 16];
453 /******************************************************************************
455 ******************************************************************************/
458 * Initialize the range encoder @rc to write forwards to the specified buffer
459 * @out that is @count 16-bit integers long.
462 lzms_range_encoder_init(struct lzms_range_encoder *rc, le16 *out, size_t count)
465 rc->range_size = 0xffffffff;
470 rc->end = out + count;
474 * Attempt to flush bits from the range encoder.
476 * The basic idea is that we're writing bits from @rc->lower_bound to the
477 * output. However, due to carrying, the writing of coding units with the
478 * maximum value, as well as one prior coding unit, must be delayed until it is
479 * determined whether a carry is needed.
481 * This is based on the public domain code for LZMA written by Igor Pavlov, but
482 * with the following differences:
484 * - In LZMS, 16-bit coding units are required rather than 8-bit.
486 * - In LZMS, the first coding unit is not ignored by the decompressor, so
487 * the encoder cannot output a dummy value to that position.
490 lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
492 if ((u32)(rc->lower_bound) < 0xffff0000 ||
493 (u32)(rc->lower_bound >> 32) != 0)
495 /* Carry not needed (rc->lower_bound < 0xffff0000), or carry
496 * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit
499 if (likely(rc->next >= rc->begin)) {
500 if (rc->next != rc->end) {
501 put_unaligned_u16_le(rc->cache +
502 (u16)(rc->lower_bound >> 32),
509 } while (--rc->cache_size != 0);
511 rc->cache = (rc->lower_bound >> 16) & 0xffff;
514 rc->lower_bound = (rc->lower_bound & 0xffff) << 16;
518 lzms_range_encoder_flush(struct lzms_range_encoder *rc)
520 for (int i = 0; i < 4; i++)
521 lzms_range_encoder_shift_low(rc);
522 return rc->next != rc->end;
526 * Encode the next bit using the range encoder.
528 * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
529 * bit is 0 rather than 1.
532 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
534 /* Normalize if needed. */
535 if (rc->range_size <= 0xffff) {
536 rc->range_size <<= 16;
537 lzms_range_encoder_shift_low(rc);
540 u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob;
542 rc->range_size = bound;
544 rc->lower_bound += bound;
545 rc->range_size -= bound;
550 * Encode a bit. This wraps around lzms_range_encode_bit() to handle using and
551 * updating the state and its corresponding probability entry.
554 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
555 struct lzms_probability_entry *probs,
556 struct lzms_range_encoder *rc)
558 struct lzms_probability_entry *prob_entry;
561 /* Load the probability entry for the current state. */
562 prob_entry = &probs[*state_p];
564 /* Update the state based on the next bit. */
565 *state_p = ((*state_p << 1) | bit) & (num_states - 1);
567 /* Get the probability that the bit is 0. */
568 prob = lzms_get_probability(prob_entry);
570 /* Update the probability entry. */
571 lzms_update_probability_entry(prob_entry, bit);
573 /* Encode the bit using the range encoder. */
574 lzms_range_encode_bit(rc, bit, prob);
577 /* Helper functions for encoding bits in the various decision classes */
580 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
582 lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
583 c->probs.main, &c->rc);
587 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
589 lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
590 c->probs.match, &c->rc);
594 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
596 lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
597 c->probs.lz, &c->rc);
601 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
603 lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
604 c->probs.lz_rep[idx], &c->rc);
608 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
610 lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
611 c->probs.delta, &c->rc);
615 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
617 lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
618 c->probs.delta_rep[idx], &c->rc);
621 /******************************************************************************
622 * Huffman encoding and verbatim bits *
623 ******************************************************************************/
626 * Initialize the output bitstream @os to write backwards to the specified
627 * buffer @out that is @count 16-bit integers long.
630 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
631 le16 *out, size_t count)
635 os->next = out + count;
640 * Write some bits, contained in the low-order @num_bits bits of @bits, to the
641 * output bitstream @os.
643 * @max_num_bits is a compile-time constant that specifies the maximum number of
644 * bits that can ever be written at this call site.
647 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
648 const unsigned num_bits, const unsigned max_num_bits)
650 /* Add the bits to the bit buffer variable. */
651 os->bitcount += num_bits;
652 os->bitbuf = (os->bitbuf << num_bits) | bits;
654 /* Check whether any coding units need to be written. */
655 while (os->bitcount >= 16) {
659 /* Write a coding unit, unless it would underflow the buffer. */
660 if (os->next != os->begin)
661 put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
663 /* Optimization for call sites that never write more than 16
665 if (max_num_bits <= 16)
671 * Flush the output bitstream, ensuring that all bits written to it have been
672 * written to memory. Return %true if all bits have been output successfully,
673 * or %false if an overrun occurred.
676 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
678 if (os->next == os->begin)
681 if (os->bitcount != 0)
682 put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
688 lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
690 make_canonical_huffman_code(rebuild_info->num_syms,
691 LZMS_MAX_CODEWORD_LENGTH,
694 rebuild_info->codewords);
695 rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq;
699 lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info,
700 unsigned num_syms, unsigned rebuild_freq,
701 u32 *codewords, u8 *lens, u32 *freqs)
703 rebuild_info->num_syms = num_syms;
704 rebuild_info->rebuild_freq = rebuild_freq;
705 rebuild_info->codewords = codewords;
706 rebuild_info->lens = lens;
707 rebuild_info->freqs = freqs;
708 lzms_init_symbol_frequencies(freqs, num_syms);
709 lzms_build_huffman_code(rebuild_info);
713 lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
715 lzms_build_huffman_code(rebuild_info);
716 lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms);
720 * Encode a symbol using the specified Huffman code. Then, if the Huffman code
721 * needs to be rebuilt, rebuild it and return true; otherwise return false.
724 lzms_huffman_encode_symbol(unsigned sym,
725 const u32 *codewords, const u8 *lens, u32 *freqs,
726 struct lzms_output_bitstream *os,
727 struct lzms_huffman_rebuild_info *rebuild_info)
729 lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH);
731 if (--rebuild_info->num_syms_until_rebuild == 0) {
732 lzms_rebuild_huffman_code(rebuild_info);
738 /* Helper routines to encode symbols using the various Huffman codes */
741 lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym)
743 return lzms_huffman_encode_symbol(sym, c->literal_codewords,
744 c->literal_lens, c->literal_freqs,
745 &c->os, &c->literal_rebuild_info);
749 lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym)
751 return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords,
752 c->lz_offset_lens, c->lz_offset_freqs,
753 &c->os, &c->lz_offset_rebuild_info);
757 lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym)
759 return lzms_huffman_encode_symbol(sym, c->length_codewords,
760 c->length_lens, c->length_freqs,
761 &c->os, &c->length_rebuild_info);
765 lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym)
767 return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords,
768 c->delta_offset_lens, c->delta_offset_freqs,
769 &c->os, &c->delta_offset_rebuild_info);
773 lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym)
775 return lzms_huffman_encode_symbol(sym, c->delta_power_codewords,
776 c->delta_power_lens, c->delta_power_freqs,
777 &c->os, &c->delta_power_rebuild_info);
781 lzms_update_fast_length_costs(struct lzms_compressor *c);
784 * Encode a match length. If this causes the Huffman code for length symbols to
785 * be rebuilt, also update the length costs array used by the parser.
788 lzms_encode_length(struct lzms_compressor *c, u32 length)
790 unsigned slot = lzms_comp_get_length_slot(c, length);
792 if (lzms_encode_length_symbol(c, slot))
793 lzms_update_fast_length_costs(c);
795 lzms_write_bits(&c->os, length - lzms_length_slot_base[slot],
796 lzms_extra_length_bits[slot],
797 LZMS_MAX_EXTRA_LENGTH_BITS);
800 /* Encode the offset of an LZ match. */
802 lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
804 unsigned slot = lzms_comp_get_offset_slot(c, offset);
806 lzms_encode_lz_offset_symbol(c, slot);
807 lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot],
808 lzms_extra_offset_bits[slot],
809 LZMS_MAX_EXTRA_OFFSET_BITS);
812 /* Encode the raw offset of a delta match. */
814 lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset)
816 unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
818 lzms_encode_delta_offset_symbol(c, slot);
819 lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot],
820 lzms_extra_offset_bits[slot],
821 LZMS_MAX_EXTRA_OFFSET_BITS);
824 /******************************************************************************
826 ******************************************************************************/
828 /* Encode the specified item, which may be a literal or any type of match. */
830 lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source)
832 /* Main bit: 0 = literal, 1 = match */
833 int main_bit = (length > 1);
834 lzms_encode_main_bit(c, main_bit);
838 unsigned literal = source;
839 lzms_encode_literal_symbol(c, literal);
843 /* Match bit: 0 = LZ match, 1 = delta match */
844 int match_bit = (source & DELTA_SOURCE_TAG) != 0;
845 lzms_encode_match_bit(c, match_bit);
850 /* LZ bit: 0 = explicit offset, 1 = repeat offset */
851 int lz_bit = (source < LZMS_NUM_LZ_REPS);
852 lzms_encode_lz_bit(c, lz_bit);
855 /* Explicit offset LZ match */
856 u32 offset = source - (LZMS_NUM_LZ_REPS - 1);
857 lzms_encode_lz_offset(c, offset);
859 /* Repeat offset LZ match */
860 int rep_idx = source;
861 for (int i = 0; i < rep_idx; i++)
862 lzms_encode_lz_rep_bit(c, 1, i);
863 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
864 lzms_encode_lz_rep_bit(c, 0, rep_idx);
869 source &= ~DELTA_SOURCE_TAG;
871 /* Delta bit: 0 = explicit offset, 1 = repeat offset */
872 int delta_bit = (source < LZMS_NUM_DELTA_REPS);
873 lzms_encode_delta_bit(c, delta_bit);
876 /* Explicit offset delta match */
877 u32 power = source >> DELTA_SOURCE_POWER_SHIFT;
878 u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) -
879 (LZMS_NUM_DELTA_REPS - 1);
880 lzms_encode_delta_power_symbol(c, power);
881 lzms_encode_delta_raw_offset(c, raw_offset);
883 /* Repeat offset delta match */
884 int rep_idx = source;
885 for (int i = 0; i < rep_idx; i++)
886 lzms_encode_delta_rep_bit(c, 1, i);
887 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
888 lzms_encode_delta_rep_bit(c, 0, rep_idx);
892 /* Match length (encoded the same way for any match type) */
893 lzms_encode_length(c, length);
897 /* Encode a list of matches and literals chosen by the parsing algorithm. */
899 lzms_encode_nonempty_item_list(struct lzms_compressor *c,
900 struct lzms_optimum_node *end_node)
902 /* Since we've stored at each node the item we took to arrive at that
903 * node, we can trace our chosen path in backwards order. However, for
904 * encoding we need to trace our chosen path in forwards order. To make
905 * this possible, the following loop moves the items from their
906 * destination nodes to their source nodes, which effectively reverses
907 * the path. (Think of it like reversing a singly-linked list.) */
908 struct lzms_optimum_node *cur_node = end_node;
909 struct lzms_item saved_item = cur_node->item;
911 struct lzms_item item = saved_item;
912 if (cur_node->num_extra_items > 0) {
913 /* Handle an arrival via multi-item lookahead. */
915 struct lzms_optimum_node *orig_node = cur_node;
917 cur_node -= item.length;
918 cur_node->item = item;
919 item = orig_node->extra_items[i];
920 } while (++i != orig_node->num_extra_items);
922 cur_node -= item.length;
923 saved_item = cur_node->item;
924 cur_node->item = item;
925 } while (cur_node != c->optimum_nodes);
927 /* Now trace the chosen path in forwards order, encoding each item. */
929 lzms_encode_item(c, cur_node->item.length, cur_node->item.source);
930 cur_node += cur_node->item.length;
931 } while (cur_node != end_node);
935 lzms_encode_item_list(struct lzms_compressor *c,
936 struct lzms_optimum_node *end_node)
938 if (end_node != c->optimum_nodes)
939 lzms_encode_nonempty_item_list(c, end_node);
942 /******************************************************************************
944 ******************************************************************************/
947 * If p is the predicted probability of the next bit being a 0, then the number
948 * of bits required to encode a 0 bit using a binary range encoder is the real
949 * number -log2(p), and the number of bits required to encode a 1 bit is the
950 * real number -log2(1 - p). To avoid computing either of these expressions at
951 * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from
952 * probability to cost for each possible probability. Specifically, the array
953 * indices are the numerators of the possible probabilities in LZMS, where the
954 * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the
955 * bit costs multiplied by 1<<COST_SHIFT and rounded to the nearest integer.
956 * Furthermore, the values stored for 0% and 100% probabilities are equal to the
957 * adjacent values, since these probabilities are not actually permitted. This
958 * allows us to use the num_recent_zero_bits value from the
959 * lzms_probability_entry as the array index without fixing up these two special
962 static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
963 384, 384, 320, 283, 256, 235, 219, 204,
964 192, 181, 171, 163, 155, 147, 140, 134,
965 128, 122, 117, 112, 107, 103, 99, 94,
966 91, 87, 83, 80, 76, 73, 70, 67,
967 64, 61, 58, 56, 53, 51, 48, 46,
968 43, 41, 39, 37, 35, 33, 30, 29,
969 27, 25, 23, 21, 19, 17, 16, 14,
970 12, 11, 9, 8, 6, 4, 3, 1,
975 check_cost_shift(void)
977 /* lzms_bit_costs is hard-coded to the current COST_SHIFT. */
978 STATIC_ASSERT(COST_SHIFT == 6);
985 lzms_compute_bit_costs(void)
987 for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) {
991 else if (prob == LZMS_PROBABILITY_DENOMINATOR)
994 lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) *
1000 /* Return the cost to encode a 0 bit in the specified context. */
1002 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
1004 return lzms_bit_costs[probs[state].num_recent_zero_bits];
1007 /* Return the cost to encode a 1 bit in the specified context. */
1009 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
1011 return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
1012 probs[state].num_recent_zero_bits];
1015 /* Return the cost to encode a literal, including the main bit. */
1017 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
1019 return lzms_bit_0_cost(main_state, c->probs.main) +
1020 ((u32)c->literal_lens[literal] << COST_SHIFT);
1023 /* Update 'fast_length_cost_tab' to use the latest Huffman code. */
1025 lzms_update_fast_length_costs(struct lzms_compressor *c)
1029 for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
1030 if (len >= lzms_length_slot_base[slot + 1]) {
1032 cost = (u32)(c->length_lens[slot] +
1033 lzms_extra_length_bits[slot]) << COST_SHIFT;
1035 c->fast_length_cost_tab[len] = cost;
1039 /* Return the cost to encode the specified match length, which must not exceed
1040 * MAX_FAST_LENGTH. */
1042 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
1044 return c->fast_length_cost_tab[length];
1047 /* Return the cost to encode the specified LZ match offset. */
1049 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
1051 unsigned slot = lzms_comp_get_offset_slot(c, offset);
1052 u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot];
1053 return num_bits << COST_SHIFT;
1056 /* Return the cost to encode the specified delta power and raw offset. */
1058 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
1060 unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
1061 u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] +
1062 lzms_extra_offset_bits[slot];
1063 return num_bits << COST_SHIFT;
1066 /******************************************************************************
1068 ******************************************************************************/
1071 lzms_init_adaptive_state(struct lzms_adaptive_state *state)
1073 for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++)
1074 state->recent_lz_offsets[i] = i + 1;
1075 state->prev_lz_offset = 0;
1076 state->upcoming_lz_offset = 0;
1078 for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++)
1079 state->recent_delta_pairs[i] = i + 1;
1080 state->prev_delta_pair = 0;
1081 state->upcoming_delta_pair = 0;
1083 state->main_state = 0;
1084 state->match_state = 0;
1085 state->lz_state = 0;
1086 for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
1087 state->lz_rep_states[i] = 0;
1088 state->delta_state = 0;
1089 for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
1090 state->delta_rep_states[i] = 0;
1094 * Update the LRU queues for match sources when advancing by one item.
1096 * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more
1098 * - there are separate queues for LZ and delta matches
1099 * - updates to the queues are delayed by one encoded item (this prevents
1100 * sources from being bumped up to index 0 too early)
1103 lzms_update_lru_queues(struct lzms_adaptive_state *state)
1105 if (state->prev_lz_offset != 0) {
1106 for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--)
1107 state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i];
1108 state->recent_lz_offsets[0] = state->prev_lz_offset;
1110 state->prev_lz_offset = state->upcoming_lz_offset;
1112 if (state->prev_delta_pair != 0) {
1113 for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--)
1114 state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i];
1115 state->recent_delta_pairs[0] = state->prev_delta_pair;
1117 state->prev_delta_pair = state->upcoming_delta_pair;
1121 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
1123 *state_p = ((*state_p << 1) | bit) & (num_states - 1);
1127 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
1129 lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
1133 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
1135 lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
1139 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
1141 lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
1145 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1147 for (int i = 0; i < rep_idx; i++)
1148 lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS);
1150 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1151 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
1155 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
1157 lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
1161 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1163 for (int i = 0; i < rep_idx; i++)
1164 lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS);
1166 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1167 lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS);
1170 /******************************************************************************
1172 ******************************************************************************/
1174 /* Note: this code just handles finding delta matches. The code for finding LZ
1175 * matches is elsewhere. */
1178 /* Initialize the delta matchfinder for a new input buffer. */
1180 lzms_init_delta_matchfinder(struct lzms_compressor *c)
1182 /* Set all entries to use an invalid power, which will never match. */
1183 STATIC_ASSERT(NUM_POWERS_TO_CONSIDER < (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
1184 memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
1186 /* Initialize the next hash code for each power. We can just use zeroes
1187 * initially; it doesn't really matter. */
1188 for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++)
1189 c->next_delta_hashes[i] = 0;
1193 * Compute a DELTA_HASH_ORDER-bit hash code for the first
1194 * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
1195 * delta context with the specified @span.
1198 lzms_delta_hash(const u8 *p, const u32 pos, u32 span)
1200 /* A delta match has a certain span and an offset that is a multiple of
1201 * that span. To reduce wasted space we use a single combined hash
1202 * table for all spans and positions, but to minimize collisions we
1203 * include in the hash code computation the span and the low-order bits
1204 * of the current position. */
1206 STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1207 u8 d0 = *(p + 0) - *(p + 0 - span);
1208 u8 d1 = *(p + 1) - *(p + 1 - span);
1209 u8 d2 = *(p + 2) - *(p + 2 - span);
1210 u32 v = ((span + (pos & (span - 1))) << 24) |
1211 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
1212 return lz_hash(v, DELTA_HASH_ORDER);
1216 * Given a match between @in_next and @matchptr in a delta context with the
1217 * specified @span and having the initial @len, extend the match as far as
1218 * possible, up to a limit of @max_len.
1221 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
1222 u32 len, u32 max_len, u32 span)
1224 while (len < max_len &&
1225 (u8)(*(in_next + len) - *(in_next + len - span)) ==
1226 (u8)(*(matchptr + len) - *(matchptr + len - span)))
1234 lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
1235 const u8 *in_next, u32 count)
1237 u32 pos = in_next - c->in_buffer;
1238 if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1))
1241 /* Update the hash table for each power. */
1242 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1243 const u32 span = (u32)1 << power;
1244 if (unlikely(pos < span))
1246 const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1247 const u32 hash = c->next_delta_hashes[power];
1248 c->delta_hash_table[hash] =
1249 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1250 c->next_delta_hashes[power] = next_hash;
1251 prefetchw(&c->delta_hash_table[next_hash]);
1253 } while (in_next++, pos++, --count);
1257 * Skip the next @count bytes (don't search for matches at them). @in_next
1258 * points to the first byte to skip. The return value is @in_next + count.
1261 lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
1263 lcpit_matchfinder_skip_bytes(&c->mf, count);
1264 if (c->use_delta_matches)
1265 lzms_delta_matchfinder_skip_bytes(c, in_next, count);
1266 return in_next + count;
1269 /******************************************************************************
1270 * "Near-optimal" parsing *
1271 ******************************************************************************/
1274 * The main near-optimal parsing routine.
1276 * Briefly, the algorithm does an approximate minimum-cost path search to find a
1277 * "near-optimal" sequence of matches and literals to output, based on the
1278 * current cost model. The algorithm steps forward, position by position (byte
1279 * by byte), and updates the minimum cost path to reach each later position that
1280 * can be reached using a match or literal from the current position. This is
1281 * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1282 * the graph edges are possible matches/literals to code, and the cost of each
1283 * edge is the estimated number of bits (scaled up by COST_SHIFT) that will be
1284 * required to output the corresponding match or literal. But one difference is
1285 * that we actually compute the lowest-cost path in pieces, where each piece is
1286 * terminated when there are no choices to be made.
1288 * The costs of literals and matches are estimated using the range encoder
1289 * states and the semi-adaptive Huffman codes. Except for range encoding
1290 * states, costs are assumed to be constant throughout a single run of the
1291 * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data. This
1292 * introduces a source of non-optimality because the probabilities and Huffman
1293 * codes can change over this part of the data. And of course, there are
1294 * various other reasons why the result isn't optimal in terms of compression
1298 lzms_near_optimal_parse(struct lzms_compressor *c)
1300 const u8 *in_next = c->in_buffer;
1301 const u8 * const in_end = &c->in_buffer[c->in_nbytes];
1302 struct lzms_optimum_node *cur_node;
1303 struct lzms_optimum_node *end_node;
1305 /* Set initial length costs for lengths <= MAX_FAST_LENGTH. */
1306 lzms_update_fast_length_costs(c);
1308 /* Set up the initial adaptive state. */
1309 lzms_init_adaptive_state(&c->optimum_nodes[0].state);
1312 /* Start building a new list of items, which will correspond to the next
1313 * piece of the overall minimum-cost path. */
1315 cur_node = c->optimum_nodes;
1317 end_node = cur_node;
1319 if (in_next == in_end)
1322 /* The following loop runs once for each per byte in the input buffer,
1323 * except in a few shortcut cases. */
1327 /* Repeat offset LZ matches */
1328 if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS &&
1329 in_end - in_next >= 2))
1331 for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) {
1333 /* Looking for a repeat offset LZ match at queue
1336 const u32 offset = cur_node->state.recent_lz_offsets[rep_idx];
1337 const u8 * const matchptr = in_next - offset;
1339 /* Check the first 2 bytes before entering the extension loop. */
1340 if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr))
1343 /* Extend the match to its full length. */
1344 const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next);
1346 /* Early out for long repeat offset LZ match */
1347 if (rep_len >= c->mf.nice_match_len) {
1349 in_next = lzms_skip_bytes(c, rep_len, in_next);
1351 lzms_encode_item_list(c, cur_node);
1352 lzms_encode_item(c, rep_len, rep_idx);
1354 c->optimum_nodes[0].state = cur_node->state;
1355 cur_node = &c->optimum_nodes[0];
1357 cur_node->state.upcoming_lz_offset =
1358 cur_node->state.recent_lz_offsets[rep_idx];
1359 cur_node->state.upcoming_delta_pair = 0;
1360 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1361 cur_node->state.recent_lz_offsets[i] =
1362 cur_node->state.recent_lz_offsets[i + 1];
1363 lzms_update_lru_queues(&cur_node->state);
1364 lzms_update_main_state(&cur_node->state, 1);
1365 lzms_update_match_state(&cur_node->state, 0);
1366 lzms_update_lz_state(&cur_node->state, 1);
1367 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1371 while (end_node < cur_node + rep_len)
1372 (++end_node)->cost = INFINITE_COST;
1374 u32 base_cost = cur_node->cost +
1375 lzms_bit_1_cost(cur_node->state.main_state,
1377 lzms_bit_0_cost(cur_node->state.match_state,
1379 lzms_bit_1_cost(cur_node->state.lz_state,
1382 for (int i = 0; i < rep_idx; i++)
1383 base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
1384 c->probs.lz_rep[i]);
1386 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1387 base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
1388 c->probs.lz_rep[rep_idx]);
1392 u32 cost = base_cost + lzms_fast_length_cost(c, len);
1393 if (cost < (cur_node + len)->cost) {
1394 (cur_node + len)->cost = cost;
1395 (cur_node + len)->item = (struct lzms_item) {
1399 (cur_node + len)->num_extra_items = 0;
1401 } while (++len <= rep_len);
1404 /* try LZ-rep + lit + LZ-rep0 */
1405 if (c->try_lzrep_lit_lzrep0 &&
1406 in_end - (in_next + rep_len) >= 3 &&
1407 load_u16_unaligned(in_next + rep_len + 1) ==
1408 load_u16_unaligned(matchptr + rep_len + 1))
1410 const u32 rep0_len = lz_extend(in_next + rep_len + 1,
1411 matchptr + rep_len + 1,
1413 min(c->mf.nice_match_len,
1414 in_end - (in_next + rep_len + 1)));
1416 unsigned main_state = cur_node->state.main_state;
1417 unsigned match_state = cur_node->state.match_state;
1418 unsigned lz_state = cur_node->state.lz_state;
1419 unsigned lz_rep0_state = cur_node->state.lz_rep_states[0];
1421 /* update states for LZ-rep */
1422 main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1423 match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1424 lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS;
1425 lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) %
1426 LZMS_NUM_LZ_REP_PROBS;
1429 u32 cost = base_cost + lzms_fast_length_cost(c, rep_len);
1431 /* add literal cost */
1432 cost += lzms_literal_cost(c, main_state, *(in_next + rep_len));
1434 /* update state for literal */
1435 main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1437 /* add LZ-rep0 cost */
1438 cost += lzms_bit_1_cost(main_state, c->probs.main) +
1439 lzms_bit_0_cost(match_state, c->probs.match) +
1440 lzms_bit_1_cost(lz_state, c->probs.lz) +
1441 lzms_bit_0_cost(lz_rep0_state, c->probs.lz_rep[0]) +
1442 lzms_fast_length_cost(c, rep0_len);
1444 const u32 total_len = rep_len + 1 + rep0_len;
1446 while (end_node < cur_node + total_len)
1447 (++end_node)->cost = INFINITE_COST;
1449 if (cost < (cur_node + total_len)->cost) {
1450 (cur_node + total_len)->cost = cost;
1451 (cur_node + total_len)->item = (struct lzms_item) {
1455 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1457 .source = *(in_next + rep_len),
1459 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1463 (cur_node + total_len)->num_extra_items = 2;
1469 /* Repeat offset delta matches */
1470 if (c->use_delta_matches &&
1471 likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 &&
1472 (in_end - in_next >= 2)))
1474 for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) {
1476 /* Looking for a repeat offset delta match at
1477 * queue index @rep_idx */
1479 const u32 pair = cur_node->state.recent_delta_pairs[rep_idx];
1480 const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT;
1481 const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK;
1482 const u32 span = (u32)1 << power;
1483 const u32 offset = raw_offset << power;
1484 const u8 * const matchptr = in_next - offset;
1486 /* Check the first 2 bytes before entering the
1487 * extension loop. */
1488 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1489 (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1490 ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1491 (u8)(*(matchptr + 1) - *(matchptr + 1 - span))))
1494 /* Extend the match to its full length. */
1495 const u32 rep_len = lzms_extend_delta_match(in_next, matchptr,
1496 2, in_end - in_next,
1499 /* Early out for long repeat offset delta match */
1500 if (rep_len >= c->mf.nice_match_len) {
1502 in_next = lzms_skip_bytes(c, rep_len, in_next);
1504 lzms_encode_item_list(c, cur_node);
1505 lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx);
1507 c->optimum_nodes[0].state = cur_node->state;
1508 cur_node = &c->optimum_nodes[0];
1510 cur_node->state.upcoming_delta_pair = pair;
1511 cur_node->state.upcoming_lz_offset = 0;
1512 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1513 cur_node->state.recent_delta_pairs[i] =
1514 cur_node->state.recent_delta_pairs[i + 1];
1515 lzms_update_lru_queues(&cur_node->state);
1516 lzms_update_main_state(&cur_node->state, 1);
1517 lzms_update_match_state(&cur_node->state, 1);
1518 lzms_update_delta_state(&cur_node->state, 1);
1519 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1523 while (end_node < cur_node + rep_len)
1524 (++end_node)->cost = INFINITE_COST;
1526 u32 base_cost = cur_node->cost +
1527 lzms_bit_1_cost(cur_node->state.main_state,
1529 lzms_bit_1_cost(cur_node->state.match_state,
1531 lzms_bit_1_cost(cur_node->state.delta_state,
1534 for (int i = 0; i < rep_idx; i++)
1535 base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
1536 c->probs.delta_rep[i]);
1538 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1539 base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
1540 c->probs.delta_rep[rep_idx]);
1544 u32 cost = base_cost + lzms_fast_length_cost(c, len);
1545 if (cost < (cur_node + len)->cost) {
1546 (cur_node + len)->cost = cost;
1547 (cur_node + len)->item = (struct lzms_item) {
1549 .source = DELTA_SOURCE_TAG | rep_idx,
1551 (cur_node + len)->num_extra_items = 0;
1553 } while (++len <= rep_len);
1557 /* Explicit offset LZ matches */
1558 num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches);
1561 u32 best_len = c->matches[0].length;
1563 /* Early out for long explicit offset LZ match */
1564 if (best_len >= c->mf.nice_match_len) {
1566 const u32 offset = c->matches[0].offset;
1568 /* Extend the match as far as possible.
1569 * This is necessary because the LCP-interval
1570 * tree matchfinder only reports up to
1571 * nice_match_len bytes. */
1572 best_len = lz_extend(in_next, in_next - offset,
1573 best_len, in_end - in_next);
1575 in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1);
1577 lzms_encode_item_list(c, cur_node);
1578 lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1);
1580 c->optimum_nodes[0].state = cur_node->state;
1581 cur_node = &c->optimum_nodes[0];
1583 cur_node->state.upcoming_lz_offset = offset;
1584 cur_node->state.upcoming_delta_pair = 0;
1585 lzms_update_lru_queues(&cur_node->state);
1586 lzms_update_main_state(&cur_node->state, 1);
1587 lzms_update_match_state(&cur_node->state, 0);
1588 lzms_update_lz_state(&cur_node->state, 0);
1592 while (end_node < cur_node + best_len)
1593 (++end_node)->cost = INFINITE_COST;
1595 u32 base_cost = cur_node->cost +
1596 lzms_bit_1_cost(cur_node->state.main_state,
1598 lzms_bit_0_cost(cur_node->state.match_state,
1600 lzms_bit_0_cost(cur_node->state.lz_state,
1603 if (c->try_lzmatch_lit_lzrep0 &&
1604 likely(in_end - (in_next + c->matches[0].length) >= 3))
1606 /* try LZ-match + lit + LZ-rep0 */
1609 u32 i = num_matches - 1;
1611 const u32 len = c->matches[i].length;
1612 const u32 offset = c->matches[i].offset;
1613 const u32 position_cost = base_cost +
1614 lzms_lz_offset_cost(c, offset);
1616 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1617 if (cost < (cur_node + l)->cost) {
1618 (cur_node + l)->cost = cost;
1619 (cur_node + l)->item = (struct lzms_item) {
1621 .source = offset + (LZMS_NUM_LZ_REPS - 1),
1623 (cur_node + l)->num_extra_items = 0;
1625 } while (++l <= len);
1627 const u8 * const matchptr = in_next - offset;
1628 if (load_u16_unaligned(matchptr + len + 1) !=
1629 load_u16_unaligned(in_next + len + 1))
1632 const u32 rep0_len = lz_extend(in_next + len + 1,
1635 min(c->mf.nice_match_len,
1636 in_end - (in_next + len + 1)));
1638 unsigned main_state = cur_node->state.main_state;
1639 unsigned match_state = cur_node->state.match_state;
1640 unsigned lz_state = cur_node->state.lz_state;
1642 /* update states for LZ-match */
1643 main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1644 match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1645 lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS;
1648 u32 cost = position_cost + lzms_fast_length_cost(c, len);
1650 /* add literal cost */
1651 cost += lzms_literal_cost(c, main_state, *(in_next + len));
1653 /* update state for literal */
1654 main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1656 /* add LZ-rep0 cost */
1657 cost += lzms_bit_1_cost(main_state, c->probs.main) +
1658 lzms_bit_0_cost(match_state, c->probs.match) +
1659 lzms_bit_1_cost(lz_state, c->probs.lz) +
1660 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1661 c->probs.lz_rep[0]) +
1662 lzms_fast_length_cost(c, rep0_len);
1664 const u32 total_len = len + 1 + rep0_len;
1666 while (end_node < cur_node + total_len)
1667 (++end_node)->cost = INFINITE_COST;
1669 if (cost < (cur_node + total_len)->cost) {
1670 (cur_node + total_len)->cost = cost;
1671 (cur_node + total_len)->item = (struct lzms_item) {
1675 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1677 .source = *(in_next + len),
1679 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1681 .source = offset + LZMS_NUM_LZ_REPS - 1,
1683 (cur_node + total_len)->num_extra_items = 2;
1688 u32 i = num_matches - 1;
1690 u32 position_cost = base_cost +
1691 lzms_lz_offset_cost(c, c->matches[i].offset);
1693 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1694 if (cost < (cur_node + l)->cost) {
1695 (cur_node + l)->cost = cost;
1696 (cur_node + l)->item = (struct lzms_item) {
1698 .source = c->matches[i].offset +
1699 (LZMS_NUM_LZ_REPS - 1),
1701 (cur_node + l)->num_extra_items = 0;
1703 } while (++l <= c->matches[i].length);
1708 /* Explicit offset delta matches */
1709 if (c->use_delta_matches &&
1710 likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1))
1712 const u32 pos = in_next - c->in_buffer;
1714 /* Consider each possible power (log2 of span) */
1715 STATIC_ASSERT(NUM_POWERS_TO_CONSIDER <= LZMS_NUM_DELTA_POWER_SYMS);
1716 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1718 const u32 span = (u32)1 << power;
1720 if (unlikely(pos < span))
1723 const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1724 const u32 hash = c->next_delta_hashes[power];
1725 const u32 cur_match = c->delta_hash_table[hash];
1727 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1728 c->next_delta_hashes[power] = next_hash;
1729 prefetchw(&c->delta_hash_table[next_hash]);
1731 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
1734 const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK);
1736 /* The offset must be a multiple of span. */
1737 if (offset & (span - 1))
1740 const u8 * const matchptr = in_next - offset;
1742 /* Check the first 3 bytes before entering the
1743 * extension loop. */
1744 STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1745 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1746 (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1747 ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1748 (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) ||
1749 ((u8)(*(in_next + 2) - *(in_next + 2 - span)) !=
1750 (u8)(*(matchptr + 2) - *(matchptr + 2 - span))))
1753 /* Extend the delta match to its full length. */
1754 const u32 len = lzms_extend_delta_match(in_next,
1756 NBYTES_HASHED_FOR_DELTA,
1760 const u32 raw_offset = offset >> power;
1762 if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK -
1763 (LZMS_NUM_DELTA_REPS - 1)))
1766 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
1768 const u32 source = DELTA_SOURCE_TAG |
1769 (pair + LZMS_NUM_DELTA_REPS - 1);
1771 /* Early out for long explicit offset delta match */
1772 if (len >= c->mf.nice_match_len) {
1774 in_next = lzms_skip_bytes(c, len - 1, in_next + 1);
1776 lzms_encode_item_list(c, cur_node);
1777 lzms_encode_item(c, len, source);
1779 c->optimum_nodes[0].state = cur_node->state;
1780 cur_node = &c->optimum_nodes[0];
1782 cur_node->state.upcoming_lz_offset = 0;
1783 cur_node->state.upcoming_delta_pair = pair;
1784 lzms_update_lru_queues(&cur_node->state);
1785 lzms_update_main_state(&cur_node->state, 1);
1786 lzms_update_match_state(&cur_node->state, 1);
1787 lzms_update_delta_state(&cur_node->state, 0);
1791 while (end_node < cur_node + len)
1792 (++end_node)->cost = INFINITE_COST;
1794 u32 base_cost = cur_node->cost +
1795 lzms_bit_1_cost(cur_node->state.main_state,
1797 lzms_bit_1_cost(cur_node->state.match_state,
1799 lzms_bit_0_cost(cur_node->state.delta_state,
1801 lzms_delta_source_cost(c, power, raw_offset);
1803 u32 l = NBYTES_HASHED_FOR_DELTA;
1805 u32 cost = base_cost + lzms_fast_length_cost(c, l);
1806 if (cost < (cur_node + l)->cost) {
1807 (cur_node + l)->cost = cost;
1808 (cur_node + l)->item = (struct lzms_item) {
1812 (cur_node + l)->num_extra_items = 0;
1814 } while (++l <= len);
1819 if (end_node < cur_node + 1)
1820 (++end_node)->cost = INFINITE_COST;
1821 const u32 cur_and_lit_cost = cur_node->cost +
1822 lzms_literal_cost(c, cur_node->state.main_state,
1824 if (cur_and_lit_cost < (cur_node + 1)->cost) {
1825 (cur_node + 1)->cost = cur_and_lit_cost;
1826 (cur_node + 1)->item = (struct lzms_item) {
1830 (cur_node + 1)->num_extra_items = 0;
1831 } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) {
1832 /* try lit + LZ-rep0 */
1834 (cur_node->state.prev_lz_offset) ?
1835 cur_node->state.prev_lz_offset :
1836 cur_node->state.recent_lz_offsets[0];
1838 if (load_u16_unaligned(in_next + 1) ==
1839 load_u16_unaligned(in_next + 1 - offset))
1841 const u32 rep0_len = lz_extend(in_next + 1,
1842 in_next + 1 - offset,
1844 min(in_end - (in_next + 1),
1845 c->mf.nice_match_len));
1847 unsigned main_state = cur_node->state.main_state;
1849 /* Update main_state after literal */
1850 main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS;
1852 /* Add cost of LZ-rep0 */
1853 const u32 cost = cur_and_lit_cost +
1854 lzms_bit_1_cost(main_state, c->probs.main) +
1855 lzms_bit_0_cost(cur_node->state.match_state,
1857 lzms_bit_1_cost(cur_node->state.lz_state,
1859 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1860 c->probs.lz_rep[0]) +
1861 lzms_fast_length_cost(c, rep0_len);
1863 const u32 total_len = 1 + rep0_len;
1865 while (end_node < cur_node + total_len)
1866 (++end_node)->cost = INFINITE_COST;
1868 if (cost < (cur_node + total_len)->cost) {
1869 (cur_node + total_len)->cost = cost;
1870 (cur_node + total_len)->item = (struct lzms_item) {
1874 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1878 (cur_node + total_len)->num_extra_items = 1;
1883 /* Advance to the next position. */
1887 /* The lowest-cost path to the current position is now known.
1888 * Finalize the adaptive state that results from taking this
1889 * lowest-cost path. */
1890 struct lzms_item item_to_take = cur_node->item;
1891 struct lzms_optimum_node *source_node = cur_node - item_to_take.length;
1892 int next_item_idx = -1;
1893 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
1894 item_to_take = cur_node->extra_items[i];
1895 source_node -= item_to_take.length;
1898 cur_node->state = source_node->state;
1900 const u32 length = item_to_take.length;
1901 u32 source = item_to_take.source;
1903 cur_node->state.upcoming_lz_offset = 0;
1904 cur_node->state.upcoming_delta_pair = 0;
1908 lzms_update_main_state(&cur_node->state, 1);
1910 if (source & DELTA_SOURCE_TAG) {
1913 lzms_update_match_state(&cur_node->state, 1);
1914 source &= ~DELTA_SOURCE_TAG;
1916 if (source >= LZMS_NUM_DELTA_REPS) {
1917 /* Explicit offset delta match */
1918 lzms_update_delta_state(&cur_node->state, 0);
1919 cur_node->state.upcoming_delta_pair =
1920 source - (LZMS_NUM_DELTA_REPS - 1);
1922 /* Repeat offset delta match */
1923 int rep_idx = source;
1925 lzms_update_delta_state(&cur_node->state, 1);
1926 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1928 cur_node->state.upcoming_delta_pair =
1929 cur_node->state.recent_delta_pairs[rep_idx];
1931 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1932 cur_node->state.recent_delta_pairs[i] =
1933 cur_node->state.recent_delta_pairs[i + 1];
1936 lzms_update_match_state(&cur_node->state, 0);
1938 if (source >= LZMS_NUM_LZ_REPS) {
1939 /* Explicit offset LZ match */
1940 lzms_update_lz_state(&cur_node->state, 0);
1941 cur_node->state.upcoming_lz_offset =
1942 source - (LZMS_NUM_LZ_REPS - 1);
1944 /* Repeat offset LZ match */
1945 int rep_idx = source;
1947 lzms_update_lz_state(&cur_node->state, 1);
1948 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1950 cur_node->state.upcoming_lz_offset =
1951 cur_node->state.recent_lz_offsets[rep_idx];
1953 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1954 cur_node->state.recent_lz_offsets[i] =
1955 cur_node->state.recent_lz_offsets[i + 1];
1960 lzms_update_main_state(&cur_node->state, 0);
1963 lzms_update_lru_queues(&cur_node->state);
1965 if (next_item_idx < 0)
1967 if (next_item_idx == 0)
1968 item_to_take = cur_node->item;
1970 item_to_take = cur_node->extra_items[next_item_idx - 1];
1975 * This loop will terminate when either of the following
1976 * conditions is true:
1978 * (1) cur_node == end_node
1980 * There are no paths that extend beyond the current
1981 * position. In this case, any path to a later position
1982 * must pass through the current position, so we can go
1983 * ahead and choose the list of items that led to this
1986 * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]
1988 * This bounds the number of times the algorithm can step
1989 * forward before it is guaranteed to start choosing items.
1990 * This limits the memory usage. It also guarantees that
1991 * the parser will not go too long without updating the
1992 * probability tables.
1994 * Note: no check for end-of-buffer is needed because
1995 * end-of-buffer will trigger condition (1).
1997 if (cur_node == end_node ||
1998 cur_node == &c->optimum_nodes[NUM_OPTIM_NODES])
2000 lzms_encode_nonempty_item_list(c, cur_node);
2001 c->optimum_nodes[0].state = cur_node->state;
2008 lzms_init_states_and_probabilities(struct lzms_compressor *c)
2013 for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2014 c->lz_rep_states[i] = 0;
2016 for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2017 c->delta_rep_states[i] = 0;
2019 lzms_init_probabilities(&c->probs);
2023 lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
2025 lzms_init_huffman_code(&c->literal_rebuild_info,
2026 LZMS_NUM_LITERAL_SYMS,
2027 LZMS_LITERAL_CODE_REBUILD_FREQ,
2028 c->literal_codewords,
2032 lzms_init_huffman_code(&c->lz_offset_rebuild_info,
2034 LZMS_LZ_OFFSET_CODE_REBUILD_FREQ,
2035 c->lz_offset_codewords,
2037 c->lz_offset_freqs);
2039 lzms_init_huffman_code(&c->length_rebuild_info,
2040 LZMS_NUM_LENGTH_SYMS,
2041 LZMS_LENGTH_CODE_REBUILD_FREQ,
2042 c->length_codewords,
2046 lzms_init_huffman_code(&c->delta_offset_rebuild_info,
2048 LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ,
2049 c->delta_offset_codewords,
2050 c->delta_offset_lens,
2051 c->delta_offset_freqs);
2053 lzms_init_huffman_code(&c->delta_power_rebuild_info,
2054 LZMS_NUM_DELTA_POWER_SYMS,
2055 LZMS_DELTA_POWER_CODE_REBUILD_FREQ,
2056 c->delta_power_codewords,
2057 c->delta_power_lens,
2058 c->delta_power_freqs);
2062 * Flush the output streams, prepare the final compressed data, and return its
2065 * A return value of 0 indicates that the data could not be compressed to fit in
2066 * the available space.
2069 lzms_finalize(struct lzms_compressor *c)
2071 size_t num_forwards_units;
2072 size_t num_backwards_units;
2074 /* Flush both the forwards and backwards streams, and make sure they
2075 * didn't cross each other and start overwriting each other's data. */
2076 if (!lzms_output_bitstream_flush(&c->os))
2079 if (!lzms_range_encoder_flush(&c->rc))
2082 if (c->rc.next > c->os.next)
2085 /* Now the compressed buffer contains the data output by the forwards
2086 * bitstream, then empty space, then data output by the backwards
2087 * bitstream. Move the data output by the backwards bitstream to be
2088 * adjacent to the data output by the forward bitstream, and calculate
2089 * the compressed size that this results in. */
2090 num_forwards_units = c->rc.next - c->rc.begin;
2091 num_backwards_units = c->rc.end - c->os.next;
2093 memmove(c->rc.next, c->os.next, num_backwards_units * sizeof(le16));
2095 return (num_forwards_units + num_backwards_units) * sizeof(le16);
2099 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2104 if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2107 size += sizeof(struct lzms_compressor);
2110 size += max_bufsize; /* in_buffer */
2113 size += lcpit_matchfinder_get_needed_memory(max_bufsize);
2119 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
2120 bool destructive, void **c_ret)
2122 struct lzms_compressor *c;
2125 if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2126 return WIMLIB_ERR_INVALID_PARAM;
2128 c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64);
2132 c->destructive = destructive;
2134 /* Scale nice_match_len with the compression level. But to allow an
2135 * optimization for length cost calculations, don't allow nice_match_len
2136 * to exceed MAX_FAST_LENGTH. */
2137 nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH);
2139 c->use_delta_matches = (compression_level >= 35);
2140 c->try_lzmatch_lit_lzrep0 = (compression_level >= 45);
2141 c->try_lit_lzrep0 = (compression_level >= 60);
2142 c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
2144 if (!c->destructive) {
2145 c->in_buffer = MALLOC(max_bufsize);
2150 if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
2153 lzms_init_fast_length_slot_tab(c);
2154 lzms_init_offset_slot_tabs(c);
2160 if (!c->destructive)
2165 return WIMLIB_ERR_NOMEM;
2169 lzms_compress(const void *restrict in, size_t in_nbytes,
2170 void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2172 struct lzms_compressor *c = _c;
2175 /* Don't bother trying to compress extremely small inputs. */
2179 /* Copy the input data into the internal buffer and preprocess it. */
2181 c->in_buffer = (void *)in;
2183 memcpy(c->in_buffer, in, in_nbytes);
2184 c->in_nbytes = in_nbytes;
2185 lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
2187 /* Prepare the matchfinders. */
2188 lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes);
2189 if (c->use_delta_matches)
2190 lzms_init_delta_matchfinder(c);
2192 /* Initialize the encoder structures. */
2193 lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16));
2194 lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16));
2195 lzms_init_states_and_probabilities(c);
2196 lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes));
2198 /* The main loop: parse and encode. */
2199 lzms_near_optimal_parse(c);
2201 /* Return the compressed data size or 0. */
2202 result = lzms_finalize(c);
2203 if (!result && c->destructive)
2204 lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true);
2209 lzms_free_compressor(void *_c)
2211 struct lzms_compressor *c = _c;
2213 if (!c->destructive)
2215 lcpit_matchfinder_destroy(&c->mf);
2219 const struct compressor_ops lzms_compressor_ops = {
2220 .get_needed_memory = lzms_get_needed_memory,
2221 .create_compressor = lzms_create_compressor,
2222 .compress = lzms_compress,
2223 .free_compressor = lzms_free_compressor,