4 * A compressor for the LZMS compression format.
8 * Copyright (C) 2013, 2014, 2015 Eric Biggers
10 * This file is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU Lesser General Public License as published by the Free
12 * Software Foundation; either version 3 of the License, or (at your option) any
15 * This file is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this file; if not, see http://www.gnu.org/licenses/.
31 #include "wimlib/compress_common.h"
32 #include "wimlib/compressor_ops.h"
33 #include "wimlib/error.h"
34 #include "wimlib/lcpit_matchfinder.h"
35 #include "wimlib/lz_extend.h"
36 #include "wimlib/lz_hash.h"
37 #include "wimlib/lzms_common.h"
38 #include "wimlib/unaligned.h"
39 #include "wimlib/util.h"
42 * MAX_FAST_LENGTH is the maximum match length for which the length slot can be
43 * looked up directly in 'fast_length_slot_tab' and the length cost can be
44 * looked up directly in 'fast_length_cost_tab'.
46 * We also limit the 'nice_match_len' parameter to this value. Consequently, if
47 * the longest match found is shorter than 'nice_match_len', then it must also
48 * be shorter than MAX_FAST_LENGTH. This makes it possible to do fast lookups
49 * of length costs using 'fast_length_cost_tab' without having to keep checking
50 * whether the length exceeds MAX_FAST_LENGTH or not.
52 #define MAX_FAST_LENGTH 255
54 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will
55 * step forward before forcing the pending items to be encoded. If this value
56 * is increased, then there will be fewer forced flushes, but the probability
57 * entries and Huffman codes will be more likely to become outdated. */
58 #define NUM_OPTIM_NODES 2048
60 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional
61 * bit costs. A single bit has a cost of (1 << COST_SHIFT). */
64 /* Length of the hash table for finding delta matches */
65 #define DELTA_HASH_ORDER 17
66 #define DELTA_HASH_LENGTH ((u32)1 << DELTA_HASH_ORDER)
68 /* The number of bytes to hash when finding delta matches; also taken to be the
69 * minimum length of an explicit offset delta match */
70 #define NBYTES_HASHED_FOR_DELTA 3
72 /* The number of delta match powers to consider (must be <=
73 * LZMS_NUM_DELTA_POWER_SYMS) */
74 #define NUM_POWERS_TO_CONSIDER 6
76 /* This structure tracks the state of writing bits as a series of 16-bit coding
77 * units, starting at the end of the output buffer and proceeding backwards. */
78 struct lzms_output_bitstream {
80 /* Bits that haven't yet been written to the output buffer */
83 /* Number of bits currently held in @bitbuf */
86 /* Pointer to the beginning of the output buffer (this is the "end" when
87 * writing backwards!) */
90 /* Pointer to just past the next position in the output buffer at which
91 * to output a 16-bit coding unit */
95 /* This structure tracks the state of range encoding and its output, which
96 * starts at the beginning of the output buffer and proceeds forwards. */
97 struct lzms_range_encoder {
99 /* The lower boundary of the current range. Logically, this is a 33-bit
100 * integer whose high bit is needed to detect carries. */
103 /* The size of the current range */
106 /* The next 16-bit coding unit to output */
109 /* The number of 16-bit coding units whose output has been delayed due
110 * to possible carrying. The first such coding unit is @cache; all
111 * subsequent such coding units are 0xffff. */
114 /* Pointer to the beginning of the output buffer */
117 /* Pointer to the position in the output buffer at which the next coding
118 * unit must be written */
121 /* Pointer to just past the end of the output buffer */
125 /* Bookkeeping information for an adaptive Huffman code */
126 struct lzms_huffman_rebuild_info {
128 /* The remaining number of symbols to encode until this code must be
130 unsigned num_syms_until_rebuild;
132 /* The number of symbols in this code */
135 /* The rebuild frequency of this code, in symbols */
136 unsigned rebuild_freq;
138 /* The Huffman codeword of each symbol in this code */
141 /* The length of each Huffman codeword, in bits */
144 /* The frequency of each symbol in this code */
149 * The compressor-internal representation of a match or literal.
151 * Literals have length=1; matches have length > 1. (We disallow matches of
152 * length 1, even though this is a valid length in LZMS.)
154 * The source is encoded as follows:
156 * - Literals: the literal byte itself
157 * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1)
158 * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets
159 * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are
160 * the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1)
161 * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is
162 * the index of the (power, raw_offset) pair in recent_delta_pairs
169 #define DELTA_SOURCE_TAG ((u32)1 << 31)
170 #define DELTA_SOURCE_POWER_SHIFT 28
171 #define DELTA_SOURCE_RAW_OFFSET_MASK (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
173 static _unused_attribute void
174 check_that_powers_fit_in_bitfield(void)
176 STATIC_ASSERT(LZMS_NUM_DELTA_POWER_SYMS <= (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
179 /* A stripped-down version of the adaptive state in LZMS which excludes the
180 * probability entries and Huffman codes */
181 struct lzms_adaptive_state {
183 /* Recent offsets for LZ matches */
184 u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1];
185 u32 prev_lz_offset; /* 0 means none */
186 u32 upcoming_lz_offset; /* 0 means none */
188 /* Recent (power, raw offset) pairs for delta matches.
189 * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw
190 * offset, and the high bits are the power. */
191 u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1];
192 u32 prev_delta_pair; /* 0 means none */
193 u32 upcoming_delta_pair; /* 0 means none */
195 /* States for predicting the probabilities of item types */
199 u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
201 u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
202 } _aligned_attribute(64);
205 * This structure represents a byte position in the preprocessed input data and
206 * a node in the graph of possible match/literal choices.
208 * Logically, each incoming edge to this node is labeled with a literal or a
209 * match that can be taken to reach this position from an earlier position; and
210 * each outgoing edge from this node is labeled with a literal or a match that
211 * can be taken to advance from this position to a later position.
213 struct lzms_optimum_node {
216 * The cost of the lowest-cost path that has been found to reach this
217 * position. This can change as progressively lower cost paths are
218 * found to reach this position.
221 #define INFINITE_COST UINT32_MAX
224 * @item is the last item that was taken to reach this position to reach
225 * it with the stored @cost. This can change as progressively lower
226 * cost paths are found to reach this position.
228 * In some cases we look ahead more than one item. If we looked ahead n
229 * items to reach this position, then @item is the last item taken,
230 * @extra_items contains the other items ordered from second-to-last to
231 * first, and @num_extra_items is n - 1.
233 unsigned num_extra_items;
234 struct lzms_item item;
235 struct lzms_item extra_items[2];
238 * The adaptive state that exists at this position. This is filled in
239 * lazily, only after the minimum-cost path to this position is found.
241 * Note: the way the algorithm handles this adaptive state in the
242 * "minimum-cost" parse is actually only an approximation. It's
243 * possible for the globally optimal, minimum cost path to contain a
244 * prefix, ending at a position, where that path prefix is *not* the
245 * minimum cost path to that position. This can happen if such a path
246 * prefix results in a different adaptive state which results in lower
247 * costs later. Although the algorithm does do some heuristic
248 * multi-item lookaheads, it does not solve this problem in general.
250 * Note: this adaptive state structure also does not include the
251 * probability entries or current Huffman codewords. Those aren't
252 * maintained per-position and are only updated occasionally.
254 struct lzms_adaptive_state state;
255 } _aligned_attribute(64);
257 /* The main compressor structure */
258 struct lzms_compressor {
260 /* The matchfinder for LZ matches */
261 struct lcpit_matchfinder mf;
263 /* The preprocessed buffer of data being compressed */
266 /* The number of bytes of data to be compressed, which is the number of
267 * bytes of data in @in_buffer that are actually valid */
271 * Boolean flags to enable consideration of various types of multi-step
272 * operations during parsing.
274 * Among other cases, multi-step operations can help with gaps where two
275 * matches are separated by a non-matching byte.
277 * This idea is borrowed from Igor Pavlov's LZMA encoder.
280 bool try_lzrep_lit_lzrep0;
281 bool try_lzmatch_lit_lzrep0;
284 * If true, the compressor can use delta matches. This slows down
285 * compression. It improves the compression ratio greatly, slightly, or
286 * not at all, depending on the input data.
288 bool use_delta_matches;
290 /* If true, the compressor need not preserve the input buffer if it
291 * compresses the data successfully. */
294 /* 'last_target_usages' is a large array that is only needed for
295 * preprocessing, so it is in union with fields that don't need to be
296 * initialized until after preprocessing. */
300 /* Temporary space to store matches found by the LZ matchfinder */
301 struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1];
303 /* Hash table for finding delta matches */
304 u32 delta_hash_table[DELTA_HASH_LENGTH];
306 /* For each delta power, the hash code for the next sequence */
307 u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
309 /* The per-byte graph nodes for near-optimal parsing */
310 struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH +
311 1 + MAX_FAST_LENGTH];
313 /* Table: length => current cost for small match lengths */
314 u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
316 /* Range encoder which outputs to the beginning of the compressed data
317 * buffer, proceeding forwards */
318 struct lzms_range_encoder rc;
320 /* Bitstream which outputs to the end of the compressed data buffer,
321 * proceeding backwards */
322 struct lzms_output_bitstream os;
324 /* States and probability entries for item type disambiguation */
326 unsigned match_state;
328 unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
329 unsigned delta_state;
330 unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
331 struct lzms_probabilites probs;
335 struct lzms_huffman_rebuild_info literal_rebuild_info;
336 u32 literal_codewords[LZMS_NUM_LITERAL_SYMS];
337 u8 literal_lens[LZMS_NUM_LITERAL_SYMS];
338 u32 literal_freqs[LZMS_NUM_LITERAL_SYMS];
340 struct lzms_huffman_rebuild_info lz_offset_rebuild_info;
341 u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
342 u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
343 u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
345 struct lzms_huffman_rebuild_info length_rebuild_info;
346 u32 length_codewords[LZMS_NUM_LENGTH_SYMS];
347 u8 length_lens[LZMS_NUM_LENGTH_SYMS];
348 u32 length_freqs[LZMS_NUM_LENGTH_SYMS];
350 struct lzms_huffman_rebuild_info delta_offset_rebuild_info;
351 u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
352 u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
353 u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
355 struct lzms_huffman_rebuild_info delta_power_rebuild_info;
356 u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS];
357 u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS];
358 u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS];
362 s32 last_target_usages[65536];
366 /* Table: length => length slot for small match lengths */
367 u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1];
369 /* Tables for mapping offsets to offset slots */
371 /* slots [0, 167); 0 <= num_extra_bits <= 10 */
372 u8 offset_slot_tab_1[0xe4a5];
374 /* slots [167, 427); 11 <= num_extra_bits <= 15 */
375 u16 offset_slot_tab_2[0x3d0000 >> 11];
377 /* slots [427, 799); 16 <= num_extra_bits */
378 u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16];
381 /******************************************************************************
382 * Offset and length slot acceleration *
383 ******************************************************************************/
385 /* Generate the acceleration table for length slots. */
387 lzms_init_fast_length_slot_tab(struct lzms_compressor *c)
390 for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
391 if (len >= lzms_length_slot_base[slot + 1])
393 c->fast_length_slot_tab[len] = slot;
397 /* Generate the acceleration tables for offset slots. */
399 lzms_init_offset_slot_tabs(struct lzms_compressor *c)
404 /* slots [0, 167); 0 <= num_extra_bits <= 10 */
405 for (offset = 1; offset < 0xe4a5; offset++) {
406 if (offset >= lzms_offset_slot_base[slot + 1])
408 c->offset_slot_tab_1[offset] = slot;
411 /* slots [167, 427); 11 <= num_extra_bits <= 15 */
412 for (; offset < 0x3de4a5; offset += (u32)1 << 11) {
413 if (offset >= lzms_offset_slot_base[slot + 1])
415 c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot;
418 /* slots [427, 799); 16 <= num_extra_bits */
419 for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) {
420 if (offset >= lzms_offset_slot_base[slot + 1])
422 c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot;
427 * Return the length slot for the specified match length, using the compressor's
428 * acceleration table if the length is small enough.
430 static forceinline unsigned
431 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
433 if (likely(length <= MAX_FAST_LENGTH))
434 return c->fast_length_slot_tab[length];
435 return lzms_get_length_slot(length);
439 * Return the offset slot for the specified match offset, using the compressor's
440 * acceleration tables to speed up the mapping.
442 static forceinline unsigned
443 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
446 return c->offset_slot_tab_1[offset];
448 if (offset < 0x3d0000)
449 return c->offset_slot_tab_2[offset >> 11];
450 return c->offset_slot_tab_3[offset >> 16];
453 /******************************************************************************
455 ******************************************************************************/
458 * Initialize the range encoder @rc to write forwards to the specified buffer
459 * @out that is @size bytes long.
462 lzms_range_encoder_init(struct lzms_range_encoder *rc, u8 *out, size_t size)
465 rc->range_size = 0xffffffff;
469 rc->next = out - sizeof(le16);
470 rc->end = out + (size & ~1);
474 * Attempt to flush bits from the range encoder.
476 * The basic idea is that we're writing bits from @rc->lower_bound to the
477 * output. However, due to carrying, the writing of coding units with the
478 * maximum value, as well as one prior coding unit, must be delayed until it is
479 * determined whether a carry is needed.
481 * This is based on the public domain code for LZMA written by Igor Pavlov, but
482 * with the following differences:
484 * - In LZMS, 16-bit coding units are required rather than 8-bit.
486 * - In LZMS, the first coding unit is not ignored by the decompressor, so
487 * the encoder cannot output a dummy value to that position.
490 lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
492 if ((u32)(rc->lower_bound) < 0xffff0000 ||
493 (u32)(rc->lower_bound >> 32) != 0)
495 /* Carry not needed (rc->lower_bound < 0xffff0000), or carry
496 * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit
499 if (likely(rc->next >= rc->begin)) {
500 if (rc->next != rc->end) {
501 put_unaligned_le16(rc->cache +
502 (u16)(rc->lower_bound >> 32),
504 rc->next += sizeof(le16);
507 rc->next += sizeof(le16);
510 } while (--rc->cache_size != 0);
512 rc->cache = (rc->lower_bound >> 16) & 0xffff;
515 rc->lower_bound = (rc->lower_bound & 0xffff) << 16;
519 lzms_range_encoder_flush(struct lzms_range_encoder *rc)
521 for (int i = 0; i < 4; i++)
522 lzms_range_encoder_shift_low(rc);
523 return rc->next != rc->end;
527 * Encode the next bit using the range encoder.
529 * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
530 * bit is 0 rather than 1.
532 static forceinline void
533 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
535 /* Normalize if needed. */
536 if (rc->range_size <= 0xffff) {
537 rc->range_size <<= 16;
538 lzms_range_encoder_shift_low(rc);
541 u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob;
543 rc->range_size = bound;
545 rc->lower_bound += bound;
546 rc->range_size -= bound;
551 * Encode a bit. This wraps around lzms_range_encode_bit() to handle using and
552 * updating the state and its corresponding probability entry.
554 static forceinline void
555 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
556 struct lzms_probability_entry *probs,
557 struct lzms_range_encoder *rc)
559 struct lzms_probability_entry *prob_entry;
562 /* Load the probability entry for the current state. */
563 prob_entry = &probs[*state_p];
565 /* Update the state based on the next bit. */
566 *state_p = ((*state_p << 1) | bit) & (num_states - 1);
568 /* Get the probability that the bit is 0. */
569 prob = lzms_get_probability(prob_entry);
571 /* Update the probability entry. */
572 lzms_update_probability_entry(prob_entry, bit);
574 /* Encode the bit using the range encoder. */
575 lzms_range_encode_bit(rc, bit, prob);
578 /* Helper functions for encoding bits in the various decision classes */
581 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
583 lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
584 c->probs.main, &c->rc);
588 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
590 lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
591 c->probs.match, &c->rc);
595 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
597 lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
598 c->probs.lz, &c->rc);
602 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
604 lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
605 c->probs.lz_rep[idx], &c->rc);
609 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
611 lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
612 c->probs.delta, &c->rc);
616 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
618 lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
619 c->probs.delta_rep[idx], &c->rc);
622 /******************************************************************************
623 * Huffman encoding and verbatim bits *
624 ******************************************************************************/
627 * Initialize the output bitstream @os to write backwards to the specified
628 * buffer @out that is @size bytes long.
631 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
632 u8 *out, size_t size)
637 os->next = out + (size & ~1);
641 * Write some bits, contained in the low-order @num_bits bits of @bits, to the
642 * output bitstream @os.
644 * @max_num_bits is a compile-time constant that specifies the maximum number of
645 * bits that can ever be written at this call site.
647 static forceinline void
648 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
649 const unsigned num_bits, const unsigned max_num_bits)
651 /* Add the bits to the bit buffer variable. */
652 os->bitcount += num_bits;
653 os->bitbuf = (os->bitbuf << num_bits) | bits;
655 /* Check whether any coding units need to be written. */
656 while (os->bitcount >= 16) {
660 /* Write a coding unit, unless it would underflow the buffer. */
661 if (os->next != os->begin) {
662 os->next -= sizeof(le16);
663 put_unaligned_le16(os->bitbuf >> os->bitcount, os->next);
666 /* Optimization for call sites that never write more than 16
668 if (max_num_bits <= 16)
674 * Flush the output bitstream, ensuring that all bits written to it have been
675 * written to memory. Return %true if all bits have been output successfully,
676 * or %false if an overrun occurred.
679 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
681 if (os->next == os->begin)
684 if (os->bitcount != 0) {
685 os->next -= sizeof(le16);
686 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
693 lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
695 make_canonical_huffman_code(rebuild_info->num_syms,
696 LZMS_MAX_CODEWORD_LENGTH,
699 rebuild_info->codewords);
700 rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq;
704 lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info,
705 unsigned num_syms, unsigned rebuild_freq,
706 u32 *codewords, u8 *lens, u32 *freqs)
708 rebuild_info->num_syms = num_syms;
709 rebuild_info->rebuild_freq = rebuild_freq;
710 rebuild_info->codewords = codewords;
711 rebuild_info->lens = lens;
712 rebuild_info->freqs = freqs;
713 lzms_init_symbol_frequencies(freqs, num_syms);
714 lzms_build_huffman_code(rebuild_info);
718 lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
720 lzms_build_huffman_code(rebuild_info);
721 lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms);
725 * Encode a symbol using the specified Huffman code. Then, if the Huffman code
726 * needs to be rebuilt, rebuild it and return true; otherwise return false.
728 static forceinline bool
729 lzms_huffman_encode_symbol(unsigned sym,
730 const u32 *codewords, const u8 *lens, u32 *freqs,
731 struct lzms_output_bitstream *os,
732 struct lzms_huffman_rebuild_info *rebuild_info)
734 lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH);
736 if (--rebuild_info->num_syms_until_rebuild == 0) {
737 lzms_rebuild_huffman_code(rebuild_info);
743 /* Helper routines to encode symbols using the various Huffman codes */
746 lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym)
748 return lzms_huffman_encode_symbol(sym, c->literal_codewords,
749 c->literal_lens, c->literal_freqs,
750 &c->os, &c->literal_rebuild_info);
754 lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym)
756 return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords,
757 c->lz_offset_lens, c->lz_offset_freqs,
758 &c->os, &c->lz_offset_rebuild_info);
762 lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym)
764 return lzms_huffman_encode_symbol(sym, c->length_codewords,
765 c->length_lens, c->length_freqs,
766 &c->os, &c->length_rebuild_info);
770 lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym)
772 return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords,
773 c->delta_offset_lens, c->delta_offset_freqs,
774 &c->os, &c->delta_offset_rebuild_info);
778 lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym)
780 return lzms_huffman_encode_symbol(sym, c->delta_power_codewords,
781 c->delta_power_lens, c->delta_power_freqs,
782 &c->os, &c->delta_power_rebuild_info);
786 lzms_update_fast_length_costs(struct lzms_compressor *c);
789 * Encode a match length. If this causes the Huffman code for length symbols to
790 * be rebuilt, also update the length costs array used by the parser.
793 lzms_encode_length(struct lzms_compressor *c, u32 length)
795 unsigned slot = lzms_comp_get_length_slot(c, length);
797 if (lzms_encode_length_symbol(c, slot))
798 lzms_update_fast_length_costs(c);
800 lzms_write_bits(&c->os, length - lzms_length_slot_base[slot],
801 lzms_extra_length_bits[slot],
802 LZMS_MAX_EXTRA_LENGTH_BITS);
805 /* Encode the offset of an LZ match. */
807 lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
809 unsigned slot = lzms_comp_get_offset_slot(c, offset);
811 lzms_encode_lz_offset_symbol(c, slot);
812 lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot],
813 lzms_extra_offset_bits[slot],
814 LZMS_MAX_EXTRA_OFFSET_BITS);
817 /* Encode the raw offset of a delta match. */
819 lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset)
821 unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
823 lzms_encode_delta_offset_symbol(c, slot);
824 lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot],
825 lzms_extra_offset_bits[slot],
826 LZMS_MAX_EXTRA_OFFSET_BITS);
829 /******************************************************************************
831 ******************************************************************************/
833 /* Encode the specified item, which may be a literal or any type of match. */
835 lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source)
837 /* Main bit: 0 = literal, 1 = match */
838 int main_bit = (length > 1);
839 lzms_encode_main_bit(c, main_bit);
843 unsigned literal = source;
844 lzms_encode_literal_symbol(c, literal);
848 /* Match bit: 0 = LZ match, 1 = delta match */
849 int match_bit = (source & DELTA_SOURCE_TAG) != 0;
850 lzms_encode_match_bit(c, match_bit);
855 /* LZ bit: 0 = explicit offset, 1 = repeat offset */
856 int lz_bit = (source < LZMS_NUM_LZ_REPS);
857 lzms_encode_lz_bit(c, lz_bit);
860 /* Explicit offset LZ match */
861 u32 offset = source - (LZMS_NUM_LZ_REPS - 1);
862 lzms_encode_lz_offset(c, offset);
864 /* Repeat offset LZ match */
865 int rep_idx = source;
866 for (int i = 0; i < rep_idx; i++)
867 lzms_encode_lz_rep_bit(c, 1, i);
868 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
869 lzms_encode_lz_rep_bit(c, 0, rep_idx);
874 source &= ~DELTA_SOURCE_TAG;
876 /* Delta bit: 0 = explicit offset, 1 = repeat offset */
877 int delta_bit = (source < LZMS_NUM_DELTA_REPS);
878 lzms_encode_delta_bit(c, delta_bit);
881 /* Explicit offset delta match */
882 u32 power = source >> DELTA_SOURCE_POWER_SHIFT;
883 u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) -
884 (LZMS_NUM_DELTA_REPS - 1);
885 lzms_encode_delta_power_symbol(c, power);
886 lzms_encode_delta_raw_offset(c, raw_offset);
888 /* Repeat offset delta match */
889 int rep_idx = source;
890 for (int i = 0; i < rep_idx; i++)
891 lzms_encode_delta_rep_bit(c, 1, i);
892 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
893 lzms_encode_delta_rep_bit(c, 0, rep_idx);
897 /* Match length (encoded the same way for any match type) */
898 lzms_encode_length(c, length);
902 /* Encode a list of matches and literals chosen by the parsing algorithm. */
904 lzms_encode_nonempty_item_list(struct lzms_compressor *c,
905 struct lzms_optimum_node *end_node)
907 /* Since we've stored at each node the item we took to arrive at that
908 * node, we can trace our chosen path in backwards order. However, for
909 * encoding we need to trace our chosen path in forwards order. To make
910 * this possible, the following loop moves the items from their
911 * destination nodes to their source nodes, which effectively reverses
912 * the path. (Think of it like reversing a singly-linked list.) */
913 struct lzms_optimum_node *cur_node = end_node;
914 struct lzms_item saved_item = cur_node->item;
916 struct lzms_item item = saved_item;
917 if (cur_node->num_extra_items > 0) {
918 /* Handle an arrival via multi-item lookahead. */
920 struct lzms_optimum_node *orig_node = cur_node;
922 cur_node -= item.length;
923 cur_node->item = item;
924 item = orig_node->extra_items[i];
925 } while (++i != orig_node->num_extra_items);
927 cur_node -= item.length;
928 saved_item = cur_node->item;
929 cur_node->item = item;
930 } while (cur_node != c->optimum_nodes);
932 /* Now trace the chosen path in forwards order, encoding each item. */
934 lzms_encode_item(c, cur_node->item.length, cur_node->item.source);
935 cur_node += cur_node->item.length;
936 } while (cur_node != end_node);
939 static forceinline void
940 lzms_encode_item_list(struct lzms_compressor *c,
941 struct lzms_optimum_node *end_node)
943 if (end_node != c->optimum_nodes)
944 lzms_encode_nonempty_item_list(c, end_node);
947 /******************************************************************************
949 ******************************************************************************/
952 * If p is the predicted probability of the next bit being a 0, then the number
953 * of bits required to encode a 0 bit using a binary range encoder is the real
954 * number -log2(p), and the number of bits required to encode a 1 bit is the
955 * real number -log2(1 - p). To avoid computing either of these expressions at
956 * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from
957 * probability to cost for each possible probability. Specifically, the array
958 * indices are the numerators of the possible probabilities in LZMS, where the
959 * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the
960 * bit costs multiplied by 1<<COST_SHIFT and rounded to the nearest integer.
961 * Furthermore, the values stored for 0% and 100% probabilities are equal to the
962 * adjacent values, since these probabilities are not actually permitted. This
963 * allows us to use the num_recent_zero_bits value from the
964 * lzms_probability_entry as the array index without fixing up these two special
967 static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
968 384, 384, 320, 283, 256, 235, 219, 204,
969 192, 181, 171, 163, 155, 147, 140, 134,
970 128, 122, 117, 112, 107, 103, 99, 94,
971 91, 87, 83, 80, 76, 73, 70, 67,
972 64, 61, 58, 56, 53, 51, 48, 46,
973 43, 41, 39, 37, 35, 33, 30, 29,
974 27, 25, 23, 21, 19, 17, 16, 14,
975 12, 11, 9, 8, 6, 4, 3, 1,
979 static _unused_attribute void
980 check_cost_shift(void)
982 /* lzms_bit_costs is hard-coded to the current COST_SHIFT. */
983 STATIC_ASSERT(COST_SHIFT == 6);
990 lzms_compute_bit_costs(void)
992 for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) {
996 else if (prob == LZMS_PROBABILITY_DENOMINATOR)
999 lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) *
1005 /* Return the cost to encode a 0 bit in the specified context. */
1006 static forceinline u32
1007 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
1009 return lzms_bit_costs[probs[state].num_recent_zero_bits];
1012 /* Return the cost to encode a 1 bit in the specified context. */
1013 static forceinline u32
1014 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
1016 return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
1017 probs[state].num_recent_zero_bits];
1020 /* Return the cost to encode a literal, including the main bit. */
1021 static forceinline u32
1022 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
1024 return lzms_bit_0_cost(main_state, c->probs.main) +
1025 ((u32)c->literal_lens[literal] << COST_SHIFT);
1028 /* Update 'fast_length_cost_tab' to use the latest Huffman code. */
1030 lzms_update_fast_length_costs(struct lzms_compressor *c)
1034 for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
1035 if (len >= lzms_length_slot_base[slot + 1]) {
1037 cost = (u32)(c->length_lens[slot] +
1038 lzms_extra_length_bits[slot]) << COST_SHIFT;
1040 c->fast_length_cost_tab[len] = cost;
1044 /* Return the cost to encode the specified match length, which must not exceed
1045 * MAX_FAST_LENGTH. */
1046 static forceinline u32
1047 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
1049 return c->fast_length_cost_tab[length];
1052 /* Return the cost to encode the specified LZ match offset. */
1053 static forceinline u32
1054 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
1056 unsigned slot = lzms_comp_get_offset_slot(c, offset);
1057 u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot];
1058 return num_bits << COST_SHIFT;
1061 /* Return the cost to encode the specified delta power and raw offset. */
1062 static forceinline u32
1063 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
1065 unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
1066 u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] +
1067 lzms_extra_offset_bits[slot];
1068 return num_bits << COST_SHIFT;
1071 /******************************************************************************
1073 ******************************************************************************/
1076 lzms_init_adaptive_state(struct lzms_adaptive_state *state)
1078 for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++)
1079 state->recent_lz_offsets[i] = i + 1;
1080 state->prev_lz_offset = 0;
1081 state->upcoming_lz_offset = 0;
1083 for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++)
1084 state->recent_delta_pairs[i] = i + 1;
1085 state->prev_delta_pair = 0;
1086 state->upcoming_delta_pair = 0;
1088 state->main_state = 0;
1089 state->match_state = 0;
1090 state->lz_state = 0;
1091 for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
1092 state->lz_rep_states[i] = 0;
1093 state->delta_state = 0;
1094 for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
1095 state->delta_rep_states[i] = 0;
1099 * Update the LRU queues for match sources when advancing by one item.
1101 * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more
1103 * - there are separate queues for LZ and delta matches
1104 * - updates to the queues are delayed by one encoded item (this prevents
1105 * sources from being bumped up to index 0 too early)
1108 lzms_update_lru_queues(struct lzms_adaptive_state *state)
1110 if (state->prev_lz_offset != 0) {
1111 for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--)
1112 state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i];
1113 state->recent_lz_offsets[0] = state->prev_lz_offset;
1115 state->prev_lz_offset = state->upcoming_lz_offset;
1117 if (state->prev_delta_pair != 0) {
1118 for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--)
1119 state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i];
1120 state->recent_delta_pairs[0] = state->prev_delta_pair;
1122 state->prev_delta_pair = state->upcoming_delta_pair;
1125 static forceinline void
1126 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
1128 *state_p = ((*state_p << 1) | bit) & (num_states - 1);
1131 static forceinline void
1132 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
1134 lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
1137 static forceinline void
1138 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
1140 lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
1143 static forceinline void
1144 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
1146 lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
1149 static forceinline void
1150 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1152 for (int i = 0; i < rep_idx; i++)
1153 lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS);
1155 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1156 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
1159 static forceinline void
1160 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
1162 lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
1165 static forceinline void
1166 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1168 for (int i = 0; i < rep_idx; i++)
1169 lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS);
1171 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1172 lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS);
1175 /******************************************************************************
1177 ******************************************************************************/
1179 /* Note: this code just handles finding delta matches. The code for finding LZ
1180 * matches is elsewhere. */
1183 /* Initialize the delta matchfinder for a new input buffer. */
1185 lzms_init_delta_matchfinder(struct lzms_compressor *c)
1187 /* Set all entries to use an invalid power, which will never match. */
1188 STATIC_ASSERT(NUM_POWERS_TO_CONSIDER < (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
1189 memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
1191 /* Initialize the next hash code for each power. We can just use zeroes
1192 * initially; it doesn't really matter. */
1193 for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++)
1194 c->next_delta_hashes[i] = 0;
1198 * Compute a DELTA_HASH_ORDER-bit hash code for the first
1199 * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
1200 * delta context with the specified @span.
1202 static forceinline u32
1203 lzms_delta_hash(const u8 *p, const u32 pos, u32 span)
1205 /* A delta match has a certain span and an offset that is a multiple of
1206 * that span. To reduce wasted space we use a single combined hash
1207 * table for all spans and positions, but to minimize collisions we
1208 * include in the hash code computation the span and the low-order bits
1209 * of the current position. */
1211 STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1212 u8 d0 = *(p + 0) - *(p + 0 - span);
1213 u8 d1 = *(p + 1) - *(p + 1 - span);
1214 u8 d2 = *(p + 2) - *(p + 2 - span);
1215 u32 v = ((span + (pos & (span - 1))) << 24) |
1216 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
1217 return lz_hash(v, DELTA_HASH_ORDER);
1221 * Given a match between @in_next and @matchptr in a delta context with the
1222 * specified @span and having the initial @len, extend the match as far as
1223 * possible, up to a limit of @max_len.
1225 static forceinline u32
1226 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
1227 u32 len, u32 max_len, u32 span)
1229 while (len < max_len &&
1230 (u8)(*(in_next + len) - *(in_next + len - span)) ==
1231 (u8)(*(matchptr + len) - *(matchptr + len - span)))
1239 lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
1240 const u8 *in_next, u32 count)
1242 u32 pos = in_next - c->in_buffer;
1243 if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1))
1246 /* Update the hash table for each power. */
1247 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1248 const u32 span = (u32)1 << power;
1249 if (unlikely(pos < span))
1251 const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1252 const u32 hash = c->next_delta_hashes[power];
1253 c->delta_hash_table[hash] =
1254 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1255 c->next_delta_hashes[power] = next_hash;
1256 prefetchw(&c->delta_hash_table[next_hash]);
1258 } while (in_next++, pos++, --count);
1262 * Skip the next @count bytes (don't search for matches at them). @in_next
1263 * points to the first byte to skip. The return value is @in_next + count.
1266 lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
1268 lcpit_matchfinder_skip_bytes(&c->mf, count);
1269 if (c->use_delta_matches)
1270 lzms_delta_matchfinder_skip_bytes(c, in_next, count);
1271 return in_next + count;
1274 /******************************************************************************
1275 * "Near-optimal" parsing *
1276 ******************************************************************************/
1279 * The main near-optimal parsing routine.
1281 * Briefly, the algorithm does an approximate minimum-cost path search to find a
1282 * "near-optimal" sequence of matches and literals to output, based on the
1283 * current cost model. The algorithm steps forward, position by position (byte
1284 * by byte), and updates the minimum cost path to reach each later position that
1285 * can be reached using a match or literal from the current position. This is
1286 * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1287 * the graph edges are possible matches/literals to code, and the cost of each
1288 * edge is the estimated number of bits (scaled up by COST_SHIFT) that will be
1289 * required to output the corresponding match or literal. But one difference is
1290 * that we actually compute the lowest-cost path in pieces, where each piece is
1291 * terminated when there are no choices to be made.
1293 * The costs of literals and matches are estimated using the range encoder
1294 * states and the semi-adaptive Huffman codes. Except for range encoding
1295 * states, costs are assumed to be constant throughout a single run of the
1296 * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data. This
1297 * introduces a source of non-optimality because the probabilities and Huffman
1298 * codes can change over this part of the data. And of course, there are
1299 * various other reasons why the result isn't optimal in terms of compression
1303 lzms_near_optimal_parse(struct lzms_compressor *c)
1305 const u8 *in_next = c->in_buffer;
1306 const u8 * const in_end = &c->in_buffer[c->in_nbytes];
1307 struct lzms_optimum_node *cur_node;
1308 struct lzms_optimum_node *end_node;
1310 /* Set initial length costs for lengths <= MAX_FAST_LENGTH. */
1311 lzms_update_fast_length_costs(c);
1313 /* Set up the initial adaptive state. */
1314 lzms_init_adaptive_state(&c->optimum_nodes[0].state);
1317 /* Start building a new list of items, which will correspond to the next
1318 * piece of the overall minimum-cost path. */
1320 cur_node = c->optimum_nodes;
1322 end_node = cur_node;
1324 if (in_next == in_end)
1327 /* The following loop runs once for each per byte in the input buffer,
1328 * except in a few shortcut cases. */
1332 /* Repeat offset LZ matches */
1333 if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS &&
1334 in_end - in_next >= 2))
1336 for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) {
1338 /* Looking for a repeat offset LZ match at queue
1341 const u32 offset = cur_node->state.recent_lz_offsets[rep_idx];
1342 const u8 * const matchptr = in_next - offset;
1344 /* Check the first 2 bytes before entering the extension loop. */
1345 if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr))
1348 /* Extend the match to its full length. */
1349 const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next);
1351 /* Early out for long repeat offset LZ match */
1352 if (rep_len >= c->mf.nice_match_len) {
1354 in_next = lzms_skip_bytes(c, rep_len, in_next);
1356 lzms_encode_item_list(c, cur_node);
1357 lzms_encode_item(c, rep_len, rep_idx);
1359 c->optimum_nodes[0].state = cur_node->state;
1360 cur_node = &c->optimum_nodes[0];
1362 cur_node->state.upcoming_lz_offset =
1363 cur_node->state.recent_lz_offsets[rep_idx];
1364 cur_node->state.upcoming_delta_pair = 0;
1365 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1366 cur_node->state.recent_lz_offsets[i] =
1367 cur_node->state.recent_lz_offsets[i + 1];
1368 lzms_update_lru_queues(&cur_node->state);
1369 lzms_update_main_state(&cur_node->state, 1);
1370 lzms_update_match_state(&cur_node->state, 0);
1371 lzms_update_lz_state(&cur_node->state, 1);
1372 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1376 while (end_node < cur_node + rep_len)
1377 (++end_node)->cost = INFINITE_COST;
1379 u32 base_cost = cur_node->cost +
1380 lzms_bit_1_cost(cur_node->state.main_state,
1382 lzms_bit_0_cost(cur_node->state.match_state,
1384 lzms_bit_1_cost(cur_node->state.lz_state,
1387 for (int i = 0; i < rep_idx; i++)
1388 base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
1389 c->probs.lz_rep[i]);
1391 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1392 base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
1393 c->probs.lz_rep[rep_idx]);
1397 u32 cost = base_cost + lzms_fast_length_cost(c, len);
1398 if (cost < (cur_node + len)->cost) {
1399 (cur_node + len)->cost = cost;
1400 (cur_node + len)->item = (struct lzms_item) {
1404 (cur_node + len)->num_extra_items = 0;
1406 } while (++len <= rep_len);
1409 /* try LZ-rep + lit + LZ-rep0 */
1410 if (c->try_lzrep_lit_lzrep0 &&
1411 in_end - (in_next + rep_len) >= 3 &&
1412 load_u16_unaligned(in_next + rep_len + 1) ==
1413 load_u16_unaligned(matchptr + rep_len + 1))
1415 const u32 rep0_len = lz_extend(in_next + rep_len + 1,
1416 matchptr + rep_len + 1,
1418 min(c->mf.nice_match_len,
1419 in_end - (in_next + rep_len + 1)));
1421 unsigned main_state = cur_node->state.main_state;
1422 unsigned match_state = cur_node->state.match_state;
1423 unsigned lz_state = cur_node->state.lz_state;
1424 unsigned lz_rep0_state = cur_node->state.lz_rep_states[0];
1426 /* update states for LZ-rep */
1427 main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1428 match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1429 lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS;
1430 lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) %
1431 LZMS_NUM_LZ_REP_PROBS;
1434 u32 cost = base_cost + lzms_fast_length_cost(c, rep_len);
1436 /* add literal cost */
1437 cost += lzms_literal_cost(c, main_state, *(in_next + rep_len));
1439 /* update state for literal */
1440 main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1442 /* add LZ-rep0 cost */
1443 cost += lzms_bit_1_cost(main_state, c->probs.main) +
1444 lzms_bit_0_cost(match_state, c->probs.match) +
1445 lzms_bit_1_cost(lz_state, c->probs.lz) +
1446 lzms_bit_0_cost(lz_rep0_state, c->probs.lz_rep[0]) +
1447 lzms_fast_length_cost(c, rep0_len);
1449 const u32 total_len = rep_len + 1 + rep0_len;
1451 while (end_node < cur_node + total_len)
1452 (++end_node)->cost = INFINITE_COST;
1454 if (cost < (cur_node + total_len)->cost) {
1455 (cur_node + total_len)->cost = cost;
1456 (cur_node + total_len)->item = (struct lzms_item) {
1460 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1462 .source = *(in_next + rep_len),
1464 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1468 (cur_node + total_len)->num_extra_items = 2;
1474 /* Repeat offset delta matches */
1475 if (c->use_delta_matches &&
1476 likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 &&
1477 (in_end - in_next >= 2)))
1479 for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) {
1481 /* Looking for a repeat offset delta match at
1482 * queue index @rep_idx */
1484 const u32 pair = cur_node->state.recent_delta_pairs[rep_idx];
1485 const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT;
1486 const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK;
1487 const u32 span = (u32)1 << power;
1488 const u32 offset = raw_offset << power;
1489 const u8 * const matchptr = in_next - offset;
1491 /* Check the first 2 bytes before entering the
1492 * extension loop. */
1493 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1494 (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1495 ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1496 (u8)(*(matchptr + 1) - *(matchptr + 1 - span))))
1499 /* Extend the match to its full length. */
1500 const u32 rep_len = lzms_extend_delta_match(in_next, matchptr,
1501 2, in_end - in_next,
1504 /* Early out for long repeat offset delta match */
1505 if (rep_len >= c->mf.nice_match_len) {
1507 in_next = lzms_skip_bytes(c, rep_len, in_next);
1509 lzms_encode_item_list(c, cur_node);
1510 lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx);
1512 c->optimum_nodes[0].state = cur_node->state;
1513 cur_node = &c->optimum_nodes[0];
1515 cur_node->state.upcoming_delta_pair = pair;
1516 cur_node->state.upcoming_lz_offset = 0;
1517 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1518 cur_node->state.recent_delta_pairs[i] =
1519 cur_node->state.recent_delta_pairs[i + 1];
1520 lzms_update_lru_queues(&cur_node->state);
1521 lzms_update_main_state(&cur_node->state, 1);
1522 lzms_update_match_state(&cur_node->state, 1);
1523 lzms_update_delta_state(&cur_node->state, 1);
1524 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1528 while (end_node < cur_node + rep_len)
1529 (++end_node)->cost = INFINITE_COST;
1531 u32 base_cost = cur_node->cost +
1532 lzms_bit_1_cost(cur_node->state.main_state,
1534 lzms_bit_1_cost(cur_node->state.match_state,
1536 lzms_bit_1_cost(cur_node->state.delta_state,
1539 for (int i = 0; i < rep_idx; i++)
1540 base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
1541 c->probs.delta_rep[i]);
1543 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1544 base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
1545 c->probs.delta_rep[rep_idx]);
1549 u32 cost = base_cost + lzms_fast_length_cost(c, len);
1550 if (cost < (cur_node + len)->cost) {
1551 (cur_node + len)->cost = cost;
1552 (cur_node + len)->item = (struct lzms_item) {
1554 .source = DELTA_SOURCE_TAG | rep_idx,
1556 (cur_node + len)->num_extra_items = 0;
1558 } while (++len <= rep_len);
1562 /* Explicit offset LZ matches */
1563 num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches);
1566 u32 best_len = c->matches[0].length;
1568 /* Early out for long explicit offset LZ match */
1569 if (best_len >= c->mf.nice_match_len) {
1571 const u32 offset = c->matches[0].offset;
1573 /* Extend the match as far as possible.
1574 * This is necessary because the LCP-interval
1575 * tree matchfinder only reports up to
1576 * nice_match_len bytes. */
1577 best_len = lz_extend(in_next, in_next - offset,
1578 best_len, in_end - in_next);
1580 in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1);
1582 lzms_encode_item_list(c, cur_node);
1583 lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1);
1585 c->optimum_nodes[0].state = cur_node->state;
1586 cur_node = &c->optimum_nodes[0];
1588 cur_node->state.upcoming_lz_offset = offset;
1589 cur_node->state.upcoming_delta_pair = 0;
1590 lzms_update_lru_queues(&cur_node->state);
1591 lzms_update_main_state(&cur_node->state, 1);
1592 lzms_update_match_state(&cur_node->state, 0);
1593 lzms_update_lz_state(&cur_node->state, 0);
1597 while (end_node < cur_node + best_len)
1598 (++end_node)->cost = INFINITE_COST;
1600 u32 base_cost = cur_node->cost +
1601 lzms_bit_1_cost(cur_node->state.main_state,
1603 lzms_bit_0_cost(cur_node->state.match_state,
1605 lzms_bit_0_cost(cur_node->state.lz_state,
1608 if (c->try_lzmatch_lit_lzrep0 &&
1609 likely(in_end - (in_next + c->matches[0].length) >= 3))
1611 /* try LZ-match + lit + LZ-rep0 */
1614 u32 i = num_matches - 1;
1616 const u32 len = c->matches[i].length;
1617 const u32 offset = c->matches[i].offset;
1618 const u32 position_cost = base_cost +
1619 lzms_lz_offset_cost(c, offset);
1621 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1622 if (cost < (cur_node + l)->cost) {
1623 (cur_node + l)->cost = cost;
1624 (cur_node + l)->item = (struct lzms_item) {
1626 .source = offset + (LZMS_NUM_LZ_REPS - 1),
1628 (cur_node + l)->num_extra_items = 0;
1630 } while (++l <= len);
1632 const u8 * const matchptr = in_next - offset;
1633 if (load_u16_unaligned(matchptr + len + 1) !=
1634 load_u16_unaligned(in_next + len + 1))
1637 const u32 rep0_len = lz_extend(in_next + len + 1,
1640 min(c->mf.nice_match_len,
1641 in_end - (in_next + len + 1)));
1643 unsigned main_state = cur_node->state.main_state;
1644 unsigned match_state = cur_node->state.match_state;
1645 unsigned lz_state = cur_node->state.lz_state;
1647 /* update states for LZ-match */
1648 main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1649 match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1650 lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS;
1653 u32 cost = position_cost + lzms_fast_length_cost(c, len);
1655 /* add literal cost */
1656 cost += lzms_literal_cost(c, main_state, *(in_next + len));
1658 /* update state for literal */
1659 main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1661 /* add LZ-rep0 cost */
1662 cost += lzms_bit_1_cost(main_state, c->probs.main) +
1663 lzms_bit_0_cost(match_state, c->probs.match) +
1664 lzms_bit_1_cost(lz_state, c->probs.lz) +
1665 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1666 c->probs.lz_rep[0]) +
1667 lzms_fast_length_cost(c, rep0_len);
1669 const u32 total_len = len + 1 + rep0_len;
1671 while (end_node < cur_node + total_len)
1672 (++end_node)->cost = INFINITE_COST;
1674 if (cost < (cur_node + total_len)->cost) {
1675 (cur_node + total_len)->cost = cost;
1676 (cur_node + total_len)->item = (struct lzms_item) {
1680 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1682 .source = *(in_next + len),
1684 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1686 .source = offset + LZMS_NUM_LZ_REPS - 1,
1688 (cur_node + total_len)->num_extra_items = 2;
1693 u32 i = num_matches - 1;
1695 u32 position_cost = base_cost +
1696 lzms_lz_offset_cost(c, c->matches[i].offset);
1698 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1699 if (cost < (cur_node + l)->cost) {
1700 (cur_node + l)->cost = cost;
1701 (cur_node + l)->item = (struct lzms_item) {
1703 .source = c->matches[i].offset +
1704 (LZMS_NUM_LZ_REPS - 1),
1706 (cur_node + l)->num_extra_items = 0;
1708 } while (++l <= c->matches[i].length);
1713 /* Explicit offset delta matches */
1714 if (c->use_delta_matches &&
1715 likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1))
1717 const u32 pos = in_next - c->in_buffer;
1719 /* Consider each possible power (log2 of span) */
1720 STATIC_ASSERT(NUM_POWERS_TO_CONSIDER <= LZMS_NUM_DELTA_POWER_SYMS);
1721 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1723 const u32 span = (u32)1 << power;
1725 if (unlikely(pos < span))
1728 const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1729 const u32 hash = c->next_delta_hashes[power];
1730 const u32 cur_match = c->delta_hash_table[hash];
1732 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1733 c->next_delta_hashes[power] = next_hash;
1734 prefetchw(&c->delta_hash_table[next_hash]);
1736 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
1739 const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK);
1741 /* The offset must be a multiple of span. */
1742 if (offset & (span - 1))
1745 const u8 * const matchptr = in_next - offset;
1747 /* Check the first 3 bytes before entering the
1748 * extension loop. */
1749 STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1750 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1751 (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1752 ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1753 (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) ||
1754 ((u8)(*(in_next + 2) - *(in_next + 2 - span)) !=
1755 (u8)(*(matchptr + 2) - *(matchptr + 2 - span))))
1758 /* Extend the delta match to its full length. */
1759 const u32 len = lzms_extend_delta_match(in_next,
1761 NBYTES_HASHED_FOR_DELTA,
1765 const u32 raw_offset = offset >> power;
1767 if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK -
1768 (LZMS_NUM_DELTA_REPS - 1)))
1771 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
1773 const u32 source = DELTA_SOURCE_TAG |
1774 (pair + LZMS_NUM_DELTA_REPS - 1);
1776 /* Early out for long explicit offset delta match */
1777 if (len >= c->mf.nice_match_len) {
1779 in_next = lzms_skip_bytes(c, len - 1, in_next + 1);
1781 lzms_encode_item_list(c, cur_node);
1782 lzms_encode_item(c, len, source);
1784 c->optimum_nodes[0].state = cur_node->state;
1785 cur_node = &c->optimum_nodes[0];
1787 cur_node->state.upcoming_lz_offset = 0;
1788 cur_node->state.upcoming_delta_pair = pair;
1789 lzms_update_lru_queues(&cur_node->state);
1790 lzms_update_main_state(&cur_node->state, 1);
1791 lzms_update_match_state(&cur_node->state, 1);
1792 lzms_update_delta_state(&cur_node->state, 0);
1796 while (end_node < cur_node + len)
1797 (++end_node)->cost = INFINITE_COST;
1799 u32 base_cost = cur_node->cost +
1800 lzms_bit_1_cost(cur_node->state.main_state,
1802 lzms_bit_1_cost(cur_node->state.match_state,
1804 lzms_bit_0_cost(cur_node->state.delta_state,
1806 lzms_delta_source_cost(c, power, raw_offset);
1808 u32 l = NBYTES_HASHED_FOR_DELTA;
1810 u32 cost = base_cost + lzms_fast_length_cost(c, l);
1811 if (cost < (cur_node + l)->cost) {
1812 (cur_node + l)->cost = cost;
1813 (cur_node + l)->item = (struct lzms_item) {
1817 (cur_node + l)->num_extra_items = 0;
1819 } while (++l <= len);
1824 if (end_node < cur_node + 1)
1825 (++end_node)->cost = INFINITE_COST;
1826 const u32 cur_and_lit_cost = cur_node->cost +
1827 lzms_literal_cost(c, cur_node->state.main_state,
1829 if (cur_and_lit_cost < (cur_node + 1)->cost) {
1830 (cur_node + 1)->cost = cur_and_lit_cost;
1831 (cur_node + 1)->item = (struct lzms_item) {
1835 (cur_node + 1)->num_extra_items = 0;
1836 } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) {
1837 /* try lit + LZ-rep0 */
1839 (cur_node->state.prev_lz_offset) ?
1840 cur_node->state.prev_lz_offset :
1841 cur_node->state.recent_lz_offsets[0];
1843 if (load_u16_unaligned(in_next + 1) ==
1844 load_u16_unaligned(in_next + 1 - offset))
1846 const u32 rep0_len = lz_extend(in_next + 1,
1847 in_next + 1 - offset,
1849 min(in_end - (in_next + 1),
1850 c->mf.nice_match_len));
1852 unsigned main_state = cur_node->state.main_state;
1854 /* Update main_state after literal */
1855 main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS;
1857 /* Add cost of LZ-rep0 */
1858 const u32 cost = cur_and_lit_cost +
1859 lzms_bit_1_cost(main_state, c->probs.main) +
1860 lzms_bit_0_cost(cur_node->state.match_state,
1862 lzms_bit_1_cost(cur_node->state.lz_state,
1864 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1865 c->probs.lz_rep[0]) +
1866 lzms_fast_length_cost(c, rep0_len);
1868 const u32 total_len = 1 + rep0_len;
1870 while (end_node < cur_node + total_len)
1871 (++end_node)->cost = INFINITE_COST;
1873 if (cost < (cur_node + total_len)->cost) {
1874 (cur_node + total_len)->cost = cost;
1875 (cur_node + total_len)->item = (struct lzms_item) {
1879 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1883 (cur_node + total_len)->num_extra_items = 1;
1888 /* Advance to the next position. */
1892 /* The lowest-cost path to the current position is now known.
1893 * Finalize the adaptive state that results from taking this
1894 * lowest-cost path. */
1895 struct lzms_item item_to_take = cur_node->item;
1896 struct lzms_optimum_node *source_node = cur_node - item_to_take.length;
1897 int next_item_idx = -1;
1898 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
1899 item_to_take = cur_node->extra_items[i];
1900 source_node -= item_to_take.length;
1903 cur_node->state = source_node->state;
1905 const u32 length = item_to_take.length;
1906 u32 source = item_to_take.source;
1908 cur_node->state.upcoming_lz_offset = 0;
1909 cur_node->state.upcoming_delta_pair = 0;
1913 lzms_update_main_state(&cur_node->state, 1);
1915 if (source & DELTA_SOURCE_TAG) {
1918 lzms_update_match_state(&cur_node->state, 1);
1919 source &= ~DELTA_SOURCE_TAG;
1921 if (source >= LZMS_NUM_DELTA_REPS) {
1922 /* Explicit offset delta match */
1923 lzms_update_delta_state(&cur_node->state, 0);
1924 cur_node->state.upcoming_delta_pair =
1925 source - (LZMS_NUM_DELTA_REPS - 1);
1927 /* Repeat offset delta match */
1928 int rep_idx = source;
1930 lzms_update_delta_state(&cur_node->state, 1);
1931 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1933 cur_node->state.upcoming_delta_pair =
1934 cur_node->state.recent_delta_pairs[rep_idx];
1936 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1937 cur_node->state.recent_delta_pairs[i] =
1938 cur_node->state.recent_delta_pairs[i + 1];
1941 lzms_update_match_state(&cur_node->state, 0);
1943 if (source >= LZMS_NUM_LZ_REPS) {
1944 /* Explicit offset LZ match */
1945 lzms_update_lz_state(&cur_node->state, 0);
1946 cur_node->state.upcoming_lz_offset =
1947 source - (LZMS_NUM_LZ_REPS - 1);
1949 /* Repeat offset LZ match */
1950 int rep_idx = source;
1952 lzms_update_lz_state(&cur_node->state, 1);
1953 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1955 cur_node->state.upcoming_lz_offset =
1956 cur_node->state.recent_lz_offsets[rep_idx];
1958 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1959 cur_node->state.recent_lz_offsets[i] =
1960 cur_node->state.recent_lz_offsets[i + 1];
1965 lzms_update_main_state(&cur_node->state, 0);
1968 lzms_update_lru_queues(&cur_node->state);
1970 if (next_item_idx < 0)
1972 if (next_item_idx == 0)
1973 item_to_take = cur_node->item;
1975 item_to_take = cur_node->extra_items[next_item_idx - 1];
1980 * This loop will terminate when either of the following
1981 * conditions is true:
1983 * (1) cur_node == end_node
1985 * There are no paths that extend beyond the current
1986 * position. In this case, any path to a later position
1987 * must pass through the current position, so we can go
1988 * ahead and choose the list of items that led to this
1991 * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]
1993 * This bounds the number of times the algorithm can step
1994 * forward before it is guaranteed to start choosing items.
1995 * This limits the memory usage. It also guarantees that
1996 * the parser will not go too long without updating the
1997 * probability tables.
1999 * Note: no check for end-of-buffer is needed because
2000 * end-of-buffer will trigger condition (1).
2002 if (cur_node == end_node ||
2003 cur_node == &c->optimum_nodes[NUM_OPTIM_NODES])
2005 lzms_encode_nonempty_item_list(c, cur_node);
2006 c->optimum_nodes[0].state = cur_node->state;
2013 lzms_init_states_and_probabilities(struct lzms_compressor *c)
2018 for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2019 c->lz_rep_states[i] = 0;
2021 for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2022 c->delta_rep_states[i] = 0;
2024 lzms_init_probabilities(&c->probs);
2028 lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
2030 lzms_init_huffman_code(&c->literal_rebuild_info,
2031 LZMS_NUM_LITERAL_SYMS,
2032 LZMS_LITERAL_CODE_REBUILD_FREQ,
2033 c->literal_codewords,
2037 lzms_init_huffman_code(&c->lz_offset_rebuild_info,
2039 LZMS_LZ_OFFSET_CODE_REBUILD_FREQ,
2040 c->lz_offset_codewords,
2042 c->lz_offset_freqs);
2044 lzms_init_huffman_code(&c->length_rebuild_info,
2045 LZMS_NUM_LENGTH_SYMS,
2046 LZMS_LENGTH_CODE_REBUILD_FREQ,
2047 c->length_codewords,
2051 lzms_init_huffman_code(&c->delta_offset_rebuild_info,
2053 LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ,
2054 c->delta_offset_codewords,
2055 c->delta_offset_lens,
2056 c->delta_offset_freqs);
2058 lzms_init_huffman_code(&c->delta_power_rebuild_info,
2059 LZMS_NUM_DELTA_POWER_SYMS,
2060 LZMS_DELTA_POWER_CODE_REBUILD_FREQ,
2061 c->delta_power_codewords,
2062 c->delta_power_lens,
2063 c->delta_power_freqs);
2067 * Flush the output streams, prepare the final compressed data, and return its
2070 * A return value of 0 indicates that the data could not be compressed to fit in
2071 * the available space.
2074 lzms_finalize(struct lzms_compressor *c)
2076 size_t num_forwards_bytes;
2077 size_t num_backwards_bytes;
2079 /* Flush both the forwards and backwards streams, and make sure they
2080 * didn't cross each other and start overwriting each other's data. */
2081 if (!lzms_output_bitstream_flush(&c->os))
2084 if (!lzms_range_encoder_flush(&c->rc))
2087 if (c->rc.next > c->os.next)
2090 /* Now the compressed buffer contains the data output by the forwards
2091 * bitstream, then empty space, then data output by the backwards
2092 * bitstream. Move the data output by the backwards bitstream to be
2093 * adjacent to the data output by the forward bitstream, and calculate
2094 * the compressed size that this results in. */
2095 num_forwards_bytes = c->rc.next - c->rc.begin;
2096 num_backwards_bytes = c->rc.end - c->os.next;
2098 memmove(c->rc.next, c->os.next, num_backwards_bytes);
2100 return num_forwards_bytes + num_backwards_bytes;
2104 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2109 if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2112 size += sizeof(struct lzms_compressor);
2115 size += max_bufsize; /* in_buffer */
2118 size += lcpit_matchfinder_get_needed_memory(max_bufsize);
2124 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
2125 bool destructive, void **c_ret)
2127 struct lzms_compressor *c;
2130 if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2131 return WIMLIB_ERR_INVALID_PARAM;
2133 c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64);
2137 c->destructive = destructive;
2139 /* Scale nice_match_len with the compression level. But to allow an
2140 * optimization for length cost calculations, don't allow nice_match_len
2141 * to exceed MAX_FAST_LENGTH. */
2142 nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH);
2144 c->use_delta_matches = (compression_level >= 35);
2145 c->try_lzmatch_lit_lzrep0 = (compression_level >= 45);
2146 c->try_lit_lzrep0 = (compression_level >= 60);
2147 c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
2149 if (!c->destructive) {
2150 c->in_buffer = MALLOC(max_bufsize);
2155 if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
2158 lzms_init_fast_length_slot_tab(c);
2159 lzms_init_offset_slot_tabs(c);
2165 if (!c->destructive)
2170 return WIMLIB_ERR_NOMEM;
2174 lzms_compress(const void *restrict in, size_t in_nbytes,
2175 void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2177 struct lzms_compressor *c = _c;
2180 /* Don't bother trying to compress extremely small inputs. */
2184 /* Copy the input data into the internal buffer and preprocess it. */
2186 c->in_buffer = (void *)in;
2188 memcpy(c->in_buffer, in, in_nbytes);
2189 c->in_nbytes = in_nbytes;
2190 lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
2192 /* Prepare the matchfinders. */
2193 lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes);
2194 if (c->use_delta_matches)
2195 lzms_init_delta_matchfinder(c);
2197 /* Initialize the encoder structures. */
2198 lzms_range_encoder_init(&c->rc, out, out_nbytes_avail);
2199 lzms_output_bitstream_init(&c->os, out, out_nbytes_avail);
2200 lzms_init_states_and_probabilities(c);
2201 lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes));
2203 /* The main loop: parse and encode. */
2204 lzms_near_optimal_parse(c);
2206 /* Return the compressed data size or 0. */
2207 result = lzms_finalize(c);
2208 if (!result && c->destructive)
2209 lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true);
2214 lzms_free_compressor(void *_c)
2216 struct lzms_compressor *c = _c;
2218 if (!c->destructive)
2220 lcpit_matchfinder_destroy(&c->mf);
2224 const struct compressor_ops lzms_compressor_ops = {
2225 .get_needed_memory = lzms_get_needed_memory,
2226 .create_compressor = lzms_create_compressor,
2227 .compress = lzms_compress,
2228 .free_compressor = lzms_free_compressor,