4 * A compressor for the LZMS compression format.
8 * Copyright (C) 2013, 2014, 2015 Eric Biggers
10 * This file is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU Lesser General Public License as published by the Free
12 * Software Foundation; either version 3 of the License, or (at your option) any
15 * This file is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this file; if not, see https://www.gnu.org/licenses/.
31 #include "wimlib/compress_common.h"
32 #include "wimlib/compressor_ops.h"
33 #include "wimlib/error.h"
34 #include "wimlib/lcpit_matchfinder.h"
35 #include "wimlib/lzms_common.h"
36 #include "wimlib/matchfinder_common.h"
37 #include "wimlib/unaligned.h"
38 #include "wimlib/util.h"
41 * MAX_FAST_LENGTH is the maximum match length for which the length slot can be
42 * looked up directly in 'fast_length_slot_tab' and the length cost can be
43 * looked up directly in 'fast_length_cost_tab'.
45 * We also limit the 'nice_match_len' parameter to this value. Consequently, if
46 * the longest match found is shorter than 'nice_match_len', then it must also
47 * be shorter than MAX_FAST_LENGTH. This makes it possible to do fast lookups
48 * of length costs using 'fast_length_cost_tab' without having to keep checking
49 * whether the length exceeds MAX_FAST_LENGTH or not.
51 #define MAX_FAST_LENGTH 255
53 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will
54 * step forward before forcing the pending items to be encoded. If this value
55 * is increased, then there will be fewer forced flushes, but the probability
56 * entries and Huffman codes will be more likely to become outdated. */
57 #define NUM_OPTIM_NODES 2048
59 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional
60 * bit costs. A single bit has a cost of (1 << COST_SHIFT). */
63 /* Length of the hash table for finding delta matches */
64 #define DELTA_HASH_ORDER 17
65 #define DELTA_HASH_LENGTH ((u32)1 << DELTA_HASH_ORDER)
67 /* The number of bytes to hash when finding delta matches; also taken to be the
68 * minimum length of an explicit offset delta match */
69 #define NBYTES_HASHED_FOR_DELTA 3
71 /* The number of delta match powers to consider (must be <=
72 * LZMS_NUM_DELTA_POWER_SYMS) */
73 #define NUM_POWERS_TO_CONSIDER 6
75 /* This structure tracks the state of writing bits as a series of 16-bit coding
76 * units, starting at the end of the output buffer and proceeding backwards. */
77 struct lzms_output_bitstream {
79 /* Bits that haven't yet been written to the output buffer */
82 /* Number of bits currently held in @bitbuf */
85 /* Pointer to the beginning of the output buffer (this is the "end" when
86 * writing backwards!) */
89 /* Pointer to just past the next position in the output buffer at which
90 * to output a 16-bit coding unit */
94 /* This structure tracks the state of range encoding and its output, which
95 * starts at the beginning of the output buffer and proceeds forwards. */
96 struct lzms_range_encoder {
98 /* The lower boundary of the current range. Logically, this is a 33-bit
99 * integer whose high bit is needed to detect carries. */
102 /* The size of the current range */
105 /* The next 16-bit coding unit to output */
108 /* The number of 16-bit coding units whose output has been delayed due
109 * to possible carrying. The first such coding unit is @cache; all
110 * subsequent such coding units are 0xffff. */
113 /* Pointer to the beginning of the output buffer */
116 /* Pointer to the position in the output buffer at which the next coding
117 * unit must be written */
120 /* Pointer to just past the end of the output buffer */
124 /* Bookkeeping information for an adaptive Huffman code */
125 struct lzms_huffman_rebuild_info {
127 /* The remaining number of symbols to encode until this code must be
129 unsigned num_syms_until_rebuild;
131 /* The number of symbols in this code */
134 /* The rebuild frequency of this code, in symbols */
135 unsigned rebuild_freq;
137 /* The Huffman codeword of each symbol in this code */
140 /* The length of each Huffman codeword, in bits */
143 /* The frequency of each symbol in this code */
148 * The compressor-internal representation of a match or literal.
150 * Literals have length=1; matches have length > 1. (We disallow matches of
151 * length 1, even though this is a valid length in LZMS.)
153 * The source is encoded as follows:
155 * - Literals: the literal byte itself
156 * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1)
157 * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets
158 * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are
159 * the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1)
160 * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is
161 * the index of the (power, raw_offset) pair in recent_delta_pairs
168 #define DELTA_SOURCE_TAG ((u32)1 << 31)
169 #define DELTA_SOURCE_POWER_SHIFT 28
170 #define DELTA_SOURCE_RAW_OFFSET_MASK (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
172 static void __attribute__((unused))
173 check_that_powers_fit_in_bitfield(void)
175 STATIC_ASSERT(LZMS_NUM_DELTA_POWER_SYMS <= (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
178 /* A stripped-down version of the adaptive state in LZMS which excludes the
179 * probability entries and Huffman codes */
180 struct lzms_adaptive_state {
182 /* Recent offsets for LZ matches */
183 u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1];
184 u32 prev_lz_offset; /* 0 means none */
185 u32 upcoming_lz_offset; /* 0 means none */
187 /* Recent (power, raw offset) pairs for delta matches.
188 * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw
189 * offset, and the high bits are the power. */
190 u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1];
191 u32 prev_delta_pair; /* 0 means none */
192 u32 upcoming_delta_pair; /* 0 means none */
194 /* States for predicting the probabilities of item types */
198 u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
200 u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
201 } __attribute__((aligned(64)));
204 * This structure represents a byte position in the preprocessed input data and
205 * a node in the graph of possible match/literal choices.
207 * Logically, each incoming edge to this node is labeled with a literal or a
208 * match that can be taken to reach this position from an earlier position; and
209 * each outgoing edge from this node is labeled with a literal or a match that
210 * can be taken to advance from this position to a later position.
212 struct lzms_optimum_node {
215 * The cost of the lowest-cost path that has been found to reach this
216 * position. This can change as progressively lower cost paths are
217 * found to reach this position.
220 #define INFINITE_COST UINT32_MAX
223 * @item is the last item that was taken to reach this position to reach
224 * it with the stored @cost. This can change as progressively lower
225 * cost paths are found to reach this position.
227 * In some cases we look ahead more than one item. If we looked ahead n
228 * items to reach this position, then @item is the last item taken,
229 * @extra_items contains the other items ordered from second-to-last to
230 * first, and @num_extra_items is n - 1.
232 unsigned num_extra_items;
233 struct lzms_item item;
234 struct lzms_item extra_items[2];
237 * The adaptive state that exists at this position. This is filled in
238 * lazily, only after the minimum-cost path to this position is found.
240 * Note: the way the algorithm handles this adaptive state in the
241 * "minimum-cost" parse is actually only an approximation. It's
242 * possible for the globally optimal, minimum cost path to contain a
243 * prefix, ending at a position, where that path prefix is *not* the
244 * minimum cost path to that position. This can happen if such a path
245 * prefix results in a different adaptive state which results in lower
246 * costs later. Although the algorithm does do some heuristic
247 * multi-item lookaheads, it does not solve this problem in general.
249 * Note: this adaptive state structure also does not include the
250 * probability entries or current Huffman codewords. Those aren't
251 * maintained per-position and are only updated occasionally.
253 struct lzms_adaptive_state state;
254 } __attribute__((aligned(64)));
256 /* The main compressor structure */
257 struct lzms_compressor {
259 /* The matchfinder for LZ matches */
260 struct lcpit_matchfinder mf;
262 /* The preprocessed buffer of data being compressed */
265 /* The number of bytes of data to be compressed, which is the number of
266 * bytes of data in @in_buffer that are actually valid */
270 * Boolean flags to enable consideration of various types of multi-step
271 * operations during parsing.
273 * Among other cases, multi-step operations can help with gaps where two
274 * matches are separated by a non-matching byte.
276 * This idea is borrowed from Igor Pavlov's LZMA encoder.
279 bool try_lzrep_lit_lzrep0;
280 bool try_lzmatch_lit_lzrep0;
283 * If true, the compressor can use delta matches. This slows down
284 * compression. It improves the compression ratio greatly, slightly, or
285 * not at all, depending on the input data.
287 bool use_delta_matches;
289 /* If true, the compressor need not preserve the input buffer if it
290 * compresses the data successfully. */
293 /* 'last_target_usages' is a large array that is only needed for
294 * preprocessing, so it is in union with fields that don't need to be
295 * initialized until after preprocessing. */
299 /* Temporary space to store matches found by the LZ matchfinder */
300 struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1];
302 /* Hash table for finding delta matches */
303 u32 delta_hash_table[DELTA_HASH_LENGTH];
305 /* For each delta power, the hash code for the next sequence */
306 u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
308 /* The per-byte graph nodes for near-optimal parsing */
309 struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH +
310 1 + MAX_FAST_LENGTH];
312 /* Table: length => current cost for small match lengths */
313 u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
315 /* Range encoder which outputs to the beginning of the compressed data
316 * buffer, proceeding forwards */
317 struct lzms_range_encoder rc;
319 /* Bitstream which outputs to the end of the compressed data buffer,
320 * proceeding backwards */
321 struct lzms_output_bitstream os;
323 /* States and probability entries for item type disambiguation */
325 unsigned match_state;
327 unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
328 unsigned delta_state;
329 unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
330 struct lzms_probabilites probs;
334 struct lzms_huffman_rebuild_info literal_rebuild_info;
335 u32 literal_codewords[LZMS_NUM_LITERAL_SYMS];
336 u8 literal_lens[LZMS_NUM_LITERAL_SYMS];
337 u32 literal_freqs[LZMS_NUM_LITERAL_SYMS];
339 struct lzms_huffman_rebuild_info lz_offset_rebuild_info;
340 u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
341 u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
342 u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
344 struct lzms_huffman_rebuild_info length_rebuild_info;
345 u32 length_codewords[LZMS_NUM_LENGTH_SYMS];
346 u8 length_lens[LZMS_NUM_LENGTH_SYMS];
347 u32 length_freqs[LZMS_NUM_LENGTH_SYMS];
349 struct lzms_huffman_rebuild_info delta_offset_rebuild_info;
350 u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
351 u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
352 u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
354 struct lzms_huffman_rebuild_info delta_power_rebuild_info;
355 u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS];
356 u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS];
357 u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS];
361 s32 last_target_usages[65536];
365 /* Table: length => length slot for small match lengths */
366 u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1];
368 /* Tables for mapping offsets to offset slots */
370 /* slots [0, 167); 0 <= num_extra_bits <= 10 */
371 u8 offset_slot_tab_1[0xe4a5];
373 /* slots [167, 427); 11 <= num_extra_bits <= 15 */
374 u16 offset_slot_tab_2[0x3d0000 >> 11];
376 /* slots [427, 799); 16 <= num_extra_bits */
377 u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16];
380 /******************************************************************************
381 * Offset and length slot acceleration *
382 ******************************************************************************/
384 /* Generate the acceleration table for length slots. */
386 lzms_init_fast_length_slot_tab(struct lzms_compressor *c)
389 for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
390 if (len >= lzms_length_slot_base[slot + 1])
392 c->fast_length_slot_tab[len] = slot;
396 /* Generate the acceleration tables for offset slots. */
398 lzms_init_offset_slot_tabs(struct lzms_compressor *c)
403 /* slots [0, 167); 0 <= num_extra_bits <= 10 */
404 for (offset = 1; offset < 0xe4a5; offset++) {
405 if (offset >= lzms_offset_slot_base[slot + 1])
407 c->offset_slot_tab_1[offset] = slot;
410 /* slots [167, 427); 11 <= num_extra_bits <= 15 */
411 for (; offset < 0x3de4a5; offset += (u32)1 << 11) {
412 if (offset >= lzms_offset_slot_base[slot + 1])
414 c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot;
417 /* slots [427, 799); 16 <= num_extra_bits */
418 for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) {
419 if (offset >= lzms_offset_slot_base[slot + 1])
421 c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot;
426 * Return the length slot for the specified match length, using the compressor's
427 * acceleration table if the length is small enough.
429 static forceinline unsigned
430 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
432 if (likely(length <= MAX_FAST_LENGTH))
433 return c->fast_length_slot_tab[length];
434 return lzms_get_length_slot(length);
438 * Return the offset slot for the specified match offset, using the compressor's
439 * acceleration tables to speed up the mapping.
441 static forceinline unsigned
442 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
445 return c->offset_slot_tab_1[offset];
447 if (offset < 0x3d0000)
448 return c->offset_slot_tab_2[offset >> 11];
449 return c->offset_slot_tab_3[offset >> 16];
452 /******************************************************************************
454 ******************************************************************************/
457 * Initialize the range encoder @rc to write forwards to the specified buffer
458 * @out that is @size bytes long.
461 lzms_range_encoder_init(struct lzms_range_encoder *rc, u8 *out, size_t size)
464 rc->range_size = 0xffffffff;
468 rc->next = out - sizeof(le16);
469 rc->end = out + (size & ~1);
473 * Attempt to flush bits from the range encoder.
475 * The basic idea is that we're writing bits from @rc->lower_bound to the
476 * output. However, due to carrying, the writing of coding units with the
477 * maximum value, as well as one prior coding unit, must be delayed until it is
478 * determined whether a carry is needed.
480 * This is based on the public domain code for LZMA written by Igor Pavlov, but
481 * with the following differences:
483 * - In LZMS, 16-bit coding units are required rather than 8-bit.
485 * - In LZMS, the first coding unit is not ignored by the decompressor, so
486 * the encoder cannot output a dummy value to that position.
489 lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
491 if ((u32)(rc->lower_bound) < 0xffff0000 ||
492 (u32)(rc->lower_bound >> 32) != 0)
494 /* Carry not needed (rc->lower_bound < 0xffff0000), or carry
495 * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit
498 if (likely(rc->next >= rc->begin)) {
499 if (rc->next != rc->end) {
500 put_unaligned_le16(rc->cache +
501 (u16)(rc->lower_bound >> 32),
503 rc->next += sizeof(le16);
506 rc->next += sizeof(le16);
509 } while (--rc->cache_size != 0);
511 rc->cache = (rc->lower_bound >> 16) & 0xffff;
514 rc->lower_bound = (rc->lower_bound & 0xffff) << 16;
518 lzms_range_encoder_flush(struct lzms_range_encoder *rc)
520 for (int i = 0; i < 4; i++)
521 lzms_range_encoder_shift_low(rc);
522 return rc->next != rc->end;
526 * Encode the next bit using the range encoder.
528 * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
529 * bit is 0 rather than 1.
531 static forceinline void
532 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
534 /* Normalize if needed. */
535 if (rc->range_size <= 0xffff) {
536 rc->range_size <<= 16;
537 lzms_range_encoder_shift_low(rc);
540 u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob;
542 rc->range_size = bound;
544 rc->lower_bound += bound;
545 rc->range_size -= bound;
550 * Encode a bit. This wraps around lzms_range_encode_bit() to handle using and
551 * updating the state and its corresponding probability entry.
553 static forceinline void
554 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
555 struct lzms_probability_entry *probs,
556 struct lzms_range_encoder *rc)
558 struct lzms_probability_entry *prob_entry;
561 /* Load the probability entry for the current state. */
562 prob_entry = &probs[*state_p];
564 /* Update the state based on the next bit. */
565 *state_p = ((*state_p << 1) | bit) & (num_states - 1);
567 /* Get the probability that the bit is 0. */
568 prob = lzms_get_probability(prob_entry);
570 /* Update the probability entry. */
571 lzms_update_probability_entry(prob_entry, bit);
573 /* Encode the bit using the range encoder. */
574 lzms_range_encode_bit(rc, bit, prob);
577 /* Helper functions for encoding bits in the various decision classes */
580 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
582 lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
583 c->probs.main, &c->rc);
587 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
589 lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
590 c->probs.match, &c->rc);
594 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
596 lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
597 c->probs.lz, &c->rc);
601 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
603 lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
604 c->probs.lz_rep[idx], &c->rc);
608 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
610 lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
611 c->probs.delta, &c->rc);
615 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
617 lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
618 c->probs.delta_rep[idx], &c->rc);
621 /******************************************************************************
622 * Huffman encoding and verbatim bits *
623 ******************************************************************************/
626 * Initialize the output bitstream @os to write backwards to the specified
627 * buffer @out that is @size bytes long.
630 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
631 u8 *out, size_t size)
636 os->next = out + (size & ~1);
640 * Write some bits, contained in the low-order @num_bits bits of @bits, to the
641 * output bitstream @os.
643 * @max_num_bits is a compile-time constant that specifies the maximum number of
644 * bits that can ever be written at this call site.
646 static forceinline void
647 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
648 const unsigned num_bits, const unsigned max_num_bits)
650 /* Add the bits to the bit buffer variable. */
651 os->bitcount += num_bits;
652 os->bitbuf = (os->bitbuf << num_bits) | bits;
654 /* Check whether any coding units need to be written. */
655 while (os->bitcount >= 16) {
659 /* Write a coding unit, unless it would underflow the buffer. */
660 if (os->next != os->begin) {
661 os->next -= sizeof(le16);
662 put_unaligned_le16(os->bitbuf >> os->bitcount, os->next);
665 /* Optimization for call sites that never write more than 16
667 if (max_num_bits <= 16)
673 * Flush the output bitstream, ensuring that all bits written to it have been
674 * written to memory. Return %true if all bits have been output successfully,
675 * or %false if an overrun occurred.
678 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
680 if (os->next == os->begin)
683 if (os->bitcount != 0) {
684 os->next -= sizeof(le16);
685 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
692 lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
694 make_canonical_huffman_code(rebuild_info->num_syms,
695 LZMS_MAX_CODEWORD_LENGTH,
698 rebuild_info->codewords);
699 rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq;
703 lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info,
704 unsigned num_syms, unsigned rebuild_freq,
705 u32 *codewords, u8 *lens, u32 *freqs)
707 rebuild_info->num_syms = num_syms;
708 rebuild_info->rebuild_freq = rebuild_freq;
709 rebuild_info->codewords = codewords;
710 rebuild_info->lens = lens;
711 rebuild_info->freqs = freqs;
712 lzms_init_symbol_frequencies(freqs, num_syms);
713 lzms_build_huffman_code(rebuild_info);
717 lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
719 lzms_build_huffman_code(rebuild_info);
720 lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms);
724 * Encode a symbol using the specified Huffman code. Then, if the Huffman code
725 * needs to be rebuilt, rebuild it and return true; otherwise return false.
727 static forceinline bool
728 lzms_huffman_encode_symbol(unsigned sym,
729 const u32 *codewords, const u8 *lens, u32 *freqs,
730 struct lzms_output_bitstream *os,
731 struct lzms_huffman_rebuild_info *rebuild_info)
733 lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH);
735 if (--rebuild_info->num_syms_until_rebuild == 0) {
736 lzms_rebuild_huffman_code(rebuild_info);
742 /* Helper routines to encode symbols using the various Huffman codes */
745 lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym)
747 return lzms_huffman_encode_symbol(sym, c->literal_codewords,
748 c->literal_lens, c->literal_freqs,
749 &c->os, &c->literal_rebuild_info);
753 lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym)
755 return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords,
756 c->lz_offset_lens, c->lz_offset_freqs,
757 &c->os, &c->lz_offset_rebuild_info);
761 lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym)
763 return lzms_huffman_encode_symbol(sym, c->length_codewords,
764 c->length_lens, c->length_freqs,
765 &c->os, &c->length_rebuild_info);
769 lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym)
771 return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords,
772 c->delta_offset_lens, c->delta_offset_freqs,
773 &c->os, &c->delta_offset_rebuild_info);
777 lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym)
779 return lzms_huffman_encode_symbol(sym, c->delta_power_codewords,
780 c->delta_power_lens, c->delta_power_freqs,
781 &c->os, &c->delta_power_rebuild_info);
785 lzms_update_fast_length_costs(struct lzms_compressor *c);
788 * Encode a match length. If this causes the Huffman code for length symbols to
789 * be rebuilt, also update the length costs array used by the parser.
792 lzms_encode_length(struct lzms_compressor *c, u32 length)
794 unsigned slot = lzms_comp_get_length_slot(c, length);
796 if (lzms_encode_length_symbol(c, slot))
797 lzms_update_fast_length_costs(c);
799 lzms_write_bits(&c->os, length - lzms_length_slot_base[slot],
800 lzms_extra_length_bits[slot],
801 LZMS_MAX_EXTRA_LENGTH_BITS);
804 /* Encode the offset of an LZ match. */
806 lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
808 unsigned slot = lzms_comp_get_offset_slot(c, offset);
810 lzms_encode_lz_offset_symbol(c, slot);
811 lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot],
812 lzms_extra_offset_bits[slot],
813 LZMS_MAX_EXTRA_OFFSET_BITS);
816 /* Encode the raw offset of a delta match. */
818 lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset)
820 unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
822 lzms_encode_delta_offset_symbol(c, slot);
823 lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot],
824 lzms_extra_offset_bits[slot],
825 LZMS_MAX_EXTRA_OFFSET_BITS);
828 /******************************************************************************
830 ******************************************************************************/
832 /* Encode the specified item, which may be a literal or any type of match. */
834 lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source)
836 /* Main bit: 0 = literal, 1 = match */
837 int main_bit = (length > 1);
838 lzms_encode_main_bit(c, main_bit);
842 unsigned literal = source;
843 lzms_encode_literal_symbol(c, literal);
847 /* Match bit: 0 = LZ match, 1 = delta match */
848 int match_bit = (source & DELTA_SOURCE_TAG) != 0;
849 lzms_encode_match_bit(c, match_bit);
854 /* LZ bit: 0 = explicit offset, 1 = repeat offset */
855 int lz_bit = (source < LZMS_NUM_LZ_REPS);
856 lzms_encode_lz_bit(c, lz_bit);
859 /* Explicit offset LZ match */
860 u32 offset = source - (LZMS_NUM_LZ_REPS - 1);
861 lzms_encode_lz_offset(c, offset);
863 /* Repeat offset LZ match */
864 int rep_idx = source;
865 for (int i = 0; i < rep_idx; i++)
866 lzms_encode_lz_rep_bit(c, 1, i);
867 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
868 lzms_encode_lz_rep_bit(c, 0, rep_idx);
873 source &= ~DELTA_SOURCE_TAG;
875 /* Delta bit: 0 = explicit offset, 1 = repeat offset */
876 int delta_bit = (source < LZMS_NUM_DELTA_REPS);
877 lzms_encode_delta_bit(c, delta_bit);
880 /* Explicit offset delta match */
881 u32 power = source >> DELTA_SOURCE_POWER_SHIFT;
882 u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) -
883 (LZMS_NUM_DELTA_REPS - 1);
884 lzms_encode_delta_power_symbol(c, power);
885 lzms_encode_delta_raw_offset(c, raw_offset);
887 /* Repeat offset delta match */
888 int rep_idx = source;
889 for (int i = 0; i < rep_idx; i++)
890 lzms_encode_delta_rep_bit(c, 1, i);
891 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
892 lzms_encode_delta_rep_bit(c, 0, rep_idx);
896 /* Match length (encoded the same way for any match type) */
897 lzms_encode_length(c, length);
901 /* Encode a list of matches and literals chosen by the parsing algorithm. */
903 lzms_encode_nonempty_item_list(struct lzms_compressor *c,
904 struct lzms_optimum_node *end_node)
906 /* Since we've stored at each node the item we took to arrive at that
907 * node, we can trace our chosen path in backwards order. However, for
908 * encoding we need to trace our chosen path in forwards order. To make
909 * this possible, the following loop moves the items from their
910 * destination nodes to their source nodes, which effectively reverses
911 * the path. (Think of it like reversing a singly-linked list.) */
912 struct lzms_optimum_node *cur_node = end_node;
913 struct lzms_item saved_item = cur_node->item;
915 struct lzms_item item = saved_item;
916 if (cur_node->num_extra_items > 0) {
917 /* Handle an arrival via multi-item lookahead. */
919 struct lzms_optimum_node *orig_node = cur_node;
921 cur_node -= item.length;
922 cur_node->item = item;
923 item = orig_node->extra_items[i];
924 } while (++i != orig_node->num_extra_items);
926 cur_node -= item.length;
927 saved_item = cur_node->item;
928 cur_node->item = item;
929 } while (cur_node != c->optimum_nodes);
931 /* Now trace the chosen path in forwards order, encoding each item. */
933 lzms_encode_item(c, cur_node->item.length, cur_node->item.source);
934 cur_node += cur_node->item.length;
935 } while (cur_node != end_node);
938 static forceinline void
939 lzms_encode_item_list(struct lzms_compressor *c,
940 struct lzms_optimum_node *end_node)
942 if (end_node != c->optimum_nodes)
943 lzms_encode_nonempty_item_list(c, end_node);
946 /******************************************************************************
948 ******************************************************************************/
951 * If p is the predicted probability of the next bit being a 0, then the number
952 * of bits required to encode a 0 bit using a binary range encoder is the real
953 * number -log2(p), and the number of bits required to encode a 1 bit is the
954 * real number -log2(1 - p). To avoid computing either of these expressions at
955 * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from
956 * probability to cost for each possible probability. Specifically, the array
957 * indices are the numerators of the possible probabilities in LZMS, where the
958 * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the
959 * bit costs multiplied by 1<<COST_SHIFT and rounded to the nearest integer.
960 * Furthermore, the values stored for 0% and 100% probabilities are equal to the
961 * adjacent values, since these probabilities are not actually permitted. This
962 * allows us to use the num_recent_zero_bits value from the
963 * lzms_probability_entry as the array index without fixing up these two special
966 static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
967 384, 384, 320, 283, 256, 235, 219, 204,
968 192, 181, 171, 163, 155, 147, 140, 134,
969 128, 122, 117, 112, 107, 103, 99, 94,
970 91, 87, 83, 80, 76, 73, 70, 67,
971 64, 61, 58, 56, 53, 51, 48, 46,
972 43, 41, 39, 37, 35, 33, 30, 29,
973 27, 25, 23, 21, 19, 17, 16, 14,
974 12, 11, 9, 8, 6, 4, 3, 1,
978 static void __attribute__((unused))
979 check_cost_shift(void)
981 /* lzms_bit_costs is hard-coded to the current COST_SHIFT. */
982 STATIC_ASSERT(COST_SHIFT == 6);
989 lzms_compute_bit_costs(void)
991 for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) {
995 else if (prob == LZMS_PROBABILITY_DENOMINATOR)
998 lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) *
1004 /* Return the cost to encode a 0 bit in the specified context. */
1005 static forceinline u32
1006 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
1008 return lzms_bit_costs[probs[state].num_recent_zero_bits];
1011 /* Return the cost to encode a 1 bit in the specified context. */
1012 static forceinline u32
1013 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
1015 return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
1016 probs[state].num_recent_zero_bits];
1019 /* Return the cost to encode a literal, including the main bit. */
1020 static forceinline u32
1021 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
1023 return lzms_bit_0_cost(main_state, c->probs.main) +
1024 ((u32)c->literal_lens[literal] << COST_SHIFT);
1027 /* Update 'fast_length_cost_tab' to use the latest Huffman code. */
1029 lzms_update_fast_length_costs(struct lzms_compressor *c)
1033 for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
1034 if (len >= lzms_length_slot_base[slot + 1]) {
1036 cost = (u32)(c->length_lens[slot] +
1037 lzms_extra_length_bits[slot]) << COST_SHIFT;
1039 c->fast_length_cost_tab[len] = cost;
1043 /* Return the cost to encode the specified match length, which must not exceed
1044 * MAX_FAST_LENGTH. */
1045 static forceinline u32
1046 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
1048 return c->fast_length_cost_tab[length];
1051 /* Return the cost to encode the specified LZ match offset. */
1052 static forceinline u32
1053 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
1055 unsigned slot = lzms_comp_get_offset_slot(c, offset);
1056 u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot];
1057 return num_bits << COST_SHIFT;
1060 /* Return the cost to encode the specified delta power and raw offset. */
1061 static forceinline u32
1062 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
1064 unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
1065 u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] +
1066 lzms_extra_offset_bits[slot];
1067 return num_bits << COST_SHIFT;
1070 /******************************************************************************
1072 ******************************************************************************/
1075 lzms_init_adaptive_state(struct lzms_adaptive_state *state)
1077 for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++)
1078 state->recent_lz_offsets[i] = i + 1;
1079 state->prev_lz_offset = 0;
1080 state->upcoming_lz_offset = 0;
1082 for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++)
1083 state->recent_delta_pairs[i] = i + 1;
1084 state->prev_delta_pair = 0;
1085 state->upcoming_delta_pair = 0;
1087 state->main_state = 0;
1088 state->match_state = 0;
1089 state->lz_state = 0;
1090 for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
1091 state->lz_rep_states[i] = 0;
1092 state->delta_state = 0;
1093 for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
1094 state->delta_rep_states[i] = 0;
1098 * Update the LRU queues for match sources when advancing by one item.
1100 * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more
1102 * - there are separate queues for LZ and delta matches
1103 * - updates to the queues are delayed by one encoded item (this prevents
1104 * sources from being bumped up to index 0 too early)
1107 lzms_update_lru_queues(struct lzms_adaptive_state *state)
1109 if (state->prev_lz_offset != 0) {
1110 for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--)
1111 state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i];
1112 state->recent_lz_offsets[0] = state->prev_lz_offset;
1114 state->prev_lz_offset = state->upcoming_lz_offset;
1116 if (state->prev_delta_pair != 0) {
1117 for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--)
1118 state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i];
1119 state->recent_delta_pairs[0] = state->prev_delta_pair;
1121 state->prev_delta_pair = state->upcoming_delta_pair;
1124 static forceinline void
1125 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
1127 *state_p = ((*state_p << 1) | bit) & (num_states - 1);
1130 static forceinline void
1131 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
1133 lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
1136 static forceinline void
1137 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
1139 lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
1142 static forceinline void
1143 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
1145 lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
1148 static forceinline void
1149 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1151 for (int i = 0; i < rep_idx; i++)
1152 lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS);
1154 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1155 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
1158 static forceinline void
1159 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
1161 lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
1164 static forceinline void
1165 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1167 for (int i = 0; i < rep_idx; i++)
1168 lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS);
1170 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1171 lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS);
1174 /******************************************************************************
1176 ******************************************************************************/
1178 /* Note: this code just handles finding delta matches. The code for finding LZ
1179 * matches is elsewhere. */
1182 /* Initialize the delta matchfinder for a new input buffer. */
1184 lzms_init_delta_matchfinder(struct lzms_compressor *c)
1186 /* Set all entries to use an invalid power, which will never match. */
1187 STATIC_ASSERT(NUM_POWERS_TO_CONSIDER < (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
1188 memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
1190 /* Initialize the next hash code for each power. We can just use zeroes
1191 * initially; it doesn't really matter. */
1192 for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++)
1193 c->next_delta_hashes[i] = 0;
1197 * Compute a DELTA_HASH_ORDER-bit hash code for the first
1198 * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
1199 * delta context with the specified @span.
1201 static forceinline u32
1202 lzms_delta_hash(const u8 *p, const u32 pos, u32 span)
1204 /* A delta match has a certain span and an offset that is a multiple of
1205 * that span. To reduce wasted space we use a single combined hash
1206 * table for all spans and positions, but to minimize collisions we
1207 * include in the hash code computation the span and the low-order bits
1208 * of the current position. */
1210 STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1211 u8 d0 = *(p + 0) - *(p + 0 - span);
1212 u8 d1 = *(p + 1) - *(p + 1 - span);
1213 u8 d2 = *(p + 2) - *(p + 2 - span);
1214 u32 v = ((span + (pos & (span - 1))) << 24) |
1215 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
1216 return lz_hash(v, DELTA_HASH_ORDER);
1220 * Given a match between @in_next and @matchptr in a delta context with the
1221 * specified @span and having the initial @len, extend the match as far as
1222 * possible, up to a limit of @max_len.
1224 static forceinline u32
1225 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
1226 u32 len, u32 max_len, u32 span)
1228 while (len < max_len &&
1229 (u8)(*(in_next + len) - *(in_next + len - span)) ==
1230 (u8)(*(matchptr + len) - *(matchptr + len - span)))
1238 lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
1239 const u8 *in_next, u32 count)
1241 u32 pos = in_next - c->in_buffer;
1242 if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1))
1245 /* Update the hash table for each power. */
1246 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1247 const u32 span = (u32)1 << power;
1248 if (unlikely(pos < span))
1250 const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1251 const u32 hash = c->next_delta_hashes[power];
1252 c->delta_hash_table[hash] =
1253 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1254 c->next_delta_hashes[power] = next_hash;
1255 prefetchw(&c->delta_hash_table[next_hash]);
1257 } while (in_next++, pos++, --count);
1261 * Skip the next @count bytes (don't search for matches at them). @in_next
1262 * points to the first byte to skip. The return value is @in_next + count.
1265 lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
1267 lcpit_matchfinder_skip_bytes(&c->mf, count);
1268 if (c->use_delta_matches)
1269 lzms_delta_matchfinder_skip_bytes(c, in_next, count);
1270 return in_next + count;
1273 /******************************************************************************
1274 * "Near-optimal" parsing *
1275 ******************************************************************************/
1278 * The main near-optimal parsing routine.
1280 * Briefly, the algorithm does an approximate minimum-cost path search to find a
1281 * "near-optimal" sequence of matches and literals to output, based on the
1282 * current cost model. The algorithm steps forward, position by position (byte
1283 * by byte), and updates the minimum cost path to reach each later position that
1284 * can be reached using a match or literal from the current position. This is
1285 * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1286 * the graph edges are possible matches/literals to code, and the cost of each
1287 * edge is the estimated number of bits (scaled up by COST_SHIFT) that will be
1288 * required to output the corresponding match or literal. But one difference is
1289 * that we actually compute the lowest-cost path in pieces, where each piece is
1290 * terminated when there are no choices to be made.
1292 * The costs of literals and matches are estimated using the range encoder
1293 * states and the semi-adaptive Huffman codes. Except for range encoding
1294 * states, costs are assumed to be constant throughout a single run of the
1295 * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data. This
1296 * introduces a source of non-optimality because the probabilities and Huffman
1297 * codes can change over this part of the data. And of course, there are
1298 * various other reasons why the result isn't optimal in terms of compression
1302 lzms_near_optimal_parse(struct lzms_compressor *c)
1304 const u8 *in_next = c->in_buffer;
1305 const u8 * const in_end = &c->in_buffer[c->in_nbytes];
1306 struct lzms_optimum_node *cur_node;
1307 struct lzms_optimum_node *end_node;
1309 /* Set initial length costs for lengths <= MAX_FAST_LENGTH. */
1310 lzms_update_fast_length_costs(c);
1312 /* Set up the initial adaptive state. */
1313 lzms_init_adaptive_state(&c->optimum_nodes[0].state);
1316 /* Start building a new list of items, which will correspond to the next
1317 * piece of the overall minimum-cost path. */
1319 cur_node = c->optimum_nodes;
1321 end_node = cur_node;
1323 if (in_next == in_end)
1326 /* The following loop runs once for each per byte in the input buffer,
1327 * except in a few shortcut cases. */
1331 /* Repeat offset LZ matches */
1332 if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS &&
1333 in_end - in_next >= 2))
1335 for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) {
1337 /* Looking for a repeat offset LZ match at queue
1340 const u32 offset = cur_node->state.recent_lz_offsets[rep_idx];
1341 const u8 * const matchptr = in_next - offset;
1343 /* Check the first 2 bytes before entering the extension loop. */
1344 if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr))
1347 /* Extend the match to its full length. */
1348 const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next);
1350 /* Early out for long repeat offset LZ match */
1351 if (rep_len >= c->mf.nice_match_len) {
1353 in_next = lzms_skip_bytes(c, rep_len, in_next);
1355 lzms_encode_item_list(c, cur_node);
1356 lzms_encode_item(c, rep_len, rep_idx);
1358 c->optimum_nodes[0].state = cur_node->state;
1359 cur_node = &c->optimum_nodes[0];
1361 cur_node->state.upcoming_lz_offset =
1362 cur_node->state.recent_lz_offsets[rep_idx];
1363 cur_node->state.upcoming_delta_pair = 0;
1364 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1365 cur_node->state.recent_lz_offsets[i] =
1366 cur_node->state.recent_lz_offsets[i + 1];
1367 lzms_update_lru_queues(&cur_node->state);
1368 lzms_update_main_state(&cur_node->state, 1);
1369 lzms_update_match_state(&cur_node->state, 0);
1370 lzms_update_lz_state(&cur_node->state, 1);
1371 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1375 while (end_node < cur_node + rep_len)
1376 (++end_node)->cost = INFINITE_COST;
1378 u32 base_cost = cur_node->cost +
1379 lzms_bit_1_cost(cur_node->state.main_state,
1381 lzms_bit_0_cost(cur_node->state.match_state,
1383 lzms_bit_1_cost(cur_node->state.lz_state,
1386 for (int i = 0; i < rep_idx; i++)
1387 base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
1388 c->probs.lz_rep[i]);
1390 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1391 base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
1392 c->probs.lz_rep[rep_idx]);
1396 u32 cost = base_cost + lzms_fast_length_cost(c, len);
1397 if (cost < (cur_node + len)->cost) {
1398 (cur_node + len)->cost = cost;
1399 (cur_node + len)->item = (struct lzms_item) {
1403 (cur_node + len)->num_extra_items = 0;
1405 } while (++len <= rep_len);
1408 /* try LZ-rep + lit + LZ-rep0 */
1409 if (c->try_lzrep_lit_lzrep0 &&
1410 in_end - (in_next + rep_len) >= 3 &&
1411 load_u16_unaligned(in_next + rep_len + 1) ==
1412 load_u16_unaligned(matchptr + rep_len + 1))
1414 const u32 rep0_len = lz_extend(in_next + rep_len + 1,
1415 matchptr + rep_len + 1,
1417 min(c->mf.nice_match_len,
1418 in_end - (in_next + rep_len + 1)));
1420 unsigned main_state = cur_node->state.main_state;
1421 unsigned match_state = cur_node->state.match_state;
1422 unsigned lz_state = cur_node->state.lz_state;
1423 unsigned lz_rep0_state = cur_node->state.lz_rep_states[0];
1425 /* update states for LZ-rep */
1426 main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1427 match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1428 lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS;
1429 lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) %
1430 LZMS_NUM_LZ_REP_PROBS;
1433 u32 cost = base_cost + lzms_fast_length_cost(c, rep_len);
1435 /* add literal cost */
1436 cost += lzms_literal_cost(c, main_state, *(in_next + rep_len));
1438 /* update state for literal */
1439 main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1441 /* add LZ-rep0 cost */
1442 cost += lzms_bit_1_cost(main_state, c->probs.main) +
1443 lzms_bit_0_cost(match_state, c->probs.match) +
1444 lzms_bit_1_cost(lz_state, c->probs.lz) +
1445 lzms_bit_0_cost(lz_rep0_state, c->probs.lz_rep[0]) +
1446 lzms_fast_length_cost(c, rep0_len);
1448 const u32 total_len = rep_len + 1 + rep0_len;
1450 while (end_node < cur_node + total_len)
1451 (++end_node)->cost = INFINITE_COST;
1453 if (cost < (cur_node + total_len)->cost) {
1454 (cur_node + total_len)->cost = cost;
1455 (cur_node + total_len)->item = (struct lzms_item) {
1459 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1461 .source = *(in_next + rep_len),
1463 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1467 (cur_node + total_len)->num_extra_items = 2;
1473 /* Repeat offset delta matches */
1474 if (c->use_delta_matches &&
1475 likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 &&
1476 (in_end - in_next >= 2)))
1478 for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) {
1480 /* Looking for a repeat offset delta match at
1481 * queue index @rep_idx */
1483 const u32 pair = cur_node->state.recent_delta_pairs[rep_idx];
1484 const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT;
1485 const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK;
1486 const u32 span = (u32)1 << power;
1487 const u32 offset = raw_offset << power;
1488 const u8 * const matchptr = in_next - offset;
1490 /* Check the first 2 bytes before entering the
1491 * extension loop. */
1492 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1493 (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1494 ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1495 (u8)(*(matchptr + 1) - *(matchptr + 1 - span))))
1498 /* Extend the match to its full length. */
1499 const u32 rep_len = lzms_extend_delta_match(in_next, matchptr,
1500 2, in_end - in_next,
1503 /* Early out for long repeat offset delta match */
1504 if (rep_len >= c->mf.nice_match_len) {
1506 in_next = lzms_skip_bytes(c, rep_len, in_next);
1508 lzms_encode_item_list(c, cur_node);
1509 lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx);
1511 c->optimum_nodes[0].state = cur_node->state;
1512 cur_node = &c->optimum_nodes[0];
1514 cur_node->state.upcoming_delta_pair = pair;
1515 cur_node->state.upcoming_lz_offset = 0;
1516 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1517 cur_node->state.recent_delta_pairs[i] =
1518 cur_node->state.recent_delta_pairs[i + 1];
1519 lzms_update_lru_queues(&cur_node->state);
1520 lzms_update_main_state(&cur_node->state, 1);
1521 lzms_update_match_state(&cur_node->state, 1);
1522 lzms_update_delta_state(&cur_node->state, 1);
1523 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1527 while (end_node < cur_node + rep_len)
1528 (++end_node)->cost = INFINITE_COST;
1530 u32 base_cost = cur_node->cost +
1531 lzms_bit_1_cost(cur_node->state.main_state,
1533 lzms_bit_1_cost(cur_node->state.match_state,
1535 lzms_bit_1_cost(cur_node->state.delta_state,
1538 for (int i = 0; i < rep_idx; i++)
1539 base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
1540 c->probs.delta_rep[i]);
1542 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1543 base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
1544 c->probs.delta_rep[rep_idx]);
1548 u32 cost = base_cost + lzms_fast_length_cost(c, len);
1549 if (cost < (cur_node + len)->cost) {
1550 (cur_node + len)->cost = cost;
1551 (cur_node + len)->item = (struct lzms_item) {
1553 .source = DELTA_SOURCE_TAG | rep_idx,
1555 (cur_node + len)->num_extra_items = 0;
1557 } while (++len <= rep_len);
1561 /* Explicit offset LZ matches */
1562 num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches);
1565 u32 best_len = c->matches[0].length;
1567 /* Early out for long explicit offset LZ match */
1568 if (best_len >= c->mf.nice_match_len) {
1570 const u32 offset = c->matches[0].offset;
1572 /* Extend the match as far as possible.
1573 * This is necessary because the LCP-interval
1574 * tree matchfinder only reports up to
1575 * nice_match_len bytes. */
1576 best_len = lz_extend(in_next, in_next - offset,
1577 best_len, in_end - in_next);
1579 in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1);
1581 lzms_encode_item_list(c, cur_node);
1582 lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1);
1584 c->optimum_nodes[0].state = cur_node->state;
1585 cur_node = &c->optimum_nodes[0];
1587 cur_node->state.upcoming_lz_offset = offset;
1588 cur_node->state.upcoming_delta_pair = 0;
1589 lzms_update_lru_queues(&cur_node->state);
1590 lzms_update_main_state(&cur_node->state, 1);
1591 lzms_update_match_state(&cur_node->state, 0);
1592 lzms_update_lz_state(&cur_node->state, 0);
1596 while (end_node < cur_node + best_len)
1597 (++end_node)->cost = INFINITE_COST;
1599 u32 base_cost = cur_node->cost +
1600 lzms_bit_1_cost(cur_node->state.main_state,
1602 lzms_bit_0_cost(cur_node->state.match_state,
1604 lzms_bit_0_cost(cur_node->state.lz_state,
1607 if (c->try_lzmatch_lit_lzrep0 &&
1608 likely(in_end - (in_next + c->matches[0].length) >= 3))
1610 /* try LZ-match + lit + LZ-rep0 */
1613 u32 i = num_matches - 1;
1615 const u32 len = c->matches[i].length;
1616 const u32 offset = c->matches[i].offset;
1617 const u32 position_cost = base_cost +
1618 lzms_lz_offset_cost(c, offset);
1620 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1621 if (cost < (cur_node + l)->cost) {
1622 (cur_node + l)->cost = cost;
1623 (cur_node + l)->item = (struct lzms_item) {
1625 .source = offset + (LZMS_NUM_LZ_REPS - 1),
1627 (cur_node + l)->num_extra_items = 0;
1629 } while (++l <= len);
1631 const u8 * const matchptr = in_next - offset;
1632 if (load_u16_unaligned(matchptr + len + 1) !=
1633 load_u16_unaligned(in_next + len + 1))
1636 const u32 rep0_len = lz_extend(in_next + len + 1,
1639 min(c->mf.nice_match_len,
1640 in_end - (in_next + len + 1)));
1642 unsigned main_state = cur_node->state.main_state;
1643 unsigned match_state = cur_node->state.match_state;
1644 unsigned lz_state = cur_node->state.lz_state;
1646 /* update states for LZ-match */
1647 main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1648 match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1649 lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS;
1652 u32 cost = position_cost + lzms_fast_length_cost(c, len);
1654 /* add literal cost */
1655 cost += lzms_literal_cost(c, main_state, *(in_next + len));
1657 /* update state for literal */
1658 main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1660 /* add LZ-rep0 cost */
1661 cost += lzms_bit_1_cost(main_state, c->probs.main) +
1662 lzms_bit_0_cost(match_state, c->probs.match) +
1663 lzms_bit_1_cost(lz_state, c->probs.lz) +
1664 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1665 c->probs.lz_rep[0]) +
1666 lzms_fast_length_cost(c, rep0_len);
1668 const u32 total_len = len + 1 + rep0_len;
1670 while (end_node < cur_node + total_len)
1671 (++end_node)->cost = INFINITE_COST;
1673 if (cost < (cur_node + total_len)->cost) {
1674 (cur_node + total_len)->cost = cost;
1675 (cur_node + total_len)->item = (struct lzms_item) {
1679 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1681 .source = *(in_next + len),
1683 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1685 .source = offset + LZMS_NUM_LZ_REPS - 1,
1687 (cur_node + total_len)->num_extra_items = 2;
1692 u32 i = num_matches - 1;
1694 u32 position_cost = base_cost +
1695 lzms_lz_offset_cost(c, c->matches[i].offset);
1697 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1698 if (cost < (cur_node + l)->cost) {
1699 (cur_node + l)->cost = cost;
1700 (cur_node + l)->item = (struct lzms_item) {
1702 .source = c->matches[i].offset +
1703 (LZMS_NUM_LZ_REPS - 1),
1705 (cur_node + l)->num_extra_items = 0;
1707 } while (++l <= c->matches[i].length);
1712 /* Explicit offset delta matches */
1713 if (c->use_delta_matches &&
1714 likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1))
1716 const u32 pos = in_next - c->in_buffer;
1718 /* Consider each possible power (log2 of span) */
1719 STATIC_ASSERT(NUM_POWERS_TO_CONSIDER <= LZMS_NUM_DELTA_POWER_SYMS);
1720 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1722 const u32 span = (u32)1 << power;
1724 if (unlikely(pos < span))
1727 const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1728 const u32 hash = c->next_delta_hashes[power];
1729 const u32 cur_match = c->delta_hash_table[hash];
1731 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1732 c->next_delta_hashes[power] = next_hash;
1733 prefetchw(&c->delta_hash_table[next_hash]);
1735 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
1738 const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK);
1740 /* The offset must be a multiple of span. */
1741 if (offset & (span - 1))
1744 const u8 * const matchptr = in_next - offset;
1746 /* Check the first 3 bytes before entering the
1747 * extension loop. */
1748 STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1749 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1750 (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1751 ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1752 (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) ||
1753 ((u8)(*(in_next + 2) - *(in_next + 2 - span)) !=
1754 (u8)(*(matchptr + 2) - *(matchptr + 2 - span))))
1757 /* Extend the delta match to its full length. */
1758 const u32 len = lzms_extend_delta_match(in_next,
1760 NBYTES_HASHED_FOR_DELTA,
1764 const u32 raw_offset = offset >> power;
1766 if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK -
1767 (LZMS_NUM_DELTA_REPS - 1)))
1770 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
1772 const u32 source = DELTA_SOURCE_TAG |
1773 (pair + LZMS_NUM_DELTA_REPS - 1);
1775 /* Early out for long explicit offset delta match */
1776 if (len >= c->mf.nice_match_len) {
1778 in_next = lzms_skip_bytes(c, len - 1, in_next + 1);
1780 lzms_encode_item_list(c, cur_node);
1781 lzms_encode_item(c, len, source);
1783 c->optimum_nodes[0].state = cur_node->state;
1784 cur_node = &c->optimum_nodes[0];
1786 cur_node->state.upcoming_lz_offset = 0;
1787 cur_node->state.upcoming_delta_pair = pair;
1788 lzms_update_lru_queues(&cur_node->state);
1789 lzms_update_main_state(&cur_node->state, 1);
1790 lzms_update_match_state(&cur_node->state, 1);
1791 lzms_update_delta_state(&cur_node->state, 0);
1795 while (end_node < cur_node + len)
1796 (++end_node)->cost = INFINITE_COST;
1798 u32 base_cost = cur_node->cost +
1799 lzms_bit_1_cost(cur_node->state.main_state,
1801 lzms_bit_1_cost(cur_node->state.match_state,
1803 lzms_bit_0_cost(cur_node->state.delta_state,
1805 lzms_delta_source_cost(c, power, raw_offset);
1807 u32 l = NBYTES_HASHED_FOR_DELTA;
1809 u32 cost = base_cost + lzms_fast_length_cost(c, l);
1810 if (cost < (cur_node + l)->cost) {
1811 (cur_node + l)->cost = cost;
1812 (cur_node + l)->item = (struct lzms_item) {
1816 (cur_node + l)->num_extra_items = 0;
1818 } while (++l <= len);
1823 if (end_node < cur_node + 1)
1824 (++end_node)->cost = INFINITE_COST;
1825 const u32 cur_and_lit_cost = cur_node->cost +
1826 lzms_literal_cost(c, cur_node->state.main_state,
1828 if (cur_and_lit_cost < (cur_node + 1)->cost) {
1829 (cur_node + 1)->cost = cur_and_lit_cost;
1830 (cur_node + 1)->item = (struct lzms_item) {
1834 (cur_node + 1)->num_extra_items = 0;
1835 } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) {
1836 /* try lit + LZ-rep0 */
1838 (cur_node->state.prev_lz_offset) ?
1839 cur_node->state.prev_lz_offset :
1840 cur_node->state.recent_lz_offsets[0];
1842 if (load_u16_unaligned(in_next + 1) ==
1843 load_u16_unaligned(in_next + 1 - offset))
1845 const u32 rep0_len = lz_extend(in_next + 1,
1846 in_next + 1 - offset,
1848 min(in_end - (in_next + 1),
1849 c->mf.nice_match_len));
1851 unsigned main_state = cur_node->state.main_state;
1853 /* Update main_state after literal */
1854 main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS;
1856 /* Add cost of LZ-rep0 */
1857 const u32 cost = cur_and_lit_cost +
1858 lzms_bit_1_cost(main_state, c->probs.main) +
1859 lzms_bit_0_cost(cur_node->state.match_state,
1861 lzms_bit_1_cost(cur_node->state.lz_state,
1863 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1864 c->probs.lz_rep[0]) +
1865 lzms_fast_length_cost(c, rep0_len);
1867 const u32 total_len = 1 + rep0_len;
1869 while (end_node < cur_node + total_len)
1870 (++end_node)->cost = INFINITE_COST;
1872 if (cost < (cur_node + total_len)->cost) {
1873 (cur_node + total_len)->cost = cost;
1874 (cur_node + total_len)->item = (struct lzms_item) {
1878 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1882 (cur_node + total_len)->num_extra_items = 1;
1887 /* Advance to the next position. */
1891 /* The lowest-cost path to the current position is now known.
1892 * Finalize the adaptive state that results from taking this
1893 * lowest-cost path. */
1894 struct lzms_item item_to_take = cur_node->item;
1895 struct lzms_optimum_node *source_node = cur_node - item_to_take.length;
1896 int next_item_idx = -1;
1897 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
1898 item_to_take = cur_node->extra_items[i];
1899 source_node -= item_to_take.length;
1902 cur_node->state = source_node->state;
1904 const u32 length = item_to_take.length;
1905 u32 source = item_to_take.source;
1907 cur_node->state.upcoming_lz_offset = 0;
1908 cur_node->state.upcoming_delta_pair = 0;
1912 lzms_update_main_state(&cur_node->state, 1);
1914 if (source & DELTA_SOURCE_TAG) {
1917 lzms_update_match_state(&cur_node->state, 1);
1918 source &= ~DELTA_SOURCE_TAG;
1920 if (source >= LZMS_NUM_DELTA_REPS) {
1921 /* Explicit offset delta match */
1922 lzms_update_delta_state(&cur_node->state, 0);
1923 cur_node->state.upcoming_delta_pair =
1924 source - (LZMS_NUM_DELTA_REPS - 1);
1926 /* Repeat offset delta match */
1927 int rep_idx = source;
1929 lzms_update_delta_state(&cur_node->state, 1);
1930 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1932 cur_node->state.upcoming_delta_pair =
1933 cur_node->state.recent_delta_pairs[rep_idx];
1935 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1936 cur_node->state.recent_delta_pairs[i] =
1937 cur_node->state.recent_delta_pairs[i + 1];
1940 lzms_update_match_state(&cur_node->state, 0);
1942 if (source >= LZMS_NUM_LZ_REPS) {
1943 /* Explicit offset LZ match */
1944 lzms_update_lz_state(&cur_node->state, 0);
1945 cur_node->state.upcoming_lz_offset =
1946 source - (LZMS_NUM_LZ_REPS - 1);
1948 /* Repeat offset LZ match */
1949 int rep_idx = source;
1951 lzms_update_lz_state(&cur_node->state, 1);
1952 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1954 cur_node->state.upcoming_lz_offset =
1955 cur_node->state.recent_lz_offsets[rep_idx];
1957 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1958 cur_node->state.recent_lz_offsets[i] =
1959 cur_node->state.recent_lz_offsets[i + 1];
1964 lzms_update_main_state(&cur_node->state, 0);
1967 lzms_update_lru_queues(&cur_node->state);
1969 if (next_item_idx < 0)
1971 if (next_item_idx == 0)
1972 item_to_take = cur_node->item;
1974 item_to_take = cur_node->extra_items[next_item_idx - 1];
1979 * This loop will terminate when either of the following
1980 * conditions is true:
1982 * (1) cur_node == end_node
1984 * There are no paths that extend beyond the current
1985 * position. In this case, any path to a later position
1986 * must pass through the current position, so we can go
1987 * ahead and choose the list of items that led to this
1990 * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]
1992 * This bounds the number of times the algorithm can step
1993 * forward before it is guaranteed to start choosing items.
1994 * This limits the memory usage. It also guarantees that
1995 * the parser will not go too long without updating the
1996 * probability tables.
1998 * Note: no check for end-of-buffer is needed because
1999 * end-of-buffer will trigger condition (1).
2001 if (cur_node == end_node ||
2002 cur_node == &c->optimum_nodes[NUM_OPTIM_NODES])
2004 lzms_encode_nonempty_item_list(c, cur_node);
2005 c->optimum_nodes[0].state = cur_node->state;
2012 lzms_init_states_and_probabilities(struct lzms_compressor *c)
2017 for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2018 c->lz_rep_states[i] = 0;
2020 for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2021 c->delta_rep_states[i] = 0;
2023 lzms_init_probabilities(&c->probs);
2027 lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
2029 lzms_init_huffman_code(&c->literal_rebuild_info,
2030 LZMS_NUM_LITERAL_SYMS,
2031 LZMS_LITERAL_CODE_REBUILD_FREQ,
2032 c->literal_codewords,
2036 lzms_init_huffman_code(&c->lz_offset_rebuild_info,
2038 LZMS_LZ_OFFSET_CODE_REBUILD_FREQ,
2039 c->lz_offset_codewords,
2041 c->lz_offset_freqs);
2043 lzms_init_huffman_code(&c->length_rebuild_info,
2044 LZMS_NUM_LENGTH_SYMS,
2045 LZMS_LENGTH_CODE_REBUILD_FREQ,
2046 c->length_codewords,
2050 lzms_init_huffman_code(&c->delta_offset_rebuild_info,
2052 LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ,
2053 c->delta_offset_codewords,
2054 c->delta_offset_lens,
2055 c->delta_offset_freqs);
2057 lzms_init_huffman_code(&c->delta_power_rebuild_info,
2058 LZMS_NUM_DELTA_POWER_SYMS,
2059 LZMS_DELTA_POWER_CODE_REBUILD_FREQ,
2060 c->delta_power_codewords,
2061 c->delta_power_lens,
2062 c->delta_power_freqs);
2066 * Flush the output streams, prepare the final compressed data, and return its
2069 * A return value of 0 indicates that the data could not be compressed to fit in
2070 * the available space.
2073 lzms_finalize(struct lzms_compressor *c)
2075 size_t num_forwards_bytes;
2076 size_t num_backwards_bytes;
2078 /* Flush both the forwards and backwards streams, and make sure they
2079 * didn't cross each other and start overwriting each other's data. */
2080 if (!lzms_output_bitstream_flush(&c->os))
2083 if (!lzms_range_encoder_flush(&c->rc))
2086 if (c->rc.next > c->os.next)
2089 /* Now the compressed buffer contains the data output by the forwards
2090 * bitstream, then empty space, then data output by the backwards
2091 * bitstream. Move the data output by the backwards bitstream to be
2092 * adjacent to the data output by the forward bitstream, and calculate
2093 * the compressed size that this results in. */
2094 num_forwards_bytes = c->rc.next - c->rc.begin;
2095 num_backwards_bytes = c->rc.end - c->os.next;
2097 memmove(c->rc.next, c->os.next, num_backwards_bytes);
2099 return num_forwards_bytes + num_backwards_bytes;
2103 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2108 if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2111 size += sizeof(struct lzms_compressor);
2114 size += max_bufsize; /* in_buffer */
2117 size += lcpit_matchfinder_get_needed_memory(max_bufsize);
2123 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
2124 bool destructive, void **c_ret)
2126 struct lzms_compressor *c;
2129 if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2130 return WIMLIB_ERR_INVALID_PARAM;
2132 c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64);
2136 c->destructive = destructive;
2138 /* Scale nice_match_len with the compression level. But to allow an
2139 * optimization for length cost calculations, don't allow nice_match_len
2140 * to exceed MAX_FAST_LENGTH. */
2141 nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH);
2143 c->use_delta_matches = (compression_level >= 35);
2144 c->try_lzmatch_lit_lzrep0 = (compression_level >= 45);
2145 c->try_lit_lzrep0 = (compression_level >= 60);
2146 c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
2148 if (!c->destructive) {
2149 c->in_buffer = MALLOC(max_bufsize);
2154 if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
2157 lzms_init_fast_length_slot_tab(c);
2158 lzms_init_offset_slot_tabs(c);
2164 if (!c->destructive)
2169 return WIMLIB_ERR_NOMEM;
2173 lzms_compress(const void *restrict in, size_t in_nbytes,
2174 void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2176 struct lzms_compressor *c = _c;
2179 /* Don't bother trying to compress extremely small inputs. */
2183 /* Copy the input data into the internal buffer and preprocess it. */
2185 c->in_buffer = (void *)in;
2187 memcpy(c->in_buffer, in, in_nbytes);
2188 c->in_nbytes = in_nbytes;
2189 lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
2191 /* Prepare the matchfinders. */
2192 lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes);
2193 if (c->use_delta_matches)
2194 lzms_init_delta_matchfinder(c);
2196 /* Initialize the encoder structures. */
2197 lzms_range_encoder_init(&c->rc, out, out_nbytes_avail);
2198 lzms_output_bitstream_init(&c->os, out, out_nbytes_avail);
2199 lzms_init_states_and_probabilities(c);
2200 lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes));
2202 /* The main loop: parse and encode. */
2203 lzms_near_optimal_parse(c);
2205 /* Return the compressed data size or 0. */
2206 result = lzms_finalize(c);
2207 if (!result && c->destructive)
2208 lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true);
2213 lzms_free_compressor(void *_c)
2215 struct lzms_compressor *c = _c;
2217 if (!c->destructive)
2219 lcpit_matchfinder_destroy(&c->mf);
2223 const struct compressor_ops lzms_compressor_ops = {
2224 .get_needed_memory = lzms_get_needed_memory,
2225 .create_compressor = lzms_create_compressor,
2226 .compress = lzms_compress,
2227 .free_compressor = lzms_free_compressor,