4 * A compressor for the LZMS compression format.
8 * Copyright (C) 2013, 2014, 2015 Eric Biggers
10 * This file is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU Lesser General Public License as published by the Free
12 * Software Foundation; either version 3 of the License, or (at your option) any
15 * This file is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this file; if not, see http://www.gnu.org/licenses/.
31 #include "wimlib/compress_common.h"
32 #include "wimlib/compressor_ops.h"
33 #include "wimlib/error.h"
34 #include "wimlib/lcpit_matchfinder.h"
35 #include "wimlib/lz_extend.h"
36 #include "wimlib/lz_hash.h"
37 #include "wimlib/lzms_common.h"
38 #include "wimlib/unaligned.h"
39 #include "wimlib/util.h"
42 * MAX_FAST_LENGTH is the maximum match length for which the length slot can be
43 * looked up directly in 'fast_length_slot_tab' and the length cost can be
44 * looked up directly in 'fast_length_cost_tab'.
46 * We also limit the 'nice_match_len' parameter to this value. Consequently, if
47 * the longest match found is shorter than 'nice_match_len', then it must also
48 * be shorter than MAX_FAST_LENGTH. This makes it possible to do fast lookups
49 * of length costs using 'fast_length_cost_tab' without having to keep checking
50 * whether the length exceeds MAX_FAST_LENGTH or not.
52 #define MAX_FAST_LENGTH 255
54 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will
55 * step forward before forcing the pending items to be encoded. If this value
56 * is increased, then there will be fewer forced flushes, but the probability
57 * entries and Huffman codes will be more likely to become outdated. */
58 #define NUM_OPTIM_NODES 2048
60 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional
61 * bit costs. A single bit has a cost of (1 << COST_SHIFT). */
64 /* Length of the hash table for finding delta matches */
65 #define DELTA_HASH_ORDER 17
66 #define DELTA_HASH_LENGTH ((u32)1 << DELTA_HASH_ORDER)
68 /* The number of bytes to hash when finding delta matches; also taken to be the
69 * minimum length of an explicit offset delta match */
70 #define NBYTES_HASHED_FOR_DELTA 3
72 /* The number of delta match powers to consider (must be <=
73 * LZMS_NUM_DELTA_POWER_SYMS) */
74 #define NUM_POWERS_TO_CONSIDER 6
76 /* This structure tracks the state of writing bits as a series of 16-bit coding
77 * units, starting at the end of the output buffer and proceeding backwards. */
78 struct lzms_output_bitstream {
80 /* Bits that haven't yet been written to the output buffer */
83 /* Number of bits currently held in @bitbuf */
86 /* Pointer to one past the next position in the output buffer at which
87 * to output a 16-bit coding unit */
90 /* Pointer to the beginning of the output buffer (this is the "end" when
91 * writing backwards!) */
95 /* This structure tracks the state of range encoding and its output, which
96 * starts at the beginning of the output buffer and proceeds forwards. */
97 struct lzms_range_encoder {
99 /* The lower boundary of the current range. Logically, this is a 33-bit
100 * integer whose high bit is needed to detect carries. */
103 /* The size of the current range */
106 /* The next 16-bit coding unit to output */
109 /* The number of 16-bit coding units whose output has been delayed due
110 * to possible carrying. The first such coding unit is @cache; all
111 * subsequent such coding units are 0xffff. */
114 /* Pointer to the beginning of the output buffer */
117 /* Pointer to the position in the output buffer at which the next coding
118 * unit must be written */
121 /* Pointer to just past the end of the output buffer */
125 /* Bookkeeping information for an adaptive Huffman code */
126 struct lzms_huffman_rebuild_info {
128 /* The remaining number of symbols to encode until this code must be
130 unsigned num_syms_until_rebuild;
132 /* The number of symbols in this code */
135 /* The rebuild frequency of this code, in symbols */
136 unsigned rebuild_freq;
138 /* The Huffman codeword of each symbol in this code */
141 /* The length of each Huffman codeword, in bits */
144 /* The frequency of each symbol in this code */
149 * The compressor-internal representation of a match or literal.
151 * Literals have length=1; matches have length > 1. (We disallow matches of
152 * length 1, even though this is a valid length in LZMS.)
154 * The source is encoded as follows:
156 * - Literals: the literal byte itself
157 * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1)
158 * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets
159 * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are
160 * the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1)
161 * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is
162 * the index of the (power, raw_offset) pair in recent_delta_pairs
169 #define DELTA_SOURCE_TAG ((u32)1 << 31)
170 #define DELTA_SOURCE_POWER_SHIFT 28
171 #define DELTA_SOURCE_RAW_OFFSET_MASK (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
174 check_that_powers_fit_in_bitfield(void)
176 BUILD_BUG_ON(LZMS_NUM_DELTA_POWER_SYMS > (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
179 /* A stripped-down version of the adaptive state in LZMS which excludes the
180 * probability entries and Huffman codes */
181 struct lzms_adaptive_state {
183 /* Recent offsets for LZ matches */
184 u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1];
185 u32 prev_lz_offset; /* 0 means none */
186 u32 upcoming_lz_offset; /* 0 means none */
188 /* Recent (power, raw offset) pairs for delta matches.
189 * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw
190 * offset, and the high bits are the power. */
191 u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1];
192 u32 prev_delta_pair; /* 0 means none */
193 u32 upcoming_delta_pair; /* 0 means none */
195 /* States for predicting the probabilities of item types */
199 u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
201 u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
202 } _aligned_attribute(64);
205 * This structure represents a byte position in the preprocessed input data and
206 * a node in the graph of possible match/literal choices.
208 * Logically, each incoming edge to this node is labeled with a literal or a
209 * match that can be taken to reach this position from an earlier position; and
210 * each outgoing edge from this node is labeled with a literal or a match that
211 * can be taken to advance from this position to a later position.
213 struct lzms_optimum_node {
216 * The cost of the lowest-cost path that has been found to reach this
217 * position. This can change as progressively lower cost paths are
218 * found to reach this position.
221 #define INFINITE_COST UINT32_MAX
224 * @item is the last item that was taken to reach this position to reach
225 * it with the stored @cost. This can change as progressively lower
226 * cost paths are found to reach this position.
228 * In some cases we look ahead more than one item. If we looked ahead n
229 * items to reach this position, then @item is the last item taken,
230 * @extra_items contains the other items ordered from second-to-last to
231 * first, and @num_extra_items is n - 1.
233 unsigned num_extra_items;
234 struct lzms_item item;
235 struct lzms_item extra_items[2];
238 * The adaptive state that exists at this position. This is filled in
239 * lazily, only after the minimum-cost path to this position is found.
241 * Note: the way the algorithm handles this adaptive state in the
242 * "minimum-cost" parse is actually only an approximation. It's
243 * possible for the globally optimal, minimum cost path to contain a
244 * prefix, ending at a position, where that path prefix is *not* the
245 * minimum cost path to that position. This can happen if such a path
246 * prefix results in a different adaptive state which results in lower
247 * costs later. Although the algorithm does do some heuristic
248 * multi-item lookaheads, it does not solve this problem in general.
250 * Note: this adaptive state structure also does not include the
251 * probability entries or current Huffman codewords. Those aren't
252 * maintained per-position and are only updated occassionally.
254 struct lzms_adaptive_state state;
255 } _aligned_attribute(64);
257 /* The main compressor structure */
258 struct lzms_compressor {
260 /* The matchfinder for LZ matches */
261 struct lcpit_matchfinder mf;
263 /* The preprocessed buffer of data being compressed */
266 /* The number of bytes of data to be compressed, which is the number of
267 * bytes of data in @in_buffer that are actually valid */
271 * Boolean flags to enable consideration of various types of multi-step
272 * operations during parsing.
274 * Among other cases, multi-step operations can help with gaps where two
275 * matches are separated by a non-matching byte.
277 * This idea is borrowed from Igor Pavlov's LZMA encoder.
280 bool try_lzrep_lit_lzrep0;
281 bool try_lzmatch_lit_lzrep0;
284 * If true, the compressor can use delta matches. This slows down
285 * compression. It improves the compression ratio greatly, slightly, or
286 * not at all, depending on the input data.
288 bool use_delta_matches;
290 /* 'last_target_usages' is a large array that is only needed for
291 * preprocessing, so it is in union with fields that don't need to be
292 * initialized until after preprocessing. */
296 /* Temporary space to store matches found by the LZ matchfinder */
297 struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1];
299 /* Hash table for finding delta matches */
300 u32 delta_hash_table[DELTA_HASH_LENGTH];
302 /* For each delta power, the hash code for the next sequence */
303 u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
305 /* The per-byte graph nodes for near-optimal parsing */
306 struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH];
308 /* Table: length => current cost for small match lengths */
309 u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
311 /* Range encoder which outputs to the beginning of the compressed data
312 * buffer, proceeding forwards */
313 struct lzms_range_encoder rc;
315 /* Bitstream which outputs to the end of the compressed data buffer,
316 * proceeding backwards */
317 struct lzms_output_bitstream os;
319 /* States and probability entries for item type disambiguation */
321 unsigned match_state;
323 unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
324 unsigned delta_state;
325 unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
326 struct lzms_probability_entry main_probs[LZMS_NUM_MAIN_PROBS];
327 struct lzms_probability_entry match_probs[LZMS_NUM_MATCH_PROBS];
328 struct lzms_probability_entry lz_probs[LZMS_NUM_LZ_PROBS];
329 struct lzms_probability_entry lz_rep_probs[LZMS_NUM_LZ_REP_DECISIONS]
330 [LZMS_NUM_LZ_REP_PROBS];
331 struct lzms_probability_entry delta_probs[LZMS_NUM_DELTA_PROBS];
332 struct lzms_probability_entry delta_rep_probs[LZMS_NUM_DELTA_REP_DECISIONS]
333 [LZMS_NUM_DELTA_REP_PROBS];
337 struct lzms_huffman_rebuild_info literal_rebuild_info;
338 u32 literal_codewords[LZMS_NUM_LITERAL_SYMS];
339 u8 literal_lens[LZMS_NUM_LITERAL_SYMS];
340 u32 literal_freqs[LZMS_NUM_LITERAL_SYMS];
342 struct lzms_huffman_rebuild_info lz_offset_rebuild_info;
343 u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
344 u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
345 u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
347 struct lzms_huffman_rebuild_info length_rebuild_info;
348 u32 length_codewords[LZMS_NUM_LENGTH_SYMS];
349 u8 length_lens[LZMS_NUM_LENGTH_SYMS];
350 u32 length_freqs[LZMS_NUM_LENGTH_SYMS];
352 struct lzms_huffman_rebuild_info delta_offset_rebuild_info;
353 u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
354 u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
355 u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
357 struct lzms_huffman_rebuild_info delta_power_rebuild_info;
358 u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS];
359 u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS];
360 u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS];
364 s32 last_target_usages[65536];
368 /* Table: length => length slot for small match lengths */
369 u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1];
371 /* Tables for mapping offsets to offset slots */
373 /* slots [0, 167); 0 <= num_extra_bits <= 10 */
374 u8 offset_slot_tab_1[0xe4a5];
376 /* slots [167, 427); 11 <= num_extra_bits <= 15 */
377 u16 offset_slot_tab_2[0x3d0000 >> 11];
379 /* slots [427, 799); 16 <= num_extra_bits */
380 u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16];
383 /******************************************************************************
384 * Offset and length slot acceleration *
385 ******************************************************************************/
387 /* Generate the acceleration table for length slots. */
389 lzms_init_fast_length_slot_tab(struct lzms_compressor *c)
392 for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
393 if (len >= lzms_length_slot_base[slot + 1])
395 c->fast_length_slot_tab[len] = slot;
399 /* Generate the acceleration tables for offset slots. */
401 lzms_init_offset_slot_tabs(struct lzms_compressor *c)
406 /* slots [0, 167); 0 <= num_extra_bits <= 10 */
407 for (offset = 1; offset < 0xe4a5; offset++) {
408 if (offset >= lzms_offset_slot_base[slot + 1])
410 c->offset_slot_tab_1[offset] = slot;
413 /* slots [167, 427); 11 <= num_extra_bits <= 15 */
414 for (; offset < 0x3de4a5; offset += (u32)1 << 11) {
415 if (offset >= lzms_offset_slot_base[slot + 1])
417 c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot;
420 /* slots [427, 799); 16 <= num_extra_bits */
421 for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) {
422 if (offset >= lzms_offset_slot_base[slot + 1])
424 c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot;
429 * Return the length slot for the specified match length, using the compressor's
430 * acceleration table if the length is small enough.
432 static inline unsigned
433 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
435 if (likely(length <= MAX_FAST_LENGTH))
436 return c->fast_length_slot_tab[length];
437 return lzms_get_length_slot(length);
441 * Return the offset slot for the specified match offset, using the compressor's
442 * acceleration tables to speed up the mapping.
444 static inline unsigned
445 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
448 return c->offset_slot_tab_1[offset];
450 if (offset < 0x3d0000)
451 return c->offset_slot_tab_2[offset >> 11];
452 return c->offset_slot_tab_3[offset >> 16];
455 /******************************************************************************
457 ******************************************************************************/
460 * Initialize the range encoder @rc to write forwards to the specified buffer
461 * @out that is @count 16-bit integers long.
464 lzms_range_encoder_init(struct lzms_range_encoder *rc, le16 *out, size_t count)
467 rc->range_size = 0xffffffff;
472 rc->end = out + count;
476 * Attempt to flush bits from the range encoder.
478 * The basic idea is that we're writing bits from @rc->lower_bound to the
479 * output. However, due to carrying, the writing of coding units with the
480 * maximum value, as well as one prior coding unit, must be delayed until it is
481 * determined whether a carry is needed.
483 * This is based on the public domain code for LZMA written by Igor Pavlov, but
484 * with the following differences:
486 * - In LZMS, 16-bit coding units are required rather than 8-bit.
488 * - In LZMS, the first coding unit is not ignored by the decompressor, so
489 * the encoder cannot output a dummy value to that position.
492 lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
494 if ((u32)(rc->lower_bound) < 0xffff0000 ||
495 (u32)(rc->lower_bound >> 32) != 0)
497 /* Carry not needed (rc->lower_bound < 0xffff0000), or carry
498 * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit
501 if (likely(rc->next >= rc->begin)) {
502 if (rc->next != rc->end) {
503 put_unaligned_u16_le(rc->cache +
504 (u16)(rc->lower_bound >> 32),
511 } while (--rc->cache_size != 0);
513 rc->cache = (rc->lower_bound >> 16) & 0xffff;
516 rc->lower_bound = (rc->lower_bound & 0xffff) << 16;
520 lzms_range_encoder_flush(struct lzms_range_encoder *rc)
522 for (int i = 0; i < 4; i++)
523 lzms_range_encoder_shift_low(rc);
524 return rc->next != rc->end;
528 * Encode the next bit using the range encoder.
530 * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
531 * bit is 0 rather than 1.
534 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
536 /* Normalize if needed. */
537 if (rc->range_size <= 0xffff) {
538 rc->range_size <<= 16;
539 lzms_range_encoder_shift_low(rc);
542 u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob;
544 rc->range_size = bound;
546 rc->lower_bound += bound;
547 rc->range_size -= bound;
552 * Encode a bit. This wraps around lzms_range_encode_bit() to handle using and
553 * updating the state and its corresponding probability entry.
556 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
557 struct lzms_probability_entry *probs,
558 struct lzms_range_encoder *rc)
560 struct lzms_probability_entry *prob_entry;
563 /* Load the probability entry for the current state. */
564 prob_entry = &probs[*state_p];
566 /* Update the state based on the next bit. */
567 *state_p = ((*state_p << 1) | bit) & (num_states - 1);
569 /* Get the probability that the bit is 0. */
570 prob = lzms_get_probability(prob_entry);
572 /* Update the probability entry. */
573 lzms_update_probability_entry(prob_entry, bit);
575 /* Encode the bit using the range encoder. */
576 lzms_range_encode_bit(rc, bit, prob);
579 /* Helper functions for encoding bits in the various decision classes */
582 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
584 lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
585 c->main_probs, &c->rc);
589 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
591 lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
592 c->match_probs, &c->rc);
596 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
598 lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
599 c->lz_probs, &c->rc);
603 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
605 lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
606 c->lz_rep_probs[idx], &c->rc);
610 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
612 lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
613 c->delta_probs, &c->rc);
617 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
619 lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
620 c->delta_rep_probs[idx], &c->rc);
623 /******************************************************************************
624 * Huffman encoding and verbatim bits *
625 ******************************************************************************/
628 * Initialize the output bitstream @os to write backwards to the specified
629 * buffer @out that is @count 16-bit integers long.
632 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
633 le16 *out, size_t count)
637 os->next = out + count;
642 * Write some bits, contained in the low-order @num_bits bits of @bits, to the
643 * output bitstream @os.
645 * @max_num_bits is a compile-time constant that specifies the maximum number of
646 * bits that can ever be written at this call site.
649 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
650 const unsigned num_bits, const unsigned max_num_bits)
652 /* Add the bits to the bit buffer variable. */
653 os->bitcount += num_bits;
654 os->bitbuf = (os->bitbuf << num_bits) | bits;
656 /* Check whether any coding units need to be written. */
657 while (os->bitcount >= 16) {
661 /* Write a coding unit, unless it would underflow the buffer. */
662 if (os->next != os->begin)
663 put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next);
665 /* Optimization for call sites that never write more than 16
667 if (max_num_bits <= 16)
673 * Flush the output bitstream, ensuring that all bits written to it have been
674 * written to memory. Return %true if all bits have been output successfully,
675 * or %false if an overrun occurred.
678 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
680 if (os->next == os->begin)
683 if (os->bitcount != 0)
684 put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next);
690 lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
692 make_canonical_huffman_code(rebuild_info->num_syms,
693 LZMS_MAX_CODEWORD_LENGTH,
696 rebuild_info->codewords);
697 rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq;
701 lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info,
702 unsigned num_syms, unsigned rebuild_freq,
703 u32 *codewords, u8 *lens, u32 *freqs)
705 rebuild_info->num_syms = num_syms;
706 rebuild_info->rebuild_freq = rebuild_freq;
707 rebuild_info->codewords = codewords;
708 rebuild_info->lens = lens;
709 rebuild_info->freqs = freqs;
710 lzms_init_symbol_frequencies(freqs, num_syms);
711 lzms_build_huffman_code(rebuild_info);
715 lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
717 lzms_build_huffman_code(rebuild_info);
718 lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms);
722 * Encode a symbol using the specified Huffman code. Then, if the Huffman code
723 * needs to be rebuilt, rebuild it and return true; otherwise return false.
726 lzms_huffman_encode_symbol(unsigned sym,
727 const u32 *codewords, const u8 *lens, u32 *freqs,
728 struct lzms_output_bitstream *os,
729 struct lzms_huffman_rebuild_info *rebuild_info)
731 lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH);
733 if (--rebuild_info->num_syms_until_rebuild == 0) {
734 lzms_rebuild_huffman_code(rebuild_info);
740 /* Helper routines to encode symbols using the various Huffman codes */
743 lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym)
745 return lzms_huffman_encode_symbol(sym, c->literal_codewords,
746 c->literal_lens, c->literal_freqs,
747 &c->os, &c->literal_rebuild_info);
751 lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym)
753 return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords,
754 c->lz_offset_lens, c->lz_offset_freqs,
755 &c->os, &c->lz_offset_rebuild_info);
759 lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym)
761 return lzms_huffman_encode_symbol(sym, c->length_codewords,
762 c->length_lens, c->length_freqs,
763 &c->os, &c->length_rebuild_info);
767 lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym)
769 return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords,
770 c->delta_offset_lens, c->delta_offset_freqs,
771 &c->os, &c->delta_offset_rebuild_info);
775 lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym)
777 return lzms_huffman_encode_symbol(sym, c->delta_power_codewords,
778 c->delta_power_lens, c->delta_power_freqs,
779 &c->os, &c->delta_power_rebuild_info);
783 lzms_update_fast_length_costs(struct lzms_compressor *c);
786 * Encode a match length. If this causes the Huffman code for length symbols to
787 * be rebuilt, also update the length costs array used by the parser.
790 lzms_encode_length(struct lzms_compressor *c, u32 length)
792 unsigned slot = lzms_comp_get_length_slot(c, length);
794 if (lzms_encode_length_symbol(c, slot))
795 lzms_update_fast_length_costs(c);
797 lzms_write_bits(&c->os, length - lzms_length_slot_base[slot],
798 lzms_extra_length_bits[slot],
799 LZMS_MAX_EXTRA_LENGTH_BITS);
802 /* Encode the offset of an LZ match. */
804 lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
806 unsigned slot = lzms_comp_get_offset_slot(c, offset);
808 lzms_encode_lz_offset_symbol(c, slot);
809 lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot],
810 lzms_extra_offset_bits[slot],
811 LZMS_MAX_EXTRA_OFFSET_BITS);
814 /* Encode the raw offset of a delta match. */
816 lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset)
818 unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
820 lzms_encode_delta_offset_symbol(c, slot);
821 lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot],
822 lzms_extra_offset_bits[slot],
823 LZMS_MAX_EXTRA_OFFSET_BITS);
826 /******************************************************************************
828 ******************************************************************************/
830 /* Encode the specified item, which may be a literal or any type of match. */
832 lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source)
834 /* Main bit: 0 = literal, 1 = match */
835 int main_bit = (length > 1);
836 lzms_encode_main_bit(c, main_bit);
840 unsigned literal = source;
841 lzms_encode_literal_symbol(c, literal);
845 /* Match bit: 0 = LZ match, 1 = delta match */
846 int match_bit = (source & DELTA_SOURCE_TAG) != 0;
847 lzms_encode_match_bit(c, match_bit);
852 /* LZ bit: 0 = explicit offset, 1 = repeat offset */
853 int lz_bit = (source < LZMS_NUM_LZ_REPS);
854 lzms_encode_lz_bit(c, lz_bit);
857 /* Explicit offset LZ match */
858 u32 offset = source - (LZMS_NUM_LZ_REPS - 1);
859 lzms_encode_lz_offset(c, offset);
861 /* Repeat offset LZ match */
862 int rep_idx = source;
863 for (int i = 0; i < rep_idx; i++)
864 lzms_encode_lz_rep_bit(c, 1, i);
865 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
866 lzms_encode_lz_rep_bit(c, 0, rep_idx);
871 source &= ~DELTA_SOURCE_TAG;
873 /* Delta bit: 0 = explicit offset, 1 = repeat offset */
874 int delta_bit = (source < LZMS_NUM_DELTA_REPS);
875 lzms_encode_delta_bit(c, delta_bit);
878 /* Explicit offset delta match */
879 u32 power = source >> DELTA_SOURCE_POWER_SHIFT;
880 u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) -
881 (LZMS_NUM_DELTA_REPS - 1);
882 lzms_encode_delta_power_symbol(c, power);
883 lzms_encode_delta_raw_offset(c, raw_offset);
885 /* Repeat offset delta match */
886 int rep_idx = source;
887 for (int i = 0; i < rep_idx; i++)
888 lzms_encode_delta_rep_bit(c, 1, i);
889 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
890 lzms_encode_delta_rep_bit(c, 0, rep_idx);
894 /* Match length (encoded the same way for any match type) */
895 lzms_encode_length(c, length);
899 /* Encode a list of matches and literals chosen by the parsing algorithm. */
901 lzms_encode_nonempty_item_list(struct lzms_compressor *c,
902 struct lzms_optimum_node *end_node)
904 /* Since we've stored at each node the item we took to arrive at that
905 * node, we can trace our chosen path in backwards order. However, for
906 * encoding we need to trace our chosen path in forwards order. To make
907 * this possible, the following loop moves the items from their
908 * destination nodes to their source nodes, which effectively reverses
909 * the path. (Think of it like reversing a singly-linked list.) */
910 struct lzms_optimum_node *cur_node = end_node;
911 struct lzms_item saved_item = cur_node->item;
913 struct lzms_item item = saved_item;
914 if (cur_node->num_extra_items > 0) {
915 /* Handle an arrival via multi-item lookahead. */
917 struct lzms_optimum_node *orig_node = cur_node;
919 cur_node -= item.length;
920 cur_node->item = item;
921 item = orig_node->extra_items[i];
922 } while (++i != orig_node->num_extra_items);
924 cur_node -= item.length;
925 saved_item = cur_node->item;
926 cur_node->item = item;
927 } while (cur_node != c->optimum_nodes);
929 /* Now trace the chosen path in forwards order, encoding each item. */
931 lzms_encode_item(c, cur_node->item.length, cur_node->item.source);
932 cur_node += cur_node->item.length;
933 } while (cur_node != end_node);
937 lzms_encode_item_list(struct lzms_compressor *c,
938 struct lzms_optimum_node *end_node)
940 if (end_node != c->optimum_nodes)
941 lzms_encode_nonempty_item_list(c, end_node);
944 /******************************************************************************
946 ******************************************************************************/
949 * If p is the predicted probability of the next bit being a 0, then the number
950 * of bits required to encode a 0 bit using a binary range encoder is the real
951 * number -log2(p), and the number of bits required to encode a 1 bit is the
952 * real number -log2(1 - p). To avoid computing either of these expressions at
953 * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from
954 * probability to cost for each possible probability. Specifically, the array
955 * indices are the numerators of the possible probabilities in LZMS, where the
956 * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the
957 * bit costs multiplied by 1<<COST_SHIFT and rounded to the nearest integer.
958 * Furthermore, the values stored for 0% and 100% probabilities are equal to the
959 * adjacent values, since these probabilities are not actually permitted. This
960 * allows us to use the num_recent_zero_bits value from the
961 * lzms_probability_entry as the array index without fixing up these two special
964 static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
965 384, 384, 320, 283, 256, 235, 219, 204,
966 192, 181, 171, 163, 155, 147, 140, 134,
967 128, 122, 117, 112, 107, 103, 99, 94,
968 91, 87, 83, 80, 76, 73, 70, 67,
969 64, 61, 58, 56, 53, 51, 48, 46,
970 43, 41, 39, 37, 35, 33, 30, 29,
971 27, 25, 23, 21, 19, 17, 16, 14,
972 12, 11, 9, 8, 6, 4, 3, 1,
977 check_cost_shift(void)
979 /* lzms_bit_costs is hard-coded to the current COST_SHIFT. */
980 BUILD_BUG_ON(COST_SHIFT != 6);
987 lzms_compute_bit_costs(void)
989 for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) {
993 else if (prob == LZMS_PROBABILITY_DENOMINATOR)
996 lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) *
1002 /* Return the cost to encode a 0 bit in the specified context. */
1004 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
1006 return lzms_bit_costs[probs[state].num_recent_zero_bits];
1009 /* Return the cost to encode a 1 bit in the specified context. */
1011 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
1013 return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
1014 probs[state].num_recent_zero_bits];
1017 /* Return the cost to encode a literal, including the main bit. */
1019 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
1021 return lzms_bit_0_cost(main_state, c->main_probs) +
1022 ((u32)c->literal_lens[literal] << COST_SHIFT);
1025 /* Update 'fast_length_cost_tab' to use the latest Huffman code. */
1027 lzms_update_fast_length_costs(struct lzms_compressor *c)
1031 for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
1032 if (len >= lzms_length_slot_base[slot + 1]) {
1034 cost = (u32)(c->length_lens[slot] +
1035 lzms_extra_length_bits[slot]) << COST_SHIFT;
1037 c->fast_length_cost_tab[len] = cost;
1041 /* Return the cost to encode the specified match length, which must not exceed
1042 * MAX_FAST_LENGTH. */
1044 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
1046 return c->fast_length_cost_tab[length];
1049 /* Return the cost to encode the specified LZ match offset. */
1051 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
1053 unsigned slot = lzms_comp_get_offset_slot(c, offset);
1054 u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot];
1055 return num_bits << COST_SHIFT;
1058 /* Return the cost to encode the specified delta power and raw offset. */
1060 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
1062 unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
1063 u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] +
1064 lzms_extra_offset_bits[slot];
1065 return num_bits << COST_SHIFT;
1068 /******************************************************************************
1070 ******************************************************************************/
1073 lzms_init_adaptive_state(struct lzms_adaptive_state *state)
1075 for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++)
1076 state->recent_lz_offsets[i] = i + 1;
1077 state->prev_lz_offset = 0;
1078 state->upcoming_lz_offset = 0;
1080 for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++)
1081 state->recent_delta_pairs[i] = i + 1;
1082 state->prev_delta_pair = 0;
1083 state->upcoming_delta_pair = 0;
1085 state->main_state = 0;
1086 state->match_state = 0;
1087 state->lz_state = 0;
1088 for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
1089 state->lz_rep_states[i] = 0;
1090 state->delta_state = 0;
1091 for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
1092 state->delta_rep_states[i] = 0;
1096 * Update the LRU queues for match sources when advancing by one item.
1098 * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more
1100 * - there are separate queues for LZ and delta matches
1101 * - updates to the queues are delayed by one encoded item (this prevents
1102 * sources from being bumped up to index 0 too early)
1105 lzms_update_lru_queues(struct lzms_adaptive_state *state)
1107 if (state->prev_lz_offset != 0) {
1108 for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--)
1109 state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i];
1110 state->recent_lz_offsets[0] = state->prev_lz_offset;
1112 state->prev_lz_offset = state->upcoming_lz_offset;
1114 if (state->prev_delta_pair != 0) {
1115 for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--)
1116 state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i];
1117 state->recent_delta_pairs[0] = state->prev_delta_pair;
1119 state->prev_delta_pair = state->upcoming_delta_pair;
1123 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
1125 *state_p = ((*state_p << 1) | bit) % num_states;
1129 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
1131 lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
1135 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
1137 lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
1141 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
1143 lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
1147 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1149 for (int i = 0; i < rep_idx; i++)
1150 lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS);
1152 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1153 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
1157 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
1159 lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
1163 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1165 for (int i = 0; i < rep_idx; i++)
1166 lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS);
1168 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1169 lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS);
1172 /******************************************************************************
1174 ******************************************************************************/
1176 /* Note: this code just handles finding delta matches. The code for finding LZ
1177 * matches is elsewhere. */
1180 /* Initialize the delta matchfinder for a new input buffer. */
1182 lzms_init_delta_matchfinder(struct lzms_compressor *c)
1184 /* Set all entries to use an invalid power, which will never match. */
1185 BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER >= (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
1186 memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
1188 /* Initialize the next hash code for each power. We can just use zeroes
1189 * initially; it doesn't really matter. */
1190 for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++)
1191 c->next_delta_hashes[i] = 0;
1195 * Compute a DELTA_HASH_ORDER-bit hash code for the first
1196 * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
1197 * delta context with the specified @span.
1200 lzms_delta_hash(const u8 *p, u32 span)
1202 /* A delta match has a certain span and an offset that is a multiple of
1203 * that span. To reduce wasted space we use a single combined hash
1204 * table for all spans and positions, but to minimize collisions we
1205 * include in the hash code computation the span and the low-order bits
1206 * of the current position. */
1208 BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
1209 u8 d0 = *(p + 0) - *(p + 0 - span);
1210 u8 d1 = *(p + 1) - *(p + 1 - span);
1211 u8 d2 = *(p + 2) - *(p + 2 - span);
1212 u32 v = ((span + ((u32)(uintptr_t)p & (span - 1))) << 24) |
1213 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
1214 return lz_hash(v, DELTA_HASH_ORDER);
1218 * Given a match between @in_next and @matchptr in a delta context with the
1219 * specified @span and having the initial @len, extend the match as far as
1220 * possible, up to a limit of @max_len.
1223 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
1224 u32 len, u32 max_len, u32 span)
1226 while (len < max_len &&
1227 (u8)(*(in_next + len) - *(in_next + len - span)) ==
1228 (u8)(*(matchptr + len) - *(matchptr + len - span)))
1236 lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
1237 const u8 *in_next, u32 count)
1239 u32 pos = in_next - c->in_buffer;
1240 if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1))
1243 /* Update the hash table for each power. */
1244 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1245 const u32 span = (u32)1 << power;
1246 if (unlikely(pos < span))
1248 const u32 next_hash = lzms_delta_hash(in_next + 1, span);
1249 const u32 hash = c->next_delta_hashes[power];
1250 c->delta_hash_table[hash] =
1251 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1252 c->next_delta_hashes[power] = next_hash;
1253 prefetch(&c->delta_hash_table[next_hash]);
1255 } while (in_next++, pos++, --count);
1259 * Skip the next @count bytes (don't search for matches at them). @in_next
1260 * points to the first byte to skip. The return value is @in_next + count.
1263 lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
1265 lcpit_matchfinder_skip_bytes(&c->mf, count);
1266 if (c->use_delta_matches)
1267 lzms_delta_matchfinder_skip_bytes(c, in_next, count);
1268 return in_next + count;
1271 /******************************************************************************
1272 * "Near-optimal" parsing *
1273 ******************************************************************************/
1276 * The main near-optimal parsing routine.
1278 * Briefly, the algorithm does an approximate minimum-cost path search to find a
1279 * "near-optimal" sequence of matches and literals to output, based on the
1280 * current cost model. The algorithm steps forward, position by position (byte
1281 * by byte), and updates the minimum cost path to reach each later position that
1282 * can be reached using a match or literal from the current position. This is
1283 * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1284 * the graph edges are possible matches/literals to code, and the cost of each
1285 * edge is the estimated number of bits that will be required to output the
1286 * corresponding match or literal. But one difference is that we actually
1287 * compute the lowest-cost path in pieces, where each piece is terminated when
1288 * there are no choices to be made.
1290 * The costs of literals and matches are estimated using the range encoder
1291 * states and the semi-adaptive Huffman codes. Except for range encoding
1292 * states, costs are assumed to be constant throughout a single run of the
1293 * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data. This
1294 * introduces a source of non-optimality because the probabilities and Huffman
1295 * codes can change over this part of the data. And of course, there are
1296 * various other reasons why the result isn't optimal in terms of compression
1300 lzms_near_optimal_parse(struct lzms_compressor *c)
1302 const u8 *in_next = c->in_buffer;
1303 const u8 * const in_end = &c->in_buffer[c->in_nbytes];
1304 struct lzms_optimum_node *cur_node;
1305 struct lzms_optimum_node *end_node;
1307 /* Set initial length costs for lengths <= MAX_FAST_LENGTH. */
1308 lzms_update_fast_length_costs(c);
1310 /* Set up the initial adaptive state. */
1311 lzms_init_adaptive_state(&c->optimum_nodes[0].state);
1314 /* Start building a new list of items, which will correspond to the next
1315 * piece of the overall minimum-cost path. */
1317 cur_node = c->optimum_nodes;
1319 end_node = cur_node;
1321 if (in_next == in_end)
1324 /* The following loop runs once for each per byte in the input buffer,
1325 * except in a few shortcut cases. */
1329 /* Repeat offset LZ matches */
1330 if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS &&
1331 in_end - in_next >= 2))
1333 for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) {
1335 /* Looking for a repeat offset LZ match at queue
1338 const u32 offset = cur_node->state.recent_lz_offsets[rep_idx];
1339 const u8 * const matchptr = in_next - offset;
1341 /* Check the first 2 bytes before entering the extension loop. */
1342 if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr))
1345 /* Extend the match to its full length. */
1346 const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next);
1348 /* Early out for long repeat offset LZ match */
1349 if (rep_len >= c->mf.nice_match_len) {
1351 in_next = lzms_skip_bytes(c, rep_len, in_next);
1353 lzms_encode_item_list(c, cur_node);
1354 lzms_encode_item(c, rep_len, rep_idx);
1356 c->optimum_nodes[0].state = cur_node->state;
1357 cur_node = &c->optimum_nodes[0];
1359 cur_node->state.upcoming_lz_offset =
1360 cur_node->state.recent_lz_offsets[rep_idx];
1361 cur_node->state.upcoming_delta_pair = 0;
1362 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1363 cur_node->state.recent_lz_offsets[i] =
1364 cur_node->state.recent_lz_offsets[i + 1];
1365 lzms_update_lru_queues(&cur_node->state);
1366 lzms_update_main_state(&cur_node->state, 1);
1367 lzms_update_match_state(&cur_node->state, 0);
1368 lzms_update_lz_state(&cur_node->state, 1);
1369 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1373 while (end_node < cur_node + rep_len)
1374 (++end_node)->cost = INFINITE_COST;
1376 u32 base_cost = cur_node->cost +
1377 lzms_bit_1_cost(cur_node->state.main_state,
1379 lzms_bit_0_cost(cur_node->state.match_state,
1381 lzms_bit_1_cost(cur_node->state.lz_state,
1384 for (int i = 0; i < rep_idx; i++)
1385 base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
1386 c->lz_rep_probs[i]);
1388 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1389 base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
1390 c->lz_rep_probs[rep_idx]);
1394 u32 cost = base_cost + lzms_fast_length_cost(c, len);
1395 if (cost < (cur_node + len)->cost) {
1396 (cur_node + len)->cost = cost;
1397 (cur_node + len)->item = (struct lzms_item) {
1401 (cur_node + len)->num_extra_items = 0;
1403 } while (++len <= rep_len);
1406 /* try LZ-rep + lit + LZ-rep0 */
1407 if (c->try_lzrep_lit_lzrep0 &&
1408 in_end - (in_next + rep_len) >= 3 &&
1409 load_u16_unaligned(in_next + rep_len + 1) ==
1410 load_u16_unaligned(matchptr + rep_len + 1))
1412 const u32 rep0_len = lz_extend(in_next + rep_len + 1,
1413 matchptr + rep_len + 1,
1415 min(c->mf.nice_match_len,
1416 in_end - (in_next + rep_len + 1)));
1418 unsigned main_state = cur_node->state.main_state;
1419 unsigned match_state = cur_node->state.match_state;
1420 unsigned lz_state = cur_node->state.lz_state;
1421 unsigned lz_rep0_state = cur_node->state.lz_rep_states[0];
1423 /* update states for LZ-rep */
1424 main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1425 match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1426 lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS;
1427 lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) %
1428 LZMS_NUM_LZ_REP_PROBS;
1431 u32 cost = base_cost + lzms_fast_length_cost(c, rep_len);
1433 /* add literal cost */
1434 cost += lzms_literal_cost(c, main_state, *(in_next + rep_len));
1436 /* update state for literal */
1437 main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1439 /* add LZ-rep0 cost */
1440 cost += lzms_bit_1_cost(main_state, c->main_probs) +
1441 lzms_bit_0_cost(match_state, c->match_probs) +
1442 lzms_bit_1_cost(lz_state, c->lz_probs) +
1443 lzms_bit_0_cost(lz_rep0_state, c->lz_rep_probs[0]) +
1444 lzms_fast_length_cost(c, rep0_len);
1446 const u32 total_len = rep_len + 1 + rep0_len;
1448 while (end_node < cur_node + total_len)
1449 (++end_node)->cost = INFINITE_COST;
1451 if (cost < (cur_node + total_len)->cost) {
1452 (cur_node + total_len)->cost = cost;
1453 (cur_node + total_len)->item = (struct lzms_item) {
1457 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1459 .source = *(in_next + rep_len),
1461 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1465 (cur_node + total_len)->num_extra_items = 2;
1471 /* Repeat offset delta matches */
1472 if (c->use_delta_matches &&
1473 likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 &&
1474 (in_end - in_next >= 2)))
1476 for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) {
1478 /* Looking for a repeat offset delta match at
1479 * queue index @rep_idx */
1481 const u32 pair = cur_node->state.recent_delta_pairs[rep_idx];
1482 const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT;
1483 const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK;
1484 const u32 span = (u32)1 << power;
1485 const u32 offset = raw_offset << power;
1486 const u8 * const matchptr = in_next - offset;
1488 /* Check the first 2 bytes before entering the
1489 * extension loop. */
1490 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1491 (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1492 ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1493 (u8)(*(matchptr + 1) - *(matchptr + 1 - span))))
1496 /* Extend the match to its full length. */
1497 const u32 rep_len = lzms_extend_delta_match(in_next, matchptr,
1498 2, in_end - in_next,
1501 /* Early out for long repeat offset delta match */
1502 if (rep_len >= c->mf.nice_match_len) {
1504 in_next = lzms_skip_bytes(c, rep_len, in_next);
1506 lzms_encode_item_list(c, cur_node);
1507 lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx);
1509 c->optimum_nodes[0].state = cur_node->state;
1510 cur_node = &c->optimum_nodes[0];
1512 cur_node->state.upcoming_delta_pair = pair;
1513 cur_node->state.upcoming_lz_offset = 0;
1514 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1515 cur_node->state.recent_delta_pairs[i] =
1516 cur_node->state.recent_delta_pairs[i + 1];
1517 lzms_update_lru_queues(&cur_node->state);
1518 lzms_update_main_state(&cur_node->state, 1);
1519 lzms_update_match_state(&cur_node->state, 1);
1520 lzms_update_delta_state(&cur_node->state, 1);
1521 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1525 while (end_node < cur_node + rep_len)
1526 (++end_node)->cost = INFINITE_COST;
1528 u32 base_cost = cur_node->cost +
1529 lzms_bit_1_cost(cur_node->state.main_state,
1531 lzms_bit_1_cost(cur_node->state.match_state,
1533 lzms_bit_1_cost(cur_node->state.delta_state,
1536 for (int i = 0; i < rep_idx; i++)
1537 base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
1538 c->delta_rep_probs[i]);
1540 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1541 base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
1542 c->delta_rep_probs[rep_idx]);
1546 u32 cost = base_cost + lzms_fast_length_cost(c, len);
1547 if (cost < (cur_node + len)->cost) {
1548 (cur_node + len)->cost = cost;
1549 (cur_node + len)->item = (struct lzms_item) {
1551 .source = DELTA_SOURCE_TAG | rep_idx,
1553 (cur_node + len)->num_extra_items = 0;
1555 } while (++len <= rep_len);
1559 /* Explicit offset LZ matches */
1560 num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches);
1563 u32 best_len = c->matches[0].length;
1565 /* Early out for long explicit offset LZ match */
1566 if (best_len >= c->mf.nice_match_len) {
1568 const u32 offset = c->matches[0].offset;
1570 /* Extend the match as far as possible.
1571 * This is necessary because the LCP-interval
1572 * tree matchfinder only reports up to
1573 * nice_match_len bytes. */
1574 best_len = lz_extend(in_next, in_next - offset,
1575 best_len, in_end - in_next);
1577 in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1);
1579 lzms_encode_item_list(c, cur_node);
1580 lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1);
1582 c->optimum_nodes[0].state = cur_node->state;
1583 cur_node = &c->optimum_nodes[0];
1585 cur_node->state.upcoming_lz_offset = offset;
1586 cur_node->state.upcoming_delta_pair = 0;
1587 lzms_update_lru_queues(&cur_node->state);
1588 lzms_update_main_state(&cur_node->state, 1);
1589 lzms_update_match_state(&cur_node->state, 0);
1590 lzms_update_lz_state(&cur_node->state, 0);
1594 while (end_node < cur_node + best_len)
1595 (++end_node)->cost = INFINITE_COST;
1597 u32 base_cost = cur_node->cost +
1598 lzms_bit_1_cost(cur_node->state.main_state,
1600 lzms_bit_0_cost(cur_node->state.match_state,
1602 lzms_bit_0_cost(cur_node->state.lz_state,
1605 if (c->try_lzmatch_lit_lzrep0 &&
1606 likely(in_end - (in_next + c->matches[0].length) >= 3))
1608 /* try LZ-match + lit + LZ-rep0 */
1611 u32 i = num_matches - 1;
1613 const u32 len = c->matches[i].length;
1614 const u32 offset = c->matches[i].offset;
1615 const u32 position_cost = base_cost +
1616 lzms_lz_offset_cost(c, offset);
1618 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1619 if (cost < (cur_node + l)->cost) {
1620 (cur_node + l)->cost = cost;
1621 (cur_node + l)->item = (struct lzms_item) {
1623 .source = offset + (LZMS_NUM_LZ_REPS - 1),
1625 (cur_node + l)->num_extra_items = 0;
1627 } while (++l <= len);
1629 const u8 * const matchptr = in_next - offset;
1630 if (load_u16_unaligned(matchptr + len + 1) !=
1631 load_u16_unaligned(in_next + len + 1))
1634 const u32 rep0_len = lz_extend(in_next + len + 1,
1637 min(c->mf.nice_match_len,
1638 in_end - (in_next + len + 1)));
1640 unsigned main_state = cur_node->state.main_state;
1641 unsigned match_state = cur_node->state.match_state;
1642 unsigned lz_state = cur_node->state.lz_state;
1644 /* update states for LZ-match */
1645 main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1646 match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1647 lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS;
1650 u32 cost = position_cost + lzms_fast_length_cost(c, len);
1652 /* add literal cost */
1653 cost += lzms_literal_cost(c, main_state, *(in_next + len));
1655 /* update state for literal */
1656 main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1658 /* add LZ-rep0 cost */
1659 cost += lzms_bit_1_cost(main_state, c->main_probs) +
1660 lzms_bit_0_cost(match_state, c->match_probs) +
1661 lzms_bit_1_cost(lz_state, c->lz_probs) +
1662 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1663 c->lz_rep_probs[0]) +
1664 lzms_fast_length_cost(c, rep0_len);
1666 const u32 total_len = len + 1 + rep0_len;
1668 while (end_node < cur_node + total_len)
1669 (++end_node)->cost = INFINITE_COST;
1671 if (cost < (cur_node + total_len)->cost) {
1672 (cur_node + total_len)->cost = cost;
1673 (cur_node + total_len)->item = (struct lzms_item) {
1677 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1679 .source = *(in_next + len),
1681 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1683 .source = offset + LZMS_NUM_LZ_REPS - 1,
1685 (cur_node + total_len)->num_extra_items = 2;
1690 u32 i = num_matches - 1;
1692 u32 position_cost = base_cost +
1693 lzms_lz_offset_cost(c, c->matches[i].offset);
1695 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1696 if (cost < (cur_node + l)->cost) {
1697 (cur_node + l)->cost = cost;
1698 (cur_node + l)->item = (struct lzms_item) {
1700 .source = c->matches[i].offset +
1701 (LZMS_NUM_LZ_REPS - 1),
1703 (cur_node + l)->num_extra_items = 0;
1705 } while (++l <= c->matches[i].length);
1710 /* Explicit offset delta matches */
1711 if (c->use_delta_matches &&
1712 likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1))
1714 const u32 pos = in_next - c->in_buffer;
1716 /* Consider each possible power (log2 of span) */
1717 BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER > LZMS_NUM_DELTA_POWER_SYMS);
1718 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1720 const u32 span = (u32)1 << power;
1722 if (unlikely(pos < span))
1725 const u32 next_hash = lzms_delta_hash(in_next + 1, span);
1726 const u32 hash = c->next_delta_hashes[power];
1727 const u32 cur_match = c->delta_hash_table[hash];
1729 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1730 c->next_delta_hashes[power] = next_hash;
1731 prefetch(&c->delta_hash_table[next_hash]);
1733 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
1736 const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK);
1738 /* The offset must be a multiple of span. */
1739 if (offset & (span - 1))
1742 const u8 * const matchptr = in_next - offset;
1744 /* Check the first 3 bytes before entering the
1745 * extension loop. */
1746 BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3);
1747 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1748 (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1749 ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1750 (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) ||
1751 ((u8)(*(in_next + 2) - *(in_next + 2 - span)) !=
1752 (u8)(*(matchptr + 2) - *(matchptr + 2 - span))))
1755 /* Extend the delta match to its full length. */
1756 const u32 len = lzms_extend_delta_match(in_next,
1762 const u32 raw_offset = offset >> power;
1763 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
1765 const u32 source = DELTA_SOURCE_TAG |
1766 (pair + LZMS_NUM_DELTA_REPS - 1);
1768 /* Early out for long explicit offset delta match */
1769 if (len >= c->mf.nice_match_len) {
1771 in_next = lzms_skip_bytes(c, len - 1, in_next + 1);
1773 lzms_encode_item_list(c, cur_node);
1774 lzms_encode_item(c, len, source);
1776 c->optimum_nodes[0].state = cur_node->state;
1777 cur_node = &c->optimum_nodes[0];
1779 cur_node->state.upcoming_lz_offset = 0;
1780 cur_node->state.upcoming_delta_pair = pair;
1781 lzms_update_lru_queues(&cur_node->state);
1782 lzms_update_main_state(&cur_node->state, 1);
1783 lzms_update_match_state(&cur_node->state, 1);
1784 lzms_update_delta_state(&cur_node->state, 0);
1788 while (end_node < cur_node + len)
1789 (++end_node)->cost = INFINITE_COST;
1791 u32 base_cost = cur_node->cost +
1792 lzms_bit_1_cost(cur_node->state.main_state,
1794 lzms_bit_1_cost(cur_node->state.match_state,
1796 lzms_bit_0_cost(cur_node->state.delta_state,
1798 lzms_delta_source_cost(c, power, raw_offset);
1800 u32 l = NBYTES_HASHED_FOR_DELTA;
1802 u32 cost = base_cost + lzms_fast_length_cost(c, l);
1803 if (cost < (cur_node + l)->cost) {
1804 (cur_node + l)->cost = cost;
1805 (cur_node + l)->item = (struct lzms_item) {
1809 (cur_node + l)->num_extra_items = 0;
1811 } while (++l <= len);
1816 if (end_node < cur_node + 1)
1817 (++end_node)->cost = INFINITE_COST;
1818 const u32 cur_and_lit_cost = cur_node->cost +
1819 lzms_literal_cost(c, cur_node->state.main_state,
1821 if (cur_and_lit_cost < (cur_node + 1)->cost) {
1822 (cur_node + 1)->cost = cur_and_lit_cost;
1823 (cur_node + 1)->item = (struct lzms_item) {
1827 (cur_node + 1)->num_extra_items = 0;
1828 } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) {
1829 /* try lit + LZ-rep0 */
1831 (cur_node->state.prev_lz_offset) ?
1832 cur_node->state.prev_lz_offset :
1833 cur_node->state.recent_lz_offsets[0];
1835 if (load_u16_unaligned(in_next + 1) ==
1836 load_u16_unaligned(in_next + 1 - offset))
1838 const u32 rep0_len = lz_extend(in_next + 1,
1839 in_next + 1 - offset,
1841 min(in_end - (in_next + 1),
1842 c->mf.nice_match_len));
1844 unsigned main_state = cur_node->state.main_state;
1846 /* Update main_state after literal */
1847 main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS;
1849 /* Add cost of LZ-rep0 */
1850 const u32 cost = cur_and_lit_cost +
1851 lzms_bit_1_cost(main_state, c->main_probs) +
1852 lzms_bit_0_cost(cur_node->state.match_state,
1854 lzms_bit_1_cost(cur_node->state.lz_state,
1856 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1857 c->lz_rep_probs[0]) +
1858 lzms_fast_length_cost(c, rep0_len);
1860 const u32 total_len = 1 + rep0_len;
1862 while (end_node < cur_node + total_len)
1863 (++end_node)->cost = INFINITE_COST;
1865 if (cost < (cur_node + total_len)->cost) {
1866 (cur_node + total_len)->cost = cost;
1867 (cur_node + total_len)->item = (struct lzms_item) {
1871 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1875 (cur_node + total_len)->num_extra_items = 1;
1880 /* Advance to the next position. */
1884 /* The lowest-cost path to the current position is now known.
1885 * Finalize the adaptive state that results from taking this
1886 * lowest-cost path. */
1887 struct lzms_item item_to_take = cur_node->item;
1888 struct lzms_optimum_node *source_node = cur_node - (item_to_take.length);
1889 int next_item_idx = -1;
1890 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
1891 item_to_take = cur_node->extra_items[i];
1892 source_node -= item_to_take.length;
1895 cur_node->state = source_node->state;
1897 const u32 length = item_to_take.length;
1898 u32 source = item_to_take.source;
1900 cur_node->state.upcoming_lz_offset = 0;
1901 cur_node->state.upcoming_delta_pair = 0;
1905 lzms_update_main_state(&cur_node->state, 1);
1907 if (source & DELTA_SOURCE_TAG) {
1910 lzms_update_match_state(&cur_node->state, 1);
1911 source &= ~DELTA_SOURCE_TAG;
1913 if (source >= LZMS_NUM_DELTA_REPS) {
1914 /* Explicit offset delta match */
1915 u32 pair = source - (LZMS_NUM_DELTA_REPS - 1);
1916 lzms_update_delta_state(&cur_node->state, 0);
1917 cur_node->state.upcoming_delta_pair = pair;
1919 /* Repeat offset delta match */
1920 int rep_idx = source;
1922 lzms_update_delta_state(&cur_node->state, 1);
1923 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1925 cur_node->state.upcoming_delta_pair =
1926 cur_node->state.recent_delta_pairs[rep_idx];
1928 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1929 cur_node->state.recent_delta_pairs[i] =
1930 cur_node->state.recent_delta_pairs[i + 1];
1933 lzms_update_match_state(&cur_node->state, 0);
1935 if (source >= LZMS_NUM_LZ_REPS) {
1936 /* Explicit offset LZ match */
1937 lzms_update_lz_state(&cur_node->state, 0);
1938 cur_node->state.upcoming_lz_offset =
1939 source - (LZMS_NUM_LZ_REPS - 1);
1941 /* Repeat offset LZ match */
1942 int rep_idx = source;
1944 lzms_update_lz_state(&cur_node->state, 1);
1945 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1947 cur_node->state.upcoming_lz_offset =
1948 cur_node->state.recent_lz_offsets[rep_idx];
1950 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1951 cur_node->state.recent_lz_offsets[i] =
1952 cur_node->state.recent_lz_offsets[i + 1];
1957 lzms_update_main_state(&cur_node->state, 0);
1960 lzms_update_lru_queues(&cur_node->state);
1962 if (next_item_idx < 0)
1964 if (next_item_idx == 0)
1965 item_to_take = cur_node->item;
1967 item_to_take = cur_node->extra_items[next_item_idx - 1];
1972 * This loop will terminate when either of the following
1973 * conditions is true:
1975 * (1) cur_node == end_node
1977 * There are no paths that extend beyond the current
1978 * position. In this case, any path to a later position
1979 * must pass through the current position, so we can go
1980 * ahead and choose the list of items that led to this
1983 * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]
1985 * This bounds the number of times the algorithm can step
1986 * forward before it is guaranteed to start choosing items.
1987 * This limits the memory usage. It also guarantees that
1988 * the parser will not go too long without updating the
1989 * probability tables.
1991 * Note: no check for end-of-buffer is needed because
1992 * end-of-buffer will trigger condition (1).
1994 if (cur_node == end_node ||
1995 cur_node == &c->optimum_nodes[NUM_OPTIM_NODES])
1997 lzms_encode_nonempty_item_list(c, cur_node);
1998 c->optimum_nodes[0].state = cur_node->state;
2005 lzms_init_states_and_probabilities(struct lzms_compressor *c)
2010 for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2011 c->lz_rep_states[i] = 0;
2013 for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2014 c->delta_rep_states[i] = 0;
2016 lzms_init_probability_entries(c->main_probs, LZMS_NUM_MAIN_PROBS);
2017 lzms_init_probability_entries(c->match_probs, LZMS_NUM_MATCH_PROBS);
2018 lzms_init_probability_entries(c->lz_probs, LZMS_NUM_LZ_PROBS);
2019 for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2020 lzms_init_probability_entries(c->lz_rep_probs[i], LZMS_NUM_LZ_REP_PROBS);
2021 lzms_init_probability_entries(c->delta_probs, LZMS_NUM_DELTA_PROBS);
2022 for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2023 lzms_init_probability_entries(c->delta_rep_probs[i], LZMS_NUM_DELTA_REP_PROBS);
2027 lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
2029 lzms_init_huffman_code(&c->literal_rebuild_info,
2030 LZMS_NUM_LITERAL_SYMS,
2031 LZMS_LITERAL_CODE_REBUILD_FREQ,
2032 c->literal_codewords,
2036 lzms_init_huffman_code(&c->lz_offset_rebuild_info,
2038 LZMS_LZ_OFFSET_CODE_REBUILD_FREQ,
2039 c->lz_offset_codewords,
2041 c->lz_offset_freqs);
2043 lzms_init_huffman_code(&c->length_rebuild_info,
2044 LZMS_NUM_LENGTH_SYMS,
2045 LZMS_LENGTH_CODE_REBUILD_FREQ,
2046 c->length_codewords,
2050 lzms_init_huffman_code(&c->delta_offset_rebuild_info,
2052 LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ,
2053 c->delta_offset_codewords,
2054 c->delta_offset_lens,
2055 c->delta_offset_freqs);
2057 lzms_init_huffman_code(&c->delta_power_rebuild_info,
2058 LZMS_NUM_DELTA_POWER_SYMS,
2059 LZMS_DELTA_POWER_CODE_REBUILD_FREQ,
2060 c->delta_power_codewords,
2061 c->delta_power_lens,
2062 c->delta_power_freqs);
2066 * Flush the output streams, prepare the final compressed data, and return its
2069 * A return value of 0 indicates that the data could not be compressed to fit in
2070 * the available space.
2073 lzms_finalize(struct lzms_compressor *c)
2075 size_t num_forwards_units;
2076 size_t num_backwards_units;
2078 /* Flush both the forwards and backwards streams, and make sure they
2079 * didn't cross each other and start overwriting each other's data. */
2080 if (!lzms_output_bitstream_flush(&c->os))
2083 if (!lzms_range_encoder_flush(&c->rc))
2086 if (c->rc.next > c->os.next)
2089 /* Now the compressed buffer contains the data output by the forwards
2090 * bitstream, then empty space, then data output by the backwards
2091 * bitstream. Move the data output by the backwards bitstream to be
2092 * adjacent to the data output by the forward bitstream, and calculate
2093 * the compressed size that this results in. */
2094 num_forwards_units = c->rc.next - c->rc.begin;
2095 num_backwards_units = c->rc.end - c->os.next;
2097 memmove(c->rc.next, c->os.next, num_backwards_units * sizeof(le16));
2099 return (num_forwards_units + num_backwards_units) * sizeof(le16);
2103 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level)
2107 if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2110 size += sizeof(struct lzms_compressor);
2113 size += max_bufsize;
2116 size += lcpit_matchfinder_get_needed_memory(max_bufsize);
2122 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
2125 struct lzms_compressor *c;
2128 if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2129 return WIMLIB_ERR_INVALID_PARAM;
2131 c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64);
2135 /* Scale nice_match_len with the compression level. But to allow an
2136 * optimization for length cost calculations, don't allow nice_match_len
2137 * to exceed MAX_FAST_LENGTH. */
2138 nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH);
2140 c->use_delta_matches = (compression_level >= 35);
2141 c->try_lzmatch_lit_lzrep0 = (compression_level >= 45);
2142 c->try_lit_lzrep0 = (compression_level >= 60);
2143 c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
2145 c->in_buffer = MALLOC(max_bufsize);
2149 if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
2152 lzms_init_fast_length_slot_tab(c);
2153 lzms_init_offset_slot_tabs(c);
2163 return WIMLIB_ERR_NOMEM;
2167 lzms_compress(const void *in, size_t in_nbytes,
2168 void *out, size_t out_nbytes_avail, void *_c)
2170 struct lzms_compressor *c = _c;
2172 /* Don't bother trying to compress extremely small inputs. */
2176 /* Copy the input data into the internal buffer and preprocess it. */
2177 memcpy(c->in_buffer, in, in_nbytes);
2178 c->in_nbytes = in_nbytes;
2179 lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
2181 /* Prepare the matchfinders. */
2182 lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes);
2183 if (c->use_delta_matches)
2184 lzms_init_delta_matchfinder(c);
2186 /* Initialize the encoder structures. */
2187 lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16));
2188 lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16));
2189 lzms_init_states_and_probabilities(c);
2190 lzms_init_huffman_codes(c, lzms_get_num_offset_slots(in_nbytes));
2192 /* The main loop: parse and encode. */
2193 lzms_near_optimal_parse(c);
2195 /* Return the compressed data size or 0. */
2196 return lzms_finalize(c);
2200 lzms_free_compressor(void *_c)
2202 struct lzms_compressor *c = _c;
2205 lcpit_matchfinder_destroy(&c->mf);
2209 const struct compressor_ops lzms_compressor_ops = {
2210 .get_needed_memory = lzms_get_needed_memory,
2211 .create_compressor = lzms_create_compressor,
2212 .compress = lzms_compress,
2213 .free_compressor = lzms_free_compressor,