/* * lzms_compress.c * * A compressor for the LZMS compression format. */ /* * Copyright (C) 2013, 2014, 2015 Eric Biggers * * This file is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your option) any * later version. * * This file is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this file; if not, see http://www.gnu.org/licenses/. */ #ifdef HAVE_CONFIG_H # include "config.h" #endif #include #include #include "wimlib/compress_common.h" #include "wimlib/compressor_ops.h" #include "wimlib/error.h" #include "wimlib/lcpit_matchfinder.h" #include "wimlib/lz_extend.h" #include "wimlib/lz_hash.h" #include "wimlib/lzms_common.h" #include "wimlib/unaligned.h" #include "wimlib/util.h" /* * MAX_FAST_LENGTH is the maximum match length for which the length slot can be * looked up directly in 'fast_length_slot_tab' and the length cost can be * looked up directly in 'fast_length_cost_tab'. * * We also limit the 'nice_match_len' parameter to this value. Consequently, if * the longest match found is shorter than 'nice_match_len', then it must also * be shorter than MAX_FAST_LENGTH. This makes it possible to do fast lookups * of length costs using 'fast_length_cost_tab' without having to keep checking * whether the length exceeds MAX_FAST_LENGTH or not. */ #define MAX_FAST_LENGTH 255 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will * step forward before forcing the pending items to be encoded. If this value * is increased, then there will be fewer forced flushes, but the probability * entries and Huffman codes will be more likely to become outdated. */ #define NUM_OPTIM_NODES 2048 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional * bit costs. A single bit has a cost of (1 << COST_SHIFT). */ #define COST_SHIFT 6 /* Length of the hash table for finding delta matches */ #define DELTA_HASH_ORDER 17 #define DELTA_HASH_LENGTH ((u32)1 << DELTA_HASH_ORDER) /* The number of bytes to hash when finding delta matches; also taken to be the * minimum length of an explicit offset delta match */ #define NBYTES_HASHED_FOR_DELTA 3 /* The number of delta match powers to consider (must be <= * LZMS_NUM_DELTA_POWER_SYMS) */ #define NUM_POWERS_TO_CONSIDER 6 /* This structure tracks the state of writing bits as a series of 16-bit coding * units, starting at the end of the output buffer and proceeding backwards. */ struct lzms_output_bitstream { /* Bits that haven't yet been written to the output buffer */ u64 bitbuf; /* Number of bits currently held in @bitbuf */ unsigned bitcount; /* Pointer to one past the next position in the output buffer at which * to output a 16-bit coding unit */ le16 *next; /* Pointer to the beginning of the output buffer (this is the "end" when * writing backwards!) */ le16 *begin; }; /* This structure tracks the state of range encoding and its output, which * starts at the beginning of the output buffer and proceeds forwards. */ struct lzms_range_encoder { /* The lower boundary of the current range. Logically, this is a 33-bit * integer whose high bit is needed to detect carries. */ u64 lower_bound; /* The size of the current range */ u32 range_size; /* The next 16-bit coding unit to output */ u16 cache; /* The number of 16-bit coding units whose output has been delayed due * to possible carrying. The first such coding unit is @cache; all * subsequent such coding units are 0xffff. */ u32 cache_size; /* Pointer to the beginning of the output buffer */ le16 *begin; /* Pointer to the position in the output buffer at which the next coding * unit must be written */ le16 *next; /* Pointer to just past the end of the output buffer */ le16 *end; }; /* Bookkeeping information for an adaptive Huffman code */ struct lzms_huffman_rebuild_info { /* The remaining number of symbols to encode until this code must be * rebuilt */ unsigned num_syms_until_rebuild; /* The number of symbols in this code */ unsigned num_syms; /* The rebuild frequency of this code, in symbols */ unsigned rebuild_freq; /* The Huffman codeword of each symbol in this code */ u32 *codewords; /* The length of each Huffman codeword, in bits */ u8 *lens; /* The frequency of each symbol in this code */ u32 *freqs; }; /* * The compressor-internal representation of a match or literal. * * Literals have length=1; matches have length > 1. (We disallow matches of * length 1, even though this is a valid length in LZMS.) * * The source is encoded as follows: * * - Literals: the literal byte itself * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1) * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are * the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1) * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is * the index of the (power, raw_offset) pair in recent_delta_pairs */ struct lzms_item { u32 length; u32 source; }; #define DELTA_SOURCE_TAG ((u32)1 << 31) #define DELTA_SOURCE_POWER_SHIFT 28 #define DELTA_SOURCE_RAW_OFFSET_MASK (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1) static inline void check_that_powers_fit_in_bitfield(void) { BUILD_BUG_ON(LZMS_NUM_DELTA_POWER_SYMS > (1 << (31 - DELTA_SOURCE_POWER_SHIFT))); } /* A stripped-down version of the adaptive state in LZMS which excludes the * probability entries and Huffman codes */ struct lzms_adaptive_state { /* Recent offsets for LZ matches */ u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1]; u32 prev_lz_offset; /* 0 means none */ u32 upcoming_lz_offset; /* 0 means none */ /* Recent (power, raw offset) pairs for delta matches. * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw * offset, and the high bits are the power. */ u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1]; u32 prev_delta_pair; /* 0 means none */ u32 upcoming_delta_pair; /* 0 means none */ /* States for predicting the probabilities of item types */ u8 main_state; u8 match_state; u8 lz_state; u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS]; u8 delta_state; u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS]; } _aligned_attribute(64); /* * This structure represents a byte position in the preprocessed input data and * a node in the graph of possible match/literal choices. * * Logically, each incoming edge to this node is labeled with a literal or a * match that can be taken to reach this position from an earlier position; and * each outgoing edge from this node is labeled with a literal or a match that * can be taken to advance from this position to a later position. */ struct lzms_optimum_node { /* * The cost of the lowest-cost path that has been found to reach this * position. This can change as progressively lower cost paths are * found to reach this position. */ u32 cost; #define INFINITE_COST UINT32_MAX /* * @item is the last item that was taken to reach this position to reach * it with the stored @cost. This can change as progressively lower * cost paths are found to reach this position. * * In some cases we look ahead more than one item. If we looked ahead n * items to reach this position, then @item is the last item taken, * @extra_items contains the other items ordered from second-to-last to * first, and @num_extra_items is n - 1. */ unsigned num_extra_items; struct lzms_item item; struct lzms_item extra_items[2]; /* * The adaptive state that exists at this position. This is filled in * lazily, only after the minimum-cost path to this position is found. * * Note: the way the algorithm handles this adaptive state in the * "minimum-cost" parse is actually only an approximation. It's * possible for the globally optimal, minimum cost path to contain a * prefix, ending at a position, where that path prefix is *not* the * minimum cost path to that position. This can happen if such a path * prefix results in a different adaptive state which results in lower * costs later. Although the algorithm does do some heuristic * multi-item lookaheads, it does not solve this problem in general. * * Note: this adaptive state structure also does not include the * probability entries or current Huffman codewords. Those aren't * maintained per-position and are only updated occassionally. */ struct lzms_adaptive_state state; } _aligned_attribute(64); /* The main compressor structure */ struct lzms_compressor { /* The matchfinder for LZ matches */ struct lcpit_matchfinder mf; /* The preprocessed buffer of data being compressed */ u8 *in_buffer; /* The number of bytes of data to be compressed, which is the number of * bytes of data in @in_buffer that are actually valid */ size_t in_nbytes; /* * Boolean flags to enable consideration of various types of multi-step * operations during parsing. * * Among other cases, multi-step operations can help with gaps where two * matches are separated by a non-matching byte. * * This idea is borrowed from Igor Pavlov's LZMA encoder. */ bool try_lit_lzrep0; bool try_lzrep_lit_lzrep0; bool try_lzmatch_lit_lzrep0; /* * If true, the compressor can use delta matches. This slows down * compression. It improves the compression ratio greatly, slightly, or * not at all, depending on the input data. */ bool use_delta_matches; /* If true, the compressor need not preserve the input buffer if it * compresses the data successfully. */ bool destructive; /* 'last_target_usages' is a large array that is only needed for * preprocessing, so it is in union with fields that don't need to be * initialized until after preprocessing. */ union { struct { /* Temporary space to store matches found by the LZ matchfinder */ struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1]; /* Hash table for finding delta matches */ u32 delta_hash_table[DELTA_HASH_LENGTH]; /* For each delta power, the hash code for the next sequence */ u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER]; /* The per-byte graph nodes for near-optimal parsing */ struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH]; /* Table: length => current cost for small match lengths */ u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1]; /* Range encoder which outputs to the beginning of the compressed data * buffer, proceeding forwards */ struct lzms_range_encoder rc; /* Bitstream which outputs to the end of the compressed data buffer, * proceeding backwards */ struct lzms_output_bitstream os; /* States and probability entries for item type disambiguation */ unsigned main_state; unsigned match_state; unsigned lz_state; unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS]; unsigned delta_state; unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS]; struct lzms_probability_entry main_probs[LZMS_NUM_MAIN_PROBS]; struct lzms_probability_entry match_probs[LZMS_NUM_MATCH_PROBS]; struct lzms_probability_entry lz_probs[LZMS_NUM_LZ_PROBS]; struct lzms_probability_entry lz_rep_probs[LZMS_NUM_LZ_REP_DECISIONS] [LZMS_NUM_LZ_REP_PROBS]; struct lzms_probability_entry delta_probs[LZMS_NUM_DELTA_PROBS]; struct lzms_probability_entry delta_rep_probs[LZMS_NUM_DELTA_REP_DECISIONS] [LZMS_NUM_DELTA_REP_PROBS]; /* Huffman codes */ struct lzms_huffman_rebuild_info literal_rebuild_info; u32 literal_codewords[LZMS_NUM_LITERAL_SYMS]; u8 literal_lens[LZMS_NUM_LITERAL_SYMS]; u32 literal_freqs[LZMS_NUM_LITERAL_SYMS]; struct lzms_huffman_rebuild_info lz_offset_rebuild_info; u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS]; u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS]; u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS]; struct lzms_huffman_rebuild_info length_rebuild_info; u32 length_codewords[LZMS_NUM_LENGTH_SYMS]; u8 length_lens[LZMS_NUM_LENGTH_SYMS]; u32 length_freqs[LZMS_NUM_LENGTH_SYMS]; struct lzms_huffman_rebuild_info delta_offset_rebuild_info; u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS]; u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS]; u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS]; struct lzms_huffman_rebuild_info delta_power_rebuild_info; u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS]; u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS]; u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS]; }; /* struct */ s32 last_target_usages[65536]; }; /* union */ /* Table: length => length slot for small match lengths */ u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1]; /* Tables for mapping offsets to offset slots */ /* slots [0, 167); 0 <= num_extra_bits <= 10 */ u8 offset_slot_tab_1[0xe4a5]; /* slots [167, 427); 11 <= num_extra_bits <= 15 */ u16 offset_slot_tab_2[0x3d0000 >> 11]; /* slots [427, 799); 16 <= num_extra_bits */ u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16]; }; /****************************************************************************** * Offset and length slot acceleration * ******************************************************************************/ /* Generate the acceleration table for length slots. */ static void lzms_init_fast_length_slot_tab(struct lzms_compressor *c) { unsigned slot = 0; for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) { if (len >= lzms_length_slot_base[slot + 1]) slot++; c->fast_length_slot_tab[len] = slot; } } /* Generate the acceleration tables for offset slots. */ static void lzms_init_offset_slot_tabs(struct lzms_compressor *c) { u32 offset; unsigned slot = 0; /* slots [0, 167); 0 <= num_extra_bits <= 10 */ for (offset = 1; offset < 0xe4a5; offset++) { if (offset >= lzms_offset_slot_base[slot + 1]) slot++; c->offset_slot_tab_1[offset] = slot; } /* slots [167, 427); 11 <= num_extra_bits <= 15 */ for (; offset < 0x3de4a5; offset += (u32)1 << 11) { if (offset >= lzms_offset_slot_base[slot + 1]) slot++; c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot; } /* slots [427, 799); 16 <= num_extra_bits */ for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) { if (offset >= lzms_offset_slot_base[slot + 1]) slot++; c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot; } } /* * Return the length slot for the specified match length, using the compressor's * acceleration table if the length is small enough. */ static inline unsigned lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length) { if (likely(length <= MAX_FAST_LENGTH)) return c->fast_length_slot_tab[length]; return lzms_get_length_slot(length); } /* * Return the offset slot for the specified match offset, using the compressor's * acceleration tables to speed up the mapping. */ static inline unsigned lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset) { if (offset < 0xe4a5) return c->offset_slot_tab_1[offset]; offset -= 0xe4a5; if (offset < 0x3d0000) return c->offset_slot_tab_2[offset >> 11]; return c->offset_slot_tab_3[offset >> 16]; } /****************************************************************************** * Range encoding * ******************************************************************************/ /* * Initialize the range encoder @rc to write forwards to the specified buffer * @out that is @count 16-bit integers long. */ static void lzms_range_encoder_init(struct lzms_range_encoder *rc, le16 *out, size_t count) { rc->lower_bound = 0; rc->range_size = 0xffffffff; rc->cache = 0; rc->cache_size = 1; rc->begin = out; rc->next = out - 1; rc->end = out + count; } /* * Attempt to flush bits from the range encoder. * * The basic idea is that we're writing bits from @rc->lower_bound to the * output. However, due to carrying, the writing of coding units with the * maximum value, as well as one prior coding unit, must be delayed until it is * determined whether a carry is needed. * * This is based on the public domain code for LZMA written by Igor Pavlov, but * with the following differences: * * - In LZMS, 16-bit coding units are required rather than 8-bit. * * - In LZMS, the first coding unit is not ignored by the decompressor, so * the encoder cannot output a dummy value to that position. */ static void lzms_range_encoder_shift_low(struct lzms_range_encoder *rc) { if ((u32)(rc->lower_bound) < 0xffff0000 || (u32)(rc->lower_bound >> 32) != 0) { /* Carry not needed (rc->lower_bound < 0xffff0000), or carry * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit * is 1). */ do { if (likely(rc->next >= rc->begin)) { if (rc->next != rc->end) { put_unaligned_u16_le(rc->cache + (u16)(rc->lower_bound >> 32), rc->next++); } } else { rc->next++; } rc->cache = 0xffff; } while (--rc->cache_size != 0); rc->cache = (rc->lower_bound >> 16) & 0xffff; } ++rc->cache_size; rc->lower_bound = (rc->lower_bound & 0xffff) << 16; } static bool lzms_range_encoder_flush(struct lzms_range_encoder *rc) { for (int i = 0; i < 4; i++) lzms_range_encoder_shift_low(rc); return rc->next != rc->end; } /* * Encode the next bit using the range encoder. * * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next * bit is 0 rather than 1. */ static inline void lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob) { /* Normalize if needed. */ if (rc->range_size <= 0xffff) { rc->range_size <<= 16; lzms_range_encoder_shift_low(rc); } u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob; if (bit == 0) { rc->range_size = bound; } else { rc->lower_bound += bound; rc->range_size -= bound; } } /* * Encode a bit. This wraps around lzms_range_encode_bit() to handle using and * updating the state and its corresponding probability entry. */ static inline void lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states, struct lzms_probability_entry *probs, struct lzms_range_encoder *rc) { struct lzms_probability_entry *prob_entry; u32 prob; /* Load the probability entry for the current state. */ prob_entry = &probs[*state_p]; /* Update the state based on the next bit. */ *state_p = ((*state_p << 1) | bit) & (num_states - 1); /* Get the probability that the bit is 0. */ prob = lzms_get_probability(prob_entry); /* Update the probability entry. */ lzms_update_probability_entry(prob_entry, bit); /* Encode the bit using the range encoder. */ lzms_range_encode_bit(rc, bit, prob); } /* Helper functions for encoding bits in the various decision classes */ static void lzms_encode_main_bit(struct lzms_compressor *c, int bit) { lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS, c->main_probs, &c->rc); } static void lzms_encode_match_bit(struct lzms_compressor *c, int bit) { lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS, c->match_probs, &c->rc); } static void lzms_encode_lz_bit(struct lzms_compressor *c, int bit) { lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS, c->lz_probs, &c->rc); } static void lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx) { lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS, c->lz_rep_probs[idx], &c->rc); } static void lzms_encode_delta_bit(struct lzms_compressor *c, int bit) { lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS, c->delta_probs, &c->rc); } static void lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx) { lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS, c->delta_rep_probs[idx], &c->rc); } /****************************************************************************** * Huffman encoding and verbatim bits * ******************************************************************************/ /* * Initialize the output bitstream @os to write backwards to the specified * buffer @out that is @count 16-bit integers long. */ static void lzms_output_bitstream_init(struct lzms_output_bitstream *os, le16 *out, size_t count) { os->bitbuf = 0; os->bitcount = 0; os->next = out + count; os->begin = out; } /* * Write some bits, contained in the low-order @num_bits bits of @bits, to the * output bitstream @os. * * @max_num_bits is a compile-time constant that specifies the maximum number of * bits that can ever be written at this call site. */ static inline void lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits, const unsigned num_bits, const unsigned max_num_bits) { /* Add the bits to the bit buffer variable. */ os->bitcount += num_bits; os->bitbuf = (os->bitbuf << num_bits) | bits; /* Check whether any coding units need to be written. */ while (os->bitcount >= 16) { os->bitcount -= 16; /* Write a coding unit, unless it would underflow the buffer. */ if (os->next != os->begin) put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next); /* Optimization for call sites that never write more than 16 * bits at once. */ if (max_num_bits <= 16) break; } } /* * Flush the output bitstream, ensuring that all bits written to it have been * written to memory. Return %true if all bits have been output successfully, * or %false if an overrun occurred. */ static bool lzms_output_bitstream_flush(struct lzms_output_bitstream *os) { if (os->next == os->begin) return false; if (os->bitcount != 0) put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next); return true; } static void lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info) { make_canonical_huffman_code(rebuild_info->num_syms, LZMS_MAX_CODEWORD_LENGTH, rebuild_info->freqs, rebuild_info->lens, rebuild_info->codewords); rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq; } static void lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info, unsigned num_syms, unsigned rebuild_freq, u32 *codewords, u8 *lens, u32 *freqs) { rebuild_info->num_syms = num_syms; rebuild_info->rebuild_freq = rebuild_freq; rebuild_info->codewords = codewords; rebuild_info->lens = lens; rebuild_info->freqs = freqs; lzms_init_symbol_frequencies(freqs, num_syms); lzms_build_huffman_code(rebuild_info); } static void lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info) { lzms_build_huffman_code(rebuild_info); lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms); } /* * Encode a symbol using the specified Huffman code. Then, if the Huffman code * needs to be rebuilt, rebuild it and return true; otherwise return false. */ static inline bool lzms_huffman_encode_symbol(unsigned sym, const u32 *codewords, const u8 *lens, u32 *freqs, struct lzms_output_bitstream *os, struct lzms_huffman_rebuild_info *rebuild_info) { lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH); ++freqs[sym]; if (--rebuild_info->num_syms_until_rebuild == 0) { lzms_rebuild_huffman_code(rebuild_info); return true; } return false; } /* Helper routines to encode symbols using the various Huffman codes */ static bool lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym) { return lzms_huffman_encode_symbol(sym, c->literal_codewords, c->literal_lens, c->literal_freqs, &c->os, &c->literal_rebuild_info); } static bool lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym) { return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords, c->lz_offset_lens, c->lz_offset_freqs, &c->os, &c->lz_offset_rebuild_info); } static bool lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym) { return lzms_huffman_encode_symbol(sym, c->length_codewords, c->length_lens, c->length_freqs, &c->os, &c->length_rebuild_info); } static bool lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym) { return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords, c->delta_offset_lens, c->delta_offset_freqs, &c->os, &c->delta_offset_rebuild_info); } static bool lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym) { return lzms_huffman_encode_symbol(sym, c->delta_power_codewords, c->delta_power_lens, c->delta_power_freqs, &c->os, &c->delta_power_rebuild_info); } static void lzms_update_fast_length_costs(struct lzms_compressor *c); /* * Encode a match length. If this causes the Huffman code for length symbols to * be rebuilt, also update the length costs array used by the parser. */ static void lzms_encode_length(struct lzms_compressor *c, u32 length) { unsigned slot = lzms_comp_get_length_slot(c, length); if (lzms_encode_length_symbol(c, slot)) lzms_update_fast_length_costs(c); lzms_write_bits(&c->os, length - lzms_length_slot_base[slot], lzms_extra_length_bits[slot], LZMS_MAX_EXTRA_LENGTH_BITS); } /* Encode the offset of an LZ match. */ static void lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset) { unsigned slot = lzms_comp_get_offset_slot(c, offset); lzms_encode_lz_offset_symbol(c, slot); lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot], lzms_extra_offset_bits[slot], LZMS_MAX_EXTRA_OFFSET_BITS); } /* Encode the raw offset of a delta match. */ static void lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset) { unsigned slot = lzms_comp_get_offset_slot(c, raw_offset); lzms_encode_delta_offset_symbol(c, slot); lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot], lzms_extra_offset_bits[slot], LZMS_MAX_EXTRA_OFFSET_BITS); } /****************************************************************************** * Item encoding * ******************************************************************************/ /* Encode the specified item, which may be a literal or any type of match. */ static void lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source) { /* Main bit: 0 = literal, 1 = match */ int main_bit = (length > 1); lzms_encode_main_bit(c, main_bit); if (!main_bit) { /* Literal */ unsigned literal = source; lzms_encode_literal_symbol(c, literal); } else { /* Match */ /* Match bit: 0 = LZ match, 1 = delta match */ int match_bit = (source & DELTA_SOURCE_TAG) != 0; lzms_encode_match_bit(c, match_bit); if (!match_bit) { /* LZ match */ /* LZ bit: 0 = explicit offset, 1 = repeat offset */ int lz_bit = (source < LZMS_NUM_LZ_REPS); lzms_encode_lz_bit(c, lz_bit); if (!lz_bit) { /* Explicit offset LZ match */ u32 offset = source - (LZMS_NUM_LZ_REPS - 1); lzms_encode_lz_offset(c, offset); } else { /* Repeat offset LZ match */ int rep_idx = source; for (int i = 0; i < rep_idx; i++) lzms_encode_lz_rep_bit(c, 1, i); if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS) lzms_encode_lz_rep_bit(c, 0, rep_idx); } } else { /* Delta match */ source &= ~DELTA_SOURCE_TAG; /* Delta bit: 0 = explicit offset, 1 = repeat offset */ int delta_bit = (source < LZMS_NUM_DELTA_REPS); lzms_encode_delta_bit(c, delta_bit); if (!delta_bit) { /* Explicit offset delta match */ u32 power = source >> DELTA_SOURCE_POWER_SHIFT; u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) - (LZMS_NUM_DELTA_REPS - 1); lzms_encode_delta_power_symbol(c, power); lzms_encode_delta_raw_offset(c, raw_offset); } else { /* Repeat offset delta match */ int rep_idx = source; for (int i = 0; i < rep_idx; i++) lzms_encode_delta_rep_bit(c, 1, i); if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS) lzms_encode_delta_rep_bit(c, 0, rep_idx); } } /* Match length (encoded the same way for any match type) */ lzms_encode_length(c, length); } } /* Encode a list of matches and literals chosen by the parsing algorithm. */ static void lzms_encode_nonempty_item_list(struct lzms_compressor *c, struct lzms_optimum_node *end_node) { /* Since we've stored at each node the item we took to arrive at that * node, we can trace our chosen path in backwards order. However, for * encoding we need to trace our chosen path in forwards order. To make * this possible, the following loop moves the items from their * destination nodes to their source nodes, which effectively reverses * the path. (Think of it like reversing a singly-linked list.) */ struct lzms_optimum_node *cur_node = end_node; struct lzms_item saved_item = cur_node->item; do { struct lzms_item item = saved_item; if (cur_node->num_extra_items > 0) { /* Handle an arrival via multi-item lookahead. */ unsigned i = 0; struct lzms_optimum_node *orig_node = cur_node; do { cur_node -= item.length; cur_node->item = item; item = orig_node->extra_items[i]; } while (++i != orig_node->num_extra_items); } cur_node -= item.length; saved_item = cur_node->item; cur_node->item = item; } while (cur_node != c->optimum_nodes); /* Now trace the chosen path in forwards order, encoding each item. */ do { lzms_encode_item(c, cur_node->item.length, cur_node->item.source); cur_node += cur_node->item.length; } while (cur_node != end_node); } static inline void lzms_encode_item_list(struct lzms_compressor *c, struct lzms_optimum_node *end_node) { if (end_node != c->optimum_nodes) lzms_encode_nonempty_item_list(c, end_node); } /****************************************************************************** * Cost evalution * ******************************************************************************/ /* * If p is the predicted probability of the next bit being a 0, then the number * of bits required to encode a 0 bit using a binary range encoder is the real * number -log2(p), and the number of bits required to encode a 1 bit is the * real number -log2(1 - p). To avoid computing either of these expressions at * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from * probability to cost for each possible probability. Specifically, the array * indices are the numerators of the possible probabilities in LZMS, where the * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the * bit costs multiplied by 1< static void lzms_compute_bit_costs(void) { for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) { u32 prob = i; if (prob == 0) prob++; else if (prob == LZMS_PROBABILITY_DENOMINATOR) prob--; lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) * (1 << COST_SHIFT)); } } #endif /* Return the cost to encode a 0 bit in the specified context. */ static inline u32 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs) { return lzms_bit_costs[probs[state].num_recent_zero_bits]; } /* Return the cost to encode a 1 bit in the specified context. */ static inline u32 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs) { return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR - probs[state].num_recent_zero_bits]; } /* Return the cost to encode a literal, including the main bit. */ static inline u32 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal) { return lzms_bit_0_cost(main_state, c->main_probs) + ((u32)c->literal_lens[literal] << COST_SHIFT); } /* Update 'fast_length_cost_tab' to use the latest Huffman code. */ static void lzms_update_fast_length_costs(struct lzms_compressor *c) { int slot = -1; u32 cost = 0; for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) { if (len >= lzms_length_slot_base[slot + 1]) { slot++; cost = (u32)(c->length_lens[slot] + lzms_extra_length_bits[slot]) << COST_SHIFT; } c->fast_length_cost_tab[len] = cost; } } /* Return the cost to encode the specified match length, which must not exceed * MAX_FAST_LENGTH. */ static inline u32 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length) { return c->fast_length_cost_tab[length]; } /* Return the cost to encode the specified LZ match offset. */ static inline u32 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset) { unsigned slot = lzms_comp_get_offset_slot(c, offset); u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot]; return num_bits << COST_SHIFT; } /* Return the cost to encode the specified delta power and raw offset. */ static inline u32 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset) { unsigned slot = lzms_comp_get_offset_slot(c, raw_offset); u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] + lzms_extra_offset_bits[slot]; return num_bits << COST_SHIFT; } /****************************************************************************** * Adaptive state * ******************************************************************************/ static void lzms_init_adaptive_state(struct lzms_adaptive_state *state) { for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++) state->recent_lz_offsets[i] = i + 1; state->prev_lz_offset = 0; state->upcoming_lz_offset = 0; for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++) state->recent_delta_pairs[i] = i + 1; state->prev_delta_pair = 0; state->upcoming_delta_pair = 0; state->main_state = 0; state->match_state = 0; state->lz_state = 0; for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++) state->lz_rep_states[i] = 0; state->delta_state = 0; for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++) state->delta_rep_states[i] = 0; } /* * Update the LRU queues for match sources when advancing by one item. * * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more * complex because: * - there are separate queues for LZ and delta matches * - updates to the queues are delayed by one encoded item (this prevents * sources from being bumped up to index 0 too early) */ static void lzms_update_lru_queues(struct lzms_adaptive_state *state) { if (state->prev_lz_offset != 0) { for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--) state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i]; state->recent_lz_offsets[0] = state->prev_lz_offset; } state->prev_lz_offset = state->upcoming_lz_offset; if (state->prev_delta_pair != 0) { for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--) state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i]; state->recent_delta_pairs[0] = state->prev_delta_pair; } state->prev_delta_pair = state->upcoming_delta_pair; } static inline void lzms_update_state(u8 *state_p, int bit, unsigned num_states) { *state_p = ((*state_p << 1) | bit) % num_states; } static inline void lzms_update_main_state(struct lzms_adaptive_state *state, int is_match) { lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS); } static inline void lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta) { lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS); } static inline void lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep) { lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS); } static inline void lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx) { for (int i = 0; i < rep_idx; i++) lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS); if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS) lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS); } static inline void lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep) { lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS); } static inline void lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx) { for (int i = 0; i < rep_idx; i++) lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS); if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS) lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS); } /****************************************************************************** * Matchfinding * ******************************************************************************/ /* Note: this code just handles finding delta matches. The code for finding LZ * matches is elsewhere. */ /* Initialize the delta matchfinder for a new input buffer. */ static void lzms_init_delta_matchfinder(struct lzms_compressor *c) { /* Set all entries to use an invalid power, which will never match. */ BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER >= (1 << (32 - DELTA_SOURCE_POWER_SHIFT))); memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table)); /* Initialize the next hash code for each power. We can just use zeroes * initially; it doesn't really matter. */ for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++) c->next_delta_hashes[i] = 0; } /* * Compute a DELTA_HASH_ORDER-bit hash code for the first * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a * delta context with the specified @span. */ static inline u32 lzms_delta_hash(const u8 *p, u32 span) { /* A delta match has a certain span and an offset that is a multiple of * that span. To reduce wasted space we use a single combined hash * table for all spans and positions, but to minimize collisions we * include in the hash code computation the span and the low-order bits * of the current position. */ BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3); u8 d0 = *(p + 0) - *(p + 0 - span); u8 d1 = *(p + 1) - *(p + 1 - span); u8 d2 = *(p + 2) - *(p + 2 - span); u32 v = ((span + ((u32)(uintptr_t)p & (span - 1))) << 24) | ((u32)d2 << 16) | ((u32)d1 << 8) | d0; return lz_hash(v, DELTA_HASH_ORDER); } /* * Given a match between @in_next and @matchptr in a delta context with the * specified @span and having the initial @len, extend the match as far as * possible, up to a limit of @max_len. */ static inline u32 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr, u32 len, u32 max_len, u32 span) { while (len < max_len && (u8)(*(in_next + len) - *(in_next + len - span)) == (u8)(*(matchptr + len) - *(matchptr + len - span))) { len++; } return len; } static void lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c, const u8 *in_next, u32 count) { u32 pos = in_next - c->in_buffer; if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1)) return; do { /* Update the hash table for each power. */ for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) { const u32 span = (u32)1 << power; if (unlikely(pos < span)) continue; const u32 next_hash = lzms_delta_hash(in_next + 1, span); const u32 hash = c->next_delta_hashes[power]; c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos; c->next_delta_hashes[power] = next_hash; prefetch(&c->delta_hash_table[next_hash]); } } while (in_next++, pos++, --count); } /* * Skip the next @count bytes (don't search for matches at them). @in_next * points to the first byte to skip. The return value is @in_next + count. */ static const u8 * lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next) { lcpit_matchfinder_skip_bytes(&c->mf, count); if (c->use_delta_matches) lzms_delta_matchfinder_skip_bytes(c, in_next, count); return in_next + count; } /****************************************************************************** * "Near-optimal" parsing * ******************************************************************************/ /* * The main near-optimal parsing routine. * * Briefly, the algorithm does an approximate minimum-cost path search to find a * "near-optimal" sequence of matches and literals to output, based on the * current cost model. The algorithm steps forward, position by position (byte * by byte), and updates the minimum cost path to reach each later position that * can be reached using a match or literal from the current position. This is * essentially Dijkstra's algorithm in disguise: the graph nodes are positions, * the graph edges are possible matches/literals to code, and the cost of each * edge is the estimated number of bits that will be required to output the * corresponding match or literal. But one difference is that we actually * compute the lowest-cost path in pieces, where each piece is terminated when * there are no choices to be made. * * The costs of literals and matches are estimated using the range encoder * states and the semi-adaptive Huffman codes. Except for range encoding * states, costs are assumed to be constant throughout a single run of the * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data. This * introduces a source of non-optimality because the probabilities and Huffman * codes can change over this part of the data. And of course, there are * various other reasons why the result isn't optimal in terms of compression * ratio. */ static void lzms_near_optimal_parse(struct lzms_compressor *c) { const u8 *in_next = c->in_buffer; const u8 * const in_end = &c->in_buffer[c->in_nbytes]; struct lzms_optimum_node *cur_node; struct lzms_optimum_node *end_node; /* Set initial length costs for lengths <= MAX_FAST_LENGTH. */ lzms_update_fast_length_costs(c); /* Set up the initial adaptive state. */ lzms_init_adaptive_state(&c->optimum_nodes[0].state); begin: /* Start building a new list of items, which will correspond to the next * piece of the overall minimum-cost path. */ cur_node = c->optimum_nodes; cur_node->cost = 0; end_node = cur_node; if (in_next == in_end) return; /* The following loop runs once for each per byte in the input buffer, * except in a few shortcut cases. */ for (;;) { u32 num_matches; /* Repeat offset LZ matches */ if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS && in_end - in_next >= 2)) { for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) { /* Looking for a repeat offset LZ match at queue * index @rep_idx */ const u32 offset = cur_node->state.recent_lz_offsets[rep_idx]; const u8 * const matchptr = in_next - offset; /* Check the first 2 bytes before entering the extension loop. */ if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr)) continue; /* Extend the match to its full length. */ const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next); /* Early out for long repeat offset LZ match */ if (rep_len >= c->mf.nice_match_len) { in_next = lzms_skip_bytes(c, rep_len, in_next); lzms_encode_item_list(c, cur_node); lzms_encode_item(c, rep_len, rep_idx); c->optimum_nodes[0].state = cur_node->state; cur_node = &c->optimum_nodes[0]; cur_node->state.upcoming_lz_offset = cur_node->state.recent_lz_offsets[rep_idx]; cur_node->state.upcoming_delta_pair = 0; for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++) cur_node->state.recent_lz_offsets[i] = cur_node->state.recent_lz_offsets[i + 1]; lzms_update_lru_queues(&cur_node->state); lzms_update_main_state(&cur_node->state, 1); lzms_update_match_state(&cur_node->state, 0); lzms_update_lz_state(&cur_node->state, 1); lzms_update_lz_rep_states(&cur_node->state, rep_idx); goto begin; } while (end_node < cur_node + rep_len) (++end_node)->cost = INFINITE_COST; u32 base_cost = cur_node->cost + lzms_bit_1_cost(cur_node->state.main_state, c->main_probs) + lzms_bit_0_cost(cur_node->state.match_state, c->match_probs) + lzms_bit_1_cost(cur_node->state.lz_state, c->lz_probs); for (int i = 0; i < rep_idx; i++) base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i], c->lz_rep_probs[i]); if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS) base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx], c->lz_rep_probs[rep_idx]); u32 len = 2; do { u32 cost = base_cost + lzms_fast_length_cost(c, len); if (cost < (cur_node + len)->cost) { (cur_node + len)->cost = cost; (cur_node + len)->item = (struct lzms_item) { .length = len, .source = rep_idx, }; (cur_node + len)->num_extra_items = 0; } } while (++len <= rep_len); /* try LZ-rep + lit + LZ-rep0 */ if (c->try_lzrep_lit_lzrep0 && in_end - (in_next + rep_len) >= 3 && load_u16_unaligned(in_next + rep_len + 1) == load_u16_unaligned(matchptr + rep_len + 1)) { const u32 rep0_len = lz_extend(in_next + rep_len + 1, matchptr + rep_len + 1, 2, min(c->mf.nice_match_len, in_end - (in_next + rep_len + 1))); unsigned main_state = cur_node->state.main_state; unsigned match_state = cur_node->state.match_state; unsigned lz_state = cur_node->state.lz_state; unsigned lz_rep0_state = cur_node->state.lz_rep_states[0]; /* update states for LZ-rep */ main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS; match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS; lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS; lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) % LZMS_NUM_LZ_REP_PROBS; /* LZ-rep cost */ u32 cost = base_cost + lzms_fast_length_cost(c, rep_len); /* add literal cost */ cost += lzms_literal_cost(c, main_state, *(in_next + rep_len)); /* update state for literal */ main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS; /* add LZ-rep0 cost */ cost += lzms_bit_1_cost(main_state, c->main_probs) + lzms_bit_0_cost(match_state, c->match_probs) + lzms_bit_1_cost(lz_state, c->lz_probs) + lzms_bit_0_cost(lz_rep0_state, c->lz_rep_probs[0]) + lzms_fast_length_cost(c, rep0_len); const u32 total_len = rep_len + 1 + rep0_len; while (end_node < cur_node + total_len) (++end_node)->cost = INFINITE_COST; if (cost < (cur_node + total_len)->cost) { (cur_node + total_len)->cost = cost; (cur_node + total_len)->item = (struct lzms_item) { .length = rep0_len, .source = 0, }; (cur_node + total_len)->extra_items[0] = (struct lzms_item) { .length = 1, .source = *(in_next + rep_len), }; (cur_node + total_len)->extra_items[1] = (struct lzms_item) { .length = rep_len, .source = rep_idx, }; (cur_node + total_len)->num_extra_items = 2; } } } } /* Repeat offset delta matches */ if (c->use_delta_matches && likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 && (in_end - in_next >= 2))) { for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) { /* Looking for a repeat offset delta match at * queue index @rep_idx */ const u32 pair = cur_node->state.recent_delta_pairs[rep_idx]; const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT; const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK; const u32 span = (u32)1 << power; const u32 offset = raw_offset << power; const u8 * const matchptr = in_next - offset; /* Check the first 2 bytes before entering the * extension loop. */ if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) != (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) || ((u8)(*(in_next + 1) - *(in_next + 1 - span)) != (u8)(*(matchptr + 1) - *(matchptr + 1 - span)))) continue; /* Extend the match to its full length. */ const u32 rep_len = lzms_extend_delta_match(in_next, matchptr, 2, in_end - in_next, span); /* Early out for long repeat offset delta match */ if (rep_len >= c->mf.nice_match_len) { in_next = lzms_skip_bytes(c, rep_len, in_next); lzms_encode_item_list(c, cur_node); lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx); c->optimum_nodes[0].state = cur_node->state; cur_node = &c->optimum_nodes[0]; cur_node->state.upcoming_delta_pair = pair; cur_node->state.upcoming_lz_offset = 0; for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++) cur_node->state.recent_delta_pairs[i] = cur_node->state.recent_delta_pairs[i + 1]; lzms_update_lru_queues(&cur_node->state); lzms_update_main_state(&cur_node->state, 1); lzms_update_match_state(&cur_node->state, 1); lzms_update_delta_state(&cur_node->state, 1); lzms_update_delta_rep_states(&cur_node->state, rep_idx); goto begin; } while (end_node < cur_node + rep_len) (++end_node)->cost = INFINITE_COST; u32 base_cost = cur_node->cost + lzms_bit_1_cost(cur_node->state.main_state, c->main_probs) + lzms_bit_1_cost(cur_node->state.match_state, c->match_probs) + lzms_bit_1_cost(cur_node->state.delta_state, c->delta_probs); for (int i = 0; i < rep_idx; i++) base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i], c->delta_rep_probs[i]); if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS) base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx], c->delta_rep_probs[rep_idx]); u32 len = 2; do { u32 cost = base_cost + lzms_fast_length_cost(c, len); if (cost < (cur_node + len)->cost) { (cur_node + len)->cost = cost; (cur_node + len)->item = (struct lzms_item) { .length = len, .source = DELTA_SOURCE_TAG | rep_idx, }; (cur_node + len)->num_extra_items = 0; } } while (++len <= rep_len); } } /* Explicit offset LZ matches */ num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches); if (num_matches) { u32 best_len = c->matches[0].length; /* Early out for long explicit offset LZ match */ if (best_len >= c->mf.nice_match_len) { const u32 offset = c->matches[0].offset; /* Extend the match as far as possible. * This is necessary because the LCP-interval * tree matchfinder only reports up to * nice_match_len bytes. */ best_len = lz_extend(in_next, in_next - offset, best_len, in_end - in_next); in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1); lzms_encode_item_list(c, cur_node); lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1); c->optimum_nodes[0].state = cur_node->state; cur_node = &c->optimum_nodes[0]; cur_node->state.upcoming_lz_offset = offset; cur_node->state.upcoming_delta_pair = 0; lzms_update_lru_queues(&cur_node->state); lzms_update_main_state(&cur_node->state, 1); lzms_update_match_state(&cur_node->state, 0); lzms_update_lz_state(&cur_node->state, 0); goto begin; } while (end_node < cur_node + best_len) (++end_node)->cost = INFINITE_COST; u32 base_cost = cur_node->cost + lzms_bit_1_cost(cur_node->state.main_state, c->main_probs) + lzms_bit_0_cost(cur_node->state.match_state, c->match_probs) + lzms_bit_0_cost(cur_node->state.lz_state, c->lz_probs); if (c->try_lzmatch_lit_lzrep0 && likely(in_end - (in_next + c->matches[0].length) >= 3)) { /* try LZ-match + lit + LZ-rep0 */ u32 l = 2; u32 i = num_matches - 1; do { const u32 len = c->matches[i].length; const u32 offset = c->matches[i].offset; const u32 position_cost = base_cost + lzms_lz_offset_cost(c, offset); do { u32 cost = position_cost + lzms_fast_length_cost(c, l); if (cost < (cur_node + l)->cost) { (cur_node + l)->cost = cost; (cur_node + l)->item = (struct lzms_item) { .length = l, .source = offset + (LZMS_NUM_LZ_REPS - 1), }; (cur_node + l)->num_extra_items = 0; } } while (++l <= len); const u8 * const matchptr = in_next - offset; if (load_u16_unaligned(matchptr + len + 1) != load_u16_unaligned(in_next + len + 1)) continue; const u32 rep0_len = lz_extend(in_next + len + 1, matchptr + len + 1, 2, min(c->mf.nice_match_len, in_end - (in_next + len + 1))); unsigned main_state = cur_node->state.main_state; unsigned match_state = cur_node->state.match_state; unsigned lz_state = cur_node->state.lz_state; /* update states for LZ-match */ main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS; match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS; lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS; /* LZ-match cost */ u32 cost = position_cost + lzms_fast_length_cost(c, len); /* add literal cost */ cost += lzms_literal_cost(c, main_state, *(in_next + len)); /* update state for literal */ main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS; /* add LZ-rep0 cost */ cost += lzms_bit_1_cost(main_state, c->main_probs) + lzms_bit_0_cost(match_state, c->match_probs) + lzms_bit_1_cost(lz_state, c->lz_probs) + lzms_bit_0_cost(cur_node->state.lz_rep_states[0], c->lz_rep_probs[0]) + lzms_fast_length_cost(c, rep0_len); const u32 total_len = len + 1 + rep0_len; while (end_node < cur_node + total_len) (++end_node)->cost = INFINITE_COST; if (cost < (cur_node + total_len)->cost) { (cur_node + total_len)->cost = cost; (cur_node + total_len)->item = (struct lzms_item) { .length = rep0_len, .source = 0, }; (cur_node + total_len)->extra_items[0] = (struct lzms_item) { .length = 1, .source = *(in_next + len), }; (cur_node + total_len)->extra_items[1] = (struct lzms_item) { .length = len, .source = offset + LZMS_NUM_LZ_REPS - 1, }; (cur_node + total_len)->num_extra_items = 2; } } while (i--); } else { u32 l = 2; u32 i = num_matches - 1; do { u32 position_cost = base_cost + lzms_lz_offset_cost(c, c->matches[i].offset); do { u32 cost = position_cost + lzms_fast_length_cost(c, l); if (cost < (cur_node + l)->cost) { (cur_node + l)->cost = cost; (cur_node + l)->item = (struct lzms_item) { .length = l, .source = c->matches[i].offset + (LZMS_NUM_LZ_REPS - 1), }; (cur_node + l)->num_extra_items = 0; } } while (++l <= c->matches[i].length); } while (i--); } } /* Explicit offset delta matches */ if (c->use_delta_matches && likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1)) { const u32 pos = in_next - c->in_buffer; /* Consider each possible power (log2 of span) */ BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER > LZMS_NUM_DELTA_POWER_SYMS); for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) { const u32 span = (u32)1 << power; if (unlikely(pos < span)) continue; const u32 next_hash = lzms_delta_hash(in_next + 1, span); const u32 hash = c->next_delta_hashes[power]; const u32 cur_match = c->delta_hash_table[hash]; c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos; c->next_delta_hashes[power] = next_hash; prefetch(&c->delta_hash_table[next_hash]); if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT) continue; const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK); /* The offset must be a multiple of span. */ if (offset & (span - 1)) continue; const u8 * const matchptr = in_next - offset; /* Check the first 3 bytes before entering the * extension loop. */ BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3); if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) != (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) || ((u8)(*(in_next + 1) - *(in_next + 1 - span)) != (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) || ((u8)(*(in_next + 2) - *(in_next + 2 - span)) != (u8)(*(matchptr + 2) - *(matchptr + 2 - span)))) continue; /* Extend the delta match to its full length. */ const u32 len = lzms_extend_delta_match(in_next, matchptr, 3, in_end - in_next, span); const u32 raw_offset = offset >> power; const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) | raw_offset; const u32 source = DELTA_SOURCE_TAG | (pair + LZMS_NUM_DELTA_REPS - 1); /* Early out for long explicit offset delta match */ if (len >= c->mf.nice_match_len) { in_next = lzms_skip_bytes(c, len - 1, in_next + 1); lzms_encode_item_list(c, cur_node); lzms_encode_item(c, len, source); c->optimum_nodes[0].state = cur_node->state; cur_node = &c->optimum_nodes[0]; cur_node->state.upcoming_lz_offset = 0; cur_node->state.upcoming_delta_pair = pair; lzms_update_lru_queues(&cur_node->state); lzms_update_main_state(&cur_node->state, 1); lzms_update_match_state(&cur_node->state, 1); lzms_update_delta_state(&cur_node->state, 0); goto begin; } while (end_node < cur_node + len) (++end_node)->cost = INFINITE_COST; u32 base_cost = cur_node->cost + lzms_bit_1_cost(cur_node->state.main_state, c->main_probs) + lzms_bit_1_cost(cur_node->state.match_state, c->match_probs) + lzms_bit_0_cost(cur_node->state.delta_state, c->delta_probs) + lzms_delta_source_cost(c, power, raw_offset); u32 l = NBYTES_HASHED_FOR_DELTA; do { u32 cost = base_cost + lzms_fast_length_cost(c, l); if (cost < (cur_node + l)->cost) { (cur_node + l)->cost = cost; (cur_node + l)->item = (struct lzms_item) { .length = l, .source = source, }; (cur_node + l)->num_extra_items = 0; } } while (++l <= len); } } /* Literal */ if (end_node < cur_node + 1) (++end_node)->cost = INFINITE_COST; const u32 cur_and_lit_cost = cur_node->cost + lzms_literal_cost(c, cur_node->state.main_state, *in_next); if (cur_and_lit_cost < (cur_node + 1)->cost) { (cur_node + 1)->cost = cur_and_lit_cost; (cur_node + 1)->item = (struct lzms_item) { .length = 1, .source = *in_next, }; (cur_node + 1)->num_extra_items = 0; } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) { /* try lit + LZ-rep0 */ const u32 offset = (cur_node->state.prev_lz_offset) ? cur_node->state.prev_lz_offset : cur_node->state.recent_lz_offsets[0]; if (load_u16_unaligned(in_next + 1) == load_u16_unaligned(in_next + 1 - offset)) { const u32 rep0_len = lz_extend(in_next + 1, in_next + 1 - offset, 2, min(in_end - (in_next + 1), c->mf.nice_match_len)); unsigned main_state = cur_node->state.main_state; /* Update main_state after literal */ main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS; /* Add cost of LZ-rep0 */ const u32 cost = cur_and_lit_cost + lzms_bit_1_cost(main_state, c->main_probs) + lzms_bit_0_cost(cur_node->state.match_state, c->match_probs) + lzms_bit_1_cost(cur_node->state.lz_state, c->lz_probs) + lzms_bit_0_cost(cur_node->state.lz_rep_states[0], c->lz_rep_probs[0]) + lzms_fast_length_cost(c, rep0_len); const u32 total_len = 1 + rep0_len; while (end_node < cur_node + total_len) (++end_node)->cost = INFINITE_COST; if (cost < (cur_node + total_len)->cost) { (cur_node + total_len)->cost = cost; (cur_node + total_len)->item = (struct lzms_item) { .length = rep0_len, .source = 0, }; (cur_node + total_len)->extra_items[0] = (struct lzms_item) { .length = 1, .source = *in_next, }; (cur_node + total_len)->num_extra_items = 1; } } } /* Advance to the next position. */ in_next++; cur_node++; /* The lowest-cost path to the current position is now known. * Finalize the adaptive state that results from taking this * lowest-cost path. */ struct lzms_item item_to_take = cur_node->item; struct lzms_optimum_node *source_node = cur_node - (item_to_take.length); int next_item_idx = -1; for (unsigned i = 0; i < cur_node->num_extra_items; i++) { item_to_take = cur_node->extra_items[i]; source_node -= item_to_take.length; next_item_idx++; } cur_node->state = source_node->state; for (;;) { const u32 length = item_to_take.length; u32 source = item_to_take.source; cur_node->state.upcoming_lz_offset = 0; cur_node->state.upcoming_delta_pair = 0; if (length > 1) { /* Match */ lzms_update_main_state(&cur_node->state, 1); if (source & DELTA_SOURCE_TAG) { /* Delta match */ lzms_update_match_state(&cur_node->state, 1); source &= ~DELTA_SOURCE_TAG; if (source >= LZMS_NUM_DELTA_REPS) { /* Explicit offset delta match */ u32 pair = source - (LZMS_NUM_DELTA_REPS - 1); lzms_update_delta_state(&cur_node->state, 0); cur_node->state.upcoming_delta_pair = pair; } else { /* Repeat offset delta match */ int rep_idx = source; lzms_update_delta_state(&cur_node->state, 1); lzms_update_delta_rep_states(&cur_node->state, rep_idx); cur_node->state.upcoming_delta_pair = cur_node->state.recent_delta_pairs[rep_idx]; for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++) cur_node->state.recent_delta_pairs[i] = cur_node->state.recent_delta_pairs[i + 1]; } } else { lzms_update_match_state(&cur_node->state, 0); if (source >= LZMS_NUM_LZ_REPS) { /* Explicit offset LZ match */ lzms_update_lz_state(&cur_node->state, 0); cur_node->state.upcoming_lz_offset = source - (LZMS_NUM_LZ_REPS - 1); } else { /* Repeat offset LZ match */ int rep_idx = source; lzms_update_lz_state(&cur_node->state, 1); lzms_update_lz_rep_states(&cur_node->state, rep_idx); cur_node->state.upcoming_lz_offset = cur_node->state.recent_lz_offsets[rep_idx]; for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++) cur_node->state.recent_lz_offsets[i] = cur_node->state.recent_lz_offsets[i + 1]; } } } else { /* Literal */ lzms_update_main_state(&cur_node->state, 0); } lzms_update_lru_queues(&cur_node->state); if (next_item_idx < 0) break; if (next_item_idx == 0) item_to_take = cur_node->item; else item_to_take = cur_node->extra_items[next_item_idx - 1]; --next_item_idx; } /* * This loop will terminate when either of the following * conditions is true: * * (1) cur_node == end_node * * There are no paths that extend beyond the current * position. In this case, any path to a later position * must pass through the current position, so we can go * ahead and choose the list of items that led to this * position. * * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES] * * This bounds the number of times the algorithm can step * forward before it is guaranteed to start choosing items. * This limits the memory usage. It also guarantees that * the parser will not go too long without updating the * probability tables. * * Note: no check for end-of-buffer is needed because * end-of-buffer will trigger condition (1). */ if (cur_node == end_node || cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]) { lzms_encode_nonempty_item_list(c, cur_node); c->optimum_nodes[0].state = cur_node->state; goto begin; } } } static void lzms_init_states_and_probabilities(struct lzms_compressor *c) { c->main_state = 0; c->match_state = 0; c->lz_state = 0; for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++) c->lz_rep_states[i] = 0; c->delta_state = 0; for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++) c->delta_rep_states[i] = 0; lzms_init_probability_entries(c->main_probs, LZMS_NUM_MAIN_PROBS); lzms_init_probability_entries(c->match_probs, LZMS_NUM_MATCH_PROBS); lzms_init_probability_entries(c->lz_probs, LZMS_NUM_LZ_PROBS); for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++) lzms_init_probability_entries(c->lz_rep_probs[i], LZMS_NUM_LZ_REP_PROBS); lzms_init_probability_entries(c->delta_probs, LZMS_NUM_DELTA_PROBS); for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++) lzms_init_probability_entries(c->delta_rep_probs[i], LZMS_NUM_DELTA_REP_PROBS); } static void lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots) { lzms_init_huffman_code(&c->literal_rebuild_info, LZMS_NUM_LITERAL_SYMS, LZMS_LITERAL_CODE_REBUILD_FREQ, c->literal_codewords, c->literal_lens, c->literal_freqs); lzms_init_huffman_code(&c->lz_offset_rebuild_info, num_offset_slots, LZMS_LZ_OFFSET_CODE_REBUILD_FREQ, c->lz_offset_codewords, c->lz_offset_lens, c->lz_offset_freqs); lzms_init_huffman_code(&c->length_rebuild_info, LZMS_NUM_LENGTH_SYMS, LZMS_LENGTH_CODE_REBUILD_FREQ, c->length_codewords, c->length_lens, c->length_freqs); lzms_init_huffman_code(&c->delta_offset_rebuild_info, num_offset_slots, LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ, c->delta_offset_codewords, c->delta_offset_lens, c->delta_offset_freqs); lzms_init_huffman_code(&c->delta_power_rebuild_info, LZMS_NUM_DELTA_POWER_SYMS, LZMS_DELTA_POWER_CODE_REBUILD_FREQ, c->delta_power_codewords, c->delta_power_lens, c->delta_power_freqs); } /* * Flush the output streams, prepare the final compressed data, and return its * size in bytes. * * A return value of 0 indicates that the data could not be compressed to fit in * the available space. */ static size_t lzms_finalize(struct lzms_compressor *c) { size_t num_forwards_units; size_t num_backwards_units; /* Flush both the forwards and backwards streams, and make sure they * didn't cross each other and start overwriting each other's data. */ if (!lzms_output_bitstream_flush(&c->os)) return 0; if (!lzms_range_encoder_flush(&c->rc)) return 0; if (c->rc.next > c->os.next) return 0; /* Now the compressed buffer contains the data output by the forwards * bitstream, then empty space, then data output by the backwards * bitstream. Move the data output by the backwards bitstream to be * adjacent to the data output by the forward bitstream, and calculate * the compressed size that this results in. */ num_forwards_units = c->rc.next - c->rc.begin; num_backwards_units = c->rc.end - c->os.next; memmove(c->rc.next, c->os.next, num_backwards_units * sizeof(le16)); return (num_forwards_units + num_backwards_units) * sizeof(le16); } static u64 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level, bool destructive) { u64 size = 0; if (max_bufsize > LZMS_MAX_BUFFER_SIZE) return 0; size += sizeof(struct lzms_compressor); if (!destructive) size += max_bufsize; /* in_buffer */ /* mf */ size += lcpit_matchfinder_get_needed_memory(max_bufsize); return size; } static int lzms_create_compressor(size_t max_bufsize, unsigned compression_level, bool destructive, void **c_ret) { struct lzms_compressor *c; u32 nice_match_len; if (max_bufsize > LZMS_MAX_BUFFER_SIZE) return WIMLIB_ERR_INVALID_PARAM; c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64); if (!c) goto oom0; c->destructive = destructive; /* Scale nice_match_len with the compression level. But to allow an * optimization for length cost calculations, don't allow nice_match_len * to exceed MAX_FAST_LENGTH. */ nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH); c->use_delta_matches = (compression_level >= 35); c->try_lzmatch_lit_lzrep0 = (compression_level >= 45); c->try_lit_lzrep0 = (compression_level >= 60); c->try_lzrep_lit_lzrep0 = (compression_level >= 60); if (!c->destructive) { c->in_buffer = MALLOC(max_bufsize); if (!c->in_buffer) goto oom1; } if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len)) goto oom2; lzms_init_fast_length_slot_tab(c); lzms_init_offset_slot_tabs(c); *c_ret = c; return 0; oom2: if (!c->destructive) FREE(c->in_buffer); oom1: ALIGNED_FREE(c); oom0: return WIMLIB_ERR_NOMEM; } static size_t lzms_compress(const void *in, size_t in_nbytes, void *out, size_t out_nbytes_avail, void *_c) { struct lzms_compressor *c = _c; size_t result; /* Don't bother trying to compress extremely small inputs. */ if (in_nbytes < 4) return 0; /* Copy the input data into the internal buffer and preprocess it. */ if (c->destructive) c->in_buffer = (void *)in; else memcpy(c->in_buffer, in, in_nbytes); c->in_nbytes = in_nbytes; lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false); /* Prepare the matchfinders. */ lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes); if (c->use_delta_matches) lzms_init_delta_matchfinder(c); /* Initialize the encoder structures. */ lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16)); lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16)); lzms_init_states_and_probabilities(c); lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes)); /* The main loop: parse and encode. */ lzms_near_optimal_parse(c); /* Return the compressed data size or 0. */ result = lzms_finalize(c); if (!result && c->destructive) lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true); return result; } static void lzms_free_compressor(void *_c) { struct lzms_compressor *c = _c; if (!c->destructive) FREE(c->in_buffer); lcpit_matchfinder_destroy(&c->mf); ALIGNED_FREE(c); } const struct compressor_ops lzms_compressor_ops = { .get_needed_memory = lzms_get_needed_memory, .create_compressor = lzms_create_compressor, .compress = lzms_compress, .free_compressor = lzms_free_compressor, };