X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Flzms_compress.c;h=9a84ebd72ceea965e1b880b5f75504258ddacf25;hp=096862e59c1eb39fc9ab5bb1b7b479406e7342ca;hb=a739a1c0e0d2091209544f8f155baa0df6935d6f;hpb=f328f54045b41f8258d7505de45ec8c71bff3e17 diff --git a/src/lzms_compress.c b/src/lzms_compress.c index 096862e5..9a84ebd7 100644 --- a/src/lzms_compress.c +++ b/src/lzms_compress.c @@ -5,7 +5,7 @@ */ /* - * Copyright (C) 2013, 2014 Eric Biggers + * Copyright (C) 2013, 2014, 2015 Eric Biggers * * This file is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free @@ -25,476 +25,545 @@ # include "config.h" #endif +#include +#include + #include "wimlib/compress_common.h" #include "wimlib/compressor_ops.h" -#include "wimlib/endianness.h" #include "wimlib/error.h" -#include "wimlib/lz_mf.h" -#include "wimlib/lz_repsearch.h" +#include "wimlib/lcpit_matchfinder.h" +#include "wimlib/lz_extend.h" +#include "wimlib/lz_hash.h" #include "wimlib/lzms_common.h" #include "wimlib/unaligned.h" #include "wimlib/util.h" -#include -#include -#include +/* + * MAX_FAST_LENGTH is the maximum match length for which the length slot can be + * looked up directly in 'fast_length_slot_tab' and the length cost can be + * looked up directly in 'fast_length_cost_tab'. + * + * We also limit the 'nice_match_len' parameter to this value. Consequently, if + * the longest match found is shorter than 'nice_match_len', then it must also + * be shorter than MAX_FAST_LENGTH. This makes it possible to do fast lookups + * of length costs using 'fast_length_cost_tab' without having to keep checking + * whether the length exceeds MAX_FAST_LENGTH or not. + */ +#define MAX_FAST_LENGTH 255 -/* Stucture used for writing raw bits as a series of 16-bit little endian coding - * units. This starts at the *end* of the compressed data buffer and proceeds - * backwards. */ +/* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will + * step forward before forcing the pending items to be encoded. If this value + * is increased, then there will be fewer forced flushes, but the probability + * entries and Huffman codes will be more likely to become outdated. */ +#define NUM_OPTIM_NODES 2048 + +/* COST_SHIFT is a scaling factor that makes it possible to consider fractional + * bit costs. A single bit has a cost of (1 << COST_SHIFT). */ +#define COST_SHIFT 6 + +/* Length of the hash table for finding delta matches */ +#define DELTA_HASH_ORDER 17 +#define DELTA_HASH_LENGTH ((u32)1 << DELTA_HASH_ORDER) + +/* The number of bytes to hash when finding delta matches; also taken to be the + * minimum length of an explicit offset delta match */ +#define NBYTES_HASHED_FOR_DELTA 3 + +/* The number of delta match powers to consider (must be <= + * LZMS_NUM_DELTA_POWER_SYMS) */ +#define NUM_POWERS_TO_CONSIDER 6 + +/* This structure tracks the state of writing bits as a series of 16-bit coding + * units, starting at the end of the output buffer and proceeding backwards. */ struct lzms_output_bitstream { - /* Bits that haven't yet been written to the output buffer. */ + /* Bits that haven't yet been written to the output buffer */ u64 bitbuf; - /* Number of bits currently held in @bitbuf. */ + /* Number of bits currently held in @bitbuf */ unsigned bitcount; - /* Pointer to one past the next position in the compressed data buffer - * at which to output a 16-bit coding unit. */ - le16 *next; - - /* Pointer to the beginning of the output buffer. (The "end" when + /* Pointer to the beginning of the output buffer (this is the "end" when * writing backwards!) */ - le16 *begin; + u8 *begin; + + /* Pointer to just past the next position in the output buffer at which + * to output a 16-bit coding unit */ + u8 *next; }; -/* Stucture used for range encoding (raw version). This starts at the - * *beginning* of the compressed data buffer and proceeds forward. */ -struct lzms_range_encoder_raw { +/* This structure tracks the state of range encoding and its output, which + * starts at the beginning of the output buffer and proceeds forwards. */ +struct lzms_range_encoder { - /* A 33-bit variable that holds the low boundary of the current range. - * The 33rd bit is needed to catch carries. */ - u64 low; + /* The lower boundary of the current range. Logically, this is a 33-bit + * integer whose high bit is needed to detect carries. */ + u64 lower_bound; - /* Size of the current range. */ - u32 range; + /* The size of the current range */ + u32 range_size; - /* Next 16-bit coding unit to output. */ + /* The next 16-bit coding unit to output */ u16 cache; - /* Number of 16-bit coding units whose output has been delayed due to - * possible carrying. The first such coding unit is @cache; all + /* The number of 16-bit coding units whose output has been delayed due + * to possible carrying. The first such coding unit is @cache; all * subsequent such coding units are 0xffff. */ u32 cache_size; - /* Pointer to the beginning of the output buffer. */ - le16 *begin; + /* Pointer to the beginning of the output buffer */ + u8 *begin; /* Pointer to the position in the output buffer at which the next coding - * unit must be written. */ - le16 *next; + * unit must be written */ + u8 *next; - /* Pointer just past the end of the output buffer. */ - le16 *end; + /* Pointer to just past the end of the output buffer */ + u8 *end; }; -/* Structure used for range encoding. This wraps around `struct - * lzms_range_encoder_raw' to use and maintain probability entries. */ -struct lzms_range_encoder { +/* Bookkeeping information for an adaptive Huffman code */ +struct lzms_huffman_rebuild_info { + + /* The remaining number of symbols to encode until this code must be + * rebuilt */ + unsigned num_syms_until_rebuild; + + /* The number of symbols in this code */ + unsigned num_syms; - /* Pointer to the raw range encoder, which has no persistent knowledge - * of probabilities. Multiple lzms_range_encoder's share the same - * lzms_range_encoder_raw. */ - struct lzms_range_encoder_raw *rc; + /* The rebuild frequency of this code, in symbols */ + unsigned rebuild_freq; - /* Bits recently encoded by this range encoder. This is used as an - * index into @prob_entries. */ - u32 state; + /* The Huffman codeword of each symbol in this code */ + u32 *codewords; - /* Bitmask for @state to prevent its value from exceeding the number of - * probability entries. */ - u32 mask; + /* The length of each Huffman codeword, in bits */ + u8 *lens; - /* Probability entries being used for this range encoder. */ - struct lzms_probability_entry prob_entries[LZMS_MAX_NUM_STATES]; + /* The frequency of each symbol in this code */ + u32 *freqs; }; -/* Structure used for Huffman encoding. */ -struct lzms_huffman_encoder { +/* + * The compressor-internal representation of a match or literal. + * + * Literals have length=1; matches have length > 1. (We disallow matches of + * length 1, even though this is a valid length in LZMS.) + * + * The source is encoded as follows: + * + * - Literals: the literal byte itself + * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1) + * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets + * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are + * the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1) + * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is + * the index of the (power, raw_offset) pair in recent_delta_pairs + */ +struct lzms_item { + u32 length; + u32 source; +}; + +#define DELTA_SOURCE_TAG ((u32)1 << 31) +#define DELTA_SOURCE_POWER_SHIFT 28 +#define DELTA_SOURCE_RAW_OFFSET_MASK (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1) + +static inline void +check_that_powers_fit_in_bitfield(void) +{ + STATIC_ASSERT(LZMS_NUM_DELTA_POWER_SYMS <= (1 << (31 - DELTA_SOURCE_POWER_SHIFT))); +} - /* Bitstream to write Huffman-encoded symbols and verbatim bits to. - * Multiple lzms_huffman_encoder's share the same lzms_output_bitstream. +/* A stripped-down version of the adaptive state in LZMS which excludes the + * probability entries and Huffman codes */ +struct lzms_adaptive_state { + + /* Recent offsets for LZ matches */ + u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1]; + u32 prev_lz_offset; /* 0 means none */ + u32 upcoming_lz_offset; /* 0 means none */ + + /* Recent (power, raw offset) pairs for delta matches. + * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw + * offset, and the high bits are the power. */ + u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1]; + u32 prev_delta_pair; /* 0 means none */ + u32 upcoming_delta_pair; /* 0 means none */ + + /* States for predicting the probabilities of item types */ + u8 main_state; + u8 match_state; + u8 lz_state; + u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS]; + u8 delta_state; + u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS]; +} _aligned_attribute(64); + +/* + * This structure represents a byte position in the preprocessed input data and + * a node in the graph of possible match/literal choices. + * + * Logically, each incoming edge to this node is labeled with a literal or a + * match that can be taken to reach this position from an earlier position; and + * each outgoing edge from this node is labeled with a literal or a match that + * can be taken to advance from this position to a later position. + */ +struct lzms_optimum_node { + + /* + * The cost of the lowest-cost path that has been found to reach this + * position. This can change as progressively lower cost paths are + * found to reach this position. */ - struct lzms_output_bitstream *os; + u32 cost; +#define INFINITE_COST UINT32_MAX - /* Number of symbols that have been written using this code far. Reset - * to 0 whenever the code is rebuilt. */ - u32 num_syms_written; + /* + * @item is the last item that was taken to reach this position to reach + * it with the stored @cost. This can change as progressively lower + * cost paths are found to reach this position. + * + * In some cases we look ahead more than one item. If we looked ahead n + * items to reach this position, then @item is the last item taken, + * @extra_items contains the other items ordered from second-to-last to + * first, and @num_extra_items is n - 1. + */ + unsigned num_extra_items; + struct lzms_item item; + struct lzms_item extra_items[2]; - /* When @num_syms_written reaches this number, the Huffman code must be - * rebuilt. */ - u32 rebuild_freq; + /* + * The adaptive state that exists at this position. This is filled in + * lazily, only after the minimum-cost path to this position is found. + * + * Note: the way the algorithm handles this adaptive state in the + * "minimum-cost" parse is actually only an approximation. It's + * possible for the globally optimal, minimum cost path to contain a + * prefix, ending at a position, where that path prefix is *not* the + * minimum cost path to that position. This can happen if such a path + * prefix results in a different adaptive state which results in lower + * costs later. Although the algorithm does do some heuristic + * multi-item lookaheads, it does not solve this problem in general. + * + * Note: this adaptive state structure also does not include the + * probability entries or current Huffman codewords. Those aren't + * maintained per-position and are only updated occasionally. + */ + struct lzms_adaptive_state state; +} _aligned_attribute(64); - /* Number of symbols in the represented Huffman code. */ - unsigned num_syms; +/* The main compressor structure */ +struct lzms_compressor { - /* Running totals of symbol frequencies. These are diluted slightly - * whenever the code is rebuilt. */ - u32 sym_freqs[LZMS_MAX_NUM_SYMS]; + /* The matchfinder for LZ matches */ + struct lcpit_matchfinder mf; - /* The length, in bits, of each symbol in the Huffman code. */ - u8 lens[LZMS_MAX_NUM_SYMS]; + /* The preprocessed buffer of data being compressed */ + u8 *in_buffer; - /* The codeword of each symbol in the Huffman code. */ - u32 codewords[LZMS_MAX_NUM_SYMS]; -}; + /* The number of bytes of data to be compressed, which is the number of + * bytes of data in @in_buffer that are actually valid */ + size_t in_nbytes; -/* Internal compression parameters */ -struct lzms_compressor_params { - u32 min_match_length; - u32 nice_match_length; - u32 max_search_depth; - u32 optim_array_length; -}; + /* + * Boolean flags to enable consideration of various types of multi-step + * operations during parsing. + * + * Among other cases, multi-step operations can help with gaps where two + * matches are separated by a non-matching byte. + * + * This idea is borrowed from Igor Pavlov's LZMA encoder. + */ + bool try_lit_lzrep0; + bool try_lzrep_lit_lzrep0; + bool try_lzmatch_lit_lzrep0; + + /* + * If true, the compressor can use delta matches. This slows down + * compression. It improves the compression ratio greatly, slightly, or + * not at all, depending on the input data. + */ + bool use_delta_matches; -/* State of the LZMS compressor */ -struct lzms_compressor { + /* If true, the compressor need not preserve the input buffer if it + * compresses the data successfully. */ + bool destructive; + + /* 'last_target_usages' is a large array that is only needed for + * preprocessing, so it is in union with fields that don't need to be + * initialized until after preprocessing. */ + union { + struct { - /* Internal compression parameters */ - struct lzms_compressor_params params; + /* Temporary space to store matches found by the LZ matchfinder */ + struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1]; - /* Data currently being compressed */ - u8 *cur_window; - u32 cur_window_size; + /* Hash table for finding delta matches */ + u32 delta_hash_table[DELTA_HASH_LENGTH]; - /* Lempel-Ziv match-finder */ - struct lz_mf *mf; + /* For each delta power, the hash code for the next sequence */ + u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER]; - /* Temporary space to store found matches */ - struct lz_match *matches; + /* The per-byte graph nodes for near-optimal parsing */ + struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH + + 1 + MAX_FAST_LENGTH]; - /* Per-position data for near-optimal parsing */ - struct lzms_mc_pos_data *optimum; - struct lzms_mc_pos_data *optimum_end; + /* Table: length => current cost for small match lengths */ + u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1]; - /* Raw range encoder which outputs to the beginning of the compressed - * data buffer, proceeding forwards */ - struct lzms_range_encoder_raw rc; + /* Range encoder which outputs to the beginning of the compressed data + * buffer, proceeding forwards */ + struct lzms_range_encoder rc; /* Bitstream which outputs to the end of the compressed data buffer, * proceeding backwards */ struct lzms_output_bitstream os; - /* Range encoders */ - struct lzms_range_encoder main_range_encoder; - struct lzms_range_encoder match_range_encoder; - struct lzms_range_encoder lz_match_range_encoder; - struct lzms_range_encoder lz_repeat_match_range_encoders[LZMS_NUM_RECENT_OFFSETS - 1]; - struct lzms_range_encoder delta_match_range_encoder; - struct lzms_range_encoder delta_repeat_match_range_encoders[LZMS_NUM_RECENT_OFFSETS - 1]; - - /* Huffman encoders */ - struct lzms_huffman_encoder literal_encoder; - struct lzms_huffman_encoder lz_offset_encoder; - struct lzms_huffman_encoder length_encoder; - struct lzms_huffman_encoder delta_power_encoder; - struct lzms_huffman_encoder delta_offset_encoder; - - /* Used for preprocessing */ - s32 last_target_usages[65536]; + /* States and probability entries for item type disambiguation */ + unsigned main_state; + unsigned match_state; + unsigned lz_state; + unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS]; + unsigned delta_state; + unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS]; + struct lzms_probabilites probs; -#define LZMS_NUM_FAST_LENGTHS 256 - /* Table: length => length slot for small lengths */ - u8 length_slot_fast[LZMS_NUM_FAST_LENGTHS]; + /* Huffman codes */ - /* Table: length => current cost for small match lengths */ - u32 length_cost_fast[LZMS_NUM_FAST_LENGTHS]; + struct lzms_huffman_rebuild_info literal_rebuild_info; + u32 literal_codewords[LZMS_NUM_LITERAL_SYMS]; + u8 literal_lens[LZMS_NUM_LITERAL_SYMS]; + u32 literal_freqs[LZMS_NUM_LITERAL_SYMS]; -#define LZMS_NUM_FAST_OFFSETS 32768 - /* Table: offset => offset slot for small offsets */ - u8 offset_slot_fast[LZMS_NUM_FAST_OFFSETS]; -}; + struct lzms_huffman_rebuild_info lz_offset_rebuild_info; + u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS]; + u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS]; + u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS]; -struct lzms_lz_lru_queue { - u32 recent_offsets[LZMS_NUM_RECENT_OFFSETS + 1]; - u32 prev_offset; - u32 upcoming_offset; -}; + struct lzms_huffman_rebuild_info length_rebuild_info; + u32 length_codewords[LZMS_NUM_LENGTH_SYMS]; + u8 length_lens[LZMS_NUM_LENGTH_SYMS]; + u32 length_freqs[LZMS_NUM_LENGTH_SYMS]; -static void -lzms_init_lz_lru_queue(struct lzms_lz_lru_queue *queue) -{ - for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS + 1; i++) - queue->recent_offsets[i] = i + 1; + struct lzms_huffman_rebuild_info delta_offset_rebuild_info; + u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS]; + u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS]; + u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS]; - queue->prev_offset = 0; - queue->upcoming_offset = 0; -} + struct lzms_huffman_rebuild_info delta_power_rebuild_info; + u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS]; + u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS]; + u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS]; -static void -lzms_update_lz_lru_queue(struct lzms_lz_lru_queue *queue) -{ - if (queue->prev_offset != 0) { - for (int i = LZMS_NUM_RECENT_OFFSETS - 1; i >= 0; i--) - queue->recent_offsets[i + 1] = queue->recent_offsets[i]; - queue->recent_offsets[0] = queue->prev_offset; - } - queue->prev_offset = queue->upcoming_offset; -} + }; /* struct */ -/* - * Match chooser position data: - * - * An array of these structures is used during the near-optimal match-choosing - * algorithm. They correspond to consecutive positions in the window and are - * used to keep track of the cost to reach each position, and the match/literal - * choices that need to be chosen to reach that position. - */ -struct lzms_mc_pos_data { + s32 last_target_usages[65536]; - /* The cost, in bits, of the lowest-cost path that has been found to - * reach this position. This can change as progressively lower cost - * paths are found to reach this position. */ - u32 cost; -#define MC_INFINITE_COST UINT32_MAX + }; /* union */ - /* The match or literal that was taken to reach this position. This can - * change as progressively lower cost paths are found to reach this - * position. - * - * This variable is divided into two bitfields. - * - * Literals: - * Low bits are 1, high bits are the literal. - * - * Explicit offset matches: - * Low bits are the match length, high bits are the offset plus 2. - * - * Repeat offset matches: - * Low bits are the match length, high bits are the queue index. - */ - u64 mc_item_data; -#define MC_OFFSET_SHIFT 32 -#define MC_LEN_MASK (((u64)1 << MC_OFFSET_SHIFT) - 1) + /* Table: length => length slot for small match lengths */ + u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1]; - /* The LZMS adaptive state that exists at this position. This is filled - * in lazily, only after the minimum-cost path to this position is - * found. - * - * Note: the way we handle this adaptive state in the "minimum-cost" - * parse is actually only an approximation. It's possible for the - * globally optimal, minimum cost path to contain a prefix, ending at a - * position, where that path prefix is *not* the minimum cost path to - * that position. This can happen if such a path prefix results in a - * different adaptive state which results in lower costs later. We do - * not solve this problem; we only consider the lowest cost to reach - * each position, which seems to be an acceptable approximation. - * - * Note: this adaptive state also does not include the probability - * entries or current Huffman codewords. Those aren't maintained - * per-position and are only updated occassionally. */ - struct lzms_adaptive_state { - struct lzms_lz_lru_queue lru; - u8 main_state; - u8 match_state; - u8 lz_match_state; - u8 lz_repeat_match_state[LZMS_NUM_RECENT_OFFSETS - 1]; - } state; + /* Tables for mapping offsets to offset slots */ + + /* slots [0, 167); 0 <= num_extra_bits <= 10 */ + u8 offset_slot_tab_1[0xe4a5]; + + /* slots [167, 427); 11 <= num_extra_bits <= 15 */ + u16 offset_slot_tab_2[0x3d0000 >> 11]; + + /* slots [427, 799); 16 <= num_extra_bits */ + u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16]; }; +/****************************************************************************** + * Offset and length slot acceleration * + ******************************************************************************/ + +/* Generate the acceleration table for length slots. */ static void -lzms_init_fast_slots(struct lzms_compressor *c) +lzms_init_fast_length_slot_tab(struct lzms_compressor *c) { - /* Create table mapping small lengths to length slots. */ - for (unsigned slot = 0, i = 0; i < LZMS_NUM_FAST_LENGTHS; i++) { - while (i >= lzms_length_slot_base[slot + 1]) + unsigned slot = 0; + for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) { + if (len >= lzms_length_slot_base[slot + 1]) slot++; - c->length_slot_fast[i] = slot; + c->fast_length_slot_tab[len] = slot; } +} - /* Create table mapping small offsets to offset slots. */ - for (unsigned slot = 0, i = 0; i < LZMS_NUM_FAST_OFFSETS; i++) { - while (i >= lzms_offset_slot_base[slot + 1]) +/* Generate the acceleration tables for offset slots. */ +static void +lzms_init_offset_slot_tabs(struct lzms_compressor *c) +{ + u32 offset; + unsigned slot = 0; + + /* slots [0, 167); 0 <= num_extra_bits <= 10 */ + for (offset = 1; offset < 0xe4a5; offset++) { + if (offset >= lzms_offset_slot_base[slot + 1]) slot++; - c->offset_slot_fast[i] = slot; + c->offset_slot_tab_1[offset] = slot; } -} -static inline unsigned -lzms_get_length_slot_fast(const struct lzms_compressor *c, u32 length) -{ - if (likely(length < LZMS_NUM_FAST_LENGTHS)) - return c->length_slot_fast[length]; - else - return lzms_get_length_slot(length); -} + /* slots [167, 427); 11 <= num_extra_bits <= 15 */ + for (; offset < 0x3de4a5; offset += (u32)1 << 11) { + if (offset >= lzms_offset_slot_base[slot + 1]) + slot++; + c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot; + } -static inline unsigned -lzms_get_offset_slot_fast(const struct lzms_compressor *c, u32 offset) -{ - if (offset < LZMS_NUM_FAST_OFFSETS) - return c->offset_slot_fast[offset]; - else - return lzms_get_offset_slot(offset); + /* slots [427, 799); 16 <= num_extra_bits */ + for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) { + if (offset >= lzms_offset_slot_base[slot + 1]) + slot++; + c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot; + } } -/* Initialize the output bitstream @os to write backwards to the specified - * compressed data buffer @out that is @out_limit 16-bit integers long. */ -static void -lzms_output_bitstream_init(struct lzms_output_bitstream *os, - le16 *out, size_t out_limit) +/* + * Return the length slot for the specified match length, using the compressor's + * acceleration table if the length is small enough. + */ +static inline unsigned +lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length) { - os->bitbuf = 0; - os->bitcount = 0; - os->next = out + out_limit; - os->begin = out; + if (likely(length <= MAX_FAST_LENGTH)) + return c->fast_length_slot_tab[length]; + return lzms_get_length_slot(length); } /* - * Write some bits, contained in the low @num_bits bits of @bits (ordered from - * high-order to low-order), to the output bitstream @os. - * - * @max_num_bits is a compile-time constant that specifies the maximum number of - * bits that can ever be written at this call site. + * Return the offset slot for the specified match offset, using the compressor's + * acceleration tables to speed up the mapping. */ -static inline void -lzms_output_bitstream_put_varbits(struct lzms_output_bitstream *os, - u32 bits, unsigned num_bits, - unsigned max_num_bits) +static inline unsigned +lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset) { - LZMS_ASSERT(num_bits <= 48); - - /* Add the bits to the bit buffer variable. */ - os->bitcount += num_bits; - os->bitbuf = (os->bitbuf << num_bits) | bits; - - /* Check whether any coding units need to be written. */ - while (os->bitcount >= 16) { - - os->bitcount -= 16; - - /* Write a coding unit, unless it would underflow the buffer. */ - if (os->next != os->begin) - put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next); - - /* Optimization for call sites that never write more than 16 - * bits at once. */ - if (max_num_bits <= 16) - break; - } + if (offset < 0xe4a5) + return c->offset_slot_tab_1[offset]; + offset -= 0xe4a5; + if (offset < 0x3d0000) + return c->offset_slot_tab_2[offset >> 11]; + return c->offset_slot_tab_3[offset >> 16]; } -/* Flush the output bitstream, ensuring that all bits written to it have been - * written to memory. Returns %true if all bits have been output successfully, - * or %false if an overrun occurred. */ -static bool -lzms_output_bitstream_flush(struct lzms_output_bitstream *os) -{ - if (os->next == os->begin) - return false; - - if (os->bitcount != 0) - put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next); - - return true; -} +/****************************************************************************** + * Range encoding * + ******************************************************************************/ -/* Initialize the range encoder @rc to write forwards to the specified - * compressed data buffer @out that is @out_limit 16-bit integers long. */ +/* + * Initialize the range encoder @rc to write forwards to the specified buffer + * @out that is @size bytes long. + */ static void -lzms_range_encoder_raw_init(struct lzms_range_encoder_raw *rc, - le16 *out, size_t out_limit) +lzms_range_encoder_init(struct lzms_range_encoder *rc, u8 *out, size_t size) { - rc->low = 0; - rc->range = 0xffffffff; + rc->lower_bound = 0; + rc->range_size = 0xffffffff; rc->cache = 0; rc->cache_size = 1; rc->begin = out; - rc->next = out - 1; - rc->end = out + out_limit; + rc->next = out - sizeof(le16); + rc->end = out + (size & ~1); } /* * Attempt to flush bits from the range encoder. * - * Note: this is based on the public domain code for LZMA written by Igor - * Pavlov. The only differences in this function are that in LZMS the bits must - * be output in 16-bit coding units instead of 8-bit coding units, and that in - * LZMS the first coding unit is not ignored by the decompressor, so the encoder - * cannot output a dummy value to that position. + * The basic idea is that we're writing bits from @rc->lower_bound to the + * output. However, due to carrying, the writing of coding units with the + * maximum value, as well as one prior coding unit, must be delayed until it is + * determined whether a carry is needed. + * + * This is based on the public domain code for LZMA written by Igor Pavlov, but + * with the following differences: * - * The basic idea is that we're writing bits from @rc->low to the output. - * However, due to carrying, the writing of coding units with value 0xffff, as - * well as one prior coding unit, must be delayed until it is determined whether - * a carry is needed. + * - In LZMS, 16-bit coding units are required rather than 8-bit. + * + * - In LZMS, the first coding unit is not ignored by the decompressor, so + * the encoder cannot output a dummy value to that position. */ static void -lzms_range_encoder_raw_shift_low(struct lzms_range_encoder_raw *rc) +lzms_range_encoder_shift_low(struct lzms_range_encoder *rc) { - if ((u32)(rc->low) < 0xffff0000 || - (u32)(rc->low >> 32) != 0) + if ((u32)(rc->lower_bound) < 0xffff0000 || + (u32)(rc->lower_bound >> 32) != 0) { - /* Carry not needed (rc->low < 0xffff0000), or carry occurred - * ((rc->low >> 32) != 0, a.k.a. the carry bit is 1). */ + /* Carry not needed (rc->lower_bound < 0xffff0000), or carry + * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit + * is 1). */ do { if (likely(rc->next >= rc->begin)) { if (rc->next != rc->end) { - put_unaligned_u16_le(rc->cache + - (u16)(rc->low >> 32), - rc->next++); + put_unaligned_le16(rc->cache + + (u16)(rc->lower_bound >> 32), + rc->next); + rc->next += sizeof(le16); } } else { - rc->next++; + rc->next += sizeof(le16); } rc->cache = 0xffff; } while (--rc->cache_size != 0); - rc->cache = (rc->low >> 16) & 0xffff; + rc->cache = (rc->lower_bound >> 16) & 0xffff; } ++rc->cache_size; - rc->low = (rc->low & 0xffff) << 16; -} - -static void -lzms_range_encoder_raw_normalize(struct lzms_range_encoder_raw *rc) -{ - if (rc->range <= 0xffff) { - rc->range <<= 16; - lzms_range_encoder_raw_shift_low(rc); - } + rc->lower_bound = (rc->lower_bound & 0xffff) << 16; } static bool -lzms_range_encoder_raw_flush(struct lzms_range_encoder_raw *rc) +lzms_range_encoder_flush(struct lzms_range_encoder *rc) { - for (unsigned i = 0; i < 4; i++) - lzms_range_encoder_raw_shift_low(rc); + for (int i = 0; i < 4; i++) + lzms_range_encoder_shift_low(rc); return rc->next != rc->end; } -/* Encode the next bit using the range encoder (raw version). +/* + * Encode the next bit using the range encoder. * - * @prob is the chance out of LZMS_PROBABILITY_MAX that the next bit is 0. */ + * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next + * bit is 0 rather than 1. + */ static inline void -lzms_range_encoder_raw_encode_bit(struct lzms_range_encoder_raw *rc, - int bit, u32 prob) +lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob) { - lzms_range_encoder_raw_normalize(rc); + /* Normalize if needed. */ + if (rc->range_size <= 0xffff) { + rc->range_size <<= 16; + lzms_range_encoder_shift_low(rc); + } - u32 bound = (rc->range >> LZMS_PROBABILITY_BITS) * prob; + u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob; if (bit == 0) { - rc->range = bound; + rc->range_size = bound; } else { - rc->low += bound; - rc->range -= bound; + rc->lower_bound += bound; + rc->range_size -= bound; } } -/* Encode a bit using the specified range encoder. This wraps around - * lzms_range_encoder_raw_encode_bit() to handle using and updating the - * appropriate state and probability entry. */ -static void -lzms_range_encode_bit(struct lzms_range_encoder *enc, int bit) +/* + * Encode a bit. This wraps around lzms_range_encode_bit() to handle using and + * updating the state and its corresponding probability entry. + */ +static inline void +lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states, + struct lzms_probability_entry *probs, + struct lzms_range_encoder *rc) { struct lzms_probability_entry *prob_entry; u32 prob; - /* Load the probability entry corresponding to the current state. */ - prob_entry = &enc->prob_entries[enc->state]; + /* Load the probability entry for the current state. */ + prob_entry = &probs[*state_p]; /* Update the state based on the next bit. */ - enc->state = ((enc->state << 1) | bit) & enc->mask; + *state_p = ((*state_p << 1) | bit) & (num_states - 1); /* Get the probability that the bit is 0. */ prob = lzms_get_probability(prob_entry); @@ -502,471 +571,710 @@ lzms_range_encode_bit(struct lzms_range_encoder *enc, int bit) /* Update the probability entry. */ lzms_update_probability_entry(prob_entry, bit); - /* Encode the bit. */ - lzms_range_encoder_raw_encode_bit(enc->rc, bit, prob); + /* Encode the bit using the range encoder. */ + lzms_range_encode_bit(rc, bit, prob); } -/* Called when an adaptive Huffman code needs to be rebuilt. */ +/* Helper functions for encoding bits in the various decision classes */ + static void -lzms_rebuild_huffman_code(struct lzms_huffman_encoder *enc) -{ - make_canonical_huffman_code(enc->num_syms, - LZMS_MAX_CODEWORD_LEN, - enc->sym_freqs, - enc->lens, - enc->codewords); - - /* Dilute the frequencies. */ - for (unsigned i = 0; i < enc->num_syms; i++) { - enc->sym_freqs[i] >>= 1; - enc->sym_freqs[i] += 1; - } - enc->num_syms_written = 0; +lzms_encode_main_bit(struct lzms_compressor *c, int bit) +{ + lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS, + c->probs.main, &c->rc); } -/* Encode a symbol using the specified Huffman encoder. */ -static inline void -lzms_huffman_encode_symbol(struct lzms_huffman_encoder *enc, unsigned sym) +static void +lzms_encode_match_bit(struct lzms_compressor *c, int bit) { - lzms_output_bitstream_put_varbits(enc->os, - enc->codewords[sym], - enc->lens[sym], - LZMS_MAX_CODEWORD_LEN); - ++enc->sym_freqs[sym]; - if (++enc->num_syms_written == enc->rebuild_freq) - lzms_rebuild_huffman_code(enc); + lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS, + c->probs.match, &c->rc); } static void -lzms_update_fast_length_costs(struct lzms_compressor *c); +lzms_encode_lz_bit(struct lzms_compressor *c, int bit) +{ + lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS, + c->probs.lz, &c->rc); +} -/* Encode a match length. */ static void -lzms_encode_length(struct lzms_compressor *c, u32 length) +lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx) { - unsigned slot; - unsigned num_extra_bits; - u32 extra_bits; + lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS, + c->probs.lz_rep[idx], &c->rc); +} - slot = lzms_get_length_slot_fast(c, length); +static void +lzms_encode_delta_bit(struct lzms_compressor *c, int bit) +{ + lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS, + c->probs.delta, &c->rc); +} - extra_bits = length - lzms_length_slot_base[slot]; - num_extra_bits = lzms_extra_length_bits[slot]; +static void +lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx) +{ + lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS, + c->probs.delta_rep[idx], &c->rc); +} - lzms_huffman_encode_symbol(&c->length_encoder, slot); - if (c->length_encoder.num_syms_written == 0) - lzms_update_fast_length_costs(c); +/****************************************************************************** + * Huffman encoding and verbatim bits * + ******************************************************************************/ - lzms_output_bitstream_put_varbits(c->length_encoder.os, - extra_bits, num_extra_bits, 30); +/* + * Initialize the output bitstream @os to write backwards to the specified + * buffer @out that is @size bytes long. + */ +static void +lzms_output_bitstream_init(struct lzms_output_bitstream *os, + u8 *out, size_t size) +{ + os->bitbuf = 0; + os->bitcount = 0; + os->begin = out; + os->next = out + (size & ~1); } -/* Encode an LZ match offset. */ -static void -lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset) +/* + * Write some bits, contained in the low-order @num_bits bits of @bits, to the + * output bitstream @os. + * + * @max_num_bits is a compile-time constant that specifies the maximum number of + * bits that can ever be written at this call site. + */ +static inline void +lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits, + const unsigned num_bits, const unsigned max_num_bits) { - unsigned slot; - unsigned num_extra_bits; - u32 extra_bits; + /* Add the bits to the bit buffer variable. */ + os->bitcount += num_bits; + os->bitbuf = (os->bitbuf << num_bits) | bits; - slot = lzms_get_offset_slot_fast(c, offset); + /* Check whether any coding units need to be written. */ + while (os->bitcount >= 16) { + + os->bitcount -= 16; - extra_bits = offset - lzms_offset_slot_base[slot]; - num_extra_bits = lzms_extra_offset_bits[slot]; + /* Write a coding unit, unless it would underflow the buffer. */ + if (os->next != os->begin) { + os->next -= sizeof(le16); + put_unaligned_le16(os->bitbuf >> os->bitcount, os->next); + } - lzms_huffman_encode_symbol(&c->lz_offset_encoder, slot); - lzms_output_bitstream_put_varbits(c->lz_offset_encoder.os, - extra_bits, num_extra_bits, 30); + /* Optimization for call sites that never write more than 16 + * bits at once. */ + if (max_num_bits <= 16) + break; + } +} + +/* + * Flush the output bitstream, ensuring that all bits written to it have been + * written to memory. Return %true if all bits have been output successfully, + * or %false if an overrun occurred. + */ +static bool +lzms_output_bitstream_flush(struct lzms_output_bitstream *os) +{ + if (os->next == os->begin) + return false; + + if (os->bitcount != 0) { + os->next -= sizeof(le16); + put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next); + } + + return true; } -/* Encode a literal byte. */ static void -lzms_encode_literal(struct lzms_compressor *c, unsigned literal) +lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info) { - /* Main bit: 0 = a literal, not a match. */ - lzms_range_encode_bit(&c->main_range_encoder, 0); + make_canonical_huffman_code(rebuild_info->num_syms, + LZMS_MAX_CODEWORD_LENGTH, + rebuild_info->freqs, + rebuild_info->lens, + rebuild_info->codewords); + rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq; +} - /* Encode the literal using the current literal Huffman code. */ - lzms_huffman_encode_symbol(&c->literal_encoder, literal); +static void +lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info, + unsigned num_syms, unsigned rebuild_freq, + u32 *codewords, u8 *lens, u32 *freqs) +{ + rebuild_info->num_syms = num_syms; + rebuild_info->rebuild_freq = rebuild_freq; + rebuild_info->codewords = codewords; + rebuild_info->lens = lens; + rebuild_info->freqs = freqs; + lzms_init_symbol_frequencies(freqs, num_syms); + lzms_build_huffman_code(rebuild_info); } -/* Encode an LZ repeat offset match. */ static void -lzms_encode_lz_repeat_offset_match(struct lzms_compressor *c, - u32 length, unsigned rep_index) +lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info) { - unsigned i; + lzms_build_huffman_code(rebuild_info); + lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms); +} - /* Main bit: 1 = a match, not a literal. */ - lzms_range_encode_bit(&c->main_range_encoder, 1); +/* + * Encode a symbol using the specified Huffman code. Then, if the Huffman code + * needs to be rebuilt, rebuild it and return true; otherwise return false. + */ +static inline bool +lzms_huffman_encode_symbol(unsigned sym, + const u32 *codewords, const u8 *lens, u32 *freqs, + struct lzms_output_bitstream *os, + struct lzms_huffman_rebuild_info *rebuild_info) +{ + lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH); + ++freqs[sym]; + if (--rebuild_info->num_syms_until_rebuild == 0) { + lzms_rebuild_huffman_code(rebuild_info); + return true; + } + return false; +} - /* Match bit: 0 = an LZ match, not a delta match. */ - lzms_range_encode_bit(&c->match_range_encoder, 0); +/* Helper routines to encode symbols using the various Huffman codes */ - /* LZ match bit: 1 = repeat offset, not an explicit offset. */ - lzms_range_encode_bit(&c->lz_match_range_encoder, 1); +static bool +lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym) +{ + return lzms_huffman_encode_symbol(sym, c->literal_codewords, + c->literal_lens, c->literal_freqs, + &c->os, &c->literal_rebuild_info); +} - /* Encode the repeat offset index. A 1 bit is encoded for each index - * passed up. This sequence of 1 bits is terminated by a 0 bit, or - * automatically when (LZMS_NUM_RECENT_OFFSETS - 1) 1 bits have been - * encoded. */ - for (i = 0; i < rep_index; i++) - lzms_range_encode_bit(&c->lz_repeat_match_range_encoders[i], 1); +static bool +lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym) +{ + return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords, + c->lz_offset_lens, c->lz_offset_freqs, + &c->os, &c->lz_offset_rebuild_info); +} - if (i < LZMS_NUM_RECENT_OFFSETS - 1) - lzms_range_encode_bit(&c->lz_repeat_match_range_encoders[i], 0); +static bool +lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym) +{ + return lzms_huffman_encode_symbol(sym, c->length_codewords, + c->length_lens, c->length_freqs, + &c->os, &c->length_rebuild_info); +} - /* Encode the match length. */ - lzms_encode_length(c, length); +static bool +lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym) +{ + return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords, + c->delta_offset_lens, c->delta_offset_freqs, + &c->os, &c->delta_offset_rebuild_info); } -/* Encode an LZ explicit offset match. */ -static void -lzms_encode_lz_explicit_offset_match(struct lzms_compressor *c, - u32 length, u32 offset) +static bool +lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym) { - /* Main bit: 1 = a match, not a literal. */ - lzms_range_encode_bit(&c->main_range_encoder, 1); + return lzms_huffman_encode_symbol(sym, c->delta_power_codewords, + c->delta_power_lens, c->delta_power_freqs, + &c->os, &c->delta_power_rebuild_info); +} - /* Match bit: 0 = an LZ match, not a delta match. */ - lzms_range_encode_bit(&c->match_range_encoder, 0); +static void +lzms_update_fast_length_costs(struct lzms_compressor *c); - /* LZ match bit: 0 = explicit offset, not a repeat offset. */ - lzms_range_encode_bit(&c->lz_match_range_encoder, 0); +/* + * Encode a match length. If this causes the Huffman code for length symbols to + * be rebuilt, also update the length costs array used by the parser. + */ +static void +lzms_encode_length(struct lzms_compressor *c, u32 length) +{ + unsigned slot = lzms_comp_get_length_slot(c, length); - /* Encode the match offset. */ - lzms_encode_lz_offset(c, offset); + if (lzms_encode_length_symbol(c, slot)) + lzms_update_fast_length_costs(c); - /* Encode the match length. */ - lzms_encode_length(c, length); + lzms_write_bits(&c->os, length - lzms_length_slot_base[slot], + lzms_extra_length_bits[slot], + LZMS_MAX_EXTRA_LENGTH_BITS); } +/* Encode the offset of an LZ match. */ static void -lzms_encode_item(struct lzms_compressor *c, u64 mc_item_data) +lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset) { - u32 len = mc_item_data & MC_LEN_MASK; - u32 offset_data = mc_item_data >> MC_OFFSET_SHIFT; + unsigned slot = lzms_comp_get_offset_slot(c, offset); - if (len == 1) - lzms_encode_literal(c, offset_data); - else if (offset_data < LZMS_NUM_RECENT_OFFSETS) - lzms_encode_lz_repeat_offset_match(c, len, offset_data); - else - lzms_encode_lz_explicit_offset_match(c, len, offset_data - LZMS_OFFSET_OFFSET); + lzms_encode_lz_offset_symbol(c, slot); + lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot], + lzms_extra_offset_bits[slot], + LZMS_MAX_EXTRA_OFFSET_BITS); } -/* Encode a list of matches and literals chosen by the parsing algorithm. */ +/* Encode the raw offset of a delta match. */ static void -lzms_encode_item_list(struct lzms_compressor *c, - struct lzms_mc_pos_data *cur_optimum_ptr) +lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset) { - struct lzms_mc_pos_data *end_optimum_ptr; - u64 saved_item; - u64 item; - - /* The list is currently in reverse order (last item to first item). - * Reverse it. */ - end_optimum_ptr = cur_optimum_ptr; - saved_item = cur_optimum_ptr->mc_item_data; - do { - item = saved_item; - cur_optimum_ptr -= item & MC_LEN_MASK; - saved_item = cur_optimum_ptr->mc_item_data; - cur_optimum_ptr->mc_item_data = item; - } while (cur_optimum_ptr != c->optimum); + unsigned slot = lzms_comp_get_offset_slot(c, raw_offset); - /* Walk the list of items from beginning to end, encoding each item. */ - do { - lzms_encode_item(c, cur_optimum_ptr->mc_item_data); - cur_optimum_ptr += (cur_optimum_ptr->mc_item_data) & MC_LEN_MASK; - } while (cur_optimum_ptr != end_optimum_ptr); + lzms_encode_delta_offset_symbol(c, slot); + lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot], + lzms_extra_offset_bits[slot], + LZMS_MAX_EXTRA_OFFSET_BITS); } -/* Each bit costs 1 << LZMS_COST_SHIFT units. */ -#define LZMS_COST_SHIFT 6 +/****************************************************************************** + * Item encoding * + ******************************************************************************/ -/*#define LZMS_RC_COSTS_USE_FLOATING_POINT*/ +/* Encode the specified item, which may be a literal or any type of match. */ +static void +lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source) +{ + /* Main bit: 0 = literal, 1 = match */ + int main_bit = (length > 1); + lzms_encode_main_bit(c, main_bit); + + if (!main_bit) { + /* Literal */ + unsigned literal = source; + lzms_encode_literal_symbol(c, literal); + } else { + /* Match */ -static u32 -lzms_rc_costs[LZMS_PROBABILITY_MAX + 1]; + /* Match bit: 0 = LZ match, 1 = delta match */ + int match_bit = (source & DELTA_SOURCE_TAG) != 0; + lzms_encode_match_bit(c, match_bit); -#ifdef LZMS_RC_COSTS_USE_FLOATING_POINT -# include -#endif + if (!match_bit) { + /* LZ match */ -static void -lzms_do_init_rc_costs(void) -{ - /* Fill in a table that maps range coding probabilities needed to code a - * bit X (0 or 1) to the number of bits (scaled by a constant factor, to - * handle fractional costs) needed to code that bit X. - * - * Consider the range of the range decoder. To eliminate exactly half - * the range (logical probability of 0.5), we need exactly 1 bit. For - * lower probabilities we need more bits and for higher probabilities we - * need fewer bits. In general, a logical probability of N will - * eliminate the proportion 1 - N of the range; this information takes - * log2(1 / N) bits to encode. - * - * The below loop is simply calculating this number of bits for each - * possible probability allowed by the LZMS compression format, but - * without using real numbers. To handle fractional probabilities, each - * cost is multiplied by (1 << LZMS_COST_SHIFT). These techniques are - * based on those used by LZMA. - * - * Note that in LZMS, a probability x really means x / 64, and 0 / 64 is - * really interpreted as 1 / 64 and 64 / 64 is really interpreted as - * 63 / 64. - */ - for (u32 i = 0; i <= LZMS_PROBABILITY_MAX; i++) { - u32 prob = i; + /* LZ bit: 0 = explicit offset, 1 = repeat offset */ + int lz_bit = (source < LZMS_NUM_LZ_REPS); + lzms_encode_lz_bit(c, lz_bit); - if (prob == 0) - prob = 1; - else if (prob == LZMS_PROBABILITY_MAX) - prob = LZMS_PROBABILITY_MAX - 1; - - #ifdef LZMS_RC_COSTS_USE_FLOATING_POINT - lzms_rc_costs[i] = log2((double)LZMS_PROBABILITY_MAX / prob) * - (1 << LZMS_COST_SHIFT); - #else - u32 w = prob; - u32 bit_count = 0; - for (u32 j = 0; j < LZMS_COST_SHIFT; j++) { - w *= w; - bit_count <<= 1; - while (w >= ((u32)1 << 16)) { - w >>= 1; - ++bit_count; + if (!lz_bit) { + /* Explicit offset LZ match */ + u32 offset = source - (LZMS_NUM_LZ_REPS - 1); + lzms_encode_lz_offset(c, offset); + } else { + /* Repeat offset LZ match */ + int rep_idx = source; + for (int i = 0; i < rep_idx; i++) + lzms_encode_lz_rep_bit(c, 1, i); + if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS) + lzms_encode_lz_rep_bit(c, 0, rep_idx); + } + } else { + /* Delta match */ + + source &= ~DELTA_SOURCE_TAG; + + /* Delta bit: 0 = explicit offset, 1 = repeat offset */ + int delta_bit = (source < LZMS_NUM_DELTA_REPS); + lzms_encode_delta_bit(c, delta_bit); + + if (!delta_bit) { + /* Explicit offset delta match */ + u32 power = source >> DELTA_SOURCE_POWER_SHIFT; + u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) - + (LZMS_NUM_DELTA_REPS - 1); + lzms_encode_delta_power_symbol(c, power); + lzms_encode_delta_raw_offset(c, raw_offset); + } else { + /* Repeat offset delta match */ + int rep_idx = source; + for (int i = 0; i < rep_idx; i++) + lzms_encode_delta_rep_bit(c, 1, i); + if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS) + lzms_encode_delta_rep_bit(c, 0, rep_idx); } } - lzms_rc_costs[i] = (LZMS_PROBABILITY_BITS << LZMS_COST_SHIFT) - - (15 + bit_count); - #endif + + /* Match length (encoded the same way for any match type) */ + lzms_encode_length(c, length); } } +/* Encode a list of matches and literals chosen by the parsing algorithm. */ static void -lzms_init_rc_costs(void) +lzms_encode_nonempty_item_list(struct lzms_compressor *c, + struct lzms_optimum_node *end_node) { - static pthread_once_t once = PTHREAD_ONCE_INIT; + /* Since we've stored at each node the item we took to arrive at that + * node, we can trace our chosen path in backwards order. However, for + * encoding we need to trace our chosen path in forwards order. To make + * this possible, the following loop moves the items from their + * destination nodes to their source nodes, which effectively reverses + * the path. (Think of it like reversing a singly-linked list.) */ + struct lzms_optimum_node *cur_node = end_node; + struct lzms_item saved_item = cur_node->item; + do { + struct lzms_item item = saved_item; + if (cur_node->num_extra_items > 0) { + /* Handle an arrival via multi-item lookahead. */ + unsigned i = 0; + struct lzms_optimum_node *orig_node = cur_node; + do { + cur_node -= item.length; + cur_node->item = item; + item = orig_node->extra_items[i]; + } while (++i != orig_node->num_extra_items); + } + cur_node -= item.length; + saved_item = cur_node->item; + cur_node->item = item; + } while (cur_node != c->optimum_nodes); - pthread_once(&once, lzms_do_init_rc_costs); + /* Now trace the chosen path in forwards order, encoding each item. */ + do { + lzms_encode_item(c, cur_node->item.length, cur_node->item.source); + cur_node += cur_node->item.length; + } while (cur_node != end_node); } -/* Return the cost to range-encode the specified bit from the specified state.*/ -static inline u32 -lzms_rc_bit_cost(const struct lzms_range_encoder *enc, u8 cur_state, int bit) +static inline void +lzms_encode_item_list(struct lzms_compressor *c, + struct lzms_optimum_node *end_node) { - u32 prob_zero; - u32 prob_correct; + if (end_node != c->optimum_nodes) + lzms_encode_nonempty_item_list(c, end_node); +} - prob_zero = enc->prob_entries[cur_state].num_recent_zero_bits; +/****************************************************************************** + * Cost evaluation * + ******************************************************************************/ - if (bit == 0) - prob_correct = prob_zero; - else - prob_correct = LZMS_PROBABILITY_MAX - prob_zero; +/* + * If p is the predicted probability of the next bit being a 0, then the number + * of bits required to encode a 0 bit using a binary range encoder is the real + * number -log2(p), and the number of bits required to encode a 1 bit is the + * real number -log2(1 - p). To avoid computing either of these expressions at + * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from + * probability to cost for each possible probability. Specifically, the array + * indices are the numerators of the possible probabilities in LZMS, where the + * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the + * bit costs multiplied by 1< + +static void +lzms_compute_bit_costs(void) +{ + for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) { + u32 prob = i; + if (prob == 0) + prob++; + else if (prob == LZMS_PROBABILITY_DENOMINATOR) + prob--; - return lzms_rc_costs[prob_correct]; + lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) * + (1 << COST_SHIFT)); + } } +#endif -/* Return the cost to Huffman-encode the specified symbol. */ +/* Return the cost to encode a 0 bit in the specified context. */ static inline u32 -lzms_huffman_symbol_cost(const struct lzms_huffman_encoder *enc, unsigned sym) +lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs) { - return (u32)enc->lens[sym] << LZMS_COST_SHIFT; + return lzms_bit_costs[probs[state].num_recent_zero_bits]; } -/* Return the cost to encode the specified literal byte. */ +/* Return the cost to encode a 1 bit in the specified context. */ static inline u32 -lzms_literal_cost(const struct lzms_compressor *c, unsigned literal, - const struct lzms_adaptive_state *state) +lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs) { - return lzms_rc_bit_cost(&c->main_range_encoder, state->main_state, 0) + - lzms_huffman_symbol_cost(&c->literal_encoder, literal); + return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR - + probs[state].num_recent_zero_bits]; } -/* Update the table that directly provides the costs for small lengths. */ +/* Return the cost to encode a literal, including the main bit. */ +static inline u32 +lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal) +{ + return lzms_bit_0_cost(main_state, c->probs.main) + + ((u32)c->literal_lens[literal] << COST_SHIFT); +} + +/* Update 'fast_length_cost_tab' to use the latest Huffman code. */ static void lzms_update_fast_length_costs(struct lzms_compressor *c) { - u32 len; int slot = -1; u32 cost = 0; - - for (len = 1; len < LZMS_NUM_FAST_LENGTHS; len++) { - - while (len >= lzms_length_slot_base[slot + 1]) { + for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) { + if (len >= lzms_length_slot_base[slot + 1]) { slot++; - cost = (u32)(c->length_encoder.lens[slot] + - lzms_extra_length_bits[slot]) << LZMS_COST_SHIFT; + cost = (u32)(c->length_lens[slot] + + lzms_extra_length_bits[slot]) << COST_SHIFT; } - - c->length_cost_fast[len] = cost; + c->fast_length_cost_tab[len] = cost; } } -/* Return the cost to encode the specified match length, which must be less than - * LZMS_NUM_FAST_LENGTHS. */ +/* Return the cost to encode the specified match length, which must not exceed + * MAX_FAST_LENGTH. */ static inline u32 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length) { - LZMS_ASSERT(length < LZMS_NUM_FAST_LENGTHS); - return c->length_cost_fast[length]; + return c->fast_length_cost_tab[length]; } /* Return the cost to encode the specified LZ match offset. */ static inline u32 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset) { - unsigned slot = lzms_get_offset_slot_fast(c, offset); - - return (u32)(c->lz_offset_encoder.lens[slot] + - lzms_extra_offset_bits[slot]) << LZMS_COST_SHIFT; + unsigned slot = lzms_comp_get_offset_slot(c, offset); + u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot]; + return num_bits << COST_SHIFT; } -/* - * Consider coding the match at repeat offset index @rep_idx. Consider each - * length from the minimum (2) to the full match length (@rep_len). - */ -static inline void -lzms_consider_lz_repeat_offset_match(const struct lzms_compressor *c, - struct lzms_mc_pos_data *cur_optimum_ptr, - u32 rep_len, unsigned rep_idx) +/* Return the cost to encode the specified delta power and raw offset. */ +static inline u32 +lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset) { - u32 len; - u32 base_cost; - u32 cost; - unsigned i; - - base_cost = cur_optimum_ptr->cost; - - base_cost += lzms_rc_bit_cost(&c->main_range_encoder, - cur_optimum_ptr->state.main_state, 1); - - base_cost += lzms_rc_bit_cost(&c->match_range_encoder, - cur_optimum_ptr->state.match_state, 0); + unsigned slot = lzms_comp_get_offset_slot(c, raw_offset); + u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] + + lzms_extra_offset_bits[slot]; + return num_bits << COST_SHIFT; +} - base_cost += lzms_rc_bit_cost(&c->lz_match_range_encoder, - cur_optimum_ptr->state.lz_match_state, 1); +/****************************************************************************** + * Adaptive state * + ******************************************************************************/ - for (i = 0; i < rep_idx; i++) - base_cost += lzms_rc_bit_cost(&c->lz_repeat_match_range_encoders[i], - cur_optimum_ptr->state.lz_repeat_match_state[i], 1); +static void +lzms_init_adaptive_state(struct lzms_adaptive_state *state) +{ + for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++) + state->recent_lz_offsets[i] = i + 1; + state->prev_lz_offset = 0; + state->upcoming_lz_offset = 0; - if (i < LZMS_NUM_RECENT_OFFSETS - 1) - base_cost += lzms_rc_bit_cost(&c->lz_repeat_match_range_encoders[i], - cur_optimum_ptr->state.lz_repeat_match_state[i], 0); + for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++) + state->recent_delta_pairs[i] = i + 1; + state->prev_delta_pair = 0; + state->upcoming_delta_pair = 0; - len = 2; - do { - cost = base_cost + lzms_fast_length_cost(c, len); - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->mc_item_data = - ((u64)rep_idx << MC_OFFSET_SHIFT) | len; - (cur_optimum_ptr + len)->cost = cost; - } - } while (++len <= rep_len); + state->main_state = 0; + state->match_state = 0; + state->lz_state = 0; + for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++) + state->lz_rep_states[i] = 0; + state->delta_state = 0; + for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++) + state->delta_rep_states[i] = 0; } /* - * Consider coding each match in @matches as an explicit offset match. - * - * @matches must be sorted by strictly increasing length and strictly increasing - * offset. This is guaranteed by the match-finder. + * Update the LRU queues for match sources when advancing by one item. * - * We consider each length from the minimum (2) to the longest - * (matches[num_matches - 1].len). For each length, we consider only the - * smallest offset for which that length is available. Although this is not - * guaranteed to be optimal due to the possibility of a larger offset costing - * less than a smaller offset to code, this is a very useful heuristic. + * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more + * complex because: + * - there are separate queues for LZ and delta matches + * - updates to the queues are delayed by one encoded item (this prevents + * sources from being bumped up to index 0 too early) */ -static inline void -lzms_consider_lz_explicit_offset_matches(const struct lzms_compressor *c, - struct lzms_mc_pos_data *cur_optimum_ptr, - const struct lz_match matches[], - u32 num_matches) -{ - u32 len; - u32 i; - u32 base_cost; - u32 position_cost; - u32 cost; - - base_cost = cur_optimum_ptr->cost; - - base_cost += lzms_rc_bit_cost(&c->main_range_encoder, - cur_optimum_ptr->state.main_state, 1); - - base_cost += lzms_rc_bit_cost(&c->match_range_encoder, - cur_optimum_ptr->state.match_state, 0); +static void +lzms_update_lru_queues(struct lzms_adaptive_state *state) +{ + if (state->prev_lz_offset != 0) { + for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--) + state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i]; + state->recent_lz_offsets[0] = state->prev_lz_offset; + } + state->prev_lz_offset = state->upcoming_lz_offset; - base_cost += lzms_rc_bit_cost(&c->lz_match_range_encoder, - cur_optimum_ptr->state.lz_match_state, 0); - len = 2; - i = 0; - do { - position_cost = base_cost + lzms_lz_offset_cost(c, matches[i].offset); - do { - cost = position_cost + lzms_fast_length_cost(c, len); - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->mc_item_data = - ((u64)(matches[i].offset + LZMS_OFFSET_OFFSET) - << MC_OFFSET_SHIFT) | len; - (cur_optimum_ptr + len)->cost = cost; - } - } while (++len <= matches[i].len); - } while (++i != num_matches); + if (state->prev_delta_pair != 0) { + for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--) + state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i]; + state->recent_delta_pairs[0] = state->prev_delta_pair; + } + state->prev_delta_pair = state->upcoming_delta_pair; } -static void -lzms_init_adaptive_state(struct lzms_adaptive_state *state) +static inline void +lzms_update_state(u8 *state_p, int bit, unsigned num_states) { - unsigned i; - - lzms_init_lz_lru_queue(&state->lru); - state->main_state = 0; - state->match_state = 0; - state->lz_match_state = 0; - for (i = 0; i < LZMS_NUM_RECENT_OFFSETS - 1; i++) - state->lz_repeat_match_state[i] = 0; + *state_p = ((*state_p << 1) | bit) & (num_states - 1); } static inline void lzms_update_main_state(struct lzms_adaptive_state *state, int is_match) { - state->main_state = ((state->main_state << 1) | is_match) % LZMS_NUM_MAIN_STATES; + lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS); } static inline void lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta) { - state->match_state = ((state->match_state << 1) | is_delta) % LZMS_NUM_MATCH_STATES; + lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS); +} + +static inline void +lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep) +{ + lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS); } static inline void -lzms_update_lz_match_state(struct lzms_adaptive_state *state, int is_repeat_offset) +lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx) { - state->lz_match_state = ((state->lz_match_state << 1) | is_repeat_offset) % LZMS_NUM_LZ_MATCH_STATES; + for (int i = 0; i < rep_idx; i++) + lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS); + + if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS) + lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS); } static inline void -lzms_update_lz_repeat_match_state(struct lzms_adaptive_state *state, int rep_idx) +lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep) { - int i; + lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS); +} + +static inline void +lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx) +{ + for (int i = 0; i < rep_idx; i++) + lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS); + + if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS) + lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS); +} + +/****************************************************************************** + * Matchfinding * + ******************************************************************************/ - for (i = 0; i < rep_idx; i++) - state->lz_repeat_match_state[i] = - ((state->lz_repeat_match_state[i] << 1) | 1) % - LZMS_NUM_LZ_REPEAT_MATCH_STATES; +/* Note: this code just handles finding delta matches. The code for finding LZ + * matches is elsewhere. */ - if (i < LZMS_NUM_RECENT_OFFSETS - 1) - state->lz_repeat_match_state[i] = - ((state->lz_repeat_match_state[i] << 1) | 0) % - LZMS_NUM_LZ_REPEAT_MATCH_STATES; + +/* Initialize the delta matchfinder for a new input buffer. */ +static void +lzms_init_delta_matchfinder(struct lzms_compressor *c) +{ + /* Set all entries to use an invalid power, which will never match. */ + STATIC_ASSERT(NUM_POWERS_TO_CONSIDER < (1 << (32 - DELTA_SOURCE_POWER_SHIFT))); + memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table)); + + /* Initialize the next hash code for each power. We can just use zeroes + * initially; it doesn't really matter. */ + for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++) + c->next_delta_hashes[i] = 0; } +/* + * Compute a DELTA_HASH_ORDER-bit hash code for the first + * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a + * delta context with the specified @span. + */ +static inline u32 +lzms_delta_hash(const u8 *p, const u32 pos, u32 span) +{ + /* A delta match has a certain span and an offset that is a multiple of + * that span. To reduce wasted space we use a single combined hash + * table for all spans and positions, but to minimize collisions we + * include in the hash code computation the span and the low-order bits + * of the current position. */ + + STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3); + u8 d0 = *(p + 0) - *(p + 0 - span); + u8 d1 = *(p + 1) - *(p + 1 - span); + u8 d2 = *(p + 2) - *(p + 2 - span); + u32 v = ((span + (pos & (span - 1))) << 24) | + ((u32)d2 << 16) | ((u32)d1 << 8) | d0; + return lz_hash(v, DELTA_HASH_ORDER); +} + +/* + * Given a match between @in_next and @matchptr in a delta context with the + * specified @span and having the initial @len, extend the match as far as + * possible, up to a limit of @max_len. + */ +static inline u32 +lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr, + u32 len, u32 max_len, u32 span) +{ + while (len < max_len && + (u8)(*(in_next + len) - *(in_next + len - span)) == + (u8)(*(matchptr + len) - *(matchptr + len - span))) + { + len++; + } + return len; +} + +static void +lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c, + const u8 *in_next, u32 count) +{ + u32 pos = in_next - c->in_buffer; + if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1)) + return; + do { + /* Update the hash table for each power. */ + for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) { + const u32 span = (u32)1 << power; + if (unlikely(pos < span)) + continue; + const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span); + const u32 hash = c->next_delta_hashes[power]; + c->delta_hash_table[hash] = + (power << DELTA_SOURCE_POWER_SHIFT) | pos; + c->next_delta_hashes[power] = next_hash; + prefetchw(&c->delta_hash_table[next_hash]); + } + } while (in_next++, pos++, --count); +} + +/* + * Skip the next @count bytes (don't search for matches at them). @in_next + * points to the first byte to skip. The return value is @in_next + count. + */ +static const u8 * +lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next) +{ + lcpit_matchfinder_skip_bytes(&c->mf, count); + if (c->use_delta_matches) + lzms_delta_matchfinder_skip_bytes(c, in_next, count); + return in_next + count; +} + +/****************************************************************************** + * "Near-optimal" parsing * + ******************************************************************************/ + /* * The main near-optimal parsing routine. * @@ -977,250 +1285,702 @@ lzms_update_lz_repeat_match_state(struct lzms_adaptive_state *state, int rep_idx * can be reached using a match or literal from the current position. This is * essentially Dijkstra's algorithm in disguise: the graph nodes are positions, * the graph edges are possible matches/literals to code, and the cost of each - * edge is the estimated number of bits that will be required to output the - * corresponding match or literal. But one difference is that we actually - * compute the lowest-cost path in pieces, where each piece is terminated when - * there are no choices to be made. - * - * Notes: + * edge is the estimated number of bits (scaled up by COST_SHIFT) that will be + * required to output the corresponding match or literal. But one difference is + * that we actually compute the lowest-cost path in pieces, where each piece is + * terminated when there are no choices to be made. * - * - This does not output any delta matches. - * - * - The costs of literals and matches are estimated using the range encoder - * states and the semi-adaptive Huffman codes. Except for range encoding - * states, costs are assumed to be constant throughout a single run of the - * parsing algorithm, which can parse up to @optim_array_length bytes of data. - * This introduces a source of inaccuracy because the probabilities and - * Huffman codes can change over this part of the data. + * The costs of literals and matches are estimated using the range encoder + * states and the semi-adaptive Huffman codes. Except for range encoding + * states, costs are assumed to be constant throughout a single run of the + * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data. This + * introduces a source of non-optimality because the probabilities and Huffman + * codes can change over this part of the data. And of course, there are + * various other reasons why the result isn't optimal in terms of compression + * ratio. */ static void lzms_near_optimal_parse(struct lzms_compressor *c) { - const u8 *window_ptr; - const u8 *window_end; - struct lzms_mc_pos_data *cur_optimum_ptr; - struct lzms_mc_pos_data *end_optimum_ptr; - u32 num_matches; - u32 longest_len; - u32 rep_max_len; - unsigned rep_max_idx; - unsigned literal; - unsigned i; - u32 cost; - u32 len; - u32 offset_data; + const u8 *in_next = c->in_buffer; + const u8 * const in_end = &c->in_buffer[c->in_nbytes]; + struct lzms_optimum_node *cur_node; + struct lzms_optimum_node *end_node; - window_ptr = c->cur_window; - window_end = window_ptr + c->cur_window_size; + /* Set initial length costs for lengths <= MAX_FAST_LENGTH. */ + lzms_update_fast_length_costs(c); - lzms_init_adaptive_state(&c->optimum[0].state); + /* Set up the initial adaptive state. */ + lzms_init_adaptive_state(&c->optimum_nodes[0].state); begin: /* Start building a new list of items, which will correspond to the next * piece of the overall minimum-cost path. */ - cur_optimum_ptr = c->optimum; - cur_optimum_ptr->cost = 0; - end_optimum_ptr = cur_optimum_ptr; - - /* States should currently be consistent with the encoders. */ - LZMS_ASSERT(cur_optimum_ptr->state.main_state == c->main_range_encoder.state); - LZMS_ASSERT(cur_optimum_ptr->state.match_state == c->match_range_encoder.state); - LZMS_ASSERT(cur_optimum_ptr->state.lz_match_state == c->lz_match_range_encoder.state); - for (i = 0; i < LZMS_NUM_RECENT_OFFSETS - 1; i++) - LZMS_ASSERT(cur_optimum_ptr->state.lz_repeat_match_state[i] == - c->lz_repeat_match_range_encoders[i].state); + cur_node = c->optimum_nodes; + cur_node->cost = 0; + end_node = cur_node; - if (window_ptr == window_end) + if (in_next == in_end) return; - /* The following loop runs once for each per byte in the window, except - * in a couple shortcut cases. */ + /* The following loop runs once for each per byte in the input buffer, + * except in a few shortcut cases. */ for (;;) { + u32 num_matches; - /* Find explicit offset matches with the current position. */ - num_matches = lz_mf_get_matches(c->mf, c->matches); - - if (num_matches) { - /* - * Find the longest repeat offset match with the current - * position. - * - * Heuristics: - * - * - Only search for repeat offset matches if the - * match-finder already found at least one match. - * - * - Only consider the longest repeat offset match. It - * seems to be rare for the optimal parse to include a - * repeat offset match that doesn't have the longest - * length (allowing for the possibility that not all - * of that length is actually used). - */ - if (likely(window_ptr - c->cur_window >= LZMS_MAX_INIT_RECENT_OFFSET)) { - BUILD_BUG_ON(LZMS_NUM_RECENT_OFFSETS != 3); - rep_max_len = lz_repsearch3(window_ptr, - window_end - window_ptr, - cur_optimum_ptr->state.lru.recent_offsets, - &rep_max_idx); - } else { - rep_max_len = 0; - } + /* Repeat offset LZ matches */ + if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS && + in_end - in_next >= 2)) + { + for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) { - if (rep_max_len) { - /* If there's a very long repeat offset match, - * choose it immediately. */ - if (rep_max_len >= c->params.nice_match_length) { + /* Looking for a repeat offset LZ match at queue + * index @rep_idx */ - lz_mf_skip_positions(c->mf, rep_max_len - 1); - window_ptr += rep_max_len; + const u32 offset = cur_node->state.recent_lz_offsets[rep_idx]; + const u8 * const matchptr = in_next - offset; - if (cur_optimum_ptr != c->optimum) - lzms_encode_item_list(c, cur_optimum_ptr); + /* Check the first 2 bytes before entering the extension loop. */ + if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr)) + continue; - lzms_encode_lz_repeat_offset_match(c, rep_max_len, - rep_max_idx); + /* Extend the match to its full length. */ + const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next); - c->optimum[0].state = cur_optimum_ptr->state; + /* Early out for long repeat offset LZ match */ + if (rep_len >= c->mf.nice_match_len) { - lzms_update_main_state(&c->optimum[0].state, 1); - lzms_update_match_state(&c->optimum[0].state, 0); - lzms_update_lz_match_state(&c->optimum[0].state, 1); - lzms_update_lz_repeat_match_state(&c->optimum[0].state, - rep_max_idx); + in_next = lzms_skip_bytes(c, rep_len, in_next); - c->optimum[0].state.lru.upcoming_offset = - c->optimum[0].state.lru.recent_offsets[rep_max_idx]; + lzms_encode_item_list(c, cur_node); + lzms_encode_item(c, rep_len, rep_idx); - for (i = rep_max_idx; i < LZMS_NUM_RECENT_OFFSETS; i++) - c->optimum[0].state.lru.recent_offsets[i] = - c->optimum[0].state.lru.recent_offsets[i + 1]; + c->optimum_nodes[0].state = cur_node->state; + cur_node = &c->optimum_nodes[0]; - lzms_update_lz_lru_queue(&c->optimum[0].state.lru); + cur_node->state.upcoming_lz_offset = + cur_node->state.recent_lz_offsets[rep_idx]; + cur_node->state.upcoming_delta_pair = 0; + for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++) + cur_node->state.recent_lz_offsets[i] = + cur_node->state.recent_lz_offsets[i + 1]; + lzms_update_lru_queues(&cur_node->state); + lzms_update_main_state(&cur_node->state, 1); + lzms_update_match_state(&cur_node->state, 0); + lzms_update_lz_state(&cur_node->state, 1); + lzms_update_lz_rep_states(&cur_node->state, rep_idx); goto begin; } - /* If reaching any positions for the first time, - * initialize their costs to "infinity". */ - while (end_optimum_ptr < cur_optimum_ptr + rep_max_len) - (++end_optimum_ptr)->cost = MC_INFINITE_COST; + while (end_node < cur_node + rep_len) + (++end_node)->cost = INFINITE_COST; + + u32 base_cost = cur_node->cost + + lzms_bit_1_cost(cur_node->state.main_state, + c->probs.main) + + lzms_bit_0_cost(cur_node->state.match_state, + c->probs.match) + + lzms_bit_1_cost(cur_node->state.lz_state, + c->probs.lz); + + for (int i = 0; i < rep_idx; i++) + base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i], + c->probs.lz_rep[i]); + + if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS) + base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx], + c->probs.lz_rep[rep_idx]); + + u32 len = 2; + do { + u32 cost = base_cost + lzms_fast_length_cost(c, len); + if (cost < (cur_node + len)->cost) { + (cur_node + len)->cost = cost; + (cur_node + len)->item = (struct lzms_item) { + .length = len, + .source = rep_idx, + }; + (cur_node + len)->num_extra_items = 0; + } + } while (++len <= rep_len); + + + /* try LZ-rep + lit + LZ-rep0 */ + if (c->try_lzrep_lit_lzrep0 && + in_end - (in_next + rep_len) >= 3 && + load_u16_unaligned(in_next + rep_len + 1) == + load_u16_unaligned(matchptr + rep_len + 1)) + { + const u32 rep0_len = lz_extend(in_next + rep_len + 1, + matchptr + rep_len + 1, + 2, + min(c->mf.nice_match_len, + in_end - (in_next + rep_len + 1))); + + unsigned main_state = cur_node->state.main_state; + unsigned match_state = cur_node->state.match_state; + unsigned lz_state = cur_node->state.lz_state; + unsigned lz_rep0_state = cur_node->state.lz_rep_states[0]; + + /* update states for LZ-rep */ + main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS; + match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS; + lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS; + lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) % + LZMS_NUM_LZ_REP_PROBS; + + /* LZ-rep cost */ + u32 cost = base_cost + lzms_fast_length_cost(c, rep_len); + + /* add literal cost */ + cost += lzms_literal_cost(c, main_state, *(in_next + rep_len)); + + /* update state for literal */ + main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS; + + /* add LZ-rep0 cost */ + cost += lzms_bit_1_cost(main_state, c->probs.main) + + lzms_bit_0_cost(match_state, c->probs.match) + + lzms_bit_1_cost(lz_state, c->probs.lz) + + lzms_bit_0_cost(lz_rep0_state, c->probs.lz_rep[0]) + + lzms_fast_length_cost(c, rep0_len); + + const u32 total_len = rep_len + 1 + rep0_len; + + while (end_node < cur_node + total_len) + (++end_node)->cost = INFINITE_COST; + + if (cost < (cur_node + total_len)->cost) { + (cur_node + total_len)->cost = cost; + (cur_node + total_len)->item = (struct lzms_item) { + .length = rep0_len, + .source = 0, + }; + (cur_node + total_len)->extra_items[0] = (struct lzms_item) { + .length = 1, + .source = *(in_next + rep_len), + }; + (cur_node + total_len)->extra_items[1] = (struct lzms_item) { + .length = rep_len, + .source = rep_idx, + }; + (cur_node + total_len)->num_extra_items = 2; + } + } + } + } - /* Consider coding a repeat offset match. */ - lzms_consider_lz_repeat_offset_match(c, cur_optimum_ptr, - rep_max_len, rep_max_idx); + /* Repeat offset delta matches */ + if (c->use_delta_matches && + likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 && + (in_end - in_next >= 2))) + { + for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) { + + /* Looking for a repeat offset delta match at + * queue index @rep_idx */ + + const u32 pair = cur_node->state.recent_delta_pairs[rep_idx]; + const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT; + const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK; + const u32 span = (u32)1 << power; + const u32 offset = raw_offset << power; + const u8 * const matchptr = in_next - offset; + + /* Check the first 2 bytes before entering the + * extension loop. */ + if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) != + (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) || + ((u8)(*(in_next + 1) - *(in_next + 1 - span)) != + (u8)(*(matchptr + 1) - *(matchptr + 1 - span)))) + continue; + + /* Extend the match to its full length. */ + const u32 rep_len = lzms_extend_delta_match(in_next, matchptr, + 2, in_end - in_next, + span); + + /* Early out for long repeat offset delta match */ + if (rep_len >= c->mf.nice_match_len) { + + in_next = lzms_skip_bytes(c, rep_len, in_next); + + lzms_encode_item_list(c, cur_node); + lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx); + + c->optimum_nodes[0].state = cur_node->state; + cur_node = &c->optimum_nodes[0]; + + cur_node->state.upcoming_delta_pair = pair; + cur_node->state.upcoming_lz_offset = 0; + for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++) + cur_node->state.recent_delta_pairs[i] = + cur_node->state.recent_delta_pairs[i + 1]; + lzms_update_lru_queues(&cur_node->state); + lzms_update_main_state(&cur_node->state, 1); + lzms_update_match_state(&cur_node->state, 1); + lzms_update_delta_state(&cur_node->state, 1); + lzms_update_delta_rep_states(&cur_node->state, rep_idx); + goto begin; + } + + while (end_node < cur_node + rep_len) + (++end_node)->cost = INFINITE_COST; + + u32 base_cost = cur_node->cost + + lzms_bit_1_cost(cur_node->state.main_state, + c->probs.main) + + lzms_bit_1_cost(cur_node->state.match_state, + c->probs.match) + + lzms_bit_1_cost(cur_node->state.delta_state, + c->probs.delta); + + for (int i = 0; i < rep_idx; i++) + base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i], + c->probs.delta_rep[i]); + + if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS) + base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx], + c->probs.delta_rep[rep_idx]); + + u32 len = 2; + do { + u32 cost = base_cost + lzms_fast_length_cost(c, len); + if (cost < (cur_node + len)->cost) { + (cur_node + len)->cost = cost; + (cur_node + len)->item = (struct lzms_item) { + .length = len, + .source = DELTA_SOURCE_TAG | rep_idx, + }; + (cur_node + len)->num_extra_items = 0; + } + } while (++len <= rep_len); } + } - longest_len = c->matches[num_matches - 1].len; + /* Explicit offset LZ matches */ + num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches); + if (num_matches) { - /* If there's a very long explicit offset match, choose - * it immediately. */ - if (longest_len >= c->params.nice_match_length) { + u32 best_len = c->matches[0].length; - lz_mf_skip_positions(c->mf, longest_len - 1); - window_ptr += longest_len; + /* Early out for long explicit offset LZ match */ + if (best_len >= c->mf.nice_match_len) { - if (cur_optimum_ptr != c->optimum) - lzms_encode_item_list(c, cur_optimum_ptr); + const u32 offset = c->matches[0].offset; - lzms_encode_lz_explicit_offset_match(c, longest_len, - c->matches[num_matches - 1].offset); + /* Extend the match as far as possible. + * This is necessary because the LCP-interval + * tree matchfinder only reports up to + * nice_match_len bytes. */ + best_len = lz_extend(in_next, in_next - offset, + best_len, in_end - in_next); - c->optimum[0].state = cur_optimum_ptr->state; + in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1); - lzms_update_main_state(&c->optimum[0].state, 1); - lzms_update_match_state(&c->optimum[0].state, 0); - lzms_update_lz_match_state(&c->optimum[0].state, 0); + lzms_encode_item_list(c, cur_node); + lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1); - c->optimum[0].state.lru.upcoming_offset = - c->matches[num_matches - 1].offset; + c->optimum_nodes[0].state = cur_node->state; + cur_node = &c->optimum_nodes[0]; - lzms_update_lz_lru_queue(&c->optimum[0].state.lru); + cur_node->state.upcoming_lz_offset = offset; + cur_node->state.upcoming_delta_pair = 0; + lzms_update_lru_queues(&cur_node->state); + lzms_update_main_state(&cur_node->state, 1); + lzms_update_match_state(&cur_node->state, 0); + lzms_update_lz_state(&cur_node->state, 0); goto begin; } - /* If reaching any positions for the first time, - * initialize their costs to "infinity". */ - while (end_optimum_ptr < cur_optimum_ptr + longest_len) - (++end_optimum_ptr)->cost = MC_INFINITE_COST; + while (end_node < cur_node + best_len) + (++end_node)->cost = INFINITE_COST; + + u32 base_cost = cur_node->cost + + lzms_bit_1_cost(cur_node->state.main_state, + c->probs.main) + + lzms_bit_0_cost(cur_node->state.match_state, + c->probs.match) + + lzms_bit_0_cost(cur_node->state.lz_state, + c->probs.lz); + + if (c->try_lzmatch_lit_lzrep0 && + likely(in_end - (in_next + c->matches[0].length) >= 3)) + { + /* try LZ-match + lit + LZ-rep0 */ + + u32 l = 2; + u32 i = num_matches - 1; + do { + const u32 len = c->matches[i].length; + const u32 offset = c->matches[i].offset; + const u32 position_cost = base_cost + + lzms_lz_offset_cost(c, offset); + do { + u32 cost = position_cost + lzms_fast_length_cost(c, l); + if (cost < (cur_node + l)->cost) { + (cur_node + l)->cost = cost; + (cur_node + l)->item = (struct lzms_item) { + .length = l, + .source = offset + (LZMS_NUM_LZ_REPS - 1), + }; + (cur_node + l)->num_extra_items = 0; + } + } while (++l <= len); + + const u8 * const matchptr = in_next - offset; + if (load_u16_unaligned(matchptr + len + 1) != + load_u16_unaligned(in_next + len + 1)) + continue; + + const u32 rep0_len = lz_extend(in_next + len + 1, + matchptr + len + 1, + 2, + min(c->mf.nice_match_len, + in_end - (in_next + len + 1))); + + unsigned main_state = cur_node->state.main_state; + unsigned match_state = cur_node->state.match_state; + unsigned lz_state = cur_node->state.lz_state; + + /* update states for LZ-match */ + main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS; + match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS; + lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS; + + /* LZ-match cost */ + u32 cost = position_cost + lzms_fast_length_cost(c, len); + + /* add literal cost */ + cost += lzms_literal_cost(c, main_state, *(in_next + len)); + + /* update state for literal */ + main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS; + + /* add LZ-rep0 cost */ + cost += lzms_bit_1_cost(main_state, c->probs.main) + + lzms_bit_0_cost(match_state, c->probs.match) + + lzms_bit_1_cost(lz_state, c->probs.lz) + + lzms_bit_0_cost(cur_node->state.lz_rep_states[0], + c->probs.lz_rep[0]) + + lzms_fast_length_cost(c, rep0_len); + + const u32 total_len = len + 1 + rep0_len; + + while (end_node < cur_node + total_len) + (++end_node)->cost = INFINITE_COST; + + if (cost < (cur_node + total_len)->cost) { + (cur_node + total_len)->cost = cost; + (cur_node + total_len)->item = (struct lzms_item) { + .length = rep0_len, + .source = 0, + }; + (cur_node + total_len)->extra_items[0] = (struct lzms_item) { + .length = 1, + .source = *(in_next + len), + }; + (cur_node + total_len)->extra_items[1] = (struct lzms_item) { + .length = len, + .source = offset + LZMS_NUM_LZ_REPS - 1, + }; + (cur_node + total_len)->num_extra_items = 2; + } + } while (i--); + } else { + u32 l = 2; + u32 i = num_matches - 1; + do { + u32 position_cost = base_cost + + lzms_lz_offset_cost(c, c->matches[i].offset); + do { + u32 cost = position_cost + lzms_fast_length_cost(c, l); + if (cost < (cur_node + l)->cost) { + (cur_node + l)->cost = cost; + (cur_node + l)->item = (struct lzms_item) { + .length = l, + .source = c->matches[i].offset + + (LZMS_NUM_LZ_REPS - 1), + }; + (cur_node + l)->num_extra_items = 0; + } + } while (++l <= c->matches[i].length); + } while (i--); + } + } - /* Consider coding an explicit offset match. */ - lzms_consider_lz_explicit_offset_matches(c, cur_optimum_ptr, - c->matches, num_matches); - } else { - /* No matches found. The only choice at this position - * is to code a literal. */ + /* Explicit offset delta matches */ + if (c->use_delta_matches && + likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1)) + { + const u32 pos = in_next - c->in_buffer; - if (end_optimum_ptr == cur_optimum_ptr) - (++end_optimum_ptr)->cost = MC_INFINITE_COST; - } + /* Consider each possible power (log2 of span) */ + STATIC_ASSERT(NUM_POWERS_TO_CONSIDER <= LZMS_NUM_DELTA_POWER_SYMS); + for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) { - /* Consider coding a literal. + const u32 span = (u32)1 << power; - * To avoid an extra unpredictable brench, actually checking the - * preferability of coding a literal is integrated into the - * adaptive state update code below. */ - literal = *window_ptr++; - cost = cur_optimum_ptr->cost + - lzms_literal_cost(c, literal, &cur_optimum_ptr->state); + if (unlikely(pos < span)) + continue; - /* Advance to the next position. */ - cur_optimum_ptr++; + const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span); + const u32 hash = c->next_delta_hashes[power]; + const u32 cur_match = c->delta_hash_table[hash]; - /* The lowest-cost path to the current position is now known. - * Finalize the adaptive state that results from taking this - * lowest-cost path. */ + c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos; + c->next_delta_hashes[power] = next_hash; + prefetchw(&c->delta_hash_table[next_hash]); - if (cost < cur_optimum_ptr->cost) { - /* Literal */ - cur_optimum_ptr->cost = cost; - cur_optimum_ptr->mc_item_data = ((u64)literal << MC_OFFSET_SHIFT) | 1; + if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT) + continue; - cur_optimum_ptr->state = (cur_optimum_ptr - 1)->state; + const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK); - lzms_update_main_state(&cur_optimum_ptr->state, 0); + /* The offset must be a multiple of span. */ + if (offset & (span - 1)) + continue; - cur_optimum_ptr->state.lru.upcoming_offset = 0; - } else { - /* LZ match */ - len = cur_optimum_ptr->mc_item_data & MC_LEN_MASK; - offset_data = cur_optimum_ptr->mc_item_data >> MC_OFFSET_SHIFT; + const u8 * const matchptr = in_next - offset; - cur_optimum_ptr->state = (cur_optimum_ptr - len)->state; + /* Check the first 3 bytes before entering the + * extension loop. */ + STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3); + if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) != + (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) || + ((u8)(*(in_next + 1) - *(in_next + 1 - span)) != + (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) || + ((u8)(*(in_next + 2) - *(in_next + 2 - span)) != + (u8)(*(matchptr + 2) - *(matchptr + 2 - span)))) + continue; - lzms_update_main_state(&cur_optimum_ptr->state, 1); - lzms_update_match_state(&cur_optimum_ptr->state, 0); + /* Extend the delta match to its full length. */ + const u32 len = lzms_extend_delta_match(in_next, + matchptr, + NBYTES_HASHED_FOR_DELTA, + in_end - in_next, + span); - if (offset_data >= LZMS_NUM_RECENT_OFFSETS) { + const u32 raw_offset = offset >> power; - /* Explicit offset LZ match */ + if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK - + (LZMS_NUM_DELTA_REPS - 1))) + continue; - lzms_update_lz_match_state(&cur_optimum_ptr->state, 0); + const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) | + raw_offset; + const u32 source = DELTA_SOURCE_TAG | + (pair + LZMS_NUM_DELTA_REPS - 1); - cur_optimum_ptr->state.lru.upcoming_offset = - offset_data - LZMS_OFFSET_OFFSET; - } else { - /* Repeat offset LZ match */ + /* Early out for long explicit offset delta match */ + if (len >= c->mf.nice_match_len) { + + in_next = lzms_skip_bytes(c, len - 1, in_next + 1); - lzms_update_lz_match_state(&cur_optimum_ptr->state, 1); - lzms_update_lz_repeat_match_state(&cur_optimum_ptr->state, - offset_data); + lzms_encode_item_list(c, cur_node); + lzms_encode_item(c, len, source); - cur_optimum_ptr->state.lru.upcoming_offset = - cur_optimum_ptr->state.lru.recent_offsets[offset_data]; + c->optimum_nodes[0].state = cur_node->state; + cur_node = &c->optimum_nodes[0]; - for (i = offset_data; i < LZMS_NUM_RECENT_OFFSETS; i++) - cur_optimum_ptr->state.lru.recent_offsets[i] = - cur_optimum_ptr->state.lru.recent_offsets[i + 1]; + cur_node->state.upcoming_lz_offset = 0; + cur_node->state.upcoming_delta_pair = pair; + lzms_update_lru_queues(&cur_node->state); + lzms_update_main_state(&cur_node->state, 1); + lzms_update_match_state(&cur_node->state, 1); + lzms_update_delta_state(&cur_node->state, 0); + goto begin; + } + + while (end_node < cur_node + len) + (++end_node)->cost = INFINITE_COST; + + u32 base_cost = cur_node->cost + + lzms_bit_1_cost(cur_node->state.main_state, + c->probs.main) + + lzms_bit_1_cost(cur_node->state.match_state, + c->probs.match) + + lzms_bit_0_cost(cur_node->state.delta_state, + c->probs.delta) + + lzms_delta_source_cost(c, power, raw_offset); + + u32 l = NBYTES_HASHED_FOR_DELTA; + do { + u32 cost = base_cost + lzms_fast_length_cost(c, l); + if (cost < (cur_node + l)->cost) { + (cur_node + l)->cost = cost; + (cur_node + l)->item = (struct lzms_item) { + .length = l, + .source = source, + }; + (cur_node + l)->num_extra_items = 0; + } + } while (++l <= len); } } - lzms_update_lz_lru_queue(&cur_optimum_ptr->state.lru); + /* Literal */ + if (end_node < cur_node + 1) + (++end_node)->cost = INFINITE_COST; + const u32 cur_and_lit_cost = cur_node->cost + + lzms_literal_cost(c, cur_node->state.main_state, + *in_next); + if (cur_and_lit_cost < (cur_node + 1)->cost) { + (cur_node + 1)->cost = cur_and_lit_cost; + (cur_node + 1)->item = (struct lzms_item) { + .length = 1, + .source = *in_next, + }; + (cur_node + 1)->num_extra_items = 0; + } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) { + /* try lit + LZ-rep0 */ + const u32 offset = + (cur_node->state.prev_lz_offset) ? + cur_node->state.prev_lz_offset : + cur_node->state.recent_lz_offsets[0]; + + if (load_u16_unaligned(in_next + 1) == + load_u16_unaligned(in_next + 1 - offset)) + { + const u32 rep0_len = lz_extend(in_next + 1, + in_next + 1 - offset, + 2, + min(in_end - (in_next + 1), + c->mf.nice_match_len)); + + unsigned main_state = cur_node->state.main_state; + + /* Update main_state after literal */ + main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS; + + /* Add cost of LZ-rep0 */ + const u32 cost = cur_and_lit_cost + + lzms_bit_1_cost(main_state, c->probs.main) + + lzms_bit_0_cost(cur_node->state.match_state, + c->probs.match) + + lzms_bit_1_cost(cur_node->state.lz_state, + c->probs.lz) + + lzms_bit_0_cost(cur_node->state.lz_rep_states[0], + c->probs.lz_rep[0]) + + lzms_fast_length_cost(c, rep0_len); + + const u32 total_len = 1 + rep0_len; + + while (end_node < cur_node + total_len) + (++end_node)->cost = INFINITE_COST; + + if (cost < (cur_node + total_len)->cost) { + (cur_node + total_len)->cost = cost; + (cur_node + total_len)->item = (struct lzms_item) { + .length = rep0_len, + .source = 0, + }; + (cur_node + total_len)->extra_items[0] = (struct lzms_item) { + .length = 1, + .source = *in_next, + }; + (cur_node + total_len)->num_extra_items = 1; + } + } + } + + /* Advance to the next position. */ + in_next++; + cur_node++; + + /* The lowest-cost path to the current position is now known. + * Finalize the adaptive state that results from taking this + * lowest-cost path. */ + struct lzms_item item_to_take = cur_node->item; + struct lzms_optimum_node *source_node = cur_node - item_to_take.length; + int next_item_idx = -1; + for (unsigned i = 0; i < cur_node->num_extra_items; i++) { + item_to_take = cur_node->extra_items[i]; + source_node -= item_to_take.length; + next_item_idx++; + } + cur_node->state = source_node->state; + for (;;) { + const u32 length = item_to_take.length; + u32 source = item_to_take.source; + + cur_node->state.upcoming_lz_offset = 0; + cur_node->state.upcoming_delta_pair = 0; + if (length > 1) { + /* Match */ + + lzms_update_main_state(&cur_node->state, 1); + + if (source & DELTA_SOURCE_TAG) { + /* Delta match */ + + lzms_update_match_state(&cur_node->state, 1); + source &= ~DELTA_SOURCE_TAG; + + if (source >= LZMS_NUM_DELTA_REPS) { + /* Explicit offset delta match */ + lzms_update_delta_state(&cur_node->state, 0); + cur_node->state.upcoming_delta_pair = + source - (LZMS_NUM_DELTA_REPS - 1); + } else { + /* Repeat offset delta match */ + int rep_idx = source; + + lzms_update_delta_state(&cur_node->state, 1); + lzms_update_delta_rep_states(&cur_node->state, rep_idx); + + cur_node->state.upcoming_delta_pair = + cur_node->state.recent_delta_pairs[rep_idx]; + + for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++) + cur_node->state.recent_delta_pairs[i] = + cur_node->state.recent_delta_pairs[i + 1]; + } + } else { + lzms_update_match_state(&cur_node->state, 0); + + if (source >= LZMS_NUM_LZ_REPS) { + /* Explicit offset LZ match */ + lzms_update_lz_state(&cur_node->state, 0); + cur_node->state.upcoming_lz_offset = + source - (LZMS_NUM_LZ_REPS - 1); + } else { + /* Repeat offset LZ match */ + int rep_idx = source; + + lzms_update_lz_state(&cur_node->state, 1); + lzms_update_lz_rep_states(&cur_node->state, rep_idx); + + cur_node->state.upcoming_lz_offset = + cur_node->state.recent_lz_offsets[rep_idx]; + + for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++) + cur_node->state.recent_lz_offsets[i] = + cur_node->state.recent_lz_offsets[i + 1]; + } + } + } else { + /* Literal */ + lzms_update_main_state(&cur_node->state, 0); + } + + lzms_update_lru_queues(&cur_node->state); + + if (next_item_idx < 0) + break; + if (next_item_idx == 0) + item_to_take = cur_node->item; + else + item_to_take = cur_node->extra_items[next_item_idx - 1]; + --next_item_idx; + } /* * This loop will terminate when either of the following * conditions is true: * - * (1) cur_optimum_ptr == end_optimum_ptr + * (1) cur_node == end_node * * There are no paths that extend beyond the current * position. In this case, any path to a later position @@ -1228,7 +1988,7 @@ begin: * ahead and choose the list of items that led to this * position. * - * (2) cur_optimum_ptr == c->optimum_end + * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES] * * This bounds the number of times the algorithm can step * forward before it is guaranteed to start choosing items. @@ -1236,129 +1996,82 @@ begin: * the parser will not go too long without updating the * probability tables. * - * Note: no check for end-of-window is needed because - * end-of-window will trigger condition (1). + * Note: no check for end-of-buffer is needed because + * end-of-buffer will trigger condition (1). */ - if (cur_optimum_ptr == end_optimum_ptr || - cur_optimum_ptr == c->optimum_end) + if (cur_node == end_node || + cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]) { - c->optimum[0].state = cur_optimum_ptr->state; - break; + lzms_encode_nonempty_item_list(c, cur_node); + c->optimum_nodes[0].state = cur_node->state; + goto begin; } } - - /* Output the current list of items that constitute the minimum-cost - * path to the current position. */ - lzms_encode_item_list(c, cur_optimum_ptr); - goto begin; } static void -lzms_init_range_encoder(struct lzms_range_encoder *enc, - struct lzms_range_encoder_raw *rc, u32 num_states) +lzms_init_states_and_probabilities(struct lzms_compressor *c) { - enc->rc = rc; - enc->state = 0; - LZMS_ASSERT(is_power_of_2(num_states)); - enc->mask = num_states - 1; - lzms_init_probability_entries(enc->prob_entries, num_states); + c->main_state = 0; + c->match_state = 0; + c->lz_state = 0; + for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++) + c->lz_rep_states[i] = 0; + c->delta_state = 0; + for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++) + c->delta_rep_states[i] = 0; + + lzms_init_probabilities(&c->probs); } static void -lzms_init_huffman_encoder(struct lzms_huffman_encoder *enc, - struct lzms_output_bitstream *os, - unsigned num_syms, - unsigned rebuild_freq) -{ - enc->os = os; - enc->num_syms_written = 0; - enc->rebuild_freq = rebuild_freq; - enc->num_syms = num_syms; - for (unsigned i = 0; i < num_syms; i++) - enc->sym_freqs[i] = 1; - - make_canonical_huffman_code(enc->num_syms, - LZMS_MAX_CODEWORD_LEN, - enc->sym_freqs, - enc->lens, - enc->codewords); -} - -/* Prepare the LZMS compressor for compressing a block of data. */ -static void -lzms_prepare_compressor(struct lzms_compressor *c, const u8 *udata, u32 ulen, - le16 *cdata, u32 clen16) +lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots) { - unsigned num_offset_slots; - - /* Copy the uncompressed data into the @c->cur_window buffer. */ - memcpy(c->cur_window, udata, ulen); - c->cur_window_size = ulen; - - /* Initialize the raw range encoder (writing forwards). */ - lzms_range_encoder_raw_init(&c->rc, cdata, clen16); - - /* Initialize the output bitstream for Huffman symbols and verbatim bits - * (writing backwards). */ - lzms_output_bitstream_init(&c->os, cdata, clen16); - - /* Calculate the number of offset slots required. */ - num_offset_slots = lzms_get_offset_slot(ulen - 1) + 1; - - /* Initialize a Huffman encoder for each alphabet. */ - lzms_init_huffman_encoder(&c->literal_encoder, &c->os, - LZMS_NUM_LITERAL_SYMS, - LZMS_LITERAL_CODE_REBUILD_FREQ); - - lzms_init_huffman_encoder(&c->lz_offset_encoder, &c->os, - num_offset_slots, - LZMS_LZ_OFFSET_CODE_REBUILD_FREQ); - - lzms_init_huffman_encoder(&c->length_encoder, &c->os, - LZMS_NUM_LENGTH_SYMS, - LZMS_LENGTH_CODE_REBUILD_FREQ); - - lzms_init_huffman_encoder(&c->delta_offset_encoder, &c->os, - num_offset_slots, - LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ); - - lzms_init_huffman_encoder(&c->delta_power_encoder, &c->os, - LZMS_NUM_DELTA_POWER_SYMS, - LZMS_DELTA_POWER_CODE_REBUILD_FREQ); - - /* Initialize range encoders, all of which wrap around the same - * lzms_range_encoder_raw. */ - lzms_init_range_encoder(&c->main_range_encoder, - &c->rc, LZMS_NUM_MAIN_STATES); - - lzms_init_range_encoder(&c->match_range_encoder, - &c->rc, LZMS_NUM_MATCH_STATES); - - lzms_init_range_encoder(&c->lz_match_range_encoder, - &c->rc, LZMS_NUM_LZ_MATCH_STATES); - - for (unsigned i = 0; i < ARRAY_LEN(c->lz_repeat_match_range_encoders); i++) - lzms_init_range_encoder(&c->lz_repeat_match_range_encoders[i], - &c->rc, LZMS_NUM_LZ_REPEAT_MATCH_STATES); - - lzms_init_range_encoder(&c->delta_match_range_encoder, - &c->rc, LZMS_NUM_DELTA_MATCH_STATES); - - for (unsigned i = 0; i < ARRAY_LEN(c->delta_repeat_match_range_encoders); i++) - lzms_init_range_encoder(&c->delta_repeat_match_range_encoders[i], - &c->rc, LZMS_NUM_DELTA_REPEAT_MATCH_STATES); - - /* Set initial length costs for lengths < LZMS_NUM_FAST_LENGTHS. */ - lzms_update_fast_length_costs(c); + lzms_init_huffman_code(&c->literal_rebuild_info, + LZMS_NUM_LITERAL_SYMS, + LZMS_LITERAL_CODE_REBUILD_FREQ, + c->literal_codewords, + c->literal_lens, + c->literal_freqs); + + lzms_init_huffman_code(&c->lz_offset_rebuild_info, + num_offset_slots, + LZMS_LZ_OFFSET_CODE_REBUILD_FREQ, + c->lz_offset_codewords, + c->lz_offset_lens, + c->lz_offset_freqs); + + lzms_init_huffman_code(&c->length_rebuild_info, + LZMS_NUM_LENGTH_SYMS, + LZMS_LENGTH_CODE_REBUILD_FREQ, + c->length_codewords, + c->length_lens, + c->length_freqs); + + lzms_init_huffman_code(&c->delta_offset_rebuild_info, + num_offset_slots, + LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ, + c->delta_offset_codewords, + c->delta_offset_lens, + c->delta_offset_freqs); + + lzms_init_huffman_code(&c->delta_power_rebuild_info, + LZMS_NUM_DELTA_POWER_SYMS, + LZMS_DELTA_POWER_CODE_REBUILD_FREQ, + c->delta_power_codewords, + c->delta_power_lens, + c->delta_power_freqs); } -/* Flush the output streams, prepare the final compressed data, and return its +/* + * Flush the output streams, prepare the final compressed data, and return its * size in bytes. * * A return value of 0 indicates that the data could not be compressed to fit in - * the available space. */ + * the available space. + */ static size_t -lzms_finalize(struct lzms_compressor *c, u8 *cdata, size_t csize_avail) +lzms_finalize(struct lzms_compressor *c) { size_t num_forwards_bytes; size_t num_backwards_bytes; @@ -1368,7 +2081,7 @@ lzms_finalize(struct lzms_compressor *c, u8 *cdata, size_t csize_avail) if (!lzms_output_bitstream_flush(&c->os)) return 0; - if (!lzms_range_encoder_raw_flush(&c->rc)) + if (!lzms_range_encoder_flush(&c->rc)) return 0; if (c->rc.next > c->os.next) @@ -1379,186 +2092,122 @@ lzms_finalize(struct lzms_compressor *c, u8 *cdata, size_t csize_avail) * bitstream. Move the data output by the backwards bitstream to be * adjacent to the data output by the forward bitstream, and calculate * the compressed size that this results in. */ - num_forwards_bytes = (u8*)c->rc.next - (u8*)cdata; - num_backwards_bytes = ((u8*)cdata + csize_avail) - (u8*)c->os.next; + num_forwards_bytes = c->rc.next - c->rc.begin; + num_backwards_bytes = c->rc.end - c->os.next; - memmove(cdata + num_forwards_bytes, c->os.next, num_backwards_bytes); + memmove(c->rc.next, c->os.next, num_backwards_bytes); return num_forwards_bytes + num_backwards_bytes; } -/* Set internal compression parameters for the specified compression level and - * maximum window size. */ -static void -lzms_build_params(unsigned int compression_level, - struct lzms_compressor_params *params) -{ - /* Allow length 2 matches if the compression level is sufficiently high. - */ - if (compression_level >= 45) - params->min_match_length = 2; - else - params->min_match_length = 3; - - /* Scale nice_match_length and max_search_depth with the compression - * level. But to allow an optimization on length cost calculations, - * don't allow nice_match_length to exceed LZMS_NUM_FAST_LENGTH. */ - params->nice_match_length = ((u64)compression_level * 32) / 50; - if (params->nice_match_length < params->min_match_length) - params->nice_match_length = params->min_match_length; - if (params->nice_match_length > LZMS_NUM_FAST_LENGTHS) - params->nice_match_length = LZMS_NUM_FAST_LENGTHS; - params->max_search_depth = compression_level; - - params->optim_array_length = 1024; -} - -/* Given the internal compression parameters and maximum window size, build the - * Lempel-Ziv match-finder parameters. */ -static void -lzms_build_mf_params(const struct lzms_compressor_params *lzms_params, - u32 max_window_size, struct lz_mf_params *mf_params) -{ - memset(mf_params, 0, sizeof(*mf_params)); - - /* Choose an appropriate match-finding algorithm. */ - if (max_window_size <= 2097152) - mf_params->algorithm = LZ_MF_BINARY_TREES; - else if (max_window_size <= 33554432) - mf_params->algorithm = LZ_MF_LCP_INTERVAL_TREE; - else - mf_params->algorithm = LZ_MF_LINKED_SUFFIX_ARRAY; - - mf_params->max_window_size = max_window_size; - mf_params->min_match_len = lzms_params->min_match_length; - mf_params->max_search_depth = lzms_params->max_search_depth; - mf_params->nice_match_len = lzms_params->nice_match_length; -} - -static void -lzms_free_compressor(void *_c); - static u64 -lzms_get_needed_memory(size_t max_block_size, unsigned int compression_level) +lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level, + bool destructive) { - struct lzms_compressor_params params; - struct lz_mf_params mf_params; u64 size = 0; - if (max_block_size > LZMS_MAX_BUFFER_SIZE) + if (max_bufsize > LZMS_MAX_BUFFER_SIZE) return 0; - lzms_build_params(compression_level, ¶ms); - lzms_build_mf_params(¶ms, max_block_size, &mf_params); - size += sizeof(struct lzms_compressor); - /* cur_window */ - size += max_block_size; + if (!destructive) + size += max_bufsize; /* in_buffer */ /* mf */ - size += lz_mf_get_needed_memory(mf_params.algorithm, max_block_size); - - /* matches */ - size += min(params.max_search_depth, params.nice_match_length) * - sizeof(struct lz_match); - - /* optimum */ - size += (params.optim_array_length + params.nice_match_length) * - sizeof(struct lzms_mc_pos_data); + size += lcpit_matchfinder_get_needed_memory(max_bufsize); return size; } static int -lzms_create_compressor(size_t max_block_size, unsigned int compression_level, - void **ctx_ret) +lzms_create_compressor(size_t max_bufsize, unsigned compression_level, + bool destructive, void **c_ret) { struct lzms_compressor *c; - struct lzms_compressor_params params; - struct lz_mf_params mf_params; - - if (max_block_size > LZMS_MAX_BUFFER_SIZE) - return WIMLIB_ERR_INVALID_PARAM; + u32 nice_match_len; - lzms_build_params(compression_level, ¶ms); - lzms_build_mf_params(¶ms, max_block_size, &mf_params); - if (!lz_mf_params_valid(&mf_params)) + if (max_bufsize > LZMS_MAX_BUFFER_SIZE) return WIMLIB_ERR_INVALID_PARAM; - c = CALLOC(1, sizeof(struct lzms_compressor)); + c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64); if (!c) - goto oom; - - c->params = params; + goto oom0; - c->cur_window = MALLOC(max_block_size); - if (!c->cur_window) - goto oom; + c->destructive = destructive; - c->mf = lz_mf_alloc(&mf_params); - if (!c->mf) - goto oom; + /* Scale nice_match_len with the compression level. But to allow an + * optimization for length cost calculations, don't allow nice_match_len + * to exceed MAX_FAST_LENGTH. */ + nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH); - c->matches = MALLOC(min(params.max_search_depth, - params.nice_match_length) * - sizeof(struct lz_match)); - if (!c->matches) - goto oom; + c->use_delta_matches = (compression_level >= 35); + c->try_lzmatch_lit_lzrep0 = (compression_level >= 45); + c->try_lit_lzrep0 = (compression_level >= 60); + c->try_lzrep_lit_lzrep0 = (compression_level >= 60); - c->optimum = MALLOC((params.optim_array_length + - params.nice_match_length) * - sizeof(struct lzms_mc_pos_data)); - if (!c->optimum) - goto oom; - c->optimum_end = &c->optimum[params.optim_array_length]; + if (!c->destructive) { + c->in_buffer = MALLOC(max_bufsize); + if (!c->in_buffer) + goto oom1; + } - lzms_init_rc_costs(); + if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len)) + goto oom2; - lzms_init_fast_slots(c); + lzms_init_fast_length_slot_tab(c); + lzms_init_offset_slot_tabs(c); - *ctx_ret = c; + *c_ret = c; return 0; -oom: - lzms_free_compressor(c); +oom2: + if (!c->destructive) + FREE(c->in_buffer); +oom1: + ALIGNED_FREE(c); +oom0: return WIMLIB_ERR_NOMEM; } static size_t -lzms_compress(const void *uncompressed_data, size_t uncompressed_size, - void *compressed_data, size_t compressed_size_avail, void *_c) +lzms_compress(const void *restrict in, size_t in_nbytes, + void *restrict out, size_t out_nbytes_avail, void *restrict _c) { struct lzms_compressor *c = _c; + size_t result; - /* Don't bother compressing extremely small inputs. */ - if (uncompressed_size < 4) + /* Don't bother trying to compress extremely small inputs. */ + if (in_nbytes < 4) return 0; - /* Cap the available compressed size to a 32-bit integer and round it - * down to the nearest multiple of 2. */ - if (compressed_size_avail > UINT32_MAX) - compressed_size_avail = UINT32_MAX; - if (compressed_size_avail & 1) - compressed_size_avail--; - - /* Initialize the compressor structures. */ - lzms_prepare_compressor(c, uncompressed_data, uncompressed_size, - compressed_data, compressed_size_avail / 2); - - /* Preprocess the uncompressed data. */ - lzms_x86_filter(c->cur_window, c->cur_window_size, - c->last_target_usages, false); - - /* Load the window into the match-finder. */ - lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size); - - /* Compute and encode a literal/match sequence that decompresses to the - * preprocessed data. */ + /* Copy the input data into the internal buffer and preprocess it. */ + if (c->destructive) + c->in_buffer = (void *)in; + else + memcpy(c->in_buffer, in, in_nbytes); + c->in_nbytes = in_nbytes; + lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false); + + /* Prepare the matchfinders. */ + lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes); + if (c->use_delta_matches) + lzms_init_delta_matchfinder(c); + + /* Initialize the encoder structures. */ + lzms_range_encoder_init(&c->rc, out, out_nbytes_avail); + lzms_output_bitstream_init(&c->os, out, out_nbytes_avail); + lzms_init_states_and_probabilities(c); + lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes)); + + /* The main loop: parse and encode. */ lzms_near_optimal_parse(c); /* Return the compressed data size or 0. */ - return lzms_finalize(c, compressed_data, compressed_size_avail); + result = lzms_finalize(c); + if (!result && c->destructive) + lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true); + return result; } static void @@ -1566,13 +2215,10 @@ lzms_free_compressor(void *_c) { struct lzms_compressor *c = _c; - if (c) { - FREE(c->cur_window); - lz_mf_free(c->mf); - FREE(c->matches); - FREE(c->optimum); - FREE(c); - } + if (!c->destructive) + FREE(c->in_buffer); + lcpit_matchfinder_destroy(&c->mf); + ALIGNED_FREE(c); } const struct compressor_ops lzms_compressor_ops = {