From: Eric Biggers Date: Fri, 13 Feb 2015 01:01:06 +0000 (-0600) Subject: Rewrite of LZMS compressor X-Git-Tag: v1.8.0~24 X-Git-Url: https://wimlib.net/git/?p=wimlib;a=commitdiff_plain;h=a141cf94ae7562406ea1bc32c026ac85b37751a6 Rewrite of LZMS compressor - Delta match support - Multi-step item consideration in selected cases - Various refactoring; updates to comments and names; some changes to decompressor as well - Remove pthreads dependency --- diff --git a/Makefile.am b/Makefile.am index b6846759..21b40bcf 100644 --- a/Makefile.am +++ b/Makefile.am @@ -61,7 +61,6 @@ libwim_la_SOURCES = \ src/lcpit_matchfinder.c \ src/lcpit_matchfinder_templates.h \ src/lookup_table.c \ - src/lz_repsearch.c \ src/lzms_common.c \ src/lzms_compress.c \ src/lzms_decompress.c \ @@ -125,7 +124,6 @@ libwim_la_SOURCES = \ include/wimlib/lookup_table.h \ include/wimlib/lz_extend.h \ include/wimlib/lz_hash.h \ - include/wimlib/lz_repsearch.h \ include/wimlib/lzms_common.h \ include/wimlib/lzms_constants.h \ include/wimlib/lzx_common.h \ diff --git a/include/wimlib/lz_repsearch.h b/include/wimlib/lz_repsearch.h deleted file mode 100644 index b3ab80aa..00000000 --- a/include/wimlib/lz_repsearch.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * lz_repsearch.h - * - * Fast searching for repeat offset matches. - * - * Author: Eric Biggers - * Year: 2014, 2015 - * - * The author dedicates this file to the public domain. - * You can do whatever you want with this file. - */ - -#ifndef _LZ_REPSEARCH_H -#define _LZ_REPSEARCH_H - -#include "wimlib/lz_extend.h" -#include "wimlib/unaligned.h" - -extern u32 -lz_extend_repmatch(const u8 *strptr, const u8 *matchptr, u32 max_len); - -/* - * Given a pointer to the current string and a queue of 3 recent match offsets, - * find the longest repeat offset match. - * - * If no match of at least 2 bytes is found, then return 0. - * - * If a match of at least 2 bytes is found, then return its length and set - * *rep_max_idx_ret to the index of its offset in @recent_offsets. -*/ -static inline u32 -lz_repsearch3(const u8 * const strptr, const u32 max_len, - const u32 recent_offsets[3], unsigned *rep_max_idx_ret) -{ - unsigned rep_max_idx; - u32 rep_len; - u32 rep_max_len; - const u16 str = load_u16_unaligned(strptr); - const u8 *matchptr; - - matchptr = strptr - recent_offsets[0]; - if (load_u16_unaligned(matchptr) == str) - rep_max_len = lz_extend_repmatch(strptr, matchptr, max_len); - else - rep_max_len = 0; - rep_max_idx = 0; - - matchptr = strptr - recent_offsets[1]; - if (load_u16_unaligned(matchptr) == str) { - rep_len = lz_extend_repmatch(strptr, matchptr, max_len); - if (rep_len > rep_max_len) { - rep_max_len = rep_len; - rep_max_idx = 1; - } - } - - matchptr = strptr - recent_offsets[2]; - if (load_u16_unaligned(matchptr) == str) { - rep_len = lz_extend_repmatch(strptr, matchptr, max_len); - if (rep_len > rep_max_len) { - rep_max_len = rep_len; - rep_max_idx = 2; - } - } - - *rep_max_idx_ret = rep_max_idx; - return rep_max_len; -} - -#endif /* _LZ_REPSEARCH_H */ diff --git a/include/wimlib/lzms_common.h b/include/wimlib/lzms_common.h index 10dc4d02..1cb8c566 100644 --- a/include/wimlib/lzms_common.h +++ b/include/wimlib/lzms_common.h @@ -11,36 +11,6 @@ #include "wimlib/lzms_constants.h" #include "wimlib/types.h" -//#define ENABLE_LZMS_DEBUG -#ifdef ENABLE_LZMS_DEBUG -# define LZMS_DEBUG DEBUG -# define LZMS_ASSERT wimlib_assert -# include "wimlib/assert.h" -# include "wimlib/error.h" -#else -# define LZMS_DEBUG(format, ...) -# define LZMS_ASSERT(...) -#endif - -extern void -lzms_x86_filter(u8 data[], s32 size, s32 last_target_usages[], bool undo); - -/* Probability entry for use by the range coder when in a specific state. */ -struct lzms_probability_entry { - - /* Number of zeroes in the most recent LZMS_PROBABILITY_MAX bits that - * have been coded using this probability entry. This is a cached value - * because it can be computed as LZMS_PROBABILITY_MAX minus the number - * of bits set in the low-order LZMS_PROBABILITY_MAX bits of - * @recent_bits. */ - u32 num_recent_zero_bits; - - /* The most recent LZMS_PROBABILITY_MAX bits that have been coded using - * this probability entry. The size of this variable, in bits, must be - * at least LZMS_PROBABILITY_MAX. */ - u64 recent_bits; -}; - /* Offset slot tables */ extern const u32 lzms_offset_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1]; extern const u8 lzms_extra_offset_bits[LZMS_MAX_NUM_OFFSET_SYMS]; @@ -69,42 +39,62 @@ lzms_get_length_slot(u32 length) extern unsigned lzms_get_num_offset_slots(size_t uncompressed_size); -extern void -lzms_init_probability_entries(struct lzms_probability_entry *entries, size_t count); + +/* Probability entry for use by the range coder when in a specific state */ +struct lzms_probability_entry { + + /* The number of zeroes in the most recent LZMS_PROBABILITY_DENOMINATOR + * bits that have been decoded or encoded using this probability entry. + * The probability of the next bit being 0 is this value over + * LZMS_PROBABILITY_DENOMINATOR, except for the cases where this would + * imply 0% or 100% probability. */ + u32 num_recent_zero_bits; + + /* The most recent LZMS_PROBABILITY_DENOMINATOR bits that have been + * coded using this probability entry. The bits are ordered such that + * low order is newest and high order is oldest. */ + u64 recent_bits; +}; extern void -lzms_init_symbol_frequencies(u32 freqs[], size_t num_syms); +lzms_init_probability_entries(struct lzms_probability_entry *entries, size_t count); -/* Given a decoded bit, update the probability entry. */ +/* Given a decoded or encoded bit, update the probability entry. */ static inline void -lzms_update_probability_entry(struct lzms_probability_entry *prob_entry, int bit) +lzms_update_probability_entry(struct lzms_probability_entry *entry, int bit) { - s32 delta_zero_bits; + BUILD_BUG_ON(LZMS_PROBABILITY_DENOMINATOR != sizeof(entry->recent_bits) * 8); - BUILD_BUG_ON(LZMS_PROBABILITY_MAX != sizeof(prob_entry->recent_bits) * 8); + s32 delta_zero_bits = (s32)(entry->recent_bits >> + (LZMS_PROBABILITY_DENOMINATOR - 1)) - bit; - delta_zero_bits = (s32)(prob_entry->recent_bits >> (LZMS_PROBABILITY_MAX - 1)) - bit; - - prob_entry->num_recent_zero_bits += delta_zero_bits; - prob_entry->recent_bits <<= 1; - prob_entry->recent_bits |= bit; + entry->num_recent_zero_bits += delta_zero_bits; + entry->recent_bits = (entry->recent_bits << 1) | bit; } -/* Given a probability entry, return the chance out of LZMS_PROBABILITY_MAX that - * the next decoded bit will be a 0. */ +/* Given a probability entry, return the chance out of + * LZMS_PROBABILITY_DENOMINATOR that the next decoded bit will be a 0. */ static inline u32 lzms_get_probability(const struct lzms_probability_entry *prob_entry) { - u32 prob; - - prob = prob_entry->num_recent_zero_bits; + u32 prob = prob_entry->num_recent_zero_bits; /* 0% and 100% probabilities aren't allowed. */ if (prob == 0) prob++; - if (prob == LZMS_PROBABILITY_MAX) + else if (prob == LZMS_PROBABILITY_DENOMINATOR) prob--; return prob; } +extern void +lzms_init_symbol_frequencies(u32 freqs[], unsigned num_syms); + +extern void +lzms_dilute_symbol_frequencies(u32 freqs[], unsigned num_syms); + +/* Pre/post-processing */ +extern void +lzms_x86_filter(u8 data[], s32 size, s32 last_target_usages[], bool undo); + #endif /* _LZMS_COMMON_H */ diff --git a/include/wimlib/lzms_constants.h b/include/wimlib/lzms_constants.h index aa22aaf2..500be242 100644 --- a/include/wimlib/lzms_constants.h +++ b/include/wimlib/lzms_constants.h @@ -7,42 +7,76 @@ #ifndef _LZMS_CONSTANTS_H #define _LZMS_CONSTANTS_H -#define LZMS_MIN_MATCH_LEN 1 -#define LZMS_MAX_MATCH_LEN 1073809578 +/* Limits on match lengths and offsets (in bytes) */ +#define LZMS_MIN_MATCH_LENGTH 1 +#define LZMS_MAX_MATCH_LENGTH 1073809578 +#define LZMS_MIN_MATCH_OFFSET 1 #define LZMS_MAX_MATCH_OFFSET 1180427428 -#define LZMS_MAX_BUFFER_SIZE (LZMS_MAX_MATCH_OFFSET + 1) -#define LZMS_NUM_RECENT_OFFSETS 3 -#define LZMS_MAX_INIT_RECENT_OFFSET (LZMS_NUM_RECENT_OFFSETS + 1) -#define LZMS_OFFSET_OFFSET (LZMS_NUM_RECENT_OFFSETS - 1) +/* The value to which buffer sizes should be limited. Microsoft's + * implementation seems to use 67108864 (2^26) bytes. However, since the format + * itself supports higher match lengths and offsets, we'll use 2^30 bytes. */ +#define LZMS_MAX_BUFFER_SIZE 1073741824 +/* The length of each LRU queue for match sources: + * + * 1. offsets of LZ matches + * 2. (power, raw offset) pairs of delta matches */ +#define LZMS_NUM_LZ_REPS 3 +#define LZMS_NUM_DELTA_REPS 3 + +/* The numbers of binary decision classes needed for encoding queue indices */ +#define LZMS_NUM_LZ_REP_DECISIONS (LZMS_NUM_LZ_REPS - 1) +#define LZMS_NUM_DELTA_REP_DECISIONS (LZMS_NUM_DELTA_REPS - 1) + +/* These are the numbers of probability entries for each binary decision class. + * The logarithm base 2 of each of these values is the number of recently + * encoded bits that are remembered for each decision class and are used to + * select the appropriate probability entry when decoding/encoding the next + * binary decision (bit). */ +#define LZMS_NUM_MAIN_PROBS 16 +#define LZMS_NUM_MATCH_PROBS 32 +#define LZMS_NUM_LZ_PROBS 64 +#define LZMS_NUM_LZ_REP_PROBS 64 +#define LZMS_NUM_DELTA_PROBS 64 +#define LZMS_NUM_DELTA_REP_PROBS 64 + +/* These values define the precision for probabilities in LZMS, which are always + * given as a numerator; the denominator is implied. */ #define LZMS_PROBABILITY_BITS 6 -#define LZMS_PROBABILITY_MAX (1U << LZMS_PROBABILITY_BITS) -#define LZMS_INITIAL_PROBABILITY 48 -#define LZMS_INITIAL_RECENT_BITS 0x0000000055555555ULL +#define LZMS_PROBABILITY_DENOMINATOR (1 << LZMS_PROBABILITY_BITS) -#define LZMS_NUM_MAIN_STATES 16 -#define LZMS_NUM_MATCH_STATES 32 -#define LZMS_NUM_LZ_MATCH_STATES 64 -#define LZMS_NUM_LZ_REPEAT_MATCH_STATES 64 -#define LZMS_NUM_DELTA_MATCH_STATES 64 -#define LZMS_NUM_DELTA_REPEAT_MATCH_STATES 64 -#define LZMS_MAX_NUM_STATES 64 +/* These values define the initial state of each probability entry. */ +#define LZMS_INITIAL_PROBABILITY 48 +#define LZMS_INITIAL_RECENT_BITS 0x0000000055555555 +/* The number of symbols in each Huffman-coded alphabet */ #define LZMS_NUM_LITERAL_SYMS 256 #define LZMS_NUM_LENGTH_SYMS 54 #define LZMS_NUM_DELTA_POWER_SYMS 8 #define LZMS_MAX_NUM_OFFSET_SYMS 799 #define LZMS_MAX_NUM_SYMS 799 -#define LZMS_MAX_CODEWORD_LEN 15 - +/* The rebuild frequencies, in symbols, for each Huffman code */ #define LZMS_LITERAL_CODE_REBUILD_FREQ 1024 #define LZMS_LZ_OFFSET_CODE_REBUILD_FREQ 1024 #define LZMS_LENGTH_CODE_REBUILD_FREQ 512 #define LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ 1024 #define LZMS_DELTA_POWER_CODE_REBUILD_FREQ 512 +/* The maximum length, in bits, of any Huffman codeword. This is guaranteed by + * the way frequencies are updated. */ +#define LZMS_MAX_CODEWORD_LENGTH 15 + +/* The maximum number of verbatim bits, in addition to the Huffman-encoded + * length slot symbol, that may be required to encode a match length */ +#define LZMS_MAX_EXTRA_LENGTH_BITS 30 + +/* The maximum number of verbatim bits, in addition to the Huffman-encoded + * offset slot symbol, that may be required to encode a match offset */ +#define LZMS_MAX_EXTRA_OFFSET_BITS 30 + +/* Parameters for x86 machine code pre/post-processing */ #define LZMS_X86_ID_WINDOW_SIZE 65535 #define LZMS_X86_MAX_TRANSLATION_OFFSET 1023 diff --git a/src/lz_repsearch.c b/src/lz_repsearch.c deleted file mode 100644 index 6c43f0e9..00000000 --- a/src/lz_repsearch.c +++ /dev/null @@ -1,22 +0,0 @@ -/* - * lz_repsearch.c - * - * Fast searching for repeat offset matches. - * - * The author dedicates this file to the public domain. - * You can do whatever you want with this file. - * - */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include "wimlib/lz_repsearch.h" -#include "wimlib/lz_extend.h" - -u32 -lz_extend_repmatch(const u8 *strptr, const u8 *matchptr, u32 max_len) -{ - return lz_extend(strptr, matchptr, 2, max_len); -} diff --git a/src/lzms_common.c b/src/lzms_common.c index d986b1dd..dfe022b0 100644 --- a/src/lzms_common.c +++ b/src/lzms_common.c @@ -302,8 +302,8 @@ const u32 lzms_length_slot_base[LZMS_NUM_LENGTH_SYMS + 1] = { 0x0000009b, 0x000000ab, 0x000000cb, 0x000000eb, 0x0000012b, 0x000001ab, 0x000002ab, 0x000004ab, 0x000008ab, 0x000108ab, 0x400108ab, - /* The last entry is extra; it is equal to LZMS_MAX_MATCH_LEN + 1 and is - * here to aid binary search. */ + /* The last entry is extra; it is equal to LZMS_MAX_MATCH_LENGTH + 1 and + * is here to aid binary search. */ }; /* Table: length slot => number of extra length bits */ @@ -355,10 +355,17 @@ lzms_init_probability_entries(struct lzms_probability_entry *entries, size_t cou } void -lzms_init_symbol_frequencies(u32 freqs[], size_t num_syms) +lzms_init_symbol_frequencies(u32 freqs[], unsigned num_syms) +{ + for (unsigned sym = 0; sym < num_syms; sym++) + freqs[sym] = 1; +} + +void +lzms_dilute_symbol_frequencies(u32 freqs[], unsigned num_syms) { - for (size_t i = 0; i < num_syms; i++) - freqs[i] = 1; + for (unsigned sym = 0; sym < num_syms; sym++) + freqs[sym] = (freqs[sym] >> 1) + 1; } /* @@ -528,8 +535,6 @@ lzms_x86_filter(u8 data[restrict], s32 size, have_opcode: if (undo) { if (i - last_x86_pos <= max_trans_offset) { - LZMS_DEBUG("Undid x86 translation at position %d " - "(opcode 0x%02x)", i, data[i]); void *p32 = &data[i + opcode_nbytes]; u32 n = get_unaligned_u32_le(p32); put_unaligned_u32_le(n - i, p32); @@ -538,8 +543,6 @@ lzms_x86_filter(u8 data[restrict], s32 size, } else { target16 = i + get_unaligned_u16_le(&data[i + opcode_nbytes]); if (i - last_x86_pos <= max_trans_offset) { - LZMS_DEBUG("Did x86 translation at position %d " - "(opcode 0x%02x)", i, data[i]); void *p32 = &data[i + opcode_nbytes]; u32 n = get_unaligned_u32_le(p32); put_unaligned_u32_le(n + i, p32); diff --git a/src/lzms_compress.c b/src/lzms_compress.c index 1e2a4fce..abf39f5e 100644 --- a/src/lzms_compress.c +++ b/src/lzms_compress.c @@ -5,7 +5,7 @@ */ /* - * Copyright (C) 2013, 2014 Eric Biggers + * Copyright (C) 2013, 2014, 2015 Eric Biggers * * This file is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free @@ -26,411 +26,482 @@ #endif #include -#include #include #include "wimlib/compress_common.h" #include "wimlib/compressor_ops.h" -#include "wimlib/endianness.h" #include "wimlib/error.h" #include "wimlib/lcpit_matchfinder.h" -#include "wimlib/lz_repsearch.h" +#include "wimlib/lz_extend.h" +#include "wimlib/lz_hash.h" #include "wimlib/lzms_common.h" #include "wimlib/unaligned.h" #include "wimlib/util.h" -/* Stucture used for writing raw bits as a series of 16-bit little endian coding - * units. This starts at the *end* of the compressed data buffer and proceeds - * backwards. */ +/* + * MAX_FAST_LENGTH is the maximum match length for which the length slot can be + * looked up directly in 'fast_length_slot_tab' and the length cost can be + * looked up directly in 'fast_length_cost_tab'. + * + * We also limit the 'nice_match_len' parameter to this value. Consequently, if + * the longest match found is shorter than 'nice_match_len', then it must also + * be shorter than MAX_FAST_LENGTH. This makes it possible to do fast lookups + * of length costs using 'fast_length_cost_tab' without having to keep checking + * whether the length exceeds MAX_FAST_LENGTH or not. + */ +#define MAX_FAST_LENGTH 255 + +/* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will + * step forward before forcing the pending items to be encoded. If this value + * is increased, then there will be fewer forced flushes, but the probability + * entries and Huffman codes will be more likely to become outdated. */ +#define NUM_OPTIM_NODES 2048 + +/* COST_SHIFT is a scaling factor that makes it possible to consider fractional + * bit costs. A single bit has a cost of (1 << COST_SHIFT). */ +#define COST_SHIFT 6 + +/* Length of the hash table for finding delta matches */ +#define DELTA_HASH_ORDER 17 +#define DELTA_HASH_LENGTH ((u32)1 << DELTA_HASH_ORDER) + +/* The number of bytes to hash when finding delta matches; also taken to be the + * minimum length of an explicit offset delta match */ +#define NBYTES_HASHED_FOR_DELTA 3 + +/* The number of delta match powers to consider (must be <= + * LZMS_NUM_DELTA_POWER_SYMS) */ +#define NUM_POWERS_TO_CONSIDER 6 + +/* This structure tracks the state of writing bits as a series of 16-bit coding + * units, starting at the end of the output buffer and proceeding backwards. */ struct lzms_output_bitstream { - /* Bits that haven't yet been written to the output buffer. */ + /* Bits that haven't yet been written to the output buffer */ u64 bitbuf; - /* Number of bits currently held in @bitbuf. */ + /* Number of bits currently held in @bitbuf */ unsigned bitcount; - /* Pointer to one past the next position in the compressed data buffer - * at which to output a 16-bit coding unit. */ + /* Pointer to one past the next position in the output buffer at which + * to output a 16-bit coding unit */ le16 *next; - /* Pointer to the beginning of the output buffer. (The "end" when + /* Pointer to the beginning of the output buffer (this is the "end" when * writing backwards!) */ le16 *begin; }; -/* Stucture used for range encoding (raw version). This starts at the - * *beginning* of the compressed data buffer and proceeds forward. */ -struct lzms_range_encoder_raw { +/* This structure tracks the state of range encoding and its output, which + * starts at the beginning of the output buffer and proceeds forwards. */ +struct lzms_range_encoder { - /* A 33-bit variable that holds the low boundary of the current range. - * The 33rd bit is needed to catch carries. */ - u64 low; + /* The lower boundary of the current range. Logically, this is a 33-bit + * integer whose high bit is needed to detect carries. */ + u64 lower_bound; - /* Size of the current range. */ - u32 range; + /* The size of the current range */ + u32 range_size; - /* Next 16-bit coding unit to output. */ + /* The next 16-bit coding unit to output */ u16 cache; - /* Number of 16-bit coding units whose output has been delayed due to - * possible carrying. The first such coding unit is @cache; all + /* The number of 16-bit coding units whose output has been delayed due + * to possible carrying. The first such coding unit is @cache; all * subsequent such coding units are 0xffff. */ u32 cache_size; - /* Pointer to the beginning of the output buffer. */ + /* Pointer to the beginning of the output buffer */ le16 *begin; /* Pointer to the position in the output buffer at which the next coding - * unit must be written. */ + * unit must be written */ le16 *next; - /* Pointer just past the end of the output buffer. */ + /* Pointer to just past the end of the output buffer */ le16 *end; }; -/* Structure used for range encoding. This wraps around `struct - * lzms_range_encoder_raw' to use and maintain probability entries. */ -struct lzms_range_encoder { +/* Bookkeeping information for an adaptive Huffman code */ +struct lzms_huffman_rebuild_info { - /* Pointer to the raw range encoder, which has no persistent knowledge - * of probabilities. Multiple lzms_range_encoder's share the same - * lzms_range_encoder_raw. */ - struct lzms_range_encoder_raw *rc; + /* The remaining number of symbols to encode until this code must be + * rebuilt */ + unsigned num_syms_until_rebuild; - /* Bits recently encoded by this range encoder. This is used as an - * index into @prob_entries. */ - u32 state; + /* The number of symbols in this code */ + unsigned num_syms; - /* Bitmask for @state to prevent its value from exceeding the number of - * probability entries. */ - u32 mask; + /* The rebuild frequency of this code, in symbols */ + unsigned rebuild_freq; - /* Probability entries being used for this range encoder. */ - struct lzms_probability_entry prob_entries[LZMS_MAX_NUM_STATES]; + /* The Huffman codeword of each symbol in this code */ + u32 *codewords; + + /* The length of each Huffman codeword, in bits */ + u8 *lens; + + /* The frequency of each symbol in this code */ + u32 *freqs; }; -/* Structure used for Huffman encoding. */ -struct lzms_huffman_encoder { +/* + * The compressor-internal representation of a match or literal. + * + * Literals have length=1; matches have length > 1. (We disallow matches of + * length 1, even though this is a valid length in LZMS.) + * + * The source is encoded as follows: + * + * - Literals: the literal byte itself + * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1) + * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets + * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are + * the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1) + * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is + * the index of the (power, raw_offset) pair in recent_delta_pairs + */ +struct lzms_item { + u32 length; + u32 source; +}; + +#define DELTA_SOURCE_TAG ((u32)1 << 31) +#define DELTA_SOURCE_POWER_SHIFT 28 +#define DELTA_SOURCE_RAW_OFFSET_MASK (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1) + +static inline void +check_that_powers_fit_in_bitfield(void) +{ + BUILD_BUG_ON(LZMS_NUM_DELTA_POWER_SYMS > (1 << (31 - DELTA_SOURCE_POWER_SHIFT))); +} + +/* A stripped-down version of the adaptive state in LZMS which excludes the + * probability entries and Huffman codes */ +struct lzms_adaptive_state { + + /* Recent offsets for LZ matches */ + u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1]; + u32 prev_lz_offset; /* 0 means none */ + u32 upcoming_lz_offset; /* 0 means none */ + + /* Recent (power, raw offset) pairs for delta matches. + * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw + * offset, and the high bits are the power. */ + u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1]; + u32 prev_delta_pair; /* 0 means none */ + u32 upcoming_delta_pair; /* 0 means none */ + + /* States for predicting the probabilities of item types */ + u8 main_state; + u8 match_state; + u8 lz_state; + u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS]; + u8 delta_state; + u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS]; +} _aligned_attribute(64); + +/* + * This structure represents a byte position in the preprocessed input data and + * a node in the graph of possible match/literal choices. + * + * Logically, each incoming edge to this node is labeled with a literal or a + * match that can be taken to reach this position from an earlier position; and + * each outgoing edge from this node is labeled with a literal or a match that + * can be taken to advance from this position to a later position. + */ +struct lzms_optimum_node { - /* Bitstream to write Huffman-encoded symbols and verbatim bits to. - * Multiple lzms_huffman_encoder's share the same lzms_output_bitstream. + /* + * The cost of the lowest-cost path that has been found to reach this + * position. This can change as progressively lower cost paths are + * found to reach this position. */ - struct lzms_output_bitstream *os; + u32 cost; +#define INFINITE_COST UINT32_MAX - /* Number of symbols that have been written using this code far. Reset - * to 0 whenever the code is rebuilt. */ - u32 num_syms_written; + /* + * @item is the last item that was taken to reach this position to reach + * it with the stored @cost. This can change as progressively lower + * cost paths are found to reach this position. + * + * In some cases we look ahead more than one item. If we looked ahead n + * items to reach this position, then @item is the last item taken, + * @extra_items contains the other items ordered from second-to-last to + * first, and @num_extra_items is n - 1. + */ + unsigned num_extra_items; + struct lzms_item item; + struct lzms_item extra_items[2]; - /* When @num_syms_written reaches this number, the Huffman code must be - * rebuilt. */ - u32 rebuild_freq; + /* + * The adaptive state that exists at this position. This is filled in + * lazily, only after the minimum-cost path to this position is found. + * + * Note: the way the algorithm handles this adaptive state in the + * "minimum-cost" parse is actually only an approximation. It's + * possible for the globally optimal, minimum cost path to contain a + * prefix, ending at a position, where that path prefix is *not* the + * minimum cost path to that position. This can happen if such a path + * prefix results in a different adaptive state which results in lower + * costs later. Although the algorithm does do some heuristic + * multi-item lookaheads, it does not solve this problem in general. + * + * Note: this adaptive state structure also does not include the + * probability entries or current Huffman codewords. Those aren't + * maintained per-position and are only updated occassionally. + */ + struct lzms_adaptive_state state; +} _aligned_attribute(64); - /* Number of symbols in the represented Huffman code. */ - unsigned num_syms; +/* The main compressor structure */ +struct lzms_compressor { - /* Running totals of symbol frequencies. These are diluted slightly - * whenever the code is rebuilt. */ - u32 sym_freqs[LZMS_MAX_NUM_SYMS]; + /* The matchfinder for LZ matches */ + struct lcpit_matchfinder mf; - /* The length, in bits, of each symbol in the Huffman code. */ - u8 lens[LZMS_MAX_NUM_SYMS]; + /* The preprocessed buffer of data being compressed */ + u8 *in_buffer; - /* The codeword of each symbol in the Huffman code. */ - u32 codewords[LZMS_MAX_NUM_SYMS]; -}; + /* The number of bytes of data to be compressed, which is the number of + * bytes of data in @in_buffer that are actually valid */ + size_t in_nbytes; -/* Internal compression parameters */ -struct lzms_compressor_params { - u32 min_match_length; - u32 nice_match_length; - u32 optim_array_length; -}; + /* + * Boolean flags to enable consideration of various types of multi-step + * operations during parsing. + * + * Among other cases, multi-step operations can help with gaps where two + * matches are separated by a non-matching byte. + * + * This idea is borrowed from Igor Pavlov's LZMA encoder. + */ + bool try_lit_lzrep0; + bool try_lzrep_lit_lzrep0; + bool try_lzmatch_lit_lzrep0; + + /* + * If true, the compressor can use delta matches. This slows down + * compression. It improves the compression ratio greatly, slightly, or + * not at all, depending on the input data. + */ + bool use_delta_matches; -/* State of the LZMS compressor */ -struct lzms_compressor { + /* 'last_target_usages' is a large array that is only needed for + * preprocessing, so it is in union with fields that don't need to be + * initialized until after preprocessing. */ + union { + struct { - /* Internal compression parameters */ - struct lzms_compressor_params params; + /* Temporary space to store matches found by the LZ matchfinder */ + struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1]; - /* Data currently being compressed */ - u8 *cur_window; - u32 cur_window_size; + /* Hash table for finding delta matches */ + u32 delta_hash_table[DELTA_HASH_LENGTH]; - /* Lempel-Ziv match-finder */ - struct lcpit_matchfinder mf; + /* For each delta power, the hash code for the next sequence */ + u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER]; - /* Temporary space to store found matches */ - struct lz_match *matches; + /* The per-byte graph nodes for near-optimal parsing */ + struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH]; - /* Per-position data for near-optimal parsing */ - struct lzms_mc_pos_data *optimum; - struct lzms_mc_pos_data *optimum_end; + /* Table: length => current cost for small match lengths */ + u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1]; - /* Raw range encoder which outputs to the beginning of the compressed - * data buffer, proceeding forwards */ - struct lzms_range_encoder_raw rc; + /* Range encoder which outputs to the beginning of the compressed data + * buffer, proceeding forwards */ + struct lzms_range_encoder rc; /* Bitstream which outputs to the end of the compressed data buffer, * proceeding backwards */ struct lzms_output_bitstream os; - /* Range encoders */ - struct lzms_range_encoder main_range_encoder; - struct lzms_range_encoder match_range_encoder; - struct lzms_range_encoder lz_match_range_encoder; - struct lzms_range_encoder lz_repeat_match_range_encoders[LZMS_NUM_RECENT_OFFSETS - 1]; - struct lzms_range_encoder delta_match_range_encoder; - struct lzms_range_encoder delta_repeat_match_range_encoders[LZMS_NUM_RECENT_OFFSETS - 1]; - - /* Huffman encoders */ - struct lzms_huffman_encoder literal_encoder; - struct lzms_huffman_encoder lz_offset_encoder; - struct lzms_huffman_encoder length_encoder; - struct lzms_huffman_encoder delta_power_encoder; - struct lzms_huffman_encoder delta_offset_encoder; - - /* Used for preprocessing */ + /* States and probability entries for item type disambiguation */ + unsigned main_state; + unsigned match_state; + unsigned lz_state; + unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS]; + unsigned delta_state; + unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS]; + struct lzms_probability_entry main_probs[LZMS_NUM_MAIN_PROBS]; + struct lzms_probability_entry match_probs[LZMS_NUM_MATCH_PROBS]; + struct lzms_probability_entry lz_probs[LZMS_NUM_LZ_PROBS]; + struct lzms_probability_entry lz_rep_probs[LZMS_NUM_LZ_REP_DECISIONS] + [LZMS_NUM_LZ_REP_PROBS]; + struct lzms_probability_entry delta_probs[LZMS_NUM_DELTA_PROBS]; + struct lzms_probability_entry delta_rep_probs[LZMS_NUM_DELTA_REP_DECISIONS] + [LZMS_NUM_DELTA_REP_PROBS]; + + /* Huffman codes */ + + struct lzms_huffman_rebuild_info literal_rebuild_info; + u32 literal_codewords[LZMS_NUM_LITERAL_SYMS]; + u8 literal_lens[LZMS_NUM_LITERAL_SYMS]; + u32 literal_freqs[LZMS_NUM_LITERAL_SYMS]; + + struct lzms_huffman_rebuild_info lz_offset_rebuild_info; + u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS]; + u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS]; + u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS]; + + struct lzms_huffman_rebuild_info length_rebuild_info; + u32 length_codewords[LZMS_NUM_LENGTH_SYMS]; + u8 length_lens[LZMS_NUM_LENGTH_SYMS]; + u32 length_freqs[LZMS_NUM_LENGTH_SYMS]; + + struct lzms_huffman_rebuild_info delta_offset_rebuild_info; + u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS]; + u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS]; + u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS]; + + struct lzms_huffman_rebuild_info delta_power_rebuild_info; + u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS]; + u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS]; + u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS]; + + }; /* struct */ + s32 last_target_usages[65536]; -#define LZMS_NUM_FAST_LENGTHS 256 - /* Table: length => length slot for small lengths */ - u8 length_slot_fast[LZMS_NUM_FAST_LENGTHS]; + }; /* union */ - /* Table: length => current cost for small match lengths */ - u32 length_cost_fast[LZMS_NUM_FAST_LENGTHS]; + /* Table: length => length slot for small match lengths */ + u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1]; -#define LZMS_NUM_FAST_OFFSETS 32768 - /* Table: offset => offset slot for small offsets */ - u8 offset_slot_fast[LZMS_NUM_FAST_OFFSETS]; -}; + /* Tables for mapping offsets to offset slots */ -struct lzms_lz_lru_queue { - u32 recent_offsets[LZMS_NUM_RECENT_OFFSETS + 1]; - u32 prev_offset; - u32 upcoming_offset; -}; + /* slots [0, 167); 0 <= num_extra_bits <= 10 */ + u8 offset_slot_tab_1[0xe4a5]; -static void -lzms_init_lz_lru_queue(struct lzms_lz_lru_queue *queue) -{ - for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS + 1; i++) - queue->recent_offsets[i] = i + 1; + /* slots [167, 427); 11 <= num_extra_bits <= 15 */ + u16 offset_slot_tab_2[0x3d0000 >> 11]; - queue->prev_offset = 0; - queue->upcoming_offset = 0; -} + /* slots [427, 799); 16 <= num_extra_bits */ + u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16]; +}; + +/****************************************************************************** + * Offset and length slot acceleration * + ******************************************************************************/ +/* Generate the acceleration table for length slots. */ static void -lzms_update_lz_lru_queue(struct lzms_lz_lru_queue *queue) +lzms_init_fast_length_slot_tab(struct lzms_compressor *c) { - if (queue->prev_offset != 0) { - for (int i = LZMS_NUM_RECENT_OFFSETS - 1; i >= 0; i--) - queue->recent_offsets[i + 1] = queue->recent_offsets[i]; - queue->recent_offsets[0] = queue->prev_offset; + unsigned slot = 0; + for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) { + if (len >= lzms_length_slot_base[slot + 1]) + slot++; + c->fast_length_slot_tab[len] = slot; } - queue->prev_offset = queue->upcoming_offset; } -/* - * Match chooser position data: - * - * An array of these structures is used during the near-optimal match-choosing - * algorithm. They correspond to consecutive positions in the window and are - * used to keep track of the cost to reach each position, and the match/literal - * choices that need to be chosen to reach that position. - */ -struct lzms_mc_pos_data { - - /* The cost, in bits, of the lowest-cost path that has been found to - * reach this position. This can change as progressively lower cost - * paths are found to reach this position. */ - u32 cost; -#define MC_INFINITE_COST UINT32_MAX - - /* The match or literal that was taken to reach this position. This can - * change as progressively lower cost paths are found to reach this - * position. - * - * This variable is divided into two bitfields. - * - * Literals: - * Low bits are 1, high bits are the literal. - * - * Explicit offset matches: - * Low bits are the match length, high bits are the offset plus 2. - * - * Repeat offset matches: - * Low bits are the match length, high bits are the queue index. - */ - u64 mc_item_data; -#define MC_OFFSET_SHIFT 32 -#define MC_LEN_MASK (((u64)1 << MC_OFFSET_SHIFT) - 1) - - /* The LZMS adaptive state that exists at this position. This is filled - * in lazily, only after the minimum-cost path to this position is - * found. - * - * Note: the way we handle this adaptive state in the "minimum-cost" - * parse is actually only an approximation. It's possible for the - * globally optimal, minimum cost path to contain a prefix, ending at a - * position, where that path prefix is *not* the minimum cost path to - * that position. This can happen if such a path prefix results in a - * different adaptive state which results in lower costs later. We do - * not solve this problem; we only consider the lowest cost to reach - * each position, which seems to be an acceptable approximation. - * - * Note: this adaptive state also does not include the probability - * entries or current Huffman codewords. Those aren't maintained - * per-position and are only updated occassionally. */ - struct lzms_adaptive_state { - struct lzms_lz_lru_queue lru; - u8 main_state; - u8 match_state; - u8 lz_match_state; - u8 lz_repeat_match_state[LZMS_NUM_RECENT_OFFSETS - 1]; - } state; -}; - +/* Generate the acceleration tables for offset slots. */ static void -lzms_init_fast_slots(struct lzms_compressor *c) +lzms_init_offset_slot_tabs(struct lzms_compressor *c) { - /* Create table mapping small lengths to length slots. */ - for (unsigned slot = 0, i = 0; i < LZMS_NUM_FAST_LENGTHS; i++) { - while (i >= lzms_length_slot_base[slot + 1]) + u32 offset; + unsigned slot = 0; + + /* slots [0, 167); 0 <= num_extra_bits <= 10 */ + for (offset = 1; offset < 0xe4a5; offset++) { + if (offset >= lzms_offset_slot_base[slot + 1]) slot++; - c->length_slot_fast[i] = slot; + c->offset_slot_tab_1[offset] = slot; } - /* Create table mapping small offsets to offset slots. */ - for (unsigned slot = 0, i = 0; i < LZMS_NUM_FAST_OFFSETS; i++) { - while (i >= lzms_offset_slot_base[slot + 1]) + /* slots [167, 427); 11 <= num_extra_bits <= 15 */ + for (; offset < 0x3de4a5; offset += (u32)1 << 11) { + if (offset >= lzms_offset_slot_base[slot + 1]) slot++; - c->offset_slot_fast[i] = slot; + c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot; } -} -static inline unsigned -lzms_get_length_slot_fast(const struct lzms_compressor *c, u32 length) -{ - if (likely(length < LZMS_NUM_FAST_LENGTHS)) - return c->length_slot_fast[length]; - else - return lzms_get_length_slot(length); + /* slots [427, 799); 16 <= num_extra_bits */ + for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) { + if (offset >= lzms_offset_slot_base[slot + 1]) + slot++; + c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot; + } } +/* + * Return the length slot for the specified match length, using the compressor's + * acceleration table if the length is small enough. + */ static inline unsigned -lzms_get_offset_slot_fast(const struct lzms_compressor *c, u32 offset) -{ - if (offset < LZMS_NUM_FAST_OFFSETS) - return c->offset_slot_fast[offset]; - else - return lzms_get_offset_slot(offset); -} - -/* Initialize the output bitstream @os to write backwards to the specified - * compressed data buffer @out that is @out_limit 16-bit integers long. */ -static void -lzms_output_bitstream_init(struct lzms_output_bitstream *os, - le16 *out, size_t out_limit) +lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length) { - os->bitbuf = 0; - os->bitcount = 0; - os->next = out + out_limit; - os->begin = out; + if (likely(length <= MAX_FAST_LENGTH)) + return c->fast_length_slot_tab[length]; + return lzms_get_length_slot(length); } /* - * Write some bits, contained in the low @num_bits bits of @bits (ordered from - * high-order to low-order), to the output bitstream @os. - * - * @max_num_bits is a compile-time constant that specifies the maximum number of - * bits that can ever be written at this call site. + * Return the offset slot for the specified match offset, using the compressor's + * acceleration tables to speed up the mapping. */ -static inline void -lzms_output_bitstream_put_varbits(struct lzms_output_bitstream *os, - u32 bits, unsigned num_bits, - unsigned max_num_bits) +static inline unsigned +lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset) { - LZMS_ASSERT(num_bits <= 48); - - /* Add the bits to the bit buffer variable. */ - os->bitcount += num_bits; - os->bitbuf = (os->bitbuf << num_bits) | bits; - - /* Check whether any coding units need to be written. */ - while (os->bitcount >= 16) { - - os->bitcount -= 16; - - /* Write a coding unit, unless it would underflow the buffer. */ - if (os->next != os->begin) - put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next); - - /* Optimization for call sites that never write more than 16 - * bits at once. */ - if (max_num_bits <= 16) - break; - } + if (offset < 0xe4a5) + return c->offset_slot_tab_1[offset]; + offset -= 0xe4a5; + if (offset < 0x3d0000) + return c->offset_slot_tab_2[offset >> 11]; + return c->offset_slot_tab_3[offset >> 16]; } -/* Flush the output bitstream, ensuring that all bits written to it have been - * written to memory. Returns %true if all bits have been output successfully, - * or %false if an overrun occurred. */ -static bool -lzms_output_bitstream_flush(struct lzms_output_bitstream *os) -{ - if (os->next == os->begin) - return false; - - if (os->bitcount != 0) - put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next); +/****************************************************************************** + * Range encoding * + ******************************************************************************/ - return true; -} - -/* Initialize the range encoder @rc to write forwards to the specified - * compressed data buffer @out that is @out_limit 16-bit integers long. */ +/* + * Initialize the range encoder @rc to write forwards to the specified buffer + * @out that is @count 16-bit integers long. + */ static void -lzms_range_encoder_raw_init(struct lzms_range_encoder_raw *rc, - le16 *out, size_t out_limit) +lzms_range_encoder_init(struct lzms_range_encoder *rc, le16 *out, size_t count) { - rc->low = 0; - rc->range = 0xffffffff; + rc->lower_bound = 0; + rc->range_size = 0xffffffff; rc->cache = 0; rc->cache_size = 1; rc->begin = out; rc->next = out - 1; - rc->end = out + out_limit; + rc->end = out + count; } /* * Attempt to flush bits from the range encoder. * - * Note: this is based on the public domain code for LZMA written by Igor - * Pavlov. The only differences in this function are that in LZMS the bits must - * be output in 16-bit coding units instead of 8-bit coding units, and that in - * LZMS the first coding unit is not ignored by the decompressor, so the encoder - * cannot output a dummy value to that position. + * The basic idea is that we're writing bits from @rc->lower_bound to the + * output. However, due to carrying, the writing of coding units with the + * maximum value, as well as one prior coding unit, must be delayed until it is + * determined whether a carry is needed. + * + * This is based on the public domain code for LZMA written by Igor Pavlov, but + * with the following differences: * - * The basic idea is that we're writing bits from @rc->low to the output. - * However, due to carrying, the writing of coding units with value 0xffff, as - * well as one prior coding unit, must be delayed until it is determined whether - * a carry is needed. + * - In LZMS, 16-bit coding units are required rather than 8-bit. + * + * - In LZMS, the first coding unit is not ignored by the decompressor, so + * the encoder cannot output a dummy value to that position. */ static void -lzms_range_encoder_raw_shift_low(struct lzms_range_encoder_raw *rc) +lzms_range_encoder_shift_low(struct lzms_range_encoder *rc) { - if ((u32)(rc->low) < 0xffff0000 || - (u32)(rc->low >> 32) != 0) + if ((u32)(rc->lower_bound) < 0xffff0000 || + (u32)(rc->lower_bound >> 32) != 0) { - /* Carry not needed (rc->low < 0xffff0000), or carry occurred - * ((rc->low >> 32) != 0, a.k.a. the carry bit is 1). */ + /* Carry not needed (rc->lower_bound < 0xffff0000), or carry + * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit + * is 1). */ do { if (likely(rc->next >= rc->begin)) { if (rc->next != rc->end) { put_unaligned_u16_le(rc->cache + - (u16)(rc->low >> 32), + (u16)(rc->lower_bound >> 32), rc->next++); } } else { @@ -439,61 +510,61 @@ lzms_range_encoder_raw_shift_low(struct lzms_range_encoder_raw *rc) rc->cache = 0xffff; } while (--rc->cache_size != 0); - rc->cache = (rc->low >> 16) & 0xffff; + rc->cache = (rc->lower_bound >> 16) & 0xffff; } ++rc->cache_size; - rc->low = (rc->low & 0xffff) << 16; -} - -static void -lzms_range_encoder_raw_normalize(struct lzms_range_encoder_raw *rc) -{ - if (rc->range <= 0xffff) { - rc->range <<= 16; - lzms_range_encoder_raw_shift_low(rc); - } + rc->lower_bound = (rc->lower_bound & 0xffff) << 16; } static bool -lzms_range_encoder_raw_flush(struct lzms_range_encoder_raw *rc) +lzms_range_encoder_flush(struct lzms_range_encoder *rc) { - for (unsigned i = 0; i < 4; i++) - lzms_range_encoder_raw_shift_low(rc); + for (int i = 0; i < 4; i++) + lzms_range_encoder_shift_low(rc); return rc->next != rc->end; } -/* Encode the next bit using the range encoder (raw version). +/* + * Encode the next bit using the range encoder. * - * @prob is the chance out of LZMS_PROBABILITY_MAX that the next bit is 0. */ + * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next + * bit is 0 rather than 1. + */ static inline void -lzms_range_encoder_raw_encode_bit(struct lzms_range_encoder_raw *rc, - int bit, u32 prob) +lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob) { - lzms_range_encoder_raw_normalize(rc); + /* Normalize if needed. */ + if (rc->range_size <= 0xffff) { + rc->range_size <<= 16; + lzms_range_encoder_shift_low(rc); + } - u32 bound = (rc->range >> LZMS_PROBABILITY_BITS) * prob; + u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob; if (bit == 0) { - rc->range = bound; + rc->range_size = bound; } else { - rc->low += bound; - rc->range -= bound; + rc->lower_bound += bound; + rc->range_size -= bound; } } -/* Encode a bit using the specified range encoder. This wraps around - * lzms_range_encoder_raw_encode_bit() to handle using and updating the - * appropriate state and probability entry. */ -static void -lzms_range_encode_bit(struct lzms_range_encoder *enc, int bit) +/* + * Encode a bit. This wraps around lzms_range_encode_bit() to handle using and + * updating the state and its corresponding probability entry. + */ +static inline void +lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states, + struct lzms_probability_entry *probs, + struct lzms_range_encoder *rc) { struct lzms_probability_entry *prob_entry; u32 prob; - /* Load the probability entry corresponding to the current state. */ - prob_entry = &enc->prob_entries[enc->state]; + /* Load the probability entry for the current state. */ + prob_entry = &probs[*state_p]; /* Update the state based on the next bit. */ - enc->state = ((enc->state << 1) | bit) & enc->mask; + *state_p = ((*state_p << 1) | bit) & (num_states - 1); /* Get the probability that the bit is 0. */ prob = lzms_get_probability(prob_entry); @@ -501,471 +572,706 @@ lzms_range_encode_bit(struct lzms_range_encoder *enc, int bit) /* Update the probability entry. */ lzms_update_probability_entry(prob_entry, bit); - /* Encode the bit. */ - lzms_range_encoder_raw_encode_bit(enc->rc, bit, prob); + /* Encode the bit using the range encoder. */ + lzms_range_encode_bit(rc, bit, prob); } -/* Called when an adaptive Huffman code needs to be rebuilt. */ +/* Helper functions for encoding bits in the various decision classes */ + static void -lzms_rebuild_huffman_code(struct lzms_huffman_encoder *enc) -{ - make_canonical_huffman_code(enc->num_syms, - LZMS_MAX_CODEWORD_LEN, - enc->sym_freqs, - enc->lens, - enc->codewords); - - /* Dilute the frequencies. */ - for (unsigned i = 0; i < enc->num_syms; i++) { - enc->sym_freqs[i] >>= 1; - enc->sym_freqs[i] += 1; - } - enc->num_syms_written = 0; +lzms_encode_main_bit(struct lzms_compressor *c, int bit) +{ + lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS, + c->main_probs, &c->rc); } -/* Encode a symbol using the specified Huffman encoder. */ -static inline void -lzms_huffman_encode_symbol(struct lzms_huffman_encoder *enc, unsigned sym) +static void +lzms_encode_match_bit(struct lzms_compressor *c, int bit) { - lzms_output_bitstream_put_varbits(enc->os, - enc->codewords[sym], - enc->lens[sym], - LZMS_MAX_CODEWORD_LEN); - ++enc->sym_freqs[sym]; - if (++enc->num_syms_written == enc->rebuild_freq) - lzms_rebuild_huffman_code(enc); + lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS, + c->match_probs, &c->rc); } static void -lzms_update_fast_length_costs(struct lzms_compressor *c); +lzms_encode_lz_bit(struct lzms_compressor *c, int bit) +{ + lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS, + c->lz_probs, &c->rc); +} -/* Encode a match length. */ static void -lzms_encode_length(struct lzms_compressor *c, u32 length) +lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx) { - unsigned slot; - unsigned num_extra_bits; - u32 extra_bits; + lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS, + c->lz_rep_probs[idx], &c->rc); +} - slot = lzms_get_length_slot_fast(c, length); +static void +lzms_encode_delta_bit(struct lzms_compressor *c, int bit) +{ + lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS, + c->delta_probs, &c->rc); +} - extra_bits = length - lzms_length_slot_base[slot]; - num_extra_bits = lzms_extra_length_bits[slot]; +static void +lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx) +{ + lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS, + c->delta_rep_probs[idx], &c->rc); +} - lzms_huffman_encode_symbol(&c->length_encoder, slot); - if (c->length_encoder.num_syms_written == 0) - lzms_update_fast_length_costs(c); +/****************************************************************************** + * Huffman encoding and verbatim bits * + ******************************************************************************/ - lzms_output_bitstream_put_varbits(c->length_encoder.os, - extra_bits, num_extra_bits, 30); +/* + * Initialize the output bitstream @os to write backwards to the specified + * buffer @out that is @count 16-bit integers long. + */ +static void +lzms_output_bitstream_init(struct lzms_output_bitstream *os, + le16 *out, size_t count) +{ + os->bitbuf = 0; + os->bitcount = 0; + os->next = out + count; + os->begin = out; } -/* Encode an LZ match offset. */ -static void -lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset) +/* + * Write some bits, contained in the low-order @num_bits bits of @bits, to the + * output bitstream @os. + * + * @max_num_bits is a compile-time constant that specifies the maximum number of + * bits that can ever be written at this call site. + */ +static inline void +lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits, + const unsigned num_bits, const unsigned max_num_bits) { - unsigned slot; - unsigned num_extra_bits; - u32 extra_bits; + /* Add the bits to the bit buffer variable. */ + os->bitcount += num_bits; + os->bitbuf = (os->bitbuf << num_bits) | bits; - slot = lzms_get_offset_slot_fast(c, offset); + /* Check whether any coding units need to be written. */ + while (os->bitcount >= 16) { - extra_bits = offset - lzms_offset_slot_base[slot]; - num_extra_bits = lzms_extra_offset_bits[slot]; + os->bitcount -= 16; - lzms_huffman_encode_symbol(&c->lz_offset_encoder, slot); - lzms_output_bitstream_put_varbits(c->lz_offset_encoder.os, - extra_bits, num_extra_bits, 30); + /* Write a coding unit, unless it would underflow the buffer. */ + if (os->next != os->begin) + put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next); + + /* Optimization for call sites that never write more than 16 + * bits at once. */ + if (max_num_bits <= 16) + break; + } +} + +/* + * Flush the output bitstream, ensuring that all bits written to it have been + * written to memory. Return %true if all bits have been output successfully, + * or %false if an overrun occurred. + */ +static bool +lzms_output_bitstream_flush(struct lzms_output_bitstream *os) +{ + if (os->next == os->begin) + return false; + + if (os->bitcount != 0) + put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next); + + return true; } -/* Encode a literal byte. */ static void -lzms_encode_literal(struct lzms_compressor *c, unsigned literal) +lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info) { - /* Main bit: 0 = a literal, not a match. */ - lzms_range_encode_bit(&c->main_range_encoder, 0); + make_canonical_huffman_code(rebuild_info->num_syms, + LZMS_MAX_CODEWORD_LENGTH, + rebuild_info->freqs, + rebuild_info->lens, + rebuild_info->codewords); + rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq; +} - /* Encode the literal using the current literal Huffman code. */ - lzms_huffman_encode_symbol(&c->literal_encoder, literal); +static void +lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info, + unsigned num_syms, unsigned rebuild_freq, + u32 *codewords, u8 *lens, u32 *freqs) +{ + rebuild_info->num_syms = num_syms; + rebuild_info->rebuild_freq = rebuild_freq; + rebuild_info->codewords = codewords; + rebuild_info->lens = lens; + rebuild_info->freqs = freqs; + lzms_init_symbol_frequencies(freqs, num_syms); + lzms_build_huffman_code(rebuild_info); } -/* Encode an LZ repeat offset match. */ static void -lzms_encode_lz_repeat_offset_match(struct lzms_compressor *c, - u32 length, unsigned rep_index) +lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info) { - unsigned i; + lzms_build_huffman_code(rebuild_info); + lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms); +} - /* Main bit: 1 = a match, not a literal. */ - lzms_range_encode_bit(&c->main_range_encoder, 1); +/* + * Encode a symbol using the specified Huffman code. Then, if the Huffman code + * needs to be rebuilt, rebuild it and return true; otherwise return false. + */ +static inline bool +lzms_huffman_encode_symbol(unsigned sym, + const u32 *codewords, const u8 *lens, u32 *freqs, + struct lzms_output_bitstream *os, + struct lzms_huffman_rebuild_info *rebuild_info) +{ + lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH); + ++freqs[sym]; + if (--rebuild_info->num_syms_until_rebuild == 0) { + lzms_rebuild_huffman_code(rebuild_info); + return true; + } + return false; +} - /* Match bit: 0 = an LZ match, not a delta match. */ - lzms_range_encode_bit(&c->match_range_encoder, 0); +/* Helper routines to encode symbols using the various Huffman codes */ - /* LZ match bit: 1 = repeat offset, not an explicit offset. */ - lzms_range_encode_bit(&c->lz_match_range_encoder, 1); +static bool +lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym) +{ + return lzms_huffman_encode_symbol(sym, c->literal_codewords, + c->literal_lens, c->literal_freqs, + &c->os, &c->literal_rebuild_info); +} - /* Encode the repeat offset index. A 1 bit is encoded for each index - * passed up. This sequence of 1 bits is terminated by a 0 bit, or - * automatically when (LZMS_NUM_RECENT_OFFSETS - 1) 1 bits have been - * encoded. */ - for (i = 0; i < rep_index; i++) - lzms_range_encode_bit(&c->lz_repeat_match_range_encoders[i], 1); +static bool +lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym) +{ + return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords, + c->lz_offset_lens, c->lz_offset_freqs, + &c->os, &c->lz_offset_rebuild_info); +} - if (i < LZMS_NUM_RECENT_OFFSETS - 1) - lzms_range_encode_bit(&c->lz_repeat_match_range_encoders[i], 0); +static bool +lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym) +{ + return lzms_huffman_encode_symbol(sym, c->length_codewords, + c->length_lens, c->length_freqs, + &c->os, &c->length_rebuild_info); +} - /* Encode the match length. */ - lzms_encode_length(c, length); +static bool +lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym) +{ + return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords, + c->delta_offset_lens, c->delta_offset_freqs, + &c->os, &c->delta_offset_rebuild_info); } -/* Encode an LZ explicit offset match. */ -static void -lzms_encode_lz_explicit_offset_match(struct lzms_compressor *c, - u32 length, u32 offset) +static bool +lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym) { - /* Main bit: 1 = a match, not a literal. */ - lzms_range_encode_bit(&c->main_range_encoder, 1); + return lzms_huffman_encode_symbol(sym, c->delta_power_codewords, + c->delta_power_lens, c->delta_power_freqs, + &c->os, &c->delta_power_rebuild_info); +} - /* Match bit: 0 = an LZ match, not a delta match. */ - lzms_range_encode_bit(&c->match_range_encoder, 0); +static void +lzms_update_fast_length_costs(struct lzms_compressor *c); - /* LZ match bit: 0 = explicit offset, not a repeat offset. */ - lzms_range_encode_bit(&c->lz_match_range_encoder, 0); +/* + * Encode a match length. If this causes the Huffman code for length symbols to + * be rebuilt, also update the length costs array used by the parser. + */ +static void +lzms_encode_length(struct lzms_compressor *c, u32 length) +{ + unsigned slot = lzms_comp_get_length_slot(c, length); - /* Encode the match offset. */ - lzms_encode_lz_offset(c, offset); + if (lzms_encode_length_symbol(c, slot)) + lzms_update_fast_length_costs(c); - /* Encode the match length. */ - lzms_encode_length(c, length); + lzms_write_bits(&c->os, length - lzms_length_slot_base[slot], + lzms_extra_length_bits[slot], + LZMS_MAX_EXTRA_LENGTH_BITS); } +/* Encode the offset of an LZ match. */ static void -lzms_encode_item(struct lzms_compressor *c, u64 mc_item_data) +lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset) { - u32 len = mc_item_data & MC_LEN_MASK; - u32 offset_data = mc_item_data >> MC_OFFSET_SHIFT; + unsigned slot = lzms_comp_get_offset_slot(c, offset); - if (len == 1) - lzms_encode_literal(c, offset_data); - else if (offset_data < LZMS_NUM_RECENT_OFFSETS) - lzms_encode_lz_repeat_offset_match(c, len, offset_data); - else - lzms_encode_lz_explicit_offset_match(c, len, offset_data - LZMS_OFFSET_OFFSET); + lzms_encode_lz_offset_symbol(c, slot); + lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot], + lzms_extra_offset_bits[slot], + LZMS_MAX_EXTRA_OFFSET_BITS); } -/* Encode a list of matches and literals chosen by the parsing algorithm. */ +/* Encode the raw offset of a delta match. */ static void -lzms_encode_item_list(struct lzms_compressor *c, - struct lzms_mc_pos_data *cur_optimum_ptr) +lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset) { - struct lzms_mc_pos_data *end_optimum_ptr; - u64 saved_item; - u64 item; - - /* The list is currently in reverse order (last item to first item). - * Reverse it. */ - end_optimum_ptr = cur_optimum_ptr; - saved_item = cur_optimum_ptr->mc_item_data; - do { - item = saved_item; - cur_optimum_ptr -= item & MC_LEN_MASK; - saved_item = cur_optimum_ptr->mc_item_data; - cur_optimum_ptr->mc_item_data = item; - } while (cur_optimum_ptr != c->optimum); + unsigned slot = lzms_comp_get_offset_slot(c, raw_offset); - /* Walk the list of items from beginning to end, encoding each item. */ - do { - lzms_encode_item(c, cur_optimum_ptr->mc_item_data); - cur_optimum_ptr += (cur_optimum_ptr->mc_item_data) & MC_LEN_MASK; - } while (cur_optimum_ptr != end_optimum_ptr); + lzms_encode_delta_offset_symbol(c, slot); + lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot], + lzms_extra_offset_bits[slot], + LZMS_MAX_EXTRA_OFFSET_BITS); } -/* Each bit costs 1 << LZMS_COST_SHIFT units. */ -#define LZMS_COST_SHIFT 6 +/****************************************************************************** + * Item encoding * + ******************************************************************************/ -/*#define LZMS_RC_COSTS_USE_FLOATING_POINT*/ +/* Encode the specified item, which may be a literal or any type of match. */ +static void +lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source) +{ + /* Main bit: 0 = literal, 1 = match */ + int main_bit = (length > 1); + lzms_encode_main_bit(c, main_bit); + + if (!main_bit) { + /* Literal */ + unsigned literal = source; + lzms_encode_literal_symbol(c, literal); + } else { + /* Match */ -static u32 -lzms_rc_costs[LZMS_PROBABILITY_MAX + 1]; + /* Match bit: 0 = LZ match, 1 = delta match */ + int match_bit = (source & DELTA_SOURCE_TAG) != 0; + lzms_encode_match_bit(c, match_bit); -#ifdef LZMS_RC_COSTS_USE_FLOATING_POINT -# include -#endif + if (!match_bit) { + /* LZ match */ -static void -lzms_do_init_rc_costs(void) -{ - /* Fill in a table that maps range coding probabilities needed to code a - * bit X (0 or 1) to the number of bits (scaled by a constant factor, to - * handle fractional costs) needed to code that bit X. - * - * Consider the range of the range decoder. To eliminate exactly half - * the range (logical probability of 0.5), we need exactly 1 bit. For - * lower probabilities we need more bits and for higher probabilities we - * need fewer bits. In general, a logical probability of N will - * eliminate the proportion 1 - N of the range; this information takes - * log2(1 / N) bits to encode. - * - * The below loop is simply calculating this number of bits for each - * possible probability allowed by the LZMS compression format, but - * without using real numbers. To handle fractional probabilities, each - * cost is multiplied by (1 << LZMS_COST_SHIFT). These techniques are - * based on those used by LZMA. - * - * Note that in LZMS, a probability x really means x / 64, and 0 / 64 is - * really interpreted as 1 / 64 and 64 / 64 is really interpreted as - * 63 / 64. - */ - for (u32 i = 0; i <= LZMS_PROBABILITY_MAX; i++) { - u32 prob = i; + /* LZ bit: 0 = explicit offset, 1 = repeat offset */ + int lz_bit = (source < LZMS_NUM_LZ_REPS); + lzms_encode_lz_bit(c, lz_bit); - if (prob == 0) - prob = 1; - else if (prob == LZMS_PROBABILITY_MAX) - prob = LZMS_PROBABILITY_MAX - 1; - - #ifdef LZMS_RC_COSTS_USE_FLOATING_POINT - lzms_rc_costs[i] = log2((double)LZMS_PROBABILITY_MAX / prob) * - (1 << LZMS_COST_SHIFT); - #else - u32 w = prob; - u32 bit_count = 0; - for (u32 j = 0; j < LZMS_COST_SHIFT; j++) { - w *= w; - bit_count <<= 1; - while (w >= ((u32)1 << 16)) { - w >>= 1; - ++bit_count; + if (!lz_bit) { + /* Explicit offset LZ match */ + u32 offset = source - (LZMS_NUM_LZ_REPS - 1); + lzms_encode_lz_offset(c, offset); + } else { + /* Repeat offset LZ match */ + int rep_idx = source; + for (int i = 0; i < rep_idx; i++) + lzms_encode_lz_rep_bit(c, 1, i); + if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS) + lzms_encode_lz_rep_bit(c, 0, rep_idx); + } + } else { + /* Delta match */ + + source &= ~DELTA_SOURCE_TAG; + + /* Delta bit: 0 = explicit offset, 1 = repeat offset */ + int delta_bit = (source < LZMS_NUM_DELTA_REPS); + lzms_encode_delta_bit(c, delta_bit); + + if (!delta_bit) { + /* Explicit offset delta match */ + u32 power = source >> DELTA_SOURCE_POWER_SHIFT; + u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) - + (LZMS_NUM_DELTA_REPS - 1); + lzms_encode_delta_power_symbol(c, power); + lzms_encode_delta_raw_offset(c, raw_offset); + } else { + /* Repeat offset delta match */ + int rep_idx = source; + for (int i = 0; i < rep_idx; i++) + lzms_encode_delta_rep_bit(c, 1, i); + if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS) + lzms_encode_delta_rep_bit(c, 0, rep_idx); } } - lzms_rc_costs[i] = (LZMS_PROBABILITY_BITS << LZMS_COST_SHIFT) - - (15 + bit_count); - #endif + + /* Match length (encoded the same way for any match type) */ + lzms_encode_length(c, length); } } +/* Encode a list of matches and literals chosen by the parsing algorithm. */ static void -lzms_init_rc_costs(void) +lzms_encode_nonempty_item_list(struct lzms_compressor *c, + struct lzms_optimum_node *end_node) { - static pthread_once_t once = PTHREAD_ONCE_INIT; + /* Since we've stored at each node the item we took to arrive at that + * node, we can trace our chosen path in backwards order. However, for + * encoding we need to trace our chosen path in forwards order. To make + * this possible, the following loop moves the items from their + * destination nodes to their source nodes, which effectively reverses + * the path. (Think of it like reversing a singly-linked list.) */ + struct lzms_optimum_node *cur_node = end_node; + struct lzms_item saved_item = cur_node->item; + do { + struct lzms_item item = saved_item; + if (cur_node->num_extra_items > 0) { + /* Handle an arrival via multi-item lookahead. */ + unsigned i = 0; + struct lzms_optimum_node *orig_node = cur_node; + do { + cur_node -= item.length; + cur_node->item = item; + item = orig_node->extra_items[i]; + } while (++i != orig_node->num_extra_items); + } + cur_node -= item.length; + saved_item = cur_node->item; + cur_node->item = item; + } while (cur_node != c->optimum_nodes); - pthread_once(&once, lzms_do_init_rc_costs); + /* Now trace the chosen path in forwards order, encoding each item. */ + do { + lzms_encode_item(c, cur_node->item.length, cur_node->item.source); + cur_node += cur_node->item.length; + } while (cur_node != end_node); } -/* Return the cost to range-encode the specified bit from the specified state.*/ -static inline u32 -lzms_rc_bit_cost(const struct lzms_range_encoder *enc, u8 cur_state, int bit) +static inline void +lzms_encode_item_list(struct lzms_compressor *c, + struct lzms_optimum_node *end_node) { - u32 prob_zero; - u32 prob_correct; + if (end_node != c->optimum_nodes) + lzms_encode_nonempty_item_list(c, end_node); +} - prob_zero = enc->prob_entries[cur_state].num_recent_zero_bits; +/****************************************************************************** + * Cost evalution * + ******************************************************************************/ - if (bit == 0) - prob_correct = prob_zero; - else - prob_correct = LZMS_PROBABILITY_MAX - prob_zero; +/* + * If p is the predicted probability of the next bit being a 0, then the number + * of bits required to encode a 0 bit using a binary range encoder is the real + * number -log2(p), and the number of bits required to encode a 1 bit is the + * real number -log2(1 - p). To avoid computing either of these expressions at + * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from + * probability to cost for each possible probability. Specifically, the array + * indices are the numerators of the possible probabilities in LZMS, where the + * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the + * bit costs multiplied by 1< + +static void +lzms_compute_bit_costs(void) +{ + for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) { + u32 prob = i; + if (prob == 0) + prob++; + else if (prob == LZMS_PROBABILITY_DENOMINATOR) + prob--; + + lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) * + (1 << COST_SHIFT)); + } } +#endif -/* Return the cost to Huffman-encode the specified symbol. */ +/* Return the cost to encode a 0 bit in the specified context. */ static inline u32 -lzms_huffman_symbol_cost(const struct lzms_huffman_encoder *enc, unsigned sym) +lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs) { - return (u32)enc->lens[sym] << LZMS_COST_SHIFT; + return lzms_bit_costs[probs[state].num_recent_zero_bits]; } -/* Return the cost to encode the specified literal byte. */ +/* Return the cost to encode a 1 bit in the specified context. */ static inline u32 -lzms_literal_cost(const struct lzms_compressor *c, unsigned literal, - const struct lzms_adaptive_state *state) +lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs) { - return lzms_rc_bit_cost(&c->main_range_encoder, state->main_state, 0) + - lzms_huffman_symbol_cost(&c->literal_encoder, literal); + return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR - + probs[state].num_recent_zero_bits]; } -/* Update the table that directly provides the costs for small lengths. */ +/* Return the cost to encode a literal, including the main bit. */ +static inline u32 +lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal) +{ + return lzms_bit_0_cost(main_state, c->main_probs) + + ((u32)c->literal_lens[literal] << COST_SHIFT); +} + +/* Update 'fast_length_cost_tab' to use the latest Huffman code. */ static void lzms_update_fast_length_costs(struct lzms_compressor *c) { - u32 len; int slot = -1; u32 cost = 0; - - for (len = 1; len < LZMS_NUM_FAST_LENGTHS; len++) { - - while (len >= lzms_length_slot_base[slot + 1]) { + for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) { + if (len >= lzms_length_slot_base[slot + 1]) { slot++; - cost = (u32)(c->length_encoder.lens[slot] + - lzms_extra_length_bits[slot]) << LZMS_COST_SHIFT; + cost = (u32)(c->length_lens[slot] + + lzms_extra_length_bits[slot]) << COST_SHIFT; } - - c->length_cost_fast[len] = cost; + c->fast_length_cost_tab[len] = cost; } } -/* Return the cost to encode the specified match length, which must be less than - * LZMS_NUM_FAST_LENGTHS. */ +/* Return the cost to encode the specified match length, which must not exceed + * MAX_FAST_LENGTH. */ static inline u32 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length) { - LZMS_ASSERT(length < LZMS_NUM_FAST_LENGTHS); - return c->length_cost_fast[length]; + return c->fast_length_cost_tab[length]; } /* Return the cost to encode the specified LZ match offset. */ static inline u32 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset) { - unsigned slot = lzms_get_offset_slot_fast(c, offset); - - return (u32)(c->lz_offset_encoder.lens[slot] + - lzms_extra_offset_bits[slot]) << LZMS_COST_SHIFT; + unsigned slot = lzms_comp_get_offset_slot(c, offset); + u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot]; + return num_bits << COST_SHIFT; } -/* - * Consider coding the match at repeat offset index @rep_idx. Consider each - * length from the minimum (2) to the full match length (@rep_len). - */ -static inline void -lzms_consider_lz_repeat_offset_match(const struct lzms_compressor *c, - struct lzms_mc_pos_data *cur_optimum_ptr, - u32 rep_len, unsigned rep_idx) +/* Return the cost to encode the specified delta power and raw offset. */ +static inline u32 +lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset) { - u32 len; - u32 base_cost; - u32 cost; - unsigned i; - - base_cost = cur_optimum_ptr->cost; - - base_cost += lzms_rc_bit_cost(&c->main_range_encoder, - cur_optimum_ptr->state.main_state, 1); - - base_cost += lzms_rc_bit_cost(&c->match_range_encoder, - cur_optimum_ptr->state.match_state, 0); + unsigned slot = lzms_comp_get_offset_slot(c, raw_offset); + u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] + + lzms_extra_offset_bits[slot]; + return num_bits << COST_SHIFT; +} - base_cost += lzms_rc_bit_cost(&c->lz_match_range_encoder, - cur_optimum_ptr->state.lz_match_state, 1); +/****************************************************************************** + * Adaptive state * + ******************************************************************************/ - for (i = 0; i < rep_idx; i++) - base_cost += lzms_rc_bit_cost(&c->lz_repeat_match_range_encoders[i], - cur_optimum_ptr->state.lz_repeat_match_state[i], 1); +static void +lzms_init_adaptive_state(struct lzms_adaptive_state *state) +{ + for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++) + state->recent_lz_offsets[i] = i + 1; + state->prev_lz_offset = 0; + state->upcoming_lz_offset = 0; - if (i < LZMS_NUM_RECENT_OFFSETS - 1) - base_cost += lzms_rc_bit_cost(&c->lz_repeat_match_range_encoders[i], - cur_optimum_ptr->state.lz_repeat_match_state[i], 0); + for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++) + state->recent_delta_pairs[i] = i + 1; + state->prev_delta_pair = 0; + state->upcoming_delta_pair = 0; - len = 2; - do { - cost = base_cost + lzms_fast_length_cost(c, len); - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->mc_item_data = - ((u64)rep_idx << MC_OFFSET_SHIFT) | len; - (cur_optimum_ptr + len)->cost = cost; - } - } while (++len <= rep_len); + state->main_state = 0; + state->match_state = 0; + state->lz_state = 0; + for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++) + state->lz_rep_states[i] = 0; + state->delta_state = 0; + for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++) + state->delta_rep_states[i] = 0; } /* - * Consider coding each match in @matches as an explicit offset match. - * - * @matches must be sorted by strictly decreasing length. This is guaranteed by - * the match-finder. + * Update the LRU queues for match sources when advancing by one item. * - * We consider each length from the minimum (2) to the longest - * (matches[num_matches - 1].len). For each length, we consider only the - * smallest offset for which that length is available. Although this is not - * guaranteed to be optimal due to the possibility of a larger offset costing - * less than a smaller offset to code, this is a very useful heuristic. + * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more + * complex because: + * - there are separate queues for LZ and delta matches + * - updates to the queues are delayed by one encoded item (this prevents + * sources from being bumped up to index 0 too early) */ -static inline void -lzms_consider_lz_explicit_offset_matches(const struct lzms_compressor *c, - struct lzms_mc_pos_data *cur_optimum_ptr, - const struct lz_match matches[], - u32 num_matches) -{ - u32 len; - u32 i; - u32 base_cost; - u32 position_cost; - u32 cost; - - base_cost = cur_optimum_ptr->cost; - - base_cost += lzms_rc_bit_cost(&c->main_range_encoder, - cur_optimum_ptr->state.main_state, 1); - - base_cost += lzms_rc_bit_cost(&c->match_range_encoder, - cur_optimum_ptr->state.match_state, 0); +static void +lzms_update_lru_queues(struct lzms_adaptive_state *state) +{ + if (state->prev_lz_offset != 0) { + for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--) + state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i]; + state->recent_lz_offsets[0] = state->prev_lz_offset; + } + state->prev_lz_offset = state->upcoming_lz_offset; - base_cost += lzms_rc_bit_cost(&c->lz_match_range_encoder, - cur_optimum_ptr->state.lz_match_state, 0); - len = 2; - i = num_matches - 1; - do { - position_cost = base_cost + lzms_lz_offset_cost(c, matches[i].offset); - do { - cost = position_cost + lzms_fast_length_cost(c, len); - if (cost < (cur_optimum_ptr + len)->cost) { - (cur_optimum_ptr + len)->mc_item_data = - ((u64)(matches[i].offset + LZMS_OFFSET_OFFSET) - << MC_OFFSET_SHIFT) | len; - (cur_optimum_ptr + len)->cost = cost; - } - } while (++len <= matches[i].length); - } while (i--); + if (state->prev_delta_pair != 0) { + for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--) + state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i]; + state->recent_delta_pairs[0] = state->prev_delta_pair; + } + state->prev_delta_pair = state->upcoming_delta_pair; } -static void -lzms_init_adaptive_state(struct lzms_adaptive_state *state) +static inline void +lzms_update_state(u8 *state_p, int bit, unsigned num_states) { - unsigned i; - - lzms_init_lz_lru_queue(&state->lru); - state->main_state = 0; - state->match_state = 0; - state->lz_match_state = 0; - for (i = 0; i < LZMS_NUM_RECENT_OFFSETS - 1; i++) - state->lz_repeat_match_state[i] = 0; + *state_p = ((*state_p << 1) | bit) % num_states; } static inline void lzms_update_main_state(struct lzms_adaptive_state *state, int is_match) { - state->main_state = ((state->main_state << 1) | is_match) % LZMS_NUM_MAIN_STATES; + lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS); } static inline void lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta) { - state->match_state = ((state->match_state << 1) | is_delta) % LZMS_NUM_MATCH_STATES; + lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS); +} + +static inline void +lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep) +{ + lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS); } static inline void -lzms_update_lz_match_state(struct lzms_adaptive_state *state, int is_repeat_offset) +lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx) { - state->lz_match_state = ((state->lz_match_state << 1) | is_repeat_offset) % LZMS_NUM_LZ_MATCH_STATES; + for (int i = 0; i < rep_idx; i++) + lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS); + + if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS) + lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS); } static inline void -lzms_update_lz_repeat_match_state(struct lzms_adaptive_state *state, int rep_idx) +lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep) +{ + lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS); +} + +static inline void +lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx) +{ + for (int i = 0; i < rep_idx; i++) + lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS); + + if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS) + lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS); +} + +/****************************************************************************** + * Matchfinding * + ******************************************************************************/ + +/* Note: this code just handles finding delta matches. The code for finding LZ + * matches is elsewhere. */ + + +/* Initialize the delta matchfinder for a new input buffer. */ +static void +lzms_init_delta_matchfinder(struct lzms_compressor *c) { - int i; + /* Set all entries to use an invalid power, which will never match. */ + BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER >= (1 << (32 - DELTA_SOURCE_POWER_SHIFT))); + memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table)); + + /* Initialize the next hash code for each power. We can just use zeroes + * initially; it doesn't really matter. */ + for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++) + c->next_delta_hashes[i] = 0; +} - for (i = 0; i < rep_idx; i++) - state->lz_repeat_match_state[i] = - ((state->lz_repeat_match_state[i] << 1) | 1) % - LZMS_NUM_LZ_REPEAT_MATCH_STATES; +/* + * Compute a DELTA_HASH_ORDER-bit hash code for the first + * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a + * delta context with the specified @span. + */ +static inline u32 +lzms_delta_hash(const u8 *p, u32 span) +{ + /* A delta match has a certain span and an offset that is a multiple of + * that span. To reduce wasted space we use a single combined hash + * table for all spans and positions, but to minimize collisions we + * include in the hash code computation the span and the low-order bits + * of the current position. */ + + BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3); + u8 d0 = *(p + 0) - *(p + 0 - span); + u8 d1 = *(p + 1) - *(p + 1 - span); + u8 d2 = *(p + 2) - *(p + 2 - span); + u32 v = ((span + ((u32)(uintptr_t)p & (span - 1))) << 24) | + ((u32)d2 << 16) | ((u32)d1 << 8) | d0; + return lz_hash(v, DELTA_HASH_ORDER); +} - if (i < LZMS_NUM_RECENT_OFFSETS - 1) - state->lz_repeat_match_state[i] = - ((state->lz_repeat_match_state[i] << 1) | 0) % - LZMS_NUM_LZ_REPEAT_MATCH_STATES; +/* + * Given a match between @in_next and @matchptr in a delta context with the + * specified @span and having the initial @len, extend the match as far as + * possible, up to a limit of @max_len. + */ +static inline u32 +lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr, + u32 len, u32 max_len, u32 span) +{ + while (len < max_len && + (u8)(*(in_next + len) - *(in_next + len - span)) == + (u8)(*(matchptr + len) - *(matchptr + len - span))) + { + len++; + } + return len; } +static void +lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c, + const u8 *in_next, u32 count) +{ + u32 pos = in_next - c->in_buffer; + if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1)) + return; + do { + /* Update the hash table for each power. */ + for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) { + const u32 span = (u32)1 << power; + if (unlikely(pos < span)) + continue; + const u32 next_hash = lzms_delta_hash(in_next + 1, span); + const u32 hash = c->next_delta_hashes[power]; + c->delta_hash_table[hash] = + (power << DELTA_SOURCE_POWER_SHIFT) | pos; + c->next_delta_hashes[power] = next_hash; + prefetch(&c->delta_hash_table[next_hash]); + } + } while (in_next++, pos++, --count); +} + +/* + * Skip the next @count bytes (don't search for matches at them). @in_next + * points to the first byte to skip. The return value is @in_next + count. + */ +static const u8 * +lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next) +{ + lcpit_matchfinder_skip_bytes(&c->mf, count); + if (c->use_delta_matches) + lzms_delta_matchfinder_skip_bytes(c, in_next, count); + return in_next + count; +} + +/****************************************************************************** + * "Near-optimal" parsing * + ******************************************************************************/ + /* * The main near-optimal parsing routine. * @@ -981,252 +1287,692 @@ lzms_update_lz_repeat_match_state(struct lzms_adaptive_state *state, int rep_idx * compute the lowest-cost path in pieces, where each piece is terminated when * there are no choices to be made. * - * Notes: - * - * - This does not output any delta matches. - * - * - The costs of literals and matches are estimated using the range encoder - * states and the semi-adaptive Huffman codes. Except for range encoding - * states, costs are assumed to be constant throughout a single run of the - * parsing algorithm, which can parse up to @optim_array_length bytes of data. - * This introduces a source of inaccuracy because the probabilities and - * Huffman codes can change over this part of the data. + * The costs of literals and matches are estimated using the range encoder + * states and the semi-adaptive Huffman codes. Except for range encoding + * states, costs are assumed to be constant throughout a single run of the + * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data. This + * introduces a source of non-optimality because the probabilities and Huffman + * codes can change over this part of the data. And of course, there are + * various other reasons why the result isn't optimal in terms of compression + * ratio. */ static void lzms_near_optimal_parse(struct lzms_compressor *c) { - const u8 *window_ptr; - const u8 *window_end; - struct lzms_mc_pos_data *cur_optimum_ptr; - struct lzms_mc_pos_data *end_optimum_ptr; - u32 num_matches; - u32 longest_len; - u32 rep_max_len; - unsigned rep_max_idx; - unsigned literal; - unsigned i; - u32 cost; - u32 len; - u32 offset_data; + const u8 *in_next = c->in_buffer; + const u8 * const in_end = &c->in_buffer[c->in_nbytes]; + struct lzms_optimum_node *cur_node; + struct lzms_optimum_node *end_node; - window_ptr = c->cur_window; - window_end = window_ptr + c->cur_window_size; + /* Set initial length costs for lengths <= MAX_FAST_LENGTH. */ + lzms_update_fast_length_costs(c); - lzms_init_adaptive_state(&c->optimum[0].state); + /* Set up the initial adaptive state. */ + lzms_init_adaptive_state(&c->optimum_nodes[0].state); begin: /* Start building a new list of items, which will correspond to the next * piece of the overall minimum-cost path. */ - cur_optimum_ptr = c->optimum; - cur_optimum_ptr->cost = 0; - end_optimum_ptr = cur_optimum_ptr; + cur_node = c->optimum_nodes; + cur_node->cost = 0; + end_node = cur_node; - /* States should currently be consistent with the encoders. */ - LZMS_ASSERT(cur_optimum_ptr->state.main_state == c->main_range_encoder.state); - LZMS_ASSERT(cur_optimum_ptr->state.match_state == c->match_range_encoder.state); - LZMS_ASSERT(cur_optimum_ptr->state.lz_match_state == c->lz_match_range_encoder.state); - for (i = 0; i < LZMS_NUM_RECENT_OFFSETS - 1; i++) - LZMS_ASSERT(cur_optimum_ptr->state.lz_repeat_match_state[i] == - c->lz_repeat_match_range_encoders[i].state); - - if (window_ptr == window_end) + if (in_next == in_end) return; - /* The following loop runs once for each per byte in the window, except - * in a couple shortcut cases. */ + /* The following loop runs once for each per byte in the input buffer, + * except in a few shortcut cases. */ for (;;) { + u32 num_matches; - /* Find explicit offset matches with the current position. */ - num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches); - if (num_matches) { - /* - * Find the longest repeat offset match with the current - * position. - * - * Heuristics: - * - * - Only search for repeat offset matches if the - * match-finder already found at least one match. - * - * - Only consider the longest repeat offset match. It - * seems to be rare for the optimal parse to include a - * repeat offset match that doesn't have the longest - * length (allowing for the possibility that not all - * of that length is actually used). - */ - if (likely(window_ptr - c->cur_window >= LZMS_MAX_INIT_RECENT_OFFSET)) { - BUILD_BUG_ON(LZMS_NUM_RECENT_OFFSETS != 3); - rep_max_len = lz_repsearch3(window_ptr, - window_end - window_ptr, - cur_optimum_ptr->state.lru.recent_offsets, - &rep_max_idx); - } else { - rep_max_len = 0; - } + /* Repeat offset LZ matches */ + if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS && + in_end - in_next >= 2)) + { + for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) { - if (rep_max_len) { - /* If there's a very long repeat offset match, - * choose it immediately. */ - if (rep_max_len >= c->params.nice_match_length) { + /* Looking for a repeat offset LZ match at queue + * index @rep_idx */ - lcpit_matchfinder_skip_bytes(&c->mf, rep_max_len - 1); - window_ptr += rep_max_len; + const u32 offset = cur_node->state.recent_lz_offsets[rep_idx]; + const u8 * const matchptr = in_next - offset; - if (cur_optimum_ptr != c->optimum) - lzms_encode_item_list(c, cur_optimum_ptr); + /* Check the first 2 bytes before entering the extension loop. */ + if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr)) + continue; - lzms_encode_lz_repeat_offset_match(c, rep_max_len, - rep_max_idx); + /* Extend the match to its full length. */ + const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next); - c->optimum[0].state = cur_optimum_ptr->state; + /* Early out for long repeat offset LZ match */ + if (rep_len >= c->mf.nice_match_len) { - lzms_update_main_state(&c->optimum[0].state, 1); - lzms_update_match_state(&c->optimum[0].state, 0); - lzms_update_lz_match_state(&c->optimum[0].state, 1); - lzms_update_lz_repeat_match_state(&c->optimum[0].state, - rep_max_idx); + in_next = lzms_skip_bytes(c, rep_len, in_next); - c->optimum[0].state.lru.upcoming_offset = - c->optimum[0].state.lru.recent_offsets[rep_max_idx]; + lzms_encode_item_list(c, cur_node); + lzms_encode_item(c, rep_len, rep_idx); - for (i = rep_max_idx; i < LZMS_NUM_RECENT_OFFSETS; i++) - c->optimum[0].state.lru.recent_offsets[i] = - c->optimum[0].state.lru.recent_offsets[i + 1]; + c->optimum_nodes[0].state = cur_node->state; + cur_node = &c->optimum_nodes[0]; - lzms_update_lz_lru_queue(&c->optimum[0].state.lru); + cur_node->state.upcoming_lz_offset = + cur_node->state.recent_lz_offsets[rep_idx]; + cur_node->state.upcoming_delta_pair = 0; + for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++) + cur_node->state.recent_lz_offsets[i] = + cur_node->state.recent_lz_offsets[i + 1]; + lzms_update_lru_queues(&cur_node->state); + lzms_update_main_state(&cur_node->state, 1); + lzms_update_match_state(&cur_node->state, 0); + lzms_update_lz_state(&cur_node->state, 1); + lzms_update_lz_rep_states(&cur_node->state, rep_idx); goto begin; } - /* If reaching any positions for the first time, - * initialize their costs to "infinity". */ - while (end_optimum_ptr < cur_optimum_ptr + rep_max_len) - (++end_optimum_ptr)->cost = MC_INFINITE_COST; - - /* Consider coding a repeat offset match. */ - lzms_consider_lz_repeat_offset_match(c, cur_optimum_ptr, - rep_max_len, rep_max_idx); + while (end_node < cur_node + rep_len) + (++end_node)->cost = INFINITE_COST; + + u32 base_cost = cur_node->cost + + lzms_bit_1_cost(cur_node->state.main_state, + c->main_probs) + + lzms_bit_0_cost(cur_node->state.match_state, + c->match_probs) + + lzms_bit_1_cost(cur_node->state.lz_state, + c->lz_probs); + + for (int i = 0; i < rep_idx; i++) + base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i], + c->lz_rep_probs[i]); + + if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS) + base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx], + c->lz_rep_probs[rep_idx]); + + u32 len = 2; + do { + u32 cost = base_cost + lzms_fast_length_cost(c, len); + if (cost < (cur_node + len)->cost) { + (cur_node + len)->cost = cost; + (cur_node + len)->item = (struct lzms_item) { + .length = len, + .source = rep_idx, + }; + (cur_node + len)->num_extra_items = 0; + } + } while (++len <= rep_len); + + + /* try LZ-rep + lit + LZ-rep0 */ + if (c->try_lzrep_lit_lzrep0 && + in_end - (in_next + rep_len) >= 3 && + load_u16_unaligned(in_next + rep_len + 1) == + load_u16_unaligned(matchptr + rep_len + 1)) + { + const u32 rep0_len = lz_extend(in_next + rep_len + 1, + matchptr + rep_len + 1, + 2, + min(c->mf.nice_match_len, + in_end - (in_next + rep_len + 1))); + + unsigned main_state = cur_node->state.main_state; + unsigned match_state = cur_node->state.match_state; + unsigned lz_state = cur_node->state.lz_state; + unsigned lz_rep0_state = cur_node->state.lz_rep_states[0]; + + /* update states for LZ-rep */ + main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS; + match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS; + lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS; + lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) % + LZMS_NUM_LZ_REP_PROBS; + + /* LZ-rep cost */ + u32 cost = base_cost + lzms_fast_length_cost(c, rep_len); + + /* add literal cost */ + cost += lzms_literal_cost(c, main_state, *(in_next + rep_len)); + + /* update state for literal */ + main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS; + + /* add LZ-rep0 cost */ + cost += lzms_bit_1_cost(main_state, c->main_probs) + + lzms_bit_0_cost(match_state, c->match_probs) + + lzms_bit_1_cost(lz_state, c->lz_probs) + + lzms_bit_0_cost(lz_rep0_state, c->lz_rep_probs[0]) + + lzms_fast_length_cost(c, rep0_len); + + const u32 total_len = rep_len + 1 + rep0_len; + + while (end_node < cur_node + total_len) + (++end_node)->cost = INFINITE_COST; + + if (cost < (cur_node + total_len)->cost) { + (cur_node + total_len)->cost = cost; + (cur_node + total_len)->item = (struct lzms_item) { + .length = rep0_len, + .source = 0, + }; + (cur_node + total_len)->extra_items[0] = (struct lzms_item) { + .length = 1, + .source = *(in_next + rep_len), + }; + (cur_node + total_len)->extra_items[1] = (struct lzms_item) { + .length = rep_len, + .source = rep_idx, + }; + (cur_node + total_len)->num_extra_items = 2; + } + } } + } - longest_len = c->matches[0].length; + /* Repeat offset delta matches */ + if (c->use_delta_matches && + likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 && + (in_end - in_next >= 2))) + { + for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) { + + /* Looking for a repeat offset delta match at + * queue index @rep_idx */ + + const u32 pair = cur_node->state.recent_delta_pairs[rep_idx]; + const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT; + const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK; + const u32 span = (u32)1 << power; + const u32 offset = raw_offset << power; + const u8 * const matchptr = in_next - offset; + + /* Check the first 2 bytes before entering the + * extension loop. */ + if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) != + (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) || + ((u8)(*(in_next + 1) - *(in_next + 1 - span)) != + (u8)(*(matchptr + 1) - *(matchptr + 1 - span)))) + continue; + + /* Extend the match to its full length. */ + const u32 rep_len = lzms_extend_delta_match(in_next, matchptr, + 2, in_end - in_next, + span); + + /* Early out for long repeat offset delta match */ + if (rep_len >= c->mf.nice_match_len) { + + in_next = lzms_skip_bytes(c, rep_len, in_next); + + lzms_encode_item_list(c, cur_node); + lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx); + + c->optimum_nodes[0].state = cur_node->state; + cur_node = &c->optimum_nodes[0]; + + cur_node->state.upcoming_delta_pair = pair; + cur_node->state.upcoming_lz_offset = 0; + for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++) + cur_node->state.recent_delta_pairs[i] = + cur_node->state.recent_delta_pairs[i + 1]; + lzms_update_lru_queues(&cur_node->state); + lzms_update_main_state(&cur_node->state, 1); + lzms_update_match_state(&cur_node->state, 1); + lzms_update_delta_state(&cur_node->state, 1); + lzms_update_delta_rep_states(&cur_node->state, rep_idx); + goto begin; + } - /* If there's a very long explicit offset match, choose - * it immediately. */ - if (longest_len >= c->params.nice_match_length) { + while (end_node < cur_node + rep_len) + (++end_node)->cost = INFINITE_COST; + + u32 base_cost = cur_node->cost + + lzms_bit_1_cost(cur_node->state.main_state, + c->main_probs) + + lzms_bit_1_cost(cur_node->state.match_state, + c->match_probs) + + lzms_bit_1_cost(cur_node->state.delta_state, + c->delta_probs); + + for (int i = 0; i < rep_idx; i++) + base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i], + c->delta_rep_probs[i]); + + if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS) + base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx], + c->delta_rep_probs[rep_idx]); + + u32 len = 2; + do { + u32 cost = base_cost + lzms_fast_length_cost(c, len); + if (cost < (cur_node + len)->cost) { + (cur_node + len)->cost = cost; + (cur_node + len)->item = (struct lzms_item) { + .length = len, + .source = DELTA_SOURCE_TAG | rep_idx, + }; + (cur_node + len)->num_extra_items = 0; + } + } while (++len <= rep_len); + } + } - u32 offset = c->matches[0].offset; + /* Explicit offset LZ matches */ + num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches); + if (num_matches) { - /* Extend the match as far as possible. (The - * LCP-interval tree matchfinder only reports up - * to the "nice" length.) */ - longest_len = lz_extend(window_ptr, - window_ptr - offset, - longest_len, - window_end - window_ptr); + u32 best_len = c->matches[0].length; - lcpit_matchfinder_skip_bytes(&c->mf, longest_len - 1); - window_ptr += longest_len; + /* Early out for long explicit offset LZ match */ + if (best_len >= c->mf.nice_match_len) { - if (cur_optimum_ptr != c->optimum) - lzms_encode_item_list(c, cur_optimum_ptr); + const u32 offset = c->matches[0].offset; - lzms_encode_lz_explicit_offset_match(c, longest_len, offset); + /* Extend the match as far as possible. + * This is necessary because the LCP-interval + * tree matchfinder only reports up to + * nice_match_len bytes. */ + best_len = lz_extend(in_next, in_next - offset, + best_len, in_end - in_next); - c->optimum[0].state = cur_optimum_ptr->state; + in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1); - lzms_update_main_state(&c->optimum[0].state, 1); - lzms_update_match_state(&c->optimum[0].state, 0); - lzms_update_lz_match_state(&c->optimum[0].state, 0); + lzms_encode_item_list(c, cur_node); + lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1); - c->optimum[0].state.lru.upcoming_offset = offset; + c->optimum_nodes[0].state = cur_node->state; + cur_node = &c->optimum_nodes[0]; - lzms_update_lz_lru_queue(&c->optimum[0].state.lru); + cur_node->state.upcoming_lz_offset = offset; + cur_node->state.upcoming_delta_pair = 0; + lzms_update_lru_queues(&cur_node->state); + lzms_update_main_state(&cur_node->state, 1); + lzms_update_match_state(&cur_node->state, 0); + lzms_update_lz_state(&cur_node->state, 0); goto begin; } - /* If reaching any positions for the first time, - * initialize their costs to "infinity". */ - while (end_optimum_ptr < cur_optimum_ptr + longest_len) - (++end_optimum_ptr)->cost = MC_INFINITE_COST; + while (end_node < cur_node + best_len) + (++end_node)->cost = INFINITE_COST; + + u32 base_cost = cur_node->cost + + lzms_bit_1_cost(cur_node->state.main_state, + c->main_probs) + + lzms_bit_0_cost(cur_node->state.match_state, + c->match_probs) + + lzms_bit_0_cost(cur_node->state.lz_state, + c->lz_probs); + + if (c->try_lzmatch_lit_lzrep0 && + likely(in_end - (in_next + c->matches[0].length) >= 3)) + { + /* try LZ-match + lit + LZ-rep0 */ + + u32 l = 2; + u32 i = num_matches - 1; + do { + const u32 len = c->matches[i].length; + const u32 offset = c->matches[i].offset; + const u32 position_cost = base_cost + + lzms_lz_offset_cost(c, offset); + do { + u32 cost = position_cost + lzms_fast_length_cost(c, l); + if (cost < (cur_node + l)->cost) { + (cur_node + l)->cost = cost; + (cur_node + l)->item = (struct lzms_item) { + .length = l, + .source = offset + (LZMS_NUM_LZ_REPS - 1), + }; + (cur_node + l)->num_extra_items = 0; + } + } while (++l <= len); + + const u8 * const matchptr = in_next - offset; + if (load_u16_unaligned(matchptr + len + 1) != + load_u16_unaligned(in_next + len + 1)) + continue; + + const u32 rep0_len = lz_extend(in_next + len + 1, + matchptr + len + 1, + 2, + min(c->mf.nice_match_len, + in_end - (in_next + len + 1))); + + unsigned main_state = cur_node->state.main_state; + unsigned match_state = cur_node->state.match_state; + unsigned lz_state = cur_node->state.lz_state; + + /* update states for LZ-match */ + main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS; + match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS; + lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS; + + /* LZ-match cost */ + u32 cost = position_cost + lzms_fast_length_cost(c, len); + + /* add literal cost */ + cost += lzms_literal_cost(c, main_state, *(in_next + len)); + + /* update state for literal */ + main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS; + + /* add LZ-rep0 cost */ + cost += lzms_bit_1_cost(main_state, c->main_probs) + + lzms_bit_0_cost(match_state, c->match_probs) + + lzms_bit_1_cost(lz_state, c->lz_probs) + + lzms_bit_0_cost(cur_node->state.lz_rep_states[0], + c->lz_rep_probs[0]) + + lzms_fast_length_cost(c, rep0_len); + + const u32 total_len = len + 1 + rep0_len; + + while (end_node < cur_node + total_len) + (++end_node)->cost = INFINITE_COST; + + if (cost < (cur_node + total_len)->cost) { + (cur_node + total_len)->cost = cost; + (cur_node + total_len)->item = (struct lzms_item) { + .length = rep0_len, + .source = 0, + }; + (cur_node + total_len)->extra_items[0] = (struct lzms_item) { + .length = 1, + .source = *(in_next + len), + }; + (cur_node + total_len)->extra_items[1] = (struct lzms_item) { + .length = len, + .source = offset + LZMS_NUM_LZ_REPS - 1, + }; + (cur_node + total_len)->num_extra_items = 2; + } + } while (i--); + } else { + u32 l = 2; + u32 i = num_matches - 1; + do { + u32 position_cost = base_cost + + lzms_lz_offset_cost(c, c->matches[i].offset); + do { + u32 cost = position_cost + lzms_fast_length_cost(c, l); + if (cost < (cur_node + l)->cost) { + (cur_node + l)->cost = cost; + (cur_node + l)->item = (struct lzms_item) { + .length = l, + .source = c->matches[i].offset + + (LZMS_NUM_LZ_REPS - 1), + }; + (cur_node + l)->num_extra_items = 0; + } + } while (++l <= c->matches[i].length); + } while (i--); + } + } - /* Consider coding an explicit offset match. */ - lzms_consider_lz_explicit_offset_matches(c, cur_optimum_ptr, - c->matches, num_matches); - } else { - /* No matches found. The only choice at this position - * is to code a literal. */ + /* Explicit offset delta matches */ + if (c->use_delta_matches && + likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1)) + { + const u32 pos = in_next - c->in_buffer; - if (end_optimum_ptr == cur_optimum_ptr) - (++end_optimum_ptr)->cost = MC_INFINITE_COST; - } + /* Consider each possible power (log2 of span) */ + BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER > LZMS_NUM_DELTA_POWER_SYMS); + for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) { - /* Consider coding a literal. + const u32 span = (u32)1 << power; - * To avoid an extra unpredictable brench, actually checking the - * preferability of coding a literal is integrated into the - * adaptive state update code below. */ - literal = *window_ptr++; - cost = cur_optimum_ptr->cost + - lzms_literal_cost(c, literal, &cur_optimum_ptr->state); + if (unlikely(pos < span)) + continue; - /* Advance to the next position. */ - cur_optimum_ptr++; + const u32 next_hash = lzms_delta_hash(in_next + 1, span); + const u32 hash = c->next_delta_hashes[power]; + const u32 cur_match = c->delta_hash_table[hash]; - /* The lowest-cost path to the current position is now known. - * Finalize the adaptive state that results from taking this - * lowest-cost path. */ + c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos; + c->next_delta_hashes[power] = next_hash; + prefetch(&c->delta_hash_table[next_hash]); - if (cost < cur_optimum_ptr->cost) { - /* Literal */ - cur_optimum_ptr->cost = cost; - cur_optimum_ptr->mc_item_data = ((u64)literal << MC_OFFSET_SHIFT) | 1; + if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT) + continue; - cur_optimum_ptr->state = (cur_optimum_ptr - 1)->state; + const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK); - lzms_update_main_state(&cur_optimum_ptr->state, 0); + /* The offset must be a multiple of span. */ + if (offset & (span - 1)) + continue; - cur_optimum_ptr->state.lru.upcoming_offset = 0; - } else { - /* LZ match */ - len = cur_optimum_ptr->mc_item_data & MC_LEN_MASK; - offset_data = cur_optimum_ptr->mc_item_data >> MC_OFFSET_SHIFT; + const u8 * const matchptr = in_next - offset; - cur_optimum_ptr->state = (cur_optimum_ptr - len)->state; + /* Check the first 3 bytes before entering the + * extension loop. */ + BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3); + if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) != + (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) || + ((u8)(*(in_next + 1) - *(in_next + 1 - span)) != + (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) || + ((u8)(*(in_next + 2) - *(in_next + 2 - span)) != + (u8)(*(matchptr + 2) - *(matchptr + 2 - span)))) + continue; - lzms_update_main_state(&cur_optimum_ptr->state, 1); - lzms_update_match_state(&cur_optimum_ptr->state, 0); + /* Extend the delta match to its full length. */ + const u32 len = lzms_extend_delta_match(in_next, + matchptr, + 3, + in_end - in_next, + span); - if (offset_data >= LZMS_NUM_RECENT_OFFSETS) { + const u32 raw_offset = offset >> power; + const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) | + raw_offset; + const u32 source = DELTA_SOURCE_TAG | + (pair + LZMS_NUM_DELTA_REPS - 1); - /* Explicit offset LZ match */ + /* Early out for long explicit offset delta match */ + if (len >= c->mf.nice_match_len) { - lzms_update_lz_match_state(&cur_optimum_ptr->state, 0); + in_next = lzms_skip_bytes(c, len - 1, in_next + 1); - cur_optimum_ptr->state.lru.upcoming_offset = - offset_data - LZMS_OFFSET_OFFSET; - } else { - /* Repeat offset LZ match */ + lzms_encode_item_list(c, cur_node); + lzms_encode_item(c, len, source); - lzms_update_lz_match_state(&cur_optimum_ptr->state, 1); - lzms_update_lz_repeat_match_state(&cur_optimum_ptr->state, - offset_data); + c->optimum_nodes[0].state = cur_node->state; + cur_node = &c->optimum_nodes[0]; - cur_optimum_ptr->state.lru.upcoming_offset = - cur_optimum_ptr->state.lru.recent_offsets[offset_data]; + cur_node->state.upcoming_lz_offset = 0; + cur_node->state.upcoming_delta_pair = pair; + lzms_update_lru_queues(&cur_node->state); + lzms_update_main_state(&cur_node->state, 1); + lzms_update_match_state(&cur_node->state, 1); + lzms_update_delta_state(&cur_node->state, 0); + goto begin; + } - for (i = offset_data; i < LZMS_NUM_RECENT_OFFSETS; i++) - cur_optimum_ptr->state.lru.recent_offsets[i] = - cur_optimum_ptr->state.lru.recent_offsets[i + 1]; + while (end_node < cur_node + len) + (++end_node)->cost = INFINITE_COST; + + u32 base_cost = cur_node->cost + + lzms_bit_1_cost(cur_node->state.main_state, + c->main_probs) + + lzms_bit_1_cost(cur_node->state.match_state, + c->match_probs) + + lzms_bit_0_cost(cur_node->state.delta_state, + c->delta_probs) + + lzms_delta_source_cost(c, power, raw_offset); + + u32 l = NBYTES_HASHED_FOR_DELTA; + do { + u32 cost = base_cost + lzms_fast_length_cost(c, l); + if (cost < (cur_node + l)->cost) { + (cur_node + l)->cost = cost; + (cur_node + l)->item = (struct lzms_item) { + .length = l, + .source = source, + }; + (cur_node + l)->num_extra_items = 0; + } + } while (++l <= len); } } - lzms_update_lz_lru_queue(&cur_optimum_ptr->state.lru); + /* Literal */ + if (end_node < cur_node + 1) + (++end_node)->cost = INFINITE_COST; + const u32 cur_and_lit_cost = cur_node->cost + + lzms_literal_cost(c, cur_node->state.main_state, + *in_next); + if (cur_and_lit_cost < (cur_node + 1)->cost) { + (cur_node + 1)->cost = cur_and_lit_cost; + (cur_node + 1)->item = (struct lzms_item) { + .length = 1, + .source = *in_next, + }; + (cur_node + 1)->num_extra_items = 0; + } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) { + /* try lit + LZ-rep0 */ + const u32 offset = + (cur_node->state.prev_lz_offset) ? + cur_node->state.prev_lz_offset : + cur_node->state.recent_lz_offsets[0]; + + if (load_u16_unaligned(in_next + 1) == + load_u16_unaligned(in_next + 1 - offset)) + { + const u32 rep0_len = lz_extend(in_next + 1, + in_next + 1 - offset, + 2, + min(in_end - (in_next + 1), + c->mf.nice_match_len)); + + unsigned main_state = cur_node->state.main_state; + + /* Update main_state after literal */ + main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS; + + /* Add cost of LZ-rep0 */ + const u32 cost = cur_and_lit_cost + + lzms_bit_1_cost(main_state, c->main_probs) + + lzms_bit_0_cost(cur_node->state.match_state, + c->match_probs) + + lzms_bit_1_cost(cur_node->state.lz_state, + c->lz_probs) + + lzms_bit_0_cost(cur_node->state.lz_rep_states[0], + c->lz_rep_probs[0]) + + lzms_fast_length_cost(c, rep0_len); + + const u32 total_len = 1 + rep0_len; + + while (end_node < cur_node + total_len) + (++end_node)->cost = INFINITE_COST; + + if (cost < (cur_node + total_len)->cost) { + (cur_node + total_len)->cost = cost; + (cur_node + total_len)->item = (struct lzms_item) { + .length = rep0_len, + .source = 0, + }; + (cur_node + total_len)->extra_items[0] = (struct lzms_item) { + .length = 1, + .source = *in_next, + }; + (cur_node + total_len)->num_extra_items = 1; + } + } + } + + /* Advance to the next position. */ + in_next++; + cur_node++; + + /* The lowest-cost path to the current position is now known. + * Finalize the adaptive state that results from taking this + * lowest-cost path. */ + struct lzms_item item_to_take = cur_node->item; + struct lzms_optimum_node *source_node = cur_node - (item_to_take.length); + int next_item_idx = -1; + for (unsigned i = 0; i < cur_node->num_extra_items; i++) { + item_to_take = cur_node->extra_items[i]; + source_node -= item_to_take.length; + next_item_idx++; + } + cur_node->state = source_node->state; + for (;;) { + const u32 length = item_to_take.length; + u32 source = item_to_take.source; + + cur_node->state.upcoming_lz_offset = 0; + cur_node->state.upcoming_delta_pair = 0; + if (length > 1) { + /* Match */ + + lzms_update_main_state(&cur_node->state, 1); + + if (source & DELTA_SOURCE_TAG) { + /* Delta match */ + + lzms_update_match_state(&cur_node->state, 1); + source &= ~DELTA_SOURCE_TAG; + + if (source >= LZMS_NUM_DELTA_REPS) { + /* Explicit offset delta match */ + u32 pair = source - (LZMS_NUM_DELTA_REPS - 1); + lzms_update_delta_state(&cur_node->state, 0); + cur_node->state.upcoming_delta_pair = pair; + } else { + /* Repeat offset delta match */ + int rep_idx = source; + + lzms_update_delta_state(&cur_node->state, 1); + lzms_update_delta_rep_states(&cur_node->state, rep_idx); + + cur_node->state.upcoming_delta_pair = + cur_node->state.recent_delta_pairs[rep_idx]; + + for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++) + cur_node->state.recent_delta_pairs[i] = + cur_node->state.recent_delta_pairs[i + 1]; + } + } else { + lzms_update_match_state(&cur_node->state, 0); + + if (source >= LZMS_NUM_LZ_REPS) { + /* Explicit offset LZ match */ + lzms_update_lz_state(&cur_node->state, 0); + cur_node->state.upcoming_lz_offset = + source - (LZMS_NUM_LZ_REPS - 1); + } else { + /* Repeat offset LZ match */ + int rep_idx = source; + + lzms_update_lz_state(&cur_node->state, 1); + lzms_update_lz_rep_states(&cur_node->state, rep_idx); + + cur_node->state.upcoming_lz_offset = + cur_node->state.recent_lz_offsets[rep_idx]; + + for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++) + cur_node->state.recent_lz_offsets[i] = + cur_node->state.recent_lz_offsets[i + 1]; + } + } + } else { + /* Literal */ + lzms_update_main_state(&cur_node->state, 0); + } + + lzms_update_lru_queues(&cur_node->state); + + if (next_item_idx < 0) + break; + if (next_item_idx == 0) + item_to_take = cur_node->item; + else + item_to_take = cur_node->extra_items[next_item_idx - 1]; + --next_item_idx; + } /* * This loop will terminate when either of the following * conditions is true: * - * (1) cur_optimum_ptr == end_optimum_ptr + * (1) cur_node == end_node * * There are no paths that extend beyond the current * position. In this case, any path to a later position @@ -1234,7 +1980,7 @@ begin: * ahead and choose the list of items that led to this * position. * - * (2) cur_optimum_ptr == c->optimum_end + * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES] * * This bounds the number of times the algorithm can step * forward before it is guaranteed to start choosing items. @@ -1242,139 +1988,99 @@ begin: * the parser will not go too long without updating the * probability tables. * - * Note: no check for end-of-window is needed because - * end-of-window will trigger condition (1). + * Note: no check for end-of-buffer is needed because + * end-of-buffer will trigger condition (1). */ - if (cur_optimum_ptr == end_optimum_ptr || - cur_optimum_ptr == c->optimum_end) + if (cur_node == end_node || + cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]) { - c->optimum[0].state = cur_optimum_ptr->state; - break; + lzms_encode_nonempty_item_list(c, cur_node); + c->optimum_nodes[0].state = cur_node->state; + goto begin; } } - - /* Output the current list of items that constitute the minimum-cost - * path to the current position. */ - lzms_encode_item_list(c, cur_optimum_ptr); - goto begin; } static void -lzms_init_range_encoder(struct lzms_range_encoder *enc, - struct lzms_range_encoder_raw *rc, u32 num_states) +lzms_init_states_and_probabilities(struct lzms_compressor *c) { - enc->rc = rc; - enc->state = 0; - LZMS_ASSERT(is_power_of_2(num_states)); - enc->mask = num_states - 1; - lzms_init_probability_entries(enc->prob_entries, num_states); + c->main_state = 0; + c->match_state = 0; + c->lz_state = 0; + for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++) + c->lz_rep_states[i] = 0; + c->delta_state = 0; + for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++) + c->delta_rep_states[i] = 0; + + lzms_init_probability_entries(c->main_probs, LZMS_NUM_MAIN_PROBS); + lzms_init_probability_entries(c->match_probs, LZMS_NUM_MATCH_PROBS); + lzms_init_probability_entries(c->lz_probs, LZMS_NUM_LZ_PROBS); + for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++) + lzms_init_probability_entries(c->lz_rep_probs[i], LZMS_NUM_LZ_REP_PROBS); + lzms_init_probability_entries(c->delta_probs, LZMS_NUM_DELTA_PROBS); + for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++) + lzms_init_probability_entries(c->delta_rep_probs[i], LZMS_NUM_DELTA_REP_PROBS); } static void -lzms_init_huffman_encoder(struct lzms_huffman_encoder *enc, - struct lzms_output_bitstream *os, - unsigned num_syms, - unsigned rebuild_freq) -{ - enc->os = os; - enc->num_syms_written = 0; - enc->rebuild_freq = rebuild_freq; - enc->num_syms = num_syms; - for (unsigned i = 0; i < num_syms; i++) - enc->sym_freqs[i] = 1; - - make_canonical_huffman_code(enc->num_syms, - LZMS_MAX_CODEWORD_LEN, - enc->sym_freqs, - enc->lens, - enc->codewords); -} - -/* Prepare the LZMS compressor for compressing a block of data. */ -static void -lzms_prepare_compressor(struct lzms_compressor *c, const u8 *udata, u32 ulen, - le16 *cdata, u32 clen16) +lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots) { - unsigned num_offset_slots; - - /* Copy the uncompressed data into the @c->cur_window buffer. */ - memcpy(c->cur_window, udata, ulen); - c->cur_window_size = ulen; - - /* Initialize the raw range encoder (writing forwards). */ - lzms_range_encoder_raw_init(&c->rc, cdata, clen16); - - /* Initialize the output bitstream for Huffman symbols and verbatim bits - * (writing backwards). */ - lzms_output_bitstream_init(&c->os, cdata, clen16); - - /* Calculate the number of offset slots required. */ - num_offset_slots = lzms_get_offset_slot(ulen - 1) + 1; - - /* Initialize a Huffman encoder for each alphabet. */ - lzms_init_huffman_encoder(&c->literal_encoder, &c->os, - LZMS_NUM_LITERAL_SYMS, - LZMS_LITERAL_CODE_REBUILD_FREQ); - - lzms_init_huffman_encoder(&c->lz_offset_encoder, &c->os, - num_offset_slots, - LZMS_LZ_OFFSET_CODE_REBUILD_FREQ); - - lzms_init_huffman_encoder(&c->length_encoder, &c->os, - LZMS_NUM_LENGTH_SYMS, - LZMS_LENGTH_CODE_REBUILD_FREQ); - - lzms_init_huffman_encoder(&c->delta_offset_encoder, &c->os, - num_offset_slots, - LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ); - - lzms_init_huffman_encoder(&c->delta_power_encoder, &c->os, - LZMS_NUM_DELTA_POWER_SYMS, - LZMS_DELTA_POWER_CODE_REBUILD_FREQ); - - /* Initialize range encoders, all of which wrap around the same - * lzms_range_encoder_raw. */ - lzms_init_range_encoder(&c->main_range_encoder, - &c->rc, LZMS_NUM_MAIN_STATES); - - lzms_init_range_encoder(&c->match_range_encoder, - &c->rc, LZMS_NUM_MATCH_STATES); - - lzms_init_range_encoder(&c->lz_match_range_encoder, - &c->rc, LZMS_NUM_LZ_MATCH_STATES); - - for (unsigned i = 0; i < ARRAY_LEN(c->lz_repeat_match_range_encoders); i++) - lzms_init_range_encoder(&c->lz_repeat_match_range_encoders[i], - &c->rc, LZMS_NUM_LZ_REPEAT_MATCH_STATES); - - lzms_init_range_encoder(&c->delta_match_range_encoder, - &c->rc, LZMS_NUM_DELTA_MATCH_STATES); - - for (unsigned i = 0; i < ARRAY_LEN(c->delta_repeat_match_range_encoders); i++) - lzms_init_range_encoder(&c->delta_repeat_match_range_encoders[i], - &c->rc, LZMS_NUM_DELTA_REPEAT_MATCH_STATES); - - /* Set initial length costs for lengths < LZMS_NUM_FAST_LENGTHS. */ - lzms_update_fast_length_costs(c); + lzms_init_huffman_code(&c->literal_rebuild_info, + LZMS_NUM_LITERAL_SYMS, + LZMS_LITERAL_CODE_REBUILD_FREQ, + c->literal_codewords, + c->literal_lens, + c->literal_freqs); + + lzms_init_huffman_code(&c->lz_offset_rebuild_info, + num_offset_slots, + LZMS_LZ_OFFSET_CODE_REBUILD_FREQ, + c->lz_offset_codewords, + c->lz_offset_lens, + c->lz_offset_freqs); + + lzms_init_huffman_code(&c->length_rebuild_info, + LZMS_NUM_LENGTH_SYMS, + LZMS_LENGTH_CODE_REBUILD_FREQ, + c->length_codewords, + c->length_lens, + c->length_freqs); + + lzms_init_huffman_code(&c->delta_offset_rebuild_info, + num_offset_slots, + LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ, + c->delta_offset_codewords, + c->delta_offset_lens, + c->delta_offset_freqs); + + lzms_init_huffman_code(&c->delta_power_rebuild_info, + LZMS_NUM_DELTA_POWER_SYMS, + LZMS_DELTA_POWER_CODE_REBUILD_FREQ, + c->delta_power_codewords, + c->delta_power_lens, + c->delta_power_freqs); } -/* Flush the output streams, prepare the final compressed data, and return its +/* + * Flush the output streams, prepare the final compressed data, and return its * size in bytes. * * A return value of 0 indicates that the data could not be compressed to fit in - * the available space. */ + * the available space. + */ static size_t -lzms_finalize(struct lzms_compressor *c, u8 *cdata, size_t csize_avail) +lzms_finalize(struct lzms_compressor *c) { - size_t num_forwards_bytes; - size_t num_backwards_bytes; + size_t num_forwards_units; + size_t num_backwards_units; /* Flush both the forwards and backwards streams, and make sure they * didn't cross each other and start overwriting each other's data. */ if (!lzms_output_bitstream_flush(&c->os)) return 0; - if (!lzms_range_encoder_raw_flush(&c->rc)) + if (!lzms_range_encoder_flush(&c->rc)) return 0; if (c->rc.next > c->os.next) @@ -1385,156 +2091,109 @@ lzms_finalize(struct lzms_compressor *c, u8 *cdata, size_t csize_avail) * bitstream. Move the data output by the backwards bitstream to be * adjacent to the data output by the forward bitstream, and calculate * the compressed size that this results in. */ - num_forwards_bytes = (u8*)c->rc.next - (u8*)cdata; - num_backwards_bytes = ((u8*)cdata + csize_avail) - (u8*)c->os.next; - - memmove(cdata + num_forwards_bytes, c->os.next, num_backwards_bytes); + num_forwards_units = c->rc.next - c->rc.begin; + num_backwards_units = c->rc.end - c->os.next; - return num_forwards_bytes + num_backwards_bytes; -} + memmove(c->rc.next, c->os.next, num_backwards_units * sizeof(le16)); -/* Set internal compression parameters for the specified compression level and - * maximum window size. */ -static void -lzms_build_params(unsigned int compression_level, - struct lzms_compressor_params *params) -{ - /* Allow length 2 matches if the compression level is sufficiently high. - */ - if (compression_level >= 45) - params->min_match_length = 2; - else - params->min_match_length = 3; - - /* Scale nice_match_length with the compression level. But to allow an - * optimization on length cost calculations, don't allow - * nice_match_length to exceed LZMS_NUM_FAST_LENGTH. */ - params->nice_match_length = ((u64)compression_level * 32) / 50; - if (params->nice_match_length < params->min_match_length) - params->nice_match_length = params->min_match_length; - if (params->nice_match_length > LZMS_NUM_FAST_LENGTHS) - params->nice_match_length = LZMS_NUM_FAST_LENGTHS; - params->optim_array_length = 1024; + return (num_forwards_units + num_backwards_units) * sizeof(le16); } -static void -lzms_free_compressor(void *_c); - static u64 -lzms_get_needed_memory(size_t max_block_size, unsigned int compression_level) +lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level) { - struct lzms_compressor_params params; u64 size = 0; - if (max_block_size > LZMS_MAX_BUFFER_SIZE) + if (max_bufsize > LZMS_MAX_BUFFER_SIZE) return 0; - lzms_build_params(compression_level, ¶ms); - size += sizeof(struct lzms_compressor); - /* cur_window */ - size += max_block_size; + /* in_buffer */ + size += max_bufsize; /* mf */ - size += lcpit_matchfinder_get_needed_memory(max_block_size); - - /* matches */ - size += (params.nice_match_length - params.min_match_length + 1) * - sizeof(struct lz_match); - - /* optimum */ - size += (params.optim_array_length + params.nice_match_length) * - sizeof(struct lzms_mc_pos_data); + size += lcpit_matchfinder_get_needed_memory(max_bufsize); return size; } static int -lzms_create_compressor(size_t max_block_size, unsigned int compression_level, - void **ctx_ret) +lzms_create_compressor(size_t max_bufsize, unsigned compression_level, + void **c_ret) { struct lzms_compressor *c; - struct lzms_compressor_params params; + u32 nice_match_len; - if (max_block_size > LZMS_MAX_BUFFER_SIZE) + if (max_bufsize > LZMS_MAX_BUFFER_SIZE) return WIMLIB_ERR_INVALID_PARAM; - lzms_build_params(compression_level, ¶ms); - - c = CALLOC(1, sizeof(struct lzms_compressor)); + c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64); if (!c) - goto oom; - - c->params = params; + goto oom0; - c->cur_window = MALLOC(max_block_size); - if (!c->cur_window) - goto oom; + /* Scale nice_match_len with the compression level. But to allow an + * optimization for length cost calculations, don't allow nice_match_len + * to exceed MAX_FAST_LENGTH. */ + nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH); - if (!lcpit_matchfinder_init(&c->mf, max_block_size, - c->params.min_match_length, - c->params.nice_match_length)) - goto oom; + c->use_delta_matches = (compression_level >= 35); + c->try_lzmatch_lit_lzrep0 = (compression_level >= 45); + c->try_lit_lzrep0 = (compression_level >= 60); + c->try_lzrep_lit_lzrep0 = (compression_level >= 60); - c->matches = MALLOC((params.nice_match_length - params.min_match_length + 1) * - sizeof(struct lz_match)); - if (!c->matches) - goto oom; + c->in_buffer = MALLOC(max_bufsize); + if (!c->in_buffer) + goto oom1; - c->optimum = MALLOC((params.optim_array_length + - params.nice_match_length) * - sizeof(struct lzms_mc_pos_data)); - if (!c->optimum) - goto oom; - c->optimum_end = &c->optimum[params.optim_array_length]; + if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len)) + goto oom2; - lzms_init_rc_costs(); + lzms_init_fast_length_slot_tab(c); + lzms_init_offset_slot_tabs(c); - lzms_init_fast_slots(c); - - *ctx_ret = c; + *c_ret = c; return 0; -oom: - lzms_free_compressor(c); +oom2: + FREE(c->in_buffer); +oom1: + ALIGNED_FREE(c); +oom0: return WIMLIB_ERR_NOMEM; } static size_t -lzms_compress(const void *uncompressed_data, size_t uncompressed_size, - void *compressed_data, size_t compressed_size_avail, void *_c) +lzms_compress(const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, void *_c) { struct lzms_compressor *c = _c; - /* Don't bother compressing extremely small inputs. */ - if (uncompressed_size < 4) + /* Don't bother trying to compress extremely small inputs. */ + if (in_nbytes < 4) return 0; - /* Cap the available compressed size to a 32-bit integer and round it - * down to the nearest multiple of 2. */ - if (compressed_size_avail > UINT32_MAX) - compressed_size_avail = UINT32_MAX; - if (compressed_size_avail & 1) - compressed_size_avail--; - - /* Initialize the compressor structures. */ - lzms_prepare_compressor(c, uncompressed_data, uncompressed_size, - compressed_data, compressed_size_avail / 2); + /* Copy the input data into the internal buffer and preprocess it. */ + memcpy(c->in_buffer, in, in_nbytes); + c->in_nbytes = in_nbytes; + lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false); - /* Preprocess the uncompressed data. */ - lzms_x86_filter(c->cur_window, c->cur_window_size, - c->last_target_usages, false); + /* Prepare the matchfinders. */ + lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes); + if (c->use_delta_matches) + lzms_init_delta_matchfinder(c); - /* Load the window into the match-finder. */ - lcpit_matchfinder_load_buffer(&c->mf, c->cur_window, c->cur_window_size); + /* Initialize the encoder structures. */ + lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16)); + lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16)); + lzms_init_states_and_probabilities(c); + lzms_init_huffman_codes(c, lzms_get_num_offset_slots(in_nbytes)); - /* Compute and encode a literal/match sequence that decompresses to the - * preprocessed data. */ + /* The main loop: parse and encode. */ lzms_near_optimal_parse(c); /* Return the compressed data size or 0. */ - return lzms_finalize(c, compressed_data, compressed_size_avail); + return lzms_finalize(c); } static void @@ -1542,13 +2201,9 @@ lzms_free_compressor(void *_c) { struct lzms_compressor *c = _c; - if (c) { - FREE(c->cur_window); - lcpit_matchfinder_destroy(&c->mf); - FREE(c->matches); - FREE(c->optimum); - FREE(c); - } + FREE(c->in_buffer); + lcpit_matchfinder_destroy(&c->mf); + ALIGNED_FREE(c); } const struct compressor_ops lzms_compressor_ops = { diff --git a/src/lzms_decompress.c b/src/lzms_decompress.c index b7bfaff3..a54cc804 100644 --- a/src/lzms_decompress.c +++ b/src/lzms_decompress.c @@ -126,10 +126,9 @@ * references to the first 3 entries at any given time. The queue must be * initialized to the offsets {1, 2, 3, 4}. * - * Repeat delta matches are handled similarly, but for them there are two queues - * updated in lock-step: one for powers and one for raw offsets. The power - * queue must be initialized to {0, 0, 0, 0}, and the raw offset queue must be - * initialized to {1, 2, 3, 4}. + * Repeat delta matches are handled similarly, but for them the queue contains + * (power, raw offset) pairs. This queue must be initialized to + * {(0, 1), (0, 2), (0, 3), (0, 4)}. * * Bits from the binary range decoder must be used to disambiguate item types. * The range decoder must hold two state variables: the range, which must @@ -154,14 +153,14 @@ * it. * * The probability used to range-decode each bit must be taken from a table, of - * which one instance must exist for each distinct context in which a - * range-decoded bit is needed. At each call of the range decoder, the - * appropriate probability must be obtained by indexing the appropriate - * probability table with the last 4 (in the context disambiguating literals - * from matches), 5 (in the context disambiguating LZ matches from delta - * matches), or 6 (in all other contexts) bits recently range-decoded in that - * context, ordered such that the most recently decoded bit is the low-order bit - * of the index. + * which one instance must exist for each distinct context, or "binary decision + * class", in which a range-decoded bit is needed. At each call of the range + * decoder, the appropriate probability must be obtained by indexing the + * appropriate probability table with the last 4 (in the context disambiguating + * literals from matches), 5 (in the context disambiguating LZ matches from + * delta matches), or 6 (in all other contexts) bits recently range-decoded in + * that context, ordered such that the most recently decoded bit is the + * low-order bit of the index. * * Furthermore, each probability entry itself is variable, as its value must be * maintained as n/64 where n is the number of 0 bits in the most recently @@ -198,12 +197,12 @@ * reconstitute the full length. This code must be rebuilt whenever 512 * symbols have been decoded with it. * - * - The delta offset code, used for decoding the offsets of delta matches. + * - The delta offset code, used for decoding the raw offsets of delta matches. * Each symbol corresponds to an offset slot, which corresponds to a base * value and some number of extra bits which must be read and added to the - * base value to reconstitute the full offset. The number of symbols in this - * code is equal to the number of symbols in the LZ offset code. This code - * must be rebuilt whenever 1024 symbols have been decoded with it. + * base value to reconstitute the full raw offset. The number of symbols in + * this code is equal to the number of symbols in the LZ offset code. This + * code must be rebuilt whenever 1024 symbols have been decoded with it. * * - The delta power code, used for decoding the powers of delta matches. Each * of the 8 symbols corresponds to a power. This code must be rebuilt @@ -305,13 +304,13 @@ struct lzms_input_bitstream { /* Bookkeeping information for an adaptive Huffman code */ struct lzms_huffman_rebuild_info { unsigned num_syms_until_rebuild; + unsigned num_syms; unsigned rebuild_freq; - u16 *decode_table; - unsigned table_bits; - u32 *freqs; u32 *codewords; u8 *lens; - unsigned num_syms; + u32 *freqs; + u16 *decode_table; + unsigned table_bits; }; struct lzms_decompressor { @@ -324,39 +323,35 @@ struct lzms_decompressor { struct lzms_range_decoder rd; struct lzms_input_bitstream is; - /* Match offset LRU queues */ - u32 recent_lz_offsets[LZMS_NUM_RECENT_OFFSETS + 1]; - u64 recent_delta_offsets[LZMS_NUM_RECENT_OFFSETS + 1]; + /* LRU queues for match sources */ + u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1]; + u64 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1]; u32 pending_lz_offset; - u64 pending_delta_offset; + u64 pending_delta_pair; const u8 *lz_offset_still_pending; - const u8 *delta_offset_still_pending; + const u8 *delta_pair_still_pending; - /* States and probabilities for range decoding */ + /* States and probability entries for item type disambiguation */ u32 main_state; - struct lzms_probability_entry main_prob_entries[ - LZMS_NUM_MAIN_STATES]; + struct lzms_probability_entry main_probs[LZMS_NUM_MAIN_PROBS]; u32 match_state; - struct lzms_probability_entry match_prob_entries[ - LZMS_NUM_MATCH_STATES]; + struct lzms_probability_entry match_probs[LZMS_NUM_MATCH_PROBS]; - u32 lz_match_state; - struct lzms_probability_entry lz_match_prob_entries[ - LZMS_NUM_LZ_MATCH_STATES]; + u32 lz_state; + struct lzms_probability_entry lz_probs[LZMS_NUM_LZ_PROBS]; - u32 delta_match_state; - struct lzms_probability_entry delta_match_prob_entries[ - LZMS_NUM_DELTA_MATCH_STATES]; + u32 delta_state; + struct lzms_probability_entry delta_probs[LZMS_NUM_DELTA_PROBS]; - u32 lz_repeat_match_states[LZMS_NUM_RECENT_OFFSETS - 1]; - struct lzms_probability_entry lz_repeat_match_prob_entries[ - LZMS_NUM_RECENT_OFFSETS - 1][LZMS_NUM_LZ_REPEAT_MATCH_STATES]; + u32 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS]; + struct lzms_probability_entry lz_rep_probs[LZMS_NUM_LZ_REP_DECISIONS] + [LZMS_NUM_LZ_REP_PROBS]; - u32 delta_repeat_match_states[LZMS_NUM_RECENT_OFFSETS - 1]; - struct lzms_probability_entry delta_repeat_match_prob_entries[ - LZMS_NUM_RECENT_OFFSETS - 1][LZMS_NUM_DELTA_REPEAT_MATCH_STATES]; + u32 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS]; + struct lzms_probability_entry delta_rep_probs[LZMS_NUM_DELTA_REP_DECISIONS] + [LZMS_NUM_DELTA_REP_PROBS]; /* Huffman decoding */ @@ -366,18 +361,18 @@ struct lzms_decompressor { u32 literal_freqs[LZMS_NUM_LITERAL_SYMS]; struct lzms_huffman_rebuild_info literal_rebuild_info; - u16 length_decode_table[(1 << LZMS_LENGTH_TABLEBITS) + - (2 * LZMS_NUM_LENGTH_SYMS)] - _aligned_attribute(DECODE_TABLE_ALIGNMENT); - u32 length_freqs[LZMS_NUM_LENGTH_SYMS]; - struct lzms_huffman_rebuild_info length_rebuild_info; - u16 lz_offset_decode_table[(1 << LZMS_LZ_OFFSET_TABLEBITS) + ( 2 * LZMS_MAX_NUM_OFFSET_SYMS)] _aligned_attribute(DECODE_TABLE_ALIGNMENT); u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS]; struct lzms_huffman_rebuild_info lz_offset_rebuild_info; + u16 length_decode_table[(1 << LZMS_LENGTH_TABLEBITS) + + (2 * LZMS_NUM_LENGTH_SYMS)] + _aligned_attribute(DECODE_TABLE_ALIGNMENT); + u32 length_freqs[LZMS_NUM_LENGTH_SYMS]; + struct lzms_huffman_rebuild_info length_rebuild_info; + u16 delta_offset_decode_table[(1 << LZMS_DELTA_OFFSET_TABLEBITS) + (2 * LZMS_MAX_NUM_OFFSET_SYMS)] _aligned_attribute(DECODE_TABLE_ALIGNMENT); @@ -477,12 +472,14 @@ lzms_range_decoder_init(struct lzms_range_decoder *rd, rd->end = in + count; } -/* Decode and return the next bit from the range decoder. +/* + * Decode and return the next bit from the range decoder. * - * @prob is the chance out of LZMS_PROBABILITY_MAX that the next bit is 0. + * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next + * bit is 0 rather than 1. */ static inline int -lzms_range_decoder_decode_bit(struct lzms_range_decoder *rd, u32 prob) +lzms_range_decode_bit(struct lzms_range_decoder *rd, u32 prob) { u32 bound; @@ -510,26 +507,26 @@ lzms_range_decoder_decode_bit(struct lzms_range_decoder *rd, u32 prob) } } -/* Decode and return the next bit from the range decoder. This wraps around - * lzms_range_decoder_decode_bit() to handle using and updating the appropriate - * state and probability entry. */ +/* + * Decode a bit. This wraps around lzms_range_decode_bit() to handle using and + * updating the state and its corresponding probability entry. + */ static inline int -lzms_range_decode_bit(struct lzms_range_decoder *rd, - u32 *state_p, u32 num_states, - struct lzms_probability_entry prob_entries[]) +lzms_decode_bit(struct lzms_range_decoder *rd, u32 *state_p, u32 num_states, + struct lzms_probability_entry *probs) { struct lzms_probability_entry *prob_entry; u32 prob; int bit; /* Load the probability entry corresponding to the current state. */ - prob_entry = &prob_entries[*state_p]; + prob_entry = &probs[*state_p]; /* Get the probability that the next bit is 0. */ prob = lzms_get_probability(prob_entry); /* Decode the next bit. */ - bit = lzms_range_decoder_decode_bit(rd, prob); + bit = lzms_range_decode_bit(rd, prob); /* Update the state and probability entry based on the decoded bit. */ *state_p = ((*state_p << 1) | bit) & (num_states - 1); @@ -542,95 +539,97 @@ lzms_range_decode_bit(struct lzms_range_decoder *rd, static int lzms_decode_main_bit(struct lzms_decompressor *d) { - return lzms_range_decode_bit(&d->rd, &d->main_state, - LZMS_NUM_MAIN_STATES, - d->main_prob_entries); + return lzms_decode_bit(&d->rd, &d->main_state, + LZMS_NUM_MAIN_PROBS, d->main_probs); } static int lzms_decode_match_bit(struct lzms_decompressor *d) { - return lzms_range_decode_bit(&d->rd, &d->match_state, - LZMS_NUM_MATCH_STATES, - d->match_prob_entries); + return lzms_decode_bit(&d->rd, &d->match_state, + LZMS_NUM_MATCH_PROBS, d->match_probs); } static int lzms_decode_lz_match_bit(struct lzms_decompressor *d) { - return lzms_range_decode_bit(&d->rd, &d->lz_match_state, - LZMS_NUM_LZ_MATCH_STATES, - d->lz_match_prob_entries); + return lzms_decode_bit(&d->rd, &d->lz_state, + LZMS_NUM_LZ_PROBS, d->lz_probs); } static int lzms_decode_delta_match_bit(struct lzms_decompressor *d) { - return lzms_range_decode_bit(&d->rd, &d->delta_match_state, - LZMS_NUM_DELTA_MATCH_STATES, - d->delta_match_prob_entries); + return lzms_decode_bit(&d->rd, &d->delta_state, + LZMS_NUM_DELTA_PROBS, d->delta_probs); } static noinline int lzms_decode_lz_repeat_match_bit(struct lzms_decompressor *d, int idx) { - return lzms_range_decode_bit(&d->rd, &d->lz_repeat_match_states[idx], - LZMS_NUM_LZ_REPEAT_MATCH_STATES, - d->lz_repeat_match_prob_entries[idx]); + return lzms_decode_bit(&d->rd, &d->lz_rep_states[idx], + LZMS_NUM_LZ_REP_PROBS, d->lz_rep_probs[idx]); } static noinline int lzms_decode_delta_repeat_match_bit(struct lzms_decompressor *d, int idx) { - return lzms_range_decode_bit(&d->rd, &d->delta_repeat_match_states[idx], - LZMS_NUM_DELTA_REPEAT_MATCH_STATES, - d->delta_repeat_match_prob_entries[idx]); + return lzms_decode_bit(&d->rd, &d->delta_rep_states[idx], + LZMS_NUM_DELTA_REP_PROBS, d->delta_rep_probs[idx]); } static void -lzms_init_huffman_rebuild_info(struct lzms_huffman_rebuild_info *info, - unsigned rebuild_freq, - u16 *decode_table, unsigned table_bits, - u32 *freqs, u32 *codewords, u8 *lens, - unsigned num_syms) +lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info) { - info->num_syms_until_rebuild = 1; - info->rebuild_freq = rebuild_freq; - info->decode_table = decode_table; - info->table_bits = table_bits; - info->freqs = freqs; - info->codewords = codewords; - info->lens = lens; - info->num_syms = num_syms; + make_canonical_huffman_code(rebuild_info->num_syms, + LZMS_MAX_CODEWORD_LENGTH, + rebuild_info->freqs, + rebuild_info->lens, + rebuild_info->codewords); + + make_huffman_decode_table(rebuild_info->decode_table, + rebuild_info->num_syms, + rebuild_info->table_bits, + rebuild_info->lens, + LZMS_MAX_CODEWORD_LENGTH); + + rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq; +} + +static void +lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info, + unsigned num_syms, unsigned rebuild_freq, + u32 *codewords, u8 *lens, u32 *freqs, + u16 *decode_table, unsigned table_bits) +{ + rebuild_info->num_syms = num_syms; + rebuild_info->rebuild_freq = rebuild_freq; + rebuild_info->codewords = codewords; + rebuild_info->lens = lens; + rebuild_info->freqs = freqs; + rebuild_info->decode_table = decode_table; + rebuild_info->table_bits = table_bits; lzms_init_symbol_frequencies(freqs, num_syms); + lzms_build_huffman_code(rebuild_info); } static noinline void -lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *info) +lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info) { - make_canonical_huffman_code(info->num_syms, LZMS_MAX_CODEWORD_LEN, - info->freqs, info->lens, info->codewords); - make_huffman_decode_table(info->decode_table, info->num_syms, - info->table_bits, info->lens, - LZMS_MAX_CODEWORD_LEN); - for (unsigned i = 0; i < info->num_syms; i++) - info->freqs[i] = (info->freqs[i] >> 1) + 1; - info->num_syms_until_rebuild = info->rebuild_freq; + lzms_build_huffman_code(rebuild_info); + lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms); } static inline unsigned -lzms_decode_huffman_symbol(struct lzms_input_bitstream *is, - u16 decode_table[], unsigned table_bits, +lzms_decode_huffman_symbol(struct lzms_input_bitstream *is, u16 decode_table[], + unsigned table_bits, u32 freqs[], struct lzms_huffman_rebuild_info *rebuild_info) { unsigned key_bits; unsigned entry; unsigned sym; - if (unlikely(--rebuild_info->num_syms_until_rebuild == 0)) - lzms_rebuild_huffman_code(rebuild_info); - - lzms_ensure_bits(is, LZMS_MAX_CODEWORD_LEN); + lzms_ensure_bits(is, LZMS_MAX_CODEWORD_LENGTH); /* Index the decode table by the next table_bits bits of the input. */ key_bits = lzms_peek_bits(is, table_bits); @@ -654,8 +653,9 @@ lzms_decode_huffman_symbol(struct lzms_input_bitstream *is, sym = entry; } - /* Tally and return the decoded symbol. */ - rebuild_info->freqs[sym]++; + freqs[sym]++; + if (--rebuild_info->num_syms_until_rebuild == 0) + lzms_rebuild_huffman_code(rebuild_info); return sym; } @@ -665,15 +665,29 @@ lzms_decode_literal(struct lzms_decompressor *d) return lzms_decode_huffman_symbol(&d->is, d->literal_decode_table, LZMS_LITERAL_TABLEBITS, + d->literal_freqs, &d->literal_rebuild_info); } +static u32 +lzms_decode_lz_offset(struct lzms_decompressor *d) +{ + unsigned slot = lzms_decode_huffman_symbol(&d->is, + d->lz_offset_decode_table, + LZMS_LZ_OFFSET_TABLEBITS, + d->lz_offset_freqs, + &d->lz_offset_rebuild_info); + return lzms_offset_slot_base[slot] + + lzms_read_bits(&d->is, lzms_extra_offset_bits[slot]); +} + static u32 lzms_decode_length(struct lzms_decompressor *d) { unsigned slot = lzms_decode_huffman_symbol(&d->is, d->length_decode_table, LZMS_LENGTH_TABLEBITS, + d->length_freqs, &d->length_rebuild_info); u32 length = lzms_length_slot_base[slot]; unsigned num_extra_bits = lzms_extra_length_bits[slot]; @@ -683,23 +697,13 @@ lzms_decode_length(struct lzms_decompressor *d) return length; } -static u32 -lzms_decode_lz_offset(struct lzms_decompressor *d) -{ - unsigned slot = lzms_decode_huffman_symbol(&d->is, - d->lz_offset_decode_table, - LZMS_LZ_OFFSET_TABLEBITS, - &d->lz_offset_rebuild_info); - return lzms_offset_slot_base[slot] + - lzms_read_bits(&d->is, lzms_extra_offset_bits[slot]); -} - static u32 lzms_decode_delta_offset(struct lzms_decompressor *d) { unsigned slot = lzms_decode_huffman_symbol(&d->is, d->delta_offset_decode_table, LZMS_DELTA_OFFSET_TABLEBITS, + d->delta_offset_freqs, &d->delta_offset_rebuild_info); return lzms_offset_slot_base[slot] + lzms_read_bits(&d->is, lzms_extra_offset_bits[slot]); @@ -711,6 +715,7 @@ lzms_decode_delta_power(struct lzms_decompressor *d) return lzms_decode_huffman_symbol(&d->is, d->delta_power_decode_table, LZMS_DELTA_POWER_TABLEBITS, + d->delta_power_freqs, &d->delta_power_rebuild_info); } @@ -740,7 +745,7 @@ lzms_decode_items(struct lzms_decompressor * const restrict d, if (d->pending_lz_offset != 0 && out_next != d->lz_offset_still_pending) { - BUILD_BUG_ON(LZMS_NUM_RECENT_OFFSETS != 3); + BUILD_BUG_ON(LZMS_NUM_LZ_REPS != 3); d->recent_lz_offsets[3] = d->recent_lz_offsets[2]; d->recent_lz_offsets[2] = d->recent_lz_offsets[1]; d->recent_lz_offsets[1] = d->recent_lz_offsets[0]; @@ -754,7 +759,7 @@ lzms_decode_items(struct lzms_decompressor * const restrict d, } else { /* Repeat offset */ - BUILD_BUG_ON(LZMS_NUM_RECENT_OFFSETS != 3); + BUILD_BUG_ON(LZMS_NUM_LZ_REPS != 3); if (!lzms_decode_lz_repeat_match_bit(d, 0)) { offset = d->recent_lz_offsets[0]; d->recent_lz_offsets[0] = d->recent_lz_offsets[1]; @@ -771,7 +776,7 @@ lzms_decode_items(struct lzms_decompressor * const restrict d, } if (d->pending_lz_offset != 0) { - BUILD_BUG_ON(LZMS_NUM_RECENT_OFFSETS != 3); + BUILD_BUG_ON(LZMS_NUM_LZ_REPS != 3); d->recent_lz_offsets[3] = d->recent_lz_offsets[2]; d->recent_lz_offsets[2] = d->recent_lz_offsets[1]; d->recent_lz_offsets[1] = d->recent_lz_offsets[0]; @@ -786,7 +791,7 @@ lzms_decode_items(struct lzms_decompressor * const restrict d, if (unlikely(offset > out_next - out)) return -1; - lz_copy(out_next, length, offset, out_end, LZMS_MIN_MATCH_LEN); + lz_copy(out_next, length, offset, out_end, LZMS_MIN_MATCH_LENGTH); out_next += length; d->lz_offset_still_pending = out_next; @@ -799,18 +804,18 @@ lzms_decode_items(struct lzms_decompressor * const restrict d, u32 raw_offset; u32 span; u32 offset; - const u8 *B, *C, *D; + const u8 *matchptr; u32 length; - if (d->pending_delta_offset != 0 && - out_next != d->delta_offset_still_pending) + if (d->pending_delta_pair != 0 && + out_next != d->delta_pair_still_pending) { - BUILD_BUG_ON(LZMS_NUM_RECENT_OFFSETS != 3); - d->recent_delta_offsets[3] = d->recent_delta_offsets[2]; - d->recent_delta_offsets[2] = d->recent_delta_offsets[1]; - d->recent_delta_offsets[1] = d->recent_delta_offsets[0]; - d->recent_delta_offsets[0] = d->pending_delta_offset; - d->pending_delta_offset = 0; + BUILD_BUG_ON(LZMS_NUM_DELTA_REPS != 3); + d->recent_delta_pairs[3] = d->recent_delta_pairs[2]; + d->recent_delta_pairs[2] = d->recent_delta_pairs[1]; + d->recent_delta_pairs[1] = d->recent_delta_pairs[0]; + d->recent_delta_pairs[0] = d->pending_delta_pair; + d->pending_delta_pair = 0; } if (!lzms_decode_delta_match_bit(d)) { @@ -821,32 +826,32 @@ lzms_decode_items(struct lzms_decompressor * const restrict d, /* Repeat offset */ u64 val; - BUILD_BUG_ON(LZMS_NUM_RECENT_OFFSETS != 3); + BUILD_BUG_ON(LZMS_NUM_DELTA_REPS != 3); if (!lzms_decode_delta_repeat_match_bit(d, 0)) { - val = d->recent_delta_offsets[0]; - d->recent_delta_offsets[0] = d->recent_delta_offsets[1]; - d->recent_delta_offsets[1] = d->recent_delta_offsets[2]; - d->recent_delta_offsets[2] = d->recent_delta_offsets[3]; + val = d->recent_delta_pairs[0]; + d->recent_delta_pairs[0] = d->recent_delta_pairs[1]; + d->recent_delta_pairs[1] = d->recent_delta_pairs[2]; + d->recent_delta_pairs[2] = d->recent_delta_pairs[3]; } else if (!lzms_decode_delta_repeat_match_bit(d, 1)) { - val = d->recent_delta_offsets[1]; - d->recent_delta_offsets[1] = d->recent_delta_offsets[2]; - d->recent_delta_offsets[2] = d->recent_delta_offsets[3]; + val = d->recent_delta_pairs[1]; + d->recent_delta_pairs[1] = d->recent_delta_pairs[2]; + d->recent_delta_pairs[2] = d->recent_delta_pairs[3]; } else { - val = d->recent_delta_offsets[2]; - d->recent_delta_offsets[2] = d->recent_delta_offsets[3]; + val = d->recent_delta_pairs[2]; + d->recent_delta_pairs[2] = d->recent_delta_pairs[3]; } power = val >> 32; raw_offset = (u32)val; } - if (d->pending_delta_offset != 0) { - BUILD_BUG_ON(LZMS_NUM_RECENT_OFFSETS != 3); - d->recent_delta_offsets[3] = d->recent_delta_offsets[2]; - d->recent_delta_offsets[2] = d->recent_delta_offsets[1]; - d->recent_delta_offsets[1] = d->recent_delta_offsets[0]; - d->recent_delta_offsets[0] = d->pending_delta_offset; + if (d->pending_delta_pair != 0) { + BUILD_BUG_ON(LZMS_NUM_DELTA_REPS != 3); + d->recent_delta_pairs[3] = d->recent_delta_pairs[2]; + d->recent_delta_pairs[2] = d->recent_delta_pairs[1]; + d->recent_delta_pairs[1] = d->recent_delta_pairs[0]; + d->recent_delta_pairs[0] = d->pending_delta_pair; } - d->pending_delta_offset = raw_offset | ((u64)power << 32); + d->pending_delta_pair = raw_offset | ((u64)power << 32); length = lzms_decode_length(d); @@ -869,15 +874,15 @@ lzms_decode_items(struct lzms_decompressor * const restrict d, if (unlikely(length > out_end - out_next)) return -1; - B = out_next - span; - C = out_next - offset; - D = C - span; - + matchptr = out_next - offset; do { - *out_next++ = *B++ + *C++ - *D++; + *out_next = *matchptr + *(out_next - span) - + *(matchptr - span); + out_next++; + matchptr++; } while (--length); - d->delta_offset_still_pending = out_next; + d->delta_pair_still_pending = out_next; } } return 0; @@ -888,87 +893,89 @@ lzms_init_decompressor(struct lzms_decompressor *d, const void *in, size_t in_nbytes, unsigned num_offset_slots) { /* Match offset LRU queues */ - for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS + 1; i++) { + for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++) d->recent_lz_offsets[i] = i + 1; - d->recent_delta_offsets[i] = i + 1; - } + for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++) + d->recent_delta_pairs[i] = i + 1; d->pending_lz_offset = 0; - d->pending_delta_offset = 0; + d->pending_delta_pair = 0; /* Range decoding */ lzms_range_decoder_init(&d->rd, in, in_nbytes / sizeof(le16)); d->main_state = 0; - lzms_init_probability_entries(d->main_prob_entries, LZMS_NUM_MAIN_STATES); + lzms_init_probability_entries(d->main_probs, LZMS_NUM_MAIN_PROBS); d->match_state = 0; - lzms_init_probability_entries(d->match_prob_entries, LZMS_NUM_MATCH_STATES); + lzms_init_probability_entries(d->match_probs, LZMS_NUM_MATCH_PROBS); - d->lz_match_state = 0; - lzms_init_probability_entries(d->lz_match_prob_entries, LZMS_NUM_LZ_MATCH_STATES); + d->lz_state = 0; + lzms_init_probability_entries(d->lz_probs, LZMS_NUM_LZ_PROBS); - d->delta_match_state = 0; - lzms_init_probability_entries(d->delta_match_prob_entries, LZMS_NUM_DELTA_MATCH_STATES); + for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++) { + d->lz_rep_states[i] = 0; + lzms_init_probability_entries(d->lz_rep_probs[i], + LZMS_NUM_LZ_REP_PROBS); + } - for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS - 1; i++) { - d->lz_repeat_match_states[i] = 0; - lzms_init_probability_entries(d->lz_repeat_match_prob_entries[i], - LZMS_NUM_LZ_REPEAT_MATCH_STATES); + d->delta_state = 0; + lzms_init_probability_entries(d->delta_probs, LZMS_NUM_DELTA_PROBS); - d->delta_repeat_match_states[i] = 0; - lzms_init_probability_entries(d->delta_repeat_match_prob_entries[i], - LZMS_NUM_DELTA_REPEAT_MATCH_STATES); + for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++) { + d->delta_rep_states[i] = 0; + lzms_init_probability_entries(d->delta_rep_probs[i], + LZMS_NUM_DELTA_REP_PROBS); } /* Huffman decoding */ lzms_input_bitstream_init(&d->is, in, in_nbytes / sizeof(le16)); - lzms_init_huffman_rebuild_info(&d->literal_rebuild_info, - LZMS_LITERAL_CODE_REBUILD_FREQ, - d->literal_decode_table, - LZMS_LITERAL_TABLEBITS, - d->literal_freqs, - d->codewords, - d->lens, - LZMS_NUM_LITERAL_SYMS); - - lzms_init_huffman_rebuild_info(&d->length_rebuild_info, - LZMS_LENGTH_CODE_REBUILD_FREQ, - d->length_decode_table, - LZMS_LENGTH_TABLEBITS, - d->length_freqs, - d->codewords, - d->lens, - LZMS_NUM_LENGTH_SYMS); - - lzms_init_huffman_rebuild_info(&d->lz_offset_rebuild_info, - LZMS_LZ_OFFSET_CODE_REBUILD_FREQ, - d->lz_offset_decode_table, - LZMS_LZ_OFFSET_TABLEBITS, - d->lz_offset_freqs, - d->codewords, - d->lens, - num_offset_slots); - - lzms_init_huffman_rebuild_info(&d->delta_offset_rebuild_info, - LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ, - d->delta_offset_decode_table, - LZMS_DELTA_OFFSET_TABLEBITS, - d->delta_offset_freqs, - d->codewords, - d->lens, - num_offset_slots); - - lzms_init_huffman_rebuild_info(&d->delta_power_rebuild_info, - LZMS_DELTA_POWER_CODE_REBUILD_FREQ, - d->delta_power_decode_table, - LZMS_DELTA_POWER_TABLEBITS, - d->delta_power_freqs, - d->codewords, - d->lens, - LZMS_NUM_DELTA_POWER_SYMS); + lzms_init_huffman_code(&d->literal_rebuild_info, + LZMS_NUM_LITERAL_SYMS, + LZMS_LITERAL_CODE_REBUILD_FREQ, + d->codewords, + d->lens, + d->literal_freqs, + d->literal_decode_table, + LZMS_LITERAL_TABLEBITS); + + lzms_init_huffman_code(&d->lz_offset_rebuild_info, + num_offset_slots, + LZMS_LZ_OFFSET_CODE_REBUILD_FREQ, + d->codewords, + d->lens, + d->lz_offset_freqs, + d->lz_offset_decode_table, + LZMS_LZ_OFFSET_TABLEBITS); + + lzms_init_huffman_code(&d->length_rebuild_info, + LZMS_NUM_LENGTH_SYMS, + LZMS_LENGTH_CODE_REBUILD_FREQ, + d->codewords, + d->lens, + d->length_freqs, + d->length_decode_table, + LZMS_LENGTH_TABLEBITS); + + lzms_init_huffman_code(&d->delta_offset_rebuild_info, + num_offset_slots, + LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ, + d->codewords, + d->lens, + d->delta_offset_freqs, + d->delta_offset_decode_table, + LZMS_DELTA_OFFSET_TABLEBITS); + + lzms_init_huffman_code(&d->delta_power_rebuild_info, + LZMS_NUM_DELTA_POWER_SYMS, + LZMS_DELTA_POWER_CODE_REBUILD_FREQ, + d->codewords, + d->lens, + d->delta_power_freqs, + d->delta_power_decode_table, + LZMS_DELTA_POWER_TABLEBITS); } static int @@ -988,9 +995,11 @@ lzms_create_decompressor(size_t max_bufsize, void **d_ret) return 0; } -/* Decompress @in_nbytes bytes of LZMS-compressed data at @in and write the +/* + * Decompress @in_nbytes bytes of LZMS-compressed data at @in and write the * uncompressed data, which had original size @out_nbytes, to @out. Return 0 if - * successful or -1 if the compressed data is invalid. */ + * successful or -1 if the compressed data is invalid. + */ static int lzms_decompress(const void *in, size_t in_nbytes, void *out, size_t out_nbytes, void *_d)