X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Fxpress-compress.c;h=f39d54758969f2884f37ee2e8d50b3164152b7c4;hp=4b2469498871b682b9357112249363a5df06fd04;hb=93110bb18090d4d2c00294a56f819c7edeef318f;hpb=157d002da341c9109c5c065893ae82c6dbf5d4e8 diff --git a/src/xpress-compress.c b/src/xpress-compress.c index 4b246949..f39d5475 100644 --- a/src/xpress-compress.c +++ b/src/xpress-compress.c @@ -1,13 +1,12 @@ /* * xpress-compress.c * - * XPRESS compression routines. - * - * See the comments in xpress-decompress.c about the XPRESS format. + * A compressor that produces output compatible with the XPRESS (Huffman + * version) compression format. */ /* - * Copyright (C) 2012, 2013 Eric Biggers + * Copyright (C) 2012, 2013, 2014 Eric Biggers * * This file is part of wimlib, a library for working with WIM files. * @@ -29,240 +28,1227 @@ # include "config.h" #endif -#include "wimlib.h" -#include "wimlib/assert.h" -#include "wimlib/compress.h" +#include "wimlib/compress_common.h" +#include "wimlib/compressor_ops.h" +#include "wimlib/endianness.h" #include "wimlib/error.h" +#include "wimlib/lz_mf.h" #include "wimlib/util.h" #include "wimlib/xpress.h" -#ifdef HAVE_ALLOCA_H -# include -#endif - #include +#include + +#define XPRESS_CACHE_PER_POS 8 +#define XPRESS_OPTIM_ARRAY_LENGTH 4096 + +struct xpress_compressor; +struct xpress_item; +struct xpress_mc_pos_data; + +/* Internal compression parameters */ +struct xpress_compressor_params { + + /* See xpress_choose_items() */ + u32 (*choose_items_func)(struct xpress_compressor *); + + /* For near-optimal parsing only */ + u32 num_optim_passes; + + /* Match-finding algorithm and parameters */ + enum lz_mf_algo mf_algo; + u32 max_search_depth; + u32 nice_match_length; +}; + +/* State of the XPRESS compressor */ +struct xpress_compressor { + + /* Internal compression parameters */ + struct xpress_compressor_params params; + + /* Data currently being compressed */ + const u8 *cur_window; + u32 cur_window_size; + + /* Lempel-Ziv match-finder */ + struct lz_mf *mf; + + /* Optimal parsing data */ + unsigned (*get_matches_func)(struct xpress_compressor *, + const struct lz_match **); + void (*skip_bytes_func)(struct xpress_compressor *, unsigned n); + struct lz_match *cached_matches; + struct lz_match *cache_ptr; + struct lz_match *cache_limit; + struct xpress_mc_pos_data *optimum; + u8 costs[XPRESS_NUM_SYMBOLS]; + + /* The selected sequence of matches/literals */ + struct xpress_item *chosen_items; + + /* Symbol frequency counters */ + u32 freqs[XPRESS_NUM_SYMBOLS]; + + /* The current Huffman code */ + u32 codewords[XPRESS_NUM_SYMBOLS]; + u8 lens[XPRESS_NUM_SYMBOLS]; +}; + +/* Intermediate XPRESS match/literal format */ +struct xpress_item { + + /* Bits 0 - 8: Symbol + * Bits 9 - 24: Length - XPRESS_MIN_MATCH_LEN + * Bits 25 - 28: Number of extra offset bits + * Bits 29+ : Extra offset bits */ -/* Intermediate XPRESS match/literal representation. */ -struct xpress_match { - u16 adjusted_len; /* Match length minus XPRESS_MIN_MATCH_LEN */ - u16 offset; /* Match offset */ - /* For literals, offset == 0 and adjusted_len is the literal. */ + u64 data; }; /* - * Writes @match, which is a match given in the intermediate representation for - * XPRESS matches, to the output stream @ostream. + * Match chooser position data: * - * @codewords and @lens provide the Huffman code that is being used. + * An array of these structures is used during the near-optimal match-choosing + * algorithm. They correspond to consecutive positions in the window and are + * used to keep track of the cost to reach each position, and the match/literal + * choices that need to be chosen to reach that position. + */ +struct xpress_mc_pos_data { + + /* The cost, in bits, of the lowest-cost path that has been found to + * reach this position. This can change as progressively lower cost + * paths are found to reach this position. */ + u32 cost; +#define MC_INFINITE_COST UINT32_MAX + + /* The match or literal that was taken to reach this position. This can + * change as progressively lower cost paths are found to reach this + * position. + * + * This variable is divided into two bitfields. + * + * Literals: + * Low bits are 1, high bits are the literal. + * + * Matches: + * Low bits are the match length, high bits are the offset. + */ + u32 mc_item_data; +#define MC_OFFSET_SHIFT 16 +#define MC_LEN_MASK ((1 << MC_OFFSET_SHIFT) - 1) +}; + + +/* + * Structure to keep track of the current state of sending data to the + * compressed output buffer. + * + * The XPRESS bitstream is encoded as a sequence of little endian 16-bit coding + * units interwoven with literal bytes. + */ +struct xpress_output_bitstream { + + /* Bits that haven't yet been written to the output buffer. */ + u32 bitbuf; + + /* Number of bits currently held in @bitbuf. */ + u32 bitcount; + + /* Pointer to the start of the output buffer. */ + u8 *start; + + /* Pointer to the location in the ouput buffer at which to write the + * next 16 bits. */ + u8 *next_bits; + + /* Pointer to the location in the output buffer at which to write the + * next 16 bits, after @next_bits. */ + u8 *next_bits2; + + /* Pointer to the location in the output buffer at which to write the + * next literal byte. */ + u8 *next_byte; + + /* Pointer to the end of the output buffer. */ + u8 *end; +}; + +/* + * Initialize the output bitstream. + * + * @os + * The output bitstream structure to initialize. + * @buffer + * The buffer to write to. + * @size + * Size of @buffer, in bytes. Must be at least 4. */ static void -xpress_write_match(struct xpress_match match, - struct output_bitstream *restrict ostream, - const u16 codewords[restrict], - const u8 lens[restrict]) +xpress_init_output(struct xpress_output_bitstream *os, void *buffer, u32 size) { - u8 len_hdr = min(match.adjusted_len, 0xf); - u8 offset_bsr = bsr32(match.offset); - unsigned sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr); + os->bitbuf = 0; + os->bitcount = 0; + os->start = buffer; + os->next_bits = os->start; + os->next_bits2 = os->start + 2; + os->next_byte = os->start + 4; + os->end = os->start + size; +} - bitstream_put_bits(ostream, codewords[sym], lens[sym]); +/* + * Write some bits to the output bitstream. + * + * The bits are given by the low-order @num_bits bits of @bits. Higher-order + * bits in @bits cannot be set. At most 16 bits can be written at once. + * + * If the output buffer space is exhausted, then the bits will be ignored, and + * xpress_flush_output() will return 0 when it gets called. + */ +static inline void +xpress_write_bits(struct xpress_output_bitstream *os, + const u32 bits, const unsigned int num_bits) +{ + /* This code is optimized for XPRESS, which never needs to write more + * than 16 bits at once. */ - if (match.adjusted_len >= 0xf) { - u8 byte1 = min(match.adjusted_len - 0xf, 0xff); - bitstream_put_byte(ostream, byte1); - if (byte1 == 0xff) { - bitstream_put_byte(ostream, match.adjusted_len & 0xff); - bitstream_put_byte(ostream, match.adjusted_len >> 8); + os->bitcount += num_bits; + os->bitbuf = (os->bitbuf << num_bits) | bits; + + if (os->bitcount > 16) { + os->bitcount -= 16; + if (os->end - os->next_byte >= 2) { + *(le16 *)os->next_bits = cpu_to_le16(os->bitbuf >> os->bitcount); + os->next_bits = os->next_bits2; + os->next_bits2 = os->next_byte; + os->next_byte += 2; } } - bitstream_put_bits(ostream, match.offset ^ (1U << offset_bsr), offset_bsr); } -static void -xpress_write_matches_and_literals(struct output_bitstream *ostream, - const struct xpress_match matches[restrict], - input_idx_t num_matches, - const u16 codewords[restrict], - const u8 lens[restrict]) -{ - for (input_idx_t i = 0; i < num_matches; i++) { - if (matches[i].offset) { - /* Real match */ - xpress_write_match(matches[i], ostream, codewords, lens); - } else { - /* Literal byte */ - u8 lit = matches[i].adjusted_len; - bitstream_put_bits(ostream, codewords[lit], lens[lit]); +/* + * Interweave a literal byte into the output bitstream. + */ +static inline void +xpress_write_byte(struct xpress_output_bitstream *os, u8 byte) +{ + if (os->next_byte < os->end) + *os->next_byte++ = byte; +} + +/* + * Flush the last coding unit to the output buffer if needed. Return the total + * number of bytes written to the output buffer, or 0 if an overflow occurred. + */ +static u32 +xpress_flush_output(struct xpress_output_bitstream *os) +{ + if (unlikely(os->end - os->next_byte < 2)) + return 0; + + *(le16 *)os->next_bits = cpu_to_le16(os->bitbuf << (16 - os->bitcount)); + *(le16 *)os->next_bits2 = cpu_to_le16(0); + + return os->next_byte - os->start; +} + +/* Output a match or literal. */ +static inline void +xpress_write_item(struct xpress_item item, struct xpress_output_bitstream *os, + const u32 codewords[], const u8 lens[]) +{ + u64 data = item.data; + unsigned symbol; + unsigned adjusted_len; + unsigned num_extra_bits; + unsigned extra_bits; + + symbol = data & 0x1FF; + + xpress_write_bits(os, codewords[symbol], lens[symbol]); + + if (symbol < XPRESS_NUM_CHARS) /* Literal? */ + return; + + adjusted_len = (data >> 9) & 0xFFFF; + + /* If length >= 18, one extra length byte. + * If length >= 273, three (total) extra length bytes. */ + if (adjusted_len >= 0xf) { + u8 byte1 = min(adjusted_len - 0xf, 0xff); + xpress_write_byte(os, byte1); + if (byte1 == 0xff) { + xpress_write_byte(os, adjusted_len & 0xff); + xpress_write_byte(os, adjusted_len >> 8); } } - bitstream_put_bits(ostream, codewords[XPRESS_END_OF_DATA], lens[XPRESS_END_OF_DATA]); -} -struct xpress_record_ctx { - freq_t freqs[XPRESS_NUM_SYMBOLS]; - struct xpress_match *matches; -}; + num_extra_bits = (data >> 25) & 0xF; + extra_bits = data >> 29; + + xpress_write_bits(os, extra_bits, num_extra_bits); +} +/* Output a sequence of XPRESS matches and literals. */ static void -xpress_record_literal(u8 lit, void *_ctx) +xpress_write_items(struct xpress_output_bitstream *os, + const struct xpress_item items[], u32 num_items, + const u32 codewords[], const u8 lens[]) { - struct xpress_record_ctx *ctx = _ctx; - ctx->freqs[lit]++; - *(ctx->matches++) = (struct xpress_match) { .offset = 0, .adjusted_len = lit }; + for (u32 i = 0; i < num_items; i++) + xpress_write_item(items[i], os, codewords, lens); + + /* End-of-data symbol (required for MS compatibility) */ + xpress_write_bits(os, codewords[XPRESS_END_OF_DATA], lens[XPRESS_END_OF_DATA]); } +/* Make the Huffman code for XPRESS. + * + * Takes as input c->freqs and produces as output c->lens and c->codewords. */ static void -xpress_record_match(unsigned len, unsigned offset, void *_ctx) +xpress_make_huffman_code(struct xpress_compressor *c) { - struct xpress_record_ctx *ctx = _ctx; + make_canonical_huffman_code(XPRESS_NUM_SYMBOLS, XPRESS_MAX_CODEWORD_LEN, + c->freqs, c->lens, c->codewords); +} - XPRESS_ASSERT(len >= XPRESS_MIN_MATCH_LEN); - XPRESS_ASSERT(len <= XPRESS_MAX_MATCH_LEN); - XPRESS_ASSERT(offset >= XPRESS_MIN_OFFSET); - XPRESS_ASSERT(offset <= XPRESS_MAX_OFFSET); +/* Tally, and optionally record, the specified literal byte. */ +static inline void +xpress_declare_literal(struct xpress_compressor *c, unsigned literal, + struct xpress_item **next_chosen_item) +{ + c->freqs[literal]++; + if (next_chosen_item) { + *(*next_chosen_item)++ = (struct xpress_item) { + .data = literal, + }; + } +} + +/* Tally, and optionally record, the specified match. */ +static inline void +xpress_declare_match(struct xpress_compressor *c, + unsigned len, unsigned offset, + struct xpress_item **next_chosen_item) +{ unsigned adjusted_len = len - XPRESS_MIN_MATCH_LEN; unsigned len_hdr = min(adjusted_len, 0xf); - unsigned sym = XPRESS_NUM_CHARS + ((bsr32(offset) << 4) | len_hdr); + unsigned offset_bsr = bsr32(offset); + unsigned sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr); - XPRESS_ASSERT(sym >= XPRESS_NUM_CHARS); - XPRESS_ASSERT(sym < XPRESS_NUM_SYMBOLS); + c->freqs[sym]++; - ctx->freqs[sym]++; - *(ctx->matches++) = (struct xpress_match) { .offset = offset, - .adjusted_len = adjusted_len }; + if (next_chosen_item) { + *(*next_chosen_item)++ = (struct xpress_item) { + .data = (u64)sym | + ((u64)adjusted_len << 9) | + ((u64)offset_bsr << 25) | + ((u64)(offset ^ (1U << offset_bsr)) << 29), + }; + } } -static const struct lz_params xpress_lz_params = { - .min_match = XPRESS_MIN_MATCH_LEN, - .max_match = XPRESS_MAX_MATCH_LEN, - .max_offset = XPRESS_MAX_OFFSET, - .good_match = 16, - .nice_match = 32, - .max_chain_len = 16, - .max_lazy_match = 16, - .too_far = 4096, -}; +/* Tally, and optionally record, the specified match or literal. */ +static inline void +xpress_declare_item(struct xpress_compressor *c, u32 mc_item_data, + struct xpress_item **next_chosen_item) +{ + unsigned len = mc_item_data & MC_LEN_MASK; + unsigned offset_data = mc_item_data >> MC_OFFSET_SHIFT; -/* API function documented in wimlib.h */ -WIMLIBAPI unsigned -wimlib_xpress_compress(const void * restrict uncompressed_data, - unsigned uncompressed_len, - void * restrict compressed_data) + if (len == 1) + xpress_declare_literal(c, offset_data, next_chosen_item); + else + xpress_declare_match(c, len, offset_data, next_chosen_item); +} + +static inline void +xpress_record_item_list(struct xpress_compressor *c, + struct xpress_mc_pos_data *cur_optimum_ptr, + struct xpress_item **next_chosen_item) { - u8 *cptr = compressed_data; - struct output_bitstream ostream; + struct xpress_mc_pos_data *end_optimum_ptr; + u32 saved_item; + u32 item; - struct xpress_record_ctx record_ctx; + /* The list is currently in reverse order (last item to first item). + * Reverse it. */ + end_optimum_ptr = cur_optimum_ptr; + saved_item = cur_optimum_ptr->mc_item_data; + do { + item = saved_item; + cur_optimum_ptr -= item & MC_LEN_MASK; + saved_item = cur_optimum_ptr->mc_item_data; + cur_optimum_ptr->mc_item_data = item; + } while (cur_optimum_ptr != c->optimum); - struct xpress_match *matches; - input_idx_t *prev_tab; - u8 *udata; + /* Walk the list of items from beginning to end, tallying and recording + * each item. */ + do { + xpress_declare_item(c, cur_optimum_ptr->mc_item_data, next_chosen_item); + cur_optimum_ptr += (cur_optimum_ptr->mc_item_data) & MC_LEN_MASK; + } while (cur_optimum_ptr != end_optimum_ptr); +} - u16 codewords[XPRESS_NUM_SYMBOLS]; - u8 lens[XPRESS_NUM_SYMBOLS]; - input_idx_t num_matches; - input_idx_t compressed_len; - input_idx_t i; - const size_t stack_max = 65536; +static inline void +xpress_tally_item_list(struct xpress_compressor *c, + struct xpress_mc_pos_data *cur_optimum_ptr) +{ + /* Since we're just tallying the items, we don't need to reverse the + * list. Processing the items in reverse order is fine. */ + do { + xpress_declare_item(c, cur_optimum_ptr->mc_item_data, NULL); + cur_optimum_ptr -= (cur_optimum_ptr->mc_item_data & MC_LEN_MASK); + } while (cur_optimum_ptr != c->optimum); +} - /* XPRESS requires 256 bytes of overhead for the Huffman code, so it's - * impossible to compress 256 bytes or less of data to less than the - * input size. - * - * +1 to take into account that the buffer for compressed data is 1 byte - * smaller than the buffer for uncompressed data. +/* Tally, and optionally (if next_chosen_item != NULL) record, in order, all + * items in the current list of items found by the match-chooser. */ +static void +xpress_declare_item_list(struct xpress_compressor *c, + struct xpress_mc_pos_data *cur_optimum_ptr, + struct xpress_item **next_chosen_item) +{ + if (next_chosen_item) + xpress_record_item_list(c, cur_optimum_ptr, next_chosen_item); + else + xpress_tally_item_list(c, cur_optimum_ptr); +} + +static unsigned +xpress_get_matches_fillcache(struct xpress_compressor *c, + const struct lz_match **matches_ret) +{ + struct lz_match *cache_ptr; + struct lz_match *matches; + unsigned num_matches; + + cache_ptr = c->cache_ptr; + matches = cache_ptr + 1; + if (likely(cache_ptr <= c->cache_limit)) { + num_matches = lz_mf_get_matches(c->mf, matches); + cache_ptr->len = num_matches; + c->cache_ptr = matches + num_matches; + } else { + num_matches = 0; + } + *matches_ret = matches; + return num_matches; +} + +static unsigned +xpress_get_matches_usecache(struct xpress_compressor *c, + const struct lz_match **matches_ret) +{ + struct lz_match *cache_ptr; + struct lz_match *matches; + unsigned num_matches; + + cache_ptr = c->cache_ptr; + matches = cache_ptr + 1; + if (cache_ptr <= c->cache_limit) { + num_matches = cache_ptr->len; + c->cache_ptr = matches + num_matches; + } else { + num_matches = 0; + } + *matches_ret = matches; + return num_matches; +} + +static unsigned +xpress_get_matches_usecache_nocheck(struct xpress_compressor *c, + const struct lz_match **matches_ret) +{ + struct lz_match *cache_ptr; + struct lz_match *matches; + unsigned num_matches; + + cache_ptr = c->cache_ptr; + matches = cache_ptr + 1; + num_matches = cache_ptr->len; + c->cache_ptr = matches + num_matches; + *matches_ret = matches; + return num_matches; +} + +static unsigned +xpress_get_matches_noncaching(struct xpress_compressor *c, + const struct lz_match **matches_ret) +{ + *matches_ret = c->cached_matches; + return lz_mf_get_matches(c->mf, c->cached_matches); +} + +/* + * Find matches at the next position in the window. + * + * This uses a wrapper function around the underlying match-finder. + * + * Returns the number of matches found and sets *matches_ret to point to the + * matches array. The matches will be sorted by strictly increasing length and + * offset. + */ +static inline unsigned +xpress_get_matches(struct xpress_compressor *c, + const struct lz_match **matches_ret) +{ + return (*c->get_matches_func)(c, matches_ret); +} + +static void +xpress_skip_bytes_fillcache(struct xpress_compressor *c, unsigned n) +{ + struct lz_match *cache_ptr; + + cache_ptr = c->cache_ptr; + lz_mf_skip_positions(c->mf, n); + if (cache_ptr <= c->cache_limit) { + do { + cache_ptr->len = 0; + cache_ptr += 1; + } while (--n && likely(cache_ptr <= c->cache_limit)); + } + c->cache_ptr = cache_ptr; +} + +static void +xpress_skip_bytes_usecache(struct xpress_compressor *c, unsigned n) +{ + struct lz_match *cache_ptr; + + cache_ptr = c->cache_ptr; + if (likely(cache_ptr <= c->cache_limit)) { + do { + cache_ptr += 1 + cache_ptr->len; + } while (--n && likely(cache_ptr <= c->cache_limit)); + } + c->cache_ptr = cache_ptr; +} + +static void +xpress_skip_bytes_usecache_nocheck(struct xpress_compressor *c, unsigned n) +{ + struct lz_match *cache_ptr; + + cache_ptr = c->cache_ptr; + do { + cache_ptr += 1 + cache_ptr->len; + } while (--n); + c->cache_ptr = cache_ptr; +} + +static void +xpress_skip_bytes_noncaching(struct xpress_compressor *c, unsigned n) +{ + lz_mf_skip_positions(c->mf, n); +} + +/* + * Skip the specified number of positions in the window (don't search for + * matches at them). + * + * This uses a wrapper function around the underlying match-finder. + */ +static inline void +xpress_skip_bytes(struct xpress_compressor *c, unsigned n) +{ + return (*c->skip_bytes_func)(c, n); +} + +/* Set default XPRESS Huffman symbol costs to bootstrap the iterative + * optimization algorithm. */ +static void +xpress_set_default_costs(u8 costs[]) +{ + unsigned i; + + /* Literal symbols */ + for (i = 0; i < XPRESS_NUM_CHARS; i++) + costs[i] = 8; + + /* Match symbols */ + for (; i < XPRESS_NUM_SYMBOLS; i++) + costs[i] = 10; +} + +/* Copy the Huffman codeword lengths array @lens to the Huffman symbol costs + * array @costs, but also assign a default cost to each 0-length (unused) + * codeword. */ +static void +xpress_set_costs(u8 costs[], const u8 lens[]) +{ + for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i++) + costs[i] = lens[i] ? lens[i] : XPRESS_MAX_CODEWORD_LEN; +} + +/* + * Consider coding each match in @matches. + * + * @matches must be sorted by strictly increasing length and strictly + * increasing offset. This is guaranteed by the match-finder. + * + * We consider each length from the minimum (2) to the longest + * (matches[num_matches - 1].len). For each length, we consider only + * the smallest offset for which that length is available. Although + * this is not guaranteed to be optimal due to the possibility of a + * larger offset costing less than a smaller offset to code, this is a + * very useful heuristic. + */ +static inline void +xpress_consider_matches(struct xpress_compressor *c, + struct xpress_mc_pos_data *cur_optimum_ptr, + const struct lz_match matches[], + unsigned num_matches) +{ + unsigned i = 0; + unsigned len = XPRESS_MIN_MATCH_LEN; + u32 cost; + u32 position_cost; + unsigned offset; + unsigned offset_bsr; + unsigned adjusted_len; + unsigned len_hdr; + unsigned sym; + + if (matches[num_matches - 1].len < 0xf + XPRESS_MIN_MATCH_LEN) { + /* All lengths are small. Optimize accordingly. */ + do { + offset = matches[i].offset; + offset_bsr = bsr32(offset); + len_hdr = len - XPRESS_MIN_MATCH_LEN; + sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr); + + position_cost = cur_optimum_ptr->cost + offset_bsr; + do { + cost = position_cost + c->costs[sym]; + if (cost < (cur_optimum_ptr + len)->cost) { + (cur_optimum_ptr + len)->cost = cost; + (cur_optimum_ptr + len)->mc_item_data = + (offset << MC_OFFSET_SHIFT) | len; + } + sym++; + } while (++len <= matches[i].len); + } while (++i != num_matches); + } else { + /* Some lengths are big. */ + do { + offset = matches[i].offset; + offset_bsr = bsr32(offset); + position_cost = cur_optimum_ptr->cost + offset_bsr; + do { + adjusted_len = len - XPRESS_MIN_MATCH_LEN; + len_hdr = min(adjusted_len, 0xf); + sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr); + + cost = position_cost + c->costs[sym]; + if (adjusted_len >= 0xf) { + cost += 8; + if (adjusted_len - 0xf >= 0xff) + cost += 16; + } + + if (cost < (cur_optimum_ptr + len)->cost) { + (cur_optimum_ptr + len)->cost = cost; + (cur_optimum_ptr + len)->mc_item_data = + (offset << MC_OFFSET_SHIFT) | len; + } + } while (++len <= matches[i].len); + } while (++i != num_matches); + } +} + +/* + * The main near-optimal parsing routine. + * + * Briefly, the algorithm does an approximate minimum-cost path search to find a + * "near-optimal" sequence of matches and literals to output, based on the + * current cost model. The algorithm steps forward, position by position (byte + * by byte), and updates the minimum cost path to reach each later position that + * can be reached using a match or literal from the current position. This is + * essentially Dijkstra's algorithm in disguise: the graph nodes are positions, + * the graph edges are possible matches/literals to code, and the cost of each + * edge is the estimated number of bits that will be required to output the + * corresponding match or literal. But one difference is that we actually + * compute the lowest-cost path in pieces, where each piece is terminated when + * there are no choices to be made. + * + * If next_chosen_item != NULL, then all items chosen will be recorded (saved in + * the chosen_items array). Otherwise, all items chosen will only be tallied + * (symbol frequencies tallied in c->freqs). + */ +static void +xpress_optim_pass(struct xpress_compressor *c, + struct xpress_item **next_chosen_item) +{ + const u8 *window_end; + const u8 *window_ptr; + struct xpress_mc_pos_data *cur_optimum_ptr; + struct xpress_mc_pos_data *end_optimum_ptr; + const struct lz_match *matches; + unsigned num_matches; + unsigned longest_len; + unsigned literal; + u32 cost; + + window_ptr = c->cur_window; + window_end = &c->cur_window[c->cur_window_size]; + +begin: + /* Start building a new list of items, which will correspond to the next + * piece of the overall minimum-cost path. */ + + if (window_ptr == window_end) + return; + + cur_optimum_ptr = c->optimum; + cur_optimum_ptr->cost = 0; + end_optimum_ptr = cur_optimum_ptr; + + /* The following loop runs once for each per byte in the window, except + * in a couple shortcut cases. */ + for (;;) { + + /* Find matches with the current position. */ + num_matches = xpress_get_matches(c, &matches); + + if (num_matches) { + + longest_len = matches[num_matches - 1].len; + + /* If there's a very long match, choose it immediately. + */ + if (longest_len >= c->params.nice_match_length) { + + xpress_skip_bytes(c, longest_len - 1); + window_ptr += longest_len; + + if (cur_optimum_ptr != c->optimum) + xpress_declare_item_list(c, cur_optimum_ptr, + next_chosen_item); + + xpress_declare_match(c, longest_len, + matches[num_matches - 1].offset, + next_chosen_item); + goto begin; + } + + /* If reaching any positions for the first time, + * initialize their costs to "infinity". */ + while (end_optimum_ptr < cur_optimum_ptr + longest_len) + (++end_optimum_ptr)->cost = MC_INFINITE_COST; + + /* Consider coding a match. */ + xpress_consider_matches(c, cur_optimum_ptr, + matches, num_matches); + } else { + /* No matches found. The only choice at this position + * is to code a literal. */ + + if (end_optimum_ptr == cur_optimum_ptr) { + #if 1 + /* Optimization for single literals. */ + if (likely(cur_optimum_ptr == c->optimum)) { + xpress_declare_literal(c, *window_ptr++, + next_chosen_item); + if (window_ptr == window_end) + return; + continue; + } + #endif + (++end_optimum_ptr)->cost = MC_INFINITE_COST; + } + } + + /* Consider coding a literal. */ + literal = *window_ptr++; + cost = cur_optimum_ptr->cost + c->costs[literal]; + if (cost < (cur_optimum_ptr + 1)->cost) { + (cur_optimum_ptr + 1)->cost = cost; + (cur_optimum_ptr + 1)->mc_item_data = + ((u32)literal << MC_OFFSET_SHIFT) | 1; + } + + /* Advance to the next position. */ + cur_optimum_ptr++; + + /* + * This loop will terminate when either of the following + * conditions is true: + * + * (1) cur_optimum_ptr == end_optimum_ptr + * + * There are no paths that extend beyond the current + * position. In this case, any path to a later position + * must pass through the current position, so we can go + * ahead and choose the list of items that led to this + * position. + * + * (2) cur_optimum_ptr == &c->optimum[XPRESS_OPTIM_ARRAY_LENGTH] + * + * This bounds the number of times the algorithm can step + * forward before it is guaranteed to start choosing items. + * This limits the memory usage. But + * XPRESS_OPTIM_ARRAY_LENGTH is high enough that on most + * inputs this limit is never reached. + * + * Note: no check for end-of-block is needed because + * end-of-block will trigger condition (1). + */ + if (cur_optimum_ptr == end_optimum_ptr || + cur_optimum_ptr == &c->optimum[XPRESS_OPTIM_ARRAY_LENGTH]) + break; + } + + /* Choose the current list of items that constitute the minimum-cost + * path to the current position. */ + xpress_declare_item_list(c, cur_optimum_ptr, next_chosen_item); + goto begin; +} + +/* Near-optimal parsing */ +static u32 +xpress_choose_near_optimal_items(struct xpress_compressor *c) +{ + u32 num_passes_remaining = c->params.num_optim_passes; + struct xpress_item *next_chosen_item; + struct xpress_item **next_chosen_item_ptr; + + /* Choose appropriate match-finder wrapper functions. */ + if (c->params.num_optim_passes > 1) { + c->get_matches_func = xpress_get_matches_fillcache; + c->skip_bytes_func = xpress_skip_bytes_fillcache; + } else { + c->get_matches_func = xpress_get_matches_noncaching; + c->skip_bytes_func = xpress_skip_bytes_noncaching; + } + + /* The first optimization pass will use a default cost model. Each + * additional optimization pass will use a cost model computed from the + * previous pass. * - * +4 to take into account that init_output_bitstream() requires at - * least 4 bytes of data. */ - if (uncompressed_len < XPRESS_NUM_SYMBOLS / 2 + 1 + 4) + * To improve performance, we only generate the array containing the + * matches and literals in intermediate form on the final pass. For + * earlier passes, tallying symbol frequencies is sufficient. */ + xpress_set_default_costs(c->costs); + + next_chosen_item_ptr = NULL; + do { + /* Reset the match-finder wrapper. */ + c->cache_ptr = c->cached_matches; + + if (num_passes_remaining == 1) { + /* Last pass: actually generate the items. */ + next_chosen_item = c->chosen_items; + next_chosen_item_ptr = &next_chosen_item; + } + + /* Choose the items. */ + xpress_optim_pass(c, next_chosen_item_ptr); + + if (num_passes_remaining > 1) { + /* This isn't the last pass. */ + + /* Make the Huffman code from the symbol frequencies. */ + c->freqs[XPRESS_END_OF_DATA]++; + xpress_make_huffman_code(c); + + /* Reset symbol frequencies. */ + memset(c->freqs, 0, sizeof(c->freqs)); + + /* Update symbol costs. */ + xpress_set_costs(c->costs, c->lens); + + /* Choose appopriate match-finder wrapper functions. */ + if (c->cache_ptr <= c->cache_limit) { + c->get_matches_func = xpress_get_matches_usecache_nocheck; + c->skip_bytes_func = xpress_skip_bytes_usecache_nocheck; + } else { + c->get_matches_func = xpress_get_matches_usecache; + c->skip_bytes_func = xpress_skip_bytes_usecache; + } + } + } while (--num_passes_remaining); + + /* Return the number of items chosen. */ + return next_chosen_item - c->chosen_items; +} + +/* Lazy parsing */ +static u32 +xpress_choose_lazy_items(struct xpress_compressor *c) +{ + const u8 *window_ptr = c->cur_window; + const u8 *window_end = &c->cur_window[c->cur_window_size]; + struct xpress_item *next_chosen_item = c->chosen_items; + u32 len_3_too_far; + struct lz_mf *mf = c->mf; + struct lz_match *matches = c->cached_matches; + unsigned num_matches; + struct lz_match prev_match; + + if (c->cur_window_size <= 8192) + len_3_too_far = 2048; + else + len_3_too_far = 4096; + + do { + /* Don't have match at previous position */ + + num_matches = lz_mf_get_matches(mf, matches); + window_ptr++; + + if (num_matches == 0 || + (matches[num_matches - 1].len == 3 && + matches[num_matches - 1].offset >= len_3_too_far)) + { + /* No matches found => output literal */ + xpress_declare_literal(c, *(window_ptr - 1), + &next_chosen_item); + continue; + } + + prev_match = matches[num_matches - 1]; + + have_prev_match: + /* Have match at previous position */ + + if (prev_match.len >= c->params.nice_match_length) { + /* Very long match found => output immediately */ + xpress_declare_match(c, prev_match.len, + prev_match.offset, + &next_chosen_item); + lz_mf_skip_positions(mf, prev_match.len - 1); + window_ptr += prev_match.len - 1; + continue; + } + + num_matches = lz_mf_get_matches(mf, matches); + window_ptr++; + + if (num_matches == 0 || + (matches[num_matches - 1].len <= prev_match.len)) + { + /* Next match is not longer => output previous match */ + xpress_declare_match(c, prev_match.len, + prev_match.offset, + &next_chosen_item); + lz_mf_skip_positions(mf, prev_match.len - 2); + window_ptr += prev_match.len - 2; + continue; + } + + /* Next match is longer => output literal */ + + xpress_declare_literal(c, *(window_ptr - 2), &next_chosen_item); + + prev_match = matches[num_matches - 1]; + + goto have_prev_match; + + } while (window_ptr != window_end); + + return next_chosen_item - c->chosen_items; +} + +/* Greedy parsing */ +static u32 +xpress_choose_greedy_items(struct xpress_compressor *c) +{ + const u8 *window_ptr = c->cur_window; + const u8 *window_end = &c->cur_window[c->cur_window_size]; + struct xpress_item *next_chosen_item = c->chosen_items; + u32 len_3_too_far; + struct lz_mf *mf = c->mf; + struct lz_match *matches = c->cached_matches; + unsigned num_matches; + + if (c->cur_window_size <= 8192) + len_3_too_far = 2048; + else + len_3_too_far = 4096; + + do { + /* Get longest match at the current position. */ + num_matches = lz_mf_get_matches(mf, matches); + + if (num_matches == 0 || + (matches[num_matches - 1].len == 3 && + matches[num_matches - 1].offset >= len_3_too_far)) + { + /* No match, or length 3 match with large offset. + * Choose a literal. */ + xpress_declare_literal(c, *window_ptr, &next_chosen_item); + window_ptr += 1; + } else { + /* Match found. Choose it. */ + unsigned len = matches[num_matches - 1].len; + unsigned offset = matches[num_matches - 1].offset; + + xpress_declare_match(c, len, offset, &next_chosen_item); + lz_mf_skip_positions(mf, len - 1); + window_ptr += len; + } + } while (window_ptr != window_end); + + return next_chosen_item - c->chosen_items; +} + +/* Literals-only parsing */ +static u32 +xpress_choose_literals(struct xpress_compressor *c) +{ + const u8 *window_ptr = c->cur_window; + const u8 *window_end = &c->cur_window[c->cur_window_size]; + struct xpress_item *next_chosen_item = c->chosen_items; + + do { + xpress_declare_literal(c, *window_ptr++, &next_chosen_item); + } while (window_ptr != window_end); + + return next_chosen_item - c->chosen_items; +} + +/* + * 'choose_items_func' is provided a data buffer c->cur_window of length + * c->cur_window_size bytes. This data buffer will have already been loaded + * into the match-finder c->mf. 'choose_items_func' must choose the + * match/literal sequence to output to represent this data buffer. The + * intermediate representation of this match/literal sequence must be recorded + * in c->chosen_items, and the Huffman symbols used must be tallied in c->freqs. + * The return value must be the number of items written to c->chosen_items. + */ +static u32 +xpress_choose_items(struct xpress_compressor *c) +{ + return (*c->params.choose_items_func)(c); +} + +/* Set internal compression parameters for the specified compression level and + * maximum window size. */ +static void +xpress_build_params(unsigned int compression_level, u32 max_window_size, + struct xpress_compressor_params *xpress_params) +{ + memset(xpress_params, 0, sizeof(*xpress_params)); + xpress_params->num_optim_passes = 1; + + if (compression_level == 1) { + + /* Literal-only parsing */ + xpress_params->choose_items_func = xpress_choose_literals; + xpress_params->mf_algo = LZ_MF_NULL; + + } else if (compression_level < 30) { + + /* Greedy parsing */ + xpress_params->choose_items_func = xpress_choose_greedy_items; + xpress_params->mf_algo = LZ_MF_HASH_CHAINS; + xpress_params->nice_match_length = compression_level; + xpress_params->max_search_depth = compression_level / 2; + + } else if (compression_level < 60) { + + /* Lazy parsing */ + xpress_params->choose_items_func = xpress_choose_lazy_items; + xpress_params->mf_algo = LZ_MF_HASH_CHAINS; + xpress_params->nice_match_length = compression_level; + xpress_params->max_search_depth = compression_level / 2; + + } else { + + /* Near-optimal parsing */ + xpress_params->choose_items_func = xpress_choose_near_optimal_items; + if (max_window_size >= 16384) + xpress_params->mf_algo = LZ_MF_BINARY_TREES; + else + xpress_params->mf_algo = LZ_MF_HASH_CHAINS; + xpress_params->num_optim_passes = compression_level / 40; + xpress_params->nice_match_length = min(compression_level / 2, + XPRESS_MAX_MATCH_LEN); + xpress_params->max_search_depth = min(compression_level, + XPRESS_MAX_MATCH_LEN); + } +} + +/* Given the internal compression parameters and maximum window size, build the + * Lempel-Ziv match-finder parameters. */ +static void +xpress_build_mf_params(const struct xpress_compressor_params *xpress_params, + u32 max_window_size, struct lz_mf_params *mf_params) +{ + memset(mf_params, 0, sizeof(*mf_params)); + + mf_params->algorithm = xpress_params->mf_algo; + mf_params->max_window_size = max_window_size; + mf_params->min_match_len = XPRESS_MIN_MATCH_LEN; + mf_params->max_match_len = XPRESS_MAX_MATCH_LEN; + mf_params->max_search_depth = xpress_params->max_search_depth; + mf_params->nice_match_len = xpress_params->nice_match_length; +} + +static void +xpress_free_compressor(void *_c); + +static u64 +xpress_get_needed_memory(size_t max_window_size, unsigned int compression_level) +{ + u64 size = 0; + struct xpress_compressor_params params; + + if (max_window_size > XPRESS_MAX_OFFSET + 1) return 0; - if (uncompressed_len <= stack_max) { - matches = alloca(uncompressed_len * sizeof(matches[0])); - udata = alloca(uncompressed_len + 8); - prev_tab = alloca(uncompressed_len * sizeof(prev_tab[0])); + xpress_build_params(compression_level, max_window_size, ¶ms); + + size += sizeof(struct xpress_compressor); + + /* mf */ + size += lz_mf_get_needed_memory(params.mf_algo, max_window_size); + + /* optimum */ + if (params.choose_items_func == xpress_choose_near_optimal_items) { + size += (XPRESS_OPTIM_ARRAY_LENGTH + params.nice_match_length) * + sizeof(struct xpress_mc_pos_data); + } + + /* cached_matches */ + if (params.num_optim_passes > 1) { + size_t cache_len = max(max_window_size * XPRESS_CACHE_PER_POS, + params.max_search_depth + 1); + size += cache_len * sizeof(struct lz_match); } else { - matches = MALLOC(uncompressed_len * sizeof(matches[0])); - udata = MALLOC(uncompressed_len + 8); - prev_tab = MALLOC(uncompressed_len * sizeof(prev_tab[0])); - if (matches == NULL || udata == NULL || prev_tab == NULL) { - WARNING("Failed to allocate memory for compression..."); - compressed_len = 0; - goto out_free; - } + size += params.max_search_depth * sizeof(struct lz_match); } - /* Copy the data to a temporary buffer, but only to avoid - * inconsequential accesses of uninitialized memory in - * lz_analyze_block(). */ - memcpy(udata, uncompressed_data, uncompressed_len); - memset(udata + uncompressed_len, 0, 8); - - /* Determine match/literal sequence to divide the data into. */ - ZERO_ARRAY(record_ctx.freqs); - record_ctx.matches = matches; - lz_analyze_block(udata, - uncompressed_len, - xpress_record_match, - xpress_record_literal, - &record_ctx, - &xpress_lz_params, - prev_tab); - - num_matches = (record_ctx.matches - matches); - - /* Account for end of data symbol. */ - record_ctx.freqs[XPRESS_END_OF_DATA]++; - - /* Build the Huffman code. */ - make_canonical_huffman_code(XPRESS_NUM_SYMBOLS, XPRESS_MAX_CODEWORD_LEN, - record_ctx.freqs, lens, codewords); + /* chosen_items */ + size += max_window_size * sizeof(struct xpress_item); + + return size; +} + +static int +xpress_create_compressor(size_t max_window_size, unsigned int compression_level, + void **c_ret) +{ + struct xpress_compressor *c; + struct xpress_compressor_params params; + struct lz_mf_params mf_params; + + if (max_window_size > XPRESS_MAX_OFFSET + 1) + return WIMLIB_ERR_INVALID_PARAM; + + xpress_build_params(compression_level, max_window_size, ¶ms); + xpress_build_mf_params(¶ms, max_window_size, &mf_params); + + c = CALLOC(1, sizeof(struct xpress_compressor)); + if (!c) + goto oom; + + c->params = params; + + c->mf = lz_mf_alloc(&mf_params); + if (!c->mf) + goto oom; + + if (params.choose_items_func == xpress_choose_near_optimal_items) { + c->optimum = MALLOC((XPRESS_OPTIM_ARRAY_LENGTH + + params.nice_match_length) * + sizeof(struct xpress_mc_pos_data)); + if (!c->optimum) + goto oom; + } + + if (params.num_optim_passes > 1) { + size_t cache_len = max(max_window_size * XPRESS_CACHE_PER_POS, + params.max_search_depth + 1); + c->cached_matches = MALLOC(cache_len * sizeof(struct lz_match)); + if (!c->cached_matches) + goto oom; + c->cache_limit = c->cached_matches + cache_len - + (params.max_search_depth + 1); + } else { + c->cached_matches = MALLOC(params.max_search_depth * + sizeof(struct lz_match)); + if (!c->cached_matches) + goto oom; + } + + c->chosen_items = MALLOC(max_window_size * sizeof(struct xpress_item)); + if (!c->chosen_items) + goto oom; + + *c_ret = c; + return 0; + +oom: + xpress_free_compressor(c); + return WIMLIB_ERR_NOMEM; +} + +static size_t +xpress_compress(const void *uncompressed_data, size_t uncompressed_size, + void *compressed_data, size_t compressed_size_avail, void *_c) +{ + struct xpress_compressor *c = _c; + u32 num_chosen_items; + u8 *cptr; + struct xpress_output_bitstream os; + u32 compressed_size; + + /* XPRESS requires 256 bytes of overhead for the Huffman code, so it's + * impossible to compress 256 bytes or less of data to less than the + * input size. */ + if (compressed_size_avail < XPRESS_NUM_SYMBOLS / 2 + 50) + return 0; + + /* Determine match/literal sequence. */ + c->cur_window = uncompressed_data; + c->cur_window_size = uncompressed_size; + lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size); + memset(c->freqs, 0, sizeof(c->freqs)); + num_chosen_items = xpress_choose_items(c); + c->freqs[XPRESS_END_OF_DATA]++; + xpress_make_huffman_code(c); /* Output the Huffman code as a series of 512 4-bit lengths. */ - for (i = 0; i < XPRESS_NUM_SYMBOLS; i += 2) - *cptr++ = (lens[i] & 0xf) | (lens[i + 1] << 4); + cptr = compressed_data; + for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i += 2) + *cptr++ = (c->lens[i] & 0xf) | (c->lens[i + 1] << 4); /* Output the encoded matches/literals. */ - init_output_bitstream(&ostream, cptr, - uncompressed_len - XPRESS_NUM_SYMBOLS / 2 - 1); - xpress_write_matches_and_literals(&ostream, matches, - num_matches, codewords, lens); + xpress_init_output(&os, cptr, + compressed_size_avail - XPRESS_NUM_SYMBOLS / 2); + xpress_write_items(&os, c->chosen_items, num_chosen_items, + c->codewords, c->lens); /* Flush any pending data and get the length of the compressed data. */ - compressed_len = flush_output_bitstream(&ostream); - if (compressed_len == ~(input_idx_t)0) { - compressed_len = 0; - goto out_free; - } - compressed_len += XPRESS_NUM_SYMBOLS / 2; - -#if defined(ENABLE_XPRESS_DEBUG) || defined(ENABLE_VERIFY_COMPRESSION) - /* Verify that we really get the same thing back when decompressing. */ - if (wimlib_xpress_decompress(compressed_data, compressed_len, - udata, uncompressed_len)) - { - ERROR("Failed to decompress data we " - "compressed using XPRESS algorithm"); - wimlib_assert(0); - compressed_len = 0; - goto out_free; - } + compressed_size = xpress_flush_output(&os); + if (compressed_size == 0) + return 0; - if (memcmp(uncompressed_data, udata, uncompressed_len)) { - ERROR("Data we compressed using XPRESS algorithm " - "didn't decompress to original"); - wimlib_assert(0); - compressed_len = 0; - goto out_free; - } -#endif + /* Return the length of the compressed data. */ + return compressed_size + XPRESS_NUM_SYMBOLS / 2; +} -out_free: - if (uncompressed_len > stack_max) { - FREE(matches); - FREE(udata); - FREE(prev_tab); +static void +xpress_free_compressor(void *_c) +{ + struct xpress_compressor *c = _c; + + if (c) { + lz_mf_free(c->mf); + FREE(c->cached_matches); + FREE(c->optimum); + FREE(c->chosen_items); + FREE(c); } - return compressed_len; } + +const struct compressor_ops xpress_compressor_ops = { + .get_needed_memory = xpress_get_needed_memory, + .create_compressor = xpress_create_compressor, + .compress = xpress_compress, + .free_compressor = xpress_free_compressor, +};