X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Fxpress-compress.c;h=e1884978377f0ac6d1555282600e775aa8141ab2;hp=21118e7652b2e17d2287c085cc7e571c4bf88c3c;hb=4dae2eef895b95d1c2bc1bf5fc17413af8cc7952;hpb=fc8276d0a3efb3df5f7512b3fa9499eb1b3449eb diff --git a/src/xpress-compress.c b/src/xpress-compress.c index 21118e76..e1884978 100644 --- a/src/xpress-compress.c +++ b/src/xpress-compress.c @@ -1,13 +1,12 @@ /* * xpress-compress.c * - * XPRESS compression routines. - * - * See the comments in xpress-decompress.c about the XPRESS format. + * A compressor that produces output compatible with the XPRESS (Huffman + * version) compression format. */ /* - * Copyright (C) 2012, 2013 Eric Biggers + * Copyright (C) 2012, 2013, 2014 Eric Biggers * * This file is part of wimlib, a library for working with WIM files. * @@ -25,238 +24,1207 @@ * along with wimlib; if not, see http://www.gnu.org/licenses/. */ -#include "xpress.h" -#include "wimlib.h" -#include "compress.h" +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "wimlib/compressor_ops.h" +#include "wimlib/compress_common.h" +#include "wimlib/endianness.h" +#include "wimlib/error.h" +#include "wimlib/lz_mf.h" +#include "wimlib/util.h" +#include "wimlib/xpress.h" -#include #include +#define XPRESS_CACHE_PER_POS 8 +#define XPRESS_OPTIM_ARRAY_LENGTH 4096 + +struct xpress_compressor; +struct xpress_item; +struct xpress_mc_pos_data; + +struct xpress_compressor_params { + + /* Only used when choose_items_func == xpress_choose_items_near_optimal */ + u32 num_optim_passes; + + /* Given the data to compress (c->cur_window, c->cur_window_size), + * 'choose_items_func' fills in c->chosen_items with the intermediate + * representation of the match/literal sequence to output. Also fills + * in c->codewords and c->lens to provide the Huffman code with which + * these items should be output. + * + * Returns the number of items written to c->chosen_items. This can be + * at most c->cur_window_size. (The worst case is all literals, no + * matches.) */ + u32 (*choose_items_func)(struct xpress_compressor *c); + + /* Match-finding algorithm and parameters */ + enum lz_mf_algo mf_algo; + u32 max_search_depth; + u32 nice_match_length; +}; + +/* XPRESS compressor state. */ +struct xpress_compressor { + + /* Parameters determined based on the compression level. */ + struct xpress_compressor_params params; + + /* Lempel-Ziv match-finder */ + struct lz_mf *mf; + + /* Optimal parsing data */ + unsigned (*get_matches_func)(struct xpress_compressor *, + const struct lz_match **); + void (*skip_bytes_func)(struct xpress_compressor *, u32 n); + const u8 *cur_window_ptr; + struct lz_match *cached_matches; + struct lz_match *cache_ptr; + struct lz_match *cache_limit; + struct xpress_mc_pos_data *optimum; + unsigned optimum_cur_idx; + unsigned optimum_end_idx; + u8 costs[XPRESS_NUM_SYMBOLS]; + + /* The selected sequence of matches/literals */ + struct xpress_item *chosen_items; + + /* Data currently being compressed */ + const u8 *cur_window; + u32 cur_window_size; + + /* Symbol frequency counters */ + u32 freqs[XPRESS_NUM_SYMBOLS]; + + /* The current Huffman code */ + u32 codewords[XPRESS_NUM_SYMBOLS]; + u8 lens[XPRESS_NUM_SYMBOLS]; +}; + +/* Match-chooser position data. + * See corresponding declaration in lzx-compress.c for more information. */ +struct xpress_mc_pos_data { + u32 cost; +#define MC_INFINITE_COST ((u32)~0UL) + + union { + struct { + u32 link; + u32 match_offset; + } prev; + struct { + u32 link; + u32 match_offset; + } next; + }; +}; + +/* Intermediate XPRESS match/literal representation. */ +struct xpress_item { + u16 adjusted_len; /* Match length minus XPRESS_MIN_MATCH_LEN */ + u16 offset; /* Match offset */ + /* For literals, offset == 0 and adjusted_len is the literal byte. */ +}; + /* - * Writes @match, which is a match given in the intermediate representation for - * XPRESS matches, to the output stream @ostream. + * Structure to keep track of the current state of sending data to the + * compressed output buffer. * - * @codewords and @lens provide the Huffman code that is being used. + * The XPRESS bitstream is encoded as a sequence of little endian 16-bit coding + * units interwoven with literal bytes. */ -static int -xpress_write_match(struct output_bitstream *ostream, u32 match, - const u16 codewords[], const u8 lens[]) -{ - u32 adjusted_match_len = match & 0xffff; - u32 match_offset = match >> 16; - u32 len_hdr = min(adjusted_match_len, 0xf); - u32 offset_bsr = bsr32(match_offset); - u32 sym = len_hdr | (offset_bsr << 4) | XPRESS_NUM_CHARS; - int ret; - - ret = bitstream_put_bits(ostream, codewords[sym], lens[sym]); - if (ret != 0) - return ret; - - if (adjusted_match_len >= 0xf) { - u8 byte1 = min(adjusted_match_len - 0xf, 0xff); - ret = bitstream_put_byte(ostream, byte1); - if (ret != 0) - return ret; +struct xpress_output_bitstream { + + /* Bits that haven't yet been written to the output buffer. */ + u32 bitbuf; + + /* Number of bits currently held in @bitbuf. */ + u32 bitcount; + + /* Pointer to the start of the output buffer. */ + u8 *start; + + /* Pointer to the location in the ouput buffer at which to write the + * next 16 bits. */ + u8 *next_bits; + + /* Pointer to the location in the output buffer at which to write the + * next 16 bits, after @next_bits. */ + u8 *next_bits2; + + /* Pointer to the location in the output buffer at which to write the + * next literal byte. */ + u8 *next_byte; + + /* Pointer to the end of the output buffer. */ + u8 *end; +}; + +/* + * Initialize the output bitstream. + * + * @os + * The output bitstream structure to initialize. + * @buffer + * The buffer to write to. + * @size + * Size of @buffer, in bytes. Must be at least 4. + */ +static void +xpress_init_output(struct xpress_output_bitstream *os, void *buffer, u32 size) +{ + os->bitbuf = 0; + os->bitcount = 0; + os->start = buffer; + os->next_bits = os->start; + os->next_bits2 = os->start + 2; + os->next_byte = os->start + 4; + os->end = os->start + size; +} + +/* + * Write some bits to the output bitstream. + * + * The bits are given by the low-order @num_bits bits of @bits. Higher-order + * bits in @bits cannot be set. At most 16 bits can be written at once. + * + * If the output buffer space is exhausted, then the bits will be ignored, and + * xpress_flush_output() will return 0 when it gets called. + */ +static _always_inline_attribute void +xpress_write_bits(struct xpress_output_bitstream *os, + const u32 bits, const unsigned int num_bits) +{ + /* This code is optimized for XPRESS, which never needs to write more + * than 16 bits at once. */ + + os->bitcount += num_bits; + os->bitbuf = (os->bitbuf << num_bits) | bits; + + if (os->bitcount > 16) { + os->bitcount -= 16; + if (os->end - os->next_byte >= 2) { + *(le16 *)os->next_bits = cpu_to_le16(os->bitbuf >> os->bitcount); + os->next_bits = os->next_bits2; + os->next_bits2 = os->next_byte; + os->next_byte += 2; + } + } +} + +/* + * Interweave a literal byte into the output bitstream. + */ +static _always_inline_attribute void +xpress_write_byte(struct xpress_output_bitstream *os, u8 byte) +{ + if (os->next_byte < os->end) + *os->next_byte++ = byte; +} + +/* + * Flush the last coding unit to the output buffer if needed. Return the total + * number of bytes written to the output buffer, or 0 if an overflow occurred. + */ +static u32 +xpress_flush_output(struct xpress_output_bitstream *os) +{ + if (unlikely(os->end - os->next_byte < 2)) + return 0; + + *(le16 *)os->next_bits = cpu_to_le16(os->bitbuf << (16 - os->bitcount)); + *(le16 *)os->next_bits2 = cpu_to_le16(0); + + return os->next_byte - os->start; +} + +/* Output an XPRESS match. */ +static void +xpress_write_match(struct xpress_item match, struct xpress_output_bitstream *os, + const u32 codewords[], const u8 lens[]) +{ + unsigned len_hdr = min(match.adjusted_len, 0xf); + unsigned offset_bsr = bsr32(match.offset); + unsigned sym = XPRESS_NUM_CHARS + ((offset_bsr << 4) | len_hdr); + + /* Huffman symbol */ + xpress_write_bits(os, codewords[sym], lens[sym]); + + /* If length >= 18, one extra length byte. + * If length >= 273, three (total) extra length bytes. */ + if (match.adjusted_len >= 0xf) { + u8 byte1 = min(match.adjusted_len - 0xf, 0xff); + xpress_write_byte(os, byte1); if (byte1 == 0xff) { - ret = bitstream_put_two_bytes(ostream, adjusted_match_len); - if (ret != 0) - return ret; + xpress_write_byte(os, match.adjusted_len & 0xff); + xpress_write_byte(os, match.adjusted_len >> 8); } } - return bitstream_put_bits(ostream, - match_offset ^ (1 << offset_bsr), offset_bsr); + + /* Offset bits */ + xpress_write_bits(os, match.offset ^ (1U << offset_bsr), offset_bsr); } -static int -xpress_write_compressed_literals(struct output_bitstream *ostream, - const u32 match_tab[], - unsigned num_matches, - const u16 codewords[], - const u8 lens[]) -{ - for (unsigned i = 0; i < num_matches; i++) { - int ret; - u32 match = match_tab[i]; - - if (match >= XPRESS_NUM_CHARS) /* match */ - ret = xpress_write_match(ostream, match, codewords, - lens); - else /* literal byte */ - ret = bitstream_put_bits(ostream, codewords[match], - lens[match]); - if (ret != 0) - return ret; +/* Output a sequence of XPRESS matches and literals. */ +static void +xpress_write_items(struct xpress_output_bitstream *os, + const struct xpress_item items[], u32 num_items, + const u32 codewords[], const u8 lens[]) +{ + for (u32 i = 0; i < num_items; i++) { + if (items[i].offset) { + /* Match */ + xpress_write_match(items[i], os, codewords, lens); + } else { + /* Literal */ + unsigned lit = items[i].adjusted_len; + xpress_write_bits(os, codewords[lit], lens[lit]); + } } - return bitstream_put_bits(ostream, codewords[XPRESS_END_OF_DATA], - lens[XPRESS_END_OF_DATA]); + /* End-of-data symbol (required for MS compatibility) */ + xpress_write_bits(os, codewords[XPRESS_END_OF_DATA], lens[XPRESS_END_OF_DATA]); } -static u32 -xpress_record_literal(u8 literal, void *__freq_tab) +/* Make the Huffman code for XPRESS. + * + * Takes as input c->freqs and produces as output c->lens and c->codewords. */ +static void +xpress_make_huffman_code(struct xpress_compressor *c) { - freq_t *freq_tab = __freq_tab; - freq_tab[literal]++; - return literal; + make_canonical_huffman_code(XPRESS_NUM_SYMBOLS, XPRESS_MAX_CODEWORD_LEN, + c->freqs, c->lens, c->codewords); } -static u32 -xpress_record_match(unsigned match_offset, unsigned match_len, - void *freq_tab, void *ignore) +/* Account for the Huffman symbol that would be produced by outputting the + * specified literal. Returns the intermediate representation of the literal. + */ +static inline struct xpress_item +xpress_tally_literal(u8 lit, u32 freqs[]) { - wimlib_assert(match_len >= XPRESS_MIN_MATCH && - match_len <= XPRESS_MAX_MATCH); - wimlib_assert(match_offset >= XPRESS_MIN_OFFSET && - match_offset <= XPRESS_MAX_OFFSET); + freqs[lit]++; + return (struct xpress_item) { .offset = 0, .adjusted_len = lit }; +} - /* - * The intermediate representation of XPRESS matches is as follows: - * - * bits description - * ---- ----------------------------------------------------------- - * - * 16-31 match offset (XPRESS_MIN_OFFSET < x < XPRESS_MAX_OFFSET) - * - * 0-15 adjusted match length (0 <= x <= XPRESS_MAX_MATCH - XPRESS_MIN_MATCH) - * - * Literals are simply represented as themselves and can be - * distinguished from matches by the fact that only literals will have - * the upper three bytes completely clear. */ - - u32 adjusted_match_len = match_len - XPRESS_MIN_MATCH; - u32 len_hdr = min(adjusted_match_len, 0xf); - u32 offset_bsr = bsr32(match_offset); - u32 sym = len_hdr | (offset_bsr << 4) | XPRESS_NUM_CHARS; - ((freq_t*)freq_tab)[sym]++; - return adjusted_match_len | (match_offset << 16); -} - -static const struct lz_params xpress_lz_params = { - .min_match = XPRESS_MIN_MATCH, - .max_match = XPRESS_MAX_MATCH, - .good_match = 16, - .nice_match = 32, - .max_chain_len = 16, - .max_lazy_match = 16, - .too_far = 4096, -}; +/* Account for the Huffman symbol that would be produced by outputting the + * specified match. Returns the intermediate representation of the match. */ +static inline struct xpress_item +xpress_tally_match(u32 len, u32 offset, u32 freqs[]) +{ + u32 adjusted_len = len - XPRESS_MIN_MATCH_LEN; + unsigned len_hdr = min(adjusted_len, 0xf); + unsigned sym = XPRESS_NUM_CHARS + ((bsr32(offset) << 4) | len_hdr); + + freqs[sym]++; + return (struct xpress_item) { .offset = offset, + .adjusted_len = adjusted_len }; +} -/* Documented in wimlib.h */ -WIMLIBAPI unsigned -wimlib_xpress_compress(const void *__uncompressed_data, - unsigned uncompressed_len, void *__compressed_data) +static unsigned +xpress_get_matches_fillcache(struct xpress_compressor *c, + const struct lz_match **matches_ret) { - u8 *compressed_data = __compressed_data; - struct output_bitstream ostream; - u32 match_tab[uncompressed_len]; - freq_t freq_tab[XPRESS_NUM_SYMBOLS]; - u16 codewords[XPRESS_NUM_SYMBOLS]; - u8 lens[XPRESS_NUM_SYMBOLS]; + struct lz_match *cache_ptr; + struct lz_match *matches; + unsigned num_matches; + + cache_ptr = c->cache_ptr; + matches = cache_ptr + 1; + if (likely(cache_ptr <= c->cache_limit)) { + num_matches = lz_mf_get_matches(c->mf, matches); + cache_ptr->len = num_matches; + c->cache_ptr = matches + num_matches; + } else { + num_matches = 0; + } + c->cur_window_ptr++; + *matches_ret = matches; + return num_matches; +} + +static unsigned +xpress_get_matches_usecache(struct xpress_compressor *c, + const struct lz_match **matches_ret) +{ + struct lz_match *cache_ptr; + struct lz_match *matches; unsigned num_matches; - unsigned compressed_len; + + cache_ptr = c->cache_ptr; + matches = cache_ptr + 1; + if (likely(cache_ptr <= c->cache_limit)) { + num_matches = cache_ptr->len; + c->cache_ptr = matches + num_matches; + } else { + num_matches = 0; + } + c->cur_window_ptr++; + *matches_ret = matches; + return num_matches; +} + +static unsigned +xpress_get_matches_usecache_nocheck(struct xpress_compressor *c, + const struct lz_match **matches_ret) +{ + struct lz_match *cache_ptr; + struct lz_match *matches; + unsigned num_matches; + + cache_ptr = c->cache_ptr; + matches = cache_ptr + 1; + num_matches = cache_ptr->len; + c->cache_ptr = matches + num_matches; + c->cur_window_ptr++; + *matches_ret = matches; + return num_matches; +} + +static unsigned +xpress_get_matches_noncaching(struct xpress_compressor *c, + const struct lz_match **matches_ret) +{ + c->cur_window_ptr++; + *matches_ret = c->cached_matches; + return lz_mf_get_matches(c->mf, c->cached_matches); +} + +/* + * Find matches at the next position in the window. + * + * Returns the number of matches found and sets *matches_ret to point to the + * matches array. The matches will be sorted by strictly increasing length and + * offset. + */ +static inline unsigned +xpress_get_matches(struct xpress_compressor *c, + const struct lz_match **matches_ret) +{ + return (*c->get_matches_func)(c, matches_ret); +} + +static void +xpress_skip_bytes_fillcache(struct xpress_compressor *c, u32 n) +{ + struct lz_match *cache_ptr; + + c->cur_window_ptr += n; + cache_ptr = c->cache_ptr; + lz_mf_skip_positions(c->mf, n); + if (likely(cache_ptr <= c->cache_limit)) { + do { + cache_ptr->len = 0; + cache_ptr += 1; + } while (--n && likely(cache_ptr <= c->cache_limit)); + } + c->cache_ptr = cache_ptr; +} + +static void +xpress_skip_bytes_usecache(struct xpress_compressor *c, u32 n) +{ + struct lz_match *cache_ptr; + + c->cur_window_ptr += n; + cache_ptr = c->cache_ptr; + if (likely(cache_ptr <= c->cache_limit)) { + do { + cache_ptr += 1 + cache_ptr->len; + } while (--n && likely(cache_ptr <= c->cache_limit)); + } + c->cache_ptr = cache_ptr; +} + +static void +xpress_skip_bytes_usecache_nocheck(struct xpress_compressor *c, u32 n) +{ + struct lz_match *cache_ptr; + + c->cur_window_ptr += n; + cache_ptr = c->cache_ptr; + do { + cache_ptr += 1 + cache_ptr->len; + } while (--n); + c->cache_ptr = cache_ptr; +} + +static void +xpress_skip_bytes_noncaching(struct xpress_compressor *c, u32 n) +{ + c->cur_window_ptr += n; + lz_mf_skip_positions(c->mf, n); +} + +/* + * Skip the specified number of positions in the window (don't search for + * matches at them). + */ +static inline void +xpress_skip_bytes(struct xpress_compressor *c, u32 n) +{ + return (*c->skip_bytes_func)(c, n); +} + +/* + * Returns the cost, in bits, required to output the literal from the previous + * window position (the position at which matches were last searched). + */ +static inline u32 +xpress_prev_literal_cost(const struct xpress_compressor *c) +{ + return c->costs[*(c->cur_window_ptr - 1)]; +} + +/* + * Reverse the linked list of near-optimal matches so that they can be returned + * in forwards order. + * + * Returns the first match in the list. + */ +static struct lz_match +xpress_match_chooser_reverse_list(struct xpress_compressor *c, unsigned cur_pos) +{ + unsigned prev_link, saved_prev_link; + u32 prev_match_offset, saved_prev_match_offset; + + c->optimum_end_idx = cur_pos; + + saved_prev_link = c->optimum[cur_pos].prev.link; + saved_prev_match_offset = c->optimum[cur_pos].prev.match_offset; + + do { + prev_link = saved_prev_link; + prev_match_offset = saved_prev_match_offset; + + saved_prev_link = c->optimum[prev_link].prev.link; + saved_prev_match_offset = c->optimum[prev_link].prev.match_offset; + + c->optimum[prev_link].next.link = cur_pos; + c->optimum[prev_link].next.match_offset = prev_match_offset; + + cur_pos = prev_link; + } while (cur_pos != 0); + + c->optimum_cur_idx = c->optimum[0].next.link; + + return (struct lz_match) + { .len = c->optimum_cur_idx, + .offset = c->optimum[0].next.match_offset, + }; +} + +/* + * Near-optimal parsing. + * + * This does a forward lowest-cost path search. The search is terminated when a + * sufficiently long match is found, when the search reaches a position with no + * alternatives, or when the temporary 'optimum' array fills up. After + * termination of the search, matches/literals will be returned one by one by + * successive calls to this function. Once all the matches/literals are used + * up, the next call to this function will begin a new search. + */ +static struct lz_match +xpress_choose_near_optimal_item(struct xpress_compressor *c) +{ + const struct lz_match *matches; + unsigned num_matches; + struct lz_match match; + unsigned cur_pos; + unsigned end_pos; + struct xpress_mc_pos_data * const optimum = c->optimum; + + if (c->optimum_cur_idx != c->optimum_end_idx) { + /* Return previously computed match or literal. */ + match.len = optimum[c->optimum_cur_idx].next.link - + c->optimum_cur_idx; + match.offset = optimum[c->optimum_cur_idx].next.match_offset; + + c->optimum_cur_idx = optimum[c->optimum_cur_idx].next.link; + return match; + } + + c->optimum_cur_idx = 0; + c->optimum_end_idx = 0; + + num_matches = xpress_get_matches(c, &matches); + + if (num_matches == 0) + return (struct lz_match) {}; + + if (matches[num_matches - 1].len >= c->params.nice_match_length) { + /* Take the long match immediately. */ + xpress_skip_bytes(c, matches[num_matches - 1].len - 1); + return matches[num_matches - 1]; + } + + /* Consider coding a literal. */ + optimum[1].cost = xpress_prev_literal_cost(c); + optimum[1].prev.link = 0; + + optimum[2].cost = MC_INFINITE_COST; + + { + /* Consider coding a match. Cost evaluation is hand-inlined so + * that we can do some performance hacks. */ + + unsigned i = 0; + unsigned len = 3; + struct xpress_mc_pos_data *optimum_ptr = &optimum[len]; + + if (matches[num_matches - 1].len < 0xf + XPRESS_MIN_MATCH_LEN) { + do { + u32 offset = matches[i].offset; + u32 offset_bsr = bsr32(offset); + unsigned len_hdr = len - XPRESS_MIN_MATCH_LEN; + unsigned sym = XPRESS_NUM_CHARS + + ((offset_bsr << 4) | len_hdr); + do { + optimum_ptr->prev.link = 0; + optimum_ptr->prev.match_offset = offset; + optimum_ptr->cost = offset_bsr + c->costs[sym]; + sym++; + optimum_ptr++; + } while (++len <= matches[i].len); + } while (++i != num_matches); + } else { + do { + u32 offset = matches[i].offset; + u32 offset_bsr = bsr32(offset); + do { + u32 adjusted_len = len - XPRESS_MIN_MATCH_LEN; + unsigned len_hdr = min(adjusted_len, 0xf); + unsigned sym = XPRESS_NUM_CHARS + + ((offset_bsr << 4) | len_hdr); + u32 cost = offset_bsr + c->costs[sym]; + if (adjusted_len >= 0xf) { + cost += 8; + if (adjusted_len - 0xf >= 0xff) + cost += 16; + } + + optimum_ptr->prev.link = 0; + optimum_ptr->prev.match_offset = offset; + optimum_ptr->cost = cost; + optimum_ptr++; + } while (++len <= matches[i].len); + } while (++i != num_matches); + } + } + + end_pos = matches[num_matches - 1].len; + cur_pos = 1; + do { + u32 cost; + u32 longest_len; + + num_matches = xpress_get_matches(c, &matches); + + if (num_matches) { + longest_len = matches[num_matches - 1].len; + if (longest_len >= c->params.nice_match_length) { + /* Take the long match immediately. */ + match = xpress_match_chooser_reverse_list(c, cur_pos); + + optimum[cur_pos].next.match_offset = + matches[num_matches - 1].offset; + optimum[cur_pos].next.link = cur_pos + longest_len; + c->optimum_end_idx = cur_pos + longest_len; + + xpress_skip_bytes(c, longest_len - 1); + + return match; + } + } else { + longest_len = 1; + } + + while (end_pos < cur_pos + longest_len) + optimum[++end_pos].cost = MC_INFINITE_COST; + + /* Consider coding a literal. */ + cost = optimum[cur_pos].cost + xpress_prev_literal_cost(c); + if (cost < optimum[cur_pos + 1].cost) { + optimum[cur_pos + 1].cost = cost; + optimum[cur_pos + 1].prev.link = cur_pos; + } + + if (num_matches) { + /* Consider coding a match. Cost evaluation is + * hand-inlined so that we can do some performance + * hacks. */ + unsigned i = 0; + unsigned len = 3; + struct xpress_mc_pos_data *optimum_ptr = &optimum[cur_pos + 3]; + u32 cur_cost = optimum[cur_pos].cost; + + if (matches[num_matches - 1].len < 0xf + XPRESS_MIN_MATCH_LEN) { + do { + u32 offset = matches[i].offset; + u32 offset_bsr = bsr32(offset); + unsigned len_hdr = len - XPRESS_MIN_MATCH_LEN; + unsigned sym = XPRESS_NUM_CHARS + + ((offset_bsr << 4) | len_hdr); + + u32 base_cost = cur_cost + offset_bsr; + do { + cost = base_cost + c->costs[sym]; + if (cost < optimum_ptr->cost) { + optimum_ptr->prev.link = cur_pos; + optimum_ptr->prev.match_offset = offset; + optimum_ptr->cost = cost; + } + sym++; + optimum_ptr++; + } while (++len <= matches[i].len); + } while (++i != num_matches); + } else { + do { + u32 offset = matches[i].offset; + u32 offset_bsr = bsr32(offset); + + u32 base_cost = cur_cost + offset_bsr; + do { + u32 adjusted_len = len - XPRESS_MIN_MATCH_LEN; + unsigned len_hdr = min(adjusted_len, 0xf); + unsigned sym = XPRESS_NUM_CHARS + + ((offset_bsr << 4) | len_hdr); + + cost = base_cost + c->costs[sym]; + if (adjusted_len >= 0xf) { + cost += 8; + if (adjusted_len - 0xf >= 0xff) + cost += 16; + } + + if (cost < optimum_ptr->cost) { + optimum_ptr->prev.link = cur_pos; + optimum_ptr->prev.match_offset = offset; + optimum_ptr->cost = cost; + } + optimum_ptr++; + } while (++len <= matches[i].len); + } while (++i != num_matches); + } + } + + cur_pos++; + + } while (cur_pos != end_pos && cur_pos != XPRESS_OPTIM_ARRAY_LENGTH); + + return xpress_match_chooser_reverse_list(c, cur_pos); +} + +/* Set default XPRESS Huffman symbol costs to kick-start the iterative + * optimization algorithm. */ +static void +xpress_set_default_costs(u8 costs[]) +{ unsigned i; - int ret; - u8 uncompressed_data[uncompressed_len + 8]; - memcpy(uncompressed_data, __uncompressed_data, uncompressed_len); - memset(uncompressed_data + uncompressed_len, 0, 8); + for (i = 0; i < XPRESS_NUM_CHARS; i++) + costs[i] = 8; - wimlib_assert(uncompressed_len <= 32768); + for (; i < XPRESS_NUM_SYMBOLS; i++) + costs[i] = 10; +} - /* XPRESS requires 256 bytes of overhead for the Huffman tables, so it's - * impossible cannot compress 256 bytes or less of data to less than the - * input size. - * - * +1 to take into account that the buffer for compressed data is 1 byte - * smaller than the buffer for uncompressed data. - * - * +4 to take into account that init_output_bitstream() requires at - * least 4 bytes of data. */ - if (uncompressed_len < XPRESS_NUM_SYMBOLS / 2 + 1 + 4) - return 0; +/* Copy the Huffman codeword lengths array @lens to the Huffman symbol costs + * array @costs, but also assign a default cost to each 0-length (unused) + * codeword. */ +static void +xpress_set_costs(u8 costs[], const u8 lens[]) +{ + for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i++) + costs[i] = lens[i] ? lens[i] : XPRESS_MAX_CODEWORD_LEN; +} - ZERO_ARRAY(freq_tab); - num_matches = lz_analyze_block(uncompressed_data, uncompressed_len, - match_tab, xpress_record_match, - xpress_record_literal, freq_tab, - NULL, freq_tab, - &xpress_lz_params); +/* Near-optimal parsing */ +static u32 +xpress_choose_items_near_optimal(struct xpress_compressor *c) +{ + u32 num_passes_remaining = c->params.num_optim_passes; + const u8 *window_ptr; + const u8 *window_end; + struct xpress_item *next_chosen_item; + struct lz_match raw_item; + struct xpress_item xpress_item; - freq_tab[XPRESS_END_OF_DATA]++; + xpress_set_default_costs(c->costs); + c->optimum_cur_idx = 0; + c->optimum_end_idx = 0; - make_canonical_huffman_code(XPRESS_NUM_SYMBOLS, XPRESS_MAX_CODEWORD_LEN, - freq_tab, lens, codewords); + if (c->params.num_optim_passes > 1) { + c->get_matches_func = xpress_get_matches_fillcache; + c->skip_bytes_func = xpress_skip_bytes_fillcache; + } else { + c->get_matches_func = xpress_get_matches_noncaching; + c->skip_bytes_func = xpress_skip_bytes_noncaching; + } - /* IMPORTANT NOTE: - * - * It's tempting to output the 512 Huffman codeword lengths using the - * bitstream_put_bits() function. However, this is NOT correct because - * bitstream_put_bits() will output 2 bytes at a time in little-endian - * order, which is the order that is needed for the compressed literals. - * However, the bytes in the lengths table are in order, so they need to - * be written one at a time without using bitstream_put_bits(). - * - * Because of this, init_output_bitstream() is not called until after - * the lengths table is output. - */ - for (i = 0; i < XPRESS_NUM_SYMBOLS; i += 2) - *compressed_data++ = (lens[i] & 0xf) | (lens[i + 1] << 4); - - init_output_bitstream(&ostream, compressed_data, - uncompressed_len - XPRESS_NUM_SYMBOLS / 2 - 1); - - ret = xpress_write_compressed_literals(&ostream, match_tab, - num_matches, codewords, lens); - if (ret) - return 0; + lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size); - /* Flush any bits that are buffered. */ - ret = flush_output_bitstream(&ostream); - if (ret) - return 0; + while (--num_passes_remaining) { + c->cur_window_ptr = c->cur_window; + window_ptr = c->cur_window; + window_end = window_ptr + c->cur_window_size; + c->cache_ptr = c->cached_matches; + memset(c->freqs, 0, sizeof(c->freqs)); - /* Assert that there are no output bytes between the ostream.output - * pointer and the ostream.next_bit_output pointer. This can only - * happen if bytes had been written at the ostream.output pointer before - * the last bit word was written to the stream. But, this does not - * occur since xpress_write_match() always finishes by writing some bits - * (a Huffman symbol), and the bitstream was just flushed. */ - wimlib_assert(ostream.output - ostream.next_bit_output == 2); - - /* The length of the compressed data is supposed to be the value of the - * ostream.output pointer before flushing, which is now the - * output.next_bit_output pointer after flushing. - * - * There will be an extra 2 bytes at the ostream.bit_output pointer, - * which is zeroed out. (These 2 bytes may be either the last bytes in - * the compressed data, in which case they are actually unnecessary, or - * they may precede a number of bytes embedded into the bitstream.) */ - if (ostream.bit_output > - (const u8*)__compressed_data + uncompressed_len - 3) + while (window_ptr != window_end) { + raw_item = xpress_choose_near_optimal_item(c); + if (raw_item.len >= XPRESS_MIN_MATCH_LEN) { + xpress_tally_match(raw_item.len, + raw_item.offset, c->freqs); + window_ptr += raw_item.len; + } else { + xpress_tally_literal(*window_ptr, c->freqs); + window_ptr += 1; + } + } + c->freqs[XPRESS_END_OF_DATA]++; + xpress_make_huffman_code(c); + xpress_set_costs(c->costs, c->lens); + if (c->cache_ptr <= c->cache_limit) { + c->get_matches_func = xpress_get_matches_usecache_nocheck; + c->skip_bytes_func = xpress_skip_bytes_usecache_nocheck; + } else { + c->get_matches_func = xpress_get_matches_usecache; + c->skip_bytes_func = xpress_skip_bytes_usecache; + } + } + + c->cur_window_ptr = c->cur_window; + window_ptr = c->cur_window; + window_end = window_ptr + c->cur_window_size; + c->cache_ptr = c->cached_matches; + memset(c->freqs, 0, sizeof(c->freqs)); + next_chosen_item = c->chosen_items; + + u32 unseen_cost = 9; + while (window_ptr != window_end) { + raw_item = xpress_choose_near_optimal_item(c); + if (raw_item.len >= XPRESS_MIN_MATCH_LEN) { + xpress_item = xpress_tally_match(raw_item.len, + raw_item.offset, + c->freqs); + window_ptr += raw_item.len; + } else { + xpress_item = xpress_tally_literal(*window_ptr, + c->freqs); + window_ptr += 1; + } + *next_chosen_item++ = xpress_item; + + /* When doing one-pass near-optimal parsing, rebuild the Huffman + * code occasionally. */ + if (unlikely((next_chosen_item - c->chosen_items) % 2048 == 0) && + c->cur_window_size >= 16384 && + c->params.num_optim_passes == 1) + { + xpress_make_huffman_code(c); + for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i++) + c->costs[i] = c->lens[i] ? c->lens[i] : unseen_cost; + if (unseen_cost < 15) + unseen_cost++; + } + } + c->freqs[XPRESS_END_OF_DATA]++; + xpress_make_huffman_code(c); + return next_chosen_item - c->chosen_items; +} + +/* Lazy parsing */ +static u32 +xpress_choose_items_lazy(struct xpress_compressor *c) +{ + struct lz_mf *mf; + u32 len_3_too_far; + const u8 *window_ptr; + const u8 *window_end; + u32 num_matches; + struct lz_match matches[min(c->params.nice_match_length, c->params.max_search_depth)]; + struct xpress_item *next_chosen_item; + struct lz_match prev_match; + + mf = c->mf; + + lz_mf_load_window(mf, c->cur_window, c->cur_window_size); + + if (c->cur_window_size <= 8192) + len_3_too_far = 2048; + else + len_3_too_far = 4096; + + memset(c->freqs, 0, sizeof(c->freqs)); + + window_ptr = c->cur_window; + window_end = c->cur_window + c->cur_window_size; + next_chosen_item = c->chosen_items; + + for (;;) { + + /* Don't have match at previous position */ + + num_matches = lz_mf_get_matches(mf, matches); + window_ptr++; + + if (num_matches == 0 || + (matches[num_matches - 1].len == 3 && + matches[num_matches - 1].offset >= len_3_too_far)) + { + /* No matches found => output literal */ + *next_chosen_item++ = xpress_tally_literal(*(window_ptr - 1), + c->freqs); + if (window_ptr == window_end) + break; + continue; + } + + prev_match = matches[num_matches - 1]; + + have_prev_match: + /* Have match at previous position */ + + if (prev_match.len >= c->params.nice_match_length) { + /* Very long match found => output immediately */ + *next_chosen_item++ = xpress_tally_match(prev_match.len, + prev_match.offset, + c->freqs); + lz_mf_skip_positions(mf, prev_match.len - 1); + window_ptr += prev_match.len - 1; + if (window_ptr == window_end) + break; + continue; + } + + num_matches = lz_mf_get_matches(mf, matches); + window_ptr++; + + if (num_matches == 0 || + (matches[num_matches - 1].len <= prev_match.len)) + { + /* Next match is not longer => output previous match */ + *next_chosen_item++ = xpress_tally_match(prev_match.len, + prev_match.offset, + c->freqs); + lz_mf_skip_positions(mf, prev_match.len - 2); + window_ptr += prev_match.len - 2; + if (window_ptr == window_end) + break; + continue; + } + + /* Next match is longer => output literal */ + + *next_chosen_item++ = xpress_tally_literal(*(window_ptr - 2), + c->freqs); + + prev_match = matches[num_matches - 1]; + + goto have_prev_match; + } + + c->freqs[XPRESS_END_OF_DATA]++; + xpress_make_huffman_code(c); + return next_chosen_item - c->chosen_items; +} + +/* Greedy parsing */ +static u32 +xpress_choose_items_greedy(struct xpress_compressor *c) +{ + struct lz_mf *mf; + u32 len_3_too_far; + const u8 *window_ptr; + const u8 *window_end; + struct lz_match matches[min(c->params.nice_match_length, c->params.max_search_depth)]; + u32 num_matches; + struct xpress_item *next_chosen_item; + + mf = c->mf; + + lz_mf_load_window(mf, c->cur_window, c->cur_window_size); + + if (c->cur_window_size <= 8192) + len_3_too_far = 2048; + else + len_3_too_far = 4096; + + memset(c->freqs, 0, sizeof(c->freqs)); + + window_ptr = c->cur_window; + window_end = c->cur_window + c->cur_window_size; + next_chosen_item = c->chosen_items; + + do { + /* Get longest match at the current position. */ + num_matches = lz_mf_get_matches(mf, matches); + + if (num_matches == 0 || + (matches[num_matches - 1].len == 3 && + matches[num_matches - 1].offset >= len_3_too_far)) + { + *next_chosen_item++ = xpress_tally_literal(*window_ptr, c->freqs); + window_ptr += 1; + } else { + u32 len = matches[num_matches - 1].len; + u32 offset = matches[num_matches - 1].offset; + + *next_chosen_item++ = xpress_tally_match(len, offset, c->freqs); + lz_mf_skip_positions(mf, len - 1); + window_ptr += len; + } + } while (window_ptr != window_end); + + c->freqs[XPRESS_END_OF_DATA]++; + xpress_make_huffman_code(c); + return next_chosen_item - c->chosen_items; +} + +/* Huffman-only parsing */ +static u32 +xpress_choose_items_huffonly(struct xpress_compressor *c) +{ + const u8 *window_ptr; + const u8 *window_end; + struct xpress_item *next_chosen_item; + + memset(c->freqs, 0, sizeof(c->freqs)); + + window_ptr = c->cur_window; + window_end = c->cur_window + c->cur_window_size; + next_chosen_item = c->chosen_items; + + do { + *next_chosen_item++ = xpress_tally_literal(*window_ptr++, c->freqs); + } while (window_ptr != window_end); + + c->freqs[XPRESS_END_OF_DATA]++; + xpress_make_huffman_code(c); + return next_chosen_item - c->chosen_items; +} + +/* Given the specified compression level and maximum window size, build the + * parameters to use for XPRESS compression. */ +static void +xpress_build_params(unsigned int compression_level, u32 max_window_size, + struct xpress_compressor_params *xpress_params) +{ + memset(xpress_params, 0, sizeof(*xpress_params)); + + if (compression_level == 1) { + + /* Huffman only (no Lempel-Ziv matches) */ + xpress_params->mf_algo = LZ_MF_NULL; + xpress_params->choose_items_func = xpress_choose_items_huffonly; + + } else if (compression_level < 30) { + + /* Greedy parsing */ + xpress_params->mf_algo = LZ_MF_HASH_CHAINS; + xpress_params->choose_items_func = xpress_choose_items_greedy; + xpress_params->nice_match_length = compression_level; + xpress_params->max_search_depth = compression_level / 2; + + } else if (compression_level < 60) { + + /* Lazy parsing */ + xpress_params->mf_algo = LZ_MF_HASH_CHAINS; + xpress_params->choose_items_func = xpress_choose_items_lazy; + xpress_params->nice_match_length = compression_level; + xpress_params->max_search_depth = compression_level / 2; + + } else { + + /* Near-optimal parsing */ + xpress_params->choose_items_func = xpress_choose_items_near_optimal; + if (max_window_size >= 32768) + xpress_params->mf_algo = LZ_MF_BINARY_TREES; + else + xpress_params->mf_algo = LZ_MF_HASH_CHAINS; + xpress_params->num_optim_passes = compression_level / 40; + xpress_params->nice_match_length = min(compression_level / 2, + XPRESS_MAX_MATCH_LEN); + xpress_params->max_search_depth = min(compression_level, + XPRESS_MAX_MATCH_LEN); + } +} + +/* Given the specified XPRESS parameters and maximum window size, build the + * parameters to use for match-finding. */ +static void +xpress_build_mf_params(const struct xpress_compressor_params *xpress_params, + u32 max_window_size, struct lz_mf_params *mf_params) +{ + memset(mf_params, 0, sizeof(*mf_params)); + + mf_params->algorithm = xpress_params->mf_algo; + mf_params->max_window_size = max_window_size; + mf_params->min_match_len = XPRESS_MIN_MATCH_LEN; + mf_params->max_match_len = XPRESS_MAX_MATCH_LEN; + mf_params->max_search_depth = xpress_params->max_search_depth; + mf_params->nice_match_len = xpress_params->nice_match_length; +} + +static inline bool +xpress_window_size_valid(size_t window_size) +{ + return (window_size > 0 && window_size <= XPRESS_MAX_OFFSET + 1); +} + +static void +xpress_free_compressor(void *_c); + +static u64 +xpress_get_needed_memory(size_t max_window_size, unsigned int compression_level) +{ + u64 size = 0; + struct xpress_compressor_params params; + + if (!xpress_window_size_valid(max_window_size)) return 0; - *(u16*)ostream.bit_output = cpu_to_le16(0); - compressed_len = ostream.next_bit_output - (const u8*)__compressed_data; - - wimlib_assert(compressed_len <= uncompressed_len - 1); - -#ifdef ENABLE_VERIFY_COMPRESSION - /* Verify that we really get the same thing back when decompressing. */ - u8 buf[uncompressed_len]; - ret = wimlib_xpress_decompress(__compressed_data, compressed_len, - buf, uncompressed_len); - if (ret) { - ERROR("xpress_compress(): Failed to decompress data we " - "compressed"); - abort(); + + xpress_build_params(compression_level, max_window_size, ¶ms); + + size += sizeof(struct xpress_compressor); + + size += lz_mf_get_needed_memory(params.mf_algo, max_window_size); + + if (params.choose_items_func == xpress_choose_items_near_optimal) { + size += (XPRESS_OPTIM_ARRAY_LENGTH + params.nice_match_length) * + sizeof(struct xpress_mc_pos_data); + if (params.num_optim_passes > 1) { + size_t cache_len = max(max_window_size * XPRESS_CACHE_PER_POS, + params.max_search_depth + 1); + size += cache_len * sizeof(struct lz_match); + } else { + size += params.max_search_depth * sizeof(struct lz_match); + } } - for (i = 0; i < uncompressed_len; i++) { - if (buf[i] != uncompressed_data[i]) { - ERROR("xpress_compress(): Data we compressed didn't " - "decompress to the original data (difference at " - "byte %u of %u)", i + 1, uncompressed_len); - abort(); + + size += max_window_size * sizeof(struct xpress_item); + + return size; +} + +static int +xpress_create_compressor(size_t max_window_size, unsigned int compression_level, + void **c_ret) +{ + struct xpress_compressor *c; + struct xpress_compressor_params params; + struct lz_mf_params mf_params; + + if (!xpress_window_size_valid(max_window_size)) + return WIMLIB_ERR_INVALID_PARAM; + + xpress_build_params(compression_level, max_window_size, ¶ms); + xpress_build_mf_params(¶ms, max_window_size, &mf_params); + + c = CALLOC(1, sizeof(struct xpress_compressor)); + if (!c) + goto oom; + + c->params = params; + + c->mf = lz_mf_alloc(&mf_params); + if (!c->mf) + goto oom; + + if (params.choose_items_func == xpress_choose_items_near_optimal) { + c->optimum = MALLOC((XPRESS_OPTIM_ARRAY_LENGTH + + params.nice_match_length) * + sizeof(struct xpress_mc_pos_data)); + if (!c->optimum) + goto oom; + if (params.num_optim_passes > 1) { + size_t cache_len = max(max_window_size * XPRESS_CACHE_PER_POS, + params.max_search_depth + 1); + c->cached_matches = MALLOC(cache_len * sizeof(struct lz_match)); + if (!c->cached_matches) + goto oom; + c->cache_limit = c->cached_matches + cache_len - + (params.max_search_depth + 1); + } else { + c->cached_matches = MALLOC(params.max_search_depth * + sizeof(struct lz_match)); + if (!c->cached_matches) + goto oom; } } -#endif - return compressed_len; + + c->chosen_items = MALLOC(max_window_size * sizeof(struct xpress_item)); + if (!c->chosen_items) + goto oom; + + *c_ret = c; + return 0; + +oom: + xpress_free_compressor(c); + return WIMLIB_ERR_NOMEM; +} + +static size_t +xpress_compress(const void *uncompressed_data, size_t uncompressed_size, + void *compressed_data, size_t compressed_size_avail, void *_c) +{ + struct xpress_compressor *c = _c; + u32 num_chosen_items; + u8 *cptr; + struct xpress_output_bitstream os; + u32 compressed_size; + + /* XPRESS requires 256 bytes of overhead for the Huffman code, so it's + * impossible to compress 256 bytes or less of data to less than the + * input size. */ + if (compressed_size_avail < XPRESS_NUM_SYMBOLS / 2 + 50) + return 0; + + /* Determine match/literal sequence to divide the data into. */ + c->cur_window = uncompressed_data; + c->cur_window_size = uncompressed_size; + num_chosen_items = (*c->params.choose_items_func)(c); + + /* Output the Huffman code as a series of 512 4-bit lengths. */ + cptr = compressed_data; + for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS; i += 2) + *cptr++ = (c->lens[i] & 0xf) | (c->lens[i + 1] << 4); + + /* Output the encoded matches/literals. */ + xpress_init_output(&os, cptr, + compressed_size_avail - XPRESS_NUM_SYMBOLS / 2); + xpress_write_items(&os, c->chosen_items, num_chosen_items, + c->codewords, c->lens); + + /* Flush any pending data and get the length of the compressed data. */ + compressed_size = xpress_flush_output(&os); + if (compressed_size == 0) + return 0; + + /* Return the length of the compressed data. */ + return compressed_size + XPRESS_NUM_SYMBOLS / 2; +} + +static void +xpress_free_compressor(void *_c) +{ + struct xpress_compressor *c = _c; + + if (c) { + lz_mf_free(c->mf); + FREE(c->cached_matches); + FREE(c->optimum); + FREE(c->chosen_items); + FREE(c); + } } + +const struct compressor_ops xpress_compressor_ops = { + .get_needed_memory = xpress_get_needed_memory, + .create_compressor = xpress_create_compressor, + .compress = xpress_compress, + .free_compressor = xpress_free_compressor, +};