X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Fxpress_compress.c;h=1b430912de7521d5b2abb622f6b1a694f7ab6281;hp=65c4b026627bdeaf912b29e04cdba435779f7cef;hb=4a20aae0dd8469a352517a0b107416ffa99ccc55;hpb=3e8aa757aaa63297f0d54007adf46411778fb6a8 diff --git a/src/xpress_compress.c b/src/xpress_compress.c index 65c4b026..1b430912 100644 --- a/src/xpress_compress.c +++ b/src/xpress_compress.c @@ -46,10 +46,10 @@ #define MIN_LEVEL_FOR_NEAR_OPTIMAL 60 /* - * The maximum window order for the matchfinder. This must be the base 2 - * logarithm of the maximum buffer size. + * Matchfinder definitions. For XPRESS, only a 16-bit matchfinder is needed. */ -#define MATCHFINDER_MAX_WINDOW_ORDER 16 +#define mf_pos_t u16 +#define MF_SUFFIX /* * Note: although XPRESS can potentially use a sliding window, it isn't well @@ -59,8 +59,6 @@ * optimizations. */ -#include - #include "wimlib/bitops.h" #include "wimlib/compress_common.h" #include "wimlib/compressor_ops.h" @@ -214,7 +212,7 @@ struct xpress_output_bitstream { /* Pointer to the start of the output buffer. */ u8 *start; - /* Pointer to the location in the ouput buffer at which to write the + /* Pointer to the location in the output buffer at which to write the * next 16 bits. */ u8 *next_bits; @@ -281,7 +279,7 @@ xpress_init_output(struct xpress_output_bitstream *os, void *buffer, size_t size * If the output buffer space is exhausted, then the bits will be ignored, and * xpress_flush_output() will return 0 when it gets called. */ -static inline void +static forceinline void xpress_write_bits(struct xpress_output_bitstream *os, const u32 bits, const unsigned num_bits) { @@ -294,7 +292,7 @@ xpress_write_bits(struct xpress_output_bitstream *os, if (os->bitcount > 16) { os->bitcount -= 16; if (os->end - os->next_byte >= 2) { - put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next_bits); + put_unaligned_le16(os->bitbuf >> os->bitcount, os->next_bits); os->next_bits = os->next_bits2; os->next_bits2 = os->next_byte; os->next_byte += 2; @@ -305,7 +303,7 @@ xpress_write_bits(struct xpress_output_bitstream *os, /* * Interweave a literal byte into the output bitstream. */ -static inline void +static forceinline void xpress_write_byte(struct xpress_output_bitstream *os, u8 byte) { if (os->next_byte < os->end) @@ -315,11 +313,11 @@ xpress_write_byte(struct xpress_output_bitstream *os, u8 byte) /* * Interweave two literal bytes into the output bitstream. */ -static inline void +static forceinline void xpress_write_u16(struct xpress_output_bitstream *os, u16 v) { if (os->end - os->next_byte >= 2) { - put_unaligned_u16_le(v, os->next_byte); + put_unaligned_le16(v, os->next_byte); os->next_byte += 2; } } @@ -334,13 +332,13 @@ xpress_flush_output(struct xpress_output_bitstream *os) if (os->end - os->next_byte < 2) return 0; - put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next_bits); - put_unaligned_u16_le(0, os->next_bits2); + put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next_bits); + put_unaligned_le16(0, os->next_bits2); return os->next_byte - os->start; } -static inline void +static forceinline void xpress_write_extra_length_bytes(struct xpress_output_bitstream *os, unsigned adjusted_len) { @@ -355,7 +353,7 @@ xpress_write_extra_length_bytes(struct xpress_output_bitstream *os, } /* Output a match or literal. */ -static inline void +static forceinline void xpress_write_item(struct xpress_item item, struct xpress_output_bitstream *os, const u32 codewords[], const u8 lens[]) { @@ -396,11 +394,11 @@ xpress_write_item_list(struct xpress_output_bitstream *os, struct xpress_optimum_node *optimum_nodes, size_t count, const u32 codewords[], const u8 lens[]) { - struct xpress_optimum_node *cur_optimum_ptr = optimum_nodes; - struct xpress_optimum_node *end_optimum_ptr = optimum_nodes + count; + struct xpress_optimum_node *cur_node = optimum_nodes; + struct xpress_optimum_node *end_node = optimum_nodes + count; do { - unsigned length = cur_optimum_ptr->item & OPTIMUM_LEN_MASK; - unsigned offset = cur_optimum_ptr->item >> OPTIMUM_OFFSET_SHIFT; + unsigned length = cur_node->item & OPTIMUM_LEN_MASK; + unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; if (length == 1) { /* Literal */ @@ -410,22 +408,22 @@ xpress_write_item_list(struct xpress_output_bitstream *os, } else { /* Match */ unsigned adjusted_len; - unsigned offset_high_bit; + unsigned log2_offset; unsigned len_hdr; unsigned sym; adjusted_len = length - XPRESS_MIN_MATCH_LEN; - offset_high_bit = fls32(offset); + log2_offset = bsr32(offset); len_hdr = min(0xF, adjusted_len); - sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr); + sym = XPRESS_NUM_CHARS + ((log2_offset << 4) | len_hdr); xpress_write_bits(os, codewords[sym], lens[sym]); xpress_write_extra_length_bytes(os, adjusted_len); - xpress_write_bits(os, offset - (1U << offset_high_bit), - offset_high_bit); + xpress_write_bits(os, offset - (1U << log2_offset), + log2_offset); } - cur_optimum_ptr += length; - } while (cur_optimum_ptr != end_optimum_ptr); + cur_node += length; + } while (cur_node != end_node); } #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ @@ -486,7 +484,7 @@ xpress_write(struct xpress_compressor *c, void *out, size_t out_nbytes_avail, /* Tally the Huffman symbol for a literal and return the intermediate * representation of that literal. */ -static inline struct xpress_item +static forceinline struct xpress_item xpress_record_literal(struct xpress_compressor *c, unsigned literal) { c->freqs[literal]++; @@ -498,21 +496,21 @@ xpress_record_literal(struct xpress_compressor *c, unsigned literal) /* Tally the Huffman symbol for a match and return the intermediate * representation of that match. */ -static inline struct xpress_item +static forceinline struct xpress_item xpress_record_match(struct xpress_compressor *c, unsigned length, unsigned offset) { unsigned adjusted_len = length - XPRESS_MIN_MATCH_LEN; unsigned len_hdr = min(adjusted_len, 0xF); - unsigned offset_high_bit = fls32(offset); - unsigned sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr); + unsigned log2_offset = bsr32(offset); + unsigned sym = XPRESS_NUM_CHARS + ((log2_offset << 4) | len_hdr); c->freqs[sym]++; return (struct xpress_item) { .data = (u64)sym | ((u64)adjusted_len << 9) | - ((u64)offset_high_bit << 25) | - ((u64)(offset ^ (1U << offset_high_bit)) << 29), + ((u64)log2_offset << 25) | + ((u64)(offset ^ (1U << log2_offset)) << 29), }; } @@ -531,6 +529,7 @@ xpress_compress_greedy(struct xpress_compressor * restrict c, const u8 * const in_end = in_begin + in_nbytes; struct xpress_item *next_chosen_item = c->chosen_items; unsigned len_3_too_far; + u32 next_hashes[2] = {}; if (in_nbytes <= 8192) len_3_too_far = 2048; @@ -545,11 +544,12 @@ xpress_compress_greedy(struct xpress_compressor * restrict c, length = hc_matchfinder_longest_match(&c->hc_mf, in_begin, - in_next, + in_next - in_begin, XPRESS_MIN_MATCH_LEN - 1, in_end - in_next, min(in_end - in_next, c->nice_match_length), c->max_search_depth, + next_hashes, &offset); if (length >= XPRESS_MIN_MATCH_LEN && !(length == XPRESS_MIN_MATCH_LEN && offset >= len_3_too_far)) @@ -560,9 +560,10 @@ xpress_compress_greedy(struct xpress_compressor * restrict c, in_next += 1; hc_matchfinder_skip_positions(&c->hc_mf, in_begin, - in_next, - in_end, - length - 1); + in_next - in_begin, + in_end - in_begin, + length - 1, + next_hashes); in_next += length - 1; } else { /* No match found */ @@ -591,6 +592,7 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, const u8 * const in_end = in_begin + in_nbytes; struct xpress_item *next_chosen_item = c->chosen_items; unsigned len_3_too_far; + u32 next_hashes[2] = {}; if (in_nbytes <= 8192) len_3_too_far = 2048; @@ -608,11 +610,12 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, /* Find the longest match at the current position. */ cur_len = hc_matchfinder_longest_match(&c->hc_mf, in_begin, - in_next, + in_next - in_begin, XPRESS_MIN_MATCH_LEN - 1, in_end - in_next, min(in_end - in_next, c->nice_match_length), c->max_search_depth, + next_hashes, &cur_offset); in_next += 1; @@ -637,9 +640,10 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, hc_matchfinder_skip_positions(&c->hc_mf, in_begin, - in_next, - in_end, - cur_len - 1); + in_next - in_begin, + in_end - in_begin, + cur_len - 1, + next_hashes); in_next += cur_len - 1; continue; } @@ -662,11 +666,12 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, */ next_len = hc_matchfinder_longest_match(&c->hc_mf, in_begin, - in_next, + in_next - in_begin, cur_len, in_end - in_next, min(in_end - in_next, c->nice_match_length), c->max_search_depth / 2, + next_hashes, &next_offset); in_next += 1; @@ -685,9 +690,10 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, xpress_record_match(c, cur_len, cur_offset); hc_matchfinder_skip_positions(&c->hc_mf, in_begin, - in_next, - in_end, - cur_len - 2); + in_next - in_begin, + in_end - in_begin, + cur_len - 2, + next_hashes); in_next += cur_len - 2; continue; } @@ -728,13 +734,13 @@ xpress_update_costs(struct xpress_compressor *c) */ static void xpress_tally_item_list(struct xpress_compressor *c, - struct xpress_optimum_node *end_optimum_ptr) + struct xpress_optimum_node *end_node) { - struct xpress_optimum_node *cur_optimum_ptr = c->optimum_nodes; + struct xpress_optimum_node *cur_node = c->optimum_nodes; do { - unsigned length = cur_optimum_ptr->item & OPTIMUM_LEN_MASK; - unsigned offset = cur_optimum_ptr->item >> OPTIMUM_OFFSET_SHIFT; + unsigned length = cur_node->item & OPTIMUM_LEN_MASK; + unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; if (length == 1) { /* Literal */ @@ -744,19 +750,19 @@ xpress_tally_item_list(struct xpress_compressor *c, } else { /* Match */ unsigned adjusted_len; - unsigned offset_high_bit; + unsigned log2_offset; unsigned len_hdr; unsigned sym; adjusted_len = length - XPRESS_MIN_MATCH_LEN; - offset_high_bit = fls32(offset); + log2_offset = bsr32(offset); len_hdr = min(0xF, adjusted_len); - sym = XPRESS_NUM_CHARS + ((offset_high_bit << 4) | len_hdr); + sym = XPRESS_NUM_CHARS + ((log2_offset << 4) | len_hdr); c->freqs[sym]++; } - cur_optimum_ptr += length; - } while (cur_optimum_ptr != end_optimum_ptr); + cur_node += length; + } while (cur_node != end_node); } /* @@ -775,10 +781,10 @@ static void xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes, struct lz_match *end_cache_ptr) { - struct xpress_optimum_node *cur_optimum_ptr = c->optimum_nodes + in_nbytes; + struct xpress_optimum_node *cur_node = c->optimum_nodes + in_nbytes; struct lz_match *cache_ptr = end_cache_ptr; - cur_optimum_ptr->cost_to_end = 0; + cur_node->cost_to_end = 0; do { unsigned literal; u32 best_item; @@ -787,7 +793,7 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes, struct lz_match *match; unsigned len; - cur_optimum_ptr--; + cur_node--; cache_ptr--; literal = cache_ptr->offset; @@ -795,14 +801,14 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes, /* Consider coding a literal. */ best_item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1; best_cost_to_end = c->costs[literal] + - (cur_optimum_ptr + 1)->cost_to_end; + (cur_node + 1)->cost_to_end; num_matches = cache_ptr->length; if (num_matches == 0) { /* No matches; the only choice is the literal. */ - cur_optimum_ptr->cost_to_end = best_cost_to_end; - cur_optimum_ptr->item = best_item; + cur_node->cost_to_end = best_cost_to_end; + cur_node->item = best_item; continue; } @@ -821,12 +827,12 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes, /* All lengths are small. Optimize accordingly. */ do { unsigned offset; - unsigned offset_high_bit; + unsigned log2_offset; u32 offset_cost; offset = match->offset; - offset_high_bit = fls32(offset); - offset_cost = offset_high_bit; + log2_offset = bsr32(offset); + offset_cost = log2_offset; do { unsigned len_hdr; unsigned sym; @@ -834,10 +840,10 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes, len_hdr = len - XPRESS_MIN_MATCH_LEN; sym = XPRESS_NUM_CHARS + - ((offset_high_bit << 4) | len_hdr); + ((log2_offset << 4) | len_hdr); cost_to_end = offset_cost + c->costs[sym] + - (cur_optimum_ptr + len)->cost_to_end; + (cur_node + len)->cost_to_end; if (cost_to_end < best_cost_to_end) { best_cost_to_end = cost_to_end; best_item = @@ -850,12 +856,12 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes, /* Some lengths are big. */ do { unsigned offset; - unsigned offset_high_bit; + unsigned log2_offset; u32 offset_cost; offset = match->offset; - offset_high_bit = fls32(offset); - offset_cost = offset_high_bit; + log2_offset = bsr32(offset); + offset_cost = log2_offset; do { unsigned adjusted_len; unsigned len_hdr; @@ -865,10 +871,10 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes, adjusted_len = len - XPRESS_MIN_MATCH_LEN; len_hdr = min(adjusted_len, 0xF); sym = XPRESS_NUM_CHARS + - ((offset_high_bit << 4) | len_hdr); + ((log2_offset << 4) | len_hdr); cost_to_end = offset_cost + c->costs[sym] + - (cur_optimum_ptr + len)->cost_to_end; + (cur_node + len)->cost_to_end; if (adjusted_len >= 0xF) { cost_to_end += 8; if (adjusted_len - 0xF >= 0xFF) @@ -884,9 +890,9 @@ xpress_find_min_cost_path(struct xpress_compressor *c, size_t in_nbytes, } while (++match != cache_ptr); } cache_ptr -= num_matches; - cur_optimum_ptr->cost_to_end = best_cost_to_end; - cur_optimum_ptr->item = best_item; - } while (cur_optimum_ptr != c->optimum_nodes); + cur_node->cost_to_end = best_cost_to_end; + cur_node->item = best_item; + } while (cur_node != c->optimum_nodes); } /* @@ -900,28 +906,23 @@ xpress_find_matches(struct xpress_compressor * restrict c, { const u8 * const in_begin = in; const u8 *in_next = in_begin; - const u8 * const in_end = in_begin + in_nbytes; struct lz_match *cache_ptr = c->match_cache; - u32 next_hash; + u32 next_hashes[2] = {}; + u32 max_len = in_nbytes; + u32 nice_len = min(max_len, c->nice_match_length); bt_matchfinder_init(&c->bt_mf); - next_hash = bt_matchfinder_hash_3_bytes(in_next); - do { + for (;;) { struct lz_match *matches; - unsigned best_len; + u32 best_len; /* If we've found so many matches that the cache might overflow * if we keep finding more, then stop finding matches. This * case is very unlikely. */ - if (unlikely(cache_ptr >= c->cache_overflow_mark)) { - do { - cache_ptr->length = 0; - cache_ptr->offset = *in_next++; - cache_ptr++; - } while (in_next != in_end); - return cache_ptr; - } + if (unlikely(cache_ptr >= c->cache_overflow_mark || + max_len < BT_MATCHFINDER_REQUIRED_NBYTES)) + break; matches = cache_ptr; @@ -931,18 +932,18 @@ xpress_find_matches(struct xpress_compressor * restrict c, cache_ptr = bt_matchfinder_get_matches(&c->bt_mf, in_begin, - in_next, - XPRESS_MIN_MATCH_LEN, - in_end - in_next, - min(in_end - in_next, c->nice_match_length), + in_next - in_begin, + max_len, + nice_len, c->max_search_depth, - &next_hash, + next_hashes, &best_len, cache_ptr); cache_ptr->length = cache_ptr - matches; - cache_ptr->offset = *in_next; - in_next++; + cache_ptr->offset = *in_next++; cache_ptr++; + max_len--; + nice_len = min(nice_len, max_len); /* * If there was a very long match found, then don't cache any @@ -954,24 +955,32 @@ xpress_find_matches(struct xpress_compressor * restrict c, * very much. If there's a long match, then the data must be * highly compressible, so it doesn't matter as much what we do. */ - if (best_len >= c->nice_match_length) { + if (best_len >= nice_len) { + if (unlikely(best_len + + BT_MATCHFINDER_REQUIRED_NBYTES >= max_len)) + break; --best_len; do { bt_matchfinder_skip_position(&c->bt_mf, in_begin, - in_next, - in_end, - min(in_end - in_next, - c->nice_match_length), + in_next - in_begin, + nice_len, c->max_search_depth, - &next_hash); - + next_hashes); cache_ptr->length = 0; cache_ptr->offset = *in_next++; cache_ptr++; + max_len--; + nice_len = min(nice_len, max_len); } while (--best_len); } - } while (in_next != in_end); + } + + while (max_len--) { + cache_ptr->length = 0; + cache_ptr->offset = *in_next++; + cache_ptr++; + } return cache_ptr; } @@ -1031,7 +1040,8 @@ xpress_get_compressor_size(size_t max_bufsize, unsigned compression_level) } static u64 -xpress_get_needed_memory(size_t max_bufsize, unsigned compression_level) +xpress_get_needed_memory(size_t max_bufsize, unsigned compression_level, + bool destructive) { u64 size = 0; @@ -1060,15 +1070,14 @@ xpress_get_needed_memory(size_t max_bufsize, unsigned compression_level) static int xpress_create_compressor(size_t max_bufsize, unsigned compression_level, - void **c_ret) + bool destructive, void **c_ret) { struct xpress_compressor *c; if (max_bufsize > XPRESS_MAX_BUFSIZE) return WIMLIB_ERR_INVALID_PARAM; - c = ALIGNED_MALLOC(xpress_get_compressor_size(max_bufsize, compression_level), - MATCHFINDER_ALIGNMENT); + c = MALLOC(xpress_get_compressor_size(max_bufsize, compression_level)); if (!c) goto oom0; @@ -1082,12 +1091,19 @@ xpress_create_compressor(size_t max_bufsize, unsigned compression_level, if (compression_level < 30) { c->impl = xpress_compress_greedy; - c->max_search_depth = (compression_level * 24) / 16; - c->nice_match_length = (compression_level * 48) / 16; + c->max_search_depth = (compression_level * 30) / 16; + c->nice_match_length = (compression_level * 60) / 16; } else { c->impl = xpress_compress_lazy; - c->max_search_depth = (compression_level * 24) / 32; - c->nice_match_length = (compression_level * 48) / 32; + c->max_search_depth = (compression_level * 30) / 32; + c->nice_match_length = (compression_level * 60) / 32; + + /* xpress_compress_lazy() needs max_search_depth >= 2 + * because it halves the max_search_depth when + * attempting a lazy match, and max_search_depth cannot + * be 0. */ + if (c->max_search_depth < 2) + c->max_search_depth = 2; } } #if SUPPORT_NEAR_OPTIMAL_PARSING @@ -1107,24 +1123,28 @@ xpress_create_compressor(size_t max_bufsize, unsigned compression_level, &c->match_cache[max_bufsize * CACHE_RESERVE_PER_POS]; c->impl = xpress_compress_near_optimal; - c->max_search_depth = (compression_level * 32) / 100; - c->nice_match_length = (compression_level * 50) / 100; + c->max_search_depth = (compression_level * 28) / 100; + c->nice_match_length = (compression_level * 56) / 100; c->num_optim_passes = compression_level / 40; } #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + /* max_search_depth == 0 is invalid. */ + if (c->max_search_depth < 1) + c->max_search_depth = 1; + *c_ret = c; return 0; oom1: - ALIGNED_FREE(c); + FREE(c); oom0: return WIMLIB_ERR_NOMEM; } static size_t -xpress_compress(const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, void *_c) +xpress_compress(const void *restrict in, size_t in_nbytes, + void *restrict out, size_t out_nbytes_avail, void *restrict _c) { struct xpress_compressor *c = _c; @@ -1152,7 +1172,7 @@ xpress_free_compressor(void *_c) } else #endif FREE(c->chosen_items); - ALIGNED_FREE(c); + FREE(c); } const struct compressor_ops xpress_compressor_ops = {