X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Flzx_compress.c;h=5e1be485f3bc9ff0590db093edc8f09599c3f0b5;hp=45bb019bb94341c1a6cf146239fffe8ff15ea943;hb=80c2fe3e6463cfd0eca5bead23a08731b6db9576;hpb=3e8aa757aaa63297f0d54007adf46411778fb6a8 diff --git a/src/lzx_compress.c b/src/lzx_compress.c index 45bb019b..5e1be485 100644 --- a/src/lzx_compress.c +++ b/src/lzx_compress.c @@ -79,17 +79,33 @@ * LZX_CACHE_PER_POS is the number of lz_match structures to reserve in the * match cache for each byte position. This value should be high enough so that * nearly the time, all matches found in a given block can fit in the match - * cache. However, fallback behavior on cache overflow is still required. + * cache. However, fallback behavior (immediately terminating the block) on + * cache overflow is still required. */ -#define LZX_CACHE_PER_POS 6 +#define LZX_CACHE_PER_POS 7 -#define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1)) +/* + * LZX_CACHE_LENGTH is the number of lz_match structures in the match cache, + * excluding the extra "overflow" entries. The per-position multiplier is '1 + + * LZX_CACHE_PER_POS' instead of 'LZX_CACHE_PER_POS' because there is an + * overhead of one lz_match per position, used to hold the match count at that + * position. + */ +#define LZX_CACHE_LENGTH (LZX_DIV_BLOCK_SIZE * (1 + LZX_CACHE_PER_POS)) +/* + * LZX_MAX_MATCHES_PER_POS is an upper bound on the number of matches that can + * ever be saved in the match cache for a single position. Since each match we + * save for a single position has a distinct length, we can use the number of + * possible match lengths in LZX as this bound. This bound is guaranteed to be + * valid in all cases, although if 'nice_match_length < LZX_MAX_MATCH_LEN', then + * it will never actually be reached. + */ #define LZX_MAX_MATCHES_PER_POS LZX_NUM_LENS /* * LZX_BIT_COST is a scaling factor that represents the cost to output one bit. - * THis makes it possible to consider fractional bit costs. + * This makes it possible to consider fractional bit costs. * * Note: this is only useful as a statistical trick for when the true costs are * unknown. In reality, each token in LZX requires a whole number of bits to @@ -104,14 +120,15 @@ #define LZX_CONSIDER_ALIGNED_COSTS 0 /* - * The maximum compression level at which we use the faster algorithm. + * LZX_MAX_FAST_LEVEL is the maximum compression level at which we use the + * faster algorithm. */ #define LZX_MAX_FAST_LEVEL 34 /* * LZX_HASH2_ORDER is the log base 2 of the number of entries in the hash table * for finding length 2 matches. This can be as high as 16 (in which case the - * hash function is trivial), but using a smaller hash table actually speeds up + * hash function is trivial), but using a smaller hash table speeds up * compression due to reduced cache pressure. */ #define LZX_HASH2_ORDER 12 @@ -129,7 +146,6 @@ #include "wimlib/bt_matchfinder.h" #include "wimlib/compress_common.h" #include "wimlib/compressor_ops.h" -#include "wimlib/endianness.h" #include "wimlib/error.h" #include "wimlib/hc_matchfinder.h" #include "wimlib/lz_extend.h" @@ -352,6 +368,10 @@ struct lzx_compressor { /* Pointer to the compress() implementation chosen at allocation time */ void (*impl)(struct lzx_compressor *, struct lzx_output_bitstream *); + /* If true, the compressor need not preserve the input buffer if it + * compresses the data successfully. */ + bool destructive; + /* The Huffman symbol frequency counters for the current block. */ struct lzx_freqs freqs; @@ -361,9 +381,26 @@ struct lzx_compressor { struct lzx_codes codes[2]; unsigned codes_index; - /* The match/literal sequence the algorithm chose for the current block. + /* + * The match/literal sequence the algorithm chose for the current block. + * + * Notes on how large this array actually needs to be: + * + * - In lzx_compress_near_optimal(), the maximum block size is + * 'LZX_DIV_BLOCK_SIZE + LZX_MAX_MATCH_LEN - 1' bytes. This occurs if + * a match of the maximum length is found on the last byte. Although + * it is impossible for this particular case to actually result in a + * parse of all literals, we reserve this many spaces anyway. + * + * - The worst case for lzx_compress_lazy() is a block of almost all + * literals that ends with a series of matches of increasing scores, + * causing a sequence of literals to be chosen before the last match + * is finally chosen. The number of items actually chosen in this + * scenario is limited by the number of distinct match scores that + * exist for matches shorter than 'nice_match_length'. Having + * 'LZX_MAX_MATCH_LEN - 1' extra spaces is plenty for now. */ - struct lzx_item chosen_items[LZX_DIV_BLOCK_SIZE + LZX_MAX_MATCH_LEN + 1]; + struct lzx_item chosen_items[LZX_DIV_BLOCK_SIZE + LZX_MAX_MATCH_LEN - 1]; /* Table mapping match offset => offset slot for small offsets */ #define LZX_NUM_FAST_OFFSETS 32768 @@ -378,21 +415,58 @@ struct lzx_compressor { /* Data for near-optimal parsing */ struct { - /* The graph nodes for the current block */ + /* + * The graph nodes for the current block. + * + * We need at least 'LZX_DIV_BLOCK_SIZE + + * LZX_MAX_MATCH_LEN - 1' nodes because that is the + * maximum block size that may be used. Add 1 because + * we need a node to represent end-of-block. + * + * It is possible that nodes past end-of-block are + * accessed during match consideration, but this can + * only occur if the block was truncated at + * LZX_DIV_BLOCK_SIZE. So the same bound still applies. + * Note that since nodes past the end of the block will + * never actually have an effect on the items that are + * chosen for the block, it makes no difference what + * their costs are initialized to (if anything). + */ struct lzx_optimum_node optimum_nodes[LZX_DIV_BLOCK_SIZE + - LZX_MAX_MATCH_LEN + 1]; + LZX_MAX_MATCH_LEN - 1 + 1]; /* The cost model for the current block */ struct lzx_costs costs; - /* Cached matches for the current block */ - struct lz_match match_cache[LZX_CACHE_LEN + 1 + - LZX_MAX_MATCHES_PER_POS]; - struct lz_match *cache_overflow_mark; + /* + * Cached matches for the current block. This array + * contains the matches that were found at each position + * in the block. Specifically, for each position, there + * is a special 'struct lz_match' whose 'length' field + * contains the number of matches that were found at + * that position; this is followed by the matches + * themselves, if any, sorted by strictly increasing + * length. + * + * Note: in rare cases, there will be a very high number + * of matches in the block and this array will overflow. + * If this happens, we force the end of the current + * block. LZX_CACHE_LENGTH is the length at which we + * actually check for overflow. The extra slots beyond + * this are enough to absorb the worst case overflow, + * which occurs if starting at + * &match_cache[LZX_CACHE_LENGTH - 1], we write the + * match count header, then write + * LZX_MAX_MATCHES_PER_POS matches, then skip searching + * for matches at 'LZX_MAX_MATCH_LEN - 1' positions and + * write the match count header for each. + */ + struct lz_match match_cache[LZX_CACHE_LENGTH + + LZX_MAX_MATCHES_PER_POS + + LZX_MAX_MATCH_LEN - 1]; /* Hash table for finding length 2 matches */ - pos_t hash2_tab[LZX_HASH2_LENGTH] - _aligned_attribute(MATCHFINDER_ALIGNMENT); + pos_t hash2_tab[LZX_HASH2_LENGTH]; /* Binary trees matchfinder (MUST BE LAST!!!) */ struct bt_matchfinder bt_mf; @@ -400,17 +474,6 @@ struct lzx_compressor { }; }; -/* Compute a hash value for the next 2 bytes of uncompressed data. */ -static inline u32 -lz_hash_2_bytes(const u8 *in_next) -{ - u16 next_2_bytes = load_u16_unaligned(in_next); - if (LZX_HASH2_ORDER == 16) - return next_2_bytes; - else - return lz_hash(next_2_bytes, LZX_HASH2_ORDER); -} - /* * Structure to keep track of the current state of sending bits to the * compressed output buffer. @@ -426,14 +489,15 @@ struct lzx_output_bitstream { u32 bitcount; /* Pointer to the start of the output buffer. */ - le16 *start; + u8 *start; /* Pointer to the position in the output buffer at which the next coding * unit should be written. */ - le16 *next; + u8 *next; - /* Pointer past the end of the output buffer. */ - le16 *end; + /* Pointer just past the end of the output buffer, rounded down to a + * 2-byte boundary. */ + u8 *end; }; /* @@ -453,7 +517,7 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size) os->bitcount = 0; os->start = buffer; os->next = os->start; - os->end = os->start + size / sizeof(le16); + os->end = os->start + (size & ~1); } /* @@ -463,9 +527,9 @@ lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size) * bits in @bits cannot be set. At most 17 bits can be written at once. * * @max_num_bits is a compile-time constant that specifies the maximum number of - * bits that can ever be written at the call site. Currently, it is used to - * optimize away the conditional code for writing a second 16-bit coding unit - * when writing fewer than 17 bits. + * bits that can ever be written at the call site. It is used to optimize away + * the conditional code for writing a second 16-bit coding unit when writing + * fewer than 17 bits. * * If the output buffer space is exhausted, then the bits will be ignored, and * lzx_flush_output() will return 0 when it gets called. @@ -493,16 +557,20 @@ lzx_write_varbits(struct lzx_output_bitstream *os, os->bitcount -= 16; /* Write a coding unit, unless it would overflow the buffer. */ - if (os->next != os->end) - put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next++); + if (os->next != os->end) { + put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next); + os->next += 2; + } /* If writing 17 bits, a second coding unit might need to be * written. But because 'max_num_bits' is a compile-time * constant, the compiler will optimize away this code at most * call sites. */ if (max_num_bits == 17 && os->bitcount == 16) { - if (os->next != os->end) - put_unaligned_u16_le(os->bitbuf, os->next++); + if (os->next != os->end) { + put_unaligned_u16_le(os->bitbuf, os->next); + os->next += 2; + } os->bitcount = 0; } } @@ -511,8 +579,7 @@ lzx_write_varbits(struct lzx_output_bitstream *os, /* Use when @num_bits is a compile-time constant. Otherwise use * lzx_write_varbits(). */ static inline void -lzx_write_bits(struct lzx_output_bitstream *os, - const u32 bits, const unsigned num_bits) +lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits) { lzx_write_varbits(os, bits, num_bits, num_bits); } @@ -527,10 +594,12 @@ lzx_flush_output(struct lzx_output_bitstream *os) if (os->next == os->end) return 0; - if (os->bitcount != 0) - put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next++); + if (os->bitcount != 0) { + put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next); + os->next += 2; + } - return (const u8 *)os->next - (const u8 *)os->start; + return os->next - os->start; } /* Build the main, length, and aligned offset Huffman codes used in LZX. @@ -1051,8 +1120,8 @@ lzx_declare_explicit_offset_match(struct lzx_compressor *c, unsigned len, u32 of extra_bits = (offset + LZX_OFFSET_ADJUSTMENT) - lzx_offset_slot_base[offset_slot]; - BUILD_BUG_ON(LZX_MAINCODE_MAX_NUM_SYMBOLS > (1 << 10)); - BUILD_BUG_ON(LZX_LENCODE_NUM_SYMBOLS > (1 << 8)); + STATIC_ASSERT(LZX_MAINCODE_MAX_NUM_SYMBOLS <= (1 << 10)); + STATIC_ASSERT(LZX_LENCODE_NUM_SYMBOLS <= (1 << 8)); *(*next_chosen_item)++ = (struct lzx_item) { .data = (u64)main_symbol | ((u64)len_symbol << 10) | @@ -1144,7 +1213,7 @@ lzx_tally_item_list(struct lzx_compressor *c, struct lzx_optimum_node *cur_node) * Also, note that because of the presence of the recent offsets queue (which is * a type of adaptive state), the algorithm cannot work backwards and compute * "cost to end" instead of "cost to beginning". Furthermore, the way the - * algorithm handles this adaptive state in the "minimum-cost" parse is actually + * algorithm handles this adaptive state in the "minimum cost" parse is actually * only an approximation. It's possible for the globally optimal, minimum cost * path to contain a prefix, ending at a position, where that path prefix is * *not* the minimum cost path to that position. This can happen if such a path @@ -1172,7 +1241,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c, * it is no longer needed. */ struct lzx_lru_queue queues[512]; - BUILD_BUG_ON(ARRAY_LEN(queues) < LZX_MAX_MATCH_LEN + 1); + STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1); #define QUEUE(in) (queues[(uintptr_t)(in) % ARRAY_LEN(queues)]) /* Initially, the cost to reach each node is "infinity". */ @@ -1224,7 +1293,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c, matchptr = in_next - lzx_lru_queue_R0(QUEUE(in_next)); if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next)) goto R0_done; - BUILD_BUG_ON(LZX_MIN_MATCH_LEN != 2); + STATIC_ASSERT(LZX_MIN_MATCH_LEN == 2); do { u32 cost = cur_node->cost + c->costs.match_cost[0][ @@ -1365,7 +1434,7 @@ lzx_find_min_cost_path(struct lzx_compressor * const restrict c, } } while (cur_node != end_node); - /* Return the match offset queue at the end of the minimum-cost path. */ + /* Return the match offset queue at the end of the minimum cost path. */ return QUEUE(block_end); } @@ -1409,7 +1478,7 @@ lzx_set_default_costs(struct lzx_compressor *c, const u8 *block, u32 block_size) unsigned num_used_bytes; /* The costs below are hard coded to use a scaling factor of 16. */ - BUILD_BUG_ON(LZX_BIT_COST != 16); + STATIC_ASSERT(LZX_BIT_COST == 16); /* * Heuristics: @@ -1530,7 +1599,7 @@ lzx_compress_near_optimal(struct lzx_compressor *c, struct lzx_lru_queue queue; bt_matchfinder_init(&c->bt_mf); - matchfinder_init(c->hash2_tab, LZX_HASH2_LENGTH); + memset(c->hash2_tab, 0, sizeof(c->hash2_tab)); next_hash = bt_matchfinder_hash_3_bytes(in_next); lzx_lru_queue_init(&queue); @@ -1554,16 +1623,12 @@ lzx_compress_near_optimal(struct lzx_compressor *c, max_len = in_end - in_next; nice_len = min(max_len, nice_len); - /* This extra check is needed to ensure that - * reading the next 3 bytes when looking for a - * length 2 match is valid. In addition, we - * cannot allow ourselves to find a length 2 - * match of the very last two bytes with the - * very first two bytes, since such a match has - * an offset too large to be represented. */ - if (unlikely(max_len < - max(LZ_HASH_REQUIRED_NBYTES, 3))) - { + /* This extra check is needed to ensure that we + * never output a length 2 match of the very + * last two bytes with the very first two bytes, + * since such a match has an offset too large to + * be represented. */ + if (unlikely(max_len < 3)) { in_next++; cache_ptr->length = 0; cache_ptr++; @@ -1574,14 +1639,13 @@ lzx_compress_near_optimal(struct lzx_compressor *c, lz_matchptr = cache_ptr + 1; /* Check for a length 2 match. */ - hash2 = lz_hash_2_bytes(in_next); + hash2 = lz_hash_2_bytes(in_next, LZX_HASH2_ORDER); cur_match = c->hash2_tab[hash2]; c->hash2_tab[hash2] = in_next - in_begin; - if (matchfinder_node_valid(cur_match) && + if (cur_match != 0 && (LZX_HASH2_ORDER == 16 || load_u16_unaligned(&in_begin[cur_match]) == - load_u16_unaligned(in_next)) && - in_begin[cur_match + 2] != in_next[2]) + load_u16_unaligned(in_next))) { lz_matchptr->length = 2; lz_matchptr->offset = in_next - &in_begin[cur_match]; @@ -1621,16 +1685,14 @@ lzx_compress_near_optimal(struct lzx_compressor *c, if (unlikely(max_len > in_end - in_next)) { max_len = in_end - in_next; nice_len = min(max_len, nice_len); - if (unlikely(max_len < - max(LZ_HASH_REQUIRED_NBYTES, 3))) - { + if (unlikely(max_len < 3)) { in_next++; cache_ptr->length = 0; cache_ptr++; continue; } } - c->hash2_tab[lz_hash_2_bytes(in_next)] = + c->hash2_tab[lz_hash_2_bytes(in_next, LZX_HASH2_ORDER)] = in_next - in_begin; bt_matchfinder_skip_position(&c->bt_mf, in_begin, @@ -1645,7 +1707,7 @@ lzx_compress_near_optimal(struct lzx_compressor *c, } while (--best_len); } } while (in_next < in_block_end && - likely(cache_ptr < c->cache_overflow_mark)); + likely(cache_ptr < &c->match_cache[LZX_CACHE_LENGTH])); /* We've finished running the block through the matchfinder. * Now choose a match/literal sequence and write the block. */ @@ -1671,7 +1733,7 @@ lzx_find_longest_repeat_offset_match(const u8 * const in_next, struct lzx_lru_queue queue, unsigned *rep_max_idx_ret) { - BUILD_BUG_ON(LZX_NUM_RECENT_OFFSETS != 3); + STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); LZX_ASSERT(bytes_remaining >= 2); const unsigned max_len = min(bytes_remaining, LZX_MAX_MATCH_LEN); @@ -1949,7 +2011,8 @@ lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level) } static u64 -lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level) +lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level, + bool destructive) { u64 size = 0; @@ -1957,13 +2020,14 @@ lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level) return 0; size += lzx_get_compressor_size(max_bufsize, compression_level); - size += max_bufsize; /* in_buffer */ + if (!destructive) + size += max_bufsize; /* in_buffer */ return size; } static int lzx_create_compressor(size_t max_bufsize, unsigned compression_level, - void **c_ret) + bool destructive, void **c_ret) { unsigned window_order; struct lzx_compressor *c; @@ -1972,18 +2036,20 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level, if (window_order == 0) return WIMLIB_ERR_INVALID_PARAM; - c = ALIGNED_MALLOC(lzx_get_compressor_size(max_bufsize, - compression_level), - MATCHFINDER_ALIGNMENT); + c = MALLOC(lzx_get_compressor_size(max_bufsize, compression_level)); if (!c) goto oom0; + c->destructive = destructive; + c->num_main_syms = lzx_get_num_main_syms(window_order); c->window_order = window_order; - c->in_buffer = MALLOC(max_bufsize); - if (!c->in_buffer) - goto oom1; + if (!c->destructive) { + c->in_buffer = MALLOC(max_bufsize); + if (!c->in_buffer) + goto oom1; + } if (compression_level <= LZX_MAX_FAST_LEVEL) { @@ -1991,9 +2057,13 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level, c->impl = lzx_compress_lazy; c->max_search_depth = (36 * compression_level) / 20; - c->nice_match_length = min((72 * compression_level) / 20, - LZX_MAX_MATCH_LEN); + c->nice_match_length = (72 * compression_level) / 20; + /* lzx_compress_lazy() needs max_search_depth >= 2 because it + * halves the max_search_depth when attempting a lazy match, and + * max_search_depth cannot be 0. */ + if (c->max_search_depth < 2) + c->max_search_depth = 2; } else { /* Normal / high compression: Use near-optimal parsing. */ @@ -2003,8 +2073,7 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level, /* Scale nice_match_length and max_search_depth with the * compression level. */ c->max_search_depth = (24 * compression_level) / 50; - c->nice_match_length = min((32 * compression_level) / 50, - LZX_MAX_MATCH_LEN); + c->nice_match_length = (32 * compression_level) / 50; /* Set a number of optimization passes appropriate for the * compression level. */ @@ -2028,33 +2097,42 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level, if (compression_level >= 300) c->num_optim_passes++; } - - c->cache_overflow_mark = &c->match_cache[LZX_CACHE_LEN]; } + /* max_search_depth == 0 is invalid. */ + if (c->max_search_depth < 1) + c->max_search_depth = 1; + + if (c->nice_match_length > LZX_MAX_MATCH_LEN) + c->nice_match_length = LZX_MAX_MATCH_LEN; + lzx_init_offset_slot_fast(c); *c_ret = c; return 0; oom1: - ALIGNED_FREE(c); + FREE(c); oom0: return WIMLIB_ERR_NOMEM; } static size_t -lzx_compress(const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, void *_c) +lzx_compress(const void *restrict in, size_t in_nbytes, + void *restrict out, size_t out_nbytes_avail, void *restrict _c) { struct lzx_compressor *c = _c; struct lzx_output_bitstream os; + size_t result; /* Don't bother trying to compress very small inputs. */ if (in_nbytes < 100) return 0; /* Copy the input data into the internal buffer and preprocess it. */ - memcpy(c->in_buffer, in, in_nbytes); + if (c->destructive) + c->in_buffer = (void *)in; + else + memcpy(c->in_buffer, in, in_nbytes); c->in_nbytes = in_nbytes; lzx_do_e8_preprocessing(c->in_buffer, in_nbytes); @@ -2069,7 +2147,10 @@ lzx_compress(const void *in, size_t in_nbytes, (*c->impl)(c, &os); /* Flush the output bitstream and return the compressed size or 0. */ - return lzx_flush_output(&os); + result = lzx_flush_output(&os); + if (!result && c->destructive) + lzx_undo_e8_preprocessing(c->in_buffer, c->in_nbytes); + return result; } static void @@ -2077,8 +2158,9 @@ lzx_free_compressor(void *_c) { struct lzx_compressor *c = _c; - FREE(c->in_buffer); - ALIGNED_FREE(c); + if (!c->destructive) + FREE(c->in_buffer); + FREE(c); } const struct compressor_ops lzx_compressor_ops = {