X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Flzms_compress.c;h=f42a46ee2fe8d82292265dd5d9685c7bf9f5bbdf;hp=abf39f5e89f194d5ff0fb087b28a3c4c73b57100;hb=0096c56e9a0ee15f708523075d67ea7a1d77b456;hpb=a141cf94ae7562406ea1bc32c026ac85b37751a6 diff --git a/src/lzms_compress.c b/src/lzms_compress.c index abf39f5e..f42a46ee 100644 --- a/src/lzms_compress.c +++ b/src/lzms_compress.c @@ -173,7 +173,7 @@ struct lzms_item { static inline void check_that_powers_fit_in_bitfield(void) { - BUILD_BUG_ON(LZMS_NUM_DELTA_POWER_SYMS > (1 << (31 - DELTA_SOURCE_POWER_SHIFT))); + STATIC_ASSERT(LZMS_NUM_DELTA_POWER_SYMS <= (1 << (31 - DELTA_SOURCE_POWER_SHIFT))); } /* A stripped-down version of the adaptive state in LZMS which excludes the @@ -249,7 +249,7 @@ struct lzms_optimum_node { * * Note: this adaptive state structure also does not include the * probability entries or current Huffman codewords. Those aren't - * maintained per-position and are only updated occassionally. + * maintained per-position and are only updated occasionally. */ struct lzms_adaptive_state state; } _aligned_attribute(64); @@ -287,6 +287,10 @@ struct lzms_compressor { */ bool use_delta_matches; + /* If true, the compressor need not preserve the input buffer if it + * compresses the data successfully. */ + bool destructive; + /* 'last_target_usages' is a large array that is only needed for * preprocessing, so it is in union with fields that don't need to be * initialized until after preprocessing. */ @@ -303,7 +307,8 @@ struct lzms_compressor { u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER]; /* The per-byte graph nodes for near-optimal parsing */ - struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH]; + struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH + + 1 + MAX_FAST_LENGTH]; /* Table: length => current cost for small match lengths */ u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1]; @@ -323,14 +328,7 @@ struct lzms_compressor { unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS]; unsigned delta_state; unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS]; - struct lzms_probability_entry main_probs[LZMS_NUM_MAIN_PROBS]; - struct lzms_probability_entry match_probs[LZMS_NUM_MATCH_PROBS]; - struct lzms_probability_entry lz_probs[LZMS_NUM_LZ_PROBS]; - struct lzms_probability_entry lz_rep_probs[LZMS_NUM_LZ_REP_DECISIONS] - [LZMS_NUM_LZ_REP_PROBS]; - struct lzms_probability_entry delta_probs[LZMS_NUM_DELTA_PROBS]; - struct lzms_probability_entry delta_rep_probs[LZMS_NUM_DELTA_REP_DECISIONS] - [LZMS_NUM_DELTA_REP_PROBS]; + struct lzms_probabilites probs; /* Huffman codes */ @@ -500,7 +498,7 @@ lzms_range_encoder_shift_low(struct lzms_range_encoder *rc) do { if (likely(rc->next >= rc->begin)) { if (rc->next != rc->end) { - put_unaligned_u16_le(rc->cache + + put_unaligned_le16(rc->cache + (u16)(rc->lower_bound >> 32), rc->next++); } @@ -582,42 +580,42 @@ static void lzms_encode_main_bit(struct lzms_compressor *c, int bit) { lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS, - c->main_probs, &c->rc); + c->probs.main, &c->rc); } static void lzms_encode_match_bit(struct lzms_compressor *c, int bit) { lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS, - c->match_probs, &c->rc); + c->probs.match, &c->rc); } static void lzms_encode_lz_bit(struct lzms_compressor *c, int bit) { lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS, - c->lz_probs, &c->rc); + c->probs.lz, &c->rc); } static void lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx) { lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS, - c->lz_rep_probs[idx], &c->rc); + c->probs.lz_rep[idx], &c->rc); } static void lzms_encode_delta_bit(struct lzms_compressor *c, int bit) { lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS, - c->delta_probs, &c->rc); + c->probs.delta, &c->rc); } static void lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx) { lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS, - c->delta_rep_probs[idx], &c->rc); + c->probs.delta_rep[idx], &c->rc); } /****************************************************************************** @@ -660,7 +658,7 @@ lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits, /* Write a coding unit, unless it would underflow the buffer. */ if (os->next != os->begin) - put_unaligned_u16_le(os->bitbuf >> os->bitcount, --os->next); + put_unaligned_le16(os->bitbuf >> os->bitcount, --os->next); /* Optimization for call sites that never write more than 16 * bits at once. */ @@ -681,7 +679,7 @@ lzms_output_bitstream_flush(struct lzms_output_bitstream *os) return false; if (os->bitcount != 0) - put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), --os->next); + put_unaligned_le16(os->bitbuf << (16 - os->bitcount), --os->next); return true; } @@ -942,7 +940,7 @@ lzms_encode_item_list(struct lzms_compressor *c, } /****************************************************************************** - * Cost evalution * + * Cost evaluation * ******************************************************************************/ /* @@ -977,7 +975,7 @@ static inline void check_cost_shift(void) { /* lzms_bit_costs is hard-coded to the current COST_SHIFT. */ - BUILD_BUG_ON(COST_SHIFT != 6); + STATIC_ASSERT(COST_SHIFT == 6); } #if 0 @@ -1018,7 +1016,7 @@ lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs) static inline u32 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal) { - return lzms_bit_0_cost(main_state, c->main_probs) + + return lzms_bit_0_cost(main_state, c->probs.main) + ((u32)c->literal_lens[literal] << COST_SHIFT); } @@ -1122,7 +1120,7 @@ lzms_update_lru_queues(struct lzms_adaptive_state *state) static inline void lzms_update_state(u8 *state_p, int bit, unsigned num_states) { - *state_p = ((*state_p << 1) | bit) % num_states; + *state_p = ((*state_p << 1) | bit) & (num_states - 1); } static inline void @@ -1182,7 +1180,7 @@ static void lzms_init_delta_matchfinder(struct lzms_compressor *c) { /* Set all entries to use an invalid power, which will never match. */ - BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER >= (1 << (32 - DELTA_SOURCE_POWER_SHIFT))); + STATIC_ASSERT(NUM_POWERS_TO_CONSIDER < (1 << (32 - DELTA_SOURCE_POWER_SHIFT))); memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table)); /* Initialize the next hash code for each power. We can just use zeroes @@ -1197,7 +1195,7 @@ lzms_init_delta_matchfinder(struct lzms_compressor *c) * delta context with the specified @span. */ static inline u32 -lzms_delta_hash(const u8 *p, u32 span) +lzms_delta_hash(const u8 *p, const u32 pos, u32 span) { /* A delta match has a certain span and an offset that is a multiple of * that span. To reduce wasted space we use a single combined hash @@ -1205,11 +1203,11 @@ lzms_delta_hash(const u8 *p, u32 span) * include in the hash code computation the span and the low-order bits * of the current position. */ - BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3); + STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3); u8 d0 = *(p + 0) - *(p + 0 - span); u8 d1 = *(p + 1) - *(p + 1 - span); u8 d2 = *(p + 2) - *(p + 2 - span); - u32 v = ((span + ((u32)(uintptr_t)p & (span - 1))) << 24) | + u32 v = ((span + (pos & (span - 1))) << 24) | ((u32)d2 << 16) | ((u32)d1 << 8) | d0; return lz_hash(v, DELTA_HASH_ORDER); } @@ -1245,12 +1243,12 @@ lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c, const u32 span = (u32)1 << power; if (unlikely(pos < span)) continue; - const u32 next_hash = lzms_delta_hash(in_next + 1, span); + const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span); const u32 hash = c->next_delta_hashes[power]; c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos; c->next_delta_hashes[power] = next_hash; - prefetch(&c->delta_hash_table[next_hash]); + prefetchw(&c->delta_hash_table[next_hash]); } } while (in_next++, pos++, --count); } @@ -1282,10 +1280,10 @@ lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next) * can be reached using a match or literal from the current position. This is * essentially Dijkstra's algorithm in disguise: the graph nodes are positions, * the graph edges are possible matches/literals to code, and the cost of each - * edge is the estimated number of bits that will be required to output the - * corresponding match or literal. But one difference is that we actually - * compute the lowest-cost path in pieces, where each piece is terminated when - * there are no choices to be made. + * edge is the estimated number of bits (scaled up by COST_SHIFT) that will be + * required to output the corresponding match or literal. But one difference is + * that we actually compute the lowest-cost path in pieces, where each piece is + * terminated when there are no choices to be made. * * The costs of literals and matches are estimated using the range encoder * states and the semi-adaptive Huffman codes. Except for range encoding @@ -1375,19 +1373,19 @@ begin: u32 base_cost = cur_node->cost + lzms_bit_1_cost(cur_node->state.main_state, - c->main_probs) + + c->probs.main) + lzms_bit_0_cost(cur_node->state.match_state, - c->match_probs) + + c->probs.match) + lzms_bit_1_cost(cur_node->state.lz_state, - c->lz_probs); + c->probs.lz); for (int i = 0; i < rep_idx; i++) base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i], - c->lz_rep_probs[i]); + c->probs.lz_rep[i]); if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS) base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx], - c->lz_rep_probs[rep_idx]); + c->probs.lz_rep[rep_idx]); u32 len = 2; do { @@ -1437,10 +1435,10 @@ begin: main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS; /* add LZ-rep0 cost */ - cost += lzms_bit_1_cost(main_state, c->main_probs) + - lzms_bit_0_cost(match_state, c->match_probs) + - lzms_bit_1_cost(lz_state, c->lz_probs) + - lzms_bit_0_cost(lz_rep0_state, c->lz_rep_probs[0]) + + cost += lzms_bit_1_cost(main_state, c->probs.main) + + lzms_bit_0_cost(match_state, c->probs.match) + + lzms_bit_1_cost(lz_state, c->probs.lz) + + lzms_bit_0_cost(lz_rep0_state, c->probs.lz_rep[0]) + lzms_fast_length_cost(c, rep0_len); const u32 total_len = rep_len + 1 + rep0_len; @@ -1527,19 +1525,19 @@ begin: u32 base_cost = cur_node->cost + lzms_bit_1_cost(cur_node->state.main_state, - c->main_probs) + + c->probs.main) + lzms_bit_1_cost(cur_node->state.match_state, - c->match_probs) + + c->probs.match) + lzms_bit_1_cost(cur_node->state.delta_state, - c->delta_probs); + c->probs.delta); for (int i = 0; i < rep_idx; i++) base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i], - c->delta_rep_probs[i]); + c->probs.delta_rep[i]); if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS) base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx], - c->delta_rep_probs[rep_idx]); + c->probs.delta_rep[rep_idx]); u32 len = 2; do { @@ -1596,11 +1594,11 @@ begin: u32 base_cost = cur_node->cost + lzms_bit_1_cost(cur_node->state.main_state, - c->main_probs) + + c->probs.main) + lzms_bit_0_cost(cur_node->state.match_state, - c->match_probs) + + c->probs.match) + lzms_bit_0_cost(cur_node->state.lz_state, - c->lz_probs); + c->probs.lz); if (c->try_lzmatch_lit_lzrep0 && likely(in_end - (in_next + c->matches[0].length) >= 3)) @@ -1656,11 +1654,11 @@ begin: main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS; /* add LZ-rep0 cost */ - cost += lzms_bit_1_cost(main_state, c->main_probs) + - lzms_bit_0_cost(match_state, c->match_probs) + - lzms_bit_1_cost(lz_state, c->lz_probs) + + cost += lzms_bit_1_cost(main_state, c->probs.main) + + lzms_bit_0_cost(match_state, c->probs.match) + + lzms_bit_1_cost(lz_state, c->probs.lz) + lzms_bit_0_cost(cur_node->state.lz_rep_states[0], - c->lz_rep_probs[0]) + + c->probs.lz_rep[0]) + lzms_fast_length_cost(c, rep0_len); const u32 total_len = len + 1 + rep0_len; @@ -1714,7 +1712,7 @@ begin: const u32 pos = in_next - c->in_buffer; /* Consider each possible power (log2 of span) */ - BUILD_BUG_ON(NUM_POWERS_TO_CONSIDER > LZMS_NUM_DELTA_POWER_SYMS); + STATIC_ASSERT(NUM_POWERS_TO_CONSIDER <= LZMS_NUM_DELTA_POWER_SYMS); for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) { const u32 span = (u32)1 << power; @@ -1722,13 +1720,13 @@ begin: if (unlikely(pos < span)) continue; - const u32 next_hash = lzms_delta_hash(in_next + 1, span); + const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span); const u32 hash = c->next_delta_hashes[power]; const u32 cur_match = c->delta_hash_table[hash]; c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos; c->next_delta_hashes[power] = next_hash; - prefetch(&c->delta_hash_table[next_hash]); + prefetchw(&c->delta_hash_table[next_hash]); if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT) continue; @@ -1743,7 +1741,7 @@ begin: /* Check the first 3 bytes before entering the * extension loop. */ - BUILD_BUG_ON(NBYTES_HASHED_FOR_DELTA != 3); + STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3); if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) != (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) || ((u8)(*(in_next + 1) - *(in_next + 1 - span)) != @@ -1755,11 +1753,16 @@ begin: /* Extend the delta match to its full length. */ const u32 len = lzms_extend_delta_match(in_next, matchptr, - 3, + NBYTES_HASHED_FOR_DELTA, in_end - in_next, span); const u32 raw_offset = offset >> power; + + if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK - + (LZMS_NUM_DELTA_REPS - 1))) + continue; + const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) | raw_offset; const u32 source = DELTA_SOURCE_TAG | @@ -1790,11 +1793,11 @@ begin: u32 base_cost = cur_node->cost + lzms_bit_1_cost(cur_node->state.main_state, - c->main_probs) + + c->probs.main) + lzms_bit_1_cost(cur_node->state.match_state, - c->match_probs) + + c->probs.match) + lzms_bit_0_cost(cur_node->state.delta_state, - c->delta_probs) + + c->probs.delta) + lzms_delta_source_cost(c, power, raw_offset); u32 l = NBYTES_HASHED_FOR_DELTA; @@ -1848,13 +1851,13 @@ begin: /* Add cost of LZ-rep0 */ const u32 cost = cur_and_lit_cost + - lzms_bit_1_cost(main_state, c->main_probs) + + lzms_bit_1_cost(main_state, c->probs.main) + lzms_bit_0_cost(cur_node->state.match_state, - c->match_probs) + + c->probs.match) + lzms_bit_1_cost(cur_node->state.lz_state, - c->lz_probs) + + c->probs.lz) + lzms_bit_0_cost(cur_node->state.lz_rep_states[0], - c->lz_rep_probs[0]) + + c->probs.lz_rep[0]) + lzms_fast_length_cost(c, rep0_len); const u32 total_len = 1 + rep0_len; @@ -1885,7 +1888,7 @@ begin: * Finalize the adaptive state that results from taking this * lowest-cost path. */ struct lzms_item item_to_take = cur_node->item; - struct lzms_optimum_node *source_node = cur_node - (item_to_take.length); + struct lzms_optimum_node *source_node = cur_node - item_to_take.length; int next_item_idx = -1; for (unsigned i = 0; i < cur_node->num_extra_items; i++) { item_to_take = cur_node->extra_items[i]; @@ -1912,9 +1915,9 @@ begin: if (source >= LZMS_NUM_DELTA_REPS) { /* Explicit offset delta match */ - u32 pair = source - (LZMS_NUM_DELTA_REPS - 1); lzms_update_delta_state(&cur_node->state, 0); - cur_node->state.upcoming_delta_pair = pair; + cur_node->state.upcoming_delta_pair = + source - (LZMS_NUM_DELTA_REPS - 1); } else { /* Repeat offset delta match */ int rep_idx = source; @@ -2013,14 +2016,7 @@ lzms_init_states_and_probabilities(struct lzms_compressor *c) for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++) c->delta_rep_states[i] = 0; - lzms_init_probability_entries(c->main_probs, LZMS_NUM_MAIN_PROBS); - lzms_init_probability_entries(c->match_probs, LZMS_NUM_MATCH_PROBS); - lzms_init_probability_entries(c->lz_probs, LZMS_NUM_LZ_PROBS); - for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++) - lzms_init_probability_entries(c->lz_rep_probs[i], LZMS_NUM_LZ_REP_PROBS); - lzms_init_probability_entries(c->delta_probs, LZMS_NUM_DELTA_PROBS); - for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++) - lzms_init_probability_entries(c->delta_rep_probs[i], LZMS_NUM_DELTA_REP_PROBS); + lzms_init_probabilities(&c->probs); } static void @@ -2100,7 +2096,8 @@ lzms_finalize(struct lzms_compressor *c) } static u64 -lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level) +lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level, + bool destructive) { u64 size = 0; @@ -2109,8 +2106,8 @@ lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level) size += sizeof(struct lzms_compressor); - /* in_buffer */ - size += max_bufsize; + if (!destructive) + size += max_bufsize; /* in_buffer */ /* mf */ size += lcpit_matchfinder_get_needed_memory(max_bufsize); @@ -2120,7 +2117,7 @@ lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level) static int lzms_create_compressor(size_t max_bufsize, unsigned compression_level, - void **c_ret) + bool destructive, void **c_ret) { struct lzms_compressor *c; u32 nice_match_len; @@ -2132,6 +2129,8 @@ lzms_create_compressor(size_t max_bufsize, unsigned compression_level, if (!c) goto oom0; + c->destructive = destructive; + /* Scale nice_match_len with the compression level. But to allow an * optimization for length cost calculations, don't allow nice_match_len * to exceed MAX_FAST_LENGTH. */ @@ -2142,9 +2141,11 @@ lzms_create_compressor(size_t max_bufsize, unsigned compression_level, c->try_lit_lzrep0 = (compression_level >= 60); c->try_lzrep_lit_lzrep0 = (compression_level >= 60); - c->in_buffer = MALLOC(max_bufsize); - if (!c->in_buffer) - goto oom1; + if (!c->destructive) { + c->in_buffer = MALLOC(max_bufsize); + if (!c->in_buffer) + goto oom1; + } if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len)) goto oom2; @@ -2156,7 +2157,8 @@ lzms_create_compressor(size_t max_bufsize, unsigned compression_level, return 0; oom2: - FREE(c->in_buffer); + if (!c->destructive) + FREE(c->in_buffer); oom1: ALIGNED_FREE(c); oom0: @@ -2164,17 +2166,21 @@ oom0: } static size_t -lzms_compress(const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, void *_c) +lzms_compress(const void *restrict in, size_t in_nbytes, + void *restrict out, size_t out_nbytes_avail, void *restrict _c) { struct lzms_compressor *c = _c; + size_t result; /* Don't bother trying to compress extremely small inputs. */ if (in_nbytes < 4) return 0; /* Copy the input data into the internal buffer and preprocess it. */ - memcpy(c->in_buffer, in, in_nbytes); + if (c->destructive) + c->in_buffer = (void *)in; + else + memcpy(c->in_buffer, in, in_nbytes); c->in_nbytes = in_nbytes; lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false); @@ -2187,13 +2193,16 @@ lzms_compress(const void *in, size_t in_nbytes, lzms_range_encoder_init(&c->rc, out, out_nbytes_avail / sizeof(le16)); lzms_output_bitstream_init(&c->os, out, out_nbytes_avail / sizeof(le16)); lzms_init_states_and_probabilities(c); - lzms_init_huffman_codes(c, lzms_get_num_offset_slots(in_nbytes)); + lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes)); /* The main loop: parse and encode. */ lzms_near_optimal_parse(c); /* Return the compressed data size or 0. */ - return lzms_finalize(c); + result = lzms_finalize(c); + if (!result && c->destructive) + lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true); + return result; } static void @@ -2201,7 +2210,8 @@ lzms_free_compressor(void *_c) { struct lzms_compressor *c = _c; - FREE(c->in_buffer); + if (!c->destructive) + FREE(c->in_buffer); lcpit_matchfinder_destroy(&c->mf); ALIGNED_FREE(c); }