From 7bbec7e3377223c617d23fcfcb3de2f5adb38c38 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 1 Jan 2014 11:34:14 -0600 Subject: [PATCH] lz_optimal.h: Cleanup, better comments --- Makefile.am | 2 +- include/wimlib/lz_optimal.h | 296 ++++++++++++++++++++++++------------ src/lzms-compress.c | 14 +- src/lzx-compress.c | 4 +- 4 files changed, 212 insertions(+), 104 deletions(-) diff --git a/Makefile.am b/Makefile.am index 06400403..157e5709 100644 --- a/Makefile.am +++ b/Makefile.am @@ -93,7 +93,7 @@ libwim_la_SOURCES = \ include/wimlib/lookup_table.h \ include/wimlib/lz.h \ include/wimlib/lz_hash.h \ - include/wimlib/lz_optimal \ + include/wimlib/lz_optimal.h \ include/wimlib/lz_sarray.h \ include/wimlib/lzms.h \ include/wimlib/lzx.h \ diff --git a/include/wimlib/lz_optimal.h b/include/wimlib/lz_optimal.h index 98eec103..22129d64 100644 --- a/include/wimlib/lz_optimal.h +++ b/include/wimlib/lz_optimal.h @@ -2,9 +2,10 @@ * lz_optimal.h * * Near-optimal LZ (Lempel-Ziv) parsing, or "match choosing". + * See lz_get_near_optimal_match() for details of the algorithm. * - * This is based on the algorithm used in 7-Zip's DEFLATE encoder, written by - * Igor Pavlov. + * This code is not concerned with actually *finding* LZ matches, as it relies + * on an underlying match-finder implementation that can do so. */ /* @@ -26,17 +27,22 @@ * along with wimlib; if not, see http://www.gnu.org/licenses/. */ -/* Define the following structures before including this: +/* Define the following structures before including this header: * * LZ_COMPRESSOR - * LZ_FORMAT_STATE */ + * LZ_ADAPTIVE_STATE + * + * Also, the type lz_mc_cost_t can be optionally overridden by providing an + * appropriate typedef and defining LZ_MC_COST_T_DEFINED. */ #ifndef _LZ_OPTIMAL_H #define _LZ_OPTIMAL_H #include "wimlib/lz.h" -typedef input_idx_t lz_mc_cost_t; +#ifndef LZ_MC_COST_T_DEFINED + typedef input_idx_t lz_mc_cost_t; +#endif #define LZ_MC_INFINITE_COST (~(lz_mc_cost_t)0) @@ -82,25 +88,26 @@ struct lz_mc_pos_data { } next; }; - /* Format-dependent state that exists after an approximate minimum-cost - * path to reach this position is taken. For example, for LZX this is - * the list of recently used match offsets. This could be 0 bytes if - * the format does not have any state that affects match costs. */ - LZ_FORMAT_STATE state; + /* Format-dependent adaptive state that exists after an approximate + * minimum-cost path to reach this position is taken. For example, for + * LZX this is the list of recently used match offsets. If the format + * does not have any adaptive state that affects match costs, + * LZ_ADAPTIVE_STATE could be set to a dummy structure of size 0. */ + LZ_ADAPTIVE_STATE state; }; struct lz_match_chooser { /* Temporary space used for the match-choosing algorithm. The size of - * this array must be at least one more than greedy_len but otherwise is - * arbitrary. More space simply allows the match-choosing algorithm to - * potentially find better matches (depending on the input, as always). - */ + * this array must be at least one more than @nice_len but otherwise is + * arbitrary. More space decreases the frequency at which the algorithm + * is forced to terminate early. 4096 spaces seems sufficient for most + * real data. */ struct lz_mc_pos_data *optimum; input_idx_t array_space; - /* When a match greater than this length is found, choose it immediately - * without further consideration. */ - input_idx_t greedy_len; + /* When a match with length greater than or equal to this length is + * found, choose it immediately without further consideration. */ + input_idx_t nice_len; /* When matches have been chosen, optimum_cur_idx is set to the position * in the window of the next match/literal to return and optimum_end_idx @@ -117,25 +124,27 @@ struct lz_match_chooser { static bool lz_match_chooser_init(struct lz_match_chooser *mc, input_idx_t array_space, - input_idx_t greedy_len, input_idx_t max_match_len) + input_idx_t nice_len, input_idx_t max_match_len) { - input_idx_t extra_len = min(greedy_len, max_match_len); + input_idx_t extra_len = min(nice_len, max_match_len); LZ_ASSERT(array_space > 0); mc->optimum = MALLOC((array_space + extra_len) * sizeof(mc->optimum[0])); if (mc->optimum == NULL) return false; mc->array_space = array_space; - mc->greedy_len = greedy_len; + mc->nice_len = nice_len; return true; } +/* Free memory allocated in lz_match_chooser_init(). */ static void lz_match_chooser_destroy(struct lz_match_chooser *mc) { FREE(mc->optimum); } +/* Call this before starting to parse each new input string. */ static void lz_match_chooser_begin(struct lz_match_chooser *mc) { @@ -186,61 +195,107 @@ lz_match_chooser_reverse_list(struct lz_match_chooser *mc, input_idx_t cur_pos) /* Get the list of possible matches at the next position. The return value must * be the number of matches found (which may be 0) and a pointer to the returned * matches must be written into @matches_ret. Matches must be of distinct - * lengths and sorted in decreasing order by length. */ + * lengths and sorted in decreasing order by length. Furthermore, match lengths + * may not exceed the @max_match_len passed to lz_match_chooser_init(). */ typedef u32 (*lz_get_matches_t)(LZ_COMPRESSOR *ctx, - const LZ_FORMAT_STATE *state, + const LZ_ADAPTIVE_STATE *state, struct raw_match **matches_ret); -/* Skip the specified number of bytes (don't search for matches at them). */ +/* Skip the specified number of bytes (don't search for matches at them). This + * is expected to be faster than simply getting the matches at each position, + * but the exact performance difference will be dependent on the match-finder + * implementation. */ typedef void (*lz_skip_bytes_t)(LZ_COMPRESSOR *ctx, input_idx_t n); /* Get the cost of the literal located at the position at which matches have * most recently been searched. This can optionally update the @state to take * into account format-dependent state that affects match costs, such as repeat * offsets. */ -typedef u32 (lz_get_prev_literal_cost_t)(LZ_COMPRESSOR *ctx, - LZ_FORMAT_STATE *state); +typedef lz_mc_cost_t (lz_get_prev_literal_cost_t)(LZ_COMPRESSOR *ctx, + LZ_ADAPTIVE_STATE *state); /* Get the cost of a match. This can optionally update the @state to take into * account format-dependent state that affects match costs, such as repeat * offsets. */ -typedef u32 (lz_get_match_cost_t)(LZ_COMPRESSOR *ctx, LZ_FORMAT_STATE *state, - input_idx_t length, - input_idx_t offset); +typedef lz_mc_cost_t (lz_get_match_cost_t)(LZ_COMPRESSOR *ctx, + LZ_ADAPTIVE_STATE *state, + input_idx_t length, + input_idx_t offset); /* * lz_get_near_optimal_match() - * - * Choose the optimal match or literal to use at the next position in the input. + * Choose an approximately optimal match or literal to use at the next position + * in the string, or "window", being LZ-encoded. * - * Unlike a greedy parser that always takes the longest match, or even a + * This is based on the algorithm used in 7-Zip's DEFLATE encoder, written by + * Igor Pavlov. However it also attempts to account for adaptive state, such as + * a LRU queue of recent match offsets. + * + * Unlike a greedy parser that always takes the longest match, or even a "lazy" * parser with one match/literal look-ahead like zlib, the algorithm used here - * may look ahead many matches/literals to determine the optimal match/literal to - * output next. The motivation is that the compression ratio is improved if the - * compressor can do things like use a shorter-than-possible match in order to - * allow a longer match later, and also take into account the Huffman code cost - * model rather than simply assuming that longer is better. + * may look ahead many matches/literals to determine the approximately optimal + * match/literal to code next. The motivation is that the compression ratio is + * improved if the compressor can do things like use a shorter-than-possible + * match in order to allow a longer match later, and also take into account the + * estimated real cost of coding each match/literal based on the underlying + * entropy encoding. + * + * Still, this is not a true optimal parser for several reasons: * - * Still, this is not truly an optimal parser because very long matches are - * taken immediately, and the raw match-finder takes some shortcuts. This is - * done to avoid considering many different alternatives that are unlikely to - * be significantly better. + * - Very long matches (at least @nice_len) are taken immediately. This is + * because locations with long matches are likely to have many possible + * alternatives that would cause slow optimal parsing, but also such locations + * are already highly compressible so it is not too harmful to just grab the + * longest match. * - * This algorithm is based on that used in 7-Zip's DEFLATE encoder. + * - Not all possible matches at each location are considered. Users of this + * code are expected to provide a @get_matches() function that returns a list + * of potentially good matches at the current position, but no more than one + * per length. It therefore must use some sort of heuristic (e.g. smallest or + * repeat offset) to choose a good match to consider for a given length, if + * multiple exist. Furthermore, the @get_matches() implementation may limit + * the total number of matches returned and/or the number of computational + * steps spent searching for matches at each position. + * + * - This function relies on the user-provided @get_match_cost() and + * @get_prev_literal_cost() functions to evaluate match and literal costs, + * respectively, but real compression formats use entropy encoding of the + * literal/match sequence, so the real cost of coding each match or literal is + * unknown until the parse is fully determined. It can be approximated based + * on iterative parses, but the end result is not guaranteed to be globally + * optimal. + * + * - Although this function allows @get_match_cost() and + * @get_prev_literal_cost() to take into account adaptive state, coding + * decisions made with respect to the adaptive state will be locally optimal + * but will not necessarily be globally optimal. This is because the + * algorithm only keeps the least-costly path to get to a given location and + * does not take into account that a slightly more costly path could result in + * a different adaptive state that ultimately results in a lower global cost. + * + * - The array space used by this function is bounded, so in degenerate cases it + * is forced to start returning matches/literals before the algorithm has + * really finished. * * Each call to this function does one of two things: * - * 1. Build a near-optimal sequence of matches/literals, up to some point, that + * 1. Build a sequence of near-optimal matches/literals, up to some point, that * will be returned by subsequent calls to this function, then return the * first one. * * OR * * 2. Return the next match/literal previously computed by a call to this - * function; + * function. * * The return value is a (length, offset) pair specifying the match or literal * chosen. For literals, the length is 0 or 1 and the offset is meaningless. + * + * NOTE: this code has been factored out of the LZX compressor so that it can be + * shared by other formats such as LZMS. It is inlined so there is no loss of + * performance, especially with the different implementations of match-finding, + * cost evaluation, and adaptive state. */ static _always_inline_attribute struct raw_match lz_get_near_optimal_match(struct lz_match_chooser *mc, @@ -249,7 +304,7 @@ lz_get_near_optimal_match(struct lz_match_chooser *mc, lz_get_prev_literal_cost_t get_prev_literal_cost, lz_get_match_cost_t get_match_cost, LZ_COMPRESSOR *ctx, - const LZ_FORMAT_STATE *initial_state) + const LZ_ADAPTIVE_STATE *initial_state) { u32 num_possible_matches; struct raw_match *possible_matches; @@ -285,30 +340,31 @@ lz_get_near_optimal_match(struct lz_match_chooser *mc, longest_match_len = possible_matches[0].len; /* Greedy heuristic: if the longest match that was found is greater - * than the number of fast bytes, return it immediately; don't both - * doing more work. */ - if (longest_match_len > mc->greedy_len) { + * than nice_len, return it immediately; don't both doing more work. */ + if (longest_match_len >= mc->nice_len) { (*skip_bytes)(ctx, longest_match_len - 1); return possible_matches[0]; } - /* Calculate the cost to reach the next position by outputting a - * literal. */ - mc->optimum[0].state = *initial_state; - mc->optimum[1].state = mc->optimum[0].state; + /* Calculate the cost to reach the next position by coding a literal. + */ + mc->optimum[1].state = *initial_state; mc->optimum[1].cost = (*get_prev_literal_cost)(ctx, &mc->optimum[1].state); mc->optimum[1].prev.link = 0; /* Calculate the cost to reach any position up to and including that - * reached by the longest match, using the shortest (i.e. closest) match - * that reaches each position. */ + * reached by the longest match. Use the shortest available match that + * reaches each position, assuming that @get_matches() only returned + * shorter matches because their estimated costs were less than that of + * the longest match. */ for (input_idx_t len = 2, match_idx = num_possible_matches - 1; len <= longest_match_len; len++) { LZ_ASSERT(match_idx < num_possible_matches); + LZ_ASSERT(len <= possible_matches[match_idx].len); - mc->optimum[len].state = mc->optimum[0].state; + mc->optimum[len].state = *initial_state; mc->optimum[len].prev.link = 0; mc->optimum[len].prev.match_offset = possible_matches[match_idx].offset; mc->optimum[len].cost = (*get_match_cost)(ctx, @@ -319,79 +375,131 @@ lz_get_near_optimal_match(struct lz_match_chooser *mc, match_idx--; } + /* Step forward, calculating the estimated minimum cost to reach each + * position. The algorithm may find multiple paths to reach each + * position; only the lowest-cost path is saved. + * + * The progress of the parse is tracked in the @mc->optimum array, which + * for each position contains the minimum cost to reach that position, + * the index of the start of the match/literal taken to reach that + * position through the minimum-cost path, the offset of the match taken + * (not relevant for literals), and the adaptive state that will exist + * at that position after the minimum-cost path is taken. The @cur_pos + * variable stores the position at which the algorithm is currently + * considering coding choices, and the @len_end variable stores the + * greatest offset at which the costs of coding choices have been saved. + * (The algorithm guarantees that all positions before @len_end are + * reachable by at least one path and therefore have costs computed.) + * + * The loop terminates when any one of the following conditions occurs: + * + * 1. A match greater than @nice_len is found. When this is found, the + * algorithm chooses this match unconditionally, and consequently the + * near-optimal match/literal sequence up to and including that match + * is fully determined. + * + * 2. @cur_pos reaches a position not overlapped by a preceding match. + * In such cases, the near-optimal match/literal sequence up to + * @cur_pos is fully determined. + * + * 3. Failing either of the above in a degenerate case, the loop + * terminates when space in the @mc->optimum array is exhausted. + * This terminates the algorithm and forces it to start returning + * matches/literals even though they may not be globally optimal. + * + * Upon loop termination, a nonempty list of matches/literals has been + * produced and stored in the @optimum array. They are linked in + * reverse order, so the last thing this function does is reverse the + * links and return the first match/literal, leaving the rest to be + * returned immediately by subsequent calls to this function. + */ input_idx_t cur_pos = 0; - - /* len_end: greatest index forward at which costs have been calculated - * so far */ input_idx_t len_end = longest_match_len; - for (;;) { /* Advance to next position. */ cur_pos++; + /* Check termination conditions (2) and (3) noted above. */ if (cur_pos == len_end || cur_pos == mc->array_space) return lz_match_chooser_reverse_list(mc, cur_pos); - /* retrieve the number of matches available at this position */ + /* Retrieve a (possibly empty) list of potentially useful + * matches available at this position. */ num_possible_matches = (*get_matches)(ctx, &mc->optimum[cur_pos].state, &possible_matches); - input_idx_t new_len = 0; - - if (num_possible_matches != 0) { - new_len = possible_matches[0].len; - - /* Greedy heuristic: if we found a match greater than - * the number of fast bytes, stop immediately. */ - if (new_len > mc->greedy_len) { + if (num_possible_matches == 0) + longest_match_len = 0; + else + longest_match_len = possible_matches[0].len; + + /* Greedy heuristic and termination condition (1) noted above: + * if we found a match greater than @nice_len, choose it + * unconditionally and begin returning matches/literals. */ + if (longest_match_len >= mc->nice_len) { + /* Build the list of matches to return and get + * the first one. */ + match = lz_match_chooser_reverse_list(mc, cur_pos); + + /* Append the long match to the end of the list. */ + mc->optimum[cur_pos].next.match_offset = + possible_matches[0].offset; + mc->optimum[cur_pos].next.link = cur_pos + longest_match_len; + mc->optimum_end_idx = cur_pos + longest_match_len; + + /* Skip over the remaining bytes of the long match. */ + (*skip_bytes)(ctx, longest_match_len - 1); + + /* Return first match in the list. */ + return match; + } - /* Build the list of matches to return and get - * the first one. */ - match = lz_match_chooser_reverse_list(mc, cur_pos); + /* Load minimum cost to reach the current position. */ + input_idx_t cur_cost = mc->optimum[cur_pos].cost; - /* Append the long match to the end of the list. */ - mc->optimum[cur_pos].next.match_offset = - possible_matches[0].offset; - mc->optimum[cur_pos].next.link = cur_pos + new_len; - mc->optimum_end_idx = cur_pos + new_len; + /* Consider proceeding with a literal byte. */ + { + LZ_ADAPTIVE_STATE state; + lz_mc_cost_t cost; - /* Skip over the remaining bytes of the long match. */ - (*skip_bytes)(ctx, new_len - 1); + state = mc->optimum[cur_pos].state; + cost = cur_cost + (*get_prev_literal_cost)(ctx, &state); - /* Return first match in the list */ - return match; + if (cost < mc->optimum[cur_pos + 1].cost) { + mc->optimum[cur_pos + 1].cost = cost; + mc->optimum[cur_pos + 1].prev.link = cur_pos; + mc->optimum[cur_pos + 1].state = state; } } - /* Consider proceeding with a literal byte. */ - lz_mc_cost_t cur_cost = mc->optimum[cur_pos].cost; - LZ_FORMAT_STATE cur_plus_literal_state = mc->optimum[cur_pos].state; - lz_mc_cost_t cur_plus_literal_cost = cur_cost + - (*get_prev_literal_cost)(ctx, - &cur_plus_literal_state); - if (cur_plus_literal_cost < mc->optimum[cur_pos + 1].cost) { - mc->optimum[cur_pos + 1].cost = cur_plus_literal_cost; - mc->optimum[cur_pos + 1].prev.link = cur_pos; - mc->optimum[cur_pos + 1].state = cur_plus_literal_state; - } + /* If no matches were found, continue to the next position. + * Otherwise, consider proceeding with a match. */ if (num_possible_matches == 0) continue; - /* Consider proceeding with a match. */ - - while (len_end < cur_pos + new_len) + /* Initialize any uninitialized costs up to the length of the + * longest match found. */ + while (len_end < cur_pos + longest_match_len) mc->optimum[++len_end].cost = LZ_MC_INFINITE_COST; + /* Calculate the minimum cost to reach any position up to and + * including that reached by the longest match. Use the + * shortest available match that reaches each position, assuming + * that @get_matches() only returned shorter matches because + * their estimated costs were less than that of the longest + * match. */ for (input_idx_t len = 2, match_idx = num_possible_matches - 1; - len <= new_len; len++) + len <= longest_match_len; len++) { LZ_ASSERT(match_idx < num_possible_matches); + LZ_ASSERT(len <= possible_matches[match_idx].len); - LZ_FORMAT_STATE state = mc->optimum[cur_pos].state; + LZ_ADAPTIVE_STATE state; lz_mc_cost_t cost; + state = mc->optimum[cur_pos].state; cost = cur_cost + (*get_match_cost)(ctx, &state, len, diff --git a/src/lzms-compress.c b/src/lzms-compress.c index eb993b3f..b98ee760 100644 --- a/src/lzms-compress.c +++ b/src/lzms-compress.c @@ -50,14 +50,14 @@ #define LZMS_OPTIM_ARRAY_SIZE 1024 struct lzms_compressor; -struct lzms_cost_state { +struct lzms_adaptive_state { struct lzms_lz_lru_queues lru; u8 main_state; u8 match_state; u8 lz_match_state; }; -#define LZ_FORMAT_STATE struct lzms_cost_state -#define LZ_COMPRESSOR struct lzms_compressor +#define LZ_ADAPTIVE_STATE struct lzms_adaptive_state +#define LZ_COMPRESSOR struct lzms_compressor #include "wimlib/lz_optimal.h" /* Stucture used for writing raw bits to the end of the LZMS-compressed data as @@ -693,7 +693,7 @@ lzms_value_cost(const struct lzms_huffman_encoder *enc, u32 value) static u32 lzms_get_matches(struct lzms_compressor *ctx, - const struct lzms_cost_state *cost_state, + const struct lzms_adaptive_state *cost_state, struct raw_match **matches_ret) { u32 num_matches; @@ -733,7 +733,7 @@ lzms_skip_bytes(struct lzms_compressor *ctx, input_idx_t n) static u32 lzms_get_prev_literal_cost(struct lzms_compressor *ctx, - struct lzms_cost_state *cost_state) + struct lzms_adaptive_state *cost_state) { u8 literal = ctx->window[lz_sarray_get_pos(&ctx->lz_sarray) - 1]; u32 cost = 0; @@ -750,7 +750,7 @@ lzms_get_prev_literal_cost(struct lzms_compressor *ctx, static u32 lzms_get_match_cost(struct lzms_compressor *ctx, - struct lzms_cost_state *cost_state, + struct lzms_adaptive_state *cost_state, input_idx_t length, input_idx_t offset) { u32 cost = 0; @@ -802,7 +802,7 @@ lzms_get_match_cost(struct lzms_compressor *ctx, static struct raw_match lzms_get_near_optimal_match(struct lzms_compressor *ctx) { - struct lzms_cost_state initial_state = { + struct lzms_adaptive_state initial_state = { .lru = ctx->lru.lz, .main_state = ctx->main_range_encoder.state, .match_state = ctx->match_range_encoder.state, diff --git a/src/lzx-compress.c b/src/lzx-compress.c index a4749d21..ae4bfa72 100644 --- a/src/lzx-compress.c +++ b/src/lzx-compress.c @@ -265,8 +265,8 @@ struct lzx_block_spec { }; /* Include template for the match-choosing algorithm. */ -#define LZ_COMPRESSOR struct lzx_compressor -#define LZ_FORMAT_STATE struct lzx_lru_queue +#define LZ_COMPRESSOR struct lzx_compressor +#define LZ_ADAPTIVE_STATE struct lzx_lru_queue struct lzx_compressor; #include "wimlib/lz_optimal.h" -- 2.43.0