4 * Near-optimal LZ (Lempel-Ziv) parsing, or "match choosing".
5 * See lz_get_near_optimal_match() for details of the algorithm.
7 * This code is not concerned with actually *finding* LZ matches, as it relies
8 * on an underlying match-finder implementation that can do so.
12 * Copyright (C) 2013 Eric Biggers
14 * This file is part of wimlib, a library for working with WIM files.
16 * wimlib is free software; you can redistribute it and/or modify it under the
17 * terms of the GNU General Public License as published by the Free
18 * Software Foundation; either version 3 of the License, or (at your option)
21 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
22 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
23 * A PARTICULAR PURPOSE. See the GNU General Public License for more
26 * You should have received a copy of the GNU General Public License
27 * along with wimlib; if not, see http://www.gnu.org/licenses/.
30 /* Define the following structures before including this header:
35 * Also, the type lz_mc_cost_t can be optionally overridden by providing an
36 * appropriate typedef and defining LZ_MC_COST_T_DEFINED. */
41 #include "wimlib/lz.h"
43 #ifndef LZ_MC_COST_T_DEFINED
44 typedef input_idx_t lz_mc_cost_t;
47 #define LZ_MC_INFINITE_COST (~(lz_mc_cost_t)0)
50 * Match chooser position data:
52 * An array of these structures is used during the match-choosing algorithm.
53 * They correspond to consecutive positions in the window and are used to keep
54 * track of the cost to reach each position, and the match/literal choices that
55 * need to be chosen to reach that position.
57 struct lz_mc_pos_data {
58 /* The approximate minimum cost, in bits, to reach this position in the
59 * window which has been found so far. */
62 /* The union here is just for clarity, since the fields are used in two
63 * slightly different ways. Initially, the @prev structure is filled in
64 * first, and links go from later in the window to earlier in the
65 * window. Later, @next structure is filled in and links go from
66 * earlier in the window to later in the window. */
69 /* Position of the start of the match or literal that
70 * was taken to get to this position in the approximate
71 * minimum-cost parse. */
74 /* Offset (as in an LZ (length, offset) pair) of the
75 * match or literal that was taken to get to this
76 * position in the approximate minimum-cost parse. */
77 input_idx_t match_offset;
80 /* Position at which the match or literal starting at
81 * this position ends in the minimum-cost parse. */
84 /* Offset (as in an LZ (length, offset) pair) of the
85 * match or literal starting at this position in the
86 * approximate minimum-cost parse. */
87 input_idx_t match_offset;
91 /* Format-dependent adaptive state that exists after an approximate
92 * minimum-cost path to reach this position is taken. For example, for
93 * LZX this is the list of recently used match offsets. If the format
94 * does not have any adaptive state that affects match costs,
95 * LZ_ADAPTIVE_STATE could be set to a dummy structure of size 0. */
96 LZ_ADAPTIVE_STATE state;
99 struct lz_match_chooser {
100 /* Temporary space used for the match-choosing algorithm. The size of
101 * this array must be at least one more than @nice_len but otherwise is
102 * arbitrary. More space decreases the frequency at which the algorithm
103 * is forced to terminate early. 4096 spaces seems sufficient for most
105 struct lz_mc_pos_data *optimum;
106 input_idx_t array_space;
108 /* When a match with length greater than or equal to this length is
109 * found, choose it immediately without further consideration. */
110 input_idx_t nice_len;
112 /* When matches have been chosen, optimum_cur_idx is set to the position
113 * in the window of the next match/literal to return and optimum_end_idx
114 * is set to the position in the window at the end of the last
115 * match/literal to return. */
116 input_idx_t optimum_cur_idx;
117 input_idx_t optimum_end_idx;
120 /* Initialize the match-chooser.
122 * After calling this, multiple data buffers can be scanned with it if each is
123 * preceded with a call to lz_match_chooser_begin(). */
125 lz_match_chooser_init(struct lz_match_chooser *mc,
126 input_idx_t array_space,
127 input_idx_t nice_len, input_idx_t max_match_len)
129 input_idx_t extra_len = min(nice_len, max_match_len);
131 LZ_ASSERT(array_space > 0);
132 mc->optimum = MALLOC((array_space + extra_len) * sizeof(mc->optimum[0]));
133 if (mc->optimum == NULL)
135 mc->array_space = array_space;
136 mc->nice_len = nice_len;
140 /* Free memory allocated in lz_match_chooser_init(). */
142 lz_match_chooser_destroy(struct lz_match_chooser *mc)
147 /* Call this before starting to parse each new input string. */
149 lz_match_chooser_begin(struct lz_match_chooser *mc)
151 mc->optimum_cur_idx = 0;
152 mc->optimum_end_idx = 0;
156 * Reverse the linked list of near-optimal matches so that they can be returned
159 * Returns the first match in the list.
161 static _always_inline_attribute struct raw_match
162 lz_match_chooser_reverse_list(struct lz_match_chooser *mc, input_idx_t cur_pos)
164 unsigned prev_link, saved_prev_link;
165 unsigned prev_match_offset, saved_prev_match_offset;
167 mc->optimum_end_idx = cur_pos;
169 saved_prev_link = mc->optimum[cur_pos].prev.link;
170 saved_prev_match_offset = mc->optimum[cur_pos].prev.match_offset;
173 prev_link = saved_prev_link;
174 prev_match_offset = saved_prev_match_offset;
176 saved_prev_link = mc->optimum[prev_link].prev.link;
177 saved_prev_match_offset = mc->optimum[prev_link].prev.match_offset;
179 mc->optimum[prev_link].next.link = cur_pos;
180 mc->optimum[prev_link].next.match_offset = prev_match_offset;
183 } while (cur_pos != 0);
185 mc->optimum_cur_idx = mc->optimum[0].next.link;
187 return (struct raw_match)
188 { .len = mc->optimum_cur_idx,
189 .offset = mc->optimum[0].next.match_offset,
193 /* Format-specific functions inlined into lz_get_near_optimal_match(). */
195 /* Get the list of possible matches at the next position. The return value must
196 * be the number of matches found (which may be 0) and a pointer to the returned
197 * matches must be written into @matches_ret. Matches must be of distinct
198 * lengths and sorted in decreasing order by length. Furthermore, match lengths
199 * may not exceed the @max_match_len passed to lz_match_chooser_init(). */
200 typedef u32 (*lz_get_matches_t)(LZ_COMPRESSOR *ctx,
201 const LZ_ADAPTIVE_STATE *state,
202 struct raw_match **matches_ret);
204 /* Skip the specified number of bytes (don't search for matches at them). This
205 * is expected to be faster than simply getting the matches at each position,
206 * but the exact performance difference will be dependent on the match-finder
208 typedef void (*lz_skip_bytes_t)(LZ_COMPRESSOR *ctx, input_idx_t n);
210 /* Get the cost of the literal located at the position at which matches have
211 * most recently been searched. This can optionally update the @state to take
212 * into account format-dependent state that affects match costs, such as repeat
214 typedef lz_mc_cost_t (lz_get_prev_literal_cost_t)(LZ_COMPRESSOR *ctx,
215 LZ_ADAPTIVE_STATE *state);
217 /* Get the cost of a match. This can optionally update the @state to take into
218 * account format-dependent state that affects match costs, such as repeat
220 typedef lz_mc_cost_t (lz_get_match_cost_t)(LZ_COMPRESSOR *ctx,
221 LZ_ADAPTIVE_STATE *state,
226 * lz_get_near_optimal_match() -
228 * Choose an approximately optimal match or literal to use at the next position
229 * in the string, or "window", being LZ-encoded.
231 * This is based on the algorithm used in 7-Zip's DEFLATE encoder, written by
232 * Igor Pavlov. However it also attempts to account for adaptive state, such as
233 * a LRU queue of recent match offsets.
235 * Unlike a greedy parser that always takes the longest match, or even a "lazy"
236 * parser with one match/literal look-ahead like zlib, the algorithm used here
237 * may look ahead many matches/literals to determine the approximately optimal
238 * match/literal to code next. The motivation is that the compression ratio is
239 * improved if the compressor can do things like use a shorter-than-possible
240 * match in order to allow a longer match later, and also take into account the
241 * estimated real cost of coding each match/literal based on the underlying
244 * Still, this is not a true optimal parser for several reasons:
246 * - Very long matches (at least @nice_len) are taken immediately. This is
247 * because locations with long matches are likely to have many possible
248 * alternatives that would cause slow optimal parsing, but also such locations
249 * are already highly compressible so it is not too harmful to just grab the
252 * - Not all possible matches at each location are considered. Users of this
253 * code are expected to provide a @get_matches() function that returns a list
254 * of potentially good matches at the current position, but no more than one
255 * per length. It therefore must use some sort of heuristic (e.g. smallest or
256 * repeat offset) to choose a good match to consider for a given length, if
257 * multiple exist. Furthermore, the @get_matches() implementation may limit
258 * the total number of matches returned and/or the number of computational
259 * steps spent searching for matches at each position.
261 * - This function relies on the user-provided @get_match_cost() and
262 * @get_prev_literal_cost() functions to evaluate match and literal costs,
263 * respectively, but real compression formats use entropy encoding of the
264 * literal/match sequence, so the real cost of coding each match or literal is
265 * unknown until the parse is fully determined. It can be approximated based
266 * on iterative parses, but the end result is not guaranteed to be globally
269 * - Although this function allows @get_match_cost() and
270 * @get_prev_literal_cost() to take into account adaptive state, coding
271 * decisions made with respect to the adaptive state will be locally optimal
272 * but will not necessarily be globally optimal. This is because the
273 * algorithm only keeps the least-costly path to get to a given location and
274 * does not take into account that a slightly more costly path could result in
275 * a different adaptive state that ultimately results in a lower global cost.
277 * - The array space used by this function is bounded, so in degenerate cases it
278 * is forced to start returning matches/literals before the algorithm has
281 * Each call to this function does one of two things:
283 * 1. Build a sequence of near-optimal matches/literals, up to some point, that
284 * will be returned by subsequent calls to this function, then return the
289 * 2. Return the next match/literal previously computed by a call to this
292 * The return value is a (length, offset) pair specifying the match or literal
293 * chosen. For literals, the length is 0 or 1 and the offset is meaningless.
295 * NOTE: this code has been factored out of the LZX compressor so that it can be
296 * shared by other formats such as LZMS. It is inlined so there is no loss of
297 * performance, especially with the different implementations of match-finding,
298 * cost evaluation, and adaptive state.
300 static _always_inline_attribute struct raw_match
301 lz_get_near_optimal_match(struct lz_match_chooser *mc,
302 lz_get_matches_t get_matches,
303 lz_skip_bytes_t skip_bytes,
304 lz_get_prev_literal_cost_t get_prev_literal_cost,
305 lz_get_match_cost_t get_match_cost,
307 const LZ_ADAPTIVE_STATE *initial_state)
309 u32 num_possible_matches;
310 struct raw_match *possible_matches;
311 struct raw_match match;
312 input_idx_t longest_match_len;
314 if (mc->optimum_cur_idx != mc->optimum_end_idx) {
315 /* Case 2: Return the next match/literal already found. */
316 match.len = mc->optimum[mc->optimum_cur_idx].next.link -
318 match.offset = mc->optimum[mc->optimum_cur_idx].next.match_offset;
320 mc->optimum_cur_idx = mc->optimum[mc->optimum_cur_idx].next.link;
324 /* Case 1: Compute a new list of matches/literals to return. */
326 mc->optimum_cur_idx = 0;
327 mc->optimum_end_idx = 0;
329 /* Get matches at this position. */
330 num_possible_matches = (*get_matches)(ctx,
334 /* If no matches found, return literal. */
335 if (num_possible_matches == 0)
336 return (struct raw_match){ .len = 0 };
338 /* The matches that were found are sorted in decreasing order by length.
339 * Get the length of the longest one. */
340 longest_match_len = possible_matches[0].len;
342 /* Greedy heuristic: if the longest match that was found is greater
343 * than nice_len, return it immediately; don't both doing more work. */
344 if (longest_match_len >= mc->nice_len) {
345 (*skip_bytes)(ctx, longest_match_len - 1);
346 return possible_matches[0];
349 /* Calculate the cost to reach the next position by coding a literal.
351 mc->optimum[1].state = *initial_state;
352 mc->optimum[1].cost = (*get_prev_literal_cost)(ctx, &mc->optimum[1].state);
353 mc->optimum[1].prev.link = 0;
355 /* Calculate the cost to reach any position up to and including that
356 * reached by the longest match. Use the shortest available match that
357 * reaches each position, assuming that @get_matches() only returned
358 * shorter matches because their estimated costs were less than that of
359 * the longest match. */
360 for (input_idx_t len = 2, match_idx = num_possible_matches - 1;
361 len <= longest_match_len; len++)
364 LZ_ASSERT(match_idx < num_possible_matches);
365 LZ_ASSERT(len <= possible_matches[match_idx].len);
367 mc->optimum[len].state = *initial_state;
368 mc->optimum[len].prev.link = 0;
369 mc->optimum[len].prev.match_offset = possible_matches[match_idx].offset;
370 mc->optimum[len].cost = (*get_match_cost)(ctx,
371 &mc->optimum[len].state,
373 possible_matches[match_idx].offset);
374 if (len == possible_matches[match_idx].len)
378 /* Step forward, calculating the estimated minimum cost to reach each
379 * position. The algorithm may find multiple paths to reach each
380 * position; only the lowest-cost path is saved.
382 * The progress of the parse is tracked in the @mc->optimum array, which
383 * for each position contains the minimum cost to reach that position,
384 * the index of the start of the match/literal taken to reach that
385 * position through the minimum-cost path, the offset of the match taken
386 * (not relevant for literals), and the adaptive state that will exist
387 * at that position after the minimum-cost path is taken. The @cur_pos
388 * variable stores the position at which the algorithm is currently
389 * considering coding choices, and the @len_end variable stores the
390 * greatest offset at which the costs of coding choices have been saved.
391 * (The algorithm guarantees that all positions before @len_end are
392 * reachable by at least one path and therefore have costs computed.)
394 * The loop terminates when any one of the following conditions occurs:
396 * 1. A match greater than @nice_len is found. When this is found, the
397 * algorithm chooses this match unconditionally, and consequently the
398 * near-optimal match/literal sequence up to and including that match
399 * is fully determined.
401 * 2. @cur_pos reaches a position not overlapped by a preceding match.
402 * In such cases, the near-optimal match/literal sequence up to
403 * @cur_pos is fully determined.
405 * 3. Failing either of the above in a degenerate case, the loop
406 * terminates when space in the @mc->optimum array is exhausted.
407 * This terminates the algorithm and forces it to start returning
408 * matches/literals even though they may not be globally optimal.
410 * Upon loop termination, a nonempty list of matches/literals has been
411 * produced and stored in the @optimum array. They are linked in
412 * reverse order, so the last thing this function does is reverse the
413 * links and return the first match/literal, leaving the rest to be
414 * returned immediately by subsequent calls to this function.
416 input_idx_t cur_pos = 0;
417 input_idx_t len_end = longest_match_len;
419 /* Advance to next position. */
422 /* Check termination conditions (2) and (3) noted above. */
423 if (cur_pos == len_end || cur_pos == mc->array_space)
424 return lz_match_chooser_reverse_list(mc, cur_pos);
426 /* Retrieve a (possibly empty) list of potentially useful
427 * matches available at this position. */
428 num_possible_matches = (*get_matches)(ctx,
429 &mc->optimum[cur_pos].state,
432 if (num_possible_matches == 0)
433 longest_match_len = 0;
435 longest_match_len = possible_matches[0].len;
437 /* Greedy heuristic and termination condition (1) noted above:
438 * if we found a match greater than @nice_len, choose it
439 * unconditionally and begin returning matches/literals. */
440 if (longest_match_len >= mc->nice_len) {
441 /* Build the list of matches to return and get
443 match = lz_match_chooser_reverse_list(mc, cur_pos);
445 /* Append the long match to the end of the list. */
446 mc->optimum[cur_pos].next.match_offset =
447 possible_matches[0].offset;
448 mc->optimum[cur_pos].next.link = cur_pos + longest_match_len;
449 mc->optimum_end_idx = cur_pos + longest_match_len;
451 /* Skip over the remaining bytes of the long match. */
452 (*skip_bytes)(ctx, longest_match_len - 1);
454 /* Return first match in the list. */
458 /* Load minimum cost to reach the current position. */
459 input_idx_t cur_cost = mc->optimum[cur_pos].cost;
461 /* Consider proceeding with a literal byte. */
463 LZ_ADAPTIVE_STATE state;
466 state = mc->optimum[cur_pos].state;
467 cost = cur_cost + (*get_prev_literal_cost)(ctx, &state);
469 if (cost < mc->optimum[cur_pos + 1].cost) {
470 mc->optimum[cur_pos + 1].cost = cost;
471 mc->optimum[cur_pos + 1].prev.link = cur_pos;
472 mc->optimum[cur_pos + 1].state = state;
476 /* If no matches were found, continue to the next position.
477 * Otherwise, consider proceeding with a match. */
479 if (num_possible_matches == 0)
482 /* Initialize any uninitialized costs up to the length of the
483 * longest match found. */
484 while (len_end < cur_pos + longest_match_len)
485 mc->optimum[++len_end].cost = LZ_MC_INFINITE_COST;
487 /* Calculate the minimum cost to reach any position up to and
488 * including that reached by the longest match. Use the
489 * shortest available match that reaches each position, assuming
490 * that @get_matches() only returned shorter matches because
491 * their estimated costs were less than that of the longest
493 for (input_idx_t len = 2, match_idx = num_possible_matches - 1;
494 len <= longest_match_len; len++)
496 LZ_ASSERT(match_idx < num_possible_matches);
497 LZ_ASSERT(len <= possible_matches[match_idx].len);
499 LZ_ADAPTIVE_STATE state;
502 state = mc->optimum[cur_pos].state;
503 cost = cur_cost + (*get_match_cost)(ctx,
506 possible_matches[match_idx].offset);
508 if (cost < mc->optimum[cur_pos + len].cost) {
509 mc->optimum[cur_pos + len].cost = cost;
510 mc->optimum[cur_pos + len].prev.link = cur_pos;
511 mc->optimum[cur_pos + len].prev.match_offset =
512 possible_matches[match_idx].offset;
513 mc->optimum[cur_pos + len].state = state;
516 if (len == possible_matches[match_idx].len)
522 #endif /* _LZ_OPTIMAL_H */