4 * Near-optimal LZ (Lempel-Ziv) parsing, or "match choosing".
6 * This is based on the algorithm used in 7-Zip's DEFLATE encoder, written by
11 * Copyright (C) 2013 Eric Biggers
13 * This file is part of wimlib, a library for working with WIM files.
15 * wimlib is free software; you can redistribute it and/or modify it under the
16 * terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 3 of the License, or (at your option)
20 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
21 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 * A PARTICULAR PURPOSE. See the GNU General Public License for more
25 * You should have received a copy of the GNU General Public License
26 * along with wimlib; if not, see http://www.gnu.org/licenses/.
29 /* Define the following structures before including this:
37 #include "wimlib/lz.h"
39 typedef input_idx_t lz_mc_cost_t;
41 #define LZ_MC_INFINITE_COST (~(lz_mc_cost_t)0)
44 * Match chooser position data:
46 * An array of these structures is used during the match-choosing algorithm.
47 * They correspond to consecutive positions in the window and are used to keep
48 * track of the cost to reach each position, and the match/literal choices that
49 * need to be chosen to reach that position.
51 struct lz_mc_pos_data {
52 /* The approximate minimum cost, in bits, to reach this position in the
53 * window which has been found so far. */
56 /* The union here is just for clarity, since the fields are used in two
57 * slightly different ways. Initially, the @prev structure is filled in
58 * first, and links go from later in the window to earlier in the
59 * window. Later, @next structure is filled in and links go from
60 * earlier in the window to later in the window. */
63 /* Position of the start of the match or literal that
64 * was taken to get to this position in the approximate
65 * minimum-cost parse. */
68 /* Offset (as in an LZ (length, offset) pair) of the
69 * match or literal that was taken to get to this
70 * position in the approximate minimum-cost parse. */
71 input_idx_t match_offset;
74 /* Position at which the match or literal starting at
75 * this position ends in the minimum-cost parse. */
78 /* Offset (as in an LZ (length, offset) pair) of the
79 * match or literal starting at this position in the
80 * approximate minimum-cost parse. */
81 input_idx_t match_offset;
85 /* Format-dependent state that exists after an approximate minimum-cost
86 * path to reach this position is taken. For example, for LZX this is
87 * the list of recently used match offsets. This could be 0 bytes if
88 * the format does not have any state that affects match costs. */
89 LZ_FORMAT_STATE state;
92 struct lz_match_chooser {
93 /* Temporary space used for the match-choosing algorithm. The size of
94 * this array must be at least one more than greedy_len but otherwise is
95 * arbitrary. More space simply allows the match-choosing algorithm to
96 * potentially find better matches (depending on the input, as always).
98 struct lz_mc_pos_data *optimum;
99 input_idx_t array_space;
101 /* When a match greater than this length is found, choose it immediately
102 * without further consideration. */
103 input_idx_t greedy_len;
105 /* When matches have been chosen, optimum_cur_idx is set to the position
106 * in the window of the next match/literal to return and optimum_end_idx
107 * is set to the position in the window at the end of the last
108 * match/literal to return. */
109 input_idx_t optimum_cur_idx;
110 input_idx_t optimum_end_idx;
113 /* Initialize the match-chooser.
115 * After calling this, multiple data buffers can be scanned with it if each is
116 * preceded with a call to lz_match_chooser_begin(). */
118 lz_match_chooser_init(struct lz_match_chooser *mc,
119 input_idx_t array_space,
120 input_idx_t greedy_len, input_idx_t max_match_len)
122 input_idx_t extra_len = min(greedy_len, max_match_len);
124 LZ_ASSERT(array_space > 0);
125 mc->optimum = MALLOC((array_space + extra_len) * sizeof(mc->optimum[0]));
126 if (mc->optimum == NULL)
128 mc->array_space = array_space;
129 mc->greedy_len = greedy_len;
134 lz_match_chooser_destroy(struct lz_match_chooser *mc)
140 lz_match_chooser_begin(struct lz_match_chooser *mc)
142 mc->optimum_cur_idx = 0;
143 mc->optimum_end_idx = 0;
147 * Reverse the linked list of near-optimal matches so that they can be returned
150 * Returns the first match in the list.
152 static _always_inline_attribute struct raw_match
153 lz_match_chooser_reverse_list(struct lz_match_chooser *mc, input_idx_t cur_pos)
155 unsigned prev_link, saved_prev_link;
156 unsigned prev_match_offset, saved_prev_match_offset;
158 mc->optimum_end_idx = cur_pos;
160 saved_prev_link = mc->optimum[cur_pos].prev.link;
161 saved_prev_match_offset = mc->optimum[cur_pos].prev.match_offset;
164 prev_link = saved_prev_link;
165 prev_match_offset = saved_prev_match_offset;
167 saved_prev_link = mc->optimum[prev_link].prev.link;
168 saved_prev_match_offset = mc->optimum[prev_link].prev.match_offset;
170 mc->optimum[prev_link].next.link = cur_pos;
171 mc->optimum[prev_link].next.match_offset = prev_match_offset;
174 } while (cur_pos != 0);
176 mc->optimum_cur_idx = mc->optimum[0].next.link;
178 return (struct raw_match)
179 { .len = mc->optimum_cur_idx,
180 .offset = mc->optimum[0].next.match_offset,
184 /* Format-specific functions inlined into lz_get_near_optimal_match(). */
186 /* Get the list of possible matches at the next position. The return value must
187 * be the number of matches found (which may be 0) and a pointer to the returned
188 * matches must be written into @matches_ret. Matches must be of distinct
189 * lengths and sorted in decreasing order by length. */
190 typedef u32 (*lz_get_matches_t)(LZ_COMPRESSOR *ctx,
191 const LZ_FORMAT_STATE *state,
192 struct raw_match **matches_ret);
194 /* Skip the specified number of bytes (don't search for matches at them). */
195 typedef void (*lz_skip_bytes_t)(LZ_COMPRESSOR *ctx, input_idx_t n);
197 /* Get the cost of the literal located at the position at which matches have
198 * most recently been searched. This can optionally update the @state to take
199 * into account format-dependent state that affects match costs, such as repeat
201 typedef u32 (lz_get_prev_literal_cost_t)(LZ_COMPRESSOR *ctx,
202 LZ_FORMAT_STATE *state);
204 /* Get the cost of a match. This can optionally update the @state to take into
205 * account format-dependent state that affects match costs, such as repeat
207 typedef u32 (lz_get_match_cost_t)(LZ_COMPRESSOR *ctx, LZ_FORMAT_STATE *state,
212 * lz_get_near_optimal_match() -
214 * Choose the optimal match or literal to use at the next position in the input.
216 * Unlike a greedy parser that always takes the longest match, or even a
217 * parser with one match/literal look-ahead like zlib, the algorithm used here
218 * may look ahead many matches/literals to determine the optimal match/literal to
219 * output next. The motivation is that the compression ratio is improved if the
220 * compressor can do things like use a shorter-than-possible match in order to
221 * allow a longer match later, and also take into account the Huffman code cost
222 * model rather than simply assuming that longer is better.
224 * Still, this is not truly an optimal parser because very long matches are
225 * taken immediately, and the raw match-finder takes some shortcuts. This is
226 * done to avoid considering many different alternatives that are unlikely to
227 * be significantly better.
229 * This algorithm is based on that used in 7-Zip's DEFLATE encoder.
231 * Each call to this function does one of two things:
233 * 1. Build a near-optimal sequence of matches/literals, up to some point, that
234 * will be returned by subsequent calls to this function, then return the
239 * 2. Return the next match/literal previously computed by a call to this
242 * The return value is a (length, offset) pair specifying the match or literal
243 * chosen. For literals, the length is 0 or 1 and the offset is meaningless.
245 static _always_inline_attribute struct raw_match
246 lz_get_near_optimal_match(struct lz_match_chooser *mc,
247 lz_get_matches_t get_matches,
248 lz_skip_bytes_t skip_bytes,
249 lz_get_prev_literal_cost_t get_prev_literal_cost,
250 lz_get_match_cost_t get_match_cost,
252 const LZ_FORMAT_STATE *initial_state)
254 u32 num_possible_matches;
255 struct raw_match *possible_matches;
256 struct raw_match match;
257 input_idx_t longest_match_len;
259 if (mc->optimum_cur_idx != mc->optimum_end_idx) {
260 /* Case 2: Return the next match/literal already found. */
261 match.len = mc->optimum[mc->optimum_cur_idx].next.link -
263 match.offset = mc->optimum[mc->optimum_cur_idx].next.match_offset;
265 mc->optimum_cur_idx = mc->optimum[mc->optimum_cur_idx].next.link;
269 /* Case 1: Compute a new list of matches/literals to return. */
271 mc->optimum_cur_idx = 0;
272 mc->optimum_end_idx = 0;
274 /* Get matches at this position. */
275 num_possible_matches = (*get_matches)(ctx,
279 /* If no matches found, return literal. */
280 if (num_possible_matches == 0)
281 return (struct raw_match){ .len = 0 };
283 /* The matches that were found are sorted in decreasing order by length.
284 * Get the length of the longest one. */
285 longest_match_len = possible_matches[0].len;
287 /* Greedy heuristic: if the longest match that was found is greater
288 * than the number of fast bytes, return it immediately; don't both
289 * doing more work. */
290 if (longest_match_len > mc->greedy_len) {
291 (*skip_bytes)(ctx, longest_match_len - 1);
292 return possible_matches[0];
295 /* Calculate the cost to reach the next position by outputting a
297 mc->optimum[0].state = *initial_state;
298 mc->optimum[1].state = mc->optimum[0].state;
299 mc->optimum[1].cost = (*get_prev_literal_cost)(ctx, &mc->optimum[1].state);
300 mc->optimum[1].prev.link = 0;
302 /* Calculate the cost to reach any position up to and including that
303 * reached by the longest match, using the shortest (i.e. closest) match
304 * that reaches each position. */
305 for (input_idx_t len = 2, match_idx = num_possible_matches - 1;
306 len <= longest_match_len; len++)
309 LZ_ASSERT(match_idx < num_possible_matches);
311 mc->optimum[len].state = mc->optimum[0].state;
312 mc->optimum[len].prev.link = 0;
313 mc->optimum[len].prev.match_offset = possible_matches[match_idx].offset;
314 mc->optimum[len].cost = (*get_match_cost)(ctx,
315 &mc->optimum[len].state,
317 possible_matches[match_idx].offset);
318 if (len == possible_matches[match_idx].len)
322 input_idx_t cur_pos = 0;
324 /* len_end: greatest index forward at which costs have been calculated
326 input_idx_t len_end = longest_match_len;
329 /* Advance to next position. */
332 if (cur_pos == len_end || cur_pos == mc->array_space)
333 return lz_match_chooser_reverse_list(mc, cur_pos);
335 /* retrieve the number of matches available at this position */
336 num_possible_matches = (*get_matches)(ctx,
337 &mc->optimum[cur_pos].state,
340 input_idx_t new_len = 0;
342 if (num_possible_matches != 0) {
343 new_len = possible_matches[0].len;
345 /* Greedy heuristic: if we found a match greater than
346 * the number of fast bytes, stop immediately. */
347 if (new_len > mc->greedy_len) {
349 /* Build the list of matches to return and get
351 match = lz_match_chooser_reverse_list(mc, cur_pos);
353 /* Append the long match to the end of the list. */
354 mc->optimum[cur_pos].next.match_offset =
355 possible_matches[0].offset;
356 mc->optimum[cur_pos].next.link = cur_pos + new_len;
357 mc->optimum_end_idx = cur_pos + new_len;
359 /* Skip over the remaining bytes of the long match. */
360 (*skip_bytes)(ctx, new_len - 1);
362 /* Return first match in the list */
367 /* Consider proceeding with a literal byte. */
368 lz_mc_cost_t cur_cost = mc->optimum[cur_pos].cost;
369 LZ_FORMAT_STATE cur_plus_literal_state = mc->optimum[cur_pos].state;
370 lz_mc_cost_t cur_plus_literal_cost = cur_cost +
371 (*get_prev_literal_cost)(ctx,
372 &cur_plus_literal_state);
373 if (cur_plus_literal_cost < mc->optimum[cur_pos + 1].cost) {
374 mc->optimum[cur_pos + 1].cost = cur_plus_literal_cost;
375 mc->optimum[cur_pos + 1].prev.link = cur_pos;
376 mc->optimum[cur_pos + 1].state = cur_plus_literal_state;
379 if (num_possible_matches == 0)
382 /* Consider proceeding with a match. */
384 while (len_end < cur_pos + new_len)
385 mc->optimum[++len_end].cost = LZ_MC_INFINITE_COST;
387 for (input_idx_t len = 2, match_idx = num_possible_matches - 1;
388 len <= new_len; len++)
390 LZ_ASSERT(match_idx < num_possible_matches);
392 LZ_FORMAT_STATE state = mc->optimum[cur_pos].state;
395 cost = cur_cost + (*get_match_cost)(ctx,
398 possible_matches[match_idx].offset);
400 if (cost < mc->optimum[cur_pos + len].cost) {
401 mc->optimum[cur_pos + len].cost = cost;
402 mc->optimum[cur_pos + len].prev.link = cur_pos;
403 mc->optimum[cur_pos + len].prev.match_offset =
404 possible_matches[match_idx].offset;
405 mc->optimum[cur_pos + len].state = state;
408 if (len == possible_matches[match_idx].len)
414 #endif /* _LZ_OPTIMAL_H */