4 * Near-optimal LZ (Lempel-Ziv) parsing, or "match choosing".
6 * This is based on the algorithm used in 7-Zip's DEFLATE encoder, written by
11 * Copyright (C) 2013 Eric Biggers
13 * This file is part of wimlib, a library for working with WIM files.
15 * wimlib is free software; you can redistribute it and/or modify it under the
16 * terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 3 of the License, or (at your option)
20 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
21 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
22 * A PARTICULAR PURPOSE. See the GNU General Public License for more
25 * You should have received a copy of the GNU General Public License
26 * along with wimlib; if not, see http://www.gnu.org/licenses/.
29 /* Define the following structures before including this:
37 #include "wimlib/lz.h"
39 typedef input_idx_t lz_mc_cost_t;
41 #define LZ_MC_INFINITE_COST (~(lz_mc_cost_t)0)
44 * Match chooser position data:
46 * An array of these structures is used during the match-choosing algorithm.
47 * They correspond to consecutive positions in the window and are used to keep
48 * track of the cost to reach each position, and the match/literal choices that
49 * need to be chosen to reach that position.
51 struct lz_mc_pos_data {
52 /* The approximate minimum cost, in bits, to reach this position in the
53 * window which has been found so far. */
56 /* The union here is just for clarity, since the fields are used in two
57 * slightly different ways. Initially, the @prev structure is filled in
58 * first, and links go from later in the window to earlier in the
59 * window. Later, @next structure is filled in and links go from
60 * earlier in the window to later in the window. */
63 /* Position of the start of the match or literal that
64 * was taken to get to this position in the approximate
65 * minimum-cost parse. */
68 /* Offset (as in an LZ (length, offset) pair) of the
69 * match or literal that was taken to get to this
70 * position in the approximate minimum-cost parse. */
71 input_idx_t match_offset;
74 /* Position at which the match or literal starting at
75 * this position ends in the minimum-cost parse. */
78 /* Offset (as in an LZ (length, offset) pair) of the
79 * match or literal starting at this position in the
80 * approximate minimum-cost parse. */
81 input_idx_t match_offset;
85 /* Format-dependent state that exists after an approximate minimum-cost
86 * path to reach this position is taken. For example, for LZX this is
87 * the list of recently used match offsets. This could be 0 bytes if
88 * the format does not have any state that affects match costs. */
89 LZ_FORMAT_STATE state;
92 struct lz_match_chooser {
93 /* Temporary space used for the match-choosing algorithm. The size of
94 * this array must be at least one more than greedy_len but otherwise is
95 * arbitrary. More space simply allows the match-choosing algorithm to
96 * potentially find better matches (depending on the input, as always).
98 struct lz_mc_pos_data *optimum;
99 input_idx_t array_space;
101 /* When a match greater than this length is found, choose it immediately
102 * without further consideration. */
103 input_idx_t greedy_len;
105 /* When matches have been chosen, optimum_cur_idx is set to the position
106 * in the window of the next match/literal to return and optimum_end_idx
107 * is set to the position in the window at the end of the last
108 * match/literal to return. */
109 input_idx_t optimum_cur_idx;
110 input_idx_t optimum_end_idx;
113 /* Initialize the match-chooser.
115 * After calling this, multiple data buffers can be scanned with it if each is
116 * preceded with a call to lz_match_chooser_begin(). */
118 lz_match_chooser_init(struct lz_match_chooser *mc,
119 input_idx_t array_space,
120 input_idx_t greedy_len, input_idx_t max_match_len)
122 input_idx_t extra_len = min(greedy_len, max_match_len);
124 LZ_ASSERT(array_space > 0);
125 mc->optimum = MALLOC((array_space + extra_len) * sizeof(mc->optimum[0]));
126 if (mc->optimum == NULL)
128 mc->array_space = array_space;
129 mc->greedy_len = greedy_len;
134 lz_match_chooser_destroy(struct lz_match_chooser *mc)
140 lz_match_chooser_begin(struct lz_match_chooser *mc)
142 mc->optimum_cur_idx = 0;
143 mc->optimum_end_idx = 0;
147 * Reverse the linked list of near-optimal matches so that they can be returned
150 * Returns the first match in the list.
152 static _always_inline_attribute struct raw_match
153 lz_match_chooser_reverse_list(struct lz_match_chooser *mc, input_idx_t cur_pos)
155 unsigned prev_link, saved_prev_link;
156 unsigned prev_match_offset, saved_prev_match_offset;
158 mc->optimum_end_idx = cur_pos;
160 saved_prev_link = mc->optimum[cur_pos].prev.link;
161 saved_prev_match_offset = mc->optimum[cur_pos].prev.match_offset;
164 prev_link = saved_prev_link;
165 prev_match_offset = saved_prev_match_offset;
167 saved_prev_link = mc->optimum[prev_link].prev.link;
168 saved_prev_match_offset = mc->optimum[prev_link].prev.match_offset;
170 mc->optimum[prev_link].next.link = cur_pos;
171 mc->optimum[prev_link].next.match_offset = prev_match_offset;
174 } while (cur_pos != 0);
176 mc->optimum_cur_idx = mc->optimum[0].next.link;
178 return (struct raw_match)
179 { .len = mc->optimum_cur_idx,
180 .offset = mc->optimum[0].next.match_offset,
184 /* Format-specific functions inlined into lz_get_near_optimal_match(). */
186 /* Get the list of possible matches at the next position. The return value must
187 * be the number of matches found (which may be 0) and a pointer to the returned
188 * matches must be written into @matches_ret. Matches must be of distinct
189 * lengths and sorted in decreasing order by length. */
190 typedef u32 (*lz_get_matches_t)(LZ_COMPRESSOR *ctx,
191 const LZ_FORMAT_STATE *state,
192 struct raw_match **matches_ret);
194 /* Skip the specified number of bytes (don't search for matches at them). */
195 typedef void (*lz_skip_bytes_t)(LZ_COMPRESSOR *ctx, input_idx_t n);
197 /* Get the cost of the literal located at the position at which matches have
198 * most recently been searched. Note: this currently assumes that literals do
199 * not affect the LZ_FORMAT_STATE. */
200 typedef u32 (lz_get_prev_literal_cost_t)(LZ_COMPRESSOR *ctx);
202 /* Get the cost of a match. This can optionally update the @state to take into
203 * account format-dependent state that affects match costs, such as repeat
205 typedef u32 (lz_get_match_cost_t)(LZ_COMPRESSOR *ctx, LZ_FORMAT_STATE *state,
210 * lz_get_near_optimal_match() -
212 * Choose the optimal match or literal to use at the next position in the input.
214 * Unlike a greedy parser that always takes the longest match, or even a
215 * parser with one match/literal look-ahead like zlib, the algorithm used here
216 * may look ahead many matches/literals to determine the optimal match/literal to
217 * output next. The motivation is that the compression ratio is improved if the
218 * compressor can do things like use a shorter-than-possible match in order to
219 * allow a longer match later, and also take into account the Huffman code cost
220 * model rather than simply assuming that longer is better.
222 * Still, this is not truly an optimal parser because very long matches are
223 * taken immediately, and the raw match-finder takes some shortcuts. This is
224 * done to avoid considering many different alternatives that are unlikely to
225 * be significantly better.
227 * This algorithm is based on that used in 7-Zip's DEFLATE encoder.
229 * Each call to this function does one of two things:
231 * 1. Build a near-optimal sequence of matches/literals, up to some point, that
232 * will be returned by subsequent calls to this function, then return the
237 * 2. Return the next match/literal previously computed by a call to this
240 * The return value is a (length, offset) pair specifying the match or literal
241 * chosen. For literals, the length is 0 or 1 and the offset is meaningless.
243 static _always_inline_attribute struct raw_match
244 lz_get_near_optimal_match(struct lz_match_chooser *mc,
245 lz_get_matches_t get_matches,
246 lz_skip_bytes_t skip_bytes,
247 lz_get_prev_literal_cost_t get_prev_literal_cost,
248 lz_get_match_cost_t get_match_cost,
250 const LZ_FORMAT_STATE *initial_state)
252 u32 num_possible_matches;
253 struct raw_match *possible_matches;
254 struct raw_match match;
255 input_idx_t longest_match_len;
257 if (mc->optimum_cur_idx != mc->optimum_end_idx) {
258 /* Case 2: Return the next match/literal already found. */
259 match.len = mc->optimum[mc->optimum_cur_idx].next.link -
261 match.offset = mc->optimum[mc->optimum_cur_idx].next.match_offset;
263 mc->optimum_cur_idx = mc->optimum[mc->optimum_cur_idx].next.link;
267 /* Case 1: Compute a new list of matches/literals to return. */
269 mc->optimum_cur_idx = 0;
270 mc->optimum_end_idx = 0;
272 /* Get matches at this position. */
273 num_possible_matches = (*get_matches)(ctx,
277 /* If no matches found, return literal. */
278 if (num_possible_matches == 0)
279 return (struct raw_match){ .len = 0 };
281 /* The matches that were found are sorted in decreasing order by length.
282 * Get the length of the longest one. */
283 longest_match_len = possible_matches[0].len;
285 /* Greedy heuristic: if the longest match that was found is greater
286 * than the number of fast bytes, return it immediately; don't both
287 * doing more work. */
288 if (longest_match_len > mc->greedy_len) {
289 (*skip_bytes)(ctx, longest_match_len - 1);
290 return possible_matches[0];
293 /* Calculate the cost to reach the next position by outputting a
295 mc->optimum[0].state = *initial_state;
296 mc->optimum[1].state = mc->optimum[0].state;
297 mc->optimum[1].cost = (*get_prev_literal_cost)(ctx);
298 mc->optimum[1].prev.link = 0;
300 /* Calculate the cost to reach any position up to and including that
301 * reached by the longest match, using the shortest (i.e. closest) match
302 * that reaches each position. */
303 for (input_idx_t len = 2, match_idx = num_possible_matches - 1;
304 len <= longest_match_len; len++)
307 LZ_ASSERT(match_idx < num_possible_matches);
309 mc->optimum[len].state = mc->optimum[0].state;
310 mc->optimum[len].prev.link = 0;
311 mc->optimum[len].prev.match_offset = possible_matches[match_idx].offset;
312 mc->optimum[len].cost = (*get_match_cost)(ctx,
313 &mc->optimum[len].state,
315 possible_matches[match_idx].offset);
316 if (len == possible_matches[match_idx].len)
320 input_idx_t cur_pos = 0;
322 /* len_end: greatest index forward at which costs have been calculated
324 input_idx_t len_end = longest_match_len;
327 /* Advance to next position. */
330 if (cur_pos == len_end || cur_pos == mc->array_space)
331 return lz_match_chooser_reverse_list(mc, cur_pos);
333 /* retrieve the number of matches available at this position */
334 num_possible_matches = (*get_matches)(ctx,
335 &mc->optimum[cur_pos].state,
338 input_idx_t new_len = 0;
340 if (num_possible_matches != 0) {
341 new_len = possible_matches[0].len;
343 /* Greedy heuristic: if we found a match greater than
344 * the number of fast bytes, stop immediately. */
345 if (new_len > mc->greedy_len) {
347 /* Build the list of matches to return and get
349 match = lz_match_chooser_reverse_list(mc, cur_pos);
351 /* Append the long match to the end of the list. */
352 mc->optimum[cur_pos].next.match_offset =
353 possible_matches[0].offset;
354 mc->optimum[cur_pos].next.link = cur_pos + new_len;
355 mc->optimum_end_idx = cur_pos + new_len;
357 /* Skip over the remaining bytes of the long match. */
358 (*skip_bytes)(ctx, new_len - 1);
360 /* Return first match in the list */
365 /* Consider proceeding with a literal byte. */
366 lz_mc_cost_t cur_cost = mc->optimum[cur_pos].cost;
367 lz_mc_cost_t cur_plus_literal_cost = cur_cost +
368 (*get_prev_literal_cost)(ctx);
369 if (cur_plus_literal_cost < mc->optimum[cur_pos + 1].cost) {
370 mc->optimum[cur_pos + 1].cost = cur_plus_literal_cost;
371 mc->optimum[cur_pos + 1].prev.link = cur_pos;
372 mc->optimum[cur_pos + 1].state = mc->optimum[cur_pos].state;
375 if (num_possible_matches == 0)
378 /* Consider proceeding with a match. */
380 while (len_end < cur_pos + new_len)
381 mc->optimum[++len_end].cost = LZ_MC_INFINITE_COST;
383 for (input_idx_t len = 2, match_idx = num_possible_matches - 1;
384 len <= new_len; len++)
386 LZX_ASSERT(match_idx < num_possible_matches);
388 LZ_FORMAT_STATE state = mc->optimum[cur_pos].state;
391 cost = cur_cost + (*get_match_cost)(ctx,
394 possible_matches[match_idx].offset);
396 if (cost < mc->optimum[cur_pos + len].cost) {
397 mc->optimum[cur_pos + len].cost = cost;
398 mc->optimum[cur_pos + len].prev.link = cur_pos;
399 mc->optimum[cur_pos + len].prev.match_offset =
400 possible_matches[match_idx].offset;
401 mc->optimum[cur_pos + len].state = state;
404 if (len == possible_matches[match_idx].len)
410 #endif /* _LZ_OPTIMAL_H */