4 * Binary tree match-finder for Lempel-Ziv compression.
6 * Copyright (c) 2014 Eric Biggers. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * Note: the binary tree search/update algorithm is based on LzFind.c from
34 * 7-Zip, which was written by Igor Pavlov and released into the public domain.
41 #include "wimlib/lz_extend.h"
42 #include "wimlib/lz_hash3.h"
43 #include "wimlib/lz_mf.h"
44 #include "wimlib/util.h"
48 /* log2 of the number of buckets in the hash table. This can be changed. */
49 #define LZ_BT_HASH_ORDER 16
51 #define LZ_BT_HASH_LEN (1 << LZ_BT_HASH_ORDER)
53 /* Number of entries in the digram table.
55 * Note: You rarely get length-2 matches if you use length-3 hashing. But
56 * since binary trees are typically used for higher compression ratios than hash
57 * chains, it is helpful for this match-finder to find length-2 matches as well.
58 * Therefore this match-finder also uses a digram table to find length-2 matches
59 * when the minimum match length is 2. */
60 #define LZ_BT_DIGRAM_TAB_LEN (256 * 256)
72 lz_bt_hash(const u8 *p)
74 return lz_hash(p, LZ_BT_HASH_ORDER);
78 lz_bt_set_default_params(struct lz_mf_params *params)
80 if (params->min_match_len == 0)
81 params->min_match_len = 2;
83 if (params->max_match_len == 0)
84 params->max_match_len = UINT32_MAX;
86 if (params->max_search_depth == 0)
87 params->max_search_depth = 50;
89 if (params->nice_match_len == 0)
90 params->nice_match_len = 24;
92 if (params->nice_match_len < params->min_match_len)
93 params->nice_match_len = params->min_match_len;
95 if (params->nice_match_len > params->max_match_len)
96 params->nice_match_len = params->max_match_len;
100 lz_bt_params_valid(const struct lz_mf_params *params)
106 lz_bt_get_needed_memory(u32 max_window_size)
110 len += LZ_BT_HASH_LEN; /* hash_tab */
111 len += LZ_BT_DIGRAM_TAB_LEN; /* digram_tab */
112 len += 2 * (u64)max_window_size; /* child_tab */
114 return len * sizeof(u32);
118 lz_bt_init(struct lz_mf *_mf)
120 struct lz_bt *mf = (struct lz_bt *)_mf;
121 struct lz_mf_params *params = &mf->base.params;
124 lz_bt_set_default_params(params);
126 /* Allocate space for 'hash_tab', 'digram_tab', and 'child_tab'. */
128 len += LZ_BT_HASH_LEN;
129 if (params->min_match_len == 2)
130 len += LZ_BT_DIGRAM_TAB_LEN;
131 len += 2 * params->max_window_size;
133 mf->hash_tab = MALLOC(len * sizeof(u32));
137 if (params->min_match_len == 2) {
138 mf->digram_tab = mf->hash_tab + LZ_BT_HASH_LEN;
139 mf->child_tab = mf->digram_tab + LZ_BT_DIGRAM_TAB_LEN;
141 mf->child_tab = mf->hash_tab + LZ_BT_HASH_LEN;
148 lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size)
150 struct lz_bt *mf = (struct lz_bt *)_mf;
153 /* Clear hash_tab and digram_tab.
154 * Note: child_tab need not be cleared. */
155 clear_len = LZ_BT_HASH_LEN;
157 clear_len += LZ_BT_DIGRAM_TAB_LEN;
158 memset(mf->hash_tab, 0, clear_len * sizeof(u32));
162 * Search the binary tree of the current hash code for matches. At the same
163 * time, update this tree to add the current position in the window.
166 * The window being searched.
168 * The current position in the window.
170 * Table of child pointers for the binary tree. The children of the node
171 * for position 'i' in the window are child_tab[i * 2] and child_tab[i*2 +
172 * 1]. Zero is reserved for the 'null' value (no child). Consequently, we
173 * don't recognize matches beginning at position 0. In fact, the node for
174 * position 0 in the window will not be used at all, which is just as well
175 * because we use 0-based indices which don't work for position 0.
177 * The position in the window at which the binary tree for the current hash
178 * code is rooted. This can be 0, which indicates that the binary tree for
179 * the current hash code is empty.
181 * Ignore matches shorter than this length. This must be at least 1.
183 * Stop searching if a match of this length or longer is found. This must
184 * be less than or equal to @max_len.
186 * Maximum length of matches to return. This can be longer than @nice_len,
187 * in which case a match of length @nice_len will still cause the search to
188 * be terminated, but the match will be extended up to @max_len bytes
191 * Stop if we reach this depth in the binary tree.
193 * The array in which to produce the matches. The matches will be produced
194 * in order of increasing length and increasing offset. No more than one
195 * match shall have any given length, nor shall any match be shorter than
196 * @min_len, nor shall any match be longer than @max_len, nor shall any two
197 * matches have the same offset.
199 * Returns a pointer to the next free slot in @matches.
201 static struct lz_match *
202 do_search(const u8 window[restrict],
204 u32 child_tab[restrict],
209 const u32 max_search_depth,
210 struct lz_match *lz_matchptr)
213 * Here's my explanation of how this code actually works. Beware: this
214 * algorithm is a *lot* trickier than searching for matches via hash
215 * chains. But it can be significantly better, especially when doing
216 * "optimal" parsing, which is why it gets used, e.g. in LZMA as well as
219 * ---------------------------------------------------------------------
223 * Basically, there is not just one binary tree, but rather one binary
224 * tree per hash code. For a given hash code, the binary tree indexes
225 * previous positions in the window that have that same hash code. The
226 * key for each node is the "string", or byte sequence, beginning at the
227 * corresponding position in the window.
229 * Each tree maintains the invariant that if node C is a child of node
230 * P, then the window position represented by node C is smaller than
231 * ("left of") the window position represented by node P. Equivalently,
232 * while descending into a tree, the match distances ("offsets") from
233 * the current position are non-decreasing --- actually strictly
234 * increasing, because each node represents a unique position.
236 * In addition, not all previous positions sharing the same hash code
237 * will necessarily be represented in each binary tree; see the
238 * "Updating" section.
240 * ---------------------------------------------------------------------
244 * Suppose we want to search for LZ77-style matches with the string
245 * beginning at the current window position and extending for @max_len
246 * bytes. To do this, we can search for this string in the binary tree
247 * for this string's hash code. Each node visited during the search is
248 * a potential match. This method will find the matches efficiently
249 * because they will converge on the current string, due to the nature
250 * of the binary search.
252 * Naively, when visiting a node that represents a match of length N, we
253 * must compare N + 1 bytes in order to determine the length of that
254 * match and the lexicographic ordering of that match relative to the
255 * current string (which determines whether we need to step left or
256 * right into the next level of the tree, as per the standard binary
257 * tree search algorithm). However, as an optimization, we need not
258 * explicitly examine the full length of the match at each node. To see
259 * that this is true, suppose that we examine a node during the search,
260 * and we find that the corresponding match is less (alt. greater) than
261 * the current string. Then, because of how binary tree search
262 * operates, the match must be lexicographically greater (alt. lesser)
263 * than any ancestor node that corresponded to a match lexicographically
264 * lesser (alt. greater) than the current string. Therefore, the match
265 * must be at least as long as the match for any such ancestor node.
266 * Therefore, the lengths of lexicographically-lesser (alt. greater)
267 * matches must be non-decreasing as they are encountered by the tree
270 * Using this observation, we can maintain two variables, 'best_lt_len'
271 * and 'best_gt_len', that represent the length of the longest
272 * lexicographically lesser and greater, respectively, match that has
273 * been examined so far. Then, when examining a new match, we need
274 * only start comparing at the index min(best_lt_len, best_gt_len) byte.
275 * Note that we cannot know beforehand whether the match will be
276 * lexicographically lesser or greater, hence the need for taking the
277 * minimum of these two lengths.
279 * As noted earlier, as we descend into the tree, the potential matches
280 * will have strictly increasing offsets. To make things faster for
281 * higher-level parsing / match-choosing code, we do not want to return
282 * a shorter match that has a larger offset than a longer match. This
283 * is because a longer match can always be truncated to a shorter match
284 * if needed, and smaller offsets usually (depending on the compression
285 * format) take fewer bits to encode than larger offsets.
286 * Consequently, we keep a potential match only if it is longer than the
287 * previous longest match that has been found. This has the added
288 * advantage of producing the array of matches sorted by strictly
289 * increasing lengths as well as strictly decreasing offsets.
291 * In degenerate cases, the binary tree might become severely
292 * unbalanced. To prevent excessive running times, we stop immediately
293 * (and return any matches that happen to have been found so far) if the
294 * current depth exceeds @max_search_depth. Note that this cutoff can
295 * occur before the longest match has been found, which is usually bad
296 * for the compression ratio.
298 * ---------------------------------------------------------------------
302 * I've explained how to find matches by searching the binary tree of
303 * the current hash code. But how do we get the binary tree in the
304 * first place? Since the tree is built incrementally, the real
305 * question is how do we update the tree to "add" the current window
308 * The tree maintains the invariant that a node's parent always has a
309 * larger position (a.k.a. smaller match offset) than itself.
310 * Therefore, the root node must always have the largest position; and
311 * since the current position is larger than any previous position, the
312 * current position must become the root of the tree.
314 * A correct, but silly, approach is to simply add the previous root as
315 * a child of the new root, using either the left or right child pointer
316 * depending on the lexicographic ordering of the strings. This works,
317 * but it really just produces a linked list, so it's not sufficient.
319 * Instead, we can initially mark the new root's left child pointer as
320 * "pending (less than)" and its right child pointer as "pending
321 * (greater than)". Then, during the search, when we examine a match
322 * that is lexicographically less than the current string, we link the
323 * "pending (less than)" pointer to the node of that match, then set the
324 * right child pointer of *that* node as "pending (less than)".
325 * Similarly, when we examine a match that is lexicographically greater
326 * than the current string, we link the "pending (greater than)" pointer
327 * to the node of that match, then set the left child pointer of *that*
328 * node as "pending (greater than)".
330 * If the search terminates before the current string is found (up to a
331 * precision of @nice_len bytes), then we set "pending (less than)" and
332 * "pending (greater than)" to point to nothing. Alternatively, if the
333 * search terminates due to finding the current string (up to a
334 * precision of @nice_len bytes), then we set "pending (less than)" and
335 * "pending (greater than)" to point to the appropriate children of that
338 * Why does this work? Well, we can think of it this way: the "pending
339 * (less than)" pointer is reserved for the next match we find that is
340 * lexicographically *less than* the current string, and the "pending
341 * (greater than)" pointer is reserved for the next match we find that
342 * is lexicographically *greater than* the current string. This
343 * explains why when we find a match that is lexicographically less than
344 * the current string, we set the "pending (less than)" pointer to point
345 * to that match. And the reason we change "pending (less than)" to the
346 * right pointer of the match in that case is because we're walking down
347 * into that subtree, and the next match lexicographically *less than*
348 * the current string is guaranteed to be lexicographically *greater
349 * than* that match, so it should be set as the right subtree of that
350 * match. But the next match in that subtree that is lexicographically
351 * *greater than* the current string will need to be moved to the
352 * "pending (greater than)" pointer farther up the tree.
354 * It's complicated, but it should make sense if you think about it.
355 * The algorithm basically just moves subtrees into the correct
356 * locations as it walks down the tree for the search. But also, if the
357 * algorithm actually finds a match of length @nice_len with the current
358 * string, it no longer needs that match node and can discard it. The
359 * algorithm also will discard nodes if the search terminates due to the
360 * depth limit. For these reasons, the binary tree might not, in fact,
361 * contain all valid positions.
366 u32 best_len = min_len - 1;
367 u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0];
368 u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1];
369 const u8 * const strptr = &window[cur_pos];
370 u32 depth_remaining = max_search_depth;
376 if (cur_match == 0 || depth_remaining-- == 0) {
382 matchptr = &window[cur_match];
383 len = min(best_lt_len, best_gt_len);
385 if (matchptr[len] == strptr[len]) {
387 len = lz_extend(strptr, matchptr, len + 1, max_len);
389 if (len > best_len) {
392 *lz_matchptr++ = (struct lz_match) {
394 .offset = strptr - matchptr,
397 if (len >= nice_len) {
398 *pending_lt_ptr = child_tab[cur_match * 2 + 0];
399 *pending_gt_ptr = child_tab[cur_match * 2 + 1];
405 if (matchptr[len] < strptr[len]) {
406 *pending_lt_ptr = cur_match;
407 pending_lt_ptr = &child_tab[cur_match * 2 + 1];
408 cur_match = *pending_lt_ptr;
411 *pending_gt_ptr = cur_match;
412 pending_gt_ptr = &child_tab[cur_match * 2 + 0];
413 cur_match = *pending_gt_ptr;
420 lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[])
422 struct lz_bt *mf = (struct lz_bt *)_mf;
423 const u8 * const window = mf->base.cur_window;
424 const u32 cur_pos = mf->base.cur_window_pos++;
425 const u32 bytes_remaining = mf->base.cur_window_size - cur_pos;
427 const u32 max_len = min(bytes_remaining, mf->base.params.max_match_len);
428 const u32 nice_len = min(max_len, mf->base.params.nice_match_len);
431 struct lz_match *lz_matchptr = matches;
433 if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1))
436 if (mf->digram_tab) {
437 /* Search the digram table for a length 2 match. */
439 const u16 digram = mf->next_digram;
440 mf->next_digram = load_u16_unaligned(&window[cur_pos + 1]);
441 prefetch(&mf->digram_tab[mf->next_digram]);
442 cur_match = mf->digram_tab[digram];
443 mf->digram_tab[digram] = cur_pos;
445 /* We're only interested in matches of length exactly 2, since
446 * those won't be found during the binary tree search.
448 * Note: it's possible to extend this match as much as possible,
449 * then use its length plus 1 as min_len for the binary tree
450 * search. However I found this actually *reduced* performance
451 * slightly, evidently because the binary tree still needs to be
452 * searched/updated starting from the root in either case. */
453 if (cur_match != 0 && window[cur_match + 2] != window[cur_pos + 2]) {
454 *lz_matchptr++ = (struct lz_match) {
456 .offset = cur_pos - cur_match,
461 min_len = mf->base.params.min_match_len;
464 hash = mf->next_hash;
465 mf->next_hash = lz_bt_hash(&window[cur_pos + 1]);
466 prefetch(&mf->hash_tab[mf->next_hash]);
467 cur_match = mf->hash_tab[hash];
468 mf->hash_tab[hash] = cur_pos;
470 /* Search the binary tree of 'hash' for matches while re-rooting it at
471 * the current position. */
472 lz_matchptr = do_search(window,
479 mf->base.params.max_search_depth,
482 /* Return the number of matches found. */
483 return lz_matchptr - matches;
486 /* This is very similar to do_search(), but it does not save any matches.
487 * See do_search() for explanatory comments. */
489 do_skip(const u8 window[restrict],
491 u32 child_tab[restrict],
494 const u32 max_search_depth)
498 u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0];
499 u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1];
500 const u8 * const strptr = &window[cur_pos];
501 u32 depth_remaining = max_search_depth;
506 if (cur_match == 0 || depth_remaining-- == 0) {
512 matchptr = &window[cur_match];
513 len = min(best_lt_len, best_gt_len);
515 if (matchptr[len] == strptr[len]) {
516 len = lz_extend(strptr, matchptr, len + 1, nice_len);
517 if (len == nice_len) {
518 *pending_lt_ptr = child_tab[cur_match * 2 + 0];
519 *pending_gt_ptr = child_tab[cur_match * 2 + 1];
523 if (matchptr[len] < strptr[len]) {
524 *pending_lt_ptr = cur_match;
525 pending_lt_ptr = &child_tab[cur_match * 2 + 1];
526 cur_match = *pending_lt_ptr;
529 *pending_gt_ptr = cur_match;
530 pending_gt_ptr = &child_tab[cur_match * 2 + 0];
531 cur_match = *pending_gt_ptr;
538 lz_bt_skip_positions(struct lz_mf *_mf, u32 n)
540 struct lz_bt *mf = (struct lz_bt *)_mf;
541 const u8 * const window = mf->base.cur_window;
542 u32 cur_pos = mf->base.cur_window_pos;
543 u32 end_pos = cur_pos + n;
544 u32 bytes_remaining = mf->base.cur_window_size - cur_pos;
551 mf->base.cur_window_pos = end_pos;
553 if (unlikely(bytes_remaining < n + (LZ_HASH_REQUIRED_NBYTES + 1) - 1)) {
554 /* Nearing end of window. */
555 if (unlikely(bytes_remaining < (LZ_HASH_REQUIRED_NBYTES + 1)))
558 end_pos = cur_pos + bytes_remaining - (LZ_HASH_REQUIRED_NBYTES + 1) + 1;
561 next_hash = mf->next_hash;
562 next_digram = mf->next_digram;
564 if (mf->digram_tab) {
565 digram = next_digram;
566 next_digram = load_u16_unaligned(&window[cur_pos + 1]);
567 mf->digram_tab[digram] = cur_pos;
571 next_hash = lz_bt_hash(&window[cur_pos + 1]);
572 cur_match = mf->hash_tab[hash];
573 mf->hash_tab[hash] = cur_pos;
575 /* Update the binary tree for the appropriate hash code. */
580 min(bytes_remaining, mf->base.params.nice_match_len),
581 mf->base.params.max_search_depth);
584 } while (++cur_pos != end_pos);
586 if (mf->digram_tab) {
587 prefetch(&mf->digram_tab[next_digram]);
588 mf->next_digram = next_digram;
591 prefetch(&mf->hash_tab[next_hash]);
592 mf->next_hash = next_hash;
596 lz_bt_destroy(struct lz_mf *_mf)
598 struct lz_bt *mf = (struct lz_bt *)_mf;
601 /* mf->hash_tab shares storage with mf->digram_tab and mf->child_tab. */
604 const struct lz_mf_ops lz_binary_trees_ops = {
605 .params_valid = lz_bt_params_valid,
606 .get_needed_memory = lz_bt_get_needed_memory,
608 .load_window = lz_bt_load_window,
609 .get_matches = lz_bt_get_matches,
610 .skip_positions = lz_bt_skip_positions,
611 .destroy = lz_bt_destroy,
612 .struct_size = sizeof(struct lz_bt),