4 * Binary tree match-finder for Lempel-Ziv compression.
6 * Copyright (c) 2014 Eric Biggers. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * Note: the binary tree search/update algorithm is based on code from the
34 * public domain LZMA SDK (authors: Igor Pavlov, Lasse Collin).
41 #include "wimlib/lz_mf.h"
42 #include "wimlib/util.h"
46 /* Number of hash buckets. This can be changed, but it should be a power of 2
47 * so that the correct hash bucket can be selected using a fast bitwise AND. */
48 #define LZ_BT_HASH_LEN (1 << 16)
50 /* Number of bytes from which the hash code is computed at each position. This
51 * can be changed, provided that lz_bt_hash() is updated as well. */
52 #define LZ_BT_HASH_BYTES 3
54 /* Number of entries in the digram table.
56 * Note: You rarely get length-2 matches if you use length-3 hashing. But
57 * since binary trees are typically used for higher compression ratios than hash
58 * chains, it is helpful for this match-finder to find length-2 matches as well.
59 * Therefore this match-finder also uses a digram table to find length-2 matches
60 * when the minimum match length is 2. */
61 #define LZ_BT_DIGRAM_TAB_LEN (256 * 256)
71 static u32 crc32_table[256];
72 static pthread_once_t crc32_table_filled = PTHREAD_ONCE_INIT;
77 for (u32 b = 0; b < 256; b++) {
79 for (int i = 0; i < 8; i++) {
81 r = (r >> 1) ^ 0xEDB88320;
89 /* This hash function is taken from the LZMA SDK. It seems to work well.
91 * TODO: Maybe use the SSE4.2 CRC32 instruction when available? */
93 lz_bt_hash(const u8 *p)
97 hash ^= crc32_table[p[0]];
99 hash ^= (u32)p[2] << 8;
101 return hash % LZ_BT_HASH_LEN;
105 lz_bt_set_default_params(struct lz_mf_params *params)
107 if (params->min_match_len == 0)
108 params->min_match_len = 2;
110 if (params->max_match_len == 0)
111 params->max_match_len = params->max_window_size;
113 if (params->max_search_depth == 0)
114 params->max_search_depth = 50;
116 if (params->nice_match_len == 0)
117 params->nice_match_len = 24;
119 if (params->nice_match_len < params->min_match_len)
120 params->nice_match_len = params->min_match_len;
122 if (params->nice_match_len > params->max_match_len)
123 params->nice_match_len = params->max_match_len;
127 lz_bt_params_valid(const struct lz_mf_params *params)
133 lz_bt_get_needed_memory(u32 max_window_size)
137 len += LZ_BT_HASH_LEN; /* hash_tab */
138 len += LZ_BT_DIGRAM_TAB_LEN; /* digram_tab */
139 len += 2 * (u64)max_window_size; /* child_tab */
141 return len * sizeof(u32);
145 lz_bt_init(struct lz_mf *_mf)
147 struct lz_bt *mf = (struct lz_bt *)_mf;
148 struct lz_mf_params *params = &mf->base.params;
151 lz_bt_set_default_params(params);
153 /* Allocate space for 'hash_tab', 'digram_tab', and 'child_tab'. */
155 len += LZ_BT_HASH_LEN;
156 if (params->min_match_len == 2)
157 len += LZ_BT_DIGRAM_TAB_LEN;
158 len += 2 * params->max_window_size;
160 mf->hash_tab = MALLOC(len * sizeof(u32));
164 if (params->min_match_len == 2) {
165 mf->digram_tab = mf->hash_tab + LZ_BT_HASH_LEN;
166 mf->child_tab = mf->digram_tab + LZ_BT_DIGRAM_TAB_LEN;
168 mf->digram_tab = NULL;
169 mf->child_tab = mf->hash_tab + LZ_BT_HASH_LEN;
172 /* Fill in the CRC32 table if not done already. */
173 pthread_once(&crc32_table_filled, crc32_init);
179 lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size)
181 struct lz_bt *mf = (struct lz_bt *)_mf;
184 /* Clear hash_tab and digram_tab.
185 * Note: child_tab need not be cleared. */
186 clear_len = LZ_BT_HASH_LEN;
188 clear_len += LZ_BT_DIGRAM_TAB_LEN;
189 memset(mf->hash_tab, 0, clear_len * sizeof(u32));
191 if (size >= LZ_BT_HASH_BYTES)
192 mf->next_hash = lz_bt_hash(window);
196 * Search the binary tree of the current hash code for matches. At the same
197 * time, update this tree to add the current position in the window.
200 * The window being searched.
202 * The current position in the window.
204 * Table of child pointers for the binary tree. The children of the node
205 * for position 'i' in the window are child_tab[i * 2] and child_tab[i*2 +
206 * 1]. Zero is reserved for the 'null' value (no child). Consequently, we
207 * don't recognize matches beginning at position 0. In fact, the node for
208 * position 0 in the window will not be used at all, which is just as well
209 * because we use 0-based indices which don't work for position 0.
211 * The position in the window at which the binary tree for the current hash
212 * code is rooted. This can be 0, which indicates that the binary tree for
213 * the current hash code is empty.
215 * Ignore matches shorter than this length. This must be at least 1.
217 * Don't produce any matches longer than this length. If we find a match
218 * this long, terminate the search and return.
220 * Stop if we reach this depth in the binary tree.
222 * The array in which to produce the matches. The matches will be produced
223 * in order of increasing length and increasing offset. No more than one
224 * match shall have any given length, nor shall any match be shorter than
225 * @min_len, nor shall any match be longer than @max_len, nor shall any two
226 * matches have the same offset.
228 * Returns the number of matches found and written to @matches.
231 do_search(const u8 window[restrict],
232 const u32 cur_window_pos,
233 u32 child_tab[restrict],
237 const u32 max_search_depth,
238 struct lz_match matches[restrict])
241 * Here's my explanation of how this code actually works. Beware: this
242 * algorithm is a *lot* trickier than searching for matches via hash
243 * chains. But it can be significantly better, especially when doing
244 * "optimal" parsing, which is why it gets used, e.g. in LZMA as well as
247 * ---------------------------------------------------------------------
251 * Basically, there is not just one binary tree, but rather one binary
252 * tree per hash code. For a given hash code, the binary tree indexes
253 * previous positions in the window that have that same hash code. The
254 * key for each node is the "string", or byte sequence, beginning at the
255 * corresponding position in the window.
257 * Each tree maintains the invariant that if node C is a child of node
258 * P, then the window position represented by node C is smaller than
259 * ("left of") the window position represented by node P. Equivalently,
260 * while descending into a tree, the match distances ("offsets") from
261 * the current position are non-decreasing --- actually strictly
262 * increasing, because each node represents a unique position.
264 * In addition, not all previous positions sharing the same hash code
265 * will necessarily be represented in each binary tree; see the
266 * "Updating" section.
268 * ---------------------------------------------------------------------
272 * Suppose we want to search for LZ77-style matches with the string
273 * beginning at the current window position and extending for @max_len
274 * bytes. To do this, we can search for this string in the binary tree
275 * for this string's hash code. Each node visited during the search is
276 * a potential match. This method will find the matches efficiently
277 * because they will converge on the current string, due to the nature
278 * of the binary search.
280 * Naively, when visiting a node that represents a match of length N, we
281 * must compare N + 1 bytes in order to determine the length of that
282 * match and the lexicographic ordering of that match relative to the
283 * current string (which determines whether we need to step left or
284 * right into the next level of the tree, as per the standard binary
285 * tree search algorithm). However, as an optimization, we need not
286 * explicitly examine the full length of the match at each node. To see
287 * that this is true, suppose that we examine a node during the search,
288 * and we find that the corresponding match is less (alt. greater) than
289 * the current string. Then, because of how binary tree search
290 * operates, the match must be lexicographically greater (alt. lesser)
291 * than any ancestor node that corresponded to a match lexicographically
292 * lesser (alt. greater) than the current string. Therefore, the match
293 * must be at least as long as the match for any such ancestor node.
294 * Therefore, the lengths of lexicographically-lesser (alt. greater)
295 * matches must be non-decreasing as they are encountered by the tree
298 * Using this observation, we can maintain two variables,
299 * 'longest_lt_match_len' and 'longest_gt_match_len', that represent the
300 * length of the longest lexicographically lesser and greater,
301 * respectively, match that has been examined so far. Then, when
302 * examining a new match, we need only start comparing at the index
303 * min(longest_lt_match_len, longest_gt_match_len) byte. Note that we
304 * cannot know beforehand whether the match will be lexicographically
305 * lesser or greater, hence the need for taking the minimum of these two
308 * As noted earlier, as we descend into the tree, the potential matches
309 * will have strictly increasing offsets. To make things faster for
310 * higher-level parsing / match-choosing code, we do not want to return
311 * a shorter match that has a larger offset than a longer match. This
312 * is because a longer match can always be truncated to a shorter match
313 * if needed, and smaller offsets usually (depending on the compression
314 * format) take fewer bits to encode than larger offsets.
315 * Consequently, we keep a potential match only if it is longer than the
316 * previous longest match that has been found. This has the added
317 * advantage of producing the array of matches sorted by strictly
318 * increasing lengths as well as strictly decreasing offsets.
320 * In degenerate cases, the binary tree might become severely
321 * unbalanced. To prevent excessive running times, we stop immediately
322 * (and return any matches that happen to have been found so far) if the
323 * current depth exceeds @max_search_depth. Note that this cutoff can
324 * occur before the longest match has been found, which is usually bad
325 * for the compression ratio.
327 * ---------------------------------------------------------------------
331 * I've explained how to find matches by searching the binary tree of
332 * the current hash code. But how do we get the binary tree in the
333 * first place? Since the tree is built incrementally, the real
334 * question is how do we update the tree to "add" the current window
337 * The tree maintains the invariant that a node's parent always has a
338 * larger position (a.k.a. smaller match offset) than itself.
339 * Therefore, the root node must always have the largest position; and
340 * since the current position is larger than any previous position, the
341 * current position must become the root of the tree.
343 * A correct, but silly, approach is to simply add the previous root as
344 * a child of the new root, using either the left or right child pointer
345 * depending on the lexicographic ordering of the strings. This works,
346 * but it really just produces a linked list, so it's not sufficient.
348 * Instead, we can initially mark the new root's left child pointer as
349 * "pending (less than)" and its right child pointer as "pending
350 * (greater than)". Then, during the search, when we examine a match
351 * that is lexicographically less than the current string, we link the
352 * "pending (less than)" pointer to the node of that match, then set the
353 * right child pointer of *that* node as "pending (less than)".
354 * Similarly, when we examine a match that is lexicographically greater
355 * than the current string, we link the "pending (greater than)" pointer
356 * to the node of that match, then set the left child pointer of *that*
357 * node as "pending (greater than)".
359 * If the search terminates before the current string is found (up to a
360 * precision of @max_len bytes), then we set "pending (less than)" and
361 * "pending (greater than)" to point to nothing. Alternatively, if the
362 * search terminates due to finding the current string (up to a
363 * precision of @max_len bytes), then we set "pending (less than)" and
364 * "pending (greater than)" to point to the appropriate children of that
367 * Why does this work? Well, we can think of it this way: the "pending
368 * (less than)" pointer is reserved for the next match we find that is
369 * lexicographically *less than* the current string, and the "pending
370 * (greater than)" pointer is reserved for the next match we find that
371 * is lexicographically *greater than* the current string. This
372 * explains why when we find a match that is lexicographically less than
373 * the current string, we set the "pending (less than)" pointer to point
374 * to that match. And the reason we change "pending (less than)" to the
375 * right pointer of the match in that case is because we're walking down
376 * into that subtree, and the next match lexicographically *less than*
377 * the current string is guaranteed to be lexicographically *greater
378 * than* that match, so it should be set as the right subtree of that
379 * match. But the next match in that subtree that is lexicographically
380 * *greater than* the current string will need to be moved to the
381 * "pending (greater than)" pointer farther up the tree.
383 * It's complicated, but it should make sense if you think about it.
384 * The algorithm basically just moves subtrees into the correct
385 * locations as it walks down the tree for the search. But also, if the
386 * algorithm actually finds a match of length @max_len with the current
387 * string, it no longer needs that match node and can discard it. The
388 * algorithm also will discard nodes if the search terminates due to the
389 * depth limit. For these reasons, the binary tree might not, in fact,
390 * contain all valid positions.
394 u32 longest_lt_match_len = 0;
395 u32 longest_gt_match_len = 0;
396 u32 longest_match_len = min_len - 1;
397 u32 *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0];
398 u32 *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1];
399 const u8 *strptr = &window[cur_window_pos];
400 u32 depth_remaining = max_search_depth;
405 if (depth_remaining-- == 0 || cur_match == 0) {
411 matchptr = &window[cur_match];
412 len = min(longest_lt_match_len, longest_gt_match_len);
414 if (matchptr[len] == strptr[len]) {
416 if (++len != max_len && matchptr[len] == strptr[len])
417 while (++len != max_len)
418 if (matchptr[len] != strptr[len])
421 if (len > longest_match_len) {
422 longest_match_len = len;
424 matches[num_matches++] = (struct lz_match) {
426 .offset = strptr - matchptr,
429 if (len == max_len) {
430 *pending_lt_ptr = child_tab[cur_match * 2 + 0];
431 *pending_gt_ptr = child_tab[cur_match * 2 + 1];
437 if (matchptr[len] < strptr[len]) {
438 *pending_lt_ptr = cur_match;
439 pending_lt_ptr = &child_tab[cur_match * 2 + 1];
440 cur_match = *pending_lt_ptr;
441 longest_lt_match_len = len;
443 *pending_gt_ptr = cur_match;
444 pending_gt_ptr = &child_tab[cur_match * 2 + 0];
445 cur_match = *pending_gt_ptr;
446 longest_gt_match_len = len;
452 lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[])
454 struct lz_bt *mf = (struct lz_bt *)_mf;
455 const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base);
461 if (unlikely(bytes_remaining <= mf->base.params.min_match_len + 1))
464 if (mf->digram_tab) {
465 /* Search the digram table for a length 2 match. */
467 const u16 digram = *(const u16 *)lz_mf_get_window_ptr(&mf->base);
468 cur_match = mf->digram_tab[digram];
469 mf->digram_tab[digram] = mf->base.cur_window_pos;
471 /* We're only interested in matches of length exactly 2, since
472 * those won't be found during the binary tree search.
474 * Note: it's possible to extend this match as much as possible,
475 * then use its length plus 1 as min_len for the binary tree
476 * search. However I found this actually *reduced* performance
477 * slightly, evidently because the binary tree still needs to be
478 * searched/updated starting from the root in either case. */
479 if (cur_match != 0 &&
480 (mf->base.cur_window[cur_match + 2] !=
481 mf->base.cur_window[mf->base.cur_window_pos + 2]))
483 matches[num_matches++] = (struct lz_match) {
485 .offset = mf->base.cur_window_pos - cur_match,
490 min_len = mf->base.params.min_match_len;
493 hash = mf->next_hash;
494 mf->next_hash = lz_bt_hash(lz_mf_get_window_ptr(&mf->base) + 1);
495 prefetch(&mf->hash_tab[mf->next_hash]);
496 cur_match = mf->hash_tab[hash];
497 mf->hash_tab[hash] = mf->base.cur_window_pos;
499 /* Search the binary tree of 'hash' for matches while re-rooting it at
500 * the current position. */
501 num_matches += do_search(mf->base.cur_window,
502 mf->base.cur_window_pos,
506 min(bytes_remaining, mf->base.params.nice_match_len),
507 mf->base.params.max_search_depth,
508 &matches[num_matches]);
510 /* If the longest match is @nice_match_len in length, it may have been
511 * truncated. Try extending it up to the maximum match length. */
512 if (num_matches != 0 &&
513 matches[num_matches - 1].len == mf->base.params.nice_match_len)
515 const u8 * const strptr = lz_mf_get_window_ptr(&mf->base);
516 const u8 * const matchptr = strptr - matches[num_matches - 1].offset;
517 const u32 len_limit = min(bytes_remaining, mf->base.params.max_match_len);
520 len = matches[num_matches - 1].len;
521 while (len < len_limit && strptr[len] == matchptr[len])
523 matches[num_matches - 1].len = len;
527 /* Advance to the next position. */
528 mf->base.cur_window_pos++;
530 /* Return the number of matches found. */
534 /* This is the same as do_search(), but it does not save any matches.
535 * See do_search() for explanatory comments. */
537 do_skip(const u8 window[restrict],
538 const u32 cur_window_pos,
539 u32 child_tab[restrict],
542 const u32 max_search_depth)
544 u32 longest_lt_match_len = 0;
545 u32 longest_gt_match_len = 0;
546 u32 *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0];
547 u32 *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1];
548 const u8 * const strptr = &window[cur_window_pos];
549 u32 depth_remaining = max_search_depth;
554 if (depth_remaining-- == 0 || cur_match == 0) {
560 matchptr = &window[cur_match];
561 len = min(longest_lt_match_len, longest_gt_match_len);
563 if (matchptr[len] == strptr[len]) {
565 if (++len == max_len) {
566 *pending_lt_ptr = child_tab[cur_match * 2 + 0];
567 *pending_gt_ptr = child_tab[cur_match * 2 + 1];
570 } while (matchptr[len] == strptr[len]);
572 if (matchptr[len] < strptr[len]) {
573 *pending_lt_ptr = cur_match;
574 pending_lt_ptr = &child_tab[cur_match * 2 + 1];
575 cur_match = *pending_lt_ptr;
576 longest_lt_match_len = len;
578 *pending_gt_ptr = cur_match;
579 pending_gt_ptr = &child_tab[cur_match * 2 + 0];
580 cur_match = *pending_gt_ptr;
581 longest_gt_match_len = len;
587 lz_bt_skip_position(struct lz_bt *mf)
589 const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base);
593 if (unlikely(bytes_remaining <= mf->base.params.min_match_len + 1))
596 /* Update the digram table. */
597 if (mf->digram_tab) {
598 const u16 digram = *(const u16 *)lz_mf_get_window_ptr(&mf->base);
599 mf->digram_tab[digram] = mf->base.cur_window_pos;
602 /* Update the hash table. */
603 hash = mf->next_hash;
604 mf->next_hash = lz_bt_hash(lz_mf_get_window_ptr(&mf->base) + 1);
605 prefetch(&mf->hash_tab[mf->next_hash]);
606 cur_match = mf->hash_tab[hash];
607 mf->hash_tab[hash] = mf->base.cur_window_pos;
609 /* Update the binary tree for the appropriate hash code. */
610 do_skip(mf->base.cur_window,
611 mf->base.cur_window_pos,
614 min(bytes_remaining, mf->base.params.nice_match_len),
615 mf->base.params.max_search_depth);
618 /* Advance to the next position. */
619 mf->base.cur_window_pos++;
623 lz_bt_skip_positions(struct lz_mf *_mf, u32 n)
625 struct lz_bt *mf = (struct lz_bt *)_mf;
628 lz_bt_skip_position(mf);
633 lz_bt_destroy(struct lz_mf *_mf)
635 struct lz_bt *mf = (struct lz_bt *)_mf;
638 /* mf->hash_tab shares storage with mf->digram_tab and mf->child_tab. */
641 const struct lz_mf_ops lz_binary_trees_ops = {
642 .params_valid = lz_bt_params_valid,
643 .get_needed_memory = lz_bt_get_needed_memory,
645 .load_window = lz_bt_load_window,
646 .get_matches = lz_bt_get_matches,
647 .skip_positions = lz_bt_skip_positions,
648 .destroy = lz_bt_destroy,
649 .struct_size = sizeof(struct lz_bt),