4 * Binary tree match-finder for Lempel-Ziv compression.
9 * The author dedicates this file to the public domain.
10 * You can do whatever you want with this file.
14 * Note: the binary tree search/update algorithm is based on code from the
15 * public domain LZMA SDK (authors: Igor Pavlov, Lasse Collin).
22 #include "wimlib/lz.h"
23 #include "wimlib/lz_bt.h"
24 #include "wimlib/util.h"
28 #define LZ_BT_HASH_BITS 16
29 #define LZ_BT_HASH_SIZE (1 << LZ_BT_HASH_BITS)
30 #define LZ_BT_HASH_MASK (LZ_BT_HASH_SIZE - 1)
31 #define LZ_BT_DIGRAM_TAB_SIZE (256 * 256)
33 static u32 crc32_table[256];
34 static pthread_once_t crc32_table_filled = PTHREAD_ONCE_INIT;
39 for (u32 b = 0; b < 256; b++) {
41 for (int i = 0; i < 8; i++) {
43 r = (r >> 1) ^ 0xEDB88320;
52 * Compute the hash code for the next 3-byte sequence in the window.
55 * A pointer to the next 3-byte sequence in the window.
57 * Returns the resulting hash code.
60 lz_bt_hash(const u8 *p)
64 hash ^= crc32_table[p[0]];
66 hash ^= (u32)p[2] << 8;
68 return hash & LZ_BT_HASH_MASK;
72 * Compute the number of bytes of memory that would be needed to initialize a
73 * binary tree match-finder with the specified maximum window size.
76 * The maximum window size, in bytes, to query.
78 * Returns the number of bytes that would be allocated by lz_bt_init(),
79 * excluding the size of the 'struct lz_bt' itself.
82 lz_bt_get_needed_memory(lz_bt_pos_t max_window_size)
86 len = LZ_BT_HASH_SIZE + LZ_BT_DIGRAM_TAB_SIZE;
87 len += 2 * (u64)max_window_size;
89 return len * sizeof(lz_bt_pos_t);
93 * Initialize a binary tree match-finder.
96 * The match-finder structure to initialize.
98 * The maximum window size that shall be supported by subsequent calls to
99 * lz_bt_load_window().
101 * The minimum length of matches that shall be produced by subsequent calls
102 * to lz_bt_get_matches(). This must be at least 2.
104 * The maximum length of matches that shall be produced by subsequent calls
105 * to lz_bt_get_matches(). This must be at least @min_match_len.
107 * The maximum length of matches that shall be produced just using the
108 * binary tree search algorithm. If the longest match has this length,
109 * then lz_bt_get_matches() will extend it up to @max_match_len. This must
110 * be at least @min_match_len and no more than @max_match_len.
112 * The maximum depth to descend into the binary search tree before halting
115 * Returns %true if successful; %false if out of memory.
118 lz_bt_init(struct lz_bt *mf,
119 lz_bt_pos_t max_window_size,
120 lz_bt_len_t min_match_len,
121 lz_bt_len_t max_match_len,
122 lz_bt_len_t num_fast_bytes,
123 u32 max_search_depth)
127 /* Check and set parameters. */
128 LZ_ASSERT(min_match_len >= 2);
129 LZ_ASSERT(max_match_len >= min_match_len);
130 LZ_ASSERT(num_fast_bytes >= min_match_len);
131 LZ_ASSERT(num_fast_bytes <= max_match_len);
133 mf->max_window_size = max_window_size;
134 mf->min_match_len = min_match_len;
135 mf->max_match_len = max_match_len;
136 mf->num_fast_bytes = num_fast_bytes;
137 mf->max_search_depth = max_search_depth;
139 /* Allocate space for 'hash_tab', 'digram_tab', and 'child_tab'. */
140 len = LZ_BT_HASH_SIZE + (2 * (u64)max_window_size);
141 if (mf->min_match_len <= 2)
142 len += LZ_BT_DIGRAM_TAB_SIZE;
143 len *= sizeof(lz_bt_pos_t);
144 if ((size_t)len != len || !(mf->hash_tab = MALLOC(len)))
146 if (mf->min_match_len <= 2) {
147 mf->digram_tab = mf->hash_tab + LZ_BT_HASH_SIZE;
148 mf->child_tab = mf->digram_tab + LZ_BT_DIGRAM_TAB_SIZE;
150 mf->child_tab = mf->hash_tab + LZ_BT_HASH_SIZE;
153 /* Fill in the CRC32 table if not done already. */
154 pthread_once(&crc32_table_filled, crc32_init);
160 * Destroy a binary tree match-finder.
163 * The match-finder structure to destroy.
166 lz_bt_destroy(struct lz_bt *mf)
169 /* mf->hash_tab shares storage with mf->digram_tab and mf->child_tab. */
173 * Load a window into a binary tree match-finder.
176 * The match-finder structure into which to load the window.
178 * Pointer to the window to load. This memory must remain available,
179 * unmodified, while the match-finder is being used.
181 * The size of the window, in bytes. This can't be larger than the
182 * @max_window_size with which lz_bt_init() was called.
185 lz_bt_load_window(struct lz_bt *mf, const u8 *window, lz_bt_pos_t window_size)
187 LZ_ASSERT(window_size <= mf->max_window_size);
190 mf->cur_window = window;
191 mf->cur_window_pos = 0;
192 mf->cur_window_size = window_size;
194 /* Clear the hash and digram tables.
195 * Note: The child table need not be cleared. */
196 clear_len = LZ_BT_HASH_SIZE;
197 if (mf->min_match_len <= 2)
198 clear_len += LZ_BT_DIGRAM_TAB_SIZE;
199 memset(mf->hash_tab, 0, clear_len * sizeof(lz_bt_pos_t));
203 * Search the binary tree of the current hash code for matches. At the same
204 * time, update this tree to add the current position in the window.
207 * The window being searched.
209 * The current position in the window.
211 * Ignore matches shorter than this length. This must be at least 1.
213 * Don't produce any matches longer than this length. If we find a match
214 * this long, terminate the search and return.
216 * Stop if we reach this depth in the binary tree.
218 * Table of child pointers for the binary tree. The children of the node
219 * for position 'i' in the window are child_tab[i * 2] and child_tab[i*2 +
220 * 1]. Zero is reserved for the 'null' value (no child). Consequently, we
221 * don't recognize matches beginning at position 0. In fact, the node for
222 * position 0 in the window will not be used at all, which is just as well
223 * because we use 0-based indices which don't work for position 0.
225 * The position in the window at which the binary tree for the current hash
226 * code is rooted. This can be 0, which indicates that the binary tree for
227 * the current hash code is empty.
229 * The array in which to produce the matches. The matches will be produced
230 * in order of increasing length and increasing offset. No more than one
231 * match shall have any given length, nor shall any match be shorter than
232 * @min_len, nor shall any match be longer than @max_len, nor shall any two
233 * matches have the same offset.
235 * Returns the number of matches found and written to @matches.
238 do_search(const u8 window[restrict],
239 const lz_bt_pos_t cur_window_pos,
240 const lz_bt_len_t min_len,
241 const lz_bt_len_t max_len,
243 lz_bt_pos_t child_tab[restrict],
244 lz_bt_pos_t cur_match,
245 struct raw_match matches[restrict])
248 * Here's my explanation of how this code actually works. Beware: this
249 * algorithm is a *lot* trickier than searching for matches via hash
250 * chains. But it can be significantly better, especially when doing
251 * "optimal" parsing, which is why it gets used, e.g. in LZMA as well as
254 * ---------------------------------------------------------------------
258 * Basically, there is not just one binary tree, but rather one binary
259 * tree per hash code. For a given hash code, the binary tree indexes
260 * previous positions in the window that have that same hash code. The
261 * key for each node is the "string", or byte sequence, beginning at the
262 * corresponding position in the window.
264 * Each tree maintains the invariant that if node C is a child of node
265 * P, then the window position represented by node C is smaller than
266 * ("left of") the window position represented by node P. Equivalently,
267 * while descending into a tree, the match distances ("offsets") from
268 * the current position are non-decreasing --- actually strictly
269 * increasing, because each node represents a unique position.
271 * In addition, not all previous positions sharing the same hash code
272 * will necessarily be represented in each binary tree; see the
273 * "Updating" section.
275 * ---------------------------------------------------------------------
279 * Suppose we want to search for LZ77-style matches with the string
280 * beginning at the current window position and extending for @max_len
281 * bytes. To do this, we can search for this string in the binary tree
282 * for this string's hash code. Each node visited during the search is
283 * a potential match. This method will find the matches efficiently
284 * because they will converge on the current string, due to the nature
285 * of the binary search.
287 * Naively, when visiting a node that represents a match of length N, we
288 * must compare N + 1 bytes in order to determine the length of that
289 * match and the lexicographic ordering of that match relative to the
290 * current string (which determines whether we need to step left or
291 * right into the next level of the tree, as per the standard binary
292 * tree search algorithm). However, as an optimization, we need not
293 * explicitly examine the full length of the match at each node. To see
294 * that this is true, suppose that we examine a node during the search,
295 * and we find that the corresponding match is less (alt. greater) than
296 * the current string. Then, because of how binary tree search
297 * operates, the match must be lexicographically greater (alt. lesser)
298 * than any ancestor node that corresponded to a match lexicographically
299 * lesser (alt. greater) than the current string. Therefore, the match
300 * must be at least as long as the match for any such ancestor node.
301 * Therefore, the lengths of lexicographically-lesser (alt. greater)
302 * matches must be non-decreasing as they are encountered by the tree
305 * Using this observation, we can maintain two variables,
306 * 'longest_lt_match_len' and 'longest_gt_match_len', that represent the
307 * length of the longest lexicographically lesser and greater,
308 * respectively, match that has been examined so far. Then, when
309 * examining a new match, we need only start comparing at the index
310 * min(longest_lt_match_len, longest_gt_match_len) byte. Note that we
311 * cannot know beforehand whether the match will be lexicographically
312 * lesser or greater, hence the need for taking the minimum of these two
315 * As noted earlier, as we descend into the tree, the potential matches
316 * will have strictly increasing offsets. To make things faster for
317 * higher-level parsing / match-choosing code, we do not want to return
318 * a shorter match that has a larger offset than a longer match. This
319 * is because a longer match can always be truncated to a shorter match
320 * if needed, and smaller offsets usually (depending on the compression
321 * format) take fewer bits to encode than larger offsets.
322 * Consequently, we keep a potential match only if it is longer than the
323 * previous longest match that has been found. This has the added
324 * advantage of producing the array of matches sorted by strictly
325 * increasing lengths as well as strictly decreasing offsets.
327 * In degenerate cases, the binary tree might become severely
328 * unbalanced. To prevent excessive running times, we stop immediately
329 * (and return any matches that happen to have been found so far) if the
330 * current depth exceeds @max_depth. Note that this cutoff can occur
331 * before the longest match has been found, which is usually bad for the
334 * ---------------------------------------------------------------------
338 * I've explained how to find matches by searching the binary tree of
339 * the current hash code. But how do we get the binary tree in the
340 * first place? Since the tree is built incrementally, the real
341 * question is how do we update the tree to "add" the current window
344 * The tree maintains the invariant that a node's parent always has a
345 * larger position (a.k.a. smaller match offset) than itself.
346 * Therefore, the root node must always have the largest position; and
347 * since the current position is larger than any previous position, the
348 * current position must become the root of the tree.
350 * A correct, but silly, approach is to simply add the previous root as
351 * a child of the new root, using either the left or right child pointer
352 * depending on the lexicographic ordering of the strings. This works,
353 * but it really just produces a linked list, so it's not sufficient.
355 * Instead, we can initially mark the new root's left child pointer as
356 * "pending (less than)" and its right child pointer as "pending
357 * (greater than)". Then, during the search, when we examine a match
358 * that is lexicographically less than the current string, we link the
359 * "pending (less than)" pointer to the node of that match, then set the
360 * right child pointer of *that* node as "pending (less than)".
361 * Similarly, when we examine a match that is lexicographically greater
362 * than the current string, we link the "pending (greater than)" pointer
363 * to the node of that match, then set the left child pointer of *that*
364 * node as "pending (greater than)".
366 * If the search terminates before the current string is found (up to a
367 * precision of @max_len bytes), then we set "pending (less than)" and
368 * "pending (greater than)" to point to nothing. Alternatively, if the
369 * search terminates due to finding the current string (up to a
370 * precision of @max_len bytes), then we set "pending (less than)" and
371 * "pending (greater than)" to point to the appropriate children of that
374 * Why does this work? Well, we can think of it this way: the "pending
375 * (less than)" pointer is reserved for the next match we find that is
376 * lexicographically *less than* the current string, and the "pending
377 * (greater than)" pointer is reserved for the next match we find that
378 * is lexicographically *greater than* the current string. This
379 * explains why when we find a match that is lexicographically less than
380 * the current string, we set the "pending (less than)" pointer to point
381 * to that match. And the reason we change "pending (less than)" to the
382 * right pointer of the match in that case is because we're walking down
383 * into that subtree, and the next match lexicographically *less than*
384 * the current string is guaranteed to be lexicographically *greater
385 * than* that match, so it should be set as the right subtree of that
386 * match. But the next match in that subtree that is lexicographically
387 * *greater than* the current string will need to be moved to the
388 * "pending (greater than)" pointer farther up the tree.
390 * It's complicated, but it should make sense if you think about it.
391 * The algorithm basically just moves subtrees into the correct
392 * locations as it walks down the tree for the search. But also, if the
393 * algorithm actually finds a match of length @max_len with the current
394 * string, it no longer needs that match node and can discard it. The
395 * algorithm also will discard nodes if the search terminates due to the
396 * depth limit. For these reasons, the binary tree might not, in fact,
397 * contain all valid positions.
400 lz_bt_len_t num_matches = 0;
401 lz_bt_len_t longest_lt_match_len = 0;
402 lz_bt_len_t longest_gt_match_len = 0;
403 lz_bt_len_t longest_match_len = min_len - 1;
404 lz_bt_pos_t *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0];
405 lz_bt_pos_t *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1];
406 const u8 *strptr = &window[cur_window_pos];
407 u32 depth_remaining = max_depth;
412 if (depth_remaining-- == 0 || cur_match == 0) {
418 matchptr = &window[cur_match];
419 len = min(longest_lt_match_len, longest_gt_match_len);
421 if (matchptr[len] == strptr[len]) {
423 while (++len != max_len)
424 if (matchptr[len] != strptr[len])
427 if (len > longest_match_len) {
428 longest_match_len = len;
430 matches[num_matches++] = (struct raw_match) {
432 .offset = cur_window_pos - cur_match,
435 if (len == max_len) {
436 *pending_lt_ptr = child_tab[cur_match * 2 + 0];
437 *pending_gt_ptr = child_tab[cur_match * 2 + 1];
443 if (matchptr[len] < strptr[len]) {
444 *pending_lt_ptr = cur_match;
445 pending_lt_ptr = &child_tab[cur_match * 2 + 1];
446 cur_match = *pending_lt_ptr;
447 longest_lt_match_len = len;
449 *pending_gt_ptr = cur_match;
450 pending_gt_ptr = &child_tab[cur_match * 2 + 0];
451 cur_match = *pending_gt_ptr;
452 longest_gt_match_len = len;
458 * Retrieve a list of matches at the next position in the window.
461 * The binary tree match-finder structure into which a window has been
462 * loaded using lz_bt_load_window().
464 * The array into which the matches will be returned. The length of this
465 * array must be at least (@mf->num_fast_bytes - @mf->min_match_len + 1).
467 * The return value is the number of matches that were found and stored in the
468 * 'matches' array. The matches will be ordered by strictly increasing length
469 * and strictly increasing offset. No match shall have length less than
470 * @min_match_len, and no match shall have length greater than @max_match_len.
471 * The return value may be 0, which indicates that no matches were found.
473 * On completion, the binary tree match-finder is advanced to the next position
477 lz_bt_get_matches(struct lz_bt *mf, struct raw_match matches[])
479 lz_bt_pos_t bytes_remaining;
480 lz_bt_len_t num_matches;
481 lz_bt_pos_t cur_match;
484 LZ_ASSERT(mf->cur_window_pos < mf->cur_window_size);
486 bytes_remaining = lz_bt_get_remaining_size(mf);
488 /* If there are fewer than 3 bytes remaining, we can't even compute a
489 * hash to look up a binary tree root. If there are exactly 2 bytes
490 * remaining we could still search for a length-2 match using the digram
491 * table, but it's not worth bothering. (Note: this is also useful for
492 * LZX, since this excludes the length 2 match having the maximum
493 * offset, which isn't allowed.) */
494 if (bytes_remaining < 3) {
495 mf->cur_window_pos++;
501 /* Search the digram table for a length 2 match. */
502 if (mf->min_match_len <= 2) {
506 c1 = mf->cur_window[mf->cur_window_pos];
507 c2 = mf->cur_window[mf->cur_window_pos + 1];
508 digram = (u16)c1 | ((u16)c2 << 8);
509 cur_match = mf->digram_tab[digram];
510 mf->digram_tab[digram] = mf->cur_window_pos;
512 /* We're only interested in matches of length exactly 2, since
513 * those won't be found during the binary tree search. */
514 if (cur_match != 0 && mf->cur_window[cur_match + 2] !=
515 mf->cur_window[mf->cur_window_pos + 2])
517 matches[num_matches++] = (struct raw_match) {
519 .offset = mf->cur_window_pos - cur_match,
524 /* Hash the length-3 byte sequence beginning at the current position in
526 hash = lz_bt_hash(&mf->cur_window[mf->cur_window_pos]);
528 /* The corresponding hash bucket in 'hash_tab' contains the root of the
529 * binary tree of previous window positions that have the same hash
531 cur_match = mf->hash_tab[hash];
533 /* Update the hash bucket to point to the binary tree rooted at the
534 * current position, which we will construct in do_search(). */
535 mf->hash_tab[hash] = mf->cur_window_pos;
537 /* Search the binary tree for matches. At the same time, build the
538 * binary tree rooted at the current position, which replaces the one we
540 num_matches += do_search(mf->cur_window,
542 max(3, mf->min_match_len),
543 min(bytes_remaining, mf->num_fast_bytes),
544 mf->max_search_depth,
547 &matches[num_matches]);
549 /* If the longest match is @num_fast_bytes in length, it may have been
550 * truncated. Try extending it up to the maximum match length. */
551 if (num_matches != 0 && matches[num_matches - 1].len == mf->num_fast_bytes) {
553 const u8 *strptr, *matchptr;
556 limit = min(bytes_remaining, mf->max_match_len);
557 strptr = &mf->cur_window[mf->cur_window_pos];
558 matchptr = strptr - matches[num_matches - 1].offset;
559 len = matches[num_matches - 1].len;
560 while (len < limit && strptr[len] == matchptr[len])
562 matches[num_matches - 1].len = len;
565 #ifdef ENABLE_LZ_DEBUG
566 /* Check the matches. */
567 for (lz_bt_len_t i = 0; i < num_matches; i++) {
568 const u8 *matchptr, *strptr;
571 LZ_ASSERT(matches[i].len >= mf->min_match_len);
572 LZ_ASSERT(matches[i].len <= min(mf->max_match_len, bytes_remaining));
575 LZ_ASSERT(matches[i].offset >= 1);
576 LZ_ASSERT(matches[i].offset <= lz_bt_get_position(mf));
578 /* Lengths and offsets strictly increasing? */
580 LZ_ASSERT(matches[i].len > matches[i - 1].len);
581 LZ_ASSERT(matches[i].offset > matches[i - 1].offset);
584 /* Actually a match? */
585 strptr = lz_bt_get_window_ptr(mf);
586 matchptr = strptr - matches[i].offset;
587 LZ_ASSERT(!memcmp(strptr, matchptr, matches[i].len));
589 /* Match can't be extended further? */
590 LZ_ASSERT(matches[i].len == min(mf->max_match_len, bytes_remaining) ||
591 strptr[matches[i].len] != matchptr[matches[i].len]);
593 #endif /* ENABLE_LZ_DEBUG */
595 /* Advance to the next position in the window. */
596 mf->cur_window_pos++;
598 /* Return the number of matches found. */
602 /* This is the same as do_search(), but it does not save any matches.
603 * See do_search() for explanatory comments. */
605 do_skip(const u8 window[restrict],
606 const lz_bt_pos_t cur_window_pos,
607 const lz_bt_len_t max_len,
609 lz_bt_pos_t child_tab[restrict],
610 lz_bt_pos_t cur_match)
612 lz_bt_len_t longest_lt_match_len = 0;
613 lz_bt_len_t longest_gt_match_len = 0;
614 lz_bt_pos_t *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0];
615 lz_bt_pos_t *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1];
616 const u8 * const strptr = &window[cur_window_pos];
621 if (depth_remaining-- == 0 || cur_match == 0) {
627 matchptr = &window[cur_match];
628 len = min(longest_lt_match_len, longest_gt_match_len);
630 if (matchptr[len] == strptr[len]) {
632 if (++len == max_len) {
633 *pending_lt_ptr = child_tab[cur_match * 2 + 0];
634 *pending_gt_ptr = child_tab[cur_match * 2 + 1];
637 } while (matchptr[len] == strptr[len]);
639 if (matchptr[len] < strptr[len]) {
640 *pending_lt_ptr = cur_match;
641 pending_lt_ptr = &child_tab[cur_match * 2 + 1];
642 cur_match = *pending_lt_ptr;
643 longest_lt_match_len = len;
645 *pending_gt_ptr = cur_match;
646 pending_gt_ptr = &child_tab[cur_match * 2 + 0];
647 cur_match = *pending_gt_ptr;
648 longest_gt_match_len = len;
653 /* Skip the current position in the binary tree match-finder. */
655 lz_bt_skip_position(struct lz_bt *mf)
657 lz_bt_pos_t bytes_remaining;
659 lz_bt_pos_t cur_match;
661 LZ_ASSERT(mf->cur_window_pos < mf->cur_window_size);
663 bytes_remaining = lz_bt_get_remaining_size(mf);
665 /* As explained in lz_bt_get_matches(), we don't search for matches if
666 * there are fewer than 3 bytes remaining in the window. */
667 if (bytes_remaining < 3) {
668 mf->cur_window_pos++;
672 /* Update the digram table. */
673 if (mf->min_match_len <= 2) {
677 c1 = mf->cur_window[mf->cur_window_pos];
678 c2 = mf->cur_window[mf->cur_window_pos + 1];
679 digram = (u16)c1 | ((u16)c2 << 8);
680 mf->digram_tab[digram] = mf->cur_window_pos;
683 /* Update the hash table. */
684 hash = lz_bt_hash(&mf->cur_window[mf->cur_window_pos]);
685 cur_match = mf->hash_tab[hash];
686 mf->hash_tab[hash] = mf->cur_window_pos;
688 /* Update the binary tree for the appropriate hash code. */
689 do_skip(mf->cur_window,
691 min(bytes_remaining, mf->num_fast_bytes),
692 mf->max_search_depth,
696 /* Advance to the next position. */
697 mf->cur_window_pos++;
700 /* Skip 'n' positions in the binary tree match-finder. */
702 lz_bt_skip_positions(struct lz_bt *mf, unsigned n)
705 lz_bt_skip_position(mf);