/* * lz_binary_trees.c * * Binary tree match-finder for Lempel-Ziv compression. * * Copyright (c) 2014 Eric Biggers. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Note: the binary tree search/update algorithm is based on LzFind.c from * 7-Zip, which was written by Igor Pavlov and released into the public domain. */ #ifdef HAVE_CONFIG_H # include "config.h" #endif #include "wimlib/lz_extend.h" #include "wimlib/lz_hash3.h" #include "wimlib/lz_mf.h" #include "wimlib/util.h" #include /* log2 of the number of buckets in the hash table. This can be changed. */ #define LZ_BT_HASH_ORDER 16 #define LZ_BT_HASH_LEN (1 << LZ_BT_HASH_ORDER) /* Number of entries in the digram table. * * Note: You rarely get length-2 matches if you use length-3 hashing. But * since binary trees are typically used for higher compression ratios than hash * chains, it is helpful for this match-finder to find length-2 matches as well. * Therefore this match-finder also uses a digram table to find length-2 matches * when the minimum match length is 2. */ #define LZ_BT_DIGRAM_TAB_LEN (256 * 256) struct lz_bt { struct lz_mf base; u32 *hash_tab; u32 *digram_tab; u32 *child_tab; u32 next_hash; u16 next_digram; }; static inline u32 lz_bt_hash(const u8 *p) { return lz_hash(p, LZ_BT_HASH_ORDER); } static void lz_bt_set_default_params(struct lz_mf_params *params) { if (params->min_match_len == 0) params->min_match_len = 2; if (params->max_match_len == 0) params->max_match_len = UINT32_MAX; if (params->max_search_depth == 0) params->max_search_depth = 50; if (params->nice_match_len == 0) params->nice_match_len = 24; if (params->nice_match_len < params->min_match_len) params->nice_match_len = params->min_match_len; if (params->nice_match_len > params->max_match_len) params->nice_match_len = params->max_match_len; } static bool lz_bt_params_valid(const struct lz_mf_params *params) { return true; } static u64 lz_bt_get_needed_memory(u32 max_window_size) { u64 len = 0; len += LZ_BT_HASH_LEN; /* hash_tab */ len += LZ_BT_DIGRAM_TAB_LEN; /* digram_tab */ len += 2 * (u64)max_window_size; /* child_tab */ return len * sizeof(u32); } static bool lz_bt_init(struct lz_mf *_mf) { struct lz_bt *mf = (struct lz_bt *)_mf; struct lz_mf_params *params = &mf->base.params; size_t len = 0; lz_bt_set_default_params(params); /* Allocate space for 'hash_tab', 'digram_tab', and 'child_tab'. */ len += LZ_BT_HASH_LEN; if (params->min_match_len == 2) len += LZ_BT_DIGRAM_TAB_LEN; len += 2 * params->max_window_size; mf->hash_tab = MALLOC(len * sizeof(u32)); if (!mf->hash_tab) return false; if (params->min_match_len == 2) { mf->digram_tab = mf->hash_tab + LZ_BT_HASH_LEN; mf->child_tab = mf->digram_tab + LZ_BT_DIGRAM_TAB_LEN; } else { mf->child_tab = mf->hash_tab + LZ_BT_HASH_LEN; } return true; } static void lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size) { struct lz_bt *mf = (struct lz_bt *)_mf; size_t clear_len; /* Clear hash_tab and digram_tab. * Note: child_tab need not be cleared. */ clear_len = LZ_BT_HASH_LEN; if (mf->digram_tab) clear_len += LZ_BT_DIGRAM_TAB_LEN; memset(mf->hash_tab, 0, clear_len * sizeof(u32)); } /* * Search the binary tree of the current hash code for matches. At the same * time, update this tree to add the current position in the window. * * @window * The window being searched. * @cur_pos * The current position in the window. * @child_tab * Table of child pointers for the binary tree. The children of the node * for position 'i' in the window are child_tab[i * 2] and child_tab[i*2 + * 1]. Zero is reserved for the 'null' value (no child). Consequently, we * don't recognize matches beginning at position 0. In fact, the node for * position 0 in the window will not be used at all, which is just as well * because we use 0-based indices which don't work for position 0. * @cur_match * The position in the window at which the binary tree for the current hash * code is rooted. This can be 0, which indicates that the binary tree for * the current hash code is empty. * @min_len * Ignore matches shorter than this length. This must be at least 1. * @nice_len * Stop searching if a match of this length or longer is found. This must * be less than or equal to @max_len. * @max_len * Maximum length of matches to return. This can be longer than @nice_len, * in which case a match of length @nice_len will still cause the search to * be terminated, but the match will be extended up to @max_len bytes * first. * @max_search_depth * Stop if we reach this depth in the binary tree. * @matches * The array in which to produce the matches. The matches will be produced * in order of increasing length and increasing offset. No more than one * match shall have any given length, nor shall any match be shorter than * @min_len, nor shall any match be longer than @max_len, nor shall any two * matches have the same offset. * * Returns a pointer to the next free slot in @matches. */ static struct lz_match * do_search(const u8 window[restrict], const u32 cur_pos, u32 child_tab[restrict], u32 cur_match, const u32 min_len, const u32 nice_len, const u32 max_len, const u32 max_search_depth, struct lz_match *lz_matchptr) { /* * Here's my explanation of how this code actually works. Beware: this * algorithm is a *lot* trickier than searching for matches via hash * chains. But it can be significantly better, especially when doing * "optimal" parsing, which is why it gets used, e.g. in LZMA as well as * here. * * --------------------------------------------------------------------- * * Data structure * * Basically, there is not just one binary tree, but rather one binary * tree per hash code. For a given hash code, the binary tree indexes * previous positions in the window that have that same hash code. The * key for each node is the "string", or byte sequence, beginning at the * corresponding position in the window. * * Each tree maintains the invariant that if node C is a child of node * P, then the window position represented by node C is smaller than * ("left of") the window position represented by node P. Equivalently, * while descending into a tree, the match distances ("offsets") from * the current position are non-decreasing --- actually strictly * increasing, because each node represents a unique position. * * In addition, not all previous positions sharing the same hash code * will necessarily be represented in each binary tree; see the * "Updating" section. * * --------------------------------------------------------------------- * * Searching * * Suppose we want to search for LZ77-style matches with the string * beginning at the current window position and extending for @max_len * bytes. To do this, we can search for this string in the binary tree * for this string's hash code. Each node visited during the search is * a potential match. This method will find the matches efficiently * because they will converge on the current string, due to the nature * of the binary search. * * Naively, when visiting a node that represents a match of length N, we * must compare N + 1 bytes in order to determine the length of that * match and the lexicographic ordering of that match relative to the * current string (which determines whether we need to step left or * right into the next level of the tree, as per the standard binary * tree search algorithm). However, as an optimization, we need not * explicitly examine the full length of the match at each node. To see * that this is true, suppose that we examine a node during the search, * and we find that the corresponding match is less (alt. greater) than * the current string. Then, because of how binary tree search * operates, the match must be lexicographically greater (alt. lesser) * than any ancestor node that corresponded to a match lexicographically * lesser (alt. greater) than the current string. Therefore, the match * must be at least as long as the match for any such ancestor node. * Therefore, the lengths of lexicographically-lesser (alt. greater) * matches must be non-decreasing as they are encountered by the tree * search. * * Using this observation, we can maintain two variables, 'best_lt_len' * and 'best_gt_len', that represent the length of the longest * lexicographically lesser and greater, respectively, match that has * been examined so far. Then, when examining a new match, we need * only start comparing at the index min(best_lt_len, best_gt_len) byte. * Note that we cannot know beforehand whether the match will be * lexicographically lesser or greater, hence the need for taking the * minimum of these two lengths. * * As noted earlier, as we descend into the tree, the potential matches * will have strictly increasing offsets. To make things faster for * higher-level parsing / match-choosing code, we do not want to return * a shorter match that has a larger offset than a longer match. This * is because a longer match can always be truncated to a shorter match * if needed, and smaller offsets usually (depending on the compression * format) take fewer bits to encode than larger offsets. * Consequently, we keep a potential match only if it is longer than the * previous longest match that has been found. This has the added * advantage of producing the array of matches sorted by strictly * increasing lengths as well as strictly decreasing offsets. * * In degenerate cases, the binary tree might become severely * unbalanced. To prevent excessive running times, we stop immediately * (and return any matches that happen to have been found so far) if the * current depth exceeds @max_search_depth. Note that this cutoff can * occur before the longest match has been found, which is usually bad * for the compression ratio. * * --------------------------------------------------------------------- * * Updating * * I've explained how to find matches by searching the binary tree of * the current hash code. But how do we get the binary tree in the * first place? Since the tree is built incrementally, the real * question is how do we update the tree to "add" the current window * position. * * The tree maintains the invariant that a node's parent always has a * larger position (a.k.a. smaller match offset) than itself. * Therefore, the root node must always have the largest position; and * since the current position is larger than any previous position, the * current position must become the root of the tree. * * A correct, but silly, approach is to simply add the previous root as * a child of the new root, using either the left or right child pointer * depending on the lexicographic ordering of the strings. This works, * but it really just produces a linked list, so it's not sufficient. * * Instead, we can initially mark the new root's left child pointer as * "pending (less than)" and its right child pointer as "pending * (greater than)". Then, during the search, when we examine a match * that is lexicographically less than the current string, we link the * "pending (less than)" pointer to the node of that match, then set the * right child pointer of *that* node as "pending (less than)". * Similarly, when we examine a match that is lexicographically greater * than the current string, we link the "pending (greater than)" pointer * to the node of that match, then set the left child pointer of *that* * node as "pending (greater than)". * * If the search terminates before the current string is found (up to a * precision of @nice_len bytes), then we set "pending (less than)" and * "pending (greater than)" to point to nothing. Alternatively, if the * search terminates due to finding the current string (up to a * precision of @nice_len bytes), then we set "pending (less than)" and * "pending (greater than)" to point to the appropriate children of that * match. * * Why does this work? Well, we can think of it this way: the "pending * (less than)" pointer is reserved for the next match we find that is * lexicographically *less than* the current string, and the "pending * (greater than)" pointer is reserved for the next match we find that * is lexicographically *greater than* the current string. This * explains why when we find a match that is lexicographically less than * the current string, we set the "pending (less than)" pointer to point * to that match. And the reason we change "pending (less than)" to the * right pointer of the match in that case is because we're walking down * into that subtree, and the next match lexicographically *less than* * the current string is guaranteed to be lexicographically *greater * than* that match, so it should be set as the right subtree of that * match. But the next match in that subtree that is lexicographically * *greater than* the current string will need to be moved to the * "pending (greater than)" pointer farther up the tree. * * It's complicated, but it should make sense if you think about it. * The algorithm basically just moves subtrees into the correct * locations as it walks down the tree for the search. But also, if the * algorithm actually finds a match of length @nice_len with the current * string, it no longer needs that match node and can discard it. The * algorithm also will discard nodes if the search terminates due to the * depth limit. For these reasons, the binary tree might not, in fact, * contain all valid positions. */ u32 best_lt_len = 0; u32 best_gt_len = 0; u32 best_len = min_len - 1; u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0]; u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1]; const u8 * const strptr = &window[cur_pos]; u32 depth_remaining = max_search_depth; for (;;) { const u8 *matchptr; u32 len; if (cur_match == 0 || depth_remaining-- == 0) { *pending_lt_ptr = 0; *pending_gt_ptr = 0; return lz_matchptr; } matchptr = &window[cur_match]; len = min(best_lt_len, best_gt_len); if (matchptr[len] == strptr[len]) { len = lz_extend(strptr, matchptr, len + 1, max_len); if (len > best_len) { best_len = len; *lz_matchptr++ = (struct lz_match) { .len = len, .offset = strptr - matchptr, }; if (len >= nice_len) { *pending_lt_ptr = child_tab[cur_match * 2 + 0]; *pending_gt_ptr = child_tab[cur_match * 2 + 1]; return lz_matchptr; } } } if (matchptr[len] < strptr[len]) { *pending_lt_ptr = cur_match; pending_lt_ptr = &child_tab[cur_match * 2 + 1]; cur_match = *pending_lt_ptr; best_lt_len = len; } else { *pending_gt_ptr = cur_match; pending_gt_ptr = &child_tab[cur_match * 2 + 0]; cur_match = *pending_gt_ptr; best_gt_len = len; } } } static u32 lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[]) { struct lz_bt *mf = (struct lz_bt *)_mf; const u8 * const window = mf->base.cur_window; const u32 cur_pos = mf->base.cur_window_pos++; const u32 bytes_remaining = mf->base.cur_window_size - cur_pos; u32 min_len; const u32 max_len = min(bytes_remaining, mf->base.params.max_match_len); const u32 nice_len = min(max_len, mf->base.params.nice_match_len); u32 hash; u32 cur_match; struct lz_match *lz_matchptr = matches; if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1)) return 0; if (mf->digram_tab) { /* Search the digram table for a length 2 match. */ const u16 digram = mf->next_digram; mf->next_digram = load_u16_unaligned(&window[cur_pos + 1]); prefetch(&mf->digram_tab[mf->next_digram]); cur_match = mf->digram_tab[digram]; mf->digram_tab[digram] = cur_pos; /* We're only interested in matches of length exactly 2, since * those won't be found during the binary tree search. * * Note: it's possible to extend this match as much as possible, * then use its length plus 1 as min_len for the binary tree * search. However I found this actually *reduced* performance * slightly, evidently because the binary tree still needs to be * searched/updated starting from the root in either case. */ if (cur_match != 0 && window[cur_match + 2] != window[cur_pos + 2]) { *lz_matchptr++ = (struct lz_match) { .len = 2, .offset = cur_pos - cur_match, }; } min_len = 3; } else { min_len = mf->base.params.min_match_len; } hash = mf->next_hash; mf->next_hash = lz_bt_hash(&window[cur_pos + 1]); prefetch(&mf->hash_tab[mf->next_hash]); cur_match = mf->hash_tab[hash]; mf->hash_tab[hash] = cur_pos; /* Search the binary tree of 'hash' for matches while re-rooting it at * the current position. */ lz_matchptr = do_search(window, cur_pos, mf->child_tab, cur_match, min_len, nice_len, max_len, mf->base.params.max_search_depth, lz_matchptr); /* Return the number of matches found. */ return lz_matchptr - matches; } /* This is very similar to do_search(), but it does not save any matches. * See do_search() for explanatory comments. */ static void do_skip(const u8 window[restrict], const u32 cur_pos, u32 child_tab[restrict], u32 cur_match, const u32 nice_len, const u32 max_search_depth) { u32 best_lt_len = 0; u32 best_gt_len = 0; u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0]; u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1]; const u8 * const strptr = &window[cur_pos]; u32 depth_remaining = max_search_depth; for (;;) { const u8 *matchptr; u32 len; if (cur_match == 0 || depth_remaining-- == 0) { *pending_lt_ptr = 0; *pending_gt_ptr = 0; return; } matchptr = &window[cur_match]; len = min(best_lt_len, best_gt_len); if (matchptr[len] == strptr[len]) { len = lz_extend(strptr, matchptr, len + 1, nice_len); if (len == nice_len) { *pending_lt_ptr = child_tab[cur_match * 2 + 0]; *pending_gt_ptr = child_tab[cur_match * 2 + 1]; return; } } if (matchptr[len] < strptr[len]) { *pending_lt_ptr = cur_match; pending_lt_ptr = &child_tab[cur_match * 2 + 1]; cur_match = *pending_lt_ptr; best_lt_len = len; } else { *pending_gt_ptr = cur_match; pending_gt_ptr = &child_tab[cur_match * 2 + 0]; cur_match = *pending_gt_ptr; best_gt_len = len; } } } static void lz_bt_skip_positions(struct lz_mf *_mf, u32 n) { struct lz_bt *mf = (struct lz_bt *)_mf; const u8 * const window = mf->base.cur_window; u32 cur_pos = mf->base.cur_window_pos; u32 end_pos = cur_pos + n; u32 bytes_remaining = mf->base.cur_window_size - cur_pos; u32 hash; u32 next_hash; u32 cur_match; u16 digram; u16 next_digram; mf->base.cur_window_pos = end_pos; if (unlikely(bytes_remaining < n + (LZ_HASH_REQUIRED_NBYTES + 1) - 1)) { /* Nearing end of window. */ if (unlikely(bytes_remaining < (LZ_HASH_REQUIRED_NBYTES + 1))) return; end_pos = cur_pos + bytes_remaining - (LZ_HASH_REQUIRED_NBYTES + 1) + 1; } next_hash = mf->next_hash; next_digram = mf->next_digram; do { if (mf->digram_tab) { digram = next_digram; next_digram = load_u16_unaligned(&window[cur_pos + 1]); mf->digram_tab[digram] = cur_pos; } hash = next_hash; next_hash = lz_bt_hash(&window[cur_pos + 1]); cur_match = mf->hash_tab[hash]; mf->hash_tab[hash] = cur_pos; /* Update the binary tree for the appropriate hash code. */ do_skip(window, cur_pos, mf->child_tab, cur_match, min(bytes_remaining, mf->base.params.nice_match_len), mf->base.params.max_search_depth); bytes_remaining--; } while (++cur_pos != end_pos); if (mf->digram_tab) { prefetch(&mf->digram_tab[next_digram]); mf->next_digram = next_digram; } prefetch(&mf->hash_tab[next_hash]); mf->next_hash = next_hash; } static void lz_bt_destroy(struct lz_mf *_mf) { struct lz_bt *mf = (struct lz_bt *)_mf; FREE(mf->hash_tab); /* mf->hash_tab shares storage with mf->digram_tab and mf->child_tab. */ } const struct lz_mf_ops lz_binary_trees_ops = { .params_valid = lz_bt_params_valid, .get_needed_memory = lz_bt_get_needed_memory, .init = lz_bt_init, .load_window = lz_bt_load_window, .get_matches = lz_bt_get_matches, .skip_positions = lz_bt_skip_positions, .destroy = lz_bt_destroy, .struct_size = sizeof(struct lz_bt), };