X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flz_binary_trees.c;h=4a0537cb4df42ef9ebedd880185fc0c1a6acc734;hb=9ca5c20853d3be06378fb985aa75c75df280d1e2;hp=98924780fffba1b46c54f4af543fcf3aad253250;hpb=6929ca50c6d64ac208e7ccc5f1c2c08dada71061;p=wimlib diff --git a/src/lz_binary_trees.c b/src/lz_binary_trees.c index 98924780..4a0537cb 100644 --- a/src/lz_binary_trees.c +++ b/src/lz_binary_trees.c @@ -30,26 +30,25 @@ */ /* - * Note: the binary tree search/update algorithm is based on code from the - * public domain LZMA SDK (authors: Igor Pavlov, Lasse Collin). + * Note: the binary tree search/update algorithm is based on LzFind.c from + * 7-Zip, which was written by Igor Pavlov and released into the public domain. */ #ifdef HAVE_CONFIG_H # include "config.h" #endif +#include "wimlib/lz_extend.h" +#include "wimlib/lz_hash3.h" #include "wimlib/lz_mf.h" #include "wimlib/util.h" -#include + #include -/* Number of hash buckets. This can be changed, but it should be a power of 2 - * so that the correct hash bucket can be selected using a fast bitwise AND. */ -#define LZ_BT_HASH_LEN (1 << 16) +/* log2 of the number of buckets in the hash table. This can be changed. */ +#define LZ_BT_HASH_ORDER 16 -/* Number of bytes from which the hash code is computed at each position. This - * can be changed, provided that lz_bt_hash() is updated as well. */ -#define LZ_BT_HASH_BYTES 3 +#define LZ_BT_HASH_LEN (1 << LZ_BT_HASH_ORDER) /* Number of entries in the digram table. * @@ -66,39 +65,13 @@ struct lz_bt { u32 *digram_tab; u32 *child_tab; u32 next_hash; + u16 next_digram; }; -static u32 crc32_table[256]; -static pthread_once_t crc32_table_filled = PTHREAD_ONCE_INIT; - -static void -crc32_init(void) -{ - for (u32 b = 0; b < 256; b++) { - u32 r = b; - for (int i = 0; i < 8; i++) { - if (r & 1) - r = (r >> 1) ^ 0xEDB88320; - else - r >>= 1; - } - crc32_table[b] = r; - } -} - -/* This hash function is taken from the LZMA SDK. It seems to work well. - - * TODO: Maybe use the SSE4.2 CRC32 instruction when available? */ static inline u32 lz_bt_hash(const u8 *p) { - u32 hash = 0; - - hash ^= crc32_table[p[0]]; - hash ^= p[1]; - hash ^= (u32)p[2] << 8; - - return hash % LZ_BT_HASH_LEN; + return lz_hash(p, LZ_BT_HASH_ORDER); } static void @@ -108,7 +81,7 @@ lz_bt_set_default_params(struct lz_mf_params *params) params->min_match_len = 2; if (params->max_match_len == 0) - params->max_match_len = params->max_window_size; + params->max_match_len = UINT32_MAX; if (params->max_search_depth == 0) params->max_search_depth = 50; @@ -165,13 +138,9 @@ lz_bt_init(struct lz_mf *_mf) mf->digram_tab = mf->hash_tab + LZ_BT_HASH_LEN; mf->child_tab = mf->digram_tab + LZ_BT_DIGRAM_TAB_LEN; } else { - mf->digram_tab = NULL; mf->child_tab = mf->hash_tab + LZ_BT_HASH_LEN; } - /* Fill in the CRC32 table if not done already. */ - pthread_once(&crc32_table_filled, crc32_init); - return true; } @@ -187,9 +156,6 @@ lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size) if (mf->digram_tab) clear_len += LZ_BT_DIGRAM_TAB_LEN; memset(mf->hash_tab, 0, clear_len * sizeof(u32)); - - if (size >= LZ_BT_HASH_BYTES) - mf->next_hash = lz_bt_hash(window); } /* @@ -198,7 +164,7 @@ lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size) * * @window * The window being searched. - * @cur_window_pos + * @cur_pos * The current position in the window. * @child_tab * Table of child pointers for the binary tree. The children of the node @@ -213,9 +179,14 @@ lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size) * the current hash code is empty. * @min_len * Ignore matches shorter than this length. This must be at least 1. + * @nice_len + * Stop searching if a match of this length or longer is found. This must + * be less than or equal to @max_len. * @max_len - * Don't produce any matches longer than this length. If we find a match - * this long, terminate the search and return. + * Maximum length of matches to return. This can be longer than @nice_len, + * in which case a match of length @nice_len will still cause the search to + * be terminated, but the match will be extended up to @max_len bytes + * first. * @max_search_depth * Stop if we reach this depth in the binary tree. * @matches @@ -225,17 +196,18 @@ lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size) * @min_len, nor shall any match be longer than @max_len, nor shall any two * matches have the same offset. * - * Returns the number of matches found and written to @matches. + * Returns a pointer to the next free slot in @matches. */ -static u32 +static struct lz_match * do_search(const u8 window[restrict], - const u32 cur_window_pos, + const u32 cur_pos, u32 child_tab[restrict], u32 cur_match, const u32 min_len, + const u32 nice_len, const u32 max_len, const u32 max_search_depth, - struct lz_match matches[restrict]) + struct lz_match *lz_matchptr) { /* * Here's my explanation of how this code actually works. Beware: this @@ -295,15 +267,14 @@ do_search(const u8 window[restrict], * matches must be non-decreasing as they are encountered by the tree * search. * - * Using this observation, we can maintain two variables, - * 'longest_lt_match_len' and 'longest_gt_match_len', that represent the - * length of the longest lexicographically lesser and greater, - * respectively, match that has been examined so far. Then, when - * examining a new match, we need only start comparing at the index - * min(longest_lt_match_len, longest_gt_match_len) byte. Note that we - * cannot know beforehand whether the match will be lexicographically - * lesser or greater, hence the need for taking the minimum of these two - * lengths. + * Using this observation, we can maintain two variables, 'best_lt_len' + * and 'best_gt_len', that represent the length of the longest + * lexicographically lesser and greater, respectively, match that has + * been examined so far. Then, when examining a new match, we need + * only start comparing at the index min(best_lt_len, best_gt_len) byte. + * Note that we cannot know beforehand whether the match will be + * lexicographically lesser or greater, hence the need for taking the + * minimum of these two lengths. * * As noted earlier, as we descend into the tree, the potential matches * will have strictly increasing offsets. To make things faster for @@ -357,10 +328,10 @@ do_search(const u8 window[restrict], * node as "pending (greater than)". * * If the search terminates before the current string is found (up to a - * precision of @max_len bytes), then we set "pending (less than)" and + * precision of @nice_len bytes), then we set "pending (less than)" and * "pending (greater than)" to point to nothing. Alternatively, if the * search terminates due to finding the current string (up to a - * precision of @max_len bytes), then we set "pending (less than)" and + * precision of @nice_len bytes), then we set "pending (less than)" and * "pending (greater than)" to point to the appropriate children of that * match. * @@ -383,53 +354,50 @@ do_search(const u8 window[restrict], * It's complicated, but it should make sense if you think about it. * The algorithm basically just moves subtrees into the correct * locations as it walks down the tree for the search. But also, if the - * algorithm actually finds a match of length @max_len with the current + * algorithm actually finds a match of length @nice_len with the current * string, it no longer needs that match node and can discard it. The * algorithm also will discard nodes if the search terminates due to the * depth limit. For these reasons, the binary tree might not, in fact, * contain all valid positions. */ - u32 num_matches = 0; - u32 longest_lt_match_len = 0; - u32 longest_gt_match_len = 0; - u32 longest_match_len = min_len - 1; - u32 *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0]; - u32 *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1]; - const u8 *strptr = &window[cur_window_pos]; + u32 best_lt_len = 0; + u32 best_gt_len = 0; + u32 best_len = min_len - 1; + u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0]; + u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1]; + const u8 * const strptr = &window[cur_pos]; u32 depth_remaining = max_search_depth; + for (;;) { const u8 *matchptr; u32 len; - if (depth_remaining-- == 0 || cur_match == 0) { + if (cur_match == 0 || depth_remaining-- == 0) { *pending_lt_ptr = 0; *pending_gt_ptr = 0; - return num_matches; + return lz_matchptr; } matchptr = &window[cur_match]; - len = min(longest_lt_match_len, longest_gt_match_len); + len = min(best_lt_len, best_gt_len); if (matchptr[len] == strptr[len]) { - if (++len != max_len && matchptr[len] == strptr[len]) - while (++len != max_len) - if (matchptr[len] != strptr[len]) - break; + len = lz_extend(strptr, matchptr, len + 1, max_len); - if (len > longest_match_len) { - longest_match_len = len; + if (len > best_len) { + best_len = len; - matches[num_matches++] = (struct lz_match) { + *lz_matchptr++ = (struct lz_match) { .len = len, .offset = strptr - matchptr, }; - if (len == max_len) { + if (len >= nice_len) { *pending_lt_ptr = child_tab[cur_match * 2 + 0]; *pending_gt_ptr = child_tab[cur_match * 2 + 1]; - return num_matches; + return lz_matchptr; } } } @@ -438,12 +406,12 @@ do_search(const u8 window[restrict], *pending_lt_ptr = cur_match; pending_lt_ptr = &child_tab[cur_match * 2 + 1]; cur_match = *pending_lt_ptr; - longest_lt_match_len = len; + best_lt_len = len; } else { *pending_gt_ptr = cur_match; pending_gt_ptr = &child_tab[cur_match * 2 + 0]; cur_match = *pending_gt_ptr; - longest_gt_match_len = len; + best_gt_len = len; } } } @@ -452,21 +420,27 @@ static u32 lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[]) { struct lz_bt *mf = (struct lz_bt *)_mf; - const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base); + const u8 * const window = mf->base.cur_window; + const u32 cur_pos = mf->base.cur_window_pos++; + const u32 bytes_remaining = mf->base.cur_window_size - cur_pos; + u32 min_len; + const u32 max_len = min(bytes_remaining, mf->base.params.max_match_len); + const u32 nice_len = min(max_len, mf->base.params.nice_match_len); u32 hash; u32 cur_match; - u32 min_len; - u32 num_matches = 0; + struct lz_match *lz_matchptr = matches; - if (bytes_remaining <= LZ_BT_HASH_BYTES) - goto out; + if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1)) + return 0; if (mf->digram_tab) { /* Search the digram table for a length 2 match. */ - const u16 digram = *(const u16 *)lz_mf_get_window_ptr(&mf->base); + const u16 digram = mf->next_digram; + mf->next_digram = *(const u16 *)(&window[cur_pos + 1]); + prefetch(&mf->digram_tab[mf->next_digram]); cur_match = mf->digram_tab[digram]; - mf->digram_tab[digram] = mf->base.cur_window_pos; + mf->digram_tab[digram] = cur_pos; /* We're only interested in matches of length exactly 2, since * those won't be found during the binary tree search. @@ -476,13 +450,10 @@ lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[]) * search. However I found this actually *reduced* performance * slightly, evidently because the binary tree still needs to be * searched/updated starting from the root in either case. */ - if (cur_match != 0 && - (mf->base.cur_window[cur_match + 2] != - mf->base.cur_window[mf->base.cur_window_pos + 2])) - { - matches[num_matches++] = (struct lz_match) { + if (cur_match != 0 && window[cur_match + 2] != window[cur_pos + 2]) { + *lz_matchptr++ = (struct lz_match) { .len = 2, - .offset = mf->base.cur_window_pos - cur_match, + .offset = cur_pos - cur_match, }; } min_len = 3; @@ -491,142 +462,134 @@ lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[]) } hash = mf->next_hash; - mf->next_hash = lz_bt_hash(lz_mf_get_window_ptr(&mf->base) + 1); + mf->next_hash = lz_bt_hash(&window[cur_pos + 1]); prefetch(&mf->hash_tab[mf->next_hash]); cur_match = mf->hash_tab[hash]; - mf->hash_tab[hash] = mf->base.cur_window_pos; + mf->hash_tab[hash] = cur_pos; /* Search the binary tree of 'hash' for matches while re-rooting it at * the current position. */ - num_matches += do_search(mf->base.cur_window, - mf->base.cur_window_pos, - mf->child_tab, - cur_match, - min_len, - min(bytes_remaining, mf->base.params.nice_match_len), - mf->base.params.max_search_depth, - &matches[num_matches]); - - /* If the longest match is @nice_match_len in length, it may have been - * truncated. Try extending it up to the maximum match length. */ - if (num_matches != 0 && - matches[num_matches - 1].len == mf->base.params.nice_match_len) - { - const u8 * const strptr = lz_mf_get_window_ptr(&mf->base); - const u8 * const matchptr = strptr - matches[num_matches - 1].offset; - const u32 len_limit = min(bytes_remaining, mf->base.params.max_match_len); - u32 len; - - len = matches[num_matches - 1].len; - while (len < len_limit && strptr[len] == matchptr[len]) - len++; - matches[num_matches - 1].len = len; - } - -out: - /* Advance to the next position. */ - mf->base.cur_window_pos++; + lz_matchptr = do_search(window, + cur_pos, + mf->child_tab, + cur_match, + min_len, + nice_len, + max_len, + mf->base.params.max_search_depth, + lz_matchptr); /* Return the number of matches found. */ - return num_matches; + return lz_matchptr - matches; } -/* This is the same as do_search(), but it does not save any matches. +/* This is very similar to do_search(), but it does not save any matches. * See do_search() for explanatory comments. */ static void do_skip(const u8 window[restrict], - const u32 cur_window_pos, + const u32 cur_pos, u32 child_tab[restrict], u32 cur_match, - const u32 max_len, + const u32 nice_len, const u32 max_search_depth) { - u32 longest_lt_match_len = 0; - u32 longest_gt_match_len = 0; - u32 *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0]; - u32 *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1]; - const u8 * const strptr = &window[cur_window_pos]; + u32 best_lt_len = 0; + u32 best_gt_len = 0; + u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0]; + u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1]; + const u8 * const strptr = &window[cur_pos]; u32 depth_remaining = max_search_depth; for (;;) { const u8 *matchptr; u32 len; - if (depth_remaining-- == 0 || cur_match == 0) { + if (cur_match == 0 || depth_remaining-- == 0) { *pending_lt_ptr = 0; *pending_gt_ptr = 0; return; } matchptr = &window[cur_match]; - len = min(longest_lt_match_len, longest_gt_match_len); + len = min(best_lt_len, best_gt_len); if (matchptr[len] == strptr[len]) { - do { - if (++len == max_len) { - *pending_lt_ptr = child_tab[cur_match * 2 + 0]; - *pending_gt_ptr = child_tab[cur_match * 2 + 1]; - return; - } - } while (matchptr[len] == strptr[len]); + len = lz_extend(strptr, matchptr, len + 1, nice_len); + if (len == nice_len) { + *pending_lt_ptr = child_tab[cur_match * 2 + 0]; + *pending_gt_ptr = child_tab[cur_match * 2 + 1]; + return; + } } if (matchptr[len] < strptr[len]) { *pending_lt_ptr = cur_match; pending_lt_ptr = &child_tab[cur_match * 2 + 1]; cur_match = *pending_lt_ptr; - longest_lt_match_len = len; + best_lt_len = len; } else { *pending_gt_ptr = cur_match; pending_gt_ptr = &child_tab[cur_match * 2 + 0]; cur_match = *pending_gt_ptr; - longest_gt_match_len = len; + best_gt_len = len; } } } static void -lz_bt_skip_position(struct lz_bt *mf) +lz_bt_skip_positions(struct lz_mf *_mf, u32 n) { - const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base); + struct lz_bt *mf = (struct lz_bt *)_mf; + const u8 * const window = mf->base.cur_window; + u32 cur_pos = mf->base.cur_window_pos; + u32 end_pos = cur_pos + n; + u32 bytes_remaining = mf->base.cur_window_size - cur_pos; u32 hash; + u32 next_hash; u32 cur_match; + u16 digram; + u16 next_digram; - if (bytes_remaining <= LZ_BT_HASH_BYTES) - goto out; + mf->base.cur_window_pos = end_pos; - /* Update the digram table. */ - if (mf->digram_tab) { - const u16 digram = *(const u16 *)lz_mf_get_window_ptr(&mf->base); - mf->digram_tab[digram] = mf->base.cur_window_pos; + if (unlikely(bytes_remaining < n + (LZ_HASH_REQUIRED_NBYTES + 1) - 1)) { + /* Nearing end of window. */ + if (unlikely(bytes_remaining < (LZ_HASH_REQUIRED_NBYTES + 1))) + return; + + end_pos = cur_pos + bytes_remaining - (LZ_HASH_REQUIRED_NBYTES + 1) + 1; } - /* Update the hash table. */ - hash = mf->next_hash; - mf->next_hash = lz_bt_hash(lz_mf_get_window_ptr(&mf->base) + 1); - prefetch(&mf->hash_tab[mf->next_hash]); - cur_match = mf->hash_tab[hash]; - mf->hash_tab[hash] = mf->base.cur_window_pos; - - /* Update the binary tree for the appropriate hash code. */ - do_skip(mf->base.cur_window, - mf->base.cur_window_pos, - mf->child_tab, - cur_match, - min(bytes_remaining, mf->base.params.nice_match_len), - mf->base.params.max_search_depth); - -out: - /* Advance to the next position. */ - mf->base.cur_window_pos++; -} + next_hash = mf->next_hash; + next_digram = mf->next_digram; + do { + if (mf->digram_tab) { + digram = next_digram; + next_digram = *(const u16 *)(&window[cur_pos + 1]); + mf->digram_tab[digram] = cur_pos; + } -static void -lz_bt_skip_positions(struct lz_mf *_mf, u32 n) -{ - struct lz_bt *mf = (struct lz_bt *)_mf; + hash = next_hash; + next_hash = lz_bt_hash(&window[cur_pos + 1]); + cur_match = mf->hash_tab[hash]; + mf->hash_tab[hash] = cur_pos; - do { - lz_bt_skip_position(mf); - } while (--n); + /* Update the binary tree for the appropriate hash code. */ + do_skip(window, + cur_pos, + mf->child_tab, + cur_match, + min(bytes_remaining, mf->base.params.nice_match_len), + mf->base.params.max_search_depth); + + bytes_remaining--; + } while (++cur_pos != end_pos); + + if (mf->digram_tab) { + prefetch(&mf->digram_tab[next_digram]); + mf->next_digram = next_digram; + } + + prefetch(&mf->hash_tab[next_hash]); + mf->next_hash = next_hash; } static void