From 7e3f3761e0a9fc93341ed0b9c69f6056d9a97af9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 13 Aug 2014 23:04:58 -0500 Subject: [PATCH] More lz_hash_chains, lz_binary_trees performance improvements - Use a multiplicative hash function. In practice it's as good as the CRC32-based hash function previously used, but it's faster to compute and requires no static data. - Use slightly different logic that avoids the need to special-case the extension of matches from 'nice_len' to 'max_len'. - Faster skip_positions() by avoiding unnecessary calculations. - Fast match length extension on x86 using unaligned word accesses and trailing zero count. --- Makefile.am | 2 + include/wimlib/lz_extend.h | 74 +++++++++ include/wimlib/lz_hash3.h | 48 ++++++ src/lz_binary_trees.c | 325 ++++++++++++++++--------------------- src/lz_hash_chains.c | 214 +++++++++++------------- 5 files changed, 367 insertions(+), 296 deletions(-) create mode 100644 include/wimlib/lz_extend.h create mode 100644 include/wimlib/lz_hash3.h diff --git a/Makefile.am b/Makefile.am index c6efa2ce..1ce745a0 100644 --- a/Makefile.am +++ b/Makefile.am @@ -102,6 +102,8 @@ libwim_la_SOURCES = \ include/wimlib/integrity.h \ include/wimlib/list.h \ include/wimlib/lookup_table.h \ + include/wimlib/lz_extend.h \ + include/wimlib/lz_hash3.h \ include/wimlib/lz_mf.h \ include/wimlib/lz_mf_ops.h \ include/wimlib/lz_suffix_array_utils.h \ diff --git a/include/wimlib/lz_extend.h b/include/wimlib/lz_extend.h new file mode 100644 index 00000000..7d7ed8b8 --- /dev/null +++ b/include/wimlib/lz_extend.h @@ -0,0 +1,74 @@ +/* + * lz_extend.h + * + * Fast match extension for Lempel-Ziv matchfinding. + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + +#ifndef _WIMLIB_LZ_EXTEND_H +#define _WIMLIB_LZ_EXTEND_H + +#include "wimlib/types.h" + +#if (defined(__x86_64__) || defined(__i386__)) && defined(__GNUC__) +# define HAVE_FAST_LZ_EXTEND 1 +#else +# define HAVE_FAST_LZ_EXTEND 0 +#endif + +/* Return the number of bytes at @matchptr that match the bytes at @strptr, up + * to a maximum of @max_len. Initially, @start_len bytes are matched. */ +static inline u32 +lz_extend(const u8 * const strptr, const u8 * const matchptr, + const u32 start_len, const u32 max_len) +{ + u32 len = start_len; + +#if HAVE_FAST_LZ_EXTEND + + while (len + sizeof(unsigned long) <= max_len) { + unsigned long x; + + x = *(const unsigned long *)&matchptr[len] ^ + *(const unsigned long *)&strptr[len]; + if (x != 0) + return len + (__builtin_ctzl(x) >> 3); + len += sizeof(unsigned long); + } + + if (sizeof(unsigned int) < sizeof(unsigned long) && + len + sizeof(unsigned int) <= max_len) + { + unsigned int x; + + x = *(const unsigned int *)&matchptr[len] ^ + *(const unsigned int *)&strptr[len]; + if (x != 0) + return len + (__builtin_ctz(x) >> 3); + len += sizeof(unsigned int); + } + + if (sizeof(unsigned int) == 4) { + if (len < max_len && matchptr[len] == strptr[len]) { + len++; + if (len < max_len && matchptr[len] == strptr[len]) { + len++; + if (len < max_len && matchptr[len] == strptr[len]) { + len++; + } + } + } + return len; + } + +#endif /* HAVE_FAST_LZ_EXTEND */ + + while (len < max_len && matchptr[len] == strptr[len]) + len++; + + return len; +} + +#endif /* _WIMLIB_LZ_EXTEND_H */ diff --git a/include/wimlib/lz_hash3.h b/include/wimlib/lz_hash3.h new file mode 100644 index 00000000..feb2d9c3 --- /dev/null +++ b/include/wimlib/lz_hash3.h @@ -0,0 +1,48 @@ +/* + * lz_hash3.h + * + * 3-byte hashing for Lempel-Ziv matchfinding. + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + +#ifndef _WIMLIB_LZ_HASH3_H +#define _WIMLIB_LZ_HASH3_H + +#include "wimlib/types.h" + +/* Constant for the multiplicative hash function. */ +#define LZ_HASH_MULTIPLIER 0x1E35A7BD + +/* Hash the next 3-byte sequence in the window, producing a hash of length + * 'num_bits' bits. 4 bytes must be available, since 32-bit unaligned load is + * faster on some architectures. */ +static inline u32 +lz_hash(const u8 *p, unsigned int num_bits) +{ + u32 str; + u32 hash; + +#if defined(__i386__) || defined(__x86_64__) + /* Unaligned access allowed, and little endian CPU. + * Callers ensure that at least 4 (not 3) bytes are remaining. */ + str = *(const u32 *)p & 0xFFFFFF; +#else + str = ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); +#endif + + hash = str * LZ_HASH_MULTIPLIER; + + /* High bits are more random than the low bits. */ + return hash >> (32 - num_bits); +} + +/* The number of bytes being hashed. */ +#define LZ_HASH_NBYTES 3 + +/* Number of bytes the hash function actually requires be available, due to the + * possibility of an unaligned load. */ +#define LZ_HASH_REQUIRED_NBYTES 4 + +#endif /* _WIMLIB_LZ_HASH3_H */ diff --git a/src/lz_binary_trees.c b/src/lz_binary_trees.c index be818d72..aef7b890 100644 --- a/src/lz_binary_trees.c +++ b/src/lz_binary_trees.c @@ -30,26 +30,25 @@ */ /* - * Note: the binary tree search/update algorithm is based on code from the - * public domain LZMA SDK (authors: Igor Pavlov, Lasse Collin). + * Note: the binary tree search/update algorithm is based on LzFind.c from + * 7-Zip, which was written by Igor Pavlov and released into the public domain. */ #ifdef HAVE_CONFIG_H # include "config.h" #endif +#include "wimlib/lz_extend.h" +#include "wimlib/lz_hash3.h" #include "wimlib/lz_mf.h" #include "wimlib/util.h" -#include + #include -/* Number of hash buckets. This can be changed, but it should be a power of 2 - * so that the correct hash bucket can be selected using a fast bitwise AND. */ -#define LZ_BT_HASH_LEN (1 << 16) +/* log2 of the number of buckets in the hash table. This can be changed. */ +#define LZ_BT_HASH_ORDER 16 -/* Number of bytes from which the hash code is computed at each position. This - * can be changed, provided that lz_bt_hash() is updated as well. */ -#define LZ_BT_HASH_BYTES 3 +#define LZ_BT_HASH_LEN (1 << LZ_BT_HASH_ORDER) /* Number of entries in the digram table. * @@ -66,39 +65,13 @@ struct lz_bt { u32 *digram_tab; u32 *child_tab; u32 next_hash; + u16 next_digram; }; -static u32 crc32_table[256]; -static pthread_once_t crc32_table_filled = PTHREAD_ONCE_INIT; - -static void -crc32_init(void) -{ - for (u32 b = 0; b < 256; b++) { - u32 r = b; - for (int i = 0; i < 8; i++) { - if (r & 1) - r = (r >> 1) ^ 0xEDB88320; - else - r >>= 1; - } - crc32_table[b] = r; - } -} - -/* This hash function is taken from the LZMA SDK. It seems to work well. - - * TODO: Maybe use the SSE4.2 CRC32 instruction when available? */ static inline u32 lz_bt_hash(const u8 *p) { - u32 hash = 0; - - hash ^= crc32_table[p[0]]; - hash ^= p[1]; - hash ^= (u32)p[2] << 8; - - return hash % LZ_BT_HASH_LEN; + return lz_hash(p, LZ_BT_HASH_ORDER); } static void @@ -165,13 +138,9 @@ lz_bt_init(struct lz_mf *_mf) mf->digram_tab = mf->hash_tab + LZ_BT_HASH_LEN; mf->child_tab = mf->digram_tab + LZ_BT_DIGRAM_TAB_LEN; } else { - mf->digram_tab = NULL; mf->child_tab = mf->hash_tab + LZ_BT_HASH_LEN; } - /* Fill in the CRC32 table if not done already. */ - pthread_once(&crc32_table_filled, crc32_init); - return true; } @@ -187,9 +156,6 @@ lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size) if (mf->digram_tab) clear_len += LZ_BT_DIGRAM_TAB_LEN; memset(mf->hash_tab, 0, clear_len * sizeof(u32)); - - if (size >= LZ_BT_HASH_BYTES) - mf->next_hash = lz_bt_hash(window); } /* @@ -198,7 +164,7 @@ lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size) * * @window * The window being searched. - * @cur_window_pos + * @cur_pos * The current position in the window. * @child_tab * Table of child pointers for the binary tree. The children of the node @@ -213,9 +179,14 @@ lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size) * the current hash code is empty. * @min_len * Ignore matches shorter than this length. This must be at least 1. + * @nice_len + * Stop searching if a match of this length or longer is found. This must + * be less than or equal to @max_len. * @max_len - * Don't produce any matches longer than this length. If we find a match - * this long, terminate the search and return. + * Maximum length of matches to return. This can be longer than @nice_len, + * in which case a match of length @nice_len will still cause the search to + * be terminated, but the match will be extended up to @max_len bytes + * first. * @max_search_depth * Stop if we reach this depth in the binary tree. * @matches @@ -225,17 +196,18 @@ lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size) * @min_len, nor shall any match be longer than @max_len, nor shall any two * matches have the same offset. * - * Returns the number of matches found and written to @matches. + * Returns a pointer to the next free slot in @matches. */ -static u32 +static struct lz_match * do_search(const u8 window[restrict], - const u32 cur_window_pos, + const u32 cur_pos, u32 child_tab[restrict], u32 cur_match, const u32 min_len, + const u32 nice_len, const u32 max_len, const u32 max_search_depth, - struct lz_match matches[restrict]) + struct lz_match *lz_matchptr) { /* * Here's my explanation of how this code actually works. Beware: this @@ -295,15 +267,14 @@ do_search(const u8 window[restrict], * matches must be non-decreasing as they are encountered by the tree * search. * - * Using this observation, we can maintain two variables, - * 'longest_lt_match_len' and 'longest_gt_match_len', that represent the - * length of the longest lexicographically lesser and greater, - * respectively, match that has been examined so far. Then, when - * examining a new match, we need only start comparing at the index - * min(longest_lt_match_len, longest_gt_match_len) byte. Note that we - * cannot know beforehand whether the match will be lexicographically - * lesser or greater, hence the need for taking the minimum of these two - * lengths. + * Using this observation, we can maintain two variables, 'best_lt_len' + * and 'best_gt_len', that represent the length of the longest + * lexicographically lesser and greater, respectively, match that has + * been examined so far. Then, when examining a new match, we need + * only start comparing at the index min(best_lt_len, best_gt_len) byte. + * Note that we cannot know beforehand whether the match will be + * lexicographically lesser or greater, hence the need for taking the + * minimum of these two lengths. * * As noted earlier, as we descend into the tree, the potential matches * will have strictly increasing offsets. To make things faster for @@ -357,10 +328,10 @@ do_search(const u8 window[restrict], * node as "pending (greater than)". * * If the search terminates before the current string is found (up to a - * precision of @max_len bytes), then we set "pending (less than)" and + * precision of @nice_len bytes), then we set "pending (less than)" and * "pending (greater than)" to point to nothing. Alternatively, if the * search terminates due to finding the current string (up to a - * precision of @max_len bytes), then we set "pending (less than)" and + * precision of @nice_len bytes), then we set "pending (less than)" and * "pending (greater than)" to point to the appropriate children of that * match. * @@ -383,53 +354,50 @@ do_search(const u8 window[restrict], * It's complicated, but it should make sense if you think about it. * The algorithm basically just moves subtrees into the correct * locations as it walks down the tree for the search. But also, if the - * algorithm actually finds a match of length @max_len with the current + * algorithm actually finds a match of length @nice_len with the current * string, it no longer needs that match node and can discard it. The * algorithm also will discard nodes if the search terminates due to the * depth limit. For these reasons, the binary tree might not, in fact, * contain all valid positions. */ - u32 num_matches = 0; - u32 longest_lt_match_len = 0; - u32 longest_gt_match_len = 0; - u32 longest_match_len = min_len - 1; - u32 *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0]; - u32 *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1]; - const u8 *strptr = &window[cur_window_pos]; + u32 best_lt_len = 0; + u32 best_gt_len = 0; + u32 best_len = min_len - 1; + u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0]; + u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1]; + const u8 * const strptr = &window[cur_pos]; u32 depth_remaining = max_search_depth; + for (;;) { const u8 *matchptr; u32 len; - if (depth_remaining-- == 0 || cur_match == 0) { + if (cur_match == 0 || depth_remaining-- == 0) { *pending_lt_ptr = 0; *pending_gt_ptr = 0; - return num_matches; + return lz_matchptr; } matchptr = &window[cur_match]; - len = min(longest_lt_match_len, longest_gt_match_len); + len = min(best_lt_len, best_gt_len); if (matchptr[len] == strptr[len]) { - if (++len != max_len && matchptr[len] == strptr[len]) - while (++len != max_len) - if (matchptr[len] != strptr[len]) - break; + len = lz_extend(strptr, matchptr, len + 1, max_len); - if (len > longest_match_len) { - longest_match_len = len; + if (len > best_len) { + best_len = len; - matches[num_matches++] = (struct lz_match) { + *lz_matchptr++ = (struct lz_match) { .len = len, .offset = strptr - matchptr, }; - if (len == max_len) { + if (len >= nice_len) { *pending_lt_ptr = child_tab[cur_match * 2 + 0]; *pending_gt_ptr = child_tab[cur_match * 2 + 1]; - return num_matches; + return lz_matchptr; } } } @@ -438,12 +406,12 @@ do_search(const u8 window[restrict], *pending_lt_ptr = cur_match; pending_lt_ptr = &child_tab[cur_match * 2 + 1]; cur_match = *pending_lt_ptr; - longest_lt_match_len = len; + best_lt_len = len; } else { *pending_gt_ptr = cur_match; pending_gt_ptr = &child_tab[cur_match * 2 + 0]; cur_match = *pending_gt_ptr; - longest_gt_match_len = len; + best_gt_len = len; } } } @@ -452,21 +420,27 @@ static u32 lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[]) { struct lz_bt *mf = (struct lz_bt *)_mf; - const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base); + const u8 * const window = mf->base.cur_window; + const u32 cur_pos = mf->base.cur_window_pos++; + const u32 bytes_remaining = mf->base.cur_window_size - cur_pos; + u32 min_len; + const u32 max_len = min(bytes_remaining, mf->base.params.max_match_len); + const u32 nice_len = min(max_len, mf->base.params.nice_match_len); u32 hash; u32 cur_match; - u32 min_len; - u32 num_matches = 0; + struct lz_match *lz_matchptr = matches; - if (unlikely(bytes_remaining <= mf->base.params.min_match_len + 1)) - goto out; + if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1)) + return 0; if (mf->digram_tab) { /* Search the digram table for a length 2 match. */ - const u16 digram = *(const u16 *)lz_mf_get_window_ptr(&mf->base); + const u16 digram = mf->next_digram; + mf->next_digram = *(const u16 *)(&window[cur_pos + 1]); + prefetch(&mf->digram_tab[mf->next_digram]); cur_match = mf->digram_tab[digram]; - mf->digram_tab[digram] = mf->base.cur_window_pos; + mf->digram_tab[digram] = cur_pos; /* We're only interested in matches of length exactly 2, since * those won't be found during the binary tree search. @@ -476,13 +450,10 @@ lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[]) * search. However I found this actually *reduced* performance * slightly, evidently because the binary tree still needs to be * searched/updated starting from the root in either case. */ - if (cur_match != 0 && - (mf->base.cur_window[cur_match + 2] != - mf->base.cur_window[mf->base.cur_window_pos + 2])) - { - matches[num_matches++] = (struct lz_match) { + if (cur_match != 0 && window[cur_match + 2] != window[cur_pos + 2]) { + *lz_matchptr++ = (struct lz_match) { .len = 2, - .offset = mf->base.cur_window_pos - cur_match, + .offset = cur_pos - cur_match, }; } min_len = 3; @@ -491,142 +462,134 @@ lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[]) } hash = mf->next_hash; - mf->next_hash = lz_bt_hash(lz_mf_get_window_ptr(&mf->base) + 1); + mf->next_hash = lz_bt_hash(&window[cur_pos + 1]); prefetch(&mf->hash_tab[mf->next_hash]); cur_match = mf->hash_tab[hash]; - mf->hash_tab[hash] = mf->base.cur_window_pos; + mf->hash_tab[hash] = cur_pos; /* Search the binary tree of 'hash' for matches while re-rooting it at * the current position. */ - num_matches += do_search(mf->base.cur_window, - mf->base.cur_window_pos, - mf->child_tab, - cur_match, - min_len, - min(bytes_remaining, mf->base.params.nice_match_len), - mf->base.params.max_search_depth, - &matches[num_matches]); - - /* If the longest match is @nice_match_len in length, it may have been - * truncated. Try extending it up to the maximum match length. */ - if (num_matches != 0 && - matches[num_matches - 1].len == mf->base.params.nice_match_len) - { - const u8 * const strptr = lz_mf_get_window_ptr(&mf->base); - const u8 * const matchptr = strptr - matches[num_matches - 1].offset; - const u32 len_limit = min(bytes_remaining, mf->base.params.max_match_len); - u32 len; - - len = matches[num_matches - 1].len; - while (len < len_limit && strptr[len] == matchptr[len]) - len++; - matches[num_matches - 1].len = len; - } - -out: - /* Advance to the next position. */ - mf->base.cur_window_pos++; + lz_matchptr = do_search(window, + cur_pos, + mf->child_tab, + cur_match, + min_len, + nice_len, + max_len, + mf->base.params.max_search_depth, + lz_matchptr); /* Return the number of matches found. */ - return num_matches; + return lz_matchptr - matches; } -/* This is the same as do_search(), but it does not save any matches. +/* This is very similar to do_search(), but it does not save any matches. * See do_search() for explanatory comments. */ static void do_skip(const u8 window[restrict], - const u32 cur_window_pos, + const u32 cur_pos, u32 child_tab[restrict], u32 cur_match, - const u32 max_len, + const u32 nice_len, const u32 max_search_depth) { - u32 longest_lt_match_len = 0; - u32 longest_gt_match_len = 0; - u32 *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0]; - u32 *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1]; - const u8 * const strptr = &window[cur_window_pos]; + u32 best_lt_len = 0; + u32 best_gt_len = 0; + u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0]; + u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1]; + const u8 * const strptr = &window[cur_pos]; u32 depth_remaining = max_search_depth; for (;;) { const u8 *matchptr; u32 len; - if (depth_remaining-- == 0 || cur_match == 0) { + if (cur_match == 0 || depth_remaining-- == 0) { *pending_lt_ptr = 0; *pending_gt_ptr = 0; return; } matchptr = &window[cur_match]; - len = min(longest_lt_match_len, longest_gt_match_len); + len = min(best_lt_len, best_gt_len); if (matchptr[len] == strptr[len]) { - do { - if (++len == max_len) { - *pending_lt_ptr = child_tab[cur_match * 2 + 0]; - *pending_gt_ptr = child_tab[cur_match * 2 + 1]; - return; - } - } while (matchptr[len] == strptr[len]); + len = lz_extend(strptr, matchptr, len + 1, nice_len); + if (len == nice_len) { + *pending_lt_ptr = child_tab[cur_match * 2 + 0]; + *pending_gt_ptr = child_tab[cur_match * 2 + 1]; + return; + } } if (matchptr[len] < strptr[len]) { *pending_lt_ptr = cur_match; pending_lt_ptr = &child_tab[cur_match * 2 + 1]; cur_match = *pending_lt_ptr; - longest_lt_match_len = len; + best_lt_len = len; } else { *pending_gt_ptr = cur_match; pending_gt_ptr = &child_tab[cur_match * 2 + 0]; cur_match = *pending_gt_ptr; - longest_gt_match_len = len; + best_gt_len = len; } } } static void -lz_bt_skip_position(struct lz_bt *mf) +lz_bt_skip_positions(struct lz_mf *_mf, u32 n) { - const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base); + struct lz_bt *mf = (struct lz_bt *)_mf; + const u8 * const window = mf->base.cur_window; + u32 cur_pos = mf->base.cur_window_pos; + u32 end_pos = cur_pos + n; + u32 bytes_remaining = mf->base.cur_window_size - cur_pos; u32 hash; + u32 next_hash; u32 cur_match; + u16 digram; + u16 next_digram; - if (unlikely(bytes_remaining <= mf->base.params.min_match_len + 1)) - goto out; + mf->base.cur_window_pos = end_pos; - /* Update the digram table. */ - if (mf->digram_tab) { - const u16 digram = *(const u16 *)lz_mf_get_window_ptr(&mf->base); - mf->digram_tab[digram] = mf->base.cur_window_pos; + if (unlikely(bytes_remaining < n + (LZ_HASH_REQUIRED_NBYTES + 1) - 1)) { + /* Nearing end of window. */ + if (unlikely(bytes_remaining < (LZ_HASH_REQUIRED_NBYTES + 1))) + return; + + end_pos = cur_pos + bytes_remaining - (LZ_HASH_REQUIRED_NBYTES + 1) + 1; } - /* Update the hash table. */ - hash = mf->next_hash; - mf->next_hash = lz_bt_hash(lz_mf_get_window_ptr(&mf->base) + 1); - prefetch(&mf->hash_tab[mf->next_hash]); - cur_match = mf->hash_tab[hash]; - mf->hash_tab[hash] = mf->base.cur_window_pos; - - /* Update the binary tree for the appropriate hash code. */ - do_skip(mf->base.cur_window, - mf->base.cur_window_pos, - mf->child_tab, - cur_match, - min(bytes_remaining, mf->base.params.nice_match_len), - mf->base.params.max_search_depth); - -out: - /* Advance to the next position. */ - mf->base.cur_window_pos++; -} + next_hash = mf->next_hash; + next_digram = mf->next_digram; + do { + if (mf->digram_tab) { + digram = next_digram; + next_digram = *(const u16 *)(&window[cur_pos + 1]); + mf->digram_tab[digram] = cur_pos; + } -static void -lz_bt_skip_positions(struct lz_mf *_mf, u32 n) -{ - struct lz_bt *mf = (struct lz_bt *)_mf; + hash = next_hash; + next_hash = lz_bt_hash(&window[cur_pos + 1]); + cur_match = mf->hash_tab[hash]; + mf->hash_tab[hash] = cur_pos; - do { - lz_bt_skip_position(mf); - } while (--n); + /* Update the binary tree for the appropriate hash code. */ + do_skip(window, + cur_pos, + mf->child_tab, + cur_match, + min(bytes_remaining, mf->base.params.nice_match_len), + mf->base.params.max_search_depth); + + bytes_remaining--; + } while (++cur_pos != end_pos); + + if (mf->digram_tab) { + prefetch(&mf->digram_tab[next_digram]); + mf->next_digram = next_digram; + } + + prefetch(&mf->hash_tab[next_hash]); + mf->next_hash = next_hash; } static void diff --git a/src/lz_hash_chains.c b/src/lz_hash_chains.c index 46ea8130..2e6d6543 100644 --- a/src/lz_hash_chains.c +++ b/src/lz_hash_chains.c @@ -33,64 +33,35 @@ # include "config.h" #endif +#include "wimlib/lz_extend.h" +#include "wimlib/lz_hash3.h" #include "wimlib/lz_mf.h" #include "wimlib/util.h" -#include + #include -/* Number of hash buckets. This can be changed, but should be a power of 2 so - * that the correct hash bucket can be selected using a fast bitwise AND. */ -#define LZ_HC_HASH_LEN (1 << 15) +/* log2 of the number of buckets in the hash table. This can be changed. */ +#define LZ_HC_HASH_ORDER 15 -/* Number of bytes from which the hash code is computed at each position. This - * can be changed, provided that lz_hc_hash() is updated as well. */ -#define LZ_HC_HASH_BYTES 3 +#define LZ_HC_HASH_LEN (1 << LZ_HC_HASH_ORDER) struct lz_hc { struct lz_mf base; - u32 *hash_tab; - u32 *prev_tab; + u32 *hash_tab; /* followed by 'prev_tab' in memory */ u32 next_hash; }; -static u32 crc32_table[256]; -static pthread_once_t crc32_table_filled = PTHREAD_ONCE_INIT; - -static void -crc32_init(void) -{ - for (u32 b = 0; b < 256; b++) { - u32 r = b; - for (int i = 0; i < 8; i++) { - if (r & 1) - r = (r >> 1) ^ 0xEDB88320; - else - r >>= 1; - } - crc32_table[b] = r; - } -} - -/* This hash function is taken from the LZMA SDK. It seems to work well. - * - * TODO: Maybe use the SSE4.2 CRC32 instruction when available? */ static inline u32 lz_hc_hash(const u8 *p) { - u32 hash = 0; - - hash ^= crc32_table[p[0]]; - hash ^= p[1]; - hash ^= (u32)p[2] << 8; - - return hash % LZ_HC_HASH_LEN; + return lz_hash(p, LZ_HC_HASH_ORDER); } static void lz_hc_set_default_params(struct lz_mf_params *params) { - if (params->min_match_len < LZ_HC_HASH_BYTES) - params->min_match_len = LZ_HC_HASH_BYTES; + if (params->min_match_len < LZ_HASH_NBYTES) + params->min_match_len = LZ_HASH_NBYTES; if (params->max_match_len == 0) params->max_match_len = params->max_window_size; @@ -115,7 +86,6 @@ lz_hc_params_valid(const struct lz_mf_params *_params) lz_hc_set_default_params(¶ms); - /* Avoid edge case where min_match_len = 3, max_match_len = 2 */ return (params.min_match_len <= params.max_match_len); } @@ -137,17 +107,10 @@ lz_hc_init(struct lz_mf *_mf) lz_hc_set_default_params(&mf->base.params); - /* Allocate space for 'hash_tab' and 'prev_tab'. */ - mf->hash_tab = MALLOC(lz_hc_get_needed_memory(mf->base.params.max_window_size)); if (!mf->hash_tab) return false; - mf->prev_tab = mf->hash_tab + LZ_HC_HASH_LEN; - - /* Fill in the CRC32 table if not done already. */ - pthread_once(&crc32_table_filled, crc32_init); - return true; } @@ -157,9 +120,6 @@ lz_hc_load_window(struct lz_mf *_mf, const u8 window[], u32 size) struct lz_hc *mf = (struct lz_hc *)_mf; memset(mf->hash_tab, 0, LZ_HC_HASH_LEN * sizeof(u32)); - - if (size >= LZ_HC_HASH_BYTES) - mf->next_hash = lz_hc_hash(window); } static u32 @@ -167,19 +127,20 @@ lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[]) { struct lz_hc *mf = (struct lz_hc *)_mf; const u8 * const window = mf->base.cur_window; - const u32 cur_pos = mf->base.cur_window_pos; + const u32 cur_pos = mf->base.cur_window_pos++; const u8 * const strptr = &window[cur_pos]; const u32 bytes_remaining = mf->base.cur_window_size - cur_pos; - u32 * const prev_tab = mf->prev_tab; - const u32 nice_len = min(bytes_remaining, mf->base.params.nice_match_len); + u32 * const prev_tab = mf->hash_tab + LZ_HC_HASH_LEN; + const u32 max_len = min(bytes_remaining, mf->base.params.max_match_len); + const u32 nice_len = min(max_len, mf->base.params.nice_match_len); u32 best_len = mf->base.params.min_match_len - 1; u32 depth_remaining = mf->base.params.max_search_depth; - u32 num_matches = 0; + struct lz_match *lz_matchptr = matches; u32 hash; u32 cur_match; - if (unlikely(bytes_remaining <= mf->base.params.min_match_len)) - goto out; + if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1)) + return 0; /* Insert the current position into the appropriate hash chain and set * 'cur_match' to the previous head. @@ -194,96 +155,119 @@ lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[]) mf->hash_tab[hash] = cur_pos; prev_tab[cur_pos] = cur_match; + /* Ensure we can find a match of at least the requested length. */ + if (unlikely(best_len >= max_len)) + return 0; + + /* Search the appropriate hash chain for matches. */ for (; cur_match && depth_remaining--; cur_match = prev_tab[cur_match]) { const u8 * const matchptr = &window[cur_match]; u32 len; - /* Considering a match at 'matchptr'. */ - - /* The bytes at index 'best_len' are the most likely to differ, - * so check them first. + /* Considering the potential match at 'matchptr': is it longer + * than 'best_len'? * - * The bytes at indices 'best_len - 1' and '0' are less + * The bytes at index 'best_len' are the most likely to differ, + * so check them first. */ + if (matchptr[best_len] != strptr[best_len]) + goto next_match; + + #if HAVE_FAST_LZ_EXTEND + if ((*(const u32 *)strptr & 0xFFFFFF) != + (*(const u32 *)matchptr & 0xFFFFFF)) + goto next_match; + + len = lz_extend(strptr, matchptr, 3, max_len); + + if (len > best_len) { + best_len = len; + + *lz_matchptr++ = (struct lz_match) { + .len = best_len, + .offset = strptr - matchptr, + }; + + if (best_len >= nice_len) + break; + } + + #else /* HAVE_FAST_LZ_EXTEND */ + + /* The bytes at indices 'best_len - 1' and '0' are less * important to check separately. But doing so still gives a - * slight performance improvement, probably because they create - * separate branches for the CPU to predict independently of the - * branches in the main comparison loops. */ - if (matchptr[best_len] != strptr[best_len] || - matchptr[best_len - 1] != strptr[best_len - 1] || - matchptr[0] != strptr[0]) + * slight performance improvement, at least on x86_64, probably + * because they create separate branches for the CPU to predict + * independently of the branches in the main comparison loops. + */ + if (matchptr[best_len - 1] != strptr[best_len - 1] || + matchptr[0] != strptr[0]) goto next_match; for (len = 1; len < best_len - 1; len++) if (matchptr[len] != strptr[len]) goto next_match; - /* We now know the match length is at least 'best_len + 1'. */ - - len = best_len; - - do { - if (++len == nice_len) { - /* 'nice_len' reached; don't waste time - * searching for longer matches. Extend the - * match as far as possible, record it, and - * return. */ - const u32 max_len = min(bytes_remaining, - mf->base.params.max_match_len); - while (len < max_len && strptr[len] == matchptr[len]) - len++; - matches[num_matches++] = (struct lz_match) { - .len = len, - .offset = strptr - matchptr, - }; - goto out; - } - } while (matchptr[len] == strptr[len]); - - /* Found a longer match, but 'nice_len' not yet reached. */ - best_len = len; - matches[num_matches++] = (struct lz_match) { - .len = len, + /* The match is the longest found so far --- + * at least 'best_len' + 1 bytes. Continue extending it. */ + + if (++best_len != max_len && strptr[best_len] == matchptr[best_len]) + while (++best_len != max_len) + if (strptr[best_len] != matchptr[best_len]) + break; + + /* Record the match. */ + *lz_matchptr++ = (struct lz_match) { + .len = best_len, .offset = strptr - matchptr, }; + /* Terminate the search if 'nice_len' was reached. */ + if (best_len >= nice_len) + break; + #endif /* !HAVE_FAST_LZ_EXTEND */ + next_match: /* Continue to next match in the chain. */ ; } -out: - mf->base.cur_window_pos++; - return num_matches; + return lz_matchptr - matches; } static void -lz_hc_skip_position(struct lz_hc *mf) +lz_hc_skip_positions(struct lz_mf *_mf, u32 n) { - const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base); + struct lz_hc *mf = (struct lz_hc *)_mf; + u32 * const hash_tab = mf->hash_tab; + u32 * const prev_tab = hash_tab + LZ_HC_HASH_LEN; + const u8 * const window = mf->base.cur_window; + u32 cur_pos = mf->base.cur_window_pos; + u32 end_pos = cur_pos + n; + const u32 bytes_remaining = mf->base.cur_window_size - cur_pos; u32 hash; + u32 next_hash; - if (unlikely(bytes_remaining <= mf->base.params.min_match_len)) - goto out; + mf->base.cur_window_pos = end_pos; - hash = mf->next_hash; - mf->next_hash = lz_hc_hash(lz_mf_get_window_ptr(&mf->base) + 1); - prefetch(&mf->hash_tab[mf->next_hash]); - mf->prev_tab[mf->base.cur_window_pos] = mf->hash_tab[hash]; - mf->hash_tab[hash] = mf->base.cur_window_pos; - -out: - mf->base.cur_window_pos++; -} + if (unlikely(bytes_remaining < n + (LZ_HASH_REQUIRED_NBYTES + 1) - 1)) { + /* Nearing end of window. */ + if (unlikely(bytes_remaining < (LZ_HASH_REQUIRED_NBYTES + 1))) + return; -static void -lz_hc_skip_positions(struct lz_mf *_mf, u32 n) -{ - struct lz_hc *mf = (struct lz_hc *)_mf; + end_pos = cur_pos + bytes_remaining - (LZ_HASH_REQUIRED_NBYTES + 1) + 1; + } + next_hash = mf->next_hash; do { - lz_hc_skip_position(mf); - } while (--n); + hash = next_hash; + next_hash = lz_hc_hash(&window[cur_pos + 1]); + prev_tab[cur_pos] = hash_tab[hash]; + hash_tab[hash] = cur_pos; + } while (++cur_pos != end_pos); + + prefetch(&hash_tab[next_hash]); + mf->next_hash = next_hash; } static void -- 2.43.0