From: Eric Biggers Date: Sat, 19 Sep 2015 18:56:09 +0000 (-0500) Subject: bt_matchfinder optimizations X-Git-Tag: v1.8.3~92 X-Git-Url: https://wimlib.net/git/?p=wimlib;a=commitdiff_plain;h=d4d24ae7d5595b98ee7cdef0bf58a1ddbb5c707c bt_matchfinder optimizations --- diff --git a/include/wimlib/bt_matchfinder.h b/include/wimlib/bt_matchfinder.h index 459fedd3..70e4f143 100644 --- a/include/wimlib/bt_matchfinder.h +++ b/include/wimlib/bt_matchfinder.h @@ -11,8 +11,8 @@ * * This is a Binary Trees (bt) based matchfinder. * - * The data structure is a hash table where each hash bucket contains a binary - * tree of sequences whose first 3 bytes share the same hash code. Each + * The main data structure is a hash table where each hash bucket contains a + * binary tree of sequences whose first 3 bytes share the same hash code. Each * sequence is identified by its starting position in the input buffer. Each * binary tree is always sorted such that each left child represents a sequence * lexicographically lesser than its parent and each right child represents a @@ -51,7 +51,7 @@ #include "wimlib/lz_extend.h" #include "wimlib/lz_hash.h" -#define BT_MATCHFINDER_HASH_ORDER 16 +#define BT_MATCHFINDER_HASH3_ORDER 16 /* TEMPLATED functions and structures have MF_SUFFIX appended to their name. */ #undef TEMPLATED @@ -72,16 +72,22 @@ struct lz_match { u32 offset; }; -static inline u32 -bt_matchfinder_hash_3_bytes(const u8 *in_next) -{ - return lz_hash_3_bytes(in_next, BT_MATCHFINDER_HASH_ORDER); -} - #endif /* _WIMLIB_BT_MATCHFINDER_H */ struct TEMPLATED(bt_matchfinder) { - pos_t hash_tab[1UL << BT_MATCHFINDER_HASH_ORDER]; + + /* The hash table for finding length 2 matches, if enabled */ +#ifdef BT_MATCHFINDER_HASH2_ORDER + pos_t hash2_tab[1UL << BT_MATCHFINDER_HASH2_ORDER]; +#endif + + /* The hash table which contains the roots of the binary trees for + * finding length 3 matches */ + pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER]; + + /* The child node references for the binary trees. The left and right + * children of the node for the sequence with position 'pos' are + * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */ pos_t child_tab[]; }; @@ -102,97 +108,74 @@ TEMPLATED(bt_matchfinder_init)(struct TEMPLATED(bt_matchfinder) *mf) } static inline pos_t * -TEMPLATED(bt_child)(struct TEMPLATED(bt_matchfinder) *mf, pos_t node, int offset) +TEMPLATED(bt_left_child)(struct TEMPLATED(bt_matchfinder) *mf, u32 node) { - return &mf->child_tab[(node << 1) + offset]; + return &mf->child_tab[(node << 1) + 0]; } static inline pos_t * -TEMPLATED(bt_left_child)(struct TEMPLATED(bt_matchfinder) *mf, pos_t node) +TEMPLATED(bt_right_child)(struct TEMPLATED(bt_matchfinder) *mf, u32 node) { - return TEMPLATED(bt_child)(mf, node, 0); + return &mf->child_tab[(node << 1) + 1]; } -static inline pos_t * -TEMPLATED(bt_right_child)(struct TEMPLATED(bt_matchfinder) *mf, pos_t node) -{ - return TEMPLATED(bt_child)(mf, node, 1); -} - -/* - * Retrieve a list of matches with the current position. - * - * @mf - * The matchfinder structure. - * @in_begin - * Pointer to the beginning of the input buffer. - * @in_next - * Pointer to the next byte in the input buffer to process. This is the - * pointer to the sequence being matched against. - * @min_len - * Only record matches that are at least this long. - * @max_len - * The maximum permissible match length at this position. - * @nice_len - * Stop searching if a match of at least this length is found. - * Must be <= @max_len. - * @max_search_depth - * Limit on the number of potential matches to consider. Must be >= 1. - * @next_hash - * Pointer to the hash code for the current sequence, which was computed - * one position in advance so that the binary tree root could be - * prefetched. This is an input/output parameter. - * @best_len_ret - * The length of the longest match found is written here. (This is - * actually redundant with the 'struct lz_match' array, but this is easier - * for the compiler to optimize when inlined and the caller immediately - * does a check against 'best_len'.) - * @lz_matchptr - * An array in which this function will record the matches. The recorded - * matches will be sorted by strictly increasing length and strictly - * increasing offset. The maximum number of matches that may be found is - * 'min(nice_len, max_len) - 3 + 1'. - * - * The return value is a pointer to the next available slot in the @lz_matchptr - * array. (If no matches were found, this will be the same as @lz_matchptr.) - */ +/* Advance the binary tree matchfinder by one byte, optionally recording + * matches. @record_matches should be a compile-time constant. */ static inline struct lz_match * -TEMPLATED(bt_matchfinder_get_matches)(struct TEMPLATED(bt_matchfinder) * const restrict mf, - const u8 * const in_begin, - const u8 * const in_next, - const unsigned min_len, - const unsigned max_len, - const unsigned nice_len, - const unsigned max_search_depth, - u32 * restrict next_hash, - unsigned * restrict best_len_ret, - struct lz_match * restrict lz_matchptr) +TEMPLATED(bt_matchfinder_advance_one_byte)(struct TEMPLATED(bt_matchfinder) * const restrict mf, + const u8 * const restrict in_begin, + const ptrdiff_t cur_pos, + const u32 max_len, + const u32 nice_len, + const u32 max_search_depth, + u32 * const restrict next_hash, + u32 * const restrict best_len_ret, + struct lz_match * restrict lz_matchptr, + const bool record_matches) { - unsigned depth_remaining = max_search_depth; - u32 hash; - pos_t cur_node; + const u8 *in_next = in_begin + cur_pos; + u32 depth_remaining = max_search_depth; +#ifdef BT_MATCHFINDER_HASH2_ORDER + u16 seq2; + u32 hash2; +#endif + u32 hash3; + u32 cur_node; const u8 *matchptr; pos_t *pending_lt_ptr, *pending_gt_ptr; - unsigned best_lt_len, best_gt_len; - unsigned len; - unsigned best_len = min_len - 1; + u32 best_lt_len, best_gt_len; + u32 len; + u32 best_len = 2; if (unlikely(max_len < LZ_HASH3_REQUIRED_NBYTES + 1)) { *best_len_ret = best_len; return lz_matchptr; } - hash = *next_hash; - *next_hash = bt_matchfinder_hash_3_bytes(in_next + 1); - cur_node = mf->hash_tab[hash]; - mf->hash_tab[hash] = in_next - in_begin; - prefetchw(&mf->hash_tab[*next_hash]); + hash3 = *next_hash; + *next_hash = lz_hash(load_u24_unaligned(in_next + 1), BT_MATCHFINDER_HASH3_ORDER); + prefetchw(&mf->hash3_tab[*next_hash]); + +#ifdef BT_MATCHFINDER_HASH2_ORDER + seq2 = load_u16_unaligned(in_next); + hash2 = lz_hash(seq2, BT_MATCHFINDER_HASH2_ORDER); + cur_node = mf->hash2_tab[hash2]; + mf->hash2_tab[hash2] = cur_pos; + if (record_matches && + seq2 == load_u16_unaligned(&in_begin[cur_node]) && + likely(in_next != in_begin)) + { + lz_matchptr->length = 2; + lz_matchptr->offset = in_next - &in_begin[cur_node]; + lz_matchptr++; + } +#endif - pending_lt_ptr = TEMPLATED(bt_left_child)(mf, in_next - in_begin); - pending_gt_ptr = TEMPLATED(bt_right_child)(mf, in_next - in_begin); - best_lt_len = 0; - best_gt_len = 0; - len = 0; + cur_node = mf->hash3_tab[hash3]; + mf->hash3_tab[hash3] = cur_pos; + + pending_lt_ptr = TEMPLATED(bt_left_child)(mf, cur_pos); + pending_gt_ptr = TEMPLATED(bt_right_child)(mf, cur_pos); if (!cur_node) { *pending_lt_ptr = 0; @@ -201,16 +184,23 @@ TEMPLATED(bt_matchfinder_get_matches)(struct TEMPLATED(bt_matchfinder) * const r return lz_matchptr; } + best_lt_len = 0; + best_gt_len = 0; + len = 0; + for (;;) { matchptr = &in_begin[cur_node]; if (matchptr[len] == in_next[len]) { - len = lz_extend(in_next, matchptr, len + 1, max_len); - if (len > best_len) { - best_len = len; - lz_matchptr->length = len; - lz_matchptr->offset = in_next - matchptr; - lz_matchptr++; + len = lz_extend(in_next, matchptr, len + 1, + (record_matches ? max_len : nice_len)); + if (!record_matches || len > best_len) { + if (record_matches) { + best_len = len; + lz_matchptr->length = len; + lz_matchptr->offset = in_next - matchptr; + lz_matchptr++; + } if (len >= nice_len) { *pending_lt_ptr = *TEMPLATED(bt_left_child)(mf, cur_node); *pending_gt_ptr = *TEMPLATED(bt_right_child)(mf, cur_node); @@ -245,6 +235,66 @@ TEMPLATED(bt_matchfinder_get_matches)(struct TEMPLATED(bt_matchfinder) * const r } } +/* + * Retrieve a list of matches with the current position. + * + * @mf + * The matchfinder structure. + * @in_begin + * Pointer to the beginning of the input buffer. + * @cur_pos + * The current position in the input buffer (the position of the sequence + * being matched against). + * @max_len + * The maximum permissible match length at this position. + * @nice_len + * Stop searching if a match of at least this length is found. + * Must be <= @max_len. + * @max_search_depth + * Limit on the number of potential matches to consider. Must be >= 1. + * @next_hash + * Pointer to the hash code for the current sequence, which was computed + * one position in advance so that the binary tree root could be + * prefetched. This is an input/output parameter. + * @best_len_ret + * If a match of length >= 3 was found, then the length of the longest such + * match is written here; otherwise 2 is written here. (Note: this is + * redundant with the 'struct lz_match' array, but this is easier for the + * compiler to optimize when inlined and the caller immediately does a + * check against 'best_len'.) + * @lz_matchptr + * An array in which this function will record the matches. The recorded + * matches will be sorted by strictly increasing length and increasing + * offset. The maximum number of matches that may be found is + * 'min(nice_len, max_len) - 2 + 1', or one less if length 2 matches are + * disabled. + * + * The return value is a pointer to the next available slot in the @lz_matchptr + * array. (If no matches were found, this will be the same as @lz_matchptr.) + */ +static inline struct lz_match * +TEMPLATED(bt_matchfinder_get_matches)(struct TEMPLATED(bt_matchfinder) *mf, + const u8 *in_begin, + ptrdiff_t cur_pos, + u32 max_len, + u32 nice_len, + u32 max_search_depth, + u32 *next_hash, + u32 *best_len_ret, + struct lz_match *lz_matchptr) +{ + return TEMPLATED(bt_matchfinder_advance_one_byte)(mf, + in_begin, + cur_pos, + max_len, + nice_len, + max_search_depth, + next_hash, + best_len_ret, + lz_matchptr, + true); +} + /* * Advance the matchfinder, but don't record any matches. * @@ -252,10 +302,10 @@ TEMPLATED(bt_matchfinder_get_matches)(struct TEMPLATED(bt_matchfinder) * const r * The matchfinder structure. * @in_begin * Pointer to the beginning of the input buffer. - * @in_next - * Pointer to the next byte in the input buffer to process. - * @in_end - * Pointer to the end of the input buffer. + * @cur_pos + * The current position in the input buffer. + * @max_len + * The maximum permissible match length at this position. * @nice_len * Stop searching if a match of at least this length is found. * @max_search_depth @@ -270,76 +320,23 @@ TEMPLATED(bt_matchfinder_get_matches)(struct TEMPLATED(bt_matchfinder) * const r * actually record any matches. */ static inline void -TEMPLATED(bt_matchfinder_skip_position)(struct TEMPLATED(bt_matchfinder) * const restrict mf, - const u8 * const in_begin, - const u8 * const in_next, - const u8 * const in_end, - const unsigned nice_len, - const unsigned max_search_depth, - u32 * restrict next_hash) +TEMPLATED(bt_matchfinder_skip_position)(struct TEMPLATED(bt_matchfinder) *mf, + const u8 *in_begin, + ptrdiff_t cur_pos, + u32 max_len, + u32 nice_len, + u32 max_search_depth, + u32 *next_hash) { - unsigned depth_remaining = max_search_depth; - u32 hash; - pos_t cur_node; - const u8 *matchptr; - pos_t *pending_lt_ptr, *pending_gt_ptr; - unsigned best_lt_len, best_gt_len; - unsigned len; - - if (unlikely(in_end - in_next < LZ_HASH3_REQUIRED_NBYTES + 1)) - return; - - hash = *next_hash; - *next_hash = bt_matchfinder_hash_3_bytes(in_next + 1); - cur_node = mf->hash_tab[hash]; - mf->hash_tab[hash] = in_next - in_begin; - prefetchw(&mf->hash_tab[*next_hash]); - - depth_remaining = max_search_depth; - pending_lt_ptr = TEMPLATED(bt_left_child)(mf, in_next - in_begin); - pending_gt_ptr = TEMPLATED(bt_right_child)(mf, in_next - in_begin); - best_lt_len = 0; - best_gt_len = 0; - len = 0; - - if (!cur_node) { - *pending_lt_ptr = 0; - *pending_gt_ptr = 0; - return; - } - - for (;;) { - matchptr = &in_begin[cur_node]; - - if (matchptr[len] == in_next[len]) { - len = lz_extend(in_next, matchptr, len + 1, nice_len); - if (len == nice_len) { - *pending_lt_ptr = *TEMPLATED(bt_left_child)(mf, cur_node); - *pending_gt_ptr = *TEMPLATED(bt_right_child)(mf, cur_node); - return; - } - } - - if (matchptr[len] < in_next[len]) { - *pending_lt_ptr = cur_node; - pending_lt_ptr = TEMPLATED(bt_right_child)(mf, cur_node); - cur_node = *pending_lt_ptr; - best_lt_len = len; - if (best_gt_len < len) - len = best_gt_len; - } else { - *pending_gt_ptr = cur_node; - pending_gt_ptr = TEMPLATED(bt_left_child)(mf, cur_node); - cur_node = *pending_gt_ptr; - best_gt_len = len; - if (best_lt_len < len) - len = best_lt_len; - } - - if (!cur_node || !--depth_remaining) { - *pending_lt_ptr = 0; - *pending_gt_ptr = 0; - return; - } - } + u32 best_len; + TEMPLATED(bt_matchfinder_advance_one_byte)(mf, + in_begin, + cur_pos, + max_len, + nice_len, + max_search_depth, + next_hash, + &best_len, + NULL, + false); } diff --git a/src/lzx_compress.c b/src/lzx_compress.c index ac65c5dc..863bb5e5 100644 --- a/src/lzx_compress.c +++ b/src/lzx_compress.c @@ -126,13 +126,12 @@ #define LZX_MAX_FAST_LEVEL 34 /* - * LZX_HASH2_ORDER is the log base 2 of the number of entries in the hash table - * for finding length 2 matches. This can be as high as 16 (in which case the - * hash function is trivial), but using a smaller hash table speeds up - * compression due to reduced cache pressure. + * BT_MATCHFINDER_HASH2_ORDER is the log base 2 of the number of entries in the + * hash table for finding length 2 matches. This could be as high as 16, but + * using a smaller hash table speeds up compression due to reduced cache + * pressure. */ -#define LZX_HASH2_ORDER 12 -#define LZX_HASH2_LENGTH (1UL << LZX_HASH2_ORDER) +#define BT_MATCHFINDER_HASH2_ORDER 12 /* * These are the compressor-side limits on the codeword lengths for each Huffman @@ -485,9 +484,6 @@ struct lzx_compressor { LZX_MAX_MATCHES_PER_POS + LZX_MAX_MATCH_LEN - 1]; - /* Hash table for finding length 2 matches */ - u32 hash2_tab[LZX_HASH2_LENGTH]; - /* Binary trees matchfinder (MUST BE LAST!!!) */ union { struct bt_matchfinder_16 bt_mf_16; @@ -1787,12 +1783,10 @@ lzx_compress_near_optimal(struct lzx_compressor *c, const u8 * const in_end = in_begin + c->in_nbytes; unsigned max_len = LZX_MAX_MATCH_LEN; unsigned nice_len = min(c->nice_match_length, max_len); - u32 next_hash; + u32 next_hash = 0; struct lzx_lru_queue queue; CALL_BT_MF(is_16_bit, c, bt_matchfinder_init); - memset(c->hash2_tab, 0, sizeof(c->hash2_tab)); - next_hash = bt_matchfinder_hash_3_bytes(in_next); lzx_lru_queue_init(&queue); do { @@ -1805,8 +1799,6 @@ lzx_compress_near_optimal(struct lzx_compressor *c, struct lz_match *cache_ptr = c->match_cache; do { struct lz_match *lz_matchptr; - u32 hash2; - pos_t cur_match; unsigned best_len; /* If approaching the end of the input buffer, adjust @@ -1828,33 +1820,16 @@ lzx_compress_near_optimal(struct lzx_compressor *c, } } - lz_matchptr = cache_ptr + 1; - - /* Check for a length 2 match. */ - hash2 = lz_hash_2_bytes(in_next, LZX_HASH2_ORDER); - cur_match = c->hash2_tab[hash2]; - c->hash2_tab[hash2] = in_next - in_begin; - if (cur_match != 0 && - (LZX_HASH2_ORDER == 16 || - load_u16_unaligned(&in_begin[cur_match]) == - load_u16_unaligned(in_next))) - { - lz_matchptr->length = 2; - lz_matchptr->offset = in_next - &in_begin[cur_match]; - lz_matchptr++; - } - - /* Check for matches of length >= 3. */ + /* Check for matches. */ lz_matchptr = CALL_BT_MF(is_16_bit, c, bt_matchfinder_get_matches, in_begin, - in_next, - 3, + in_next - in_begin, max_len, nice_len, c->max_search_depth, &next_hash, &best_len, - lz_matchptr); + cache_ptr + 1); in_next++; cache_ptr->length = lz_matchptr - (cache_ptr + 1); cache_ptr = lz_matchptr; @@ -1884,12 +1859,10 @@ lzx_compress_near_optimal(struct lzx_compressor *c, continue; } } - c->hash2_tab[lz_hash_2_bytes(in_next, LZX_HASH2_ORDER)] = - in_next - in_begin; CALL_BT_MF(is_16_bit, c, bt_matchfinder_skip_position, in_begin, - in_next, - in_end, + in_next - in_begin, + max_len, nice_len, c->max_search_depth, &next_hash); diff --git a/src/xpress_compress.c b/src/xpress_compress.c index 7020a88b..734bfc02 100644 --- a/src/xpress_compress.c +++ b/src/xpress_compress.c @@ -908,10 +908,9 @@ xpress_find_matches(struct xpress_compressor * restrict c, const u8 *in_next = in_begin; const u8 * const in_end = in_begin + in_nbytes; struct lz_match *cache_ptr = c->match_cache; - u32 next_hash; + u32 next_hash = 0; bt_matchfinder_init(&c->bt_mf); - next_hash = bt_matchfinder_hash_3_bytes(in_next); do { struct lz_match *matches; @@ -937,8 +936,7 @@ xpress_find_matches(struct xpress_compressor * restrict c, cache_ptr = bt_matchfinder_get_matches(&c->bt_mf, in_begin, - in_next, - XPRESS_MIN_MATCH_LEN, + in_next - in_begin, in_end - in_next, min(in_end - in_next, c->nice_match_length), c->max_search_depth, @@ -965,8 +963,8 @@ xpress_find_matches(struct xpress_compressor * restrict c, do { bt_matchfinder_skip_position(&c->bt_mf, in_begin, - in_next, - in_end, + in_next - in_begin, + in_end - in_next, min(in_end - in_next, c->nice_match_length), c->max_search_depth,