From: Eric Biggers Date: Sat, 19 Sep 2015 18:56:01 +0000 (-0500) Subject: hc_matchfinder optimizations X-Git-Tag: v1.8.3~97 X-Git-Url: https://wimlib.net/git/?p=wimlib;a=commitdiff_plain;h=64b6dba86d307632b4c6b36ea15b993b5bac760c hc_matchfinder optimizations --- diff --git a/include/wimlib/hc_matchfinder.h b/include/wimlib/hc_matchfinder.h index eb509d9a..8382a3cd 100644 --- a/include/wimlib/hc_matchfinder.h +++ b/include/wimlib/hc_matchfinder.h @@ -13,12 +13,13 @@ * * This is a Hash Chains (hc) based matchfinder. * - * The data structure is a hash table where each hash bucket contains a linked - * list (or "chain") of sequences whose first 3 bytes share the same hash code. - * Each sequence is identified by its starting position in the input buffer. + * The main data structure is a hash table where each hash bucket contains a + * linked list (or "chain") of sequences whose first 4 bytes share the same hash + * code. Each sequence is identified by its starting position in the input + * buffer. * * The algorithm processes the input buffer sequentially. At each byte - * position, the hash code of the first 3 bytes of the sequence beginning at + * position, the hash code of the first 4 bytes of the sequence beginning at * that position (the sequence being matched against) is computed. This * identifies the hash bucket to use for that position. Then, this hash * bucket's linked list is searched for matches. Then, a new linked list node @@ -61,6 +62,13 @@ * * Optimizations * + * The main hash table and chains handle length 4+ matches. Length 3 matches + * are handled by a separate hash table with no chains. This works well for + * typical "greedy" or "lazy"-style compressors, where length 3 matches are + * often only helpful if they have small offsets. Instead of searching a full + * chain for length 3+ matches, the algorithm just checks for one close length 3 + * match, then focuses on finding length 4+ matches. + * * The longest_match() and skip_positions() functions are inlined into the * compressors that use them. This isn't just about saving the overhead of a * function call. These functions are intended to be called from the inner @@ -105,20 +113,26 @@ #include "wimlib/lz_hash.h" #include "wimlib/unaligned.h" -#if MATCHFINDER_MAX_WINDOW_ORDER < 14 -# define HC_MATCHFINDER_HASH_ORDER 14 -#else -# define HC_MATCHFINDER_HASH_ORDER 15 -#endif - #if MATCHFINDER_MAX_WINDOW_ORDER <= 16 typedef u16 pos_t; #else typedef u32 pos_t; #endif +#define HC_MATCHFINDER_HASH3_ORDER 14 +#define HC_MATCHFINDER_HASH4_ORDER 15 + struct hc_matchfinder { - pos_t hash_tab[1UL << HC_MATCHFINDER_HASH_ORDER]; + + /* The hash table for finding length 3 matches */ + pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER]; + + /* The hash table which contains the first nodes of the linked lists for + * finding length 4+ matches */ + pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER]; + + /* The "next node" references for the linked lists. The "next node" of + * the node for the sequence with position 'pos' is 'next_tab[pos]'. */ pos_t next_tab[]; }; @@ -144,9 +158,9 @@ hc_matchfinder_init(struct hc_matchfinder *mf) * The matchfinder structure. * @in_begin * Pointer to the beginning of the input buffer. - * @in_next - * Pointer to the next byte in the input buffer to process. This is the - * pointer to the sequence being matched against. + * @cur_pos + * The current position in the input buffer (the position of the sequence + * being matched against). * @best_len * Require a match longer than this length. * @max_len @@ -156,77 +170,120 @@ hc_matchfinder_init(struct hc_matchfinder *mf) * Must be <= @max_len. * @max_search_depth * Limit on the number of potential matches to consider. Must be >= 1. + * @next_hashes + * The precomputed hash codes for the sequence beginning at @in_next. + * These will be used and then updated with the precomputed hashcodes for + * the sequence beginning at @in_next + 1. * @offset_ret * If a match is found, its offset is returned in this location. * * Return the length of the match found, or 'best_len' if no match longer than * 'best_len' was found. */ -static inline unsigned +static inline u32 hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, - const u8 * const in_begin, - const u8 * const in_next, - unsigned best_len, - const unsigned max_len, - const unsigned nice_len, - const unsigned max_search_depth, - unsigned *offset_ret) + const u8 * const restrict in_begin, + const ptrdiff_t cur_pos, + u32 best_len, + const u32 max_len, + const u32 nice_len, + const u32 max_search_depth, + u32 next_hashes[const restrict static 2], + u32 * const restrict offset_ret) { - unsigned depth_remaining = max_search_depth; + const u8 *in_next = in_begin + cur_pos; + u32 depth_remaining = max_search_depth; const u8 *best_matchptr = best_matchptr; /* uninitialized */ + pos_t cur_node3, cur_node4; + u32 hash3, hash4; + u32 next_seq3, next_seq4; + u32 seq4; const u8 *matchptr; - unsigned len; - u32 first_3_bytes; - u32 hash; - pos_t cur_node; + u32 len; - /* Insert the current sequence into the appropriate linked list. */ - if (unlikely(max_len < LOAD_U24_REQUIRED_NBYTES)) + if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */ goto out; - first_3_bytes = load_u24_unaligned(in_next); - hash = lz_hash(first_3_bytes, HC_MATCHFINDER_HASH_ORDER); - cur_node = mf->hash_tab[hash]; - mf->next_tab[in_next - in_begin] = cur_node; - mf->hash_tab[hash] = in_next - in_begin; - if (unlikely(best_len >= max_len)) - goto out; + /* Get the precomputed hash codes. */ + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; - /* Search the appropriate linked list for matches. */ + /* From the hash buckets, get the first node of each linked list. */ + cur_node3 = mf->hash3_tab[hash3]; + cur_node4 = mf->hash4_tab[hash4]; - if (!cur_node) - goto out; + /* Update for length 3 matches. This replaces the singleton node in the + * 'hash3' bucket with the node for the current sequence. */ + mf->hash3_tab[hash3] = cur_pos; + + /* Update for length 4 matches. This prepends the node for the current + * sequence to the linked list in the 'hash4' bucket. */ + mf->hash4_tab[hash4] = cur_pos; + mf->next_tab[cur_pos] = cur_node4; + + /* Compute the next hash codes. */ + next_seq4 = load_u32_unaligned(in_next + 1); + next_seq3 = loaded_u32_to_u24(next_seq4); + next_hashes[0] = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER); + next_hashes[1] = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER); + prefetchw(&mf->hash3_tab[next_hashes[0]]); + prefetchw(&mf->hash4_tab[next_hashes[1]]); + + if (best_len < 4) { /* No match of length >= 4 found yet? */ + + /* Check for a length 3 match if needed. */ + + if (!cur_node3) + goto out; + + seq4 = load_u32_unaligned(in_next); + + if (best_len < 3) { + matchptr = &in_begin[cur_node3]; + if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) { + best_len = 3; + best_matchptr = matchptr; + } + } + + /* Check for a length 4 match. */ + + if (!cur_node4) + goto out; - if (best_len < 3) { for (;;) { - /* No length 3 match found yet. - * Check the first 3 bytes. */ - matchptr = &in_begin[cur_node]; + /* No length 4 match found yet. Check the first 4 bytes. */ + matchptr = &in_begin[cur_node4]; - if (load_u24_unaligned(matchptr) == first_3_bytes) + if (load_u32_unaligned(matchptr) == seq4) break; - /* The first 3 bytes did not match. Keep trying. */ - cur_node = mf->next_tab[cur_node]; - if (!cur_node || !--depth_remaining) + /* The first 4 bytes did not match. Keep trying. */ + cur_node4 = mf->next_tab[cur_node4]; + if (!cur_node4 || !--depth_remaining) goto out; } - /* Found a match of length >= 3. Extend it to its full length. */ + /* Found a match of length >= 4. Extend it to its full length. */ best_matchptr = matchptr; - best_len = lz_extend(in_next, best_matchptr, 3, max_len); + best_len = lz_extend(in_next, best_matchptr, 4, max_len); if (best_len >= nice_len) goto out; - cur_node = mf->next_tab[cur_node]; - if (!cur_node || !--depth_remaining) + cur_node4 = mf->next_tab[cur_node4]; + if (!cur_node4 || !--depth_remaining) + goto out; + } else { + if (!cur_node4 || best_len >= nice_len) goto out; } + /* Check for matches of length >= 5. */ + for (;;) { for (;;) { - matchptr = &in_begin[cur_node]; + matchptr = &in_begin[cur_node4]; - /* Already found a length 3 match. Try for a longer + /* Already found a length 4 match. Try for a longer * match; start by checking either the last 4 bytes and * the first 4 bytes, or the last byte. (The last byte, * the one which would extend the match length by 1, is @@ -241,8 +298,9 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, #endif break; - cur_node = mf->next_tab[cur_node]; - if (!cur_node || !--depth_remaining) + /* Continue to the next node in the list. */ + cur_node4 = mf->next_tab[cur_node4]; + if (!cur_node4 || !--depth_remaining) goto out; } @@ -253,13 +311,16 @@ hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, #endif len = lz_extend(in_next, matchptr, len, max_len); if (len > best_len) { + /* This is the new longest match. */ best_len = len; best_matchptr = matchptr; if (best_len >= nice_len) goto out; } - cur_node = mf->next_tab[cur_node]; - if (!cur_node || !--depth_remaining) + + /* Continue to the next node in the list. */ + cur_node4 = mf->next_tab[cur_node4]; + if (!cur_node4 || !--depth_remaining) goto out; } out: @@ -274,31 +335,56 @@ out: * The matchfinder structure. * @in_begin * Pointer to the beginning of the input buffer. - * @in_next - * Pointer to the next byte in the input buffer to process. - * @in_end - * Pointer to the end of the input buffer. + * @cur_pos + * The current position in the input buffer (the position of the sequence + * being matched against). + * @end_pos + * The length of the input buffer. + * @next_hashes + * The precomputed hash codes for the sequence beginning at @in_next. + * These will be used and then updated with the precomputed hashcodes for + * the sequence beginning at @in_next + @count. * @count * The number of bytes to advance. Must be > 0. + * + * Returns @in_next + @count. */ -static inline void -hc_matchfinder_skip_positions(struct hc_matchfinder * restrict mf, - const u8 *in_begin, - const u8 *in_next, - const u8 *in_end, - unsigned count) +static inline const u8 * +hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf, + const u8 * const restrict in_begin, + const ptrdiff_t cur_pos, + const ptrdiff_t end_pos, + const u32 count, + u32 next_hashes[const restrict static 2]) { - u32 hash; + const u8 *in_next = in_begin + cur_pos; + const u8 * const stop_ptr = in_next + count; + + if (likely(count + 5 <= end_pos - cur_pos)) { + u32 hash3, hash4; + u32 next_seq3, next_seq4; - if (unlikely(in_next + count >= in_end - LZ_HASH3_REQUIRED_NBYTES)) - return; + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; + do { + mf->hash3_tab[hash3] = in_next - in_begin; + mf->next_tab[in_next - in_begin] = mf->hash4_tab[hash4]; + mf->hash4_tab[hash4] = in_next - in_begin; + + next_seq4 = load_u32_unaligned(++in_next); + next_seq3 = loaded_u32_to_u24(next_seq4); + hash3 = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER); + hash4 = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER); + + } while (in_next != stop_ptr); + + prefetchw(&mf->hash3_tab[hash3]); + prefetchw(&mf->hash4_tab[hash4]); + next_hashes[0] = hash3; + next_hashes[1] = hash4; + } - do { - hash = lz_hash_3_bytes(in_next, HC_MATCHFINDER_HASH_ORDER); - mf->next_tab[in_next - in_begin] = mf->hash_tab[hash]; - mf->hash_tab[hash] = in_next - in_begin; - in_next++; - } while (--count); + return stop_ptr; } #endif /* _HC_MATCHFINDER_H */ diff --git a/src/lzx_compress.c b/src/lzx_compress.c index 5e1be485..fbc05988 100644 --- a/src/lzx_compress.c +++ b/src/lzx_compress.c @@ -1803,6 +1803,7 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os) unsigned max_len = LZX_MAX_MATCH_LEN; unsigned nice_len = min(c->nice_match_length, max_len); struct lzx_lru_queue queue; + u32 next_hashes[2] = {}; hc_matchfinder_init(&c->hc_mf); lzx_lru_queue_init(&queue); @@ -1839,11 +1840,12 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os) cur_len = hc_matchfinder_longest_match(&c->hc_mf, in_begin, - in_next, + in_next - in_begin, 2, max_len, nice_len, c->max_search_depth, + next_hashes, &cur_offset); if (cur_len < 3 || (cur_len == 3 && @@ -1905,11 +1907,12 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os) next_len = hc_matchfinder_longest_match(&c->hc_mf, in_begin, - in_next, + in_next - in_begin, cur_len - 2, max_len, nice_len, c->max_search_depth / 2, + next_hashes, &next_offset); if (next_len <= cur_len - 2) { @@ -1973,9 +1976,10 @@ lzx_compress_lazy(struct lzx_compressor *c, struct lzx_output_bitstream *os) hc_matchfinder_skip_positions(&c->hc_mf, in_begin, - in_next, - in_end, - skip_len); + in_next - in_begin, + in_end - in_begin, + skip_len, + next_hashes); in_next += skip_len; } while (in_next < in_block_end); diff --git a/src/xpress_compress.c b/src/xpress_compress.c index 2f3a35e9..5298af64 100644 --- a/src/xpress_compress.c +++ b/src/xpress_compress.c @@ -531,6 +531,7 @@ xpress_compress_greedy(struct xpress_compressor * restrict c, const u8 * const in_end = in_begin + in_nbytes; struct xpress_item *next_chosen_item = c->chosen_items; unsigned len_3_too_far; + u32 next_hashes[2] = {}; if (in_nbytes <= 8192) len_3_too_far = 2048; @@ -545,11 +546,12 @@ xpress_compress_greedy(struct xpress_compressor * restrict c, length = hc_matchfinder_longest_match(&c->hc_mf, in_begin, - in_next, + in_next - in_begin, XPRESS_MIN_MATCH_LEN - 1, in_end - in_next, min(in_end - in_next, c->nice_match_length), c->max_search_depth, + next_hashes, &offset); if (length >= XPRESS_MIN_MATCH_LEN && !(length == XPRESS_MIN_MATCH_LEN && offset >= len_3_too_far)) @@ -560,9 +562,10 @@ xpress_compress_greedy(struct xpress_compressor * restrict c, in_next += 1; hc_matchfinder_skip_positions(&c->hc_mf, in_begin, - in_next, - in_end, - length - 1); + in_next - in_begin, + in_end - in_begin, + length - 1, + next_hashes); in_next += length - 1; } else { /* No match found */ @@ -591,6 +594,7 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, const u8 * const in_end = in_begin + in_nbytes; struct xpress_item *next_chosen_item = c->chosen_items; unsigned len_3_too_far; + u32 next_hashes[2] = {}; if (in_nbytes <= 8192) len_3_too_far = 2048; @@ -608,11 +612,12 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, /* Find the longest match at the current position. */ cur_len = hc_matchfinder_longest_match(&c->hc_mf, in_begin, - in_next, + in_next - in_begin, XPRESS_MIN_MATCH_LEN - 1, in_end - in_next, min(in_end - in_next, c->nice_match_length), c->max_search_depth, + next_hashes, &cur_offset); in_next += 1; @@ -637,9 +642,10 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, hc_matchfinder_skip_positions(&c->hc_mf, in_begin, - in_next, - in_end, - cur_len - 1); + in_next - in_begin, + in_end - in_begin, + cur_len - 1, + next_hashes); in_next += cur_len - 1; continue; } @@ -662,11 +668,12 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, */ next_len = hc_matchfinder_longest_match(&c->hc_mf, in_begin, - in_next, + in_next - in_begin, cur_len, in_end - in_next, min(in_end - in_next, c->nice_match_length), c->max_search_depth / 2, + next_hashes, &next_offset); in_next += 1; @@ -685,9 +692,10 @@ xpress_compress_lazy(struct xpress_compressor * restrict c, xpress_record_match(c, cur_len, cur_offset); hc_matchfinder_skip_positions(&c->hc_mf, in_begin, - in_next, - in_end, - cur_len - 2); + in_next - in_begin, + in_end - in_begin, + cur_len - 2, + next_hashes); in_next += cur_len - 2; continue; }