7 * The author dedicates this file to the public domain.
8 * You can do whatever you want with this file.
10 * ---------------------------------------------------------------------------
14 * This is a Hash Chains (hc) based matchfinder.
16 * The main data structure is a hash table where each hash bucket contains a
17 * linked list (or "chain") of sequences whose first 4 bytes share the same hash
18 * code. Each sequence is identified by its starting position in the input
21 * The algorithm processes the input buffer sequentially. At each byte
22 * position, the hash code of the first 4 bytes of the sequence beginning at
23 * that position (the sequence being matched against) is computed. This
24 * identifies the hash bucket to use for that position. Then, this hash
25 * bucket's linked list is searched for matches. Then, a new linked list node
26 * is created to represent the current sequence and is prepended to the list.
28 * This algorithm has several useful properties:
30 * - It only finds true Lempel-Ziv matches; i.e., those where the matching
31 * sequence occurs prior to the sequence being matched against.
33 * - The sequences in each linked list are always sorted by decreasing starting
34 * position. Therefore, the closest (smallest offset) matches are found
35 * first, which in many compression formats tend to be the cheapest to encode.
37 * - Although fast running time is not guaranteed due to the possibility of the
38 * lists getting very long, the worst degenerate behavior can be easily
39 * prevented by capping the number of nodes searched at each position.
41 * - If the compressor decides not to search for matches at a certain position,
42 * then that position can be quickly inserted without searching the list.
44 * - The algorithm is adaptable to sliding windows: just store the positions
45 * relative to a "base" value that is updated from time to time, and stop
46 * searching each list when the sequences get too far away.
48 * ---------------------------------------------------------------------------
52 * You must define MATCHFINDER_MAX_WINDOW_ORDER before including this header
53 * because that determines which integer type to use for positions. Since
54 * 16-bit integers are faster than 32-bit integers due to reduced memory usage
55 * (and therefore reduced cache pressure), the code only uses 32-bit integers if
56 * they are needed to represent all possible positions.
58 * The number of bytes that must be allocated for a given 'struct
59 * hc_matchfinder' must be gotten by calling hc_matchfinder_size().
61 * ----------------------------------------------------------------------------
65 * The main hash table and chains handle length 4+ matches. Length 3 matches
66 * are handled by a separate hash table with no chains. This works well for
67 * typical "greedy" or "lazy"-style compressors, where length 3 matches are
68 * often only helpful if they have small offsets. Instead of searching a full
69 * chain for length 3+ matches, the algorithm just checks for one close length 3
70 * match, then focuses on finding length 4+ matches.
72 * The longest_match() and skip_positions() functions are inlined into the
73 * compressors that use them. This isn't just about saving the overhead of a
74 * function call. These functions are intended to be called from the inner
75 * loops of compressors, where giving the compiler more control over register
76 * allocation is very helpful. There is also significant benefit to be gained
77 * from allowing the CPU to predict branches independently at each call site.
78 * For example, "lazy"-style compressors can be written with two calls to
79 * longest_match(), each of which starts with a different 'best_len' and
80 * therefore has significantly different performance characteristics.
82 * Although any hash function can be used, a multiplicative hash is fast and
85 * On some processors, it is significantly faster to extend matches by whole
86 * words (32 or 64 bits) instead of by individual bytes. For this to be the
87 * case, the processor must implement unaligned memory accesses efficiently and
88 * must have either a fast "find first set bit" instruction or a fast "find last
89 * set bit" instruction, depending on the processor's endianness.
91 * The code uses one loop for finding the first match and one loop for finding a
92 * longer match. Each of these loops is tuned for its respective task and in
93 * combination are faster than a single generalized loop that handles both
96 * The code also uses a tight inner loop that only compares the last and first
97 * bytes of a potential match. It is only when these bytes match that a full
98 * match extension is attempted.
100 * ----------------------------------------------------------------------------
103 #ifndef _HC_MATCHFINDER_H
104 #define _HC_MATCHFINDER_H
106 #ifndef MATCHFINDER_MAX_WINDOW_ORDER
107 # error "MATCHFINDER_MAX_WINDOW_ORDER must be defined!"
112 #include "wimlib/lz_extend.h"
113 #include "wimlib/lz_hash.h"
114 #include "wimlib/unaligned.h"
116 #if MATCHFINDER_MAX_WINDOW_ORDER <= 16
122 #define HC_MATCHFINDER_HASH3_ORDER 14
123 #define HC_MATCHFINDER_HASH4_ORDER 15
125 struct hc_matchfinder {
127 /* The hash table for finding length 3 matches */
128 pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
130 /* The hash table which contains the first nodes of the linked lists for
131 * finding length 4+ matches */
132 pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
134 /* The "next node" references for the linked lists. The "next node" of
135 * the node for the sequence with position 'pos' is 'next_tab[pos]'. */
139 /* Return the number of bytes that must be allocated for a 'hc_matchfinder' that
140 * can work with buffers up to the specified size. */
142 hc_matchfinder_size(size_t max_bufsize)
144 return sizeof(struct hc_matchfinder) + (max_bufsize * sizeof(pos_t));
147 /* Prepare the matchfinder for a new input buffer. */
149 hc_matchfinder_init(struct hc_matchfinder *mf)
151 memset(mf, 0, sizeof(*mf));
155 * Find the longest match longer than 'best_len' bytes.
158 * The matchfinder structure.
160 * Pointer to the beginning of the input buffer.
162 * The current position in the input buffer (the position of the sequence
163 * being matched against).
165 * Require a match longer than this length.
167 * The maximum permissible match length at this position.
169 * Stop searching if a match of at least this length is found.
170 * Must be <= @max_len.
172 * Limit on the number of potential matches to consider. Must be >= 1.
174 * The precomputed hash codes for the sequence beginning at @in_next.
175 * These will be used and then updated with the precomputed hashcodes for
176 * the sequence beginning at @in_next + 1.
178 * If a match is found, its offset is returned in this location.
180 * Return the length of the match found, or 'best_len' if no match longer than
181 * 'best_len' was found.
184 hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
185 const u8 * const restrict in_begin,
186 const ptrdiff_t cur_pos,
190 const u32 max_search_depth,
191 u32 next_hashes[const restrict static 2],
192 u32 * const restrict offset_ret)
194 const u8 *in_next = in_begin + cur_pos;
195 u32 depth_remaining = max_search_depth;
196 const u8 *best_matchptr = best_matchptr; /* uninitialized */
197 pos_t cur_node3, cur_node4;
199 u32 next_seq3, next_seq4;
204 if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
207 /* Get the precomputed hash codes. */
208 hash3 = next_hashes[0];
209 hash4 = next_hashes[1];
211 /* From the hash buckets, get the first node of each linked list. */
212 cur_node3 = mf->hash3_tab[hash3];
213 cur_node4 = mf->hash4_tab[hash4];
215 /* Update for length 3 matches. This replaces the singleton node in the
216 * 'hash3' bucket with the node for the current sequence. */
217 mf->hash3_tab[hash3] = cur_pos;
219 /* Update for length 4 matches. This prepends the node for the current
220 * sequence to the linked list in the 'hash4' bucket. */
221 mf->hash4_tab[hash4] = cur_pos;
222 mf->next_tab[cur_pos] = cur_node4;
224 /* Compute the next hash codes. */
225 next_seq4 = load_u32_unaligned(in_next + 1);
226 next_seq3 = loaded_u32_to_u24(next_seq4);
227 next_hashes[0] = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
228 next_hashes[1] = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
229 prefetchw(&mf->hash3_tab[next_hashes[0]]);
230 prefetchw(&mf->hash4_tab[next_hashes[1]]);
232 if (best_len < 4) { /* No match of length >= 4 found yet? */
234 /* Check for a length 3 match if needed. */
239 seq4 = load_u32_unaligned(in_next);
242 matchptr = &in_begin[cur_node3];
243 if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
245 best_matchptr = matchptr;
249 /* Check for a length 4 match. */
255 /* No length 4 match found yet. Check the first 4 bytes. */
256 matchptr = &in_begin[cur_node4];
258 if (load_u32_unaligned(matchptr) == seq4)
261 /* The first 4 bytes did not match. Keep trying. */
262 cur_node4 = mf->next_tab[cur_node4];
263 if (!cur_node4 || !--depth_remaining)
267 /* Found a match of length >= 4. Extend it to its full length. */
268 best_matchptr = matchptr;
269 best_len = lz_extend(in_next, best_matchptr, 4, max_len);
270 if (best_len >= nice_len)
272 cur_node4 = mf->next_tab[cur_node4];
273 if (!cur_node4 || !--depth_remaining)
276 if (!cur_node4 || best_len >= nice_len)
280 /* Check for matches of length >= 5. */
284 matchptr = &in_begin[cur_node4];
286 /* Already found a length 4 match. Try for a longer
287 * match; start by checking either the last 4 bytes and
288 * the first 4 bytes, or the last byte. (The last byte,
289 * the one which would extend the match length by 1, is
290 * the most important.) */
291 #if UNALIGNED_ACCESS_IS_FAST
292 if ((load_u32_unaligned(matchptr + best_len - 3) ==
293 load_u32_unaligned(in_next + best_len - 3)) &&
294 (load_u32_unaligned(matchptr) ==
295 load_u32_unaligned(in_next)))
297 if (matchptr[best_len] == in_next[best_len])
301 /* Continue to the next node in the list. */
302 cur_node4 = mf->next_tab[cur_node4];
303 if (!cur_node4 || !--depth_remaining)
307 #if UNALIGNED_ACCESS_IS_FAST
312 len = lz_extend(in_next, matchptr, len, max_len);
313 if (len > best_len) {
314 /* This is the new longest match. */
316 best_matchptr = matchptr;
317 if (best_len >= nice_len)
321 /* Continue to the next node in the list. */
322 cur_node4 = mf->next_tab[cur_node4];
323 if (!cur_node4 || !--depth_remaining)
327 *offset_ret = in_next - best_matchptr;
332 * Advance the matchfinder, but don't search for matches.
335 * The matchfinder structure.
337 * Pointer to the beginning of the input buffer.
339 * The current position in the input buffer (the position of the sequence
340 * being matched against).
342 * The length of the input buffer.
344 * The precomputed hash codes for the sequence beginning at @in_next.
345 * These will be used and then updated with the precomputed hashcodes for
346 * the sequence beginning at @in_next + @count.
348 * The number of bytes to advance. Must be > 0.
350 * Returns @in_next + @count.
352 static inline const u8 *
353 hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf,
354 const u8 * const restrict in_begin,
355 const ptrdiff_t cur_pos,
356 const ptrdiff_t end_pos,
358 u32 next_hashes[const restrict static 2])
360 const u8 *in_next = in_begin + cur_pos;
361 const u8 * const stop_ptr = in_next + count;
363 if (likely(count + 5 <= end_pos - cur_pos)) {
365 u32 next_seq3, next_seq4;
367 hash3 = next_hashes[0];
368 hash4 = next_hashes[1];
370 mf->hash3_tab[hash3] = in_next - in_begin;
371 mf->next_tab[in_next - in_begin] = mf->hash4_tab[hash4];
372 mf->hash4_tab[hash4] = in_next - in_begin;
374 next_seq4 = load_u32_unaligned(++in_next);
375 next_seq3 = loaded_u32_to_u24(next_seq4);
376 hash3 = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
377 hash4 = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
379 } while (in_next != stop_ptr);
381 prefetchw(&mf->hash3_tab[hash3]);
382 prefetchw(&mf->hash4_tab[hash4]);
383 next_hashes[0] = hash3;
384 next_hashes[1] = hash4;
390 #endif /* _HC_MATCHFINDER_H */