7 * The author dedicates this file to the public domain.
8 * You can do whatever you want with this file.
10 * ---------------------------------------------------------------------------
14 * This is a Hash Chains (hc) based matchfinder.
16 * The main data structure is a hash table where each hash bucket contains a
17 * linked list (or "chain") of sequences whose first 4 bytes share the same hash
18 * code. Each sequence is identified by its starting position in the input
21 * The algorithm processes the input buffer sequentially. At each byte
22 * position, the hash code of the first 4 bytes of the sequence beginning at
23 * that position (the sequence being matched against) is computed. This
24 * identifies the hash bucket to use for that position. Then, this hash
25 * bucket's linked list is searched for matches. Then, a new linked list node
26 * is created to represent the current sequence and is prepended to the list.
28 * This algorithm has several useful properties:
30 * - It only finds true Lempel-Ziv matches; i.e., those where the matching
31 * sequence occurs prior to the sequence being matched against.
33 * - The sequences in each linked list are always sorted by decreasing starting
34 * position. Therefore, the closest (smallest offset) matches are found
35 * first, which in many compression formats tend to be the cheapest to encode.
37 * - Although fast running time is not guaranteed due to the possibility of the
38 * lists getting very long, the worst degenerate behavior can be easily
39 * prevented by capping the number of nodes searched at each position.
41 * - If the compressor decides not to search for matches at a certain position,
42 * then that position can be quickly inserted without searching the list.
44 * - The algorithm is adaptable to sliding windows: just store the positions
45 * relative to a "base" value that is updated from time to time, and stop
46 * searching each list when the sequences get too far away.
48 * ---------------------------------------------------------------------------
52 * Before including this header, you must define 'pos_t' to an integer type that
53 * can represent all possible positions. This can be a 16-bit or 32-bit
54 * unsigned integer. When possible, the former should be used due to the
55 * reduced cache pressure. This header can be included multiple times in a
56 * single .c file with different 'pos_t' definitions; however, you must define a
57 * different MF_SUFFIX each time to generate different names for the matchfinder
58 * structure and functions.
60 * The number of bytes that must be allocated for a given 'struct
61 * hc_matchfinder' must be gotten by calling hc_matchfinder_size().
63 * ----------------------------------------------------------------------------
67 * The main hash table and chains handle length 4+ matches. Length 3 matches
68 * are handled by a separate hash table with no chains. This works well for
69 * typical "greedy" or "lazy"-style compressors, where length 3 matches are
70 * often only helpful if they have small offsets. Instead of searching a full
71 * chain for length 3+ matches, the algorithm just checks for one close length 3
72 * match, then focuses on finding length 4+ matches.
74 * The longest_match() and skip_positions() functions are inlined into the
75 * compressors that use them. This isn't just about saving the overhead of a
76 * function call. These functions are intended to be called from the inner
77 * loops of compressors, where giving the compiler more control over register
78 * allocation is very helpful. There is also significant benefit to be gained
79 * from allowing the CPU to predict branches independently at each call site.
80 * For example, "lazy"-style compressors can be written with two calls to
81 * longest_match(), each of which starts with a different 'best_len' and
82 * therefore has significantly different performance characteristics.
84 * Although any hash function can be used, a multiplicative hash is fast and
87 * On some processors, it is significantly faster to extend matches by whole
88 * words (32 or 64 bits) instead of by individual bytes. For this to be the
89 * case, the processor must implement unaligned memory accesses efficiently and
90 * must have either a fast "find first set bit" instruction or a fast "find last
91 * set bit" instruction, depending on the processor's endianness.
93 * The code uses one loop for finding the first match and one loop for finding a
94 * longer match. Each of these loops is tuned for its respective task and in
95 * combination are faster than a single generalized loop that handles both
98 * The code also uses a tight inner loop that only compares the last and first
99 * bytes of a potential match. It is only when these bytes match that a full
100 * match extension is attempted.
102 * ----------------------------------------------------------------------------
107 #include "wimlib/lz_extend.h"
108 #include "wimlib/lz_hash.h"
109 #include "wimlib/unaligned.h"
111 #define HC_MATCHFINDER_HASH3_ORDER 14
112 #define HC_MATCHFINDER_HASH4_ORDER 15
114 /* TEMPLATED functions and structures have MF_SUFFIX appended to their name. */
116 #define TEMPLATED(name) CONCAT(name, MF_SUFFIX)
118 struct TEMPLATED(hc_matchfinder) {
120 /* The hash table for finding length 3 matches */
121 pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
123 /* The hash table which contains the first nodes of the linked lists for
124 * finding length 4+ matches */
125 pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
127 /* The "next node" references for the linked lists. The "next node" of
128 * the node for the sequence with position 'pos' is 'next_tab[pos]'. */
132 /* Return the number of bytes that must be allocated for a 'hc_matchfinder' that
133 * can work with buffers up to the specified size. */
135 TEMPLATED(hc_matchfinder_size)(size_t max_bufsize)
137 return sizeof(struct TEMPLATED(hc_matchfinder)) +
138 (max_bufsize * sizeof(pos_t));
141 /* Prepare the matchfinder for a new input buffer. */
143 TEMPLATED(hc_matchfinder_init)(struct TEMPLATED(hc_matchfinder) *mf)
145 memset(mf, 0, sizeof(*mf));
149 * Find the longest match longer than 'best_len' bytes.
152 * The matchfinder structure.
154 * Pointer to the beginning of the input buffer.
156 * The current position in the input buffer (the position of the sequence
157 * being matched against).
159 * Require a match longer than this length.
161 * The maximum permissible match length at this position.
163 * Stop searching if a match of at least this length is found.
164 * Must be <= @max_len.
166 * Limit on the number of potential matches to consider. Must be >= 1.
168 * The precomputed hash codes for the sequence beginning at @in_next.
169 * These will be used and then updated with the precomputed hashcodes for
170 * the sequence beginning at @in_next + 1.
172 * If a match is found, its offset is returned in this location.
174 * Return the length of the match found, or 'best_len' if no match longer than
175 * 'best_len' was found.
178 TEMPLATED(hc_matchfinder_longest_match)(struct TEMPLATED(hc_matchfinder) * const restrict mf,
179 const u8 * const restrict in_begin,
180 const ptrdiff_t cur_pos,
184 const u32 max_search_depth,
185 u32 next_hashes[const restrict static 2],
186 u32 * const restrict offset_ret)
188 const u8 *in_next = in_begin + cur_pos;
189 u32 depth_remaining = max_search_depth;
190 const u8 *best_matchptr = best_matchptr; /* uninitialized */
191 pos_t cur_node3, cur_node4;
193 u32 next_seq3, next_seq4;
198 if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
201 /* Get the precomputed hash codes. */
202 hash3 = next_hashes[0];
203 hash4 = next_hashes[1];
205 /* From the hash buckets, get the first node of each linked list. */
206 cur_node3 = mf->hash3_tab[hash3];
207 cur_node4 = mf->hash4_tab[hash4];
209 /* Update for length 3 matches. This replaces the singleton node in the
210 * 'hash3' bucket with the node for the current sequence. */
211 mf->hash3_tab[hash3] = cur_pos;
213 /* Update for length 4 matches. This prepends the node for the current
214 * sequence to the linked list in the 'hash4' bucket. */
215 mf->hash4_tab[hash4] = cur_pos;
216 mf->next_tab[cur_pos] = cur_node4;
218 /* Compute the next hash codes. */
219 next_seq4 = load_u32_unaligned(in_next + 1);
220 next_seq3 = loaded_u32_to_u24(next_seq4);
221 next_hashes[0] = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
222 next_hashes[1] = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
223 prefetchw(&mf->hash3_tab[next_hashes[0]]);
224 prefetchw(&mf->hash4_tab[next_hashes[1]]);
226 if (best_len < 4) { /* No match of length >= 4 found yet? */
228 /* Check for a length 3 match if needed. */
233 seq4 = load_u32_unaligned(in_next);
236 matchptr = &in_begin[cur_node3];
237 if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
239 best_matchptr = matchptr;
243 /* Check for a length 4 match. */
249 /* No length 4 match found yet. Check the first 4 bytes. */
250 matchptr = &in_begin[cur_node4];
252 if (load_u32_unaligned(matchptr) == seq4)
255 /* The first 4 bytes did not match. Keep trying. */
256 cur_node4 = mf->next_tab[cur_node4];
257 if (!cur_node4 || !--depth_remaining)
261 /* Found a match of length >= 4. Extend it to its full length. */
262 best_matchptr = matchptr;
263 best_len = lz_extend(in_next, best_matchptr, 4, max_len);
264 if (best_len >= nice_len)
266 cur_node4 = mf->next_tab[cur_node4];
267 if (!cur_node4 || !--depth_remaining)
270 if (!cur_node4 || best_len >= nice_len)
274 /* Check for matches of length >= 5. */
278 matchptr = &in_begin[cur_node4];
280 /* Already found a length 4 match. Try for a longer
281 * match; start by checking either the last 4 bytes and
282 * the first 4 bytes, or the last byte. (The last byte,
283 * the one which would extend the match length by 1, is
284 * the most important.) */
285 #if UNALIGNED_ACCESS_IS_FAST
286 if ((load_u32_unaligned(matchptr + best_len - 3) ==
287 load_u32_unaligned(in_next + best_len - 3)) &&
288 (load_u32_unaligned(matchptr) ==
289 load_u32_unaligned(in_next)))
291 if (matchptr[best_len] == in_next[best_len])
295 /* Continue to the next node in the list. */
296 cur_node4 = mf->next_tab[cur_node4];
297 if (!cur_node4 || !--depth_remaining)
301 #if UNALIGNED_ACCESS_IS_FAST
306 len = lz_extend(in_next, matchptr, len, max_len);
307 if (len > best_len) {
308 /* This is the new longest match. */
310 best_matchptr = matchptr;
311 if (best_len >= nice_len)
315 /* Continue to the next node in the list. */
316 cur_node4 = mf->next_tab[cur_node4];
317 if (!cur_node4 || !--depth_remaining)
321 *offset_ret = in_next - best_matchptr;
326 * Advance the matchfinder, but don't search for matches.
329 * The matchfinder structure.
331 * Pointer to the beginning of the input buffer.
333 * The current position in the input buffer (the position of the sequence
334 * being matched against).
336 * The length of the input buffer.
338 * The precomputed hash codes for the sequence beginning at @in_next.
339 * These will be used and then updated with the precomputed hashcodes for
340 * the sequence beginning at @in_next + @count.
342 * The number of bytes to advance. Must be > 0.
344 * Returns @in_next + @count.
346 static inline const u8 *
347 TEMPLATED(hc_matchfinder_skip_positions)(struct TEMPLATED(hc_matchfinder) * const restrict mf,
348 const u8 * const restrict in_begin,
349 const ptrdiff_t cur_pos,
350 const ptrdiff_t end_pos,
352 u32 next_hashes[const restrict static 2])
354 const u8 *in_next = in_begin + cur_pos;
355 const u8 * const stop_ptr = in_next + count;
357 if (likely(count + 5 <= end_pos - cur_pos)) {
359 u32 next_seq3, next_seq4;
361 hash3 = next_hashes[0];
362 hash4 = next_hashes[1];
364 mf->hash3_tab[hash3] = in_next - in_begin;
365 mf->next_tab[in_next - in_begin] = mf->hash4_tab[hash4];
366 mf->hash4_tab[hash4] = in_next - in_begin;
368 next_seq4 = load_u32_unaligned(++in_next);
369 next_seq3 = loaded_u32_to_u24(next_seq4);
370 hash3 = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
371 hash4 = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
373 } while (in_next != stop_ptr);
375 prefetchw(&mf->hash3_tab[hash3]);
376 prefetchw(&mf->hash4_tab[hash4]);
377 next_hashes[0] = hash3;
378 next_hashes[1] = hash4;