2 * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists
4 * The following copying information applies to this specific source code file:
6 * Written in 2014-2015 by Eric Biggers <ebiggers3@gmail.com>
8 * To the extent possible under law, the author(s) have dedicated all copyright
9 * and related and neighboring rights to this software to the public domain
10 * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
11 * Dedication (the "CC0").
13 * This software is distributed in the hope that it will be useful, but WITHOUT
14 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15 * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
17 * You should have received a copy of the CC0 along with this software; if not
18 * see <http://creativecommons.org/publicdomain/zero/1.0/>.
20 * ---------------------------------------------------------------------------
24 * This is a Hash Chains (hc) based matchfinder.
26 * The main data structure is a hash table where each hash bucket contains a
27 * linked list (or "chain") of sequences whose first 4 bytes share the same hash
28 * code. Each sequence is identified by its starting position in the input
31 * The algorithm processes the input buffer sequentially. At each byte
32 * position, the hash code of the first 4 bytes of the sequence beginning at
33 * that position (the sequence being matched against) is computed. This
34 * identifies the hash bucket to use for that position. Then, this hash
35 * bucket's linked list is searched for matches. Then, a new linked list node
36 * is created to represent the current sequence and is prepended to the list.
38 * This algorithm has several useful properties:
40 * - It only finds true Lempel-Ziv matches; i.e., those where the matching
41 * sequence occurs prior to the sequence being matched against.
43 * - The sequences in each linked list are always sorted by decreasing starting
44 * position. Therefore, the closest (smallest offset) matches are found
45 * first, which in many compression formats tend to be the cheapest to encode.
47 * - Although fast running time is not guaranteed due to the possibility of the
48 * lists getting very long, the worst degenerate behavior can be easily
49 * prevented by capping the number of nodes searched at each position.
51 * - If the compressor decides not to search for matches at a certain position,
52 * then that position can be quickly inserted without searching the list.
54 * - The algorithm is adaptable to sliding windows: just store the positions
55 * relative to a "base" value that is updated from time to time, and stop
56 * searching each list when the sequences get too far away.
58 * ---------------------------------------------------------------------------
62 * Before including this header, you must define 'mf_pos_t' to an integer type
63 * that can represent all possible positions. This can be a 16-bit or 32-bit
64 * unsigned integer. When possible, the former should be used due to the
65 * reduced cache pressure. This header can be included multiple times in a
66 * single .c file with different 'mf_pos_t' definitions; however, you must
67 * define a different MF_SUFFIX each time to generate different names for the
68 * matchfinder structure and functions.
70 * The number of bytes that must be allocated for a given 'struct
71 * hc_matchfinder' must be gotten by calling hc_matchfinder_size().
73 * ----------------------------------------------------------------------------
77 * The main hash table and chains handle length 4+ matches. Length 3 matches
78 * are handled by a separate hash table with no chains. This works well for
79 * typical "greedy" or "lazy"-style compressors, where length 3 matches are
80 * often only helpful if they have small offsets. Instead of searching a full
81 * chain for length 3+ matches, the algorithm just checks for one close length 3
82 * match, then focuses on finding length 4+ matches.
84 * The longest_match() and skip_positions() functions are inlined into the
85 * compressors that use them. This isn't just about saving the overhead of a
86 * function call. These functions are intended to be called from the inner
87 * loops of compressors, where giving the compiler more control over register
88 * allocation is very helpful. There is also significant benefit to be gained
89 * from allowing the CPU to predict branches independently at each call site.
90 * For example, "lazy"-style compressors can be written with two calls to
91 * longest_match(), each of which starts with a different 'best_len' and
92 * therefore has significantly different performance characteristics.
94 * Although any hash function can be used, a multiplicative hash is fast and
97 * On some processors, it is significantly faster to extend matches by whole
98 * words (32 or 64 bits) instead of by individual bytes. For this to be the
99 * case, the processor must implement unaligned memory accesses efficiently and
100 * must have either a fast "find first set bit" instruction or a fast "find last
101 * set bit" instruction, depending on the processor's endianness.
103 * The code uses one loop for finding the first match and one loop for finding a
104 * longer match. Each of these loops is tuned for its respective task and in
105 * combination are faster than a single generalized loop that handles both
108 * The code also uses a tight inner loop that only compares the last and first
109 * bytes of a potential match. It is only when these bytes match that a full
110 * match extension is attempted.
112 * ----------------------------------------------------------------------------
117 #include "wimlib/lz_extend.h"
118 #include "wimlib/lz_hash.h"
119 #include "wimlib/unaligned.h"
121 #define HC_MATCHFINDER_HASH3_ORDER 14
122 #define HC_MATCHFINDER_HASH4_ORDER 15
124 /* TEMPLATED functions and structures have MF_SUFFIX appended to their name. */
126 #define TEMPLATED(name) CONCAT(name, MF_SUFFIX)
128 struct TEMPLATED(hc_matchfinder) {
130 /* The hash table for finding length 3 matches */
131 mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
133 /* The hash table which contains the first nodes of the linked lists for
134 * finding length 4+ matches */
135 mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
137 /* The "next node" references for the linked lists. The "next node" of
138 * the node for the sequence with position 'pos' is 'next_tab[pos]'. */
142 /* Return the number of bytes that must be allocated for a 'hc_matchfinder' that
143 * can work with buffers up to the specified size. */
144 static forceinline size_t
145 TEMPLATED(hc_matchfinder_size)(size_t max_bufsize)
147 return sizeof(struct TEMPLATED(hc_matchfinder)) +
148 (max_bufsize * sizeof(mf_pos_t));
151 /* Prepare the matchfinder for a new input buffer. */
152 static forceinline void
153 TEMPLATED(hc_matchfinder_init)(struct TEMPLATED(hc_matchfinder) *mf)
155 memset(mf, 0, sizeof(*mf));
159 * Find the longest match longer than 'best_len' bytes.
162 * The matchfinder structure.
164 * Pointer to the beginning of the input buffer.
166 * The current position in the input buffer (the position of the sequence
167 * being matched against).
169 * Require a match longer than this length.
171 * The maximum permissible match length at this position.
173 * Stop searching if a match of at least this length is found.
174 * Must be <= @max_len.
176 * Limit on the number of potential matches to consider. Must be >= 1.
178 * The precomputed hash codes for the sequence beginning at @in_next.
179 * These will be used and then updated with the precomputed hashcodes for
180 * the sequence beginning at @in_next + 1.
182 * If a match is found, its offset is returned in this location.
184 * Return the length of the match found, or 'best_len' if no match longer than
185 * 'best_len' was found.
187 static forceinline u32
188 TEMPLATED(hc_matchfinder_longest_match)(struct TEMPLATED(hc_matchfinder) * const restrict mf,
189 const u8 * const restrict in_begin,
190 const ptrdiff_t cur_pos,
194 const u32 max_search_depth,
195 u32 next_hashes[const restrict static 2],
196 u32 * const restrict offset_ret)
198 const u8 *in_next = in_begin + cur_pos;
199 u32 depth_remaining = max_search_depth;
200 const u8 *best_matchptr = best_matchptr; /* uninitialized */
201 mf_pos_t cur_node3, cur_node4;
203 u32 next_seq3, next_seq4;
208 if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
211 /* Get the precomputed hash codes. */
212 hash3 = next_hashes[0];
213 hash4 = next_hashes[1];
215 /* From the hash buckets, get the first node of each linked list. */
216 cur_node3 = mf->hash3_tab[hash3];
217 cur_node4 = mf->hash4_tab[hash4];
219 /* Update for length 3 matches. This replaces the singleton node in the
220 * 'hash3' bucket with the node for the current sequence. */
221 mf->hash3_tab[hash3] = cur_pos;
223 /* Update for length 4 matches. This prepends the node for the current
224 * sequence to the linked list in the 'hash4' bucket. */
225 mf->hash4_tab[hash4] = cur_pos;
226 mf->next_tab[cur_pos] = cur_node4;
228 /* Compute the next hash codes. */
229 next_seq4 = load_u32_unaligned(in_next + 1);
230 next_seq3 = loaded_u32_to_u24(next_seq4);
231 next_hashes[0] = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
232 next_hashes[1] = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
233 prefetchw(&mf->hash3_tab[next_hashes[0]]);
234 prefetchw(&mf->hash4_tab[next_hashes[1]]);
236 if (best_len < 4) { /* No match of length >= 4 found yet? */
238 /* Check for a length 3 match if needed. */
243 seq4 = load_u32_unaligned(in_next);
246 matchptr = &in_begin[cur_node3];
247 if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
249 best_matchptr = matchptr;
253 /* Check for a length 4 match. */
259 /* No length 4 match found yet. Check the first 4 bytes. */
260 matchptr = &in_begin[cur_node4];
262 if (load_u32_unaligned(matchptr) == seq4)
265 /* The first 4 bytes did not match. Keep trying. */
266 cur_node4 = mf->next_tab[cur_node4];
267 if (!cur_node4 || !--depth_remaining)
271 /* Found a match of length >= 4. Extend it to its full length. */
272 best_matchptr = matchptr;
273 best_len = lz_extend(in_next, best_matchptr, 4, max_len);
274 if (best_len >= nice_len)
276 cur_node4 = mf->next_tab[cur_node4];
277 if (!cur_node4 || !--depth_remaining)
280 if (!cur_node4 || best_len >= nice_len)
284 /* Check for matches of length >= 5. */
288 matchptr = &in_begin[cur_node4];
290 /* Already found a length 4 match. Try for a longer
291 * match; start by checking either the last 4 bytes and
292 * the first 4 bytes, or the last byte. (The last byte,
293 * the one which would extend the match length by 1, is
294 * the most important.) */
295 #if UNALIGNED_ACCESS_IS_FAST
296 if ((load_u32_unaligned(matchptr + best_len - 3) ==
297 load_u32_unaligned(in_next + best_len - 3)) &&
298 (load_u32_unaligned(matchptr) ==
299 load_u32_unaligned(in_next)))
301 if (matchptr[best_len] == in_next[best_len])
305 /* Continue to the next node in the list. */
306 cur_node4 = mf->next_tab[cur_node4];
307 if (!cur_node4 || !--depth_remaining)
311 #if UNALIGNED_ACCESS_IS_FAST
316 len = lz_extend(in_next, matchptr, len, max_len);
317 if (len > best_len) {
318 /* This is the new longest match. */
320 best_matchptr = matchptr;
321 if (best_len >= nice_len)
325 /* Continue to the next node in the list. */
326 cur_node4 = mf->next_tab[cur_node4];
327 if (!cur_node4 || !--depth_remaining)
331 *offset_ret = in_next - best_matchptr;
336 * Advance the matchfinder, but don't search for matches.
339 * The matchfinder structure.
341 * Pointer to the beginning of the input buffer.
343 * The current position in the input buffer (the position of the sequence
344 * being matched against).
346 * The length of the input buffer.
348 * The precomputed hash codes for the sequence beginning at @in_next.
349 * These will be used and then updated with the precomputed hashcodes for
350 * the sequence beginning at @in_next + @count.
352 * The number of bytes to advance. Must be > 0.
354 * Returns @in_next + @count.
356 static forceinline const u8 *
357 TEMPLATED(hc_matchfinder_skip_positions)(struct TEMPLATED(hc_matchfinder) * const restrict mf,
358 const u8 * const restrict in_begin,
359 const ptrdiff_t cur_pos,
360 const ptrdiff_t end_pos,
362 u32 next_hashes[const restrict static 2])
364 const u8 *in_next = in_begin + cur_pos;
365 const u8 * const stop_ptr = in_next + count;
367 if (likely(count + 5 <= end_pos - cur_pos)) {
369 u32 next_seq3, next_seq4;
371 hash3 = next_hashes[0];
372 hash4 = next_hashes[1];
374 mf->hash3_tab[hash3] = in_next - in_begin;
375 mf->next_tab[in_next - in_begin] = mf->hash4_tab[hash4];
376 mf->hash4_tab[hash4] = in_next - in_begin;
378 next_seq4 = load_u32_unaligned(++in_next);
379 next_seq3 = loaded_u32_to_u24(next_seq4);
380 hash3 = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
381 hash4 = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
383 } while (in_next != stop_ptr);
385 prefetchw(&mf->hash3_tab[hash3]);
386 prefetchw(&mf->hash4_tab[hash4]);
387 next_hashes[0] = hash3;
388 next_hashes[1] = hash4;