4 * Suffix array match-finder for Lempel-Ziv compression.
8 * Copyright (c) 2013, 2014 Eric Biggers. All rights reserved.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
28 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
31 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #ifndef _WIMLIB_LZ_SARRAY_H
35 #define _WIMLIB_LZ_SARRAY_H
37 #include "wimlib/compiler.h" /* must define '_always_inline_attribute',
38 'likely()', and 'prefetch()'. */
39 #include "wimlib/lz.h" /* must define 'struct raw_match' and LZ_ASSERT() */
40 #include "wimlib/types.h" /* must define 'bool', 'u8', 'u16, and 'u32' */
44 /* Position type --- must be an unsigned type large enough to hold the length of
45 * longest window for which the suffix array match-finder will be used. */
46 typedef u32 lz_sarray_pos_t;
48 /* Length type --- must be an unsigned type large enough to hold the maximum
50 typedef u16 lz_sarray_len_t;
52 /* Cost type, for the user-provided match cost evaluation function. */
53 typedef lz_sarray_pos_t lz_sarray_cost_t;
55 /* Type of distances in suffix array links. A larger type would allow skipping
56 * irrelevant suffixes more quickly, which is especially helpful towards the
57 * start of the window. However, even a single byte allows skipping 255 at a
58 * time, which where it matters is already a big improvement over the
59 * alternative of searching the suffixes consecutively. */
60 typedef u8 lz_sarray_delta_t;
62 #define LZ_SARRAY_LEN_MAX ((lz_sarray_len_t)~0UL)
63 #define LZ_SARRAY_POS_MAX ((lz_sarray_pos_t)~0UL)
64 #define LZ_SARRAY_DELTA_MAX ((lz_sarray_delta_t)~0UL)
65 #define LZ_SARRAY_INFINITE_COST ((lz_sarray_cost_t)~0UL)
67 /* State of the suffix array LZ (Lempel-Ziv) match-finder.
69 * This is defined here for benefit of the inlined code. It's not intended for
70 * code outside the match-finder itself to read or write members from this
73 /* Allocated window size for the match-finder.
75 * Note: this match-finder does not store the window itself, as the
76 * suffix array (@SA) and associated arrays (@ISA, @LCP, @salink) are
77 * sufficient to find matches. This number is the maximum length of
78 * those arrays, or also the maximum window (block) size that can be
79 * passed to lz_sarray_load_window(). */
80 lz_sarray_pos_t max_window_size;
82 /* Minimum length of matches to return. */
83 lz_sarray_len_t min_match_len;
85 /* Maximum length of matches to return. */
86 lz_sarray_len_t max_match_len;
88 /* Maximum matches to consider at each position (max search depth). */
89 u32 max_matches_to_consider;
91 /* Maximum number of matches to return at each position. */
92 u32 max_matches_to_return;
94 /* Current position in the window. */
95 lz_sarray_pos_t cur_pos;
97 /* Current window size. */
98 lz_sarray_pos_t window_size;
100 /* Suffix array for the current window.
101 * This is a mapping from suffix rank to suffix position. */
104 /* Inverse suffix array for the current window.
105 * This is a mapping from suffix position to suffix rank.
106 * If 0 <= r < window_size, then ISA[SA[r]] == r. */
107 lz_sarray_pos_t *ISA;
109 /* Suffix array links.
111 * During a linear scan of the input string to find matches, this array
112 * used to keep track of which rank suffixes in the suffix array appear
113 * before the current position. Instead of searching in the original
114 * suffix array, scans for matches at a given position traverse a linked
115 * list containing (usually) only suffixes that appear before that
117 struct salink *salink;
120 /* Suffix array link. An array of these structures, one per suffix rank, is
121 * used as a replacement for the raw LCP (Longest Common Prefix) array to allow
122 * skipping over suffixes that appear later in the window and hence cannot be
123 * used as LZ77 matches. */
126 /* Temporary fields used while this structure is being
129 * Note: we want the entire `struct salink' to be only 6 bytes,
130 * even though this makes "next_initial" unaligned. */
132 lz_sarray_pos_t next_initial;
133 lz_sarray_len_t lcpnext_initial;
137 /* Intially, the length, in bytes, of the longest common
138 * prefix (LCP) between the suffix having this rank and
139 * the suffix with the the smallest larger rank that
140 * starts earlier in the window than the suffix having
141 * this rank. If no such suffix exists, this will be 0.
143 * Later, during match-finding, after the corresponding
144 * suffix has entered the LZ77 dictionary, this value
145 * may be updated by lz_sarray_update_salink() to refer
146 * instead to a lexicographically closer (but still
147 * larger) suffix that begins at a later position that
148 * has entered the LZ77 dictionary. */
149 lz_sarray_len_t lcpnext;
151 /* Initially, the length, in bytes, of the longest
152 * common prefix (LCP) between the suffix having this
153 * rank and the suffix with the the largest smaller rank
154 * that starts earlier in the window than the suffix
155 * having this rank. If no such suffix exists, this
158 * Later, during match-finding, after the corresponding
159 * suffix has entered the LZ77 dictionary, this value
160 * may be updated by lz_sarray_update_salink() to refer
161 * instead to a lexicographically closer (but still
162 * smaller) suffix that begins at a later position that
163 * has entered the LZ77 dictionary. */
164 lz_sarray_len_t lcpprev;
166 /* Distance to the suffix referred to in the description
167 * of "lcpnext" above, but capped to a maximum value to
168 * save memory; or, 0 if no such suffix exists. If the
169 * true distance was truncated, this will give the
170 * distance to the rank of a suffix that is
171 * lexicographically closer to the current suffix than
172 * the desired suffix, but appears *later* in the window
173 * and hence cannot be used as the basis for a LZ77
175 lz_sarray_delta_t dist_to_next;
177 /* Distance to the suffix referred to in the description
178 * of "lcpprev" above, but capped to a maximum value to
179 * save memory; or, 0 if no such suffix exists. If the
180 * true distance was truncated, this will give the
181 * distance to the rank of a suffix that is
182 * lexicographically closer to the current suffix than
183 * the desired suffix, but appears *later* in the window
184 * and hence cannot be used as the basis for a LZ77
186 lz_sarray_delta_t dist_to_prev;
192 /*-----------------------------------*/
193 /* Functions defined in lz_sarray.c */
194 /*-----------------------------------*/
197 lz_sarray_init(struct lz_sarray *mf,
198 lz_sarray_pos_t max_window_size,
199 lz_sarray_len_t min_match_len,
200 lz_sarray_len_t max_match_len,
201 u32 max_matches_to_consider,
202 u32 max_matches_to_return);
205 lz_sarray_get_needed_memory(lz_sarray_pos_t max_window_size);
208 lz_sarray_destroy(struct lz_sarray *mf);
211 lz_sarray_load_window(struct lz_sarray *mf, const u8 T[], lz_sarray_pos_t n);
213 /*-------------------*/
214 /* Inline functions */
215 /*-------------------*/
217 static _always_inline_attribute lz_sarray_pos_t
218 lz_sarray_get_pos(const struct lz_sarray *mf)
223 /* Advance the suffix array match-finder to the next position. */
224 static _always_inline_attribute void
225 lz_sarray_update_salink(const lz_sarray_pos_t r, struct salink link[])
227 const lz_sarray_pos_t next = r + link[r].dist_to_next;
228 const lz_sarray_pos_t prev = r - link[r].dist_to_prev;
230 if (next != r && link[r].dist_to_next < link[next].dist_to_prev) {
231 link[next].dist_to_prev = link[r].dist_to_next;
232 link[next].lcpprev = link[r].lcpnext;
235 if (prev != r && link[r].dist_to_prev < link[prev].dist_to_next) {
236 link[prev].dist_to_next = link[r].dist_to_prev;
237 link[prev].lcpnext = link[r].lcpprev;
241 /* Skip the current position in the suffix array match-finder. */
242 static _always_inline_attribute void
243 lz_sarray_skip_position(struct lz_sarray *mf)
245 LZ_ASSERT(mf->cur_pos < mf->window_size);
246 lz_sarray_update_salink(mf->ISA[mf->cur_pos++], mf->salink);
250 * Use the suffix array match-finder to retrieve a list of matches at the
253 * Returns the number of matches written into @matches. The matches are
254 * returned in decreasing order by length, and each will be of unique length
255 * between the minimum and maximum match lengths (inclusively) passed to
256 * lz_sarray_init(). Up to @max_matches_to_return (passed to lz_sarray_init())
257 * matches will be returned.
259 * @eval_match_cost is a function for evaluating the cost of a match when
260 * deciding which ones to return. It needs to be fast, and need not be exact;
261 * an implementation might simply rank matches by their offset, for example,
262 * although implementations may choose to take into account additional
263 * information such as repeat offsets.
265 static _always_inline_attribute u32
266 lz_sarray_get_matches(struct lz_sarray *mf,
267 struct raw_match matches[],
268 lz_sarray_cost_t (*eval_match_cost)
269 (lz_sarray_pos_t length,
270 lz_sarray_pos_t offset,
272 const void *eval_match_cost_ctx)
274 LZ_ASSERT(mf->cur_pos < mf->window_size);
275 const lz_sarray_pos_t i = mf->cur_pos++;
277 const lz_sarray_pos_t * const restrict SA = mf->SA;
278 const lz_sarray_pos_t * const restrict ISA = mf->ISA;
279 struct salink * const restrict link = mf->salink;
280 const u32 max_matches_to_consider = mf->max_matches_to_consider;
281 const u32 max_matches_to_return = mf->max_matches_to_return;
283 /* r = Rank of the suffix at the current position. */
284 const lz_sarray_pos_t r = ISA[i];
286 /* Prepare for searching the current position. */
287 lz_sarray_update_salink(r, link);
290 /* Prefetch next position in SA and link.
292 * This can improve performance on large windows since the locations in
293 * SA and link at which each successive search begins are in general
294 * randomly distributed. */
295 if (likely(i + 1 < mf->window_size)) {
296 const lz_sarray_pos_t next_r = ISA[i + 1];
297 prefetch(&SA[next_r]);
298 prefetch(&link[next_r]);
302 /* L = rank of next suffix to the left;
303 * R = rank of next suffix to the right;
304 * lenL = length of match between current position and the suffix with rank L;
305 * lenR = length of match between current position and the suffix with rank R.
307 * This is left and right relative to the rank of the current suffix.
308 * Since the suffixes in the suffix array are sorted, the longest
309 * matches are immediately to the left and right (using the linked list
310 * to ignore all suffixes that occur later in the window). The match
311 * length decreases the farther left and right we go. We shall keep the
312 * length on both sides in sync in order to choose the lowest-cost match
315 lz_sarray_pos_t L = r - link[r].dist_to_prev;
316 lz_sarray_pos_t R = r + link[r].dist_to_next;
317 lz_sarray_pos_t lenL = link[r].lcpprev;
318 lz_sarray_pos_t lenR = link[r].lcpnext;
320 /* nmatches = number of matches found so far. */
323 /* best_cost = cost of lowest-cost match found so far.
325 * Shorter matches that do not have a lower cost than this are
326 * discarded, since presumably it would be cheaper to output the bytes
327 * from the longer match instead. */
328 lz_sarray_cost_t best_cost = LZ_SARRAY_INFINITE_COST;
330 /* count_remaining = maximum number of possible matches remaining to be
332 u32 count_remaining = max_matches_to_consider;
334 /* pending_offset = offset of lowest-cost match found for the current
335 * length, or 0 if none found yet. */
336 lz_sarray_pos_t pending_offset = 0;
338 /* Note: some 'goto' statements are used in the remainder of this
339 * function to remove unnecessary checks and create branches that the
340 * CPU may predict better. (This function is performance critical.) */
342 if (lenL != 0 && lenL >= lenR)
350 /* Search suffixes on the left until the match length has decreased
351 * below the next match length on the right or to below the minimum
354 lz_sarray_pos_t offset;
355 lz_sarray_cost_t cost;
356 lz_sarray_pos_t old_L;
357 lz_sarray_pos_t old_lenL;
359 /* Check for hard cutoff on amount of work done. */
360 if (count_remaining-- == 0) {
361 if (pending_offset != 0) {
362 /* Save pending match. */
363 matches[nmatches++] = (struct raw_match){
365 .offset = pending_offset,
372 /* Suffix is in LZ77 dictionary. (Check was needed
373 * because the salink array caps distances to save
378 /* Save match offset if it results in lower cost. */
379 cost = (*eval_match_cost)(lenL, offset,
380 eval_match_cost_ctx);
381 if (cost < best_cost) {
383 pending_offset = offset;
387 /* Advance left to previous suffix. */
392 L -= link[L].dist_to_prev;
394 if (link[old_L].lcpprev < old_lenL) {
395 /* Match length decreased. */
397 lenL = link[old_L].lcpprev;
399 if (old_lenL > lenR) {
400 /* Neither the right side nor the left size has
401 * any more matches of length @old_lenL. If a
402 * pending match exists, save it. */
403 if (pending_offset != 0) {
404 matches[nmatches++] = (struct raw_match){
406 .offset = pending_offset,
408 if (nmatches == max_matches_to_return)
415 /* New match length on left is still at
416 * least as large as the next match
417 * length on the right: Keep extending
418 * left, unless the minimum match length
419 * would be underrun. */
426 /* Here we have lenL < lenR. Extend right.
427 * (No check for whether the minimum match length has
428 * been underrun is needed, provided that such lengths
429 * are marked as 0.) */
435 /* Search suffixes on the right until the match length has decreased to
436 * the next match length on the left or to below the minimum match
439 lz_sarray_pos_t offset;
440 lz_sarray_cost_t cost;
441 lz_sarray_pos_t old_R;
442 lz_sarray_pos_t old_lenR;
444 /* Check for hard cutoff on amount of work done. */
445 if (count_remaining-- == 0) {
446 if (pending_offset != 0) {
447 /* Save pending match. */
448 matches[nmatches++] = (struct raw_match){
450 .offset = pending_offset,
457 /* Suffix is in LZ77 dictionary. (Check was needed
458 * because the salink array caps distances to save
463 /* Save match offset if it results in lower cost. */
464 cost = (*eval_match_cost)(lenR,
466 eval_match_cost_ctx);
467 if (cost < best_cost) {
469 pending_offset = offset;
473 /* Advance right to next suffix. */
478 R += link[R].dist_to_next;
480 if (link[old_R].lcpnext < lenR) {
481 /* Match length decreased. */
483 lenR = link[old_R].lcpnext;
485 /* Neither the right side nor the left size has any more
486 * matches of length @old_lenR. If a pending match
487 * exists, save it. */
488 if (pending_offset != 0) {
489 matches[nmatches++] = (struct raw_match){
491 .offset = pending_offset,
493 if (nmatches == max_matches_to_return)
500 /* lenL >= lenR: Extend left, unless the
501 * minimum match length would be underrun, in
502 * which case we are done. */
508 /* lenR > lenL: Keep extending right.
509 * (No check for whether the minimum match length has
510 * been underrun is needed, provided that such lengths
511 * are marked as 0.) */
516 #endif /* _WIMLIB_LZ_SARRAY_H */