4 * Suffix array match-finder for Lempel-Ziv compression.
8 * Copyright (c) 2013 Eric Biggers. All rights reserved.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
28 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
31 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #ifndef _WIMLIB_LZ_SARRAY_H
35 #define _WIMLIB_LZ_SARRAY_H
37 #include "wimlib/compiler.h" /* must define '_always_inline_attribute' */
38 #include "wimlib/lz.h" /* must define 'struct raw_match', 'input_idx_t',
40 #include "wimlib/types.h" /* must define 'bool', 'u8', 'u32' */
44 /* State of the suffix array LZ (Lempel-Ziv) match-finder.
46 * This is defined here for benefit of the inlined code. It's not intended for
47 * code outside the match-finder itself to read or write members from this
50 /* Allocated window size for the match-finder.
52 * Note: this match-finder does not store the window itself, as the
53 * suffix array (@SA) and associated arrays (@ISA, @LCP, @salink) are
54 * sufficient to find matches. This number is the maximum length of
55 * those arrays, or also the maximum window (block) size that can be
56 * passed to lz_sarray_load_window(). */
57 input_idx_t max_window_size;
59 /* Minimum match length to return. */
60 input_idx_t min_match_len;
62 /* Maximum match length to return. */
63 input_idx_t max_match_len;
65 /* Maximum matches to consider at each position (max search depth). */
66 u32 max_matches_to_consider;
68 /* Maximum number of matches to return at each position. */
69 u32 max_matches_to_return;
71 /* Current position in the window. */
74 /* Current window size. */
75 input_idx_t window_size;
77 /* Suffix array for the current window.
78 * This is a mapping from suffix rank to suffix position. */
81 /* Inverse suffix array for the current window.
82 * This is a mapping from suffix position to suffix rank.
83 * If 0 <= r < window_size, then ISA[SA[r]] == r. */
86 /* Suffix array links.
88 * During a linear scan of the input string to find matches, this array
89 * used to keep track of which rank suffixes in the suffix array appear
90 * before the current position. Instead of searching in the original
91 * suffix array, scans for matches at a given position traverse a linked
92 * list containing only suffixes that appear before that position. */
93 struct salink *salink;
96 /* Suffix array link; one of these exists for each position in the suffix array.
99 /* Rank of highest ranked suffix that has rank lower than the suffix
100 * corresponding to this structure and either has a lower position
101 * (initially) or has a position lower than the highest position at
102 * which matches have been searched for so far, or ~(input_idx_t)0 if
103 * there is no such suffix.
105 * Think of this as a pointer to the closest position in the suffix
106 * array to the left that corresponds to a suffix that begins at a
107 * position in the current dictionary (i.e. before the current position
111 /* Rank of lowest ranked suffix that has rank greater than the suffix
112 * corresponding to this structure and either has a lower position
113 * (intially) or has a position lower than the highest position at which
114 * matches have been searched for so far, or ~(input_idx_t)0 if there is
117 * Think of this as a pointer to the closest position in the suffix
118 * array to the right that corresponds to a suffix that begins at a
119 * position in the current dictionary (i.e. before the current position
123 /* Length of longest common prefix between the suffix corresponding to
124 * this structure and the suffix with rank @prev, or 0 if @prev is
125 * ~(input_idx_t)0. */
128 /* Length of longest common prefix between the suffix corresponding to
129 * this structure and the suffix with rank @next, or 0 if @next is
130 * ~(input_idx_t)0. */
134 /*-----------------------------------*/
135 /* Functions defined in lz_sarray.c */
136 /*-----------------------------------*/
139 lz_sarray_init(struct lz_sarray *mf,
140 input_idx_t max_window_size,
141 input_idx_t min_match_len,
142 input_idx_t max_match_len,
143 u32 max_matches_to_consider,
144 u32 max_matches_to_return);
147 lz_sarray_destroy(struct lz_sarray *mf);
150 lz_sarray_load_window(struct lz_sarray *mf, const u8 T[], input_idx_t n);
152 /*-------------------*/
153 /* Inline functions */
154 /*-------------------*/
156 static _always_inline_attribute input_idx_t
157 lz_sarray_get_pos(const struct lz_sarray *mf)
162 /* Advance the suffix array match-finder to the next position. */
163 static _always_inline_attribute void
164 lz_sarray_update_salink(const input_idx_t i,
165 const input_idx_t SA[const restrict],
166 const input_idx_t ISA[const restrict],
167 struct salink link[const restrict])
169 /* r = Rank of the suffix at the current position. */
170 const input_idx_t r = ISA[i];
172 /* next = rank of LOWEST ranked suffix that is ranked HIGHER than the
173 * current suffix AND has a LOWER position, or ~(input_idx_t)0 if none
175 const input_idx_t next = link[r].next;
177 /* prev = rank of HIGHEST ranked suffix that is ranked LOWER than the
178 * current suffix AND has a LOWER position, or ~(input_idx_t)0 if none
180 const input_idx_t prev = link[r].prev;
182 /* Link the suffix at the current position into the linked list that
183 * contains all suffixes referenced by the suffix array that appear at
184 * or before the current position, sorted by rank. */
185 if (next != ~(input_idx_t)0) {
187 link[next].lcpprev = link[r].lcpnext;
190 if (prev != ~(input_idx_t)0) {
192 link[prev].lcpnext = link[r].lcpprev;
196 /* Skip the current position in the suffix array match-finder. */
197 static _always_inline_attribute void
198 lz_sarray_skip_position(struct lz_sarray *mf)
200 LZ_ASSERT(mf->cur_pos < mf->window_size);
201 lz_sarray_update_salink(mf->cur_pos++, mf->SA, mf->ISA, mf->salink);
204 typedef input_idx_t lz_sarray_cost_t;
205 #define LZ_SARRAY_INFINITE_COST (~(lz_sarray_cost_t)0)
208 * Use the suffix array match-finder to retrieve a list of matches at the
211 * Returns the number of matches written into @matches. The matches are
212 * returned in decreasing order by length, and each will be of unique length
213 * between the minimum and maximum match lengths (inclusively) passed to
214 * lz_sarray_init(). Up to @max_matches_to_return (passed to lz_sarray_init())
215 * matches will be returned.
217 * @eval_match_cost is a function for evaluating the cost of a match when
218 * deciding which ones to return. It needs to be fast, and need not be exact;
219 * an implementation might simply rank matches by their offset, for example,
220 * although implementations may choose to take into account additional
221 * information such as repeat offsets.
223 static _always_inline_attribute u32
224 lz_sarray_get_matches(struct lz_sarray *mf,
225 struct raw_match matches[],
226 lz_sarray_cost_t (*eval_match_cost)
230 const void *eval_match_cost_ctx)
232 LZ_ASSERT(mf->cur_pos < mf->window_size);
233 const input_idx_t i = mf->cur_pos++;
235 const input_idx_t * const restrict SA = mf->SA;
236 const input_idx_t * const restrict ISA = mf->ISA;
237 struct salink * const restrict link = mf->salink;
238 const input_idx_t min_match_len = mf->min_match_len;
239 const u32 max_matches_to_consider = mf->max_matches_to_consider;
240 const u32 max_matches_to_return = mf->max_matches_to_return;
242 /* r = Rank of the suffix at the current position. */
243 const input_idx_t r = ISA[i];
245 /* Prepare for searching the current position. */
246 lz_sarray_update_salink(i, SA, ISA, link);
248 /* L = rank of next suffix to the left;
249 * R = rank of next suffix to the right;
250 * lenL = length of match between current position and the suffix with rank L;
251 * lenR = length of match between current position and the suffix with rank R.
253 * This is left and right relative to the rank of the current suffix.
254 * Since the suffixes in the suffix array are sorted, the longest
255 * matches are immediately to the left and right (using the linked list
256 * to ignore all suffixes that occur later in the window). The match
257 * length decreases the farther left and right we go. We shall keep the
258 * length on both sides in sync in order to choose the lowest-cost match
261 input_idx_t L = link[r].prev;
262 input_idx_t R = link[r].next;
263 input_idx_t lenL = link[r].lcpprev;
264 input_idx_t lenR = link[r].lcpnext;
266 /* nmatches = number of matches found so far. */
269 /* best_cost = cost of lowest-cost match found so far.
271 * We keep track of this so that we can ignore shorter matches that do
272 * not have lower costs than longer matches already found. */
273 lz_sarray_cost_t best_cost = LZ_SARRAY_INFINITE_COST;
275 /* count_remaining = maximum number of possible matches remaining to be
277 u32 count_remaining = max_matches_to_consider;
279 /* pending = match currently being considered for a specific length. */
280 struct raw_match pending;
281 lz_sarray_cost_t pending_cost;
283 while (lenL >= min_match_len || lenR >= min_match_len)
286 pending_cost = LZ_SARRAY_INFINITE_COST;
287 lz_sarray_cost_t cost;
290 if (lenL >= min_match_len && lenL >= lenR) {
293 if (--count_remaining == 0)
294 goto out_save_pending;
296 input_idx_t offset = i - SA[L];
298 /* Save match if it has smaller cost. */
299 cost = (*eval_match_cost)(lenL, offset,
300 eval_match_cost_ctx);
301 if (cost < pending_cost) {
302 pending.offset = offset;
306 if (link[L].lcpprev < lenL) {
307 /* Match length decreased. */
309 lenL = link[L].lcpprev;
311 /* Save the pending match unless the
312 * right side still may have matches of
313 * this length to be scanned, or if a
314 * previous (longer) match had lower
316 if (pending.len > lenR) {
317 if (pending_cost < best_cost) {
318 best_cost = pending_cost;
319 matches[nmatches++] = pending;
320 if (nmatches == max_matches_to_return)
324 pending_cost = LZ_SARRAY_INFINITE_COST;
326 if (lenL < min_match_len || lenL < lenR)
336 if (lenR >= min_match_len && lenR > lenL) {
339 if (--count_remaining == 0)
340 goto out_save_pending;
342 input_idx_t offset = i - SA[R];
344 /* Save match if it has smaller cost. */
345 cost = (*eval_match_cost)(lenR,
347 eval_match_cost_ctx);
348 if (cost < pending_cost) {
349 pending.offset = offset;
353 if (link[R].lcpnext < lenR) {
354 /* Match length decreased. */
356 lenR = link[R].lcpnext;
358 /* Save the pending match unless a
359 * previous (longer) match had lower
361 if (pending_cost < best_cost) {
362 matches[nmatches++] = pending;
363 best_cost = pending_cost;
364 if (nmatches == max_matches_to_return)
368 if (lenR < min_match_len || lenR <= lenL)
372 pending_cost = LZ_SARRAY_INFINITE_COST;
381 if (pending_cost != LZ_SARRAY_INFINITE_COST)
382 matches[nmatches++] = pending;
388 #endif /* _WIMLIB_LZ_SARRAY_H */