4 * Suffix array match-finder for Lempel-Ziv compression.
8 * Copyright (c) 2013 Eric Biggers. All rights reserved.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
28 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
31 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #ifndef _WIMLIB_LZ_SARRAY_H
35 #define _WIMLIB_LZ_SARRAY_H
37 #include "wimlib/compiler.h" /* must define '_always_inline_attribute' */
38 #include "wimlib/lz.h" /* must define 'struct raw_match', 'input_idx_t',
40 #include "wimlib/types.h" /* must define 'bool', 'u8', 'u32' */
44 /* State of the suffix array LZ (Lempel-Ziv) match-finder.
46 * This is defined here for benefit of the inlined code. It's not intended for
47 * code outside the match-finder itself to read or write members from this
50 /* Allocated window size for the match-finder.
52 * Note: this match-finder does not store the window itself, as the
53 * suffix array (@SA) and associated arrays (@ISA, @LCP, @salink) are
54 * sufficient to find matches. This number is the maximum length of
55 * those arrays, or also the maximum window (block) size that can be
56 * passed to lz_sarray_load_window(). */
57 input_idx_t max_window_size;
59 /* Minimum match length to return. */
60 input_idx_t min_match_len;
62 /* Maximum match length to return. */
63 input_idx_t max_match_len;
65 /* Maximum matches to consider at each position (max search depth). */
66 u32 max_matches_to_consider;
68 /* Maximum number of matches to return at each position. */
69 u32 max_matches_to_return;
71 /* Current position in the window. */
74 /* Current window size. */
75 input_idx_t window_size;
77 /* Suffix array for the current window.
78 * This is a mapping from suffix rank to suffix position. */
81 /* Inverse suffix array for the current window.
82 * This is a mapping from suffix position to suffix rank.
83 * If 0 <= r < window_size, then ISA[SA[r]] == r. */
86 /* Suffix array links.
88 * During a linear scan of the input string to find matches, this array
89 * used to keep track of which rank suffixes in the suffix array appear
90 * before the current position. Instead of searching in the original
91 * suffix array, scans for matches at a given position traverse a linked
92 * list containing only suffixes that appear before that position. */
93 struct salink *salink;
96 /* Suffix array link; one of these exists for each position in the suffix array.
99 /* Rank of highest ranked suffix that has rank lower than the suffix
100 * corresponding to this structure and either has a lower position
101 * (initially) or has a position lower than the highest position at
102 * which matches have been searched for so far, or ~(input_idx_t)0 if
103 * there is no such suffix.
105 * Think of this as a pointer to the closest position in the suffix
106 * array to the left that corresponds to a suffix that begins at a
107 * position in the current dictionary (i.e. before the current position
111 /* Rank of lowest ranked suffix that has rank greater than the suffix
112 * corresponding to this structure and either has a lower position
113 * (intially) or has a position lower than the highest position at which
114 * matches have been searched for so far, or ~(input_idx_t)0 if there is
117 * Think of this as a pointer to the closest position in the suffix
118 * array to the right that corresponds to a suffix that begins at a
119 * position in the current dictionary (i.e. before the current position
123 /* Length of longest common prefix between the suffix corresponding to
124 * this structure and the suffix with rank @prev, or 0 if @prev is
125 * ~(input_idx_t)0. */
128 /* Length of longest common prefix between the suffix corresponding to
129 * this structure and the suffix with rank @next, or 0 if @next is
130 * ~(input_idx_t)0. */
134 /*-----------------------------------*/
135 /* Functions defined in lz_sarray.c */
136 /*-----------------------------------*/
139 lz_sarray_init(struct lz_sarray *mf,
140 input_idx_t max_window_size,
141 input_idx_t min_match_len,
142 input_idx_t max_match_len,
143 u32 max_matches_to_consider,
144 u32 max_matches_to_return);
147 lz_sarray_get_needed_memory(input_idx_t max_window_size);
150 lz_sarray_destroy(struct lz_sarray *mf);
153 lz_sarray_load_window(struct lz_sarray *mf, const u8 T[], input_idx_t n);
155 /*-------------------*/
156 /* Inline functions */
157 /*-------------------*/
159 static _always_inline_attribute input_idx_t
160 lz_sarray_get_pos(const struct lz_sarray *mf)
165 /* Advance the suffix array match-finder to the next position. */
166 static _always_inline_attribute void
167 lz_sarray_update_salink(const input_idx_t i,
168 const input_idx_t SA[const restrict],
169 const input_idx_t ISA[const restrict],
170 struct salink link[const restrict])
172 /* r = Rank of the suffix at the current position. */
173 const input_idx_t r = ISA[i];
175 /* next = rank of LOWEST ranked suffix that is ranked HIGHER than the
176 * current suffix AND has a LOWER position, or ~(input_idx_t)0 if none
178 const input_idx_t next = link[r].next;
180 /* prev = rank of HIGHEST ranked suffix that is ranked LOWER than the
181 * current suffix AND has a LOWER position, or ~(input_idx_t)0 if none
183 const input_idx_t prev = link[r].prev;
185 /* Link the suffix at the current position into the linked list that
186 * contains all suffixes referenced by the suffix array that appear at
187 * or before the current position, sorted by rank. */
188 if (next != ~(input_idx_t)0) {
190 link[next].lcpprev = link[r].lcpnext;
193 if (prev != ~(input_idx_t)0) {
195 link[prev].lcpnext = link[r].lcpprev;
199 /* Skip the current position in the suffix array match-finder. */
200 static _always_inline_attribute void
201 lz_sarray_skip_position(struct lz_sarray *mf)
203 LZ_ASSERT(mf->cur_pos < mf->window_size);
204 lz_sarray_update_salink(mf->cur_pos++, mf->SA, mf->ISA, mf->salink);
207 typedef input_idx_t lz_sarray_cost_t;
208 #define LZ_SARRAY_INFINITE_COST (~(lz_sarray_cost_t)0)
211 * Use the suffix array match-finder to retrieve a list of matches at the
214 * Returns the number of matches written into @matches. The matches are
215 * returned in decreasing order by length, and each will be of unique length
216 * between the minimum and maximum match lengths (inclusively) passed to
217 * lz_sarray_init(). Up to @max_matches_to_return (passed to lz_sarray_init())
218 * matches will be returned.
220 * @eval_match_cost is a function for evaluating the cost of a match when
221 * deciding which ones to return. It needs to be fast, and need not be exact;
222 * an implementation might simply rank matches by their offset, for example,
223 * although implementations may choose to take into account additional
224 * information such as repeat offsets.
226 static _always_inline_attribute u32
227 lz_sarray_get_matches(struct lz_sarray *mf,
228 struct raw_match matches[],
229 lz_sarray_cost_t (*eval_match_cost)
233 const void *eval_match_cost_ctx)
235 LZ_ASSERT(mf->cur_pos < mf->window_size);
236 const input_idx_t i = mf->cur_pos++;
238 const input_idx_t * const restrict SA = mf->SA;
239 const input_idx_t * const restrict ISA = mf->ISA;
240 struct salink * const restrict link = mf->salink;
241 const input_idx_t min_match_len = mf->min_match_len;
242 const u32 max_matches_to_consider = mf->max_matches_to_consider;
243 const u32 max_matches_to_return = mf->max_matches_to_return;
245 /* r = Rank of the suffix at the current position. */
246 const input_idx_t r = ISA[i];
248 /* Prepare for searching the current position. */
249 lz_sarray_update_salink(i, SA, ISA, link);
251 /* L = rank of next suffix to the left;
252 * R = rank of next suffix to the right;
253 * lenL = length of match between current position and the suffix with rank L;
254 * lenR = length of match between current position and the suffix with rank R.
256 * This is left and right relative to the rank of the current suffix.
257 * Since the suffixes in the suffix array are sorted, the longest
258 * matches are immediately to the left and right (using the linked list
259 * to ignore all suffixes that occur later in the window). The match
260 * length decreases the farther left and right we go. We shall keep the
261 * length on both sides in sync in order to choose the lowest-cost match
264 input_idx_t L = link[r].prev;
265 input_idx_t R = link[r].next;
266 input_idx_t lenL = link[r].lcpprev;
267 input_idx_t lenR = link[r].lcpnext;
269 /* nmatches = number of matches found so far. */
272 /* best_cost = cost of lowest-cost match found so far.
274 * We keep track of this so that we can ignore shorter matches that do
275 * not have lower costs than longer matches already found. */
276 lz_sarray_cost_t best_cost = LZ_SARRAY_INFINITE_COST;
278 /* count_remaining = maximum number of possible matches remaining to be
280 u32 count_remaining = max_matches_to_consider;
282 /* pending = match currently being considered for a specific length. */
283 struct raw_match pending;
284 lz_sarray_cost_t pending_cost;
286 while (lenL >= min_match_len || lenR >= min_match_len)
289 pending_cost = LZ_SARRAY_INFINITE_COST;
290 lz_sarray_cost_t cost;
293 if (lenL >= min_match_len && lenL >= lenR) {
296 if (--count_remaining == 0)
297 goto out_save_pending;
299 input_idx_t offset = i - SA[L];
301 /* Save match if it has smaller cost. */
302 cost = (*eval_match_cost)(lenL, offset,
303 eval_match_cost_ctx);
304 if (cost < pending_cost) {
305 pending.offset = offset;
309 if (link[L].lcpprev < lenL) {
310 /* Match length decreased. */
312 lenL = link[L].lcpprev;
314 /* Save the pending match unless the
315 * right side still may have matches of
316 * this length to be scanned, or if a
317 * previous (longer) match had lower
319 if (pending.len > lenR) {
320 if (pending_cost < best_cost) {
321 best_cost = pending_cost;
322 matches[nmatches++] = pending;
323 if (nmatches == max_matches_to_return)
327 pending_cost = LZ_SARRAY_INFINITE_COST;
329 if (lenL < min_match_len || lenL < lenR)
339 if (lenR >= min_match_len && lenR > lenL) {
342 if (--count_remaining == 0)
343 goto out_save_pending;
345 input_idx_t offset = i - SA[R];
347 /* Save match if it has smaller cost. */
348 cost = (*eval_match_cost)(lenR,
350 eval_match_cost_ctx);
351 if (cost < pending_cost) {
352 pending.offset = offset;
356 if (link[R].lcpnext < lenR) {
357 /* Match length decreased. */
359 lenR = link[R].lcpnext;
361 /* Save the pending match unless a
362 * previous (longer) match had lower
364 if (pending_cost < best_cost) {
365 matches[nmatches++] = pending;
366 best_cost = pending_cost;
367 if (nmatches == max_matches_to_return)
371 if (lenR < min_match_len || lenR <= lenL)
375 pending_cost = LZ_SARRAY_INFINITE_COST;
384 if (pending_cost != LZ_SARRAY_INFINITE_COST)
385 matches[nmatches++] = pending;
391 #endif /* _WIMLIB_LZ_SARRAY_H */