4 * Suffix array match-finder for Lempel-Ziv compression.
8 * Copyright (c) 2013 Eric Biggers. All rights reserved.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
28 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
31 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #ifndef _WIMLIB_LZ_SARRAY_H
35 #define _WIMLIB_LZ_SARRAY_H
37 #include "wimlib/compiler.h" /* must define '_always_inline_attribute' */
38 #include "wimlib/lz.h" /* must define 'struct raw_match', 'input_idx_t',
40 #include "wimlib/types.h" /* must define 'bool', 'u8', 'u32' */
44 /* State of the suffix array LZ (Lempel-Ziv) match-finder.
46 * This is defined here for benefit of the inlined code. It's not intended for
47 * code outside the match-finder itself to read or write members from this
50 /* Allocated window size for the match-finder.
52 * Note: this match-finder does not store the window itself, as the
53 * suffix array (@SA) and associated arrays (@ISA, @LCP, @salink) are
54 * sufficient to find matches. This number is the maximum length of
55 * those arrays, or also the maximum window (block) size that can be
56 * passed to lz_sarray_load_window(). */
57 input_idx_t max_window_size;
59 /* Minimum match length to return. */
60 input_idx_t min_match_len;
62 /* Maximum match length to return. */
63 input_idx_t max_match_len;
65 /* Maximum matches to consider at each position (max search depth). */
66 u32 max_matches_to_consider;
68 /* Maximum number of matches to return at each position. */
69 u32 max_matches_to_return;
71 /* Current position in the window. */
74 /* Current window size. */
75 input_idx_t window_size;
77 /* Suffix array for the current window.
78 * This is a mapping from suffix rank to suffix position. */
81 /* Inverse suffix array for the current window.
82 * This is a mapping from suffix position to suffix rank.
83 * If 0 <= r < window_size, then ISA[SA[r]] == r. */
86 /* Suffix array links.
88 * During a linear scan of the input string to find matches, this array
89 * used to keep track of which rank suffixes in the suffix array appear
90 * before the current position. Instead of searching in the original
91 * suffix array, scans for matches at a given position traverse a linked
92 * list containing only suffixes that appear before that position. */
93 struct salink *salink;
96 /* Suffix array link; one of these exists for each position in the suffix array.
99 /* Rank of highest ranked suffix that has rank lower than the suffix
100 * corresponding to this structure and either has a lower position
101 * (initially) or has a position lower than the highest position at
102 * which matches have been searched for so far, or ~(input_idx_t)0 if
103 * there is no such suffix.
105 * Think of this as a pointer to the closest position in the suffix
106 * array to the left that corresponds to a suffix that begins at a
107 * position in the current dictionary (i.e. before the current position
111 /* Rank of lowest ranked suffix that has rank greater than the suffix
112 * corresponding to this structure and either has a lower position
113 * (intially) or has a position lower than the highest position at which
114 * matches have been searched for so far, or ~(input_idx_t)0 if there is
117 * Think of this as a pointer to the closest position in the suffix
118 * array to the right that corresponds to a suffix that begins at a
119 * position in the current dictionary (i.e. before the current position
123 /* Length of longest common prefix between the suffix corresponding to
124 * this structure and the suffix with rank @prev, or 0 if @prev is
125 * ~(input_idx_t)0. */
128 /* Length of longest common prefix between the suffix corresponding to
129 * this structure and the suffix with rank @next, or 0 if @next is
130 * ~(input_idx_t)0. */
134 /*-----------------------------------*/
135 /* Functions defined in lz_sarray.c */
136 /*-----------------------------------*/
139 lz_sarray_init(struct lz_sarray *mf,
140 input_idx_t max_window_size,
141 input_idx_t min_match_len,
142 input_idx_t max_match_len,
143 u32 max_matches_to_consider,
144 u32 max_matches_to_return);
147 lz_sarray_get_needed_memory(input_idx_t max_window_size);
150 lz_sarray_destroy(struct lz_sarray *mf);
153 lz_sarray_load_window(struct lz_sarray *mf, const u8 T[], input_idx_t n);
155 /*-------------------*/
156 /* Inline functions */
157 /*-------------------*/
159 static _always_inline_attribute input_idx_t
160 lz_sarray_get_pos(const struct lz_sarray *mf)
165 /* Advance the suffix array match-finder to the next position. */
166 static _always_inline_attribute void
167 lz_sarray_update_salink(const input_idx_t r, struct salink link[])
169 /* next = rank of LOWEST ranked suffix that is ranked HIGHER than the
170 * current suffix AND has a LOWER position, or ~(input_idx_t)0 if none
172 const input_idx_t next = link[r].next;
174 /* prev = rank of HIGHEST ranked suffix that is ranked LOWER than the
175 * current suffix AND has a LOWER position, or ~(input_idx_t)0 if none
177 const input_idx_t prev = link[r].prev;
179 /* Link the suffix at the current position into the linked list that
180 * contains all suffixes referenced by the suffix array that appear at
181 * or before the current position, sorted by rank. */
182 if (next != ~(input_idx_t)0) {
184 link[next].lcpprev = link[r].lcpnext;
187 if (prev != ~(input_idx_t)0) {
189 link[prev].lcpnext = link[r].lcpprev;
193 /* Skip the current position in the suffix array match-finder. */
194 static _always_inline_attribute void
195 lz_sarray_skip_position(struct lz_sarray *mf)
197 LZ_ASSERT(mf->cur_pos < mf->window_size);
198 lz_sarray_update_salink(mf->ISA[mf->cur_pos++], mf->salink);
201 typedef input_idx_t lz_sarray_cost_t;
202 #define LZ_SARRAY_INFINITE_COST (~(lz_sarray_cost_t)0)
205 * Use the suffix array match-finder to retrieve a list of matches at the
208 * Returns the number of matches written into @matches. The matches are
209 * returned in decreasing order by length, and each will be of unique length
210 * between the minimum and maximum match lengths (inclusively) passed to
211 * lz_sarray_init(). Up to @max_matches_to_return (passed to lz_sarray_init())
212 * matches will be returned.
214 * @eval_match_cost is a function for evaluating the cost of a match when
215 * deciding which ones to return. It needs to be fast, and need not be exact;
216 * an implementation might simply rank matches by their offset, for example,
217 * although implementations may choose to take into account additional
218 * information such as repeat offsets.
220 static _always_inline_attribute u32
221 lz_sarray_get_matches(struct lz_sarray *mf,
222 struct raw_match matches[],
223 lz_sarray_cost_t (*eval_match_cost)
227 const void *eval_match_cost_ctx)
229 LZ_ASSERT(mf->cur_pos < mf->window_size);
230 const input_idx_t i = mf->cur_pos++;
232 const input_idx_t * const restrict SA = mf->SA;
233 const input_idx_t * const restrict ISA = mf->ISA;
234 struct salink * const restrict link = mf->salink;
235 const input_idx_t min_match_len = mf->min_match_len;
236 const u32 max_matches_to_consider = mf->max_matches_to_consider;
237 const u32 max_matches_to_return = mf->max_matches_to_return;
239 /* r = Rank of the suffix at the current position. */
240 const input_idx_t r = ISA[i];
242 /* Prepare for searching the current position. */
243 lz_sarray_update_salink(r, link);
245 /* L = rank of next suffix to the left;
246 * R = rank of next suffix to the right;
247 * lenL = length of match between current position and the suffix with rank L;
248 * lenR = length of match between current position and the suffix with rank R.
250 * This is left and right relative to the rank of the current suffix.
251 * Since the suffixes in the suffix array are sorted, the longest
252 * matches are immediately to the left and right (using the linked list
253 * to ignore all suffixes that occur later in the window). The match
254 * length decreases the farther left and right we go. We shall keep the
255 * length on both sides in sync in order to choose the lowest-cost match
258 input_idx_t L = link[r].prev;
259 input_idx_t R = link[r].next;
260 input_idx_t lenL = link[r].lcpprev;
261 input_idx_t lenR = link[r].lcpnext;
263 /* nmatches = number of matches found so far. */
266 /* best_cost = cost of lowest-cost match found so far.
268 * We keep track of this so that we can ignore shorter matches that do
269 * not have lower costs than longer matches already found. */
270 lz_sarray_cost_t best_cost = LZ_SARRAY_INFINITE_COST;
272 /* count_remaining = maximum number of possible matches remaining to be
274 u32 count_remaining = max_matches_to_consider;
276 /* pending = match currently being considered for a specific length. */
277 struct raw_match pending;
278 lz_sarray_cost_t pending_cost;
280 while (lenL >= min_match_len || lenR >= min_match_len)
283 pending_cost = LZ_SARRAY_INFINITE_COST;
284 lz_sarray_cost_t cost;
287 if (lenL >= min_match_len && lenL >= lenR) {
290 if (--count_remaining == 0)
291 goto out_save_pending;
293 input_idx_t offset = i - SA[L];
295 /* Save match if it has smaller cost. */
296 cost = (*eval_match_cost)(lenL, offset,
297 eval_match_cost_ctx);
298 if (cost < pending_cost) {
299 pending.offset = offset;
303 if (link[L].lcpprev < lenL) {
304 /* Match length decreased. */
306 lenL = link[L].lcpprev;
308 /* Save the pending match unless the
309 * right side still may have matches of
310 * this length to be scanned, or if a
311 * previous (longer) match had lower
313 if (pending.len > lenR) {
314 if (pending_cost < best_cost) {
315 best_cost = pending_cost;
316 matches[nmatches++] = pending;
317 if (nmatches == max_matches_to_return)
321 pending_cost = LZ_SARRAY_INFINITE_COST;
323 if (lenL < min_match_len || lenL < lenR)
333 if (lenR >= min_match_len && lenR > lenL) {
336 if (--count_remaining == 0)
337 goto out_save_pending;
339 input_idx_t offset = i - SA[R];
341 /* Save match if it has smaller cost. */
342 cost = (*eval_match_cost)(lenR,
344 eval_match_cost_ctx);
345 if (cost < pending_cost) {
346 pending.offset = offset;
350 if (link[R].lcpnext < lenR) {
351 /* Match length decreased. */
353 lenR = link[R].lcpnext;
355 /* Save the pending match unless a
356 * previous (longer) match had lower
358 if (pending_cost < best_cost) {
359 matches[nmatches++] = pending;
360 best_cost = pending_cost;
361 if (nmatches == max_matches_to_return)
365 if (lenR < min_match_len || lenR <= lenL)
369 pending_cost = LZ_SARRAY_INFINITE_COST;
378 if (pending_cost != LZ_SARRAY_INFINITE_COST)
379 matches[nmatches++] = pending;
385 #endif /* _WIMLIB_LZ_SARRAY_H */