4 * Suffix array match-finder for Lempel-Ziv compression.
8 * Copyright (c) 2013 Eric Biggers. All rights reserved.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
28 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
31 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 #include "wimlib/lz_sarray.h"
39 #include "wimlib/util.h"
40 #include "divsufsort/divsufsort.h"
43 /* If ENABLE_LZ_DEBUG is defined, verify that the suffix array satisfies its
46 * @SA The constructed suffix array.
47 * @T The original data.
48 * @found Temporary 'bool' array of length @n.
49 * @n Length of the data (length of @SA, @T, and @found arrays).
51 * WARNING: this is for debug use only as it does not necessarily run in linear
54 verify_suffix_array(const input_idx_t SA[restrict],
59 #ifdef ENABLE_LZ_DEBUG
60 /* Ensure the SA contains exactly one of each i in [0, n - 1]. */
61 for (input_idx_t i = 0; i < n; i++)
63 for (input_idx_t r = 0; r < n; r++) {
64 input_idx_t i = SA[r];
70 /* Ensure the suffix with rank r is lexicographically lesser than the
71 * suffix with rank (r + 1) for all r in [0, n - 2]. */
72 for (input_idx_t r = 0; r < n - 1; r++) {
74 input_idx_t i1 = SA[r];
75 input_idx_t i2 = SA[r + 1];
77 input_idx_t n1 = n - i1;
78 input_idx_t n2 = n - i2;
80 int res = memcmp(&T[i1], &T[i2], min(n1, n2));
81 LZ_ASSERT(res < 0 || (res == 0 && n1 < n2));
83 #endif /* ENABLE_LZ_DEBUG */
86 /* Compute the inverse suffix array @ISA from the suffix array @SA in linear
89 * Whereas the suffix array is a mapping from suffix rank to suffix position,
90 * the inverse suffix array is a mapping from suffix position to suffix rank.
93 compute_inverse_suffix_array(input_idx_t ISA[restrict],
94 const input_idx_t SA[restrict],
99 for (i = 0; i < n; i++)
104 /* Compute the LCP (Longest Common Prefix) array in linear time.
106 * LCP[r] will be the length of the longest common prefix between the suffixes
107 * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined.
109 * Algorithm adapted from Kasai et al. 2001: "Linear-Time Longest-Common-Prefix
110 * Computation in Suffix Arrays and Its Applications". Modified slightly to
111 * take into account that with bytes in the real world, there is no unique
112 * symbol at the end of the string. */
114 compute_lcp_array(input_idx_t LCP[restrict],
115 const input_idx_t SA[restrict],
116 const input_idx_t ISA[restrict],
117 const u8 T[restrict],
120 input_idx_t h, i, r, j, lim;
123 for (i = 0; i < n; i++) {
127 lim = min(n - i, n - j);
129 while (h < lim && T[i + h] == T[j + h])
138 /* If ENABLE_LZ_DEBUG is defined, verify that the LCP (Longest Common Prefix)
139 * array satisfies its definition.
141 * WARNING: this is for debug use only as it does not necessarily run in linear
144 verify_lcp_array(input_idx_t LCP[restrict],
145 const input_idx_t SA[restrict],
146 const u8 T[restrict],
149 #ifdef ENABLE_LZ_DEBUG
150 for (input_idx_t r = 0; r < n - 1; r++) {
151 input_idx_t i1 = SA[r];
152 input_idx_t i2 = SA[r + 1];
153 input_idx_t lcp = LCP[r + 1];
155 input_idx_t n1 = n - i1;
156 input_idx_t n2 = n - i2;
158 LZ_ASSERT(lcp <= min(n1, n2));
160 LZ_ASSERT(memcmp(&T[i1], &T[i2], lcp) == 0);
161 if (lcp < min(n1, n2))
162 LZ_ASSERT(T[i1 + lcp] != T[i2 + lcp]);
164 #endif /* ENABLE_LZ_DEBUG */
167 /* Initialize the SA link array in linear time.
169 * This is similar to computing the LPF (Longest Previous Factor) array, which
170 * is addressed in several papers. In particular the algorithms below are based
171 * on Crochemore et al. 2009: "LPF computation revisited". However, this
172 * match-finder does not actually compute or use the LPF array per se. Rather,
173 * this function sets up some information necessary to compute the LPF array,
174 * but later lz_sarray_get_matches() actually uses this information to search
175 * the suffix array directly and can keep searching beyond the first (longest)
176 * match whose length would be placed in the LPF array. This difference from
177 * the theoretical work is necessary because in many real compression formats
178 * matches take variable numbers of bits to encode, so a decent parser needs to
179 * consider more than just the longest match with unspecified offset.
181 * Note: We cap the lcpprev and lcpnext values to the maximum match length so
182 * that the match-finder need not worry about it later, in the inner loop.
185 init_salink(struct salink link[restrict],
186 const input_idx_t LCP[restrict],
187 const input_idx_t SA[restrict],
188 const u8 T[restrict],
190 input_idx_t max_match_len)
192 /* Compute salink.next and salink.lcpnext. */
193 link[n - 1].next = ~(input_idx_t)0;
194 link[n - 1].lcpnext = 0;
195 for (input_idx_t r = n - 2; r != ~(input_idx_t)0; r--) {
196 input_idx_t t = r + 1;
197 input_idx_t l = LCP[t];
198 while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
199 l = min(l, link[t].lcpnext);
203 link[r].lcpnext = min(l, max_match_len);
206 /* Compute salink.prev and salink.lcpprev. */
207 link[0].prev = ~(input_idx_t)0;
209 for (input_idx_t r = 1; r < n; r++) {
210 input_idx_t t = r - 1;
211 input_idx_t l = LCP[r];
212 while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
213 l = min(l, link[t].lcpprev);
217 link[r].lcpprev = min(l, max_match_len);
221 /* If ENABLE_LZ_DEBUG is defined, verify the values computed by init_salink().
223 * WARNING: this is for debug use only as it does not necessarily run in linear
226 verify_salink(const struct salink link[],
227 const input_idx_t SA[],
230 input_idx_t max_match_len)
232 #ifdef ENABLE_LZ_DEBUG
233 for (input_idx_t r = 0; r < n; r++) {
234 for (input_idx_t prev = r; ; ) {
236 LZ_ASSERT(link[r].prev == ~(input_idx_t)0);
237 LZ_ASSERT(link[r].lcpprev == 0);
243 if (SA[prev] < SA[r]) {
244 LZ_ASSERT(link[r].prev == prev);
245 LZ_ASSERT(link[r].lcpprev <= n - SA[prev]);
246 LZ_ASSERT(link[r].lcpprev <= n - SA[r]);
247 LZ_ASSERT(link[r].lcpprev <= max_match_len);
248 LZ_ASSERT(0 == memcmp(&T[SA[prev]],
251 if (link[r].lcpprev < n - SA[prev] &&
252 link[r].lcpprev < n - SA[r] &&
253 link[r].lcpprev < max_match_len)
255 LZ_ASSERT(T[SA[prev] + link[r].lcpprev] !=
256 T[SA[r] + link[r].lcpprev]);
262 for (input_idx_t next = r; ; ) {
264 LZ_ASSERT(link[r].next == ~(input_idx_t)0);
265 LZ_ASSERT(link[r].lcpnext == 0);
271 if (SA[next] < SA[r]) {
272 LZ_ASSERT(link[r].next == next);
273 LZ_ASSERT(link[r].lcpnext <= n - SA[next]);
274 LZ_ASSERT(link[r].lcpnext <= n - SA[r]);
275 LZ_ASSERT(link[r].lcpnext <= max_match_len);
276 LZ_ASSERT(0 == memcmp(&T[SA[next]],
279 if (link[r].lcpnext < n - SA[next] &&
280 link[r].lcpnext < n - SA[r] &&
281 link[r].lcpnext < max_match_len)
283 LZ_ASSERT(T[SA[next] + link[r].lcpnext] !=
284 T[SA[r] + link[r].lcpnext]);
295 * Initialize the suffix array match-finder.
298 * The suffix array match-finder structure to initialize. This structure
299 * is expected to be zeroed before this function is called. In the case
300 * that this function fails, lz_sarray_destroy() should be called to free
301 * any memory that may have been allocated.
304 * The maximum window size to support.
306 * In the current implementation, the memory needed will be
307 * (6 * sizeof(input_idx_t) * @max_window_size) bytes.
308 * For (sizeof(input_idx_t) == 4) that's 24 times the window size.
310 * Memory is saved by saving neither the original window nor the LCP
311 * (Longest Common Prefix) array; otherwise 29 times the window size would
312 * be required. (In practice the compressor will likely keep the original
313 * window around anyway, although based on this property of the
314 * match-finder it theoretically it could overwrite it with the compressed
318 * The minimum length of each match to be found.
321 * The maximum length of each match to be found.
323 * @max_matches_to_consider
324 * The maximum number of matches to consider at each position. This should
325 * be greater than @max_matches_to_return because @max_matches_to_consider
326 * counts all the returned matches as well as matches of equal length to
327 * returned matches that were not returned. This parameter bounds the
328 * amount of work the match-finder does at any one position. This could be
329 * anywhere from 1 to 100+ depending on the compression ratio and
330 * performance desired.
332 * @max_matches_to_return
333 * Maximum number of matches to return at each position. Because of the
334 * suffix array search algorithm, the order in which matches are returned
335 * will be from longest to shortest, so cut-offs due to this parameter will
336 * only result in shorter matches being discarded. This parameter could be
337 * anywhere from 1 to (@max_match_len - @min_match_len + 1) depending on
338 * the compression performance desired. However, making it even moderately
339 * large (say, greater than 3) may not be very helpful due to the property
340 * that the matches are returned from longest to shortest. But the main
341 * thing to keep in mind is that if the compressor decides to output a
342 * shorter-than-possible match, ideally it would be best to choose the best
343 * match of the desired length rather than truncate a longer match to that
346 * After initialization, the suffix-array match-finder can be used for any
347 * number of input strings (windows) of length less than or equal to
348 * @max_window_size by successive calls to lz_sarray_load_window().
350 * Returns %true on success, or %false if sufficient memory could not be
351 * allocated. See the note for @max_window_size above regarding the needed
355 lz_sarray_init(struct lz_sarray *mf,
356 input_idx_t max_window_size,
357 input_idx_t min_match_len,
358 input_idx_t max_match_len,
359 u32 max_matches_to_consider,
360 u32 max_matches_to_return)
362 mf->max_window_size = max_window_size;
363 mf->min_match_len = min_match_len;
364 mf->max_match_len = max_match_len;
365 mf->max_matches_to_consider = max_matches_to_consider;
366 mf->max_matches_to_return = max_matches_to_return;
368 /* SA and ISA will share the same storage block. */
369 mf->SA = MALLOC(2 * max_window_size * sizeof(mf->SA[0]));
373 mf->salink = MALLOC(max_window_size * sizeof(mf->salink[0]));
374 if (mf->salink == NULL)
381 * Prepare the suffix array match-finder to scan the specified window for
384 * @mf Suffix array match-finder previously initialized with lz_sarray_init().
386 * @T Window, or "block", in which to find matches.
388 * @n Size of window in bytes. This must be positive and less than or equal
389 * to the @max_window_size passed to lz_sarray_init().
391 * This function runs in linear time (relative to @n).
394 lz_sarray_load_window(struct lz_sarray *mf, const u8 T[], input_idx_t n)
396 input_idx_t *ISA, *LCP;
398 LZ_ASSERT(n > 0 && n <= mf->max_window_size);
400 /* Compute SA (Suffix Array).
402 * divsufsort() needs temporary space --- one array with 256 spaces and
403 * one array with 65536 spaces. The implementation has been modified
404 * from the original to use the provided temporary space instead of
405 * allocating its own.
407 * We also check at build-time that divsufsort() uses the same integer
408 * size expected by this code. Unfortunately, divsufsort breaks if
409 * 'sa_idx_t' is defined to be a 16-bit integer; however, that would
410 * limit blocks to only 65536 bytes anyway. */
411 LZ_ASSERT(mf->max_window_size * sizeof(mf->SA[0])
412 >= 256 * sizeof(saidx_t));
413 LZ_ASSERT(mf->max_window_size * sizeof(mf->salink[0])
414 >= 256 * 256 * sizeof(saidx_t));
415 BUILD_BUG_ON(sizeof(input_idx_t) != sizeof(saidx_t));
417 divsufsort(T, mf->SA, n, (saidx_t*)&mf->SA[n], (saidx_t*)mf->salink);
419 BUILD_BUG_ON(sizeof(bool) > sizeof(mf->salink[0]));
420 verify_suffix_array(mf->SA, T, (bool*)mf->salink, n);
422 /* Compute ISA (Inverse Suffix Array) in a preliminary position.
424 * This is just a trick to save memory. Since LCP is unneeded after
425 * this function, it can be computed in any available space. The
426 * storage for the ISA is the best choice because the ISA can be built
427 * quickly in salink for now, then re-built in its real location at the
428 * end. This is probably worth it because computing the ISA from the SA
429 * is very fast, and since this match-finder is memory-hungry we'd like
430 * to save as much memory as possible. */
431 ISA = (input_idx_t*)mf->salink;
432 compute_inverse_suffix_array(ISA, mf->SA, n);
434 /* Compute LCP (Longest Common Prefix) array. */
436 compute_lcp_array(LCP, mf->SA, ISA, T, n);
437 verify_lcp_array(LCP, mf->SA, T, n);
439 /* Initialize suffix array links. */
440 init_salink(mf->salink, LCP, mf->SA, T, n, mf->max_match_len);
441 verify_salink(mf->salink, mf->SA, T, n, mf->max_match_len);
443 /* Compute ISA (Inverse Suffix Array) in its final position. */
445 compute_inverse_suffix_array(ISA, mf->SA, n);
447 /* Save new variables and return. */
453 /* Free memory allocated for the suffix array match-finder. */
455 lz_sarray_destroy(struct lz_sarray *mf)