4 * Suffix array match-finder for LZ (Lempel-Ziv) compression.
8 * Copyright (c) 2013 Eric Biggers. All rights reserved.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
28 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
31 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 #include "wimlib/lz_sarray.h"
39 #include "wimlib/util.h"
40 #include "divsufsort/divsufsort.h"
43 /* Initialize the suffix array match-finder with the specified parameters.
45 * After initialization, it can be used for any number of input strings of
46 * length less than or equal to @max_window_size. */
48 lz_sarray_init(struct lz_sarray *mf,
49 input_idx_t max_window_size,
50 input_idx_t min_match_len,
51 input_idx_t max_match_len,
52 u32 max_matches_to_consider,
53 u32 max_matches_to_return)
55 mf->max_window_size = max_window_size;
56 mf->min_match_len = min_match_len;
57 mf->max_match_len = max_match_len;
58 mf->max_matches_to_consider = max_matches_to_consider;
59 mf->max_matches_to_return = max_matches_to_return;
61 mf->SA = MALLOC(3U * max_window_size * sizeof(mf->SA[0]));
65 mf->salink = MALLOC(max_window_size * sizeof(mf->salink[0]));
66 if (mf->salink == NULL)
72 /* Free memory allocated for the suffix array match-finder. */
74 lz_sarray_destroy(struct lz_sarray *mf)
80 /* Initialize the suffix array match-finder for the specified input. */
82 lz_sarray_load_window(struct lz_sarray *mf, const u8 window[],
83 input_idx_t window_size)
86 const u8 * const restrict T = window;
87 const input_idx_t n = window_size;
88 const input_idx_t max_match_len = mf->max_match_len;
89 input_idx_t * const restrict SA = mf->SA;
90 input_idx_t * const restrict ISA = mf->ISA = SA + window_size;
91 input_idx_t * const restrict LCP = mf->LCP = ISA + window_size;
92 struct salink * const restrict link = mf->salink;
94 /* Compute SA (Suffix Array). */
96 /* ISA and link are used as temporary space. */
97 LZ_ASSERT(mf->max_window_size * sizeof(ISA[0]) >= 256 * sizeof(saidx_t));
98 LZ_ASSERT(mf->max_window_size * 2 * sizeof(link[0]) >= 256 * 256 * sizeof(saidx_t));
100 if (sizeof(input_idx_t) == sizeof(saidx_t)) {
101 divsufsort(T, SA, n, (saidx_t*)ISA, (saidx_t*)link);
104 divsufsort(T, sa, n, (saidx_t*)ISA, (saidx_t*)link);
105 for (input_idx_t i = 0; i < n; i++)
110 #ifdef ENABLE_LZ_DEBUG
114 /* Verify suffix array. */
118 for (input_idx_t r = 0; r < n; r++) {
119 input_idx_t i = SA[r];
121 LZ_ASSERT(!found[i]);
126 for (input_idx_t r = 0; r < n - 1; r++) {
128 input_idx_t i1 = SA[r];
129 input_idx_t i2 = SA[r + 1];
131 input_idx_t n1 = n - i1;
132 input_idx_t n2 = n - i2;
134 LZ_ASSERT(memcmp(&T[i1], &T[i2], min(n1, n2)) <= 0);
136 LZ_DEBUG("Verified SA (len %u)", n);
137 #endif /* ENABLE_LZ_DEBUG */
139 /* Compute ISA (Inverse Suffix Array) */
140 for (input_idx_t r = 0; r < n; r++)
143 /* Compute LCP (longest common prefix) array.
145 * Algorithm adapted from Kasai et al. 2001: "Linear-Time
146 * Longest-Common-Prefix Computation in Suffix Arrays and Its
150 for (input_idx_t i = 0; i < n; i++) {
151 input_idx_t r = ISA[i];
153 input_idx_t j = SA[r - 1];
155 input_idx_t lim = min(n - i, n - j);
157 while (h < lim && T[i + h] == T[j + h])
166 #ifdef ENABLE_LZ_DEBUG
167 /* Verify LCP array. */
168 for (input_idx_t r = 0; r < n - 1; r++) {
169 LZ_ASSERT(ISA[SA[r]] == r);
170 LZ_ASSERT(ISA[SA[r + 1]] == r + 1);
172 input_idx_t i1 = SA[r];
173 input_idx_t i2 = SA[r + 1];
174 input_idx_t lcp = LCP[r + 1];
176 input_idx_t n1 = n - i1;
177 input_idx_t n2 = n - i2;
179 LZ_ASSERT(lcp <= min(n1, n2));
181 LZ_ASSERT(memcmp(&T[i1], &T[i2], lcp) == 0);
182 if (lcp < min(n1, n2))
183 LZ_ASSERT(T[i1 + lcp] != T[i2 + lcp]);
185 #endif /* ENABLE_LZ_DEBUG */
187 /* Compute salink.next and salink.lcpnext.
189 * Algorithm adapted from Crochemore et al. 2009:
190 * "LPF computation revisited".
192 * Note: we cap lcpnext to the maximum match length so that the
193 * match-finder need not worry about it later. */
194 link[n - 1].next = ~(input_idx_t)0;
195 link[n - 1].prev = ~(input_idx_t)0;
196 link[n - 1].lcpnext = 0;
197 link[n - 1].lcpprev = 0;
198 for (input_idx_t r = n - 2; r != ~(input_idx_t)0; r--) {
199 input_idx_t t = r + 1;
200 input_idx_t l = LCP[t];
201 while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
202 l = min(l, link[t].lcpnext);
206 link[r].lcpnext = min(l, max_match_len);
207 LZ_ASSERT(t == ~(input_idx_t)0 || l <= n - SA[t]);
208 LZ_ASSERT(l <= n - SA[r]);
209 if (t == ~(input_idx_t)0)
212 LZ_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);
215 /* Compute salink.prev and salink.lcpprev.
217 * Algorithm adapted from Crochemore et al. 2009:
218 * "LPF computation revisited".
220 * Note: we cap lcpprev to the maximum match length so that the
221 * match-finder need not worry about it later. */
222 link[0].prev = ~(input_idx_t)0;
223 link[0].next = ~(input_idx_t)0;
226 for (input_idx_t r = 1; r < n; r++) {
227 input_idx_t t = r - 1;
228 input_idx_t l = LCP[r];
229 while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
230 l = min(l, link[t].lcpprev);
234 link[r].lcpprev = min(l, max_match_len);
235 LZ_ASSERT(t == ~(input_idx_t)0 || l <= n - SA[t]);
236 LZ_ASSERT(l <= n - SA[r]);
237 if (t == ~(input_idx_t)0)
240 LZ_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);