4 * Suffix array match-finder for LZ (Lempel-Ziv) compression.
8 * Copyright (C) 2013 Eric Biggers
10 * This file is part of wimlib, a library for working with WIM files.
12 * wimlib is free software; you can redistribute it and/or modify it under the
13 * terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 3 of the License, or (at your option)
17 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
18 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
19 * A PARTICULAR PURPOSE. See the GNU General Public License for more
22 * You should have received a copy of the GNU General Public License
23 * along with wimlib; if not, see http://www.gnu.org/licenses/.
30 #include "wimlib/lz_sarray.h"
31 #include "wimlib/util.h"
32 #include "divsufsort/divsufsort.h"
35 /* Initialize the suffix array match-finder with the specified parameters.
37 * After initialization, it can be used for any number of input strings of
38 * length less than or equal to @max_window_size. */
40 lz_sarray_init(struct lz_sarray *mf,
41 input_idx_t max_window_size,
42 input_idx_t min_match_len,
43 input_idx_t max_match_len,
44 u32 max_matches_to_consider,
45 u32 max_matches_to_return)
47 mf->max_window_size = max_window_size;
48 mf->min_match_len = min_match_len;
49 mf->max_match_len = max_match_len;
50 mf->max_matches_to_consider = max_matches_to_consider;
51 mf->max_matches_to_return = max_matches_to_return;
53 mf->SA = MALLOC(3U * max_window_size * sizeof(mf->SA[0]));
57 mf->salink = MALLOC(max_window_size * sizeof(mf->salink[0]));
58 if (mf->salink == NULL)
64 /* Free memory allocated for the suffix array match-finder. */
66 lz_sarray_destroy(struct lz_sarray *mf)
72 /* Initialize the suffix array match-finder for the specified input. */
74 lz_sarray_load_window(struct lz_sarray *mf, const u8 window[],
75 input_idx_t window_size)
78 const u8 * const restrict T = window;
79 const input_idx_t n = window_size;
80 const input_idx_t max_match_len = mf->max_match_len;
81 input_idx_t * const restrict SA = mf->SA;
82 input_idx_t * const restrict ISA = mf->ISA = SA + window_size;
83 input_idx_t * const restrict LCP = mf->LCP = ISA + window_size;
84 struct salink * const restrict link = mf->salink;
86 /* Compute SA (Suffix Array). */
88 /* ISA and link are used as temporary space. */
89 LZ_ASSERT(mf->max_window_size * sizeof(ISA[0]) >= 256 * sizeof(saidx_t));
90 LZ_ASSERT(mf->max_window_size * 2 * sizeof(link[0]) >= 256 * 256 * sizeof(saidx_t));
92 if (sizeof(input_idx_t) == sizeof(saidx_t)) {
93 divsufsort(T, SA, n, (saidx_t*)ISA, (saidx_t*)link);
96 divsufsort(T, sa, n, (saidx_t*)ISA, (saidx_t*)link);
97 for (input_idx_t i = 0; i < n; i++)
102 #ifdef ENABLE_LZ_DEBUG
106 /* Verify suffix array. */
110 for (input_idx_t r = 0; r < n; r++) {
111 input_idx_t i = SA[r];
113 LZ_ASSERT(!found[i]);
118 for (input_idx_t r = 0; r < n - 1; r++) {
120 input_idx_t i1 = SA[r];
121 input_idx_t i2 = SA[r + 1];
123 input_idx_t n1 = n - i1;
124 input_idx_t n2 = n - i2;
126 LZ_ASSERT(memcmp(&T[i1], &T[i2], min(n1, n2)) <= 0);
128 LZ_DEBUG("Verified SA (len %u)", n);
129 #endif /* ENABLE_LZ_DEBUG */
131 /* Compute ISA (Inverse Suffix Array) */
132 for (input_idx_t r = 0; r < n; r++)
135 /* Compute LCP (longest common prefix) array.
137 * Algorithm adapted from Kasai et al. 2001: "Linear-Time
138 * Longest-Common-Prefix Computation in Suffix Arrays and Its
142 for (input_idx_t i = 0; i < n; i++) {
143 input_idx_t r = ISA[i];
145 input_idx_t j = SA[r - 1];
147 input_idx_t lim = min(n - i, n - j);
149 while (h < lim && T[i + h] == T[j + h])
158 #ifdef ENABLE_LZ_DEBUG
159 /* Verify LCP array. */
160 for (input_idx_t r = 0; r < n - 1; r++) {
161 LZ_ASSERT(ISA[SA[r]] == r);
162 LZ_ASSERT(ISA[SA[r + 1]] == r + 1);
164 input_idx_t i1 = SA[r];
165 input_idx_t i2 = SA[r + 1];
166 input_idx_t lcp = LCP[r + 1];
168 input_idx_t n1 = n - i1;
169 input_idx_t n2 = n - i2;
171 LZ_ASSERT(lcp <= min(n1, n2));
173 LZ_ASSERT(memcmp(&T[i1], &T[i2], lcp) == 0);
174 if (lcp < min(n1, n2))
175 LZ_ASSERT(T[i1 + lcp] != T[i2 + lcp]);
177 #endif /* ENABLE_LZ_DEBUG */
179 /* Compute salink.next and salink.lcpnext.
181 * Algorithm adapted from Crochemore et al. 2009:
182 * "LPF computation revisited".
184 * Note: we cap lcpnext to the maximum match length so that the
185 * match-finder need not worry about it later. */
186 link[n - 1].next = ~(input_idx_t)0;
187 link[n - 1].prev = ~(input_idx_t)0;
188 link[n - 1].lcpnext = 0;
189 link[n - 1].lcpprev = 0;
190 for (input_idx_t r = n - 2; r != ~(input_idx_t)0; r--) {
191 input_idx_t t = r + 1;
192 input_idx_t l = LCP[t];
193 while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
194 l = min(l, link[t].lcpnext);
198 link[r].lcpnext = min(l, max_match_len);
199 LZ_ASSERT(t == ~(input_idx_t)0 || l <= n - SA[t]);
200 LZ_ASSERT(l <= n - SA[r]);
201 if (t == ~(input_idx_t)0)
204 LZ_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);
207 /* Compute salink.prev and salink.lcpprev.
209 * Algorithm adapted from Crochemore et al. 2009:
210 * "LPF computation revisited".
212 * Note: we cap lcpprev to the maximum match length so that the
213 * match-finder need not worry about it later. */
214 link[0].prev = ~(input_idx_t)0;
215 link[0].next = ~(input_idx_t)0;
218 for (input_idx_t r = 1; r < n; r++) {
219 input_idx_t t = r - 1;
220 input_idx_t l = LCP[r];
221 while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
222 l = min(l, link[t].lcpprev);
226 link[r].lcpprev = min(l, max_match_len);
227 LZ_ASSERT(t == ~(input_idx_t)0 || l <= n - SA[t]);
228 LZ_ASSERT(l <= n - SA[r]);
229 if (t == ~(input_idx_t)0)
232 LZ_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);