2 * lz_suffix_array_utils.c
4 * Common utilities for suffix-array based Lempel-Ziv match-finding algorithms.
6 * Copyright (c) 2014 Eric Biggers. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 #include "wimlib/divsufsort.h"
37 #include "wimlib/lz_mf.h"
38 #include "wimlib/lz_suffix_array_utils.h"
39 #include "wimlib/util.h"
41 /* If ENABLE_LZ_DEBUG is defined, verify that the suffix array satisfies its
44 * WARNING: this is for debug use only as it does not necessarily run in linear
47 verify_SA(const u32 *SA, const u8 *T, u32 n, u32 *tmp)
49 #ifdef ENABLE_LZ_DEBUG
50 /* Ensure the SA contains exactly one of each i in [0, n - 1]. */
51 for (u32 i = 0; i < n; i++)
53 for (u32 r = 0; r < n; r++) {
60 /* Ensure the suffix with rank r is lexicographically less than the
61 * suffix with rank (r + 1) for all r in [0, n - 2]. */
62 for (u32 r = 0; r < n - 1; r++) {
70 int res = memcmp(&T[i1], &T[i2], min(n1, n2));
71 LZ_ASSERT(res < 0 || (res == 0 && n1 < n2));
73 #endif /* ENABLE_LZ_DEBUG */
77 * Build the suffix array (SA) for the specified "text".
79 * The SA is a sorted array of the text's suffixes, represented by indices into
80 * the text. It can equivalently be viewed as a mapping from suffix rank to
83 * To build the SA, we currently rely on libdivsufsort, which uses an
84 * induced-sorting-based algorithm. In practice, this seems to be the fastest
85 * suffix array construction algorithm currently available.
89 * Y. Mori. libdivsufsort, a lightweight suffix-sorting library.
90 * https://code.google.com/p/libdivsufsort/.
92 * G. Nong, S. Zhang, and W.H. Chan. 2009. Linear Suffix Array
93 * Construction by Almost Pure Induced-Sorting. Data Compression
94 * Conference, 2009. DCC '09. pp. 193 - 202.
96 * S.J. Puglisi, W.F. Smyth, and A. Turpin. 2007. A Taxonomy of Suffix
97 * Array Construction Algorithms. ACM Computing Surveys (CSUR) Volume 39
98 * Issue 2, 2007 Article No. 4.
101 build_SA(u32 *SA, const u8 *T, u32 n, u32 *tmp)
103 BUILD_BUG_ON(BUILD_SA_MIN_TMP_LEN !=
104 DIVSUFSORT_TMP1_LEN + DIVSUFSORT_TMP2_LEN);
106 /* Note: divsufsort() needs temporary space --- one array with 256
107 * spaces and one array with 65536 spaces. The implementation of
108 * divsufsort() has been modified from the original to use the provided
109 * temporary space instead of allocating its own, since we don't want to
110 * have to deal with malloc() failures here. */
111 divsufsort(T, SA, n, tmp, tmp + DIVSUFSORT_TMP1_LEN);
113 verify_SA(SA, T, n, tmp);
117 /* Build the inverse suffix array @ISA from the suffix array @SA in linear time.
119 * Whereas the suffix array is a mapping from suffix rank to suffix position,
120 * the inverse suffix array is a mapping from suffix position to suffix rank.
123 build_ISA(u32 * restrict ISA, const u32 * restrict SA, u32 n)
125 for (u32 r = 0; r < n; r++)
129 /* If ENABLE_LZ_DEBUG is defined, verify that the LCP (Longest Common Prefix)
130 * array satisfies its definition.
132 * WARNING: this is for debug use only as it does not necessarily run in linear
135 verify_LCP(const u32 *LCP, const u32 *SA, const u8 *T, u32 n)
137 #ifdef ENABLE_LZ_DEBUG
138 for (u32 r = 0; r < n - 1; r++) {
141 u32 lcp = LCP[r + 1];
146 LZ_ASSERT(lcp <= min(n1, n2));
148 LZ_ASSERT(memcmp(&T[i1], &T[i2], lcp) == 0);
149 if (lcp < min(n1, n2))
150 LZ_ASSERT(T[i1 + lcp] != T[i2 + lcp]);
152 #endif /* ENABLE_LZ_DEBUG */
156 * Build the LCP (Longest Common Prefix) array in linear time.
158 * LCP[r] will be the length of the longest common prefix between the suffixes
159 * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined.
161 * Algorithm taken from Kasai et al. (2001), but modified slightly to take into
162 * account that with bytes in the real world, there is no unique symbol at the
167 * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in
168 * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th
169 * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
172 build_LCP(u32 * restrict LCP, const u32 * restrict SA,
173 const u32 * restrict ISA, const u8 * restrict T, u32 n)
178 for (i = 0; i < n; i++) {
182 lim = min(n - i, n - j);
184 while (h < lim && T[i + h] == T[j + h])
192 verify_LCP(LCP, SA, T, n);