4 * A match-finder for Lempel-Ziv compression based on bottom-up construction and
5 * traversal of the Longest Common Prefix (LCP) interval tree.
10 * The author dedicates this file to the public domain.
11 * You can do whatever you want with this file.
17 #include "wimlib/divsufsort.h"
18 #include "wimlib/lcpit_matchfinder.h"
19 #include "wimlib/util.h"
22 #define LCP_MASK ((1 << LCP_BITS) - 1)
23 #define LCP_MAX LCP_MASK
24 #define NORMAL_UNVISITED_TAG ((u32)1 << 31)
25 #define MAX_NORMAL_BUFSIZE ((u32)1 << (31 - LCP_BITS))
26 #define HUGE_UNVISITED_TAG ((u64)1 << 63)
27 #define SA_and_LCP_LCP_SHIFT (32 - LCP_BITS)
28 #define SA_and_LCP_POS_MASK (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1)
31 * Include the template header to define the functions build_LCP(),
32 * build_LCPIT(), and lcpit_advance_one_byte(). There are "normal" and "huge"
33 * versions of each function. The normal versions assume that a buffer position
34 * and LCP value can be packed into a 32-bit integer, whereas the huge versions
35 * assume that 64 bits is needed.
37 * Both versions cap LCP values to 6 bits. This limits the depth of the
38 * lcp-interval tree without hurting the compression ratio too much. Matches of
39 * length 63 are sufficiently long that the compression ratio doesn't change
40 * significantly if we choose one such match over another.
43 #include "lcpit_matchfinder_templates.h"
47 #include "lcpit_matchfinder_templates.h"
51 * Calculate the number of bytes of memory needed for the LCP-interval tree
54 * @max_bufsize - maximum buffer size to support
56 * Returns the number of bytes required.
59 lcpit_matchfinder_get_needed_memory(size_t max_bufsize)
63 /* pos_data (+1 is for prefetch) */
64 size += ((u64)max_bufsize + 1) * sizeof(u32);
66 /* intervals or intervals64 */
67 size += max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) *
68 (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64));
71 size += (u64)max_bufsize * sizeof(u32);
77 * Initialize the LCP-interval tree matchfinder.
79 * @mf - the matchfinder structure to initialize
80 * @max_bufsize - maximum buffer size to support
81 * @min_match_len - minimum match length in bytes
82 * @nice_match_len - only consider this many bytes of each match
84 * Returns true if successfully initialized; false if out of memory.
87 lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize,
88 u32 min_match_len, u32 nice_match_len)
90 if (lcpit_matchfinder_get_needed_memory(max_bufsize) > SIZE_MAX)
93 mf->pos_data = MALLOC((max_bufsize + 1) * sizeof(u32));
94 mf->intervals = MALLOC(max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) *
95 (max_bufsize <= MAX_NORMAL_BUFSIZE ?
96 sizeof(u32) : sizeof(u64)));
97 mf->SA = MALLOC(max_bufsize * sizeof(u32));
99 if (!mf->pos_data || !mf->intervals || !mf->SA) {
100 lcpit_matchfinder_destroy(mf);
104 mf->min_match_len = min_match_len;
105 mf->nice_match_len = min(nice_match_len, LCP_MAX);
110 * Build the suffix array SA for the specified byte array T of length n.
112 * The suffix array is a sorted array of the byte array's suffixes, represented
113 * by indices into the byte array. It can equivalently be viewed as a mapping
114 * from suffix rank to suffix position.
116 * To build the suffix array, we use libdivsufsort, which uses an
117 * induced-sorting-based algorithm. In practice, this seems to be the fastest
118 * suffix array construction algorithm currently available.
122 * Y. Mori. libdivsufsort, a lightweight suffix-sorting library.
123 * https://code.google.com/p/libdivsufsort/.
125 * G. Nong, S. Zhang, and W.H. Chan. 2009. Linear Suffix Array
126 * Construction by Almost Pure Induced-Sorting. Data Compression
127 * Conference, 2009. DCC '09. pp. 193 - 202.
129 * S.J. Puglisi, W.F. Smyth, and A. Turpin. 2007. A Taxonomy of Suffix
130 * Array Construction Algorithms. ACM Computing Surveys (CSUR) Volume 39
131 * Issue 2, 2007 Article No. 4.
134 build_SA(u32 SA[], const u8 T[], u32 n, u32 *tmp)
136 /* Note: divsufsort() needs temporary space --- one array with 256
137 * spaces and one array with 65536 spaces. The implementation of
138 * divsufsort() has been modified from the original to use the provided
139 * temporary space instead of allocating its own, since we don't want to
140 * have to deal with malloc() failures here. */
141 divsufsort(T, SA, n, tmp);
145 * Build the inverse suffix array ISA from the suffix array SA.
147 * Whereas the suffix array is a mapping from suffix rank to suffix position,
148 * the inverse suffix array is a mapping from suffix position to suffix rank.
151 build_ISA(u32 ISA[restrict], const u32 SA[restrict], u32 n)
153 for (u32 r = 0; r < n; r++)
158 * Prepare the LCP-interval tree matchfinder for a new input buffer.
160 * @mf - the initialized matchfinder structure
161 * @T - the input buffer
162 * @n - size of the input buffer in bytes. This may be at most the max_bufsize
163 * with which lcpit_matchfinder_init() was called.
166 lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
171 build_SA(mf->SA, T, n, mf->intervals);
172 build_ISA(mf->pos_data, mf->SA, n);
173 if (n <= MAX_NORMAL_BUFSIZE) {
174 /* "Normal" sized buffer */
176 /* Build LCP, packing it into ->SA */
177 build_LCP_normal(mf->SA, mf->pos_data, T, n,
178 mf->min_match_len, mf->nice_match_len);
179 /* Prepare ->intervals and ->pos_data */
180 build_LCPIT_normal(mf->SA, mf->intervals, mf->pos_data, n);
181 mf->huge_mode = false;
183 /* "Huge" sized buffer */
185 /* Build LCP in the second half of ->intervals64. It may be
186 * partially overwritten in build_LCPIT_huge(), but this is okay
187 * since each LCP entry is guaranteed to be consumed before it
188 * can possibly be overwritten. */
189 build_LCP_huge(mf->intervals + n, mf->SA, mf->pos_data, T, n,
190 mf->min_match_len, mf->nice_match_len);
191 /* Prepare ->intervals64 and ->pos_data */
192 build_LCPIT_huge(mf->SA, mf->intervals + n, mf->intervals64,
194 mf->huge_mode = true;
196 mf->cur_pos = 0; /* starting at beginning of input buffer */
197 mf->pos_data[n] = 0; /* safety entry for prefetch() overrun */
201 * Retrieve a list of matches with the next position.
203 * The matches will be recorded in the @matches array, ordered by strictly
204 * decreasing length and strictly decreasing offset.
206 * The return value is the number of matches found and written to @matches.
207 * This can be any value in [0, nice_match_len - min_match_len + 1].
209 * If the caller attempts to advance beyond the end of the input buffer, the
210 * behavior is undefined.
213 lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf,
214 struct lz_match *matches)
217 return lcpit_advance_one_byte_huge(mf, matches, true);
219 return lcpit_advance_one_byte_normal(mf, matches, true);
223 * Skip the next @count bytes (don't search for matches at them). @count is
226 * If the caller attempts to advance beyond the end of the input buffer, the
227 * behavior is undefined.
230 lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count)
234 lcpit_advance_one_byte_huge(mf, NULL, false);
238 lcpit_advance_one_byte_normal(mf, NULL, false);
244 * Destroy an LCP-interval tree matchfinder that was previously initialized with
245 * lcpit_matchfinder_init().
247 * If the struct has been zeroed out, this has no effect.
250 lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf)
255 memset(mf, 0, sizeof(*mf));