2 * lz_lcp_interval_tree.c
4 * A match-finder for Lempel-Ziv compression based on bottom-up construction and
5 * traversal of the Longest Common Prefix (LCP) interval tree.
7 * Copyright (c) 2014 Eric Biggers. All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
24 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
27 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
29 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 #include "wimlib/lz_mf.h"
38 #include "wimlib/lz_suffix_array_utils.h"
39 #include "wimlib/util.h"
42 * To save space, we pack lcp (longest common prefix) and position values into
43 * 32-bit integers. Therefore, we must divide the 32 bits into lcp and position
44 * bits. 6 lcp bits seems to be a good value, since matches of length 64 are
45 * sufficiently long so that the compression ratio isn't hurt much by choosing
46 * one such match over another. We also use 1 bit to mark intervals as "not yet
47 * visited". This leaves 25 bits, which when used for position results in a
48 * maximum window size of 33554432 bytes.
50 #define LZ_LCPIT_LCP_BITS 6
51 #define LZ_LCPIT_LCP_MASK ((1 << LZ_LCPIT_LCP_BITS) - 1)
52 #define LZ_LCPIT_LCP_MAX LZ_LCPIT_LCP_MASK
53 #define LZ_LCPIT_POS_BITS (32 - 1 - LZ_LCPIT_LCP_BITS)
54 #define LZ_LCPIT_MAX_WINDOW_SIZE (1UL << LZ_LCPIT_POS_BITS)
59 /* Each of the arrays has length equal to the window size. This
60 * therefore results in an additional memory usage of 12 bytes per
61 * position. (That's compared to about 8 for binary trees or 4 for hash
62 * chains, for example.)
64 * We allocate these arrays in one contiguous block. 'SA' is first,
65 * 'intervals' is second, and 'pos_data' is third. */
67 /* Pointer to the suffix array */
70 /* Mapping: lcp-interval index => lcp-interval data
72 * Initially, the lcp-interval data for an lcp-interval contains that
73 * interval's lcp and superinterval index.
75 * After a lcp-interval is visited during match-finding, its
76 * lcp-interval data contains that interval's lcp and the position of
77 * the next suffix to consider as a match when matching against that
81 /* Mapping: suffix index ("window position") => lcp-interval index */
86 * Use the suffix array accompanied with the longest-common-prefix array --- in
87 * other words, the "enhanced suffix array" --- to simulate a bottom-up
88 * traversal of the corresponding suffix tree, or equivalently the "lcp-interval
89 * tree", as described in Abouelhoda et al. (2004).
91 * While doing the traversal, create a table 'intervals' that contains data for
92 * each lcp-interval, specifically the lcp value of that interval, and the index
93 * of the superinterval.
95 * Also while doing the traversal, create a table 'pos_data' that contains a
96 * mapping from suffix index to the deepest lcp-interval containing it.
98 * The result is that we will later be able to do match-finding at a given
99 * position by looking up that position in 'pos_data' to get the deepest
100 * lcp-interval containing the corresponding suffix, then proceeding to the
101 * superintervals. See lz_lcpit_get_matches() for more details.
103 * Note: We limit the depth of the lcp-interval tree by capping the lcp at
104 * LZ_LCPIT_LCP_MAX. This can cause a sub-tree of intervals with lcp greater
105 * than LZ_LCPIT_LCP_MAX to be collapsed into a single interval with lcp
106 * LZ_LCPIT_LCP_MAX. This avoids degenerate cases and does not hurt
107 * match-finding very much, since if we find a match of length LZ_LCPIT_LCP_MAX
108 * and extend it as far as possible, that's usually good enough because that
109 * region of the input must already be highly compressible.
113 * M.I. Abouelhoda, S. Kurtz, E. Ohlebusch. 2004. Replacing Suffix Trees
114 * With Enhanced Suffix Arrays. Journal of Discrete Algorithms Volume 2
115 * Issue 1, March 2004, pp. 53-86.
117 * G. Chen, S.J. Puglisi, W.F. Smyth. 2008. Lempel-Ziv Factorization
118 * Using Less Time & Space. Mathematics in Computer Science June 2008,
119 * Volume 1, Issue 4, pp. 605-623.
121 * Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
122 * Arrays and Its Applications. 2001. CPM '01 Proceedings of the 12th
123 * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
126 build_LCPIT(const u32 SA[restrict], u32 LCP[restrict],
127 u32 pos_data[restrict], const u32 lcp_limit, const u32 n)
129 u32 *intervals = LCP;
130 u32 next_interval_idx = 0;
131 u32 open_intervals[LZ_LCPIT_LCP_MAX + 1];
132 u32 *top = open_intervals;
133 u32 prev_pos = SA[0];
135 /* The interval with lcp=0 covers the entire array. It remains open
137 *top = next_interval_idx;
138 intervals[next_interval_idx] = 0;
141 for (u32 r = 1; r < n; r++) {
142 u32 next_pos = SA[r];
143 u32 next_lcp = min(LCP[r], lcp_limit);
144 u32 top_lcp = intervals[*top];
146 if (next_lcp == top_lcp) {
147 /* continuing the deepest open interval */
148 pos_data[prev_pos] = *top;
149 } else if (next_lcp > top_lcp) {
150 /* opening a new interval */
151 intervals[next_interval_idx] = next_lcp;
152 *++top = next_interval_idx;
153 pos_data[prev_pos] = next_interval_idx;
156 /* closing the deepest open interval */
157 pos_data[prev_pos] = *top;
159 u32 closed_interval_idx = *top;
160 u32 superinterval_idx = *--top;
161 u32 superinterval_lcp = intervals[superinterval_idx];
163 if (next_lcp == superinterval_lcp) {
164 /* continuing the superinterval */
165 intervals[closed_interval_idx] |=
166 (superinterval_idx << LZ_LCPIT_LCP_BITS) |
169 } else if (next_lcp > superinterval_lcp) {
170 /* creating a new interval that is a
171 * superinterval of the one being
172 * closed, but still a subinterval of
173 * its superinterval */
174 intervals[next_interval_idx] = next_lcp;
175 *++top = next_interval_idx;
176 intervals[closed_interval_idx] |=
177 (next_interval_idx << LZ_LCPIT_LCP_BITS) |
182 /* also closing the superinterval */
183 intervals[closed_interval_idx] |=
184 (superinterval_idx << LZ_LCPIT_LCP_BITS) |
192 /* close any still-open intervals */
193 pos_data[prev_pos] = *top;
194 while (top > open_intervals) {
195 u32 closed_interval_idx = *top;
196 u32 superinterval_idx = *--top;
197 intervals[closed_interval_idx] |=
198 (superinterval_idx << LZ_LCPIT_LCP_BITS) | 0x80000000;
203 lz_lcpit_set_default_params(struct lz_mf_params *params)
205 if (params->min_match_len == 0)
206 params->min_match_len = 2;
208 if (params->max_match_len == 0)
209 params->max_match_len = UINT32_MAX;
211 if (params->max_search_depth == 0)
212 params->max_search_depth = 32;
214 params->max_search_depth = DIV_ROUND_UP(params->max_search_depth, 8);
216 if (params->nice_match_len == 0)
217 params->nice_match_len = LZ_LCPIT_LCP_MAX;
219 if (params->nice_match_len < params->min_match_len)
220 params->nice_match_len = params->min_match_len;
222 if (params->nice_match_len > params->max_match_len)
223 params->nice_match_len = params->max_match_len;
225 if (params->nice_match_len > LZ_LCPIT_LCP_MAX)
226 params->nice_match_len = LZ_LCPIT_LCP_MAX;
230 lz_lcpit_params_valid(const struct lz_mf_params *params)
232 return params->max_window_size <= LZ_LCPIT_MAX_WINDOW_SIZE;
236 lz_lcpit_get_needed_memory(u32 max_window_size)
238 return sizeof(u32) * (max_window_size +
239 max(BUILD_SA_MIN_TMP_LEN,
240 2 * (u64)max_window_size));
244 lz_lcpit_init(struct lz_mf *_mf)
246 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
248 lz_lcpit_set_default_params(&mf->base.params);
250 mf->SA = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size));
258 lz_lcpit_load_window(struct lz_mf *_mf, const u8 T[], u32 n)
260 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
263 build_SA(&mem[0 * n], T, n, &mem[1 * n]);
264 build_ISA(&mem[2 * n], &mem[0 * n], n);
265 build_LCP(&mem[1 * n], &mem[0 * n], &mem[2 * n], T, n);
266 build_LCPIT(&mem[0 * n], &mem[1 * n], &mem[2 * n],
267 mf->base.params.nice_match_len, n);
268 mf->SA = &mem[0 * n];
269 mf->intervals = &mem[1 * n];
270 mf->pos_data = &mem[2 * n];
274 lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[])
276 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
277 const u32 min_match_len = mf->base.params.min_match_len;
278 const u32 cur_pos = mf->base.cur_window_pos;
279 u32 * const pos_data = mf->pos_data;
280 u32 * const intervals = mf->intervals;
283 u32 interval, next_interval;
284 u32 cur_match, next_match;
286 /* Look up the deepest lcp-interval containing the current suffix. */
287 interval = pos_data[cur_pos];
289 /* Since the current position is greater than any position previously
290 * searched, set the "lcp interval of the next match" for this suffix to
291 * 0. This is the index of the root interval, and this indicates that
292 * there is no next match. */
293 pos_data[cur_pos] = 0;
295 /* Ascend the lcp-interval tree until we reach an lcp-interval that has
296 * already been visited. */
298 while (intervals[interval] & 0x80000000) {
300 /* Visiting this lcp-interval for the first time. Therefore,
301 * there are no Lempel-Ziv matches with length equal to the lcp
302 * of this lcp-interval. */
304 /* Extract the LCP and superinterval reference. */
306 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
308 next_interval = (intervals[interval] & ~0x80000000)
309 >> LZ_LCPIT_LCP_BITS;
311 /* If the LCP is shorter than the minimum length of matches to
312 * be produced, we're done, since the LCP will only ever get
313 * shorter from here. This also prevents ascending above the
314 * root of the lcp-interval tree, since the root is guaranteed
315 * to be a 0-interval, and min_match_len is guaranteed to be at
317 if (lcp < min_match_len)
320 /* Set the position of the most-recently-seen suffix within this
321 * lcp-interval. Since this is the first visitation of this
322 * lcp-interval, this is simply the current suffix.
324 * Note that this overwrites the superinterval reference which
325 * was previously included in this lcp-interval data slot.
326 * Further visitations of this lcp-interval will detect that it
327 * is already visited and will follow the chain of
328 * most-recently-seen suffixes rather than ascend the tree
330 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
332 /* Ascend to the superinterval of this lcp-interval. */
333 interval = next_interval;
336 /* We've already visited the current lcp-interval. */
338 /* Extract the LCP of this lcp-interval. */
339 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
341 /* Extract the current match for this lcp-interval. This usually is the
342 * most-recently-seen suffix within this lcp-interval, but it may be
344 cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS;
347 /* If the LCP is shorter than the minimum length of matches to
348 * be produced, we're done, since the LCP will only ever get
349 * shorter from here. This also prevents ascending above the
350 * root of the lcp-interval tree, since the root is guaranteed
351 * to be a 0-interval, and min_match_len is guaranteed to be at
353 if (lcp < min_match_len)
356 /* Advance the current match until the lcp of the *next* match
357 * is lower than the current lcp. When this is true we know
358 * that the current match is up to date (lowest offset /
359 * greatest position for that lcp). */
361 next_match = cur_match;
363 next_interval = pos_data[next_match];
364 next_lcp = intervals[next_interval] & LZ_LCPIT_LCP_MASK;
365 cur_match = next_match;
366 next_match = intervals[next_interval] >> LZ_LCPIT_LCP_BITS;
367 } while (next_lcp >= lcp);
369 /* Link the current position into the match chain, discarding
370 * any skipped matches. */
371 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
372 pos_data[cur_match] = interval;
374 /* Record the match. */
375 matches[num_matches++] = (struct lz_match) {
377 .offset = cur_pos - cur_match,
380 /* Bound the number of matches per position. */
381 if (num_matches >= mf->base.params.max_search_depth)
384 /* Advance to the next match. */
385 interval = next_interval;
387 cur_match = next_match;
390 /* If the length of the longest match is equal to the lcp limit, it may
391 * have been truncated. Try extending it up to the maximum match
393 if (num_matches && matches[0].len == mf->base.params.nice_match_len) {
394 const u8 * const strptr = lz_mf_get_window_ptr(&mf->base);
395 const u8 * const matchptr = strptr - matches[0].offset;
396 const u32 len_limit = min(lz_mf_get_bytes_remaining(&mf->base),
397 mf->base.params.max_match_len);
400 len = matches[0].len;
401 while (len < len_limit && strptr[len] == matchptr[len])
403 matches[0].len = len;
406 for (u32 i = 0; i < num_matches / 2; i++)
407 swap(matches[i], matches[num_matches - 1 - i]);
409 mf->base.cur_window_pos++;
413 /* Slightly simplified version of lz_lcpit_get_matches() for updating the data
414 * structures when we don't actually need matches at the current position. See
415 * lz_lcpit_get_matches() for explanatory comments. */
417 lz_lcpit_skip_position(struct lz_lcpit *mf)
419 const u32 min_match_len = mf->base.params.min_match_len;
420 const u32 cur_pos = mf->base.cur_window_pos++;
421 u32 * const pos_data = mf->pos_data;
422 u32 * const intervals = mf->intervals;
424 u32 interval, next_interval;
425 u32 cur_match, next_match;
427 interval = pos_data[cur_pos];
428 pos_data[cur_pos] = 0;
429 while (intervals[interval] & 0x80000000) {
430 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
431 next_interval = (intervals[interval] & ~0x80000000)
432 >> LZ_LCPIT_LCP_BITS;
433 if (lcp < min_match_len)
435 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
436 interval = next_interval;
438 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
439 cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS;
440 while (lcp >= min_match_len) {
441 next_match = cur_match;
443 next_interval = pos_data[next_match];
444 next_lcp = intervals[next_interval] & LZ_LCPIT_LCP_MASK;
445 cur_match = next_match;
446 next_match = intervals[next_interval] >> LZ_LCPIT_LCP_BITS;
447 } while (next_lcp >= lcp);
448 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
449 pos_data[cur_match] = interval;
450 interval = next_interval;
452 cur_match = next_match;
457 lz_lcpit_skip_positions(struct lz_mf *_mf, u32 n)
459 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
462 lz_lcpit_skip_position(mf);
467 lz_lcpit_destroy(struct lz_mf *_mf)
469 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
474 const struct lz_mf_ops lz_lcp_interval_tree_ops = {
475 .params_valid = lz_lcpit_params_valid,
476 .get_needed_memory = lz_lcpit_get_needed_memory,
477 .init = lz_lcpit_init,
478 .load_window = lz_lcpit_load_window,
479 .get_matches = lz_lcpit_get_matches,
480 .skip_positions = lz_lcpit_skip_positions,
481 .destroy = lz_lcpit_destroy,
482 .struct_size = sizeof(struct lz_lcpit),