2 * lz_lcp_interval_tree.c
4 * A match-finder for Lempel-Ziv compression based on bottom-up construction and
5 * traversal of the Longest Common Prefix (LCP) interval tree.
7 * Copyright (c) 2014 Eric Biggers. All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
24 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
27 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
29 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 #include "wimlib/lz_mf.h"
38 #include "wimlib/lz_suffix_array_utils.h"
39 #include "wimlib/util.h"
42 * To save space, we pack lcp (longest common prefix) and position values into
43 * 32-bit integers. Therefore, we must divide the 32 bits into lcp and position
44 * bits. 6 lcp bits seems to be a good value, since matches of length 64 are
45 * sufficiently long so that the compression ratio isn't hurt much by choosing
46 * one such match over another. We also use 1 bit to mark intervals as "not yet
47 * visited". This leaves 25 bits, which when used for position results in a
48 * maximum window size of 33554432 bytes.
50 #define LZ_LCPIT_LCP_BITS 6
51 #define LZ_LCPIT_LCP_MASK ((1 << LZ_LCPIT_LCP_BITS) - 1)
52 #define LZ_LCPIT_LCP_MAX LZ_LCPIT_LCP_MASK
53 #define LZ_LCPIT_POS_BITS (32 - 1 - LZ_LCPIT_LCP_BITS)
54 #define LZ_LCPIT_MAX_WINDOW_SIZE (1UL << LZ_LCPIT_POS_BITS)
59 /* Each of the arrays has length equal to the window size. This
60 * therefore results in an additional memory usage of 12 bytes per
61 * position. (That's compared to about 8 for binary trees or 4 for hash
62 * chains, for example.)
64 * We allocate these arrays in one contiguous block. 'SA' is first,
65 * 'intervals' is second, and 'pos_data' is third. */
67 /* Pointer to the suffix array */
70 /* Mapping: lcp-interval index => lcp-interval data
72 * Initially, the lcp-interval data for an lcp-interval contains that
73 * interval's lcp and superinterval index.
75 * After a lcp-interval is visited during match-finding, its
76 * lcp-interval data contains that interval's lcp and the position of
77 * the next suffix to consider as a match when matching against that
81 /* Mapping: suffix index ("window position") => lcp-interval index */
86 * Use the suffix array accompanied with the longest-common-prefix array --- in
87 * other words, the "enhanced suffix array" --- to simulate a bottom-up
88 * traversal of the corresponding suffix tree, or equivalently the "lcp-interval
89 * tree", as described in Abouelhoda et al. (2004).
91 * While doing the traversal, create a table 'intervals' that contains data for
92 * each lcp-interval, specifically the lcp value of that interval, and the index
93 * of the superinterval.
95 * Also while doing the traversal, create a table 'pos_data' that contains a
96 * mapping from suffix index to the deepest lcp-interval containing it.
98 * The result is that we will later be able to do match-finding at a given
99 * position by looking up that position in 'pos_data' to get the deepest
100 * lcp-interval containing the corresponding suffix, then proceeding to the
101 * superintervals. See lz_lcpit_get_matches() for more details.
103 * Note: We limit the depth of the lcp-interval tree by capping the lcp at
104 * LZ_LCPIT_LCP_MAX. This can cause a sub-tree of intervals with lcp greater
105 * than LZ_LCPIT_LCP_MAX to be collapsed into a single interval with lcp
106 * LZ_LCPIT_LCP_MAX. This avoids degenerate cases and does not hurt
107 * match-finding very much, since if we find a match of length LZ_LCPIT_LCP_MAX
108 * and extend it as far as possible, that's usually good enough because that
109 * region of the input must already be highly compressible.
113 * M.I. Abouelhoda, S. Kurtz, E. Ohlebusch. 2004. Replacing Suffix Trees
114 * With Enhanced Suffix Arrays. Journal of Discrete Algorithms Volume 2
115 * Issue 1, March 2004, pp. 53-86.
117 * G. Chen, S.J. Puglisi, W.F. Smyth. 2008. Lempel-Ziv Factorization
118 * Using Less Time & Space. Mathematics in Computer Science June 2008,
119 * Volume 1, Issue 4, pp. 605-623.
121 * Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
122 * Arrays and Its Applications. 2001. CPM '01 Proceedings of the 12th
123 * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
126 build_LCPIT(const u32 SA[restrict], u32 LCP[restrict],
127 u32 pos_data[restrict], const u32 lcp_limit, const u32 n)
129 u32 *intervals = LCP;
131 u32 incomplete_intervals[lcp_limit + 1];
135 /* As we determine lcp-intervals, we assign each one an entry in
136 * 'intervals', overwriting LCP in the process. Each such entry will
137 * contain the index in 'intervals' of the superinterval, along with the
138 * longest common prefix length that the suffixes in that interval
141 * Note: since we don't need its memory for anything, we don't overwrite
142 * the suffix array, even though this could potentially be done since
143 * it's not actually used during match-finding. */
145 /* Process rank 0 as special case. This creates the lcp-interval
146 * containing every suffix in the window. */
149 pos_data[prev_pos] = 0;
150 cur_interval = incomplete_intervals;
154 /* Iterate through each suffix array rank. */
155 for (u32 r = 1; r < n; r++) {
157 /* Get the longest common prefix (lcp) between the suffixes with
158 * ranks r and r - 1. But cap it to the lcp limit. */
159 const u32 lcp = min(LCP[r], lcp_limit);
161 /* Convert rank => position using the suffix array. */
162 const u32 pos = SA[r];
164 /* cur_interval points to the index of the deepest (highest lcp
165 * value) incomplete lcp-interval. */
168 * There are three cases:
170 * (1) The lcp stayed the same as the previous value. Place the
171 * current suffix in cur_interval. (This placement is
172 * tentative, because if LCP increases at the next rank, this
173 * suffix could still be placed in the resulting new LCP
174 * interval instead.) cur_interval remains unchanged.
176 * (2) The lcp increased from the previous value. This signals
177 * the beginning of a new lcp-interval. Create it and push it
178 * onto the stack of incomplete intervals. But since lcp is
179 * defined in terms of the longest prefix between this suffix
180 * and the *previous* ranked suffix, the new lcp-interval
181 * actually should have begun at the *previous* ranked suffix.
182 * Therefore, we need to set both pos_data[pos] and
183 * pos_data[prev_pos] to refer to the new interval.
185 * (3) The lcp decreased from the previous value. This signals
186 * the termination of at least one lcp-interval. Pop the stack,
187 * finalizing the lcp-intervals, until the current lcp is at
188 * least as large as the lcp associated with cur_interval.
189 * Then, if the current lcp is equal to the lcp associated with
190 * cur_interval, place the current suffix in cur_interval,
191 * similar to case (1). Else, create a new lcp-interval,
192 * similar to case (2).
195 if (lcp == (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK)) {
199 pos_data[pos] = *cur_interval;
201 } else if (lcp > (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK)) {
205 intervals[next_interval] = lcp | 0x80000000;
206 pos_data[prev_pos] = next_interval;
207 pos_data[pos] = next_interval;
208 *++cur_interval = next_interval++;
218 /* Examine the deepest incomplete lcp-interval
219 * and its superinterval. */
221 interval = *cur_interval;
222 superinterval = *--cur_interval;
224 if (lcp >= (intervals[superinterval] &
228 /* The current suffix can't go in either of
229 * them. Therefore we're visiting 'interval'
230 * for the last time and finalizing its
231 * membership in 'superinterval'. */
233 intervals[interval] |=
234 (superinterval << LZ_LCPIT_LCP_BITS);
237 /* The current suffix can't go in 'interval', but it can
238 * go in 'superinterval'. */
240 if (lcp > (intervals[superinterval] & LZ_LCPIT_LCP_MASK)) {
241 /* Creating a new lcp-interval that is a
242 * superinterval of 'interval' but a subinterval
243 * of 'superinterval'.
245 * Example: with the LCP arrayl
249 * then we will execute this case when
250 * processing the LCP value 3. The LCP
259 * Note that the 3-interval (the one being
260 * created by this code) is a superinterval of
261 * the 4-interval (which already existed)! But
262 * we don't need to re-assign pos_data values in
263 * the 4-interval because they point to the
264 * deepest interval which contains them, which
265 * is the 4-interval. */
267 intervals[next_interval] = lcp | 0x80000000;
268 intervals[interval] |=
269 (next_interval << LZ_LCPIT_LCP_BITS);
270 pos_data[pos] = next_interval;
271 *++cur_interval = next_interval++;
273 /* Finishing 'interval', continuing with
274 * 'superinterval'. */
276 intervals[interval] |=
277 (superinterval << LZ_LCPIT_LCP_BITS);
278 pos_data[pos] = superinterval;
282 /* Remember this suffix index when processing the next-ranked
287 /* Finalize any still-incomplete lcp-intervals. */
288 while (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK) {
289 intervals[*cur_interval] |=
290 (*(cur_interval - 1) << LZ_LCPIT_LCP_BITS);
296 lz_lcpit_set_default_params(struct lz_mf_params *params)
298 if (params->min_match_len == 0)
299 params->min_match_len = 2;
301 if (params->max_match_len == 0)
302 params->max_match_len = params->max_window_size;
304 if (params->max_search_depth == 0)
305 params->max_search_depth = 32;
307 params->max_search_depth = DIV_ROUND_UP(params->max_search_depth, 8);
309 if (params->nice_match_len == 0)
310 params->nice_match_len = LZ_LCPIT_LCP_MAX;
312 if (params->nice_match_len < params->min_match_len)
313 params->nice_match_len = params->min_match_len;
315 if (params->nice_match_len > params->max_match_len)
316 params->nice_match_len = params->max_match_len;
318 if (params->nice_match_len > LZ_LCPIT_LCP_MAX)
319 params->nice_match_len = LZ_LCPIT_LCP_MAX;
323 lz_lcpit_params_valid(const struct lz_mf_params *params)
325 return params->max_window_size <= LZ_LCPIT_MAX_WINDOW_SIZE;
329 lz_lcpit_get_needed_memory(u32 max_window_size)
331 return sizeof(u32) * (max_window_size +
332 max(BUILD_SA_MIN_TMP_LEN,
333 2 * (u64)max_window_size));
337 lz_lcpit_init(struct lz_mf *_mf)
339 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
341 lz_lcpit_set_default_params(&mf->base.params);
343 mf->SA = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size));
351 lz_lcpit_load_window(struct lz_mf *_mf, const u8 T[], u32 n)
353 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
356 build_SA(&mem[0 * n], T, n, &mem[1 * n]);
357 build_ISA(&mem[2 * n], &mem[0 * n], n);
358 build_LCP(&mem[1 * n], &mem[0 * n], &mem[2 * n], T, n);
359 build_LCPIT(&mem[0 * n], &mem[1 * n], &mem[2 * n],
360 mf->base.params.nice_match_len, n);
361 mf->SA = &mem[0 * n];
362 mf->intervals = &mem[1 * n];
363 mf->pos_data = &mem[2 * n];
367 lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[])
369 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
370 const u32 min_match_len = mf->base.params.min_match_len;
371 const u32 cur_pos = mf->base.cur_window_pos;
372 u32 * const pos_data = mf->pos_data;
373 u32 * const intervals = mf->intervals;
376 u32 interval, next_interval;
377 u32 cur_match, next_match;
379 /* Look up the deepest lcp-interval containing the current suffix. */
380 interval = pos_data[cur_pos];
382 /* Since the current position is greater than any position previously
383 * searched, set the "lcp interval of the next match" for this suffix to
384 * 0. This is the index of the root interval, and this indicates that
385 * there is no next match. */
386 pos_data[cur_pos] = 0;
388 /* Ascend the lcp-interval tree until we reach an lcp-interval that has
389 * already been visited. */
391 while (intervals[interval] & 0x80000000) {
393 /* Visiting this lcp-interval for the first time. Therefore,
394 * there are no Lempel-Ziv matches with length equal to the lcp
395 * of this lcp-interval. */
397 /* Extract the LCP and superinterval reference. */
399 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
401 next_interval = (intervals[interval] & ~0x80000000)
402 >> LZ_LCPIT_LCP_BITS;
404 /* If the LCP is shorter than the minimum length of matches to
405 * be produced, we're done, since the LCP will only ever get
406 * shorter from here. This also prevents ascending above the
407 * root of the lcp-interval tree, since the root is guaranteed
408 * to be a 0-interval, and min_match_len is guaranteed to be at
410 if (lcp < min_match_len)
413 /* Set the position of the most-recently-seen suffix within this
414 * lcp-interval. Since this is the first visitation of this
415 * lcp-interval, this is simply the current suffix.
417 * Note that this overwrites the superinterval reference which
418 * was previously included in this lcp-interval data slot.
419 * Further visitations of this lcp-interval will detect that it
420 * is already visited and will follow the chain of
421 * most-recently-seen suffixes rather than ascend the tree
423 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
425 /* Ascend to the superinterval of this lcp-interval. */
426 interval = next_interval;
429 /* We've already visited the current lcp-interval. */
431 /* Extract the LCP of this lcp-interval. */
432 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
434 /* Extract the current match for this lcp-interval. This usually is the
435 * most-recently-seen suffix within this lcp-interval, but it may be
437 cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS;
440 /* If the LCP is shorter than the minimum length of matches to
441 * be produced, we're done, since the LCP will only ever get
442 * shorter from here. This also prevents ascending above the
443 * root of the lcp-interval tree, since the root is guaranteed
444 * to be a 0-interval, and min_match_len is guaranteed to be at
446 if (lcp < min_match_len)
449 /* Advance the current match until the lcp of the *next* match
450 * is lower than the current lcp. When this is true we know
451 * that the current match is up to date (lowest offset /
452 * greatest position for that lcp). */
454 next_match = cur_match;
456 next_interval = pos_data[next_match];
457 next_lcp = intervals[next_interval] & LZ_LCPIT_LCP_MASK;
458 cur_match = next_match;
459 next_match = intervals[next_interval] >> LZ_LCPIT_LCP_BITS;
460 } while (next_lcp >= lcp);
462 /* Link the current position into the match chain, discarding
463 * any skipped matches. */
464 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
465 pos_data[cur_match] = interval;
467 /* Record the match. */
468 matches[num_matches++] = (struct lz_match) {
470 .offset = cur_pos - cur_match,
473 /* Bound the number of matches per position. */
474 if (num_matches >= mf->base.params.max_search_depth)
477 /* Advance to the next match. */
478 interval = next_interval;
480 cur_match = next_match;
483 /* If the length of the longest match is equal to the lcp limit, it may
484 * have been truncated. Try extending it up to the maximum match
486 if (num_matches && matches[0].len == mf->base.params.nice_match_len) {
487 const u8 * const strptr = lz_mf_get_window_ptr(&mf->base);
488 const u8 * const matchptr = strptr - matches[0].offset;
489 const u32 len_limit = min(lz_mf_get_bytes_remaining(&mf->base),
490 mf->base.params.max_match_len);
493 len = matches[0].len;
494 while (len < len_limit && strptr[len] == matchptr[len])
496 matches[0].len = len;
499 for (u32 i = 0; i < num_matches / 2; i++)
500 swap(matches[i], matches[num_matches - 1 - i]);
502 mf->base.cur_window_pos++;
506 /* Slightly simplified version of lz_lcpit_get_matches() for updating the data
507 * structures when we don't actually need matches at the current position. See
508 * lz_lcpit_get_matches() for explanatory comments. */
510 lz_lcpit_skip_position(struct lz_lcpit *mf)
512 const u32 min_match_len = mf->base.params.min_match_len;
513 const u32 cur_pos = mf->base.cur_window_pos++;
514 u32 * const pos_data = mf->pos_data;
515 u32 * const intervals = mf->intervals;
517 u32 interval, next_interval;
518 u32 cur_match, next_match;
520 interval = pos_data[cur_pos];
521 pos_data[cur_pos] = 0;
522 while (intervals[interval] & 0x80000000) {
523 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
524 next_interval = (intervals[interval] & ~0x80000000)
525 >> LZ_LCPIT_LCP_BITS;
526 if (lcp < min_match_len)
528 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
529 interval = next_interval;
531 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
532 cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS;
533 while (lcp >= min_match_len) {
534 next_match = cur_match;
536 next_interval = pos_data[next_match];
537 next_lcp = intervals[next_interval] & LZ_LCPIT_LCP_MASK;
538 cur_match = next_match;
539 next_match = intervals[next_interval] >> LZ_LCPIT_LCP_BITS;
540 } while (next_lcp >= lcp);
541 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
542 pos_data[cur_match] = interval;
543 interval = next_interval;
545 cur_match = next_match;
550 lz_lcpit_skip_positions(struct lz_mf *_mf, u32 n)
552 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
555 lz_lcpit_skip_position(mf);
560 lz_lcpit_destroy(struct lz_mf *_mf)
562 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
567 const struct lz_mf_ops lz_lcp_interval_tree_ops = {
568 .params_valid = lz_lcpit_params_valid,
569 .get_needed_memory = lz_lcpit_get_needed_memory,
570 .init = lz_lcpit_init,
571 .load_window = lz_lcpit_load_window,
572 .get_matches = lz_lcpit_get_matches,
573 .skip_positions = lz_lcpit_skip_positions,
574 .destroy = lz_lcpit_destroy,
575 .struct_size = sizeof(struct lz_lcpit),