2 * lz_lcp_interval_tree.c
4 * A match-finder for Lempel-Ziv compression based on bottom-up construction and
5 * traversal of the Longest Common Prefix (LCP) interval tree.
7 * Copyright (c) 2014 Eric Biggers. All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
24 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
27 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
29 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 #include "wimlib/lz_mf.h"
38 #include "wimlib/lz_suffix_array_utils.h"
39 #include "wimlib/util.h"
42 * To save space, we pack lcp (longest common prefix) and position values into
43 * 32-bit integers. Therefore, we must divide the 32 bits into lcp and position
44 * bits. 6 lcp bits seems to be a good value, since matches of length 64 are
45 * sufficiently long so that the compression ratio isn't hurt much by choosing
46 * one such match over another. We also use 1 bit to mark intervals as "not yet
47 * visited". This leaves 25 bits, which when used for position results in a
48 * maximum window size of 33554432 bytes.
50 #define LZ_LCPIT_LCP_BITS 6
51 #define LZ_LCPIT_LCP_MASK ((1 << LZ_LCPIT_LCP_BITS) - 1)
52 #define LZ_LCPIT_LCP_MAX LZ_LCPIT_LCP_MASK
53 #define LZ_LCPIT_POS_BITS (32 - 1 - LZ_LCPIT_LCP_BITS)
54 #define LZ_LCPIT_MAX_WINDOW_SIZE (1UL << LZ_LCPIT_POS_BITS)
56 #define SA_and_LCP_LCP_SHIFT (32 - LZ_LCPIT_LCP_BITS)
57 #define SA_and_LCP_POS_MASK (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1)
64 /* Mapping: lcp-interval index => lcp-interval data
66 * Initially, the lcp-interval data for an lcp-interval contains that
67 * interval's lcp and superinterval index.
69 * After a lcp-interval is visited during match-finding, its
70 * lcp-interval data contains that interval's lcp and the position of
71 * the next suffix to consider as a match when matching against that
75 /* Mapping: suffix index ("window position") => lcp-interval index */
80 * Build the LCP (Longest Common Prefix) array in linear time.
82 * LCP[r] will be the length of the longest common prefix between the suffixes
83 * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined.
85 * Algorithm taken from Kasai et al. (2001), but modified slightly:
87 * - For decreased memory usage and improved memory locality, pack the two
88 * logically distinct SA and LCP arrays into a single array SA_and_LCP.
90 * - With bytes there is no realistic way to reserve a unique symbol for
91 * end-of-buffer, so use explicit checks for end-of-buffer.
93 * - If a LCP value is less than the minimum match length, then store 0. This
94 * avoids having to do comparisons against the minimum match length later.
96 * - If a LCP value is greater than the "nice match length", then store the
97 * "nice match length". This caps the number of bits needed to store each
98 * LCP value, and this caps the depth of the LCP-interval tree, without
99 * usually hurting the compression ratio too much.
103 * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in
104 * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th
105 * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
108 build_LCP_packed(u32 * const restrict SA_and_LCP, const u32 * const restrict ISA,
109 const u8 * const restrict T, const u32 n,
110 const u32 min_lcp, const u32 max_lcp)
112 u32 h, i, r, j, lim, stored_lcp;
115 for (i = 0; i < n; i++) {
118 j = SA_and_LCP[r - 1] & SA_and_LCP_POS_MASK;
119 lim = min(n - i, n - j);
120 while (h < lim && T[i + h] == T[j + h])
123 if (stored_lcp < min_lcp)
125 else if (stored_lcp > max_lcp)
126 stored_lcp = max_lcp;
127 SA_and_LCP[r] |= stored_lcp << SA_and_LCP_LCP_SHIFT;
135 * Use the suffix array accompanied with the longest-common-prefix array --- in
136 * other words, the "enhanced suffix array" --- to simulate a bottom-up
137 * traversal of the corresponding suffix tree, or equivalently the "lcp-interval
138 * tree", as described in Abouelhoda et al. (2004).
140 * While doing the traversal, create a table 'intervals' that contains data for
141 * each lcp-interval, specifically the lcp value of that interval, and the index
142 * of the superinterval.
144 * Also while doing the traversal, create a table 'pos_data' that contains a
145 * mapping from suffix index to the deepest lcp-interval containing it.
147 * The result is that we will later be able to do match-finding at a given
148 * position by looking up that position in 'pos_data' to get the deepest
149 * lcp-interval containing the corresponding suffix, then proceeding to the
150 * superintervals. See lz_lcpit_get_matches() for more details.
152 * Note: We limit the depth of the lcp-interval tree by capping the lcp at
153 * LZ_LCPIT_LCP_MAX. This can cause a sub-tree of intervals with lcp greater
154 * than LZ_LCPIT_LCP_MAX to be collapsed into a single interval with lcp
155 * LZ_LCPIT_LCP_MAX. This avoids degenerate cases and does not hurt
156 * match-finding very much, since if we find a match of length LZ_LCPIT_LCP_MAX
157 * and extend it as far as possible, that's usually good enough because that
158 * region of the input must already be highly compressible.
162 * M.I. Abouelhoda, S. Kurtz, E. Ohlebusch. 2004. Replacing Suffix Trees
163 * With Enhanced Suffix Arrays. Journal of Discrete Algorithms Volume 2
164 * Issue 1, March 2004, pp. 53-86.
166 * G. Chen, S.J. Puglisi, W.F. Smyth. 2008. Lempel-Ziv Factorization
167 * Using Less Time & Space. Mathematics in Computer Science June 2008,
168 * Volume 1, Issue 4, pp. 605-623.
170 * Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
171 * Arrays and Its Applications. 2001. CPM '01 Proceedings of the 12th
172 * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
175 build_LCPIT(const u32 * const restrict SA_and_LCP,
176 u32 * const restrict intervals, u32 * const restrict pos_data,
179 u32 next_interval_idx = 0;
180 u32 open_intervals[LZ_LCPIT_LCP_MAX + 1];
181 u32 *top = open_intervals;
182 u32 prev_pos = SA_and_LCP[0] & SA_and_LCP_POS_MASK;
184 /* The interval with lcp=0 covers the entire array. It remains open
186 *top = next_interval_idx;
187 intervals[next_interval_idx] = 0;
190 for (u32 r = 1; r < n; r++) {
191 u32 next_pos = SA_and_LCP[r] & SA_and_LCP_POS_MASK;
192 u32 next_lcp = SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT;
193 u32 top_lcp = intervals[*top];
195 if (next_lcp == top_lcp) {
196 /* continuing the deepest open interval */
197 pos_data[prev_pos] = *top;
198 } else if (next_lcp > top_lcp) {
199 /* opening a new interval */
200 intervals[next_interval_idx] = next_lcp;
201 *++top = next_interval_idx;
202 pos_data[prev_pos] = next_interval_idx;
205 /* closing the deepest open interval */
206 pos_data[prev_pos] = *top;
208 u32 closed_interval_idx = *top;
209 u32 superinterval_idx = *--top;
210 u32 superinterval_lcp = intervals[superinterval_idx];
212 if (next_lcp == superinterval_lcp) {
213 /* continuing the superinterval */
214 intervals[closed_interval_idx] |=
215 (superinterval_idx << LZ_LCPIT_LCP_BITS) |
218 } else if (next_lcp > superinterval_lcp) {
219 /* creating a new interval that is a
220 * superinterval of the one being
221 * closed, but still a subinterval of
222 * its superinterval */
223 intervals[next_interval_idx] = next_lcp;
224 *++top = next_interval_idx;
225 intervals[closed_interval_idx] |=
226 (next_interval_idx << LZ_LCPIT_LCP_BITS) |
231 /* also closing the superinterval */
232 intervals[closed_interval_idx] |=
233 (superinterval_idx << LZ_LCPIT_LCP_BITS) |
241 /* close any still-open intervals */
242 pos_data[prev_pos] = *top;
243 while (top > open_intervals) {
244 u32 closed_interval_idx = *top;
245 u32 superinterval_idx = *--top;
246 intervals[closed_interval_idx] |=
247 (superinterval_idx << LZ_LCPIT_LCP_BITS) | 0x80000000;
252 lz_lcpit_set_default_params(struct lz_mf_params *params)
254 if (params->min_match_len == 0)
255 params->min_match_len = 2;
257 if (params->max_match_len == 0)
258 params->max_match_len = UINT32_MAX;
260 if (params->max_search_depth == 0)
261 params->max_search_depth = 32;
263 params->max_search_depth = DIV_ROUND_UP(params->max_search_depth, 8);
265 if (params->nice_match_len == 0)
266 params->nice_match_len = LZ_LCPIT_LCP_MAX;
268 if (params->nice_match_len < params->min_match_len)
269 params->nice_match_len = params->min_match_len;
271 if (params->nice_match_len > params->max_match_len)
272 params->nice_match_len = params->max_match_len;
274 if (params->nice_match_len > LZ_LCPIT_LCP_MAX)
275 params->nice_match_len = LZ_LCPIT_LCP_MAX;
279 lz_lcpit_params_valid(const struct lz_mf_params *params)
281 return params->max_window_size <= LZ_LCPIT_MAX_WINDOW_SIZE;
285 lz_lcpit_get_needed_memory(u32 max_window_size)
287 return sizeof(u32) * (max_window_size +
288 max(BUILD_SA_MIN_TMP_LEN,
289 2 * (u64)max_window_size));
293 lz_lcpit_init(struct lz_mf *_mf)
295 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
297 lz_lcpit_set_default_params(&mf->base.params);
299 mf->mem = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size));
300 return (mf->mem != NULL);
304 lz_lcpit_load_window(struct lz_mf *_mf, const u8 T[], u32 n)
306 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
308 build_SA(&mf->mem[0 * n], T, n, &mf->mem[1 * n]);
309 build_ISA(&mf->mem[2 * n], &mf->mem[0 * n], n);
310 build_LCP_packed(&mf->mem[0 * n], &mf->mem[2 * n], T, n,
311 mf->base.params.min_match_len,
312 mf->base.params.nice_match_len);
313 build_LCPIT(&mf->mem[0 * n], &mf->mem[1 * n], &mf->mem[2 * n], n);
314 mf->intervals = &mf->mem[1 * n];
315 mf->pos_data = &mf->mem[2 * n];
319 lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[])
321 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
322 const u32 cur_pos = mf->base.cur_window_pos;
323 u32 * const pos_data = mf->pos_data;
324 u32 * const intervals = mf->intervals;
327 u32 interval, next_interval;
328 u32 cur_match, next_match;
330 /* Look up the deepest lcp-interval containing the current suffix. */
331 interval = pos_data[cur_pos];
333 /* Since the current position is greater than any position previously
334 * searched, set the "lcp interval of the next match" for this suffix to
335 * 0. This is the index of the root interval, and this indicates that
336 * there is no next match. */
337 pos_data[cur_pos] = 0;
339 /* Ascend the lcp-interval tree until we reach an lcp-interval that has
340 * already been visited. */
342 while (intervals[interval] & 0x80000000) {
344 /* Visiting this lcp-interval for the first time. Therefore,
345 * there are no Lempel-Ziv matches with length equal to the lcp
346 * of this lcp-interval. */
348 /* Extract the LCP and superinterval reference. */
350 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
352 next_interval = (intervals[interval] & ~0x80000000)
353 >> LZ_LCPIT_LCP_BITS;
355 /* If the LCP is shorter than the minimum length of matches to
356 * be produced, we're done, since the LCP will only ever get
357 * shorter from here. This also prevents ascending above the
358 * root of the lcp-interval tree, since the root is guaranteed
359 * to be a 0-interval. */
363 /* Set the position of the most-recently-seen suffix within this
364 * lcp-interval. Since this is the first visitation of this
365 * lcp-interval, this is simply the current suffix.
367 * Note that this overwrites the superinterval reference which
368 * was previously included in this lcp-interval data slot.
369 * Further visitations of this lcp-interval will detect that it
370 * is already visited and will follow the chain of
371 * most-recently-seen suffixes rather than ascend the tree
373 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
375 /* Ascend to the superinterval of this lcp-interval. */
376 interval = next_interval;
379 /* We've already visited the current lcp-interval. */
381 /* Extract the LCP of this lcp-interval. */
382 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
384 /* Extract the current match for this lcp-interval. This usually is the
385 * most-recently-seen suffix within this lcp-interval, but it may be
387 cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS;
390 /* If the LCP is shorter than the minimum length of matches to
391 * be produced, we're done, since the LCP will only ever get
392 * shorter from here. This also prevents ascending above the
393 * root of the lcp-interval tree, since the root is guaranteed
394 * to be a 0-interval. */
398 /* Advance the current match until the lcp of the *next* match
399 * is lower than the current lcp. When this is true we know
400 * that the current match is up to date (lowest offset /
401 * greatest position for that lcp). */
403 next_match = cur_match;
405 next_interval = pos_data[next_match];
406 next_lcp = intervals[next_interval] & LZ_LCPIT_LCP_MASK;
407 cur_match = next_match;
408 next_match = intervals[next_interval] >> LZ_LCPIT_LCP_BITS;
409 } while (next_lcp >= lcp);
411 /* Link the current position into the match chain, discarding
412 * any skipped matches. */
413 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
414 pos_data[cur_match] = interval;
416 /* Record the match. */
417 matches[num_matches++] = (struct lz_match) {
419 .offset = cur_pos - cur_match,
422 /* Bound the number of matches per position. */
423 if (num_matches >= mf->base.params.max_search_depth)
426 /* Advance to the next match. */
427 interval = next_interval;
429 cur_match = next_match;
432 /* If the length of the longest match is equal to the lcp limit, it may
433 * have been truncated. Try extending it up to the maximum match
435 if (num_matches && matches[0].len == mf->base.params.nice_match_len) {
436 const u8 * const strptr = lz_mf_get_window_ptr(&mf->base);
437 const u8 * const matchptr = strptr - matches[0].offset;
438 const u32 len_limit = min(lz_mf_get_bytes_remaining(&mf->base),
439 mf->base.params.max_match_len);
442 len = matches[0].len;
443 while (len < len_limit && strptr[len] == matchptr[len])
445 matches[0].len = len;
448 for (u32 i = 0; i < num_matches / 2; i++)
449 swap(matches[i], matches[num_matches - 1 - i]);
451 mf->base.cur_window_pos++;
455 /* Slightly simplified version of lz_lcpit_get_matches() for updating the data
456 * structures when we don't actually need matches at the current position. See
457 * lz_lcpit_get_matches() for explanatory comments. */
459 lz_lcpit_skip_position(struct lz_lcpit *mf)
461 const u32 cur_pos = mf->base.cur_window_pos++;
462 u32 * const pos_data = mf->pos_data;
463 u32 * const intervals = mf->intervals;
465 u32 interval, next_interval;
466 u32 cur_match, next_match;
468 interval = pos_data[cur_pos];
469 pos_data[cur_pos] = 0;
470 while (intervals[interval] & 0x80000000) {
471 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
472 next_interval = (intervals[interval] & ~0x80000000)
473 >> LZ_LCPIT_LCP_BITS;
476 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
477 interval = next_interval;
479 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
480 cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS;
482 next_match = cur_match;
484 next_interval = pos_data[next_match];
485 next_lcp = intervals[next_interval] & LZ_LCPIT_LCP_MASK;
486 cur_match = next_match;
487 next_match = intervals[next_interval] >> LZ_LCPIT_LCP_BITS;
488 } while (next_lcp >= lcp);
489 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
490 pos_data[cur_match] = interval;
491 interval = next_interval;
493 cur_match = next_match;
498 lz_lcpit_skip_positions(struct lz_mf *_mf, u32 n)
500 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
503 lz_lcpit_skip_position(mf);
508 lz_lcpit_destroy(struct lz_mf *_mf)
510 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
515 const struct lz_mf_ops lz_lcp_interval_tree_ops = {
516 .params_valid = lz_lcpit_params_valid,
517 .get_needed_memory = lz_lcpit_get_needed_memory,
518 .init = lz_lcpit_init,
519 .load_window = lz_lcpit_load_window,
520 .get_matches = lz_lcpit_get_matches,
521 .skip_positions = lz_lcpit_skip_positions,
522 .destroy = lz_lcpit_destroy,
523 .struct_size = sizeof(struct lz_lcpit),