2 * lz_lcp_interval_tree.c
4 * A match-finder for Lempel-Ziv compression based on bottom-up construction and
5 * traversal of the Longest Common Prefix (LCP) interval tree.
7 * Copyright (c) 2014 Eric Biggers. All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
24 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
27 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
29 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 #include "wimlib/lz_mf.h"
38 #include "wimlib/lz_suffix_array_utils.h"
39 #include "wimlib/util.h"
42 * To save space, we pack lcp (longest common prefix) and position values into
43 * 32-bit integers. Therefore, we must divide the 32 bits into lcp and position
44 * bits. 6 lcp bits seems to be a good value, since matches of length 64 are
45 * sufficiently long so that the compression ratio isn't hurt much by choosing
46 * one such match over another. We also use 1 bit to mark intervals as "not yet
47 * visited". This leaves 25 bits, which when used for position results in a
48 * maximum window size of 33554432 bytes.
50 #define LZ_LCPIT_LCP_BITS 6
51 #define LZ_LCPIT_LCP_MASK ((1 << LZ_LCPIT_LCP_BITS) - 1)
52 #define LZ_LCPIT_LCP_MAX LZ_LCPIT_LCP_MASK
53 #define LZ_LCPIT_POS_BITS (32 - 1 - LZ_LCPIT_LCP_BITS)
54 #define LZ_LCPIT_MAX_WINDOW_SIZE (1UL << LZ_LCPIT_POS_BITS)
56 #define SA_and_LCP_LCP_SHIFT (32 - LZ_LCPIT_LCP_BITS)
57 #define SA_and_LCP_POS_MASK (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1)
64 /* Mapping: lcp-interval index => lcp-interval data
66 * Initially, the lcp-interval data for an lcp-interval contains that
67 * interval's lcp and superinterval index.
69 * After a lcp-interval is visited during match-finding, its
70 * lcp-interval data contains that interval's lcp and the position of
71 * the next suffix to consider as a match when matching against that
75 /* Mapping: suffix index ("window position") => lcp-interval index */
80 * Build the LCP (Longest Common Prefix) array in linear time.
82 * LCP[r] will be the length of the longest common prefix between the suffixes
83 * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined.
85 * Algorithm taken from Kasai et al. (2001), but modified slightly:
87 * - For decreased memory usage and improved memory locality, pack the two
88 * logically distinct SA and LCP arrays into a single array SA_and_LCP.
89 * - Cap the LCP values to the specified limit.
90 * - With bytes there is no realistic way to reserve a unique symbol for
91 * end-of-buffer, so use explicit checks for end-of-buffer.
95 * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in
96 * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th
97 * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
100 build_LCP_packed(u32 * const restrict SA_and_LCP, const u32 * const restrict ISA,
101 const u8 * const restrict T, const u32 n, const u32 lcp_limit)
106 for (i = 0; i < n; i++) {
109 j = SA_and_LCP[r - 1] & SA_and_LCP_POS_MASK;
110 lim = min(n - i, n - j);
111 while (h < lim && T[i + h] == T[j + h])
113 SA_and_LCP[r] |= min(h, lcp_limit) << SA_and_LCP_LCP_SHIFT;
121 * Use the suffix array accompanied with the longest-common-prefix array --- in
122 * other words, the "enhanced suffix array" --- to simulate a bottom-up
123 * traversal of the corresponding suffix tree, or equivalently the "lcp-interval
124 * tree", as described in Abouelhoda et al. (2004).
126 * While doing the traversal, create a table 'intervals' that contains data for
127 * each lcp-interval, specifically the lcp value of that interval, and the index
128 * of the superinterval.
130 * Also while doing the traversal, create a table 'pos_data' that contains a
131 * mapping from suffix index to the deepest lcp-interval containing it.
133 * The result is that we will later be able to do match-finding at a given
134 * position by looking up that position in 'pos_data' to get the deepest
135 * lcp-interval containing the corresponding suffix, then proceeding to the
136 * superintervals. See lz_lcpit_get_matches() for more details.
138 * Note: We limit the depth of the lcp-interval tree by capping the lcp at
139 * LZ_LCPIT_LCP_MAX. This can cause a sub-tree of intervals with lcp greater
140 * than LZ_LCPIT_LCP_MAX to be collapsed into a single interval with lcp
141 * LZ_LCPIT_LCP_MAX. This avoids degenerate cases and does not hurt
142 * match-finding very much, since if we find a match of length LZ_LCPIT_LCP_MAX
143 * and extend it as far as possible, that's usually good enough because that
144 * region of the input must already be highly compressible.
148 * M.I. Abouelhoda, S. Kurtz, E. Ohlebusch. 2004. Replacing Suffix Trees
149 * With Enhanced Suffix Arrays. Journal of Discrete Algorithms Volume 2
150 * Issue 1, March 2004, pp. 53-86.
152 * G. Chen, S.J. Puglisi, W.F. Smyth. 2008. Lempel-Ziv Factorization
153 * Using Less Time & Space. Mathematics in Computer Science June 2008,
154 * Volume 1, Issue 4, pp. 605-623.
156 * Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
157 * Arrays and Its Applications. 2001. CPM '01 Proceedings of the 12th
158 * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
161 build_LCPIT(const u32 * const restrict SA_and_LCP,
162 u32 * const restrict intervals, u32 * const restrict pos_data,
165 u32 next_interval_idx = 0;
166 u32 open_intervals[LZ_LCPIT_LCP_MAX + 1];
167 u32 *top = open_intervals;
168 u32 prev_pos = SA_and_LCP[0] & SA_and_LCP_POS_MASK;
170 /* The interval with lcp=0 covers the entire array. It remains open
172 *top = next_interval_idx;
173 intervals[next_interval_idx] = 0;
176 for (u32 r = 1; r < n; r++) {
177 u32 next_pos = SA_and_LCP[r] & SA_and_LCP_POS_MASK;
178 u32 next_lcp = SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT;
179 u32 top_lcp = intervals[*top];
181 if (next_lcp == top_lcp) {
182 /* continuing the deepest open interval */
183 pos_data[prev_pos] = *top;
184 } else if (next_lcp > top_lcp) {
185 /* opening a new interval */
186 intervals[next_interval_idx] = next_lcp;
187 *++top = next_interval_idx;
188 pos_data[prev_pos] = next_interval_idx;
191 /* closing the deepest open interval */
192 pos_data[prev_pos] = *top;
194 u32 closed_interval_idx = *top;
195 u32 superinterval_idx = *--top;
196 u32 superinterval_lcp = intervals[superinterval_idx];
198 if (next_lcp == superinterval_lcp) {
199 /* continuing the superinterval */
200 intervals[closed_interval_idx] |=
201 (superinterval_idx << LZ_LCPIT_LCP_BITS) |
204 } else if (next_lcp > superinterval_lcp) {
205 /* creating a new interval that is a
206 * superinterval of the one being
207 * closed, but still a subinterval of
208 * its superinterval */
209 intervals[next_interval_idx] = next_lcp;
210 *++top = next_interval_idx;
211 intervals[closed_interval_idx] |=
212 (next_interval_idx << LZ_LCPIT_LCP_BITS) |
217 /* also closing the superinterval */
218 intervals[closed_interval_idx] |=
219 (superinterval_idx << LZ_LCPIT_LCP_BITS) |
227 /* close any still-open intervals */
228 pos_data[prev_pos] = *top;
229 while (top > open_intervals) {
230 u32 closed_interval_idx = *top;
231 u32 superinterval_idx = *--top;
232 intervals[closed_interval_idx] |=
233 (superinterval_idx << LZ_LCPIT_LCP_BITS) | 0x80000000;
238 lz_lcpit_set_default_params(struct lz_mf_params *params)
240 if (params->min_match_len == 0)
241 params->min_match_len = 2;
243 if (params->max_match_len == 0)
244 params->max_match_len = UINT32_MAX;
246 if (params->max_search_depth == 0)
247 params->max_search_depth = 32;
249 params->max_search_depth = DIV_ROUND_UP(params->max_search_depth, 8);
251 if (params->nice_match_len == 0)
252 params->nice_match_len = LZ_LCPIT_LCP_MAX;
254 if (params->nice_match_len < params->min_match_len)
255 params->nice_match_len = params->min_match_len;
257 if (params->nice_match_len > params->max_match_len)
258 params->nice_match_len = params->max_match_len;
260 if (params->nice_match_len > LZ_LCPIT_LCP_MAX)
261 params->nice_match_len = LZ_LCPIT_LCP_MAX;
265 lz_lcpit_params_valid(const struct lz_mf_params *params)
267 return params->max_window_size <= LZ_LCPIT_MAX_WINDOW_SIZE;
271 lz_lcpit_get_needed_memory(u32 max_window_size)
273 return sizeof(u32) * (max_window_size +
274 max(BUILD_SA_MIN_TMP_LEN,
275 2 * (u64)max_window_size));
279 lz_lcpit_init(struct lz_mf *_mf)
281 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
283 lz_lcpit_set_default_params(&mf->base.params);
285 mf->mem = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size));
286 return (mf->mem != NULL);
290 lz_lcpit_load_window(struct lz_mf *_mf, const u8 T[], u32 n)
292 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
294 build_SA(&mf->mem[0 * n], T, n, &mf->mem[1 * n]);
295 build_ISA(&mf->mem[2 * n], &mf->mem[0 * n], n);
296 build_LCP_packed(&mf->mem[0 * n], &mf->mem[2 * n], T, n,
297 mf->base.params.nice_match_len);
298 build_LCPIT(&mf->mem[0 * n], &mf->mem[1 * n], &mf->mem[2 * n], n);
299 mf->intervals = &mf->mem[1 * n];
300 mf->pos_data = &mf->mem[2 * n];
304 lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[])
306 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
307 const u32 min_match_len = mf->base.params.min_match_len;
308 const u32 cur_pos = mf->base.cur_window_pos;
309 u32 * const pos_data = mf->pos_data;
310 u32 * const intervals = mf->intervals;
313 u32 interval, next_interval;
314 u32 cur_match, next_match;
316 /* Look up the deepest lcp-interval containing the current suffix. */
317 interval = pos_data[cur_pos];
319 /* Since the current position is greater than any position previously
320 * searched, set the "lcp interval of the next match" for this suffix to
321 * 0. This is the index of the root interval, and this indicates that
322 * there is no next match. */
323 pos_data[cur_pos] = 0;
325 /* Ascend the lcp-interval tree until we reach an lcp-interval that has
326 * already been visited. */
328 while (intervals[interval] & 0x80000000) {
330 /* Visiting this lcp-interval for the first time. Therefore,
331 * there are no Lempel-Ziv matches with length equal to the lcp
332 * of this lcp-interval. */
334 /* Extract the LCP and superinterval reference. */
336 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
338 next_interval = (intervals[interval] & ~0x80000000)
339 >> LZ_LCPIT_LCP_BITS;
341 /* If the LCP is shorter than the minimum length of matches to
342 * be produced, we're done, since the LCP will only ever get
343 * shorter from here. This also prevents ascending above the
344 * root of the lcp-interval tree, since the root is guaranteed
345 * to be a 0-interval, and min_match_len is guaranteed to be at
347 if (lcp < min_match_len)
350 /* Set the position of the most-recently-seen suffix within this
351 * lcp-interval. Since this is the first visitation of this
352 * lcp-interval, this is simply the current suffix.
354 * Note that this overwrites the superinterval reference which
355 * was previously included in this lcp-interval data slot.
356 * Further visitations of this lcp-interval will detect that it
357 * is already visited and will follow the chain of
358 * most-recently-seen suffixes rather than ascend the tree
360 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
362 /* Ascend to the superinterval of this lcp-interval. */
363 interval = next_interval;
366 /* We've already visited the current lcp-interval. */
368 /* Extract the LCP of this lcp-interval. */
369 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
371 /* Extract the current match for this lcp-interval. This usually is the
372 * most-recently-seen suffix within this lcp-interval, but it may be
374 cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS;
377 /* If the LCP is shorter than the minimum length of matches to
378 * be produced, we're done, since the LCP will only ever get
379 * shorter from here. This also prevents ascending above the
380 * root of the lcp-interval tree, since the root is guaranteed
381 * to be a 0-interval, and min_match_len is guaranteed to be at
383 if (lcp < min_match_len)
386 /* Advance the current match until the lcp of the *next* match
387 * is lower than the current lcp. When this is true we know
388 * that the current match is up to date (lowest offset /
389 * greatest position for that lcp). */
391 next_match = cur_match;
393 next_interval = pos_data[next_match];
394 next_lcp = intervals[next_interval] & LZ_LCPIT_LCP_MASK;
395 cur_match = next_match;
396 next_match = intervals[next_interval] >> LZ_LCPIT_LCP_BITS;
397 } while (next_lcp >= lcp);
399 /* Link the current position into the match chain, discarding
400 * any skipped matches. */
401 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
402 pos_data[cur_match] = interval;
404 /* Record the match. */
405 matches[num_matches++] = (struct lz_match) {
407 .offset = cur_pos - cur_match,
410 /* Bound the number of matches per position. */
411 if (num_matches >= mf->base.params.max_search_depth)
414 /* Advance to the next match. */
415 interval = next_interval;
417 cur_match = next_match;
420 /* If the length of the longest match is equal to the lcp limit, it may
421 * have been truncated. Try extending it up to the maximum match
423 if (num_matches && matches[0].len == mf->base.params.nice_match_len) {
424 const u8 * const strptr = lz_mf_get_window_ptr(&mf->base);
425 const u8 * const matchptr = strptr - matches[0].offset;
426 const u32 len_limit = min(lz_mf_get_bytes_remaining(&mf->base),
427 mf->base.params.max_match_len);
430 len = matches[0].len;
431 while (len < len_limit && strptr[len] == matchptr[len])
433 matches[0].len = len;
436 for (u32 i = 0; i < num_matches / 2; i++)
437 swap(matches[i], matches[num_matches - 1 - i]);
439 mf->base.cur_window_pos++;
443 /* Slightly simplified version of lz_lcpit_get_matches() for updating the data
444 * structures when we don't actually need matches at the current position. See
445 * lz_lcpit_get_matches() for explanatory comments. */
447 lz_lcpit_skip_position(struct lz_lcpit *mf)
449 const u32 min_match_len = mf->base.params.min_match_len;
450 const u32 cur_pos = mf->base.cur_window_pos++;
451 u32 * const pos_data = mf->pos_data;
452 u32 * const intervals = mf->intervals;
454 u32 interval, next_interval;
455 u32 cur_match, next_match;
457 interval = pos_data[cur_pos];
458 pos_data[cur_pos] = 0;
459 while (intervals[interval] & 0x80000000) {
460 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
461 next_interval = (intervals[interval] & ~0x80000000)
462 >> LZ_LCPIT_LCP_BITS;
463 if (lcp < min_match_len)
465 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
466 interval = next_interval;
468 lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
469 cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS;
470 while (lcp >= min_match_len) {
471 next_match = cur_match;
473 next_interval = pos_data[next_match];
474 next_lcp = intervals[next_interval] & LZ_LCPIT_LCP_MASK;
475 cur_match = next_match;
476 next_match = intervals[next_interval] >> LZ_LCPIT_LCP_BITS;
477 } while (next_lcp >= lcp);
478 intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
479 pos_data[cur_match] = interval;
480 interval = next_interval;
482 cur_match = next_match;
487 lz_lcpit_skip_positions(struct lz_mf *_mf, u32 n)
489 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
492 lz_lcpit_skip_position(mf);
497 lz_lcpit_destroy(struct lz_mf *_mf)
499 struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
504 const struct lz_mf_ops lz_lcp_interval_tree_ops = {
505 .params_valid = lz_lcpit_params_valid,
506 .get_needed_memory = lz_lcpit_get_needed_memory,
507 .init = lz_lcpit_init,
508 .load_window = lz_lcpit_load_window,
509 .get_matches = lz_lcpit_get_matches,
510 .skip_positions = lz_lcpit_skip_positions,
511 .destroy = lz_lcpit_destroy,
512 .struct_size = sizeof(struct lz_lcpit),