4 * A match-finder for Lempel-Ziv compression based on bottom-up construction and
5 * traversal of the Longest Common Prefix (LCP) interval tree.
10 * The author dedicates this file to the public domain.
11 * You can do whatever you want with this file.
16 #include "wimlib/divsufsort.h"
17 #include "wimlib/lcpit_matchfinder.h"
18 #include "wimlib/util.h"
21 #define LCP_MAX (((u32)1 << LCP_BITS) - 1)
22 #define POS_MASK (((u32)1 << (32 - LCP_BITS)) - 1)
23 #define LCP_SHIFT (32 - LCP_BITS)
24 #define LCP_MASK (LCP_MAX << LCP_SHIFT)
25 #define MAX_NORMAL_BUFSIZE (POS_MASK + 1)
27 #define HUGE_LCP_BITS 7
28 #define HUGE_LCP_MAX (((u32)1 << HUGE_LCP_BITS) - 1)
29 #define HUGE_LCP_SHIFT (64 - HUGE_LCP_BITS)
30 #define HUGE_POS_MASK 0xFFFFFFFF
31 #define MAX_HUGE_BUFSIZE ((u64)HUGE_POS_MASK + 1)
32 #define HUGE_UNVISITED_TAG 0x100000000
34 #define PREFETCH_SAFETY 5
37 * Build the LCP (Longest Common Prefix) array in linear time.
39 * LCP[r] will be the length of the longest common prefix between the suffixes
40 * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined.
42 * Algorithm taken from Kasai et al. (2001), but modified slightly:
44 * - With bytes there is no realistic way to reserve a unique symbol for
45 * end-of-buffer, so use explicit checks for end-of-buffer.
47 * - For decreased memory usage and improved memory locality, pack the two
48 * logically distinct SA and LCP arrays into a single array SA_and_LCP.
50 * - Since SA_and_LCP is accessed randomly, improve the cache behavior by
51 * reading several entries ahead in ISA and prefetching the upcoming
54 * - If an LCP value is less than the minimum match length, then store 0. This
55 * avoids having to do comparisons against the minimum match length later.
57 * - If an LCP value is greater than the "nice match length", then store the
58 * "nice match length". This caps the number of bits needed to store each
59 * LCP value, and this caps the depth of the LCP-interval tree, without
60 * usually hurting the compression ratio too much.
64 * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in
65 * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th
66 * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
69 build_LCP(u32 SA_and_LCP[restrict], const u32 ISA[restrict],
70 const u8 T[restrict], const u32 n,
71 const u32 min_lcp, const u32 max_lcp)
74 for (u32 i = 0; i < n; i++) {
76 prefetch(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
78 const u32 j = SA_and_LCP[r - 1] & POS_MASK;
79 const u32 lim = min(n - i, n - j);
80 while (h < lim && T[i + h] == T[j + h])
83 if (stored_lcp < min_lcp)
85 else if (stored_lcp > max_lcp)
87 SA_and_LCP[r] |= stored_lcp << LCP_SHIFT;
95 * Use the suffix array accompanied with the longest-common-prefix array ---
96 * which in combination can be called the "enhanced suffix array" --- to
97 * simulate a bottom-up traversal of the corresponding suffix tree, or
98 * equivalently the lcp-interval tree. Do so in suffix rank order, but save the
99 * superinterval references needed for later bottom-up traversal of the tree in
100 * suffix position order.
102 * To enumerate the lcp-intervals, this algorithm scans the suffix array and its
103 * corresponding LCP array linearly. While doing so, it maintains a stack of
104 * lcp-intervals that are currently open, meaning that their left boundaries
105 * have been seen but their right boundaries have not. The bottom of the stack
106 * is the interval which covers the entire suffix array (this has lcp=0), and
107 * the top of the stack is the deepest interval that is currently open (this has
108 * the greatest lcp of any interval on the stack). When this algorithm opens an
109 * lcp-interval, it assigns it a unique index in intervals[] and pushes it onto
110 * the stack. When this algorithm closes an interval, it pops it from the stack
111 * and sets the intervals[] entry of that interval to the index and lcp of that
112 * interval's superinterval, which is the new top of the stack.
114 * This algorithm also set pos_data[pos] for each suffix position 'pos' to the
115 * index and lcp of the deepest lcp-interval containing it. Alternatively, we
116 * can interpret each suffix as being associated with a singleton lcp-interval,
117 * or leaf of the suffix tree. With this interpretation, an entry in pos_data[]
118 * is the superinterval reference for one of these singleton lcp-intervals and
119 * therefore is not fundamentally different from an entry in intervals[].
121 * To reduce memory usage, this algorithm re-uses the suffix array's memory to
122 * store the generated intervals[] array. This is possible because SA and LCP
123 * are accessed linearly, and no more than one interval is generated per suffix.
125 * The techniques used in this algorithm are described in various published
126 * papers. The generation of lcp-intervals from the suffix array (SA) and the
127 * longest-common-prefix array (LCP) is given as Algorithm BottomUpTraverse in
128 * Kasai et al. (2001) and Algorithm 4.1 ("Computation of lcp-intervals") in
129 * Abouelhoda et al. (2004). Both these papers note the equivalence between
130 * lcp-intervals (including the singleton lcp-interval for each suffix) and
131 * nodes of the suffix tree. Abouelhoda et al. (2004) furthermore applies
132 * bottom-up traversal of the lcp-interval tree to Lempel-Ziv factorization, as
133 * does Chen at al. (2008). Algorithm CPS1b of Chen et al. (2008) dynamically
134 * re-uses the suffix array during bottom-up traversal of the lcp-interval tree.
138 * Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
139 * Arrays and Its Applications. 2001. CPM '01 Proceedings of the 12th
140 * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
142 * M.I. Abouelhoda, S. Kurtz, E. Ohlebusch. 2004. Replacing Suffix Trees
143 * With Enhanced Suffix Arrays. Journal of Discrete Algorithms Volume 2
144 * Issue 1, March 2004, pp. 53-86.
146 * G. Chen, S.J. Puglisi, W.F. Smyth. 2008. Lempel-Ziv Factorization
147 * Using Less Time & Space. Mathematics in Computer Science June 2008,
148 * Volume 1, Issue 4, pp. 605-623.
151 build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n)
153 u32 * const SA_and_LCP = intervals;
154 u32 next_interval_idx;
155 u32 open_intervals[LCP_MAX + 1];
156 u32 *top = open_intervals;
157 u32 prev_pos = SA_and_LCP[0] & POS_MASK;
161 next_interval_idx = 1;
163 for (u32 r = 1; r < n; r++) {
164 const u32 next_pos = SA_and_LCP[r] & POS_MASK;
165 const u32 next_lcp = SA_and_LCP[r] >> LCP_SHIFT;
166 const u32 top_lcp = *top >> LCP_SHIFT;
168 if (next_lcp == top_lcp) {
169 /* Continuing the deepest open interval */
170 pos_data[prev_pos] = *top;
171 } else if (next_lcp > top_lcp) {
172 /* Opening a new interval */
173 *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++;
174 pos_data[prev_pos] = *top;
176 /* Closing the deepest open interval */
177 pos_data[prev_pos] = *top;
179 const u32 closed_interval_idx = *top-- & POS_MASK;
180 const u32 superinterval_lcp = *top >> LCP_SHIFT;
182 if (next_lcp == superinterval_lcp) {
183 /* Continuing the superinterval */
184 intervals[closed_interval_idx] = *top;
186 } else if (next_lcp > superinterval_lcp) {
187 /* Creating a new interval that is a
188 * superinterval of the one being
189 * closed, but still a subinterval of
190 * its superinterval */
191 *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++;
192 intervals[closed_interval_idx] = *top;
195 /* Also closing the superinterval */
196 intervals[closed_interval_idx] = *top;
203 /* Close any still-open intervals. */
204 pos_data[prev_pos] = *top;
205 for (; top > open_intervals; top--)
206 intervals[*top & POS_MASK] = *(top - 1);
210 * Advance the LCP-interval tree matchfinder by one byte.
212 * If @record_matches is true, then matches are written to the @matches array
213 * sorted by strictly decreasing length and strictly decreasing offset, and the
214 * return value is the number of matches found. Otherwise, @matches is ignored
215 * and the return value is always 0.
217 * How this algorithm works:
219 * 'cur_pos' is the position of the current suffix, which is the suffix being
220 * matched against. 'cur_pos' starts at 0 and is incremented each time this
221 * function is called. This function finds each suffix with position less than
222 * 'cur_pos' that shares a prefix with the current position, but for each
223 * distinct prefix length it finds only the suffix with the greatest position
224 * (i.e. the most recently seen in the linear traversal by position). This
225 * function accomplishes this using the lcp-interval tree data structure that
226 * was built by build_LCPIT() and is updated dynamically by this function.
228 * The first step is to read 'pos_data[cur_pos]', which gives the index and lcp
229 * value of the deepest lcp-interval containing the current suffix --- or,
230 * equivalently, the parent of the conceptual singleton lcp-interval that
231 * contains the current suffix.
233 * The second step is to ascend the lcp-interval tree until reaching an interval
234 * that has not yet been visited, and link the intervals to the current suffix
235 * along the way. An lcp-interval has been visited if and only if it has been
236 * linked to a suffix. Initially, no lcp-intervals are linked to suffixes.
238 * The third step is to continue ascending the lcp-interval tree, but indirectly
239 * through suffix links rather than through the original superinterval
240 * references, and continuing to form links with the current suffix along the
241 * way. Each suffix visited during this step, except in a special case to
242 * handle outdated suffixes, is a match which can be written to matches[]. Each
243 * intervals[] entry contains the position of the next suffix to visit, which we
244 * shall call 'match_pos'; this is the most recently seen suffix that belongs to
245 * that lcp-interval. 'pos_data[match_pos]' then contains the lcp and interval
246 * index of the next lcp-interval that should be visited.
248 * We can view these arrays as describing a new set of links that gets overlaid
249 * on top of the original superinterval references of the lcp-interval tree.
250 * Each such link can connect a node of the lcp-interval tree to an ancestor
251 * more than one generation removed.
253 * For each one-byte advance, the current position becomes the most recently
254 * seen suffix for a continuous sequence of lcp-intervals from a leaf interval
255 * to the root interval. Conceptually, this algorithm needs to update all these
256 * nodes to link to 'cur_pos', and then update 'pos_data[cur_pos]' to a "null"
257 * link. But actually, if some of these nodes have the same most recently seen
258 * suffix, then this algorithm just visits the pos_data[] entry for that suffix
259 * and skips over all these nodes in one step. Updating the extra nodes is
260 * accomplished by creating a redirection from the previous suffix to the
263 * Using this shortcutting scheme, it's possible for a suffix to become out of
264 * date, which means that it is no longer the most recently seen suffix for the
265 * lcp-interval under consideration. This case is detected by noticing when the
266 * next lcp-interval link actually points deeper in the tree, and it is worked
267 * around by just continuing until we get to a link that actually takes us
268 * higher in the tree. This can be described as a lazy-update scheme.
271 lcpit_advance_one_byte(const u32 cur_pos,
272 u32 pos_data[restrict],
273 u32 intervals[restrict],
274 struct lz_match matches[restrict],
275 const bool record_matches)
280 struct lz_match *matchptr;
282 /* Get the deepest lcp-interval containing the current suffix. */
283 lcp = pos_data[cur_pos] >> LCP_SHIFT;
284 interval_idx = pos_data[cur_pos] & POS_MASK;
285 prefetch(&intervals[pos_data[cur_pos + 1] & POS_MASK]);
286 pos_data[cur_pos] = 0;
288 /* Ascend until we reach a visited interval, linking the unvisited
289 * intervals to the current suffix as we go. */
290 while (intervals[interval_idx] & LCP_MASK) {
291 const u32 superinterval_lcp = intervals[interval_idx] >> LCP_SHIFT;
292 const u32 superinterval_idx = intervals[interval_idx] & POS_MASK;
293 intervals[interval_idx] = cur_pos;
294 lcp = superinterval_lcp;
295 interval_idx = superinterval_idx;
298 match_pos = intervals[interval_idx] & POS_MASK;
299 if (match_pos == 0) {
300 /* Ambiguous case; just don't allow matches with position 0. */
301 if (interval_idx != 0)
302 intervals[interval_idx] = cur_pos;
306 /* Ascend indirectly via pos_data[] links. */
309 u32 next_interval_idx;
312 next_lcp = pos_data[match_pos] >> LCP_SHIFT;
313 next_interval_idx = pos_data[match_pos] & POS_MASK;
316 /* Suffix was out of date. */
317 match_pos = intervals[next_interval_idx];
319 intervals[interval_idx] = cur_pos;
320 pos_data[match_pos] = (lcp << LCP_SHIFT) | interval_idx;
321 if (record_matches) {
322 matchptr->length = lcp;
323 matchptr->offset = cur_pos - match_pos;
326 if (next_interval_idx == 0)
328 match_pos = intervals[next_interval_idx];
329 interval_idx = next_interval_idx;
332 return matchptr - matches;
335 /* Expand SA from 32 bits to 64 bits. */
337 expand_SA(u32 *p, u32 n)
339 typedef u32 _may_alias_attribute aliased_u32_t;
340 typedef u64 _may_alias_attribute aliased_u64_t;
342 aliased_u32_t *SA = (aliased_u32_t *)p;
343 aliased_u64_t *SA64 = (aliased_u64_t *)p;
351 /* Like build_LCP(), but for buffers larger than MAX_NORMAL_BUFSIZE. */
353 build_LCP_huge(u64 SA_and_LCP64[restrict], const u32 ISA[restrict],
354 const u8 T[restrict], const u32 n,
355 const u32 min_lcp, const u32 max_lcp)
358 for (u32 i = 0; i < n; i++) {
359 const u32 r = ISA[i];
360 prefetch(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
362 const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK;
363 const u32 lim = min(n - i, n - j);
364 while (h < lim && T[i + h] == T[j + h])
367 if (stored_lcp < min_lcp)
369 else if (stored_lcp > max_lcp)
370 stored_lcp = max_lcp;
371 SA_and_LCP64[r] |= (u64)stored_lcp << HUGE_LCP_SHIFT;
379 * Like build_LCPIT(), but for buffers larger than MAX_NORMAL_BUFSIZE.
381 * This "huge" version is also slightly different in that the lcp value stored
382 * in each intervals[] entry is the lcp value for that interval, not its
383 * superinterval. This lcp value stays put in intervals[] and doesn't get moved
384 * to pos_data[] during lcpit_advance_one_byte_huge(). One consequence of this
385 * is that we have to use a special flag to distinguish visited from unvisited
386 * intervals. But overall, this scheme keeps the memory usage at 12n instead of
387 * 16n. (The non-huge version is 8n.)
390 build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
392 u64 * const SA_and_LCP64 = intervals64;
393 u32 next_interval_idx;
394 u32 open_intervals[HUGE_LCP_MAX + 1];
395 u32 *top = open_intervals;
396 u32 prev_pos = SA_and_LCP64[0] & HUGE_POS_MASK;
400 next_interval_idx = 1;
402 for (u32 r = 1; r < n; r++) {
403 const u32 next_pos = SA_and_LCP64[r] & HUGE_POS_MASK;
404 const u32 next_lcp = SA_and_LCP64[r] >> HUGE_LCP_SHIFT;
405 const u32 top_lcp = intervals64[*top] >> HUGE_LCP_SHIFT;
407 if (next_lcp == top_lcp) {
408 /* Continuing the deepest open interval */
409 pos_data[prev_pos] = *top;
410 } else if (next_lcp > top_lcp) {
411 /* Opening a new interval */
412 intervals64[next_interval_idx] = (u64)next_lcp << HUGE_LCP_SHIFT;
413 pos_data[prev_pos] = next_interval_idx;
414 *++top = next_interval_idx++;
416 /* Closing the deepest open interval */
417 pos_data[prev_pos] = *top;
419 const u32 closed_interval_idx = *top--;
420 const u32 superinterval_lcp = intervals64[*top] >> HUGE_LCP_SHIFT;
422 if (next_lcp == superinterval_lcp) {
423 /* Continuing the superinterval */
424 intervals64[closed_interval_idx] |=
425 HUGE_UNVISITED_TAG | *top;
427 } else if (next_lcp > superinterval_lcp) {
428 /* Creating a new interval that is a
429 * superinterval of the one being
430 * closed, but still a subinterval of
431 * its superinterval */
432 intervals64[next_interval_idx] =
433 (u64)next_lcp << HUGE_LCP_SHIFT;
434 intervals64[closed_interval_idx] |=
435 HUGE_UNVISITED_TAG | next_interval_idx;
436 *++top = next_interval_idx++;
439 /* Also closing the superinterval */
440 intervals64[closed_interval_idx] |=
441 HUGE_UNVISITED_TAG | *top;
448 /* Close any still-open intervals. */
449 pos_data[prev_pos] = *top;
450 for (; top > open_intervals; top--)
451 intervals64[*top] |= HUGE_UNVISITED_TAG | *(top - 1);
454 /* Like lcpit_advance_one_byte(), but for buffers larger than
455 * MAX_NORMAL_BUFSIZE. */
457 lcpit_advance_one_byte_huge(const u32 cur_pos,
458 u32 pos_data[restrict],
459 u64 intervals64[restrict],
460 struct lz_match matches[restrict],
461 const bool record_matches)
466 struct lz_match *matchptr;
468 interval_idx = pos_data[cur_pos];
469 prefetch(&intervals64[pos_data[cur_pos + 1]]);
470 pos_data[cur_pos] = 0;
472 while (intervals64[interval_idx] & HUGE_UNVISITED_TAG) {
473 lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT;
476 const u32 superinterval_idx = intervals64[interval_idx] & HUGE_POS_MASK;
477 intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos;
478 interval_idx = superinterval_idx;
481 match_pos = intervals64[interval_idx] & HUGE_POS_MASK;
482 lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT;
485 u32 next_interval_idx;
489 next_interval_idx = pos_data[match_pos];
490 next_lcp = intervals64[next_interval_idx] >> HUGE_LCP_SHIFT;
493 match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK;
495 intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos;
496 pos_data[match_pos] = interval_idx;
497 if (record_matches) {
498 matchptr->length = lcp;
499 matchptr->offset = cur_pos - match_pos;
502 match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK;
503 interval_idx = next_interval_idx;
506 return matchptr - matches;
510 get_pos_data_size(size_t max_bufsize)
512 return (u64)max((u64)max_bufsize + PREFETCH_SAFETY,
513 DIVSUFSORT_TMP_LEN) * sizeof(u32);
517 get_intervals_size(size_t max_bufsize)
519 return (u64)max_bufsize * (max_bufsize <= MAX_NORMAL_BUFSIZE ?
520 sizeof(u32) : sizeof(u64));
524 * Calculate the number of bytes of memory needed for the LCP-interval tree
527 * @max_bufsize - maximum buffer size to support
529 * Returns the number of bytes required.
532 lcpit_matchfinder_get_needed_memory(size_t max_bufsize)
534 return get_pos_data_size(max_bufsize) + get_intervals_size(max_bufsize);
538 * Initialize the LCP-interval tree matchfinder.
540 * @mf - the matchfinder structure to initialize
541 * @max_bufsize - maximum buffer size to support
542 * @min_match_len - minimum match length in bytes
543 * @nice_match_len - only consider this many bytes of each match
545 * Returns true if successfully initialized; false if out of memory.
548 lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize,
549 u32 min_match_len, u32 nice_match_len)
551 if (lcpit_matchfinder_get_needed_memory(max_bufsize) > SIZE_MAX)
553 if (max_bufsize > MAX_HUGE_BUFSIZE - PREFETCH_SAFETY)
556 mf->pos_data = MALLOC(get_pos_data_size(max_bufsize));
557 mf->intervals = MALLOC(get_intervals_size(max_bufsize));
558 if (!mf->pos_data || !mf->intervals) {
559 lcpit_matchfinder_destroy(mf);
563 mf->min_match_len = min_match_len;
564 mf->nice_match_len = min(nice_match_len,
565 (max_bufsize <= MAX_NORMAL_BUFSIZE) ?
566 LCP_MAX : HUGE_LCP_MAX);
567 for (u32 i = 0; i < PREFETCH_SAFETY; i++)
568 mf->pos_data[max_bufsize + i] = 0;
573 * Build the suffix array SA for the specified byte array T of length n.
575 * The suffix array is a sorted array of the byte array's suffixes, represented
576 * by indices into the byte array. It can equivalently be viewed as a mapping
577 * from suffix rank to suffix position.
579 * To build the suffix array, we use libdivsufsort, which uses an
580 * induced-sorting-based algorithm. In practice, this seems to be the fastest
581 * suffix array construction algorithm currently available.
585 * Y. Mori. libdivsufsort, a lightweight suffix-sorting library.
586 * https://code.google.com/p/libdivsufsort/.
588 * G. Nong, S. Zhang, and W.H. Chan. 2009. Linear Suffix Array
589 * Construction by Almost Pure Induced-Sorting. Data Compression
590 * Conference, 2009. DCC '09. pp. 193 - 202.
592 * S.J. Puglisi, W.F. Smyth, and A. Turpin. 2007. A Taxonomy of Suffix
593 * Array Construction Algorithms. ACM Computing Surveys (CSUR) Volume 39
594 * Issue 2, 2007 Article No. 4.
597 build_SA(u32 SA[], const u8 T[], u32 n, u32 *tmp)
599 /* Note: divsufsort() needs temporary space --- one array with 256
600 * spaces and one array with 65536 spaces. The implementation of
601 * divsufsort() has been modified from the original to use the provided
602 * temporary space instead of allocating its own, since we don't want to
603 * have to deal with malloc() failures here. */
604 divsufsort(T, SA, n, tmp);
608 * Build the inverse suffix array ISA from the suffix array SA.
610 * Whereas the suffix array is a mapping from suffix rank to suffix position,
611 * the inverse suffix array is a mapping from suffix position to suffix rank.
614 build_ISA(u32 ISA[restrict], const u32 SA[restrict], u32 n)
616 for (u32 r = 0; r < n; r++)
621 * Prepare the LCP-interval tree matchfinder for a new input buffer.
623 * @mf - the initialized matchfinder structure
624 * @T - the input buffer
625 * @n - size of the input buffer in bytes. This must be nonzero and can be at
626 * most the max_bufsize with which lcpit_matchfinder_init() was called.
629 lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
631 /* intervals[] temporarily stores SA and LCP packed together.
632 * pos_data[] temporarily stores ISA.
633 * pos_data[] is also used as the temporary space for divsufsort(). */
635 build_SA(mf->intervals, T, n, mf->pos_data);
636 build_ISA(mf->pos_data, mf->intervals, n);
637 if (n <= MAX_NORMAL_BUFSIZE) {
638 build_LCP(mf->intervals, mf->pos_data, T, n,
639 mf->min_match_len, mf->nice_match_len);
640 build_LCPIT(mf->intervals, mf->pos_data, n);
641 mf->huge_mode = false;
643 expand_SA(mf->intervals, n);
644 build_LCP_huge(mf->intervals64, mf->pos_data, T, n,
645 mf->min_match_len, mf->nice_match_len);
646 build_LCPIT_huge(mf->intervals64, mf->pos_data, n);
647 mf->huge_mode = true;
649 mf->cur_pos = 0; /* starting at beginning of input buffer */
653 * Retrieve a list of matches with the next position.
655 * The matches will be recorded in the @matches array, ordered by strictly
656 * decreasing length and strictly decreasing offset.
658 * The return value is the number of matches found and written to @matches.
659 * This can be any value in [0, nice_match_len - min_match_len + 1].
662 lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf,
663 struct lz_match *matches)
666 return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
667 mf->intervals64, matches, true);
669 return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
670 mf->intervals, matches, true);
674 * Skip the next @count bytes (don't search for matches at them). @count is
678 lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count)
682 lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
683 mf->intervals64, NULL, false);
687 lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
688 mf->intervals, NULL, false);
694 * Destroy an LCP-interval tree matchfinder that was previously initialized with
695 * lcpit_matchfinder_init().
698 lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf)