2 * lz_linked_suffix_array.c
4 * Linked suffix array match-finder for Lempel-Ziv compression.
6 * Copyright (c) 2013, 2014 Eric Biggers. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 #include "wimlib/lz_mf.h"
33 #include "wimlib/lz_suffix_array_utils.h"
34 #include "wimlib/util.h"
38 /* Length type --- must be an unsigned type large enough to hold the maximum
40 typedef u16 lz_lsa_len_t;
42 /* Type of distances in suffix array links. A larger type would allow skipping
43 * irrelevant suffixes more quickly, which is especially helpful towards the
44 * start of the window. However, even a single byte allows skipping 255 at a
45 * time, which where it matters is already a big improvement over the
46 * alternative of searching the suffixes consecutively. */
47 typedef u8 lz_lsa_delta_t;
49 #define LZ_LSA_LEN_MAX ((lz_lsa_len_t)~0UL)
50 #define LZ_LSA_POS_MAX ((u32)~0UL)
51 #define LZ_LSA_DELTA_MAX ((lz_lsa_delta_t)~0UL)
53 /* State of the linked suffix array match-finder. */
58 /* Suffix array for the current window.
59 * This is a mapping from suffix rank to suffix position. */
62 /* Inverse suffix array for the current window.
63 * This is a mapping from suffix position to suffix rank.
64 * If 0 <= r < window_size, then ISA[SA[r]] == r. */
67 /* Suffix array links.
69 * During a linear scan of the input string to find matches, this array
70 * used to keep track of which rank suffixes in the suffix array appear
71 * before the current position. Instead of searching in the original
72 * suffix array, scans for matches at a given position traverse a linked
73 * list containing (usually) only suffixes that appear before that
75 struct salink *salink;
78 /* Suffix array link. An array of these structures, one per suffix rank, is
79 * used as a replacement for the raw LCP (Longest Common Prefix) array to allow
80 * skipping over suffixes that appear later in the window and hence cannot be
81 * used as LZ77 matches. */
84 /* Temporary fields used while this structure is being
87 * Note: we want the entire `struct salink' to be only 6 bytes,
88 * even though this makes "next_initial" unaligned. */
91 lz_lsa_len_t lcpnext_initial;
95 /* Intially, the length, in bytes, of the longest common
96 * prefix (LCP) between the suffix having this rank and
97 * the suffix with the smallest larger rank that
98 * starts earlier in the window than the suffix having
99 * this rank. If no such suffix exists, this will be 0.
101 * Later, during match-finding, after the corresponding
102 * suffix has entered the LZ77 dictionary, this value
103 * may be updated by lz_lsa_update_salink() to refer
104 * instead to a lexicographically closer (but still
105 * larger) suffix that begins at a later position that
106 * has entered the LZ77 dictionary. */
107 lz_lsa_len_t lcpnext;
109 /* Initially, the length, in bytes, of the longest
110 * common prefix (LCP) between the suffix having this
111 * rank and the suffix with the largest smaller rank
112 * that starts earlier in the window than the suffix
113 * having this rank. If no such suffix exists, this
116 * Later, during match-finding, after the corresponding
117 * suffix has entered the LZ77 dictionary, this value
118 * may be updated by lz_lsa_update_salink() to refer
119 * instead to a lexicographically closer (but still
120 * smaller) suffix that begins at a later position that
121 * has entered the LZ77 dictionary. */
122 lz_lsa_len_t lcpprev;
124 /* Distance to the suffix referred to in the description
125 * of "lcpnext" above, but capped to a maximum value to
126 * save memory; or, 0 if no such suffix exists. If the
127 * true distance was truncated, this will give the
128 * distance to the rank of a suffix that is
129 * lexicographically closer to the current suffix than
130 * the desired suffix, but appears *later* in the window
131 * and hence cannot be used as the basis for an LZ77
133 lz_lsa_delta_t dist_to_next;
135 /* Distance to the suffix referred to in the description
136 * of "lcpprev" above, but capped to a maximum value to
137 * save memory; or, 0 if no such suffix exists. If the
138 * true distance was truncated, this will give the
139 * distance to the rank of a suffix that is
140 * lexicographically closer to the current suffix than
141 * the desired suffix, but appears *later* in the window
142 * and hence cannot be used as the basis for an LZ77
144 lz_lsa_delta_t dist_to_prev;
149 /* Initialize the SA link array in linear time.
151 * This is similar to computing the LPF (Longest Previous Factor) array, which
152 * is addressed in several papers. In particular the algorithms below are based
153 * on Crochemore et al. 2009: "LPF computation revisited". However, this
154 * match-finder does not actually compute or use the LPF array per se. Rather,
155 * this function sets up some information necessary to compute the LPF array,
156 * but later lz_lsa_get_matches() actually uses this information to search
157 * the suffix array directly and can keep searching beyond the first (longest)
158 * match whose length would be placed in the LPF array. This difference from
159 * the theoretical work is necessary because in many real compression formats
160 * matches take variable numbers of bits to encode, so a decent parser needs to
161 * consider more than just the longest match with unspecified offset.
163 * Note: We cap the lcpprev and lcpnext values to the maximum match length so
164 * that the match-finder need not worry about it later, in the inner loop.
166 * Note: the LCP array is one of the inputs to this function, but it is used as
167 * temporary space and therefore will be invalidated.
170 init_salink(struct salink link[restrict], u32 LCP[restrict],
171 const u32 SA[restrict], const u8 T[restrict], u32 n,
172 lz_lsa_len_t min_match_len, lz_lsa_len_t max_match_len)
174 /* Calculate salink.dist_to_next and salink.lcpnext.
176 * Pass 1 calculates, for each suffix rank, the corresponding
177 * "next_initial" value which is the smallest larger rank that
178 * corresponds to a suffix starting earlier in the string. It also
179 * calculates "lcpnext_initial", which is the longest common prefix with
180 * that suffix, although to eliminate checks in lz_lsa_get_matches(),
181 * "lcpnext_initial" is set to 0 if it's less than the minimum match
182 * length or set to the maximum match length if it's greater than the
183 * maximum match length.
185 * Pass 2 translates each absolute "next_initial", a 4-byte value, into
186 * a relative "dist_to_next", a 1-byte value. This is done to save
187 * memory. In the case that the exact relative distance cannot be
188 * encoded in 1 byte, it is capped to 255. This is valid as long as
189 * lz_lsa_get_matches() validates each position before using it.
190 * Note that "lcpnext" need not be updated in this case because it will
191 * not be used until the actual next rank has been found anyway.
193 link[n - 1].next_initial = LZ_LSA_POS_MAX;
194 link[n - 1].lcpnext_initial = 0;
195 for (u32 r = n - 2; r != LZ_LSA_POS_MAX; r--) {
198 while (t != LZ_LSA_POS_MAX && SA[t] > SA[r]) {
199 l = min(l, link[t].lcpnext_initial);
200 t = link[t].next_initial;
202 link[r].next_initial = t;
204 if (l < min_match_len)
206 else if (l > max_match_len)
208 link[r].lcpnext_initial = l;
210 for (u32 r = 0; r < n; r++) {
213 lz_lsa_delta_t dist_to_next;
215 next = link[r].next_initial;
216 l = link[r].lcpnext_initial;
218 if (next == LZ_LSA_POS_MAX)
220 else if (next - r <= LZ_LSA_DELTA_MAX)
221 dist_to_next = next - r;
223 dist_to_next = LZ_LSA_DELTA_MAX;
226 link[r].dist_to_next = dist_to_next;
229 /* Calculate salink.dist_to_prev and salink.lcpprev.
231 * This is analgous to dist_to_next and lcpnext as described above, but
232 * in the other direction. That is, here we're interested in, for each
233 * rank, the largest smaller rank that corresponds to a suffix starting
234 * earlier in the string.
236 * To save memory we don't have a "prev_initial" field, but rather store
237 * those values in the LCP array. */
238 LCP[0] = LZ_LSA_POS_MAX;
240 for (u32 r = 1; r < n; r++) {
243 while (t != LZ_LSA_POS_MAX && SA[t] > SA[r]) {
244 l = min(l, link[t].lcpprev);
249 if (l < min_match_len)
251 else if (l > max_match_len)
256 for (u32 r = 0; r < n; r++) {
260 if (prev == LZ_LSA_POS_MAX)
261 link[r].dist_to_prev = 0;
262 else if (r - prev <= LZ_LSA_DELTA_MAX)
263 link[r].dist_to_prev = r - prev;
265 link[r].dist_to_prev = LZ_LSA_DELTA_MAX;
269 /* If ENABLE_LZ_DEBUG is defined, verify the values computed by init_salink().
271 * WARNING: this is for debug use only as it does not necessarily run in linear
274 verify_salink(const struct salink link[], const u32 SA[], const u8 T[], u32 n,
275 lz_lsa_len_t min_match_len, lz_lsa_len_t max_match_len)
277 #ifdef ENABLE_LZ_DEBUG
278 for (u32 r = 0; r < n; r++) {
279 for (u32 prev = r; ; ) {
281 LZ_ASSERT(link[r].dist_to_prev == 0);
282 LZ_ASSERT(link[r].lcpprev == 0);
288 if (SA[prev] < SA[r]) {
289 LZ_ASSERT(link[r].dist_to_prev == min(r - prev, LZ_LSA_DELTA_MAX));
293 lcpprev < min(n - SA[prev], n - SA[r]) &&
294 T[SA[prev] + lcpprev] == T[SA[r] + lcpprev];
297 if (lcpprev < min_match_len)
299 else if (lcpprev > max_match_len)
300 lcpprev = max_match_len;
302 LZ_ASSERT(lcpprev == link[r].lcpprev);
307 for (u32 next = r; ; ) {
309 LZ_ASSERT(link[r].dist_to_next == 0);
310 LZ_ASSERT(link[r].lcpnext == 0);
316 if (SA[next] < SA[r]) {
317 LZ_ASSERT(link[r].dist_to_next == min(next - r, LZ_LSA_DELTA_MAX));
321 lcpnext < min(n - SA[next], n - SA[r]) &&
322 T[SA[next] + lcpnext] == T[SA[r] + lcpnext];
325 if (lcpnext < min_match_len)
327 else if (lcpnext > max_match_len)
328 lcpnext = max_match_len;
330 LZ_ASSERT(lcpnext == link[r].lcpnext);
339 lz_lsa_update_salink(const u32 r, struct salink link[])
341 const u32 next = r + link[r].dist_to_next;
342 const u32 prev = r - link[r].dist_to_prev;
344 if (next != r && link[r].dist_to_next < link[next].dist_to_prev) {
345 link[next].dist_to_prev = link[r].dist_to_next;
346 link[next].lcpprev = link[r].lcpnext;
349 if (prev != r && link[r].dist_to_prev < link[prev].dist_to_next) {
350 link[prev].dist_to_next = link[r].dist_to_prev;
351 link[prev].lcpnext = link[r].lcpprev;
356 lz_lsa_set_default_params(struct lz_mf_params *params)
358 if (params->min_match_len == 0)
359 params->min_match_len = 2;
361 if (params->max_match_len == 0)
362 params->max_match_len = UINT32_MAX;
364 if (params->max_match_len > LZ_LSA_LEN_MAX)
365 params->max_match_len = LZ_LSA_LEN_MAX;
367 if (params->max_search_depth == 0)
368 params->max_search_depth = 32;
370 /* Scale max_search_depth down since this algorithm finds the longest
372 params->max_search_depth = DIV_ROUND_UP(params->max_search_depth, 5);
376 lz_lsa_get_needed_memory(u32 max_window_size)
381 size += (u64)max_window_size * sizeof(u32);
384 size += (u64)max_window_size * sizeof(u32);
386 /* salink and minimum temporary space for divsufsort */
387 size += max(BUILD_SA_MIN_TMP_LEN * sizeof(u32),
388 (u64)max_window_size * sizeof(struct salink));
394 lz_lsa_params_valid(const struct lz_mf_params *params)
400 lz_lsa_init(struct lz_mf *_mf)
402 struct lz_lsa *mf = (struct lz_lsa *)_mf;
403 const u32 max_window_size = mf->base.params.max_window_size;
405 lz_lsa_set_default_params(&mf->base.params);
407 /* SA and ISA will share the same allocation. */
408 mf->SA = MALLOC(max_window_size * 2 * sizeof(u32));
412 mf->salink = MALLOC(max(BUILD_SA_MIN_TMP_LEN * sizeof(u32),
413 max_window_size * sizeof(struct salink)));
423 lz_lsa_load_window(struct lz_mf *_mf, const u8 T[], u32 n)
425 struct lz_lsa *mf = (struct lz_lsa *)_mf;
428 build_SA(mf->SA, T, n, (u32 *)mf->salink);
430 /* Compute ISA (Inverse Suffix Array) in a preliminary position.
432 * This is just a trick to save memory. Since LCP is unneeded after
433 * this function, it can be computed in any available space. The
434 * storage for the ISA is the best choice because the ISA can be built
435 * quickly in salink for now, then re-built in its real location at the
436 * end. This is probably worth it because computing the ISA from the SA
437 * is very fast, and since this match-finder is memory-hungry we'd like
438 * to save as much memory as possible. */
439 BUILD_BUG_ON(sizeof(mf->salink[0]) < sizeof(mf->ISA[0]));
440 ISA = (u32 *)mf->salink;
441 build_ISA(ISA, mf->SA, n);
443 /* Compute LCP (Longest Common Prefix) array. */
445 build_LCP(LCP, mf->SA, ISA, T, n);
447 /* Initialize suffix array links. */
448 init_salink(mf->salink, LCP, mf->SA, T, n,
449 mf->base.params.min_match_len,
450 mf->base.params.max_match_len);
451 verify_salink(mf->salink, mf->SA, T, n,
452 mf->base.params.min_match_len,
453 mf->base.params.max_match_len);
455 /* Compute ISA (Inverse Suffix Array) in its final position. */
457 build_ISA(ISA, mf->SA, n);
459 /* Save new variables and return. */
464 lz_lsa_get_matches(struct lz_mf *_mf, struct lz_match matches[])
466 struct lz_lsa *mf = (struct lz_lsa *)_mf;
467 const u32 i = mf->base.cur_window_pos++;
469 const u32 * const restrict SA = mf->SA;
470 const u32 * const restrict ISA = mf->ISA;
471 struct salink * const restrict link = mf->salink;
473 /* r = Rank of the suffix at the current position. */
474 const u32 r = ISA[i];
476 /* Prepare for searching the current position. */
477 lz_lsa_update_salink(r, link);
479 /* Prefetch next position in SA and link.
481 * This can improve performance on large windows since the locations in
482 * SA and link at which each successive search begins are in general
483 * randomly distributed. */
484 if (likely(i + 1 < mf->base.cur_window_size)) {
485 const u32 next_r = ISA[i + 1];
486 prefetch(&SA[next_r]);
487 prefetch(&link[next_r]);
490 /* L = rank of next suffix to the left;
491 * R = rank of next suffix to the right;
492 * lenL = length of match between current position and the suffix with rank L;
493 * lenR = length of match between current position and the suffix with rank R.
495 * This is left and right relative to the rank of the current suffix.
496 * Since the suffixes in the suffix array are sorted, the longest
497 * matches are immediately to the left and right (using the linked list
498 * to ignore all suffixes that occur later in the window). The match
499 * length decreases the farther left and right we go. We shall keep the
500 * length on both sides in sync in order to choose the lowest-cost match
503 u32 L = r - link[r].dist_to_prev;
504 u32 R = r + link[r].dist_to_next;
505 u32 lenL = link[r].lcpprev;
506 u32 lenR = link[r].lcpnext;
508 /* num_matches = number of matches found so far. */
511 /* best_offset = offset of lowest-cost match found so far.
513 * Shorter matches that do not have a lower offset than this are
514 * discarded, since presumably it would be cheaper to output the bytes
515 * from the longer match instead. */
516 u32 best_offset = LZ_LSA_POS_MAX;
518 /* count_remaining = maximum number of possible matches remaining to be
520 u32 count_remaining = mf->base.params.max_search_depth;
522 /* pending_offset = offset of lowest-cost match found for the current
523 * length, or 0 if none found yet. */
524 u32 pending_offset = 0;
526 /* Note: some 'goto' statements are used in the remainder of this
527 * function to remove unnecessary checks and create branches that the
528 * CPU may predict better. (This function is performance critical.) */
530 if (lenL != 0 && lenL >= lenR)
538 /* Search suffixes on the left until the match length has decreased
539 * below the next match length on the right or to below the minimum
546 /* Check for hard cutoff on amount of work done. */
547 if (count_remaining-- == 0) {
548 if (pending_offset != 0) {
549 /* Save pending match. */
550 matches[num_matches++] = (struct lz_match) {
552 .offset = pending_offset,
559 /* Suffix is in LZ77 dictionary. (Check was needed
560 * because the salink array caps distances to save
565 /* Save match offset if it results in lower cost. */
566 if (offset < best_offset) {
567 best_offset = offset;
568 pending_offset = offset;
572 /* Advance left to previous suffix. */
577 L -= link[L].dist_to_prev;
579 if (link[old_L].lcpprev < old_lenL) {
580 /* Match length decreased. */
582 lenL = link[old_L].lcpprev;
584 if (old_lenL > lenR) {
585 /* Neither the right side nor the left size has
586 * any more matches of length @old_lenL. If a
587 * pending match exists, save it. */
588 if (pending_offset != 0) {
589 matches[num_matches++] = (struct lz_match) {
591 .offset = pending_offset,
597 /* New match length on left is still at
598 * least as large as the next match
599 * length on the right: Keep extending
600 * left, unless the minimum match length
601 * would be underrun. */
608 /* Here we have lenL < lenR. Extend right.
609 * (No check for whether the minimum match length has
610 * been underrun is needed, provided that such lengths
611 * are marked as 0.) */
617 /* Search suffixes on the right until the match length has decreased to
618 * the next match length on the left or to below the minimum match
625 /* Check for hard cutoff on amount of work done. */
626 if (count_remaining-- == 0) {
627 if (pending_offset != 0) {
628 /* Save pending match. */
629 matches[num_matches++] = (struct lz_match) {
631 .offset = pending_offset,
638 /* Suffix is in LZ77 dictionary. (Check was needed
639 * because the salink array caps distances to save
644 if (offset < best_offset) {
645 best_offset = offset;
646 pending_offset = offset;
650 /* Advance right to next suffix. */
655 R += link[R].dist_to_next;
657 if (link[old_R].lcpnext < lenR) {
658 /* Match length decreased. */
660 lenR = link[old_R].lcpnext;
662 /* Neither the right side nor the left size has any more
663 * matches of length @old_lenR. If a pending match
664 * exists, save it. */
665 if (pending_offset != 0) {
666 matches[num_matches++] = (struct lz_match) {
668 .offset = pending_offset,
674 /* lenL >= lenR: Extend left, unless the
675 * minimum match length would be underrun, in
676 * which case we are done. */
682 /* lenR > lenL: Keep extending right.
683 * (No check for whether the minimum match length has
684 * been underrun is needed, provided that such lengths
685 * are marked as 0.) */
690 for (u32 i = 0; i < num_matches / 2; i++)
691 swap(matches[i], matches[num_matches - 1 - i]);
696 lz_lsa_skip_positions(struct lz_mf *_mf, u32 n)
698 struct lz_lsa *mf = (struct lz_lsa *)_mf;
700 lz_lsa_update_salink(mf->ISA[mf->base.cur_window_pos++], mf->salink);
705 lz_lsa_destroy(struct lz_mf *_mf)
707 struct lz_lsa *mf = (struct lz_lsa *)_mf;
713 const struct lz_mf_ops lz_linked_suffix_array_ops = {
714 .params_valid = lz_lsa_params_valid,
715 .get_needed_memory = lz_lsa_get_needed_memory,
717 .load_window = lz_lsa_load_window,
718 .get_matches = lz_lsa_get_matches,
719 .skip_positions = lz_lsa_skip_positions,
720 .destroy = lz_lsa_destroy,
721 .struct_size = sizeof(struct lz_lsa),