2 * lz_linked_suffix_array.c
4 * Linked suffix array match-finder for Lempel-Ziv compression.
6 * Copyright (c) 2013, 2014 Eric Biggers. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 #include "wimlib/lz_mf.h"
37 #include "wimlib/lz_suffix_array_utils.h"
38 #include "wimlib/util.h"
42 /* Length type --- must be an unsigned type large enough to hold the maximum
44 typedef u16 lz_lsa_len_t;
46 /* Type of distances in suffix array links. A larger type would allow skipping
47 * irrelevant suffixes more quickly, which is especially helpful towards the
48 * start of the window. However, even a single byte allows skipping 255 at a
49 * time, which where it matters is already a big improvement over the
50 * alternative of searching the suffixes consecutively. */
51 typedef u8 lz_lsa_delta_t;
53 #define LZ_LSA_LEN_MAX ((lz_lsa_len_t)~0UL)
54 #define LZ_LSA_POS_MAX ((u32)~0UL)
55 #define LZ_LSA_DELTA_MAX ((lz_lsa_delta_t)~0UL)
57 /* State of the linked suffix array match-finder. */
62 /* Suffix array for the current window.
63 * This is a mapping from suffix rank to suffix position. */
66 /* Inverse suffix array for the current window.
67 * This is a mapping from suffix position to suffix rank.
68 * If 0 <= r < window_size, then ISA[SA[r]] == r. */
71 /* Suffix array links.
73 * During a linear scan of the input string to find matches, this array
74 * used to keep track of which rank suffixes in the suffix array appear
75 * before the current position. Instead of searching in the original
76 * suffix array, scans for matches at a given position traverse a linked
77 * list containing (usually) only suffixes that appear before that
79 struct salink *salink;
82 /* Suffix array link. An array of these structures, one per suffix rank, is
83 * used as a replacement for the raw LCP (Longest Common Prefix) array to allow
84 * skipping over suffixes that appear later in the window and hence cannot be
85 * used as LZ77 matches. */
88 /* Temporary fields used while this structure is being
91 * Note: we want the entire `struct salink' to be only 6 bytes,
92 * even though this makes "next_initial" unaligned. */
95 lz_lsa_len_t lcpnext_initial;
99 /* Intially, the length, in bytes, of the longest common
100 * prefix (LCP) between the suffix having this rank and
101 * the suffix with the smallest larger rank that
102 * starts earlier in the window than the suffix having
103 * this rank. If no such suffix exists, this will be 0.
105 * Later, during match-finding, after the corresponding
106 * suffix has entered the LZ77 dictionary, this value
107 * may be updated by lz_lsa_update_salink() to refer
108 * instead to a lexicographically closer (but still
109 * larger) suffix that begins at a later position that
110 * has entered the LZ77 dictionary. */
111 lz_lsa_len_t lcpnext;
113 /* Initially, the length, in bytes, of the longest
114 * common prefix (LCP) between the suffix having this
115 * rank and the suffix with the largest smaller rank
116 * that starts earlier in the window than the suffix
117 * having this rank. If no such suffix exists, this
120 * Later, during match-finding, after the corresponding
121 * suffix has entered the LZ77 dictionary, this value
122 * may be updated by lz_lsa_update_salink() to refer
123 * instead to a lexicographically closer (but still
124 * smaller) suffix that begins at a later position that
125 * has entered the LZ77 dictionary. */
126 lz_lsa_len_t lcpprev;
128 /* Distance to the suffix referred to in the description
129 * of "lcpnext" above, but capped to a maximum value to
130 * save memory; or, 0 if no such suffix exists. If the
131 * true distance was truncated, this will give the
132 * distance to the rank of a suffix that is
133 * lexicographically closer to the current suffix than
134 * the desired suffix, but appears *later* in the window
135 * and hence cannot be used as the basis for an LZ77
137 lz_lsa_delta_t dist_to_next;
139 /* Distance to the suffix referred to in the description
140 * of "lcpprev" above, but capped to a maximum value to
141 * save memory; or, 0 if no such suffix exists. If the
142 * true distance was truncated, this will give the
143 * distance to the rank of a suffix that is
144 * lexicographically closer to the current suffix than
145 * the desired suffix, but appears *later* in the window
146 * and hence cannot be used as the basis for an LZ77
148 lz_lsa_delta_t dist_to_prev;
153 /* Initialize the SA link array in linear time.
155 * This is similar to computing the LPF (Longest Previous Factor) array, which
156 * is addressed in several papers. In particular the algorithms below are based
157 * on Crochemore et al. 2009: "LPF computation revisited". However, this
158 * match-finder does not actually compute or use the LPF array per se. Rather,
159 * this function sets up some information necessary to compute the LPF array,
160 * but later lz_lsa_get_matches() actually uses this information to search
161 * the suffix array directly and can keep searching beyond the first (longest)
162 * match whose length would be placed in the LPF array. This difference from
163 * the theoretical work is necessary because in many real compression formats
164 * matches take variable numbers of bits to encode, so a decent parser needs to
165 * consider more than just the longest match with unspecified offset.
167 * Note: We cap the lcpprev and lcpnext values to the maximum match length so
168 * that the match-finder need not worry about it later, in the inner loop.
170 * Note: the LCP array is one of the inputs to this function, but it is used as
171 * temporary space and therefore will be invalidated.
174 init_salink(struct salink link[restrict], u32 LCP[restrict],
175 const u32 SA[restrict], const u8 T[restrict], u32 n,
176 lz_lsa_len_t min_match_len, lz_lsa_len_t max_match_len)
178 /* Calculate salink.dist_to_next and salink.lcpnext.
180 * Pass 1 calculates, for each suffix rank, the corresponding
181 * "next_initial" value which is the smallest larger rank that
182 * corresponds to a suffix starting earlier in the string. It also
183 * calculates "lcpnext_initial", which is the longest common prefix with
184 * that suffix, although to eliminate checks in lz_lsa_get_matches(),
185 * "lcpnext_initial" is set to 0 if it's less than the minimum match
186 * length or set to the maximum match length if it's greater than the
187 * maximum match length.
189 * Pass 2 translates each absolute "next_initial", a 4-byte value, into
190 * a relative "dist_to_next", a 1-byte value. This is done to save
191 * memory. In the case that the exact relative distance cannot be
192 * encoded in 1 byte, it is capped to 255. This is valid as long as
193 * lz_lsa_get_matches() validates each position before using it.
194 * Note that "lcpnext" need not be updated in this case because it will
195 * not be used until the actual next rank has been found anyway.
197 link[n - 1].next_initial = LZ_LSA_POS_MAX;
198 link[n - 1].lcpnext_initial = 0;
199 for (u32 r = n - 2; r != LZ_LSA_POS_MAX; r--) {
202 while (t != LZ_LSA_POS_MAX && SA[t] > SA[r]) {
203 l = min(l, link[t].lcpnext_initial);
204 t = link[t].next_initial;
206 link[r].next_initial = t;
208 if (l < min_match_len)
210 else if (l > max_match_len)
212 link[r].lcpnext_initial = l;
214 for (u32 r = 0; r < n; r++) {
217 lz_lsa_delta_t dist_to_next;
219 next = link[r].next_initial;
220 l = link[r].lcpnext_initial;
222 if (next == LZ_LSA_POS_MAX)
224 else if (next - r <= LZ_LSA_DELTA_MAX)
225 dist_to_next = next - r;
227 dist_to_next = LZ_LSA_DELTA_MAX;
230 link[r].dist_to_next = dist_to_next;
233 /* Calculate salink.dist_to_prev and salink.lcpprev.
235 * This is analgous to dist_to_next and lcpnext as described above, but
236 * in the other direction. That is, here we're interested in, for each
237 * rank, the largest smaller rank that corresponds to a suffix starting
238 * earlier in the string.
240 * To save memory we don't have a "prev_initial" field, but rather store
241 * those values in the LCP array. */
242 LCP[0] = LZ_LSA_POS_MAX;
244 for (u32 r = 1; r < n; r++) {
247 while (t != LZ_LSA_POS_MAX && SA[t] > SA[r]) {
248 l = min(l, link[t].lcpprev);
253 if (l < min_match_len)
255 else if (l > max_match_len)
260 for (u32 r = 0; r < n; r++) {
264 if (prev == LZ_LSA_POS_MAX)
265 link[r].dist_to_prev = 0;
266 else if (r - prev <= LZ_LSA_DELTA_MAX)
267 link[r].dist_to_prev = r - prev;
269 link[r].dist_to_prev = LZ_LSA_DELTA_MAX;
273 /* If ENABLE_LZ_DEBUG is defined, verify the values computed by init_salink().
275 * WARNING: this is for debug use only as it does not necessarily run in linear
278 verify_salink(const struct salink link[], const u32 SA[], const u8 T[], u32 n,
279 lz_lsa_len_t min_match_len, lz_lsa_len_t max_match_len)
281 #ifdef ENABLE_LZ_DEBUG
282 for (u32 r = 0; r < n; r++) {
283 for (u32 prev = r; ; ) {
285 LZ_ASSERT(link[r].dist_to_prev == 0);
286 LZ_ASSERT(link[r].lcpprev == 0);
292 if (SA[prev] < SA[r]) {
293 LZ_ASSERT(link[r].dist_to_prev == min(r - prev, LZ_LSA_DELTA_MAX));
297 lcpprev < min(n - SA[prev], n - SA[r]) &&
298 T[SA[prev] + lcpprev] == T[SA[r] + lcpprev];
301 if (lcpprev < min_match_len)
303 else if (lcpprev > max_match_len)
304 lcpprev = max_match_len;
306 LZ_ASSERT(lcpprev == link[r].lcpprev);
311 for (u32 next = r; ; ) {
313 LZ_ASSERT(link[r].dist_to_next == 0);
314 LZ_ASSERT(link[r].lcpnext == 0);
320 if (SA[next] < SA[r]) {
321 LZ_ASSERT(link[r].dist_to_next == min(next - r, LZ_LSA_DELTA_MAX));
325 lcpnext < min(n - SA[next], n - SA[r]) &&
326 T[SA[next] + lcpnext] == T[SA[r] + lcpnext];
329 if (lcpnext < min_match_len)
331 else if (lcpnext > max_match_len)
332 lcpnext = max_match_len;
334 LZ_ASSERT(lcpnext == link[r].lcpnext);
343 lz_lsa_update_salink(const u32 r, struct salink link[])
345 const u32 next = r + link[r].dist_to_next;
346 const u32 prev = r - link[r].dist_to_prev;
348 if (next != r && link[r].dist_to_next < link[next].dist_to_prev) {
349 link[next].dist_to_prev = link[r].dist_to_next;
350 link[next].lcpprev = link[r].lcpnext;
353 if (prev != r && link[r].dist_to_prev < link[prev].dist_to_next) {
354 link[prev].dist_to_next = link[r].dist_to_prev;
355 link[prev].lcpnext = link[r].lcpprev;
360 lz_lsa_set_default_params(struct lz_mf_params *params)
362 if (params->min_match_len == 0)
363 params->min_match_len = 2;
365 if (params->max_match_len == 0)
366 params->max_match_len = UINT32_MAX;
368 if (params->max_match_len > LZ_LSA_LEN_MAX)
369 params->max_match_len = LZ_LSA_LEN_MAX;
371 if (params->max_search_depth == 0)
372 params->max_search_depth = 32;
374 /* Scale max_search_depth down since this algorithm finds the longest
376 params->max_search_depth = DIV_ROUND_UP(params->max_search_depth, 5);
380 lz_lsa_get_needed_memory(u32 max_window_size)
385 size += (u64)max_window_size * sizeof(u32);
388 size += (u64)max_window_size * sizeof(u32);
390 /* salink and minimum temporary space for divsufsort */
391 size += max(BUILD_SA_MIN_TMP_LEN * sizeof(u32),
392 (u64)max_window_size * sizeof(struct salink));
398 lz_lsa_params_valid(const struct lz_mf_params *params)
404 lz_lsa_init(struct lz_mf *_mf)
406 struct lz_lsa *mf = (struct lz_lsa *)_mf;
407 const u32 max_window_size = mf->base.params.max_window_size;
409 lz_lsa_set_default_params(&mf->base.params);
411 /* SA and ISA will share the same allocation. */
412 mf->SA = MALLOC(max_window_size * 2 * sizeof(u32));
416 mf->salink = MALLOC(max(BUILD_SA_MIN_TMP_LEN * sizeof(u32),
417 max_window_size * sizeof(struct salink)));
427 lz_lsa_load_window(struct lz_mf *_mf, const u8 T[], u32 n)
429 struct lz_lsa *mf = (struct lz_lsa *)_mf;
432 build_SA(mf->SA, T, n, (u32 *)mf->salink);
434 /* Compute ISA (Inverse Suffix Array) in a preliminary position.
436 * This is just a trick to save memory. Since LCP is unneeded after
437 * this function, it can be computed in any available space. The
438 * storage for the ISA is the best choice because the ISA can be built
439 * quickly in salink for now, then re-built in its real location at the
440 * end. This is probably worth it because computing the ISA from the SA
441 * is very fast, and since this match-finder is memory-hungry we'd like
442 * to save as much memory as possible. */
443 BUILD_BUG_ON(sizeof(mf->salink[0]) < sizeof(mf->ISA[0]));
444 ISA = (u32 *)mf->salink;
445 build_ISA(ISA, mf->SA, n);
447 /* Compute LCP (Longest Common Prefix) array. */
449 build_LCP(LCP, mf->SA, ISA, T, n);
451 /* Initialize suffix array links. */
452 init_salink(mf->salink, LCP, mf->SA, T, n,
453 mf->base.params.min_match_len,
454 mf->base.params.max_match_len);
455 verify_salink(mf->salink, mf->SA, T, n,
456 mf->base.params.min_match_len,
457 mf->base.params.max_match_len);
459 /* Compute ISA (Inverse Suffix Array) in its final position. */
461 build_ISA(ISA, mf->SA, n);
463 /* Save new variables and return. */
468 lz_lsa_get_matches(struct lz_mf *_mf, struct lz_match matches[])
470 struct lz_lsa *mf = (struct lz_lsa *)_mf;
471 const u32 i = mf->base.cur_window_pos++;
473 const u32 * const restrict SA = mf->SA;
474 const u32 * const restrict ISA = mf->ISA;
475 struct salink * const restrict link = mf->salink;
477 /* r = Rank of the suffix at the current position. */
478 const u32 r = ISA[i];
480 /* Prepare for searching the current position. */
481 lz_lsa_update_salink(r, link);
483 /* Prefetch next position in SA and link.
485 * This can improve performance on large windows since the locations in
486 * SA and link at which each successive search begins are in general
487 * randomly distributed. */
488 if (likely(i + 1 < mf->base.cur_window_size)) {
489 const u32 next_r = ISA[i + 1];
490 prefetch(&SA[next_r]);
491 prefetch(&link[next_r]);
494 /* L = rank of next suffix to the left;
495 * R = rank of next suffix to the right;
496 * lenL = length of match between current position and the suffix with rank L;
497 * lenR = length of match between current position and the suffix with rank R.
499 * This is left and right relative to the rank of the current suffix.
500 * Since the suffixes in the suffix array are sorted, the longest
501 * matches are immediately to the left and right (using the linked list
502 * to ignore all suffixes that occur later in the window). The match
503 * length decreases the farther left and right we go. We shall keep the
504 * length on both sides in sync in order to choose the lowest-cost match
507 u32 L = r - link[r].dist_to_prev;
508 u32 R = r + link[r].dist_to_next;
509 u32 lenL = link[r].lcpprev;
510 u32 lenR = link[r].lcpnext;
512 /* num_matches = number of matches found so far. */
515 /* best_offset = offset of lowest-cost match found so far.
517 * Shorter matches that do not have a lower offset than this are
518 * discarded, since presumably it would be cheaper to output the bytes
519 * from the longer match instead. */
520 u32 best_offset = LZ_LSA_POS_MAX;
522 /* count_remaining = maximum number of possible matches remaining to be
524 u32 count_remaining = mf->base.params.max_search_depth;
526 /* pending_offset = offset of lowest-cost match found for the current
527 * length, or 0 if none found yet. */
528 u32 pending_offset = 0;
530 /* Note: some 'goto' statements are used in the remainder of this
531 * function to remove unnecessary checks and create branches that the
532 * CPU may predict better. (This function is performance critical.) */
534 if (lenL != 0 && lenL >= lenR)
542 /* Search suffixes on the left until the match length has decreased
543 * below the next match length on the right or to below the minimum
550 /* Check for hard cutoff on amount of work done. */
551 if (count_remaining-- == 0) {
552 if (pending_offset != 0) {
553 /* Save pending match. */
554 matches[num_matches++] = (struct lz_match) {
556 .offset = pending_offset,
563 /* Suffix is in LZ77 dictionary. (Check was needed
564 * because the salink array caps distances to save
569 /* Save match offset if it results in lower cost. */
570 if (offset < best_offset) {
571 best_offset = offset;
572 pending_offset = offset;
576 /* Advance left to previous suffix. */
581 L -= link[L].dist_to_prev;
583 if (link[old_L].lcpprev < old_lenL) {
584 /* Match length decreased. */
586 lenL = link[old_L].lcpprev;
588 if (old_lenL > lenR) {
589 /* Neither the right side nor the left size has
590 * any more matches of length @old_lenL. If a
591 * pending match exists, save it. */
592 if (pending_offset != 0) {
593 matches[num_matches++] = (struct lz_match) {
595 .offset = pending_offset,
601 /* New match length on left is still at
602 * least as large as the next match
603 * length on the right: Keep extending
604 * left, unless the minimum match length
605 * would be underrun. */
612 /* Here we have lenL < lenR. Extend right.
613 * (No check for whether the minimum match length has
614 * been underrun is needed, provided that such lengths
615 * are marked as 0.) */
621 /* Search suffixes on the right until the match length has decreased to
622 * the next match length on the left or to below the minimum match
629 /* Check for hard cutoff on amount of work done. */
630 if (count_remaining-- == 0) {
631 if (pending_offset != 0) {
632 /* Save pending match. */
633 matches[num_matches++] = (struct lz_match) {
635 .offset = pending_offset,
642 /* Suffix is in LZ77 dictionary. (Check was needed
643 * because the salink array caps distances to save
648 if (offset < best_offset) {
649 best_offset = offset;
650 pending_offset = offset;
654 /* Advance right to next suffix. */
659 R += link[R].dist_to_next;
661 if (link[old_R].lcpnext < lenR) {
662 /* Match length decreased. */
664 lenR = link[old_R].lcpnext;
666 /* Neither the right side nor the left size has any more
667 * matches of length @old_lenR. If a pending match
668 * exists, save it. */
669 if (pending_offset != 0) {
670 matches[num_matches++] = (struct lz_match) {
672 .offset = pending_offset,
678 /* lenL >= lenR: Extend left, unless the
679 * minimum match length would be underrun, in
680 * which case we are done. */
686 /* lenR > lenL: Keep extending right.
687 * (No check for whether the minimum match length has
688 * been underrun is needed, provided that such lengths
689 * are marked as 0.) */
694 for (u32 i = 0; i < num_matches / 2; i++)
695 swap(matches[i], matches[num_matches - 1 - i]);
700 lz_lsa_skip_positions(struct lz_mf *_mf, u32 n)
702 struct lz_lsa *mf = (struct lz_lsa *)_mf;
704 lz_lsa_update_salink(mf->ISA[mf->base.cur_window_pos++], mf->salink);
709 lz_lsa_destroy(struct lz_mf *_mf)
711 struct lz_lsa *mf = (struct lz_lsa *)_mf;
717 const struct lz_mf_ops lz_linked_suffix_array_ops = {
718 .params_valid = lz_lsa_params_valid,
719 .get_needed_memory = lz_lsa_get_needed_memory,
721 .load_window = lz_lsa_load_window,
722 .get_matches = lz_lsa_get_matches,
723 .skip_positions = lz_lsa_skip_positions,
724 .destroy = lz_lsa_destroy,
725 .struct_size = sizeof(struct lz_lsa),