From acb5c41d2c9be655eca863a0315a22d1f54889e2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 Feb 2015 01:13:15 -0600 Subject: [PATCH] LCP-interval tree matchfinder improvements - Adjust the algorithm to remove the need to have an "unvisited" flag in normal mode. - Prefetch upcoming SA_and_LCP entries during LCP construction. - Get rid of lcpit_matchfinder_templates.h, since the "huge mode" algorithm isn't quite the same as the regular algorithm anymore. - Improve documentation. --- Makefile.am | 1 - include/wimlib/lcpit_matchfinder.h | 24 +- src/lcpit_matchfinder.c | 614 +++++++++++++++++++++++++---- src/lcpit_matchfinder_templates.h | 343 ---------------- 4 files changed, 533 insertions(+), 449 deletions(-) delete mode 100644 src/lcpit_matchfinder_templates.h diff --git a/Makefile.am b/Makefile.am index 10511f78..27dad522 100644 --- a/Makefile.am +++ b/Makefile.am @@ -59,7 +59,6 @@ libwim_la_SOURCES = \ src/iterate_dir.c \ src/join.c \ src/lcpit_matchfinder.c \ - src/lcpit_matchfinder_templates.h \ src/lookup_table.c \ src/lzms_common.c \ src/lzms_compress.c \ diff --git a/include/wimlib/lcpit_matchfinder.h b/include/wimlib/lcpit_matchfinder.h index ffcd426b..dfedf371 100644 --- a/include/wimlib/lcpit_matchfinder.h +++ b/include/wimlib/lcpit_matchfinder.h @@ -17,31 +17,13 @@ #include "wimlib/types.h" struct lcpit_matchfinder { - bool huge_mode; - u32 cur_pos; - - /* Mapping: suffix index ("window position") => lcp-interval index */ u32 *pos_data; - - /* Mapping: lcp-interval index => lcp-interval data - * - * Initially, the lcp-interval data for an lcp-interval contains that - * interval's lcp and superinterval index. - * - * After a lcp-interval is visited during match-finding, its - * lcp-interval data contains that interval's lcp and the position of - * the next suffix to consider as a match when matching against that - * lcp-interval. */ union { u32 *intervals; u64 *intervals64; }; - - /* The suffix array */ - u32 *SA; - u32 min_match_len; u32 nice_match_len; }; @@ -58,9 +40,6 @@ extern bool lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize, u32 min_match_len, u32 nice_match_len); -extern void -lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf); - extern void lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n); @@ -71,4 +50,7 @@ lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf, extern void lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count); +extern void +lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf); + #endif /* _LCPIT_MATCHFINDER_H */ diff --git a/src/lcpit_matchfinder.c b/src/lcpit_matchfinder.c index dd00979e..9d22ad07 100644 --- a/src/lcpit_matchfinder.c +++ b/src/lcpit_matchfinder.c @@ -1,5 +1,5 @@ /* - * lcpit_matchfinder.h + * lcpit_matchfinder.c * * A match-finder for Lempel-Ziv compression based on bottom-up construction and * traversal of the Longest Common Prefix (LCP) interval tree. @@ -12,40 +12,513 @@ */ #include -#include #include "wimlib/divsufsort.h" #include "wimlib/lcpit_matchfinder.h" #include "wimlib/util.h" #define LCP_BITS 6 -#define LCP_MASK ((1 << LCP_BITS) - 1) -#define LCP_MAX LCP_MASK -#define NORMAL_UNVISITED_TAG ((u32)1 << 31) -#define MAX_NORMAL_BUFSIZE ((u32)1 << (31 - LCP_BITS)) -#define HUGE_UNVISITED_TAG ((u64)1 << 63) -#define SA_and_LCP_LCP_SHIFT (32 - LCP_BITS) -#define SA_and_LCP_POS_MASK (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1) +#define LCP_MAX (((u32)1 << LCP_BITS) - 1) +#define POS_MASK (((u32)1 << (32 - LCP_BITS)) - 1) +#define LCP_SHIFT (32 - LCP_BITS) +#define LCP_MASK (LCP_MAX << LCP_SHIFT) +#define MAX_NORMAL_BUFSIZE (POS_MASK + 1) + +#define HUGE_LCP_BITS 7 +#define HUGE_LCP_MAX (((u32)1 << HUGE_LCP_BITS) - 1) +#define HUGE_LCP_SHIFT (64 - HUGE_LCP_BITS) +#define HUGE_POS_MASK 0xFFFFFFFF +#define MAX_HUGE_BUFSIZE ((u64)HUGE_POS_MASK + 1) +#define HUGE_UNVISITED_TAG 0x100000000 + +#define PREFETCH_SAFETY 5 /* - * Include the template header to define the functions build_LCP(), - * build_LCPIT(), and lcpit_advance_one_byte(). There are "normal" and "huge" - * versions of each function. The normal versions assume that a buffer position - * and LCP value can be packed into a 32-bit integer, whereas the huge versions - * assume that 64 bits is needed. - * - * Both versions cap LCP values to 6 bits. This limits the depth of the - * lcp-interval tree without hurting the compression ratio too much. Matches of - * length 63 are sufficiently long that the compression ratio doesn't change - * significantly if we choose one such match over another. + * Build the LCP (Longest Common Prefix) array in linear time. + * + * LCP[r] will be the length of the longest common prefix between the suffixes + * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined. + * + * Algorithm taken from Kasai et al. (2001), but modified slightly: + * + * - With bytes there is no realistic way to reserve a unique symbol for + * end-of-buffer, so use explicit checks for end-of-buffer. + * + * - For decreased memory usage and improved memory locality, pack the two + * logically distinct SA and LCP arrays into a single array SA_and_LCP. + * + * - Since SA_and_LCP is accessed randomly, improve the cache behavior by + * reading several entries ahead in ISA and prefetching the upcoming + * SA_and_LCP entry. + * + * - If an LCP value is less than the minimum match length, then store 0. This + * avoids having to do comparisons against the minimum match length later. + * + * - If an LCP value is greater than the "nice match length", then store the + * "nice match length". This caps the number of bits needed to store each + * LCP value, and this caps the depth of the LCP-interval tree, without + * usually hurting the compression ratio too much. + * + * References: + * + * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in + * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th + * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. + */ +static void +build_LCP(u32 SA_and_LCP[restrict], const u32 ISA[restrict], + const u8 T[restrict], const u32 n, + const u32 min_lcp, const u32 max_lcp) +{ + u32 h = 0; + for (u32 i = 0; i < n; i++) { + const u32 r = ISA[i]; + prefetch(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]); + if (r > 0) { + const u32 j = SA_and_LCP[r - 1] & POS_MASK; + const u32 lim = min(n - i, n - j); + while (h < lim && T[i + h] == T[j + h]) + h++; + u32 stored_lcp = h; + if (stored_lcp < min_lcp) + stored_lcp = 0; + else if (stored_lcp > max_lcp) + stored_lcp = max_lcp; + SA_and_LCP[r] |= stored_lcp << LCP_SHIFT; + if (h > 0) + h--; + } + } +} + +/* + * Use the suffix array accompanied with the longest-common-prefix array --- + * which in combination can be called the "enhanced suffix array" --- to + * simulate a bottom-up traversal of the corresponding suffix tree, or + * equivalently the lcp-interval tree. Do so in suffix rank order, but save the + * superinterval references needed for later bottom-up traversal of the tree in + * suffix position order. + * + * To enumerate the lcp-intervals, this algorithm scans the suffix array and its + * corresponding LCP array linearly. While doing so, it maintains a stack of + * lcp-intervals that are currently open, meaning that their left boundaries + * have been seen but their right boundaries have not. The bottom of the stack + * is the interval which covers the entire suffix array (this has lcp=0), and + * the top of the stack is the deepest interval that is currently open (this has + * the greatest lcp of any interval on the stack). When this algorithm opens an + * lcp-interval, it assigns it a unique index in intervals[] and pushes it onto + * the stack. When this algorithm closes an interval, it pops it from the stack + * and sets the intervals[] entry of that interval to the index and lcp of that + * interval's superinterval, which is the new top of the stack. + * + * This algorithm also set pos_data[pos] for each suffix position 'pos' to the + * index and lcp of the deepest lcp-interval containing it. Alternatively, we + * can interpret each suffix as being associated with a singleton lcp-interval, + * or leaf of the suffix tree. With this interpretation, an entry in pos_data[] + * is the superinterval reference for one of these singleton lcp-intervals and + * therefore is not fundamentally different from an entry in intervals[]. + * + * To reduce memory usage, this algorithm re-uses the suffix array's memory to + * store the generated intervals[] array. This is possible because SA and LCP + * are accessed linearly, and no more than one interval is generated per suffix. + * + * The techniques used in this algorithm are described in various published + * papers. The generation of lcp-intervals from the suffix array (SA) and the + * longest-common-prefix array (LCP) is given as Algorithm BottomUpTraverse in + * Kasai et al. (2001) and Algorithm 4.1 ("Computation of lcp-intervals") in + * Abouelhoda et al. (2004). Both these papers note the equivalence between + * lcp-intervals (including the singleton lcp-interval for each suffix) and + * nodes of the suffix tree. Abouelhoda et al. (2004) furthermore applies + * bottom-up traversal of the lcp-interval tree to Lempel-Ziv factorization, as + * does Chen at al. (2008). Algorithm CPS1b of Chen et al. (2008) dynamically + * re-uses the suffix array during bottom-up traversal of the lcp-interval tree. + * + * References: + * + * Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix + * Arrays and Its Applications. 2001. CPM '01 Proceedings of the 12th + * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. + * + * M.I. Abouelhoda, S. Kurtz, E. Ohlebusch. 2004. Replacing Suffix Trees + * With Enhanced Suffix Arrays. Journal of Discrete Algorithms Volume 2 + * Issue 1, March 2004, pp. 53-86. + * + * G. Chen, S.J. Puglisi, W.F. Smyth. 2008. Lempel-Ziv Factorization + * Using Less Time & Space. Mathematics in Computer Science June 2008, + * Volume 1, Issue 4, pp. 605-623. */ -#define HUGE_MODE 1 -#include "lcpit_matchfinder_templates.h" -#undef HUGE_MODE +static void +build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n) +{ + u32 * const SA_and_LCP = intervals; + u32 next_interval_idx; + u32 open_intervals[LCP_MAX + 1]; + u32 *top = open_intervals; + u32 prev_pos = SA_and_LCP[0] & POS_MASK; + + *top = 0; + intervals[0] = 0; + next_interval_idx = 1; + + for (u32 r = 1; r < n; r++) { + const u32 next_pos = SA_and_LCP[r] & POS_MASK; + const u32 next_lcp = SA_and_LCP[r] >> LCP_SHIFT; + const u32 top_lcp = *top >> LCP_SHIFT; + + if (next_lcp == top_lcp) { + /* Continuing the deepest open interval */ + pos_data[prev_pos] = *top; + } else if (next_lcp > top_lcp) { + /* Opening a new interval */ + *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++; + pos_data[prev_pos] = *top; + } else { + /* Closing the deepest open interval */ + pos_data[prev_pos] = *top; + for (;;) { + const u32 closed_interval_idx = *top-- & POS_MASK; + const u32 superinterval_lcp = *top >> LCP_SHIFT; + + if (next_lcp == superinterval_lcp) { + /* Continuing the superinterval */ + intervals[closed_interval_idx] = *top; + break; + } else if (next_lcp > superinterval_lcp) { + /* Creating a new interval that is a + * superinterval of the one being + * closed, but still a subinterval of + * its superinterval */ + *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++; + intervals[closed_interval_idx] = *top; + break; + } else { + /* Also closing the superinterval */ + intervals[closed_interval_idx] = *top; + } + } + } + prev_pos = next_pos; + } -#define HUGE_MODE 0 -#include "lcpit_matchfinder_templates.h" -#undef HUGE_MODE + /* Close any still-open intervals. */ + pos_data[prev_pos] = *top; + for (; top > open_intervals; top--) + intervals[*top & POS_MASK] = *(top - 1); +} + +/* + * Advance the LCP-interval tree matchfinder by one byte. + * + * If @record_matches is true, then matches are written to the @matches array + * sorted by strictly decreasing length and strictly decreasing offset, and the + * return value is the number of matches found. Otherwise, @matches is ignored + * and the return value is always 0. + * + * How this algorithm works: + * + * 'cur_pos' is the position of the current suffix, which is the suffix being + * matched against. 'cur_pos' starts at 0 and is incremented each time this + * function is called. This function finds each suffix with position less than + * 'cur_pos' that shares a prefix with the current position, but for each + * distinct prefix length it finds only the suffix with the greatest position + * (i.e. the most recently seen in the linear traversal by position). This + * function accomplishes this using the lcp-interval tree data structure that + * was built by build_LCPIT() and is updated dynamically by this function. + * + * The first step is to read 'pos_data[cur_pos]', which gives the index and lcp + * value of the deepest lcp-interval containing the current suffix --- or, + * equivalently, the parent of the conceptual singleton lcp-interval that + * contains the current suffix. + * + * The second step is to ascend the lcp-interval tree until reaching an interval + * that has not yet been visited, and link the intervals to the current suffix + * along the way. An lcp-interval has been visited if and only if it has been + * linked to a suffix. Initially, no lcp-intervals are linked to suffixes. + * + * The third step is to continue ascending the lcp-interval tree, but indirectly + * through suffix links rather than through the original superinterval + * references, and continuing to form links with the current suffix along the + * way. Each suffix visited during this step, except in a special case to + * handle outdated suffixes, is a match which can be written to matches[]. Each + * intervals[] entry contains the position of the next suffix to visit, which we + * shall call 'match_pos'; this is the most recently seen suffix that belongs to + * that lcp-interval. 'pos_data[match_pos]' then contains the lcp and interval + * index of the next lcp-interval that should be visited. + * + * We can view these arrays as describing a new set of links that gets overlaid + * on top of the original superinterval references of the lcp-interval tree. + * Each such link can connect a node of the lcp-interval tree to an ancestor + * more than one generation removed. + * + * For each one-byte advance, the current position becomes the most recently + * seen suffix for a continuous sequence of lcp-intervals from a leaf interval + * to the root interval. Conceptually, this algorithm needs to update all these + * nodes to link to 'cur_pos', and then update 'pos_data[cur_pos]' to a "null" + * link. But actually, if some of these nodes have the same most recently seen + * suffix, then this algorithm just visits the pos_data[] entry for that suffix + * and skips over all these nodes in one step. Updating the extra nodes is + * accomplished by creating a redirection from the previous suffix to the + * current suffix. + * + * Using this shortcutting scheme, it's possible for a suffix to become out of + * date, which means that it is no longer the most recently seen suffix for the + * lcp-interval under consideration. This case is detected by noticing when the + * next lcp-interval link actually points deeper in the tree, and it is worked + * around by just continuing until we get to a link that actually takes us + * higher in the tree. This can be described as a lazy-update scheme. + */ +static inline u32 +lcpit_advance_one_byte(const u32 cur_pos, + u32 pos_data[restrict], + u32 intervals[restrict], + struct lz_match matches[restrict], + const bool record_matches) +{ + u32 lcp; + u32 interval_idx; + u32 match_pos; + struct lz_match *matchptr; + + /* Get the deepest lcp-interval containing the current suffix. */ + lcp = pos_data[cur_pos] >> LCP_SHIFT; + interval_idx = pos_data[cur_pos] & POS_MASK; + prefetch(&intervals[pos_data[cur_pos + 1] & POS_MASK]); + pos_data[cur_pos] = 0; + + /* Ascend until we reach a visited interval, linking the unvisited + * intervals to the current suffix as we go. */ + while (intervals[interval_idx] & LCP_MASK) { + const u32 superinterval_lcp = intervals[interval_idx] >> LCP_SHIFT; + const u32 superinterval_idx = intervals[interval_idx] & POS_MASK; + intervals[interval_idx] = cur_pos; + lcp = superinterval_lcp; + interval_idx = superinterval_idx; + } + + match_pos = intervals[interval_idx] & POS_MASK; + if (match_pos == 0) { + /* Ambiguous case; just don't allow matches with position 0. */ + if (interval_idx != 0) + intervals[interval_idx] = cur_pos; + return 0; + } + matchptr = matches; + /* Ascend indirectly via pos_data[] links. */ + for (;;) { + u32 next_lcp; + u32 next_interval_idx; + + for (;;) { + next_lcp = pos_data[match_pos] >> LCP_SHIFT; + next_interval_idx = pos_data[match_pos] & POS_MASK; + if (next_lcp < lcp) + break; + /* Suffix was out of date. */ + match_pos = intervals[next_interval_idx]; + } + intervals[interval_idx] = cur_pos; + pos_data[match_pos] = (lcp << LCP_SHIFT) | interval_idx; + if (record_matches) { + matchptr->length = lcp; + matchptr->offset = cur_pos - match_pos; + matchptr++; + } + if (next_interval_idx == 0) + break; + match_pos = intervals[next_interval_idx]; + interval_idx = next_interval_idx; + lcp = next_lcp; + } + return matchptr - matches; +} + +/* Expand SA from 32 bits to 64 bits. */ +static void +expand_SA(u32 *p, u32 n) +{ + typedef u32 _may_alias_attribute aliased_u32_t; + typedef u64 _may_alias_attribute aliased_u64_t; + + aliased_u32_t *SA = (aliased_u32_t *)p; + aliased_u64_t *SA64 = (aliased_u64_t *)p; + + u32 r = n - 1; + do { + SA64[r] = SA[r]; + } while (r--); +} + +/* Like build_LCP(), but for buffers larger than MAX_NORMAL_BUFSIZE. */ +static void +build_LCP_huge(u64 SA_and_LCP64[restrict], const u32 ISA[restrict], + const u8 T[restrict], const u32 n, + const u32 min_lcp, const u32 max_lcp) +{ + u32 h = 0; + for (u32 i = 0; i < n; i++) { + const u32 r = ISA[i]; + prefetch(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]); + if (r > 0) { + const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK; + const u32 lim = min(n - i, n - j); + while (h < lim && T[i + h] == T[j + h]) + h++; + u32 stored_lcp = h; + if (stored_lcp < min_lcp) + stored_lcp = 0; + else if (stored_lcp > max_lcp) + stored_lcp = max_lcp; + SA_and_LCP64[r] |= (u64)stored_lcp << HUGE_LCP_SHIFT; + if (h > 0) + h--; + } + } +} + +/* + * Like build_LCPIT(), but for buffers larger than MAX_NORMAL_BUFSIZE. + * + * This "huge" version is also slightly different in that the lcp value stored + * in each intervals[] entry is the lcp value for that interval, not its + * superinterval. This lcp value stays put in intervals[] and doesn't get moved + * to pos_data[] during lcpit_advance_one_byte_huge(). One consequence of this + * is that we have to use a special flag to distinguish visited from unvisited + * intervals. But overall, this scheme keeps the memory usage at 12n instead of + * 16n. (The non-huge version is 8n.) + */ +static void +build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n) +{ + u64 * const SA_and_LCP64 = intervals64; + u32 next_interval_idx; + u32 open_intervals[HUGE_LCP_MAX + 1]; + u32 *top = open_intervals; + u32 prev_pos = SA_and_LCP64[0] & HUGE_POS_MASK; + + *top = 0; + intervals64[0] = 0; + next_interval_idx = 1; + + for (u32 r = 1; r < n; r++) { + const u32 next_pos = SA_and_LCP64[r] & HUGE_POS_MASK; + const u32 next_lcp = SA_and_LCP64[r] >> HUGE_LCP_SHIFT; + const u32 top_lcp = intervals64[*top] >> HUGE_LCP_SHIFT; + + if (next_lcp == top_lcp) { + /* Continuing the deepest open interval */ + pos_data[prev_pos] = *top; + } else if (next_lcp > top_lcp) { + /* Opening a new interval */ + intervals64[next_interval_idx] = (u64)next_lcp << HUGE_LCP_SHIFT; + pos_data[prev_pos] = next_interval_idx; + *++top = next_interval_idx++; + } else { + /* Closing the deepest open interval */ + pos_data[prev_pos] = *top; + for (;;) { + const u32 closed_interval_idx = *top--; + const u32 superinterval_lcp = intervals64[*top] >> HUGE_LCP_SHIFT; + + if (next_lcp == superinterval_lcp) { + /* Continuing the superinterval */ + intervals64[closed_interval_idx] |= + HUGE_UNVISITED_TAG | *top; + break; + } else if (next_lcp > superinterval_lcp) { + /* Creating a new interval that is a + * superinterval of the one being + * closed, but still a subinterval of + * its superinterval */ + intervals64[next_interval_idx] = + (u64)next_lcp << HUGE_LCP_SHIFT; + intervals64[closed_interval_idx] |= + HUGE_UNVISITED_TAG | next_interval_idx; + *++top = next_interval_idx++; + break; + } else { + /* Also closing the superinterval */ + intervals64[closed_interval_idx] |= + HUGE_UNVISITED_TAG | *top; + } + } + } + prev_pos = next_pos; + } + + /* Close any still-open intervals. */ + pos_data[prev_pos] = *top; + for (; top > open_intervals; top--) + intervals64[*top] |= HUGE_UNVISITED_TAG | *(top - 1); +} + +/* Like lcpit_advance_one_byte(), but for buffers larger than + * MAX_NORMAL_BUFSIZE. */ +static inline u32 +lcpit_advance_one_byte_huge(const u32 cur_pos, + u32 pos_data[restrict], + u64 intervals64[restrict], + struct lz_match matches[restrict], + const bool record_matches) +{ + u32 interval_idx; + u32 lcp; + u32 match_pos; + struct lz_match *matchptr; + + interval_idx = pos_data[cur_pos]; + prefetch(&intervals64[pos_data[cur_pos + 1]]); + pos_data[cur_pos] = 0; + + while (intervals64[interval_idx] & HUGE_UNVISITED_TAG) { + lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT; + if (lcp == 0) + return 0; + const u32 superinterval_idx = intervals64[interval_idx] & HUGE_POS_MASK; + intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos; + interval_idx = superinterval_idx; + } + + match_pos = intervals64[interval_idx] & HUGE_POS_MASK; + lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT; + matchptr = matches; + while (lcp != 0) { + u32 next_interval_idx; + u32 next_lcp; + + for (;;) { + next_interval_idx = pos_data[match_pos]; + next_lcp = intervals64[next_interval_idx] >> HUGE_LCP_SHIFT; + if (next_lcp < lcp) + break; + match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK; + } + intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos; + pos_data[match_pos] = interval_idx; + if (record_matches) { + matchptr->length = lcp; + matchptr->offset = cur_pos - match_pos; + matchptr++; + } + match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK; + interval_idx = next_interval_idx; + lcp = next_lcp; + } + return matchptr - matches; +} + +static inline u64 +get_pos_data_size(size_t max_bufsize) +{ + return (u64)max((u64)max_bufsize + PREFETCH_SAFETY, + DIVSUFSORT_TMP_LEN) * sizeof(u32); +} + +static inline u64 +get_intervals_size(size_t max_bufsize) +{ + return (u64)max_bufsize * (max_bufsize <= MAX_NORMAL_BUFSIZE ? + sizeof(u32) : sizeof(u64)); +} /* * Calculate the number of bytes of memory needed for the LCP-interval tree @@ -58,19 +531,7 @@ u64 lcpit_matchfinder_get_needed_memory(size_t max_bufsize) { - u64 size = 0; - - /* pos_data (+1 is for prefetch) */ - size += ((u64)max_bufsize + 1) * sizeof(u32); - - /* intervals or intervals64 */ - size += max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) * - (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64)); - - /* SA */ - size += (u64)max_bufsize * sizeof(u32); - - return size; + return get_pos_data_size(max_bufsize) + get_intervals_size(max_bufsize); } /* @@ -89,20 +550,22 @@ lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize, { if (lcpit_matchfinder_get_needed_memory(max_bufsize) > SIZE_MAX) return false; + if (max_bufsize > MAX_HUGE_BUFSIZE - PREFETCH_SAFETY) + return false; - mf->pos_data = MALLOC((max_bufsize + 1) * sizeof(u32)); - mf->intervals = MALLOC(max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) * - (max_bufsize <= MAX_NORMAL_BUFSIZE ? - sizeof(u32) : sizeof(u64))); - mf->SA = MALLOC(max_bufsize * sizeof(u32)); - - if (!mf->pos_data || !mf->intervals || !mf->SA) { + mf->pos_data = MALLOC(get_pos_data_size(max_bufsize)); + mf->intervals = MALLOC(get_intervals_size(max_bufsize)); + if (!mf->pos_data || !mf->intervals) { lcpit_matchfinder_destroy(mf); return false; } mf->min_match_len = min_match_len; - mf->nice_match_len = min(nice_match_len, LCP_MAX); + mf->nice_match_len = min(nice_match_len, + (max_bufsize <= MAX_NORMAL_BUFSIZE) ? + LCP_MAX : HUGE_LCP_MAX); + for (u32 i = 0; i < PREFETCH_SAFETY; i++) + mf->pos_data[max_bufsize + i] = 0; return true; } @@ -159,42 +622,31 @@ build_ISA(u32 ISA[restrict], const u32 SA[restrict], u32 n) * * @mf - the initialized matchfinder structure * @T - the input buffer - * @n - size of the input buffer in bytes. This may be at most the max_bufsize - * with which lcpit_matchfinder_init() was called. + * @n - size of the input buffer in bytes. This must be nonzero and can be at + * most the max_bufsize with which lcpit_matchfinder_init() was called. */ void lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n) { - if (n == 0) - return; + /* intervals[] temporarily stores SA and LCP packed together. + * pos_data[] temporarily stores ISA. + * pos_data[] is also used as the temporary space for divsufsort(). */ - build_SA(mf->SA, T, n, mf->intervals); - build_ISA(mf->pos_data, mf->SA, n); + build_SA(mf->intervals, T, n, mf->pos_data); + build_ISA(mf->pos_data, mf->intervals, n); if (n <= MAX_NORMAL_BUFSIZE) { - /* "Normal" sized buffer */ - - /* Build LCP, packing it into ->SA */ - build_LCP_normal(mf->SA, mf->pos_data, T, n, - mf->min_match_len, mf->nice_match_len); - /* Prepare ->intervals and ->pos_data */ - build_LCPIT_normal(mf->SA, mf->intervals, mf->pos_data, n); + build_LCP(mf->intervals, mf->pos_data, T, n, + mf->min_match_len, mf->nice_match_len); + build_LCPIT(mf->intervals, mf->pos_data, n); mf->huge_mode = false; } else { - /* "Huge" sized buffer */ - - /* Build LCP in the second half of ->intervals64. It may be - * partially overwritten in build_LCPIT_huge(), but this is okay - * since each LCP entry is guaranteed to be consumed before it - * can possibly be overwritten. */ - build_LCP_huge(mf->intervals + n, mf->SA, mf->pos_data, T, n, + expand_SA(mf->intervals, n); + build_LCP_huge(mf->intervals64, mf->pos_data, T, n, mf->min_match_len, mf->nice_match_len); - /* Prepare ->intervals64 and ->pos_data */ - build_LCPIT_huge(mf->SA, mf->intervals + n, mf->intervals64, - mf->pos_data, n); + build_LCPIT_huge(mf->intervals64, mf->pos_data, n); mf->huge_mode = true; } mf->cur_pos = 0; /* starting at beginning of input buffer */ - mf->pos_data[n] = 0; /* safety entry for prefetch() overrun */ } /* @@ -205,37 +657,35 @@ lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n) * * The return value is the number of matches found and written to @matches. * This can be any value in [0, nice_match_len - min_match_len + 1]. - * - * If the caller attempts to advance beyond the end of the input buffer, the - * behavior is undefined. */ u32 lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf, struct lz_match *matches) { if (mf->huge_mode) - return lcpit_advance_one_byte_huge(mf, matches, true); + return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data, + mf->intervals64, matches, true); else - return lcpit_advance_one_byte_normal(mf, matches, true); + return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data, + mf->intervals, matches, true); } /* * Skip the next @count bytes (don't search for matches at them). @count is * assumed to be > 0. - * - * If the caller attempts to advance beyond the end of the input buffer, the - * behavior is undefined. */ void lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count) { if (mf->huge_mode) { do { - lcpit_advance_one_byte_huge(mf, NULL, false); + lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data, + mf->intervals64, NULL, false); } while (--count); } else { do { - lcpit_advance_one_byte_normal(mf, NULL, false); + lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data, + mf->intervals, NULL, false); } while (--count); } } @@ -243,14 +693,10 @@ lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count) /* * Destroy an LCP-interval tree matchfinder that was previously initialized with * lcpit_matchfinder_init(). - * - * If the struct has been zeroed out, this has no effect. */ void lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf) { FREE(mf->pos_data); FREE(mf->intervals); - FREE(mf->SA); - memset(mf, 0, sizeof(*mf)); } diff --git a/src/lcpit_matchfinder_templates.h b/src/lcpit_matchfinder_templates.h deleted file mode 100644 index ac9c8883..00000000 --- a/src/lcpit_matchfinder_templates.h +++ /dev/null @@ -1,343 +0,0 @@ -/* - * lcpit_matchfinder_templates.h - * - * This file is included by lcpit_matchfinder.c. - * - * Author: Eric Biggers - * Year: 2014, 2015 - * - * The author dedicates this file to the public domain. - * You can do whatever you want with this file. - */ - -/* - * In normal mode, we can pack a buffer position and a LCP value into a 32-bit - * number. In huge mode, we can't. - */ -#if HUGE_MODE -# define GET_SA_ENTRY(r) (SA[r]) -# define GET_LCP_ENTRY(r) (LCP[r]) -# define SET_LCP_ENTRY(r, val) (LCP[r] = (val)) -# define UNVISITED_TAG HUGE_UNVISITED_TAG -#else -# define GET_SA_ENTRY(r) (SA_and_LCP[r] & SA_and_LCP_POS_MASK) -# define GET_LCP_ENTRY(r) (SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT) -# define SET_LCP_ENTRY(r, val) (SA_and_LCP[r] |= (val) << SA_and_LCP_LCP_SHIFT) -# define UNVISITED_TAG NORMAL_UNVISITED_TAG -#endif - -/* - * Build the LCP (Longest Common Prefix) array in linear time. - * - * LCP[r] will be the length of the longest common prefix between the suffixes - * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined. - * - * Algorithm taken from Kasai et al. (2001), but modified slightly: - * - * - For decreased memory usage and improved memory locality, pack the two - * logically distinct SA and LCP arrays into a single array SA_and_LCP. - * - * - With bytes there is no realistic way to reserve a unique symbol for - * end-of-buffer, so use explicit checks for end-of-buffer. - * - * - If a LCP value is less than the minimum match length, then store 0. This - * avoids having to do comparisons against the minimum match length later. - * - * - If a LCP value is greater than the "nice match length", then store the - * "nice match length". This caps the number of bits needed to store each - * LCP value, and this caps the depth of the LCP-interval tree, without - * usually hurting the compression ratio too much. - * - * References: - * - * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in - * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th - * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. - */ -#if HUGE_MODE -static void -build_LCP_huge(u32 LCP[restrict], const u32 SA[restrict], const u32 ISA[restrict], - const u8 T[restrict], u32 n, u32 min_lcp, u32 max_lcp) -#else -static void -build_LCP_normal(u32 SA_and_LCP[restrict], const u32 ISA[restrict], - const u8 T[restrict], u32 n, u32 min_lcp, u32 max_lcp) -#endif -{ - u32 h = 0; - for (u32 i = 0; i < n; i++) { - u32 r = ISA[i]; - if (r > 0) { - u32 j = GET_SA_ENTRY(r - 1); - u32 lim = min(n - i, n - j); - while (h < lim && T[i + h] == T[j + h]) - h++; - u32 stored_lcp = h; - if (stored_lcp < min_lcp) - stored_lcp = 0; - else if (stored_lcp > max_lcp) - stored_lcp = max_lcp; - SET_LCP_ENTRY(r, stored_lcp); - if (h > 0) - h--; - } - } -} - -/* - * Use the suffix array accompanied with the longest-common-prefix array --- in - * other words, the "enhanced suffix array" --- to simulate a bottom-up - * traversal of the corresponding suffix tree, or equivalently the "lcp-interval - * tree", as described in Abouelhoda et al. (2004). - * - * While doing the traversal, create a table 'intervals' that contains data for - * each lcp-interval, specifically the lcp value of that interval, and the index - * of the superinterval. - * - * Also while doing the traversal, create a table 'pos_data' that contains a - * mapping from suffix index to the deepest lcp-interval containing it. - * - * The result is that we will later be able to do match-finding at a given - * position by looking up that position in 'pos_data' to get the deepest - * lcp-interval containing the corresponding suffix, then proceeding to the - * superintervals. See lcpit_advance_one_byte() for more details. - * - * Note: We limit the depth of the lcp-interval tree by capping the lcp at - * LCP_MAX. This can cause a sub-tree of intervals with lcp greater than - * LCP_MAX to be collapsed into a single interval with lcp LCP_MAX. This avoids - * degenerate cases and does not hurt match-finding very much, since if we find - * a match of length LCP_MAX and extend it as far as possible, that's usually - * good enough because that region of the input must already be highly - * compressible. - * - * References: - * - * M.I. Abouelhoda, S. Kurtz, E. Ohlebusch. 2004. Replacing Suffix Trees - * With Enhanced Suffix Arrays. Journal of Discrete Algorithms Volume 2 - * Issue 1, March 2004, pp. 53-86. - * - * G. Chen, S.J. Puglisi, W.F. Smyth. 2008. Lempel-Ziv Factorization - * Using Less Time & Space. Mathematics in Computer Science June 2008, - * Volume 1, Issue 4, pp. 605-623. - * - * Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix - * Arrays and Its Applications. 2001. CPM '01 Proceedings of the 12th - * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. - */ -#if HUGE_MODE -static void -build_LCPIT_huge(const u32 SA[restrict], u32 LCP[], u64 intervals[], - u32 pos_data[restrict], u32 n) -#else -static void -build_LCPIT_normal(const u32 SA_and_LCP[restrict], u32 intervals[restrict], - u32 pos_data[restrict], u32 n) -#endif -{ - u32 next_interval_idx = 0; - u32 open_intervals[LCP_MAX + 1]; - u32 *top = open_intervals; - u32 prev_pos = GET_SA_ENTRY(0); - - /* The interval with lcp=0 covers the entire array. It remains open - * until the end. */ - *top = next_interval_idx; - intervals[next_interval_idx] = 0; - next_interval_idx++; - - for (u32 r = 1; r < n; r++) { - u32 next_pos = GET_SA_ENTRY(r); - u32 next_lcp = GET_LCP_ENTRY(r); - u32 top_lcp = intervals[*top]; - - if (next_lcp == top_lcp) { - /* continuing the deepest open interval */ - pos_data[prev_pos] = *top; - } else if (next_lcp > top_lcp) { - /* opening a new interval */ - intervals[next_interval_idx] = next_lcp; - *++top = next_interval_idx; - pos_data[prev_pos] = next_interval_idx; - next_interval_idx++; - } else { - /* closing the deepest open interval */ - pos_data[prev_pos] = *top; - for (;;) { - u32 closed_interval_idx = *top; - u32 superinterval_idx = *--top; - u32 superinterval_lcp = intervals[superinterval_idx]; - - if (next_lcp == superinterval_lcp) { - /* continuing the superinterval */ - intervals[closed_interval_idx] |= - (superinterval_idx << LCP_BITS) | - UNVISITED_TAG; - break; - } else if (next_lcp > superinterval_lcp) { - /* creating a new interval that is a - * superinterval of the one being - * closed, but still a subinterval of - * its superinterval */ - intervals[next_interval_idx] = next_lcp; - *++top = next_interval_idx; - intervals[closed_interval_idx] |= - (next_interval_idx << LCP_BITS) | - UNVISITED_TAG; - next_interval_idx++; - break; - } else { - /* also closing the superinterval */ - intervals[closed_interval_idx] |= - (superinterval_idx << LCP_BITS) | - UNVISITED_TAG; - } - } - } - prev_pos = next_pos; - } - - /* close any still-open intervals */ - pos_data[prev_pos] = *top; - while (top > open_intervals) { - u32 closed_interval_idx = *top; - u32 superinterval_idx = *--top; - intervals[closed_interval_idx] |= - (superinterval_idx << LCP_BITS) | UNVISITED_TAG; - } -} - -/* - * Advance the LCP-interval tree matchfinder by one byte. - * - * If @record_matches is true, then matches are recorded in the @matches array, - * and the return value is the number of matches found. Otherwise, @matches is - * ignored and the return value is always 0. - */ -static inline u32 -#if HUGE_MODE -lcpit_advance_one_byte_huge -#else -lcpit_advance_one_byte_normal -#endif -(struct lcpit_matchfinder *mf, struct lz_match * restrict matches, - bool record_matches) -{ - const u32 cur_pos = mf->cur_pos++; - u32 * const pos_data = mf->pos_data; -#if HUGE_MODE - u64 * const intervals = mf->intervals64; -#else - u32 * const intervals = mf->intervals; -#endif - u32 num_matches = 0; - u32 lcp, next_lcp; - u32 interval, next_interval; - u32 cur_match, next_match; - - /* Look up the deepest lcp-interval containing the current suffix. */ - interval = pos_data[cur_pos]; - - /* Prefetch the deepest lcp-interval containing the next suffix. */ - prefetch(&intervals[pos_data[cur_pos + 1]]); - - /* Since the current position is greater than any position previously - * searched, set the "lcp interval of the next match" for this suffix to - * 0. This is the index of the root interval, and this indicates that - * there is no next match. */ - pos_data[cur_pos] = 0; - - /* Ascend the lcp-interval tree until we reach an lcp-interval that has - * already been visited. */ - - while (intervals[interval] & UNVISITED_TAG) { - - /* Visiting this lcp-interval for the first time. Therefore, - * there are no matches with length equal to the lcp of this - * lcp-interval. */ - - /* Extract the LCP and superinterval reference. */ - - lcp = intervals[interval] & LCP_MASK; - - /* If the LCP is shorter than the minimum length of matches to - * be produced, we're done, since the LCP will only ever get - * shorter from here. This also prevents ascending above the - * root of the lcp-interval tree, since the root is guaranteed - * to be a 0-interval. */ - if (lcp == 0) - return 0; - - next_interval = (intervals[interval] & ~UNVISITED_TAG) >> LCP_BITS; - - /* Set the position of the most-recently-seen suffix within this - * lcp-interval. Since this is the first visitation of this - * lcp-interval, this is simply the current suffix. - * - * Note that this overwrites the superinterval reference which - * was previously included in this lcp-interval data slot. - * Further visitations of this lcp-interval will detect that it - * is already visited and will follow the chain of - * most-recently-seen suffixes rather than ascend the tree - * directly. */ - intervals[interval] = (cur_pos << LCP_BITS) | lcp; - - /* Ascend to the superinterval of this lcp-interval. */ - interval = next_interval; - } - - /* We've already visited the current lcp-interval. */ - - /* Extract the LCP of this lcp-interval. */ - lcp = intervals[interval] & LCP_MASK; - - /* Extract the current match for this lcp-interval. This usually is the - * most-recently-seen suffix within this lcp-interval, but it may be - * outdated. */ - cur_match = intervals[interval] >> LCP_BITS; - - for (;;) { - /* If the LCP is shorter than the minimum length of matches to - * be produced, we're done, since the LCP will only ever get - * shorter from here. This also prevents ascending above the - * root of the lcp-interval tree, since the root is guaranteed - * to be a 0-interval. */ - if (lcp == 0) - break; - - /* Advance the current match until the lcp of the *next* match - * is lower than the current lcp. When this is true we know - * that the current match is up to date (lowest offset / - * greatest position for that lcp). */ - - next_match = cur_match; - do { - next_interval = pos_data[next_match]; - next_lcp = intervals[next_interval] & LCP_MASK; - cur_match = next_match; - next_match = intervals[next_interval] >> LCP_BITS; - } while (next_lcp >= lcp); - - /* Link the current position into the match chain, discarding - * any skipped matches. */ - intervals[interval] = (cur_pos << LCP_BITS) | lcp; - pos_data[cur_match] = interval; - - if (record_matches) { - /* Record the match. */ - matches[num_matches].length = lcp; - matches[num_matches].offset = cur_pos - cur_match; - num_matches++; - } - - /* Advance to the next match. */ - interval = next_interval; - lcp = next_lcp; - cur_match = next_match; - } - return num_matches; -} - -#undef GET_SA_ENTRY -#undef GET_LCP_ENTRY -#undef SET_LCP_ENTRY -#undef UNVISITED_TAG -- 2.43.0