X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flz_lcp_interval_tree.c;h=5240a7f326d47c32950a867dc88d23570e4ed315;hb=958cc5438f3ade80baafcfbf15da370e819a6b90;hp=7c0135a2701995fc61a616c491c1bca29560659d;hpb=91b0f909393476f39ed5865dfe4b27c244ba3c48;p=wimlib diff --git a/src/lz_lcp_interval_tree.c b/src/lz_lcp_interval_tree.c index 7c0135a2..5240a7f3 100644 --- a/src/lz_lcp_interval_tree.c +++ b/src/lz_lcp_interval_tree.c @@ -53,19 +53,13 @@ #define LZ_LCPIT_POS_BITS (32 - 1 - LZ_LCPIT_LCP_BITS) #define LZ_LCPIT_MAX_WINDOW_SIZE (1UL << LZ_LCPIT_POS_BITS) +#define SA_and_LCP_LCP_SHIFT (32 - LZ_LCPIT_LCP_BITS) +#define SA_and_LCP_POS_MASK (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1) + struct lz_lcpit { struct lz_mf base; - /* Each of the arrays has length equal to the window size. This - * therefore results in an additional memory usage of 12 bytes per - * position. (That's compared to about 8 for binary trees or 4 for hash - * chains, for example.) - * - * We allocate these arrays in one contiguous block. 'SA' is first, - * 'intervals' is second, and 'pos_data' is third. */ - - /* Pointer to the suffix array */ - u32 *SA; + u32 *mem; /* Mapping: lcp-interval index => lcp-interval data * @@ -82,6 +76,61 @@ struct lz_lcpit { u32 *pos_data; }; +/* + * Build the LCP (Longest Common Prefix) array in linear time. + * + * LCP[r] will be the length of the longest common prefix between the suffixes + * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined. + * + * Algorithm taken from Kasai et al. (2001), but modified slightly: + * + * - For decreased memory usage and improved memory locality, pack the two + * logically distinct SA and LCP arrays into a single array SA_and_LCP. + * + * - With bytes there is no realistic way to reserve a unique symbol for + * end-of-buffer, so use explicit checks for end-of-buffer. + * + * - If a LCP value is less than the minimum match length, then store 0. This + * avoids having to do comparisons against the minimum match length later. + * + * - If a LCP value is greater than the "nice match length", then store the + * "nice match length". This caps the number of bits needed to store each + * LCP value, and this caps the depth of the LCP-interval tree, without + * usually hurting the compression ratio too much. + * + * References: + * + * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in + * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th + * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. + */ +static void +build_LCP_packed(u32 * const restrict SA_and_LCP, const u32 * const restrict ISA, + const u8 * const restrict T, const u32 n, + const u32 min_lcp, const u32 max_lcp) +{ + u32 h, i, r, j, lim, stored_lcp; + + h = 0; + for (i = 0; i < n; i++) { + r = ISA[i]; + if (r > 0) { + j = SA_and_LCP[r - 1] & SA_and_LCP_POS_MASK; + lim = min(n - i, n - j); + while (h < lim && T[i + h] == T[j + h]) + h++; + stored_lcp = h; + if (stored_lcp < min_lcp) + stored_lcp = 0; + else if (stored_lcp > max_lcp) + stored_lcp = max_lcp; + SA_and_LCP[r] |= stored_lcp << SA_and_LCP_LCP_SHIFT; + if (h > 0) + h--; + } + } +} + /* * Use the suffix array accompanied with the longest-common-prefix array --- in * other words, the "enhanced suffix array" --- to simulate a bottom-up @@ -123,172 +172,79 @@ struct lz_lcpit { * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. */ static void -build_LCPIT(const u32 SA[restrict], u32 LCP[restrict], - u32 pos_data[restrict], const u32 lcp_limit, const u32 n) +build_LCPIT(const u32 * const restrict SA_and_LCP, + u32 * const restrict intervals, u32 * const restrict pos_data, + const u32 n) { - u32 *intervals = LCP; - u32 next_interval; - u32 incomplete_intervals[lcp_limit + 1]; - u32 *cur_interval; - u32 prev_pos; - - /* As we determine lcp-intervals, we assign each one an entry in - * 'intervals', overwriting LCP in the process. Each such entry will - * contain the index in 'intervals' of the superinterval, along with the - * longest common prefix length that the suffixes in that interval - * share. - * - * Note: since we don't need its memory for anything, we don't overwrite - * the suffix array, even though this could potentially be done since - * it's not actually used during match-finding. */ - - /* Process rank 0 as special case. This creates the lcp-interval - * containing every suffix in the window. */ - prev_pos = SA[0]; - intervals[0] = 0; - pos_data[prev_pos] = 0; - cur_interval = incomplete_intervals; - *cur_interval = 0; - next_interval = 1; - - /* Iterate through each suffix array rank. */ - for (u32 r = 1; r < n; r++) { - - /* Get the longest common prefix (lcp) between the suffixes with - * ranks r and r - 1. But cap it to the lcp limit. */ - const u32 lcp = min(LCP[r], lcp_limit); - - /* Convert rank => position using the suffix array. */ - const u32 pos = SA[r]; + u32 next_interval_idx = 0; + u32 open_intervals[LZ_LCPIT_LCP_MAX + 1]; + u32 *top = open_intervals; + u32 prev_pos = SA_and_LCP[0] & SA_and_LCP_POS_MASK; - /* cur_interval points to the index of the deepest (highest lcp - * value) incomplete lcp-interval. */ - - /* - * There are three cases: - * - * (1) The lcp stayed the same as the previous value. Place the - * current suffix in cur_interval. (This placement is - * tentative, because if LCP increases at the next rank, this - * suffix could still be placed in the resulting new LCP - * interval instead.) cur_interval remains unchanged. - * - * (2) The lcp increased from the previous value. This signals - * the beginning of a new lcp-interval. Create it and push it - * onto the stack of incomplete intervals. But since lcp is - * defined in terms of the longest prefix between this suffix - * and the *previous* ranked suffix, the new lcp-interval - * actually should have begun at the *previous* ranked suffix. - * Therefore, we need to set both pos_data[pos] and - * pos_data[prev_pos] to refer to the new interval. - * - * (3) The lcp decreased from the previous value. This signals - * the termination of at least one lcp-interval. Pop the stack, - * finalizing the lcp-intervals, until the current lcp is at - * least as large as the lcp associated with cur_interval. - * Then, if the current lcp is equal to the lcp associated with - * cur_interval, place the current suffix in cur_interval, - * similar to case (1). Else, create a new lcp-interval, - * similar to case (2). - */ - - if (lcp == (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK)) { - - /* Case (1) */ - - pos_data[pos] = *cur_interval; - - } else if (lcp > (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK)) { - - /* Case (2) */ - - intervals[next_interval] = lcp | 0x80000000; - pos_data[prev_pos] = next_interval; - pos_data[pos] = next_interval; - *++cur_interval = next_interval++; + /* The interval with lcp=0 covers the entire array. It remains open + * until the end. */ + *top = next_interval_idx; + intervals[next_interval_idx] = 0; + next_interval_idx++; + for (u32 r = 1; r < n; r++) { + u32 next_pos = SA_and_LCP[r] & SA_and_LCP_POS_MASK; + u32 next_lcp = SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT; + u32 top_lcp = intervals[*top]; + + if (next_lcp == top_lcp) { + /* continuing the deepest open interval */ + pos_data[prev_pos] = *top; + } else if (next_lcp > top_lcp) { + /* opening a new interval */ + intervals[next_interval_idx] = next_lcp; + *++top = next_interval_idx; + pos_data[prev_pos] = next_interval_idx; + next_interval_idx++; } else { - - /* Case (3) */ - - u32 interval; - u32 superinterval; - + /* closing the deepest open interval */ + pos_data[prev_pos] = *top; for (;;) { - /* Examine the deepest incomplete lcp-interval - * and its superinterval. */ - - interval = *cur_interval; - superinterval = *--cur_interval; - - if (lcp >= (intervals[superinterval] & - LZ_LCPIT_LCP_MASK)) + u32 closed_interval_idx = *top; + u32 superinterval_idx = *--top; + u32 superinterval_lcp = intervals[superinterval_idx]; + + if (next_lcp == superinterval_lcp) { + /* continuing the superinterval */ + intervals[closed_interval_idx] |= + (superinterval_idx << LZ_LCPIT_LCP_BITS) | + 0x80000000; break; - - /* The current suffix can't go in either of - * them. Therefore we're visiting 'interval' - * for the last time and finalizing its - * membership in 'superinterval'. */ - - intervals[interval] |= - (superinterval << LZ_LCPIT_LCP_BITS); - } - - /* The current suffix can't go in 'interval', but it can - * go in 'superinterval'. */ - - if (lcp > (intervals[superinterval] & LZ_LCPIT_LCP_MASK)) { - /* Creating a new lcp-interval that is a - * superinterval of 'interval' but a subinterval - * of 'superinterval'. - * - * Example: with the LCP arrayl - * - * 2 2 2 4 4 3 - * - * then we will execute this case when - * processing the LCP value 3. The LCP - * intervals will be: - * - * 2 2 2 4 4 3 - * (lcp=0): | | - * (lcp=2): | | - * (lcp=3): | | - * (lcp=4): | | - * - * Note that the 3-interval (the one being - * created by this code) is a superinterval of - * the 4-interval (which already existed)! But - * we don't need to re-assign pos_data values in - * the 4-interval because they point to the - * deepest interval which contains them, which - * is the 4-interval. */ - - intervals[next_interval] = lcp | 0x80000000; - intervals[interval] |= - (next_interval << LZ_LCPIT_LCP_BITS); - pos_data[pos] = next_interval; - *++cur_interval = next_interval++; - } else { - /* Finishing 'interval', continuing with - * 'superinterval'. */ - - intervals[interval] |= - (superinterval << LZ_LCPIT_LCP_BITS); - pos_data[pos] = superinterval; + } else if (next_lcp > superinterval_lcp) { + /* creating a new interval that is a + * superinterval of the one being + * closed, but still a subinterval of + * its superinterval */ + intervals[next_interval_idx] = next_lcp; + *++top = next_interval_idx; + intervals[closed_interval_idx] |= + (next_interval_idx << LZ_LCPIT_LCP_BITS) | + 0x80000000; + next_interval_idx++; + break; + } else { + /* also closing the superinterval */ + intervals[closed_interval_idx] |= + (superinterval_idx << LZ_LCPIT_LCP_BITS) | + 0x80000000; + } } } - - /* Remember this suffix index when processing the next-ranked - * suffix. */ - prev_pos = pos; + prev_pos = next_pos; } - /* Finalize any still-incomplete lcp-intervals. */ - while (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK) { - intervals[*cur_interval] |= - (*(cur_interval - 1) << LZ_LCPIT_LCP_BITS); - cur_interval--; + /* close any still-open intervals */ + pos_data[prev_pos] = *top; + while (top > open_intervals) { + u32 closed_interval_idx = *top; + u32 superinterval_idx = *--top; + intervals[closed_interval_idx] |= + (superinterval_idx << LZ_LCPIT_LCP_BITS) | 0x80000000; } } @@ -299,7 +255,7 @@ lz_lcpit_set_default_params(struct lz_mf_params *params) params->min_match_len = 2; if (params->max_match_len == 0) - params->max_match_len = params->max_window_size; + params->max_match_len = UINT32_MAX; if (params->max_search_depth == 0) params->max_search_depth = 32; @@ -340,34 +296,29 @@ lz_lcpit_init(struct lz_mf *_mf) lz_lcpit_set_default_params(&mf->base.params); - mf->SA = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size)); - if (!mf->SA) - return false; - - return true; + mf->mem = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size)); + return (mf->mem != NULL); } static void lz_lcpit_load_window(struct lz_mf *_mf, const u8 T[], u32 n) { struct lz_lcpit *mf = (struct lz_lcpit *)_mf; - u32 *mem = mf->SA; - - build_SA(&mem[0 * n], T, n, &mem[1 * n]); - build_ISA(&mem[2 * n], &mem[0 * n], n); - build_LCP(&mem[1 * n], &mem[0 * n], &mem[2 * n], T, n); - build_LCPIT(&mem[0 * n], &mem[1 * n], &mem[2 * n], - mf->base.params.nice_match_len, n); - mf->SA = &mem[0 * n]; - mf->intervals = &mem[1 * n]; - mf->pos_data = &mem[2 * n]; + + build_SA(&mf->mem[0 * n], T, n, &mf->mem[1 * n]); + build_ISA(&mf->mem[2 * n], &mf->mem[0 * n], n); + build_LCP_packed(&mf->mem[0 * n], &mf->mem[2 * n], T, n, + mf->base.params.min_match_len, + mf->base.params.nice_match_len); + build_LCPIT(&mf->mem[0 * n], &mf->mem[1 * n], &mf->mem[2 * n], n); + mf->intervals = &mf->mem[1 * n]; + mf->pos_data = &mf->mem[2 * n]; } static u32 lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[]) { struct lz_lcpit *mf = (struct lz_lcpit *)_mf; - const u32 min_match_len = mf->base.params.min_match_len; const u32 cur_pos = mf->base.cur_window_pos; u32 * const pos_data = mf->pos_data; u32 * const intervals = mf->intervals; @@ -405,9 +356,8 @@ lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[]) * be produced, we're done, since the LCP will only ever get * shorter from here. This also prevents ascending above the * root of the lcp-interval tree, since the root is guaranteed - * to be a 0-interval, and min_match_len is guaranteed to be at - * least 2. */ - if (lcp < min_match_len) + * to be a 0-interval. */ + if (lcp == 0) goto out; /* Set the position of the most-recently-seen suffix within this @@ -441,9 +391,8 @@ lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[]) * be produced, we're done, since the LCP will only ever get * shorter from here. This also prevents ascending above the * root of the lcp-interval tree, since the root is guaranteed - * to be a 0-interval, and min_match_len is guaranteed to be at - * least 2. */ - if (lcp < min_match_len) + * to be a 0-interval. */ + if (lcp == 0) break; /* Advance the current match until the lcp of the *next* match @@ -509,7 +458,6 @@ out: static void lz_lcpit_skip_position(struct lz_lcpit *mf) { - const u32 min_match_len = mf->base.params.min_match_len; const u32 cur_pos = mf->base.cur_window_pos++; u32 * const pos_data = mf->pos_data; u32 * const intervals = mf->intervals; @@ -523,14 +471,14 @@ lz_lcpit_skip_position(struct lz_lcpit *mf) lcp = intervals[interval] & LZ_LCPIT_LCP_MASK; next_interval = (intervals[interval] & ~0x80000000) >> LZ_LCPIT_LCP_BITS; - if (lcp < min_match_len) + if (lcp == 0) return; intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp; interval = next_interval; } lcp = intervals[interval] & LZ_LCPIT_LCP_MASK; cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS; - while (lcp >= min_match_len) { + while (lcp != 0) { next_match = cur_match; do { next_interval = pos_data[next_match]; @@ -561,7 +509,7 @@ lz_lcpit_destroy(struct lz_mf *_mf) { struct lz_lcpit *mf = (struct lz_lcpit *)_mf; - FREE(mf->SA); + FREE(mf->mem); } const struct lz_mf_ops lz_lcp_interval_tree_ops = {