X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flz_lcp_interval_tree.c;h=5240a7f326d47c32950a867dc88d23570e4ed315;hb=1afd794e3feb32e2984cc709213ad75232736003;hp=68f766ab6897d5d53d41a8577cc7c786b5885ebf;hpb=376790d04c3985dc771c6dbdc49d88e44570beb4;p=wimlib diff --git a/src/lz_lcp_interval_tree.c b/src/lz_lcp_interval_tree.c index 68f766ab..5240a7f3 100644 --- a/src/lz_lcp_interval_tree.c +++ b/src/lz_lcp_interval_tree.c @@ -53,19 +53,13 @@ #define LZ_LCPIT_POS_BITS (32 - 1 - LZ_LCPIT_LCP_BITS) #define LZ_LCPIT_MAX_WINDOW_SIZE (1UL << LZ_LCPIT_POS_BITS) +#define SA_and_LCP_LCP_SHIFT (32 - LZ_LCPIT_LCP_BITS) +#define SA_and_LCP_POS_MASK (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1) + struct lz_lcpit { struct lz_mf base; - /* Each of the arrays has length equal to the window size. This - * therefore results in an additional memory usage of 12 bytes per - * position. (That's compared to about 8 for binary trees or 4 for hash - * chains, for example.) - * - * We allocate these arrays in one contiguous block. 'SA' is first, - * 'intervals' is second, and 'pos_data' is third. */ - - /* Pointer to the suffix array */ - u32 *SA; + u32 *mem; /* Mapping: lcp-interval index => lcp-interval data * @@ -82,6 +76,61 @@ struct lz_lcpit { u32 *pos_data; }; +/* + * Build the LCP (Longest Common Prefix) array in linear time. + * + * LCP[r] will be the length of the longest common prefix between the suffixes + * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined. + * + * Algorithm taken from Kasai et al. (2001), but modified slightly: + * + * - For decreased memory usage and improved memory locality, pack the two + * logically distinct SA and LCP arrays into a single array SA_and_LCP. + * + * - With bytes there is no realistic way to reserve a unique symbol for + * end-of-buffer, so use explicit checks for end-of-buffer. + * + * - If a LCP value is less than the minimum match length, then store 0. This + * avoids having to do comparisons against the minimum match length later. + * + * - If a LCP value is greater than the "nice match length", then store the + * "nice match length". This caps the number of bits needed to store each + * LCP value, and this caps the depth of the LCP-interval tree, without + * usually hurting the compression ratio too much. + * + * References: + * + * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in + * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th + * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. + */ +static void +build_LCP_packed(u32 * const restrict SA_and_LCP, const u32 * const restrict ISA, + const u8 * const restrict T, const u32 n, + const u32 min_lcp, const u32 max_lcp) +{ + u32 h, i, r, j, lim, stored_lcp; + + h = 0; + for (i = 0; i < n; i++) { + r = ISA[i]; + if (r > 0) { + j = SA_and_LCP[r - 1] & SA_and_LCP_POS_MASK; + lim = min(n - i, n - j); + while (h < lim && T[i + h] == T[j + h]) + h++; + stored_lcp = h; + if (stored_lcp < min_lcp) + stored_lcp = 0; + else if (stored_lcp > max_lcp) + stored_lcp = max_lcp; + SA_and_LCP[r] |= stored_lcp << SA_and_LCP_LCP_SHIFT; + if (h > 0) + h--; + } + } +} + /* * Use the suffix array accompanied with the longest-common-prefix array --- in * other words, the "enhanced suffix array" --- to simulate a bottom-up @@ -123,14 +172,14 @@ struct lz_lcpit { * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. */ static void -build_LCPIT(const u32 SA[restrict], u32 LCP[restrict], - u32 pos_data[restrict], const u32 lcp_limit, const u32 n) +build_LCPIT(const u32 * const restrict SA_and_LCP, + u32 * const restrict intervals, u32 * const restrict pos_data, + const u32 n) { - u32 *intervals = LCP; u32 next_interval_idx = 0; u32 open_intervals[LZ_LCPIT_LCP_MAX + 1]; u32 *top = open_intervals; - u32 prev_pos = SA[0]; + u32 prev_pos = SA_and_LCP[0] & SA_and_LCP_POS_MASK; /* The interval with lcp=0 covers the entire array. It remains open * until the end. */ @@ -139,8 +188,8 @@ build_LCPIT(const u32 SA[restrict], u32 LCP[restrict], next_interval_idx++; for (u32 r = 1; r < n; r++) { - u32 next_pos = SA[r]; - u32 next_lcp = min(LCP[r], lcp_limit); + u32 next_pos = SA_and_LCP[r] & SA_and_LCP_POS_MASK; + u32 next_lcp = SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT; u32 top_lcp = intervals[*top]; if (next_lcp == top_lcp) { @@ -247,34 +296,29 @@ lz_lcpit_init(struct lz_mf *_mf) lz_lcpit_set_default_params(&mf->base.params); - mf->SA = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size)); - if (!mf->SA) - return false; - - return true; + mf->mem = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size)); + return (mf->mem != NULL); } static void lz_lcpit_load_window(struct lz_mf *_mf, const u8 T[], u32 n) { struct lz_lcpit *mf = (struct lz_lcpit *)_mf; - u32 *mem = mf->SA; - - build_SA(&mem[0 * n], T, n, &mem[1 * n]); - build_ISA(&mem[2 * n], &mem[0 * n], n); - build_LCP(&mem[1 * n], &mem[0 * n], &mem[2 * n], T, n); - build_LCPIT(&mem[0 * n], &mem[1 * n], &mem[2 * n], - mf->base.params.nice_match_len, n); - mf->SA = &mem[0 * n]; - mf->intervals = &mem[1 * n]; - mf->pos_data = &mem[2 * n]; + + build_SA(&mf->mem[0 * n], T, n, &mf->mem[1 * n]); + build_ISA(&mf->mem[2 * n], &mf->mem[0 * n], n); + build_LCP_packed(&mf->mem[0 * n], &mf->mem[2 * n], T, n, + mf->base.params.min_match_len, + mf->base.params.nice_match_len); + build_LCPIT(&mf->mem[0 * n], &mf->mem[1 * n], &mf->mem[2 * n], n); + mf->intervals = &mf->mem[1 * n]; + mf->pos_data = &mf->mem[2 * n]; } static u32 lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[]) { struct lz_lcpit *mf = (struct lz_lcpit *)_mf; - const u32 min_match_len = mf->base.params.min_match_len; const u32 cur_pos = mf->base.cur_window_pos; u32 * const pos_data = mf->pos_data; u32 * const intervals = mf->intervals; @@ -312,9 +356,8 @@ lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[]) * be produced, we're done, since the LCP will only ever get * shorter from here. This also prevents ascending above the * root of the lcp-interval tree, since the root is guaranteed - * to be a 0-interval, and min_match_len is guaranteed to be at - * least 2. */ - if (lcp < min_match_len) + * to be a 0-interval. */ + if (lcp == 0) goto out; /* Set the position of the most-recently-seen suffix within this @@ -348,9 +391,8 @@ lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[]) * be produced, we're done, since the LCP will only ever get * shorter from here. This also prevents ascending above the * root of the lcp-interval tree, since the root is guaranteed - * to be a 0-interval, and min_match_len is guaranteed to be at - * least 2. */ - if (lcp < min_match_len) + * to be a 0-interval. */ + if (lcp == 0) break; /* Advance the current match until the lcp of the *next* match @@ -416,7 +458,6 @@ out: static void lz_lcpit_skip_position(struct lz_lcpit *mf) { - const u32 min_match_len = mf->base.params.min_match_len; const u32 cur_pos = mf->base.cur_window_pos++; u32 * const pos_data = mf->pos_data; u32 * const intervals = mf->intervals; @@ -430,14 +471,14 @@ lz_lcpit_skip_position(struct lz_lcpit *mf) lcp = intervals[interval] & LZ_LCPIT_LCP_MASK; next_interval = (intervals[interval] & ~0x80000000) >> LZ_LCPIT_LCP_BITS; - if (lcp < min_match_len) + if (lcp == 0) return; intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp; interval = next_interval; } lcp = intervals[interval] & LZ_LCPIT_LCP_MASK; cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS; - while (lcp >= min_match_len) { + while (lcp != 0) { next_match = cur_match; do { next_interval = pos_data[next_match]; @@ -468,7 +509,7 @@ lz_lcpit_destroy(struct lz_mf *_mf) { struct lz_lcpit *mf = (struct lz_lcpit *)_mf; - FREE(mf->SA); + FREE(mf->mem); } const struct lz_mf_ops lz_lcp_interval_tree_ops = {