#define LZ_LCPIT_POS_BITS (32 - 1 - LZ_LCPIT_LCP_BITS)
#define LZ_LCPIT_MAX_WINDOW_SIZE (1UL << LZ_LCPIT_POS_BITS)
+#define SA_and_LCP_LCP_SHIFT (32 - LZ_LCPIT_LCP_BITS)
+#define SA_and_LCP_POS_MASK (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1)
+
struct lz_lcpit {
struct lz_mf base;
- /* Each of the arrays has length equal to the window size. This
- * therefore results in an additional memory usage of 12 bytes per
- * position. (That's compared to about 8 for binary trees or 4 for hash
- * chains, for example.)
- *
- * We allocate these arrays in one contiguous block. 'SA' is first,
- * 'intervals' is second, and 'pos_data' is third. */
-
- /* Pointer to the suffix array */
- u32 *SA;
+ u32 *mem;
/* Mapping: lcp-interval index => lcp-interval data
*
u32 *pos_data;
};
+/*
+ * Build the LCP (Longest Common Prefix) array in linear time.
+ *
+ * LCP[r] will be the length of the longest common prefix between the suffixes
+ * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined.
+ *
+ * Algorithm taken from Kasai et al. (2001), but modified slightly:
+ *
+ * - For decreased memory usage and improved memory locality, pack the two
+ * logically distinct SA and LCP arrays into a single array SA_and_LCP.
+ *
+ * - With bytes there is no realistic way to reserve a unique symbol for
+ * end-of-buffer, so use explicit checks for end-of-buffer.
+ *
+ * - If a LCP value is less than the minimum match length, then store 0. This
+ * avoids having to do comparisons against the minimum match length later.
+ *
+ * - If a LCP value is greater than the "nice match length", then store the
+ * "nice match length". This caps the number of bits needed to store each
+ * LCP value, and this caps the depth of the LCP-interval tree, without
+ * usually hurting the compression ratio too much.
+ *
+ * References:
+ *
+ * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in
+ * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th
+ * Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
+ */
+static void
+build_LCP_packed(u32 * const restrict SA_and_LCP, const u32 * const restrict ISA,
+ const u8 * const restrict T, const u32 n,
+ const u32 min_lcp, const u32 max_lcp)
+{
+ u32 h, i, r, j, lim, stored_lcp;
+
+ h = 0;
+ for (i = 0; i < n; i++) {
+ r = ISA[i];
+ if (r > 0) {
+ j = SA_and_LCP[r - 1] & SA_and_LCP_POS_MASK;
+ lim = min(n - i, n - j);
+ while (h < lim && T[i + h] == T[j + h])
+ h++;
+ stored_lcp = h;
+ if (stored_lcp < min_lcp)
+ stored_lcp = 0;
+ else if (stored_lcp > max_lcp)
+ stored_lcp = max_lcp;
+ SA_and_LCP[r] |= stored_lcp << SA_and_LCP_LCP_SHIFT;
+ if (h > 0)
+ h--;
+ }
+ }
+}
+
/*
* Use the suffix array accompanied with the longest-common-prefix array --- in
* other words, the "enhanced suffix array" --- to simulate a bottom-up
* Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
*/
static void
-build_LCPIT(const u32 SA[restrict], u32 LCP[restrict],
- u32 pos_data[restrict], const u32 lcp_limit, const u32 n)
+build_LCPIT(const u32 * const restrict SA_and_LCP,
+ u32 * const restrict intervals, u32 * const restrict pos_data,
+ const u32 n)
{
- u32 *intervals = LCP;
- u32 next_interval;
- u32 incomplete_intervals[lcp_limit + 1];
- u32 *cur_interval;
- u32 prev_pos;
-
- /* As we determine lcp-intervals, we assign each one an entry in
- * 'intervals', overwriting LCP in the process. Each such entry will
- * contain the index in 'intervals' of the superinterval, along with the
- * longest common prefix length that the suffixes in that interval
- * share.
- *
- * Note: since we don't need its memory for anything, we don't overwrite
- * the suffix array, even though this could potentially be done since
- * it's not actually used during match-finding. */
-
- /* Process rank 0 as special case. This creates the lcp-interval
- * containing every suffix in the window. */
- prev_pos = SA[0];
- intervals[0] = 0;
- pos_data[prev_pos] = 0;
- cur_interval = incomplete_intervals;
- *cur_interval = 0;
- next_interval = 1;
-
- /* Iterate through each suffix array rank. */
- for (u32 r = 1; r < n; r++) {
-
- /* Get the longest common prefix (lcp) between the suffixes with
- * ranks r and r - 1. But cap it to the lcp limit. */
- const u32 lcp = min(LCP[r], lcp_limit);
-
- /* Convert rank => position using the suffix array. */
- const u32 pos = SA[r];
+ u32 next_interval_idx = 0;
+ u32 open_intervals[LZ_LCPIT_LCP_MAX + 1];
+ u32 *top = open_intervals;
+ u32 prev_pos = SA_and_LCP[0] & SA_and_LCP_POS_MASK;
- /* cur_interval points to the index of the deepest (highest lcp
- * value) incomplete lcp-interval. */
-
- /*
- * There are three cases:
- *
- * (1) The lcp stayed the same as the previous value. Place the
- * current suffix in cur_interval. (This placement is
- * tentative, because if LCP increases at the next rank, this
- * suffix could still be placed in the resulting new LCP
- * interval instead.) cur_interval remains unchanged.
- *
- * (2) The lcp increased from the previous value. This signals
- * the beginning of a new lcp-interval. Create it and push it
- * onto the stack of incomplete intervals. But since lcp is
- * defined in terms of the longest prefix between this suffix
- * and the *previous* ranked suffix, the new lcp-interval
- * actually should have begun at the *previous* ranked suffix.
- * Therefore, we need to set both pos_data[pos] and
- * pos_data[prev_pos] to refer to the new interval.
- *
- * (3) The lcp decreased from the previous value. This signals
- * the termination of at least one lcp-interval. Pop the stack,
- * finalizing the lcp-intervals, until the current lcp is at
- * least as large as the lcp associated with cur_interval.
- * Then, if the current lcp is equal to the lcp associated with
- * cur_interval, place the current suffix in cur_interval,
- * similar to case (1). Else, create a new lcp-interval,
- * similar to case (2).
- */
-
- if (lcp == (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK)) {
-
- /* Case (1) */
-
- pos_data[pos] = *cur_interval;
-
- } else if (lcp > (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK)) {
-
- /* Case (2) */
-
- intervals[next_interval] = lcp | 0x80000000;
- pos_data[prev_pos] = next_interval;
- pos_data[pos] = next_interval;
- *++cur_interval = next_interval++;
+ /* The interval with lcp=0 covers the entire array. It remains open
+ * until the end. */
+ *top = next_interval_idx;
+ intervals[next_interval_idx] = 0;
+ next_interval_idx++;
+ for (u32 r = 1; r < n; r++) {
+ u32 next_pos = SA_and_LCP[r] & SA_and_LCP_POS_MASK;
+ u32 next_lcp = SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT;
+ u32 top_lcp = intervals[*top];
+
+ if (next_lcp == top_lcp) {
+ /* continuing the deepest open interval */
+ pos_data[prev_pos] = *top;
+ } else if (next_lcp > top_lcp) {
+ /* opening a new interval */
+ intervals[next_interval_idx] = next_lcp;
+ *++top = next_interval_idx;
+ pos_data[prev_pos] = next_interval_idx;
+ next_interval_idx++;
} else {
-
- /* Case (3) */
-
- u32 interval;
- u32 superinterval;
-
+ /* closing the deepest open interval */
+ pos_data[prev_pos] = *top;
for (;;) {
- /* Examine the deepest incomplete lcp-interval
- * and its superinterval. */
-
- interval = *cur_interval;
- superinterval = *--cur_interval;
-
- if (lcp >= (intervals[superinterval] &
- LZ_LCPIT_LCP_MASK))
+ u32 closed_interval_idx = *top;
+ u32 superinterval_idx = *--top;
+ u32 superinterval_lcp = intervals[superinterval_idx];
+
+ if (next_lcp == superinterval_lcp) {
+ /* continuing the superinterval */
+ intervals[closed_interval_idx] |=
+ (superinterval_idx << LZ_LCPIT_LCP_BITS) |
+ 0x80000000;
break;
-
- /* The current suffix can't go in either of
- * them. Therefore we're visiting 'interval'
- * for the last time and finalizing its
- * membership in 'superinterval'. */
-
- intervals[interval] |=
- (superinterval << LZ_LCPIT_LCP_BITS);
- }
-
- /* The current suffix can't go in 'interval', but it can
- * go in 'superinterval'. */
-
- if (lcp > (intervals[superinterval] & LZ_LCPIT_LCP_MASK)) {
- /* Creating a new lcp-interval that is a
- * superinterval of 'interval' but a subinterval
- * of 'superinterval'.
- *
- * Example: with the LCP arrayl
- *
- * 2 2 2 4 4 3
- *
- * then we will execute this case when
- * processing the LCP value 3. The LCP
- * intervals will be:
- *
- * 2 2 2 4 4 3
- * (lcp=0): | |
- * (lcp=2): | |
- * (lcp=3): | |
- * (lcp=4): | |
- *
- * Note that the 3-interval (the one being
- * created by this code) is a superinterval of
- * the 4-interval (which already existed)! But
- * we don't need to re-assign pos_data values in
- * the 4-interval because they point to the
- * deepest interval which contains them, which
- * is the 4-interval. */
-
- intervals[next_interval] = lcp | 0x80000000;
- intervals[interval] |=
- (next_interval << LZ_LCPIT_LCP_BITS);
- pos_data[pos] = next_interval;
- *++cur_interval = next_interval++;
- } else {
- /* Finishing 'interval', continuing with
- * 'superinterval'. */
-
- intervals[interval] |=
- (superinterval << LZ_LCPIT_LCP_BITS);
- pos_data[pos] = superinterval;
+ } else if (next_lcp > superinterval_lcp) {
+ /* creating a new interval that is a
+ * superinterval of the one being
+ * closed, but still a subinterval of
+ * its superinterval */
+ intervals[next_interval_idx] = next_lcp;
+ *++top = next_interval_idx;
+ intervals[closed_interval_idx] |=
+ (next_interval_idx << LZ_LCPIT_LCP_BITS) |
+ 0x80000000;
+ next_interval_idx++;
+ break;
+ } else {
+ /* also closing the superinterval */
+ intervals[closed_interval_idx] |=
+ (superinterval_idx << LZ_LCPIT_LCP_BITS) |
+ 0x80000000;
+ }
}
}
-
- /* Remember this suffix index when processing the next-ranked
- * suffix. */
- prev_pos = pos;
+ prev_pos = next_pos;
}
- /* Finalize any still-incomplete lcp-intervals. */
- while (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK) {
- intervals[*cur_interval] |=
- (*(cur_interval - 1) << LZ_LCPIT_LCP_BITS);
- cur_interval--;
+ /* close any still-open intervals */
+ pos_data[prev_pos] = *top;
+ while (top > open_intervals) {
+ u32 closed_interval_idx = *top;
+ u32 superinterval_idx = *--top;
+ intervals[closed_interval_idx] |=
+ (superinterval_idx << LZ_LCPIT_LCP_BITS) | 0x80000000;
}
}
lz_lcpit_set_default_params(&mf->base.params);
- mf->SA = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size));
- if (!mf->SA)
- return false;
-
- return true;
+ mf->mem = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size));
+ return (mf->mem != NULL);
}
static void
lz_lcpit_load_window(struct lz_mf *_mf, const u8 T[], u32 n)
{
struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
- u32 *mem = mf->SA;
-
- build_SA(&mem[0 * n], T, n, &mem[1 * n]);
- build_ISA(&mem[2 * n], &mem[0 * n], n);
- build_LCP(&mem[1 * n], &mem[0 * n], &mem[2 * n], T, n);
- build_LCPIT(&mem[0 * n], &mem[1 * n], &mem[2 * n],
- mf->base.params.nice_match_len, n);
- mf->SA = &mem[0 * n];
- mf->intervals = &mem[1 * n];
- mf->pos_data = &mem[2 * n];
+
+ build_SA(&mf->mem[0 * n], T, n, &mf->mem[1 * n]);
+ build_ISA(&mf->mem[2 * n], &mf->mem[0 * n], n);
+ build_LCP_packed(&mf->mem[0 * n], &mf->mem[2 * n], T, n,
+ mf->base.params.min_match_len,
+ mf->base.params.nice_match_len);
+ build_LCPIT(&mf->mem[0 * n], &mf->mem[1 * n], &mf->mem[2 * n], n);
+ mf->intervals = &mf->mem[1 * n];
+ mf->pos_data = &mf->mem[2 * n];
}
static u32
lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[])
{
struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
- const u32 min_match_len = mf->base.params.min_match_len;
const u32 cur_pos = mf->base.cur_window_pos;
u32 * const pos_data = mf->pos_data;
u32 * const intervals = mf->intervals;
* be produced, we're done, since the LCP will only ever get
* shorter from here. This also prevents ascending above the
* root of the lcp-interval tree, since the root is guaranteed
- * to be a 0-interval, and min_match_len is guaranteed to be at
- * least 2. */
- if (lcp < min_match_len)
+ * to be a 0-interval. */
+ if (lcp == 0)
goto out;
/* Set the position of the most-recently-seen suffix within this
* be produced, we're done, since the LCP will only ever get
* shorter from here. This also prevents ascending above the
* root of the lcp-interval tree, since the root is guaranteed
- * to be a 0-interval, and min_match_len is guaranteed to be at
- * least 2. */
- if (lcp < min_match_len)
+ * to be a 0-interval. */
+ if (lcp == 0)
break;
/* Advance the current match until the lcp of the *next* match
static void
lz_lcpit_skip_position(struct lz_lcpit *mf)
{
- const u32 min_match_len = mf->base.params.min_match_len;
const u32 cur_pos = mf->base.cur_window_pos++;
u32 * const pos_data = mf->pos_data;
u32 * const intervals = mf->intervals;
lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
next_interval = (intervals[interval] & ~0x80000000)
>> LZ_LCPIT_LCP_BITS;
- if (lcp < min_match_len)
+ if (lcp == 0)
return;
intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
interval = next_interval;
}
lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS;
- while (lcp >= min_match_len) {
+ while (lcp != 0) {
next_match = cur_match;
do {
next_interval = pos_data[next_match];
{
struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
- FREE(mf->SA);
+ FREE(mf->mem);
}
const struct lz_mf_ops lz_lcp_interval_tree_ops = {