X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flz_lcp_interval_tree.c;h=5240a7f326d47c32950a867dc88d23570e4ed315;hb=958cc5438f3ade80baafcfbf15da370e819a6b90;hp=7c0135a2701995fc61a616c491c1bca29560659d;hpb=91b0f909393476f39ed5865dfe4b27c244ba3c48;p=wimlib

diff --git a/src/lz_lcp_interval_tree.c b/src/lz_lcp_interval_tree.c
index 7c0135a2..5240a7f3 100644
--- a/src/lz_lcp_interval_tree.c
+++ b/src/lz_lcp_interval_tree.c
@@ -53,19 +53,13 @@
 #define LZ_LCPIT_POS_BITS		(32 - 1 - LZ_LCPIT_LCP_BITS)
 #define LZ_LCPIT_MAX_WINDOW_SIZE	(1UL << LZ_LCPIT_POS_BITS)
 
+#define SA_and_LCP_LCP_SHIFT		(32 - LZ_LCPIT_LCP_BITS)
+#define SA_and_LCP_POS_MASK		(((u32)1 << SA_and_LCP_LCP_SHIFT) - 1)
+
 struct lz_lcpit {
 	struct lz_mf base;
 
-	/* Each of the arrays has length equal to the window size.  This
-	 * therefore results in an additional memory usage of 12 bytes per
-	 * position.  (That's compared to about 8 for binary trees or 4 for hash
-	 * chains, for example.)
-	 *
-	 * We allocate these arrays in one contiguous block.  'SA' is first,
-	 * 'intervals' is second, and 'pos_data' is third.  */
-
-	/* Pointer to the suffix array  */
-	u32 *SA;
+	u32 *mem;
 
 	/* Mapping: lcp-interval index => lcp-interval data
 	 *
@@ -82,6 +76,61 @@ struct lz_lcpit {
 	u32 *pos_data;
 };
 
+/*
+ * Build the LCP (Longest Common Prefix) array in linear time.
+ *
+ * LCP[r] will be the length of the longest common prefix between the suffixes
+ * with positions SA[r - 1] and  SA[r].  LCP[0] will be undefined.
+ *
+ * Algorithm taken from Kasai et al. (2001), but modified slightly:
+ *
+ *  - For decreased memory usage and improved memory locality, pack the two
+ *    logically distinct SA and LCP arrays into a single array SA_and_LCP.
+ *
+ *  - With bytes there is no realistic way to reserve a unique symbol for
+ *    end-of-buffer, so use explicit checks for end-of-buffer.
+ *
+ *  - If a LCP value is less than the minimum match length, then store 0.  This
+ *    avoids having to do comparisons against the minimum match length later.
+ *
+ *  - If a LCP value is greater than the "nice match length", then store the
+ *    "nice match length".  This caps the number of bits needed to store each
+ *    LCP value, and this caps the depth of the LCP-interval tree, without
+ *    usually hurting the compression ratio too much.
+ *
+ * References:
+ *
+ *	Kasai et al.  2001.  Linear-Time Longest-Common-Prefix Computation in
+ *	Suffix Arrays and Its Applications.  CPM '01 Proceedings of the 12th
+ *	Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
+ */
+static void
+build_LCP_packed(u32 * const restrict SA_and_LCP, const u32 * const restrict ISA,
+		 const u8 * const restrict T, const u32 n,
+		 const u32 min_lcp, const u32 max_lcp)
+{
+	u32 h, i, r, j, lim, stored_lcp;
+
+	h = 0;
+	for (i = 0; i < n; i++) {
+		r = ISA[i];
+		if (r > 0) {
+			j = SA_and_LCP[r - 1] & SA_and_LCP_POS_MASK;
+			lim = min(n - i, n - j);
+			while (h < lim && T[i + h] == T[j + h])
+				h++;
+			stored_lcp = h;
+			if (stored_lcp < min_lcp)
+				stored_lcp = 0;
+			else if (stored_lcp > max_lcp)
+				stored_lcp = max_lcp;
+			SA_and_LCP[r] |= stored_lcp << SA_and_LCP_LCP_SHIFT;
+			if (h > 0)
+				h--;
+		}
+	}
+}
+
 /*
  * Use the suffix array accompanied with the longest-common-prefix array --- in
  * other words, the "enhanced suffix array" --- to simulate a bottom-up
@@ -123,172 +172,79 @@ struct lz_lcpit {
  *	Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
  */
 static void
-build_LCPIT(const u32 SA[restrict], u32 LCP[restrict],
-	    u32 pos_data[restrict], const u32 lcp_limit, const u32 n)
+build_LCPIT(const u32 * const restrict SA_and_LCP,
+	    u32 * const restrict intervals, u32 * const restrict pos_data,
+	    const u32 n)
 {
-	u32 *intervals = LCP;
-	u32 next_interval;
-	u32 incomplete_intervals[lcp_limit + 1];
-	u32 *cur_interval;
-	u32 prev_pos;
-
-	/* As we determine lcp-intervals, we assign each one an entry in
-	 * 'intervals', overwriting LCP in the process.  Each such entry will
-	 * contain the index in 'intervals' of the superinterval, along with the
-	 * longest common prefix length that the suffixes in that interval
-	 * share.
-	 *
-	 * Note: since we don't need its memory for anything, we don't overwrite
-	 * the suffix array, even though this could potentially be done since
-	 * it's not actually used during match-finding.  */
-
-	/* Process rank 0 as special case.  This creates the lcp-interval
-	 * containing every suffix in the window.  */
-	prev_pos = SA[0];
-	intervals[0] = 0;
-	pos_data[prev_pos] = 0;
-	cur_interval = incomplete_intervals;
-	*cur_interval = 0;
-	next_interval = 1;
-
-	/* Iterate through each suffix array rank.  */
-	for (u32 r = 1; r < n; r++) {
-
-		/* Get the longest common prefix (lcp) between the suffixes with
-		 * ranks r and r - 1.  But cap it to the lcp limit.  */
-		const u32 lcp = min(LCP[r], lcp_limit);
-
-		/* Convert rank => position using the suffix array.  */
-		const u32 pos = SA[r];
+	u32 next_interval_idx = 0;
+	u32 open_intervals[LZ_LCPIT_LCP_MAX + 1];
+	u32 *top = open_intervals;
+	u32 prev_pos = SA_and_LCP[0] & SA_and_LCP_POS_MASK;
 
-		/* cur_interval points to the index of the deepest (highest lcp
-		 * value) incomplete lcp-interval.  */
-
-		/*
-		 * There are three cases:
-		 *
-		 * (1) The lcp stayed the same as the previous value.  Place the
-		 * current suffix in cur_interval.  (This placement is
-		 * tentative, because if LCP increases at the next rank, this
-		 * suffix could still be placed in the resulting new LCP
-		 * interval instead.)  cur_interval remains unchanged.
-		 *
-		 * (2) The lcp increased from the previous value.  This signals
-		 * the beginning of a new lcp-interval.  Create it and push it
-		 * onto the stack of incomplete intervals.  But since lcp is
-		 * defined in terms of the longest prefix between this suffix
-		 * and the *previous* ranked suffix, the new lcp-interval
-		 * actually should have begun at the *previous* ranked suffix.
-		 * Therefore, we need to set both pos_data[pos] and
-		 * pos_data[prev_pos] to refer to the new interval.
-		 *
-		 * (3) The lcp decreased from the previous value.  This signals
-		 * the termination of at least one lcp-interval.  Pop the stack,
-		 * finalizing the lcp-intervals, until the current lcp is at
-		 * least as large as the lcp associated with cur_interval.
-		 * Then, if the current lcp is equal to the lcp associated with
-		 * cur_interval, place the current suffix in cur_interval,
-		 * similar to case (1).  Else, create a new lcp-interval,
-		 * similar to case (2).
-		 */
-
-		if (lcp == (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK)) {
-
-			/* Case (1) */
-
-			pos_data[pos] = *cur_interval;
-
-		} else if (lcp > (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK)) {
-
-			/* Case (2) */
-
-			intervals[next_interval] = lcp | 0x80000000;
-			pos_data[prev_pos] = next_interval;
-			pos_data[pos] = next_interval;
-			*++cur_interval = next_interval++;
+	/* The interval with lcp=0 covers the entire array.  It remains open
+	 * until the end.  */
+	*top = next_interval_idx;
+	intervals[next_interval_idx] = 0;
+	next_interval_idx++;
 
+	for (u32 r = 1; r < n; r++) {
+		u32 next_pos = SA_and_LCP[r] & SA_and_LCP_POS_MASK;
+		u32 next_lcp = SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT;
+		u32 top_lcp = intervals[*top];
+
+		if (next_lcp == top_lcp) {
+			/* continuing the deepest open interval  */
+			pos_data[prev_pos] = *top;
+		} else if (next_lcp > top_lcp) {
+			/* opening a new interval  */
+			intervals[next_interval_idx] = next_lcp;
+			*++top = next_interval_idx;
+			pos_data[prev_pos] = next_interval_idx;
+			next_interval_idx++;
 		} else {
-
-			/* Case (3) */
-
-			u32 interval;
-			u32 superinterval;
-
+			/* closing the deepest open interval  */
+			pos_data[prev_pos] = *top;
 			for (;;) {
-				/* Examine the deepest incomplete lcp-interval
-				 * and its superinterval.  */
-
-				interval = *cur_interval;
-				superinterval = *--cur_interval;
-
-				if (lcp >= (intervals[superinterval] &
-					    LZ_LCPIT_LCP_MASK))
+				u32 closed_interval_idx = *top;
+				u32 superinterval_idx = *--top;
+				u32 superinterval_lcp = intervals[superinterval_idx];
+
+				if (next_lcp == superinterval_lcp) {
+					/* continuing the superinterval */
+					intervals[closed_interval_idx] |=
+						(superinterval_idx << LZ_LCPIT_LCP_BITS) |
+							0x80000000;
 					break;
-
-				/* The current suffix can't go in either of
-				 * them.  Therefore we're visiting 'interval'
-				 * for the last time and finalizing its
-				 * membership in 'superinterval'.  */
-
-				intervals[interval] |=
-					(superinterval << LZ_LCPIT_LCP_BITS);
-			}
-
-			/* The current suffix can't go in 'interval', but it can
-			 * go in 'superinterval'.  */
-
-			if (lcp > (intervals[superinterval] & LZ_LCPIT_LCP_MASK)) {
-				/* Creating a new lcp-interval that is a
-				 * superinterval of 'interval' but a subinterval
-				 * of 'superinterval'.
-				 *
-				 * Example: with the LCP arrayl
-				 *
-				 *            2  2  2  4  4  3
-				 *
-				 * then we will execute this case when
-				 * processing the LCP value 3.  The LCP
-				 * intervals will be:
-				 *
-				 *            2  2  2  4  4  3
-				 * (lcp=0):  |                |
-				 * (lcp=2):  |                |
-				 * (lcp=3):        |          |
-				 * (lcp=4):        |       |
-				 *
-				 * Note that the 3-interval (the one being
-				 * created by this code) is a superinterval of
-				 * the 4-interval (which already existed)!  But
-				 * we don't need to re-assign pos_data values in
-				 * the 4-interval because they point to the
-				 * deepest interval which contains them, which
-				 * is the 4-interval.  */
-
-				intervals[next_interval] = lcp | 0x80000000;
-				intervals[interval] |=
-					(next_interval << LZ_LCPIT_LCP_BITS);
-				pos_data[pos] = next_interval;
-				*++cur_interval = next_interval++;
-			} else {
-				/* Finishing 'interval', continuing with
-				 * 'superinterval'.  */
-
-				intervals[interval] |=
-					(superinterval << LZ_LCPIT_LCP_BITS);
-				pos_data[pos] = superinterval;
+				} else if (next_lcp > superinterval_lcp) {
+					/* creating a new interval that is a
+					 * superinterval of the one being
+					 * closed, but still a subinterval of
+					 * its superinterval  */
+					intervals[next_interval_idx] = next_lcp;
+					*++top = next_interval_idx;
+					intervals[closed_interval_idx] |=
+						(next_interval_idx << LZ_LCPIT_LCP_BITS) |
+							0x80000000;
+					next_interval_idx++;
+					break;
+				} else {
+					/* also closing the superinterval  */
+					intervals[closed_interval_idx] |=
+						(superinterval_idx << LZ_LCPIT_LCP_BITS) |
+							0x80000000;
+				}
 			}
 		}
-
-		/* Remember this suffix index when processing the next-ranked
-		 * suffix.  */
-		prev_pos = pos;
+		prev_pos = next_pos;
 	}
 
-	/* Finalize any still-incomplete lcp-intervals.  */
-	while (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK) {
-		intervals[*cur_interval] |=
-			(*(cur_interval - 1) << LZ_LCPIT_LCP_BITS);
-		cur_interval--;
+	/* close any still-open intervals  */
+	pos_data[prev_pos] = *top;
+	while (top > open_intervals) {
+		u32 closed_interval_idx = *top;
+		u32 superinterval_idx = *--top;
+		intervals[closed_interval_idx] |=
+			(superinterval_idx << LZ_LCPIT_LCP_BITS) | 0x80000000;
 	}
 }
 
@@ -299,7 +255,7 @@ lz_lcpit_set_default_params(struct lz_mf_params *params)
 		params->min_match_len = 2;
 
 	if (params->max_match_len == 0)
-		params->max_match_len = params->max_window_size;
+		params->max_match_len = UINT32_MAX;
 
 	if (params->max_search_depth == 0)
 		params->max_search_depth = 32;
@@ -340,34 +296,29 @@ lz_lcpit_init(struct lz_mf *_mf)
 
 	lz_lcpit_set_default_params(&mf->base.params);
 
-	mf->SA = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size));
-	if (!mf->SA)
-		return false;
-
-	return true;
+	mf->mem = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size));
+	return (mf->mem != NULL);
 }
 
 static void
 lz_lcpit_load_window(struct lz_mf *_mf, const u8 T[], u32 n)
 {
 	struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
-	u32 *mem = mf->SA;
-
-	build_SA(&mem[0 * n], T, n, &mem[1 * n]);
-	build_ISA(&mem[2 * n], &mem[0 * n], n);
-	build_LCP(&mem[1 * n], &mem[0 * n], &mem[2 * n], T, n);
-	build_LCPIT(&mem[0 * n], &mem[1 * n], &mem[2 * n],
-		    mf->base.params.nice_match_len, n);
-	mf->SA = &mem[0 * n];
-	mf->intervals = &mem[1 * n];
-	mf->pos_data = &mem[2 * n];
+
+	build_SA(&mf->mem[0 * n], T, n, &mf->mem[1 * n]);
+	build_ISA(&mf->mem[2 * n], &mf->mem[0 * n], n);
+	build_LCP_packed(&mf->mem[0 * n], &mf->mem[2 * n], T, n,
+			 mf->base.params.min_match_len,
+			 mf->base.params.nice_match_len);
+	build_LCPIT(&mf->mem[0 * n], &mf->mem[1 * n], &mf->mem[2 * n], n);
+	mf->intervals = &mf->mem[1 * n];
+	mf->pos_data = &mf->mem[2 * n];
 }
 
 static u32
 lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[])
 {
 	struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
-	const u32 min_match_len = mf->base.params.min_match_len;
 	const u32 cur_pos = mf->base.cur_window_pos;
 	u32 * const pos_data = mf->pos_data;
 	u32 * const intervals = mf->intervals;
@@ -405,9 +356,8 @@ lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[])
 		 * be produced, we're done, since the LCP will only ever get
 		 * shorter from here.  This also prevents ascending above the
 		 * root of the lcp-interval tree, since the root is guaranteed
-		 * to be a 0-interval, and min_match_len is guaranteed to be at
-		 * least 2.  */
-		if (lcp < min_match_len)
+		 * to be a 0-interval.  */
+		if (lcp == 0)
 			goto out;
 
 		/* Set the position of the most-recently-seen suffix within this
@@ -441,9 +391,8 @@ lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[])
 		 * be produced, we're done, since the LCP will only ever get
 		 * shorter from here.  This also prevents ascending above the
 		 * root of the lcp-interval tree, since the root is guaranteed
-		 * to be a 0-interval, and min_match_len is guaranteed to be at
-		 * least 2.  */
-		if (lcp < min_match_len)
+		 * to be a 0-interval.  */
+		if (lcp == 0)
 			break;
 
 		/* Advance the current match until the lcp of the *next* match
@@ -509,7 +458,6 @@ out:
 static void
 lz_lcpit_skip_position(struct lz_lcpit *mf)
 {
-	const u32 min_match_len = mf->base.params.min_match_len;
 	const u32 cur_pos = mf->base.cur_window_pos++;
 	u32 * const pos_data = mf->pos_data;
 	u32 * const intervals = mf->intervals;
@@ -523,14 +471,14 @@ lz_lcpit_skip_position(struct lz_lcpit *mf)
 		lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
 		next_interval = (intervals[interval] & ~0x80000000)
 					>> LZ_LCPIT_LCP_BITS;
-		if (lcp < min_match_len)
+		if (lcp == 0)
 			return;
 		intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp;
 		interval = next_interval;
 	}
 	lcp = intervals[interval] & LZ_LCPIT_LCP_MASK;
 	cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS;
-	while (lcp >= min_match_len) {
+	while (lcp != 0) {
 		next_match = cur_match;
 		do {
 			next_interval = pos_data[next_match];
@@ -561,7 +509,7 @@ lz_lcpit_destroy(struct lz_mf *_mf)
 {
 	struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
 
-	FREE(mf->SA);
+	FREE(mf->mem);
 }
 
 const struct lz_mf_ops lz_lcp_interval_tree_ops = {