From acb5c41d2c9be655eca863a0315a22d1f54889e2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Fri, 13 Feb 2015 01:13:15 -0600
Subject: [PATCH] LCP-interval tree matchfinder improvements

- Adjust the algorithm to remove the need to have an "unvisited" flag in
  normal mode.
- Prefetch upcoming SA_and_LCP entries during LCP construction.
- Get rid of lcpit_matchfinder_templates.h, since the "huge mode"
  algorithm isn't quite the same as the regular algorithm anymore.
- Improve documentation.
---
 Makefile.am                        |   1 -
 include/wimlib/lcpit_matchfinder.h |  24 +-
 src/lcpit_matchfinder.c            | 614 +++++++++++++++++++++++++----
 src/lcpit_matchfinder_templates.h  | 343 ----------------
 4 files changed, 533 insertions(+), 449 deletions(-)
 delete mode 100644 src/lcpit_matchfinder_templates.h

diff --git a/Makefile.am b/Makefile.am
index 10511f78..27dad522 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -59,7 +59,6 @@ libwim_la_SOURCES =		\
 	src/iterate_dir.c	\
 	src/join.c		\
 	src/lcpit_matchfinder.c	\
-	src/lcpit_matchfinder_templates.h	\
 	src/lookup_table.c	\
 	src/lzms_common.c	\
 	src/lzms_compress.c	\
diff --git a/include/wimlib/lcpit_matchfinder.h b/include/wimlib/lcpit_matchfinder.h
index ffcd426b..dfedf371 100644
--- a/include/wimlib/lcpit_matchfinder.h
+++ b/include/wimlib/lcpit_matchfinder.h
@@ -17,31 +17,13 @@
 #include "wimlib/types.h"
 
 struct lcpit_matchfinder {
-
 	bool huge_mode;
-
 	u32 cur_pos;
-
-	/* Mapping: suffix index ("window position") => lcp-interval index  */
 	u32 *pos_data;
-
-	/* Mapping: lcp-interval index => lcp-interval data
-	 *
-	 * Initially, the lcp-interval data for an lcp-interval contains that
-	 * interval's lcp and superinterval index.
-	 *
-	 * After a lcp-interval is visited during match-finding, its
-	 * lcp-interval data contains that interval's lcp and the position of
-	 * the next suffix to consider as a match when matching against that
-	 * lcp-interval.  */
 	union {
 		u32 *intervals;
 		u64 *intervals64;
 	};
-
-	/* The suffix array  */
-	u32 *SA;
-
 	u32 min_match_len;
 	u32 nice_match_len;
 };
@@ -58,9 +40,6 @@ extern bool
 lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize,
 		       u32 min_match_len, u32 nice_match_len);
 
-extern void
-lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf);
-
 extern void
 lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n);
 
@@ -71,4 +50,7 @@ lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf,
 extern void
 lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count);
 
+extern void
+lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf);
+
 #endif /* _LCPIT_MATCHFINDER_H */
diff --git a/src/lcpit_matchfinder.c b/src/lcpit_matchfinder.c
index dd00979e..9d22ad07 100644
--- a/src/lcpit_matchfinder.c
+++ b/src/lcpit_matchfinder.c
@@ -1,5 +1,5 @@
 /*
- * lcpit_matchfinder.h
+ * lcpit_matchfinder.c
  *
  * A match-finder for Lempel-Ziv compression based on bottom-up construction and
  * traversal of the Longest Common Prefix (LCP) interval tree.
@@ -12,40 +12,513 @@
  */
 
 #include <limits.h>
-#include <string.h>
 
 #include "wimlib/divsufsort.h"
 #include "wimlib/lcpit_matchfinder.h"
 #include "wimlib/util.h"
 
 #define LCP_BITS		6
-#define LCP_MASK		((1 << LCP_BITS) - 1)
-#define LCP_MAX			LCP_MASK
-#define NORMAL_UNVISITED_TAG	((u32)1 << 31)
-#define MAX_NORMAL_BUFSIZE	((u32)1 << (31 - LCP_BITS))
-#define HUGE_UNVISITED_TAG	((u64)1 << 63)
-#define SA_and_LCP_LCP_SHIFT	(32 - LCP_BITS)
-#define SA_and_LCP_POS_MASK	(((u32)1 << SA_and_LCP_LCP_SHIFT) - 1)
+#define LCP_MAX			(((u32)1 << LCP_BITS) - 1)
+#define POS_MASK		(((u32)1 << (32 - LCP_BITS)) - 1)
+#define LCP_SHIFT		(32 - LCP_BITS)
+#define LCP_MASK		(LCP_MAX << LCP_SHIFT)
+#define MAX_NORMAL_BUFSIZE	(POS_MASK + 1)
+
+#define HUGE_LCP_BITS		7
+#define HUGE_LCP_MAX		(((u32)1 << HUGE_LCP_BITS) - 1)
+#define HUGE_LCP_SHIFT		(64 - HUGE_LCP_BITS)
+#define HUGE_POS_MASK		0xFFFFFFFF
+#define MAX_HUGE_BUFSIZE	((u64)HUGE_POS_MASK + 1)
+#define HUGE_UNVISITED_TAG	0x100000000
+
+#define PREFETCH_SAFETY		5
 
 /*
- * Include the template header to define the functions build_LCP(),
- * build_LCPIT(), and lcpit_advance_one_byte().  There are "normal" and "huge"
- * versions of each function.  The normal versions assume that a buffer position
- * and LCP value can be packed into a 32-bit integer, whereas the huge versions
- * assume that 64 bits is needed.
- *
- * Both versions cap LCP values to 6 bits. This limits the depth of the
- * lcp-interval tree without hurting the compression ratio too much.  Matches of
- * length 63 are sufficiently long that the compression ratio doesn't change
- * significantly if we choose one such match over another.
+ * Build the LCP (Longest Common Prefix) array in linear time.
+ *
+ * LCP[r] will be the length of the longest common prefix between the suffixes
+ * with positions SA[r - 1] and  SA[r].  LCP[0] will be undefined.
+ *
+ * Algorithm taken from Kasai et al. (2001), but modified slightly:
+ *
+ *  - With bytes there is no realistic way to reserve a unique symbol for
+ *    end-of-buffer, so use explicit checks for end-of-buffer.
+ *
+ *  - For decreased memory usage and improved memory locality, pack the two
+ *    logically distinct SA and LCP arrays into a single array SA_and_LCP.
+ *
+ *  - Since SA_and_LCP is accessed randomly, improve the cache behavior by
+ *    reading several entries ahead in ISA and prefetching the upcoming
+ *    SA_and_LCP entry.
+ *
+ *  - If an LCP value is less than the minimum match length, then store 0.  This
+ *    avoids having to do comparisons against the minimum match length later.
+ *
+ *  - If an LCP value is greater than the "nice match length", then store the
+ *    "nice match length".  This caps the number of bits needed to store each
+ *    LCP value, and this caps the depth of the LCP-interval tree, without
+ *    usually hurting the compression ratio too much.
+ *
+ * References:
+ *
+ *	Kasai et al.  2001.  Linear-Time Longest-Common-Prefix Computation in
+ *	Suffix Arrays and Its Applications.  CPM '01 Proceedings of the 12th
+ *	Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
+ */
+static void
+build_LCP(u32 SA_and_LCP[restrict], const u32 ISA[restrict],
+	  const u8 T[restrict], const u32 n,
+	  const u32 min_lcp, const u32 max_lcp)
+{
+	u32 h = 0;
+	for (u32 i = 0; i < n; i++) {
+		const u32 r = ISA[i];
+		prefetch(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
+		if (r > 0) {
+			const u32 j = SA_and_LCP[r - 1] & POS_MASK;
+			const u32 lim = min(n - i, n - j);
+			while (h < lim && T[i + h] == T[j + h])
+				h++;
+			u32 stored_lcp = h;
+			if (stored_lcp < min_lcp)
+				stored_lcp = 0;
+			else if (stored_lcp > max_lcp)
+				stored_lcp = max_lcp;
+			SA_and_LCP[r] |= stored_lcp << LCP_SHIFT;
+			if (h > 0)
+				h--;
+		}
+	}
+}
+
+/*
+ * Use the suffix array accompanied with the longest-common-prefix array ---
+ * which in combination can be called the "enhanced suffix array" --- to
+ * simulate a bottom-up traversal of the corresponding suffix tree, or
+ * equivalently the lcp-interval tree.  Do so in suffix rank order, but save the
+ * superinterval references needed for later bottom-up traversal of the tree in
+ * suffix position order.
+ *
+ * To enumerate the lcp-intervals, this algorithm scans the suffix array and its
+ * corresponding LCP array linearly.  While doing so, it maintains a stack of
+ * lcp-intervals that are currently open, meaning that their left boundaries
+ * have been seen but their right boundaries have not.  The bottom of the stack
+ * is the interval which covers the entire suffix array (this has lcp=0), and
+ * the top of the stack is the deepest interval that is currently open (this has
+ * the greatest lcp of any interval on the stack).  When this algorithm opens an
+ * lcp-interval, it assigns it a unique index in intervals[] and pushes it onto
+ * the stack.  When this algorithm closes an interval, it pops it from the stack
+ * and sets the intervals[] entry of that interval to the index and lcp of that
+ * interval's superinterval, which is the new top of the stack.
+ *
+ * This algorithm also set pos_data[pos] for each suffix position 'pos' to the
+ * index and lcp of the deepest lcp-interval containing it.  Alternatively, we
+ * can interpret each suffix as being associated with a singleton lcp-interval,
+ * or leaf of the suffix tree.  With this interpretation, an entry in pos_data[]
+ * is the superinterval reference for one of these singleton lcp-intervals and
+ * therefore is not fundamentally different from an entry in intervals[].
+ *
+ * To reduce memory usage, this algorithm re-uses the suffix array's memory to
+ * store the generated intervals[] array.  This is possible because SA and LCP
+ * are accessed linearly, and no more than one interval is generated per suffix.
+ *
+ * The techniques used in this algorithm are described in various published
+ * papers.  The generation of lcp-intervals from the suffix array (SA) and the
+ * longest-common-prefix array (LCP) is given as Algorithm BottomUpTraverse in
+ * Kasai et al. (2001) and Algorithm 4.1 ("Computation of lcp-intervals") in
+ * Abouelhoda et al. (2004).  Both these papers note the equivalence between
+ * lcp-intervals (including the singleton lcp-interval for each suffix) and
+ * nodes of the suffix tree.  Abouelhoda et al. (2004) furthermore applies
+ * bottom-up traversal of the lcp-interval tree to Lempel-Ziv factorization, as
+ * does Chen at al. (2008).  Algorithm CPS1b of Chen et al. (2008) dynamically
+ * re-uses the suffix array during bottom-up traversal of the lcp-interval tree.
+ *
+ * References:
+ *
+ *	Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
+ *	Arrays and Its Applications.  2001.  CPM '01 Proceedings of the 12th
+ *	Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
+ *
+ *	M.I. Abouelhoda, S. Kurtz, E. Ohlebusch.  2004.  Replacing Suffix Trees
+ *	With Enhanced Suffix Arrays.  Journal of Discrete Algorithms Volume 2
+ *	Issue 1, March 2004, pp. 53-86.
+ *
+ *	G. Chen, S.J. Puglisi, W.F. Smyth.  2008.  Lempel-Ziv Factorization
+ *	Using Less Time & Space.  Mathematics in Computer Science June 2008,
+ *	Volume 1, Issue 4, pp. 605-623.
  */
-#define HUGE_MODE 1
-#include "lcpit_matchfinder_templates.h"
-#undef HUGE_MODE
+static void
+build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n)
+{
+	u32 * const SA_and_LCP = intervals;
+	u32 next_interval_idx;
+	u32 open_intervals[LCP_MAX + 1];
+	u32 *top = open_intervals;
+	u32 prev_pos = SA_and_LCP[0] & POS_MASK;
+
+	*top = 0;
+	intervals[0] = 0;
+	next_interval_idx = 1;
+
+	for (u32 r = 1; r < n; r++) {
+		const u32 next_pos = SA_and_LCP[r] & POS_MASK;
+		const u32 next_lcp = SA_and_LCP[r] >> LCP_SHIFT;
+		const u32 top_lcp = *top >> LCP_SHIFT;
+
+		if (next_lcp == top_lcp) {
+			/* Continuing the deepest open interval  */
+			pos_data[prev_pos] = *top;
+		} else if (next_lcp > top_lcp) {
+			/* Opening a new interval  */
+			*++top = (next_lcp << LCP_SHIFT) | next_interval_idx++;
+			pos_data[prev_pos] = *top;
+		} else {
+			/* Closing the deepest open interval  */
+			pos_data[prev_pos] = *top;
+			for (;;) {
+				const u32 closed_interval_idx = *top-- & POS_MASK;
+				const u32 superinterval_lcp = *top >> LCP_SHIFT;
+
+				if (next_lcp == superinterval_lcp) {
+					/* Continuing the superinterval */
+					intervals[closed_interval_idx] = *top;
+					break;
+				} else if (next_lcp > superinterval_lcp) {
+					/* Creating a new interval that is a
+					 * superinterval of the one being
+					 * closed, but still a subinterval of
+					 * its superinterval  */
+					*++top = (next_lcp << LCP_SHIFT) | next_interval_idx++;
+					intervals[closed_interval_idx] = *top;
+					break;
+				} else {
+					/* Also closing the superinterval  */
+					intervals[closed_interval_idx] = *top;
+				}
+			}
+		}
+		prev_pos = next_pos;
+	}
 
-#define HUGE_MODE 0
-#include "lcpit_matchfinder_templates.h"
-#undef HUGE_MODE
+	/* Close any still-open intervals.  */
+	pos_data[prev_pos] = *top;
+	for (; top > open_intervals; top--)
+		intervals[*top & POS_MASK] = *(top - 1);
+}
+
+/*
+ * Advance the LCP-interval tree matchfinder by one byte.
+ *
+ * If @record_matches is true, then matches are written to the @matches array
+ * sorted by strictly decreasing length and strictly decreasing offset, and the
+ * return value is the number of matches found.  Otherwise, @matches is ignored
+ * and the return value is always 0.
+ *
+ * How this algorithm works:
+ *
+ * 'cur_pos' is the position of the current suffix, which is the suffix being
+ * matched against.  'cur_pos' starts at 0 and is incremented each time this
+ * function is called.  This function finds each suffix with position less than
+ * 'cur_pos' that shares a prefix with the current position, but for each
+ * distinct prefix length it finds only the suffix with the greatest position
+ * (i.e. the most recently seen in the linear traversal by position).  This
+ * function accomplishes this using the lcp-interval tree data structure that
+ * was built by build_LCPIT() and is updated dynamically by this function.
+ *
+ * The first step is to read 'pos_data[cur_pos]', which gives the index and lcp
+ * value of the deepest lcp-interval containing the current suffix --- or,
+ * equivalently, the parent of the conceptual singleton lcp-interval that
+ * contains the current suffix.
+ *
+ * The second step is to ascend the lcp-interval tree until reaching an interval
+ * that has not yet been visited, and link the intervals to the current suffix
+ * along the way.   An lcp-interval has been visited if and only if it has been
+ * linked to a suffix.  Initially, no lcp-intervals are linked to suffixes.
+ *
+ * The third step is to continue ascending the lcp-interval tree, but indirectly
+ * through suffix links rather than through the original superinterval
+ * references, and continuing to form links with the current suffix along the
+ * way.  Each suffix visited during this step, except in a special case to
+ * handle outdated suffixes, is a match which can be written to matches[].  Each
+ * intervals[] entry contains the position of the next suffix to visit, which we
+ * shall call 'match_pos'; this is the most recently seen suffix that belongs to
+ * that lcp-interval.  'pos_data[match_pos]' then contains the lcp and interval
+ * index of the next lcp-interval that should be visited.
+ *
+ * We can view these arrays as describing a new set of links that gets overlaid
+ * on top of the original superinterval references of the lcp-interval tree.
+ * Each such link can connect a node of the lcp-interval tree to an ancestor
+ * more than one generation removed.
+ *
+ * For each one-byte advance, the current position becomes the most recently
+ * seen suffix for a continuous sequence of lcp-intervals from a leaf interval
+ * to the root interval.  Conceptually, this algorithm needs to update all these
+ * nodes to link to 'cur_pos', and then update 'pos_data[cur_pos]' to a "null"
+ * link.  But actually, if some of these nodes have the same most recently seen
+ * suffix, then this algorithm just visits the pos_data[] entry for that suffix
+ * and skips over all these nodes in one step.  Updating the extra nodes is
+ * accomplished by creating a redirection from the previous suffix to the
+ * current suffix.
+ *
+ * Using this shortcutting scheme, it's possible for a suffix to become out of
+ * date, which means that it is no longer the most recently seen suffix for the
+ * lcp-interval under consideration.  This case is detected by noticing when the
+ * next lcp-interval link actually points deeper in the tree, and it is worked
+ * around by just continuing until we get to a link that actually takes us
+ * higher in the tree.  This can be described as a lazy-update scheme.
+ */
+static inline u32
+lcpit_advance_one_byte(const u32 cur_pos,
+		       u32 pos_data[restrict],
+		       u32 intervals[restrict],
+		       struct lz_match matches[restrict],
+		       const bool record_matches)
+{
+	u32 lcp;
+	u32 interval_idx;
+	u32 match_pos;
+	struct lz_match *matchptr;
+
+	/* Get the deepest lcp-interval containing the current suffix. */
+	lcp = pos_data[cur_pos] >> LCP_SHIFT;
+	interval_idx = pos_data[cur_pos] & POS_MASK;
+	prefetch(&intervals[pos_data[cur_pos + 1] & POS_MASK]);
+	pos_data[cur_pos] = 0;
+
+	/* Ascend until we reach a visited interval, linking the unvisited
+	 * intervals to the current suffix as we go.  */
+	while (intervals[interval_idx] & LCP_MASK) {
+		const u32 superinterval_lcp = intervals[interval_idx] >> LCP_SHIFT;
+		const u32 superinterval_idx = intervals[interval_idx] & POS_MASK;
+		intervals[interval_idx] = cur_pos;
+		lcp = superinterval_lcp;
+		interval_idx = superinterval_idx;
+	}
+
+	match_pos = intervals[interval_idx] & POS_MASK;
+	if (match_pos == 0) {
+		/* Ambiguous case; just don't allow matches with position 0. */
+		if (interval_idx != 0)
+			intervals[interval_idx] = cur_pos;
+		return 0;
+	}
+	matchptr = matches;
+	/* Ascend indirectly via pos_data[] links.  */
+	for (;;) {
+		u32 next_lcp;
+		u32 next_interval_idx;
+
+		for (;;) {
+			next_lcp = pos_data[match_pos] >> LCP_SHIFT;
+			next_interval_idx = pos_data[match_pos] & POS_MASK;
+			if (next_lcp < lcp)
+				break;
+			/* Suffix was out of date.  */
+			match_pos = intervals[next_interval_idx];
+		}
+		intervals[interval_idx] = cur_pos;
+		pos_data[match_pos] = (lcp << LCP_SHIFT) | interval_idx;
+		if (record_matches) {
+			matchptr->length = lcp;
+			matchptr->offset = cur_pos - match_pos;
+			matchptr++;
+		}
+		if (next_interval_idx == 0)
+			break;
+		match_pos = intervals[next_interval_idx];
+		interval_idx = next_interval_idx;
+		lcp = next_lcp;
+	}
+	return matchptr - matches;
+}
+
+/* Expand SA from 32 bits to 64 bits.  */
+static void
+expand_SA(u32 *p, u32 n)
+{
+	typedef u32 _may_alias_attribute aliased_u32_t;
+	typedef u64 _may_alias_attribute aliased_u64_t;
+
+	aliased_u32_t *SA = (aliased_u32_t *)p;
+	aliased_u64_t *SA64 = (aliased_u64_t *)p;
+
+	u32 r = n - 1;
+	do {
+		SA64[r] = SA[r];
+	} while (r--);
+}
+
+/* Like build_LCP(), but for buffers larger than MAX_NORMAL_BUFSIZE.  */
+static void
+build_LCP_huge(u64 SA_and_LCP64[restrict], const u32 ISA[restrict],
+	       const u8 T[restrict], const u32 n,
+	       const u32 min_lcp, const u32 max_lcp)
+{
+	u32 h = 0;
+	for (u32 i = 0; i < n; i++) {
+		const u32 r = ISA[i];
+		prefetch(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
+		if (r > 0) {
+			const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK;
+			const u32 lim = min(n - i, n - j);
+			while (h < lim && T[i + h] == T[j + h])
+				h++;
+			u32 stored_lcp = h;
+			if (stored_lcp < min_lcp)
+				stored_lcp = 0;
+			else if (stored_lcp > max_lcp)
+				stored_lcp = max_lcp;
+			SA_and_LCP64[r] |= (u64)stored_lcp << HUGE_LCP_SHIFT;
+			if (h > 0)
+				h--;
+		}
+	}
+}
+
+/*
+ * Like build_LCPIT(), but for buffers larger than MAX_NORMAL_BUFSIZE.
+ *
+ * This "huge" version is also slightly different in that the lcp value stored
+ * in each intervals[] entry is the lcp value for that interval, not its
+ * superinterval.  This lcp value stays put in intervals[] and doesn't get moved
+ * to pos_data[] during lcpit_advance_one_byte_huge().  One consequence of this
+ * is that we have to use a special flag to distinguish visited from unvisited
+ * intervals.  But overall, this scheme keeps the memory usage at 12n instead of
+ * 16n.  (The non-huge version is 8n.)
+ */
+static void
+build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
+{
+	u64 * const SA_and_LCP64 = intervals64;
+	u32 next_interval_idx;
+	u32 open_intervals[HUGE_LCP_MAX + 1];
+	u32 *top = open_intervals;
+	u32 prev_pos = SA_and_LCP64[0] & HUGE_POS_MASK;
+
+	*top = 0;
+	intervals64[0] = 0;
+	next_interval_idx = 1;
+
+	for (u32 r = 1; r < n; r++) {
+		const u32 next_pos = SA_and_LCP64[r] & HUGE_POS_MASK;
+		const u32 next_lcp = SA_and_LCP64[r] >> HUGE_LCP_SHIFT;
+		const u32 top_lcp = intervals64[*top] >> HUGE_LCP_SHIFT;
+
+		if (next_lcp == top_lcp) {
+			/* Continuing the deepest open interval  */
+			pos_data[prev_pos] = *top;
+		} else if (next_lcp > top_lcp) {
+			/* Opening a new interval  */
+			intervals64[next_interval_idx] = (u64)next_lcp << HUGE_LCP_SHIFT;
+			pos_data[prev_pos] = next_interval_idx;
+			*++top = next_interval_idx++;
+		} else {
+			/* Closing the deepest open interval  */
+			pos_data[prev_pos] = *top;
+			for (;;) {
+				const u32 closed_interval_idx = *top--;
+				const u32 superinterval_lcp = intervals64[*top] >> HUGE_LCP_SHIFT;
+
+				if (next_lcp == superinterval_lcp) {
+					/* Continuing the superinterval */
+					intervals64[closed_interval_idx] |=
+						HUGE_UNVISITED_TAG | *top;
+					break;
+				} else if (next_lcp > superinterval_lcp) {
+					/* Creating a new interval that is a
+					 * superinterval of the one being
+					 * closed, but still a subinterval of
+					 * its superinterval  */
+					intervals64[next_interval_idx] =
+						(u64)next_lcp << HUGE_LCP_SHIFT;
+					intervals64[closed_interval_idx] |=
+						HUGE_UNVISITED_TAG | next_interval_idx;
+					*++top = next_interval_idx++;
+					break;
+				} else {
+					/* Also closing the superinterval  */
+					intervals64[closed_interval_idx] |=
+						HUGE_UNVISITED_TAG | *top;
+				}
+			}
+		}
+		prev_pos = next_pos;
+	}
+
+	/* Close any still-open intervals.  */
+	pos_data[prev_pos] = *top;
+	for (; top > open_intervals; top--)
+		intervals64[*top] |= HUGE_UNVISITED_TAG | *(top - 1);
+}
+
+/* Like lcpit_advance_one_byte(), but for buffers larger than
+ * MAX_NORMAL_BUFSIZE.  */
+static inline u32
+lcpit_advance_one_byte_huge(const u32 cur_pos,
+			    u32 pos_data[restrict],
+			    u64 intervals64[restrict],
+			    struct lz_match matches[restrict],
+			    const bool record_matches)
+{
+	u32 interval_idx;
+	u32 lcp;
+	u32 match_pos;
+	struct lz_match *matchptr;
+
+	interval_idx = pos_data[cur_pos];
+	prefetch(&intervals64[pos_data[cur_pos + 1]]);
+	pos_data[cur_pos] = 0;
+
+	while (intervals64[interval_idx] & HUGE_UNVISITED_TAG) {
+		lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT;
+		if (lcp == 0)
+			return 0;
+		const u32 superinterval_idx = intervals64[interval_idx] & HUGE_POS_MASK;
+		intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos;
+		interval_idx = superinterval_idx;
+	}
+
+	match_pos = intervals64[interval_idx] & HUGE_POS_MASK;
+	lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT;
+	matchptr = matches;
+	while (lcp != 0) {
+		u32 next_interval_idx;
+		u32 next_lcp;
+
+		for (;;) {
+			next_interval_idx = pos_data[match_pos];
+			next_lcp = intervals64[next_interval_idx] >> HUGE_LCP_SHIFT;
+			if (next_lcp < lcp)
+				break;
+			match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK;
+		}
+		intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos;
+		pos_data[match_pos] = interval_idx;
+		if (record_matches) {
+			matchptr->length = lcp;
+			matchptr->offset = cur_pos - match_pos;
+			matchptr++;
+		}
+		match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK;
+		interval_idx = next_interval_idx;
+		lcp = next_lcp;
+	}
+	return matchptr - matches;
+}
+
+static inline u64
+get_pos_data_size(size_t max_bufsize)
+{
+	return (u64)max((u64)max_bufsize + PREFETCH_SAFETY,
+			DIVSUFSORT_TMP_LEN) * sizeof(u32);
+}
+
+static inline u64
+get_intervals_size(size_t max_bufsize)
+{
+	return (u64)max_bufsize * (max_bufsize <= MAX_NORMAL_BUFSIZE ?
+				   sizeof(u32) : sizeof(u64));
+}
 
 /*
  * Calculate the number of bytes of memory needed for the LCP-interval tree
@@ -58,19 +531,7 @@
 u64
 lcpit_matchfinder_get_needed_memory(size_t max_bufsize)
 {
-	u64 size = 0;
-
-	/* pos_data (+1 is for prefetch) */
-	size += ((u64)max_bufsize + 1) * sizeof(u32);
-
-	/* intervals or intervals64  */
-	size += max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) *
-		(max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64));
-
-	/* SA */
-	size += (u64)max_bufsize * sizeof(u32);
-
-	return size;
+	return get_pos_data_size(max_bufsize) + get_intervals_size(max_bufsize);
 }
 
 /*
@@ -89,20 +550,22 @@ lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize,
 {
 	if (lcpit_matchfinder_get_needed_memory(max_bufsize) > SIZE_MAX)
 		return false;
+	if (max_bufsize > MAX_HUGE_BUFSIZE - PREFETCH_SAFETY)
+		return false;
 
-	mf->pos_data = MALLOC((max_bufsize + 1) * sizeof(u32));
-	mf->intervals = MALLOC(max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) *
-			       (max_bufsize <= MAX_NORMAL_BUFSIZE ?
-				sizeof(u32) : sizeof(u64)));
-	mf->SA = MALLOC(max_bufsize * sizeof(u32));
-
-	if (!mf->pos_data || !mf->intervals || !mf->SA) {
+	mf->pos_data = MALLOC(get_pos_data_size(max_bufsize));
+	mf->intervals = MALLOC(get_intervals_size(max_bufsize));
+	if (!mf->pos_data || !mf->intervals) {
 		lcpit_matchfinder_destroy(mf);
 		return false;
 	}
 
 	mf->min_match_len = min_match_len;
-	mf->nice_match_len = min(nice_match_len, LCP_MAX);
+	mf->nice_match_len = min(nice_match_len,
+				 (max_bufsize <= MAX_NORMAL_BUFSIZE) ?
+				 LCP_MAX : HUGE_LCP_MAX);
+	for (u32 i = 0; i < PREFETCH_SAFETY; i++)
+		mf->pos_data[max_bufsize + i] = 0;
 	return true;
 }
 
@@ -159,42 +622,31 @@ build_ISA(u32 ISA[restrict], const u32 SA[restrict], u32 n)
  *
  * @mf - the initialized matchfinder structure
  * @T - the input buffer
- * @n - size of the input buffer in bytes.  This may be at most the max_bufsize
- *	with which lcpit_matchfinder_init() was called.
+ * @n - size of the input buffer in bytes.  This must be nonzero and can be at
+ *	most the max_bufsize with which lcpit_matchfinder_init() was called.
  */
 void
 lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
 {
-	if (n == 0)
-		return;
+	/* intervals[] temporarily stores SA and LCP packed together.
+	 * pos_data[] temporarily stores ISA.
+	 * pos_data[] is also used as the temporary space for divsufsort().  */
 
-	build_SA(mf->SA, T, n, mf->intervals);
-	build_ISA(mf->pos_data, mf->SA, n);
+	build_SA(mf->intervals, T, n, mf->pos_data);
+	build_ISA(mf->pos_data, mf->intervals, n);
 	if (n <= MAX_NORMAL_BUFSIZE) {
-		/* "Normal" sized buffer  */
-
-		/* Build LCP, packing it into ->SA  */
-		build_LCP_normal(mf->SA, mf->pos_data, T, n,
-				 mf->min_match_len, mf->nice_match_len);
-		/* Prepare ->intervals and ->pos_data  */
-		build_LCPIT_normal(mf->SA, mf->intervals, mf->pos_data, n);
+		build_LCP(mf->intervals, mf->pos_data, T, n,
+			  mf->min_match_len, mf->nice_match_len);
+		build_LCPIT(mf->intervals, mf->pos_data, n);
 		mf->huge_mode = false;
 	} else {
-		/* "Huge" sized buffer  */
-
-		/* Build LCP in the second half of ->intervals64.  It may be
-		 * partially overwritten in build_LCPIT_huge(), but this is okay
-		 * since each LCP entry is guaranteed to be consumed before it
-		 * can possibly be overwritten.  */
-		build_LCP_huge(mf->intervals + n, mf->SA, mf->pos_data, T, n,
+		expand_SA(mf->intervals, n);
+		build_LCP_huge(mf->intervals64, mf->pos_data, T, n,
 			       mf->min_match_len, mf->nice_match_len);
-		/* Prepare ->intervals64 and ->pos_data  */
-		build_LCPIT_huge(mf->SA, mf->intervals + n, mf->intervals64,
-				 mf->pos_data, n);
+		build_LCPIT_huge(mf->intervals64, mf->pos_data, n);
 		mf->huge_mode = true;
 	}
 	mf->cur_pos = 0; /* starting at beginning of input buffer  */
-	mf->pos_data[n] = 0; /* safety entry for prefetch() overrun  */
 }
 
 /*
@@ -205,37 +657,35 @@ lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
  *
  * The return value is the number of matches found and written to @matches.
  * This can be any value in [0, nice_match_len - min_match_len + 1].
- *
- * If the caller attempts to advance beyond the end of the input buffer, the
- * behavior is undefined.
  */
 u32
 lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf,
 			      struct lz_match *matches)
 {
 	if (mf->huge_mode)
-		return lcpit_advance_one_byte_huge(mf, matches, true);
+		return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
+						   mf->intervals64, matches, true);
 	else
-		return lcpit_advance_one_byte_normal(mf, matches, true);
+		return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
+					      mf->intervals, matches, true);
 }
 
 /*
  * Skip the next @count bytes (don't search for matches at them).  @count is
  * assumed to be > 0.
- *
- * If the caller attempts to advance beyond the end of the input buffer, the
- * behavior is undefined.
  */
 void
 lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count)
 {
 	if (mf->huge_mode) {
 		do {
-			lcpit_advance_one_byte_huge(mf, NULL, false);
+			lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
+						    mf->intervals64, NULL, false);
 		} while (--count);
 	} else {
 		do {
-			lcpit_advance_one_byte_normal(mf, NULL, false);
+			lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
+					       mf->intervals, NULL, false);
 		} while (--count);
 	}
 }
@@ -243,14 +693,10 @@ lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count)
 /*
  * Destroy an LCP-interval tree matchfinder that was previously initialized with
  * lcpit_matchfinder_init().
- *
- * If the struct has been zeroed out, this has no effect.
  */
 void
 lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf)
 {
 	FREE(mf->pos_data);
 	FREE(mf->intervals);
-	FREE(mf->SA);
-	memset(mf, 0, sizeof(*mf));
 }
diff --git a/src/lcpit_matchfinder_templates.h b/src/lcpit_matchfinder_templates.h
deleted file mode 100644
index ac9c8883..00000000
--- a/src/lcpit_matchfinder_templates.h
+++ /dev/null
@@ -1,343 +0,0 @@
-/*
- * lcpit_matchfinder_templates.h
- *
- * This file is included by lcpit_matchfinder.c.
- *
- * Author:	Eric Biggers
- * Year:	2014, 2015
- *
- * The author dedicates this file to the public domain.
- * You can do whatever you want with this file.
- */
-
-/*
- * In normal mode, we can pack a buffer position and a LCP value into a 32-bit
- * number.  In huge mode, we can't.
- */
-#if HUGE_MODE
-#  define GET_SA_ENTRY(r)	(SA[r])
-#  define GET_LCP_ENTRY(r)	(LCP[r])
-#  define SET_LCP_ENTRY(r, val)	(LCP[r] = (val))
-#  define UNVISITED_TAG		HUGE_UNVISITED_TAG
-#else
-#  define GET_SA_ENTRY(r)	(SA_and_LCP[r] & SA_and_LCP_POS_MASK)
-#  define GET_LCP_ENTRY(r)	(SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT)
-#  define SET_LCP_ENTRY(r, val)	(SA_and_LCP[r] |= (val) << SA_and_LCP_LCP_SHIFT)
-#  define UNVISITED_TAG		NORMAL_UNVISITED_TAG
-#endif
-
-/*
- * Build the LCP (Longest Common Prefix) array in linear time.
- *
- * LCP[r] will be the length of the longest common prefix between the suffixes
- * with positions SA[r - 1] and  SA[r].  LCP[0] will be undefined.
- *
- * Algorithm taken from Kasai et al. (2001), but modified slightly:
- *
- *  - For decreased memory usage and improved memory locality, pack the two
- *    logically distinct SA and LCP arrays into a single array SA_and_LCP.
- *
- *  - With bytes there is no realistic way to reserve a unique symbol for
- *    end-of-buffer, so use explicit checks for end-of-buffer.
- *
- *  - If a LCP value is less than the minimum match length, then store 0.  This
- *    avoids having to do comparisons against the minimum match length later.
- *
- *  - If a LCP value is greater than the "nice match length", then store the
- *    "nice match length".  This caps the number of bits needed to store each
- *    LCP value, and this caps the depth of the LCP-interval tree, without
- *    usually hurting the compression ratio too much.
- *
- * References:
- *
- *	Kasai et al.  2001.  Linear-Time Longest-Common-Prefix Computation in
- *	Suffix Arrays and Its Applications.  CPM '01 Proceedings of the 12th
- *	Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
- */
-#if HUGE_MODE
-static void
-build_LCP_huge(u32 LCP[restrict], const u32 SA[restrict], const u32 ISA[restrict],
-	       const u8 T[restrict], u32 n, u32 min_lcp, u32 max_lcp)
-#else
-static void
-build_LCP_normal(u32 SA_and_LCP[restrict], const u32 ISA[restrict],
-		 const u8 T[restrict], u32 n, u32 min_lcp, u32 max_lcp)
-#endif
-{
-	u32 h = 0;
-	for (u32 i = 0; i < n; i++) {
-		u32 r = ISA[i];
-		if (r > 0) {
-			u32 j = GET_SA_ENTRY(r - 1);
-			u32 lim = min(n - i, n - j);
-			while (h < lim && T[i + h] == T[j + h])
-				h++;
-			u32 stored_lcp = h;
-			if (stored_lcp < min_lcp)
-				stored_lcp = 0;
-			else if (stored_lcp > max_lcp)
-				stored_lcp = max_lcp;
-			SET_LCP_ENTRY(r, stored_lcp);
-			if (h > 0)
-				h--;
-		}
-	}
-}
-
-/*
- * Use the suffix array accompanied with the longest-common-prefix array --- in
- * other words, the "enhanced suffix array" --- to simulate a bottom-up
- * traversal of the corresponding suffix tree, or equivalently the "lcp-interval
- * tree", as described in Abouelhoda et al. (2004).
- *
- * While doing the traversal, create a table 'intervals' that contains data for
- * each lcp-interval, specifically the lcp value of that interval, and the index
- * of the superinterval.
- *
- * Also while doing the traversal, create a table 'pos_data' that contains a
- * mapping from suffix index to the deepest lcp-interval containing it.
- *
- * The result is that we will later be able to do match-finding at a given
- * position by looking up that position in 'pos_data' to get the deepest
- * lcp-interval containing the corresponding suffix, then proceeding to the
- * superintervals.  See lcpit_advance_one_byte() for more details.
- *
- * Note: We limit the depth of the lcp-interval tree by capping the lcp at
- * LCP_MAX.  This can cause a sub-tree of intervals with lcp greater than
- * LCP_MAX to be collapsed into a single interval with lcp LCP_MAX.  This avoids
- * degenerate cases and does not hurt match-finding very much, since if we find
- * a match of length LCP_MAX and extend it as far as possible, that's usually
- * good enough because that region of the input must already be highly
- * compressible.
- *
- * References:
- *
- *	M.I. Abouelhoda, S. Kurtz, E. Ohlebusch.  2004.  Replacing Suffix Trees
- *	With Enhanced Suffix Arrays.  Journal of Discrete Algorithms Volume 2
- *	Issue 1, March 2004, pp. 53-86.
- *
- *	G. Chen, S.J. Puglisi, W.F. Smyth.  2008.  Lempel-Ziv Factorization
- *	Using Less Time & Space.  Mathematics in Computer Science June 2008,
- *	Volume 1, Issue 4, pp. 605-623.
- *
- *	Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
- *	Arrays and Its Applications.  2001.  CPM '01 Proceedings of the 12th
- *	Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
- */
-#if HUGE_MODE
-static void
-build_LCPIT_huge(const u32 SA[restrict], u32 LCP[], u64 intervals[],
-		 u32 pos_data[restrict], u32 n)
-#else
-static void
-build_LCPIT_normal(const u32 SA_and_LCP[restrict], u32 intervals[restrict],
-		   u32 pos_data[restrict], u32 n)
-#endif
-{
-	u32 next_interval_idx = 0;
-	u32 open_intervals[LCP_MAX + 1];
-	u32 *top = open_intervals;
-	u32 prev_pos = GET_SA_ENTRY(0);
-
-	/* The interval with lcp=0 covers the entire array.  It remains open
-	 * until the end.  */
-	*top = next_interval_idx;
-	intervals[next_interval_idx] = 0;
-	next_interval_idx++;
-
-	for (u32 r = 1; r < n; r++) {
-		u32 next_pos = GET_SA_ENTRY(r);
-		u32 next_lcp = GET_LCP_ENTRY(r);
-		u32 top_lcp = intervals[*top];
-
-		if (next_lcp == top_lcp) {
-			/* continuing the deepest open interval  */
-			pos_data[prev_pos] = *top;
-		} else if (next_lcp > top_lcp) {
-			/* opening a new interval  */
-			intervals[next_interval_idx] = next_lcp;
-			*++top = next_interval_idx;
-			pos_data[prev_pos] = next_interval_idx;
-			next_interval_idx++;
-		} else {
-			/* closing the deepest open interval  */
-			pos_data[prev_pos] = *top;
-			for (;;) {
-				u32 closed_interval_idx = *top;
-				u32 superinterval_idx = *--top;
-				u32 superinterval_lcp = intervals[superinterval_idx];
-
-				if (next_lcp == superinterval_lcp) {
-					/* continuing the superinterval */
-					intervals[closed_interval_idx] |=
-						(superinterval_idx << LCP_BITS) |
-							UNVISITED_TAG;
-					break;
-				} else if (next_lcp > superinterval_lcp) {
-					/* creating a new interval that is a
-					 * superinterval of the one being
-					 * closed, but still a subinterval of
-					 * its superinterval  */
-					intervals[next_interval_idx] = next_lcp;
-					*++top = next_interval_idx;
-					intervals[closed_interval_idx] |=
-						(next_interval_idx << LCP_BITS) |
-							UNVISITED_TAG;
-					next_interval_idx++;
-					break;
-				} else {
-					/* also closing the superinterval  */
-					intervals[closed_interval_idx] |=
-						(superinterval_idx << LCP_BITS) |
-							UNVISITED_TAG;
-				}
-			}
-		}
-		prev_pos = next_pos;
-	}
-
-	/* close any still-open intervals  */
-	pos_data[prev_pos] = *top;
-	while (top > open_intervals) {
-		u32 closed_interval_idx = *top;
-		u32 superinterval_idx = *--top;
-		intervals[closed_interval_idx] |=
-			(superinterval_idx << LCP_BITS) | UNVISITED_TAG;
-	}
-}
-
-/*
- * Advance the LCP-interval tree matchfinder by one byte.
- *
- * If @record_matches is true, then matches are recorded in the @matches array,
- * and the return value is the number of matches found.  Otherwise, @matches is
- * ignored and the return value is always 0.
- */
-static inline u32
-#if HUGE_MODE
-lcpit_advance_one_byte_huge
-#else
-lcpit_advance_one_byte_normal
-#endif
-(struct lcpit_matchfinder *mf, struct lz_match * restrict matches,
- bool record_matches)
-{
-	const u32 cur_pos = mf->cur_pos++;
-	u32 * const pos_data = mf->pos_data;
-#if HUGE_MODE
-	u64 * const intervals = mf->intervals64;
-#else
-	u32 * const intervals = mf->intervals;
-#endif
-	u32 num_matches = 0;
-	u32 lcp, next_lcp;
-	u32 interval, next_interval;
-	u32 cur_match, next_match;
-
-	/* Look up the deepest lcp-interval containing the current suffix.  */
-	interval = pos_data[cur_pos];
-
-	/* Prefetch the deepest lcp-interval containing the next suffix.  */
-	prefetch(&intervals[pos_data[cur_pos + 1]]);
-
-	/* Since the current position is greater than any position previously
-	 * searched, set the "lcp interval of the next match" for this suffix to
-	 * 0.  This is the index of the root interval, and this indicates that
-	 * there is no next match.  */
-	pos_data[cur_pos] = 0;
-
-	/* Ascend the lcp-interval tree until we reach an lcp-interval that has
-	 * already been visited.  */
-
-	while (intervals[interval] & UNVISITED_TAG) {
-
-		/* Visiting this lcp-interval for the first time.  Therefore,
-		 * there are no matches with length equal to the lcp of this
-		 * lcp-interval.  */
-
-		/* Extract the LCP and superinterval reference.  */
-
-		lcp = intervals[interval] & LCP_MASK;
-
-		/* If the LCP is shorter than the minimum length of matches to
-		 * be produced, we're done, since the LCP will only ever get
-		 * shorter from here.  This also prevents ascending above the
-		 * root of the lcp-interval tree, since the root is guaranteed
-		 * to be a 0-interval.  */
-		if (lcp == 0)
-			return 0;
-
-		next_interval = (intervals[interval] & ~UNVISITED_TAG) >> LCP_BITS;
-
-		/* Set the position of the most-recently-seen suffix within this
-		 * lcp-interval.  Since this is the first visitation of this
-		 * lcp-interval, this is simply the current suffix.
-		 *
-		 * Note that this overwrites the superinterval reference which
-		 * was previously included in this lcp-interval data slot.
-		 * Further visitations of this lcp-interval will detect that it
-		 * is already visited and will follow the chain of
-		 * most-recently-seen suffixes rather than ascend the tree
-		 * directly.  */
-		intervals[interval] = (cur_pos << LCP_BITS) | lcp;
-
-		/* Ascend to the superinterval of this lcp-interval.  */
-		interval = next_interval;
-	}
-
-	/* We've already visited the current lcp-interval.  */
-
-	/* Extract the LCP of this lcp-interval.  */
-	lcp = intervals[interval] & LCP_MASK;
-
-	/* Extract the current match for this lcp-interval.  This usually is the
-	 * most-recently-seen suffix within this lcp-interval, but it may be
-	 * outdated.  */
-	cur_match = intervals[interval] >> LCP_BITS;
-
-	for (;;) {
-		/* If the LCP is shorter than the minimum length of matches to
-		 * be produced, we're done, since the LCP will only ever get
-		 * shorter from here.  This also prevents ascending above the
-		 * root of the lcp-interval tree, since the root is guaranteed
-		 * to be a 0-interval.  */
-		if (lcp == 0)
-			break;
-
-		/* Advance the current match until the lcp of the *next* match
-		 * is lower than the current lcp.  When this is true we know
-		 * that the current match is up to date (lowest offset /
-		 * greatest position for that lcp).  */
-
-		next_match = cur_match;
-		do {
-			next_interval = pos_data[next_match];
-			next_lcp = intervals[next_interval] & LCP_MASK;
-			cur_match = next_match;
-			next_match = intervals[next_interval] >> LCP_BITS;
-		} while (next_lcp >= lcp);
-
-		/* Link the current position into the match chain, discarding
-		 * any skipped matches.  */
-		intervals[interval] = (cur_pos << LCP_BITS) | lcp;
-		pos_data[cur_match] = interval;
-
-		if (record_matches) {
-			/* Record the match.  */
-			matches[num_matches].length = lcp;
-			matches[num_matches].offset = cur_pos - cur_match;
-			num_matches++;
-		}
-
-		/* Advance to the next match.  */
-		interval = next_interval;
-		lcp = next_lcp;
-		cur_match = next_match;
-	}
-	return num_matches;
-}
-
-#undef GET_SA_ENTRY
-#undef GET_LCP_ENTRY
-#undef SET_LCP_ENTRY
-#undef UNVISITED_TAG
-- 
2.43.0