Stop force-inlining everything marked 'inline'

[wimlib] / src / lcpit_matchfinder.c
diff --git a/src/lcpit_matchfinder.c b/src/lcpit_matchfinder.c

index dd00979e619df1f5ce8d80e6eb59904f34a896ae..8b9ffd9d882fa5e9e1559b3a20222561378f0dd3 100644 (file)
--- a/src/lcpit_matchfinder.c
+++ b/src/lcpit_matchfinder.c
@@ -1,51 +1,556 @@
  /*
- * lcpit_matchfinder.h
+ * lcpit_matchfinder.c
   *
   * A match-finder for Lempel-Ziv compression based on bottom-up construction and
   * traversal of the Longest Common Prefix (LCP) interval tree.
   *
- * Author:     Eric Biggers
- * Year:       2014, 2015
+ * The following copying information applies to this specific source code file:
   *
- * The author dedicates this file to the public domain.
- * You can do whatever you want with this file.
+ * Written in 2014-2015 by Eric Biggers <ebiggers3@gmail.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
+ * Dedication (the "CC0").
+ *
+ * This software is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
+ *
+ * You should have received a copy of the CC0 along with this software; if not
+ * see <http://creativecommons.org/publicdomain/zero/1.0/>.
   */
  
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
  #include <limits.h>
-#include <string.h>
  
  #include "wimlib/divsufsort.h"
  #include "wimlib/lcpit_matchfinder.h"
  #include "wimlib/util.h"
  
  #define LCP_BITS               6
-#define LCP_MASK               ((1 << LCP_BITS) - 1)
-#define LCP_MAX                        LCP_MASK
-#define NORMAL_UNVISITED_TAG   ((u32)1 << 31)
-#define MAX_NORMAL_BUFSIZE     ((u32)1 << (31 - LCP_BITS))
-#define HUGE_UNVISITED_TAG     ((u64)1 << 63)
-#define SA_and_LCP_LCP_SHIFT   (32 - LCP_BITS)
-#define SA_and_LCP_POS_MASK    (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1)
+#define LCP_MAX                        (((u32)1 << LCP_BITS) - 1)
+#define LCP_SHIFT              (32 - LCP_BITS)
+#define LCP_MASK               (LCP_MAX << LCP_SHIFT)
+#define POS_MASK               (((u32)1 << (32 - LCP_BITS)) - 1)
+#define MAX_NORMAL_BUFSIZE     (POS_MASK + 1)
+
+#define HUGE_LCP_BITS          7
+#define HUGE_LCP_MAX           (((u32)1 << HUGE_LCP_BITS) - 1)
+#define HUGE_LCP_SHIFT         (64 - HUGE_LCP_BITS)
+#define HUGE_LCP_MASK          ((u64)HUGE_LCP_MAX << HUGE_LCP_SHIFT)
+#define HUGE_POS_MASK          0xFFFFFFFF
+#define MAX_HUGE_BUFSIZE       ((u64)HUGE_POS_MASK + 1)
+#define HUGE_UNVISITED_TAG     0x100000000
+
+#define PREFETCH_SAFETY                5
  
  /*
- * Include the template header to define the functions build_LCP(),
- * build_LCPIT(), and lcpit_advance_one_byte().  There are "normal" and "huge"
- * versions of each function.  The normal versions assume that a buffer position
- * and LCP value can be packed into a 32-bit integer, whereas the huge versions
- * assume that 64 bits is needed.
- *
- * Both versions cap LCP values to 6 bits. This limits the depth of the
- * lcp-interval tree without hurting the compression ratio too much.  Matches of
- * length 63 are sufficiently long that the compression ratio doesn't change
- * significantly if we choose one such match over another.
+ * Build the LCP (Longest Common Prefix) array in linear time.
+ *
+ * LCP[r] will be the length of the longest common prefix between the suffixes
+ * with positions SA[r - 1] and SA[r].  LCP[0] will be undefined.
+ *
+ * Algorithm taken from Kasai et al. (2001), but modified slightly:
+ *
+ *  - With bytes there is no realistic way to reserve a unique symbol for
+ *    end-of-buffer, so use explicit checks for end-of-buffer.
+ *
+ *  - For decreased memory usage and improved memory locality, pack the two
+ *    logically distinct SA and LCP arrays into a single array SA_and_LCP.
+ *
+ *  - Since SA_and_LCP is accessed randomly, improve the cache behavior by
+ *    reading several entries ahead in ISA and prefetching the upcoming
+ *    SA_and_LCP entry.
+ *
+ *  - If an LCP value is less than the minimum match length, then store 0.  This
+ *    avoids having to do comparisons against the minimum match length later.
+ *
+ *  - If an LCP value is greater than the "nice match length", then store the
+ *    "nice match length".  This caps the number of bits needed to store each
+ *    LCP value, and this caps the depth of the LCP-interval tree, without
+ *    usually hurting the compression ratio too much.
+ *
+ * References:
+ *
+ *     Kasai et al.  2001.  Linear-Time Longest-Common-Prefix Computation in
+ *     Suffix Arrays and Its Applications.  CPM '01 Proceedings of the 12th
+ *     Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
   */
-#define HUGE_MODE 1
-#include "lcpit_matchfinder_templates.h"
-#undef HUGE_MODE
+static void
+build_LCP(u32 SA_and_LCP[restrict], const u32 ISA[restrict],
+         const u8 T[restrict], const u32 n,
+         const u32 min_lcp, const u32 max_lcp)
+{
+       u32 h = 0;
+       for (u32 i = 0; i < n; i++) {
+               const u32 r = ISA[i];
+               prefetchw(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
+               if (r > 0) {
+                       const u32 j = SA_and_LCP[r - 1] & POS_MASK;
+                       const u32 lim = min(n - i, n - j);
+                       while (h < lim && T[i + h] == T[j + h])
+                               h++;
+                       u32 stored_lcp = h;
+                       if (stored_lcp < min_lcp)
+                               stored_lcp = 0;
+                       else if (stored_lcp > max_lcp)
+                               stored_lcp = max_lcp;
+                       SA_and_LCP[r] |= stored_lcp << LCP_SHIFT;
+                       if (h > 0)
+                               h--;
+               }
+       }
+}
  
-#define HUGE_MODE 0
-#include "lcpit_matchfinder_templates.h"
-#undef HUGE_MODE
+/*
+ * Use the suffix array accompanied with the longest-common-prefix array ---
+ * which in combination can be called the "enhanced suffix array" --- to
+ * simulate a bottom-up traversal of the corresponding suffix tree, or
+ * equivalently the lcp-interval tree.  Do so in suffix rank order, but save the
+ * superinterval references needed for later bottom-up traversal of the tree in
+ * suffix position order.
+ *
+ * To enumerate the lcp-intervals, this algorithm scans the suffix array and its
+ * corresponding LCP array linearly.  While doing so, it maintains a stack of
+ * lcp-intervals that are currently open, meaning that their left boundaries
+ * have been seen but their right boundaries have not.  The bottom of the stack
+ * is the interval which covers the entire suffix array (this has lcp=0), and
+ * the top of the stack is the deepest interval that is currently open (this has
+ * the greatest lcp of any interval on the stack).  When this algorithm opens an
+ * lcp-interval, it assigns it a unique index in intervals[] and pushes it onto
+ * the stack.  When this algorithm closes an interval, it pops it from the stack
+ * and sets the intervals[] entry of that interval to the index and lcp of that
+ * interval's superinterval, which is the new top of the stack.
+ *
+ * This algorithm also set pos_data[pos] for each suffix position 'pos' to the
+ * index and lcp of the deepest lcp-interval containing it.  Alternatively, we
+ * can interpret each suffix as being associated with a singleton lcp-interval,
+ * or leaf of the suffix tree.  With this interpretation, an entry in pos_data[]
+ * is the superinterval reference for one of these singleton lcp-intervals and
+ * therefore is not fundamentally different from an entry in intervals[].
+ *
+ * To reduce memory usage, this algorithm re-uses the suffix array's memory to
+ * store the generated intervals[] array.  This is possible because SA and LCP
+ * are accessed linearly, and no more than one interval is generated per suffix.
+ *
+ * The techniques used in this algorithm are described in various published
+ * papers.  The generation of lcp-intervals from the suffix array (SA) and the
+ * longest-common-prefix array (LCP) is given as Algorithm BottomUpTraverse in
+ * Kasai et al. (2001) and Algorithm 4.1 ("Computation of lcp-intervals") in
+ * Abouelhoda et al. (2004).  Both these papers note the equivalence between
+ * lcp-intervals (including the singleton lcp-interval for each suffix) and
+ * nodes of the suffix tree.  Abouelhoda et al. (2004) furthermore applies
+ * bottom-up traversal of the lcp-interval tree to Lempel-Ziv factorization, as
+ * does Chen at al. (2008).  Algorithm CPS1b of Chen et al. (2008) dynamically
+ * re-uses the suffix array during bottom-up traversal of the lcp-interval tree.
+ *
+ * References:
+ *
+ *     Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
+ *     Arrays and Its Applications.  2001.  CPM '01 Proceedings of the 12th
+ *     Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
+ *
+ *     M.I. Abouelhoda, S. Kurtz, E. Ohlebusch.  2004.  Replacing Suffix Trees
+ *     With Enhanced Suffix Arrays.  Journal of Discrete Algorithms Volume 2
+ *     Issue 1, March 2004, pp. 53-86.
+ *
+ *     G. Chen, S.J. Puglisi, W.F. Smyth.  2008.  Lempel-Ziv Factorization
+ *     Using Less Time & Space.  Mathematics in Computer Science June 2008,
+ *     Volume 1, Issue 4, pp. 605-623.
+ */
+static void
+build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n)
+{
+       u32 * const SA_and_LCP = intervals;
+       u32 next_interval_idx;
+       u32 open_intervals[LCP_MAX + 1];
+       u32 *top = open_intervals;
+       u32 prev_pos = SA_and_LCP[0] & POS_MASK;
+
+       *top = 0;
+       intervals[0] = 0;
+       next_interval_idx = 1;
+
+       for (u32 r = 1; r < n; r++) {
+               const u32 next_pos = SA_and_LCP[r] & POS_MASK;
+               const u32 next_lcp = SA_and_LCP[r] & LCP_MASK;
+               const u32 top_lcp = *top & LCP_MASK;
+
+               prefetchw(&pos_data[SA_and_LCP[r + PREFETCH_SAFETY] & POS_MASK]);
+
+               if (next_lcp == top_lcp) {
+                       /* Continuing the deepest open interval  */
+                       pos_data[prev_pos] = *top;
+               } else if (next_lcp > top_lcp) {
+                       /* Opening a new interval  */
+                       *++top = next_lcp | next_interval_idx++;
+                       pos_data[prev_pos] = *top;
+               } else {
+                       /* Closing the deepest open interval  */
+                       pos_data[prev_pos] = *top;
+                       for (;;) {
+                               const u32 closed_interval_idx = *top-- & POS_MASK;
+                               const u32 superinterval_lcp = *top & LCP_MASK;
+
+                               if (next_lcp == superinterval_lcp) {
+                                       /* Continuing the superinterval */
+                                       intervals[closed_interval_idx] = *top;
+                                       break;
+                               } else if (next_lcp > superinterval_lcp) {
+                                       /* Creating a new interval that is a
+                                        * superinterval of the one being
+                                        * closed, but still a subinterval of
+                                        * its superinterval  */
+                                       *++top = next_lcp | next_interval_idx++;
+                                       intervals[closed_interval_idx] = *top;
+                                       break;
+                               } else {
+                                       /* Also closing the superinterval  */
+                                       intervals[closed_interval_idx] = *top;
+                               }
+                       }
+               }
+               prev_pos = next_pos;
+       }
+
+       /* Close any still-open intervals.  */
+       pos_data[prev_pos] = *top;
+       for (; top > open_intervals; top--)
+               intervals[*top & POS_MASK] = *(top - 1);
+}
+
+/*
+ * Advance the LCP-interval tree matchfinder by one byte.
+ *
+ * If @record_matches is true, then matches are written to the @matches array
+ * sorted by strictly decreasing length and strictly decreasing offset, and the
+ * return value is the number of matches found.  Otherwise, @matches is ignored
+ * and the return value is always 0.
+ *
+ * How this algorithm works:
+ *
+ * 'cur_pos' is the position of the current suffix, which is the suffix being
+ * matched against.  'cur_pos' starts at 0 and is incremented each time this
+ * function is called.  This function finds each suffix with position less than
+ * 'cur_pos' that shares a prefix with the current suffix, but for each distinct
+ * prefix length it finds only the suffix with the greatest position (i.e. the
+ * most recently seen in the linear traversal by position).  This function
+ * accomplishes this using the lcp-interval tree data structure that was built
+ * by build_LCPIT() and is updated dynamically by this function.
+ *
+ * The first step is to read 'pos_data[cur_pos]', which gives the index and lcp
+ * value of the deepest lcp-interval containing the current suffix --- or,
+ * equivalently, the parent of the conceptual singleton lcp-interval that
+ * contains the current suffix.
+ *
+ * The second step is to ascend the lcp-interval tree until reaching an interval
+ * that has not yet been visited, and link the intervals to the current suffix
+ * along the way.   An lcp-interval has been visited if and only if it has been
+ * linked to a suffix.  Initially, no lcp-intervals are linked to suffixes.
+ *
+ * The third step is to continue ascending the lcp-interval tree, but indirectly
+ * through suffix links rather than through the original superinterval
+ * references, and continuing to form links with the current suffix along the
+ * way.  Each suffix visited during this step, except in a special case to
+ * handle outdated suffixes, is a match which can be written to matches[].  Each
+ * intervals[] entry contains the position of the next suffix to visit, which we
+ * shall call 'match_pos'; this is the most recently seen suffix that belongs to
+ * that lcp-interval.  'pos_data[match_pos]' then contains the lcp and interval
+ * index of the next lcp-interval that should be visited.
+ *
+ * We can view these arrays as describing a new set of links that gets overlaid
+ * on top of the original superinterval references of the lcp-interval tree.
+ * Each such link can connect a node of the lcp-interval tree to an ancestor
+ * more than one generation removed.
+ *
+ * For each one-byte advance, the current position becomes the most recently
+ * seen suffix for a continuous sequence of lcp-intervals from a leaf interval
+ * to the root interval.  Conceptually, this algorithm needs to update all these
+ * nodes to link to 'cur_pos', and then update 'pos_data[cur_pos]' to a "null"
+ * link.  But actually, if some of these nodes have the same most recently seen
+ * suffix, then this algorithm just visits the pos_data[] entry for that suffix
+ * and skips over all these nodes in one step.  Updating the extra nodes is
+ * accomplished by creating a redirection from the previous suffix to the
+ * current suffix.
+ *
+ * Using this shortcutting scheme, it's possible for a suffix to become out of
+ * date, which means that it is no longer the most recently seen suffix for the
+ * lcp-interval under consideration.  This case is detected by noticing when the
+ * next lcp-interval link actually points deeper in the tree, and it is worked
+ * around by just continuing until we get to a link that actually takes us
+ * higher in the tree.  This can be described as a lazy-update scheme.
+ */
+static forceinline u32
+lcpit_advance_one_byte(const u32 cur_pos,
+                      u32 pos_data[restrict],
+                      u32 intervals[restrict],
+                      u32 next[restrict],
+                      struct lz_match matches[restrict],
+                      const bool record_matches)
+{
+       u32 ref;
+       u32 super_ref;
+       u32 match_pos;
+       struct lz_match *matchptr;
+
+       /* Get the deepest lcp-interval containing the current suffix. */
+       ref = pos_data[cur_pos];
+
+       /* Prefetch upcoming data, up to 3 positions ahead.  Assume the
+        * intervals are already visited.  */
+
+       /* Prefetch the superinterval via a suffix link for the deepest
+        * lcp-interval containing the suffix starting 1 position from now.  */
+       prefetchw(&intervals[pos_data[next[0]] & POS_MASK]);
+
+       /* Prefetch suffix link for the deepest lcp-interval containing the
+        * suffix starting 2 positions from now.  */
+       next[0] = intervals[next[1]] & POS_MASK;
+       prefetchw(&pos_data[next[0]]);
+
+       /* Prefetch the deepest lcp-interval containing the suffix starting 3
+        * positions from now.  */
+       next[1] = pos_data[cur_pos + 3] & POS_MASK;
+       prefetchw(&intervals[next[1]]);
+
+       /* There is no "next suffix" after the current one.  */
+       pos_data[cur_pos] = 0;
+
+       /* Ascend until we reach a visited interval, the root, or a child of the
+        * root.  Link unvisited intervals to the current suffix as we go.  */
+       while ((super_ref = intervals[ref & POS_MASK]) & LCP_MASK) {
+               intervals[ref & POS_MASK] = cur_pos;
+               ref = super_ref;
+       }
+
+       if (super_ref == 0) {
+               /* In this case, the current interval may be any of:
+                * (1) the root;
+                * (2) an unvisited child of the root;
+                * (3) an interval last visited by suffix 0
+                *
+                * We could avoid the ambiguity with (3) by using an lcp
+                * placeholder value other than 0 to represent "visited", but
+                * it's fastest to use 0.  So we just don't allow matches with
+                * position 0.  */
+
+               if (ref != 0)  /* Not the root?  */
+                       intervals[ref & POS_MASK] = cur_pos;
+               return 0;
+       }
+
+       /* Ascend indirectly via pos_data[] links.  */
+       match_pos = super_ref;
+       matchptr = matches;
+       for (;;) {
+               while ((super_ref = pos_data[match_pos]) > ref)
+                       match_pos = intervals[super_ref & POS_MASK];
+               intervals[ref & POS_MASK] = cur_pos;
+               pos_data[match_pos] = ref;
+               if (record_matches) {
+                       matchptr->length = ref >> LCP_SHIFT;
+                       matchptr->offset = cur_pos - match_pos;
+                       matchptr++;
+               }
+               if (super_ref == 0)
+                       break;
+               ref = super_ref;
+               match_pos = intervals[ref & POS_MASK];
+       }
+       return matchptr - matches;
+}
+
+/* Expand SA from 32 bits to 64 bits.  */
+static void
+expand_SA(void *p, u32 n)
+{
+       typedef u32 _may_alias_attribute aliased_u32_t;
+       typedef u64 _may_alias_attribute aliased_u64_t;
+
+       aliased_u32_t *SA = p;
+       aliased_u64_t *SA64 = p;
+
+       u32 r = n - 1;
+       do {
+               SA64[r] = SA[r];
+       } while (r--);
+}
+
+/* Like build_LCP(), but for buffers larger than MAX_NORMAL_BUFSIZE.  */
+static void
+build_LCP_huge(u64 SA_and_LCP64[restrict], const u32 ISA[restrict],
+              const u8 T[restrict], const u32 n,
+              const u32 min_lcp, const u32 max_lcp)
+{
+       u32 h = 0;
+       for (u32 i = 0; i < n; i++) {
+               const u32 r = ISA[i];
+               prefetchw(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
+               if (r > 0) {
+                       const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK;
+                       const u32 lim = min(n - i, n - j);
+                       while (h < lim && T[i + h] == T[j + h])
+                               h++;
+                       u32 stored_lcp = h;
+                       if (stored_lcp < min_lcp)
+                               stored_lcp = 0;
+                       else if (stored_lcp > max_lcp)
+                               stored_lcp = max_lcp;
+                       SA_and_LCP64[r] |= (u64)stored_lcp << HUGE_LCP_SHIFT;
+                       if (h > 0)
+                               h--;
+               }
+       }
+}
+
+/*
+ * Like build_LCPIT(), but for buffers larger than MAX_NORMAL_BUFSIZE.
+ *
+ * This "huge" version is also slightly different in that the lcp value stored
+ * in each intervals[] entry is the lcp value for that interval, not its
+ * superinterval.  This lcp value stays put in intervals[] and doesn't get moved
+ * to pos_data[] during lcpit_advance_one_byte_huge().  One consequence of this
+ * is that we have to use a special flag to distinguish visited from unvisited
+ * intervals.  But overall, this scheme keeps the memory usage at 12n instead of
+ * 16n.  (The non-huge version is 8n.)
+ */
+static void
+build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
+{
+       u64 * const SA_and_LCP64 = intervals64;
+       u32 next_interval_idx;
+       u32 open_intervals[HUGE_LCP_MAX + 1];
+       u32 *top = open_intervals;
+       u32 prev_pos = SA_and_LCP64[0] & HUGE_POS_MASK;
+
+       *top = 0;
+       intervals64[0] = 0;
+       next_interval_idx = 1;
+
+       for (u32 r = 1; r < n; r++) {
+               const u32 next_pos = SA_and_LCP64[r] & HUGE_POS_MASK;
+               const u64 next_lcp = SA_and_LCP64[r] & HUGE_LCP_MASK;
+               const u64 top_lcp = intervals64[*top];
+
+               prefetchw(&pos_data[SA_and_LCP64[r + PREFETCH_SAFETY] & HUGE_POS_MASK]);
+
+               if (next_lcp == top_lcp) {
+                       /* Continuing the deepest open interval  */
+                       pos_data[prev_pos] = *top;
+               } else if (next_lcp > top_lcp) {
+                       /* Opening a new interval  */
+                       intervals64[next_interval_idx] = next_lcp;
+                       pos_data[prev_pos] = next_interval_idx;
+                       *++top = next_interval_idx++;
+               } else {
+                       /* Closing the deepest open interval  */
+                       pos_data[prev_pos] = *top;
+                       for (;;) {
+                               const u32 closed_interval_idx = *top--;
+                               const u64 superinterval_lcp = intervals64[*top];
+
+                               if (next_lcp == superinterval_lcp) {
+                                       /* Continuing the superinterval */
+                                       intervals64[closed_interval_idx] |=
+                                               HUGE_UNVISITED_TAG | *top;
+                                       break;
+                               } else if (next_lcp > superinterval_lcp) {
+                                       /* Creating a new interval that is a
+                                        * superinterval of the one being
+                                        * closed, but still a subinterval of
+                                        * its superinterval  */
+                                       intervals64[next_interval_idx] = next_lcp;
+                                       intervals64[closed_interval_idx] |=
+                                               HUGE_UNVISITED_TAG | next_interval_idx;
+                                       *++top = next_interval_idx++;
+                                       break;
+                               } else {
+                                       /* Also closing the superinterval  */
+                                       intervals64[closed_interval_idx] |=
+                                               HUGE_UNVISITED_TAG | *top;
+                               }
+                       }
+               }
+               prev_pos = next_pos;
+       }
+
+       /* Close any still-open intervals.  */
+       pos_data[prev_pos] = *top;
+       for (; top > open_intervals; top--)
+               intervals64[*top] |= HUGE_UNVISITED_TAG | *(top - 1);
+}
+
+/* Like lcpit_advance_one_byte(), but for buffers larger than
+ * MAX_NORMAL_BUFSIZE.  */
+static forceinline u32
+lcpit_advance_one_byte_huge(const u32 cur_pos,
+                           u32 pos_data[restrict],
+                           u64 intervals64[restrict],
+                           u32 prefetch_next[restrict],
+                           struct lz_match matches[restrict],
+                           const bool record_matches)
+{
+       u32 interval_idx;
+       u32 next_interval_idx;
+       u64 cur;
+       u64 next;
+       u32 match_pos;
+       struct lz_match *matchptr;
+
+       interval_idx = pos_data[cur_pos];
+
+       prefetchw(&intervals64[pos_data[prefetch_next[0]] & HUGE_POS_MASK]);
+
+       prefetch_next[0] = intervals64[prefetch_next[1]] & HUGE_POS_MASK;
+       prefetchw(&pos_data[prefetch_next[0]]);
+
+       prefetch_next[1] = pos_data[cur_pos + 3] & HUGE_POS_MASK;
+       prefetchw(&intervals64[prefetch_next[1]]);
+
+       pos_data[cur_pos] = 0;
+
+       while ((next = intervals64[interval_idx]) & HUGE_UNVISITED_TAG) {
+               intervals64[interval_idx] = (next & HUGE_LCP_MASK) | cur_pos;
+               interval_idx = next & HUGE_POS_MASK;
+       }
+
+       matchptr = matches;
+       while (next & HUGE_LCP_MASK) {
+               cur = next;
+               do {
+                       match_pos = next & HUGE_POS_MASK;
+                       next_interval_idx = pos_data[match_pos];
+                       next = intervals64[next_interval_idx];
+               } while (next > cur);
+               intervals64[interval_idx] = (cur & HUGE_LCP_MASK) | cur_pos;
+               pos_data[match_pos] = interval_idx;
+               if (record_matches) {
+                       matchptr->length = cur >> HUGE_LCP_SHIFT;
+                       matchptr->offset = cur_pos - match_pos;
+                       matchptr++;
+               }
+               interval_idx = next_interval_idx;
+       }
+       return matchptr - matches;
+}
+
+static forceinline u64
+get_pos_data_size(size_t max_bufsize)
+{
+       return (u64)max((u64)max_bufsize + PREFETCH_SAFETY,
+                       DIVSUFSORT_TMP_LEN) * sizeof(u32);
+}
+
+static forceinline u64
+get_intervals_size(size_t max_bufsize)
+{
+       return ((u64)max_bufsize + PREFETCH_SAFETY) *
+               (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64));
+}
  
  /*
   * Calculate the number of bytes of memory needed for the LCP-interval tree
@@ -58,19 +563,7 @@
  u64
  lcpit_matchfinder_get_needed_memory(size_t max_bufsize)
  {
-       u64 size = 0;
-
-       /* pos_data (+1 is for prefetch) */
-       size += ((u64)max_bufsize + 1) * sizeof(u32);
-
-       /* intervals or intervals64  */
-       size += max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) *
-               (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64));
-
-       /* SA */
-       size += (u64)max_bufsize * sizeof(u32);
-
-       return size;
+       return get_pos_data_size(max_bufsize) + get_intervals_size(max_bufsize);
  }
  
  /*
@@ -89,20 +582,20 @@ lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize,
  {
         if (lcpit_matchfinder_get_needed_memory(max_bufsize) > SIZE_MAX)
                 return false;
+       if (max_bufsize > MAX_HUGE_BUFSIZE - PREFETCH_SAFETY)
+               return false;
  
-       mf->pos_data = MALLOC((max_bufsize + 1) * sizeof(u32));
-       mf->intervals = MALLOC(max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) *
-                              (max_bufsize <= MAX_NORMAL_BUFSIZE ?
-                               sizeof(u32) : sizeof(u64)));
-       mf->SA = MALLOC(max_bufsize * sizeof(u32));
-
-       if (!mf->pos_data || !mf->intervals || !mf->SA) {
+       mf->pos_data = MALLOC(get_pos_data_size(max_bufsize));
+       mf->intervals = MALLOC(get_intervals_size(max_bufsize));
+       if (!mf->pos_data || !mf->intervals) {
                 lcpit_matchfinder_destroy(mf);
                 return false;
         }
  
         mf->min_match_len = min_match_len;
-       mf->nice_match_len = min(nice_match_len, LCP_MAX);
+       mf->nice_match_len = min(nice_match_len,
+                                (max_bufsize <= MAX_NORMAL_BUFSIZE) ?
+                                LCP_MAX : HUGE_LCP_MAX);
         return true;
  }
  
@@ -133,11 +626,10 @@ lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize,
  static void
  build_SA(u32 SA[], const u8 T[], u32 n, u32 *tmp)
  {
-       /* Note: divsufsort() needs temporary space --- one array with 256
-        * spaces and one array with 65536 spaces.  The implementation of
-        * divsufsort() has been modified from the original to use the provided
-        * temporary space instead of allocating its own, since we don't want to
-        * have to deal with malloc() failures here.  */
+       /* Note: divsufsort() requires a fixed amount of temporary space.  The
+        * implementation of divsufsort() has been modified from the original to
+        * use the provided temporary space instead of allocating its own, since
+        * we don't want to have to deal with malloc() failures here.  */
         divsufsort(T, SA, n, tmp);
  }
  
@@ -159,42 +651,41 @@ build_ISA(u32 ISA[restrict], const u32 SA[restrict], u32 n)
   *
   * @mf - the initialized matchfinder structure
   * @T - the input buffer
- * @n - size of the input buffer in bytes.  This may be at most the max_bufsize
- *     with which lcpit_matchfinder_init() was called.
+ * @n - size of the input buffer in bytes.  This must be nonzero and can be at
+ *     most the max_bufsize with which lcpit_matchfinder_init() was called.
   */
  void
  lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
  {
-       if (n == 0)
-               return;
+       /* intervals[] temporarily stores SA and LCP packed together.
+        * pos_data[] temporarily stores ISA.
+        * pos_data[] is also used as the temporary space for divsufsort().  */
  
-       build_SA(mf->SA, T, n, mf->intervals);
-       build_ISA(mf->pos_data, mf->SA, n);
+       build_SA(mf->intervals, T, n, mf->pos_data);
+       build_ISA(mf->pos_data, mf->intervals, n);
         if (n <= MAX_NORMAL_BUFSIZE) {
-               /* "Normal" sized buffer  */
-
-               /* Build LCP, packing it into ->SA  */
-               build_LCP_normal(mf->SA, mf->pos_data, T, n,
-                                mf->min_match_len, mf->nice_match_len);
-               /* Prepare ->intervals and ->pos_data  */
-               build_LCPIT_normal(mf->SA, mf->intervals, mf->pos_data, n);
+               for (u32 i = 0; i < PREFETCH_SAFETY; i++) {
+                       mf->intervals[n + i] = 0;
+                       mf->pos_data[n + i] = 0;
+               }
+               build_LCP(mf->intervals, mf->pos_data, T, n,
+                         mf->min_match_len, mf->nice_match_len);
+               build_LCPIT(mf->intervals, mf->pos_data, n);
                 mf->huge_mode = false;
         } else {
-               /* "Huge" sized buffer  */
-
-               /* Build LCP in the second half of ->intervals64.  It may be
-                * partially overwritten in build_LCPIT_huge(), but this is okay
-                * since each LCP entry is guaranteed to be consumed before it
-                * can possibly be overwritten.  */
-               build_LCP_huge(mf->intervals + n, mf->SA, mf->pos_data, T, n,
+               for (u32 i = 0; i < PREFETCH_SAFETY; i++) {
+                       mf->intervals64[n + i] = 0;
+                       mf->pos_data[n + i] = 0;
+               }
+               expand_SA(mf->intervals, n);
+               build_LCP_huge(mf->intervals64, mf->pos_data, T, n,
                                mf->min_match_len, mf->nice_match_len);
-               /* Prepare ->intervals64 and ->pos_data  */
-               build_LCPIT_huge(mf->SA, mf->intervals + n, mf->intervals64,
-                                mf->pos_data, n);
+               build_LCPIT_huge(mf->intervals64, mf->pos_data, n);
                 mf->huge_mode = true;
         }
         mf->cur_pos = 0; /* starting at beginning of input buffer  */
-       mf->pos_data[n] = 0; /* safety entry for prefetch() overrun  */
+       for (u32 i = 0; i < ARRAY_LEN(mf->next); i++)
+               mf->next[i] = 0;
  }
  
  /*
@@ -205,37 +696,39 @@ lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
   *
   * The return value is the number of matches found and written to @matches.
   * This can be any value in [0, nice_match_len - min_match_len + 1].
- *
- * If the caller attempts to advance beyond the end of the input buffer, the
- * behavior is undefined.
   */
  u32
  lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf,
                               struct lz_match *matches)
  {
         if (mf->huge_mode)
-               return lcpit_advance_one_byte_huge(mf, matches, true);
+               return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
+                                                  mf->intervals64, mf->next,
+                                                  matches, true);
         else
-               return lcpit_advance_one_byte_normal(mf, matches, true);
+               return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
+                                             mf->intervals, mf->next,
+                                             matches, true);
  }
  
  /*
   * Skip the next @count bytes (don't search for matches at them).  @count is
   * assumed to be > 0.
- *
- * If the caller attempts to advance beyond the end of the input buffer, the
- * behavior is undefined.
   */
  void
  lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count)
  {
         if (mf->huge_mode) {
                 do {
-                       lcpit_advance_one_byte_huge(mf, NULL, false);
+                       lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
+                                                   mf->intervals64, mf->next,
+                                                   NULL, false);
                 } while (--count);
         } else {
                 do {
-                       lcpit_advance_one_byte_normal(mf, NULL, false);
+                       lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
+                                              mf->intervals, mf->next,
+                                              NULL, false);
                 } while (--count);
         }
  }
@@ -243,14 +736,10 @@ lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count)
  /*
   * Destroy an LCP-interval tree matchfinder that was previously initialized with
   * lcpit_matchfinder_init().
- *
- * If the struct has been zeroed out, this has no effect.
   */
  void
  lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf)
  {
         FREE(mf->pos_data);
         FREE(mf->intervals);
-       FREE(mf->SA);
-       memset(mf, 0, sizeof(*mf));
  }