lz_lcpit: pack SA and LCP into one array

[wimlib] / src / lz_lcp_interval_tree.c
diff --git a/src/lz_lcp_interval_tree.c b/src/lz_lcp_interval_tree.c

index 9584070f3d402e729bd52d15409ae43a396eee77..d48fe373176ff66ebed7b55d40f1ec22ebd1e2d6 100644 (file)
--- a/src/lz_lcp_interval_tree.c
+++ b/src/lz_lcp_interval_tree.c
@@ -53,19 +53,13 @@
  #define LZ_LCPIT_POS_BITS              (32 - 1 - LZ_LCPIT_LCP_BITS)
  #define LZ_LCPIT_MAX_WINDOW_SIZE       (1UL << LZ_LCPIT_POS_BITS)
  
+#define SA_and_LCP_LCP_SHIFT           (32 - LZ_LCPIT_LCP_BITS)
+#define SA_and_LCP_POS_MASK            (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1)
+
  struct lz_lcpit {
         struct lz_mf base;
  
-       /* Each of the arrays has length equal to the window size.  This
-        * therefore results in an additional memory usage of 12 bytes per
-        * position.  (That's compared to about 8 for binary trees or 4 for hash
-        * chains, for example.)
-        *
-        * We allocate these arrays in one contiguous block.  'SA' is first,
-        * 'intervals' is second, and 'pos_data' is third.  */
-
-       /* Pointer to the suffix array  */
-       u32 *SA;
+       u32 *mem;
  
         /* Mapping: lcp-interval index => lcp-interval data
          *
@@ -82,6 +76,47 @@ struct lz_lcpit {
         u32 *pos_data;
  };
  
+/*
+ * Build the LCP (Longest Common Prefix) array in linear time.
+ *
+ * LCP[r] will be the length of the longest common prefix between the suffixes
+ * with positions SA[r - 1] and  SA[r].  LCP[0] will be undefined.
+ *
+ * Algorithm taken from Kasai et al. (2001), but modified slightly:
+ *
+ *  - For decreased memory usage and improved memory locality, pack the two
+ *    logically distinct SA and LCP arrays into a single array SA_and_LCP.
+ *  - Cap the LCP values to the specified limit.
+ *  - With bytes there is no realistic way to reserve a unique symbol for
+ *    end-of-buffer, so use explicit checks for end-of-buffer.
+ *
+ * References:
+ *
+ *     Kasai et al.  2001.  Linear-Time Longest-Common-Prefix Computation in
+ *     Suffix Arrays and Its Applications.  CPM '01 Proceedings of the 12th
+ *     Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
+ */
+static void
+build_LCP_packed(u32 * const restrict SA_and_LCP, const u32 * const restrict ISA,
+                const u8 * const restrict T, const u32 n, const u32 lcp_limit)
+{
+       u32 h, i, r, j, lim;
+
+       h = 0;
+       for (i = 0; i < n; i++) {
+               r = ISA[i];
+               if (r > 0) {
+                       j = SA_and_LCP[r - 1] & SA_and_LCP_POS_MASK;
+                       lim = min(n - i, n - j);
+                       while (h < lim && T[i + h] == T[j + h])
+                               h++;
+                       SA_and_LCP[r] |= min(h, lcp_limit) << SA_and_LCP_LCP_SHIFT;
+                       if (h > 0)
+                               h--;
+               }
+       }
+}
+
  /*
   * Use the suffix array accompanied with the longest-common-prefix array --- in
   * other words, the "enhanced suffix array" --- to simulate a bottom-up
@@ -123,172 +158,79 @@ struct lz_lcpit {
   *     Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
   */
  static void
-build_LCPIT(const u32 SA[restrict], u32 LCP[restrict],
-           u32 pos_data[restrict], const u32 lcp_limit, const u32 n)
+build_LCPIT(const u32 * const restrict SA_and_LCP,
+           u32 * const restrict intervals, u32 * const restrict pos_data,
+           const u32 n)
  {
-       u32 *intervals = LCP;
-       u32 next_interval;
-       u32 incomplete_intervals[lcp_limit + 1];
-       u32 *cur_interval;
-       u32 prev_pos;
-
-       /* As we determine lcp-intervals, we assign each one an entry in
-        * 'intervals', overwriting LCP in the process.  Each such entry will
-        * contain the index in 'intervals' of the superinterval, along with the
-        * longest common prefix length that the suffixes in that interval
-        * share.
-        *
-        * Note: since we don't need its memory for anything, we don't overwrite
-        * the suffix array, even though this could potentially be done since
-        * it's not actually used during match-finding.  */
-
-       /* Process rank 0 as special case.  This creates the lcp-interval
-        * containing every suffix in the window.  */
-       prev_pos = SA[0];
-       intervals[0] = 0;
-       pos_data[prev_pos] = 0;
-       cur_interval = incomplete_intervals;
-       *cur_interval = 0;
-       next_interval = 1;
-
-       /* Iterate through each suffix array rank.  */
-       for (u32 r = 1; r < n; r++) {
-
-               /* Get the longest common prefix (lcp) between the suffixes with
-                * ranks r and r - 1.  But cap it to the lcp limit.  */
-               const u32 lcp = min(LCP[r], lcp_limit);
-
-               /* Convert rank => position using the suffix array.  */
-               const u32 pos = SA[r];
-
-               /* cur_interval points to the index of the deepest (highest lcp
-                * value) incomplete lcp-interval.  */
+       u32 next_interval_idx = 0;
+       u32 open_intervals[LZ_LCPIT_LCP_MAX + 1];
+       u32 *top = open_intervals;
+       u32 prev_pos = SA_and_LCP[0] & SA_and_LCP_POS_MASK;
  
-               /*
-                * There are three cases:
-                *
-                * (1) The lcp stayed the same as the previous value.  Place the
-                * current suffix in cur_interval.  (This placement is
-                * tentative, because if LCP increases at the next rank, this
-                * suffix could still be placed in the resulting new LCP
-                * interval instead.)  cur_interval remains unchanged.
-                *
-                * (2) The lcp increased from the previous value.  This signals
-                * the beginning of a new lcp-interval.  Create it and push it
-                * onto the stack of incomplete intervals.  But since lcp is
-                * defined in terms of the longest prefix between this suffix
-                * and the *previous* ranked suffix, the new lcp-interval
-                * actually should have begun at the *previous* ranked suffix.
-                * Therefore, we need to set both pos_data[pos] and
-                * pos_data[prev_pos] to refer to the new interval.
-                *
-                * (3) The lcp decreased from the previous value.  This signals
-                * the termination of at least one lcp-interval.  Pop the stack,
-                * finalizing the lcp-intervals, until the current lcp is at
-                * least as large as the lcp associated with cur_interval.
-                * Then, if the current lcp is equal to the lcp associated with
-                * cur_interval, place the current suffix in cur_interval,
-                * similar to case (1).  Else, create a new lcp-interval,
-                * similar to case (2).
-                */
-
-               if (lcp == (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK)) {
-
-                       /* Case (1) */
-
-                       pos_data[pos] = *cur_interval;
-
-               } else if (lcp > (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK)) {
-
-                       /* Case (2) */
-
-                       intervals[next_interval] = lcp | 0x80000000;
-                       pos_data[prev_pos] = next_interval;
-                       pos_data[pos] = next_interval;
-                       *++cur_interval = next_interval++;
+       /* The interval with lcp=0 covers the entire array.  It remains open
+        * until the end.  */
+       *top = next_interval_idx;
+       intervals[next_interval_idx] = 0;
+       next_interval_idx++;
  
+       for (u32 r = 1; r < n; r++) {
+               u32 next_pos = SA_and_LCP[r] & SA_and_LCP_POS_MASK;
+               u32 next_lcp = SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT;
+               u32 top_lcp = intervals[*top];
+
+               if (next_lcp == top_lcp) {
+                       /* continuing the deepest open interval  */
+                       pos_data[prev_pos] = *top;
+               } else if (next_lcp > top_lcp) {
+                       /* opening a new interval  */
+                       intervals[next_interval_idx] = next_lcp;
+                       *++top = next_interval_idx;
+                       pos_data[prev_pos] = next_interval_idx;
+                       next_interval_idx++;
                 } else {
-
-                       /* Case (3) */
-
-                       u32 interval;
-                       u32 superinterval;
-
+                       /* closing the deepest open interval  */
+                       pos_data[prev_pos] = *top;
                         for (;;) {
-                               /* Examine the deepest incomplete lcp-interval
-                                * and its superinterval.  */
-
-                               interval = *cur_interval;
-                               superinterval = *--cur_interval;
-
-                               if (lcp >= (intervals[superinterval] &
-                                           LZ_LCPIT_LCP_MASK))
+                               u32 closed_interval_idx = *top;
+                               u32 superinterval_idx = *--top;
+                               u32 superinterval_lcp = intervals[superinterval_idx];
+
+                               if (next_lcp == superinterval_lcp) {
+                                       /* continuing the superinterval */
+                                       intervals[closed_interval_idx] |=
+                                               (superinterval_idx << LZ_LCPIT_LCP_BITS) |
+                                                       0x80000000;
                                         break;
-
-                               /* The current suffix can't go in either of
-                                * them.  Therefore we're visiting 'interval'
-                                * for the last time and finalizing its
-                                * membership in 'superinterval'.  */
-
-                               intervals[interval] |=
-                                       (superinterval << LZ_LCPIT_LCP_BITS);
-                       }
-
-                       /* The current suffix can't go in 'interval', but it can
-                        * go in 'superinterval'.  */
-
-                       if (lcp > (intervals[superinterval] & LZ_LCPIT_LCP_MASK)) {
-                               /* Creating a new lcp-interval that is a
-                                * superinterval of 'interval' but a subinterval
-                                * of 'superinterval'.
-                                *
-                                * Example: with the LCP arrayl
-                                *
-                                *            2  2  2  4  4  3
-                                *
-                                * then we will execute this case when
-                                * processing the LCP value 3.  The LCP
-                                * intervals will be:
-                                *
-                                *            2  2  2  4  4  3
-                                * (lcp=0):  |                |
-                                * (lcp=2):  |                |
-                                * (lcp=3):        |          |
-                                * (lcp=4):        |       |
-                                *
-                                * Note that the 3-interval (the one being
-                                * created by this code) is a superinterval of
-                                * the 4-interval (which already existed)!  But
-                                * we don't need to re-assign pos_data values in
-                                * the 4-interval because they point to the
-                                * deepest interval which contains them, which
-                                * is the 4-interval.  */
-
-                               intervals[next_interval] = lcp | 0x80000000;
-                               intervals[interval] |=
-                                       (next_interval << LZ_LCPIT_LCP_BITS);
-                               pos_data[pos] = next_interval;
-                               *++cur_interval = next_interval++;
-                       } else {
-                               /* Finishing 'interval', continuing with
-                                * 'superinterval'.  */
-
-                               intervals[interval] |=
-                                       (superinterval << LZ_LCPIT_LCP_BITS);
-                               pos_data[pos] = superinterval;
+                               } else if (next_lcp > superinterval_lcp) {
+                                       /* creating a new interval that is a
+                                        * superinterval of the one being
+                                        * closed, but still a subinterval of
+                                        * its superinterval  */
+                                       intervals[next_interval_idx] = next_lcp;
+                                       *++top = next_interval_idx;
+                                       intervals[closed_interval_idx] |=
+                                               (next_interval_idx << LZ_LCPIT_LCP_BITS) |
+                                                       0x80000000;
+                                       next_interval_idx++;
+                                       break;
+                               } else {
+                                       /* also closing the superinterval  */
+                                       intervals[closed_interval_idx] |=
+                                               (superinterval_idx << LZ_LCPIT_LCP_BITS) |
+                                                       0x80000000;
+                               }
                         }
                 }
-
-               /* Remember this suffix index when processing the next-ranked
-                * suffix.  */
-               prev_pos = pos;
+               prev_pos = next_pos;
         }
  
-       /* Finalize any still-incomplete lcp-intervals.  */
-       while (intervals[*cur_interval] & LZ_LCPIT_LCP_MASK) {
-               intervals[*cur_interval] |=
-                       (*(cur_interval - 1) << LZ_LCPIT_LCP_BITS);
-               cur_interval--;
+       /* close any still-open intervals  */
+       pos_data[prev_pos] = *top;
+       while (top > open_intervals) {
+               u32 closed_interval_idx = *top;
+               u32 superinterval_idx = *--top;
+               intervals[closed_interval_idx] |=
+                       (superinterval_idx << LZ_LCPIT_LCP_BITS) | 0x80000000;
         }
  }
  
@@ -340,27 +282,22 @@ lz_lcpit_init(struct lz_mf *_mf)
  
         lz_lcpit_set_default_params(&mf->base.params);
  
-       mf->SA = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size));
-       if (!mf->SA)
-               return false;
-
-       return true;
+       mf->mem = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size));
+       return (mf->mem != NULL);
  }
  
  static void
  lz_lcpit_load_window(struct lz_mf *_mf, const u8 T[], u32 n)
  {
         struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
-       u32 *mem = mf->SA;
-
-       build_SA(&mem[0 * n], T, n, &mem[1 * n]);
-       build_ISA(&mem[2 * n], &mem[0 * n], n);
-       build_LCP(&mem[1 * n], &mem[0 * n], &mem[2 * n], T, n);
-       build_LCPIT(&mem[0 * n], &mem[1 * n], &mem[2 * n],
-                   mf->base.params.nice_match_len, n);
-       mf->SA = &mem[0 * n];
-       mf->intervals = &mem[1 * n];
-       mf->pos_data = &mem[2 * n];
+
+       build_SA(&mf->mem[0 * n], T, n, &mf->mem[1 * n]);
+       build_ISA(&mf->mem[2 * n], &mf->mem[0 * n], n);
+       build_LCP_packed(&mf->mem[0 * n], &mf->mem[2 * n], T, n,
+                        mf->base.params.nice_match_len);
+       build_LCPIT(&mf->mem[0 * n], &mf->mem[1 * n], &mf->mem[2 * n], n);
+       mf->intervals = &mf->mem[1 * n];
+       mf->pos_data = &mf->mem[2 * n];
  }
  
  static u32
@@ -561,7 +498,7 @@ lz_lcpit_destroy(struct lz_mf *_mf)
  {
         struct lz_lcpit *mf = (struct lz_lcpit *)_mf;
  
-       FREE(mf->SA);
+       FREE(mf->mem);
  }
  
  const struct lz_mf_ops lz_lcp_interval_tree_ops = {