]> wimlib.net Git - wimlib/blobdiff - src/lcpit_matchfinder.c
Stop force-inlining everything marked 'inline'
[wimlib] / src / lcpit_matchfinder.c
index b8144559147dcf78a8fe84a26c112edc6b8307b1..8b9ffd9d882fa5e9e1559b3a20222561378f0dd3 100644 (file)
@@ -4,11 +4,21 @@
  * A match-finder for Lempel-Ziv compression based on bottom-up construction and
  * traversal of the Longest Common Prefix (LCP) interval tree.
  *
- * Author:     Eric Biggers
- * Year:       2014, 2015
+ * The following copying information applies to this specific source code file:
  *
- * The author dedicates this file to the public domain.
- * You can do whatever you want with this file.
+ * Written in 2014-2015 by Eric Biggers <ebiggers3@gmail.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
+ * Dedication (the "CC0").
+ *
+ * This software is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
+ *
+ * You should have received a copy of the CC0 along with this software; if not
+ * see <http://creativecommons.org/publicdomain/zero/1.0/>.
  */
 
 #ifdef HAVE_CONFIG_H
 
 #define LCP_BITS               6
 #define LCP_MAX                        (((u32)1 << LCP_BITS) - 1)
-#define POS_MASK               (((u32)1 << (32 - LCP_BITS)) - 1)
 #define LCP_SHIFT              (32 - LCP_BITS)
 #define LCP_MASK               (LCP_MAX << LCP_SHIFT)
+#define POS_MASK               (((u32)1 << (32 - LCP_BITS)) - 1)
 #define MAX_NORMAL_BUFSIZE     (POS_MASK + 1)
 
 #define HUGE_LCP_BITS          7
 #define HUGE_LCP_MAX           (((u32)1 << HUGE_LCP_BITS) - 1)
 #define HUGE_LCP_SHIFT         (64 - HUGE_LCP_BITS)
+#define HUGE_LCP_MASK          ((u64)HUGE_LCP_MAX << HUGE_LCP_SHIFT)
 #define HUGE_POS_MASK          0xFFFFFFFF
 #define MAX_HUGE_BUFSIZE       ((u64)HUGE_POS_MASK + 1)
 #define HUGE_UNVISITED_TAG     0x100000000
@@ -77,7 +88,7 @@ build_LCP(u32 SA_and_LCP[restrict], const u32 ISA[restrict],
        u32 h = 0;
        for (u32 i = 0; i < n; i++) {
                const u32 r = ISA[i];
-               prefetch(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
+               prefetchw(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
                if (r > 0) {
                        const u32 j = SA_and_LCP[r - 1] & POS_MASK;
                        const u32 lim = min(n - i, n - j);
@@ -166,22 +177,24 @@ build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n)
 
        for (u32 r = 1; r < n; r++) {
                const u32 next_pos = SA_and_LCP[r] & POS_MASK;
-               const u32 next_lcp = SA_and_LCP[r] >> LCP_SHIFT;
-               const u32 top_lcp = *top >> LCP_SHIFT;
+               const u32 next_lcp = SA_and_LCP[r] & LCP_MASK;
+               const u32 top_lcp = *top & LCP_MASK;
+
+               prefetchw(&pos_data[SA_and_LCP[r + PREFETCH_SAFETY] & POS_MASK]);
 
                if (next_lcp == top_lcp) {
                        /* Continuing the deepest open interval  */
                        pos_data[prev_pos] = *top;
                } else if (next_lcp > top_lcp) {
                        /* Opening a new interval  */
-                       *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++;
+                       *++top = next_lcp | next_interval_idx++;
                        pos_data[prev_pos] = *top;
                } else {
                        /* Closing the deepest open interval  */
                        pos_data[prev_pos] = *top;
                        for (;;) {
                                const u32 closed_interval_idx = *top-- & POS_MASK;
-                               const u32 superinterval_lcp = *top >> LCP_SHIFT;
+                               const u32 superinterval_lcp = *top & LCP_MASK;
 
                                if (next_lcp == superinterval_lcp) {
                                        /* Continuing the superinterval */
@@ -192,7 +205,7 @@ build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n)
                                         * superinterval of the one being
                                         * closed, but still a subinterval of
                                         * its superinterval  */
-                                       *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++;
+                                       *++top = next_lcp | next_interval_idx++;
                                        intervals[closed_interval_idx] = *top;
                                        break;
                                } else {
@@ -271,67 +284,82 @@ build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n)
  * around by just continuing until we get to a link that actually takes us
  * higher in the tree.  This can be described as a lazy-update scheme.
  */
-static inline u32
+static forceinline u32
 lcpit_advance_one_byte(const u32 cur_pos,
                       u32 pos_data[restrict],
                       u32 intervals[restrict],
+                      u32 next[restrict],
                       struct lz_match matches[restrict],
                       const bool record_matches)
 {
-       u32 lcp;
-       u32 interval_idx;
+       u32 ref;
+       u32 super_ref;
        u32 match_pos;
        struct lz_match *matchptr;
 
        /* Get the deepest lcp-interval containing the current suffix. */
-       lcp = pos_data[cur_pos] >> LCP_SHIFT;
-       interval_idx = pos_data[cur_pos] & POS_MASK;
-       prefetch(&intervals[pos_data[cur_pos + 1] & POS_MASK]);
+       ref = pos_data[cur_pos];
+
+       /* Prefetch upcoming data, up to 3 positions ahead.  Assume the
+        * intervals are already visited.  */
+
+       /* Prefetch the superinterval via a suffix link for the deepest
+        * lcp-interval containing the suffix starting 1 position from now.  */
+       prefetchw(&intervals[pos_data[next[0]] & POS_MASK]);
+
+       /* Prefetch suffix link for the deepest lcp-interval containing the
+        * suffix starting 2 positions from now.  */
+       next[0] = intervals[next[1]] & POS_MASK;
+       prefetchw(&pos_data[next[0]]);
+
+       /* Prefetch the deepest lcp-interval containing the suffix starting 3
+        * positions from now.  */
+       next[1] = pos_data[cur_pos + 3] & POS_MASK;
+       prefetchw(&intervals[next[1]]);
+
+       /* There is no "next suffix" after the current one.  */
        pos_data[cur_pos] = 0;
 
-       /* Ascend until we reach a visited interval, linking the unvisited
-        * intervals to the current suffix as we go.  */
-       while (intervals[interval_idx] & LCP_MASK) {
-               const u32 superinterval_lcp = intervals[interval_idx] >> LCP_SHIFT;
-               const u32 superinterval_idx = intervals[interval_idx] & POS_MASK;
-               intervals[interval_idx] = cur_pos;
-               lcp = superinterval_lcp;
-               interval_idx = superinterval_idx;
+       /* Ascend until we reach a visited interval, the root, or a child of the
+        * root.  Link unvisited intervals to the current suffix as we go.  */
+       while ((super_ref = intervals[ref & POS_MASK]) & LCP_MASK) {
+               intervals[ref & POS_MASK] = cur_pos;
+               ref = super_ref;
        }
 
-       match_pos = intervals[interval_idx] & POS_MASK;
-       if (match_pos == 0) {
-               /* Ambiguous case; just don't allow matches with position 0. */
-               if (interval_idx != 0)
-                       intervals[interval_idx] = cur_pos;
+       if (super_ref == 0) {
+               /* In this case, the current interval may be any of:
+                * (1) the root;
+                * (2) an unvisited child of the root;
+                * (3) an interval last visited by suffix 0
+                *
+                * We could avoid the ambiguity with (3) by using an lcp
+                * placeholder value other than 0 to represent "visited", but
+                * it's fastest to use 0.  So we just don't allow matches with
+                * position 0.  */
+
+               if (ref != 0)  /* Not the root?  */
+                       intervals[ref & POS_MASK] = cur_pos;
                return 0;
        }
-       matchptr = matches;
+
        /* Ascend indirectly via pos_data[] links.  */
+       match_pos = super_ref;
+       matchptr = matches;
        for (;;) {
-               u32 next_lcp;
-               u32 next_interval_idx;
-
-               for (;;) {
-                       next_lcp = pos_data[match_pos] >> LCP_SHIFT;
-                       next_interval_idx = pos_data[match_pos] & POS_MASK;
-                       if (next_lcp < lcp)
-                               break;
-                       /* Suffix was out of date.  */
-                       match_pos = intervals[next_interval_idx];
-               }
-               intervals[interval_idx] = cur_pos;
-               pos_data[match_pos] = (lcp << LCP_SHIFT) | interval_idx;
+               while ((super_ref = pos_data[match_pos]) > ref)
+                       match_pos = intervals[super_ref & POS_MASK];
+               intervals[ref & POS_MASK] = cur_pos;
+               pos_data[match_pos] = ref;
                if (record_matches) {
-                       matchptr->length = lcp;
+                       matchptr->length = ref >> LCP_SHIFT;
                        matchptr->offset = cur_pos - match_pos;
                        matchptr++;
                }
-               if (next_interval_idx == 0)
+               if (super_ref == 0)
                        break;
-               match_pos = intervals[next_interval_idx];
-               interval_idx = next_interval_idx;
-               lcp = next_lcp;
+               ref = super_ref;
+               match_pos = intervals[ref & POS_MASK];
        }
        return matchptr - matches;
 }
@@ -361,7 +389,7 @@ build_LCP_huge(u64 SA_and_LCP64[restrict], const u32 ISA[restrict],
        u32 h = 0;
        for (u32 i = 0; i < n; i++) {
                const u32 r = ISA[i];
-               prefetch(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
+               prefetchw(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
                if (r > 0) {
                        const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK;
                        const u32 lim = min(n - i, n - j);
@@ -405,15 +433,17 @@ build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
 
        for (u32 r = 1; r < n; r++) {
                const u32 next_pos = SA_and_LCP64[r] & HUGE_POS_MASK;
-               const u32 next_lcp = SA_and_LCP64[r] >> HUGE_LCP_SHIFT;
-               const u32 top_lcp = intervals64[*top] >> HUGE_LCP_SHIFT;
+               const u64 next_lcp = SA_and_LCP64[r] & HUGE_LCP_MASK;
+               const u64 top_lcp = intervals64[*top];
+
+               prefetchw(&pos_data[SA_and_LCP64[r + PREFETCH_SAFETY] & HUGE_POS_MASK]);
 
                if (next_lcp == top_lcp) {
                        /* Continuing the deepest open interval  */
                        pos_data[prev_pos] = *top;
                } else if (next_lcp > top_lcp) {
                        /* Opening a new interval  */
-                       intervals64[next_interval_idx] = (u64)next_lcp << HUGE_LCP_SHIFT;
+                       intervals64[next_interval_idx] = next_lcp;
                        pos_data[prev_pos] = next_interval_idx;
                        *++top = next_interval_idx++;
                } else {
@@ -421,7 +451,7 @@ build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
                        pos_data[prev_pos] = *top;
                        for (;;) {
                                const u32 closed_interval_idx = *top--;
-                               const u32 superinterval_lcp = intervals64[*top] >> HUGE_LCP_SHIFT;
+                               const u64 superinterval_lcp = intervals64[*top];
 
                                if (next_lcp == superinterval_lcp) {
                                        /* Continuing the superinterval */
@@ -433,8 +463,7 @@ build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
                                         * superinterval of the one being
                                         * closed, but still a subinterval of
                                         * its superinterval  */
-                                       intervals64[next_interval_idx] =
-                                               (u64)next_lcp << HUGE_LCP_SHIFT;
+                                       intervals64[next_interval_idx] = next_lcp;
                                        intervals64[closed_interval_idx] |=
                                                HUGE_UNVISITED_TAG | next_interval_idx;
                                        *++top = next_interval_idx++;
@@ -457,71 +486,70 @@ build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
 
 /* Like lcpit_advance_one_byte(), but for buffers larger than
  * MAX_NORMAL_BUFSIZE.  */
-static inline u32
+static forceinline u32
 lcpit_advance_one_byte_huge(const u32 cur_pos,
                            u32 pos_data[restrict],
                            u64 intervals64[restrict],
+                           u32 prefetch_next[restrict],
                            struct lz_match matches[restrict],
                            const bool record_matches)
 {
        u32 interval_idx;
-       u32 lcp;
+       u32 next_interval_idx;
+       u64 cur;
+       u64 next;
        u32 match_pos;
        struct lz_match *matchptr;
 
        interval_idx = pos_data[cur_pos];
-       prefetch(&intervals64[pos_data[cur_pos + 1]]);
+
+       prefetchw(&intervals64[pos_data[prefetch_next[0]] & HUGE_POS_MASK]);
+
+       prefetch_next[0] = intervals64[prefetch_next[1]] & HUGE_POS_MASK;
+       prefetchw(&pos_data[prefetch_next[0]]);
+
+       prefetch_next[1] = pos_data[cur_pos + 3] & HUGE_POS_MASK;
+       prefetchw(&intervals64[prefetch_next[1]]);
+
        pos_data[cur_pos] = 0;
 
-       while (intervals64[interval_idx] & HUGE_UNVISITED_TAG) {
-               lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT;
-               if (lcp == 0)
-                       return 0;
-               const u32 superinterval_idx = intervals64[interval_idx] & HUGE_POS_MASK;
-               intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos;
-               interval_idx = superinterval_idx;
+       while ((next = intervals64[interval_idx]) & HUGE_UNVISITED_TAG) {
+               intervals64[interval_idx] = (next & HUGE_LCP_MASK) | cur_pos;
+               interval_idx = next & HUGE_POS_MASK;
        }
 
-       match_pos = intervals64[interval_idx] & HUGE_POS_MASK;
-       lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT;
        matchptr = matches;
-       while (lcp != 0) {
-               u32 next_interval_idx;
-               u32 next_lcp;
-
-               for (;;) {
+       while (next & HUGE_LCP_MASK) {
+               cur = next;
+               do {
+                       match_pos = next & HUGE_POS_MASK;
                        next_interval_idx = pos_data[match_pos];
-                       next_lcp = intervals64[next_interval_idx] >> HUGE_LCP_SHIFT;
-                       if (next_lcp < lcp)
-                               break;
-                       match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK;
-               }
-               intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos;
+                       next = intervals64[next_interval_idx];
+               } while (next > cur);
+               intervals64[interval_idx] = (cur & HUGE_LCP_MASK) | cur_pos;
                pos_data[match_pos] = interval_idx;
                if (record_matches) {
-                       matchptr->length = lcp;
+                       matchptr->length = cur >> HUGE_LCP_SHIFT;
                        matchptr->offset = cur_pos - match_pos;
                        matchptr++;
                }
-               match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK;
                interval_idx = next_interval_idx;
-               lcp = next_lcp;
        }
        return matchptr - matches;
 }
 
-static inline u64
+static forceinline u64
 get_pos_data_size(size_t max_bufsize)
 {
        return (u64)max((u64)max_bufsize + PREFETCH_SAFETY,
                        DIVSUFSORT_TMP_LEN) * sizeof(u32);
 }
 
-static inline u64
+static forceinline u64
 get_intervals_size(size_t max_bufsize)
 {
-       return (u64)max_bufsize * (max_bufsize <= MAX_NORMAL_BUFSIZE ?
-                                  sizeof(u32) : sizeof(u64));
+       return ((u64)max_bufsize + PREFETCH_SAFETY) *
+               (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64));
 }
 
 /*
@@ -568,8 +596,6 @@ lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize,
        mf->nice_match_len = min(nice_match_len,
                                 (max_bufsize <= MAX_NORMAL_BUFSIZE) ?
                                 LCP_MAX : HUGE_LCP_MAX);
-       for (u32 i = 0; i < PREFETCH_SAFETY; i++)
-               mf->pos_data[max_bufsize + i] = 0;
        return true;
 }
 
@@ -638,11 +664,19 @@ lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
        build_SA(mf->intervals, T, n, mf->pos_data);
        build_ISA(mf->pos_data, mf->intervals, n);
        if (n <= MAX_NORMAL_BUFSIZE) {
+               for (u32 i = 0; i < PREFETCH_SAFETY; i++) {
+                       mf->intervals[n + i] = 0;
+                       mf->pos_data[n + i] = 0;
+               }
                build_LCP(mf->intervals, mf->pos_data, T, n,
                          mf->min_match_len, mf->nice_match_len);
                build_LCPIT(mf->intervals, mf->pos_data, n);
                mf->huge_mode = false;
        } else {
+               for (u32 i = 0; i < PREFETCH_SAFETY; i++) {
+                       mf->intervals64[n + i] = 0;
+                       mf->pos_data[n + i] = 0;
+               }
                expand_SA(mf->intervals, n);
                build_LCP_huge(mf->intervals64, mf->pos_data, T, n,
                               mf->min_match_len, mf->nice_match_len);
@@ -650,6 +684,8 @@ lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
                mf->huge_mode = true;
        }
        mf->cur_pos = 0; /* starting at beginning of input buffer  */
+       for (u32 i = 0; i < ARRAY_LEN(mf->next); i++)
+               mf->next[i] = 0;
 }
 
 /*
@@ -667,10 +703,12 @@ lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf,
 {
        if (mf->huge_mode)
                return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
-                                                  mf->intervals64, matches, true);
+                                                  mf->intervals64, mf->next,
+                                                  matches, true);
        else
                return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
-                                             mf->intervals, matches, true);
+                                             mf->intervals, mf->next,
+                                             matches, true);
 }
 
 /*
@@ -683,12 +721,14 @@ lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count)
        if (mf->huge_mode) {
                do {
                        lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
-                                                   mf->intervals64, NULL, false);
+                                                   mf->intervals64, mf->next,
+                                                   NULL, false);
                } while (--count);
        } else {
                do {
                        lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
-                                              mf->intervals, NULL, false);
+                                              mf->intervals, mf->next,
+                                              NULL, false);
                } while (--count);
        }
 }