+ prefetchw(&pos_data[SA_and_LCP[r + PREFETCH_SAFETY] & POS_MASK]);
+
+ if (next_lcp == top_lcp) {
+ /* Continuing the deepest open interval */
+ pos_data[prev_pos] = *top;
+ } else if (next_lcp > top_lcp) {
+ /* Opening a new interval */
+ *++top = next_lcp | next_interval_idx++;
+ pos_data[prev_pos] = *top;
+ } else {
+ /* Closing the deepest open interval */
+ pos_data[prev_pos] = *top;
+ for (;;) {
+ const u32 closed_interval_idx = *top-- & POS_MASK;
+ const u32 superinterval_lcp = *top & LCP_MASK;
+
+ if (next_lcp == superinterval_lcp) {
+ /* Continuing the superinterval */
+ intervals[closed_interval_idx] = *top;
+ break;
+ } else if (next_lcp > superinterval_lcp) {
+ /* Creating a new interval that is a
+ * superinterval of the one being
+ * closed, but still a subinterval of
+ * its superinterval */
+ *++top = next_lcp | next_interval_idx++;
+ intervals[closed_interval_idx] = *top;
+ break;
+ } else {
+ /* Also closing the superinterval */
+ intervals[closed_interval_idx] = *top;
+ }
+ }
+ }
+ prev_pos = next_pos;
+ }
+
+ /* Close any still-open intervals. */
+ pos_data[prev_pos] = *top;
+ for (; top > open_intervals; top--)
+ intervals[*top & POS_MASK] = *(top - 1);
+}
+
+/*
+ * Advance the LCP-interval tree matchfinder by one byte.
+ *
+ * If @record_matches is true, then matches are written to the @matches array
+ * sorted by strictly decreasing length and strictly decreasing offset, and the
+ * return value is the number of matches found. Otherwise, @matches is ignored
+ * and the return value is always 0.
+ *
+ * How this algorithm works:
+ *
+ * 'cur_pos' is the position of the current suffix, which is the suffix being
+ * matched against. 'cur_pos' starts at 0 and is incremented each time this
+ * function is called. This function finds each suffix with position less than
+ * 'cur_pos' that shares a prefix with the current suffix, but for each distinct
+ * prefix length it finds only the suffix with the greatest position (i.e. the
+ * most recently seen in the linear traversal by position). This function
+ * accomplishes this using the lcp-interval tree data structure that was built
+ * by build_LCPIT() and is updated dynamically by this function.
+ *
+ * The first step is to read 'pos_data[cur_pos]', which gives the index and lcp
+ * value of the deepest lcp-interval containing the current suffix --- or,
+ * equivalently, the parent of the conceptual singleton lcp-interval that
+ * contains the current suffix.
+ *
+ * The second step is to ascend the lcp-interval tree until reaching an interval
+ * that has not yet been visited, and link the intervals to the current suffix
+ * along the way. An lcp-interval has been visited if and only if it has been
+ * linked to a suffix. Initially, no lcp-intervals are linked to suffixes.
+ *
+ * The third step is to continue ascending the lcp-interval tree, but indirectly
+ * through suffix links rather than through the original superinterval
+ * references, and continuing to form links with the current suffix along the
+ * way. Each suffix visited during this step, except in a special case to
+ * handle outdated suffixes, is a match which can be written to matches[]. Each
+ * intervals[] entry contains the position of the next suffix to visit, which we
+ * shall call 'match_pos'; this is the most recently seen suffix that belongs to
+ * that lcp-interval. 'pos_data[match_pos]' then contains the lcp and interval
+ * index of the next lcp-interval that should be visited.
+ *
+ * We can view these arrays as describing a new set of links that gets overlaid
+ * on top of the original superinterval references of the lcp-interval tree.
+ * Each such link can connect a node of the lcp-interval tree to an ancestor
+ * more than one generation removed.
+ *
+ * For each one-byte advance, the current position becomes the most recently
+ * seen suffix for a continuous sequence of lcp-intervals from a leaf interval
+ * to the root interval. Conceptually, this algorithm needs to update all these
+ * nodes to link to 'cur_pos', and then update 'pos_data[cur_pos]' to a "null"
+ * link. But actually, if some of these nodes have the same most recently seen
+ * suffix, then this algorithm just visits the pos_data[] entry for that suffix
+ * and skips over all these nodes in one step. Updating the extra nodes is
+ * accomplished by creating a redirection from the previous suffix to the
+ * current suffix.
+ *
+ * Using this shortcutting scheme, it's possible for a suffix to become out of
+ * date, which means that it is no longer the most recently seen suffix for the
+ * lcp-interval under consideration. This case is detected by noticing when the
+ * next lcp-interval link actually points deeper in the tree, and it is worked
+ * around by just continuing until we get to a link that actually takes us
+ * higher in the tree. This can be described as a lazy-update scheme.
+ */
+static inline u32
+lcpit_advance_one_byte(const u32 cur_pos,
+ u32 pos_data[restrict],
+ u32 intervals[restrict],
+ struct lz_match matches[restrict],
+ const bool record_matches)
+{
+ u32 ref;
+ u32 super_ref;
+ u32 match_pos;
+ struct lz_match *matchptr;
+
+ /* Get the deepest lcp-interval containing the current suffix. */
+ ref = pos_data[cur_pos];
+
+ /* Prefetch the deepest lcp-interval containing the *next* suffix. */
+ prefetchw(&intervals[pos_data[cur_pos + 1] & POS_MASK]);
+
+ /* There is no "next suffix" after the current one. */
+ pos_data[cur_pos] = 0;
+
+ /* Ascend until we reach a visited interval, the root, or a child of the
+ * root. Link unvisited intervals to the current suffix as we go. */
+ while ((super_ref = intervals[ref & POS_MASK]) & LCP_MASK) {
+ intervals[ref & POS_MASK] = cur_pos;
+ ref = super_ref;
+ }
+
+ if (super_ref == 0) {
+ /* In this case, the current interval may be any of:
+ * (1) the root;
+ * (2) an unvisited child of the root;
+ * (3) an interval last visited by suffix 0
+ *
+ * We could avoid the ambiguity with (3) by using an lcp
+ * placeholder value other than 0 to represent "visited", but
+ * it's fastest to use 0. So we just don't allow matches with
+ * position 0. */
+
+ if (ref != 0) /* Not the root? */
+ intervals[ref & POS_MASK] = cur_pos;
+ return 0;
+ }
+
+ /* Ascend indirectly via pos_data[] links. */
+ match_pos = super_ref;
+ matchptr = matches;
+ for (;;) {
+ while ((super_ref = pos_data[match_pos]) > ref)
+ match_pos = intervals[super_ref & POS_MASK];
+ intervals[ref & POS_MASK] = cur_pos;
+ pos_data[match_pos] = ref;
+ if (record_matches) {
+ matchptr->length = ref >> LCP_SHIFT;
+ matchptr->offset = cur_pos - match_pos;
+ matchptr++;
+ }
+ if (super_ref == 0)
+ break;
+ ref = super_ref;
+ match_pos = intervals[ref & POS_MASK];
+ }
+ return matchptr - matches;
+}
+
+/* Expand SA from 32 bits to 64 bits. */
+static void
+expand_SA(void *p, u32 n)
+{
+ typedef u32 _may_alias_attribute aliased_u32_t;
+ typedef u64 _may_alias_attribute aliased_u64_t;
+
+ aliased_u32_t *SA = p;
+ aliased_u64_t *SA64 = p;
+
+ u32 r = n - 1;
+ do {
+ SA64[r] = SA[r];
+ } while (r--);
+}
+
+/* Like build_LCP(), but for buffers larger than MAX_NORMAL_BUFSIZE. */
+static void
+build_LCP_huge(u64 SA_and_LCP64[restrict], const u32 ISA[restrict],
+ const u8 T[restrict], const u32 n,
+ const u32 min_lcp, const u32 max_lcp)
+{
+ u32 h = 0;
+ for (u32 i = 0; i < n; i++) {
+ const u32 r = ISA[i];
+ prefetchw(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
+ if (r > 0) {
+ const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK;
+ const u32 lim = min(n - i, n - j);
+ while (h < lim && T[i + h] == T[j + h])
+ h++;
+ u32 stored_lcp = h;
+ if (stored_lcp < min_lcp)
+ stored_lcp = 0;
+ else if (stored_lcp > max_lcp)
+ stored_lcp = max_lcp;
+ SA_and_LCP64[r] |= (u64)stored_lcp << HUGE_LCP_SHIFT;
+ if (h > 0)
+ h--;
+ }
+ }
+}
+
+/*
+ * Like build_LCPIT(), but for buffers larger than MAX_NORMAL_BUFSIZE.
+ *
+ * This "huge" version is also slightly different in that the lcp value stored
+ * in each intervals[] entry is the lcp value for that interval, not its
+ * superinterval. This lcp value stays put in intervals[] and doesn't get moved
+ * to pos_data[] during lcpit_advance_one_byte_huge(). One consequence of this
+ * is that we have to use a special flag to distinguish visited from unvisited
+ * intervals. But overall, this scheme keeps the memory usage at 12n instead of
+ * 16n. (The non-huge version is 8n.)
+ */
+static void
+build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
+{
+ u64 * const SA_and_LCP64 = intervals64;
+ u32 next_interval_idx;
+ u32 open_intervals[HUGE_LCP_MAX + 1];
+ u32 *top = open_intervals;
+ u32 prev_pos = SA_and_LCP64[0] & HUGE_POS_MASK;
+
+ *top = 0;
+ intervals64[0] = 0;
+ next_interval_idx = 1;
+
+ for (u32 r = 1; r < n; r++) {
+ const u32 next_pos = SA_and_LCP64[r] & HUGE_POS_MASK;
+ const u64 next_lcp = SA_and_LCP64[r] & HUGE_LCP_MASK;
+ const u64 top_lcp = intervals64[*top];
+
+ prefetchw(&pos_data[SA_and_LCP64[r + PREFETCH_SAFETY] & HUGE_POS_MASK]);
+
+ if (next_lcp == top_lcp) {
+ /* Continuing the deepest open interval */
+ pos_data[prev_pos] = *top;
+ } else if (next_lcp > top_lcp) {
+ /* Opening a new interval */
+ intervals64[next_interval_idx] = next_lcp;
+ pos_data[prev_pos] = next_interval_idx;
+ *++top = next_interval_idx++;
+ } else {
+ /* Closing the deepest open interval */
+ pos_data[prev_pos] = *top;
+ for (;;) {
+ const u32 closed_interval_idx = *top--;
+ const u64 superinterval_lcp = intervals64[*top];
+
+ if (next_lcp == superinterval_lcp) {
+ /* Continuing the superinterval */
+ intervals64[closed_interval_idx] |=
+ HUGE_UNVISITED_TAG | *top;
+ break;
+ } else if (next_lcp > superinterval_lcp) {
+ /* Creating a new interval that is a
+ * superinterval of the one being
+ * closed, but still a subinterval of
+ * its superinterval */
+ intervals64[next_interval_idx] = next_lcp;
+ intervals64[closed_interval_idx] |=
+ HUGE_UNVISITED_TAG | next_interval_idx;
+ *++top = next_interval_idx++;
+ break;
+ } else {
+ /* Also closing the superinterval */
+ intervals64[closed_interval_idx] |=
+ HUGE_UNVISITED_TAG | *top;
+ }
+ }
+ }
+ prev_pos = next_pos;
+ }
+
+ /* Close any still-open intervals. */
+ pos_data[prev_pos] = *top;
+ for (; top > open_intervals; top--)
+ intervals64[*top] |= HUGE_UNVISITED_TAG | *(top - 1);
+}
+
+/* Like lcpit_advance_one_byte(), but for buffers larger than
+ * MAX_NORMAL_BUFSIZE. */
+static inline u32
+lcpit_advance_one_byte_huge(const u32 cur_pos,
+ u32 pos_data[restrict],
+ u64 intervals64[restrict],
+ struct lz_match matches[restrict],
+ const bool record_matches)
+{
+ u32 interval_idx;
+ u32 next_interval_idx;
+ u64 cur;
+ u64 next;
+ u32 match_pos;
+ struct lz_match *matchptr;
+
+ interval_idx = pos_data[cur_pos];
+ prefetchw(&pos_data[intervals64[pos_data[cur_pos + 1]] & HUGE_POS_MASK]);
+ prefetchw(&intervals64[pos_data[cur_pos + 2]]);
+ pos_data[cur_pos] = 0;
+
+ while ((next = intervals64[interval_idx]) & HUGE_UNVISITED_TAG) {
+ intervals64[interval_idx] = (next & HUGE_LCP_MASK) | cur_pos;
+ interval_idx = next & HUGE_POS_MASK;
+ }
+
+ matchptr = matches;
+ while (next & HUGE_LCP_MASK) {
+ cur = next;
+ do {
+ match_pos = next & HUGE_POS_MASK;
+ next_interval_idx = pos_data[match_pos];
+ next = intervals64[next_interval_idx];
+ } while (next > cur);
+ intervals64[interval_idx] = (cur & HUGE_LCP_MASK) | cur_pos;
+ pos_data[match_pos] = interval_idx;
+ if (record_matches) {
+ matchptr->length = cur >> HUGE_LCP_SHIFT;
+ matchptr->offset = cur_pos - match_pos;
+ matchptr++;
+ }
+ interval_idx = next_interval_idx;
+ }
+ return matchptr - matches;
+}
+
+static inline u64
+get_pos_data_size(size_t max_bufsize)
+{
+ return (u64)max((u64)max_bufsize + PREFETCH_SAFETY,
+ DIVSUFSORT_TMP_LEN) * sizeof(u32);
+}
+
+static inline u64
+get_intervals_size(size_t max_bufsize)
+{
+ return ((u64)max_bufsize + PREFETCH_SAFETY) *
+ (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64));
+}