* A match-finder for Lempel-Ziv compression based on bottom-up construction and
* traversal of the Longest Common Prefix (LCP) interval tree.
*
- * Author: Eric Biggers
- * Year: 2014, 2015
+ * The following copying information applies to this specific source code file:
*
- * The author dedicates this file to the public domain.
- * You can do whatever you want with this file.
+ * Written in 2014-2015 by Eric Biggers <ebiggers3@gmail.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
+ * Dedication (the "CC0").
+ *
+ * This software is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
+ *
+ * You should have received a copy of the CC0 along with this software; if not
+ * see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#ifdef HAVE_CONFIG_H
#define LCP_BITS 6
#define LCP_MAX (((u32)1 << LCP_BITS) - 1)
-#define POS_MASK (((u32)1 << (32 - LCP_BITS)) - 1)
#define LCP_SHIFT (32 - LCP_BITS)
#define LCP_MASK (LCP_MAX << LCP_SHIFT)
+#define POS_MASK (((u32)1 << (32 - LCP_BITS)) - 1)
#define MAX_NORMAL_BUFSIZE (POS_MASK + 1)
#define HUGE_LCP_BITS 7
#define HUGE_LCP_MAX (((u32)1 << HUGE_LCP_BITS) - 1)
#define HUGE_LCP_SHIFT (64 - HUGE_LCP_BITS)
+#define HUGE_LCP_MASK ((u64)HUGE_LCP_MAX << HUGE_LCP_SHIFT)
#define HUGE_POS_MASK 0xFFFFFFFF
#define MAX_HUGE_BUFSIZE ((u64)HUGE_POS_MASK + 1)
#define HUGE_UNVISITED_TAG 0x100000000
u32 h = 0;
for (u32 i = 0; i < n; i++) {
const u32 r = ISA[i];
- prefetch(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
+ prefetchw(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
if (r > 0) {
const u32 j = SA_and_LCP[r - 1] & POS_MASK;
const u32 lim = min(n - i, n - j);
for (u32 r = 1; r < n; r++) {
const u32 next_pos = SA_and_LCP[r] & POS_MASK;
- const u32 next_lcp = SA_and_LCP[r] >> LCP_SHIFT;
- const u32 top_lcp = *top >> LCP_SHIFT;
+ const u32 next_lcp = SA_and_LCP[r] & LCP_MASK;
+ const u32 top_lcp = *top & LCP_MASK;
+
+ prefetchw(&pos_data[SA_and_LCP[r + PREFETCH_SAFETY] & POS_MASK]);
if (next_lcp == top_lcp) {
/* Continuing the deepest open interval */
pos_data[prev_pos] = *top;
} else if (next_lcp > top_lcp) {
/* Opening a new interval */
- *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++;
+ *++top = next_lcp | next_interval_idx++;
pos_data[prev_pos] = *top;
} else {
/* Closing the deepest open interval */
pos_data[prev_pos] = *top;
for (;;) {
const u32 closed_interval_idx = *top-- & POS_MASK;
- const u32 superinterval_lcp = *top >> LCP_SHIFT;
+ const u32 superinterval_lcp = *top & LCP_MASK;
if (next_lcp == superinterval_lcp) {
/* Continuing the superinterval */
* superinterval of the one being
* closed, but still a subinterval of
* its superinterval */
- *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++;
+ *++top = next_lcp | next_interval_idx++;
intervals[closed_interval_idx] = *top;
break;
} else {
* around by just continuing until we get to a link that actually takes us
* higher in the tree. This can be described as a lazy-update scheme.
*/
-static inline u32
+static forceinline u32
lcpit_advance_one_byte(const u32 cur_pos,
u32 pos_data[restrict],
u32 intervals[restrict],
+ u32 next[restrict],
struct lz_match matches[restrict],
const bool record_matches)
{
- u32 lcp;
- u32 interval_idx;
+ u32 ref;
+ u32 super_ref;
u32 match_pos;
struct lz_match *matchptr;
/* Get the deepest lcp-interval containing the current suffix. */
- lcp = pos_data[cur_pos] >> LCP_SHIFT;
- interval_idx = pos_data[cur_pos] & POS_MASK;
- prefetch(&intervals[pos_data[cur_pos + 1] & POS_MASK]);
+ ref = pos_data[cur_pos];
+
+ /* Prefetch upcoming data, up to 3 positions ahead. Assume the
+ * intervals are already visited. */
+
+ /* Prefetch the superinterval via a suffix link for the deepest
+ * lcp-interval containing the suffix starting 1 position from now. */
+ prefetchw(&intervals[pos_data[next[0]] & POS_MASK]);
+
+ /* Prefetch suffix link for the deepest lcp-interval containing the
+ * suffix starting 2 positions from now. */
+ next[0] = intervals[next[1]] & POS_MASK;
+ prefetchw(&pos_data[next[0]]);
+
+ /* Prefetch the deepest lcp-interval containing the suffix starting 3
+ * positions from now. */
+ next[1] = pos_data[cur_pos + 3] & POS_MASK;
+ prefetchw(&intervals[next[1]]);
+
+ /* There is no "next suffix" after the current one. */
pos_data[cur_pos] = 0;
- /* Ascend until we reach a visited interval, linking the unvisited
- * intervals to the current suffix as we go. */
- while (intervals[interval_idx] & LCP_MASK) {
- const u32 superinterval_lcp = intervals[interval_idx] >> LCP_SHIFT;
- const u32 superinterval_idx = intervals[interval_idx] & POS_MASK;
- intervals[interval_idx] = cur_pos;
- lcp = superinterval_lcp;
- interval_idx = superinterval_idx;
+ /* Ascend until we reach a visited interval, the root, or a child of the
+ * root. Link unvisited intervals to the current suffix as we go. */
+ while ((super_ref = intervals[ref & POS_MASK]) & LCP_MASK) {
+ intervals[ref & POS_MASK] = cur_pos;
+ ref = super_ref;
}
- match_pos = intervals[interval_idx] & POS_MASK;
- if (match_pos == 0) {
- /* Ambiguous case; just don't allow matches with position 0. */
- if (interval_idx != 0)
- intervals[interval_idx] = cur_pos;
+ if (super_ref == 0) {
+ /* In this case, the current interval may be any of:
+ * (1) the root;
+ * (2) an unvisited child of the root;
+ * (3) an interval last visited by suffix 0
+ *
+ * We could avoid the ambiguity with (3) by using an lcp
+ * placeholder value other than 0 to represent "visited", but
+ * it's fastest to use 0. So we just don't allow matches with
+ * position 0. */
+
+ if (ref != 0) /* Not the root? */
+ intervals[ref & POS_MASK] = cur_pos;
return 0;
}
- matchptr = matches;
+
/* Ascend indirectly via pos_data[] links. */
+ match_pos = super_ref;
+ matchptr = matches;
for (;;) {
- u32 next_lcp;
- u32 next_interval_idx;
-
- for (;;) {
- next_lcp = pos_data[match_pos] >> LCP_SHIFT;
- next_interval_idx = pos_data[match_pos] & POS_MASK;
- if (next_lcp < lcp)
- break;
- /* Suffix was out of date. */
- match_pos = intervals[next_interval_idx];
- }
- intervals[interval_idx] = cur_pos;
- pos_data[match_pos] = (lcp << LCP_SHIFT) | interval_idx;
+ while ((super_ref = pos_data[match_pos]) > ref)
+ match_pos = intervals[super_ref & POS_MASK];
+ intervals[ref & POS_MASK] = cur_pos;
+ pos_data[match_pos] = ref;
if (record_matches) {
- matchptr->length = lcp;
+ matchptr->length = ref >> LCP_SHIFT;
matchptr->offset = cur_pos - match_pos;
matchptr++;
}
- if (next_interval_idx == 0)
+ if (super_ref == 0)
break;
- match_pos = intervals[next_interval_idx];
- interval_idx = next_interval_idx;
- lcp = next_lcp;
+ ref = super_ref;
+ match_pos = intervals[ref & POS_MASK];
}
return matchptr - matches;
}
u32 h = 0;
for (u32 i = 0; i < n; i++) {
const u32 r = ISA[i];
- prefetch(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
+ prefetchw(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
if (r > 0) {
const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK;
const u32 lim = min(n - i, n - j);
for (u32 r = 1; r < n; r++) {
const u32 next_pos = SA_and_LCP64[r] & HUGE_POS_MASK;
- const u32 next_lcp = SA_and_LCP64[r] >> HUGE_LCP_SHIFT;
- const u32 top_lcp = intervals64[*top] >> HUGE_LCP_SHIFT;
+ const u64 next_lcp = SA_and_LCP64[r] & HUGE_LCP_MASK;
+ const u64 top_lcp = intervals64[*top];
+
+ prefetchw(&pos_data[SA_and_LCP64[r + PREFETCH_SAFETY] & HUGE_POS_MASK]);
if (next_lcp == top_lcp) {
/* Continuing the deepest open interval */
pos_data[prev_pos] = *top;
} else if (next_lcp > top_lcp) {
/* Opening a new interval */
- intervals64[next_interval_idx] = (u64)next_lcp << HUGE_LCP_SHIFT;
+ intervals64[next_interval_idx] = next_lcp;
pos_data[prev_pos] = next_interval_idx;
*++top = next_interval_idx++;
} else {
pos_data[prev_pos] = *top;
for (;;) {
const u32 closed_interval_idx = *top--;
- const u32 superinterval_lcp = intervals64[*top] >> HUGE_LCP_SHIFT;
+ const u64 superinterval_lcp = intervals64[*top];
if (next_lcp == superinterval_lcp) {
/* Continuing the superinterval */
* superinterval of the one being
* closed, but still a subinterval of
* its superinterval */
- intervals64[next_interval_idx] =
- (u64)next_lcp << HUGE_LCP_SHIFT;
+ intervals64[next_interval_idx] = next_lcp;
intervals64[closed_interval_idx] |=
HUGE_UNVISITED_TAG | next_interval_idx;
*++top = next_interval_idx++;
/* Like lcpit_advance_one_byte(), but for buffers larger than
* MAX_NORMAL_BUFSIZE. */
-static inline u32
+static forceinline u32
lcpit_advance_one_byte_huge(const u32 cur_pos,
u32 pos_data[restrict],
u64 intervals64[restrict],
+ u32 prefetch_next[restrict],
struct lz_match matches[restrict],
const bool record_matches)
{
u32 interval_idx;
- u32 lcp;
+ u32 next_interval_idx;
+ u64 cur;
+ u64 next;
u32 match_pos;
struct lz_match *matchptr;
interval_idx = pos_data[cur_pos];
- prefetch(&intervals64[pos_data[cur_pos + 1]]);
+
+ prefetchw(&intervals64[pos_data[prefetch_next[0]] & HUGE_POS_MASK]);
+
+ prefetch_next[0] = intervals64[prefetch_next[1]] & HUGE_POS_MASK;
+ prefetchw(&pos_data[prefetch_next[0]]);
+
+ prefetch_next[1] = pos_data[cur_pos + 3] & HUGE_POS_MASK;
+ prefetchw(&intervals64[prefetch_next[1]]);
+
pos_data[cur_pos] = 0;
- while (intervals64[interval_idx] & HUGE_UNVISITED_TAG) {
- lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT;
- if (lcp == 0)
- return 0;
- const u32 superinterval_idx = intervals64[interval_idx] & HUGE_POS_MASK;
- intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos;
- interval_idx = superinterval_idx;
+ while ((next = intervals64[interval_idx]) & HUGE_UNVISITED_TAG) {
+ intervals64[interval_idx] = (next & HUGE_LCP_MASK) | cur_pos;
+ interval_idx = next & HUGE_POS_MASK;
}
- match_pos = intervals64[interval_idx] & HUGE_POS_MASK;
- lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT;
matchptr = matches;
- while (lcp != 0) {
- u32 next_interval_idx;
- u32 next_lcp;
-
- for (;;) {
+ while (next & HUGE_LCP_MASK) {
+ cur = next;
+ do {
+ match_pos = next & HUGE_POS_MASK;
next_interval_idx = pos_data[match_pos];
- next_lcp = intervals64[next_interval_idx] >> HUGE_LCP_SHIFT;
- if (next_lcp < lcp)
- break;
- match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK;
- }
- intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos;
+ next = intervals64[next_interval_idx];
+ } while (next > cur);
+ intervals64[interval_idx] = (cur & HUGE_LCP_MASK) | cur_pos;
pos_data[match_pos] = interval_idx;
if (record_matches) {
- matchptr->length = lcp;
+ matchptr->length = cur >> HUGE_LCP_SHIFT;
matchptr->offset = cur_pos - match_pos;
matchptr++;
}
- match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK;
interval_idx = next_interval_idx;
- lcp = next_lcp;
}
return matchptr - matches;
}
-static inline u64
+static forceinline u64
get_pos_data_size(size_t max_bufsize)
{
return (u64)max((u64)max_bufsize + PREFETCH_SAFETY,
DIVSUFSORT_TMP_LEN) * sizeof(u32);
}
-static inline u64
+static forceinline u64
get_intervals_size(size_t max_bufsize)
{
- return (u64)max_bufsize * (max_bufsize <= MAX_NORMAL_BUFSIZE ?
- sizeof(u32) : sizeof(u64));
+ return ((u64)max_bufsize + PREFETCH_SAFETY) *
+ (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64));
}
/*
mf->nice_match_len = min(nice_match_len,
(max_bufsize <= MAX_NORMAL_BUFSIZE) ?
LCP_MAX : HUGE_LCP_MAX);
- for (u32 i = 0; i < PREFETCH_SAFETY; i++)
- mf->pos_data[max_bufsize + i] = 0;
return true;
}
* References:
*
* Y. Mori. libdivsufsort, a lightweight suffix-sorting library.
- * https://code.google.com/p/libdivsufsort/.
+ * https://github.com/y-256/libdivsufsort
*
* G. Nong, S. Zhang, and W.H. Chan. 2009. Linear Suffix Array
* Construction by Almost Pure Induced-Sorting. Data Compression
build_SA(mf->intervals, T, n, mf->pos_data);
build_ISA(mf->pos_data, mf->intervals, n);
if (n <= MAX_NORMAL_BUFSIZE) {
+ for (u32 i = 0; i < PREFETCH_SAFETY; i++) {
+ mf->intervals[n + i] = 0;
+ mf->pos_data[n + i] = 0;
+ }
build_LCP(mf->intervals, mf->pos_data, T, n,
mf->min_match_len, mf->nice_match_len);
build_LCPIT(mf->intervals, mf->pos_data, n);
mf->huge_mode = false;
} else {
+ for (u32 i = 0; i < PREFETCH_SAFETY; i++) {
+ mf->intervals64[n + i] = 0;
+ mf->pos_data[n + i] = 0;
+ }
expand_SA(mf->intervals, n);
build_LCP_huge(mf->intervals64, mf->pos_data, T, n,
mf->min_match_len, mf->nice_match_len);
mf->huge_mode = true;
}
mf->cur_pos = 0; /* starting at beginning of input buffer */
+ for (u32 i = 0; i < ARRAY_LEN(mf->next); i++)
+ mf->next[i] = 0;
}
/*
{
if (mf->huge_mode)
return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
- mf->intervals64, matches, true);
+ mf->intervals64, mf->next,
+ matches, true);
else
return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
- mf->intervals, matches, true);
+ mf->intervals, mf->next,
+ matches, true);
}
/*
if (mf->huge_mode) {
do {
lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
- mf->intervals64, NULL, false);
+ mf->intervals64, mf->next,
+ NULL, false);
} while (--count);
} else {
do {
lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
- mf->intervals, NULL, false);
+ mf->intervals, mf->next,
+ NULL, false);
} while (--count);
}
}