u32 h = 0;
for (u32 i = 0; i < n; i++) {
const u32 r = ISA[i];
- prefetch(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
+ prefetchw(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
if (r > 0) {
const u32 j = SA_and_LCP[r - 1] & POS_MASK;
const u32 lim = min(n - i, n - j);
const u32 next_lcp = SA_and_LCP[r] & LCP_MASK;
const u32 top_lcp = *top & LCP_MASK;
- prefetch(&pos_data[SA_and_LCP[r + PREFETCH_SAFETY] & POS_MASK]);
+ prefetchw(&pos_data[SA_and_LCP[r + PREFETCH_SAFETY] & POS_MASK]);
if (next_lcp == top_lcp) {
/* Continuing the deepest open interval */
lcpit_advance_one_byte(const u32 cur_pos,
u32 pos_data[restrict],
u32 intervals[restrict],
+ u32 next[restrict],
struct lz_match matches[restrict],
const bool record_matches)
{
/* Get the deepest lcp-interval containing the current suffix. */
ref = pos_data[cur_pos];
- /* Prefetch the deepest lcp-interval containing the *next* suffix. */
- prefetch(&intervals[pos_data[cur_pos + 1] & POS_MASK]);
+ /* Prefetch upcoming data, up to 3 positions ahead. Assume the
+ * intervals are already visited. */
+
+ /* Prefetch the superinterval via a suffix link for the deepest
+ * lcp-interval containing the suffix starting 1 position from now. */
+ prefetchw(&intervals[pos_data[next[0]] & POS_MASK]);
+
+ /* Prefetch suffix link for the deepest lcp-interval containing the
+ * suffix starting 2 positions from now. */
+ next[0] = intervals[next[1]] & POS_MASK;
+ prefetchw(&pos_data[next[0]]);
+
+ /* Prefetch the deepest lcp-interval containing the suffix starting 3
+ * positions from now. */
+ next[1] = pos_data[cur_pos + 3] & POS_MASK;
+ prefetchw(&intervals[next[1]]);
/* There is no "next suffix" after the current one. */
pos_data[cur_pos] = 0;
u32 h = 0;
for (u32 i = 0; i < n; i++) {
const u32 r = ISA[i];
- prefetch(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
+ prefetchw(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
if (r > 0) {
const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK;
const u32 lim = min(n - i, n - j);
const u64 next_lcp = SA_and_LCP64[r] & HUGE_LCP_MASK;
const u64 top_lcp = intervals64[*top];
- prefetch(&pos_data[SA_and_LCP64[r + PREFETCH_SAFETY] & HUGE_POS_MASK]);
+ prefetchw(&pos_data[SA_and_LCP64[r + PREFETCH_SAFETY] & HUGE_POS_MASK]);
if (next_lcp == top_lcp) {
/* Continuing the deepest open interval */
lcpit_advance_one_byte_huge(const u32 cur_pos,
u32 pos_data[restrict],
u64 intervals64[restrict],
+ u32 prefetch_next[restrict],
struct lz_match matches[restrict],
const bool record_matches)
{
struct lz_match *matchptr;
interval_idx = pos_data[cur_pos];
- prefetch(&pos_data[intervals64[pos_data[cur_pos + 1]] & HUGE_POS_MASK]);
- prefetch(&intervals64[pos_data[cur_pos + 2]]);
+
+ prefetchw(&intervals64[pos_data[prefetch_next[0]] & HUGE_POS_MASK]);
+
+ prefetch_next[0] = intervals64[prefetch_next[1]] & HUGE_POS_MASK;
+ prefetchw(&pos_data[prefetch_next[0]]);
+
+ prefetch_next[1] = pos_data[cur_pos + 3] & HUGE_POS_MASK;
+ prefetchw(&intervals64[prefetch_next[1]]);
+
pos_data[cur_pos] = 0;
while ((next = intervals64[interval_idx]) & HUGE_UNVISITED_TAG) {
mf->huge_mode = true;
}
mf->cur_pos = 0; /* starting at beginning of input buffer */
+ for (u32 i = 0; i < ARRAY_LEN(mf->next); i++)
+ mf->next[i] = 0;
}
/*
{
if (mf->huge_mode)
return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
- mf->intervals64, matches, true);
+ mf->intervals64, mf->next,
+ matches, true);
else
return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
- mf->intervals, matches, true);
+ mf->intervals, mf->next,
+ matches, true);
}
/*
if (mf->huge_mode) {
do {
lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
- mf->intervals64, NULL, false);
+ mf->intervals64, mf->next,
+ NULL, false);
} while (--count);
} else {
do {
lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
- mf->intervals, NULL, false);
+ mf->intervals, mf->next,
+ NULL, false);
} while (--count);
}
}