X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Flcpit_matchfinder.c;h=8b9ffd9d882fa5e9e1559b3a20222561378f0dd3;hp=65e391d91e36a7732b8ebb4c2b8bd66146d53b97;hb=4a20aae0dd8469a352517a0b107416ffa99ccc55;hpb=76b9892785bcdca5d43324f8896fa5ca3c427b1b diff --git a/src/lcpit_matchfinder.c b/src/lcpit_matchfinder.c index 65e391d9..8b9ffd9d 100644 --- a/src/lcpit_matchfinder.c +++ b/src/lcpit_matchfinder.c @@ -4,11 +4,21 @@ * A match-finder for Lempel-Ziv compression based on bottom-up construction and * traversal of the Longest Common Prefix (LCP) interval tree. * - * Author: Eric Biggers - * Year: 2014, 2015 + * The following copying information applies to this specific source code file: * - * The author dedicates this file to the public domain. - * You can do whatever you want with this file. + * Written in 2014-2015 by Eric Biggers + * + * To the extent possible under law, the author(s) have dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide via the Creative Commons Zero 1.0 Universal Public Domain + * Dedication (the "CC0"). + * + * This software is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the CC0 for more details. + * + * You should have received a copy of the CC0 along with this software; if not + * see . */ #ifdef HAVE_CONFIG_H @@ -78,7 +88,7 @@ build_LCP(u32 SA_and_LCP[restrict], const u32 ISA[restrict], u32 h = 0; for (u32 i = 0; i < n; i++) { const u32 r = ISA[i]; - prefetch(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]); + prefetchw(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]); if (r > 0) { const u32 j = SA_and_LCP[r - 1] & POS_MASK; const u32 lim = min(n - i, n - j); @@ -170,7 +180,7 @@ build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n) const u32 next_lcp = SA_and_LCP[r] & LCP_MASK; const u32 top_lcp = *top & LCP_MASK; - prefetch(&pos_data[SA_and_LCP[r + PREFETCH_SAFETY] & POS_MASK]); + prefetchw(&pos_data[SA_and_LCP[r + PREFETCH_SAFETY] & POS_MASK]); if (next_lcp == top_lcp) { /* Continuing the deepest open interval */ @@ -274,10 +284,11 @@ build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n) * around by just continuing until we get to a link that actually takes us * higher in the tree. This can be described as a lazy-update scheme. */ -static inline u32 +static forceinline u32 lcpit_advance_one_byte(const u32 cur_pos, u32 pos_data[restrict], u32 intervals[restrict], + u32 next[restrict], struct lz_match matches[restrict], const bool record_matches) { @@ -289,8 +300,22 @@ lcpit_advance_one_byte(const u32 cur_pos, /* Get the deepest lcp-interval containing the current suffix. */ ref = pos_data[cur_pos]; - /* Prefetch the deepest lcp-interval containing the *next* suffix. */ - prefetch(&intervals[pos_data[cur_pos + 1] & POS_MASK]); + /* Prefetch upcoming data, up to 3 positions ahead. Assume the + * intervals are already visited. */ + + /* Prefetch the superinterval via a suffix link for the deepest + * lcp-interval containing the suffix starting 1 position from now. */ + prefetchw(&intervals[pos_data[next[0]] & POS_MASK]); + + /* Prefetch suffix link for the deepest lcp-interval containing the + * suffix starting 2 positions from now. */ + next[0] = intervals[next[1]] & POS_MASK; + prefetchw(&pos_data[next[0]]); + + /* Prefetch the deepest lcp-interval containing the suffix starting 3 + * positions from now. */ + next[1] = pos_data[cur_pos + 3] & POS_MASK; + prefetchw(&intervals[next[1]]); /* There is no "next suffix" after the current one. */ pos_data[cur_pos] = 0; @@ -364,7 +389,7 @@ build_LCP_huge(u64 SA_and_LCP64[restrict], const u32 ISA[restrict], u32 h = 0; for (u32 i = 0; i < n; i++) { const u32 r = ISA[i]; - prefetch(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]); + prefetchw(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]); if (r > 0) { const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK; const u32 lim = min(n - i, n - j); @@ -411,7 +436,7 @@ build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n) const u64 next_lcp = SA_and_LCP64[r] & HUGE_LCP_MASK; const u64 top_lcp = intervals64[*top]; - prefetch(&pos_data[SA_and_LCP64[r + PREFETCH_SAFETY] & HUGE_POS_MASK]); + prefetchw(&pos_data[SA_and_LCP64[r + PREFETCH_SAFETY] & HUGE_POS_MASK]); if (next_lcp == top_lcp) { /* Continuing the deepest open interval */ @@ -461,10 +486,11 @@ build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n) /* Like lcpit_advance_one_byte(), but for buffers larger than * MAX_NORMAL_BUFSIZE. */ -static inline u32 +static forceinline u32 lcpit_advance_one_byte_huge(const u32 cur_pos, u32 pos_data[restrict], u64 intervals64[restrict], + u32 prefetch_next[restrict], struct lz_match matches[restrict], const bool record_matches) { @@ -476,8 +502,15 @@ lcpit_advance_one_byte_huge(const u32 cur_pos, struct lz_match *matchptr; interval_idx = pos_data[cur_pos]; - prefetch(&pos_data[intervals64[pos_data[cur_pos + 1]] & HUGE_POS_MASK]); - prefetch(&intervals64[pos_data[cur_pos + 2]]); + + prefetchw(&intervals64[pos_data[prefetch_next[0]] & HUGE_POS_MASK]); + + prefetch_next[0] = intervals64[prefetch_next[1]] & HUGE_POS_MASK; + prefetchw(&pos_data[prefetch_next[0]]); + + prefetch_next[1] = pos_data[cur_pos + 3] & HUGE_POS_MASK; + prefetchw(&intervals64[prefetch_next[1]]); + pos_data[cur_pos] = 0; while ((next = intervals64[interval_idx]) & HUGE_UNVISITED_TAG) { @@ -505,14 +538,14 @@ lcpit_advance_one_byte_huge(const u32 cur_pos, return matchptr - matches; } -static inline u64 +static forceinline u64 get_pos_data_size(size_t max_bufsize) { return (u64)max((u64)max_bufsize + PREFETCH_SAFETY, DIVSUFSORT_TMP_LEN) * sizeof(u32); } -static inline u64 +static forceinline u64 get_intervals_size(size_t max_bufsize) { return ((u64)max_bufsize + PREFETCH_SAFETY) * @@ -651,6 +684,8 @@ lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n) mf->huge_mode = true; } mf->cur_pos = 0; /* starting at beginning of input buffer */ + for (u32 i = 0; i < ARRAY_LEN(mf->next); i++) + mf->next[i] = 0; } /* @@ -668,10 +703,12 @@ lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf, { if (mf->huge_mode) return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data, - mf->intervals64, matches, true); + mf->intervals64, mf->next, + matches, true); else return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data, - mf->intervals, matches, true); + mf->intervals, mf->next, + matches, true); } /* @@ -684,12 +721,14 @@ lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count) if (mf->huge_mode) { do { lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data, - mf->intervals64, NULL, false); + mf->intervals64, mf->next, + NULL, false); } while (--count); } else { do { lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data, - mf->intervals, NULL, false); + mf->intervals, mf->next, + NULL, false); } while (--count); } }