X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Flcpit_matchfinder.c;h=8b9ffd9d882fa5e9e1559b3a20222561378f0dd3;hp=9d22ad076e6bc900a99fb2739fcaf275651527a5;hb=4a20aae0dd8469a352517a0b107416ffa99ccc55;hpb=acb5c41d2c9be655eca863a0315a22d1f54889e2 diff --git a/src/lcpit_matchfinder.c b/src/lcpit_matchfinder.c index 9d22ad07..8b9ffd9d 100644 --- a/src/lcpit_matchfinder.c +++ b/src/lcpit_matchfinder.c @@ -4,13 +4,27 @@ * A match-finder for Lempel-Ziv compression based on bottom-up construction and * traversal of the Longest Common Prefix (LCP) interval tree. * - * Author: Eric Biggers - * Year: 2014, 2015 + * The following copying information applies to this specific source code file: * - * The author dedicates this file to the public domain. - * You can do whatever you want with this file. + * Written in 2014-2015 by Eric Biggers + * + * To the extent possible under law, the author(s) have dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide via the Creative Commons Zero 1.0 Universal Public Domain + * Dedication (the "CC0"). + * + * This software is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the CC0 for more details. + * + * You should have received a copy of the CC0 along with this software; if not + * see . */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + #include #include "wimlib/divsufsort.h" @@ -19,14 +33,15 @@ #define LCP_BITS 6 #define LCP_MAX (((u32)1 << LCP_BITS) - 1) -#define POS_MASK (((u32)1 << (32 - LCP_BITS)) - 1) #define LCP_SHIFT (32 - LCP_BITS) #define LCP_MASK (LCP_MAX << LCP_SHIFT) +#define POS_MASK (((u32)1 << (32 - LCP_BITS)) - 1) #define MAX_NORMAL_BUFSIZE (POS_MASK + 1) #define HUGE_LCP_BITS 7 #define HUGE_LCP_MAX (((u32)1 << HUGE_LCP_BITS) - 1) #define HUGE_LCP_SHIFT (64 - HUGE_LCP_BITS) +#define HUGE_LCP_MASK ((u64)HUGE_LCP_MAX << HUGE_LCP_SHIFT) #define HUGE_POS_MASK 0xFFFFFFFF #define MAX_HUGE_BUFSIZE ((u64)HUGE_POS_MASK + 1) #define HUGE_UNVISITED_TAG 0x100000000 @@ -37,7 +52,7 @@ * Build the LCP (Longest Common Prefix) array in linear time. * * LCP[r] will be the length of the longest common prefix between the suffixes - * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined. + * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined. * * Algorithm taken from Kasai et al. (2001), but modified slightly: * @@ -73,7 +88,7 @@ build_LCP(u32 SA_and_LCP[restrict], const u32 ISA[restrict], u32 h = 0; for (u32 i = 0; i < n; i++) { const u32 r = ISA[i]; - prefetch(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]); + prefetchw(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]); if (r > 0) { const u32 j = SA_and_LCP[r - 1] & POS_MASK; const u32 lim = min(n - i, n - j); @@ -162,22 +177,24 @@ build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n) for (u32 r = 1; r < n; r++) { const u32 next_pos = SA_and_LCP[r] & POS_MASK; - const u32 next_lcp = SA_and_LCP[r] >> LCP_SHIFT; - const u32 top_lcp = *top >> LCP_SHIFT; + const u32 next_lcp = SA_and_LCP[r] & LCP_MASK; + const u32 top_lcp = *top & LCP_MASK; + + prefetchw(&pos_data[SA_and_LCP[r + PREFETCH_SAFETY] & POS_MASK]); if (next_lcp == top_lcp) { /* Continuing the deepest open interval */ pos_data[prev_pos] = *top; } else if (next_lcp > top_lcp) { /* Opening a new interval */ - *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++; + *++top = next_lcp | next_interval_idx++; pos_data[prev_pos] = *top; } else { /* Closing the deepest open interval */ pos_data[prev_pos] = *top; for (;;) { const u32 closed_interval_idx = *top-- & POS_MASK; - const u32 superinterval_lcp = *top >> LCP_SHIFT; + const u32 superinterval_lcp = *top & LCP_MASK; if (next_lcp == superinterval_lcp) { /* Continuing the superinterval */ @@ -188,7 +205,7 @@ build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n) * superinterval of the one being * closed, but still a subinterval of * its superinterval */ - *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++; + *++top = next_lcp | next_interval_idx++; intervals[closed_interval_idx] = *top; break; } else { @@ -219,11 +236,11 @@ build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n) * 'cur_pos' is the position of the current suffix, which is the suffix being * matched against. 'cur_pos' starts at 0 and is incremented each time this * function is called. This function finds each suffix with position less than - * 'cur_pos' that shares a prefix with the current position, but for each - * distinct prefix length it finds only the suffix with the greatest position - * (i.e. the most recently seen in the linear traversal by position). This - * function accomplishes this using the lcp-interval tree data structure that - * was built by build_LCPIT() and is updated dynamically by this function. + * 'cur_pos' that shares a prefix with the current suffix, but for each distinct + * prefix length it finds only the suffix with the greatest position (i.e. the + * most recently seen in the linear traversal by position). This function + * accomplishes this using the lcp-interval tree data structure that was built + * by build_LCPIT() and is updated dynamically by this function. * * The first step is to read 'pos_data[cur_pos]', which gives the index and lcp * value of the deepest lcp-interval containing the current suffix --- or, @@ -267,80 +284,95 @@ build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n) * around by just continuing until we get to a link that actually takes us * higher in the tree. This can be described as a lazy-update scheme. */ -static inline u32 +static forceinline u32 lcpit_advance_one_byte(const u32 cur_pos, u32 pos_data[restrict], u32 intervals[restrict], + u32 next[restrict], struct lz_match matches[restrict], const bool record_matches) { - u32 lcp; - u32 interval_idx; + u32 ref; + u32 super_ref; u32 match_pos; struct lz_match *matchptr; /* Get the deepest lcp-interval containing the current suffix. */ - lcp = pos_data[cur_pos] >> LCP_SHIFT; - interval_idx = pos_data[cur_pos] & POS_MASK; - prefetch(&intervals[pos_data[cur_pos + 1] & POS_MASK]); + ref = pos_data[cur_pos]; + + /* Prefetch upcoming data, up to 3 positions ahead. Assume the + * intervals are already visited. */ + + /* Prefetch the superinterval via a suffix link for the deepest + * lcp-interval containing the suffix starting 1 position from now. */ + prefetchw(&intervals[pos_data[next[0]] & POS_MASK]); + + /* Prefetch suffix link for the deepest lcp-interval containing the + * suffix starting 2 positions from now. */ + next[0] = intervals[next[1]] & POS_MASK; + prefetchw(&pos_data[next[0]]); + + /* Prefetch the deepest lcp-interval containing the suffix starting 3 + * positions from now. */ + next[1] = pos_data[cur_pos + 3] & POS_MASK; + prefetchw(&intervals[next[1]]); + + /* There is no "next suffix" after the current one. */ pos_data[cur_pos] = 0; - /* Ascend until we reach a visited interval, linking the unvisited - * intervals to the current suffix as we go. */ - while (intervals[interval_idx] & LCP_MASK) { - const u32 superinterval_lcp = intervals[interval_idx] >> LCP_SHIFT; - const u32 superinterval_idx = intervals[interval_idx] & POS_MASK; - intervals[interval_idx] = cur_pos; - lcp = superinterval_lcp; - interval_idx = superinterval_idx; + /* Ascend until we reach a visited interval, the root, or a child of the + * root. Link unvisited intervals to the current suffix as we go. */ + while ((super_ref = intervals[ref & POS_MASK]) & LCP_MASK) { + intervals[ref & POS_MASK] = cur_pos; + ref = super_ref; } - match_pos = intervals[interval_idx] & POS_MASK; - if (match_pos == 0) { - /* Ambiguous case; just don't allow matches with position 0. */ - if (interval_idx != 0) - intervals[interval_idx] = cur_pos; + if (super_ref == 0) { + /* In this case, the current interval may be any of: + * (1) the root; + * (2) an unvisited child of the root; + * (3) an interval last visited by suffix 0 + * + * We could avoid the ambiguity with (3) by using an lcp + * placeholder value other than 0 to represent "visited", but + * it's fastest to use 0. So we just don't allow matches with + * position 0. */ + + if (ref != 0) /* Not the root? */ + intervals[ref & POS_MASK] = cur_pos; return 0; } - matchptr = matches; + /* Ascend indirectly via pos_data[] links. */ + match_pos = super_ref; + matchptr = matches; for (;;) { - u32 next_lcp; - u32 next_interval_idx; - - for (;;) { - next_lcp = pos_data[match_pos] >> LCP_SHIFT; - next_interval_idx = pos_data[match_pos] & POS_MASK; - if (next_lcp < lcp) - break; - /* Suffix was out of date. */ - match_pos = intervals[next_interval_idx]; - } - intervals[interval_idx] = cur_pos; - pos_data[match_pos] = (lcp << LCP_SHIFT) | interval_idx; + while ((super_ref = pos_data[match_pos]) > ref) + match_pos = intervals[super_ref & POS_MASK]; + intervals[ref & POS_MASK] = cur_pos; + pos_data[match_pos] = ref; if (record_matches) { - matchptr->length = lcp; + matchptr->length = ref >> LCP_SHIFT; matchptr->offset = cur_pos - match_pos; matchptr++; } - if (next_interval_idx == 0) + if (super_ref == 0) break; - match_pos = intervals[next_interval_idx]; - interval_idx = next_interval_idx; - lcp = next_lcp; + ref = super_ref; + match_pos = intervals[ref & POS_MASK]; } return matchptr - matches; } /* Expand SA from 32 bits to 64 bits. */ static void -expand_SA(u32 *p, u32 n) +expand_SA(void *p, u32 n) { typedef u32 _may_alias_attribute aliased_u32_t; typedef u64 _may_alias_attribute aliased_u64_t; - aliased_u32_t *SA = (aliased_u32_t *)p; - aliased_u64_t *SA64 = (aliased_u64_t *)p; + aliased_u32_t *SA = p; + aliased_u64_t *SA64 = p; u32 r = n - 1; do { @@ -357,7 +389,7 @@ build_LCP_huge(u64 SA_and_LCP64[restrict], const u32 ISA[restrict], u32 h = 0; for (u32 i = 0; i < n; i++) { const u32 r = ISA[i]; - prefetch(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]); + prefetchw(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]); if (r > 0) { const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK; const u32 lim = min(n - i, n - j); @@ -401,15 +433,17 @@ build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n) for (u32 r = 1; r < n; r++) { const u32 next_pos = SA_and_LCP64[r] & HUGE_POS_MASK; - const u32 next_lcp = SA_and_LCP64[r] >> HUGE_LCP_SHIFT; - const u32 top_lcp = intervals64[*top] >> HUGE_LCP_SHIFT; + const u64 next_lcp = SA_and_LCP64[r] & HUGE_LCP_MASK; + const u64 top_lcp = intervals64[*top]; + + prefetchw(&pos_data[SA_and_LCP64[r + PREFETCH_SAFETY] & HUGE_POS_MASK]); if (next_lcp == top_lcp) { /* Continuing the deepest open interval */ pos_data[prev_pos] = *top; } else if (next_lcp > top_lcp) { /* Opening a new interval */ - intervals64[next_interval_idx] = (u64)next_lcp << HUGE_LCP_SHIFT; + intervals64[next_interval_idx] = next_lcp; pos_data[prev_pos] = next_interval_idx; *++top = next_interval_idx++; } else { @@ -417,7 +451,7 @@ build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n) pos_data[prev_pos] = *top; for (;;) { const u32 closed_interval_idx = *top--; - const u32 superinterval_lcp = intervals64[*top] >> HUGE_LCP_SHIFT; + const u64 superinterval_lcp = intervals64[*top]; if (next_lcp == superinterval_lcp) { /* Continuing the superinterval */ @@ -429,8 +463,7 @@ build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n) * superinterval of the one being * closed, but still a subinterval of * its superinterval */ - intervals64[next_interval_idx] = - (u64)next_lcp << HUGE_LCP_SHIFT; + intervals64[next_interval_idx] = next_lcp; intervals64[closed_interval_idx] |= HUGE_UNVISITED_TAG | next_interval_idx; *++top = next_interval_idx++; @@ -453,71 +486,70 @@ build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n) /* Like lcpit_advance_one_byte(), but for buffers larger than * MAX_NORMAL_BUFSIZE. */ -static inline u32 +static forceinline u32 lcpit_advance_one_byte_huge(const u32 cur_pos, u32 pos_data[restrict], u64 intervals64[restrict], + u32 prefetch_next[restrict], struct lz_match matches[restrict], const bool record_matches) { u32 interval_idx; - u32 lcp; + u32 next_interval_idx; + u64 cur; + u64 next; u32 match_pos; struct lz_match *matchptr; interval_idx = pos_data[cur_pos]; - prefetch(&intervals64[pos_data[cur_pos + 1]]); + + prefetchw(&intervals64[pos_data[prefetch_next[0]] & HUGE_POS_MASK]); + + prefetch_next[0] = intervals64[prefetch_next[1]] & HUGE_POS_MASK; + prefetchw(&pos_data[prefetch_next[0]]); + + prefetch_next[1] = pos_data[cur_pos + 3] & HUGE_POS_MASK; + prefetchw(&intervals64[prefetch_next[1]]); + pos_data[cur_pos] = 0; - while (intervals64[interval_idx] & HUGE_UNVISITED_TAG) { - lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT; - if (lcp == 0) - return 0; - const u32 superinterval_idx = intervals64[interval_idx] & HUGE_POS_MASK; - intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos; - interval_idx = superinterval_idx; + while ((next = intervals64[interval_idx]) & HUGE_UNVISITED_TAG) { + intervals64[interval_idx] = (next & HUGE_LCP_MASK) | cur_pos; + interval_idx = next & HUGE_POS_MASK; } - match_pos = intervals64[interval_idx] & HUGE_POS_MASK; - lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT; matchptr = matches; - while (lcp != 0) { - u32 next_interval_idx; - u32 next_lcp; - - for (;;) { + while (next & HUGE_LCP_MASK) { + cur = next; + do { + match_pos = next & HUGE_POS_MASK; next_interval_idx = pos_data[match_pos]; - next_lcp = intervals64[next_interval_idx] >> HUGE_LCP_SHIFT; - if (next_lcp < lcp) - break; - match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK; - } - intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos; + next = intervals64[next_interval_idx]; + } while (next > cur); + intervals64[interval_idx] = (cur & HUGE_LCP_MASK) | cur_pos; pos_data[match_pos] = interval_idx; if (record_matches) { - matchptr->length = lcp; + matchptr->length = cur >> HUGE_LCP_SHIFT; matchptr->offset = cur_pos - match_pos; matchptr++; } - match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK; interval_idx = next_interval_idx; - lcp = next_lcp; } return matchptr - matches; } -static inline u64 +static forceinline u64 get_pos_data_size(size_t max_bufsize) { return (u64)max((u64)max_bufsize + PREFETCH_SAFETY, DIVSUFSORT_TMP_LEN) * sizeof(u32); } -static inline u64 +static forceinline u64 get_intervals_size(size_t max_bufsize) { - return (u64)max_bufsize * (max_bufsize <= MAX_NORMAL_BUFSIZE ? - sizeof(u32) : sizeof(u64)); + return ((u64)max_bufsize + PREFETCH_SAFETY) * + (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64)); } /* @@ -564,8 +596,6 @@ lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize, mf->nice_match_len = min(nice_match_len, (max_bufsize <= MAX_NORMAL_BUFSIZE) ? LCP_MAX : HUGE_LCP_MAX); - for (u32 i = 0; i < PREFETCH_SAFETY; i++) - mf->pos_data[max_bufsize + i] = 0; return true; } @@ -596,11 +626,10 @@ lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize, static void build_SA(u32 SA[], const u8 T[], u32 n, u32 *tmp) { - /* Note: divsufsort() needs temporary space --- one array with 256 - * spaces and one array with 65536 spaces. The implementation of - * divsufsort() has been modified from the original to use the provided - * temporary space instead of allocating its own, since we don't want to - * have to deal with malloc() failures here. */ + /* Note: divsufsort() requires a fixed amount of temporary space. The + * implementation of divsufsort() has been modified from the original to + * use the provided temporary space instead of allocating its own, since + * we don't want to have to deal with malloc() failures here. */ divsufsort(T, SA, n, tmp); } @@ -635,11 +664,19 @@ lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n) build_SA(mf->intervals, T, n, mf->pos_data); build_ISA(mf->pos_data, mf->intervals, n); if (n <= MAX_NORMAL_BUFSIZE) { + for (u32 i = 0; i < PREFETCH_SAFETY; i++) { + mf->intervals[n + i] = 0; + mf->pos_data[n + i] = 0; + } build_LCP(mf->intervals, mf->pos_data, T, n, mf->min_match_len, mf->nice_match_len); build_LCPIT(mf->intervals, mf->pos_data, n); mf->huge_mode = false; } else { + for (u32 i = 0; i < PREFETCH_SAFETY; i++) { + mf->intervals64[n + i] = 0; + mf->pos_data[n + i] = 0; + } expand_SA(mf->intervals, n); build_LCP_huge(mf->intervals64, mf->pos_data, T, n, mf->min_match_len, mf->nice_match_len); @@ -647,6 +684,8 @@ lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n) mf->huge_mode = true; } mf->cur_pos = 0; /* starting at beginning of input buffer */ + for (u32 i = 0; i < ARRAY_LEN(mf->next); i++) + mf->next[i] = 0; } /* @@ -664,10 +703,12 @@ lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf, { if (mf->huge_mode) return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data, - mf->intervals64, matches, true); + mf->intervals64, mf->next, + matches, true); else return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data, - mf->intervals, matches, true); + mf->intervals, mf->next, + matches, true); } /* @@ -680,12 +721,14 @@ lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count) if (mf->huge_mode) { do { lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data, - mf->intervals64, NULL, false); + mf->intervals64, mf->next, + NULL, false); } while (--count); } else { do { lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data, - mf->intervals, NULL, false); + mf->intervals, mf->next, + NULL, false); } while (--count); } }