From c3e031c6dfe0a2851b709e2ee54cd44dbd393285 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 9 Jul 2016 10:01:22 -0500 Subject: [PATCH] decompress_common: introduce fast path for lz_copy() --- include/wimlib/decompress_common.h | 132 ++++++++++++++++------------- src/lzms_decompress.c | 6 +- src/lzx_decompress.c | 13 +-- src/xpress_decompress.c | 10 +-- 4 files changed, 79 insertions(+), 82 deletions(-) diff --git a/include/wimlib/decompress_common.h b/include/wimlib/decompress_common.h index d3961207..945a5e9f 100644 --- a/include/wimlib/decompress_common.h +++ b/include/wimlib/decompress_common.h @@ -431,70 +431,86 @@ repeat_byte(u8 b) } /* - * Copy an LZ77 match at (dst - offset) to dst. + * Copy an LZ77 match of 'length' bytes from the match source at 'out_next - + * offset' to the match destination at 'out_next'. The source and destination + * may overlap. * - * The length and offset must be already validated --- that is, (dst - offset) - * can't underrun the output buffer, and (dst + length) can't overrun the output - * buffer. Also, the length cannot be 0. + * This handles validating the length and offset. It is validated that the + * beginning of the match source is '>= out_begin' and that end of the match + * destination is '<= out_end'. The return value is 0 if the match was valid + * (and was copied), otherwise -1. * - * @winend points to the byte past the end of the output buffer. - * This function won't write any data beyond this position. + * 'min_length' is a hint which specifies the minimum possible match length. + * This should be a compile-time constant. */ -static inline void -lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend, u32 min_length) +static inline int +lz_copy(u32 length, u32 offset, u8 *out_begin, u8 *out_next, u8 *out_end, + u32 min_length) { - const u8 *src = dst - offset; - const u8 * const end = dst + length; + const u8 *src; + u8 *end; + + /* Validate the offset. */ + if (unlikely(offset > out_next - out_begin)) + return -1; + + /* + * Fast path: copy a match which is no longer than a few words, is not + * overlapped such that copying a word at a time would produce incorrect + * results, and is not too close to the end of the buffer. Note that + * this might copy more than the length of the match, but that's okay in + * this scenario. + */ + src = out_next - offset; + if (UNALIGNED_ACCESS_IS_FAST && length <= 3 * WORDBYTES && + offset >= WORDBYTES && out_end - out_next >= 3 * WORDBYTES) + { + copy_word_unaligned(src + WORDBYTES*0, out_next + WORDBYTES*0); + copy_word_unaligned(src + WORDBYTES*1, out_next + WORDBYTES*1); + copy_word_unaligned(src + WORDBYTES*2, out_next + WORDBYTES*2); + return 0; + } + + /* Validate the length. This isn't needed in the fast path above, due + * to the additional conditions tested, but we do need it here. */ + if (unlikely(length > out_end - out_next)) + return -1; + end = out_next + length; /* - * Try to copy one machine word at a time. On i386 and x86_64 this is - * faster than copying one byte at a time, unless the data is - * near-random and all the matches have very short lengths. Note that - * since this requires unaligned memory accesses, it won't necessarily - * be faster on every architecture. + * Try to copy one word at a time. On i386 and x86_64 this is faster + * than copying one byte at a time, unless the data is near-random and + * all the matches have very short lengths. Note that since this + * requires unaligned memory accesses, it won't necessarily be faster on + * every architecture. * * Also note that we might copy more than the length of the match. For * example, if a word is 8 bytes and the match is of length 5, then * we'll simply copy 8 bytes. This is okay as long as we don't write - * beyond the end of the output buffer, hence the check for (winend - + * beyond the end of the output buffer, hence the check for (out_end - * end >= WORDBYTES - 1). */ - if (UNALIGNED_ACCESS_IS_FAST && likely(winend - end >= WORDBYTES - 1)) { - + if (UNALIGNED_ACCESS_IS_FAST && likely(out_end - end >= WORDBYTES - 1)) + { if (offset >= WORDBYTES) { - /* The source and destination words don't overlap. */ - - /* To improve branch prediction, one iteration of this - * loop is unrolled. Most matches are short and will - * fail the first check. But if that check passes, then - * it becomes increasing likely that the match is long - * and we'll need to continue copying. */ - - copy_word_unaligned(src, dst); - src += WORDBYTES; - dst += WORDBYTES; - - if (dst < end) { - do { - copy_word_unaligned(src, dst); - src += WORDBYTES; - dst += WORDBYTES; - } while (dst < end); - } - return; + /* The source and destination words don't overlap. */ + do { + copy_word_unaligned(src, out_next); + src += WORDBYTES; + out_next += WORDBYTES; + } while (out_next < end); + return 0; } else if (offset == 1) { - /* Offset 1 matches are equivalent to run-length * encoding of the previous byte. This case is common - * if the data contains many repeated bytes. */ - - machine_word_t v = repeat_byte(*(dst - 1)); + * if the data contains many repeated bytes. */ + machine_word_t v = repeat_byte(*(out_next - 1)); do { - store_word_unaligned(v, dst); + store_word_unaligned(v, out_next); src += WORDBYTES; - dst += WORDBYTES; - } while (dst < end); - return; + out_next += WORDBYTES; + } while (out_next < end); + return 0; } /* * We don't bother with special cases for other 'offset < @@ -508,22 +524,16 @@ lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend, u32 min_length) } /* Fall back to a bytewise copy. */ - - if (min_length >= 2) { - *dst++ = *src++; - length--; - } - if (min_length >= 3) { - *dst++ = *src++; - length--; - } - if (min_length >= 4) { - *dst++ = *src++; - length--; - } + if (min_length >= 2) + *out_next++ = *src++; + if (min_length >= 3) + *out_next++ = *src++; + if (min_length >= 4) + *out_next++ = *src++; do { - *dst++ = *src++; - } while (--length); + *out_next++ = *src++; + } while (out_next != end); + return 0; } #endif /* _WIMLIB_DECOMPRESS_COMMON_H */ diff --git a/src/lzms_decompress.c b/src/lzms_decompress.c index 4791054a..d6de2990 100644 --- a/src/lzms_decompress.c +++ b/src/lzms_decompress.c @@ -822,12 +822,10 @@ lzms_decompress(const void * const restrict in, const size_t in_nbytes, length = lzms_decode_length(d, &is); - if (unlikely(length > out_end - out_next)) - return -1; - if (unlikely(offset > out_next - (u8 *)out)) + if (unlikely(lz_copy(length, offset, out, out_next, out_end, + LZMS_MIN_MATCH_LENGTH))) return -1; - lz_copy(out_next, length, offset, out_end, LZMS_MIN_MATCH_LENGTH); out_next += length; } else { /* Delta match */ diff --git a/src/lzx_decompress.c b/src/lzx_decompress.c index 9f93fcf7..7b02dbf6 100644 --- a/src/lzx_decompress.c +++ b/src/lzx_decompress.c @@ -409,18 +409,11 @@ lzx_decompress_block(struct lzx_decompressor *d, struct input_bitstream *is, } recent_offsets[0] = offset; - /* Validate the match, then copy it to the current position. */ - - if (unlikely(length > block_end - out_next)) - return -1; - - if (unlikely(offset > out_next - out_begin)) + /* Validate the match and copy it to the current position. */ + if (unlikely(lz_copy(length, offset, out_begin, + out_next, block_end, LZX_MIN_MATCH_LEN))) return -1; - - lz_copy(out_next, length, offset, block_end, LZX_MIN_MATCH_LEN); - out_next += length; - } while (out_next != block_end); return 0; diff --git a/src/xpress_decompress.c b/src/xpress_decompress.c index 9fd5ac5e..6a0abe2a 100644 --- a/src/xpress_decompress.c +++ b/src/xpress_decompress.c @@ -138,15 +138,11 @@ xpress_decompress(const void *restrict compressed_data, size_t compressed_size, } length += XPRESS_MIN_MATCH_LEN; - if (unlikely(offset > out_next - out_begin)) + if (unlikely(lz_copy(length, offset, + out_begin, out_next, out_end, + XPRESS_MIN_MATCH_LEN))) return -1; - if (unlikely(length > out_end - out_next)) - return -1; - - lz_copy(out_next, length, offset, out_end, - XPRESS_MIN_MATCH_LEN); - out_next += length; } } -- 2.43.0