From: Eric Biggers Date: Sat, 14 Jun 2014 20:44:47 +0000 (-0500) Subject: Speed up LZ77 match copying X-Git-Tag: v1.7.0~8 X-Git-Url: https://wimlib.net/git/?p=wimlib;a=commitdiff_plain;h=543d8a6b89049aff65fa7eabf5f4b376a196c8d2 Speed up LZ77 match copying --- diff --git a/include/wimlib/decompress_common.h b/include/wimlib/decompress_common.h index c25d5e84..ef7ebe95 100644 --- a/include/wimlib/decompress_common.h +++ b/include/wimlib/decompress_common.h @@ -199,4 +199,56 @@ make_huffman_decode_table(u16 decode_table[], unsigned num_syms, unsigned num_bits, const u8 lens[], unsigned max_codeword_len); + +/* + * Copy a LZ77 match at (dst - offset) to dst. + * + * The length and offset must be already validated --- that is, (dst - offset) + * can't underrun the output buffer, and (dst + length) can't overrun the output + * buffer. Also, the length cannot be 0. + * + * @winend points to the byte past the end of the output buffer. + * This function won't write any data beyond this position. + */ +static inline void +lz_copy(u8 *dst, unsigned length, unsigned offset, const u8 *winend) +{ + const u8 *src = dst - offset; +#if defined(__x86_64__) || defined(__i386__) + /* Copy one 'unsigned long' at a time. On i386 and x86_64 this is + * faster than copying one byte at a time, unless the data is + * near-random and all the matches have very short lengths. Note that + * since this requires unaligned memory accesses, it won't necessarily + * be faster on every architecture. + * + * Also note that we might copy more than the length of the match. For + * example, if an 'unsigned long' is 8 bytes and the match is of length + * 5, then we'll simply copy 8 bytes. This is okay as long as we don't + * write beyond the end of the output buffer, hence the check for + * (winend - (dst + length) >= sizeof(unsigned long) - 1). */ + if (offset >= sizeof(unsigned long) && + winend - (dst + length) >= sizeof(unsigned long) - 1) + { + /* Access memory through a packed struct. This tricks the + * compiler into allowing unaligned memory accesses. */ + struct ulong_wrapper { + unsigned long v; + } _packed_attribute; + + const u8 *end = dst + length; + do { + unsigned long v = ((struct ulong_wrapper *)src)->v; + ((struct ulong_wrapper *)dst)->v = v; + dst += sizeof(unsigned long); + src += sizeof(unsigned long); + } while (dst < end); + + return; + } +#endif + do { + *dst++ = *src++; + } while (--length); +} + #endif /* _WIMLIB_DECOMPRESS_COMMON_H */ diff --git a/src/lzms-decompress.c b/src/lzms-decompress.c index 1ddc3f19..532dfb59 100644 --- a/src/lzms-decompress.c +++ b/src/lzms-decompress.c @@ -664,7 +664,6 @@ static int lzms_copy_lz_match(struct lzms_decompressor *ctx, u32 length, u32 offset) { u8 *out_next; - u8 *matchptr; if (length > ctx->out_end - ctx->out_next) { LZMS_DEBUG("Match overrun!"); @@ -676,11 +675,10 @@ lzms_copy_lz_match(struct lzms_decompressor *ctx, u32 length, u32 offset) } out_next = ctx->out_next; - matchptr = out_next - offset; - while (length--) - *out_next++ = *matchptr++; - ctx->out_next = out_next; + lz_copy(out_next, length, offset, ctx->out_end); + ctx->out_next = out_next + length; + return 0; } diff --git a/src/lzx-decompress.c b/src/lzx-decompress.c index d259eeed..10c37115 100644 --- a/src/lzx-decompress.c +++ b/src/lzx-decompress.c @@ -523,9 +523,6 @@ lzx_decode_match(unsigned main_element, int block_type, unsigned num_extra_bits; u32 verbatim_bits; u32 aligned_bits; - unsigned i; - u8 *match_dest; - u8 *match_src; /* The main element is offset by 256 because values under 256 indicate a * literal value. */ @@ -621,24 +618,8 @@ lzx_decode_match(unsigned main_element, int block_type, return -1; } - match_dest = window + window_pos; - match_src = match_dest - match_offset; - -#if 0 - printf("Match: src %u, dst %u, len %u\n", match_src - window, - match_dest - window, - match_len); - putchar('|'); - for (i = 0; i < match_len; i++) { - match_dest[i] = match_src[i]; - putchar(match_src[i]); - } - putchar('|'); - putchar('\n'); -#else - for (i = 0; i < match_len; i++) - match_dest[i] = match_src[i]; -#endif + lz_copy(&window[window_pos], match_len, match_offset, + &window[window_pos + bytes_remaining]); return match_len; } diff --git a/src/xpress-decompress.c b/src/xpress-decompress.c index 855f2cb3..431563b6 100644 --- a/src/xpress-decompress.c +++ b/src/xpress-decompress.c @@ -89,12 +89,8 @@ xpress_decode_match(unsigned sym, input_idx_t window_pos, input_idx_t window_len, u8 window[restrict], struct input_bitstream * restrict istream) { - unsigned len_hdr; unsigned offset_bsr; - u8 *match_dest; - u8 *match_src; - unsigned i; unsigned match_len; unsigned match_offset; @@ -119,21 +115,14 @@ xpress_decode_match(unsigned sym, input_idx_t window_pos, } match_len += XPRESS_MIN_MATCH_LEN; - - /* Verify the match is in bounds, then copy its data to the current - * position. */ - - if (window_pos + match_len > window_len) + if (unlikely(match_len > window_len - window_pos)) return -1; - if (match_offset > window_pos) + if (unlikely(match_offset > window_pos)) return -1; - match_dest = window + window_pos; - match_src = match_dest - match_offset; - - for (i = 0; i < match_len; i++) - match_dest[i] = match_src[i]; + lz_copy(&window[window_pos], match_len, match_offset, + &window[window_len]); return match_len; }