From: Eric Biggers <ebiggers3@gmail.com>
Date: Sat, 14 Jun 2014 20:44:47 +0000 (-0500)
Subject: Speed up LZ77 match copying
X-Git-Tag: v1.7.0~8
X-Git-Url: https://wimlib.net/git/?p=wimlib;a=commitdiff_plain;h=543d8a6b89049aff65fa7eabf5f4b376a196c8d2

Speed up LZ77 match copying
---

diff --git a/include/wimlib/decompress_common.h b/include/wimlib/decompress_common.h
index c25d5e84..ef7ebe95 100644
--- a/include/wimlib/decompress_common.h
+++ b/include/wimlib/decompress_common.h
@@ -199,4 +199,56 @@ make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
 			  unsigned num_bits, const u8 lens[],
 			  unsigned max_codeword_len);
 
+
+/*
+ * Copy a LZ77 match at (dst - offset) to dst.
+ *
+ * The length and offset must be already validated --- that is, (dst - offset)
+ * can't underrun the output buffer, and (dst + length) can't overrun the output
+ * buffer.  Also, the length cannot be 0.
+ *
+ * @winend points to the byte past the end of the output buffer.
+ * This function won't write any data beyond this position.
+ */
+static inline void
+lz_copy(u8 *dst, unsigned length, unsigned offset, const u8 *winend)
+{
+	const u8 *src = dst - offset;
+#if defined(__x86_64__) || defined(__i386__)
+	/* Copy one 'unsigned long' at a time.  On i386 and x86_64 this is
+	 * faster than copying one byte at a time, unless the data is
+	 * near-random and all the matches have very short lengths.  Note that
+	 * since this requires unaligned memory accesses, it won't necessarily
+	 * be faster on every architecture.
+	 *
+	 * Also note that we might copy more than the length of the match.  For
+	 * example, if an 'unsigned long' is 8 bytes and the match is of length
+	 * 5, then we'll simply copy 8 bytes.  This is okay as long as we don't
+	 * write beyond the end of the output buffer, hence the check for
+	 * (winend - (dst + length) >= sizeof(unsigned long) - 1).  */
+	if (offset >= sizeof(unsigned long) &&
+			winend - (dst + length) >= sizeof(unsigned long) - 1)
+	{
+		/* Access memory through a packed struct.  This tricks the
+		 * compiler into allowing unaligned memory accesses.  */
+		struct ulong_wrapper {
+			unsigned long v;
+		} _packed_attribute;
+
+		const u8 *end = dst + length;
+		do {
+			unsigned long v = ((struct ulong_wrapper *)src)->v;
+			((struct ulong_wrapper *)dst)->v = v;
+			dst += sizeof(unsigned long);
+			src += sizeof(unsigned long);
+		} while (dst < end);
+
+		return;
+	}
+#endif
+	do {
+		*dst++ = *src++;
+	} while (--length);
+}
+
 #endif /* _WIMLIB_DECOMPRESS_COMMON_H */
diff --git a/src/lzms-decompress.c b/src/lzms-decompress.c
index 1ddc3f19..532dfb59 100644
--- a/src/lzms-decompress.c
+++ b/src/lzms-decompress.c
@@ -664,7 +664,6 @@ static int
 lzms_copy_lz_match(struct lzms_decompressor *ctx, u32 length, u32 offset)
 {
 	u8 *out_next;
-	u8 *matchptr;
 
 	if (length > ctx->out_end - ctx->out_next) {
 		LZMS_DEBUG("Match overrun!");
@@ -676,11 +675,10 @@ lzms_copy_lz_match(struct lzms_decompressor *ctx, u32 length, u32 offset)
 	}
 
 	out_next = ctx->out_next;
-	matchptr = out_next - offset;
-	while (length--)
-		*out_next++ = *matchptr++;
 
-	ctx->out_next = out_next;
+	lz_copy(out_next, length, offset, ctx->out_end);
+	ctx->out_next = out_next + length;
+
 	return 0;
 }
 
diff --git a/src/lzx-decompress.c b/src/lzx-decompress.c
index d259eeed..10c37115 100644
--- a/src/lzx-decompress.c
+++ b/src/lzx-decompress.c
@@ -523,9 +523,6 @@ lzx_decode_match(unsigned main_element, int block_type,
 	unsigned num_extra_bits;
 	u32 verbatim_bits;
 	u32 aligned_bits;
-	unsigned i;
-	u8 *match_dest;
-	u8 *match_src;
 
 	/* The main element is offset by 256 because values under 256 indicate a
 	 * literal value. */
@@ -621,24 +618,8 @@ lzx_decode_match(unsigned main_element, int block_type,
 		return -1;
 	}
 
-	match_dest = window + window_pos;
-	match_src = match_dest - match_offset;
-
-#if 0
-	printf("Match: src %u, dst %u, len %u\n", match_src - window,
-						match_dest - window,
-						match_len);
-	putchar('|');
-	for (i = 0; i < match_len; i++) {
-		match_dest[i] = match_src[i];
-		putchar(match_src[i]);
-	}
-	putchar('|');
-	putchar('\n');
-#else
-	for (i = 0; i < match_len; i++)
-		match_dest[i] = match_src[i];
-#endif
+	lz_copy(&window[window_pos], match_len, match_offset,
+		&window[window_pos + bytes_remaining]);
 
 	return match_len;
 }
diff --git a/src/xpress-decompress.c b/src/xpress-decompress.c
index 855f2cb3..431563b6 100644
--- a/src/xpress-decompress.c
+++ b/src/xpress-decompress.c
@@ -89,12 +89,8 @@ xpress_decode_match(unsigned sym, input_idx_t window_pos,
 		    input_idx_t window_len, u8 window[restrict],
 		    struct input_bitstream * restrict istream)
 {
-
 	unsigned len_hdr;
 	unsigned offset_bsr;
-	u8 *match_dest;
-	u8 *match_src;
-	unsigned i;
 	unsigned match_len;
 	unsigned match_offset;
 
@@ -119,21 +115,14 @@ xpress_decode_match(unsigned sym, input_idx_t window_pos,
 	}
 	match_len += XPRESS_MIN_MATCH_LEN;
 
-
-	/* Verify the match is in bounds, then copy its data to the current
-	 * position.  */
-
-	if (window_pos + match_len > window_len)
+	if (unlikely(match_len > window_len - window_pos))
 		return -1;
 
-	if (match_offset > window_pos)
+	if (unlikely(match_offset > window_pos))
 		return -1;
 
-	match_dest = window + window_pos;
-	match_src = match_dest - match_offset;
-
-	for (i = 0; i < match_len; i++)
-		match_dest[i] = match_src[i];
+	lz_copy(&window[window_pos], match_len, match_offset,
+		&window[window_len]);
 
 	return match_len;
 }