lz_extend: simplify lz_extend() slightly
authorEric Biggers <ebiggers3@gmail.com>
Wed, 22 Jun 2016 01:01:57 +0000 (20:01 -0500)
committerEric Biggers <ebiggers3@gmail.com>
Sat, 2 Jul 2016 14:58:24 +0000 (09:58 -0500)
Unrolling the first four word copies does not seem give noticably better
performance anymore, and on a recent Intel processor actually appears to
decrease the performance slightly.

include/wimlib/lz_extend.h

index 858cb9a..2fb76bc 100644 (file)
@@ -3,7 +3,7 @@
  *
  * The following copying information applies to this specific source code file:
  *
- * Written in 2014-2015 by Eric Biggers <ebiggers3@gmail.com>
+ * Written in 2014-2016 by Eric Biggers <ebiggers3@gmail.com>
  *
  * To the extent possible under law, the author(s) have dedicated all copyright
  * and related and neighboring rights to this software to the public domain
 #include "wimlib/bitops.h"
 #include "wimlib/unaligned.h"
 
-/* Return the number of bytes at @matchptr that match the bytes at @strptr, up
- * to a maximum of @max_len.  Initially, @start_len bytes are matched.  */
+/*
+ * Return the number of bytes at @matchptr that match the bytes at @strptr, up
+ * to a maximum of @max_len.  Initially, @len bytes are matched.
+ */
 static inline u32
 lz_extend(const u8 * const strptr, const u8 * const matchptr,
-         const u32 start_len, const u32 max_len)
+         u32 len, const u32 max_len)
 {
-       u32 len = start_len;
-       machine_word_t v_word;
-
-       if (UNALIGNED_ACCESS_IS_FAST) {
-
-               if (likely(max_len - len >= 4 * WORDSIZE)) {
-
-               #define COMPARE_WORD_STEP                                       \
-                       v_word = load_word_unaligned(&matchptr[len]) ^          \
-                                load_word_unaligned(&strptr[len]);             \
-                       if (v_word != 0)                                        \
-                               goto word_differs;                              \
-                       len += WORDSIZE;                                        \
-
-                       COMPARE_WORD_STEP
-                       COMPARE_WORD_STEP
-                       COMPARE_WORD_STEP
-                       COMPARE_WORD_STEP
-               #undef COMPARE_WORD_STEP
-               }
-
-               while (len + WORDSIZE <= max_len) {
-                       v_word = load_word_unaligned(&matchptr[len]) ^
-                                load_word_unaligned(&strptr[len]);
-                       if (v_word != 0)
-                               goto word_differs;
-                       len += WORDSIZE;
+       while (UNALIGNED_ACCESS_IS_FAST && len + WORDSIZE <= max_len) {
+               machine_word_t v = load_word_unaligned(matchptr + len) ^
+                                  load_word_unaligned(strptr + len);
+               if (v != 0) {
+                       if (CPU_IS_LITTLE_ENDIAN)
+                               len += ffsw(v) >> 3;
+                       else
+                               len += (8 * WORDSIZE - 1 - flsw(v)) >> 3;
+                       return len;
                }
+               len += WORDSIZE;
        }
 
        while (len < max_len && matchptr[len] == strptr[len])
                len++;
        return len;
-
-word_differs:
-       if (CPU_IS_LITTLE_ENDIAN)
-               len += (ffsw(v_word) >> 3);
-       else
-               len += (8 * WORDSIZE - 1 - flsw(v_word)) >> 3;
-       return len;
 }
 
 #endif /* _WIMLIB_LZ_EXTEND_H */