]> wimlib.net Git - wimlib/commitdiff
lz extend pcmpestri
authorEric Biggers <ebiggers3@gmail.com>
Fri, 3 Jun 2016 07:22:16 +0000 (02:22 -0500)
committerEric Biggers <ebiggers3@gmail.com>
Sat, 4 Jun 2016 23:30:16 +0000 (18:30 -0500)
include/wimlib/lz_extend.h
src/lzx_compress.c

index 858cb9a3b1f791edccbb6f2663f71fc8a3eb3983..ad16847d9a769ab17175a45dcc4f352366f3e1c4 100644 (file)
 #include "wimlib/bitops.h"
 #include "wimlib/unaligned.h"
 
+#include <smmintrin.h>
+
 /* Return the number of bytes at @matchptr that match the bytes at @strptr, up
  * to a maximum of @max_len.  Initially, @start_len bytes are matched.  */
 static inline u32
 lz_extend(const u8 * const strptr, const u8 * const matchptr,
          const u32 start_len, const u32 max_len)
 {
+#if 0
        u32 len = start_len;
        machine_word_t v_word;
 
-       if (UNALIGNED_ACCESS_IS_FAST) {
+       for (;;) {
+               v_word = load_word_unaligned(&matchptr[len]) ^
+                        load_word_unaligned(&strptr[len]);
+               if (v_word != 0 || len >= max_len)
+                       break;
+               len += WORDSIZE;
+       }
+
+       return min(max_len, len + (ffsw(v_word) >> 3));
+#else
 
-               if (likely(max_len - len >= 4 * WORDSIZE)) {
+       const u8 *p1 = strptr + start_len;
+       const u8 *p2 = matchptr + start_len;
 
-               #define COMPARE_WORD_STEP                                       \
-                       v_word = load_word_unaligned(&matchptr[len]) ^          \
-                                load_word_unaligned(&strptr[len]);             \
-                       if (v_word != 0)                                        \
-                               goto word_differs;                              \
-                       len += WORDSIZE;                                        \
+       u8 saved = strptr[max_len];
+       ((u8 *)strptr)[max_len] = matchptr[max_len] + 1;
 
-                       COMPARE_WORD_STEP
-                       COMPARE_WORD_STEP
-                       COMPARE_WORD_STEP
-                       COMPARE_WORD_STEP
-               #undef COMPARE_WORD_STEP
-               }
+       __asm__(
+               "  movdqu (%[p1]), %%xmm0                    \n"
+               "  pcmpestri $0x18, (%[p2]), %%xmm0          \n"
+               "  jc 2f                                     \n"
+               "1:                                          \n"
+               "  add $0x10, %[p1]                          \n"
+               "  add $0x10, %[p2]                          \n"
+               "  movdqu (%[p1]), %%xmm0                    \n"
+               "  pcmpestri $0x18, (%[p2]), %%xmm0          \n"
+               "  jnc 1b                                    \n"
+               "2:                                          \n"
+               "  add %%rcx, %[p1]                          \n"
+               "  add %%rcx, %[p2]                          \n"
+               : [p1] "+r" (p1), [p2] "+r" (p2)
+               : "a" (16), "d" (16)
+               : "rcx", "cc", "xmm0", "memory"
+              );
 
-               while (len + WORDSIZE <= max_len) {
-                       v_word = load_word_unaligned(&matchptr[len]) ^
-                                load_word_unaligned(&strptr[len]);
-                       if (v_word != 0)
-                               goto word_differs;
-                       len += WORDSIZE;
-               }
-       }
 
-       while (len < max_len && matchptr[len] == strptr[len])
-               len++;
-       return len;
+       ((u8 *)strptr)[max_len] = saved;
 
-word_differs:
-       if (CPU_IS_LITTLE_ENDIAN)
-               len += (ffsw(v_word) >> 3);
-       else
-               len += (8 * WORDSIZE - 1 - flsw(v_word)) >> 3;
-       return len;
+       return p1 - strptr;
+#endif
 }
 
 #endif /* _WIMLIB_LZ_EXTEND_H */
index 0a2e88dfdf3e26a6a9af10af928f843061d3a561..e46680420b7c894861fcf8fd1f7f1f92cc547718 100644 (file)
@@ -2472,12 +2472,14 @@ lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
 {
        u64 size = 0;
 
+       destructive = false;
+
        if (max_bufsize > LZX_MAX_WINDOW_SIZE)
                return 0;
 
        size += lzx_get_compressor_size(max_bufsize, compression_level);
        if (!destructive)
-               size += max_bufsize; /* in_buffer */
+               size += max_bufsize + LZX_MAX_MATCH_LEN; /* in_buffer */
        return size;
 }
 
@@ -2488,6 +2490,8 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
        unsigned window_order;
        struct lzx_compressor *c;
 
+       destructive = false;
+
        window_order = lzx_get_window_order(max_bufsize);
        if (window_order == 0)
                return WIMLIB_ERR_INVALID_PARAM;
@@ -2502,9 +2506,10 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
        c->window_order = window_order;
 
        if (!c->destructive) {
-               c->in_buffer = MALLOC(max_bufsize);
+               c->in_buffer = MALLOC(max_bufsize + LZX_MAX_MATCH_LEN);
                if (!c->in_buffer)
                        goto oom1;
+               randomize_byte_array(&c->in_buffer[max_bufsize], LZX_MAX_MATCH_LEN);
        }
 
        if (compression_level <= LZX_MAX_FAST_LEVEL) {