From e9ab3bf39bf6689a62a94fc45ccc2b3f78e183c8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 3 Jun 2016 02:22:16 -0500 Subject: [PATCH] lz extend pcmpestri --- include/wimlib/lz_extend.h | 68 +++++++++++++++++++++----------------- src/lzx_compress.c | 9 +++-- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/include/wimlib/lz_extend.h b/include/wimlib/lz_extend.h index 858cb9a3..ad16847d 100644 --- a/include/wimlib/lz_extend.h +++ b/include/wimlib/lz_extend.h @@ -24,52 +24,58 @@ #include "wimlib/bitops.h" #include "wimlib/unaligned.h" +#include + /* Return the number of bytes at @matchptr that match the bytes at @strptr, up * to a maximum of @max_len. Initially, @start_len bytes are matched. */ static inline u32 lz_extend(const u8 * const strptr, const u8 * const matchptr, const u32 start_len, const u32 max_len) { +#if 0 u32 len = start_len; machine_word_t v_word; - if (UNALIGNED_ACCESS_IS_FAST) { + for (;;) { + v_word = load_word_unaligned(&matchptr[len]) ^ + load_word_unaligned(&strptr[len]); + if (v_word != 0 || len >= max_len) + break; + len += WORDSIZE; + } + + return min(max_len, len + (ffsw(v_word) >> 3)); +#else - if (likely(max_len - len >= 4 * WORDSIZE)) { + const u8 *p1 = strptr + start_len; + const u8 *p2 = matchptr + start_len; - #define COMPARE_WORD_STEP \ - v_word = load_word_unaligned(&matchptr[len]) ^ \ - load_word_unaligned(&strptr[len]); \ - if (v_word != 0) \ - goto word_differs; \ - len += WORDSIZE; \ + u8 saved = strptr[max_len]; + ((u8 *)strptr)[max_len] = matchptr[max_len] + 1; - COMPARE_WORD_STEP - COMPARE_WORD_STEP - COMPARE_WORD_STEP - COMPARE_WORD_STEP - #undef COMPARE_WORD_STEP - } + __asm__( + " movdqu (%[p1]), %%xmm0 \n" + " pcmpestri $0x18, (%[p2]), %%xmm0 \n" + " jc 2f \n" + "1: \n" + " add $0x10, %[p1] \n" + " add $0x10, %[p2] \n" + " movdqu (%[p1]), %%xmm0 \n" + " pcmpestri $0x18, (%[p2]), %%xmm0 \n" + " jnc 1b \n" + "2: \n" + " add %%rcx, %[p1] \n" + " add %%rcx, %[p2] \n" + : [p1] "+r" (p1), [p2] "+r" (p2) + : "a" (16), "d" (16) + : "rcx", "cc", "xmm0", "memory" + ); - while (len + WORDSIZE <= max_len) { - v_word = load_word_unaligned(&matchptr[len]) ^ - load_word_unaligned(&strptr[len]); - if (v_word != 0) - goto word_differs; - len += WORDSIZE; - } - } - while (len < max_len && matchptr[len] == strptr[len]) - len++; - return len; + ((u8 *)strptr)[max_len] = saved; -word_differs: - if (CPU_IS_LITTLE_ENDIAN) - len += (ffsw(v_word) >> 3); - else - len += (8 * WORDSIZE - 1 - flsw(v_word)) >> 3; - return len; + return p1 - strptr; +#endif } #endif /* _WIMLIB_LZ_EXTEND_H */ diff --git a/src/lzx_compress.c b/src/lzx_compress.c index 0a2e88df..e4668042 100644 --- a/src/lzx_compress.c +++ b/src/lzx_compress.c @@ -2472,12 +2472,14 @@ lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level, { u64 size = 0; + destructive = false; + if (max_bufsize > LZX_MAX_WINDOW_SIZE) return 0; size += lzx_get_compressor_size(max_bufsize, compression_level); if (!destructive) - size += max_bufsize; /* in_buffer */ + size += max_bufsize + LZX_MAX_MATCH_LEN; /* in_buffer */ return size; } @@ -2488,6 +2490,8 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level, unsigned window_order; struct lzx_compressor *c; + destructive = false; + window_order = lzx_get_window_order(max_bufsize); if (window_order == 0) return WIMLIB_ERR_INVALID_PARAM; @@ -2502,9 +2506,10 @@ lzx_create_compressor(size_t max_bufsize, unsigned compression_level, c->window_order = window_order; if (!c->destructive) { - c->in_buffer = MALLOC(max_bufsize); + c->in_buffer = MALLOC(max_bufsize + LZX_MAX_MATCH_LEN); if (!c->in_buffer) goto oom1; + randomize_byte_array(&c->in_buffer[max_bufsize], LZX_MAX_MATCH_LEN); } if (compression_level <= LZX_MAX_FAST_LEVEL) { -- 2.43.0