]> wimlib.net Git - wimlib/blob - include/wimlib/matchfinder_avx2.h
Faster XPRESS compression
[wimlib] / include / wimlib / matchfinder_avx2.h
1 /*
2  * matchfinder_avx2.h
3  *
4  * Matchfinding routines optimized for Intel AVX2 (Advanced Vector Extensions).
5  */
6
7 #include <immintrin.h>
8
9 static inline bool
10 matchfinder_init_avx2(pos_t *data, size_t size)
11 {
12         __m256i v, *p;
13         size_t n;
14
15         if (size % sizeof(__m256i) * 4)
16                 return false;
17
18         if (sizeof(pos_t) == 2)
19                 v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
20         else if (sizeof(pos_t) == 4)
21                 v = _mm256_set1_epi32(MATCHFINDER_INITVAL);
22         else
23                 return false;
24
25         p = (__m256i *)data;
26         n = size / (sizeof(__m256i) * 4);
27         do {
28                 p[0] = v;
29                 p[1] = v;
30                 p[2] = v;
31                 p[3] = v;
32                 p += 4;
33         } while (--n);
34         return true;
35 }
36
37 static inline bool
38 matchfinder_rebase_avx2(pos_t *data, size_t size)
39 {
40         __m256i v, *p;
41         size_t n;
42
43         if ((size % sizeof(__m256i) * 4 != 0))
44                 return false;
45
46         if (sizeof(pos_t) == 2)
47                 v = _mm256_set1_epi16((pos_t)-MATCHFINDER_WINDOW_SIZE);
48         else if (sizeof(pos_t) == 4)
49                 v = _mm256_set1_epi32((pos_t)-MATCHFINDER_WINDOW_SIZE);
50         else
51                 return false;
52
53         p = (__m256i *)data;
54         n = size / (sizeof(__m256i) * 4);
55         do {
56                 /* PADDSW: Add Packed Signed Integers With Signed Saturation  */
57                 p[0] = _mm256_adds_epi16(p[0], v);
58                 p[1] = _mm256_adds_epi16(p[1], v);
59                 p[2] = _mm256_adds_epi16(p[2], v);
60                 p[3] = _mm256_adds_epi16(p[3], v);
61                 p += 4;
62         } while (--n);
63         return true;
64 }