]> wimlib.net Git - wimlib/blob - src/lzms_common.c
lzms_common.c: support x32 ABI in find_next_opcode_sse4_2()
[wimlib] / src / lzms_common.c
1 /*
2  * lzms_common.c - Common code for LZMS compression and decompression
3  */
4
5 /*
6  * Copyright (C) 2013-2016 Eric Biggers
7  *
8  * This file is free software; you can redistribute it and/or modify it under
9  * the terms of the GNU Lesser General Public License as published by the Free
10  * Software Foundation; either version 3 of the License, or (at your option) any
11  * later version.
12  *
13  * This file is distributed in the hope that it will be useful, but WITHOUT
14  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
16  * details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with this file; if not, see http://www.gnu.org/licenses/.
20  */
21
22 #ifdef HAVE_CONFIG_H
23 #  include "config.h"
24 #endif
25
26 #include "wimlib/lzms_common.h"
27 #include "wimlib/unaligned.h"
28 #include "wimlib/x86_cpu_features.h"
29
30 #ifdef __x86_64__
31 #  include <emmintrin.h>
32 #endif
33
34 /* Table: offset slot => offset slot base value  */
35 const u32 lzms_offset_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1] = {
36         0x00000001, 0x00000002, 0x00000003, 0x00000004,
37         0x00000005, 0x00000006, 0x00000007, 0x00000008,
38         0x00000009, 0x0000000d, 0x00000011, 0x00000015,
39         0x00000019, 0x0000001d, 0x00000021, 0x00000025,
40         0x00000029, 0x0000002d, 0x00000035, 0x0000003d,
41         0x00000045, 0x0000004d, 0x00000055, 0x0000005d,
42         0x00000065, 0x00000075, 0x00000085, 0x00000095,
43         0x000000a5, 0x000000b5, 0x000000c5, 0x000000d5,
44         0x000000e5, 0x000000f5, 0x00000105, 0x00000125,
45         0x00000145, 0x00000165, 0x00000185, 0x000001a5,
46         0x000001c5, 0x000001e5, 0x00000205, 0x00000225,
47         0x00000245, 0x00000265, 0x00000285, 0x000002a5,
48         0x000002c5, 0x000002e5, 0x00000325, 0x00000365,
49         0x000003a5, 0x000003e5, 0x00000425, 0x00000465,
50         0x000004a5, 0x000004e5, 0x00000525, 0x00000565,
51         0x000005a5, 0x000005e5, 0x00000625, 0x00000665,
52         0x000006a5, 0x00000725, 0x000007a5, 0x00000825,
53         0x000008a5, 0x00000925, 0x000009a5, 0x00000a25,
54         0x00000aa5, 0x00000b25, 0x00000ba5, 0x00000c25,
55         0x00000ca5, 0x00000d25, 0x00000da5, 0x00000e25,
56         0x00000ea5, 0x00000f25, 0x00000fa5, 0x00001025,
57         0x000010a5, 0x000011a5, 0x000012a5, 0x000013a5,
58         0x000014a5, 0x000015a5, 0x000016a5, 0x000017a5,
59         0x000018a5, 0x000019a5, 0x00001aa5, 0x00001ba5,
60         0x00001ca5, 0x00001da5, 0x00001ea5, 0x00001fa5,
61         0x000020a5, 0x000021a5, 0x000022a5, 0x000023a5,
62         0x000024a5, 0x000026a5, 0x000028a5, 0x00002aa5,
63         0x00002ca5, 0x00002ea5, 0x000030a5, 0x000032a5,
64         0x000034a5, 0x000036a5, 0x000038a5, 0x00003aa5,
65         0x00003ca5, 0x00003ea5, 0x000040a5, 0x000042a5,
66         0x000044a5, 0x000046a5, 0x000048a5, 0x00004aa5,
67         0x00004ca5, 0x00004ea5, 0x000050a5, 0x000052a5,
68         0x000054a5, 0x000056a5, 0x000058a5, 0x00005aa5,
69         0x00005ca5, 0x00005ea5, 0x000060a5, 0x000064a5,
70         0x000068a5, 0x00006ca5, 0x000070a5, 0x000074a5,
71         0x000078a5, 0x00007ca5, 0x000080a5, 0x000084a5,
72         0x000088a5, 0x00008ca5, 0x000090a5, 0x000094a5,
73         0x000098a5, 0x00009ca5, 0x0000a0a5, 0x0000a4a5,
74         0x0000a8a5, 0x0000aca5, 0x0000b0a5, 0x0000b4a5,
75         0x0000b8a5, 0x0000bca5, 0x0000c0a5, 0x0000c4a5,
76         0x0000c8a5, 0x0000cca5, 0x0000d0a5, 0x0000d4a5,
77         0x0000d8a5, 0x0000dca5, 0x0000e0a5, 0x0000e4a5,
78         0x0000eca5, 0x0000f4a5, 0x0000fca5, 0x000104a5,
79         0x00010ca5, 0x000114a5, 0x00011ca5, 0x000124a5,
80         0x00012ca5, 0x000134a5, 0x00013ca5, 0x000144a5,
81         0x00014ca5, 0x000154a5, 0x00015ca5, 0x000164a5,
82         0x00016ca5, 0x000174a5, 0x00017ca5, 0x000184a5,
83         0x00018ca5, 0x000194a5, 0x00019ca5, 0x0001a4a5,
84         0x0001aca5, 0x0001b4a5, 0x0001bca5, 0x0001c4a5,
85         0x0001cca5, 0x0001d4a5, 0x0001dca5, 0x0001e4a5,
86         0x0001eca5, 0x0001f4a5, 0x0001fca5, 0x000204a5,
87         0x00020ca5, 0x000214a5, 0x00021ca5, 0x000224a5,
88         0x000234a5, 0x000244a5, 0x000254a5, 0x000264a5,
89         0x000274a5, 0x000284a5, 0x000294a5, 0x0002a4a5,
90         0x0002b4a5, 0x0002c4a5, 0x0002d4a5, 0x0002e4a5,
91         0x0002f4a5, 0x000304a5, 0x000314a5, 0x000324a5,
92         0x000334a5, 0x000344a5, 0x000354a5, 0x000364a5,
93         0x000374a5, 0x000384a5, 0x000394a5, 0x0003a4a5,
94         0x0003b4a5, 0x0003c4a5, 0x0003d4a5, 0x0003e4a5,
95         0x0003f4a5, 0x000404a5, 0x000414a5, 0x000424a5,
96         0x000434a5, 0x000444a5, 0x000454a5, 0x000464a5,
97         0x000474a5, 0x000484a5, 0x000494a5, 0x0004a4a5,
98         0x0004b4a5, 0x0004c4a5, 0x0004e4a5, 0x000504a5,
99         0x000524a5, 0x000544a5, 0x000564a5, 0x000584a5,
100         0x0005a4a5, 0x0005c4a5, 0x0005e4a5, 0x000604a5,
101         0x000624a5, 0x000644a5, 0x000664a5, 0x000684a5,
102         0x0006a4a5, 0x0006c4a5, 0x0006e4a5, 0x000704a5,
103         0x000724a5, 0x000744a5, 0x000764a5, 0x000784a5,
104         0x0007a4a5, 0x0007c4a5, 0x0007e4a5, 0x000804a5,
105         0x000824a5, 0x000844a5, 0x000864a5, 0x000884a5,
106         0x0008a4a5, 0x0008c4a5, 0x0008e4a5, 0x000904a5,
107         0x000924a5, 0x000944a5, 0x000964a5, 0x000984a5,
108         0x0009a4a5, 0x0009c4a5, 0x0009e4a5, 0x000a04a5,
109         0x000a24a5, 0x000a44a5, 0x000a64a5, 0x000aa4a5,
110         0x000ae4a5, 0x000b24a5, 0x000b64a5, 0x000ba4a5,
111         0x000be4a5, 0x000c24a5, 0x000c64a5, 0x000ca4a5,
112         0x000ce4a5, 0x000d24a5, 0x000d64a5, 0x000da4a5,
113         0x000de4a5, 0x000e24a5, 0x000e64a5, 0x000ea4a5,
114         0x000ee4a5, 0x000f24a5, 0x000f64a5, 0x000fa4a5,
115         0x000fe4a5, 0x001024a5, 0x001064a5, 0x0010a4a5,
116         0x0010e4a5, 0x001124a5, 0x001164a5, 0x0011a4a5,
117         0x0011e4a5, 0x001224a5, 0x001264a5, 0x0012a4a5,
118         0x0012e4a5, 0x001324a5, 0x001364a5, 0x0013a4a5,
119         0x0013e4a5, 0x001424a5, 0x001464a5, 0x0014a4a5,
120         0x0014e4a5, 0x001524a5, 0x001564a5, 0x0015a4a5,
121         0x0015e4a5, 0x001624a5, 0x001664a5, 0x0016a4a5,
122         0x0016e4a5, 0x001724a5, 0x001764a5, 0x0017a4a5,
123         0x0017e4a5, 0x001824a5, 0x001864a5, 0x0018a4a5,
124         0x0018e4a5, 0x001924a5, 0x001964a5, 0x0019e4a5,
125         0x001a64a5, 0x001ae4a5, 0x001b64a5, 0x001be4a5,
126         0x001c64a5, 0x001ce4a5, 0x001d64a5, 0x001de4a5,
127         0x001e64a5, 0x001ee4a5, 0x001f64a5, 0x001fe4a5,
128         0x002064a5, 0x0020e4a5, 0x002164a5, 0x0021e4a5,
129         0x002264a5, 0x0022e4a5, 0x002364a5, 0x0023e4a5,
130         0x002464a5, 0x0024e4a5, 0x002564a5, 0x0025e4a5,
131         0x002664a5, 0x0026e4a5, 0x002764a5, 0x0027e4a5,
132         0x002864a5, 0x0028e4a5, 0x002964a5, 0x0029e4a5,
133         0x002a64a5, 0x002ae4a5, 0x002b64a5, 0x002be4a5,
134         0x002c64a5, 0x002ce4a5, 0x002d64a5, 0x002de4a5,
135         0x002e64a5, 0x002ee4a5, 0x002f64a5, 0x002fe4a5,
136         0x003064a5, 0x0030e4a5, 0x003164a5, 0x0031e4a5,
137         0x003264a5, 0x0032e4a5, 0x003364a5, 0x0033e4a5,
138         0x003464a5, 0x0034e4a5, 0x003564a5, 0x0035e4a5,
139         0x003664a5, 0x0036e4a5, 0x003764a5, 0x0037e4a5,
140         0x003864a5, 0x0038e4a5, 0x003964a5, 0x0039e4a5,
141         0x003a64a5, 0x003ae4a5, 0x003b64a5, 0x003be4a5,
142         0x003c64a5, 0x003ce4a5, 0x003d64a5, 0x003de4a5,
143         0x003ee4a5, 0x003fe4a5, 0x0040e4a5, 0x0041e4a5,
144         0x0042e4a5, 0x0043e4a5, 0x0044e4a5, 0x0045e4a5,
145         0x0046e4a5, 0x0047e4a5, 0x0048e4a5, 0x0049e4a5,
146         0x004ae4a5, 0x004be4a5, 0x004ce4a5, 0x004de4a5,
147         0x004ee4a5, 0x004fe4a5, 0x0050e4a5, 0x0051e4a5,
148         0x0052e4a5, 0x0053e4a5, 0x0054e4a5, 0x0055e4a5,
149         0x0056e4a5, 0x0057e4a5, 0x0058e4a5, 0x0059e4a5,
150         0x005ae4a5, 0x005be4a5, 0x005ce4a5, 0x005de4a5,
151         0x005ee4a5, 0x005fe4a5, 0x0060e4a5, 0x0061e4a5,
152         0x0062e4a5, 0x0063e4a5, 0x0064e4a5, 0x0065e4a5,
153         0x0066e4a5, 0x0067e4a5, 0x0068e4a5, 0x0069e4a5,
154         0x006ae4a5, 0x006be4a5, 0x006ce4a5, 0x006de4a5,
155         0x006ee4a5, 0x006fe4a5, 0x0070e4a5, 0x0071e4a5,
156         0x0072e4a5, 0x0073e4a5, 0x0074e4a5, 0x0075e4a5,
157         0x0076e4a5, 0x0077e4a5, 0x0078e4a5, 0x0079e4a5,
158         0x007ae4a5, 0x007be4a5, 0x007ce4a5, 0x007de4a5,
159         0x007ee4a5, 0x007fe4a5, 0x0080e4a5, 0x0081e4a5,
160         0x0082e4a5, 0x0083e4a5, 0x0084e4a5, 0x0085e4a5,
161         0x0086e4a5, 0x0087e4a5, 0x0088e4a5, 0x0089e4a5,
162         0x008ae4a5, 0x008be4a5, 0x008ce4a5, 0x008de4a5,
163         0x008fe4a5, 0x0091e4a5, 0x0093e4a5, 0x0095e4a5,
164         0x0097e4a5, 0x0099e4a5, 0x009be4a5, 0x009de4a5,
165         0x009fe4a5, 0x00a1e4a5, 0x00a3e4a5, 0x00a5e4a5,
166         0x00a7e4a5, 0x00a9e4a5, 0x00abe4a5, 0x00ade4a5,
167         0x00afe4a5, 0x00b1e4a5, 0x00b3e4a5, 0x00b5e4a5,
168         0x00b7e4a5, 0x00b9e4a5, 0x00bbe4a5, 0x00bde4a5,
169         0x00bfe4a5, 0x00c1e4a5, 0x00c3e4a5, 0x00c5e4a5,
170         0x00c7e4a5, 0x00c9e4a5, 0x00cbe4a5, 0x00cde4a5,
171         0x00cfe4a5, 0x00d1e4a5, 0x00d3e4a5, 0x00d5e4a5,
172         0x00d7e4a5, 0x00d9e4a5, 0x00dbe4a5, 0x00dde4a5,
173         0x00dfe4a5, 0x00e1e4a5, 0x00e3e4a5, 0x00e5e4a5,
174         0x00e7e4a5, 0x00e9e4a5, 0x00ebe4a5, 0x00ede4a5,
175         0x00efe4a5, 0x00f1e4a5, 0x00f3e4a5, 0x00f5e4a5,
176         0x00f7e4a5, 0x00f9e4a5, 0x00fbe4a5, 0x00fde4a5,
177         0x00ffe4a5, 0x0101e4a5, 0x0103e4a5, 0x0105e4a5,
178         0x0107e4a5, 0x0109e4a5, 0x010be4a5, 0x010de4a5,
179         0x010fe4a5, 0x0111e4a5, 0x0113e4a5, 0x0115e4a5,
180         0x0117e4a5, 0x0119e4a5, 0x011be4a5, 0x011de4a5,
181         0x011fe4a5, 0x0121e4a5, 0x0123e4a5, 0x0125e4a5,
182         0x0127e4a5, 0x0129e4a5, 0x012be4a5, 0x012de4a5,
183         0x012fe4a5, 0x0131e4a5, 0x0133e4a5, 0x0135e4a5,
184         0x0137e4a5, 0x013be4a5, 0x013fe4a5, 0x0143e4a5,
185         0x0147e4a5, 0x014be4a5, 0x014fe4a5, 0x0153e4a5,
186         0x0157e4a5, 0x015be4a5, 0x015fe4a5, 0x0163e4a5,
187         0x0167e4a5, 0x016be4a5, 0x016fe4a5, 0x0173e4a5,
188         0x0177e4a5, 0x017be4a5, 0x017fe4a5, 0x0183e4a5,
189         0x0187e4a5, 0x018be4a5, 0x018fe4a5, 0x0193e4a5,
190         0x0197e4a5, 0x019be4a5, 0x019fe4a5, 0x01a3e4a5,
191         0x01a7e4a5, 0x01abe4a5, 0x01afe4a5, 0x01b3e4a5,
192         0x01b7e4a5, 0x01bbe4a5, 0x01bfe4a5, 0x01c3e4a5,
193         0x01c7e4a5, 0x01cbe4a5, 0x01cfe4a5, 0x01d3e4a5,
194         0x01d7e4a5, 0x01dbe4a5, 0x01dfe4a5, 0x01e3e4a5,
195         0x01e7e4a5, 0x01ebe4a5, 0x01efe4a5, 0x01f3e4a5,
196         0x01f7e4a5, 0x01fbe4a5, 0x01ffe4a5, 0x0203e4a5,
197         0x0207e4a5, 0x020be4a5, 0x020fe4a5, 0x0213e4a5,
198         0x0217e4a5, 0x021be4a5, 0x021fe4a5, 0x0223e4a5,
199         0x0227e4a5, 0x022be4a5, 0x022fe4a5, 0x0233e4a5,
200         0x0237e4a5, 0x023be4a5, 0x023fe4a5, 0x0243e4a5,
201         0x0247e4a5, 0x024be4a5, 0x024fe4a5, 0x0253e4a5,
202         0x0257e4a5, 0x025be4a5, 0x025fe4a5, 0x0263e4a5,
203         0x0267e4a5, 0x026be4a5, 0x026fe4a5, 0x0273e4a5,
204         0x0277e4a5, 0x027be4a5, 0x027fe4a5, 0x0283e4a5,
205         0x0287e4a5, 0x028be4a5, 0x028fe4a5, 0x0293e4a5,
206         0x0297e4a5, 0x029be4a5, 0x029fe4a5, 0x02a3e4a5,
207         0x02a7e4a5, 0x02abe4a5, 0x02afe4a5, 0x02b3e4a5,
208         0x02bbe4a5, 0x02c3e4a5, 0x02cbe4a5, 0x02d3e4a5,
209         0x02dbe4a5, 0x02e3e4a5, 0x02ebe4a5, 0x02f3e4a5,
210         0x02fbe4a5, 0x0303e4a5, 0x030be4a5, 0x0313e4a5,
211         0x031be4a5, 0x0323e4a5, 0x032be4a5, 0x0333e4a5,
212         0x033be4a5, 0x0343e4a5, 0x034be4a5, 0x0353e4a5,
213         0x035be4a5, 0x0363e4a5, 0x036be4a5, 0x0373e4a5,
214         0x037be4a5, 0x0383e4a5, 0x038be4a5, 0x0393e4a5,
215         0x039be4a5, 0x03a3e4a5, 0x03abe4a5, 0x03b3e4a5,
216         0x03bbe4a5, 0x03c3e4a5, 0x03cbe4a5, 0x03d3e4a5,
217         0x03dbe4a5, 0x03e3e4a5, 0x03ebe4a5, 0x03f3e4a5,
218         0x03fbe4a5, 0x0403e4a5, 0x040be4a5, 0x0413e4a5,
219         0x041be4a5, 0x0423e4a5, 0x042be4a5, 0x0433e4a5,
220         0x043be4a5, 0x0443e4a5, 0x044be4a5, 0x0453e4a5,
221         0x045be4a5, 0x0463e4a5, 0x046be4a5, 0x0473e4a5,
222         0x047be4a5, 0x0483e4a5, 0x048be4a5, 0x0493e4a5,
223         0x049be4a5, 0x04a3e4a5, 0x04abe4a5, 0x04b3e4a5,
224         0x04bbe4a5, 0x04c3e4a5, 0x04cbe4a5, 0x04d3e4a5,
225         0x04dbe4a5, 0x04e3e4a5, 0x04ebe4a5, 0x04f3e4a5,
226         0x04fbe4a5, 0x0503e4a5, 0x050be4a5, 0x0513e4a5,
227         0x051be4a5, 0x0523e4a5, 0x052be4a5, 0x0533e4a5,
228         0x053be4a5, 0x0543e4a5, 0x054be4a5, 0x0553e4a5,
229         0x055be4a5, 0x0563e4a5, 0x056be4a5, 0x0573e4a5,
230         0x057be4a5, 0x0583e4a5, 0x058be4a5, 0x0593e4a5,
231         0x059be4a5, 0x05a3e4a5, 0x05abe4a5, 0x05b3e4a5,
232         0x05bbe4a5, 0x05c3e4a5, 0x05cbe4a5, 0x05d3e4a5,
233         0x05dbe4a5, 0x05e3e4a5, 0x05ebe4a5, 0x05f3e4a5,
234         0x05fbe4a5, 0x060be4a5, 0x061be4a5, 0x062be4a5,
235         0x063be4a5, 0x064be4a5, 0x065be4a5, 0x465be4a5,
236         /* The last entry is extra; it is equal to LZMS_MAX_MATCH_OFFSET + 1 and
237          * is here to aid binary search.  */
238 };
239
240 /* Table: offset slot => number of extra offset bits  */
241 const u8 lzms_extra_offset_bits[LZMS_MAX_NUM_OFFSET_SYMS] = {
242         0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2,
243         2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4,
244         4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5,
245         5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6,
246         7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7,
247         7 , 7 , 7 , 7 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8,
248         8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9,
249         9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9,
250         9 , 9 , 9 , 9 , 9 , 9 , 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
251         10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
252         10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11,
253         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
254         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
255         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
256         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
257         12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13,
258         13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
259         13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
260         13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
261         14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
262         14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
263         14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
264         14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
265         15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
266         15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
267         15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
268         15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16,
269         16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
270         16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
271         16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
272         16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
273         16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17,
274         17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
275         17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
276         17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
277         17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
278         17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
279         18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
280         18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
281         18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
282         18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
283         18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
284         18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19,
285         19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
286         19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
287         19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
288         19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
289         19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
290         19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
291         19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 30,
292 };
293
294 /* Table: length slot => length slot base value  */
295 const u32 lzms_length_slot_base[LZMS_NUM_LENGTH_SYMS + 1] = {
296         0x00000001, 0x00000002, 0x00000003, 0x00000004,
297         0x00000005, 0x00000006, 0x00000007, 0x00000008,
298         0x00000009, 0x0000000a, 0x0000000b, 0x0000000c,
299         0x0000000d, 0x0000000e, 0x0000000f, 0x00000010,
300         0x00000011, 0x00000012, 0x00000013, 0x00000014,
301         0x00000015, 0x00000016, 0x00000017, 0x00000018,
302         0x00000019, 0x0000001a, 0x0000001b, 0x0000001d,
303         0x0000001f, 0x00000021, 0x00000023, 0x00000027,
304         0x0000002b, 0x0000002f, 0x00000033, 0x00000037,
305         0x0000003b, 0x00000043, 0x0000004b, 0x00000053,
306         0x0000005b, 0x0000006b, 0x0000007b, 0x0000008b,
307         0x0000009b, 0x000000ab, 0x000000cb, 0x000000eb,
308         0x0000012b, 0x000001ab, 0x000002ab, 0x000004ab,
309         0x000008ab, 0x000108ab, 0x400108ab,
310         /* The last entry is extra; it is equal to LZMS_MAX_MATCH_LENGTH + 1 and
311          * is here to aid binary search.  */
312 };
313
314 /* Table: length slot => number of extra length bits  */
315 const u8 lzms_extra_length_bits[LZMS_NUM_LENGTH_SYMS] = {
316         0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
317         0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
318         0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
319         0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 ,
320         2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ,
321         4 , 4 , 4 , 4 , 4 , 5 , 5 , 6 ,
322         7 , 8 , 9 , 10, 16, 30,
323 };
324
325 unsigned
326 lzms_get_slot(u32 value, const u32 slot_base_tab[], unsigned num_slots)
327 {
328         unsigned l = 0;
329         unsigned r = num_slots - 1;
330         for (;;) {
331                 unsigned slot = (l + r) / 2;
332                 if (value >= slot_base_tab[slot]) {
333                         if (value < slot_base_tab[slot + 1])
334                                 return slot;
335                         else
336                                 l = slot + 1;
337                 } else {
338                         r = slot - 1;
339                 }
340         }
341 }
342
343 /* Return the number of offset slots used when processing a buffer having the
344  * specified uncompressed size.  */
345 unsigned
346 lzms_get_num_offset_slots(size_t uncompressed_size)
347 {
348         if (uncompressed_size < 2)
349                 return 0;
350         return 1 + lzms_get_offset_slot(uncompressed_size - 1);
351 }
352
353 void
354 lzms_init_probabilities(struct lzms_probabilites *probs)
355 {
356         struct lzms_probability_entry *entries =
357                 (struct lzms_probability_entry *)probs;
358         size_t num_entries = sizeof(struct lzms_probabilites) /
359                              sizeof(struct lzms_probability_entry);
360         for (size_t i = 0; i < num_entries; i++) {
361                 entries[i].num_recent_zero_bits = LZMS_INITIAL_PROBABILITY;
362                 entries[i].recent_bits = LZMS_INITIAL_RECENT_BITS;
363         }
364 }
365
366 void
367 lzms_init_symbol_frequencies(u32 freqs[], unsigned num_syms)
368 {
369         for (unsigned sym = 0; sym < num_syms; sym++)
370                 freqs[sym] = 1;
371 }
372
373 void
374 lzms_dilute_symbol_frequencies(u32 freqs[], unsigned num_syms)
375 {
376         for (unsigned sym = 0; sym < num_syms; sym++)
377                 freqs[sym] = (freqs[sym] >> 1) + 1;
378 }
379
380
381 #ifdef __x86_64__
382 static forceinline u8 *
383 find_next_opcode_sse4_2(u8 *p)
384 {
385         const __v16qi potential_opcodes = (__v16qi) {0x48, 0x4C, 0xE8, 0xE9, 0xF0, 0xFF};
386         __asm__(
387                 "  pcmpestri $0x0, (%[p]), %[potential_opcodes]      \n"
388                 "  jc 2f                                             \n"
389                 "1:                                                  \n"
390                 "  add $0x10, %[p]                                   \n"
391                 "  pcmpestri $0x0, (%[p]), %[potential_opcodes]      \n"
392                 "  jnc 1b                                            \n"
393                 "2:                                                  \n"
394         #ifdef __ILP32__ /* x32 ABI (x86_64 with 32-bit pointers) */
395                 "  add %%ecx, %[p]                                   \n"
396         #else
397                 "  add %%rcx, %[p]                                   \n"
398         #endif
399                 : [p] "+r" (p)
400                 : [potential_opcodes] "x" (potential_opcodes), "a" (6), "d" (16)
401                 : "rcx", "cc"
402                );
403
404         return p;
405 }
406 #endif /* __x86_64__ */
407
408 static forceinline u8 *
409 find_next_opcode_default(u8 *p)
410 {
411         /*
412          * The following table is used to accelerate the common case where the
413          * byte has nothing to do with x86 translation and must simply be
414          * skipped.  This was faster than the following alternatives:
415          *      - Jump table with 256 entries
416          *      - Switch statement with default
417          */
418         static const u8 is_potential_opcode[256] = {
419                 [0x48] = 1, [0x4C] = 1, [0xE8] = 1,
420                 [0xE9] = 1, [0xF0] = 1, [0xFF] = 1,
421         };
422
423         for (;;) {
424                 if (is_potential_opcode[*p])
425                         break;
426                 p++;
427                 if (is_potential_opcode[*p])
428                         break;
429                 p++;
430                 if (is_potential_opcode[*p])
431                         break;
432                 p++;
433                 if (is_potential_opcode[*p])
434                         break;
435                 p++;
436         }
437         return p;
438 }
439
440 static forceinline u8 *
441 translate_if_needed(u8 *data, u8 *p, s32 *last_x86_pos,
442                     s32 last_target_usages[], bool undo)
443 {
444         s32 max_trans_offset;
445         s32 opcode_nbytes;
446         u16 target16;
447         s32 i;
448
449         max_trans_offset = LZMS_X86_MAX_TRANSLATION_OFFSET;
450
451         /*
452          * p[0] has one of the following values:
453          *      0x48 0x4C 0xE8 0xE9 0xF0 0xFF
454          */
455
456         if (p[0] >= 0xF0) {
457                 if (p[0] & 0x0F) {
458                         /* 0xFF (instruction group)  */
459                         if (p[1] == 0x15) {
460                                 /* Call indirect relative  */
461                                 opcode_nbytes = 2;
462                                 goto have_opcode;
463                         }
464                 } else {
465                         /* 0xF0 (lock prefix)  */
466                         if (p[1] == 0x83 && p[2] == 0x05) {
467                                 /* Lock add relative  */
468                                 opcode_nbytes = 3;
469                                 goto have_opcode;
470                         }
471                 }
472         } else if (p[0] <= 0x4C) {
473
474                 /* 0x48 or 0x4C.  In 64-bit code this is a REX prefix byte with
475                  * W=1, R=[01], X=0, and B=0, and it will be followed by the
476                  * actual opcode, then additional bytes depending on the opcode.
477                  * We are most interested in several common instructions that
478                  * access data relative to the instruction pointer.  These use a
479                  * 1-byte opcode, followed by a ModR/M byte, followed by a
480                  * 4-byte displacement.  */
481
482                 /* Test: does the ModR/M byte indicate RIP-relative addressing?
483                  * Note: there seems to be a mistake in the format here; the
484                  * mask really should be 0xC7 instead of 0x07 so that both the
485                  * MOD and R/M fields of ModR/M are tested, not just R/M.  */
486                 if ((p[2] & 0x07) == 0x05) {
487                         /* Check for the LEA (load effective address) or MOV
488                          * (move) opcodes.  For MOV there are additional
489                          * restrictions, although it seems they are only helpful
490                          * due to the overly lax ModR/M test.  */
491                         if (p[1] == 0x8D ||
492                             (p[1] == 0x8B && !(p[0] & 0x04) && !(p[2] & 0xF0)))
493                         {
494                                 opcode_nbytes = 3;
495                                 goto have_opcode;
496                         }
497                 }
498         } else {
499                 if (p[0] & 0x01) {
500                         /* 0xE9: Jump relative.  Theoretically this would be
501                          * useful to translate, but in fact it's explicitly
502                          * excluded.  Most likely it creates too many false
503                          * positives for the detection algorithm.  */
504                         p += 4;
505                 } else {
506                         /* 0xE8: Call relative.  This is a common case, so it
507                          * uses a reduced max_trans_offset.  In other words, we
508                          * have to be more confident that the data actually is
509                          * x86 machine code before we'll do the translation.  */
510                         opcode_nbytes = 1;
511                         max_trans_offset >>= 1;
512                         goto have_opcode;
513                 }
514         }
515
516         return p + 1;
517
518 have_opcode:
519         i = p - data;
520         p += opcode_nbytes;
521         if (undo) {
522                 if (i - *last_x86_pos <= max_trans_offset) {
523                         u32 n = get_unaligned_le32(p);
524                         put_unaligned_le32(n - i, p);
525                 }
526                 target16 = i + get_unaligned_le16(p);
527         } else {
528                 target16 = i + get_unaligned_le16(p);
529                 if (i - *last_x86_pos <= max_trans_offset) {
530                         u32 n = get_unaligned_le32(p);
531                         put_unaligned_le32(n + i, p);
532                 }
533         }
534
535         i += opcode_nbytes + sizeof(le32) - 1;
536
537         if (i - last_target_usages[target16] <= LZMS_X86_ID_WINDOW_SIZE)
538                 *last_x86_pos = i;
539
540         last_target_usages[target16] = i;
541
542         return p + sizeof(le32);
543 }
544
545 /*
546  * Translate relative addresses embedded in x86 instructions into absolute
547  * addresses (@undo == %false), or undo this translation (@undo == %true).
548  *
549  * Absolute addresses are usually more compressible by LZ factorization.
550  *
551  * @last_target_usages must be a temporary array of length >= 65536.
552  */
553 void
554 lzms_x86_filter(u8 data[restrict], s32 size,
555                 s32 last_target_usages[restrict], bool undo)
556 {
557         /*
558          * Note: this filter runs unconditionally and uses a custom algorithm to
559          * detect data regions that probably contain x86 code.
560          *
561          * 'last_x86_pos' tracks the most recent position that has a good chance
562          * of being the start of an x86 instruction.  When the filter detects a
563          * likely x86 instruction, it updates this variable and considers the
564          * next LZMS_X86_MAX_TRANSLATION_OFFSET bytes of data as valid for x86
565          * translations.
566          *
567          * If part of the data does not, in fact, contain x86 machine code, then
568          * 'last_x86_pos' will, very likely, eventually fall more than
569          * LZMS_X86_MAX_TRANSLATION_OFFSET bytes behind the current position.
570          * This results in x86 translations being disabled until the next likely
571          * x86 instruction is detected.
572          *
573          * To identify "likely x86 instructions", the algorithm attempts to
574          * track the position of the most recent potential relative-addressing
575          * instruction that referenced each possible memory address.  If it
576          * finds two references to the same memory address within an
577          * LZMS_X86_ID_WINDOW_SIZE-byte sized window, then the second reference
578          * is flagged as a likely x86 instruction.  Since the instructions
579          * considered for translation necessarily use relative addressing, the
580          * algorithm does a tentative translation into absolute addresses.  In
581          * addition, so that memory addresses can be looked up in an array of
582          * reasonable size (in this code, 'last_target_usages'), only the
583          * low-order 2 bytes of each address are considered significant.
584          */
585
586         u8 *p;
587         u8 *tail_ptr;
588         s32 last_x86_pos = -LZMS_X86_MAX_TRANSLATION_OFFSET - 1;
589
590         if (size <= 17)
591                 return;
592
593         for (s32 i = 0; i < 65536; i++)
594                 last_target_usages[i] = -(s32)LZMS_X86_ID_WINDOW_SIZE - 1;
595
596         /*
597          * Optimization: only check for end-of-buffer when we already have a
598          * byte that is a potential opcode for x86 translation.  To do this,
599          * overwrite one of the bytes near the end of the buffer, and restore it
600          * later.  The correctness of this optimization relies on two
601          * characteristics of compressed format:
602          *
603          *  1. No translation can follow an opcode beginning in the last 16
604          *     bytes.
605          *  2. A translation following an opcode starting at the last possible
606          *     position (17 bytes from the end) never extends more than 7 bytes.
607          *     Consequently, we can overwrite any of the bytes starting at
608          *     data[(size - 16) + 7] and have no effect on the result, as long
609          *     as we restore those bytes later.
610          */
611
612         /* Note: the very first byte must be ignored completely!  */
613         p = data + 1;
614         tail_ptr = &data[size - 16];
615
616 #ifdef __x86_64__
617         if (x86_have_cpu_feature(X86_CPU_FEATURE_SSE4_2)) {
618                 u8 saved_byte = *tail_ptr;
619                 *tail_ptr = 0xE8;
620                 for (;;) {
621                         u8 *new_p = find_next_opcode_sse4_2(p);
622                         if (new_p >= tail_ptr - 8)
623                                 break;
624                         p = new_p;
625                         p = translate_if_needed(data, p, &last_x86_pos,
626                                                 last_target_usages, undo);
627                 }
628                 *tail_ptr = saved_byte;
629         }
630 #endif
631         {
632                 u8 saved_byte = *(tail_ptr + 8);
633                 *(tail_ptr + 8) = 0xE8;
634                 for (;;) {
635                         p = find_next_opcode_default(p);
636                         if (p >= tail_ptr)
637                                 break;
638                         p = translate_if_needed(data, p, &last_x86_pos,
639                                                 last_target_usages, undo);
640                 }
641                 *(tail_ptr + 8) = saved_byte;
642         }
643 }