2 * lzms_common.c - Common code for LZMS compression and decompression
6 * Copyright (C) 2013-2016 Eric Biggers
8 * This file is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU Lesser General Public License as published by the Free
10 * Software Foundation; either version 3 of the License, or (at your option) any
13 * This file is distributed in the hope that it will be useful, but WITHOUT
14 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with this file; if not, see https://www.gnu.org/licenses/.
26 #include "wimlib/cpu_features.h"
27 #include "wimlib/lzms_common.h"
28 #include "wimlib/unaligned.h"
31 # include <emmintrin.h>
34 /* Table: offset slot => offset slot base value */
35 const u32 lzms_offset_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1] = {
36 0x00000001, 0x00000002, 0x00000003, 0x00000004,
37 0x00000005, 0x00000006, 0x00000007, 0x00000008,
38 0x00000009, 0x0000000d, 0x00000011, 0x00000015,
39 0x00000019, 0x0000001d, 0x00000021, 0x00000025,
40 0x00000029, 0x0000002d, 0x00000035, 0x0000003d,
41 0x00000045, 0x0000004d, 0x00000055, 0x0000005d,
42 0x00000065, 0x00000075, 0x00000085, 0x00000095,
43 0x000000a5, 0x000000b5, 0x000000c5, 0x000000d5,
44 0x000000e5, 0x000000f5, 0x00000105, 0x00000125,
45 0x00000145, 0x00000165, 0x00000185, 0x000001a5,
46 0x000001c5, 0x000001e5, 0x00000205, 0x00000225,
47 0x00000245, 0x00000265, 0x00000285, 0x000002a5,
48 0x000002c5, 0x000002e5, 0x00000325, 0x00000365,
49 0x000003a5, 0x000003e5, 0x00000425, 0x00000465,
50 0x000004a5, 0x000004e5, 0x00000525, 0x00000565,
51 0x000005a5, 0x000005e5, 0x00000625, 0x00000665,
52 0x000006a5, 0x00000725, 0x000007a5, 0x00000825,
53 0x000008a5, 0x00000925, 0x000009a5, 0x00000a25,
54 0x00000aa5, 0x00000b25, 0x00000ba5, 0x00000c25,
55 0x00000ca5, 0x00000d25, 0x00000da5, 0x00000e25,
56 0x00000ea5, 0x00000f25, 0x00000fa5, 0x00001025,
57 0x000010a5, 0x000011a5, 0x000012a5, 0x000013a5,
58 0x000014a5, 0x000015a5, 0x000016a5, 0x000017a5,
59 0x000018a5, 0x000019a5, 0x00001aa5, 0x00001ba5,
60 0x00001ca5, 0x00001da5, 0x00001ea5, 0x00001fa5,
61 0x000020a5, 0x000021a5, 0x000022a5, 0x000023a5,
62 0x000024a5, 0x000026a5, 0x000028a5, 0x00002aa5,
63 0x00002ca5, 0x00002ea5, 0x000030a5, 0x000032a5,
64 0x000034a5, 0x000036a5, 0x000038a5, 0x00003aa5,
65 0x00003ca5, 0x00003ea5, 0x000040a5, 0x000042a5,
66 0x000044a5, 0x000046a5, 0x000048a5, 0x00004aa5,
67 0x00004ca5, 0x00004ea5, 0x000050a5, 0x000052a5,
68 0x000054a5, 0x000056a5, 0x000058a5, 0x00005aa5,
69 0x00005ca5, 0x00005ea5, 0x000060a5, 0x000064a5,
70 0x000068a5, 0x00006ca5, 0x000070a5, 0x000074a5,
71 0x000078a5, 0x00007ca5, 0x000080a5, 0x000084a5,
72 0x000088a5, 0x00008ca5, 0x000090a5, 0x000094a5,
73 0x000098a5, 0x00009ca5, 0x0000a0a5, 0x0000a4a5,
74 0x0000a8a5, 0x0000aca5, 0x0000b0a5, 0x0000b4a5,
75 0x0000b8a5, 0x0000bca5, 0x0000c0a5, 0x0000c4a5,
76 0x0000c8a5, 0x0000cca5, 0x0000d0a5, 0x0000d4a5,
77 0x0000d8a5, 0x0000dca5, 0x0000e0a5, 0x0000e4a5,
78 0x0000eca5, 0x0000f4a5, 0x0000fca5, 0x000104a5,
79 0x00010ca5, 0x000114a5, 0x00011ca5, 0x000124a5,
80 0x00012ca5, 0x000134a5, 0x00013ca5, 0x000144a5,
81 0x00014ca5, 0x000154a5, 0x00015ca5, 0x000164a5,
82 0x00016ca5, 0x000174a5, 0x00017ca5, 0x000184a5,
83 0x00018ca5, 0x000194a5, 0x00019ca5, 0x0001a4a5,
84 0x0001aca5, 0x0001b4a5, 0x0001bca5, 0x0001c4a5,
85 0x0001cca5, 0x0001d4a5, 0x0001dca5, 0x0001e4a5,
86 0x0001eca5, 0x0001f4a5, 0x0001fca5, 0x000204a5,
87 0x00020ca5, 0x000214a5, 0x00021ca5, 0x000224a5,
88 0x000234a5, 0x000244a5, 0x000254a5, 0x000264a5,
89 0x000274a5, 0x000284a5, 0x000294a5, 0x0002a4a5,
90 0x0002b4a5, 0x0002c4a5, 0x0002d4a5, 0x0002e4a5,
91 0x0002f4a5, 0x000304a5, 0x000314a5, 0x000324a5,
92 0x000334a5, 0x000344a5, 0x000354a5, 0x000364a5,
93 0x000374a5, 0x000384a5, 0x000394a5, 0x0003a4a5,
94 0x0003b4a5, 0x0003c4a5, 0x0003d4a5, 0x0003e4a5,
95 0x0003f4a5, 0x000404a5, 0x000414a5, 0x000424a5,
96 0x000434a5, 0x000444a5, 0x000454a5, 0x000464a5,
97 0x000474a5, 0x000484a5, 0x000494a5, 0x0004a4a5,
98 0x0004b4a5, 0x0004c4a5, 0x0004e4a5, 0x000504a5,
99 0x000524a5, 0x000544a5, 0x000564a5, 0x000584a5,
100 0x0005a4a5, 0x0005c4a5, 0x0005e4a5, 0x000604a5,
101 0x000624a5, 0x000644a5, 0x000664a5, 0x000684a5,
102 0x0006a4a5, 0x0006c4a5, 0x0006e4a5, 0x000704a5,
103 0x000724a5, 0x000744a5, 0x000764a5, 0x000784a5,
104 0x0007a4a5, 0x0007c4a5, 0x0007e4a5, 0x000804a5,
105 0x000824a5, 0x000844a5, 0x000864a5, 0x000884a5,
106 0x0008a4a5, 0x0008c4a5, 0x0008e4a5, 0x000904a5,
107 0x000924a5, 0x000944a5, 0x000964a5, 0x000984a5,
108 0x0009a4a5, 0x0009c4a5, 0x0009e4a5, 0x000a04a5,
109 0x000a24a5, 0x000a44a5, 0x000a64a5, 0x000aa4a5,
110 0x000ae4a5, 0x000b24a5, 0x000b64a5, 0x000ba4a5,
111 0x000be4a5, 0x000c24a5, 0x000c64a5, 0x000ca4a5,
112 0x000ce4a5, 0x000d24a5, 0x000d64a5, 0x000da4a5,
113 0x000de4a5, 0x000e24a5, 0x000e64a5, 0x000ea4a5,
114 0x000ee4a5, 0x000f24a5, 0x000f64a5, 0x000fa4a5,
115 0x000fe4a5, 0x001024a5, 0x001064a5, 0x0010a4a5,
116 0x0010e4a5, 0x001124a5, 0x001164a5, 0x0011a4a5,
117 0x0011e4a5, 0x001224a5, 0x001264a5, 0x0012a4a5,
118 0x0012e4a5, 0x001324a5, 0x001364a5, 0x0013a4a5,
119 0x0013e4a5, 0x001424a5, 0x001464a5, 0x0014a4a5,
120 0x0014e4a5, 0x001524a5, 0x001564a5, 0x0015a4a5,
121 0x0015e4a5, 0x001624a5, 0x001664a5, 0x0016a4a5,
122 0x0016e4a5, 0x001724a5, 0x001764a5, 0x0017a4a5,
123 0x0017e4a5, 0x001824a5, 0x001864a5, 0x0018a4a5,
124 0x0018e4a5, 0x001924a5, 0x001964a5, 0x0019e4a5,
125 0x001a64a5, 0x001ae4a5, 0x001b64a5, 0x001be4a5,
126 0x001c64a5, 0x001ce4a5, 0x001d64a5, 0x001de4a5,
127 0x001e64a5, 0x001ee4a5, 0x001f64a5, 0x001fe4a5,
128 0x002064a5, 0x0020e4a5, 0x002164a5, 0x0021e4a5,
129 0x002264a5, 0x0022e4a5, 0x002364a5, 0x0023e4a5,
130 0x002464a5, 0x0024e4a5, 0x002564a5, 0x0025e4a5,
131 0x002664a5, 0x0026e4a5, 0x002764a5, 0x0027e4a5,
132 0x002864a5, 0x0028e4a5, 0x002964a5, 0x0029e4a5,
133 0x002a64a5, 0x002ae4a5, 0x002b64a5, 0x002be4a5,
134 0x002c64a5, 0x002ce4a5, 0x002d64a5, 0x002de4a5,
135 0x002e64a5, 0x002ee4a5, 0x002f64a5, 0x002fe4a5,
136 0x003064a5, 0x0030e4a5, 0x003164a5, 0x0031e4a5,
137 0x003264a5, 0x0032e4a5, 0x003364a5, 0x0033e4a5,
138 0x003464a5, 0x0034e4a5, 0x003564a5, 0x0035e4a5,
139 0x003664a5, 0x0036e4a5, 0x003764a5, 0x0037e4a5,
140 0x003864a5, 0x0038e4a5, 0x003964a5, 0x0039e4a5,
141 0x003a64a5, 0x003ae4a5, 0x003b64a5, 0x003be4a5,
142 0x003c64a5, 0x003ce4a5, 0x003d64a5, 0x003de4a5,
143 0x003ee4a5, 0x003fe4a5, 0x0040e4a5, 0x0041e4a5,
144 0x0042e4a5, 0x0043e4a5, 0x0044e4a5, 0x0045e4a5,
145 0x0046e4a5, 0x0047e4a5, 0x0048e4a5, 0x0049e4a5,
146 0x004ae4a5, 0x004be4a5, 0x004ce4a5, 0x004de4a5,
147 0x004ee4a5, 0x004fe4a5, 0x0050e4a5, 0x0051e4a5,
148 0x0052e4a5, 0x0053e4a5, 0x0054e4a5, 0x0055e4a5,
149 0x0056e4a5, 0x0057e4a5, 0x0058e4a5, 0x0059e4a5,
150 0x005ae4a5, 0x005be4a5, 0x005ce4a5, 0x005de4a5,
151 0x005ee4a5, 0x005fe4a5, 0x0060e4a5, 0x0061e4a5,
152 0x0062e4a5, 0x0063e4a5, 0x0064e4a5, 0x0065e4a5,
153 0x0066e4a5, 0x0067e4a5, 0x0068e4a5, 0x0069e4a5,
154 0x006ae4a5, 0x006be4a5, 0x006ce4a5, 0x006de4a5,
155 0x006ee4a5, 0x006fe4a5, 0x0070e4a5, 0x0071e4a5,
156 0x0072e4a5, 0x0073e4a5, 0x0074e4a5, 0x0075e4a5,
157 0x0076e4a5, 0x0077e4a5, 0x0078e4a5, 0x0079e4a5,
158 0x007ae4a5, 0x007be4a5, 0x007ce4a5, 0x007de4a5,
159 0x007ee4a5, 0x007fe4a5, 0x0080e4a5, 0x0081e4a5,
160 0x0082e4a5, 0x0083e4a5, 0x0084e4a5, 0x0085e4a5,
161 0x0086e4a5, 0x0087e4a5, 0x0088e4a5, 0x0089e4a5,
162 0x008ae4a5, 0x008be4a5, 0x008ce4a5, 0x008de4a5,
163 0x008fe4a5, 0x0091e4a5, 0x0093e4a5, 0x0095e4a5,
164 0x0097e4a5, 0x0099e4a5, 0x009be4a5, 0x009de4a5,
165 0x009fe4a5, 0x00a1e4a5, 0x00a3e4a5, 0x00a5e4a5,
166 0x00a7e4a5, 0x00a9e4a5, 0x00abe4a5, 0x00ade4a5,
167 0x00afe4a5, 0x00b1e4a5, 0x00b3e4a5, 0x00b5e4a5,
168 0x00b7e4a5, 0x00b9e4a5, 0x00bbe4a5, 0x00bde4a5,
169 0x00bfe4a5, 0x00c1e4a5, 0x00c3e4a5, 0x00c5e4a5,
170 0x00c7e4a5, 0x00c9e4a5, 0x00cbe4a5, 0x00cde4a5,
171 0x00cfe4a5, 0x00d1e4a5, 0x00d3e4a5, 0x00d5e4a5,
172 0x00d7e4a5, 0x00d9e4a5, 0x00dbe4a5, 0x00dde4a5,
173 0x00dfe4a5, 0x00e1e4a5, 0x00e3e4a5, 0x00e5e4a5,
174 0x00e7e4a5, 0x00e9e4a5, 0x00ebe4a5, 0x00ede4a5,
175 0x00efe4a5, 0x00f1e4a5, 0x00f3e4a5, 0x00f5e4a5,
176 0x00f7e4a5, 0x00f9e4a5, 0x00fbe4a5, 0x00fde4a5,
177 0x00ffe4a5, 0x0101e4a5, 0x0103e4a5, 0x0105e4a5,
178 0x0107e4a5, 0x0109e4a5, 0x010be4a5, 0x010de4a5,
179 0x010fe4a5, 0x0111e4a5, 0x0113e4a5, 0x0115e4a5,
180 0x0117e4a5, 0x0119e4a5, 0x011be4a5, 0x011de4a5,
181 0x011fe4a5, 0x0121e4a5, 0x0123e4a5, 0x0125e4a5,
182 0x0127e4a5, 0x0129e4a5, 0x012be4a5, 0x012de4a5,
183 0x012fe4a5, 0x0131e4a5, 0x0133e4a5, 0x0135e4a5,
184 0x0137e4a5, 0x013be4a5, 0x013fe4a5, 0x0143e4a5,
185 0x0147e4a5, 0x014be4a5, 0x014fe4a5, 0x0153e4a5,
186 0x0157e4a5, 0x015be4a5, 0x015fe4a5, 0x0163e4a5,
187 0x0167e4a5, 0x016be4a5, 0x016fe4a5, 0x0173e4a5,
188 0x0177e4a5, 0x017be4a5, 0x017fe4a5, 0x0183e4a5,
189 0x0187e4a5, 0x018be4a5, 0x018fe4a5, 0x0193e4a5,
190 0x0197e4a5, 0x019be4a5, 0x019fe4a5, 0x01a3e4a5,
191 0x01a7e4a5, 0x01abe4a5, 0x01afe4a5, 0x01b3e4a5,
192 0x01b7e4a5, 0x01bbe4a5, 0x01bfe4a5, 0x01c3e4a5,
193 0x01c7e4a5, 0x01cbe4a5, 0x01cfe4a5, 0x01d3e4a5,
194 0x01d7e4a5, 0x01dbe4a5, 0x01dfe4a5, 0x01e3e4a5,
195 0x01e7e4a5, 0x01ebe4a5, 0x01efe4a5, 0x01f3e4a5,
196 0x01f7e4a5, 0x01fbe4a5, 0x01ffe4a5, 0x0203e4a5,
197 0x0207e4a5, 0x020be4a5, 0x020fe4a5, 0x0213e4a5,
198 0x0217e4a5, 0x021be4a5, 0x021fe4a5, 0x0223e4a5,
199 0x0227e4a5, 0x022be4a5, 0x022fe4a5, 0x0233e4a5,
200 0x0237e4a5, 0x023be4a5, 0x023fe4a5, 0x0243e4a5,
201 0x0247e4a5, 0x024be4a5, 0x024fe4a5, 0x0253e4a5,
202 0x0257e4a5, 0x025be4a5, 0x025fe4a5, 0x0263e4a5,
203 0x0267e4a5, 0x026be4a5, 0x026fe4a5, 0x0273e4a5,
204 0x0277e4a5, 0x027be4a5, 0x027fe4a5, 0x0283e4a5,
205 0x0287e4a5, 0x028be4a5, 0x028fe4a5, 0x0293e4a5,
206 0x0297e4a5, 0x029be4a5, 0x029fe4a5, 0x02a3e4a5,
207 0x02a7e4a5, 0x02abe4a5, 0x02afe4a5, 0x02b3e4a5,
208 0x02bbe4a5, 0x02c3e4a5, 0x02cbe4a5, 0x02d3e4a5,
209 0x02dbe4a5, 0x02e3e4a5, 0x02ebe4a5, 0x02f3e4a5,
210 0x02fbe4a5, 0x0303e4a5, 0x030be4a5, 0x0313e4a5,
211 0x031be4a5, 0x0323e4a5, 0x032be4a5, 0x0333e4a5,
212 0x033be4a5, 0x0343e4a5, 0x034be4a5, 0x0353e4a5,
213 0x035be4a5, 0x0363e4a5, 0x036be4a5, 0x0373e4a5,
214 0x037be4a5, 0x0383e4a5, 0x038be4a5, 0x0393e4a5,
215 0x039be4a5, 0x03a3e4a5, 0x03abe4a5, 0x03b3e4a5,
216 0x03bbe4a5, 0x03c3e4a5, 0x03cbe4a5, 0x03d3e4a5,
217 0x03dbe4a5, 0x03e3e4a5, 0x03ebe4a5, 0x03f3e4a5,
218 0x03fbe4a5, 0x0403e4a5, 0x040be4a5, 0x0413e4a5,
219 0x041be4a5, 0x0423e4a5, 0x042be4a5, 0x0433e4a5,
220 0x043be4a5, 0x0443e4a5, 0x044be4a5, 0x0453e4a5,
221 0x045be4a5, 0x0463e4a5, 0x046be4a5, 0x0473e4a5,
222 0x047be4a5, 0x0483e4a5, 0x048be4a5, 0x0493e4a5,
223 0x049be4a5, 0x04a3e4a5, 0x04abe4a5, 0x04b3e4a5,
224 0x04bbe4a5, 0x04c3e4a5, 0x04cbe4a5, 0x04d3e4a5,
225 0x04dbe4a5, 0x04e3e4a5, 0x04ebe4a5, 0x04f3e4a5,
226 0x04fbe4a5, 0x0503e4a5, 0x050be4a5, 0x0513e4a5,
227 0x051be4a5, 0x0523e4a5, 0x052be4a5, 0x0533e4a5,
228 0x053be4a5, 0x0543e4a5, 0x054be4a5, 0x0553e4a5,
229 0x055be4a5, 0x0563e4a5, 0x056be4a5, 0x0573e4a5,
230 0x057be4a5, 0x0583e4a5, 0x058be4a5, 0x0593e4a5,
231 0x059be4a5, 0x05a3e4a5, 0x05abe4a5, 0x05b3e4a5,
232 0x05bbe4a5, 0x05c3e4a5, 0x05cbe4a5, 0x05d3e4a5,
233 0x05dbe4a5, 0x05e3e4a5, 0x05ebe4a5, 0x05f3e4a5,
234 0x05fbe4a5, 0x060be4a5, 0x061be4a5, 0x062be4a5,
235 0x063be4a5, 0x064be4a5, 0x065be4a5, 0x465be4a5,
236 /* The last entry is extra; it is equal to LZMS_MAX_MATCH_OFFSET + 1 and
237 * is here to aid binary search. */
240 /* Table: offset slot => number of extra offset bits */
241 const u8 lzms_extra_offset_bits[LZMS_MAX_NUM_OFFSET_SYMS] = {
242 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2,
243 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4,
244 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5,
245 5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6,
246 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7,
247 7 , 7 , 7 , 7 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8,
248 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9,
249 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9,
250 9 , 9 , 9 , 9 , 9 , 9 , 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
251 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
252 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11,
253 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
254 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
255 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
256 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
257 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13,
258 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
259 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
260 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
261 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
262 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
263 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
264 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
265 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
266 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
267 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
268 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16,
269 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
270 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
271 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
272 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
273 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17,
274 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
275 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
276 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
277 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
278 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
279 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
280 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
281 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
282 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
283 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
284 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19,
285 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
286 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
287 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
288 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
289 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
290 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
291 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 30,
294 /* Table: length slot => length slot base value */
295 const u32 lzms_length_slot_base[LZMS_NUM_LENGTH_SYMS + 1] = {
296 0x00000001, 0x00000002, 0x00000003, 0x00000004,
297 0x00000005, 0x00000006, 0x00000007, 0x00000008,
298 0x00000009, 0x0000000a, 0x0000000b, 0x0000000c,
299 0x0000000d, 0x0000000e, 0x0000000f, 0x00000010,
300 0x00000011, 0x00000012, 0x00000013, 0x00000014,
301 0x00000015, 0x00000016, 0x00000017, 0x00000018,
302 0x00000019, 0x0000001a, 0x0000001b, 0x0000001d,
303 0x0000001f, 0x00000021, 0x00000023, 0x00000027,
304 0x0000002b, 0x0000002f, 0x00000033, 0x00000037,
305 0x0000003b, 0x00000043, 0x0000004b, 0x00000053,
306 0x0000005b, 0x0000006b, 0x0000007b, 0x0000008b,
307 0x0000009b, 0x000000ab, 0x000000cb, 0x000000eb,
308 0x0000012b, 0x000001ab, 0x000002ab, 0x000004ab,
309 0x000008ab, 0x000108ab, 0x400108ab,
310 /* The last entry is extra; it is equal to LZMS_MAX_MATCH_LENGTH + 1 and
311 * is here to aid binary search. */
314 /* Table: length slot => number of extra length bits */
315 const u8 lzms_extra_length_bits[LZMS_NUM_LENGTH_SYMS] = {
316 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
317 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
318 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
319 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 ,
320 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ,
321 4 , 4 , 4 , 4 , 4 , 5 , 5 , 6 ,
322 7 , 8 , 9 , 10, 16, 30,
326 lzms_get_slot(u32 value, const u32 slot_base_tab[], unsigned num_slots)
329 unsigned r = num_slots - 1;
331 unsigned slot = (l + r) / 2;
332 if (value >= slot_base_tab[slot]) {
333 if (value < slot_base_tab[slot + 1])
343 /* Return the number of offset slots used when processing a buffer having the
344 * specified uncompressed size. */
346 lzms_get_num_offset_slots(size_t uncompressed_size)
348 if (uncompressed_size < 2)
350 return 1 + lzms_get_offset_slot(uncompressed_size - 1);
354 lzms_init_probabilities(struct lzms_probabilites *probs)
356 struct lzms_probability_entry *entries =
357 (struct lzms_probability_entry *)probs;
358 size_t num_entries = sizeof(struct lzms_probabilites) /
359 sizeof(struct lzms_probability_entry);
360 for (size_t i = 0; i < num_entries; i++) {
361 entries[i].num_recent_zero_bits = LZMS_INITIAL_PROBABILITY;
362 entries[i].recent_bits = LZMS_INITIAL_RECENT_BITS;
367 lzms_init_symbol_frequencies(u32 freqs[], unsigned num_syms)
369 for (unsigned sym = 0; sym < num_syms; sym++)
374 lzms_dilute_symbol_frequencies(u32 freqs[], unsigned num_syms)
376 for (unsigned sym = 0; sym < num_syms; sym++)
377 freqs[sym] = (freqs[sym] >> 1) + 1;
382 static forceinline u8 *
383 find_next_opcode_sse4_2(u8 *p)
385 const __v16qi potential_opcodes = (__v16qi) {0x48, 0x4C, 0xE8, 0xE9, 0xF0, 0xFF};
387 " pcmpestri $0x0, (%[p]), %[potential_opcodes] \n"
390 " add $0x10, %[p] \n"
391 " pcmpestri $0x0, (%[p]), %[potential_opcodes] \n"
394 #ifdef __ILP32__ /* x32 ABI (x86_64 with 32-bit pointers) */
395 " add %%ecx, %[p] \n"
397 " add %%rcx, %[p] \n"
400 : [potential_opcodes] "x" (potential_opcodes), "a" (6), "d" (16)
406 #endif /* __x86_64__ */
408 static forceinline u8 *
409 find_next_opcode_default(u8 *p)
412 * The following table is used to accelerate the common case where the
413 * byte has nothing to do with x86 translation and must simply be
414 * skipped. This was faster than the following alternatives:
415 * - Jump table with 256 entries
416 * - Switch statement with default
418 static const u8 is_potential_opcode[256] = {
419 [0x48] = 1, [0x4C] = 1, [0xE8] = 1,
420 [0xE9] = 1, [0xF0] = 1, [0xFF] = 1,
424 if (is_potential_opcode[*p])
427 if (is_potential_opcode[*p])
430 if (is_potential_opcode[*p])
433 if (is_potential_opcode[*p])
440 static forceinline u8 *
441 translate_if_needed(u8 *data, u8 *p, s32 *last_x86_pos,
442 s32 last_target_usages[], bool undo)
444 s32 max_trans_offset;
449 max_trans_offset = LZMS_X86_MAX_TRANSLATION_OFFSET;
452 * p[0] has one of the following values:
453 * 0x48 0x4C 0xE8 0xE9 0xF0 0xFF
458 /* 0xFF (instruction group) */
460 /* Call indirect relative */
465 /* 0xF0 (lock prefix) */
466 if (p[1] == 0x83 && p[2] == 0x05) {
467 /* Lock add relative */
472 } else if (p[0] <= 0x4C) {
474 /* 0x48 or 0x4C. In 64-bit code this is a REX prefix byte with
475 * W=1, R=[01], X=0, and B=0, and it will be followed by the
476 * actual opcode, then additional bytes depending on the opcode.
477 * We are most interested in several common instructions that
478 * access data relative to the instruction pointer. These use a
479 * 1-byte opcode, followed by a ModR/M byte, followed by a
480 * 4-byte displacement. */
482 /* Test: does the ModR/M byte indicate RIP-relative addressing?
483 * Note: there seems to be a mistake in the format here; the
484 * mask really should be 0xC7 instead of 0x07 so that both the
485 * MOD and R/M fields of ModR/M are tested, not just R/M. */
486 if ((p[2] & 0x07) == 0x05) {
487 /* Check for the LEA (load effective address) or MOV
488 * (move) opcodes. For MOV there are additional
489 * restrictions, although it seems they are only helpful
490 * due to the overly lax ModR/M test. */
492 (p[1] == 0x8B && !(p[0] & 0x04) && !(p[2] & 0xF0)))
500 /* 0xE9: Jump relative. Theoretically this would be
501 * useful to translate, but in fact it's explicitly
502 * excluded. Most likely it creates too many false
503 * positives for the detection algorithm. */
506 /* 0xE8: Call relative. This is a common case, so it
507 * uses a reduced max_trans_offset. In other words, we
508 * have to be more confident that the data actually is
509 * x86 machine code before we'll do the translation. */
511 max_trans_offset >>= 1;
522 if (i - *last_x86_pos <= max_trans_offset) {
523 u32 n = get_unaligned_le32(p);
524 put_unaligned_le32(n - i, p);
526 target16 = i + get_unaligned_le16(p);
528 target16 = i + get_unaligned_le16(p);
529 if (i - *last_x86_pos <= max_trans_offset) {
530 u32 n = get_unaligned_le32(p);
531 put_unaligned_le32(n + i, p);
535 i += opcode_nbytes + sizeof(le32) - 1;
537 if (i - last_target_usages[target16] <= LZMS_X86_ID_WINDOW_SIZE)
540 last_target_usages[target16] = i;
542 return p + sizeof(le32);
546 * Translate relative addresses embedded in x86 instructions into absolute
547 * addresses (@undo == %false), or undo this translation (@undo == %true).
549 * Absolute addresses are usually more compressible by LZ factorization.
551 * @last_target_usages must be a temporary array of length >= 65536.
554 lzms_x86_filter(u8 data[restrict], s32 size,
555 s32 last_target_usages[restrict], bool undo)
558 * Note: this filter runs unconditionally and uses a custom algorithm to
559 * detect data regions that probably contain x86 code.
561 * 'last_x86_pos' tracks the most recent position that has a good chance
562 * of being the start of an x86 instruction. When the filter detects a
563 * likely x86 instruction, it updates this variable and considers the
564 * next LZMS_X86_MAX_TRANSLATION_OFFSET bytes of data as valid for x86
567 * If part of the data does not, in fact, contain x86 machine code, then
568 * 'last_x86_pos' will, very likely, eventually fall more than
569 * LZMS_X86_MAX_TRANSLATION_OFFSET bytes behind the current position.
570 * This results in x86 translations being disabled until the next likely
571 * x86 instruction is detected.
573 * To identify "likely x86 instructions", the algorithm attempts to
574 * track the position of the most recent potential relative-addressing
575 * instruction that referenced each possible memory address. If it
576 * finds two references to the same memory address within an
577 * LZMS_X86_ID_WINDOW_SIZE-byte sized window, then the second reference
578 * is flagged as a likely x86 instruction. Since the instructions
579 * considered for translation necessarily use relative addressing, the
580 * algorithm does a tentative translation into absolute addresses. In
581 * addition, so that memory addresses can be looked up in an array of
582 * reasonable size (in this code, 'last_target_usages'), only the
583 * low-order 2 bytes of each address are considered significant.
588 s32 last_x86_pos = -LZMS_X86_MAX_TRANSLATION_OFFSET - 1;
593 for (s32 i = 0; i < 65536; i++)
594 last_target_usages[i] = -(s32)LZMS_X86_ID_WINDOW_SIZE - 1;
597 * Optimization: only check for end-of-buffer when we already have a
598 * byte that is a potential opcode for x86 translation. To do this,
599 * overwrite one of the bytes near the end of the buffer, and restore it
600 * later. The correctness of this optimization relies on two
601 * characteristics of compressed format:
603 * 1. No translation can follow an opcode beginning in the last 16
605 * 2. A translation following an opcode starting at the last possible
606 * position (17 bytes from the end) never extends more than 7 bytes.
607 * Consequently, we can overwrite any of the bytes starting at
608 * data[(size - 16) + 7] and have no effect on the result, as long
609 * as we restore those bytes later.
612 /* Note: the very first byte must be ignored completely! */
614 tail_ptr = &data[size - 16];
617 if (cpu_features & X86_CPU_FEATURE_SSE4_2) {
618 u8 saved_byte = *tail_ptr;
621 u8 *new_p = find_next_opcode_sse4_2(p);
622 if (new_p >= tail_ptr - 8)
625 p = translate_if_needed(data, p, &last_x86_pos,
626 last_target_usages, undo);
628 *tail_ptr = saved_byte;
632 u8 saved_byte = *(tail_ptr + 8);
633 *(tail_ptr + 8) = 0xE8;
635 p = find_next_opcode_default(p);
638 p = translate_if_needed(data, p, &last_x86_pos,
639 last_target_usages, undo);
641 *(tail_ptr + 8) = saved_byte;