a34fb245265bc5dab3013535ec16198aed593cbf
[wimlib] / src / lzms-common.c
1 /*
2  * lzms-common.c
3  *
4  * Code shared between the compressor and decompressor for the LZMS compression
5  * format.
6  */
7
8 /*
9  * Copyright (C) 2013 Eric Biggers
10  *
11  * This file is part of wimlib, a library for working with WIM files.
12  *
13  * wimlib is free software; you can redistribute it and/or modify it under the
14  * terms of the GNU General Public License as published by the Free
15  * Software Foundation; either version 3 of the License, or (at your option)
16  * any later version.
17  *
18  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
19  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
20  * A PARTICULAR PURPOSE. See the GNU General Public License for more
21  * details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with wimlib; if not, see http://www.gnu.org/licenses/.
25  */
26
27 #ifdef HAVE_CONFIG_H
28 #  include "config.h"
29 #endif
30
31 #include "wimlib/endianness.h"
32 #include "wimlib/lzms.h"
33 #include "wimlib/util.h"
34
35 #include <pthread.h>
36
37 /***************************************************************
38  * Constant tables initialized by lzms_compute_slots():        *
39  ***************************************************************/
40
41 /* Table: position slot => position slot base value  */
42 u32 lzms_position_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1];
43
44 /* Table: position slot => number of extra position bits  */
45 u8 lzms_extra_position_bits[LZMS_MAX_NUM_OFFSET_SYMS];
46
47 /* Table: log2(position) => [lower bound, upper bound] on position slot  */
48 u16 lzms_order_to_position_slot_bounds[30][2];
49
50 /* Table: length slot => length slot base value  */
51 u32 lzms_length_slot_base[LZMS_NUM_LEN_SYMS + 1];
52
53 /* Table: length slot => number of extra length bits  */
54 u8 lzms_extra_length_bits[LZMS_NUM_LEN_SYMS];
55
56 /* Table: length (< LZMS_NUM_FAST_LENGTHS only) => length slot  */
57 u8 lzms_length_slot_fast[LZMS_NUM_FAST_LENGTHS];
58
59 u32
60 lzms_get_slot(u32 value, const u32 slot_base_tab[], unsigned num_slots)
61 {
62         u32 l = 0;
63         u32 r = num_slots - 1;
64         for (;;) {
65                 LZMS_ASSERT(r >= l);
66                 u32 slot = (l + r) / 2;
67                 if (value >= slot_base_tab[slot]) {
68                         if (value < slot_base_tab[slot + 1])
69                                 return slot;
70                         else
71                                 l = slot + 1;
72                 } else {
73                         r = slot - 1;
74                 }
75         }
76 }
77
78 static void
79 lzms_decode_delta_rle_slot_bases(u32 slot_bases[],
80                                  u8 extra_bits[],
81                                  const u8 delta_run_lens[],
82                                  u32 num_run_lens,
83                                  u32 final,
84                                  u32 expected_num_slots)
85 {
86         u32 order = 0;
87         u32 delta = 1;
88         u32 base = 0;
89         u32 slot = 0;
90         for (u32 i = 0; i < num_run_lens; i++) {
91                 u8 run_len = delta_run_lens[i];
92                 while (run_len--) {
93                         base += delta;
94                         if (slot > 0)
95                                 extra_bits[slot - 1] = order;
96                         slot_bases[slot] = base;
97                         slot++;
98                 }
99                 delta <<= 1;
100                 order++;
101         }
102         LZMS_ASSERT(slot == expected_num_slots);
103
104         slot_bases[slot] = final;
105         extra_bits[slot - 1] = bsr32(slot_bases[slot] - slot_bases[slot - 1]);
106 }
107
108 /* Initialize the global position and length slot tables.  */
109 static void
110 lzms_compute_slots(void)
111 {
112         /* If an explicit formula that maps LZMS position and length slots to
113          * slot bases exists, then it could be used here.  But until one is
114          * found, the following code fills in the slots using the observation
115          * that the increase from one slot base to the next is an increasing
116          * power of 2.  Therefore, run-length encoding of the delta of adjacent
117          * entries can be used.  */
118         static const u8 position_slot_delta_run_lens[] = {
119                 9,   0,   9,   7,   10,  15,  15,  20,
120                 20,  30,  33,  40,  42,  45,  60,  73,
121                 80,  85,  95,  105, 6,
122         };
123
124         static const u8 length_slot_delta_run_lens[] = {
125                 27,  4,   6,   4,   5,   2,   1,   1,
126                 1,   1,   1,   0,   0,   0,   0,   0,
127                 1,
128         };
129
130         /* Position slots  */
131         lzms_decode_delta_rle_slot_bases(lzms_position_slot_base,
132                                          lzms_extra_position_bits,
133                                          position_slot_delta_run_lens,
134                                          ARRAY_LEN(position_slot_delta_run_lens),
135                                          0x7fffffff,
136                                          LZMS_MAX_NUM_OFFSET_SYMS);
137
138         for (u32 order = 0; order < 30; order++) {
139                 lzms_order_to_position_slot_bounds[order][0] =
140                         lzms_get_slot(1U << order, lzms_position_slot_base,
141                                       LZMS_MAX_NUM_OFFSET_SYMS);
142                 lzms_order_to_position_slot_bounds[order][1] =
143                         lzms_get_slot((1U << (order + 1)) - 1, lzms_position_slot_base,
144                                       LZMS_MAX_NUM_OFFSET_SYMS);
145         }
146
147         /* Length slots  */
148         lzms_decode_delta_rle_slot_bases(lzms_length_slot_base,
149                                          lzms_extra_length_bits,
150                                          length_slot_delta_run_lens,
151                                          ARRAY_LEN(length_slot_delta_run_lens),
152                                          0x400108ab,
153                                          LZMS_NUM_LEN_SYMS);
154
155         /* Create table mapping short lengths to length slots.  */
156         for (u32 slot = 0, i = 0; i < LZMS_NUM_FAST_LENGTHS; i++) {
157                 if (i >= lzms_length_slot_base[slot + 1])
158                         slot++;
159                 lzms_length_slot_fast[i] = slot;
160         }
161 }
162
163 /* Initialize the global position and length slot tables if not done so already.
164  * */
165 void
166 lzms_init_slots(void)
167 {
168         static bool done = false;
169         static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
170
171         if (unlikely(!done)) {
172                 pthread_mutex_lock(&mutex);
173                 if (!done) {
174                         lzms_compute_slots();
175                         done = true;
176                 }
177                 pthread_mutex_unlock(&mutex);
178         }
179 }
180
181 static s32
182 lzms_maybe_do_x86_translation(u8 data[restrict], s32 i, s32 num_op_bytes,
183                               s32 * restrict closest_target_usage_p,
184                               s32 last_target_usages[restrict],
185                               s32 max_trans_offset, bool undo)
186 {
187         u16 pos;
188
189         if (undo) {
190                 if (i - *closest_target_usage_p <= max_trans_offset) {
191                         LZMS_DEBUG("Undid x86 translation at position %d "
192                                    "(opcode 0x%02x)", i, data[i]);
193                         le32 *p32 = (le32*)&data[i + num_op_bytes];
194                         u32 n = le32_to_cpu(*p32);
195                         *p32 = cpu_to_le32(n - i);
196                 }
197                 pos = i + le16_to_cpu(*(const le16*)&data[i + num_op_bytes]);
198         } else {
199                 pos = i + le16_to_cpu(*(const le16*)&data[i + num_op_bytes]);
200
201                 if (i - *closest_target_usage_p <= max_trans_offset) {
202                         LZMS_DEBUG("Did x86 translation at position %d "
203                                    "(opcode 0x%02x)", i, data[i]);
204                         le32 *p32 = (le32*)&data[i + num_op_bytes];
205                         u32 n = le32_to_cpu(*p32);
206                         *p32 = cpu_to_le32(n + i);
207                 }
208         }
209
210         i += num_op_bytes + sizeof(le32) - 1;
211
212         if (i - last_target_usages[pos] <= LZMS_X86_MAX_GOOD_TARGET_OFFSET)
213                 *closest_target_usage_p = i;
214
215         last_target_usages[pos] = i;
216
217         return i + 1;
218 }
219
220 static s32
221 lzms_may_x86_translate(const u8 p[restrict],
222                        s32 *restrict max_offset_ret)
223 {
224         /* Switch on first byte of the opcode, assuming it is really an x86
225          * instruction.  */
226         *max_offset_ret = LZMS_X86_MAX_TRANSLATION_OFFSET;
227         switch (p[0]) {
228         case 0x48:
229                 if (p[1] == 0x8b) {
230                         if (p[2] == 0x5 || p[2] == 0xd) {
231                                 /* Load relative (x86_64)  */
232                                 return 3;
233                         }
234                 } else if (p[1] == 0x8d) {
235                         if ((p[2] & 0x7) == 0x5) {
236                                 /* Load effective address relative (x86_64)  */
237                                 return 3;
238                         }
239                 }
240                 break;
241
242         case 0x4c:
243                 if (p[1] == 0x8d) {
244                         if ((p[2] & 0x7) == 0x5) {
245                                 /* Load effective address relative (x86_64)  */
246                                 return 3;
247                         }
248                 }
249                 break;
250
251         case 0xe8:
252                 /* Call relative  */
253                 *max_offset_ret = LZMS_X86_MAX_TRANSLATION_OFFSET / 2;
254                 return 1;
255
256         case 0xe9:
257                 /* Jump relative  */
258                 *max_offset_ret = 0;
259                 return 5;
260
261         case 0xf0:
262                 if (p[1] == 0x83 && p[2] == 0x05) {
263                         /* Lock add relative  */
264                         return 3;
265                 }
266                 break;
267
268         case 0xff:
269                 if (p[1] == 0x15) {
270                         /* Call indirect  */
271                         return 2;
272                 }
273                 break;
274         }
275         *max_offset_ret = 0;
276         return 1;
277 }
278
279 /*
280  * Translate relative addresses embedded in x86 instructions into absolute
281  * addresses (@undo == %false), or undo this translation (@undo == %true).
282  *
283  * @last_target_usages is a temporary array of length >= 65536.
284  */
285 void
286 lzms_x86_filter(u8 data[restrict],
287                 s32 size,
288                 s32 last_target_usages[restrict],
289                 bool undo)
290 {
291         s32 closest_target_usage = -LZMS_X86_MAX_TRANSLATION_OFFSET - 1;
292
293         for (s32 i = 0; i < 65536; i++)
294                 last_target_usages[i] = -LZMS_X86_MAX_GOOD_TARGET_OFFSET - 1;
295
296         for (s32 i = 0; i < size - 16; ) {
297                 s32 max_trans_offset;
298                 s32 n;
299
300                 n = lzms_may_x86_translate(data + i, &max_trans_offset);
301                 if (max_trans_offset) {
302                         i = lzms_maybe_do_x86_translation(data, i, n,
303                                                           &closest_target_usage,
304                                                           last_target_usages,
305                                                           max_trans_offset,
306                                                           undo);
307                 } else {
308                         i += n;
309                 }
310         }
311 }
312
313 static void
314 lzms_init_lz_lru_queues(struct lzms_lz_lru_queues *lz)
315 {
316         /* Recent offsets for LZ matches  */
317         for (u32 i = 0; i < LZMS_NUM_RECENT_OFFSETS + 1; i++)
318                 lz->recent_offsets[i] = i + 1;
319
320         lz->prev_offset = 0;
321         lz->upcoming_offset = 0;
322 }
323
324 static void
325 lzms_init_delta_lru_queues(struct lzms_delta_lru_queues *delta)
326 {
327         /* Recent offsets and powers for LZ matches  */
328         for (u32 i = 0; i < LZMS_NUM_RECENT_OFFSETS + 1; i++) {
329                 delta->recent_offsets[i] = i + 1;
330                 delta->recent_powers[i] = 0;
331         }
332         delta->prev_offset = 0;
333         delta->prev_power = 0;
334         delta->upcoming_offset = 0;
335         delta->upcoming_power = 0;
336 }
337
338
339 void
340 lzms_init_lru_queues(struct lzms_lru_queues *lru)
341 {
342         lzms_init_lz_lru_queues(&lru->lz);
343         lzms_init_delta_lru_queues(&lru->delta);
344 }
345
346 void
347 lzms_update_lz_lru_queues(struct lzms_lz_lru_queues *lz)
348 {
349         if (lz->prev_offset != 0) {
350                 for (int i = LZMS_NUM_RECENT_OFFSETS - 1; i >= 0; i--)
351                         lz->recent_offsets[i + 1] = lz->recent_offsets[i];
352                 lz->recent_offsets[0] = lz->prev_offset;
353         }
354         lz->prev_offset = lz->upcoming_offset;
355 }
356
357 void
358 lzms_update_delta_lru_queues(struct lzms_delta_lru_queues *delta)
359 {
360         if (delta->prev_offset != 0) {
361                 for (int i = LZMS_NUM_RECENT_OFFSETS - 1; i >= 0; i--) {
362                         delta->recent_offsets[i + 1] = delta->recent_offsets[i];
363                         delta->recent_powers[i + 1] = delta->recent_powers[i];
364                 }
365                 delta->recent_offsets[0] = delta->prev_offset;
366                 delta->recent_powers[0] = delta->prev_power;
367         }
368
369         delta->prev_offset = delta->upcoming_offset;
370         delta->prev_power = delta->upcoming_power;
371 }
372
373 void
374 lzms_update_lru_queues(struct lzms_lru_queues *lru)
375 {
376         lzms_update_lz_lru_queues(&lru->lz);
377         lzms_update_delta_lru_queues(&lru->delta);
378 }