lzms-common.c, lzms-compress.c: Use pthread_once()
[wimlib] / src / lzms-common.c
1 /*
2  * lzms-common.c
3  *
4  * Code shared between the compressor and decompressor for the LZMS compression
5  * format.
6  */
7
8 /*
9  * Copyright (C) 2013 Eric Biggers
10  *
11  * This file is part of wimlib, a library for working with WIM files.
12  *
13  * wimlib is free software; you can redistribute it and/or modify it under the
14  * terms of the GNU General Public License as published by the Free
15  * Software Foundation; either version 3 of the License, or (at your option)
16  * any later version.
17  *
18  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
19  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
20  * A PARTICULAR PURPOSE. See the GNU General Public License for more
21  * details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with wimlib; if not, see http://www.gnu.org/licenses/.
25  */
26
27 #ifdef HAVE_CONFIG_H
28 #  include "config.h"
29 #endif
30
31 #include "wimlib/endianness.h"
32 #include "wimlib/lzms.h"
33 #include "wimlib/util.h"
34
35 #include <pthread.h>
36
37 /***************************************************************
38  * Constant tables initialized by lzms_compute_slots():        *
39  ***************************************************************/
40
41 /* Table: position slot => position slot base value  */
42 u32 lzms_position_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1];
43
44 /* Table: position slot => number of extra position bits  */
45 u8 lzms_extra_position_bits[LZMS_MAX_NUM_OFFSET_SYMS];
46
47 /* Table: log2(position) => [lower bound, upper bound] on position slot  */
48 u16 lzms_order_to_position_slot_bounds[30][2];
49
50 /* Table: length slot => length slot base value  */
51 u32 lzms_length_slot_base[LZMS_NUM_LEN_SYMS + 1];
52
53 /* Table: length slot => number of extra length bits  */
54 u8 lzms_extra_length_bits[LZMS_NUM_LEN_SYMS];
55
56 /* Table: length (< LZMS_NUM_FAST_LENGTHS only) => length slot  */
57 u8 lzms_length_slot_fast[LZMS_NUM_FAST_LENGTHS];
58
59 u32
60 lzms_get_slot(u32 value, const u32 slot_base_tab[], unsigned num_slots)
61 {
62         u32 l = 0;
63         u32 r = num_slots - 1;
64         for (;;) {
65                 LZMS_ASSERT(r >= l);
66                 u32 slot = (l + r) / 2;
67                 if (value >= slot_base_tab[slot]) {
68                         if (value < slot_base_tab[slot + 1])
69                                 return slot;
70                         else
71                                 l = slot + 1;
72                 } else {
73                         r = slot - 1;
74                 }
75         }
76 }
77
78 static void
79 lzms_decode_delta_rle_slot_bases(u32 slot_bases[],
80                                  u8 extra_bits[],
81                                  const u8 delta_run_lens[],
82                                  u32 num_run_lens,
83                                  u32 final,
84                                  u32 expected_num_slots)
85 {
86         u32 order = 0;
87         u32 delta = 1;
88         u32 base = 0;
89         u32 slot = 0;
90         for (u32 i = 0; i < num_run_lens; i++) {
91                 u8 run_len = delta_run_lens[i];
92                 while (run_len--) {
93                         base += delta;
94                         if (slot > 0)
95                                 extra_bits[slot - 1] = order;
96                         slot_bases[slot] = base;
97                         slot++;
98                 }
99                 delta <<= 1;
100                 order++;
101         }
102         LZMS_ASSERT(slot == expected_num_slots);
103
104         slot_bases[slot] = final;
105         extra_bits[slot - 1] = bsr32(slot_bases[slot] - slot_bases[slot - 1]);
106 }
107
108 /* Initialize the global position and length slot tables.  */
109 static void
110 lzms_compute_slots(void)
111 {
112         /* If an explicit formula that maps LZMS position and length slots to
113          * slot bases exists, then it could be used here.  But until one is
114          * found, the following code fills in the slots using the observation
115          * that the increase from one slot base to the next is an increasing
116          * power of 2.  Therefore, run-length encoding of the delta of adjacent
117          * entries can be used.  */
118         static const u8 position_slot_delta_run_lens[] = {
119                 9,   0,   9,   7,   10,  15,  15,  20,
120                 20,  30,  33,  40,  42,  45,  60,  73,
121                 80,  85,  95,  105, 6,
122         };
123
124         static const u8 length_slot_delta_run_lens[] = {
125                 27,  4,   6,   4,   5,   2,   1,   1,
126                 1,   1,   1,   0,   0,   0,   0,   0,
127                 1,
128         };
129
130         /* Position slots  */
131         lzms_decode_delta_rle_slot_bases(lzms_position_slot_base,
132                                          lzms_extra_position_bits,
133                                          position_slot_delta_run_lens,
134                                          ARRAY_LEN(position_slot_delta_run_lens),
135                                          0x7fffffff,
136                                          LZMS_MAX_NUM_OFFSET_SYMS);
137
138         for (u32 order = 0; order < 30; order++) {
139                 lzms_order_to_position_slot_bounds[order][0] =
140                         lzms_get_slot(1U << order, lzms_position_slot_base,
141                                       LZMS_MAX_NUM_OFFSET_SYMS);
142                 lzms_order_to_position_slot_bounds[order][1] =
143                         lzms_get_slot((1U << (order + 1)) - 1, lzms_position_slot_base,
144                                       LZMS_MAX_NUM_OFFSET_SYMS);
145         }
146
147         /* Length slots  */
148         lzms_decode_delta_rle_slot_bases(lzms_length_slot_base,
149                                          lzms_extra_length_bits,
150                                          length_slot_delta_run_lens,
151                                          ARRAY_LEN(length_slot_delta_run_lens),
152                                          0x400108ab,
153                                          LZMS_NUM_LEN_SYMS);
154
155         /* Create table mapping short lengths to length slots.  */
156         for (u32 slot = 0, i = 0; i < LZMS_NUM_FAST_LENGTHS; i++) {
157                 if (i >= lzms_length_slot_base[slot + 1])
158                         slot++;
159                 lzms_length_slot_fast[i] = slot;
160         }
161 }
162
163 /* Initialize the global position and length slot tables if not done so already.
164  * */
165 void
166 lzms_init_slots(void)
167 {
168         static pthread_once_t once = PTHREAD_ONCE_INIT;
169
170         pthread_once(&once, lzms_compute_slots);
171 }
172
173 static s32
174 lzms_maybe_do_x86_translation(u8 data[restrict], s32 i, s32 num_op_bytes,
175                               s32 * restrict closest_target_usage_p,
176                               s32 last_target_usages[restrict],
177                               s32 max_trans_offset, bool undo)
178 {
179         u16 pos;
180
181         if (undo) {
182                 if (i - *closest_target_usage_p <= max_trans_offset) {
183                         LZMS_DEBUG("Undid x86 translation at position %d "
184                                    "(opcode 0x%02x)", i, data[i]);
185                         le32 *p32 = (le32*)&data[i + num_op_bytes];
186                         u32 n = le32_to_cpu(*p32);
187                         *p32 = cpu_to_le32(n - i);
188                 }
189                 pos = i + le16_to_cpu(*(const le16*)&data[i + num_op_bytes]);
190         } else {
191                 pos = i + le16_to_cpu(*(const le16*)&data[i + num_op_bytes]);
192
193                 if (i - *closest_target_usage_p <= max_trans_offset) {
194                         LZMS_DEBUG("Did x86 translation at position %d "
195                                    "(opcode 0x%02x)", i, data[i]);
196                         le32 *p32 = (le32*)&data[i + num_op_bytes];
197                         u32 n = le32_to_cpu(*p32);
198                         *p32 = cpu_to_le32(n + i);
199                 }
200         }
201
202         i += num_op_bytes + sizeof(le32) - 1;
203
204         if (i - last_target_usages[pos] <= LZMS_X86_MAX_GOOD_TARGET_OFFSET)
205                 *closest_target_usage_p = i;
206
207         last_target_usages[pos] = i;
208
209         return i + 1;
210 }
211
212 static s32
213 lzms_may_x86_translate(const u8 p[restrict],
214                        s32 *restrict max_offset_ret)
215 {
216         /* Switch on first byte of the opcode, assuming it is really an x86
217          * instruction.  */
218         *max_offset_ret = LZMS_X86_MAX_TRANSLATION_OFFSET;
219         switch (p[0]) {
220         case 0x48:
221                 if (p[1] == 0x8b) {
222                         if (p[2] == 0x5 || p[2] == 0xd) {
223                                 /* Load relative (x86_64)  */
224                                 return 3;
225                         }
226                 } else if (p[1] == 0x8d) {
227                         if ((p[2] & 0x7) == 0x5) {
228                                 /* Load effective address relative (x86_64)  */
229                                 return 3;
230                         }
231                 }
232                 break;
233
234         case 0x4c:
235                 if (p[1] == 0x8d) {
236                         if ((p[2] & 0x7) == 0x5) {
237                                 /* Load effective address relative (x86_64)  */
238                                 return 3;
239                         }
240                 }
241                 break;
242
243         case 0xe8:
244                 /* Call relative  */
245                 *max_offset_ret = LZMS_X86_MAX_TRANSLATION_OFFSET / 2;
246                 return 1;
247
248         case 0xe9:
249                 /* Jump relative  */
250                 *max_offset_ret = 0;
251                 return 5;
252
253         case 0xf0:
254                 if (p[1] == 0x83 && p[2] == 0x05) {
255                         /* Lock add relative  */
256                         return 3;
257                 }
258                 break;
259
260         case 0xff:
261                 if (p[1] == 0x15) {
262                         /* Call indirect  */
263                         return 2;
264                 }
265                 break;
266         }
267         *max_offset_ret = 0;
268         return 1;
269 }
270
271 /*
272  * Translate relative addresses embedded in x86 instructions into absolute
273  * addresses (@undo == %false), or undo this translation (@undo == %true).
274  *
275  * @last_target_usages is a temporary array of length >= 65536.
276  */
277 void
278 lzms_x86_filter(u8 data[restrict],
279                 s32 size,
280                 s32 last_target_usages[restrict],
281                 bool undo)
282 {
283         s32 closest_target_usage = -LZMS_X86_MAX_TRANSLATION_OFFSET - 1;
284
285         for (s32 i = 0; i < 65536; i++)
286                 last_target_usages[i] = -LZMS_X86_MAX_GOOD_TARGET_OFFSET - 1;
287
288         for (s32 i = 0; i < size - 16; ) {
289                 s32 max_trans_offset;
290                 s32 n;
291
292                 n = lzms_may_x86_translate(data + i, &max_trans_offset);
293                 if (max_trans_offset) {
294                         i = lzms_maybe_do_x86_translation(data, i, n,
295                                                           &closest_target_usage,
296                                                           last_target_usages,
297                                                           max_trans_offset,
298                                                           undo);
299                 } else {
300                         i += n;
301                 }
302         }
303 }
304
305 static void
306 lzms_init_lz_lru_queues(struct lzms_lz_lru_queues *lz)
307 {
308         /* Recent offsets for LZ matches  */
309         for (u32 i = 0; i < LZMS_NUM_RECENT_OFFSETS + 1; i++)
310                 lz->recent_offsets[i] = i + 1;
311
312         lz->prev_offset = 0;
313         lz->upcoming_offset = 0;
314 }
315
316 static void
317 lzms_init_delta_lru_queues(struct lzms_delta_lru_queues *delta)
318 {
319         /* Recent offsets and powers for LZ matches  */
320         for (u32 i = 0; i < LZMS_NUM_RECENT_OFFSETS + 1; i++) {
321                 delta->recent_offsets[i] = i + 1;
322                 delta->recent_powers[i] = 0;
323         }
324         delta->prev_offset = 0;
325         delta->prev_power = 0;
326         delta->upcoming_offset = 0;
327         delta->upcoming_power = 0;
328 }
329
330
331 void
332 lzms_init_lru_queues(struct lzms_lru_queues *lru)
333 {
334         lzms_init_lz_lru_queues(&lru->lz);
335         lzms_init_delta_lru_queues(&lru->delta);
336 }
337
338 void
339 lzms_update_lz_lru_queues(struct lzms_lz_lru_queues *lz)
340 {
341         if (lz->prev_offset != 0) {
342                 for (int i = LZMS_NUM_RECENT_OFFSETS - 1; i >= 0; i--)
343                         lz->recent_offsets[i + 1] = lz->recent_offsets[i];
344                 lz->recent_offsets[0] = lz->prev_offset;
345         }
346         lz->prev_offset = lz->upcoming_offset;
347 }
348
349 void
350 lzms_update_delta_lru_queues(struct lzms_delta_lru_queues *delta)
351 {
352         if (delta->prev_offset != 0) {
353                 for (int i = LZMS_NUM_RECENT_OFFSETS - 1; i >= 0; i--) {
354                         delta->recent_offsets[i + 1] = delta->recent_offsets[i];
355                         delta->recent_powers[i + 1] = delta->recent_powers[i];
356                 }
357                 delta->recent_offsets[0] = delta->prev_offset;
358                 delta->recent_powers[0] = delta->prev_power;
359         }
360
361         delta->prev_offset = delta->upcoming_offset;
362         delta->prev_power = delta->upcoming_power;
363 }
364
365 void
366 lzms_update_lru_queues(struct lzms_lru_queues *lru)
367 {
368         lzms_update_lz_lru_queues(&lru->lz);
369         lzms_update_delta_lru_queues(&lru->delta);
370 }