2 * lzms-common.c - Common code for LZMS compression and decompression
6 * Copyright (C) 2013, 2014 Eric Biggers
8 * This file is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU Lesser General Public License as published by the Free
10 * Software Foundation; either version 3 of the License, or (at your option) any
13 * This file is distributed in the hope that it will be useful, but WITHOUT
14 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with this file; if not, see http://www.gnu.org/licenses/.
26 #include "wimlib/bitops.h"
27 #include "wimlib/endianness.h"
28 #include "wimlib/lzms.h"
29 #include "wimlib/unaligned.h"
30 #include "wimlib/util.h"
34 /***************************************************************
35 * Constant tables initialized by lzms_compute_slots(): *
36 ***************************************************************/
38 /* Table: offset slot => offset slot base value */
39 u32 lzms_offset_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1];
41 /* Table: offset slot => number of extra offset bits */
42 u8 lzms_extra_offset_bits[LZMS_MAX_NUM_OFFSET_SYMS];
44 /* Table: length slot => length slot base value */
45 u32 lzms_length_slot_base[LZMS_NUM_LEN_SYMS + 1];
47 /* Table: length slot => number of extra length bits */
48 u8 lzms_extra_length_bits[LZMS_NUM_LEN_SYMS];
51 lzms_get_slot(u32 value, const u32 slot_base_tab[], unsigned num_slots)
54 unsigned r = num_slots - 1;
57 unsigned slot = (l + r) / 2;
58 if (value >= slot_base_tab[slot]) {
59 if (value < slot_base_tab[slot + 1])
70 lzms_decode_delta_rle_slot_bases(u32 slot_bases[],
72 const u8 delta_run_lens[],
73 unsigned num_run_lens,
75 unsigned expected_num_slots)
81 for (unsigned i = 0; i < num_run_lens; i++) {
82 unsigned run_len = delta_run_lens[i];
86 extra_bits[slot - 1] = order;
87 slot_bases[slot] = base;
93 LZMS_ASSERT(slot == expected_num_slots);
95 slot_bases[slot] = final;
96 extra_bits[slot - 1] = fls32(slot_bases[slot] - slot_bases[slot - 1]);
99 /* Initialize the global offset and length slot tables. */
101 lzms_compute_slots(void)
103 /* If an explicit formula that maps LZMS offset and length slots to slot
104 * bases exists, then it could be used here. But until one is found,
105 * the following code fills in the slots using the observation that the
106 * increase from one slot base to the next is an increasing power of 2.
107 * Therefore, run-length encoding of the delta of adjacent entries can
109 static const u8 offset_slot_delta_run_lens[] = {
110 9, 0, 9, 7, 10, 15, 15, 20,
111 20, 30, 33, 40, 42, 45, 60, 73,
115 static const u8 length_slot_delta_run_lens[] = {
116 27, 4, 6, 4, 5, 2, 1, 1,
117 1, 1, 1, 0, 0, 0, 0, 0,
122 lzms_decode_delta_rle_slot_bases(lzms_offset_slot_base,
123 lzms_extra_offset_bits,
124 offset_slot_delta_run_lens,
125 ARRAY_LEN(offset_slot_delta_run_lens),
127 LZMS_MAX_NUM_OFFSET_SYMS);
130 lzms_decode_delta_rle_slot_bases(lzms_length_slot_base,
131 lzms_extra_length_bits,
132 length_slot_delta_run_lens,
133 ARRAY_LEN(length_slot_delta_run_lens),
138 /* Initialize the global offset and length slot tables if not already done. */
140 lzms_init_slots(void)
142 static pthread_once_t once = PTHREAD_ONCE_INIT;
144 pthread_once(&once, lzms_compute_slots);
148 * Translate relative addresses embedded in x86 instructions into absolute
149 * addresses (@undo == %false), or undo this translation (@undo == %true).
151 * Absolute addresses are usually more compressible by LZ factorization.
153 * @last_target_usages must be a temporary array of length >= 65536.
156 lzms_x86_filter(u8 data[restrict], s32 size,
157 s32 last_target_usages[restrict], bool undo)
160 * Note: this filter runs unconditionally and uses a custom algorithm to
161 * detect data regions that probably contain x86 code.
163 * 'last_x86_pos' tracks the most recent position that has a good chance
164 * of being the start of an x86 instruction. When the filter detects a
165 * likely x86 instruction, it updates this variable and considers the
166 * next LZMS_X86_MAX_TRANSLATION_OFFSET bytes of data as valid for x86
169 * If part of the data does not, in fact, contain x86 machine code, then
170 * 'last_x86_pos' will, very likely, eventually fall more than
171 * LZMS_X86_MAX_TRANSLATION_OFFSET bytes behind the current position.
172 * This results in x86 translations being disabled until the next likely
173 * x86 instruction is detected.
175 * To identify "likely x86 instructions", the algorithm attempts to
176 * track the position of the most recent potential relative-addressing
177 * instruction that referenced each possible memory address. If it
178 * finds two references to the same memory address within an
179 * LZMS_X86_ID_WINDOW_SIZE-byte sized window, then the second reference
180 * is flagged as a likely x86 instruction. Since the instructions
181 * considered for translation necessarily use relative addressing, the
182 * algorithm does a tentative translation into absolute addresses. In
183 * addition, so that memory addresses can be looked up in an array of
184 * reasonable size (in this code, 'last_target_usages'), only the
185 * low-order 2 bytes of each address are considered significant.
196 for (i = 0; i < 65536; i++)
197 last_target_usages[i] = -(s32)LZMS_X86_ID_WINDOW_SIZE - 1;
200 * Optimization: only check for end-of-buffer when we already have a
201 * byte that is a potential opcode for x86 translation. To do this,
202 * overwrite one of the bytes near the end of the buffer, and restore it
203 * later. The correctness of this optimization relies on two
204 * characteristics of compressed format:
206 * 1. No translation can follow an opcode beginning in the last 16
208 * 2. A translation following an opcode starting at the last possible
209 * position (17 bytes from the end) never extends more than 7 bytes.
210 * Consequently, we can overwrite any of the bytes starting at
211 * data[(size - 16) + 7] and have no effect on the result, as long
212 * as we restore those bytes later.
214 tail_idx = size - 16;
215 saved_byte = data[tail_idx + 8];
216 data[tail_idx + 8] = 0xE8;
217 last_x86_pos = -LZMS_X86_MAX_TRANSLATION_OFFSET - 1;
219 /* Note: the very first byte must be ignored completely! */
222 s32 max_trans_offset;
227 * The following table is used to accelerate the common case
228 * where the byte has nothing to do with x86 translation and
229 * must simply be skipped. This is the fastest (at least on
230 * x86_64) of the implementations I tested. The other
231 * implementations I tested were:
232 * - Jump table with 256 entries
233 * - Switch statement with default
235 static const u8 is_potential_opcode[256] = {
236 [0x48] = 1, [0x4C] = 1, [0xE8] = 1,
237 [0xE9] = 1, [0xF0] = 1, [0xFF] = 1,
241 if (is_potential_opcode[data[++i]])
243 if (is_potential_opcode[data[++i]])
245 if (is_potential_opcode[data[++i]])
247 if (is_potential_opcode[data[++i]])
254 max_trans_offset = LZMS_X86_MAX_TRANSLATION_OFFSET;
257 if (data[i + 1] == 0x8B) {
258 if (data[i + 2] == 0x5 || data[i + 2] == 0xD) {
259 /* Load relative (x86_64) */
263 } else if (data[i + 1] == 0x8D) {
264 if ((data[i + 2] & 0x7) == 0x5) {
265 /* Load effective address relative (x86_64) */
272 if (data[i + 1] == 0x8D) {
273 if ((data[i + 2] & 0x7) == 0x5) {
274 /* Load effective address relative (x86_64) */
281 /* Call relative. Note: 'max_trans_offset' must be
282 * halved for this instruction. This means that we must
283 * be more confident that we are in a region of x86
284 * machine code before we will do a translation for this
285 * particular instruction. */
287 max_trans_offset /= 2;
294 if (data[i + 1] == 0x83 && data[i + 2] == 0x05) {
295 /* Lock add relative */
301 if (data[i + 1] == 0x15) {
313 if (i - last_x86_pos <= max_trans_offset) {
314 LZMS_DEBUG("Undid x86 translation at position %d "
315 "(opcode 0x%02x)", i, data[i]);
316 void *p32 = &data[i + opcode_nbytes];
317 u32 n = get_unaligned_u32_le(p32);
318 put_unaligned_u32_le(n - i, p32);
320 target16 = i + get_unaligned_u16_le(&data[i + opcode_nbytes]);
322 target16 = i + get_unaligned_u16_le(&data[i + opcode_nbytes]);
323 if (i - last_x86_pos <= max_trans_offset) {
324 LZMS_DEBUG("Did x86 translation at position %d "
325 "(opcode 0x%02x)", i, data[i]);
326 void *p32 = &data[i + opcode_nbytes];
327 u32 n = get_unaligned_u32_le(p32);
328 put_unaligned_u32_le(n + i, p32);
332 i += opcode_nbytes + sizeof(le32) - 1;
334 if (i - last_target_usages[target16] <= LZMS_X86_ID_WINDOW_SIZE)
337 last_target_usages[target16] = i;
342 data[tail_idx + 8] = saved_byte;
346 lzms_init_lz_lru_queues(struct lzms_lz_lru_queues *lz)
348 /* Recent offsets for LZ matches */
349 for (u32 i = 0; i < LZMS_NUM_RECENT_OFFSETS + 1; i++)
350 lz->recent_offsets[i] = i + 1;
353 lz->upcoming_offset = 0;
357 lzms_init_delta_lru_queues(struct lzms_delta_lru_queues *delta)
359 /* Recent offsets and powers for LZ matches */
360 for (u32 i = 0; i < LZMS_NUM_RECENT_OFFSETS + 1; i++) {
361 delta->recent_offsets[i] = i + 1;
362 delta->recent_powers[i] = 0;
364 delta->prev_offset = 0;
365 delta->prev_power = 0;
366 delta->upcoming_offset = 0;
367 delta->upcoming_power = 0;
372 lzms_init_lru_queues(struct lzms_lru_queues *lru)
374 lzms_init_lz_lru_queues(&lru->lz);
375 lzms_init_delta_lru_queues(&lru->delta);
379 lzms_update_lz_lru_queue(struct lzms_lz_lru_queues *lz)
381 if (lz->prev_offset != 0) {
382 for (int i = LZMS_NUM_RECENT_OFFSETS - 1; i >= 0; i--)
383 lz->recent_offsets[i + 1] = lz->recent_offsets[i];
384 lz->recent_offsets[0] = lz->prev_offset;
386 lz->prev_offset = lz->upcoming_offset;
390 lzms_update_delta_lru_queues(struct lzms_delta_lru_queues *delta)
392 if (delta->prev_offset != 0) {
393 for (int i = LZMS_NUM_RECENT_OFFSETS - 1; i >= 0; i--) {
394 delta->recent_offsets[i + 1] = delta->recent_offsets[i];
395 delta->recent_powers[i + 1] = delta->recent_powers[i];
397 delta->recent_offsets[0] = delta->prev_offset;
398 delta->recent_powers[0] = delta->prev_power;
401 delta->prev_offset = delta->upcoming_offset;
402 delta->prev_power = delta->upcoming_power;
406 lzms_update_lru_queues(struct lzms_lru_queues *lru)
408 lzms_update_lz_lru_queue(&lru->lz);
409 lzms_update_delta_lru_queues(&lru->delta);