/*
- * lzx-common.c - Common code for LZX compression and decompression.
+ * lzx_common.c - Common code for LZX compression and decompression.
*/
/*
- * Copyright (C) 2012, 2013, 2014 Eric Biggers
+ * Copyright (C) 2012-2016 Eric Biggers
*
* This file is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
#include <string.h>
+#ifdef __SSE2__
+# include <emmintrin.h>
+#endif
+
+#ifdef __AVX2__
+# include <immintrin.h>
+#endif
+
#include "wimlib/bitops.h"
#include "wimlib/endianness.h"
#include "wimlib/lzx_common.h"
#include "wimlib/unaligned.h"
#include "wimlib/util.h"
-#ifdef __SSE2__
-# include <emmintrin.h>
-#endif
-
/* Mapping: offset slot => first match offset that uses that offset slot.
- */
-const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS] = {
- 0 , 1 , 2 , 3 , 4 , /* 0 --- 4 */
- 6 , 8 , 12 , 16 , 24 , /* 5 --- 9 */
- 32 , 48 , 64 , 96 , 128 , /* 10 --- 14 */
- 192 , 256 , 384 , 512 , 768 , /* 15 --- 19 */
- 1024 , 1536 , 2048 , 3072 , 4096 , /* 20 --- 24 */
- 6144 , 8192 , 12288 , 16384 , 24576 , /* 25 --- 29 */
- 32768 , 49152 , 65536 , 98304 , 131072 , /* 30 --- 34 */
- 196608 , 262144 , 393216 , 524288 , 655360 , /* 35 --- 39 */
- 786432 , 917504 , 1048576, 1179648, 1310720, /* 40 --- 44 */
- 1441792, 1572864, 1703936, 1835008, 1966080, /* 45 --- 49 */
- 2097152 /* 50 */
+ * The offset slots for repeat offsets map to "fake" offsets < 1. */
+const s32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1] = {
+ -2 , -1 , 0 , 1 , 2 , /* 0 --- 4 */
+ 4 , 6 , 10 , 14 , 22 , /* 5 --- 9 */
+ 30 , 46 , 62 , 94 , 126 , /* 10 --- 14 */
+ 190 , 254 , 382 , 510 , 766 , /* 15 --- 19 */
+ 1022 , 1534 , 2046 , 3070 , 4094 , /* 20 --- 24 */
+ 6142 , 8190 , 12286 , 16382 , 24574 , /* 25 --- 29 */
+ 32766 , 49150 , 65534 , 98302 , 131070 , /* 30 --- 34 */
+ 196606 , 262142 , 393214 , 524286 , 655358 , /* 35 --- 39 */
+ 786430 , 917502 , 1048574, 1179646, 1310718, /* 40 --- 44 */
+ 1441790, 1572862, 1703934, 1835006, 1966078, /* 45 --- 49 */
+ 2097150 /* extra */
};
/* Mapping: offset slot => how many extra bits must be read and added to the
16, 17, 17, 17, 17,
17, 17, 17, 17, 17,
17, 17, 17, 17, 17,
- 17
};
-/* Round the specified compression block size (not LZX block size) up to the
- * next valid LZX window size, and return its order (log2). Or, if the block
- * size is 0 or greater than the largest valid LZX window size, return 0. */
+/* Round the specified buffer size up to the next valid LZX window size, and
+ * return its order (log2). Or, if the buffer size is 0 or greater than the
+ * largest valid LZX window size, return 0. */
unsigned
-lzx_get_window_order(size_t max_block_size)
+lzx_get_window_order(size_t max_bufsize)
{
- unsigned order;
-
- if (max_block_size == 0 || max_block_size > LZX_MAX_WINDOW_SIZE)
+ if (max_bufsize == 0 || max_bufsize > LZX_MAX_WINDOW_SIZE)
return 0;
- order = fls32(max_block_size);
-
- if (((u32)1 << order) != max_block_size)
- order++;
-
- return max(order, LZX_MIN_WINDOW_ORDER);
+ return max(ilog2_ceil(max_bufsize), LZX_MIN_WINDOW_ORDER);
}
/* Given a valid LZX window order, return the number of symbols that will exist
unsigned
lzx_get_num_main_syms(unsigned window_order)
{
+ /* Note: one would expect that the maximum match offset would be
+ * 'window_size - LZX_MIN_MATCH_LEN', which would occur if the first two
+ * bytes were to match the last two bytes. However, the format
+ * disallows this case. This reduces the number of needed offset slots
+ * by 1. */
u32 window_size = (u32)1 << window_order;
+ u32 max_offset = window_size - LZX_MIN_MATCH_LEN - 1;
+ unsigned num_offset_slots = 30;
+ while (max_offset >= lzx_offset_slot_base[num_offset_slots])
+ num_offset_slots++;
- /* NOTE: the calculation *should* be as follows:
- *
- * u32 max_offset = window_size - LZX_MIN_MATCH_LEN;
- * u32 max_adjusted_offset = max_offset + LZX_OFFSET_OFFSET;
- * u32 num_offset_slots = 1 + lzx_get_offset_slot_raw(max_adjusted_offset);
- *
- * However since LZX_MIN_MATCH_LEN == LZX_OFFSET_OFFSET, we would get
- * max_adjusted_offset == window_size, which would bump the number of
- * offset slots up by 1 since every valid LZX window size is equal to a
- * offset slot base value. The format doesn't do this, and instead
- * disallows matches with minimum length and maximum offset. This sets
- * max_adjusted_offset = window_size - 1, so instead we must calculate:
- *
- * num_offset_slots = 1 + lzx_get_offset_slot_raw(window_size - 1);
- *
- * ... which is the same as
- *
- * num_offset_slots = lzx_get_offset_slot_raw(window_size);
- *
- * ... since every valid window size is equal to an offset base value.
- */
- unsigned num_offset_slots = lzx_get_offset_slot_raw(window_size);
-
- /* Now calculate the number of main symbols as LZX_NUM_CHARS literal
- * symbols, plus 8 symbols per offset slot (since there are 8 possible
- * length headers, and we need all (offset slot, length header)
- * combinations). */
- return LZX_NUM_CHARS + (num_offset_slots << 3);
+ return LZX_NUM_CHARS + (num_offset_slots * LZX_NUM_LEN_HEADERS);
}
static void
{
s32 abs_offset, rel_offset;
- rel_offset = get_unaligned_u32_le(target);
+ rel_offset = get_unaligned_le32(target);
if (rel_offset >= -input_pos && rel_offset < LZX_WIM_MAGIC_FILESIZE) {
if (rel_offset < LZX_WIM_MAGIC_FILESIZE - input_pos) {
/* "good translation" */
/* "compensating translation" */
abs_offset = rel_offset - LZX_WIM_MAGIC_FILESIZE;
}
- put_unaligned_u32_le(abs_offset, target);
+ put_unaligned_le32(abs_offset, target);
}
}
{
s32 abs_offset, rel_offset;
- abs_offset = get_unaligned_u32_le(target);
+ abs_offset = get_unaligned_le32(target);
if (abs_offset >= 0) {
if (abs_offset < LZX_WIM_MAGIC_FILESIZE) {
/* "good translation" */
rel_offset = abs_offset - input_pos;
- put_unaligned_u32_le(rel_offset, target);
+ put_unaligned_le32(rel_offset, target);
}
} else {
if (abs_offset >= -input_pos) {
/* "compensating translation" */
rel_offset = abs_offset + LZX_WIM_MAGIC_FILESIZE;
- put_unaligned_u32_le(rel_offset, target);
+ put_unaligned_le32(rel_offset, target);
}
}
}
for (;;) {
u32 e8_mask;
u8 *orig_p = p;
- #ifdef __SSE2__
+ #ifdef __AVX2__
+ const __m256i e8_bytes = _mm256_set1_epi8(0xE8);
+ for (;;) {
+ __m256i bytes = *(const __m256i *)p;
+ __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes);
+ e8_mask = _mm256_movemask_epi8(cmpresult);
+ if (e8_mask)
+ break;
+ p += 32;
+ }
+ #else
const __m128i e8_bytes = _mm_set1_epi8(0xE8);
for (;;) {
/* Read the next 32 bytes of data and test them
}
p += 32;
}
- #else
- /* AVX-2 */
- const __m256i e8_bytes = _mm256_set1_epi8(0xE8);
- for (;;) {
- __m256i bytes = *(const __m256i *)p;
- __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes);
- e8_mask = _mm256_movemask_epi8(cmpresult);
- if (e8_mask)
- break;
- p += 32;
- }
#endif
/* Did we pass over data with no E8 bytes? */
* 'valid_mask' ensures we never process an E8 byte that
* was itself part of a translation target. */
while ((e8_mask &= valid_mask)) {
- unsigned bit = ffs32(e8_mask);
+ unsigned bit = bsf32(e8_mask);
(*process_target)(p + bit + 1, p + bit - data);
valid_mask &= ~((u64)0x1F << bit);
}
}
void
-lzx_do_e8_preprocessing(u8 *data, u32 size)
+lzx_preprocess(u8 *data, u32 size)
{
lzx_e8_filter(data, size, do_translate_target);
}
void
-lzx_undo_e8_preprocessing(u8 *data, u32 size)
+lzx_postprocess(u8 *data, u32 size)
{
lzx_e8_filter(data, size, undo_translate_target);
}