X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flzx_common.c;h=5a685720c011f223d021d886361c50e1de4fe690;hb=refs%2Fheads%2Flzx_decompress_word_bitstream;hp=76c73baea447aaae7d4bcfeb87c12cfe79ca7bd5;hpb=40a690416a3951361ec77d33a723dd4497fb7585;p=wimlib diff --git a/src/lzx_common.c b/src/lzx_common.c index 76c73bae..5a685720 100644 --- a/src/lzx_common.c +++ b/src/lzx_common.c @@ -1,9 +1,9 @@ /* - * lzx-common.c - Common code for LZX compression and decompression. + * lzx_common.c - Common code for LZX compression and decompression. */ /* - * Copyright (C) 2012, 2013, 2014 Eric Biggers + * Copyright (C) 2012-2016 Eric Biggers * * This file is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free @@ -25,30 +25,33 @@ #include +#ifdef __SSE2__ +# include +#endif + +#ifdef __AVX2__ +# include +#endif + #include "wimlib/bitops.h" #include "wimlib/endianness.h" #include "wimlib/lzx_common.h" #include "wimlib/unaligned.h" #include "wimlib/util.h" -#ifdef __SSE2__ -# include -#endif - -/* Mapping: offset slot => first match offset that uses that offset slot. - */ -const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS] = { - 0 , 1 , 2 , 3 , 4 , /* 0 --- 4 */ - 6 , 8 , 12 , 16 , 24 , /* 5 --- 9 */ - 32 , 48 , 64 , 96 , 128 , /* 10 --- 14 */ - 192 , 256 , 384 , 512 , 768 , /* 15 --- 19 */ - 1024 , 1536 , 2048 , 3072 , 4096 , /* 20 --- 24 */ - 6144 , 8192 , 12288 , 16384 , 24576 , /* 25 --- 29 */ - 32768 , 49152 , 65536 , 98304 , 131072 , /* 30 --- 34 */ - 196608 , 262144 , 393216 , 524288 , 655360 , /* 35 --- 39 */ - 786432 , 917504 , 1048576, 1179648, 1310720, /* 40 --- 44 */ - 1441792, 1572864, 1703936, 1835008, 1966080, /* 45 --- 49 */ - 2097152 /* 50 */ +/* Mapping: offset slot => first match offset that uses that offset slot. */ +const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1] = { + -2 , -1 , 0 , 1 , 2 , /* 0 --- 2 */ + 4 , 6 , 10 , 14 , 22 , /* 5 --- 9 */ + 30 , 46 , 62 , 94 , 126 , /* 10 --- 14 */ + 190 , 254 , 382 , 510 , 766 , /* 15 --- 19 */ + 1022 , 1534 , 2046 , 3070 , 4094 , /* 20 --- 24 */ + 6142 , 8190 , 12286 , 16382 , 24574 , /* 25 --- 29 */ + 32766 , 49150 , 65534 , 98302 , 131070 , /* 30 --- 34 */ + 196606 , 262142 , 393214 , 524286 , 655358 , /* 35 --- 39 */ + 786430 , 917502 , 1048574, 1179646, 1310718, /* 40 --- 44 */ + 1441790, 1572862, 1703934, 1835006, 1966078, /* 45 --- 49 */ + 2097150 /* extra */ }; /* Mapping: offset slot => how many extra bits must be read and added to the @@ -64,26 +67,18 @@ const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS] = { 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, - 17 }; -/* Round the specified compression block size (not LZX block size) up to the - * next valid LZX window size, and return its order (log2). Or, if the block - * size is 0 or greater than the largest valid LZX window size, return 0. */ +/* Round the specified buffer size up to the next valid LZX window size, and + * return its order (log2). Or, if the buffer size is 0 or greater than the + * largest valid LZX window size, return 0. */ unsigned -lzx_get_window_order(size_t max_block_size) +lzx_get_window_order(size_t max_bufsize) { - unsigned order; - - if (max_block_size == 0 || max_block_size > LZX_MAX_WINDOW_SIZE) + if (max_bufsize == 0 || max_bufsize > LZX_MAX_WINDOW_SIZE) return 0; - order = fls32(max_block_size); - - if (((u32)1 << order) != max_block_size) - order++; - - return max(order, LZX_MIN_WINDOW_ORDER); + return max(ilog2_ceil(max_bufsize), LZX_MIN_WINDOW_ORDER); } /* Given a valid LZX window order, return the number of symbols that will exist @@ -91,36 +86,18 @@ lzx_get_window_order(size_t max_block_size) unsigned lzx_get_num_main_syms(unsigned window_order) { + /* Note: one would expect that the maximum match offset would be + * 'window_size - LZX_MIN_MATCH_LEN', which would occur if the first two + * bytes were to match the last two bytes. However, the format + * disallows this case. This reduces the number of needed offset slots + * by 1. */ u32 window_size = (u32)1 << window_order; + u32 max_offset = window_size - LZX_MIN_MATCH_LEN - 1; + unsigned num_offset_slots = 30; + while (max_offset >= lzx_offset_slot_base[num_offset_slots]) + num_offset_slots++; - /* NOTE: the calculation *should* be as follows: - * - * u32 max_offset = window_size - LZX_MIN_MATCH_LEN; - * u32 max_adjusted_offset = max_offset + LZX_OFFSET_OFFSET; - * u32 num_offset_slots = 1 + lzx_get_offset_slot_raw(max_adjusted_offset); - * - * However since LZX_MIN_MATCH_LEN == LZX_OFFSET_OFFSET, we would get - * max_adjusted_offset == window_size, which would bump the number of - * offset slots up by 1 since every valid LZX window size is equal to a - * offset slot base value. The format doesn't do this, and instead - * disallows matches with minimum length and maximum offset. This sets - * max_adjusted_offset = window_size - 1, so instead we must calculate: - * - * num_offset_slots = 1 + lzx_get_offset_slot_raw(window_size - 1); - * - * ... which is the same as - * - * num_offset_slots = lzx_get_offset_slot_raw(window_size); - * - * ... since every valid window size is equal to an offset base value. - */ - unsigned num_offset_slots = lzx_get_offset_slot_raw(window_size); - - /* Now calculate the number of main symbols as LZX_NUM_CHARS literal - * symbols, plus 8 symbols per offset slot (since there are 8 possible - * length headers, and we need all (offset slot, length header) - * combinations). */ - return LZX_NUM_CHARS + (num_offset_slots << 3); + return LZX_NUM_CHARS + (num_offset_slots * LZX_NUM_LEN_HEADERS); } static void @@ -128,7 +105,7 @@ do_translate_target(void *target, s32 input_pos) { s32 abs_offset, rel_offset; - rel_offset = get_unaligned_u32_le(target); + rel_offset = get_unaligned_le32(target); if (rel_offset >= -input_pos && rel_offset < LZX_WIM_MAGIC_FILESIZE) { if (rel_offset < LZX_WIM_MAGIC_FILESIZE - input_pos) { /* "good translation" */ @@ -137,7 +114,7 @@ do_translate_target(void *target, s32 input_pos) /* "compensating translation" */ abs_offset = rel_offset - LZX_WIM_MAGIC_FILESIZE; } - put_unaligned_u32_le(abs_offset, target); + put_unaligned_le32(abs_offset, target); } } @@ -146,18 +123,18 @@ undo_translate_target(void *target, s32 input_pos) { s32 abs_offset, rel_offset; - abs_offset = get_unaligned_u32_le(target); + abs_offset = get_unaligned_le32(target); if (abs_offset >= 0) { if (abs_offset < LZX_WIM_MAGIC_FILESIZE) { /* "good translation" */ rel_offset = abs_offset - input_pos; - put_unaligned_u32_le(rel_offset, target); + put_unaligned_le32(rel_offset, target); } } else { if (abs_offset >= -input_pos) { /* "compensating translation" */ rel_offset = abs_offset + LZX_WIM_MAGIC_FILESIZE; - put_unaligned_u32_le(rel_offset, target); + put_unaligned_le32(rel_offset, target); } } } @@ -266,7 +243,17 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) for (;;) { u32 e8_mask; u8 *orig_p = p; - #ifdef __SSE2__ + #ifdef __AVX2__ + const __m256i e8_bytes = _mm256_set1_epi8(0xE8); + for (;;) { + __m256i bytes = *(const __m256i *)p; + __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes); + e8_mask = _mm256_movemask_epi8(cmpresult); + if (e8_mask) + break; + p += 32; + } + #else const __m128i e8_bytes = _mm_set1_epi8(0xE8); for (;;) { /* Read the next 32 bytes of data and test them @@ -286,17 +273,6 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) } p += 32; } - #else - /* AVX-2 */ - const __m256i e8_bytes = _mm256_set1_epi8(0xE8); - for (;;) { - __m256i bytes = *(const __m256i *)p; - __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes); - e8_mask = _mm256_movemask_epi8(cmpresult); - if (e8_mask) - break; - p += 32; - } #endif /* Did we pass over data with no E8 bytes? */ @@ -338,13 +314,13 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) } void -lzx_do_e8_preprocessing(u8 *data, u32 size) +lzx_preprocess(u8 *data, u32 size) { lzx_e8_filter(data, size, do_translate_target); } void -lzx_undo_e8_preprocessing(u8 *data, u32 size) +lzx_postprocess(u8 *data, u32 size) { lzx_e8_filter(data, size, undo_translate_target); }