X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Flzx_common.c;h=7925f1f197b6803ce31bca7fed7564302f6220e1;hp=76c73baea447aaae7d4bcfeb87c12cfe79ca7bd5;hb=144bba695a7a57cdd179dcfd1a916264c8bd4cde;hpb=40a690416a3951361ec77d33a723dd4497fb7585 diff --git a/src/lzx_common.c b/src/lzx_common.c index 76c73bae..7925f1f1 100644 --- a/src/lzx_common.c +++ b/src/lzx_common.c @@ -1,9 +1,9 @@ /* - * lzx-common.c - Common code for LZX compression and decompression. + * lzx_common.c - Common code for LZX compression and decompression. */ /* - * Copyright (C) 2012, 2013, 2014 Eric Biggers + * Copyright (C) 2012-2016 Eric Biggers * * This file is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free @@ -25,19 +25,23 @@ #include +#ifdef __SSE2__ +# include +#endif + +#ifdef __AVX2__ +# include +#endif + #include "wimlib/bitops.h" #include "wimlib/endianness.h" #include "wimlib/lzx_common.h" #include "wimlib/unaligned.h" #include "wimlib/util.h" -#ifdef __SSE2__ -# include -#endif - /* Mapping: offset slot => first match offset that uses that offset slot. */ -const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS] = { +const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1] = { 0 , 1 , 2 , 3 , 4 , /* 0 --- 4 */ 6 , 8 , 12 , 16 , 24 , /* 5 --- 9 */ 32 , 48 , 64 , 96 , 128 , /* 10 --- 14 */ @@ -48,7 +52,7 @@ const u32 lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS] = { 196608 , 262144 , 393216 , 524288 , 655360 , /* 35 --- 39 */ 786432 , 917504 , 1048576, 1179648, 1310720, /* 40 --- 44 */ 1441792, 1572864, 1703936, 1835008, 1966080, /* 45 --- 49 */ - 2097152 /* 50 */ + 2097152 /* extra */ }; /* Mapping: offset slot => how many extra bits must be read and added to the @@ -64,26 +68,18 @@ const u8 lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS] = { 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, - 17 }; -/* Round the specified compression block size (not LZX block size) up to the - * next valid LZX window size, and return its order (log2). Or, if the block - * size is 0 or greater than the largest valid LZX window size, return 0. */ +/* Round the specified buffer size up to the next valid LZX window size, and + * return its order (log2). Or, if the buffer size is 0 or greater than the + * largest valid LZX window size, return 0. */ unsigned -lzx_get_window_order(size_t max_block_size) +lzx_get_window_order(size_t max_bufsize) { - unsigned order; - - if (max_block_size == 0 || max_block_size > LZX_MAX_WINDOW_SIZE) + if (max_bufsize == 0 || max_bufsize > LZX_MAX_WINDOW_SIZE) return 0; - order = fls32(max_block_size); - - if (((u32)1 << order) != max_block_size) - order++; - - return max(order, LZX_MIN_WINDOW_ORDER); + return max(ilog2_ceil(max_bufsize), LZX_MIN_WINDOW_ORDER); } /* Given a valid LZX window order, return the number of symbols that will exist @@ -91,36 +87,19 @@ lzx_get_window_order(size_t max_block_size) unsigned lzx_get_num_main_syms(unsigned window_order) { + /* Note: one would expect that the maximum match offset would be + * 'window_size - LZX_MIN_MATCH_LEN', which would occur if the first two + * bytes were to match the last two bytes. However, the format + * disallows this case. This reduces the number of needed offset slots + * by 1. */ u32 window_size = (u32)1 << window_order; + u32 max_adjusted_offset = (window_size - LZX_MIN_MATCH_LEN - 1) + + LZX_OFFSET_ADJUSTMENT; + unsigned num_offset_slots = 30; + while (max_adjusted_offset >= lzx_offset_slot_base[num_offset_slots]) + num_offset_slots++; - /* NOTE: the calculation *should* be as follows: - * - * u32 max_offset = window_size - LZX_MIN_MATCH_LEN; - * u32 max_adjusted_offset = max_offset + LZX_OFFSET_OFFSET; - * u32 num_offset_slots = 1 + lzx_get_offset_slot_raw(max_adjusted_offset); - * - * However since LZX_MIN_MATCH_LEN == LZX_OFFSET_OFFSET, we would get - * max_adjusted_offset == window_size, which would bump the number of - * offset slots up by 1 since every valid LZX window size is equal to a - * offset slot base value. The format doesn't do this, and instead - * disallows matches with minimum length and maximum offset. This sets - * max_adjusted_offset = window_size - 1, so instead we must calculate: - * - * num_offset_slots = 1 + lzx_get_offset_slot_raw(window_size - 1); - * - * ... which is the same as - * - * num_offset_slots = lzx_get_offset_slot_raw(window_size); - * - * ... since every valid window size is equal to an offset base value. - */ - unsigned num_offset_slots = lzx_get_offset_slot_raw(window_size); - - /* Now calculate the number of main symbols as LZX_NUM_CHARS literal - * symbols, plus 8 symbols per offset slot (since there are 8 possible - * length headers, and we need all (offset slot, length header) - * combinations). */ - return LZX_NUM_CHARS + (num_offset_slots << 3); + return LZX_NUM_CHARS + (num_offset_slots * LZX_NUM_LEN_HEADERS); } static void @@ -128,7 +107,7 @@ do_translate_target(void *target, s32 input_pos) { s32 abs_offset, rel_offset; - rel_offset = get_unaligned_u32_le(target); + rel_offset = get_unaligned_le32(target); if (rel_offset >= -input_pos && rel_offset < LZX_WIM_MAGIC_FILESIZE) { if (rel_offset < LZX_WIM_MAGIC_FILESIZE - input_pos) { /* "good translation" */ @@ -137,7 +116,7 @@ do_translate_target(void *target, s32 input_pos) /* "compensating translation" */ abs_offset = rel_offset - LZX_WIM_MAGIC_FILESIZE; } - put_unaligned_u32_le(abs_offset, target); + put_unaligned_le32(abs_offset, target); } } @@ -146,18 +125,18 @@ undo_translate_target(void *target, s32 input_pos) { s32 abs_offset, rel_offset; - abs_offset = get_unaligned_u32_le(target); + abs_offset = get_unaligned_le32(target); if (abs_offset >= 0) { if (abs_offset < LZX_WIM_MAGIC_FILESIZE) { /* "good translation" */ rel_offset = abs_offset - input_pos; - put_unaligned_u32_le(rel_offset, target); + put_unaligned_le32(rel_offset, target); } } else { if (abs_offset >= -input_pos) { /* "compensating translation" */ rel_offset = abs_offset + LZX_WIM_MAGIC_FILESIZE; - put_unaligned_u32_le(rel_offset, target); + put_unaligned_le32(rel_offset, target); } } } @@ -266,7 +245,17 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) for (;;) { u32 e8_mask; u8 *orig_p = p; - #ifdef __SSE2__ + #ifdef __AVX2__ + const __m256i e8_bytes = _mm256_set1_epi8(0xE8); + for (;;) { + __m256i bytes = *(const __m256i *)p; + __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes); + e8_mask = _mm256_movemask_epi8(cmpresult); + if (e8_mask) + break; + p += 32; + } + #else const __m128i e8_bytes = _mm_set1_epi8(0xE8); for (;;) { /* Read the next 32 bytes of data and test them @@ -286,17 +275,6 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) } p += 32; } - #else - /* AVX-2 */ - const __m256i e8_bytes = _mm256_set1_epi8(0xE8); - for (;;) { - __m256i bytes = *(const __m256i *)p; - __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes); - e8_mask = _mm256_movemask_epi8(cmpresult); - if (e8_mask) - break; - p += 32; - } #endif /* Did we pass over data with no E8 bytes? */ @@ -338,13 +316,13 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) } void -lzx_do_e8_preprocessing(u8 *data, u32 size) +lzx_preprocess(u8 *data, u32 size) { lzx_e8_filter(data, size, do_translate_target); } void -lzx_undo_e8_preprocessing(u8 *data, u32 size) +lzx_postprocess(u8 *data, u32 size) { lzx_e8_filter(data, size, undo_translate_target); }