*/
/*
- * Copyright (C) 2012, 2013 Eric Biggers
+ * Copyright (C) 2012, 2013, 2014 Eric Biggers
*
* This file is part of wimlib, a library for working with WIM files.
*
*/
/*
- * LZX is a LZ77 and Huffman-code based compression format that has many
+ * LZX is an LZ77 and Huffman-code based compression format that has many
* similarities to the DEFLATE format used in zlib. The compression ratio is as
* good or better than DEFLATE.
*
* last decompress to a fixed number of bytes, by default 32768. This is quite
* similar to the cabinet (.cab) file format, but they are not the same.
* According to the cabinet format documentation, the LZX block size is
- * independent from the CFDATA blocks, and a LZX block may span several CFDATA
+ * independent from the CFDATA blocks, and an LZX block may span several CFDATA
* blocks. However, in WIMs, LZX blocks do not appear to ever span multiple WIM
* chunks. Note that this means any WIM chunk may be decompressed or compressed
* independently from any other chunk, which allows random access.
*
- * A LZX compressed WIM chunk contains one or more LZX blocks of the aligned,
+ * An LZX compressed WIM chunk contains one or more LZX blocks of the aligned,
* verbatim, or uncompressed block types. For aligned and verbatim blocks, the
* size of the block in uncompressed bytes is specified by a bit following the 3
* bits that specify the block type, possibly followed by an additional 16 bits.
#include <string.h>
+#ifdef __SSE2__
+# include <emmintrin.h>
+#endif
+
/* Huffman decoding tables and maps from symbols to code lengths. */
struct lzx_tables {
/*
* Reads a Huffman-encoded symbol using the pre-tree.
*/
-static inline int
+static inline u16
read_huffsym_using_pretree(struct input_bitstream *istream,
const u16 pretree_decode_table[],
- const u8 pretree_lens[], unsigned *n)
+ const u8 pretree_lens[])
{
return read_huffsym(istream, pretree_decode_table, pretree_lens,
- LZX_PRECODE_NUM_SYMBOLS, LZX_PRECODE_TABLEBITS, n,
+ LZX_PRECODE_NUM_SYMBOLS, LZX_PRECODE_TABLEBITS,
LZX_MAX_PRE_CODEWORD_LEN);
}
/* Reads a Huffman-encoded symbol using the main tree. */
-static inline int
+static inline u16
read_huffsym_using_maintree(struct input_bitstream *istream,
const struct lzx_tables *tables,
- unsigned *n,
unsigned num_main_syms)
{
return read_huffsym(istream, tables->maintree_decode_table,
tables->maintree_lens, num_main_syms,
- LZX_MAINCODE_TABLEBITS, n, LZX_MAX_MAIN_CODEWORD_LEN);
+ LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN);
}
/* Reads a Huffman-encoded symbol using the length tree. */
-static inline int
+static inline u16
read_huffsym_using_lentree(struct input_bitstream *istream,
- const struct lzx_tables *tables,
- unsigned *n)
+ const struct lzx_tables *tables)
{
return read_huffsym(istream, tables->lentree_decode_table,
tables->lentree_lens, LZX_LENCODE_NUM_SYMBOLS,
- LZX_LENCODE_TABLEBITS, n, LZX_MAX_LEN_CODEWORD_LEN);
+ LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN);
}
/* Reads a Huffman-encoded symbol using the aligned offset tree. */
-static inline int
+static inline u16
read_huffsym_using_alignedtree(struct input_bitstream *istream,
- const struct lzx_tables *tables,
- unsigned *n)
+ const struct lzx_tables *tables)
{
return read_huffsym(istream, tables->alignedtree_decode_table,
tables->alignedtree_lens,
LZX_ALIGNEDCODE_NUM_SYMBOLS,
- LZX_ALIGNEDCODE_TABLEBITS, n,
+ LZX_ALIGNEDCODE_TABLEBITS,
LZX_MAX_ALIGNED_CODEWORD_LEN);
}
_aligned_attribute(DECODE_TABLE_ALIGNMENT);
u8 pretree_lens[LZX_PRECODE_NUM_SYMBOLS];
unsigned i;
- u32 len;
int ret;
/* Read the code lengths of the pretree codes. There are 20 lengths of
* 4 bits each. */
for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) {
- ret = bitstream_read_bits(istream, LZX_PRECODE_ELEMENT_SIZE,
- &len);
- if (ret)
- return ret;
- pretree_lens[i] = len;
+ pretree_lens[i] = bitstream_read_bits(istream,
+ LZX_PRECODE_ELEMENT_SIZE);
}
/* Make the decoding table for the pretree. */
u32 num_same;
signed char value;
- ret = read_huffsym_using_pretree(istream, pretree_decode_table,
- pretree_lens, &tree_code);
- if (ret)
- return ret;
+ tree_code = read_huffsym_using_pretree(istream,
+ pretree_decode_table,
+ pretree_lens);
switch (tree_code) {
case 17: /* Run of 0's */
- ret = bitstream_read_bits(istream, 4, &num_zeroes);
- if (ret)
- return ret;
+ num_zeroes = bitstream_read_bits(istream, 4);
num_zeroes += 4;
while (num_zeroes--) {
*lens = 0;
}
break;
case 18: /* Longer run of 0's */
- ret = bitstream_read_bits(istream, 5, &num_zeroes);
- if (ret)
- return ret;
+ num_zeroes = bitstream_read_bits(istream, 5);
num_zeroes += 20;
while (num_zeroes--) {
*lens = 0;
}
break;
case 19: /* Run of identical lengths */
- ret = bitstream_read_bits(istream, 1, &num_same);
- if (ret)
- return ret;
+ num_same = bitstream_read_bits(istream, 1);
num_same += 4;
- ret = read_huffsym_using_pretree(istream,
- pretree_decode_table,
- pretree_lens,
- &code);
- if (ret)
- return ret;
+ code = read_huffsym_using_pretree(istream,
+ pretree_decode_table,
+ pretree_lens);
value = (signed char)*lens - (signed char)code;
if (value < 0)
value += 17;
* in bytes, will be returned.
* @block_type_ret: A pointer to an int into which the type of the block
* (LZX_BLOCKTYPE_*) will be returned.
- * @tables: A pointer to a lzx_tables structure in which the
+ * @tables: A pointer to an lzx_tables structure in which the
* main tree, the length tree, and possibly the
* aligned offset tree will be constructed.
* @queue: A pointer to the least-recently-used queue into which
unsigned block_type;
unsigned block_size;
- ret = bitstream_ensure_bits(istream, 4);
- if (ret)
- return ret;
+ bitstream_ensure_bits(istream, 4);
/* The first three bits tell us what kind of block it is, and are one
* of the LZX_BLOCKTYPE_* values. */
- block_type = bitstream_read_bits_nocheck(istream, 3);
+ block_type = bitstream_pop_bits(istream, 3);
/* Read the block size. This mirrors the behavior
* lzx_write_compressed_block() in lzx-compress.c; see that for more
* details. */
- if (bitstream_read_bits_nocheck(istream, 1)) {
+ if (bitstream_pop_bits(istream, 1)) {
block_size = LZX_DEFAULT_BLOCK_SIZE;
} else {
u32 tmp;
block_size = 0;
- ret = bitstream_read_bits(istream, 8, &tmp);
- if (ret)
- return ret;
+ tmp = bitstream_read_bits(istream, 8);
block_size |= tmp;
-
- ret = bitstream_read_bits(istream, 8, &tmp);
- if (ret)
- return ret;
+ tmp = bitstream_read_bits(istream, 8);
block_size <<= 8;
block_size |= tmp;
if (max_window_size >= 65536) {
- ret = bitstream_read_bits(istream, 8, &tmp);
- if (ret)
- return ret;
+ tmp = bitstream_read_bits(istream, 8);
block_size <<= 8;
block_size |= tmp;
}
* then build it. */
for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
- u32 len;
-
- ret = bitstream_read_bits(istream,
- LZX_ALIGNEDCODE_ELEMENT_SIZE,
- &len);
- if (ret)
- return ret;
- tables->alignedtree_lens[i] = len;
+ tables->alignedtree_lens[i] =
+ bitstream_read_bits(istream,
+ LZX_ALIGNEDCODE_ELEMENT_SIZE);
}
LZX_DEBUG("Building the aligned tree.");
unsigned position_slot;
unsigned match_len;
unsigned match_offset;
- unsigned additional_len;
unsigned num_extra_bits;
u32 verbatim_bits;
u32 aligned_bits;
unsigned i;
- int ret;
u8 *match_dest;
u8 *match_src;
* the length tree, offset by 9 (LZX_MIN_MATCH_LEN +
* LZX_NUM_PRIMARY_LENS) */
match_len = LZX_MIN_MATCH_LEN + length_header;
- if (length_header == LZX_NUM_PRIMARY_LENS) {
- ret = read_huffsym_using_lentree(istream, tables,
- &additional_len);
- if (ret)
- return ret;
- match_len += additional_len;
- }
-
+ if (length_header == LZX_NUM_PRIMARY_LENS)
+ match_len += read_huffsym_using_lentree(istream, tables);
/* If the position_slot is 0, 1, or 2, the match offset is retrieved
* from the LRU queue. Otherwise, the match offset is not in the LRU
* equal to 3. (Note that in the case with
* num_extra_bits == 3, the assignment to verbatim_bits
* will just set it to 0. ) */
- ret = bitstream_read_bits(istream, num_extra_bits - 3,
- &verbatim_bits);
- if (ret)
- return ret;
-
+ verbatim_bits = bitstream_read_bits(istream,
+ num_extra_bits - 3);
verbatim_bits <<= 3;
-
- ret = read_huffsym_using_alignedtree(istream, tables,
- &aligned_bits);
- if (ret)
- return ret;
+ aligned_bits = read_huffsym_using_alignedtree(istream,
+ tables);
} else {
/* For non-aligned blocks, or for aligned blocks with
* less than 3 extra bits, the extra bits are added
* directly to the match offset, and the correction for
* the alignment is taken to be 0. */
- ret = bitstream_read_bits(istream, num_extra_bits,
- &verbatim_bits);
- if (ret)
- return ret;
-
+ verbatim_bits = bitstream_read_bits(istream, num_extra_bits);
aligned_bits = 0;
}
* currently in use, then copy the source of the match to the current
* position. */
- if (match_len > bytes_remaining) {
+ if (unlikely(match_len > bytes_remaining)) {
LZX_DEBUG("Match of length %u bytes overflows "
"uncompressed block size", match_len);
return -1;
}
- if (match_offset > window_pos) {
+ if (unlikely(match_offset > window_pos)) {
LZX_DEBUG("Match of length %u bytes references "
"data before window (match_offset = %u, "
"window_pos = %u)",
}
static void
-undo_call_insn_translation(u32 *call_insn_target, s32 input_pos,
- s32 file_size)
+undo_call_insn_translation(u32 *call_insn_target, s32 input_pos)
{
s32 abs_offset;
s32 rel_offset;
abs_offset = le32_to_cpu(*call_insn_target);
- if (abs_offset >= -input_pos && abs_offset < file_size) {
- if (abs_offset >= 0) {
+ if (abs_offset >= 0) {
+ if (abs_offset < LZX_WIM_MAGIC_FILESIZE) {
/* "good translation" */
rel_offset = abs_offset - input_pos;
- } else {
+
+ *call_insn_target = cpu_to_le32(rel_offset);
+ }
+ } else {
+ if (abs_offset >= -input_pos) {
/* "compensating translation" */
- rel_offset = abs_offset + file_size;
+ rel_offset = abs_offset + LZX_WIM_MAGIC_FILESIZE;
+
+ *call_insn_target = cpu_to_le32(rel_offset);
}
- *call_insn_target = cpu_to_le32(rel_offset);
}
}
* as it is used in calculating the translated jump targets. But in WIM files,
* this file size is always the same (LZX_WIM_MAGIC_FILESIZE == 12000000).*/
static void
-undo_call_insn_preprocessing(u8 *uncompressed_data, s32 uncompressed_size)
+undo_call_insn_preprocessing(u8 *uncompressed_data, size_t uncompressed_size)
{
- for (s32 i = 0; i < uncompressed_size - 10; i++) {
- if (uncompressed_data[i] == 0xe8) {
- undo_call_insn_translation((u32*)&uncompressed_data[i + 1],
- i,
- LZX_WIM_MAGIC_FILESIZE);
- i += 4;
- }
+#ifdef __SSE2__
+
+ /* SSE2 vectorized implementation for x86_64. This speeds up LZX
+ * decompression by about 5-8% overall. (Usually --- the performance
+ * actually regresses slightly in the degenerate case that the data
+ * consists entirely of 0xe8 bytes.) */
+ __m128i *p128 = (__m128i *)uncompressed_data;
+ u32 valid_mask = 0xFFFFFFFF;
+
+ if (uncompressed_size >= 32 &&
+ ((uintptr_t)uncompressed_data % 16 == 0))
+ {
+ __m128i * const end128 = p128 + uncompressed_size / 16 - 1;
+
+ /* Create a vector of all 0xe8 bytes */
+ const __m128i e8_bytes = _mm_set1_epi8(0xe8);
+
+ /* Iterate through the 16-byte vectors in the input. */
+ do {
+ /* Compare the current 16-byte vector with the vector of
+ * all 0xe8 bytes. This produces 0xff where the byte is
+ * 0xe8 and 0x00 where it is not. */
+ __m128i cmpresult = _mm_cmpeq_epi8(*p128, e8_bytes);
+
+ /* Map the comparison results into a single 16-bit
+ * number. It will contain a 1 bit when the
+ * corresponding byte in the current 16-byte vector is
+ * an e8 byte. Note: the low-order bit corresponds to
+ * the first (lowest address) byte. */
+ u32 e8_mask = _mm_movemask_epi8(cmpresult);
+
+ if (!e8_mask) {
+ /* If e8_mask is 0, then none of these 16 bytes
+ * have value 0xe8. No e8 translation is
+ * needed, and there is no restriction that
+ * carries over to the next 16 bytes. */
+ valid_mask = 0xFFFFFFFF;
+ } else {
+ /* At least one byte has value 0xe8.
+ *
+ * The AND with valid_mask accounts for the fact
+ * that we can't start an e8 translation that
+ * overlaps the previous one. */
+ while ((e8_mask &= valid_mask)) {
+
+ /* Count the number of trailing zeroes
+ * in e8_mask. This will produce the
+ * index of the byte, within the 16, at
+ * which the next e8 translation should
+ * be done. */
+ u32 bit = __builtin_ctz(e8_mask);
+
+ /* Do the e8 translation. */
+ u8 *p8 = (u8 *)p128 + bit;
+ undo_call_insn_translation((s32 *)(p8 + 1),
+ p8 - uncompressed_data);
+
+ /* Don't start an e8 translation in the
+ * next 4 bytes. */
+ valid_mask &= ~((u32)0x1F << bit);
+ }
+ /* Moving on to the next vector. Shift and set
+ * valid_mask accordingly. */
+ valid_mask >>= 16;
+ valid_mask |= 0xFFFF0000;
+ }
+ } while (++p128 < end128);
+ }
+
+ u8 *p8 = (u8 *)p128;
+ while (!(valid_mask & 1)) {
+ p8++;
+ valid_mask >>= 1;
+ }
+#else /* __SSE2__ */
+ u8 *p8 = uncompressed_data;
+#endif /* !__SSE2__ */
+
+ if (uncompressed_size > 10) {
+ /* Finish any bytes that weren't processed by the vectorized
+ * implementation. */
+ u8 *p8_end = uncompressed_data + uncompressed_size - 10;
+ do {
+ if (*p8 == 0xe8) {
+ undo_call_insn_translation((s32 *)(p8 + 1),
+ p8 - uncompressed_data);
+ p8 += 5;
+ } else {
+ p8++;
+ }
+ } while (p8 < p8_end);
}
}
/*
- * Decompresses a LZX-compressed block of data from which the header has already
+ * Decompresses an LZX-compressed block of data from which the header has already
* been read.
*
* @block_type: The type of the block (LZX_BLOCKTYPE_VERBATIM or
{
unsigned main_element;
unsigned end;
- int ret;
int match_len;
end = window_pos + block_size;
while (window_pos < end) {
- ret = read_huffsym_using_maintree(istream, tables,
- &main_element,
- num_main_syms);
- if (ret)
- return ret;
-
+ main_element = read_huffsym_using_maintree(istream, tables,
+ num_main_syms);
if (main_element < LZX_NUM_CHARS) {
/* literal: 0 to LZX_NUM_CHARS - 1 */
window[window_pos++] = main_element;
tables,
queue,
istream);
- if (match_len < 0)
+ if (unlikely(match_len < 0))
return match_len;
window_pos += match_len;
}