X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=include%2Fwimlib%2Fdecompress_common.h;h=d20085db61736fdb5b0dfddf1f87619215115525;hp=c3016475b2c07f812d6d2285443e51839545c83f;hb=4ee103c6e2a2988e1fb358bfa2dc38dcb621505a;hpb=b5eacd90b7f342710757755aa185e8cfb19f2030 diff --git a/include/wimlib/decompress_common.h b/include/wimlib/decompress_common.h index c3016475..d20085db 100644 --- a/include/wimlib/decompress_common.h +++ b/include/wimlib/decompress_common.h @@ -29,6 +29,10 @@ #include "wimlib/types.h" #include "wimlib/unaligned.h" +/******************************************************************************/ +/* Input bitstream for XPRESS and LZX */ +/*----------------------------------------------------------------------------*/ + /* Structure that encapsulates a block of in-memory data being interpreted as a * stream of bits, optionally with interwoven literal bytes. Bits are assumed * to be stored in little endian 16-bit coding units, with the bits ordered high @@ -192,75 +196,223 @@ bitstream_align(struct input_bitstream *is) is->bitbuf = 0; } -/* Needed alignment of decode_table parameter to make_huffman_decode_table(). - * - * Reason: We may fill the entries with SSE instructions without worrying - * about dealing with the unaligned case. */ +/******************************************************************************/ +/* Huffman decoding */ +/*----------------------------------------------------------------------------*/ + +/* + * Required alignment for the Huffman decode tables. We require this alignment + * so that we can fill the entries with vector or word instructions and not have + * to deal with misaligned buffers. + */ #define DECODE_TABLE_ALIGNMENT 16 -/* Maximum supported symbol count for make_huffman_decode_table(). +/* + * Each decode table entry is 16 bits divided into two fields: 'symbol' (high 12 + * bits) and 'length' (low 4 bits). The precise meaning of these fields depends + * on the type of entry: * - * Reason: In direct mapping entries, we store the symbol in 11 bits. */ -#define DECODE_TABLE_MAX_SYMBOLS 2048 - -/* Maximum supported table bits for make_huffman_decode_table(). + * Root table entries which are *not* subtable pointers: + * symbol: symbol to decode + * length: codeword length in bits * - * Reason: In internal binary tree nodes, offsets are encoded in 14 bits. - * But the real limit is 13, because we allocate entries past the end of - * the direct lookup part of the table for binary tree nodes. (Note: if - * needed this limit could be removed by encoding the offsets relative to - * &decode_table[1 << table_bits].) */ -#define DECODE_TABLE_MAX_TABLE_BITS 13 - -/* Maximum supported codeword length for make_huffman_decode_table(). + * Root table entries which are subtable pointers: + * symbol: index of start of subtable + * length: number of bits with which the subtable is indexed * - * Reason: In direct mapping entries, we encode the codeword length in 5 - * bits, and the top 2 bits can't both be set because that has special - * meaning. */ -#define DECODE_TABLE_MAX_CODEWORD_LEN 23 - -/* Reads and returns the next Huffman-encoded symbol from a bitstream. If the - * input data is exhausted, the Huffman symbol is decoded as if the missing bits - * are all zeroes. + * Subtable entries: + * symbol: symbol to decode + * length: codeword length in bits, minus the number of bits with which the + * root table is indexed + */ +#define DECODE_TABLE_SYMBOL_SHIFT 4 +#define DECODE_TABLE_MAX_SYMBOL ((1 << (16 - DECODE_TABLE_SYMBOL_SHIFT)) - 1) +#define DECODE_TABLE_MAX_LENGTH ((1 << DECODE_TABLE_SYMBOL_SHIFT) - 1) +#define DECODE_TABLE_LENGTH_MASK DECODE_TABLE_MAX_LENGTH +#define MAKE_DECODE_TABLE_ENTRY(symbol, length) \ + (((symbol) << DECODE_TABLE_SYMBOL_SHIFT) | (length)) + +/* + * Read and return the next Huffman-encoded symbol from the given bitstream + * using the given decode table. + * + * If the input data is exhausted, then the Huffman symbol will be decoded as if + * the missing bits were all zeroes. * * XXX: This is mostly duplicated in lzms_decode_huffman_symbol() in - * lzms_decompress.c. */ + * lzms_decompress.c; keep them in sync! + */ static inline unsigned -read_huffsym(struct input_bitstream *istream, const u16 decode_table[], +read_huffsym(struct input_bitstream *is, const u16 decode_table[], unsigned table_bits, unsigned max_codeword_len) { unsigned entry; - unsigned key_bits; - - bitstream_ensure_bits(istream, max_codeword_len); - - /* Index the decode table by the next table_bits bits of the input. */ - key_bits = bitstream_peek_bits(istream, table_bits); - entry = decode_table[key_bits]; - if (likely(entry < 0xC000)) { - /* Fast case: The decode table directly provided the - * symbol and codeword length. The low 11 bits are the - * symbol, and the high 5 bits are the codeword length. */ - bitstream_remove_bits(istream, entry >> 11); - return entry & 0x7FF; - } else { - /* Slow case: The codeword for the symbol is longer than - * table_bits, so the symbol does not have an entry - * directly in the first (1 << table_bits) entries of the - * decode table. Traverse the appropriate binary tree - * bit-by-bit to decode the symbol. */ - bitstream_remove_bits(istream, table_bits); - do { - key_bits = (entry & 0x3FFF) + bitstream_pop_bits(istream, 1); - } while ((entry = decode_table[key_bits]) >= 0xC000); - return entry; + unsigned symbol; + unsigned length; + + /* Preload the bitbuffer with 'max_codeword_len' bits so that we're + * guaranteed to be able to fully decode a codeword. */ + bitstream_ensure_bits(is, max_codeword_len); + + /* Index the root table by the next 'table_bits' bits of input. */ + entry = decode_table[bitstream_peek_bits(is, table_bits)]; + + /* Extract the "symbol" and "length" from the entry. */ + symbol = entry >> DECODE_TABLE_SYMBOL_SHIFT; + length = entry & DECODE_TABLE_LENGTH_MASK; + + /* If the root table is indexed by the full 'max_codeword_len' bits, + * then there cannot be any subtables, and this will be known at compile + * time. Otherwise, we must check whether the decoded symbol is really + * a subtable pointer. If so, we must discard the bits with which the + * root table was indexed, then index the subtable by the next 'length' + * bits of input to get the real entry. */ + if (max_codeword_len > table_bits && + entry >= (1U << (table_bits + DECODE_TABLE_SYMBOL_SHIFT))) + { + /* Subtable required */ + bitstream_remove_bits(is, table_bits); + entry = decode_table[symbol + bitstream_peek_bits(is, length)]; + symbol = entry >> DECODE_TABLE_SYMBOL_SHIFT; + length = entry & DECODE_TABLE_LENGTH_MASK; } + + /* Discard the bits (or the remaining bits, if a subtable was required) + * of the codeword. */ + bitstream_remove_bits(is, length); + + /* Return the decoded symbol. */ + return symbol; } +/* + * The DECODE_TABLE_ENOUGH() macro evaluates to the maximum number of decode + * table entries, including all subtable entries, that may be required for + * decoding a given Huffman code. This depends on three parameters: + * + * num_syms: the maximum number of symbols in the code + * table_bits: the number of bits with which the root table will be indexed + * max_codeword_len: the maximum allowed codeword length in the code + * + * Given these parameters, the utility program 'enough' from zlib, when passed + * the three arguments 'num_syms', 'table_bits', and 'max_codeword_len', will + * compute the maximum number of entries required. This has already been done + * for the combinations we need and incorporated into the macro below so that + * the mapping can be done at compilation time. If an unknown combination is + * used, then a compilation error will result. To fix this, use 'enough' to + * find the missing value and add it below. If that still doesn't fix the + * compilation error, then most likely a constraint would be violated by the + * requested parameters, so they cannot be used, at least without other changes + * to the decode table --- see DECODE_TABLE_SIZE(). + */ +#define DECODE_TABLE_ENOUGH(num_syms, table_bits, max_codeword_len) ( \ + ((num_syms) == 8 && (table_bits) == 7 && (max_codeword_len) == 15) ? 128 : \ + ((num_syms) == 8 && (table_bits) == 5 && (max_codeword_len) == 7) ? 36 : \ + ((num_syms) == 8 && (table_bits) == 6 && (max_codeword_len) == 7) ? 66 : \ + ((num_syms) == 8 && (table_bits) == 7 && (max_codeword_len) == 7) ? 128 : \ + ((num_syms) == 20 && (table_bits) == 5 && (max_codeword_len) == 15) ? 1062 : \ + ((num_syms) == 20 && (table_bits) == 6 && (max_codeword_len) == 15) ? 582 : \ + ((num_syms) == 20 && (table_bits) == 7 && (max_codeword_len) == 15) ? 390 : \ + ((num_syms) == 54 && (table_bits) == 9 && (max_codeword_len) == 15) ? 618 : \ + ((num_syms) == 54 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1098 : \ + ((num_syms) == 249 && (table_bits) == 9 && (max_codeword_len) == 16) ? 878 : \ + ((num_syms) == 249 && (table_bits) == 10 && (max_codeword_len) == 16) ? 1326 : \ + ((num_syms) == 249 && (table_bits) == 11 && (max_codeword_len) == 16) ? 2318 : \ + ((num_syms) == 256 && (table_bits) == 9 && (max_codeword_len) == 15) ? 822 : \ + ((num_syms) == 256 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1302 : \ + ((num_syms) == 256 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2310 : \ + ((num_syms) == 512 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1558 : \ + ((num_syms) == 512 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2566 : \ + ((num_syms) == 512 && (table_bits) == 12 && (max_codeword_len) == 15) ? 4606 : \ + ((num_syms) == 656 && (table_bits) == 10 && (max_codeword_len) == 16) ? 1734 : \ + ((num_syms) == 656 && (table_bits) == 11 && (max_codeword_len) == 16) ? 2726 : \ + ((num_syms) == 656 && (table_bits) == 12 && (max_codeword_len) == 16) ? 4758 : \ + ((num_syms) == 799 && (table_bits) == 9 && (max_codeword_len) == 15) ? 1366 : \ + ((num_syms) == 799 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1846 : \ + ((num_syms) == 799 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2854 : \ + -1) + +/* Wrapper around DECODE_TABLE_ENOUGH() that does additional compile-time + * validation. */ +#define DECODE_TABLE_SIZE(num_syms, table_bits, max_codeword_len) ( \ + \ + /* All values must be positive. */ \ + STATIC_ASSERT_ZERO((num_syms) > 0) + \ + STATIC_ASSERT_ZERO((table_bits) > 0) + \ + STATIC_ASSERT_ZERO((max_codeword_len) > 0) + \ + \ + /* There cannot be more symbols than possible codewords. */ \ + STATIC_ASSERT_ZERO((num_syms) <= 1U << (max_codeword_len)) + \ + \ + /* There is no reason for the root table to be indexed with + * more bits than the maximum codeword length. */ \ + STATIC_ASSERT_ZERO((table_bits) <= (max_codeword_len)) + \ + \ + /* The maximum symbol value must fit in the 'symbol' field. */ \ + STATIC_ASSERT_ZERO((num_syms) - 1 <= DECODE_TABLE_MAX_SYMBOL) + \ + \ + /* The maximum codeword length in the root table must fit in + * the 'length' field. */ \ + STATIC_ASSERT_ZERO((table_bits) <= DECODE_TABLE_MAX_LENGTH) + \ + \ + /* The maximum codeword length in a subtable must fit in the + * 'length' field. */ \ + STATIC_ASSERT_ZERO((max_codeword_len) - (table_bits) <= \ + DECODE_TABLE_MAX_LENGTH) + \ + \ + /* The minimum subtable index must be greater than the maximum + * symbol value. If this were not the case, then there would + * be no way to tell whether a given root table entry is a + * "subtable pointer" or not. (An alternate solution would be + * to reserve a flag bit specifically for this purpose.) */ \ + STATIC_ASSERT_ZERO((1U << table_bits) > (num_syms) - 1) + \ + \ + /* The needed 'enough' value must have been defined. */ \ + STATIC_ASSERT_ZERO(DECODE_TABLE_ENOUGH( \ + (num_syms), (table_bits), \ + (max_codeword_len)) > 0) + \ + \ + /* The maximum subtable index must fit in the 'symbol' field. */\ + STATIC_ASSERT_ZERO(DECODE_TABLE_ENOUGH( \ + (num_syms), (table_bits), \ + (max_codeword_len)) - 1 <= \ + DECODE_TABLE_MAX_SYMBOL) + \ + \ + /* Finally, make the macro evaluate to the needed maximum + * number of decode table entries. */ \ + DECODE_TABLE_ENOUGH((num_syms), (table_bits), \ + (max_codeword_len)) \ +) + +/* + * Declare the decode table for a Huffman code, given several compile-time + * constants that describe the code. See DECODE_TABLE_ENOUGH() for details. + * + * Decode tables must be aligned to a DECODE_TABLE_ALIGNMENT-byte boundary. + * This implies that if a decode table is nested inside a dynamically allocated + * structure, then the outer structure must be allocated on a + * DECODE_TABLE_ALIGNMENT-byte aligned boundary as well. + */ +#define DECODE_TABLE(name, num_syms, table_bits, max_codeword_len) \ + u16 name[DECODE_TABLE_SIZE((num_syms), (table_bits), \ + (max_codeword_len))] \ + _aligned_attribute(DECODE_TABLE_ALIGNMENT) + +/* + * Declare the temporary "working_space" array needed for building the decode + * table for a Huffman code. + */ +#define DECODE_TABLE_WORKING_SPACE(name, num_syms, max_codeword_len) \ + u16 name[2 * ((max_codeword_len) + 1) + (num_syms)]; + extern int make_huffman_decode_table(u16 decode_table[], unsigned num_syms, - unsigned num_bits, const u8 lens[], - unsigned max_codeword_len); + unsigned table_bits, const u8 lens[], + unsigned max_codeword_len, u16 working_space[]); + +/******************************************************************************/ +/* LZ match copying */ +/*----------------------------------------------------------------------------*/ static inline void copy_word_unaligned(const void *src, void *dst) @@ -269,84 +421,103 @@ copy_word_unaligned(const void *src, void *dst) } static inline machine_word_t -repeat_byte(u8 b) +repeat_u16(u16 b) { - machine_word_t v; + machine_word_t v = b; STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); - - v = b; - v |= v << 8; v |= v << 16; v |= v << ((WORDBITS == 64) ? 32 : 0); return v; } +static inline machine_word_t +repeat_byte(u8 b) +{ + return repeat_u16(((u16)b << 8) | b); +} + /* - * Copy an LZ77 match at (dst - offset) to dst. + * Copy an LZ77 match of 'length' bytes from the match source at 'out_next - + * offset' to the match destination at 'out_next'. The source and destination + * may overlap. * - * The length and offset must be already validated --- that is, (dst - offset) - * can't underrun the output buffer, and (dst + length) can't overrun the output - * buffer. Also, the length cannot be 0. + * This handles validating the length and offset. It is validated that the + * beginning of the match source is '>= out_begin' and that end of the match + * destination is '<= out_end'. The return value is 0 if the match was valid + * (and was copied), otherwise -1. * - * @winend points to the byte past the end of the output buffer. - * This function won't write any data beyond this position. + * 'min_length' is a hint which specifies the minimum possible match length. + * This should be a compile-time constant. */ -static inline void -lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend, u32 min_length) +static inline int +lz_copy(u32 length, u32 offset, u8 *out_begin, u8 *out_next, u8 *out_end, + u32 min_length) { - const u8 *src = dst - offset; - const u8 * const end = dst + length; + const u8 *src; + u8 *end; + + /* Validate the offset. */ + if (unlikely(offset > out_next - out_begin)) + return -1; /* - * Try to copy one machine word at a time. On i386 and x86_64 this is - * faster than copying one byte at a time, unless the data is - * near-random and all the matches have very short lengths. Note that - * since this requires unaligned memory accesses, it won't necessarily - * be faster on every architecture. + * Fast path: copy a match which is no longer than a few words, is not + * overlapped such that copying a word at a time would produce incorrect + * results, and is not too close to the end of the buffer. Note that + * this might copy more than the length of the match, but that's okay in + * this scenario. + */ + src = out_next - offset; + if (UNALIGNED_ACCESS_IS_FAST && length <= 3 * WORDBYTES && + offset >= WORDBYTES && out_end - out_next >= 3 * WORDBYTES) + { + copy_word_unaligned(src + WORDBYTES*0, out_next + WORDBYTES*0); + copy_word_unaligned(src + WORDBYTES*1, out_next + WORDBYTES*1); + copy_word_unaligned(src + WORDBYTES*2, out_next + WORDBYTES*2); + return 0; + } + + /* Validate the length. This isn't needed in the fast path above, due + * to the additional conditions tested, but we do need it here. */ + if (unlikely(length > out_end - out_next)) + return -1; + end = out_next + length; + + /* + * Try to copy one word at a time. On i386 and x86_64 this is faster + * than copying one byte at a time, unless the data is near-random and + * all the matches have very short lengths. Note that since this + * requires unaligned memory accesses, it won't necessarily be faster on + * every architecture. * * Also note that we might copy more than the length of the match. For * example, if a word is 8 bytes and the match is of length 5, then * we'll simply copy 8 bytes. This is okay as long as we don't write - * beyond the end of the output buffer, hence the check for (winend - + * beyond the end of the output buffer, hence the check for (out_end - * end >= WORDBYTES - 1). */ - if (UNALIGNED_ACCESS_IS_FAST && likely(winend - end >= WORDBYTES - 1)) { - + if (UNALIGNED_ACCESS_IS_FAST && likely(out_end - end >= WORDBYTES - 1)) + { if (offset >= WORDBYTES) { - /* The source and destination words don't overlap. */ - - /* To improve branch prediction, one iteration of this - * loop is unrolled. Most matches are short and will - * fail the first check. But if that check passes, then - * it becomes increasing likely that the match is long - * and we'll need to continue copying. */ - - copy_word_unaligned(src, dst); - src += WORDBYTES; - dst += WORDBYTES; - - if (dst < end) { - do { - copy_word_unaligned(src, dst); - src += WORDBYTES; - dst += WORDBYTES; - } while (dst < end); - } - return; + /* The source and destination words don't overlap. */ + do { + copy_word_unaligned(src, out_next); + src += WORDBYTES; + out_next += WORDBYTES; + } while (out_next < end); + return 0; } else if (offset == 1) { - /* Offset 1 matches are equivalent to run-length * encoding of the previous byte. This case is common - * if the data contains many repeated bytes. */ - - machine_word_t v = repeat_byte(*(dst - 1)); + * if the data contains many repeated bytes. */ + machine_word_t v = repeat_byte(*(out_next - 1)); do { - store_word_unaligned(v, dst); + store_word_unaligned(v, out_next); src += WORDBYTES; - dst += WORDBYTES; - } while (dst < end); - return; + out_next += WORDBYTES; + } while (out_next < end); + return 0; } /* * We don't bother with special cases for other 'offset < @@ -360,22 +531,16 @@ lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend, u32 min_length) } /* Fall back to a bytewise copy. */ - - if (min_length >= 2) { - *dst++ = *src++; - length--; - } - if (min_length >= 3) { - *dst++ = *src++; - length--; - } - if (min_length >= 4) { - *dst++ = *src++; - length--; - } + if (min_length >= 2) + *out_next++ = *src++; + if (min_length >= 3) + *out_next++ = *src++; + if (min_length >= 4) + *out_next++ = *src++; do { - *dst++ = *src++; - } while (--length); + *out_next++ = *src++; + } while (out_next != end); + return 0; } #endif /* _WIMLIB_DECOMPRESS_COMMON_H */