4 * Header for decompression code shared by multiple compression formats.
6 * The following copying information applies to this specific source code file:
8 * Written in 2012-2016 by Eric Biggers <ebiggers3@gmail.com>
10 * To the extent possible under law, the author(s) have dedicated all copyright
11 * and related and neighboring rights to this software to the public domain
12 * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
13 * Dedication (the "CC0").
15 * This software is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
19 * You should have received a copy of the CC0 along with this software; if not
20 * see <http://creativecommons.org/publicdomain/zero/1.0/>.
23 #ifndef _WIMLIB_DECOMPRESS_COMMON_H
24 #define _WIMLIB_DECOMPRESS_COMMON_H
28 #include "wimlib/compiler.h"
29 #include "wimlib/types.h"
30 #include "wimlib/unaligned.h"
32 /* Structure that encapsulates a block of in-memory data being interpreted as a
33 * stream of bits, optionally with interwoven literal bytes. Bits are assumed
34 * to be stored in little endian 16-bit coding units, with the bits ordered high
36 struct input_bitstream {
38 /* Bits that have been read from the input buffer. The bits are
39 * left-justified; the next bit is always bit 31. */
42 /* Number of bits currently held in @bitbuf. */
45 /* Pointer to the next byte to be retrieved from the input buffer. */
48 /* Pointer past the end of the input buffer. */
52 /* Initialize a bitstream to read from the specified input buffer. */
54 init_input_bitstream(struct input_bitstream *is, const void *buffer, u32 size)
59 is->end = is->next + size;
62 /* Note: for performance reasons, the following methods don't return error codes
63 * to the caller if the input buffer is overrun. Instead, they just assume that
64 * all overrun data is zeroes. This has no effect on well-formed compressed
65 * data. The only disadvantage is that bad compressed data may go undetected,
66 * but even this is irrelevant if higher level code checksums the uncompressed
69 /* Ensure the bit buffer variable for the bitstream contains at least @num_bits
70 * bits. Following this, bitstream_peek_bits() and/or bitstream_remove_bits()
71 * may be called on the bitstream to peek or remove up to @num_bits bits. */
73 bitstream_ensure_bits(struct input_bitstream *is, const unsigned num_bits)
75 /* This currently works for at most 17 bits. */
77 if (is->bitsleft >= num_bits)
80 if (unlikely(is->end - is->next < 2))
83 is->bitbuf |= (u32)get_unaligned_le16(is->next) << (16 - is->bitsleft);
87 if (unlikely(num_bits == 17 && is->bitsleft == 16)) {
88 if (unlikely(is->end - is->next < 2))
91 is->bitbuf |= (u32)get_unaligned_le16(is->next);
102 /* Return the next @num_bits bits from the bitstream, without removing them.
103 * There must be at least @num_bits remaining in the buffer variable, from a
104 * previous call to bitstream_ensure_bits(). */
106 bitstream_peek_bits(const struct input_bitstream *is, const unsigned num_bits)
108 return (is->bitbuf >> 1) >> (sizeof(is->bitbuf) * 8 - num_bits - 1);
111 /* Remove @num_bits from the bitstream. There must be at least @num_bits
112 * remaining in the buffer variable, from a previous call to
113 * bitstream_ensure_bits(). */
115 bitstream_remove_bits(struct input_bitstream *is, unsigned num_bits)
117 is->bitbuf <<= num_bits;
118 is->bitsleft -= num_bits;
121 /* Remove and return @num_bits bits from the bitstream. There must be at least
122 * @num_bits remaining in the buffer variable, from a previous call to
123 * bitstream_ensure_bits(). */
125 bitstream_pop_bits(struct input_bitstream *is, unsigned num_bits)
127 u32 bits = bitstream_peek_bits(is, num_bits);
128 bitstream_remove_bits(is, num_bits);
132 /* Read and return the next @num_bits bits from the bitstream. */
134 bitstream_read_bits(struct input_bitstream *is, unsigned num_bits)
136 bitstream_ensure_bits(is, num_bits);
137 return bitstream_pop_bits(is, num_bits);
140 /* Read and return the next literal byte embedded in the bitstream. */
142 bitstream_read_byte(struct input_bitstream *is)
144 if (unlikely(is->end == is->next))
149 /* Read and return the next 16-bit integer embedded in the bitstream. */
151 bitstream_read_u16(struct input_bitstream *is)
155 if (unlikely(is->end - is->next < 2))
157 v = get_unaligned_le16(is->next);
162 /* Read and return the next 32-bit integer embedded in the bitstream. */
164 bitstream_read_u32(struct input_bitstream *is)
168 if (unlikely(is->end - is->next < 4))
170 v = get_unaligned_le32(is->next);
175 /* Read into @dst_buffer an array of literal bytes embedded in the bitstream.
176 * Return 0 if there were enough bytes remaining in the input, otherwise -1. */
178 bitstream_read_bytes(struct input_bitstream *is, void *dst_buffer, size_t count)
180 if (unlikely(is->end - is->next < count))
182 memcpy(dst_buffer, is->next, count);
187 /* Align the input bitstream on a coding-unit boundary. */
189 bitstream_align(struct input_bitstream *is)
195 /* Needed alignment of decode_table parameter to make_huffman_decode_table().
197 * Reason: We may fill the entries with SSE instructions without worrying
198 * about dealing with the unaligned case. */
199 #define DECODE_TABLE_ALIGNMENT 16
201 #define DECODE_TABLE_SYMBOL_SHIFT 4
202 #define DECODE_TABLE_LENGTH_MASK DECODE_TABLE_MAX_LENGTH
204 #define DECODE_TABLE_MAX_NUM_SYMS ((1 << (16 - DECODE_TABLE_SYMBOL_SHIFT)) - 1)
205 #define DECODE_TABLE_MAX_LENGTH ((1 << DECODE_TABLE_SYMBOL_SHIFT) - 1)
208 * Reads and returns the next Huffman-encoded symbol from a bitstream.
210 * If the input data is exhausted, the Huffman symbol is decoded as if the
211 * missing bits are all zeroes.
213 * XXX: This is mostly duplicated in lzms_decode_huffman_symbol() in
216 static inline unsigned
217 read_huffsym(struct input_bitstream *is, const u16 decode_table[],
218 unsigned table_bits, unsigned max_codeword_len)
224 /* If the bitbuffer contains fewer bits than might be required by a
225 * single codeword, then refill it. */
226 bitstream_ensure_bits(is, max_codeword_len);
228 /* Index the root table by the next 'table_bits' bits of input. */
229 entry = decode_table[bitstream_peek_bits(is, table_bits)];
231 /* Extract the symbol and length from the entry. */
232 sym = entry >> DECODE_TABLE_SYMBOL_SHIFT;
233 len = entry & DECODE_TABLE_LENGTH_MASK;
235 /* If the root table is indexed by the full 'max_codeword_len' bits,
236 * then there cannot be any subtables. This will be known at compile
237 * time. Otherwise, we must check whether the decoded symbol is really
238 * a subtable pointer. If so, we must discard the bits with which the
239 * root table was indexed, then index the subtable by the next 'len'
240 * bits of input to get the real entry. */
241 if (max_codeword_len > table_bits &&
242 entry >= (1U << (table_bits + DECODE_TABLE_SYMBOL_SHIFT)))
244 /* Subtable required */
245 bitstream_remove_bits(is, table_bits);
246 entry = decode_table[sym + bitstream_peek_bits(is, len)];
247 sym = entry >> DECODE_TABLE_SYMBOL_SHIFT;;
248 len = entry & DECODE_TABLE_LENGTH_MASK;
251 /* Discard the bits (or the remaining bits, if a subtable was required)
252 * of the codeword. */
253 bitstream_remove_bits(is, len);
255 /* Return the decoded symbol. */
260 * The ENOUGH() macro returns the maximum number of decode table entries,
261 * including all subtable entries, that may be required for decoding a given
262 * Huffman code. This depends on three parameters:
264 * num_syms: the maximum number of symbols in the code
265 * table_bits: the number of bits with which the root table will be indexed
266 * max_codeword_len: the maximum allowed codeword length
268 * Given these parameters, the utility program 'enough' from zlib, when run as
269 * './enough num_syms table_bits max_codeword_len', will compute the maximum
270 * number of entries required. This has already been done for the combinations
271 * we need (or may need) and incorporated into the macro below so that the
272 * mapping can be done at compilation time. If an unknown combination is used,
273 * then a compilation error will result. To fix this, use 'enough' to find the
274 * missing value and add it below.
276 #define ENOUGH(num_syms, table_bits, max_codeword_len) ( \
277 ((num_syms) == 8 && (table_bits) == 7 && (max_codeword_len) == 15) ? 128 : \
278 ((num_syms) == 8 && (table_bits) == 5 && (max_codeword_len) == 7) ? 36 : \
279 ((num_syms) == 8 && (table_bits) == 6 && (max_codeword_len) == 7) ? 66 : \
280 ((num_syms) == 8 && (table_bits) == 7 && (max_codeword_len) == 7) ? 128 : \
281 ((num_syms) == 20 && (table_bits) == 5 && (max_codeword_len) == 15) ? 1062 : \
282 ((num_syms) == 20 && (table_bits) == 6 && (max_codeword_len) == 15) ? 582 : \
283 ((num_syms) == 20 && (table_bits) == 7 && (max_codeword_len) == 15) ? 390 : \
284 ((num_syms) == 54 && (table_bits) == 9 && (max_codeword_len) == 15) ? 618 : \
285 ((num_syms) == 54 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1098 : \
286 ((num_syms) == 249 && (table_bits) == 9 && (max_codeword_len) == 16) ? 878 : \
287 ((num_syms) == 249 && (table_bits) == 10 && (max_codeword_len) == 16) ? 1326 : \
288 ((num_syms) == 249 && (table_bits) == 11 && (max_codeword_len) == 16) ? 2318 : \
289 ((num_syms) == 256 && (table_bits) == 9 && (max_codeword_len) == 15) ? 822 : \
290 ((num_syms) == 256 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1302 : \
291 ((num_syms) == 256 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2310 : \
292 ((num_syms) == 512 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1558 : \
293 ((num_syms) == 512 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2566 : \
294 ((num_syms) == 512 && (table_bits) == 12 && (max_codeword_len) == 15) ? 4606 : \
295 ((num_syms) == 656 && (table_bits) == 10 && (max_codeword_len) == 16) ? 1734 : \
296 ((num_syms) == 656 && (table_bits) == 11 && (max_codeword_len) == 16) ? 2726 : \
297 ((num_syms) == 656 && (table_bits) == 12 && (max_codeword_len) == 16) ? 4758 : \
298 ((num_syms) == 799 && (table_bits) == 9 && (max_codeword_len) == 15) ? 1366 : \
299 ((num_syms) == 799 && (table_bits) == 10 && (max_codeword_len) == 15) ? 1846 : \
300 ((num_syms) == 799 && (table_bits) == 11 && (max_codeword_len) == 15) ? 2854 : \
303 /* Wrapper around ENOUGH() that does additional compile-time validation. */
304 #define DECODE_TABLE_LENGTH(num_syms, table_bits, max_codeword_len) ( \
306 /* Every possible symbol value must fit into the symbol portion \
307 * of a decode table entry. */ \
308 STATIC_ASSERT_ZERO((num_syms) <= DECODE_TABLE_MAX_NUM_SYMS) + \
310 /* There cannot be more symbols than possible codewords. */ \
311 STATIC_ASSERT_ZERO((num_syms) <= 1U << (max_codeword_len)) + \
313 /* It doesn't make sense to use a table_bits more than the \
314 * maximum codeword length. */ \
315 STATIC_ASSERT_ZERO((max_codeword_len) >= (table_bits)) + \
317 /* The maximum length in the root table must fit into the \
318 * length portion of a decode table entry. */ \
319 STATIC_ASSERT_ZERO((table_bits) <= DECODE_TABLE_MAX_LENGTH) + \
321 /* The maximum length in a subtable must fit into the length
322 * portion of a decode table entry. */ \
323 STATIC_ASSERT_ZERO((max_codeword_len) - (table_bits) <= \
324 DECODE_TABLE_MAX_LENGTH) + \
326 /* The needed 'enough' value must have been defined. */ \
327 STATIC_ASSERT_ZERO(ENOUGH((num_syms), (table_bits), \
328 (max_codeword_len)) >= 0) + \
330 /* The maximum subtable index must fit in the field which would \
331 * normally hold a symbol value. */ \
332 STATIC_ASSERT_ZERO(ENOUGH((num_syms), (table_bits), \
333 (max_codeword_len)) <= \
334 DECODE_TABLE_MAX_NUM_SYMS) + \
336 /* The minimum subtable index must be greater than the greatest \
337 * possible symbol value. */ \
338 STATIC_ASSERT_ZERO((1U << table_bits) >= num_syms) + \
340 ENOUGH(num_syms, table_bits, max_codeword_len) \
344 * Declare the decode table for a Huffman code, given several compile-time
345 * constants that describe that code (see ENOUGH() for details).
347 * Decode tables must be aligned to a DECODE_TABLE_ALIGNMENT-boundary. This
348 * implies that if a decode table is nested a dynamically allocated structure,
349 * then the outer structure must be allocated on a DECODE_TABLE_ALIGNMENT-byte
352 #define DECODE_TABLE(name, num_syms, table_bits, max_codeword_len) \
353 u16 name[DECODE_TABLE_LENGTH((num_syms), (table_bits), \
354 (max_codeword_len))] \
355 _aligned_attribute(DECODE_TABLE_ALIGNMENT)
358 make_huffman_decode_table(u16 decode_table[], unsigned num_syms,
359 unsigned table_bits, const u8 lens[],
360 unsigned max_codeword_len);
363 copy_word_unaligned(const void *src, void *dst)
365 store_word_unaligned(load_word_unaligned(src), dst);
368 static inline machine_word_t
371 machine_word_t v = b;
373 STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
375 v |= v << ((WORDBITS == 64) ? 32 : 0);
379 static inline machine_word_t
382 return repeat_u16(((u16)b << 8) | b);
386 * Copy an LZ77 match at (dst - offset) to dst.
388 * The length and offset must be already validated --- that is, (dst - offset)
389 * can't underrun the output buffer, and (dst + length) can't overrun the output
390 * buffer. Also, the length cannot be 0.
392 * @winend points to the byte past the end of the output buffer.
393 * This function won't write any data beyond this position.
396 lz_copy(u8 *dst, u32 length, u32 offset, const u8 *winend, u32 min_length)
398 const u8 *src = dst - offset;
399 const u8 * const end = dst + length;
402 * Try to copy one machine word at a time. On i386 and x86_64 this is
403 * faster than copying one byte at a time, unless the data is
404 * near-random and all the matches have very short lengths. Note that
405 * since this requires unaligned memory accesses, it won't necessarily
406 * be faster on every architecture.
408 * Also note that we might copy more than the length of the match. For
409 * example, if a word is 8 bytes and the match is of length 5, then
410 * we'll simply copy 8 bytes. This is okay as long as we don't write
411 * beyond the end of the output buffer, hence the check for (winend -
412 * end >= WORDBYTES - 1).
414 if (UNALIGNED_ACCESS_IS_FAST && likely(winend - end >= WORDBYTES - 1)) {
416 if (offset >= WORDBYTES) {
417 /* The source and destination words don't overlap. */
419 /* To improve branch prediction, one iteration of this
420 * loop is unrolled. Most matches are short and will
421 * fail the first check. But if that check passes, then
422 * it becomes increasing likely that the match is long
423 * and we'll need to continue copying. */
425 copy_word_unaligned(src, dst);
431 copy_word_unaligned(src, dst);
437 } else if (offset == 1) {
439 /* Offset 1 matches are equivalent to run-length
440 * encoding of the previous byte. This case is common
441 * if the data contains many repeated bytes. */
443 machine_word_t v = repeat_u8(*(dst - 1));
445 store_word_unaligned(v, dst);
452 * We don't bother with special cases for other 'offset <
453 * WORDBYTES', which are usually rarer than 'offset == 1'.
454 * Extra checks will just slow things down. Actually, it's
455 * possible to handle all the 'offset < WORDBYTES' cases using
456 * the same code, but it still becomes more complicated doesn't
457 * seem any faster overall; it definitely slows down the more
458 * common 'offset == 1' case.
462 /* Fall back to a bytewise copy. */
464 if (min_length >= 2) {
468 if (min_length >= 3) {
472 if (min_length >= 4) {
481 #endif /* _WIMLIB_DECOMPRESS_COMMON_H */