4 * LZX compression routines.
6 * This code was originally based on code written by Matthew T. Russotto
11 * Copyright (C) 2002 Matthew T. Russotto
12 * Copyright (C) 2012 Eric Biggers
14 * This file is part of wimlib, a library for working with WIM files.
16 * wimlib is free software; you can redistribute it and/or modify it under the
17 * terms of the GNU General Public License as published by the Free
18 * Software Foundation; either version 3 of the License, or (at your option)
21 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
22 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
23 * A PARTICULAR PURPOSE. See the GNU General Public License for more
26 * You should have received a copy of the GNU General Public License
27 * along with wimlib; if not, see http://www.gnu.org/licenses/.
33 * This file provides lzx_compress(), a function to compress an in-memory buffer
34 * of data using LZX compression, as used in the WIM file format.
36 * There is no sliding window, as for the compressed chunks in WIM resources,
37 * the window is always the length of the input.
39 * The basic algorithm should be familiar if you are familiar with Huffman trees
40 * and with other LZ77-based formats such as DEFLATE. Otherwise it can be quite
41 * tricky to understand. Basically it is the following:
43 * - Preprocess the input data (LZX-specific)
44 * - Go through the input data and determine matches. This part is based on
45 * code from zlib, and a hash table of 3-character strings is used to
46 * accelerate the process of finding matches.
47 * - Build the Huffman trees based on the frequencies of symbols determined
48 * while recording matches.
49 * - Output the block header, including the Huffman trees; then output the
50 * compressed stream of matches and literal characters.
61 /* Structure to contain the Huffman codes for the main, length, and aligned
64 u16 main_codewords[LZX_MAINTREE_NUM_SYMBOLS];
65 u8 main_lens[LZX_MAINTREE_NUM_SYMBOLS];
67 u16 len_codewords[LZX_LENTREE_NUM_SYMBOLS];
68 u8 len_lens[LZX_LENTREE_NUM_SYMBOLS];
70 u16 aligned_codewords[LZX_ALIGNEDTREE_NUM_SYMBOLS];
71 u8 aligned_lens[LZX_ALIGNEDTREE_NUM_SYMBOLS];
74 struct lzx_freq_tables {
75 u32 main_freq_table[LZX_MAINTREE_NUM_SYMBOLS];
76 u32 len_freq_table[LZX_LENTREE_NUM_SYMBOLS];
77 u32 aligned_freq_table[LZX_ALIGNEDTREE_NUM_SYMBOLS];
83 /* Returns the position slot that corresponds to a given formatted offset. This
84 * means searching the lzx_position_base array to find what slot contains a
85 * position base that is less than or equal to formatted_offset, where the next
86 * slot contains a position base that is greater than or equal to
87 * formatted_offset. */
88 static uint lzx_get_position_slot(uint formatted_offset)
94 /* Calculate position base using binary search of table; if log2 can be
95 * done in hardware, approximation might work;
96 * trunc(log2(formatted_offset*formatted_offset)) gets either the proper
97 * position slot or the next one, except for slots 0, 1, and 39-49
99 * Slots 0-1 are handled by the R0-R1 procedures
101 * Slots 36-49 (formatted_offset >= 262144) can be found by
102 * (formatted_offset/131072) + 34 == (formatted_offset >> 17) + 34;
104 if (formatted_offset >= 262144) {
105 return (formatted_offset >> 17) + 34;
108 right = LZX_NUM_POSITION_SLOTS - 1;
110 mid = (left + right) >> 1;
111 if ((lzx_position_base[mid] <= formatted_offset) &&
112 lzx_position_base[mid + 1] > formatted_offset) {
115 if (formatted_offset > lzx_position_base[mid])
124 static u32 lzx_record_literal(u8 literal, void *__main_freq_tab)
126 u32 *main_freq_tab = __main_freq_tab;
127 main_freq_tab[literal]++;
131 /* Constructs a match from an offset and a length, and updates the LRU queue
132 * and the frequency of symbols in the main, length, and aligned offset
133 * alphabets. The return value is a 32-bit integer that, if the high bit is
134 * set, contains the match length, the position slot, and the position footer
136 static u32 lzx_record_match(uint match_offset, uint match_len,
137 void *__freq_tabs, void *__queue)
139 struct lzx_freq_tables *freq_tabs = __freq_tabs;
140 struct lru_queue *queue = __queue;
141 uint formatted_offset;
143 uint position_footer = 0;
149 wimlib_assert(match_len >= LZX_MIN_MATCH && match_len <= LZX_MAX_MATCH);
152 if (match_offset == queue->R0) {
153 formatted_offset = 0;
155 } else if (match_offset == queue->R1) {
156 swap(queue->R0, queue->R1);
157 formatted_offset = 1;
159 } else if (match_offset == queue->R2) {
160 swap(queue->R0, queue->R2);
161 formatted_offset = 2;
164 /* Not a repeated offset. */
166 formatted_offset = match_offset + LZX_MIN_MATCH;
168 queue->R2 = queue->R1;
169 queue->R1 = queue->R0;
170 queue->R0 = match_offset;
172 position_slot = lzx_get_position_slot(formatted_offset);
174 /* Just the extra bits of the formatted offset. */
175 position_footer = ((1UL << lzx_extra_bits[position_slot]) - 1) &
179 /* (match length - 2) = 8 bits */
180 /* position_slot = 6 bits */
181 /* position_footer = 17 bits */
182 /* total = 31 bits */
183 /* plus one to say whether it's a literal or not */
185 match = 0x80000000 | /* bit 31 in intelligent bit ordering */
186 (position_slot << 25) | /* bits 30-25 */
187 (position_footer << 8) | /* bits 8-24 */
188 (match_len - LZX_MIN_MATCH); /* bits 0-7 */
190 /* Update the frequency for the main tree, the length tree (only if a
191 * length symbol is to be output), and the aligned tree (only if an
192 * aligned symbol is to be output.) */
193 if (match_len < (LZX_NUM_PRIMARY_LENS + LZX_MIN_MATCH)) {
194 len_header = match_len - LZX_MIN_MATCH;
196 len_header = LZX_NUM_PRIMARY_LENS;
197 len_footer = match_len - (LZX_NUM_PRIMARY_LENS + LZX_MIN_MATCH);
198 freq_tabs->len_freq_table[len_footer]++;
200 len_pos_header = (position_slot << 3) | len_header;
202 wimlib_assert(len_pos_header < LZX_MAINTREE_NUM_SYMBOLS - LZX_NUM_CHARS);
204 freq_tabs->main_freq_table[len_pos_header + LZX_NUM_CHARS]++;
206 if (lzx_extra_bits[position_slot] >= 3)
207 freq_tabs->aligned_freq_table[position_footer & 7]++;
213 * Writes a compressed literal match to the output.
215 * @out: The output bitstream.
216 * @block_type: The type of the block (LZX_BLOCKTYPE_ALIGNED or LZX_BLOCKTYPE_VERBATIM)
217 * @match: The match, encoded as a 32-bit number.
218 * @codes: Pointer to a structure that contains the codewords for the
219 * main, length, and aligned offset Huffman codes.
221 static int lzx_write_match(struct output_bitstream *out, int block_type,
222 u32 match, const struct lzx_codes *codes)
224 /* low 8 bits are the match length minus 2 */
225 uint match_len_minus_2 = match & 0xff;
226 /* Next 17 bits are the position footer */
227 uint position_footer = (match >> 8) & 0x1ffff; /* 17 bits */
228 /* Next 6 bits are the position slot. */
229 uint position_slot = (match >> 25) & 0x3f; /* 6 bits */
239 /* If the match length is less than MIN_MATCH (= 2) +
240 * NUM_PRIMARY_LENS (= 7), the length header contains
241 * the match length minus MIN_MATCH, and there is no
244 * Otherwise, the length header contains
245 * NUM_PRIMARY_LENS, and the length footer contains
246 * the match length minus NUM_PRIMARY_LENS minus
248 if (match_len_minus_2 < LZX_NUM_PRIMARY_LENS) {
249 len_header = match_len_minus_2;
250 /* No length footer-- mark it with a special
252 len_footer = (uint)(-1);
254 len_header = LZX_NUM_PRIMARY_LENS;
255 len_footer = match_len_minus_2 - LZX_NUM_PRIMARY_LENS;
258 /* Combine the position slot with the length header into
259 * a single symbol that will be encoded with the main
261 len_pos_header = (position_slot << 3) | len_header;
263 /* The actual main symbol is offset by LZX_NUM_CHARS because
264 * values under LZX_NUM_CHARS are used to indicate a literal
265 * byte rather than a match. */
266 main_symbol = len_pos_header + LZX_NUM_CHARS;
268 /* Output main symbol. */
269 ret = bitstream_put_bits(out, codes->main_codewords[main_symbol],
270 codes->main_lens[main_symbol]);
274 /* If there is a length footer, output it using the
275 * length Huffman code. */
276 if (len_footer != (uint)(-1)) {
277 ret = bitstream_put_bits(out, codes->len_codewords[len_footer],
278 codes->len_lens[len_footer]);
283 wimlib_assert(position_slot < LZX_NUM_POSITION_SLOTS);
285 num_extra_bits = lzx_extra_bits[position_slot];
287 /* For aligned offset blocks with at least 3 extra bits, output the
288 * verbatim bits literally, then the aligned bits encoded using the
289 * aligned offset tree. Otherwise, only the verbatim bits need to be
291 if ((block_type == LZX_BLOCKTYPE_ALIGNED) && (num_extra_bits >= 3)) {
293 verbatim_bits = position_footer >> 3;
294 ret = bitstream_put_bits(out, verbatim_bits,
299 aligned_bits = (position_footer & 7);
300 ret = bitstream_put_bits(out,
301 codes->aligned_codewords[aligned_bits],
302 codes->aligned_lens[aligned_bits]);
306 /* verbatim bits is the same as the position
307 * footer, in this case. */
308 ret = bitstream_put_bits(out, position_footer, num_extra_bits);
316 * Writes all compressed literals in a block, both matches and literal bytes, to
317 * the output bitstream.
319 * @out: The output bitstream.
320 * @block_type: The type of the block (LZX_BLOCKTYPE_ALIGNED or LZX_BLOCKTYPE_VERBATIM)
321 * @match_tab[]: The array of matches that will be output. It has length
322 * of @num_compressed_literals.
323 * @num_compressed_literals: Number of compressed literals to be output.
324 * @codes: Pointer to a structure that contains the codewords for the
325 * main, length, and aligned offset Huffman codes.
327 static int lzx_write_compressed_literals(struct output_bitstream *ostream,
329 const u32 match_tab[],
330 uint num_compressed_literals,
331 const struct lzx_codes *codes)
337 for (i = 0; i < num_compressed_literals; i++) {
338 match = match_tab[i];
340 /* High bit of the match indicates whether the match is an
341 * actual match (1) or a literal uncompressed byte (0) */
342 if (match & 0x80000000) {
344 ret = lzx_write_match(ostream, block_type, match,
350 wimlib_assert(match < LZX_NUM_CHARS);
351 ret = bitstream_put_bits(ostream,
352 codes->main_codewords[match],
353 codes->main_lens[match]);
362 * Writes a compressed Huffman tree to the output, preceded by the pretree for
365 * The Huffman tree is represented in the output as a series of path lengths
366 * from which the canonical Huffman code can be reconstructed. The path lengths
367 * themselves are compressed using a separate Huffman code, the pretree, which
368 * consists of LZX_PRETREE_NUM_SYMBOLS (= 20) symbols that cover all possible code
369 * lengths, plus extra codes for repeated lengths. The path lengths of the
370 * pretree precede the path lengths of the larger code and are uncompressed,
371 * consisting of 20 entries of 4 bits each.
373 * @out: The bitstream for the compressed output.
374 * @lens: The code lengths for the Huffman tree, indexed by symbol.
375 * @num_symbols: The number of symbols in the code.
377 static int lzx_write_compressed_tree(struct output_bitstream *out,
381 /* Frequencies of the length symbols, including the RLE symbols (NOT the
382 * actual lengths themselves). */
383 uint pretree_freqs[LZX_PRETREE_NUM_SYMBOLS];
384 u8 pretree_lens[LZX_PRETREE_NUM_SYMBOLS];
385 u16 pretree_codewords[LZX_PRETREE_NUM_SYMBOLS];
386 u8 output_syms[num_symbols * 2];
387 uint output_syms_idx;
391 uint additional_bits;
395 ZERO_ARRAY(pretree_freqs);
397 /* Since the code word lengths use a form of RLE encoding, the goal here
398 * is to find each run of identical lengths when going through them in
399 * symbol order (including runs of length 1). For each run, as many
400 * lengths are encoded using RLE as possible, and the rest are output
403 * output_syms[] will be filled in with the length symbols that will be
404 * output, including RLE codes, not yet encoded using the pre-tree.
406 * cur_run_len keeps track of how many code word lengths are in the
407 * current run of identical lengths.
411 for (i = 1; i <= num_symbols; i++) {
413 if (i != num_symbols && lens[i] == lens[i - 1]) {
414 /* Still in a run--- keep going. */
419 /* Run ended! Check if it is a run of zeroes or a run of
422 /* The symbol that was repeated in the run--- not to be confused
423 * with the length *of* the run (cur_run_len) */
424 len_in_run = lens[i - 1];
426 if (len_in_run == 0) {
427 /* A run of 0's. Encode it in as few length
428 * codes as we can. */
430 /* The magic length 18 indicates a run of 20 + n zeroes,
431 * where n is an uncompressed literal 5-bit integer that
432 * follows the magic length. */
433 while (cur_run_len >= 20) {
435 additional_bits = min(cur_run_len - 20, 0x1f);
437 output_syms[output_syms_idx++] = 18;
438 output_syms[output_syms_idx++] = additional_bits;
439 cur_run_len -= 20 + additional_bits;
442 /* The magic length 17 indicates a run of 4 + n zeroes,
443 * where n is an uncompressed literal 4-bit integer that
444 * follows the magic length. */
445 while (cur_run_len >= 4) {
446 additional_bits = min(cur_run_len - 4, 0xf);
448 output_syms[output_syms_idx++] = 17;
449 output_syms[output_syms_idx++] = additional_bits;
450 cur_run_len -= 4 + additional_bits;
455 /* A run of nonzero lengths. */
457 /* The magic length 19 indicates a run of 4 + n
458 * nonzeroes, where n is a literal bit that follows the
459 * magic length, and where the value of the lengths in
460 * the run is given by an extra length symbol, encoded
461 * with the pretree, that follows the literal bit.
463 * The extra length symbol is encoded as a difference
464 * from the length of the codeword for the first symbol
465 * in the run in the previous tree.
467 while (cur_run_len >= 4) {
468 additional_bits = (cur_run_len > 4);
469 delta = -(char)len_in_run;
473 pretree_freqs[(unsigned char)delta]++;
474 output_syms[output_syms_idx++] = 19;
475 output_syms[output_syms_idx++] = additional_bits;
476 output_syms[output_syms_idx++] = delta;
477 cur_run_len -= 4 + additional_bits;
481 /* Any remaining lengths in the run are outputted without RLE,
482 * as a difference from the length of that codeword in the
484 while (cur_run_len--) {
485 delta = -(char)len_in_run;
489 pretree_freqs[(unsigned char)delta]++;
490 output_syms[output_syms_idx++] = delta;
496 wimlib_assert(output_syms_idx < ARRAY_LEN(output_syms));
498 /* Build the pretree from the frequencies of the length symbols. */
500 make_canonical_huffman_code(LZX_PRETREE_NUM_SYMBOLS,
501 LZX_MAX_CODEWORD_LEN,
502 pretree_freqs, pretree_lens,
505 /* Write the lengths of the pretree codes to the output. */
506 for (i = 0; i < LZX_PRETREE_NUM_SYMBOLS; i++)
507 bitstream_put_bits(out, pretree_lens[i],
508 LZX_PRETREE_ELEMENT_SIZE);
510 /* Write the length symbols, encoded with the pretree, to the output. */
513 while (i < output_syms_idx) {
514 pretree_sym = output_syms[i++];
516 bitstream_put_bits(out, pretree_codewords[pretree_sym],
517 pretree_lens[pretree_sym]);
518 switch (pretree_sym) {
520 bitstream_put_bits(out, output_syms[i++], 4);
523 bitstream_put_bits(out, output_syms[i++], 5);
526 bitstream_put_bits(out, output_syms[i++], 1);
527 bitstream_put_bits(out,
528 pretree_codewords[output_syms[i]],
529 pretree_lens[output_syms[i]]);
539 /* Builds the canonical Huffman code for the main tree, the length tree, and the
540 * aligned offset tree. */
541 static void lzx_make_huffman_codes(const struct lzx_freq_tables *freq_tabs,
542 struct lzx_codes *codes)
544 make_canonical_huffman_code(LZX_MAINTREE_NUM_SYMBOLS,
545 LZX_MAX_CODEWORD_LEN,
546 freq_tabs->main_freq_table,
548 codes->main_codewords);
550 make_canonical_huffman_code(LZX_LENTREE_NUM_SYMBOLS,
551 LZX_MAX_CODEWORD_LEN,
552 freq_tabs->len_freq_table,
554 codes->len_codewords);
556 make_canonical_huffman_code(LZX_ALIGNEDTREE_NUM_SYMBOLS, 8,
557 freq_tabs->aligned_freq_table,
559 codes->aligned_codewords);
562 /* Do the 'E8' preprocessing, where the targets of x86 CALL instructions were
563 * changed from relative offsets to absolute offsets. This type of
564 * preprocessing can be used on any binary data even if it is not actually
565 * machine code. It seems to always be used in WIM files, even though there is
566 * no bit to indicate that it actually is used, unlike in the LZX compressed
567 * format as used in other file formats such as the cabinet format, where a bit
568 * is reserved for that purpose. */
569 static void do_call_insn_preprocessing(u8 uncompressed_data[],
570 uint uncompressed_data_len)
573 int file_size = LZX_MAGIC_FILESIZE;
577 /* Not enabled in the last 6 bytes, which means the 5-byte call
578 * instruction cannot start in the last *10* bytes. */
579 while (i < uncompressed_data_len - 10) {
580 if (uncompressed_data[i] != 0xe8) {
584 rel_offset = le32_to_cpu(*(int32_t*)(uncompressed_data + i + 1));
586 if (rel_offset >= -i && rel_offset < file_size) {
587 if (rel_offset < file_size - i) {
588 /* "good translation" */
589 abs_offset = rel_offset + i;
591 /* "compensating translation" */
592 abs_offset = rel_offset - file_size;
594 *(int32_t*)(uncompressed_data + i + 1) = cpu_to_le32(abs_offset);
601 static const struct lz_params lzx_lz_params = {
603 /* LZX_MIN_MATCH == 2, but 2-character matches are rarely useful; the
604 * minimum match for compression is set to 3 instead. */
607 .max_match = LZX_MAX_MATCH,
608 .good_match = LZX_MAX_MATCH,
609 .nice_match = LZX_MAX_MATCH,
610 .max_chain_len = LZX_MAX_MATCH,
611 .max_lazy_match = LZX_MAX_MATCH,
616 * Performs LZX compression on a block of data.
618 * @__uncompressed_data: Pointer to the data to be compressed.
619 * @uncompressed_len: Length, in bytes, of the data to be compressed.
620 * @compressed_data: Pointer to a location at least (@uncompressed_len - 1)
621 * bytes long into which the compressed data may be
623 * @compressed_len_ret: A pointer to an unsigned int into which the length of
624 * the compressed data may be returned.
626 * Returns zero if compression was successfully performed. In that case
627 * @compressed_data and @compressed_len_ret will contain the compressed data and
628 * its length. A return value of nonzero means that compressing the data did
629 * not reduce its size, and @compressed_data will not contain the full
632 int lzx_compress(const void *__uncompressed_data, uint uncompressed_len,
633 void *compressed_data, uint *compressed_len_ret)
635 struct output_bitstream ostream;
636 u8 uncompressed_data[uncompressed_len + LZX_MAX_MATCH];
637 struct lzx_freq_tables freq_tabs;
638 struct lzx_codes codes;
639 u32 match_tab[uncompressed_len];
640 struct lru_queue queue = {.R0 = 1, .R1 = 1, .R2 = 1};
645 int block_type = LZX_BLOCKTYPE_ALIGNED;
647 LZX_DEBUG("uncompressed_len = %u", uncompressed_len);
649 if (uncompressed_len < 100)
653 memset(&freq_tabs, 0, sizeof(freq_tabs));
655 /* The input data must be preprocessed. To avoid changing the original
656 * input, copy it to a temporary buffer. */
657 memcpy(uncompressed_data, __uncompressed_data, uncompressed_len);
660 /* Before doing any actual compression, do the call instruction (0xe8
661 * byte) translation on the uncompressed data. */
662 do_call_insn_preprocessing(uncompressed_data, uncompressed_len);
665 /* Determine the sequence of matches and literals that will be output,
666 * and in the process, keep counts of the number of times each symbol
667 * will be output, so that the Huffman trees can be made. */
669 num_matches = lz_analyze_block(uncompressed_data, uncompressed_len,
670 match_tab, lzx_record_match,
671 lzx_record_literal, &freq_tabs,
672 &queue, freq_tabs.main_freq_table,
675 LZX_DEBUG("using %u matches", num_matches);
677 lzx_make_huffman_codes(&freq_tabs, &codes);
679 /* Initialize the output bitstream. */
680 init_output_bitstream(&ostream, compressed_data, uncompressed_len - 1);
682 /* The first three bits tell us what kind of block it is, and are one
683 * of the LZX_BLOCKTYPE_* values. */
684 bitstream_put_bits(&ostream, block_type, 3);
686 /* The next bit indicates whether the block size is the default (32768),
687 * indicated by a 1 bit, or whether the block size is given by the next
688 * 16 bits, indicated by a 0 bit. */
689 if (uncompressed_len == 32768) {
690 bitstream_put_bits(&ostream, 1, 1);
692 bitstream_put_bits(&ostream, 0, 1);
693 bitstream_put_bits(&ostream, uncompressed_len, 16);
696 /* Write out the aligned offset tree. Note that M$ lies and says that
697 * the aligned offset tree comes after the length tree, but that is
698 * wrong; it actually is before the main tree. */
699 if (block_type == LZX_BLOCKTYPE_ALIGNED)
700 for (i = 0; i < LZX_ALIGNEDTREE_NUM_SYMBOLS; i++)
701 bitstream_put_bits(&ostream, codes.aligned_lens[i],
702 LZX_ALIGNEDTREE_ELEMENT_SIZE);
704 /* Write the pre-tree and lengths for the first LZX_NUM_CHARS symbols in the
706 ret = lzx_write_compressed_tree(&ostream, codes.main_lens,
711 /* Write the pre-tree and symbols for the rest of the main tree. */
712 ret = lzx_write_compressed_tree(&ostream, codes.main_lens +
714 LZX_MAINTREE_NUM_SYMBOLS -
719 /* Write the pre-tree and symbols for the length tree. */
720 ret = lzx_write_compressed_tree(&ostream, codes.len_lens,
721 LZX_LENTREE_NUM_SYMBOLS);
725 /* Write the compressed literals. */
726 ret = lzx_write_compressed_literals(&ostream, block_type,
727 match_tab, num_matches, &codes);
731 ret = flush_output_bitstream(&ostream);
735 compressed_len = ostream.bit_output - (u8*)compressed_data;
737 LZX_DEBUG("Compressed %u => %u bytes",
738 uncompressed_len, compressed_len);
740 *compressed_len_ret = compressed_len;
742 #ifdef ENABLE_VERIFY_COMPRESSION
743 /* Verify that we really get the same thing back when decompressing. */
744 LZX_DEBUG("Verifying the compressed data.");
745 u8 buf[uncompressed_len];
746 ret = lzx_decompress(compressed_data, compressed_len, buf,
749 ERROR("lzx_compress(): Failed to decompress data we compressed");
753 for (i = 0; i < uncompressed_len; i++) {
754 if (buf[i] != *((u8*)__uncompressed_data + i)) {
755 ERROR("lzx_compress(): Data we compressed didn't "
756 "decompress to the original data (difference at "
757 "byte %u of %u)", i + 1, uncompressed_len);
761 LZX_DEBUG("Compression verified to be correct.");