/*
* lzx-comp.c
*
- * LZX compression routines.
+ * LZX compression routines.
*
* This code was originally based on code written by Matthew T. Russotto
* (liblzxcomp).
- *
+ */
+
+/*
* Copyright (C) 2002 Matthew T. Russotto
* Copyright (C) 2012 Eric Biggers
*
- * wimlib - Library for working with WIM files
+ * This file is part of wimlib, a library for working with WIM files.
*
- * This library is free software; you can redistribute it and/or modify it under
- * the terms of the GNU Lesser General Public License as published by the Free
- * Software Foundation; either version 2.1 of the License, or (at your option) any
- * later version.
+ * wimlib is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.
*
- * This library is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
+ * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+ * A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
*
- * You should have received a copy of the GNU Lesser General Public License along
- * with this library; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * You should have received a copy of the GNU General Public License
+ * along with wimlib; if not, see http://www.gnu.org/licenses/.
*/
-/*
+/*
* This file provides lzx_compress(), a function to compress an in-memory buffer
* of data using LZX compression, as used in the WIM file format.
*
* tricky to understand. Basically it is the following:
*
* - Preprocess the input data (LZX-specific)
- * - Go through the input data and determine matches. This part is based on
+ * - Go through the input data and determine matches. This part is based on
* code from zlib, and a hash table of 3-character strings is used to
* accelerate the process of finding matches.
* - Build the Huffman trees based on the frequencies of symbols determined
#include "lzx.h"
#include "comp.h"
-#include "huffman.h"
#include <math.h>
#include <stdlib.h>
#include <string.h>
};
struct lzx_freq_tables {
- u32 main_freq_table[LZX_MAINTREE_NUM_SYMBOLS];
+ u32 main_freq_table[LZX_MAINTREE_NUM_SYMBOLS];
u32 len_freq_table[LZX_LENTREE_NUM_SYMBOLS];
u32 aligned_freq_table[LZX_ALIGNEDTREE_NUM_SYMBOLS];
};
int mid;
/* Calculate position base using binary search of table; if log2 can be
- * done in hardware, approximation might work;
+ * done in hardware, approximation might work;
* trunc(log2(formatted_offset*formatted_offset)) gets either the proper
* position slot or the next one, except for slots 0, 1, and 39-49
*
* Slots 0-1 are handled by the R0-R1 procedures
*
- * Slots 36-49 (formatted_offset >= 262144) can be found by
+ * Slots 36-49 (formatted_offset >= 262144) can be found by
* (formatted_offset/131072) + 34 == (formatted_offset >> 17) + 34;
*/
if (formatted_offset >= 262144) {
* alphabets. The return value is a 32-bit integer that, if the high bit is
* set, contains the match length, the position slot, and the position footer
* for the match. */
-static u32 lzx_record_match(uint match_offset, uint match_len,
+static u32 lzx_record_match(uint match_offset, uint match_len,
void *__freq_tabs, void *__queue)
{
struct lzx_freq_tables *freq_tabs = __freq_tabs;
return match;
}
-/*
+/*
* Writes a compressed literal match to the output.
*
* @out: The output bitstream.
main_symbol = len_pos_header + LZX_NUM_CHARS;
/* Output main symbol. */
- ret = bitstream_put_bits(out, codes->main_codewords[main_symbol],
+ ret = bitstream_put_bits(out, codes->main_codewords[main_symbol],
codes->main_lens[main_symbol]);
if (ret != 0)
return ret;
* aligned offset tree. Otherwise, only the verbatim bits need to be
* output. */
if ((block_type == LZX_BLOCKTYPE_ALIGNED) && (num_extra_bits >= 3)) {
-
+
verbatim_bits = position_footer >> 3;
- ret = bitstream_put_bits(out, verbatim_bits,
+ ret = bitstream_put_bits(out, verbatim_bits,
num_extra_bits - 3);
if (ret != 0)
return ret;
aligned_bits = (position_footer & 7);
- ret = bitstream_put_bits(out,
+ ret = bitstream_put_bits(out,
codes->aligned_codewords[aligned_bits],
codes->aligned_lens[aligned_bits]);
if (ret != 0)
return 0;
}
-/*
+/*
* Writes all compressed literals in a block, both matches and literal bytes, to
* the output bitstream.
*
* @codes: Pointer to a structure that contains the codewords for the
* main, length, and aligned offset Huffman codes.
*/
-static int lzx_write_compressed_literals(struct output_bitstream *ostream,
+static int lzx_write_compressed_literals(struct output_bitstream *ostream,
int block_type,
- const u32 match_tab[],
+ const u32 match_tab[],
uint num_compressed_literals,
const struct lzx_codes *codes)
{
* actual match (1) or a literal uncompressed byte (0) */
if (match & 0x80000000) {
/* match */
- ret = lzx_write_match(ostream, block_type, match,
+ ret = lzx_write_match(ostream, block_type, match,
codes);
if (ret != 0)
return ret;
} else {
/* literal byte */
wimlib_assert(match < LZX_NUM_CHARS);
- ret = bitstream_put_bits(ostream,
+ ret = bitstream_put_bits(ostream,
codes->main_codewords[match],
codes->main_lens[match]);
if (ret != 0)
return 0;
}
-/*
+/*
* Writes a compressed Huffman tree to the output, preceded by the pretree for
* it.
*
* @lens: The code lengths for the Huffman tree, indexed by symbol.
* @num_symbols: The number of symbols in the code.
*/
-static int lzx_write_compressed_tree(struct output_bitstream *out,
- const u8 lens[],
+static int lzx_write_compressed_tree(struct output_bitstream *out,
+ const u8 lens[],
uint num_symbols)
{
/* Frequencies of the length symbols, including the RLE symbols (NOT the
if (delta < 0)
delta += 17;
pretree_freqs[19]++;
- pretree_freqs[delta]++;
+ pretree_freqs[(unsigned char)delta]++;
output_syms[output_syms_idx++] = 19;
output_syms[output_syms_idx++] = additional_bits;
output_syms[output_syms_idx++] = delta;
if (delta < 0)
delta += 17;
- pretree_freqs[delta]++;
+ pretree_freqs[(unsigned char)delta]++;
output_syms[output_syms_idx++] = delta;
}
/* Build the pretree from the frequencies of the length symbols. */
- make_canonical_huffman_code(LZX_PRETREE_NUM_SYMBOLS,
- LZX_MAX_CODEWORD_LEN,
- pretree_freqs, pretree_lens,
+ make_canonical_huffman_code(LZX_PRETREE_NUM_SYMBOLS,
+ LZX_MAX_CODEWORD_LEN,
+ pretree_freqs, pretree_lens,
pretree_codewords);
/* Write the lengths of the pretree codes to the output. */
for (i = 0; i < LZX_PRETREE_NUM_SYMBOLS; i++)
- bitstream_put_bits(out, pretree_lens[i],
+ bitstream_put_bits(out, pretree_lens[i],
LZX_PRETREE_ELEMENT_SIZE);
/* Write the length symbols, encoded with the pretree, to the output. */
while (i < output_syms_idx) {
pretree_sym = output_syms[i++];
- bitstream_put_bits(out, pretree_codewords[pretree_sym],
+ bitstream_put_bits(out, pretree_codewords[pretree_sym],
pretree_lens[pretree_sym]);
switch (pretree_sym) {
case 17:
break;
case 19:
bitstream_put_bits(out, output_syms[i++], 1);
- bitstream_put_bits(out,
+ bitstream_put_bits(out,
pretree_codewords[output_syms[i]],
pretree_lens[output_syms[i]]);
i++;
static void lzx_make_huffman_codes(const struct lzx_freq_tables *freq_tabs,
struct lzx_codes *codes)
{
- make_canonical_huffman_code(LZX_MAINTREE_NUM_SYMBOLS,
+ make_canonical_huffman_code(LZX_MAINTREE_NUM_SYMBOLS,
LZX_MAX_CODEWORD_LEN,
- freq_tabs->main_freq_table,
+ freq_tabs->main_freq_table,
codes->main_lens,
codes->main_codewords);
- make_canonical_huffman_code(LZX_LENTREE_NUM_SYMBOLS,
+ make_canonical_huffman_code(LZX_LENTREE_NUM_SYMBOLS,
LZX_MAX_CODEWORD_LEN,
- freq_tabs->len_freq_table,
+ freq_tabs->len_freq_table,
codes->len_lens,
codes->len_codewords);
make_canonical_huffman_code(LZX_ALIGNEDTREE_NUM_SYMBOLS, 8,
- freq_tabs->aligned_freq_table,
+ freq_tabs->aligned_freq_table,
codes->aligned_lens,
codes->aligned_codewords);
}
* no bit to indicate that it actually is used, unlike in the LZX compressed
* format as used in other file formats such as the cabinet format, where a bit
* is reserved for that purpose. */
-static void do_call_insn_preprocessing(u8 uncompressed_data[],
+static void do_call_insn_preprocessing(u8 uncompressed_data[],
uint uncompressed_data_len)
{
int i = 0;
/* Not enabled in the last 6 bytes, which means the 5-byte call
* instruction cannot start in the last *10* bytes. */
- while (i < uncompressed_data_len - 10) {
+ while (i < uncompressed_data_len - 10) {
if (uncompressed_data[i] != 0xe8) {
i++;
continue;
}
- rel_offset = to_le32(*(int32_t*)(uncompressed_data + i + 1));
+ rel_offset = le32_to_cpu(*(int32_t*)(uncompressed_data + i + 1));
if (rel_offset >= -i && rel_offset < file_size) {
if (rel_offset < file_size - i) {
/* "compensating translation" */
abs_offset = rel_offset - file_size;
}
- *(int32_t*)(uncompressed_data + i + 1) = to_le32(abs_offset);
+ *(int32_t*)(uncompressed_data + i + 1) = cpu_to_le32(abs_offset);
}
i += 5;
}
static const struct lz_params lzx_lz_params = {
+
+ /* LZX_MIN_MATCH == 2, but 2-character matches are rarely useful; the
+ * minimum match for compression is set to 3 instead. */
.min_match = 3,
+
.max_match = LZX_MAX_MATCH,
.good_match = LZX_MAX_MATCH,
.nice_match = LZX_MAX_MATCH,
.too_far = 4096,
};
-/*
+/*
* Performs LZX compression on a block of data.
*
* @__uncompressed_data: Pointer to the data to be compressed.
* @compressed_data and @compressed_len_ret will contain the compressed data and
* its length. A return value of nonzero means that compressing the data did
* not reduce its size, and @compressed_data will not contain the full
- * compressed data.
+ * compressed data.
*/
int lzx_compress(const void *__uncompressed_data, uint uncompressed_len,
void *compressed_data, uint *compressed_len_ret)
uint compressed_len;
uint i;
int ret;
+ int block_type = LZX_BLOCKTYPE_ALIGNED;
- LZX_DEBUG("uncompressed_len = %u\n", uncompressed_len);
+ LZX_DEBUG("uncompressed_len = %u", uncompressed_len);
if (uncompressed_len < 100)
return 1;
&queue, freq_tabs.main_freq_table,
&lzx_lz_params);
- LZX_DEBUG("using %u matches\n", num_matches);
-
+ LZX_DEBUG("using %u matches", num_matches);
lzx_make_huffman_codes(&freq_tabs, &codes);
/* The first three bits tell us what kind of block it is, and are one
* of the LZX_BLOCKTYPE_* values. */
- bitstream_put_bits(&ostream, LZX_BLOCKTYPE_ALIGNED, 3);
+ bitstream_put_bits(&ostream, block_type, 3);
/* The next bit indicates whether the block size is the default (32768),
* indicated by a 1 bit, or whether the block size is given by the next
/* Write out the aligned offset tree. Note that M$ lies and says that
* the aligned offset tree comes after the length tree, but that is
* wrong; it actually is before the main tree. */
- for (i = 0; i < LZX_ALIGNEDTREE_NUM_SYMBOLS; i++)
- bitstream_put_bits(&ostream, codes.aligned_lens[i],
- LZX_ALIGNEDTREE_ELEMENT_SIZE);
+ if (block_type == LZX_BLOCKTYPE_ALIGNED)
+ for (i = 0; i < LZX_ALIGNEDTREE_NUM_SYMBOLS; i++)
+ bitstream_put_bits(&ostream, codes.aligned_lens[i],
+ LZX_ALIGNEDTREE_ELEMENT_SIZE);
/* Write the pre-tree and lengths for the first LZX_NUM_CHARS symbols in the
* main tree. */
- ret = lzx_write_compressed_tree(&ostream, codes.main_lens,
+ ret = lzx_write_compressed_tree(&ostream, codes.main_lens,
LZX_NUM_CHARS);
if (ret != 0)
return ret;
/* Write the pre-tree and symbols for the rest of the main tree. */
- ret = lzx_write_compressed_tree(&ostream, codes.main_lens +
- LZX_NUM_CHARS,
- LZX_MAINTREE_NUM_SYMBOLS -
+ ret = lzx_write_compressed_tree(&ostream, codes.main_lens +
+ LZX_NUM_CHARS,
+ LZX_MAINTREE_NUM_SYMBOLS -
LZX_NUM_CHARS);
if (ret != 0)
return ret;
/* Write the pre-tree and symbols for the length tree. */
- ret = lzx_write_compressed_tree(&ostream, codes.len_lens,
+ ret = lzx_write_compressed_tree(&ostream, codes.len_lens,
LZX_LENTREE_NUM_SYMBOLS);
if (ret != 0)
return ret;
/* Write the compressed literals. */
- ret = lzx_write_compressed_literals(&ostream, LZX_BLOCKTYPE_ALIGNED,
+ ret = lzx_write_compressed_literals(&ostream, block_type,
match_tab, num_matches, &codes);
if (ret != 0)
return ret;
compressed_len = ostream.bit_output - (u8*)compressed_data;
- LZX_DEBUG("Compressed %u => %u bytes\n",
- uncompressed_len, compressed_len);
+ LZX_DEBUG("Compressed %u => %u bytes",
+ uncompressed_len, compressed_len);
*compressed_len_ret = compressed_len;
#ifdef ENABLE_VERIFY_COMPRESSION
/* Verify that we really get the same thing back when decompressing. */
+ LZX_DEBUG("Verifying the compressed data.");
u8 buf[uncompressed_len];
- ret = lzx_decompress(compressed_data, compressed_len, buf,
+ ret = lzx_decompress(compressed_data, compressed_len, buf,
uncompressed_len);
if (ret != 0) {
- ERROR("ERROR: Failed to decompress data we compressed!\n");
- exit(0);
+ ERROR("lzx_compress(): Failed to decompress data we compressed");
abort();
}
for (i = 0; i < uncompressed_len; i++) {
if (buf[i] != *((u8*)__uncompressed_data + i)) {
- ERROR("Data we compressed didn't decompress to "
- "the original data (difference at byte %u of "
- "%u)\n", i + 1, uncompressed_len);
+ ERROR("lzx_compress(): Data we compressed didn't "
+ "decompress to the original data (difference at "
+ "byte %u of %u)", i + 1, uncompressed_len);
abort();
}
}
- LZX_DEBUG("Compression verified to be correct.\n");
+ LZX_DEBUG("Compression verified to be correct.");
#endif
return 0;