/*
* Copyright (C) 2002 Matthew T. Russotto
- * Copyright (C) 2012 Eric Biggers
+ * Copyright (C) 2012, 2013 Eric Biggers
*
* This file is part of wimlib, a library for working with WIM files.
*
/*
- * This file provides lzx_compress(), a function to compress an in-memory buffer
- * of data using LZX compression, as used in the WIM file format.
+ * This file provides wimlib_lzx_compress(), a function to compress an in-memory
+ * buffer of data using LZX compression, as used in the WIM file format.
*
* Please see the comments in lzx-decompress.c for more information about this
* compression format.
* blocks from one input chunk is not yet implemented.
*/
+#include "wimlib.h"
#include "lzx.h"
#include "compress.h"
#include <stdlib.h>
};
struct lzx_freq_tables {
- u32 main_freq_table[LZX_MAINTREE_NUM_SYMBOLS];
- u32 len_freq_table[LZX_LENTREE_NUM_SYMBOLS];
- u32 aligned_freq_table[LZX_ALIGNEDTREE_NUM_SYMBOLS];
+ freq_t main_freq_table[LZX_MAINTREE_NUM_SYMBOLS];
+ freq_t len_freq_table[LZX_LENTREE_NUM_SYMBOLS];
+ freq_t aligned_freq_table[LZX_ALIGNEDTREE_NUM_SYMBOLS];
};
/* Returns the LZX position slot that corresponds to a given formatted offset.
* numbers in the lzx_position_base array to calculate the slot directly from
* the formatted offset without actually looking at the array.
*/
-static inline unsigned lzx_get_position_slot(unsigned formatted_offset)
+static inline unsigned
+lzx_get_position_slot(unsigned formatted_offset)
{
#if 0
/*
}
}
-static u32 lzx_record_literal(u8 literal, void *__main_freq_tab)
+static u32
+lzx_record_literal(u8 literal, void *__main_freq_tab)
{
- u32 *main_freq_tab = __main_freq_tab;
+ freq_t *main_freq_tab = __main_freq_tab;
main_freq_tab[literal]++;
return literal;
}
-/* Equivalent to lzx_extra_bits[position_slot] except position_slot must be
- * between 2 and 37 */
-static inline unsigned lzx_get_num_extra_bits(unsigned position_slot)
-{
-#if 0
- return lzx_extra_bits[position_slot];
-#endif
- wimlib_assert(position_slot >= 2 && position_slot <= 37);
- return (position_slot >> 1) - 1;
-}
-
/* Constructs a match from an offset and a length, and updates the LRU queue and
* the frequency of symbols in the main, length, and aligned offset alphabets.
* The return value is a 32-bit number that provides the match in an
* intermediate representation documented below. */
-static u32 lzx_record_match(unsigned match_offset, unsigned match_len,
- void *__freq_tabs, void *__queue)
+static u32
+lzx_record_match(unsigned match_offset, unsigned match_len,
+ void *__freq_tabs, void *__queue)
{
struct lzx_freq_tables *freq_tabs = __freq_tabs;
struct lru_queue *queue = __queue;
- unsigned formatted_offset;
unsigned position_slot;
unsigned position_footer = 0;
u32 match;
/* If possible, encode this offset as a repeated offset. */
if (match_offset == queue->R0) {
- formatted_offset = 0;
- position_slot = 0;
+ position_slot = 0;
} else if (match_offset == queue->R1) {
swap(queue->R0, queue->R1);
- formatted_offset = 1;
- position_slot = 1;
+ position_slot = 1;
} else if (match_offset == queue->R2) {
swap(queue->R0, queue->R2);
- formatted_offset = 2;
- position_slot = 2;
+ position_slot = 2;
} else {
/* Not a repeated offset. */
/* offsets of 0, 1, and 2 are reserved for the repeated offset
* codes, so non-repeated offsets must be encoded as 3+. The
* minimum offset is 1, so encode the offsets offset by 2. */
- formatted_offset = match_offset + LZX_MIN_MATCH;
+ unsigned formatted_offset = match_offset + LZX_MIN_MATCH;
queue->R2 = queue->R1;
queue->R1 = queue->R0;
* @codes: Pointer to a structure that contains the codewords for the
* main, length, and aligned offset Huffman codes.
*/
-static int lzx_write_match(struct output_bitstream *out, int block_type,
- u32 match, const struct lzx_codes *codes)
+static int
+lzx_write_match(struct output_bitstream *out, int block_type,
+ u32 match, const struct lzx_codes *codes)
{
/* low 8 bits are the match length minus 2 */
unsigned match_len_minus_2 = match & 0xff;
wimlib_assert(position_slot < LZX_NUM_POSITION_SLOTS);
- num_extra_bits = lzx_extra_bits[position_slot];
+ num_extra_bits = lzx_get_num_extra_bits(position_slot);
/* For aligned offset blocks with at least 3 extra bits, output the
* verbatim bits literally, then the aligned bits encoded using the
* @codes: Pointer to a structure that contains the codewords for the
* main, length, and aligned offset Huffman codes.
*/
-static int lzx_write_compressed_literals(struct output_bitstream *ostream,
- int block_type,
- const u32 match_tab[],
- unsigned num_compressed_literals,
- const struct lzx_codes *codes)
+static int
+lzx_write_compressed_literals(struct output_bitstream *ostream,
+ int block_type,
+ const u32 match_tab[],
+ unsigned num_compressed_literals,
+ const struct lzx_codes *codes)
{
unsigned i;
u32 match;
* @lens: The code lengths for the Huffman tree, indexed by symbol.
* @num_symbols: The number of symbols in the code.
*/
-static int lzx_write_compressed_tree(struct output_bitstream *out,
- const u8 lens[], unsigned num_symbols)
+static int
+lzx_write_compressed_tree(struct output_bitstream *out,
+ const u8 lens[], unsigned num_symbols)
{
/* Frequencies of the length symbols, including the RLE symbols (NOT the
* actual lengths themselves). */
- unsigned pretree_freqs[LZX_PRETREE_NUM_SYMBOLS];
+ freq_t pretree_freqs[LZX_PRETREE_NUM_SYMBOLS];
u8 pretree_lens[LZX_PRETREE_NUM_SYMBOLS];
u16 pretree_codewords[LZX_PRETREE_NUM_SYMBOLS];
u8 output_syms[num_symbols * 2];
/* Builds the canonical Huffman code for the main tree, the length tree, and the
* aligned offset tree. */
-static void lzx_make_huffman_codes(const struct lzx_freq_tables *freq_tabs,
- struct lzx_codes *codes)
+static void
+lzx_make_huffman_codes(const struct lzx_freq_tables *freq_tabs,
+ struct lzx_codes *codes)
{
make_canonical_huffman_code(LZX_MAINTREE_NUM_SYMBOLS,
LZX_MAX_CODEWORD_LEN,
codes->aligned_codewords);
}
-/* Do the 'E8' preprocessing, where the targets of x86 CALL instructions were
- * changed from relative offsets to absolute offsets. This type of
- * preprocessing can be used on any binary data even if it is not actually
- * machine code. It seems to always be used in WIM files, even though there is
- * no bit to indicate that it actually is used, unlike in the LZX compressed
- * format as used in other file formats such as the cabinet format, where a bit
- * is reserved for that purpose. */
-static void do_call_insn_preprocessing(u8 uncompressed_data[],
- unsigned uncompressed_data_len)
+static void
+do_call_insn_translation(u32 *call_insn_target, int input_pos,
+ int32_t file_size)
{
- int i = 0;
- int file_size = LZX_MAGIC_FILESIZE;
- int32_t rel_offset;
int32_t abs_offset;
+ int32_t rel_offset;
- /* Not enabled in the last 6 bytes, which means the 5-byte call
- * instruction cannot start in the last *10* bytes. */
- while (i < uncompressed_data_len - 10) {
- if (uncompressed_data[i] != 0xe8) {
- i++;
- continue;
+ rel_offset = le32_to_cpu(*call_insn_target);
+ if (rel_offset >= -input_pos && rel_offset < file_size) {
+ if (rel_offset < file_size - input_pos) {
+ /* "good translation" */
+ abs_offset = rel_offset + input_pos;
+ } else {
+ /* "compensating translation" */
+ abs_offset = rel_offset - file_size;
}
- rel_offset = le32_to_cpu(*(int32_t*)(uncompressed_data + i + 1));
-
- if (rel_offset >= -i && rel_offset < file_size) {
- if (rel_offset < file_size - i) {
- /* "good translation" */
- abs_offset = rel_offset + i;
- } else {
- /* "compensating translation" */
- abs_offset = rel_offset - file_size;
- }
- *(int32_t*)(uncompressed_data + i + 1) = cpu_to_le32(abs_offset);
+ *call_insn_target = cpu_to_le32(abs_offset);
+ }
+}
+
+/* This is the reverse of undo_call_insn_preprocessing() in lzx-decompress.c.
+ * See the comment above that function for more information. */
+static void
+do_call_insn_preprocessing(u8 uncompressed_data[], int uncompressed_data_len)
+{
+ for (int i = 0; i < uncompressed_data_len - 10; i++) {
+ if (uncompressed_data[i] == 0xe8) {
+ do_call_insn_translation((u32*)&uncompressed_data[i + 1],
+ i,
+ LZX_WIM_MAGIC_FILESIZE);
+ i += 4;
}
- i += 5;
}
}
.too_far = 4096,
};
-/*
- * Performs LZX compression on a block of data.
- *
- * @__uncompressed_data: Pointer to the data to be compressed.
- * @uncompressed_len: Length, in bytes, of the data to be compressed.
- * @compressed_data: Pointer to a location at least (@uncompressed_len - 1)
- * bytes long into which the compressed data may be
- * written.
- * @compressed_len_ret: A pointer to an unsigned int into which the length of
- * the compressed data may be returned.
- *
- * Returns zero if compression was successfully performed. In that case
- * @compressed_data and @compressed_len_ret will contain the compressed data and
- * its length. A return value of nonzero means that compressing the data did
- * not reduce its size, and @compressed_data will not contain the full
- * compressed data.
- */
-int lzx_compress(const void *__uncompressed_data, unsigned uncompressed_len,
- void *compressed_data, unsigned *compressed_len_ret)
+/* Documented in wimlib.h */
+WIMLIBAPI unsigned
+wimlib_lzx_compress(const void *__uncompressed_data, unsigned uncompressed_len,
+ void *compressed_data)
{
struct output_bitstream ostream;
- u8 uncompressed_data[uncompressed_len + LZX_MAX_MATCH];
+ u8 uncompressed_data[uncompressed_len + 8];
struct lzx_freq_tables freq_tabs;
struct lzx_codes codes;
u32 match_tab[uncompressed_len];
int ret;
int block_type = LZX_BLOCKTYPE_ALIGNED;
+ wimlib_assert(uncompressed_len <= 32768);
+
if (uncompressed_len < 100)
- return 1;
+ return 0;
memset(&freq_tabs, 0, sizeof(freq_tabs));
queue.R0 = 1;
/* The input data must be preprocessed. To avoid changing the original
* input, copy it to a temporary buffer. */
memcpy(uncompressed_data, __uncompressed_data, uncompressed_len);
+ memset(uncompressed_data + uncompressed_len, 0, 8);
/* Before doing any actual compression, do the call instruction (0xe8
* byte) translation on the uncompressed data. */
* main tree. */
ret = lzx_write_compressed_tree(&ostream, codes.main_lens,
LZX_NUM_CHARS);
- if (ret != 0)
- return ret;
+ if (ret)
+ return 0;
/* Write the pre-tree and symbols for the rest of the main tree. */
ret = lzx_write_compressed_tree(&ostream, codes.main_lens +
LZX_NUM_CHARS,
LZX_MAINTREE_NUM_SYMBOLS -
LZX_NUM_CHARS);
- if (ret != 0)
- return ret;
+ if (ret)
+ return 0;
/* Write the pre-tree and symbols for the length tree. */
ret = lzx_write_compressed_tree(&ostream, codes.len_lens,
LZX_LENTREE_NUM_SYMBOLS);
- if (ret != 0)
- return ret;
+ if (ret)
+ return 0;
/* Write the compressed literals. */
ret = lzx_write_compressed_literals(&ostream, block_type,
match_tab, num_matches, &codes);
- if (ret != 0)
- return ret;
+ if (ret)
+ return 0;
ret = flush_output_bitstream(&ostream);
- if (ret != 0)
- return ret;
+ if (ret)
+ return 0;
compressed_len = ostream.bit_output - (u8*)compressed_data;
- *compressed_len_ret = compressed_len;
-
#ifdef ENABLE_VERIFY_COMPRESSION
/* Verify that we really get the same thing back when decompressing. */
- u8 buf[uncompressed_len];
- ret = lzx_decompress(compressed_data, compressed_len, buf,
- uncompressed_len);
- if (ret != 0) {
- ERROR("lzx_compress(): Failed to decompress data we compressed");
- abort();
- }
-
- for (i = 0; i < uncompressed_len; i++) {
- if (buf[i] != *((u8*)__uncompressed_data + i)) {
- ERROR("lzx_compress(): Data we compressed didn't "
- "decompress to the original data (difference at "
- "byte %u of %u)", i + 1, uncompressed_len);
+ {
+ u8 buf[uncompressed_len];
+ ret = wimlib_lzx_decompress(compressed_data, compressed_len,
+ buf, uncompressed_len);
+ if (ret != 0) {
+ ERROR("lzx_compress(): Failed to decompress data we compressed");
abort();
}
+
+ for (i = 0; i < uncompressed_len; i++) {
+ if (buf[i] != *((u8*)__uncompressed_data + i)) {
+ ERROR("lzx_compress(): Data we compressed didn't "
+ "decompress to the original data (difference at "
+ "byte %u of %u)", i + 1, uncompressed_len);
+ abort();
+ }
+ }
}
#endif
- return 0;
+ return compressed_len;
}