Get imagex extract --to-stdout working

[wimlib] / src / lzx-compress.c
diff --git a/src/lzx-compress.c b/src/lzx-compress.c

index b18004684658addcad9575df430bb4332af71bfd..b23c2df088642ce2d9001ebeba130d8fbba504d8 100644 (file)
--- a/src/lzx-compress.c
+++ b/src/lzx-compress.c
@@ -7,7 +7,7 @@
  
  /*
   * Copyright (C) 2002 Matthew T. Russotto
- * Copyright (C) 2012 Eric Biggers
+ * Copyright (C) 2012, 2013 Eric Biggers
   *
   * This file is part of wimlib, a library for working with WIM files.
   *
@@ -27,8 +27,8 @@
  
  
  /*
- * This file provides lzx_compress(), a function to compress an in-memory buffer
- * of data using LZX compression, as used in the WIM file format.
+ * This file provides wimlib_lzx_compress(), a function to compress an in-memory
+ * buffer of data using LZX compression, as used in the WIM file format.
   *
   * Please see the comments in lzx-decompress.c for more information about this
   * compression format.
@@ -57,6 +57,7 @@
   * blocks from one input chunk is not yet implemented.
   */
  
+#include "wimlib.h"
  #include "lzx.h"
  #include "compress.h"
  #include <stdlib.h>
@@ -77,9 +78,9 @@ struct lzx_codes {
  };
  
  struct lzx_freq_tables {
-       u32 main_freq_table[LZX_MAINTREE_NUM_SYMBOLS];
-       u32 len_freq_table[LZX_LENTREE_NUM_SYMBOLS];
-       u32 aligned_freq_table[LZX_ALIGNEDTREE_NUM_SYMBOLS];
+       freq_t main_freq_table[LZX_MAINTREE_NUM_SYMBOLS];
+       freq_t len_freq_table[LZX_LENTREE_NUM_SYMBOLS];
+       freq_t aligned_freq_table[LZX_ALIGNEDTREE_NUM_SYMBOLS];
  };
  
  /* Returns the LZX position slot that corresponds to a given formatted offset.
@@ -91,7 +92,8 @@ struct lzx_freq_tables {
   * numbers in the lzx_position_base array to calculate the slot directly from
   * the formatted offset without actually looking at the array.
   */
-static inline unsigned lzx_get_position_slot(unsigned formatted_offset)
+static inline unsigned
+lzx_get_position_slot(unsigned formatted_offset)
  {
  #if 0
         /*
@@ -120,34 +122,24 @@ static inline unsigned lzx_get_position_slot(unsigned formatted_offset)
         }
  }
  
-static u32 lzx_record_literal(u8 literal, void *__main_freq_tab)
+static u32
+lzx_record_literal(u8 literal, void *__main_freq_tab)
  {
-       u32 *main_freq_tab = __main_freq_tab;
+       freq_t *main_freq_tab = __main_freq_tab;
         main_freq_tab[literal]++;
         return literal;
  }
  
-/* Equivalent to lzx_extra_bits[position_slot] except position_slot must be
- * between 2 and 37 */
-static inline unsigned lzx_get_num_extra_bits(unsigned position_slot)
-{
-#if 0
-       return lzx_extra_bits[position_slot];
-#endif
-       wimlib_assert(position_slot >= 2 && position_slot <= 37);
-       return (position_slot >> 1) - 1;
-}
-
  /* Constructs a match from an offset and a length, and updates the LRU queue and
   * the frequency of symbols in the main, length, and aligned offset alphabets.
   * The return value is a 32-bit number that provides the match in an
   * intermediate representation documented below. */
-static u32 lzx_record_match(unsigned match_offset, unsigned match_len,
-                           void *__freq_tabs, void *__queue)
+static u32
+lzx_record_match(unsigned match_offset, unsigned match_len,
+                void *__freq_tabs, void *__queue)
  {
         struct lzx_freq_tables *freq_tabs = __freq_tabs;
         struct lru_queue *queue = __queue;
-       unsigned formatted_offset;
         unsigned position_slot;
         unsigned position_footer = 0;
         u32 match;
@@ -161,23 +153,20 @@ static u32 lzx_record_match(unsigned match_offset, unsigned match_len,
  
         /* If possible, encode this offset as a repeated offset. */
         if (match_offset == queue->R0) {
-               formatted_offset = 0;
-               position_slot    = 0;
+               position_slot = 0;
         } else if (match_offset == queue->R1) {
                 swap(queue->R0, queue->R1);
-               formatted_offset = 1;
-               position_slot    = 1;
+               position_slot = 1;
         } else if (match_offset == queue->R2) {
                 swap(queue->R0, queue->R2);
-               formatted_offset = 2;
-               position_slot    = 2;
+               position_slot = 2;
         } else {
                 /* Not a repeated offset. */
  
                 /* offsets of 0, 1, and 2 are reserved for the repeated offset
                  * codes, so non-repeated offsets must be encoded as 3+.  The
                  * minimum offset is 1, so encode the offsets offset by 2. */
-               formatted_offset = match_offset + LZX_MIN_MATCH;
+               unsigned formatted_offset = match_offset + LZX_MIN_MATCH;
  
                 queue->R2 = queue->R1;
                 queue->R1 = queue->R0;
@@ -256,8 +245,9 @@ static u32 lzx_record_match(unsigned match_offset, unsigned match_len,
   * @codes:     Pointer to a structure that contains the codewords for the
   *                     main, length, and aligned offset Huffman codes.
   */
-static int lzx_write_match(struct output_bitstream *out, int block_type,
-                          u32 match, const struct lzx_codes *codes)
+static int
+lzx_write_match(struct output_bitstream *out, int block_type,
+               u32 match, const struct lzx_codes *codes)
  {
         /* low 8 bits are the match length minus 2 */
         unsigned match_len_minus_2 = match & 0xff;
@@ -320,7 +310,7 @@ static int lzx_write_match(struct output_bitstream *out, int block_type,
  
         wimlib_assert(position_slot < LZX_NUM_POSITION_SLOTS);
  
-       num_extra_bits = lzx_extra_bits[position_slot];
+       num_extra_bits = lzx_get_num_extra_bits(position_slot);
  
         /* For aligned offset blocks with at least 3 extra bits, output the
          * verbatim bits literally, then the aligned bits encoded using the
@@ -362,11 +352,12 @@ static int lzx_write_match(struct output_bitstream *out, int block_type,
   * @codes:     Pointer to a structure that contains the codewords for the
   *                     main, length, and aligned offset Huffman codes.
   */
-static int lzx_write_compressed_literals(struct output_bitstream *ostream,
-                                        int block_type,
-                                        const u32 match_tab[],
-                                        unsigned  num_compressed_literals,
-                                        const struct lzx_codes *codes)
+static int
+lzx_write_compressed_literals(struct output_bitstream *ostream,
+                             int block_type,
+                             const u32 match_tab[],
+                             unsigned  num_compressed_literals,
+                             const struct lzx_codes *codes)
  {
         unsigned i;
         u32 match;
@@ -412,12 +403,13 @@ static int lzx_write_compressed_literals(struct output_bitstream *ostream,
   * @lens:      The code lengths for the Huffman tree, indexed by symbol.
   * @num_symbols:       The number of symbols in the code.
   */
-static int lzx_write_compressed_tree(struct output_bitstream *out,
-                                    const u8 lens[], unsigned num_symbols)
+static int
+lzx_write_compressed_tree(struct output_bitstream *out,
+                         const u8 lens[], unsigned num_symbols)
  {
         /* Frequencies of the length symbols, including the RLE symbols (NOT the
          * actual lengths themselves). */
-       unsigned pretree_freqs[LZX_PRETREE_NUM_SYMBOLS];
+       freq_t pretree_freqs[LZX_PRETREE_NUM_SYMBOLS];
         u8 pretree_lens[LZX_PRETREE_NUM_SYMBOLS];
         u16 pretree_codewords[LZX_PRETREE_NUM_SYMBOLS];
         u8 output_syms[num_symbols * 2];
@@ -575,8 +567,9 @@ static int lzx_write_compressed_tree(struct output_bitstream *out,
  
  /* Builds the canonical Huffman code for the main tree, the length tree, and the
   * aligned offset tree. */
-static void lzx_make_huffman_codes(const struct lzx_freq_tables *freq_tabs,
-                               struct lzx_codes *codes)
+static void
+lzx_make_huffman_codes(const struct lzx_freq_tables *freq_tabs,
+                      struct lzx_codes *codes)
  {
         make_canonical_huffman_code(LZX_MAINTREE_NUM_SYMBOLS,
                                         LZX_MAX_CODEWORD_LEN,
@@ -596,41 +589,38 @@ static void lzx_make_huffman_codes(const struct lzx_freq_tables *freq_tabs,
                                         codes->aligned_codewords);
  }
  
-/* Do the 'E8' preprocessing, where the targets of x86 CALL instructions were
- * changed from relative offsets to absolute offsets.  This type of
- * preprocessing can be used on any binary data even if it is not actually
- * machine code.  It seems to always be used in WIM files, even though there is
- * no bit to indicate that it actually is used, unlike in the LZX compressed
- * format as used in other file formats such as the cabinet format, where a bit
- * is reserved for that purpose. */
-static void do_call_insn_preprocessing(u8 uncompressed_data[],
-                                      unsigned uncompressed_data_len)
+static void
+do_call_insn_translation(u32 *call_insn_target, int input_pos,
+                        int32_t file_size)
  {
-       int i = 0;
-       int file_size = LZX_MAGIC_FILESIZE;
-       int32_t rel_offset;
         int32_t abs_offset;
+       int32_t rel_offset;
  
-       /* Not enabled in the last 6 bytes, which means the 5-byte call
-        * instruction cannot start in the last *10* bytes. */
-       while (i < uncompressed_data_len - 10) {
-               if (uncompressed_data[i] != 0xe8) {
-                       i++;
-                       continue;
+       rel_offset = le32_to_cpu(*call_insn_target);
+       if (rel_offset >= -input_pos && rel_offset < file_size) {
+               if (rel_offset < file_size - input_pos) {
+                       /* "good translation" */
+                       abs_offset = rel_offset + input_pos;
+               } else {
+                       /* "compensating translation" */
+                       abs_offset = rel_offset - file_size;
                 }
-               rel_offset = le32_to_cpu(*(int32_t*)(uncompressed_data + i + 1));
-
-               if (rel_offset >= -i && rel_offset < file_size) {
-                       if (rel_offset < file_size - i) {
-                               /* "good translation" */
-                               abs_offset = rel_offset + i;
-                       } else {
-                               /* "compensating translation" */
-                               abs_offset = rel_offset - file_size;
-                       }
-                       *(int32_t*)(uncompressed_data + i + 1) = cpu_to_le32(abs_offset);
+               *call_insn_target = cpu_to_le32(abs_offset);
+       }
+}
+
+/* This is the reverse of undo_call_insn_preprocessing() in lzx-decompress.c.
+ * See the comment above that function for more information. */
+static void
+do_call_insn_preprocessing(u8 uncompressed_data[], int uncompressed_data_len)
+{
+       for (int i = 0; i < uncompressed_data_len - 10; i++) {
+               if (uncompressed_data[i] == 0xe8) {
+                       do_call_insn_translation((u32*)&uncompressed_data[i + 1],
+                                                i,
+                                                LZX_WIM_MAGIC_FILESIZE);
+                       i += 4;
                 }
-               i += 5;
         }
  }
  
@@ -649,28 +639,13 @@ static const struct lz_params lzx_lz_params = {
         .too_far        = 4096,
  };
  
-/*
- * Performs LZX compression on a block of data.
- *
- * @__uncompressed_data:  Pointer to the data to be compressed.
- * @uncompressed_len:    Length, in bytes, of the data to be compressed.
- * @compressed_data:     Pointer to a location at least (@uncompressed_len - 1)
- *                             bytes long into which the compressed data may be
- *                             written.
- * @compressed_len_ret:          A pointer to an unsigned int into which the length of
- *                             the compressed data may be returned.
- *
- * Returns zero if compression was successfully performed.  In that case
- * @compressed_data and @compressed_len_ret will contain the compressed data and
- * its length.  A return value of nonzero means that compressing the data did
- * not reduce its size, and @compressed_data will not contain the full
- * compressed data.
- */
-int lzx_compress(const void *__uncompressed_data, unsigned uncompressed_len,
-                void *compressed_data, unsigned *compressed_len_ret)
+/* Documented in wimlib.h */
+WIMLIBAPI unsigned
+wimlib_lzx_compress(const void *__uncompressed_data, unsigned uncompressed_len,
+                   void *compressed_data)
  {
         struct output_bitstream ostream;
-       u8 uncompressed_data[uncompressed_len + LZX_MAX_MATCH];
+       u8 uncompressed_data[uncompressed_len + 8];
         struct lzx_freq_tables freq_tabs;
         struct lzx_codes codes;
         u32 match_tab[uncompressed_len];
@@ -681,8 +656,10 @@ int lzx_compress(const void *__uncompressed_data, unsigned uncompressed_len,
         int ret;
         int block_type = LZX_BLOCKTYPE_ALIGNED;
  
+       wimlib_assert(uncompressed_len <= 32768);
+
         if (uncompressed_len < 100)
-               return 1;
+               return 0;
  
         memset(&freq_tabs, 0, sizeof(freq_tabs));
         queue.R0 = 1;
@@ -692,6 +669,7 @@ int lzx_compress(const void *__uncompressed_data, unsigned uncompressed_len,
         /* The input data must be preprocessed. To avoid changing the original
          * input, copy it to a temporary buffer. */
         memcpy(uncompressed_data, __uncompressed_data, uncompressed_len);
+       memset(uncompressed_data + uncompressed_len, 0, 8);
  
         /* Before doing any actual compression, do the call instruction (0xe8
          * byte) translation on the uncompressed data. */
@@ -738,55 +716,55 @@ int lzx_compress(const void *__uncompressed_data, unsigned uncompressed_len,
          * main tree. */
         ret = lzx_write_compressed_tree(&ostream, codes.main_lens,
                                         LZX_NUM_CHARS);
-       if (ret != 0)
-               return ret;
+       if (ret)
+               return 0;
  
         /* Write the pre-tree and symbols for the rest of the main tree. */
         ret = lzx_write_compressed_tree(&ostream, codes.main_lens +
                                         LZX_NUM_CHARS,
                                         LZX_MAINTREE_NUM_SYMBOLS -
                                                 LZX_NUM_CHARS);
-       if (ret != 0)
-               return ret;
+       if (ret)
+               return 0;
  
         /* Write the pre-tree and symbols for the length tree. */
         ret = lzx_write_compressed_tree(&ostream, codes.len_lens,
                                         LZX_LENTREE_NUM_SYMBOLS);
-       if (ret != 0)
-               return ret;
+       if (ret)
+               return 0;
  
         /* Write the compressed literals. */
         ret = lzx_write_compressed_literals(&ostream, block_type,
                                             match_tab, num_matches, &codes);
-       if (ret != 0)
-               return ret;
+       if (ret)
+               return 0;
  
         ret = flush_output_bitstream(&ostream);
-       if (ret != 0)
-               return ret;
+       if (ret)
+               return 0;
  
         compressed_len = ostream.bit_output - (u8*)compressed_data;
  
-       *compressed_len_ret = compressed_len;
-
  #ifdef ENABLE_VERIFY_COMPRESSION
         /* Verify that we really get the same thing back when decompressing. */
-       u8 buf[uncompressed_len];
-       ret = lzx_decompress(compressed_data, compressed_len, buf,
-                            uncompressed_len);
-       if (ret != 0) {
-               ERROR("lzx_compress(): Failed to decompress data we compressed");
-               abort();
-       }
-
-       for (i = 0; i < uncompressed_len; i++) {
-               if (buf[i] != *((u8*)__uncompressed_data + i)) {
-                       ERROR("lzx_compress(): Data we compressed didn't "
-                             "decompress to the original data (difference at "
-                             "byte %u of %u)", i + 1, uncompressed_len);
+       {
+               u8 buf[uncompressed_len];
+               ret = wimlib_lzx_decompress(compressed_data, compressed_len,
+                                           buf, uncompressed_len);
+               if (ret != 0) {
+                       ERROR("lzx_compress(): Failed to decompress data we compressed");
                         abort();
                 }
+
+               for (i = 0; i < uncompressed_len; i++) {
+                       if (buf[i] != *((u8*)__uncompressed_data + i)) {
+                               ERROR("lzx_compress(): Data we compressed didn't "
+                                     "decompress to the original data (difference at "
+                                     "byte %u of %u)", i + 1, uncompressed_len);
+                               abort();
+                       }
+               }
         }
  #endif
-       return 0;
+       return compressed_len;
  }