* - lzx_position_base is an index to the position slot bases
* - lzx_extra_bits states how many bits of offset-from-base data is needed.
*/
-const u8 lzx_extra_bits[51] = {
- 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
- 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
- 15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
- 17, 17, 17
+const u8 lzx_extra_bits[LZX_NUM_POSITION_SLOTS] = {
+ 0 , 0 , 0 , 0 , 1 ,
+ 1 , 2 , 2 , 3 , 3 ,
+ 4 , 4 , 5 , 5 , 6 ,
+ 6 , 7 , 7 , 8 , 8 ,
+ 9 , 9 , 10, 10, 11,
+ 11, 12, 12, 13, 13,
+ /*14, 14, 15, 15, 16,*/
+ /*16, 17, 17, 17, 17,*/
+ /*17, 17, 17, 17, 17,*/
+ /*17, 17, 17, 17, 17,*/
+ /*17*/
};
-const u32 lzx_position_base[51] = {
- 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384,
- 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576,
- 32768, 49152, 65536, 98304, 131072, 196608, 262144, 393216, 524288,
- 655360, 786432, 917504, 1048576, 1179648, 1310720, 1441792, 1572864,
- 1703936, 1835008, 1966080, 2097152
+const u32 lzx_position_base[LZX_NUM_POSITION_SLOTS] = {
+ 0 , 1 , 2 , 3 , 4 ,
+ 6 , 8 , 12 , 16 , 24 ,
+ 32 , 48 , 64 , 96 , 128 ,
+ 192 , 256 , 384 , 512 , 768 ,
+ 1024 , 1536 , 2048 , 3072 , 4096 ,
+ 6144 , 8192 , 12288 , 16384 , 24576 ,
+ /*32768 , 49152 , 65536 , 98304 , 131072 ,*/
+ /*196608 , 262144 , 393216 , 524288 , 655360 ,*/
+ /*786432 , 917504 , 1048576, 1179648, 1310720,*/
+ /*1441792, 1572864, 1703936, 1835008, 1966080,*/
+ /*2097152*/
};
/*
* lzx-comp.c
*
- * LZX compression routines.
- *
- * This code was originally based on code written by Matthew T. Russotto
- * (liblzxcomp).
+ * LZX compression routines, originally based on code written by Matthew T.
+ * Russotto (liblzxcomp), but heavily modified.
*/
/*
*/
-
/*
* This file provides lzx_compress(), a function to compress an in-memory buffer
* of data using LZX compression, as used in the WIM file format.
*
- * There is no sliding window, as for the compressed chunks in WIM resources,
- * the window is always the length of the input.
+ * Please see the comments in lzx-decomp.c for more information about this
+ * compression format.
*
- * The basic algorithm should be familiar if you are familiar with Huffman trees
- * and with other LZ77-based formats such as DEFLATE. Otherwise it can be quite
- * tricky to understand. Basically it is the following:
+ * One thing to keep in mind is that there is no sliding window, since the
+ * window is always the entirety of a WIM chunk, which is at most WIM_CHUNK_SIZE
+ * ( = 32768) bytes.
+ *
+ * The basic compression algorithm used here should be familiar if you are
+ * familiar with Huffman trees and with other LZ77 and Huffman-based formats
+ * such as DEFLATE. Otherwise it can be quite tricky to understand. Basically
+ * it is the following:
*
* - Preprocess the input data (LZX-specific)
* - Go through the input data and determine matches. This part is based on
* while recording matches.
* - Output the block header, including the Huffman trees; then output the
* compressed stream of matches and literal characters.
+ *
+ * It is possible for a WIM chunk to include multiple LZX blocks, since for some
+ * input data this will produce a better compression ratio (especially since
+ * each block can include new Huffman codes). However, producing multiple LZX
+ * blocks from one input chunk is not yet implemented.
*/
-
#include "lzx.h"
#include "comp.h"
#include <math.h>
-/* Returns the position slot that corresponds to a given formatted offset. This
- * means searching the lzx_position_base array to find what slot contains a
- * position base that is less than or equal to formatted_offset, where the next
- * slot contains a position base that is greater than or equal to
- * formatted_offset. */
-static uint lzx_get_position_slot(uint formatted_offset)
+/* Returns the LZX position slot that corresponds to a given formatted offset.
+ *
+ * Logically, this returns the smallest i such that
+ * formatted_offset >= lzx_position_base[i].
+ *
+ * The actual implementation below takes advantage of the regularity of the
+ * numbers in the lzx_position_base array to calculate the slot directly from
+ * the formatted offset without actually looking at the array.
+ */
+static inline unsigned lzx_get_position_slot(unsigned formatted_offset)
{
- int left;
- int right;
- int mid;
-
- /* Calculate position base using binary search of table; if log2 can be
- * done in hardware, approximation might work;
- * trunc(log2(formatted_offset*formatted_offset)) gets either the proper
- * position slot or the next one, except for slots 0, 1, and 39-49
- *
- * Slots 0-1 are handled by the R0-R1 procedures
- *
+#if 0
+ /*
* Slots 36-49 (formatted_offset >= 262144) can be found by
* (formatted_offset/131072) + 34 == (formatted_offset >> 17) + 34;
+ * however, this check for formatted_offset >= 262144 is commented out
+ * because WIM chunks cannot be that large.
*/
if (formatted_offset >= 262144) {
return (formatted_offset >> 17) + 34;
- } else {
- left = 3;
- right = LZX_NUM_POSITION_SLOTS - 1;
- while (1) {
- mid = (left + right) >> 1;
- if ((lzx_position_base[mid] <= formatted_offset) &&
- lzx_position_base[mid + 1] > formatted_offset) {
- return mid;
- }
- if (formatted_offset > lzx_position_base[mid])
- /* too low */
- left = mid + 1;
- else /* too high */
- right = mid;
- }
+ } else
+#endif
+ {
+ /* Note: this part here only works if:
+ *
+ * 2 <= formatted_offset < 655360
+ *
+ * It is < 655360 because the frequency of the position bases
+ * increases starting at the 655360 entry, and it is >= 2
+ * because the below calculation fails if the most significant
+ * bit is lower than the 2's place. */
+ wimlib_assert(formatted_offset >= 2 && formatted_offset < 655360);
+ unsigned mssb_idx = bsr32(formatted_offset);
+ return (mssb_idx << 1) |
+ ((formatted_offset >> (mssb_idx - 1)) & 1);
}
}
return literal;
}
-/* Constructs a match from an offset and a length, and updates the LRU queue
- * and the frequency of symbols in the main, length, and aligned offset
- * alphabets. The return value is a 32-bit integer that, if the high bit is
- * set, contains the match length, the position slot, and the position footer
- * for the match. */
+/* Equivalent to lzx_extra_bits[position_slot] except position_slot must be
+ * between 2 and 37 */
+static inline unsigned lzx_get_num_extra_bits(unsigned position_slot)
+{
+#if 0
+ return lzx_extra_bits[position_slot];
+#endif
+ wimlib_assert(position_slot >= 2 && position_slot <= 37);
+ return (position_slot >> 1) - 1;
+}
+
+/* Constructs a match from an offset and a length, and updates the LRU queue and
+ * the frequency of symbols in the main, length, and aligned offset alphabets.
+ * The return value is a 32-bit number that provides the match in an
+ * intermediate representation documented below. */
static u32 lzx_record_match(uint match_offset, uint match_len,
void *__freq_tabs, void *__queue)
{
u32 len_header;
u32 len_pos_header;
uint len_footer;
+ uint adjusted_match_len;
wimlib_assert(match_len >= LZX_MIN_MATCH && match_len <= LZX_MAX_MATCH);
+ wimlib_assert(match_offset != 0);
-
+ /* If possible, encode this offset as a repeated offset. */
if (match_offset == queue->R0) {
formatted_offset = 0;
position_slot = 0;
} else {
/* Not a repeated offset. */
+ /* offsets of 0, 1, and 2 are reserved for the repeated offset
+ * codes, so non-repeated offsets must be encoded as 3+. The
+ * minimum offset is 1, so encode the offsets offset by 2. */
formatted_offset = match_offset + LZX_MIN_MATCH;
queue->R2 = queue->R1;
queue->R1 = queue->R0;
queue->R0 = match_offset;
- position_slot = lzx_get_position_slot(formatted_offset);
+ /* The (now-formatted) offset will actually be encoded as a
+ * small position slot number that maps to a certain hard-coded
+ * offset (position base), followed by a number of extra bits---
+ * the position footer--- that are added to the position base to
+ * get the original formatted offset. */
- /* Just the extra bits of the formatted offset. */
- position_footer = ((1UL << lzx_extra_bits[position_slot]) - 1) &
- formatted_offset;
+ position_slot = lzx_get_position_slot(formatted_offset);
+ position_footer = formatted_offset &
+ ((1 << lzx_get_num_extra_bits(position_slot)) - 1);
}
- /* (match length - 2) = 8 bits */
- /* position_slot = 6 bits */
- /* position_footer = 17 bits */
- /* total = 31 bits */
- /* plus one to say whether it's a literal or not */
-
- match = 0x80000000 | /* bit 31 in intelligent bit ordering */
- (position_slot << 25) | /* bits 30-25 */
- (position_footer << 8) | /* bits 8-24 */
- (match_len - LZX_MIN_MATCH); /* bits 0-7 */
-
- /* Update the frequency for the main tree, the length tree (only if a
- * length symbol is to be output), and the aligned tree (only if an
- * aligned symbol is to be output.) */
- if (match_len < (LZX_NUM_PRIMARY_LENS + LZX_MIN_MATCH)) {
- len_header = match_len - LZX_MIN_MATCH;
+ adjusted_match_len = match_len - LZX_MIN_MATCH;
+
+ /* Pack the position slot, position footer, and match length into an
+ * intermediate representation.
+ *
+ * bits description
+ * ---- -----------------------------------------------------------
+ *
+ * 31 1 if a match, 0 if a literal.
+ *
+ * 30-25 position slot. This can be at most 50, so it will fit in 6
+ * bits.
+ *
+ * 8-24 position footer. This is the offset of the real formatted
+ * offset from the position base. This can be at most 17 bits
+ * (since lzx_extra_bits[LZX_NUM_POSITION_SLOTS - 1] is 17).
+ *
+ * 0-7 length of match, offset by 2. This can be at most
+ * (LZX_MAX_MATCH - 2) == 255, so it will fit in 8 bits. */
+ match = 0x80000000 |
+ (position_slot << 25) |
+ (position_footer << 8) |
+ (adjusted_match_len);
+
+ /* The match length must be at least 2, so let the adjusted match length
+ * be the match length minus 2.
+ *
+ * If it is less than 7, the adjusted match length is encoded as a 3-bit
+ * number offset by 2. Otherwise, the 3-bit length header is all 1's
+ * and the actual adjusted length is given as a symbol encoded with the
+ * length tree, offset by 7.
+ */
+ if (adjusted_match_len < LZX_NUM_PRIMARY_LENS) {
+ len_header = adjusted_match_len;
} else {
len_header = LZX_NUM_PRIMARY_LENS;
- len_footer = match_len - (LZX_NUM_PRIMARY_LENS + LZX_MIN_MATCH);
+ len_footer = adjusted_match_len - LZX_NUM_PRIMARY_LENS;
freq_tabs->len_freq_table[len_footer]++;
}
len_pos_header = (position_slot << 3) | len_header;
freq_tabs->main_freq_table[len_pos_header + LZX_NUM_CHARS]++;
- if (lzx_extra_bits[position_slot] >= 3)
+ /* Equivalent to:
+ * if (lzx_extra_bits[position_slot] >= 3) */
+ if (position_slot >= 8)
freq_tabs->aligned_freq_table[position_footer & 7]++;
return match;
/*
* lzx-decomp.c
*
- * Routines for LZX decompression. The LZX format has many similarities to the
- * DEFLATE format used in zlib and gzip, but it's not quite the same.
- *
+ * LZX decompression routines, originally based on code taken from cabextract
+ * v0.5, which was, itself, a modified version of the lzx decompression code
+ * from unlzx.
*/
/*
*/
/*
- * This file has been modified from code taken from cabextract v0.5, which was,
- * itself, a modified version of the lzx decompression code from unlzx. The
- * code has been customized for wimlib.
+ * LZX is a LZ77 and Huffman-code based compression format that has many
+ * similarities to the DEFLATE format used in zlib. The compression ratio is as
+ * good or better than DEFLATE. However, in WIM files only up to 32768 bytes of
+ * data can ever compressed be in the same LZX block, so a .tar.gz file could
+ * potentially be smaller than a WIM file that uses LZX compression because it
+ * can use a larger LZ77 window size.
*
* Some notes on the LZX compression format as used in Windows Imaging (WIM)
* files:
*
- * A compressed WIM file resource consists of a table of chunk offsets followed
- * by compressed chunks. All compressed chunks except the last decompress to
- * WIM_CHUNK_SIZE (= 32768) bytes. This is quite similar to the cabinet (.cab)
- * file format, but they are not the same (at least based on M$'s
- * documentation). According to the documentation, in the cabinet format, the
- * LZX block size is independent from the CFDATA blocks and may span several
- * CFDATA blocks. However, for WIM file resources, I have seen no case of a LZX
- * block spanning multiple WIM chunks. This is probably done to make it easier
- * to randomly access the compressed file resources. WIMLIB in fact makes use
- * of this feature to allow semi-random access to file resources in the
- * read_resource() function.
+ * A compressed WIM resource consists of a table of chunk offsets followed by
+ * the compressed chunks themselves. All compressed chunks except possibly the
+ * last decompress to WIM_CHUNK_SIZE (= 32768) bytes. This is quite similar to
+ * the cabinet (.cab) file format, but they are not the same. According to the
+ * cabinet format documentation, the LZX block size is independent from the
+ * CFDATA blocks, and a LZX block may span several CFDATA blocks. However, in
+ * WIMs, LZX blocks do not appear to ever span multiple WIM chunks. Note that
+ * this means any WIM chunk may be decompressed or compressed independently from
+ * any other chunk, which is convenient.
*
- * Usually a WIM chunk will contain only one LZX block, but on rare occasions it
- * may contain multiple LZX block. The LZX block are usually the aligned block
- * type or verbatim block type, but can (very rarely) be the uncompressed block
- * type. The size of a LZX block is specified by 1 or 17 bits following the 3
- * bits that specify the block type. A '1' means to use the default block size
- * (equal to 32768), while a '0' means that the block size is given by the next
- * 16 bits.
+ * A LZX compressed WIM chunk contains one or more LZX blocks of the aligned,
+ * verbatim, or uncompressed block types. For aligned and verbatim blocks, the
+ * size of the block in uncompressed bytes is specified by a bit following the 3
+ * bits that specify the block type, possibly followed by an additional 16 bits.
+ * '1' means to use the default block size (equal to 32768, the size of a WIM
+ * chunk--- and this seems to only be valid for the first LZX block in a WIM
+ * chunk), while '0' means that the block size is provided by the next 16 bits.
*
- * The cabinet format, as documented, allows for the possibility that a CFDATA
- * chunk is up to 6144 bytes larger than the uncompressed data. In the WIM
- * format, however, it appears that every chunk that would be 32768 bytes or
- * more when compressed, is actually stored uncompressed. This is not
- * documented by M$.
+ * The cabinet format, as documented, allows for the possibility that a
+ * compressed CFDATA chunk is up to 6144 bytes larger than the data it
+ * uncompresses to. However, in the WIM format it appears that every chunk that
+ * would be 32768 bytes or more when compressed is actually stored fully
+ * uncompressed.
*
* The 'e8' preprocessing step that changes x86 call instructions to use
* absolute offsets instead of relative offsets relies on a filesize parameter.
* the file resource could be used for this purpose), and instead a magic file
* size of 12000000 is used. The 'e8' preprocessing is always done, and there
* is no bit to indicate whether it is done or not.
- *
*/
/*
- * Some more notes about errors in Microsoft's documentation:
+ * Some more notes about errors in Microsoft's LZX documentation:
*
* Microsoft's LZX document and their implementation of the com.ms.util.cab Java
* package do not concur.
#include "util.h"
#include "lzx.h"
-
#include "decomp.h"
-
#include <string.h>
/* Huffman decoding tables and maps from symbols to code lengths. */
# define LZX_DEBUG(format, ...)
#endif
-
-/* Constants, some defined by the LZX specification: */
+/* Constants, most of which are defined by the LZX specification: */
/* The smallest and largest allowed match lengths. */
#define LZX_MIN_MATCH 2
/* Number of values an uncompressed literal byte can represent. */
#define LZX_NUM_CHARS 256
-/* Each LZX block begins with 3 bits that determines the block type: */
+/* Each LZX block begins with 3 bits that determines the block type. Below are
+ * the valid block types. Values 0, and 4 through 7, are invalid. */
#define LZX_BLOCKTYPE_VERBATIM 1
#define LZX_BLOCKTYPE_ALIGNED 2
#define LZX_BLOCKTYPE_UNCOMPRESSED 3
-/* values 0, and 4 through 7, are invalid. */
-
#define LZX_NUM_PRIMARY_LENS 7 /* this one missing from spec! */
-/* Only valid for 32768 block size! */
+/* NOTE: There are really 51 position slots in the LZX format as a whole, but
+ * only 30 are needed to allow for the window to be up to 32768 bytes long,
+ * which is the maximum in the WIM format. */
#define LZX_NUM_POSITION_SLOTS 30
/* Read the LZX specification for information about the Huffman trees used in
#define LZX_PRETREE_TABLEBITS 6
#define LZX_PRETREE_ELEMENT_SIZE 4
-
#define LZX_ALIGNEDTREE_NUM_SYMBOLS 8
#define LZX_ALIGNEDTREE_TABLEBITS 7
#define LZX_ALIGNEDTREE_ELEMENT_SIZE 3
* different as well. */
#define LZX_MAGIC_FILESIZE 12000000
-extern const u8 lzx_extra_bits[51];
-extern const u32 lzx_position_base[51];
+extern const u8 lzx_extra_bits[LZX_NUM_POSITION_SLOTS];
+extern const u32 lzx_position_base[LZX_NUM_POSITION_SLOTS];
/* Least-recently used queue for match offsets. */
struct lru_queue {
printf("%02hhx", *field++);
}
+static inline u32 bsr32(u32 n)
+{
+#if defined(__x86__) || defined(__x86_64__)
+ asm("bsrl %0, %0;"
+ : "=r"(n)
+ : "0" (n));
+ return n;
+#else
+ u32 pow = 0;
+ while ((n >>= 1) != 0)
+ pow++;
+ return pow;
+#endif
+}
#endif /* _WIMLIB_UTIL_H */
#include <stdlib.h>
#include <string.h>
-static inline u32 bsr32(u32 n)
-{
-#if defined(__x86__) || defined(__x86_64__)
- asm("bsrl %0, %0;"
- : "=r"(n)
- : "0" (n));
- return n;
-#else
- u32 pow = 0;
- while ((n >>= 1) != 0)
- pow++;
- return pow;
-#endif
-}
-
-
/*
* Writes @match, which is a match given in the intermediate representation for
* XPRESS matches, to the output stream @ostream.
* The trickiest part is probably the fact that literal bytes for match lengths
* are encoded "separately" from the bitstream.
*
- * Also, a caveat--- according to M$'s documentation for XPRESS,
+ * Also, a caveat--- according to Microsoft's documentation for XPRESS,
*
* "Some implementation of the decompression algorithm expect an extra
* symbol to mark the end of the data. Specifically, some implementations
/* Decodes @huffsym, a value >= XPRESS_NUM_CHARS, that is the header of a match.
* */
-static int xpress_decode_match(int huffsym, uint window_pos, uint window_len,
- u8 window[], struct input_bitstream *istream)
+static int xpress_decode_match(int huffsym, unsigned window_pos,
+ unsigned window_len, u8 window[],
+ struct input_bitstream *istream)
{
- uint match_len;
- uint match_offset;
+ unsigned match_len;
+ unsigned match_offset;
u8 match_sym = (u8)huffsym;
u8 len_hdr = match_sym & 0xf;
u8 offset_bsr = match_sym >> 4;
int ret;
u8 *match_dest;
u8 *match_src;
- uint i;
+ unsigned i;
ret = bitstream_read_bits(istream, offset_bsr, &match_offset);
if (ret != 0)
match_src = match_dest - match_offset;
if (window_pos + match_len > window_len) {
- ERROR("XPRESS dedecompression error: match of length %d "
+ ERROR("XPRESS decompression error: match of length %d "
"bytes overflows window", match_len);
return -1;
}
* XPRESS-encoded data. */
static int xpress_decompress_literals(struct input_bitstream *istream,
u8 uncompressed_data[],
- uint uncompressed_len,
+ unsigned uncompressed_len,
const u8 lens[],
const u16 decode_table[])
{
- uint curpos = 0;
- uint huffsym;
+ unsigned curpos = 0;
+ unsigned huffsym;
int match_len;
int ret = 0;
}
-int xpress_decompress(const void *__compressed_data, uint compressed_len,
- void *uncompressed_data, uint uncompressed_len)
+int xpress_decompress(const void *__compressed_data, unsigned compressed_len,
+ void *uncompressed_data, unsigned uncompressed_len)
{
u8 lens[XPRESS_NUM_SYMBOLS];
u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS];
struct input_bitstream istream;
u8 *lens_p;
const u8 *compressed_data;
- uint i;
+ unsigned i;
int ret;
compressed_data = __compressed_data;