From 9ad70b9c12fb5859ff5be1730672b80197a0d9f9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 14 Dec 2012 21:02:19 -0600 Subject: [PATCH] Compression code cleanups --- src/lzx-common.c | 35 ++++++--- src/lzx-comp.c | 184 +++++++++++++++++++++++++++----------------- src/lzx-decomp.c | 64 ++++++++------- src/lzx.h | 17 ++-- src/util.h | 14 ++++ src/xpress-comp.c | 16 ---- src/xpress-decomp.c | 27 +++---- 7 files changed, 204 insertions(+), 153 deletions(-) diff --git a/src/lzx-common.c b/src/lzx-common.c index 8afe2abe..9da4ddb0 100644 --- a/src/lzx-common.c +++ b/src/lzx-common.c @@ -7,18 +7,31 @@ * - lzx_position_base is an index to the position slot bases * - lzx_extra_bits states how many bits of offset-from-base data is needed. */ -const u8 lzx_extra_bits[51] = { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, - 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, - 15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, - 17, 17, 17 +const u8 lzx_extra_bits[LZX_NUM_POSITION_SLOTS] = { + 0 , 0 , 0 , 0 , 1 , + 1 , 2 , 2 , 3 , 3 , + 4 , 4 , 5 , 5 , 6 , + 6 , 7 , 7 , 8 , 8 , + 9 , 9 , 10, 10, 11, + 11, 12, 12, 13, 13, + /*14, 14, 15, 15, 16,*/ + /*16, 17, 17, 17, 17,*/ + /*17, 17, 17, 17, 17,*/ + /*17, 17, 17, 17, 17,*/ + /*17*/ }; -const u32 lzx_position_base[51] = { - 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, - 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, - 32768, 49152, 65536, 98304, 131072, 196608, 262144, 393216, 524288, - 655360, 786432, 917504, 1048576, 1179648, 1310720, 1441792, 1572864, - 1703936, 1835008, 1966080, 2097152 +const u32 lzx_position_base[LZX_NUM_POSITION_SLOTS] = { + 0 , 1 , 2 , 3 , 4 , + 6 , 8 , 12 , 16 , 24 , + 32 , 48 , 64 , 96 , 128 , + 192 , 256 , 384 , 512 , 768 , + 1024 , 1536 , 2048 , 3072 , 4096 , + 6144 , 8192 , 12288 , 16384 , 24576 , + /*32768 , 49152 , 65536 , 98304 , 131072 ,*/ + /*196608 , 262144 , 393216 , 524288 , 655360 ,*/ + /*786432 , 917504 , 1048576, 1179648, 1310720,*/ + /*1441792, 1572864, 1703936, 1835008, 1966080,*/ + /*2097152*/ }; diff --git a/src/lzx-comp.c b/src/lzx-comp.c index 899d1dad..34251e34 100644 --- a/src/lzx-comp.c +++ b/src/lzx-comp.c @@ -1,10 +1,8 @@ /* * lzx-comp.c * - * LZX compression routines. - * - * This code was originally based on code written by Matthew T. Russotto - * (liblzxcomp). + * LZX compression routines, originally based on code written by Matthew T. + * Russotto (liblzxcomp), but heavily modified. */ /* @@ -28,17 +26,21 @@ */ - /* * This file provides lzx_compress(), a function to compress an in-memory buffer * of data using LZX compression, as used in the WIM file format. * - * There is no sliding window, as for the compressed chunks in WIM resources, - * the window is always the length of the input. + * Please see the comments in lzx-decomp.c for more information about this + * compression format. * - * The basic algorithm should be familiar if you are familiar with Huffman trees - * and with other LZ77-based formats such as DEFLATE. Otherwise it can be quite - * tricky to understand. Basically it is the following: + * One thing to keep in mind is that there is no sliding window, since the + * window is always the entirety of a WIM chunk, which is at most WIM_CHUNK_SIZE + * ( = 32768) bytes. + * + * The basic compression algorithm used here should be familiar if you are + * familiar with Huffman trees and with other LZ77 and Huffman-based formats + * such as DEFLATE. Otherwise it can be quite tricky to understand. Basically + * it is the following: * * - Preprocess the input data (LZX-specific) * - Go through the input data and determine matches. This part is based on @@ -48,9 +50,13 @@ * while recording matches. * - Output the block header, including the Huffman trees; then output the * compressed stream of matches and literal characters. + * + * It is possible for a WIM chunk to include multiple LZX blocks, since for some + * input data this will produce a better compression ratio (especially since + * each block can include new Huffman codes). However, producing multiple LZX + * blocks from one input chunk is not yet implemented. */ - #include "lzx.h" #include "comp.h" #include @@ -80,44 +86,41 @@ struct lzx_freq_tables { -/* Returns the position slot that corresponds to a given formatted offset. This - * means searching the lzx_position_base array to find what slot contains a - * position base that is less than or equal to formatted_offset, where the next - * slot contains a position base that is greater than or equal to - * formatted_offset. */ -static uint lzx_get_position_slot(uint formatted_offset) +/* Returns the LZX position slot that corresponds to a given formatted offset. + * + * Logically, this returns the smallest i such that + * formatted_offset >= lzx_position_base[i]. + * + * The actual implementation below takes advantage of the regularity of the + * numbers in the lzx_position_base array to calculate the slot directly from + * the formatted offset without actually looking at the array. + */ +static inline unsigned lzx_get_position_slot(unsigned formatted_offset) { - int left; - int right; - int mid; - - /* Calculate position base using binary search of table; if log2 can be - * done in hardware, approximation might work; - * trunc(log2(formatted_offset*formatted_offset)) gets either the proper - * position slot or the next one, except for slots 0, 1, and 39-49 - * - * Slots 0-1 are handled by the R0-R1 procedures - * +#if 0 + /* * Slots 36-49 (formatted_offset >= 262144) can be found by * (formatted_offset/131072) + 34 == (formatted_offset >> 17) + 34; + * however, this check for formatted_offset >= 262144 is commented out + * because WIM chunks cannot be that large. */ if (formatted_offset >= 262144) { return (formatted_offset >> 17) + 34; - } else { - left = 3; - right = LZX_NUM_POSITION_SLOTS - 1; - while (1) { - mid = (left + right) >> 1; - if ((lzx_position_base[mid] <= formatted_offset) && - lzx_position_base[mid + 1] > formatted_offset) { - return mid; - } - if (formatted_offset > lzx_position_base[mid]) - /* too low */ - left = mid + 1; - else /* too high */ - right = mid; - } + } else +#endif + { + /* Note: this part here only works if: + * + * 2 <= formatted_offset < 655360 + * + * It is < 655360 because the frequency of the position bases + * increases starting at the 655360 entry, and it is >= 2 + * because the below calculation fails if the most significant + * bit is lower than the 2's place. */ + wimlib_assert(formatted_offset >= 2 && formatted_offset < 655360); + unsigned mssb_idx = bsr32(formatted_offset); + return (mssb_idx << 1) | + ((formatted_offset >> (mssb_idx - 1)) & 1); } } @@ -128,11 +131,21 @@ static u32 lzx_record_literal(u8 literal, void *__main_freq_tab) return literal; } -/* Constructs a match from an offset and a length, and updates the LRU queue - * and the frequency of symbols in the main, length, and aligned offset - * alphabets. The return value is a 32-bit integer that, if the high bit is - * set, contains the match length, the position slot, and the position footer - * for the match. */ +/* Equivalent to lzx_extra_bits[position_slot] except position_slot must be + * between 2 and 37 */ +static inline unsigned lzx_get_num_extra_bits(unsigned position_slot) +{ +#if 0 + return lzx_extra_bits[position_slot]; +#endif + wimlib_assert(position_slot >= 2 && position_slot <= 37); + return (position_slot >> 1) - 1; +} + +/* Constructs a match from an offset and a length, and updates the LRU queue and + * the frequency of symbols in the main, length, and aligned offset alphabets. + * The return value is a 32-bit number that provides the match in an + * intermediate representation documented below. */ static u32 lzx_record_match(uint match_offset, uint match_len, void *__freq_tabs, void *__queue) { @@ -145,10 +158,12 @@ static u32 lzx_record_match(uint match_offset, uint match_len, u32 len_header; u32 len_pos_header; uint len_footer; + uint adjusted_match_len; wimlib_assert(match_len >= LZX_MIN_MATCH && match_len <= LZX_MAX_MATCH); + wimlib_assert(match_offset != 0); - + /* If possible, encode this offset as a repeated offset. */ if (match_offset == queue->R0) { formatted_offset = 0; position_slot = 0; @@ -163,38 +178,63 @@ static u32 lzx_record_match(uint match_offset, uint match_len, } else { /* Not a repeated offset. */ + /* offsets of 0, 1, and 2 are reserved for the repeated offset + * codes, so non-repeated offsets must be encoded as 3+. The + * minimum offset is 1, so encode the offsets offset by 2. */ formatted_offset = match_offset + LZX_MIN_MATCH; queue->R2 = queue->R1; queue->R1 = queue->R0; queue->R0 = match_offset; - position_slot = lzx_get_position_slot(formatted_offset); + /* The (now-formatted) offset will actually be encoded as a + * small position slot number that maps to a certain hard-coded + * offset (position base), followed by a number of extra bits--- + * the position footer--- that are added to the position base to + * get the original formatted offset. */ - /* Just the extra bits of the formatted offset. */ - position_footer = ((1UL << lzx_extra_bits[position_slot]) - 1) & - formatted_offset; + position_slot = lzx_get_position_slot(formatted_offset); + position_footer = formatted_offset & + ((1 << lzx_get_num_extra_bits(position_slot)) - 1); } - /* (match length - 2) = 8 bits */ - /* position_slot = 6 bits */ - /* position_footer = 17 bits */ - /* total = 31 bits */ - /* plus one to say whether it's a literal or not */ - - match = 0x80000000 | /* bit 31 in intelligent bit ordering */ - (position_slot << 25) | /* bits 30-25 */ - (position_footer << 8) | /* bits 8-24 */ - (match_len - LZX_MIN_MATCH); /* bits 0-7 */ - - /* Update the frequency for the main tree, the length tree (only if a - * length symbol is to be output), and the aligned tree (only if an - * aligned symbol is to be output.) */ - if (match_len < (LZX_NUM_PRIMARY_LENS + LZX_MIN_MATCH)) { - len_header = match_len - LZX_MIN_MATCH; + adjusted_match_len = match_len - LZX_MIN_MATCH; + + /* Pack the position slot, position footer, and match length into an + * intermediate representation. + * + * bits description + * ---- ----------------------------------------------------------- + * + * 31 1 if a match, 0 if a literal. + * + * 30-25 position slot. This can be at most 50, so it will fit in 6 + * bits. + * + * 8-24 position footer. This is the offset of the real formatted + * offset from the position base. This can be at most 17 bits + * (since lzx_extra_bits[LZX_NUM_POSITION_SLOTS - 1] is 17). + * + * 0-7 length of match, offset by 2. This can be at most + * (LZX_MAX_MATCH - 2) == 255, so it will fit in 8 bits. */ + match = 0x80000000 | + (position_slot << 25) | + (position_footer << 8) | + (adjusted_match_len); + + /* The match length must be at least 2, so let the adjusted match length + * be the match length minus 2. + * + * If it is less than 7, the adjusted match length is encoded as a 3-bit + * number offset by 2. Otherwise, the 3-bit length header is all 1's + * and the actual adjusted length is given as a symbol encoded with the + * length tree, offset by 7. + */ + if (adjusted_match_len < LZX_NUM_PRIMARY_LENS) { + len_header = adjusted_match_len; } else { len_header = LZX_NUM_PRIMARY_LENS; - len_footer = match_len - (LZX_NUM_PRIMARY_LENS + LZX_MIN_MATCH); + len_footer = adjusted_match_len - LZX_NUM_PRIMARY_LENS; freq_tabs->len_freq_table[len_footer]++; } len_pos_header = (position_slot << 3) | len_header; @@ -203,7 +243,9 @@ static u32 lzx_record_match(uint match_offset, uint match_len, freq_tabs->main_freq_table[len_pos_header + LZX_NUM_CHARS]++; - if (lzx_extra_bits[position_slot] >= 3) + /* Equivalent to: + * if (lzx_extra_bits[position_slot] >= 3) */ + if (position_slot >= 8) freq_tabs->aligned_freq_table[position_footer & 7]++; return match; diff --git a/src/lzx-decomp.c b/src/lzx-decomp.c index 92bfb2e9..fb8b17e9 100644 --- a/src/lzx-decomp.c +++ b/src/lzx-decomp.c @@ -1,9 +1,9 @@ /* * lzx-decomp.c * - * Routines for LZX decompression. The LZX format has many similarities to the - * DEFLATE format used in zlib and gzip, but it's not quite the same. - * + * LZX decompression routines, originally based on code taken from cabextract + * v0.5, which was, itself, a modified version of the lzx decompression code + * from unlzx. */ /* @@ -26,38 +26,39 @@ */ /* - * This file has been modified from code taken from cabextract v0.5, which was, - * itself, a modified version of the lzx decompression code from unlzx. The - * code has been customized for wimlib. + * LZX is a LZ77 and Huffman-code based compression format that has many + * similarities to the DEFLATE format used in zlib. The compression ratio is as + * good or better than DEFLATE. However, in WIM files only up to 32768 bytes of + * data can ever compressed be in the same LZX block, so a .tar.gz file could + * potentially be smaller than a WIM file that uses LZX compression because it + * can use a larger LZ77 window size. * * Some notes on the LZX compression format as used in Windows Imaging (WIM) * files: * - * A compressed WIM file resource consists of a table of chunk offsets followed - * by compressed chunks. All compressed chunks except the last decompress to - * WIM_CHUNK_SIZE (= 32768) bytes. This is quite similar to the cabinet (.cab) - * file format, but they are not the same (at least based on M$'s - * documentation). According to the documentation, in the cabinet format, the - * LZX block size is independent from the CFDATA blocks and may span several - * CFDATA blocks. However, for WIM file resources, I have seen no case of a LZX - * block spanning multiple WIM chunks. This is probably done to make it easier - * to randomly access the compressed file resources. WIMLIB in fact makes use - * of this feature to allow semi-random access to file resources in the - * read_resource() function. + * A compressed WIM resource consists of a table of chunk offsets followed by + * the compressed chunks themselves. All compressed chunks except possibly the + * last decompress to WIM_CHUNK_SIZE (= 32768) bytes. This is quite similar to + * the cabinet (.cab) file format, but they are not the same. According to the + * cabinet format documentation, the LZX block size is independent from the + * CFDATA blocks, and a LZX block may span several CFDATA blocks. However, in + * WIMs, LZX blocks do not appear to ever span multiple WIM chunks. Note that + * this means any WIM chunk may be decompressed or compressed independently from + * any other chunk, which is convenient. * - * Usually a WIM chunk will contain only one LZX block, but on rare occasions it - * may contain multiple LZX block. The LZX block are usually the aligned block - * type or verbatim block type, but can (very rarely) be the uncompressed block - * type. The size of a LZX block is specified by 1 or 17 bits following the 3 - * bits that specify the block type. A '1' means to use the default block size - * (equal to 32768), while a '0' means that the block size is given by the next - * 16 bits. + * A LZX compressed WIM chunk contains one or more LZX blocks of the aligned, + * verbatim, or uncompressed block types. For aligned and verbatim blocks, the + * size of the block in uncompressed bytes is specified by a bit following the 3 + * bits that specify the block type, possibly followed by an additional 16 bits. + * '1' means to use the default block size (equal to 32768, the size of a WIM + * chunk--- and this seems to only be valid for the first LZX block in a WIM + * chunk), while '0' means that the block size is provided by the next 16 bits. * - * The cabinet format, as documented, allows for the possibility that a CFDATA - * chunk is up to 6144 bytes larger than the uncompressed data. In the WIM - * format, however, it appears that every chunk that would be 32768 bytes or - * more when compressed, is actually stored uncompressed. This is not - * documented by M$. + * The cabinet format, as documented, allows for the possibility that a + * compressed CFDATA chunk is up to 6144 bytes larger than the data it + * uncompresses to. However, in the WIM format it appears that every chunk that + * would be 32768 bytes or more when compressed is actually stored fully + * uncompressed. * * The 'e8' preprocessing step that changes x86 call instructions to use * absolute offsets instead of relative offsets relies on a filesize parameter. @@ -65,11 +66,10 @@ * the file resource could be used for this purpose), and instead a magic file * size of 12000000 is used. The 'e8' preprocessing is always done, and there * is no bit to indicate whether it is done or not. - * */ /* - * Some more notes about errors in Microsoft's documentation: + * Some more notes about errors in Microsoft's LZX documentation: * * Microsoft's LZX document and their implementation of the com.ms.util.cab Java * package do not concur. @@ -108,9 +108,7 @@ #include "util.h" #include "lzx.h" - #include "decomp.h" - #include /* Huffman decoding tables and maps from symbols to code lengths. */ diff --git a/src/lzx.h b/src/lzx.h index 02718697..70872abe 100644 --- a/src/lzx.h +++ b/src/lzx.h @@ -10,8 +10,7 @@ # define LZX_DEBUG(format, ...) #endif - -/* Constants, some defined by the LZX specification: */ +/* Constants, most of which are defined by the LZX specification: */ /* The smallest and largest allowed match lengths. */ #define LZX_MIN_MATCH 2 @@ -20,16 +19,17 @@ /* Number of values an uncompressed literal byte can represent. */ #define LZX_NUM_CHARS 256 -/* Each LZX block begins with 3 bits that determines the block type: */ +/* Each LZX block begins with 3 bits that determines the block type. Below are + * the valid block types. Values 0, and 4 through 7, are invalid. */ #define LZX_BLOCKTYPE_VERBATIM 1 #define LZX_BLOCKTYPE_ALIGNED 2 #define LZX_BLOCKTYPE_UNCOMPRESSED 3 -/* values 0, and 4 through 7, are invalid. */ - #define LZX_NUM_PRIMARY_LENS 7 /* this one missing from spec! */ -/* Only valid for 32768 block size! */ +/* NOTE: There are really 51 position slots in the LZX format as a whole, but + * only 30 are needed to allow for the window to be up to 32768 bytes long, + * which is the maximum in the WIM format. */ #define LZX_NUM_POSITION_SLOTS 30 /* Read the LZX specification for information about the Huffman trees used in @@ -58,7 +58,6 @@ #define LZX_PRETREE_TABLEBITS 6 #define LZX_PRETREE_ELEMENT_SIZE 4 - #define LZX_ALIGNEDTREE_NUM_SYMBOLS 8 #define LZX_ALIGNEDTREE_TABLEBITS 7 #define LZX_ALIGNEDTREE_ELEMENT_SIZE 3 @@ -73,8 +72,8 @@ * different as well. */ #define LZX_MAGIC_FILESIZE 12000000 -extern const u8 lzx_extra_bits[51]; -extern const u32 lzx_position_base[51]; +extern const u8 lzx_extra_bits[LZX_NUM_POSITION_SLOTS]; +extern const u32 lzx_position_base[LZX_NUM_POSITION_SLOTS]; /* Least-recently used queue for match offsets. */ struct lru_queue { diff --git a/src/util.h b/src/util.h index dd564f5d..28935938 100644 --- a/src/util.h +++ b/src/util.h @@ -189,5 +189,19 @@ static inline void print_byte_field(const u8 field[], size_t len) printf("%02hhx", *field++); } +static inline u32 bsr32(u32 n) +{ +#if defined(__x86__) || defined(__x86_64__) + asm("bsrl %0, %0;" + : "=r"(n) + : "0" (n)); + return n; +#else + u32 pow = 0; + while ((n >>= 1) != 0) + pow++; + return pow; +#endif +} #endif /* _WIMLIB_UTIL_H */ diff --git a/src/xpress-comp.c b/src/xpress-comp.c index aae2a4a0..b8e655f7 100644 --- a/src/xpress-comp.c +++ b/src/xpress-comp.c @@ -30,22 +30,6 @@ #include #include -static inline u32 bsr32(u32 n) -{ -#if defined(__x86__) || defined(__x86_64__) - asm("bsrl %0, %0;" - : "=r"(n) - : "0" (n)); - return n; -#else - u32 pow = 0; - while ((n >>= 1) != 0) - pow++; - return pow; -#endif -} - - /* * Writes @match, which is a match given in the intermediate representation for * XPRESS matches, to the output stream @ostream. diff --git a/src/xpress-decomp.c b/src/xpress-decomp.c index 45b52350..629b3743 100644 --- a/src/xpress-decomp.c +++ b/src/xpress-decomp.c @@ -58,7 +58,7 @@ * The trickiest part is probably the fact that literal bytes for match lengths * are encoded "separately" from the bitstream. * - * Also, a caveat--- according to M$'s documentation for XPRESS, + * Also, a caveat--- according to Microsoft's documentation for XPRESS, * * "Some implementation of the decompression algorithm expect an extra * symbol to mark the end of the data. Specifically, some implementations @@ -83,18 +83,19 @@ /* Decodes @huffsym, a value >= XPRESS_NUM_CHARS, that is the header of a match. * */ -static int xpress_decode_match(int huffsym, uint window_pos, uint window_len, - u8 window[], struct input_bitstream *istream) +static int xpress_decode_match(int huffsym, unsigned window_pos, + unsigned window_len, u8 window[], + struct input_bitstream *istream) { - uint match_len; - uint match_offset; + unsigned match_len; + unsigned match_offset; u8 match_sym = (u8)huffsym; u8 len_hdr = match_sym & 0xf; u8 offset_bsr = match_sym >> 4; int ret; u8 *match_dest; u8 *match_src; - uint i; + unsigned i; ret = bitstream_read_bits(istream, offset_bsr, &match_offset); if (ret != 0) @@ -137,7 +138,7 @@ static int xpress_decode_match(int huffsym, uint window_pos, uint window_len, match_src = match_dest - match_offset; if (window_pos + match_len > window_len) { - ERROR("XPRESS dedecompression error: match of length %d " + ERROR("XPRESS decompression error: match of length %d " "bytes overflows window", match_len); return -1; } @@ -159,12 +160,12 @@ static int xpress_decode_match(int huffsym, uint window_pos, uint window_len, * XPRESS-encoded data. */ static int xpress_decompress_literals(struct input_bitstream *istream, u8 uncompressed_data[], - uint uncompressed_len, + unsigned uncompressed_len, const u8 lens[], const u16 decode_table[]) { - uint curpos = 0; - uint huffsym; + unsigned curpos = 0; + unsigned huffsym; int match_len; int ret = 0; @@ -194,15 +195,15 @@ static int xpress_decompress_literals(struct input_bitstream *istream, } -int xpress_decompress(const void *__compressed_data, uint compressed_len, - void *uncompressed_data, uint uncompressed_len) +int xpress_decompress(const void *__compressed_data, unsigned compressed_len, + void *uncompressed_data, unsigned uncompressed_len) { u8 lens[XPRESS_NUM_SYMBOLS]; u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS]; struct input_bitstream istream; u8 *lens_p; const u8 *compressed_data; - uint i; + unsigned i; int ret; compressed_data = __compressed_data; -- 2.43.0