X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Fxpress-decompress.c;h=855f2cb311ccb156db1ecd9e00600c86255e458a;hp=de395de96afeecd3cdf3ad83fc62cdab00e1a3f4;hb=90f461637225b29f3b0ee9fa3862cff4ecb56ad8;hpb=4c16dabbfa70bc4c019463091acae1266f600113 diff --git a/src/xpress-decompress.c b/src/xpress-decompress.c index de395de9..855f2cb3 100644 --- a/src/xpress-decompress.c +++ b/src/xpress-decompress.c @@ -6,7 +6,7 @@ /* * - * Copyright (C) 2012 Eric Biggers + * Copyright (C) 2012, 2013 Eric Biggers * * This file is part of wimlib, a library for working with WIM files. * @@ -25,11 +25,10 @@ */ - /* - * The XPRESS compression format is a LZ77 and Huffman-code based algorithm. - * That means it is quite similar to LZX compression, but XPRESS is slightly - * simpler, so it is a little faster to compress and decompress. + * The XPRESS compression format is an LZ77 and Huffman-code based algorithm. + * That means it is fairly similar to LZX compression, but XPRESS is simpler, so + * it is a little faster to compress and decompress. * * The XPRESS compression format is mostly documented in a file called "[MS-XCA] * Xpress Compression Algorithm". In the MSDN library, it can currently be @@ -41,118 +40,97 @@ * If you are already familiar with the LZ77 algorithm and Huffman coding, the * XPRESS format is fairly simple. The compressed data begins with 256 bytes * that contain 512 4-bit integers that are the lengths of the symbols in the - * Huffman tree used for decoding compressed literals. This is the only Huffman - * tree that is used for the entirety of the compressed data, and the codeword + * Huffman code used for match/literal headers. In contrast with more + * complicated formats such as DEFLATE and LZX, this is the only Huffman code + * that is used for the entirety of the XPRESS compressed data, and the codeword * lengths are not encoded with a pretree. * * The rest of the compressed data is Huffman-encoded symbols. Values 0 through - * 255 are literal bytes. Values 256 through 511 are matches and may require - * extra bits or bytes to be read to get the match offset and match length. - * - * There is no notion of a "compressed block" in the XPRESS format, so in the - * XPRESS format, each WIM chunk (32768 bytes) will always use only one Huffman - * tree. + * 255 represent the corresponding literal bytes. Values 256 through 511 + * represent matches and may require extra bits or bytes to be read to get the + * match offset and match length. * - * The trickiest part is probably the fact that literal bytes for match lengths - * are encoded "separately" from the bitstream. + * The trickiest part is probably the way in which literal bytes for match + * lengths are interleaved in the bitstream. * * Also, a caveat--- according to Microsoft's documentation for XPRESS, * - * "Some implementation of the decompression algorithm expect an extra - * symbol to mark the end of the data. Specifically, some implementations - * fail during decompression if the Huffman symbol 256 is not found after - * the actual data." - * - * This is the case for WIM files--- in we must write this extra symbol "256" at - * the end. Otherwise Microsoft's software will fail to decompress the - * XPRESS-compressed data. + * "Some implementation of the decompression algorithm expect an extra + * symbol to mark the end of the data. Specifically, some implementations + * fail during decompression if the Huffman symbol 256 is not found after + * the actual data." * - * However, wimlib's decompressor in this file currently does not care if this - * extra symbol is there or not. + * This is the case for the implementation in WIMGAPI. However, wimlib's + * decompressor in this file currently does not care if this extra symbol is + * there or not. */ -#include "util.h" -#include "xpress.h" -#include "wimlib.h" +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif -#define XPRESS_DECOMP -#include "decompress.h" +#include "wimlib.h" +#include "wimlib/decompressor_ops.h" +#include "wimlib/decompress_common.h" +#include "wimlib/xpress.h" /* - * Decodes a symbol @huffsym that begins an XPRESS match. + * Decodes a symbol @sym that begins an XPRESS match. * * The low 8 bits of the symbol are divided into: * * bits 0-3: length header * bits 4-7: index of high-order bit of match offset * - * Note: taking the low 8 bits of the symbol is the same as subtracting 256, the - * number of symbols reserved for literals. - * - * Returns the match length, or -1 on error. + * Returns the match length, or -1 if the data is invalid. */ -static int xpress_decode_match(unsigned huffsym, unsigned window_pos, - unsigned window_len, u8 window[], - struct input_bitstream *istream) +static int +xpress_decode_match(unsigned sym, input_idx_t window_pos, + input_idx_t window_len, u8 window[restrict], + struct input_bitstream * restrict istream) { - unsigned match_len; - unsigned match_offset; - u8 match_sym = (u8)huffsym; - u8 len_hdr = match_sym & 0xf; - u8 offset_bsr = match_sym >> 4; - int ret; + + unsigned len_hdr; + unsigned offset_bsr; u8 *match_dest; u8 *match_src; unsigned i; + unsigned match_len; + unsigned match_offset; - ret = bitstream_read_bits(istream, offset_bsr, &match_offset); - if (ret != 0) - return ret; - match_offset |= (1 << offset_bsr); + sym -= XPRESS_NUM_CHARS; + len_hdr = sym & 0xf; + offset_bsr = sym >> 4; - if (len_hdr == 0xf) { - ret = bitstream_read_byte(istream); - if (ret < 0) - return ret; - match_len = ret; - if (match_len == 0xff) { - ret = bitstream_read_byte(istream); - if (ret < 0) - return ret; - match_len = ret; + bitstream_ensure_bits(istream, 16); - ret = bitstream_read_byte(istream); - if (ret < 0) - return ret; + match_offset = (1U << offset_bsr) | bitstream_pop_bits(istream, offset_bsr); - match_len |= (ret << 8); + if (len_hdr == 0xf) { + match_len = bitstream_read_byte(istream); + if (unlikely(match_len == 0xff)) { + match_len = bitstream_read_byte(istream); + match_len |= (unsigned)bitstream_read_byte(istream) << 8; } else { match_len += 0xf; } } else { match_len = len_hdr; } - match_len += XPRESS_MIN_MATCH; + match_len += XPRESS_MIN_MATCH_LEN; - /* Verify that the match is in the bounds of the part of the window - * currently in use, then copy the source of the match to the current - * position. */ - match_dest = window + window_pos; - match_src = match_dest - match_offset; + /* Verify the match is in bounds, then copy its data to the current + * position. */ - if (window_pos + match_len > window_len) { - ERROR("XPRESS decompression error: match of length %u " - "bytes overflows window", match_len); + if (window_pos + match_len > window_len) return -1; - } - if (match_src < window) { - ERROR("XPRESS decompression error: match of length %u bytes " - "references data before window (match_offset = %u, " - "window_pos = %u)", match_len, match_offset, window_pos); + if (match_offset > window_pos) return -1; - } + + match_dest = window + window_pos; + match_src = match_dest - match_offset; for (i = 0; i < match_len; i++) match_dest[i] = match_src[i]; @@ -160,83 +138,80 @@ static int xpress_decode_match(unsigned huffsym, unsigned window_pos, return match_len; } -/* Decodes the Huffman-encoded matches and literal bytes in a block of - * XPRESS-encoded data. */ -static int xpress_decompress_block(struct input_bitstream *istream, - u8 uncompressed_data[], - unsigned uncompressed_len, - const u8 lens[], - const u16 decode_table[]) +/* Decodes the Huffman-encoded matches and literal bytes in a region of + * XPRESS-encoded data. */ +static int +xpress_lz_decode(struct input_bitstream * restrict istream, + u8 uncompressed_data[restrict], + unsigned uncompressed_len, + const u16 decode_table[restrict]) { - unsigned curpos; - unsigned huffsym; - int ret; - int match_len; - - curpos = 0; - while (curpos < uncompressed_len) { - ret = read_huffsym(istream, decode_table, lens, - XPRESS_NUM_SYMBOLS, XPRESS_TABLEBITS, - &huffsym, XPRESS_MAX_CODEWORD_LEN); - if (ret != 0) - return ret; - - if (huffsym < XPRESS_NUM_CHARS) { - uncompressed_data[curpos++] = huffsym; + input_idx_t curpos; + unsigned match_len; + + for (curpos = 0; curpos < uncompressed_len; curpos += match_len) { + unsigned sym; + int ret; + + bitstream_ensure_bits(istream, 16); + + sym = read_huffsym(istream, decode_table, + XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN); + if (sym < XPRESS_NUM_CHARS) { + /* Literal */ + uncompressed_data[curpos] = sym; + match_len = 1; } else { - match_len = xpress_decode_match(huffsym, - curpos, - uncompressed_len, - uncompressed_data, - istream); - if (match_len < 0) - return match_len; - curpos += match_len; + /* Match */ + ret = xpress_decode_match(sym, + curpos, + uncompressed_len, + uncompressed_data, + istream); + if (unlikely(ret < 0)) + return -1; + match_len = ret; } } return 0; } -int xpress_decompress(const void *__compressed_data, unsigned compressed_len, - void *uncompressed_data, unsigned uncompressed_len) +static int +xpress_decompress(const void *compressed_data, size_t compressed_size, + void *uncompressed_data, size_t uncompressed_size, void *_ctx) { + const u8 *cdata = compressed_data; u8 lens[XPRESS_NUM_SYMBOLS]; - u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS]; - struct input_bitstream istream; u8 *lens_p; - const u8 *compressed_data; - unsigned i; - int ret; - - compressed_data = __compressed_data; - lens_p = lens; - - DEBUG2("compressed_len = %d, uncompressed_len = %d", - compressed_len, uncompressed_len); + u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS] + _aligned_attribute(DECODE_TABLE_ALIGNMENT); + struct input_bitstream istream; - /* XPRESS uses only one Huffman tree. It contains 512 symbols, and the + /* XPRESS uses only one Huffman code. It contains 512 symbols, and the * code lengths of these symbols are given literally as 4-bit integers - * in the first 256 bytes of the compressed data. - */ - if (compressed_len < XPRESS_NUM_SYMBOLS / 2) + * in the first 256 bytes of the compressed data. */ + if (compressed_size < XPRESS_NUM_SYMBOLS / 2) return -1; - for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) { - *lens_p++ = compressed_data[i] & 0xf; - *lens_p++ = compressed_data[i] >> 4; + lens_p = lens; + for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) { + *lens_p++ = cdata[i] & 0xf; + *lens_p++ = cdata[i] >> 4; } - ret = make_huffman_decode_table(decode_table, XPRESS_NUM_SYMBOLS, - XPRESS_TABLEBITS, lens, - XPRESS_MAX_CODEWORD_LEN); - if (ret != 0) - return ret; + if (make_huffman_decode_table(decode_table, XPRESS_NUM_SYMBOLS, + XPRESS_TABLEBITS, lens, + XPRESS_MAX_CODEWORD_LEN)) + return -1; - init_input_bitstream(&istream, compressed_data + XPRESS_NUM_SYMBOLS / 2, - compressed_len - XPRESS_NUM_SYMBOLS / 2); + init_input_bitstream(&istream, cdata + XPRESS_NUM_SYMBOLS / 2, + compressed_size - XPRESS_NUM_SYMBOLS / 2); - return xpress_decompress_block(&istream, uncompressed_data, - uncompressed_len, lens, - decode_table); + return xpress_lz_decode(&istream, uncompressed_data, + uncompressed_size, decode_table); } + +const struct decompressor_ops xpress_decompressor_ops = { + .decompress = xpress_decompress, +};