X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Fxpress-decompress.c;h=bf42c1f0a69883845dab935c929b4ea7b3a80dfc;hb=acb16926b00fabdba062ff4300da5d3aef32ceeb;hp=87205543022ed76c041b6a5f9f5714ddeeaf4ee0;hpb=40beb80283a2df7af88c8359ca41adb814585e9a;p=wimlib diff --git a/src/xpress-decompress.c b/src/xpress-decompress.c index 87205543..bf42c1f0 100644 --- a/src/xpress-decompress.c +++ b/src/xpress-decompress.c @@ -1,12 +1,12 @@ /* * xpress-decompress.c * - * XPRESS decompression routines. + * A very fast decompressor for XPRESS (Huffman version). */ /* * - * Copyright (C) 2012 Eric Biggers + * Copyright (C) 2012, 2013 Eric Biggers * * This file is part of wimlib, a library for working with WIM files. * @@ -25,11 +25,10 @@ */ - /* - * The XPRESS compression format is a LZ77 and Huffman-code based algorithm. - * That means it is quite similar to LZX compression, but XPRESS is slightly - * simpler, so it is a little faster to compress and decompress. + * The XPRESS compression format is an LZ77 and Huffman-code based algorithm. + * That means it is fairly similar to LZX compression, but XPRESS is simpler, so + * it is a little faster to compress and decompress. * * The XPRESS compression format is mostly documented in a file called "[MS-XCA] * Xpress Compression Algorithm". In the MSDN library, it can currently be @@ -41,203 +40,139 @@ * If you are already familiar with the LZ77 algorithm and Huffman coding, the * XPRESS format is fairly simple. The compressed data begins with 256 bytes * that contain 512 4-bit integers that are the lengths of the symbols in the - * Huffman tree used for decoding compressed literals. This is the only Huffman - * tree that is used for the entirety of the compressed data, and the codeword + * Huffman code used for match/literal headers. In contrast with more + * complicated formats such as DEFLATE and LZX, this is the only Huffman code + * that is used for the entirety of the XPRESS compressed data, and the codeword * lengths are not encoded with a pretree. * * The rest of the compressed data is Huffman-encoded symbols. Values 0 through - * 255 are literal bytes. Values 256 through 511 are matches and may require - * extra bits or bytes to be read to get the match offset and match length. - * - * There is no notion of a "compressed block" in the XPRESS format, so in the - * XPRESS format, each WIM chunk (32768 bytes) will always use only one Huffman - * tree. + * 255 represent the corresponding literal bytes. Values 256 through 511 + * represent matches and may require extra bits or bytes to be read to get the + * match offset and match length. * - * The trickiest part is probably the fact that literal bytes for match lengths - * are encoded "separately" from the bitstream. + * The trickiest part is probably the way in which literal bytes for match + * lengths are interleaved in the bitstream. * * Also, a caveat--- according to Microsoft's documentation for XPRESS, * - * "Some implementation of the decompression algorithm expect an extra - * symbol to mark the end of the data. Specifically, some implementations - * fail during decompression if the Huffman symbol 256 is not found after - * the actual data." - * - * This is the case for WIM files--- in we must write this extra symbol "256" at - * the end. Otherwise Microsoft's software will fail to decompress the - * XPRESS-compressed data. + * "Some implementation of the decompression algorithm expect an extra + * symbol to mark the end of the data. Specifically, some implementations + * fail during decompression if the Huffman symbol 256 is not found after + * the actual data." * - * However, wimlib's decompressor in this file currently does not care if this - * extra symbol is there or not. + * This is the case for the implementation in WIMGAPI. However, wimlib's + * decompressor in this file currently does not care if this extra symbol is + * there or not. */ -#include "util.h" -#include "xpress.h" -#include "wimlib.h" +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif -#define XPRESS_DECOMP -#include "decompress.h" +#include "wimlib/decompressor_ops.h" +#include "wimlib/decompress_common.h" +#include "wimlib/error.h" +#include "wimlib/xpress.h" -/* - * Decodes a symbol @huffsym that begins an XPRESS match. - * - * The low 8 bits of the symbol are divided into: - * - * bits 0-3: length header - * bits 4-7: index of high-order bit of match offset - * - * Note: taking the low 8 bits of the symbol is the same as subtracting 256, the - * number of symbols reserved for literals. - */ -static int xpress_decode_match(int huffsym, unsigned window_pos, - unsigned window_len, u8 window[], - struct input_bitstream *istream) +/* This value is chosen for fast decompression. */ +#define XPRESS_TABLEBITS 12 + +/* Decode the matches and literal bytes in a region of XPRESS-encoded data. */ +static int +xpress_decode_window(struct input_bitstream *istream, const u16 *decode_table, + u8 *window, unsigned window_size) { + u8 *window_ptr = window; + u8 *window_end = &window[window_size]; + unsigned sym; unsigned match_len; + unsigned offset_bsr; unsigned match_offset; - u8 match_sym = (u8)huffsym; - u8 len_hdr = match_sym & 0xf; - u8 offset_bsr = match_sym >> 4; - int ret; - u8 *match_dest; - u8 *match_src; - unsigned i; - - ret = bitstream_read_bits(istream, offset_bsr, &match_offset); - if (ret != 0) - return -1; - match_offset |= (1 << offset_bsr); - if (len_hdr == 0xf) { - ret = bitstream_read_byte(istream); - if (ret == -1) - return -1; - match_len = ret; - if (match_len == 0xff) { - ret = bitstream_read_byte(istream); - if (ret == -1) - return -1; - match_len = ret; - - ret = bitstream_read_byte(istream); - if (ret == -1) - return -1; - - match_len |= (ret << 8); - if (match_len < 0xf) - return -1; - } else { - match_len += 0xf; + while (window_ptr != window_end) { + + sym = read_huffsym(istream, decode_table, + XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN); + if (sym < XPRESS_NUM_CHARS) { + /* Literal */ + *window_ptr++ = sym; + continue; } - } else { - match_len = len_hdr; - } - match_len += XPRESS_MIN_MATCH; - /* Verify that the match is in the bounds of the part of the window - * currently in use, then copy the source of the match to the current - * position. */ + /* Match */ + match_len = sym & 0xf; + offset_bsr = (sym >> 4) & 0xf; - match_dest = window + window_pos; - match_src = match_dest - match_offset; + bitstream_ensure_bits(istream, 16); - if (window_pos + match_len > window_len) { - ERROR("XPRESS decompression error: match of length %d " - "bytes overflows window", match_len); - return -1; - } + match_offset = (1 << offset_bsr) | + bitstream_pop_bits(istream, offset_bsr); - if (match_src < window) { - ERROR("XPRESS decompression error: match of length %d bytes " - "references data before window (match_offset = %d, " - "window_pos = %d)", match_len, match_offset, window_pos); - return -1; - } + if (match_len == 0xf) { + match_len += bitstream_read_byte(istream); + if (match_len == 0xf + 0xff) + match_len = bitstream_read_u16(istream); + } + match_len += XPRESS_MIN_MATCH_LEN; - for (i = 0; i < match_len; i++) - match_dest[i] = match_src[i]; + if (unlikely(match_offset > window_ptr - window)) + return -1; - return match_len; -} + if (unlikely(match_len > window_end - window_ptr)) + return -1; -/* Decodes the Huffman-encoded matches and literal bytes in a block of - * XPRESS-encoded data. */ -static int xpress_decompress_literals(struct input_bitstream *istream, - u8 uncompressed_data[], - unsigned uncompressed_len, - const u8 lens[], - const u16 decode_table[]) -{ - unsigned curpos = 0; - unsigned huffsym; - int match_len; - int ret = 0; - - while (curpos < uncompressed_len) { - ret = read_huffsym(istream, decode_table, lens, - XPRESS_NUM_SYMBOLS, XPRESS_TABLEBITS, - &huffsym, XPRESS_MAX_CODEWORD_LEN); - if (ret != 0) - break; - - if (huffsym < XPRESS_NUM_CHARS) { - uncompressed_data[curpos++] = huffsym; - } else { - match_len = xpress_decode_match(huffsym, - curpos, - uncompressed_len, - uncompressed_data, - istream); - if (match_len == -1) { - ret = 1; - break; - } - curpos += match_len; - } + lz_copy(window_ptr, match_len, match_offset, window_end); + + window_ptr += match_len; } - return ret; + return 0; } - -int xpress_decompress(const void *__compressed_data, unsigned compressed_len, - void *uncompressed_data, unsigned uncompressed_len) +static int +xpress_decompress(const void *compressed_data, size_t compressed_size, + void *uncompressed_data, size_t uncompressed_size, void *_ctx) { + const u8 *cdata = compressed_data; u8 lens[XPRESS_NUM_SYMBOLS]; - u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS]; - struct input_bitstream istream; u8 *lens_p; - const u8 *compressed_data; - unsigned i; - int ret; + u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS] + _aligned_attribute(DECODE_TABLE_ALIGNMENT); + struct input_bitstream istream; + + /* XPRESS uses only one Huffman code. It contains 512 symbols, and the + * code lengths of these symbols are given literally as 4-bit integers + * in the first 256 bytes of the compressed data. */ + if (compressed_size < XPRESS_NUM_SYMBOLS / 2) + return -1; - compressed_data = __compressed_data; lens_p = lens; + for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) { + *lens_p++ = cdata[i] & 0xf; + *lens_p++ = cdata[i] >> 4; + } - DEBUG2("compressed_len = %d, uncompressed_len = %d", - compressed_len, uncompressed_len); + if (make_huffman_decode_table(decode_table, XPRESS_NUM_SYMBOLS, + XPRESS_TABLEBITS, lens, + XPRESS_MAX_CODEWORD_LEN)) + return -1; - /* XPRESS uses only one Huffman tree. It contains 512 symbols, and the - * code lengths of these symbols are given literally as 4-bit integers - * in the first 256 bytes of the compressed data. - */ - if (compressed_len < XPRESS_NUM_SYMBOLS / 2) - return WIMLIB_ERR_DECOMPRESSION; - - for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) { - *lens_p++ = compressed_data[i] & 0xf; - *lens_p++ = compressed_data[i] >> 4; - } + init_input_bitstream(&istream, cdata + XPRESS_NUM_SYMBOLS / 2, + compressed_size - XPRESS_NUM_SYMBOLS / 2); - ret = make_huffman_decode_table(decode_table, XPRESS_NUM_SYMBOLS, - XPRESS_TABLEBITS, lens, - XPRESS_MAX_CODEWORD_LEN); - if (ret != 0) - return ret; + return xpress_decode_window(&istream, decode_table, + uncompressed_data, uncompressed_size); +} - init_input_bitstream(&istream, compressed_data + XPRESS_NUM_SYMBOLS / 2, - compressed_len - XPRESS_NUM_SYMBOLS / 2); +static int +xpress_create_decompressor(size_t max_block_size, void **dec_ret) +{ + if (max_block_size > XPRESS_MAX_OFFSET + 1) + return WIMLIB_ERR_INVALID_PARAM; - return xpress_decompress_literals(&istream, uncompressed_data, - uncompressed_len, lens, - decode_table); + return 0; } + +const struct decompressor_ops xpress_decompressor_ops = { + .create_decompressor = xpress_create_decompressor, + .decompress = xpress_decompress, +};