X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Fxpress-decompress.c;h=855f2cb311ccb156db1ecd9e00600c86255e458a;hp=de395de96afeecd3cdf3ad83fc62cdab00e1a3f4;hb=90f461637225b29f3b0ee9fa3862cff4ecb56ad8;hpb=4c16dabbfa70bc4c019463091acae1266f600113

diff --git a/src/xpress-decompress.c b/src/xpress-decompress.c
index de395de9..855f2cb3 100644
--- a/src/xpress-decompress.c
+++ b/src/xpress-decompress.c
@@ -6,7 +6,7 @@
 
 /*
  *
- * Copyright (C) 2012 Eric Biggers
+ * Copyright (C) 2012, 2013 Eric Biggers
  *
  * This file is part of wimlib, a library for working with WIM files.
  *
@@ -25,11 +25,10 @@
  */
 
 
-
 /*
- * The XPRESS compression format is a LZ77 and Huffman-code based algorithm.
- * That means it is quite similar to LZX compression, but XPRESS is slightly
- * simpler, so it is a little faster to compress and decompress.
+ * The XPRESS compression format is an LZ77 and Huffman-code based algorithm.
+ * That means it is fairly similar to LZX compression, but XPRESS is simpler, so
+ * it is a little faster to compress and decompress.
  *
  * The XPRESS compression format is mostly documented in a file called "[MS-XCA]
  * Xpress Compression Algorithm".  In the MSDN library, it can currently be
@@ -41,118 +40,97 @@
  * If you are already familiar with the LZ77 algorithm and Huffman coding, the
  * XPRESS format is fairly simple.  The compressed data begins with 256 bytes
  * that contain 512 4-bit integers that are the lengths of the symbols in the
- * Huffman tree used for decoding compressed literals.  This is the only Huffman
- * tree that is used for the entirety of the compressed data, and the codeword
+ * Huffman code used for match/literal headers.  In contrast with more
+ * complicated formats such as DEFLATE and LZX, this is the only Huffman code
+ * that is used for the entirety of the XPRESS compressed data, and the codeword
  * lengths are not encoded with a pretree.
  *
  * The rest of the compressed data is Huffman-encoded symbols.  Values 0 through
- * 255 are literal bytes.  Values 256 through 511 are matches and may require
- * extra bits or bytes to be read to get the match offset and match length.
- *
- * There is no notion of a "compressed block" in the XPRESS format, so in the
- * XPRESS format, each WIM chunk (32768 bytes) will always use only one Huffman
- * tree.
+ * 255 represent the corresponding literal bytes.  Values 256 through 511
+ * represent matches and may require extra bits or bytes to be read to get the
+ * match offset and match length.
  *
- * The trickiest part is probably the fact that literal bytes for match lengths
- * are encoded "separately" from the bitstream.
+ * The trickiest part is probably the way in which literal bytes for match
+ * lengths are interleaved in the bitstream.
  *
  * Also, a caveat--- according to Microsoft's documentation for XPRESS,
  *
- * 	"Some implementation of the decompression algorithm expect an extra
- * 	symbol to mark the end of the data.  Specifically, some implementations
- * 	fail during decompression if the Huffman symbol 256 is not found after
- * 	the actual data."
- *
- * This is the case for WIM files--- in we must write this extra symbol "256" at
- * the end.  Otherwise Microsoft's software will fail to decompress the
- * XPRESS-compressed data.
+ *	"Some implementation of the decompression algorithm expect an extra
+ *	symbol to mark the end of the data.  Specifically, some implementations
+ *	fail during decompression if the Huffman symbol 256 is not found after
+ *	the actual data."
  *
- * However, wimlib's decompressor in this file currently does not care if this
- * extra symbol is there or not.
+ * This is the case for the implementation in WIMGAPI.  However, wimlib's
+ * decompressor in this file currently does not care if this extra symbol is
+ * there or not.
  */
 
-#include "util.h"
-#include "xpress.h"
-#include "wimlib.h"
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
 
-#define XPRESS_DECOMP
-#include "decompress.h"
+#include "wimlib.h"
+#include "wimlib/decompressor_ops.h"
+#include "wimlib/decompress_common.h"
+#include "wimlib/xpress.h"
 
 /*
- * Decodes a symbol @huffsym that begins an XPRESS match.
+ * Decodes a symbol @sym that begins an XPRESS match.
  *
  * The low 8 bits of the symbol are divided into:
  *
  * bits 0-3:  length header
  * bits 4-7:  index of high-order bit of match offset
  *
- * Note: taking the low 8 bits of the symbol is the same as subtracting 256, the
- * number of symbols reserved for literals.
- *
- * Returns the match length, or -1 on error.
+ * Returns the match length, or -1 if the data is invalid.
  */
-static int xpress_decode_match(unsigned huffsym, unsigned window_pos,
-			       unsigned window_len, u8 window[],
-			       struct input_bitstream *istream)
+static int
+xpress_decode_match(unsigned sym, input_idx_t window_pos,
+		    input_idx_t window_len, u8 window[restrict],
+		    struct input_bitstream * restrict istream)
 {
-	unsigned match_len;
-	unsigned match_offset;
-	u8 match_sym = (u8)huffsym;
-	u8 len_hdr = match_sym & 0xf;
-	u8 offset_bsr = match_sym >> 4;
-	int ret;
+
+	unsigned len_hdr;
+	unsigned offset_bsr;
 	u8 *match_dest;
 	u8 *match_src;
 	unsigned i;
+	unsigned match_len;
+	unsigned match_offset;
 
-	ret = bitstream_read_bits(istream, offset_bsr, &match_offset);
-	if (ret != 0)
-		return ret;
-	match_offset |= (1 << offset_bsr);
+	sym -= XPRESS_NUM_CHARS;
+	len_hdr = sym & 0xf;
+	offset_bsr = sym >> 4;
 
-	if (len_hdr == 0xf) {
-		ret = bitstream_read_byte(istream);
-		if (ret < 0)
-			return ret;
-		match_len = ret;
-		if (match_len == 0xff) {
-			ret = bitstream_read_byte(istream);
-			if (ret < 0)
-				return ret;
-			match_len = ret;
+	bitstream_ensure_bits(istream, 16);
 
-			ret = bitstream_read_byte(istream);
-			if (ret < 0)
-				return ret;
+	match_offset = (1U << offset_bsr) | bitstream_pop_bits(istream, offset_bsr);
 
-			match_len |= (ret << 8);
+	if (len_hdr == 0xf) {
+		match_len = bitstream_read_byte(istream);
+		if (unlikely(match_len == 0xff)) {
+			match_len = bitstream_read_byte(istream);
+			match_len |= (unsigned)bitstream_read_byte(istream) << 8;
 		} else {
 			match_len += 0xf;
 		}
 	} else {
 		match_len = len_hdr;
 	}
-	match_len += XPRESS_MIN_MATCH;
+	match_len += XPRESS_MIN_MATCH_LEN;
 
-	/* Verify that the match is in the bounds of the part of the window
-	 * currently in use, then copy the source of the match to the current
-	 * position. */
 
-	match_dest = window + window_pos;
-	match_src = match_dest - match_offset;
+	/* Verify the match is in bounds, then copy its data to the current
+	 * position.  */
 
-	if (window_pos + match_len > window_len) {
-		ERROR("XPRESS decompression error: match of length %u "
-		      "bytes overflows window", match_len);
+	if (window_pos + match_len > window_len)
 		return -1;
-	}
 
-	if (match_src < window) {
-		ERROR("XPRESS decompression error: match of length %u bytes "
-		      "references data before window (match_offset = %u, "
-		      "window_pos = %u)", match_len, match_offset, window_pos);
+	if (match_offset > window_pos)
 		return -1;
-	}
+
+	match_dest = window + window_pos;
+	match_src = match_dest - match_offset;
 
 	for (i = 0; i < match_len; i++)
 		match_dest[i] = match_src[i];
@@ -160,83 +138,80 @@ static int xpress_decode_match(unsigned huffsym, unsigned window_pos,
 	return match_len;
 }
 
-/* Decodes the Huffman-encoded matches and literal bytes in a block of
- * XPRESS-encoded data. */
-static int xpress_decompress_block(struct input_bitstream *istream,
-				   u8 uncompressed_data[],
-				   unsigned uncompressed_len,
-				   const u8 lens[],
-				   const u16 decode_table[])
+/* Decodes the Huffman-encoded matches and literal bytes in a region of
+ * XPRESS-encoded data.  */
+static int
+xpress_lz_decode(struct input_bitstream * restrict istream,
+		 u8 uncompressed_data[restrict],
+		 unsigned uncompressed_len,
+		 const u16 decode_table[restrict])
 {
-	unsigned curpos;
-	unsigned huffsym;
-	int ret;
-	int match_len;
-
-	curpos = 0;
-	while (curpos < uncompressed_len) {
-		ret = read_huffsym(istream, decode_table, lens,
-				   XPRESS_NUM_SYMBOLS, XPRESS_TABLEBITS,
-				   &huffsym, XPRESS_MAX_CODEWORD_LEN);
-		if (ret != 0)
-			return ret;
-
-		if (huffsym < XPRESS_NUM_CHARS) {
-			uncompressed_data[curpos++] = huffsym;
+	input_idx_t curpos;
+	unsigned match_len;
+
+	for (curpos = 0; curpos < uncompressed_len; curpos += match_len) {
+		unsigned sym;
+		int ret;
+
+		bitstream_ensure_bits(istream, 16);
+
+		sym = read_huffsym(istream, decode_table,
+				   XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN);
+		if (sym < XPRESS_NUM_CHARS) {
+			/* Literal  */
+			uncompressed_data[curpos] = sym;
+			match_len = 1;
 		} else {
-			match_len = xpress_decode_match(huffsym,
-							curpos,
-							uncompressed_len,
-							uncompressed_data,
-							istream);
-			if (match_len < 0)
-				return match_len;
-			curpos += match_len;
+			/* Match  */
+			ret = xpress_decode_match(sym,
+						  curpos,
+						  uncompressed_len,
+						  uncompressed_data,
+						  istream);
+			if (unlikely(ret < 0))
+				return -1;
+			match_len = ret;
 		}
 	}
 	return 0;
 }
 
 
-int xpress_decompress(const void *__compressed_data, unsigned compressed_len,
-		      void *uncompressed_data, unsigned uncompressed_len)
+static int
+xpress_decompress(const void *compressed_data, size_t compressed_size,
+		  void *uncompressed_data, size_t uncompressed_size, void *_ctx)
 {
+	const u8 *cdata = compressed_data;
 	u8 lens[XPRESS_NUM_SYMBOLS];
-	u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS];
-	struct input_bitstream istream;
 	u8 *lens_p;
-	const u8 *compressed_data;
-	unsigned i;
-	int ret;
-
-	compressed_data = __compressed_data;
-	lens_p = lens;
-
-	DEBUG2("compressed_len = %d, uncompressed_len = %d",
-	       compressed_len, uncompressed_len);
+	u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS]
+			_aligned_attribute(DECODE_TABLE_ALIGNMENT);
+	struct input_bitstream istream;
 
-	/* XPRESS uses only one Huffman tree.  It contains 512 symbols, and the
+	/* XPRESS uses only one Huffman code.  It contains 512 symbols, and the
 	 * code lengths of these symbols are given literally as 4-bit integers
-	 * in the first 256 bytes of the compressed data.
-	 */
-	if (compressed_len < XPRESS_NUM_SYMBOLS / 2)
+	 * in the first 256 bytes of the compressed data.  */
+	if (compressed_size < XPRESS_NUM_SYMBOLS / 2)
 		return -1;
 
-	for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) {
-		*lens_p++ = compressed_data[i] & 0xf;
-		*lens_p++ = compressed_data[i] >> 4;
+	lens_p = lens;
+	for (unsigned i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) {
+		*lens_p++ = cdata[i] & 0xf;
+		*lens_p++ = cdata[i] >> 4;
 	}
 
-	ret = make_huffman_decode_table(decode_table, XPRESS_NUM_SYMBOLS,
-					XPRESS_TABLEBITS, lens,
-					XPRESS_MAX_CODEWORD_LEN);
-	if (ret != 0)
-		return ret;
+	if (make_huffman_decode_table(decode_table, XPRESS_NUM_SYMBOLS,
+				      XPRESS_TABLEBITS, lens,
+				      XPRESS_MAX_CODEWORD_LEN))
+		return -1;
 
-	init_input_bitstream(&istream, compressed_data + XPRESS_NUM_SYMBOLS / 2,
-			     compressed_len - XPRESS_NUM_SYMBOLS / 2);
+	init_input_bitstream(&istream, cdata + XPRESS_NUM_SYMBOLS / 2,
+			     compressed_size - XPRESS_NUM_SYMBOLS / 2);
 
-	return xpress_decompress_block(&istream, uncompressed_data,
-				       uncompressed_len, lens,
-				       decode_table);
+	return xpress_lz_decode(&istream, uncompressed_data,
+				uncompressed_size, decode_table);
 }
+
+const struct decompressor_ops xpress_decompressor_ops = {
+	.decompress = xpress_decompress,
+};