/*
 * lzx_decompress.c
 *
 * A decompressor for the LZX compression format, as used in WIM files.
 */

/*
 * Copyright (C) 2012-2016 Eric Biggers
 *
 * This file is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 3 of the License, or (at your option) any
 * later version.
 *
 * This file is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this file; if not, see http://www.gnu.org/licenses/.
 */

/*
 * LZX is an LZ77 and Huffman-code based compression format that has many
 * similarities to DEFLATE (the format used by zlib/gzip).  The compression
 * ratio is as good or better than DEFLATE.  See lzx_compress.c for a format
 * overview, and see https://en.wikipedia.org/wiki/LZX_(algorithm) for a
 * historical overview.  Here I make some pragmatic notes.
 *
 * The old specification for LZX is the document "Microsoft LZX Data Compression
 * Format" (1997).  It defines the LZX format as used in cabinet files.  Allowed
 * window sizes are 2^n where 15 <= n <= 21.  However, this document contains
 * several errors, so don't read too much into it...
 *
 * The new specification for LZX is the document "[MS-PATCH]: LZX DELTA
 * Compression and Decompression" (2014).  It defines the LZX format as used by
 * Microsoft's binary patcher.  It corrects several errors in the 1997 document
 * and extends the format in several ways --- namely, optional reference data,
 * up to 2^25 byte windows, and longer match lengths.
 *
 * WIM files use a more restricted form of LZX.  No LZX DELTA extensions are
 * present, the window is not "sliding", E8 preprocessing is done
 * unconditionally with a fixed file size, and the maximum window size is always
 * 2^15 bytes (equal to the size of each "chunk" in a compressed WIM resource).
 * This code is primarily intended to implement this form of LZX.  But although
 * not compatible with WIMGAPI, this code also supports maximum window sizes up
 * to 2^21 bytes.
 *
 * TODO: Add support for window sizes up to 2^25 bytes.
 */

#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif

#include <string.h>

#include "wimlib/decompressor_ops.h"
#include "wimlib/decompress_common.h"
#include "wimlib/error.h"
#include "wimlib/lzx_common.h"
#include "wimlib/util.h"

/* These values are chosen for fast decompression.  */
#define LZX_MAINCODE_TABLEBITS		11
#define LZX_LENCODE_TABLEBITS		9
#define LZX_PRECODE_TABLEBITS		6
#define LZX_ALIGNEDCODE_TABLEBITS	7

#define LZX_READ_LENS_MAX_OVERRUN 50

struct lzx_decompressor {

	DECODE_TABLE(maincode_decode_table, LZX_MAINCODE_MAX_NUM_SYMBOLS,
		     LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN);
	u8 maincode_lens[LZX_MAINCODE_MAX_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN];

	DECODE_TABLE(lencode_decode_table, LZX_LENCODE_NUM_SYMBOLS,
		     LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN);
	u8 lencode_lens[LZX_LENCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN];

	union {
		DECODE_TABLE(alignedcode_decode_table, LZX_ALIGNEDCODE_NUM_SYMBOLS,
			     LZX_ALIGNEDCODE_TABLEBITS, LZX_MAX_ALIGNED_CODEWORD_LEN);
		u8 alignedcode_lens[LZX_ALIGNEDCODE_NUM_SYMBOLS];
	};

	union {
		DECODE_TABLE(precode_decode_table, LZX_PRECODE_NUM_SYMBOLS,
			     LZX_PRECODE_TABLEBITS, LZX_MAX_PRE_CODEWORD_LEN);
		u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
		u8 extra_offset_bits[LZX_MAX_OFFSET_SLOTS];
	};

	union {
		DECODE_TABLE_WORKING_SPACE(maincode_working_space,
					   LZX_MAINCODE_MAX_NUM_SYMBOLS,
					   LZX_MAX_MAIN_CODEWORD_LEN);
		DECODE_TABLE_WORKING_SPACE(lencode_working_space,
					   LZX_LENCODE_NUM_SYMBOLS,
					   LZX_MAX_LEN_CODEWORD_LEN);
		DECODE_TABLE_WORKING_SPACE(alignedcode_working_space,
					   LZX_ALIGNEDCODE_NUM_SYMBOLS,
					   LZX_MAX_ALIGNED_CODEWORD_LEN);
		DECODE_TABLE_WORKING_SPACE(precode_working_space,
					   LZX_PRECODE_NUM_SYMBOLS,
					   LZX_MAX_PRE_CODEWORD_LEN);
	};

	unsigned window_order;
	unsigned num_main_syms;

	/* Like lzx_extra_offset_bits[], but does not include the entropy-coded
	 * bits of aligned offset blocks */
	u8 extra_offset_bits_minus_aligned[LZX_MAX_OFFSET_SLOTS];

} _aligned_attribute(DECODE_TABLE_ALIGNMENT);

/* Read a Huffman-encoded symbol using the precode. */
static forceinline unsigned
read_presym(const struct lzx_decompressor *d, struct input_bitstream *is)
{
	return read_huffsym(is, d->precode_decode_table,
			    LZX_PRECODE_TABLEBITS, LZX_MAX_PRE_CODEWORD_LEN);
}

/* Read a Huffman-encoded symbol using the main code. */
static forceinline unsigned
read_mainsym(const struct lzx_decompressor *d, struct input_bitstream *is)
{
	return read_huffsym(is, d->maincode_decode_table,
			    LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN);
}

/* Read a Huffman-encoded symbol using the length code. */
static forceinline unsigned
read_lensym(const struct lzx_decompressor *d, struct input_bitstream *is)
{
	return read_huffsym(is, d->lencode_decode_table,
			    LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN);
}

/* Read a Huffman-encoded symbol using the aligned offset code. */
static forceinline unsigned
read_alignedsym(const struct lzx_decompressor *d, struct input_bitstream *is)
{
	return read_huffsym(is, d->alignedcode_decode_table,
			    LZX_ALIGNEDCODE_TABLEBITS, LZX_MAX_ALIGNED_CODEWORD_LEN);
}

/*
 * Read a precode from the compressed input bitstream, then use it to decode
 * @num_lens codeword length values and write them to @lens.
 */
static int
lzx_read_codeword_lens(struct lzx_decompressor *d, struct input_bitstream *is,
		       u8 *lens, unsigned num_lens)
{
	u8 *len_ptr = lens;
	u8 *lens_end = lens + num_lens;

	/* Read the lengths of the precode codewords.  These are stored
	 * explicitly. */
	for (int i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) {
		d->precode_lens[i] =
			bitstream_read_bits(is, LZX_PRECODE_ELEMENT_SIZE);
	}

	/* Build the decoding table for the precode. */
	if (make_huffman_decode_table(d->precode_decode_table,
				      LZX_PRECODE_NUM_SYMBOLS,
				      LZX_PRECODE_TABLEBITS,
				      d->precode_lens,
				      LZX_MAX_PRE_CODEWORD_LEN,
				      d->precode_working_space))
		return -1;

	/* Decode the codeword lengths.  */
	do {
		unsigned presym;
		u8 len;

		/* Read the next precode symbol.  */
		presym = read_presym(d, is);
		if (presym < 17) {
			/* Difference from old length  */
			len = *len_ptr - presym;
			if ((s8)len < 0)
				len += 17;
			*len_ptr++ = len;
		} else {
			/* Special RLE values  */

			unsigned run_len;

			if (presym == 17) {
				/* Run of 0's  */
				run_len = 4 + bitstream_read_bits(is, 4);
				len = 0;
			} else if (presym == 18) {
				/* Longer run of 0's  */
				run_len = 20 + bitstream_read_bits(is, 5);
				len = 0;
			} else {
				/* Run of identical lengths  */
				run_len = 4 + bitstream_read_bits(is, 1);
				presym = read_presym(d, is);
				if (unlikely(presym > 17))
					return -1;
				len = *len_ptr - presym;
				if ((s8)len < 0)
					len += 17;
			}

			do {
				*len_ptr++ = len;
			} while (--run_len);
			/*
			 * The worst case overrun is when presym == 18,
			 * run_len == 20 + 31, and only 1 length was remaining.
			 * So LZX_READ_LENS_MAX_OVERRUN == 50.
			 *
			 * Overrun while reading the first half of maincode_lens
			 * can corrupt the previous values in the second half.
			 * This doesn't really matter because the resulting
			 * lengths will still be in range, and data that
			 * generates overruns is invalid anyway.
			 */
		}
	} while (len_ptr < lens_end);

	return 0;
}

/*
 * Read the header of an LZX block.  For all block types, the block type and
 * size is saved in *block_type_ret and *block_size_ret, respectively.  For
 * compressed blocks, the codeword lengths are also saved.  For uncompressed
 * blocks, the recent offsets queue is also updated.
 */
static int
lzx_read_block_header(struct lzx_decompressor *d, struct input_bitstream *is,
		      u32 recent_offsets[], int *block_type_ret,
		      u32 *block_size_ret)
{
	int block_type;
	u32 block_size;

	bitstream_ensure_bits(is, 4);

	/* Read the block type. */
	block_type = bitstream_pop_bits(is, 3);

	/* Read the block size. */
	if (bitstream_pop_bits(is, 1)) {
		block_size = LZX_DEFAULT_BLOCK_SIZE;
	} else {
		block_size = bitstream_read_bits(is, 16);
		if (d->window_order >= 16) {
			block_size <<= 8;
			block_size |= bitstream_read_bits(is, 8);
		}
	}

	switch (block_type) {

	case LZX_BLOCKTYPE_ALIGNED:

		/* Read the aligned offset codeword lengths. */

		for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
			d->alignedcode_lens[i] =
				bitstream_read_bits(is,
						    LZX_ALIGNEDCODE_ELEMENT_SIZE);
		}

		/* Fall though, since the rest of the header for aligned offset
		 * blocks is the same as that for verbatim blocks.  */

	case LZX_BLOCKTYPE_VERBATIM:

		/* Read the main codeword lengths, which are divided into two
		 * parts: literal symbols and match headers. */

		if (lzx_read_codeword_lens(d, is, d->maincode_lens,
					   LZX_NUM_CHARS))
			return -1;

		if (lzx_read_codeword_lens(d, is, d->maincode_lens + LZX_NUM_CHARS,
					   d->num_main_syms - LZX_NUM_CHARS))
			return -1;


		/* Read the length codeword lengths. */

		if (lzx_read_codeword_lens(d, is, d->lencode_lens,
					   LZX_LENCODE_NUM_SYMBOLS))
			return -1;

		break;

	case LZX_BLOCKTYPE_UNCOMPRESSED:
		/*
		 * The header of an uncompressed block contains new values for
		 * the recent offsets queue, starting on the next 16-bit
		 * boundary in the bitstream.  Careful: if the stream is
		 * *already* aligned, the correct thing to do is to throw away
		 * the next 16 bits (this is probably a mistake in the format).
		 */
		bitstream_ensure_bits(is, 1);
		bitstream_align(is);
		recent_offsets[0] = bitstream_read_u32(is);
		recent_offsets[1] = bitstream_read_u32(is);
		recent_offsets[2] = bitstream_read_u32(is);

		/* Offsets of 0 are invalid.  */
		if (recent_offsets[0] == 0 || recent_offsets[1] == 0 ||
		    recent_offsets[2] == 0)
			return -1;
		break;

	default:
		/* Unrecognized block type.  */
		return -1;
	}

	*block_type_ret = block_type;
	*block_size_ret = block_size;
	return 0;
}

/* Decompress a block of LZX-compressed data. */
static int
lzx_decompress_block(struct lzx_decompressor *d, struct input_bitstream *is,
		     int block_type, u32 block_size,
		     u8 * const out_begin, u8 *out_next, u32 recent_offsets[])
{
	u8 * const block_end = out_next + block_size;
	unsigned min_aligned_offset_slot;

	/*
	 * Build the Huffman decode tables.  We always need to build the main
	 * and length decode tables.  For aligned blocks we additionally need to
	 * build the aligned offset decode table.
	 */

	if (make_huffman_decode_table(d->maincode_decode_table,
				      d->num_main_syms,
				      LZX_MAINCODE_TABLEBITS,
				      d->maincode_lens,
				      LZX_MAX_MAIN_CODEWORD_LEN,
				      d->maincode_working_space))
		return -1;

	if (make_huffman_decode_table(d->lencode_decode_table,
				      LZX_LENCODE_NUM_SYMBOLS,
				      LZX_LENCODE_TABLEBITS,
				      d->lencode_lens,
				      LZX_MAX_LEN_CODEWORD_LEN,
				      d->lencode_working_space))
		return -1;

	if (block_type == LZX_BLOCKTYPE_ALIGNED) {
		if (make_huffman_decode_table(d->alignedcode_decode_table,
					      LZX_ALIGNEDCODE_NUM_SYMBOLS,
					      LZX_ALIGNEDCODE_TABLEBITS,
					      d->alignedcode_lens,
					      LZX_MAX_ALIGNED_CODEWORD_LEN,
					      d->alignedcode_working_space))
			return -1;
		min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
		memcpy(d->extra_offset_bits, d->extra_offset_bits_minus_aligned,
		       sizeof(lzx_extra_offset_bits));
	} else {
		min_aligned_offset_slot = LZX_MAX_OFFSET_SLOTS;
		memcpy(d->extra_offset_bits, lzx_extra_offset_bits,
		       sizeof(lzx_extra_offset_bits));
	}

	/* Decode the literals and matches. */

	do {
		unsigned mainsym;
		unsigned length;
		u32 offset;
		unsigned offset_slot;

		mainsym = read_mainsym(d, is);
		if (mainsym < LZX_NUM_CHARS) {
			/* Literal */
			*out_next++ = mainsym;
			continue;
		}

		/* Match */

		/* Decode the length header and offset slot.  */
		STATIC_ASSERT(LZX_NUM_CHARS % LZX_NUM_LEN_HEADERS == 0);
		length = mainsym % LZX_NUM_LEN_HEADERS;
		offset_slot = (mainsym - LZX_NUM_CHARS) / LZX_NUM_LEN_HEADERS;

		/* If needed, read a length symbol to decode the full length. */
		if (length == LZX_NUM_PRIMARY_LENS)
			length += read_lensym(d, is);
		length += LZX_MIN_MATCH_LEN;

		if (offset_slot < LZX_NUM_RECENT_OFFSETS) {
			/* Repeat offset  */

			/* Note: This isn't a real LRU queue, since using the R2
			 * offset doesn't bump the R1 offset down to R2. */
			offset = recent_offsets[offset_slot];
			recent_offsets[offset_slot] = recent_offsets[0];
		} else {
			/* Explicit offset  */
			offset = bitstream_read_bits(is, d->extra_offset_bits[offset_slot]);
			if (offset_slot >= min_aligned_offset_slot) {
				offset = (offset << LZX_NUM_ALIGNED_OFFSET_BITS) |
					 read_alignedsym(d, is);
			}
			offset += lzx_offset_slot_base[offset_slot];

			/* Update the match offset LRU queue.  */
			STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
			recent_offsets[2] = recent_offsets[1];
			recent_offsets[1] = recent_offsets[0];
		}
		recent_offsets[0] = offset;

		/* Validate the match and copy it to the current position.  */
		if (unlikely(lz_copy(length, offset, out_begin,
				     out_next, block_end, LZX_MIN_MATCH_LEN)))
			return -1;
		out_next += length;
	} while (out_next != block_end);

	return 0;
}

static int
lzx_decompress(const void *restrict compressed_data, size_t compressed_size,
	       void *restrict uncompressed_data, size_t uncompressed_size,
	       void *restrict _d)
{
	struct lzx_decompressor *d = _d;
	u8 * const out_begin = uncompressed_data;
	u8 *out_next = out_begin;
	u8 * const out_end = out_begin + uncompressed_size;
	struct input_bitstream is;
	STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
	u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1};
	unsigned may_have_e8_byte = 0;

	init_input_bitstream(&is, compressed_data, compressed_size);

	/* Codeword lengths begin as all 0's for delta encoding purposes. */
	memset(d->maincode_lens, 0, d->num_main_syms);
	memset(d->lencode_lens, 0, LZX_LENCODE_NUM_SYMBOLS);

	/* Decompress blocks until we have all the uncompressed data. */

	while (out_next != out_end) {
		int block_type;
		u32 block_size;

		if (lzx_read_block_header(d, &is, recent_offsets,
					  &block_type, &block_size))
			return -1;

		if (block_size < 1 || block_size > out_end - out_next)
			return -1;

		if (likely(block_type != LZX_BLOCKTYPE_UNCOMPRESSED)) {

			/* Compressed block */
			if (lzx_decompress_block(d, &is, block_type, block_size,
						 out_begin, out_next,
						 recent_offsets))
				return -1;

			/* If the first E8 byte was in this block, then it must
			 * have been encoded as a literal using mainsym E8. */
			may_have_e8_byte |= d->maincode_lens[0xE8];
		} else {

			/* Uncompressed block */
			if (bitstream_read_bytes(&is, out_next, block_size))
				return -1;

			/* Re-align the bitstream if needed. */
			if (block_size & 1)
				bitstream_read_byte(&is);

			/* There may have been an E8 byte in the block. */
			may_have_e8_byte = 1;
		}
		out_next += block_size;
	}

	/* Postprocess the data unless it cannot possibly contain E8 bytes. */
	if (may_have_e8_byte)
		lzx_postprocess(uncompressed_data, uncompressed_size);

	return 0;
}

static int
lzx_create_decompressor(size_t max_block_size, void **d_ret)
{
	unsigned window_order;
	struct lzx_decompressor *d;

	window_order = lzx_get_window_order(max_block_size);
	if (window_order == 0)
		return WIMLIB_ERR_INVALID_PARAM;

	d = ALIGNED_MALLOC(sizeof(*d), DECODE_TABLE_ALIGNMENT);
	if (!d)
		return WIMLIB_ERR_NOMEM;

	d->window_order = window_order;
	d->num_main_syms = lzx_get_num_main_syms(window_order);

	/* Initialize 'd->extra_offset_bits_minus_aligned'. */
	STATIC_ASSERT(sizeof(d->extra_offset_bits_minus_aligned) ==
		      sizeof(lzx_extra_offset_bits));
	STATIC_ASSERT(sizeof(d->extra_offset_bits) ==
		      sizeof(lzx_extra_offset_bits));
	memcpy(d->extra_offset_bits_minus_aligned, lzx_extra_offset_bits,
	       sizeof(lzx_extra_offset_bits));
	for (unsigned offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
	     offset_slot < LZX_MAX_OFFSET_SLOTS; offset_slot++)
	{
		d->extra_offset_bits_minus_aligned[offset_slot] -=
				LZX_NUM_ALIGNED_OFFSET_BITS;
	}

	*d_ret = d;
	return 0;
}

static void
lzx_free_decompressor(void *_d)
{
	ALIGNED_FREE(_d);
}

const struct decompressor_ops lzx_decompressor_ops = {
	.create_decompressor = lzx_create_decompressor,
	.decompress	     = lzx_decompress,
	.free_decompressor   = lzx_free_decompressor,
};