/*
 * lz.c
 *
 * This file provides the code to analyze a buffer of uncompressed data for
 * matches, as per the LZ77 algorithm.  It uses a hash table to accelerate the
 * process.  This is based on code from zlib (v. 1.2.5).
 */

/*
 * Copyright (C) 2012 Eric Biggers
 * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
 *
 * This file is part of wimlib, a library for working with WIM files.
 *
 * wimlib is free software; you can redistribute it and/or modify it under the
 * terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or (at your option)
 * any later version.
 *
 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with wimlib; if not, see http://www.gnu.org/licenses/.
 */

#include "comp.h"
#include <string.h>

#define HASH_BITS	15
#define HASH_SIZE	(1 << HASH_BITS)
#define HASH_MASK	(HASH_SIZE - 1)
#define HASH_SHIFT	((HASH_BITS + 2) /  3)

/* Hash function, based on code from zlib.  This function will update and return
 * the hash value @hash for the string ending on the additional input character
 * @c.  This function must be called for each consecutive character, because it
 * uses a running hash value rather than computing it separately for each
 * 3-character string.
 *
 * The AND operation guarantees that only 3 characters will affect the hash
 * value, so every identical 3-character string will have the same hash value.
 */
static inline uint update_hash(uint hash, u8 c)
{
	return ((hash << HASH_SHIFT) ^ c) & HASH_MASK;
}


/* Insert a 3-character string at position @str_pos in @window and with hash
 * code @hash into the hash table described by @hash_tab and @prev_tab.  Based
 * on code from zlib.
 *
 * The hash table uses chains (linked lists) for the hash buckets, but there are
 * no real pointers involved.  Indexing `hash_tab' by hash value gives the index
 * within the window of the last string in the hash bucket.  To find the index
 * of the previous string in the hash chain, the `prev_tab' array is indexed by
 * the string index.  `prev_tab' can be indexed repeatedly by the string index
 * to walk through the hash chain, until the special index `0' is reached,
 * indicating the end of the hash chain.
 */
static inline uint insert_string(u16 hash_tab[], u16 prev_tab[], 
				 const u8 window[], uint str_pos, uint hash)
{
	hash = update_hash(hash, window[str_pos + 2]);
	prev_tab[str_pos] = hash_tab[hash];
	hash_tab[hash] = str_pos;
	return hash;
}


/*
 * Returns the longest match for a given input position.
 *
 * @window:		The window of uncompressed data.
 * @bytes_remaining:	The number of bytes remaining in the window.
 * @strstart:		The index of the start of the string in the window that
 * 				we are trying to find a match for.
 * @prev_tab:		The array of prev pointers for the hash table.
 * @cur_match:		The index of the head of the hash chain for matches 
 * 				having the hash value of the string beginning 
 * 				at index @strstart.
 * @prev_len:		The length of the match that was found for the string 
 * 				beginning at (@strstart - 1).
 * @match_start_ret:	A location into which the index of the start of the
 * 				match will be returned.
 * @params:		Parameters that affect how long the search will proceed
 * 				before going with the best that has been found
 * 				so far.
 *
 * Returns the length of the match that was found.
 */
static uint longest_match(const u8 window[], uint bytes_remaining,
			  uint strstart, const u16 prev_tab[], 
			  uint cur_match, uint prev_len, 
			  uint *match_start_ret, 
			  const struct lz_params *params)
{
	uint chain_len = params->max_chain_len;

	const u8 *scan = window + strstart;
	const u8 *match;
	uint len;
	uint best_len = prev_len;
	uint match_start = cur_match;

	uint nice_match = min(params->nice_match, bytes_remaining);

	const u8 *strend = scan + params->max_match;

	u8 scan_end1 = scan[best_len - 1];
	u8 scan_end = scan[best_len];


	/* Do not waste too much time if we already have a good match: */
	if (best_len >= params->good_match)
		chain_len >>= 2;

	do {
		match = &window[cur_match];


		/* Skip to next match if the match length cannot increase
		 * or if the match length is less than 2.  Note that the checks below
		 * for insufficient lookahead only occur occasionally for performance
		 * reasons.  Therefore uninitialized memory will be accessed, and
		 * conditional jumps will be made that depend on those values.
		 * However the length of the match is limited to the lookahead, so
		 * the output of deflate is not affected by the uninitialized values.
		 */

		if (match[best_len]   != scan_end  ||
			match[best_len-1] != scan_end1 ||
			*match			!= *scan	 ||
			*++match		  != scan[1])	  continue;

		/* The check at best_len-1 can be removed because it will be made
		 * again later. (This heuristic is not always a win.)
		 * It is not necessary to compare scan[2] and match[2] since they
		 * are always equal when the other bytes match, given that
		 * the hash keys are equal and that HASH_BITS >= 8.
		 */
		scan += 2, match++;

		wimlib_assert(*scan == *match);

		/* We check for insufficient lookahead only every 8th comparison;
		 * the 256th check will be made at strstart+258.  */
		do {
		} while (*++scan == *++match && *++scan == *++match &&
				 *++scan == *++match && *++scan == *++match &&
				 *++scan == *++match && *++scan == *++match &&
				 *++scan == *++match && *++scan == *++match &&
				 scan < strend);

		len = params->max_match - (int)(strend - scan);
		scan = strend - params->max_match;

		if (len > best_len) {
			match_start = cur_match;
			best_len = len;
			if (len >= nice_match) 
				break;
			scan_end1  = scan[best_len - 1];
			scan_end   = scan[best_len];
		}
		cur_match = prev_tab[cur_match];
	} while (--chain_len != 0 && cur_match != 0);
	*match_start_ret = match_start;
	return min(min(best_len, bytes_remaining), params->max_match);
}


/* 
 * Determines the sequence of matches and literals that a block of data will be
 * compressed to.
 *
 * @uncompressed_data:	The data that is to be compressed.
 * @uncompressed_len:	The length of @uncompressed_data, in bytes.
 * @match_tab:		An array for the intermediate representation of matches.
 * @record_match:	A function that will be called to produce the
 * 				intermediate representation of a match, given
 * 				the offset and length.  This function should also
 * 				update the appropriate symbol frequency counts
 * 				so that any needed Huffman codes can be made
 * 				later.
 * @record_literal:	A function that will be called to produce the
 * 				intermediate representation of a literal, given
 * 				the character of the literal.  This function
 * 				should also update the appropriate symbol 
 * 				frequency counts so that any needed Huffman 
 * 				codes can be made later.
 * @record_match_arg_1:
 * @record_match_arg_2:	Extra arguments to be passed to @record_match.
 * @record_literal_arg:	Extra arguments to be passed to @record_literal.
 * @params:		Structure that contains parameters that affect how the
 * 				analysis proceeds (mainly how good the matches
 * 				have to be).
 *
 * Returns the total number of matches and literal bytes that were found; this
 * is the number of slots in @match_tab that have been filled with the
 * intermediate representation of a match or literal byte.
 */
uint lz_analyze_block(const u8 uncompressed_data[], uint uncompressed_len,
		      u32 match_tab[], lz_record_match_t record_match,
		      lz_record_literal_t record_literal, void *record_match_arg1,
		      void *record_match_arg2, void *record_literal_arg,
		      const struct lz_params *params)
{
	uint cur_match_pos = 0;
	uint cur_input_pos = 0;
	uint hash          = 0;
	uint hash_head     = 0;
	uint prev_len      = params->min_match - 1;
	uint prev_start;
	uint match_len     = params->min_match - 1;
	uint match_start   = 0;
	bool match_available = false;
	u16 hash_tab[HASH_SIZE];
	u32 match;
	u16 prev_tab[uncompressed_len];

	ZERO_ARRAY(hash_tab);
	ZERO_ARRAY(prev_tab);

	do {
		/* If there are at least 3 characters remaining in the input,
		 * insert the 3-character string beginning at
		 * uncompressed_data[cur_input_pos] into the hash table.
		 *
		 * hash_head is set to the index of the previous string in the
		 * hash bucket, or 0 if there is no such string */
		if (uncompressed_len - cur_input_pos >= params->min_match) {
			hash = insert_string(hash_tab, prev_tab, 
						uncompressed_data, 
						cur_input_pos, hash);
			hash_head = prev_tab[cur_input_pos];
		} else {
			hash_head = 0;
		}


		/* Find the longest match, discarding those <= prev_len. */
		prev_len = match_len;
		prev_start = match_start;
		match_len = params->min_match - 1;

		if (hash_head != 0 && prev_len < params->min_match) {
			/* To simplify the code, we prevent matches with the
			 * string of window index 0 (in particular we have to
			 * avoid a match of the string with itself at the start
			 * of the input file).  */
			match_len = longest_match(uncompressed_data, 
						uncompressed_len - cur_input_pos,
						cur_input_pos, prev_tab, 
						hash_head, prev_len,
						&match_start, params);

			if (match_len == params->min_match && 
				cur_input_pos - match_start > params->too_far) 
			{
				match_len = params->min_match - 1;
			}
		}

		/* If there was a match at the previous step and the current
		 * match is not better, output the previous match:
		 */
		if (prev_len >= params->min_match && match_len <= prev_len) {

			/* Do not insert strings in hash table beyond this. */
			uint max_insert = uncompressed_len - params->min_match;

			/*DEBUG("Recording match (pos = %u, offset = %u, len = %u)\n",*/
					/*cur_input_pos - 1, */
					/*cur_input_pos - 1 - prev_start,*/
					/*prev_len);*/

			match = (*record_match)(cur_input_pos - 1 - prev_start, 
							prev_len, 
							record_match_arg1,
							record_match_arg2);

			match_tab[cur_match_pos++] = match;

			/* Insert in hash table all strings up to the end of the match.
			 * strstart-1 and strstart are already inserted. If there is not
			 * enough lookahead, the last two strings are not inserted in
			 * the hash table.
			 */
			prev_len -= 2;

			do {
				if (++cur_input_pos <= max_insert) {
					hash = insert_string(hash_tab, prev_tab,
								uncompressed_data,
								cur_input_pos,
								hash);
				}
			} while (--prev_len != 0);

			match_available = false;
			match_len = params->min_match - 1;
		} else if (match_available) {
			/* If there was no match at the previous position, output a
			 * single literal. If there was a match but the current match
			 * is longer, truncate the previous match to a single literal.
			 */

			/*DEBUG("Recording litrl (pos = %u, value = %u)\n",*/
					/*cur_input_pos - 1, */
					/*uncompressed_data[cur_input_pos - 1]);*/

			match = (*record_literal)(
					uncompressed_data[cur_input_pos - 1],
							record_literal_arg);
			match_tab[cur_match_pos++] = match;
		} else {
			/* There is no previous match to compare with, wait for
			 * the next step to decide.  */
			match_available = true;
		}
	} while (++cur_input_pos < uncompressed_len);

	if (match_available) {
		match = (*record_literal)(uncompressed_data[cur_input_pos - 1],
						record_literal_arg);
		match_tab[cur_match_pos++] = match;
	}
	return cur_match_pos;
}