/*
 * hc_matchfinder.h
 *
 * Author:	Eric Biggers
 * Year:	2014, 2015
 *
 * The author dedicates this file to the public domain.
 * You can do whatever you want with this file.
 *
 * ---------------------------------------------------------------------------
 *
 *				   Algorithm
 *
 * This is a Hash Chains (hc) based matchfinder.
 *
 * The data structure is a hash table where each hash bucket contains a linked
 * list (or "chain") of sequences whose first 3 bytes share the same hash code.
 * Each sequence is identified by its starting position in the input buffer.
 *
 * The algorithm processes the input buffer sequentially.  At each byte
 * position, the hash code of the first 3 bytes of the sequence beginning at
 * that position (the sequence being matched against) is computed.  This
 * identifies the hash bucket to use for that position.  Then, this hash
 * bucket's linked list is searched for matches.  Then, a new linked list node
 * is created to represent the current sequence and is prepended to the list.
 *
 * This algorithm has several useful properties:
 *
 * - It only finds true Lempel-Ziv matches; i.e., those where the matching
 *   sequence occurs prior to the sequence being matched against.
 *
 * - The sequences in each linked list are always sorted by decreasing starting
 *   position.  Therefore, the closest (smallest offset) matches are found
 *   first, which in many compression formats tend to be the cheapest to encode.
 *
 * - Although fast running time is not guaranteed due to the possibility of the
 *   lists getting very long, the worst degenerate behavior can be easily
 *   prevented by capping the number of nodes searched at each position.
 *
 * - If the compressor decides not to search for matches at a certain position,
 *   then that position can be quickly inserted without searching the list.
 *
 * - The algorithm is adaptable to sliding windows: just store the positions
 *   relative to a "base" value that is updated from time to time, and stop
 *   searching each list when the sequences get too far away.
 *
 * ---------------------------------------------------------------------------
 *
 *				Notes on usage
 *
 * You must define MATCHFINDER_MAX_WINDOW_ORDER before including this header
 * because that determines which integer type to use for positions.  Since
 * 16-bit integers are faster than 32-bit integers due to reduced memory usage
 * (and therefore reduced cache pressure), the code only uses 32-bit integers if
 * they are needed to represent all possible positions.
 *
 * You must allocate the 'struct hc_matchfinder' on a
 * MATCHFINDER_ALIGNMENT-aligned boundary, and its necessary allocation size
 * must be gotten by calling hc_matchfinder_size().
 *
 * ----------------------------------------------------------------------------
 *
 *				 Optimizations
 *
 * The longest_match() and skip_positions() functions are inlined into the
 * compressors that use them.  This isn't just about saving the overhead of a
 * function call.  These functions are intended to be called from the inner
 * loops of compressors, where giving the compiler more control over register
 * allocation is very helpful.  There is also significant benefit to be gained
 * from allowing the CPU to predict branches independently at each call site.
 * For example, "lazy"-style compressors can be written with two calls to
 * longest_match(), each of which starts with a different 'best_len' and
 * therefore has significantly different performance characteristics.
 *
 * Although any hash function can be used, a multiplicative hash is fast and
 * works well.
 *
 * On some processors, it is significantly faster to extend matches by whole
 * words (32 or 64 bits) instead of by individual bytes.  For this to be the
 * case, the processor must implement unaligned memory accesses efficiently and
 * must have either a fast "find first set bit" instruction or a fast "find last
 * set bit" instruction, depending on the processor's endianness.
 *
 * The code uses one loop for finding the first match and one loop for finding a
 * longer match.  Each of these loops is tuned for its respective task and in
 * combination are faster than a single generalized loop that handles both
 * tasks.
 *
 * The code also uses a tight inner loop that only compares the last and first
 * bytes of a potential match.  It is only when these bytes match that a full
 * match extension is attempted.
 *
 * ----------------------------------------------------------------------------
 */

#ifndef _HC_MATCHFINDER_H
#define _HC_MATCHFINDER_H

#include "wimlib/lz_extend.h"
#include "wimlib/lz_hash.h"
#include "wimlib/matchfinder_common.h"
#include "wimlib/unaligned.h"

#if MATCHFINDER_MAX_WINDOW_ORDER < 14
#  define HC_MATCHFINDER_HASH_ORDER 14
#else
#  define HC_MATCHFINDER_HASH_ORDER 15
#endif

#define HC_MATCHFINDER_HASH_LENGTH	(1UL << HC_MATCHFINDER_HASH_ORDER)

struct hc_matchfinder {
	pos_t hash_tab[HC_MATCHFINDER_HASH_LENGTH];
	pos_t next_tab[];
} _aligned_attribute(MATCHFINDER_ALIGNMENT);

/* Return the number of bytes that must be allocated for a 'hc_matchfinder' that
 * can work with buffers up to the specified size.  */
static inline size_t
hc_matchfinder_size(size_t max_bufsize)
{
	return sizeof(pos_t) * (HC_MATCHFINDER_HASH_LENGTH + max_bufsize);
}

/* Prepare the matchfinder for a new input buffer.  */
static inline void
hc_matchfinder_init(struct hc_matchfinder *mf)
{
	matchfinder_init(mf->hash_tab, HC_MATCHFINDER_HASH_LENGTH);
}

/*
 * Find the longest match longer than 'best_len' bytes.
 *
 * @mf
 *	The matchfinder structure.
 * @in_begin
 *	Pointer to the beginning of the input buffer.
 * @in_next
 *	Pointer to the next byte in the input buffer to process.  This is the
 *	pointer to the sequence being matched against.
 * @best_len
 *	Require a match longer than this length.
 * @max_len
 *	The maximum permissible match length at this position.
 * @nice_len
 *	Stop searching if a match of at least this length is found.
 *	Must be <= @max_len.
 * @max_search_depth
 *	Limit on the number of potential matches to consider.  Must be >= 1.
 * @offset_ret
 *	If a match is found, its offset is returned in this location.
 *
 * Return the length of the match found, or 'best_len' if no match longer than
 * 'best_len' was found.
 */
static inline unsigned
hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
			     const u8 * const in_begin,
			     const u8 * const in_next,
			     unsigned best_len,
			     const unsigned max_len,
			     const unsigned nice_len,
			     const unsigned max_search_depth,
			     unsigned *offset_ret)
{
	unsigned depth_remaining = max_search_depth;
	const u8 *best_matchptr = best_matchptr; /* uninitialized */
	const u8 *matchptr;
	unsigned len;
	u32 first_3_bytes;
	u32 hash;
	pos_t cur_node;

	/* Insert the current sequence into the appropriate linked list.  */
	if (unlikely(max_len < LOAD_U24_REQUIRED_NBYTES))
		goto out;
	first_3_bytes = load_u24_unaligned(in_next);
	hash = lz_hash(first_3_bytes, HC_MATCHFINDER_HASH_ORDER);
	cur_node = mf->hash_tab[hash];
	mf->next_tab[in_next - in_begin] = cur_node;
	mf->hash_tab[hash] = in_next - in_begin;

	if (unlikely(best_len >= max_len))
		goto out;

	/* Search the appropriate linked list for matches.  */

	if (!(matchfinder_node_valid(cur_node)))
		goto out;

	if (best_len < 3) {
		for (;;) {
			/* No length 3 match found yet.
			 * Check the first 3 bytes.  */
			matchptr = &in_begin[cur_node];

			if (load_u24_unaligned(matchptr) == first_3_bytes)
				break;

			/* The first 3 bytes did not match.  Keep trying.  */
			cur_node = mf->next_tab[cur_node];
			if (!matchfinder_node_valid(cur_node) || !--depth_remaining)
				goto out;
		}

		/* Found a match of length >= 3.  Extend it to its full length.  */
		best_matchptr = matchptr;
		best_len = lz_extend(in_next, best_matchptr, 3, max_len);
		if (best_len >= nice_len)
			goto out;
		cur_node = mf->next_tab[cur_node];
		if (!matchfinder_node_valid(cur_node) || !--depth_remaining)
			goto out;
	}

	for (;;) {
		for (;;) {
			matchptr = &in_begin[cur_node];

			/* Already found a length 3 match.  Try for a longer match;
			 * start by checking the last 2 bytes and the first 4 bytes.  */
		#if UNALIGNED_ACCESS_IS_FAST
			if ((load_u32_unaligned(matchptr + best_len - 3) ==
			     load_u32_unaligned(in_next + best_len - 3)) &&
			    (load_u32_unaligned(matchptr) ==
			     load_u32_unaligned(in_next)))
		#else
			if (matchptr[best_len] == in_next[best_len])
		#endif
				break;

			cur_node = mf->next_tab[cur_node];
			if (!matchfinder_node_valid(cur_node) || !--depth_remaining)
				goto out;
		}

	#if UNALIGNED_ACCESS_IS_FAST
		len = 4;
	#else
		len = 0;
	#endif
		len = lz_extend(in_next, matchptr, len, max_len);
		if (len > best_len) {
			best_len = len;
			best_matchptr = matchptr;
			if (best_len >= nice_len)
				goto out;
		}
		cur_node = mf->next_tab[cur_node];
		if (!matchfinder_node_valid(cur_node) || !--depth_remaining)
			goto out;
	}
out:
	*offset_ret = in_next - best_matchptr;
	return best_len;
}

/*
 * Advance the matchfinder, but don't search for matches.
 *
 * @mf
 *	The matchfinder structure.
 * @in_begin
 *	Pointer to the beginning of the input buffer.
 * @in_next
 *	Pointer to the next byte in the input buffer to process.
 * @in_end
 *	Pointer to the end of the input buffer.
 * @count
 *	The number of bytes to advance.  Must be > 0.
 */
static inline void
hc_matchfinder_skip_positions(struct hc_matchfinder * restrict mf,
			      const u8 *in_begin,
			      const u8 *in_next,
			      const u8 *in_end,
			      unsigned count)
{
	u32 hash;

	if (unlikely(in_next + count >= in_end - LZ_HASH3_REQUIRED_NBYTES))
		return;

	do {
		hash = lz_hash_3_bytes(in_next, HC_MATCHFINDER_HASH_ORDER);
		mf->next_tab[in_next - in_begin] = mf->hash_tab[hash];
		mf->hash_tab[hash] = in_next - in_begin;
		in_next++;
	} while (--count);
}

#endif /* _HC_MATCHFINDER_H */