/*
 * lz_binary_trees.c
 *
 * Binary tree match-finder for Lempel-Ziv compression.
 *
 * Copyright (c) 2014 Eric Biggers.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Note: the binary tree search/update algorithm is based on LzFind.c from
 * 7-Zip, which was written by Igor Pavlov and released into the public domain.
 */

#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif

#include "wimlib/lz_extend.h"
#include "wimlib/lz_hash3.h"
#include "wimlib/lz_mf.h"
#include "wimlib/util.h"

#include <string.h>

/* log2 of the number of buckets in the hash table.  This can be changed.  */
#define LZ_BT_HASH_ORDER 16

#define LZ_BT_HASH_LEN (1 << LZ_BT_HASH_ORDER)

/* Number of entries in the digram table.
 *
 * Note:  You rarely get length-2 matches if you use length-3 hashing.  But
 * since binary trees are typically used for higher compression ratios than hash
 * chains, it is helpful for this match-finder to find length-2 matches as well.
 * Therefore this match-finder also uses a digram table to find length-2 matches
 * when the minimum match length is 2.  */
#define LZ_BT_DIGRAM_TAB_LEN	(256 * 256)

struct lz_bt {
	struct lz_mf base;
	u32 *hash_tab;
	u32 *digram_tab;
	u32 *child_tab;
	u32 next_hash;
	u16 next_digram;
};

static inline u32
lz_bt_hash(const u8 *p)
{
	return lz_hash(p, LZ_BT_HASH_ORDER);
}

static void
lz_bt_set_default_params(struct lz_mf_params *params)
{
	if (params->min_match_len == 0)
		params->min_match_len = 2;

	if (params->max_match_len == 0)
		params->max_match_len = UINT32_MAX;

	if (params->max_search_depth == 0)
		params->max_search_depth = 50;

	if (params->nice_match_len == 0)
		params->nice_match_len = 24;

	if (params->nice_match_len < params->min_match_len)
		params->nice_match_len = params->min_match_len;

	if (params->nice_match_len > params->max_match_len)
		params->nice_match_len = params->max_match_len;
}

static bool
lz_bt_params_valid(const struct lz_mf_params *params)
{
	return true;
}

static u64
lz_bt_get_needed_memory(u32 max_window_size)
{
	u64 len = 0;

	len += LZ_BT_HASH_LEN;		 /* hash_tab */
	len += LZ_BT_DIGRAM_TAB_LEN;	 /* digram_tab */
	len += 2 * (u64)max_window_size; /* child_tab */

	return len * sizeof(u32);
}

static bool
lz_bt_init(struct lz_mf *_mf)
{
	struct lz_bt *mf = (struct lz_bt *)_mf;
	struct lz_mf_params *params = &mf->base.params;
	size_t len = 0;

	lz_bt_set_default_params(params);

	/* Allocate space for 'hash_tab', 'digram_tab', and 'child_tab'.  */

	len += LZ_BT_HASH_LEN;
	if (params->min_match_len == 2)
		len += LZ_BT_DIGRAM_TAB_LEN;
	len += 2 * params->max_window_size;

	mf->hash_tab = MALLOC(len * sizeof(u32));
	if (!mf->hash_tab)
		return false;

	if (params->min_match_len == 2) {
		mf->digram_tab = mf->hash_tab + LZ_BT_HASH_LEN;
		mf->child_tab = mf->digram_tab + LZ_BT_DIGRAM_TAB_LEN;
	} else {
		mf->child_tab = mf->hash_tab + LZ_BT_HASH_LEN;
	}

	return true;
}

static void
lz_bt_load_window(struct lz_mf *_mf, const u8 window[], u32 size)
{
	struct lz_bt *mf = (struct lz_bt *)_mf;
	size_t clear_len;

	/* Clear hash_tab and digram_tab.
	 * Note: child_tab need not be cleared.  */
	clear_len = LZ_BT_HASH_LEN;
	if (mf->digram_tab)
		clear_len += LZ_BT_DIGRAM_TAB_LEN;
	memset(mf->hash_tab, 0, clear_len * sizeof(u32));
}

/*
 * Search the binary tree of the current hash code for matches.  At the same
 * time, update this tree to add the current position in the window.
 *
 * @window
 *	The window being searched.
 * @cur_pos
 *	The current position in the window.
 * @child_tab
 *	Table of child pointers for the binary tree.  The children of the node
 *	for position 'i' in the window are child_tab[i * 2] and child_tab[i*2 +
 *	1].  Zero is reserved for the 'null' value (no child).  Consequently, we
 *	don't recognize matches beginning at position 0.   In fact, the node for
 *	position 0 in the window will not be used at all, which is just as well
 *	because we use 0-based indices which don't work for position 0.
 * @cur_match
 *	The position in the window at which the binary tree for the current hash
 *	code is rooted.  This can be 0, which indicates that the binary tree for
 *	the current hash code is empty.
 * @min_len
 *	Ignore matches shorter than this length.  This must be at least 1.
 * @nice_len
 *	Stop searching if a match of this length or longer is found.  This must
 *	be less than or equal to @max_len.
 * @max_len
 *	Maximum length of matches to return.  This can be longer than @nice_len,
 *	in which case a match of length @nice_len will still cause the search to
 *	be terminated, but the match will be extended up to @max_len bytes
 *	first.
 * @max_search_depth
 *	Stop if we reach this depth in the binary tree.
 * @matches
 *	The array in which to produce the matches.  The matches will be produced
 *	in order of increasing length and increasing offset.  No more than one
 *	match shall have any given length, nor shall any match be shorter than
 *	@min_len, nor shall any match be longer than @max_len, nor shall any two
 *	matches have the same offset.
 *
 * Returns a pointer to the next free slot in @matches.
 */
static struct lz_match *
do_search(const u8 window[restrict],
	  const u32 cur_pos,
	  u32 child_tab[restrict],
	  u32 cur_match,
	  const u32 min_len,
	  const u32 nice_len,
	  const u32 max_len,
	  const u32 max_search_depth,
	  struct lz_match *lz_matchptr)
{
	/*
	 * Here's my explanation of how this code actually works.  Beware: this
	 * algorithm is a *lot* trickier than searching for matches via hash
	 * chains.  But it can be significantly better, especially when doing
	 * "optimal" parsing, which is why it gets used, e.g. in LZMA as well as
	 * here.
	 *
	 * ---------------------------------------------------------------------
	 *
	 *				Data structure
	 *
	 * Basically, there is not just one binary tree, but rather one binary
	 * tree per hash code.  For a given hash code, the binary tree indexes
	 * previous positions in the window that have that same hash code.  The
	 * key for each node is the "string", or byte sequence, beginning at the
	 * corresponding position in the window.
	 *
	 * Each tree maintains the invariant that if node C is a child of node
	 * P, then the window position represented by node C is smaller than
	 * ("left of") the window position represented by node P.  Equivalently,
	 * while descending into a tree, the match distances ("offsets") from
	 * the current position are non-decreasing --- actually strictly
	 * increasing, because each node represents a unique position.
	 *
	 * In addition, not all previous positions sharing the same hash code
	 * will necessarily be represented in each binary tree; see the
	 * "Updating" section.
	 *
	 * ---------------------------------------------------------------------
	 *
	 *				  Searching
	 *
	 * Suppose we want to search for LZ77-style matches with the string
	 * beginning at the current window position and extending for @max_len
	 * bytes.  To do this, we can search for this string in the binary tree
	 * for this string's hash code.  Each node visited during the search is
	 * a potential match.  This method will find the matches efficiently
	 * because they will converge on the current string, due to the nature
	 * of the binary search.
	 *
	 * Naively, when visiting a node that represents a match of length N, we
	 * must compare N + 1 bytes in order to determine the length of that
	 * match and the lexicographic ordering of that match relative to the
	 * current string (which determines whether we need to step left or
	 * right into the next level of the tree, as per the standard binary
	 * tree search algorithm).  However, as an optimization, we need not
	 * explicitly examine the full length of the match at each node.  To see
	 * that this is true, suppose that we examine a node during the search,
	 * and we find that the corresponding match is less (alt. greater) than
	 * the current string.  Then, because of how binary tree search
	 * operates, the match must be lexicographically greater (alt. lesser)
	 * than any ancestor node that corresponded to a match lexicographically
	 * lesser (alt. greater) than the current string.  Therefore, the match
	 * must be at least as long as the match for any such ancestor node.
	 * Therefore, the lengths of lexicographically-lesser (alt. greater)
	 * matches must be non-decreasing as they are encountered by the tree
	 * search.
	 *
	 * Using this observation, we can maintain two variables, 'best_lt_len'
	 * and 'best_gt_len', that represent the length of the longest
	 * lexicographically lesser and greater, respectively, match that has
	 * been examined so far.   Then, when examining a new match, we need
	 * only start comparing at the index min(best_lt_len, best_gt_len) byte.
	 * Note that we cannot know beforehand whether the match will be
	 * lexicographically lesser or greater, hence the need for taking the
	 * minimum of these two lengths.
	 *
	 * As noted earlier, as we descend into the tree, the potential matches
	 * will have strictly increasing offsets.  To make things faster for
	 * higher-level parsing / match-choosing code, we do not want to return
	 * a shorter match that has a larger offset than a longer match.  This
	 * is because a longer match can always be truncated to a shorter match
	 * if needed, and smaller offsets usually (depending on the compression
	 * format) take fewer bits to encode than larger offsets.
	 * Consequently, we keep a potential match only if it is longer than the
	 * previous longest match that has been found.  This has the added
	 * advantage of producing the array of matches sorted by strictly
	 * increasing lengths as well as strictly decreasing offsets.
	 *
	 * In degenerate cases, the binary tree might become severely
	 * unbalanced.  To prevent excessive running times, we stop immediately
	 * (and return any matches that happen to have been found so far) if the
	 * current depth exceeds @max_search_depth.  Note that this cutoff can
	 * occur before the longest match has been found, which is usually bad
	 * for the compression ratio.
	 *
	 * ---------------------------------------------------------------------
	 *
	 *				Updating
	 *
	 * I've explained how to find matches by searching the binary tree of
	 * the current hash code.  But how do we get the binary tree in the
	 * first place?  Since the tree is built incrementally, the real
	 * question is how do we update the tree to "add" the current window
	 * position.
	 *
	 * The tree maintains the invariant that a node's parent always has a
	 * larger position (a.k.a. smaller match offset) than itself.
	 * Therefore, the root node must always have the largest position; and
	 * since the current position is larger than any previous position, the
	 * current position must become the root of the tree.
	 *
	 * A correct, but silly, approach is to simply add the previous root as
	 * a child of the new root, using either the left or right child pointer
	 * depending on the lexicographic ordering of the strings.  This works,
	 * but it really just produces a linked list, so it's not sufficient.
	 *
	 * Instead, we can initially mark the new root's left child pointer as
	 * "pending (less than)" and its right child pointer as "pending
	 * (greater than)".  Then, during the search, when we examine a match
	 * that is lexicographically less than the current string, we link the
	 * "pending (less than)" pointer to the node of that match, then set the
	 * right child pointer of *that* node as "pending (less than)".
	 * Similarly, when we examine a match that is lexicographically greater
	 * than the current string, we link the "pending (greater than)" pointer
	 * to the node of that match, then set the left child pointer of *that*
	 * node as "pending (greater than)".
	 *
	 * If the search terminates before the current string is found (up to a
	 * precision of @nice_len bytes), then we set "pending (less than)" and
	 * "pending (greater than)" to point to nothing.  Alternatively, if the
	 * search terminates due to finding the current string (up to a
	 * precision of @nice_len bytes), then we set "pending (less than)" and
	 * "pending (greater than)" to point to the appropriate children of that
	 * match.
	 *
	 * Why does this work?  Well, we can think of it this way: the "pending
	 * (less than)" pointer is reserved for the next match we find that is
	 * lexicographically *less than* the current string, and the "pending
	 * (greater than)" pointer is reserved for the next match we find that
	 * is lexicographically *greater than* the current string.  This
	 * explains why when we find a match that is lexicographically less than
	 * the current string, we set the "pending (less than)" pointer to point
	 * to that match.  And the reason we change "pending (less than)" to the
	 * right pointer of the match in that case is because we're walking down
	 * into that subtree, and the next match lexicographically *less than*
	 * the current string is guaranteed to be lexicographically *greater
	 * than* that match, so it should be set as the right subtree of that
	 * match.  But the next match in that subtree that is lexicographically
	 * *greater than* the current string will need to be moved to the
	 * "pending (greater than)" pointer farther up the tree.
	 *
	 * It's complicated, but it should make sense if you think about it.
	 * The algorithm basically just moves subtrees into the correct
	 * locations as it walks down the tree for the search.  But also, if the
	 * algorithm actually finds a match of length @nice_len with the current
	 * string, it no longer needs that match node and can discard it.  The
	 * algorithm also will discard nodes if the search terminates due to the
	 * depth limit.  For these reasons, the binary tree might not, in fact,
	 * contain all valid positions.
	 */

	u32 best_lt_len = 0;
	u32 best_gt_len = 0;
	u32 best_len = min_len - 1;
	u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0];
	u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1];
	const u8 * const strptr = &window[cur_pos];
	u32 depth_remaining = max_search_depth;

	for (;;) {
		const u8 *matchptr;
		u32 len;

		if (cur_match == 0 || depth_remaining-- == 0) {
			*pending_lt_ptr = 0;
			*pending_gt_ptr = 0;
			return lz_matchptr;
		}

		matchptr = &window[cur_match];
		len = min(best_lt_len, best_gt_len);

		if (matchptr[len] == strptr[len]) {

			len = lz_extend(strptr, matchptr, len + 1, max_len);

			if (len > best_len) {
				best_len = len;

				*lz_matchptr++ = (struct lz_match) {
					.len = len,
					.offset = strptr - matchptr,
				};

				if (len >= nice_len) {
					*pending_lt_ptr = child_tab[cur_match * 2 + 0];
					*pending_gt_ptr = child_tab[cur_match * 2 + 1];
					return lz_matchptr;
				}
			}
		}

		if (matchptr[len] < strptr[len]) {
			*pending_lt_ptr = cur_match;
			pending_lt_ptr = &child_tab[cur_match * 2 + 1];
			cur_match = *pending_lt_ptr;
			best_lt_len = len;
		} else {
			*pending_gt_ptr = cur_match;
			pending_gt_ptr = &child_tab[cur_match * 2 + 0];
			cur_match = *pending_gt_ptr;
			best_gt_len = len;
		}
	}
}

static u32
lz_bt_get_matches(struct lz_mf *_mf, struct lz_match matches[])
{
	struct lz_bt *mf = (struct lz_bt *)_mf;
	const u8 * const window = mf->base.cur_window;
	const u32 cur_pos = mf->base.cur_window_pos++;
	const u32 bytes_remaining = mf->base.cur_window_size - cur_pos;
	u32 min_len;
	const u32 max_len = min(bytes_remaining, mf->base.params.max_match_len);
	const u32 nice_len = min(max_len, mf->base.params.nice_match_len);
	u32 hash;
	u32 cur_match;
	struct lz_match *lz_matchptr = matches;

	if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1))
		return 0;

	if (mf->digram_tab) {
		/* Search the digram table for a length 2 match.  */

		const u16 digram = mf->next_digram;
		mf->next_digram = load_u16_unaligned(&window[cur_pos + 1]);
		prefetch(&mf->digram_tab[mf->next_digram]);
		cur_match = mf->digram_tab[digram];
		mf->digram_tab[digram] = cur_pos;

		/* We're only interested in matches of length exactly 2, since
		 * those won't be found during the binary tree search.
		 *
		 * Note: it's possible to extend this match as much as possible,
		 * then use its length plus 1 as min_len for the binary tree
		 * search.  However I found this actually *reduced* performance
		 * slightly, evidently because the binary tree still needs to be
		 * searched/updated starting from the root in either case.  */
		if (cur_match != 0 && window[cur_match + 2] != window[cur_pos + 2]) {
			*lz_matchptr++ = (struct lz_match) {
				.len = 2,
				.offset = cur_pos - cur_match,
			};
		}
		min_len = 3;
	} else {
		min_len = mf->base.params.min_match_len;
	}

	hash = mf->next_hash;
	mf->next_hash = lz_bt_hash(&window[cur_pos + 1]);
	prefetch(&mf->hash_tab[mf->next_hash]);
	cur_match = mf->hash_tab[hash];
	mf->hash_tab[hash] = cur_pos;

	/* Search the binary tree of 'hash' for matches while re-rooting it at
	 * the current position.  */
	lz_matchptr = do_search(window,
				cur_pos,
				mf->child_tab,
				cur_match,
				min_len,
				nice_len,
				max_len,
				mf->base.params.max_search_depth,
				lz_matchptr);

	/* Return the number of matches found.  */
	return lz_matchptr - matches;
}

/* This is very similar to do_search(), but it does not save any matches.
 * See do_search() for explanatory comments.  */
static void
do_skip(const u8 window[restrict],
	const u32 cur_pos,
	u32 child_tab[restrict],
	u32 cur_match,
	const u32 nice_len,
	const u32 max_search_depth)
{
	u32 best_lt_len = 0;
	u32 best_gt_len = 0;
	u32 *pending_lt_ptr = &child_tab[cur_pos * 2 + 0];
	u32 *pending_gt_ptr = &child_tab[cur_pos * 2 + 1];
	const u8 * const strptr = &window[cur_pos];
	u32 depth_remaining = max_search_depth;
	for (;;) {
		const u8 *matchptr;
		u32 len;

		if (cur_match == 0 || depth_remaining-- == 0) {
			*pending_lt_ptr = 0;
			*pending_gt_ptr = 0;
			return;
		}

		matchptr = &window[cur_match];
		len = min(best_lt_len, best_gt_len);

		if (matchptr[len] == strptr[len]) {
			len = lz_extend(strptr, matchptr, len + 1, nice_len);
			if (len == nice_len) {
				*pending_lt_ptr = child_tab[cur_match * 2 + 0];
				*pending_gt_ptr = child_tab[cur_match * 2 + 1];
				return;
			}
		}
		if (matchptr[len] < strptr[len]) {
			*pending_lt_ptr = cur_match;
			pending_lt_ptr = &child_tab[cur_match * 2 + 1];
			cur_match = *pending_lt_ptr;
			best_lt_len = len;
		} else {
			*pending_gt_ptr = cur_match;
			pending_gt_ptr = &child_tab[cur_match * 2 + 0];
			cur_match = *pending_gt_ptr;
			best_gt_len = len;
		}
	}
}

static void
lz_bt_skip_positions(struct lz_mf *_mf, u32 n)
{
	struct lz_bt *mf = (struct lz_bt *)_mf;
	const u8 * const window = mf->base.cur_window;
	u32 cur_pos = mf->base.cur_window_pos;
	u32 end_pos = cur_pos + n;
	u32 bytes_remaining = mf->base.cur_window_size - cur_pos;
	u32 hash;
	u32 next_hash;
	u32 cur_match;
	u16 digram;
	u16 next_digram;

	mf->base.cur_window_pos = end_pos;

	if (unlikely(bytes_remaining < n + (LZ_HASH_REQUIRED_NBYTES + 1) - 1)) {
	        /* Nearing end of window.  */
	        if (unlikely(bytes_remaining < (LZ_HASH_REQUIRED_NBYTES + 1)))
	                return;

	        end_pos = cur_pos + bytes_remaining - (LZ_HASH_REQUIRED_NBYTES + 1) + 1;
	}

	next_hash = mf->next_hash;
	next_digram = mf->next_digram;
	do {
		if (mf->digram_tab) {
			digram = next_digram;
			next_digram = load_u16_unaligned(&window[cur_pos + 1]);
			mf->digram_tab[digram] = cur_pos;
		}

		hash = next_hash;
		next_hash = lz_bt_hash(&window[cur_pos + 1]);
		cur_match = mf->hash_tab[hash];
		mf->hash_tab[hash] = cur_pos;

		/* Update the binary tree for the appropriate hash code.  */
		do_skip(window,
			cur_pos,
			mf->child_tab,
			cur_match,
			min(bytes_remaining, mf->base.params.nice_match_len),
			mf->base.params.max_search_depth);

		bytes_remaining--;
	} while (++cur_pos != end_pos);

	if (mf->digram_tab) {
		prefetch(&mf->digram_tab[next_digram]);
		mf->next_digram = next_digram;
	}

	prefetch(&mf->hash_tab[next_hash]);
	mf->next_hash = next_hash;
}

static void
lz_bt_destroy(struct lz_mf *_mf)
{
	struct lz_bt *mf = (struct lz_bt *)_mf;

	FREE(mf->hash_tab);
	/* mf->hash_tab shares storage with mf->digram_tab and mf->child_tab. */
}

const struct lz_mf_ops lz_binary_trees_ops = {
	.params_valid      = lz_bt_params_valid,
	.get_needed_memory = lz_bt_get_needed_memory,
	.init		   = lz_bt_init,
	.load_window       = lz_bt_load_window,
	.get_matches       = lz_bt_get_matches,
	.skip_positions    = lz_bt_skip_positions,
	.destroy           = lz_bt_destroy,
	.struct_size	   = sizeof(struct lz_bt),
};