X-Git-Url: https://wimlib.net/git/?a=blobdiff_plain;f=src%2Flz_hash_chains.c;h=c116f1251331a10a6dc9e11fd04862343a6072ef;hb=2b14c2bfd6d0a7a17433dd8529cfc8b7d969c4b0;hp=807e455aaf6c46b21cae44943daabcf45cf1aad4;hpb=e3a8f6a32b258ba8ac6929f6c468b1c873a96fc6;p=wimlib

diff --git a/src/lz_hash_chains.c b/src/lz_hash_chains.c
index 807e455a..c116f125 100644
--- a/src/lz_hash_chains.c
+++ b/src/lz_hash_chains.c
@@ -33,67 +33,38 @@
 #  include "config.h"
 #endif
 
+#include "wimlib/lz_extend.h"
+#include "wimlib/lz_hash3.h"
 #include "wimlib/lz_mf.h"
 #include "wimlib/util.h"
-#include <pthread.h>
+
 #include <string.h>
 
-/* Number of hash buckets.  This can be changed, but should be a power of 2 so
- * that the correct hash bucket can be selected using a fast bitwise AND.  */
-#define LZ_HC_HASH_LEN     (1 << 15)
+/* log2 of the number of buckets in the hash table.  This can be changed.  */
+#define LZ_HC_HASH_ORDER 15
 
-/* Number of bytes from which the hash code is computed at each position.  This
- * can be changed, provided that lz_hc_hash() is updated as well.  */
-#define LZ_HC_HASH_BYTES   3
+#define LZ_HC_HASH_LEN   (1 << LZ_HC_HASH_ORDER)
 
 struct lz_hc {
 	struct lz_mf base;
-	u32 *hash_tab;
-	u32 *prev_tab;
+	u32 *hash_tab; /* followed by 'prev_tab' in memory */
 	u32 next_hash;
 };
 
-static u32 crc32_table[256];
-static pthread_once_t crc32_table_filled = PTHREAD_ONCE_INIT;
-
-static void
-crc32_init(void)
-{
-        for (u32 b = 0; b < 256; b++) {
-                u32 r = b;
-                for (int i = 0; i < 8; i++) {
-                        if (r & 1)
-                                r = (r >> 1) ^ 0xEDB88320;
-                        else
-                                r >>= 1;
-                }
-                crc32_table[b] = r;
-        }
-}
-
-/* This hash function is taken from the LZMA SDK.  It seems to work well.
- *
- * TODO: Maybe use the SSE4.2 CRC32 instruction when available?  */
 static inline u32
 lz_hc_hash(const u8 *p)
 {
-	u32 hash = 0;
-
-	hash ^= crc32_table[p[0]];
-	hash ^= p[1];
-	hash ^= (u32)p[2] << 8;
-
-	return hash % LZ_HC_HASH_LEN;
+	return lz_hash(p, LZ_HC_HASH_ORDER);
 }
 
 static void
 lz_hc_set_default_params(struct lz_mf_params *params)
 {
-	if (params->min_match_len < LZ_HC_HASH_BYTES)
-		params->min_match_len = LZ_HC_HASH_BYTES;
+	if (params->min_match_len < LZ_HASH_NBYTES)
+		params->min_match_len = LZ_HASH_NBYTES;
 
 	if (params->max_match_len == 0)
-		params->max_match_len = params->max_window_size;
+		params->max_match_len = UINT32_MAX;
 
 	if (params->max_search_depth == 0)
 		params->max_search_depth = 50;
@@ -115,7 +86,6 @@ lz_hc_params_valid(const struct lz_mf_params *_params)
 
 	lz_hc_set_default_params(&params);
 
-	/* Avoid edge case where min_match_len = 3, max_match_len = 2 */
 	return (params.min_match_len <= params.max_match_len);
 }
 
@@ -137,17 +107,10 @@ lz_hc_init(struct lz_mf *_mf)
 
 	lz_hc_set_default_params(&mf->base.params);
 
-	/* Allocate space for 'hash_tab' and 'prev_tab'.  */
-
 	mf->hash_tab = MALLOC(lz_hc_get_needed_memory(mf->base.params.max_window_size));
 	if (!mf->hash_tab)
 		return false;
 
-	mf->prev_tab = mf->hash_tab + LZ_HC_HASH_LEN;
-
-	/* Fill in the CRC32 table if not done already.  */
-	pthread_once(&crc32_table_filled, crc32_init);
-
 	return true;
 }
 
@@ -157,9 +120,6 @@ lz_hc_load_window(struct lz_mf *_mf, const u8 window[], u32 size)
 	struct lz_hc *mf = (struct lz_hc *)_mf;
 
 	memset(mf->hash_tab, 0, LZ_HC_HASH_LEN * sizeof(u32));
-
-	if (size >= LZ_HC_HASH_BYTES)
-		mf->next_hash = lz_hc_hash(window);
 }
 
 static u32
@@ -167,19 +127,27 @@ lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[])
 {
 	struct lz_hc *mf = (struct lz_hc *)_mf;
 	const u8 * const window = mf->base.cur_window;
-	const u32 cur_pos = mf->base.cur_window_pos;
+	const u32 cur_pos = mf->base.cur_window_pos++;
 	const u8 * const strptr = &window[cur_pos];
 	const u32 bytes_remaining = mf->base.cur_window_size - cur_pos;
-	u32 * const prev_tab = mf->prev_tab;
-	const u32 max_len = min(bytes_remaining, mf->base.params.nice_match_len);
+	u32 * const prev_tab = mf->hash_tab + LZ_HC_HASH_LEN;
+	const u32 max_len = min(bytes_remaining, mf->base.params.max_match_len);
+	const u32 nice_len = min(max_len, mf->base.params.nice_match_len);
 	u32 best_len = mf->base.params.min_match_len - 1;
 	u32 depth_remaining = mf->base.params.max_search_depth;
-	u32 num_matches = 0;
+	struct lz_match *lz_matchptr = matches;
 	u32 hash;
 	u32 cur_match;
+	u32 sequence;
 
-	if (unlikely(bytes_remaining <= LZ_HC_HASH_BYTES))
-		goto out;
+	if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1))
+		return 0;
+
+	/* Insert the current position into the appropriate hash chain and set
+	 * 'cur_match' to the previous head.
+	 *
+	 * For a slight performance improvement, we do each hash calculation one
+	 * position in advance and prefetch the necessary hash table entry.  */
 
 	hash = mf->next_hash;
 	mf->next_hash = lz_hc_hash(strptr + 1);
@@ -188,78 +156,122 @@ lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[])
 	mf->hash_tab[hash] = cur_pos;
 	prev_tab[cur_pos] = cur_match;
 
+	/* Ensure we can find a match of at least the requested length.  */
+	if (unlikely(best_len >= max_len))
+		return 0;
+
+	if (UNALIGNED_ACCESS_IS_FAST)
+		sequence = load_u24_unaligned(strptr);
+
+	/* Search the appropriate hash chain for matches.  */
 	for (; cur_match && depth_remaining--; cur_match = prev_tab[cur_match]) {
 
 		const u8 * const matchptr = &window[cur_match];
 		u32 len;
 
-		if (matchptr[best_len] != strptr[best_len] ||
-		    matchptr[best_len - 1] != strptr[best_len - 1] ||
-		    matchptr[0] != strptr[0])
+		/* Considering the potential match at 'matchptr':  is it longer
+		 * than 'best_len'?
+		 *
+		 * The bytes at index 'best_len' are the most likely to differ,
+		 * so check them first.  */
+		if (matchptr[best_len] != strptr[best_len])
 			goto next_match;
 
-		for (len = 1; len < best_len - 1; len++)
-			if (matchptr[len] != strptr[len])
+		if (UNALIGNED_ACCESS_IS_FAST) {
+			if (load_u24_unaligned(matchptr) != sequence)
 				goto next_match;
 
-		len = best_len;
+			len = lz_extend(strptr, matchptr, 3, max_len);
+
+			if (len > best_len) {
+				best_len = len;
 
-		do {
-			if (++len == max_len) {
-				const u32 len_limit = min(bytes_remaining,
-							  mf->base.params.max_match_len);
-				while (len < len_limit && strptr[len] == matchptr[len])
-					len++;
-				matches[num_matches++] = (struct lz_match) {
-					.len = len,
+				*lz_matchptr++ = (struct lz_match) {
+					.len = best_len,
 					.offset = strptr - matchptr,
 				};
-				goto out;
+
+				if (best_len >= nice_len)
+					break;
 			}
-		} while (matchptr[len] == strptr[len]);
+		} else {
+
+			/* The bytes at indices 'best_len - 1' and '0' are less
+			 * important to check separately.  But doing so still
+			 * gives a slight performance improvement, at least on
+			 * x86_64, probably because they create separate
+			 * branches for the CPU to predict independently of the
+			 * branches in the main comparison loops.
+			 */
+			 if (matchptr[best_len - 1] != strptr[best_len - 1] ||
+			     matchptr[0] != strptr[0])
+				goto next_match;
 
-		best_len = len;
-		matches[num_matches++] = (struct lz_match) {
-			.len = len,
-			.offset = strptr - matchptr,
-		};
+			for (len = 1; len < best_len - 1; len++)
+				if (matchptr[len] != strptr[len])
+					goto next_match;
+
+			/* The match is the longest found so far --- at least
+			 * 'best_len' + 1 bytes.  Continue extending it.  */
+
+			if (++best_len != max_len &&
+			    strptr[best_len] == matchptr[best_len])
+				while (++best_len != max_len)
+					if (strptr[best_len] != matchptr[best_len])
+						break;
+
+			/* Record the match.  */
+			*lz_matchptr++ = (struct lz_match) {
+				.len = best_len,
+				.offset = strptr - matchptr,
+			};
+
+			/* Terminate the search if 'nice_len' was reached.  */
+			if (best_len >= nice_len)
+				break;
+		}
 
 	next_match:
+		/* Continue to next match in the chain.  */
 		;
 	}
 
-out:
-	mf->base.cur_window_pos++;
-	return num_matches;
+	return lz_matchptr - matches;
 }
 
 static void
-lz_hc_skip_position(struct lz_hc *mf)
+lz_hc_skip_positions(struct lz_mf *_mf, u32 n)
 {
-	const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base);
+	struct lz_hc *mf = (struct lz_hc *)_mf;
+	u32 * const hash_tab = mf->hash_tab;
+	u32 * const prev_tab = hash_tab + LZ_HC_HASH_LEN;
+	const u8 * const window = mf->base.cur_window;
+	u32 cur_pos = mf->base.cur_window_pos;
+	u32 end_pos = cur_pos + n;
+	const u32 bytes_remaining = mf->base.cur_window_size - cur_pos;
 	u32 hash;
+	u32 next_hash;
 
-	if (bytes_remaining <= LZ_HC_HASH_BYTES)
-		goto out;
-
-	hash = mf->next_hash;
-	mf->next_hash = lz_hc_hash(lz_mf_get_window_ptr(&mf->base) + 1);
-	prefetch(&mf->hash_tab[mf->next_hash]);
-	mf->prev_tab[mf->base.cur_window_pos] = mf->hash_tab[hash];
-	mf->hash_tab[hash] = mf->base.cur_window_pos;
+	mf->base.cur_window_pos = end_pos;
 
-out:
-	mf->base.cur_window_pos++;
-}
+	if (unlikely(bytes_remaining < n + (LZ_HASH_REQUIRED_NBYTES + 1) - 1)) {
+	        /* Nearing end of window.  */
+	        if (unlikely(bytes_remaining < (LZ_HASH_REQUIRED_NBYTES + 1)))
+	                return;
 
-static void
-lz_hc_skip_positions(struct lz_mf *_mf, u32 n)
-{
-	struct lz_hc *mf = (struct lz_hc *)_mf;
+	        end_pos = cur_pos + bytes_remaining - (LZ_HASH_REQUIRED_NBYTES + 1) + 1;
+	}
 
+	next_hash = mf->next_hash;
 	do {
-		lz_hc_skip_position(mf);
-	} while (--n);
+		hash = next_hash;
+		next_hash = lz_hc_hash(&window[cur_pos + 1]);
+		prev_tab[cur_pos] = hash_tab[hash];
+		hash_tab[hash] = cur_pos;
+	} while (++cur_pos != end_pos);
+
+	prefetch(&hash_tab[next_hash]);
+	mf->next_hash = next_hash;
 }
 
 static void