# include "config.h"
#endif
+#include "wimlib/lz_extend.h"
+#include "wimlib/lz_hash3.h"
#include "wimlib/lz_mf.h"
#include "wimlib/util.h"
-#include <pthread.h>
+
#include <string.h>
-/* Number of hash buckets. This can be changed, but should be a power of 2 so
- * that the correct hash bucket can be selected using a fast bitwise AND. */
-#define LZ_HC_HASH_LEN (1 << 15)
+/* log2 of the number of buckets in the hash table. This can be changed. */
+#define LZ_HC_HASH_ORDER 15
-/* Number of bytes from which the hash code is computed at each position. This
- * can be changed, provided that lz_hc_hash() is updated as well. */
-#define LZ_HC_HASH_BYTES 3
+#define LZ_HC_HASH_LEN (1 << LZ_HC_HASH_ORDER)
struct lz_hc {
struct lz_mf base;
- u32 *hash_tab;
- u32 *prev_tab;
+ u32 *hash_tab; /* followed by 'prev_tab' in memory */
u32 next_hash;
};
-static u32 crc32_table[256];
-static pthread_once_t crc32_table_filled = PTHREAD_ONCE_INIT;
-
-static void
-crc32_init(void)
-{
- for (u32 b = 0; b < 256; b++) {
- u32 r = b;
- for (int i = 0; i < 8; i++) {
- if (r & 1)
- r = (r >> 1) ^ 0xEDB88320;
- else
- r >>= 1;
- }
- crc32_table[b] = r;
- }
-}
-
-/* This hash function is taken from the LZMA SDK. It seems to work well.
- *
- * TODO: Maybe use the SSE4.2 CRC32 instruction when available? */
static inline u32
lz_hc_hash(const u8 *p)
{
- u32 hash = 0;
-
- hash ^= crc32_table[p[0]];
- hash ^= p[1];
- hash ^= (u32)p[2] << 8;
-
- return hash % LZ_HC_HASH_LEN;
+ return lz_hash(p, LZ_HC_HASH_ORDER);
}
static void
lz_hc_set_default_params(struct lz_mf_params *params)
{
- if (params->min_match_len < LZ_HC_HASH_BYTES)
- params->min_match_len = LZ_HC_HASH_BYTES;
+ if (params->min_match_len < LZ_HASH_NBYTES)
+ params->min_match_len = LZ_HASH_NBYTES;
if (params->max_match_len == 0)
- params->max_match_len = params->max_window_size;
+ params->max_match_len = UINT32_MAX;
if (params->max_search_depth == 0)
params->max_search_depth = 50;
lz_hc_set_default_params(¶ms);
- /* Avoid edge case where min_match_len = 3, max_match_len = 2 */
return (params.min_match_len <= params.max_match_len);
}
lz_hc_set_default_params(&mf->base.params);
- /* Allocate space for 'hash_tab' and 'prev_tab'. */
-
mf->hash_tab = MALLOC(lz_hc_get_needed_memory(mf->base.params.max_window_size));
if (!mf->hash_tab)
return false;
- mf->prev_tab = mf->hash_tab + LZ_HC_HASH_LEN;
-
- /* Fill in the CRC32 table if not done already. */
- pthread_once(&crc32_table_filled, crc32_init);
-
return true;
}
struct lz_hc *mf = (struct lz_hc *)_mf;
memset(mf->hash_tab, 0, LZ_HC_HASH_LEN * sizeof(u32));
-
- if (size >= LZ_HC_HASH_BYTES)
- mf->next_hash = lz_hc_hash(window);
}
-static inline u32
-do_search(const u8 * restrict window,
- const u32 cur_window_pos,
- u32 prev_tab[restrict],
- u32 cur_match,
- const u32 min_len,
- const u32 max_len,
- const u32 max_search_depth,
- struct lz_match matches[restrict])
+static u32
+lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[])
{
- const u8 * const strptr = &window[cur_window_pos];
- u32 best_len = min_len - 1;
- u32 depth_remaining = max_search_depth;
- u32 num_matches = 0;
+ struct lz_hc *mf = (struct lz_hc *)_mf;
+ const u8 * const window = mf->base.cur_window;
+ const u32 cur_pos = mf->base.cur_window_pos++;
+ const u8 * const strptr = &window[cur_pos];
+ const u32 bytes_remaining = mf->base.cur_window_size - cur_pos;
+ u32 * const prev_tab = mf->hash_tab + LZ_HC_HASH_LEN;
+ const u32 max_len = min(bytes_remaining, mf->base.params.max_match_len);
+ const u32 nice_len = min(max_len, mf->base.params.nice_match_len);
+ u32 best_len = mf->base.params.min_match_len - 1;
+ u32 depth_remaining = mf->base.params.max_search_depth;
+ struct lz_match *lz_matchptr = matches;
+ u32 hash;
+ u32 cur_match;
+ u32 sequence;
+
+ if (unlikely(bytes_remaining < LZ_HASH_REQUIRED_NBYTES + 1))
+ return 0;
+
+ /* Insert the current position into the appropriate hash chain and set
+ * 'cur_match' to the previous head.
+ *
+ * For a slight performance improvement, we do each hash calculation one
+ * position in advance and prefetch the necessary hash table entry. */
+
+ hash = mf->next_hash;
+ mf->next_hash = lz_hc_hash(strptr + 1);
+ prefetch(&mf->hash_tab[mf->next_hash]);
+ cur_match = mf->hash_tab[hash];
+ mf->hash_tab[hash] = cur_pos;
+ prev_tab[cur_pos] = cur_match;
+
+ /* Ensure we can find a match of at least the requested length. */
+ if (unlikely(best_len >= max_len))
+ return 0;
+ if (UNALIGNED_ACCESS_IS_FAST)
+ sequence = load_u24_unaligned(strptr);
+
+ /* Search the appropriate hash chain for matches. */
for (; cur_match && depth_remaining--; cur_match = prev_tab[cur_match]) {
const u8 * const matchptr = &window[cur_match];
+ u32 len;
- if (matchptr[best_len] == strptr[best_len] &&
- matchptr[best_len - 1] == strptr[best_len - 1] &&
- matchptr[0] == strptr[0])
- {
- u32 len = 0;
+ /* Considering the potential match at 'matchptr': is it longer
+ * than 'best_len'?
+ *
+ * The bytes at index 'best_len' are the most likely to differ,
+ * so check them first. */
+ if (matchptr[best_len] != strptr[best_len])
+ goto next_match;
- while (++len != max_len)
- if (matchptr[len] != strptr[len])
- break;
+ if (UNALIGNED_ACCESS_IS_FAST) {
+ if (load_u24_unaligned(matchptr) != sequence)
+ goto next_match;
+
+ len = lz_extend(strptr, matchptr, 3, max_len);
if (len > best_len) {
- matches[num_matches++] = (struct lz_match) {
- .len = len,
+ best_len = len;
+
+ *lz_matchptr++ = (struct lz_match) {
+ .len = best_len,
.offset = strptr - matchptr,
};
- best_len = len;
- if (best_len == max_len)
+
+ if (best_len >= nice_len)
break;
}
+ } else {
+
+ /* The bytes at indices 'best_len - 1' and '0' are less
+ * important to check separately. But doing so still
+ * gives a slight performance improvement, at least on
+ * x86_64, probably because they create separate
+ * branches for the CPU to predict independently of the
+ * branches in the main comparison loops.
+ */
+ if (matchptr[best_len - 1] != strptr[best_len - 1] ||
+ matchptr[0] != strptr[0])
+ goto next_match;
+
+ for (len = 1; len < best_len - 1; len++)
+ if (matchptr[len] != strptr[len])
+ goto next_match;
+
+ /* The match is the longest found so far --- at least
+ * 'best_len' + 1 bytes. Continue extending it. */
+
+ if (++best_len != max_len &&
+ strptr[best_len] == matchptr[best_len])
+ while (++best_len != max_len)
+ if (strptr[best_len] != matchptr[best_len])
+ break;
+
+ /* Record the match. */
+ *lz_matchptr++ = (struct lz_match) {
+ .len = best_len,
+ .offset = strptr - matchptr,
+ };
+
+ /* Terminate the search if 'nice_len' was reached. */
+ if (best_len >= nice_len)
+ break;
}
- }
- return num_matches;
-}
-
-static u32
-lz_hc_get_matches(struct lz_mf *_mf, struct lz_match matches[])
-{
- struct lz_hc *mf = (struct lz_hc *)_mf;
- const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base);
- u32 hash;
- u32 cur_match;
- u32 num_matches = 0;
-
- if (bytes_remaining <= LZ_HC_HASH_BYTES)
- goto out;
- hash = mf->next_hash;
- mf->next_hash = lz_hc_hash(lz_mf_get_window_ptr(&mf->base) + 1);
- prefetch(&mf->hash_tab[mf->next_hash]);
- cur_match = mf->hash_tab[hash];
- mf->hash_tab[hash] = mf->base.cur_window_pos;
- mf->prev_tab[mf->base.cur_window_pos] = cur_match;
-
- num_matches = do_search(mf->base.cur_window,
- mf->base.cur_window_pos,
- mf->prev_tab,
- cur_match,
- mf->base.params.min_match_len,
- min(bytes_remaining, mf->base.params.nice_match_len),
- mf->base.params.max_search_depth,
- matches);
-
- /* If the longest match is @nice_match_len in length, it may have been
- * truncated. Try extending it up to the maximum match length. */
- if (num_matches != 0 &&
- matches[num_matches - 1].len == mf->base.params.nice_match_len)
- {
- const u8 * const strptr = lz_mf_get_window_ptr(&mf->base);
- const u8 * const matchptr = strptr - matches[num_matches - 1].offset;
- const u32 len_limit = min(bytes_remaining, mf->base.params.max_match_len);
- u32 len;
-
- len = matches[num_matches - 1].len;
- while (len < len_limit && strptr[len] == matchptr[len])
- len++;
- matches[num_matches - 1].len = len;
+ next_match:
+ /* Continue to next match in the chain. */
+ ;
}
-out:
- mf->base.cur_window_pos++;
- return num_matches;
+ return lz_matchptr - matches;
}
static void
-lz_hc_skip_position(struct lz_hc *mf)
+lz_hc_skip_positions(struct lz_mf *_mf, u32 n)
{
- const u32 bytes_remaining = lz_mf_get_bytes_remaining(&mf->base);
+ struct lz_hc *mf = (struct lz_hc *)_mf;
+ u32 * const hash_tab = mf->hash_tab;
+ u32 * const prev_tab = hash_tab + LZ_HC_HASH_LEN;
+ const u8 * const window = mf->base.cur_window;
+ u32 cur_pos = mf->base.cur_window_pos;
+ u32 end_pos = cur_pos + n;
+ const u32 bytes_remaining = mf->base.cur_window_size - cur_pos;
u32 hash;
+ u32 next_hash;
- if (bytes_remaining <= LZ_HC_HASH_BYTES)
- goto out;
+ mf->base.cur_window_pos = end_pos;
- hash = mf->next_hash;
- mf->next_hash = lz_hc_hash(lz_mf_get_window_ptr(&mf->base) + 1);
- prefetch(&mf->hash_tab[mf->next_hash]);
- mf->prev_tab[mf->base.cur_window_pos] = mf->hash_tab[hash];
- mf->hash_tab[hash] = mf->base.cur_window_pos;
+ if (unlikely(bytes_remaining < n + (LZ_HASH_REQUIRED_NBYTES + 1) - 1)) {
+ /* Nearing end of window. */
+ if (unlikely(bytes_remaining < (LZ_HASH_REQUIRED_NBYTES + 1)))
+ return;
-out:
- mf->base.cur_window_pos++;
-}
-
-static void
-lz_hc_skip_positions(struct lz_mf *_mf, u32 n)
-{
- struct lz_hc *mf = (struct lz_hc *)_mf;
+ end_pos = cur_pos + bytes_remaining - (LZ_HASH_REQUIRED_NBYTES + 1) + 1;
+ }
+ next_hash = mf->next_hash;
do {
- lz_hc_skip_position(mf);
- } while (--n);
+ hash = next_hash;
+ next_hash = lz_hc_hash(&window[cur_pos + 1]);
+ prev_tab[cur_pos] = hash_tab[hash];
+ hash_tab[hash] = cur_pos;
+ } while (++cur_pos != end_pos);
+
+ prefetch(&hash_tab[next_hash]);
+ mf->next_hash = next_hash;
}
static void