wimlib.net Git - wimlib/blob - include/wimlib/hc_matchfinder.h

   1 /*
   2  * hc_matchfinder.h
   3  *
   4  * Author:      Eric Biggers
   5  * Year:        2014, 2015
   6  *
   7  * The author dedicates this file to the public domain.
   8  * You can do whatever you want with this file.
   9  *
  10  * ---------------------------------------------------------------------------
  11  *
  12  *                                 Algorithm
  13  *
  14  * This is a Hash Chains (hc) based matchfinder.
  15  *
  16  * The main data structure is a hash table where each hash bucket contains a
  17  * linked list (or "chain") of sequences whose first 4 bytes share the same hash
  18  * code.  Each sequence is identified by its starting position in the input
  19  * buffer.
  20  *
  21  * The algorithm processes the input buffer sequentially.  At each byte
  22  * position, the hash code of the first 4 bytes of the sequence beginning at
  23  * that position (the sequence being matched against) is computed.  This
  24  * identifies the hash bucket to use for that position.  Then, this hash
  25  * bucket's linked list is searched for matches.  Then, a new linked list node
  26  * is created to represent the current sequence and is prepended to the list.
  27  *
  28  * This algorithm has several useful properties:
  29  *
  30  * - It only finds true Lempel-Ziv matches; i.e., those where the matching
  31  *   sequence occurs prior to the sequence being matched against.
  32  *
  33  * - The sequences in each linked list are always sorted by decreasing starting
  34  *   position.  Therefore, the closest (smallest offset) matches are found
  35  *   first, which in many compression formats tend to be the cheapest to encode.
  36  *
  37  * - Although fast running time is not guaranteed due to the possibility of the
  38  *   lists getting very long, the worst degenerate behavior can be easily
  39  *   prevented by capping the number of nodes searched at each position.
  40  *
  41  * - If the compressor decides not to search for matches at a certain position,
  42  *   then that position can be quickly inserted without searching the list.
  43  *
  44  * - The algorithm is adaptable to sliding windows: just store the positions
  45  *   relative to a "base" value that is updated from time to time, and stop
  46  *   searching each list when the sequences get too far away.
  47  *
  48  * ---------------------------------------------------------------------------
  49  *
  50  *                              Notes on usage
  51  *
  52  * You must define MATCHFINDER_MAX_WINDOW_ORDER before including this header
  53  * because that determines which integer type to use for positions.  Since
  54  * 16-bit integers are faster than 32-bit integers due to reduced memory usage
  55  * (and therefore reduced cache pressure), the code only uses 32-bit integers if
  56  * they are needed to represent all possible positions.
  57  *
  58  * The number of bytes that must be allocated for a given 'struct
  59  * hc_matchfinder' must be gotten by calling hc_matchfinder_size().
  60  *
  61  * ----------------------------------------------------------------------------
  62  *
  63  *                               Optimizations
  64  *
  65  * The main hash table and chains handle length 4+ matches.  Length 3 matches
  66  * are handled by a separate hash table with no chains.  This works well for
  67  * typical "greedy" or "lazy"-style compressors, where length 3 matches are
  68  * often only helpful if they have small offsets.  Instead of searching a full
  69  * chain for length 3+ matches, the algorithm just checks for one close length 3
  70  * match, then focuses on finding length 4+ matches.
  71  *
  72  * The longest_match() and skip_positions() functions are inlined into the
  73  * compressors that use them.  This isn't just about saving the overhead of a
  74  * function call.  These functions are intended to be called from the inner
  75  * loops of compressors, where giving the compiler more control over register
  76  * allocation is very helpful.  There is also significant benefit to be gained
  77  * from allowing the CPU to predict branches independently at each call site.
  78  * For example, "lazy"-style compressors can be written with two calls to
  79  * longest_match(), each of which starts with a different 'best_len' and
  80  * therefore has significantly different performance characteristics.
  81  *
  82  * Although any hash function can be used, a multiplicative hash is fast and
  83  * works well.
  84  *
  85  * On some processors, it is significantly faster to extend matches by whole
  86  * words (32 or 64 bits) instead of by individual bytes.  For this to be the
  87  * case, the processor must implement unaligned memory accesses efficiently and
  88  * must have either a fast "find first set bit" instruction or a fast "find last
  89  * set bit" instruction, depending on the processor's endianness.
  90  *
  91  * The code uses one loop for finding the first match and one loop for finding a
  92  * longer match.  Each of these loops is tuned for its respective task and in
  93  * combination are faster than a single generalized loop that handles both
  94  * tasks.
  95  *
  96  * The code also uses a tight inner loop that only compares the last and first
  97  * bytes of a potential match.  It is only when these bytes match that a full
  98  * match extension is attempted.
  99  *
 100  * ----------------------------------------------------------------------------
 101  */
 102
 103 #ifndef _HC_MATCHFINDER_H
 104 #define _HC_MATCHFINDER_H
 105
 106 #ifndef MATCHFINDER_MAX_WINDOW_ORDER
 107 #  error "MATCHFINDER_MAX_WINDOW_ORDER must be defined!"
 108 #endif
 109
 110 #include <string.h>
 111
 112 #include "wimlib/lz_extend.h"
 113 #include "wimlib/lz_hash.h"
 114 #include "wimlib/unaligned.h"
 115
 116 #if MATCHFINDER_MAX_WINDOW_ORDER <= 16
 117 typedef u16 pos_t;
 118 #else
 119 typedef u32 pos_t;
 120 #endif
 121
 122 #define HC_MATCHFINDER_HASH3_ORDER      14
 123 #define HC_MATCHFINDER_HASH4_ORDER      15
 124
 125 struct hc_matchfinder {
 126
 127         /* The hash table for finding length 3 matches  */
 128         pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
 129
 130         /* The hash table which contains the first nodes of the linked lists for
 131          * finding length 4+ matches  */
 132         pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
 133
 134         /* The "next node" references for the linked lists.  The "next node" of
 135          * the node for the sequence with position 'pos' is 'next_tab[pos]'.  */
 136         pos_t next_tab[];
 137 };
 138
 139 /* Return the number of bytes that must be allocated for a 'hc_matchfinder' that
 140  * can work with buffers up to the specified size.  */
 141 static inline size_t
 142 hc_matchfinder_size(size_t max_bufsize)
 143 {
 144         return sizeof(struct hc_matchfinder) + (max_bufsize * sizeof(pos_t));
 145 }
 146
 147 /* Prepare the matchfinder for a new input buffer.  */
 148 static inline void
 149 hc_matchfinder_init(struct hc_matchfinder *mf)
 150 {
 151         memset(mf, 0, sizeof(*mf));
 152 }
 153
 154 /*
 155  * Find the longest match longer than 'best_len' bytes.
 156  *
 157  * @mf
 158  *      The matchfinder structure.
 159  * @in_begin
 160  *      Pointer to the beginning of the input buffer.
 161  * @cur_pos
 162  *      The current position in the input buffer (the position of the sequence
 163  *      being matched against).
 164  * @best_len
 165  *      Require a match longer than this length.
 166  * @max_len
 167  *      The maximum permissible match length at this position.
 168  * @nice_len
 169  *      Stop searching if a match of at least this length is found.
 170  *      Must be <= @max_len.
 171  * @max_search_depth
 172  *      Limit on the number of potential matches to consider.  Must be >= 1.
 173  * @next_hashes
 174  *      The precomputed hash codes for the sequence beginning at @in_next.
 175  *      These will be used and then updated with the precomputed hashcodes for
 176  *      the sequence beginning at @in_next + 1.
 177  * @offset_ret
 178  *      If a match is found, its offset is returned in this location.
 179  *
 180  * Return the length of the match found, or 'best_len' if no match longer than
 181  * 'best_len' was found.
 182  */
 183 static inline u32
 184 hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
 185                              const u8 * const restrict in_begin,
 186                              const ptrdiff_t cur_pos,
 187                              u32 best_len,
 188                              const u32 max_len,
 189                              const u32 nice_len,
 190                              const u32 max_search_depth,
 191                              u32 next_hashes[const restrict static 2],
 192                              u32 * const restrict offset_ret)
 193 {
 194         const u8 *in_next = in_begin + cur_pos;
 195         u32 depth_remaining = max_search_depth;
 196         const u8 *best_matchptr = best_matchptr; /* uninitialized */
 197         pos_t cur_node3, cur_node4;
 198         u32 hash3, hash4;
 199         u32 next_seq3, next_seq4;
 200         u32 seq4;
 201         const u8 *matchptr;
 202         u32 len;
 203
 204         if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
 205                 goto out;
 206
 207         /* Get the precomputed hash codes.  */
 208         hash3 = next_hashes[0];
 209         hash4 = next_hashes[1];
 210
 211         /* From the hash buckets, get the first node of each linked list.  */
 212         cur_node3 = mf->hash3_tab[hash3];
 213         cur_node4 = mf->hash4_tab[hash4];
 214
 215         /* Update for length 3 matches.  This replaces the singleton node in the
 216          * 'hash3' bucket with the node for the current sequence.  */
 217         mf->hash3_tab[hash3] = cur_pos;
 218
 219         /* Update for length 4 matches.  This prepends the node for the current
 220          * sequence to the linked list in the 'hash4' bucket.  */
 221         mf->hash4_tab[hash4] = cur_pos;
 222         mf->next_tab[cur_pos] = cur_node4;
 223
 224         /* Compute the next hash codes.  */
 225         next_seq4 = load_u32_unaligned(in_next + 1);
 226         next_seq3 = loaded_u32_to_u24(next_seq4);
 227         next_hashes[0] = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
 228         next_hashes[1] = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
 229         prefetchw(&mf->hash3_tab[next_hashes[0]]);
 230         prefetchw(&mf->hash4_tab[next_hashes[1]]);
 231
 232         if (best_len < 4) {  /* No match of length >= 4 found yet?  */
 233
 234                 /* Check for a length 3 match if needed.  */
 235
 236                 if (!cur_node3)
 237                         goto out;
 238
 239                 seq4 = load_u32_unaligned(in_next);
 240
 241                 if (best_len < 3) {
 242                         matchptr = &in_begin[cur_node3];
 243                         if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
 244                                 best_len = 3;
 245                                 best_matchptr = matchptr;
 246                         }
 247                 }
 248
 249                 /* Check for a length 4 match.  */
 250
 251                 if (!cur_node4)
 252                         goto out;
 253
 254                 for (;;) {
 255                         /* No length 4 match found yet.  Check the first 4 bytes.  */
 256                         matchptr = &in_begin[cur_node4];
 257
 258                         if (load_u32_unaligned(matchptr) == seq4)
 259                                 break;
 260
 261                         /* The first 4 bytes did not match.  Keep trying.  */
 262                         cur_node4 = mf->next_tab[cur_node4];
 263                         if (!cur_node4 || !--depth_remaining)
 264                                 goto out;
 265                 }
 266
 267                 /* Found a match of length >= 4.  Extend it to its full length.  */
 268                 best_matchptr = matchptr;
 269                 best_len = lz_extend(in_next, best_matchptr, 4, max_len);
 270                 if (best_len >= nice_len)
 271                         goto out;
 272                 cur_node4 = mf->next_tab[cur_node4];
 273                 if (!cur_node4 || !--depth_remaining)
 274                         goto out;
 275         } else {
 276                 if (!cur_node4 || best_len >= nice_len)
 277                         goto out;
 278         }
 279
 280         /* Check for matches of length >= 5.  */
 281
 282         for (;;) {
 283                 for (;;) {
 284                         matchptr = &in_begin[cur_node4];
 285
 286                         /* Already found a length 4 match.  Try for a longer
 287                          * match; start by checking either the last 4 bytes and
 288                          * the first 4 bytes, or the last byte.  (The last byte,
 289                          * the one which would extend the match length by 1, is
 290                          * the most important.)  */
 291                 #if UNALIGNED_ACCESS_IS_FAST
 292                         if ((load_u32_unaligned(matchptr + best_len - 3) ==
 293                              load_u32_unaligned(in_next + best_len - 3)) &&
 294                             (load_u32_unaligned(matchptr) ==
 295                              load_u32_unaligned(in_next)))
 296                 #else
 297                         if (matchptr[best_len] == in_next[best_len])
 298                 #endif
 299                                 break;
 300
 301                         /* Continue to the next node in the list.  */
 302                         cur_node4 = mf->next_tab[cur_node4];
 303                         if (!cur_node4 || !--depth_remaining)
 304                                 goto out;
 305                 }
 306
 307         #if UNALIGNED_ACCESS_IS_FAST
 308                 len = 4;
 309         #else
 310                 len = 0;
 311         #endif
 312                 len = lz_extend(in_next, matchptr, len, max_len);
 313                 if (len > best_len) {
 314                         /* This is the new longest match.  */
 315                         best_len = len;
 316                         best_matchptr = matchptr;
 317                         if (best_len >= nice_len)
 318                                 goto out;
 319                 }
 320
 321                 /* Continue to the next node in the list.  */
 322                 cur_node4 = mf->next_tab[cur_node4];
 323                 if (!cur_node4 || !--depth_remaining)
 324                         goto out;
 325         }
 326 out:
 327         *offset_ret = in_next - best_matchptr;
 328         return best_len;
 329 }
 330
 331 /*
 332  * Advance the matchfinder, but don't search for matches.
 333  *
 334  * @mf
 335  *      The matchfinder structure.
 336  * @in_begin
 337  *      Pointer to the beginning of the input buffer.
 338  * @cur_pos
 339  *      The current position in the input buffer (the position of the sequence
 340  *      being matched against).
 341  * @end_pos
 342  *      The length of the input buffer.
 343  * @next_hashes
 344  *      The precomputed hash codes for the sequence beginning at @in_next.
 345  *      These will be used and then updated with the precomputed hashcodes for
 346  *      the sequence beginning at @in_next + @count.
 347  * @count
 348  *      The number of bytes to advance.  Must be > 0.
 349  *
 350  * Returns @in_next + @count.
 351  */
 352 static inline const u8 *
 353 hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf,
 354                               const u8 * const restrict in_begin,
 355                               const ptrdiff_t cur_pos,
 356                               const ptrdiff_t end_pos,
 357                               const u32 count,
 358                               u32 next_hashes[const restrict static 2])
 359 {
 360         const u8 *in_next = in_begin + cur_pos;
 361         const u8 * const stop_ptr = in_next + count;
 362
 363         if (likely(count + 5 <= end_pos - cur_pos)) {
 364                 u32 hash3, hash4;
 365                 u32 next_seq3, next_seq4;
 366
 367                 hash3 = next_hashes[0];
 368                 hash4 = next_hashes[1];
 369                 do {
 370                         mf->hash3_tab[hash3] = in_next - in_begin;
 371                         mf->next_tab[in_next - in_begin] = mf->hash4_tab[hash4];
 372                         mf->hash4_tab[hash4] = in_next - in_begin;
 373
 374                         next_seq4 = load_u32_unaligned(++in_next);
 375                         next_seq3 = loaded_u32_to_u24(next_seq4);
 376                         hash3 = lz_hash(next_seq3, HC_MATCHFINDER_HASH3_ORDER);
 377                         hash4 = lz_hash(next_seq4, HC_MATCHFINDER_HASH4_ORDER);
 378
 379                 } while (in_next != stop_ptr);
 380
 381                 prefetchw(&mf->hash3_tab[hash3]);
 382                 prefetchw(&mf->hash4_tab[hash4]);
 383                 next_hashes[0] = hash3;
 384                 next_hashes[1] = hash4;
 385         }
 386
 387         return stop_ptr;
 388 }
 389
 390 #endif /* _HC_MATCHFINDER_H */