wimlib.net Git - wimlib/blob - src/lcpit_matchfinder.c

   1 /*
   2  * lcpit_matchfinder.c
   3  *
   4  * A match-finder for Lempel-Ziv compression based on bottom-up construction and
   5  * traversal of the Longest Common Prefix (LCP) interval tree.
   6  *
   7  * Copyright 2022 Eric Biggers
   8  *
   9  * Permission is hereby granted, free of charge, to any person
  10  * obtaining a copy of this software and associated documentation
  11  * files (the "Software"), to deal in the Software without
  12  * restriction, including without limitation the rights to use,
  13  * copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the
  15  * Software is furnished to do so, subject to the following
  16  * conditions:
  17  *
  18  * The above copyright notice and this permission notice shall be
  19  * included in all copies or substantial portions of the Software.
  20  *
  21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  22  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  23  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  24  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  25  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  26  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  27  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  28  * OTHER DEALINGS IN THE SOFTWARE.
  29  */
  30
  31 #ifdef HAVE_CONFIG_H
  32 #  include "config.h"
  33 #endif
  34
  35 #include <limits.h>
  36
  37 #include "wimlib/divsufsort.h"
  38 #include "wimlib/lcpit_matchfinder.h"
  39 #include "wimlib/util.h"
  40
  41 #define LCP_BITS                6
  42 #define LCP_MAX                 (((u32)1 << LCP_BITS) - 1)
  43 #define LCP_SHIFT               (32 - LCP_BITS)
  44 #define LCP_MASK                (LCP_MAX << LCP_SHIFT)
  45 #define POS_MASK                (((u32)1 << (32 - LCP_BITS)) - 1)
  46 #define MAX_NORMAL_BUFSIZE      (POS_MASK + 1)
  47
  48 #define HUGE_LCP_BITS           7
  49 #define HUGE_LCP_MAX            (((u32)1 << HUGE_LCP_BITS) - 1)
  50 #define HUGE_LCP_SHIFT          (64 - HUGE_LCP_BITS)
  51 #define HUGE_LCP_MASK           ((u64)HUGE_LCP_MAX << HUGE_LCP_SHIFT)
  52 #define HUGE_POS_MASK           0xFFFFFFFF
  53 #define MAX_HUGE_BUFSIZE        ((u64)HUGE_POS_MASK + 1)
  54 #define HUGE_UNVISITED_TAG      0x100000000
  55
  56 #define PREFETCH_SAFETY         5
  57
  58 /*
  59  * Build the LCP (Longest Common Prefix) array in linear time.
  60  *
  61  * LCP[r] will be the length of the longest common prefix between the suffixes
  62  * with positions SA[r - 1] and SA[r].  LCP[0] will be undefined.
  63  *
  64  * Algorithm taken from Kasai et al. (2001), but modified slightly:
  65  *
  66  *  - With bytes there is no realistic way to reserve a unique symbol for
  67  *    end-of-buffer, so use explicit checks for end-of-buffer.
  68  *
  69  *  - For decreased memory usage and improved memory locality, pack the two
  70  *    logically distinct SA and LCP arrays into a single array SA_and_LCP.
  71  *
  72  *  - Since SA_and_LCP is accessed randomly, improve the cache behavior by
  73  *    reading several entries ahead in ISA and prefetching the upcoming
  74  *    SA_and_LCP entry.
  75  *
  76  *  - If an LCP value is less than the minimum match length, then store 0.  This
  77  *    avoids having to do comparisons against the minimum match length later.
  78  *
  79  *  - If an LCP value is greater than the "nice match length", then store the
  80  *    "nice match length".  This caps the number of bits needed to store each
  81  *    LCP value, and this caps the depth of the LCP-interval tree, without
  82  *    usually hurting the compression ratio too much.
  83  *
  84  * References:
  85  *
  86  *      Kasai et al.  2001.  Linear-Time Longest-Common-Prefix Computation in
  87  *      Suffix Arrays and Its Applications.  CPM '01 Proceedings of the 12th
  88  *      Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
  89  */
  90 static void
  91 build_LCP(u32 SA_and_LCP[restrict], const u32 ISA[restrict],
  92           const u8 T[restrict], const u32 n,
  93           const u32 min_lcp, const u32 max_lcp)
  94 {
  95         u32 h = 0;
  96         for (u32 i = 0; i < n; i++) {
  97                 const u32 r = ISA[i];
  98                 prefetchw(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
  99                 if (r > 0) {
 100                         const u32 j = SA_and_LCP[r - 1] & POS_MASK;
 101                         const u32 lim = min(n - i, n - j);
 102                         while (h < lim && T[i + h] == T[j + h])
 103                                 h++;
 104                         u32 stored_lcp = h;
 105                         if (stored_lcp < min_lcp)
 106                                 stored_lcp = 0;
 107                         else if (stored_lcp > max_lcp)
 108                                 stored_lcp = max_lcp;
 109                         SA_and_LCP[r] |= stored_lcp << LCP_SHIFT;
 110                         if (h > 0)
 111                                 h--;
 112                 }
 113         }
 114 }
 115
 116 /*
 117  * Use the suffix array accompanied with the longest-common-prefix array ---
 118  * which in combination can be called the "enhanced suffix array" --- to
 119  * simulate a bottom-up traversal of the corresponding suffix tree, or
 120  * equivalently the lcp-interval tree.  Do so in suffix rank order, but save the
 121  * superinterval references needed for later bottom-up traversal of the tree in
 122  * suffix position order.
 123  *
 124  * To enumerate the lcp-intervals, this algorithm scans the suffix array and its
 125  * corresponding LCP array linearly.  While doing so, it maintains a stack of
 126  * lcp-intervals that are currently open, meaning that their left boundaries
 127  * have been seen but their right boundaries have not.  The bottom of the stack
 128  * is the interval which covers the entire suffix array (this has lcp=0), and
 129  * the top of the stack is the deepest interval that is currently open (this has
 130  * the greatest lcp of any interval on the stack).  When this algorithm opens an
 131  * lcp-interval, it assigns it a unique index in intervals[] and pushes it onto
 132  * the stack.  When this algorithm closes an interval, it pops it from the stack
 133  * and sets the intervals[] entry of that interval to the index and lcp of that
 134  * interval's superinterval, which is the new top of the stack.
 135  *
 136  * This algorithm also set pos_data[pos] for each suffix position 'pos' to the
 137  * index and lcp of the deepest lcp-interval containing it.  Alternatively, we
 138  * can interpret each suffix as being associated with a singleton lcp-interval,
 139  * or leaf of the suffix tree.  With this interpretation, an entry in pos_data[]
 140  * is the superinterval reference for one of these singleton lcp-intervals and
 141  * therefore is not fundamentally different from an entry in intervals[].
 142  *
 143  * To reduce memory usage, this algorithm re-uses the suffix array's memory to
 144  * store the generated intervals[] array.  This is possible because SA and LCP
 145  * are accessed linearly, and no more than one interval is generated per suffix.
 146  *
 147  * The techniques used in this algorithm are described in various published
 148  * papers.  The generation of lcp-intervals from the suffix array (SA) and the
 149  * longest-common-prefix array (LCP) is given as Algorithm BottomUpTraverse in
 150  * Kasai et al. (2001) and Algorithm 4.1 ("Computation of lcp-intervals") in
 151  * Abouelhoda et al. (2004).  Both these papers note the equivalence between
 152  * lcp-intervals (including the singleton lcp-interval for each suffix) and
 153  * nodes of the suffix tree.  Abouelhoda et al. (2004) furthermore applies
 154  * bottom-up traversal of the lcp-interval tree to Lempel-Ziv factorization, as
 155  * does Chen at al. (2008).  Algorithm CPS1b of Chen et al. (2008) dynamically
 156  * re-uses the suffix array during bottom-up traversal of the lcp-interval tree.
 157  *
 158  * References:
 159  *
 160  *      Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
 161  *      Arrays and Its Applications.  2001.  CPM '01 Proceedings of the 12th
 162  *      Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
 163  *
 164  *      M.I. Abouelhoda, S. Kurtz, E. Ohlebusch.  2004.  Replacing Suffix Trees
 165  *      With Enhanced Suffix Arrays.  Journal of Discrete Algorithms Volume 2
 166  *      Issue 1, March 2004, pp. 53-86.
 167  *
 168  *      G. Chen, S.J. Puglisi, W.F. Smyth.  2008.  Lempel-Ziv Factorization
 169  *      Using Less Time & Space.  Mathematics in Computer Science June 2008,
 170  *      Volume 1, Issue 4, pp. 605-623.
 171  */
 172 static void
 173 build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n)
 174 {
 175         u32 * const SA_and_LCP = intervals;
 176         u32 next_interval_idx;
 177         u32 open_intervals[LCP_MAX + 1];
 178         u32 *top = open_intervals;
 179         u32 prev_pos = SA_and_LCP[0] & POS_MASK;
 180
 181         *top = 0;
 182         intervals[0] = 0;
 183         next_interval_idx = 1;
 184
 185         for (u32 r = 1; r < n; r++) {
 186                 const u32 next_pos = SA_and_LCP[r] & POS_MASK;
 187                 const u32 next_lcp = SA_and_LCP[r] & LCP_MASK;
 188                 const u32 top_lcp = *top & LCP_MASK;
 189
 190                 prefetchw(&pos_data[SA_and_LCP[r + PREFETCH_SAFETY] & POS_MASK]);
 191
 192                 if (next_lcp == top_lcp) {
 193                         /* Continuing the deepest open interval  */
 194                         pos_data[prev_pos] = *top;
 195                 } else if (next_lcp > top_lcp) {
 196                         /* Opening a new interval  */
 197                         *++top = next_lcp | next_interval_idx++;
 198                         pos_data[prev_pos] = *top;
 199                 } else {
 200                         /* Closing the deepest open interval  */
 201                         pos_data[prev_pos] = *top;
 202                         for (;;) {
 203                                 const u32 closed_interval_idx = *top-- & POS_MASK;
 204                                 const u32 superinterval_lcp = *top & LCP_MASK;
 205
 206                                 if (next_lcp == superinterval_lcp) {
 207                                         /* Continuing the superinterval */
 208                                         intervals[closed_interval_idx] = *top;
 209                                         break;
 210                                 } else if (next_lcp > superinterval_lcp) {
 211                                         /* Creating a new interval that is a
 212                                          * superinterval of the one being
 213                                          * closed, but still a subinterval of
 214                                          * its superinterval  */
 215                                         *++top = next_lcp | next_interval_idx++;
 216                                         intervals[closed_interval_idx] = *top;
 217                                         break;
 218                                 } else {
 219                                         /* Also closing the superinterval  */
 220                                         intervals[closed_interval_idx] = *top;
 221                                 }
 222                         }
 223                 }
 224                 prev_pos = next_pos;
 225         }
 226
 227         /* Close any still-open intervals.  */
 228         pos_data[prev_pos] = *top;
 229         for (; top > open_intervals; top--)
 230                 intervals[*top & POS_MASK] = *(top - 1);
 231 }
 232
 233 /*
 234  * Advance the LCP-interval tree matchfinder by one byte.
 235  *
 236  * If @record_matches is true, then matches are written to the @matches array
 237  * sorted by strictly decreasing length and strictly decreasing offset, and the
 238  * return value is the number of matches found.  Otherwise, @matches is ignored
 239  * and the return value is always 0.
 240  *
 241  * How this algorithm works:
 242  *
 243  * 'cur_pos' is the position of the current suffix, which is the suffix being
 244  * matched against.  'cur_pos' starts at 0 and is incremented each time this
 245  * function is called.  This function finds each suffix with position less than
 246  * 'cur_pos' that shares a prefix with the current suffix, but for each distinct
 247  * prefix length it finds only the suffix with the greatest position (i.e. the
 248  * most recently seen in the linear traversal by position).  This function
 249  * accomplishes this using the lcp-interval tree data structure that was built
 250  * by build_LCPIT() and is updated dynamically by this function.
 251  *
 252  * The first step is to read 'pos_data[cur_pos]', which gives the index and lcp
 253  * value of the deepest lcp-interval containing the current suffix --- or,
 254  * equivalently, the parent of the conceptual singleton lcp-interval that
 255  * contains the current suffix.
 256  *
 257  * The second step is to ascend the lcp-interval tree until reaching an interval
 258  * that has not yet been visited, and link the intervals to the current suffix
 259  * along the way.   An lcp-interval has been visited if and only if it has been
 260  * linked to a suffix.  Initially, no lcp-intervals are linked to suffixes.
 261  *
 262  * The third step is to continue ascending the lcp-interval tree, but indirectly
 263  * through suffix links rather than through the original superinterval
 264  * references, and continuing to form links with the current suffix along the
 265  * way.  Each suffix visited during this step, except in a special case to
 266  * handle outdated suffixes, is a match which can be written to matches[].  Each
 267  * intervals[] entry contains the position of the next suffix to visit, which we
 268  * shall call 'match_pos'; this is the most recently seen suffix that belongs to
 269  * that lcp-interval.  'pos_data[match_pos]' then contains the lcp and interval
 270  * index of the next lcp-interval that should be visited.
 271  *
 272  * We can view these arrays as describing a new set of links that gets overlaid
 273  * on top of the original superinterval references of the lcp-interval tree.
 274  * Each such link can connect a node of the lcp-interval tree to an ancestor
 275  * more than one generation removed.
 276  *
 277  * For each one-byte advance, the current position becomes the most recently
 278  * seen suffix for a continuous sequence of lcp-intervals from a leaf interval
 279  * to the root interval.  Conceptually, this algorithm needs to update all these
 280  * nodes to link to 'cur_pos', and then update 'pos_data[cur_pos]' to a "null"
 281  * link.  But actually, if some of these nodes have the same most recently seen
 282  * suffix, then this algorithm just visits the pos_data[] entry for that suffix
 283  * and skips over all these nodes in one step.  Updating the extra nodes is
 284  * accomplished by creating a redirection from the previous suffix to the
 285  * current suffix.
 286  *
 287  * Using this shortcutting scheme, it's possible for a suffix to become out of
 288  * date, which means that it is no longer the most recently seen suffix for the
 289  * lcp-interval under consideration.  This case is detected by noticing when the
 290  * next lcp-interval link actually points deeper in the tree, and it is worked
 291  * around by just continuing until we get to a link that actually takes us
 292  * higher in the tree.  This can be described as a lazy-update scheme.
 293  */
 294 static forceinline u32
 295 lcpit_advance_one_byte(const u32 cur_pos,
 296                        u32 pos_data[restrict],
 297                        u32 intervals[restrict],
 298                        u32 next[restrict],
 299                        struct lz_match matches[restrict],
 300                        const bool record_matches)
 301 {
 302         u32 ref;
 303         u32 super_ref;
 304         u32 match_pos;
 305         struct lz_match *matchptr;
 306
 307         /* Get the deepest lcp-interval containing the current suffix. */
 308         ref = pos_data[cur_pos];
 309
 310         /* Prefetch upcoming data, up to 3 positions ahead.  Assume the
 311          * intervals are already visited.  */
 312
 313         /* Prefetch the superinterval via a suffix link for the deepest
 314          * lcp-interval containing the suffix starting 1 position from now.  */
 315         prefetchw(&intervals[pos_data[next[0]] & POS_MASK]);
 316
 317         /* Prefetch suffix link for the deepest lcp-interval containing the
 318          * suffix starting 2 positions from now.  */
 319         next[0] = intervals[next[1]] & POS_MASK;
 320         prefetchw(&pos_data[next[0]]);
 321
 322         /* Prefetch the deepest lcp-interval containing the suffix starting 3
 323          * positions from now.  */
 324         next[1] = pos_data[cur_pos + 3] & POS_MASK;
 325         prefetchw(&intervals[next[1]]);
 326
 327         /* There is no "next suffix" after the current one.  */
 328         pos_data[cur_pos] = 0;
 329
 330         /* Ascend until we reach a visited interval, the root, or a child of the
 331          * root.  Link unvisited intervals to the current suffix as we go.  */
 332         while ((super_ref = intervals[ref & POS_MASK]) & LCP_MASK) {
 333                 intervals[ref & POS_MASK] = cur_pos;
 334                 ref = super_ref;
 335         }
 336
 337         if (super_ref == 0) {
 338                 /* In this case, the current interval may be any of:
 339                  * (1) the root;
 340                  * (2) an unvisited child of the root;
 341                  * (3) an interval last visited by suffix 0
 342                  *
 343                  * We could avoid the ambiguity with (3) by using an lcp
 344                  * placeholder value other than 0 to represent "visited", but
 345                  * it's fastest to use 0.  So we just don't allow matches with
 346                  * position 0.  */
 347
 348                 if (ref != 0)  /* Not the root?  */
 349                         intervals[ref & POS_MASK] = cur_pos;
 350                 return 0;
 351         }
 352
 353         /* Ascend indirectly via pos_data[] links.  */
 354         match_pos = super_ref;
 355         matchptr = matches;
 356         for (;;) {
 357                 while ((super_ref = pos_data[match_pos]) > ref)
 358                         match_pos = intervals[super_ref & POS_MASK];
 359                 intervals[ref & POS_MASK] = cur_pos;
 360                 pos_data[match_pos] = ref;
 361                 if (record_matches) {
 362                         matchptr->length = ref >> LCP_SHIFT;
 363                         matchptr->offset = cur_pos - match_pos;
 364                         matchptr++;
 365                 }
 366                 if (super_ref == 0)
 367                         break;
 368                 ref = super_ref;
 369                 match_pos = intervals[ref & POS_MASK];
 370         }
 371         return matchptr - matches;
 372 }
 373
 374 /* Expand SA from 32 bits to 64 bits.  */
 375 static void
 376 expand_SA(void *p, u32 n)
 377 {
 378         typedef u32 _may_alias_attribute aliased_u32_t;
 379         typedef u64 _may_alias_attribute aliased_u64_t;
 380
 381         aliased_u32_t *SA = p;
 382         aliased_u64_t *SA64 = p;
 383
 384         u32 r = n - 1;
 385         do {
 386                 SA64[r] = SA[r];
 387         } while (r--);
 388 }
 389
 390 /* Like build_LCP(), but for buffers larger than MAX_NORMAL_BUFSIZE.  */
 391 static void
 392 build_LCP_huge(u64 SA_and_LCP64[restrict], const u32 ISA[restrict],
 393                const u8 T[restrict], const u32 n,
 394                const u32 min_lcp, const u32 max_lcp)
 395 {
 396         u32 h = 0;
 397         for (u32 i = 0; i < n; i++) {
 398                 const u32 r = ISA[i];
 399                 prefetchw(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
 400                 if (r > 0) {
 401                         const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK;
 402                         const u32 lim = min(n - i, n - j);
 403                         while (h < lim && T[i + h] == T[j + h])
 404                                 h++;
 405                         u32 stored_lcp = h;
 406                         if (stored_lcp < min_lcp)
 407                                 stored_lcp = 0;
 408                         else if (stored_lcp > max_lcp)
 409                                 stored_lcp = max_lcp;
 410                         SA_and_LCP64[r] |= (u64)stored_lcp << HUGE_LCP_SHIFT;
 411                         if (h > 0)
 412                                 h--;
 413                 }
 414         }
 415 }
 416
 417 /*
 418  * Like build_LCPIT(), but for buffers larger than MAX_NORMAL_BUFSIZE.
 419  *
 420  * This "huge" version is also slightly different in that the lcp value stored
 421  * in each intervals[] entry is the lcp value for that interval, not its
 422  * superinterval.  This lcp value stays put in intervals[] and doesn't get moved
 423  * to pos_data[] during lcpit_advance_one_byte_huge().  One consequence of this
 424  * is that we have to use a special flag to distinguish visited from unvisited
 425  * intervals.  But overall, this scheme keeps the memory usage at 12n instead of
 426  * 16n.  (The non-huge version is 8n.)
 427  */
 428 static void
 429 build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
 430 {
 431         u64 * const SA_and_LCP64 = intervals64;
 432         u32 next_interval_idx;
 433         u32 open_intervals[HUGE_LCP_MAX + 1];
 434         u32 *top = open_intervals;
 435         u32 prev_pos = SA_and_LCP64[0] & HUGE_POS_MASK;
 436
 437         *top = 0;
 438         intervals64[0] = 0;
 439         next_interval_idx = 1;
 440
 441         for (u32 r = 1; r < n; r++) {
 442                 const u32 next_pos = SA_and_LCP64[r] & HUGE_POS_MASK;
 443                 const u64 next_lcp = SA_and_LCP64[r] & HUGE_LCP_MASK;
 444                 const u64 top_lcp = intervals64[*top];
 445
 446                 prefetchw(&pos_data[SA_and_LCP64[r + PREFETCH_SAFETY] & HUGE_POS_MASK]);
 447
 448                 if (next_lcp == top_lcp) {
 449                         /* Continuing the deepest open interval  */
 450                         pos_data[prev_pos] = *top;
 451                 } else if (next_lcp > top_lcp) {
 452                         /* Opening a new interval  */
 453                         intervals64[next_interval_idx] = next_lcp;
 454                         pos_data[prev_pos] = next_interval_idx;
 455                         *++top = next_interval_idx++;
 456                 } else {
 457                         /* Closing the deepest open interval  */
 458                         pos_data[prev_pos] = *top;
 459                         for (;;) {
 460                                 const u32 closed_interval_idx = *top--;
 461                                 const u64 superinterval_lcp = intervals64[*top];
 462
 463                                 if (next_lcp == superinterval_lcp) {
 464                                         /* Continuing the superinterval */
 465                                         intervals64[closed_interval_idx] |=
 466                                                 HUGE_UNVISITED_TAG | *top;
 467                                         break;
 468                                 } else if (next_lcp > superinterval_lcp) {
 469                                         /* Creating a new interval that is a
 470                                          * superinterval of the one being
 471                                          * closed, but still a subinterval of
 472                                          * its superinterval  */
 473                                         intervals64[next_interval_idx] = next_lcp;
 474                                         intervals64[closed_interval_idx] |=
 475                                                 HUGE_UNVISITED_TAG | next_interval_idx;
 476                                         *++top = next_interval_idx++;
 477                                         break;
 478                                 } else {
 479                                         /* Also closing the superinterval  */
 480                                         intervals64[closed_interval_idx] |=
 481                                                 HUGE_UNVISITED_TAG | *top;
 482                                 }
 483                         }
 484                 }
 485                 prev_pos = next_pos;
 486         }
 487
 488         /* Close any still-open intervals.  */
 489         pos_data[prev_pos] = *top;
 490         for (; top > open_intervals; top--)
 491                 intervals64[*top] |= HUGE_UNVISITED_TAG | *(top - 1);
 492 }
 493
 494 /* Like lcpit_advance_one_byte(), but for buffers larger than
 495  * MAX_NORMAL_BUFSIZE.  */
 496 static forceinline u32
 497 lcpit_advance_one_byte_huge(const u32 cur_pos,
 498                             u32 pos_data[restrict],
 499                             u64 intervals64[restrict],
 500                             u32 prefetch_next[restrict],
 501                             struct lz_match matches[restrict],
 502                             const bool record_matches)
 503 {
 504         u32 interval_idx;
 505         u32 next_interval_idx;
 506         u64 cur;
 507         u64 next;
 508         u32 match_pos;
 509         struct lz_match *matchptr;
 510
 511         interval_idx = pos_data[cur_pos];
 512
 513         prefetchw(&intervals64[pos_data[prefetch_next[0]] & HUGE_POS_MASK]);
 514
 515         prefetch_next[0] = intervals64[prefetch_next[1]] & HUGE_POS_MASK;
 516         prefetchw(&pos_data[prefetch_next[0]]);
 517
 518         prefetch_next[1] = pos_data[cur_pos + 3] & HUGE_POS_MASK;
 519         prefetchw(&intervals64[prefetch_next[1]]);
 520
 521         pos_data[cur_pos] = 0;
 522
 523         while ((next = intervals64[interval_idx]) & HUGE_UNVISITED_TAG) {
 524                 intervals64[interval_idx] = (next & HUGE_LCP_MASK) | cur_pos;
 525                 interval_idx = next & HUGE_POS_MASK;
 526         }
 527
 528         matchptr = matches;
 529         while (next & HUGE_LCP_MASK) {
 530                 cur = next;
 531                 do {
 532                         match_pos = next & HUGE_POS_MASK;
 533                         next_interval_idx = pos_data[match_pos];
 534                         next = intervals64[next_interval_idx];
 535                 } while (next > cur);
 536                 intervals64[interval_idx] = (cur & HUGE_LCP_MASK) | cur_pos;
 537                 pos_data[match_pos] = interval_idx;
 538                 if (record_matches) {
 539                         matchptr->length = cur >> HUGE_LCP_SHIFT;
 540                         matchptr->offset = cur_pos - match_pos;
 541                         matchptr++;
 542                 }
 543                 interval_idx = next_interval_idx;
 544         }
 545         return matchptr - matches;
 546 }
 547
 548 static forceinline u64
 549 get_pos_data_size(size_t max_bufsize)
 550 {
 551         return (u64)max((u64)max_bufsize + PREFETCH_SAFETY,
 552                         DIVSUFSORT_TMP_LEN) * sizeof(u32);
 553 }
 554
 555 static forceinline u64
 556 get_intervals_size(size_t max_bufsize)
 557 {
 558         return ((u64)max_bufsize + PREFETCH_SAFETY) *
 559                 (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64));
 560 }
 561
 562 /*
 563  * Calculate the number of bytes of memory needed for the LCP-interval tree
 564  * matchfinder.
 565  *
 566  * @max_bufsize - maximum buffer size to support
 567  *
 568  * Returns the number of bytes required.
 569  */
 570 u64
 571 lcpit_matchfinder_get_needed_memory(size_t max_bufsize)
 572 {
 573         return get_pos_data_size(max_bufsize) + get_intervals_size(max_bufsize);
 574 }
 575
 576 /*
 577  * Initialize the LCP-interval tree matchfinder.
 578  *
 579  * @mf - the matchfinder structure to initialize
 580  * @max_bufsize - maximum buffer size to support
 581  * @min_match_len - minimum match length in bytes
 582  * @nice_match_len - only consider this many bytes of each match
 583  *
 584  * Returns true if successfully initialized; false if out of memory.
 585  */
 586 bool
 587 lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize,
 588                        u32 min_match_len, u32 nice_match_len)
 589 {
 590         if (lcpit_matchfinder_get_needed_memory(max_bufsize) > SIZE_MAX)
 591                 return false;
 592         if (max_bufsize > MAX_HUGE_BUFSIZE - PREFETCH_SAFETY)
 593                 return false;
 594
 595         mf->pos_data = MALLOC(get_pos_data_size(max_bufsize));
 596         mf->intervals = MALLOC(get_intervals_size(max_bufsize));
 597         if (!mf->pos_data || !mf->intervals) {
 598                 lcpit_matchfinder_destroy(mf);
 599                 return false;
 600         }
 601
 602         mf->min_match_len = min_match_len;
 603         mf->orig_nice_match_len = nice_match_len;
 604         return true;
 605 }
 606
 607 /*
 608  * Build the suffix array SA for the specified byte array T of length n.
 609  *
 610  * The suffix array is a sorted array of the byte array's suffixes, represented
 611  * by indices into the byte array.  It can equivalently be viewed as a mapping
 612  * from suffix rank to suffix position.
 613  *
 614  * To build the suffix array, we use libdivsufsort, which uses an
 615  * induced-sorting-based algorithm.  In practice, this seems to be the fastest
 616  * suffix array construction algorithm currently available.
 617  *
 618  * References:
 619  *
 620  *      Y. Mori.  libdivsufsort, a lightweight suffix-sorting library.
 621  *      https://github.com/y-256/libdivsufsort
 622  *
 623  *      G. Nong, S. Zhang, and W.H. Chan.  2009.  Linear Suffix Array
 624  *      Construction by Almost Pure Induced-Sorting.  Data Compression
 625  *      Conference, 2009.  DCC '09.  pp. 193 - 202.
 626  *
 627  *      S.J. Puglisi, W.F. Smyth, and A. Turpin.  2007.  A Taxonomy of Suffix
 628  *      Array Construction Algorithms.  ACM Computing Surveys (CSUR) Volume 39
 629  *      Issue 2, 2007 Article No. 4.
 630  */
 631 static void
 632 build_SA(u32 SA[], const u8 T[], u32 n, u32 *tmp)
 633 {
 634         /* Note: divsufsort() requires a fixed amount of temporary space.  The
 635          * implementation of divsufsort() has been modified from the original to
 636          * use the provided temporary space instead of allocating its own, since
 637          * we don't want to have to deal with malloc() failures here.  */
 638         divsufsort(T, SA, n, tmp);
 639 }
 640
 641 /*
 642  * Build the inverse suffix array ISA from the suffix array SA.
 643  *
 644  * Whereas the suffix array is a mapping from suffix rank to suffix position,
 645  * the inverse suffix array is a mapping from suffix position to suffix rank.
 646  */
 647 static void
 648 build_ISA(u32 ISA[restrict], const u32 SA[restrict], u32 n)
 649 {
 650         for (u32 r = 0; r < n; r++)
 651                 ISA[SA[r]] = r;
 652 }
 653
 654 /*
 655  * Prepare the LCP-interval tree matchfinder for a new input buffer.
 656  *
 657  * @mf - the initialized matchfinder structure
 658  * @T - the input buffer
 659  * @n - size of the input buffer in bytes.  This must be nonzero and can be at
 660  *      most the max_bufsize with which lcpit_matchfinder_init() was called.
 661  */
 662 void
 663 lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
 664 {
 665         /* intervals[] temporarily stores SA and LCP packed together.
 666          * pos_data[] temporarily stores ISA.
 667          * pos_data[] is also used as the temporary space for divsufsort().  */
 668
 669         build_SA(mf->intervals, T, n, mf->pos_data);
 670         build_ISA(mf->pos_data, mf->intervals, n);
 671         if (n <= MAX_NORMAL_BUFSIZE) {
 672                 mf->nice_match_len = min(mf->orig_nice_match_len, LCP_MAX);
 673                 for (u32 i = 0; i < PREFETCH_SAFETY; i++) {
 674                         mf->intervals[n + i] = 0;
 675                         mf->pos_data[n + i] = 0;
 676                 }
 677                 build_LCP(mf->intervals, mf->pos_data, T, n,
 678                           mf->min_match_len, mf->nice_match_len);
 679                 build_LCPIT(mf->intervals, mf->pos_data, n);
 680                 mf->huge_mode = false;
 681         } else {
 682                 mf->nice_match_len = min(mf->orig_nice_match_len, HUGE_LCP_MAX);
 683                 for (u32 i = 0; i < PREFETCH_SAFETY; i++) {
 684                         mf->intervals64[n + i] = 0;
 685                         mf->pos_data[n + i] = 0;
 686                 }
 687                 expand_SA(mf->intervals, n);
 688                 build_LCP_huge(mf->intervals64, mf->pos_data, T, n,
 689                                mf->min_match_len, mf->nice_match_len);
 690                 build_LCPIT_huge(mf->intervals64, mf->pos_data, n);
 691                 mf->huge_mode = true;
 692         }
 693         mf->cur_pos = 0; /* starting at beginning of input buffer  */
 694         for (u32 i = 0; i < ARRAY_LEN(mf->next); i++)
 695                 mf->next[i] = 0;
 696 }
 697
 698 /*
 699  * Retrieve a list of matches with the next position.
 700  *
 701  * The matches will be recorded in the @matches array, ordered by strictly
 702  * decreasing length and strictly decreasing offset.
 703  *
 704  * The return value is the number of matches found and written to @matches.
 705  * This can be any value in [0, nice_match_len - min_match_len + 1].
 706  */
 707 u32
 708 lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf,
 709                               struct lz_match *matches)
 710 {
 711         if (mf->huge_mode)
 712                 return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
 713                                                    mf->intervals64, mf->next,
 714                                                    matches, true);
 715         else
 716                 return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
 717                                               mf->intervals, mf->next,
 718                                               matches, true);
 719 }
 720
 721 /*
 722  * Skip the next @count bytes (don't search for matches at them).  @count is
 723  * assumed to be > 0.
 724  */
 725 void
 726 lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count)
 727 {
 728         if (mf->huge_mode) {
 729                 do {
 730                         lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
 731                                                     mf->intervals64, mf->next,
 732                                                     NULL, false);
 733                 } while (--count);
 734         } else {
 735                 do {
 736                         lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
 737                                                mf->intervals, mf->next,
 738                                                NULL, false);
 739                 } while (--count);
 740         }
 741 }
 742
 743 /*
 744  * Destroy an LCP-interval tree matchfinder that was previously initialized with
 745  * lcpit_matchfinder_init().
 746  */
 747 void
 748 lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf)
 749 {
 750         FREE(mf->pos_data);
 751         FREE(mf->intervals);
 752 }