wimlib.net Git - wimlib/blob - src/lcpit_matchfinder.c

   1 /*
   2  * lcpit_matchfinder.c
   3  *
   4  * A match-finder for Lempel-Ziv compression based on bottom-up construction and
   5  * traversal of the Longest Common Prefix (LCP) interval tree.
   6  *
   7  * Author:      Eric Biggers
   8  * Year:        2014, 2015
   9  *
  10  * The author dedicates this file to the public domain.
  11  * You can do whatever you want with this file.
  12  */
  13
  14 #include <limits.h>
  15
  16 #include "wimlib/divsufsort.h"
  17 #include "wimlib/lcpit_matchfinder.h"
  18 #include "wimlib/util.h"
  19
  20 #define LCP_BITS                6
  21 #define LCP_MAX                 (((u32)1 << LCP_BITS) - 1)
  22 #define POS_MASK                (((u32)1 << (32 - LCP_BITS)) - 1)
  23 #define LCP_SHIFT               (32 - LCP_BITS)
  24 #define LCP_MASK                (LCP_MAX << LCP_SHIFT)
  25 #define MAX_NORMAL_BUFSIZE      (POS_MASK + 1)
  26
  27 #define HUGE_LCP_BITS           7
  28 #define HUGE_LCP_MAX            (((u32)1 << HUGE_LCP_BITS) - 1)
  29 #define HUGE_LCP_SHIFT          (64 - HUGE_LCP_BITS)
  30 #define HUGE_POS_MASK           0xFFFFFFFF
  31 #define MAX_HUGE_BUFSIZE        ((u64)HUGE_POS_MASK + 1)
  32 #define HUGE_UNVISITED_TAG      0x100000000
  33
  34 #define PREFETCH_SAFETY         5
  35
  36 /*
  37  * Build the LCP (Longest Common Prefix) array in linear time.
  38  *
  39  * LCP[r] will be the length of the longest common prefix between the suffixes
  40  * with positions SA[r - 1] and  SA[r].  LCP[0] will be undefined.
  41  *
  42  * Algorithm taken from Kasai et al. (2001), but modified slightly:
  43  *
  44  *  - With bytes there is no realistic way to reserve a unique symbol for
  45  *    end-of-buffer, so use explicit checks for end-of-buffer.
  46  *
  47  *  - For decreased memory usage and improved memory locality, pack the two
  48  *    logically distinct SA and LCP arrays into a single array SA_and_LCP.
  49  *
  50  *  - Since SA_and_LCP is accessed randomly, improve the cache behavior by
  51  *    reading several entries ahead in ISA and prefetching the upcoming
  52  *    SA_and_LCP entry.
  53  *
  54  *  - If an LCP value is less than the minimum match length, then store 0.  This
  55  *    avoids having to do comparisons against the minimum match length later.
  56  *
  57  *  - If an LCP value is greater than the "nice match length", then store the
  58  *    "nice match length".  This caps the number of bits needed to store each
  59  *    LCP value, and this caps the depth of the LCP-interval tree, without
  60  *    usually hurting the compression ratio too much.
  61  *
  62  * References:
  63  *
  64  *      Kasai et al.  2001.  Linear-Time Longest-Common-Prefix Computation in
  65  *      Suffix Arrays and Its Applications.  CPM '01 Proceedings of the 12th
  66  *      Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
  67  */
  68 static void
  69 build_LCP(u32 SA_and_LCP[restrict], const u32 ISA[restrict],
  70           const u8 T[restrict], const u32 n,
  71           const u32 min_lcp, const u32 max_lcp)
  72 {
  73         u32 h = 0;
  74         for (u32 i = 0; i < n; i++) {
  75                 const u32 r = ISA[i];
  76                 prefetch(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
  77                 if (r > 0) {
  78                         const u32 j = SA_and_LCP[r - 1] & POS_MASK;
  79                         const u32 lim = min(n - i, n - j);
  80                         while (h < lim && T[i + h] == T[j + h])
  81                                 h++;
  82                         u32 stored_lcp = h;
  83                         if (stored_lcp < min_lcp)
  84                                 stored_lcp = 0;
  85                         else if (stored_lcp > max_lcp)
  86                                 stored_lcp = max_lcp;
  87                         SA_and_LCP[r] |= stored_lcp << LCP_SHIFT;
  88                         if (h > 0)
  89                                 h--;
  90                 }
  91         }
  92 }
  93
  94 /*
  95  * Use the suffix array accompanied with the longest-common-prefix array ---
  96  * which in combination can be called the "enhanced suffix array" --- to
  97  * simulate a bottom-up traversal of the corresponding suffix tree, or
  98  * equivalently the lcp-interval tree.  Do so in suffix rank order, but save the
  99  * superinterval references needed for later bottom-up traversal of the tree in
 100  * suffix position order.
 101  *
 102  * To enumerate the lcp-intervals, this algorithm scans the suffix array and its
 103  * corresponding LCP array linearly.  While doing so, it maintains a stack of
 104  * lcp-intervals that are currently open, meaning that their left boundaries
 105  * have been seen but their right boundaries have not.  The bottom of the stack
 106  * is the interval which covers the entire suffix array (this has lcp=0), and
 107  * the top of the stack is the deepest interval that is currently open (this has
 108  * the greatest lcp of any interval on the stack).  When this algorithm opens an
 109  * lcp-interval, it assigns it a unique index in intervals[] and pushes it onto
 110  * the stack.  When this algorithm closes an interval, it pops it from the stack
 111  * and sets the intervals[] entry of that interval to the index and lcp of that
 112  * interval's superinterval, which is the new top of the stack.
 113  *
 114  * This algorithm also set pos_data[pos] for each suffix position 'pos' to the
 115  * index and lcp of the deepest lcp-interval containing it.  Alternatively, we
 116  * can interpret each suffix as being associated with a singleton lcp-interval,
 117  * or leaf of the suffix tree.  With this interpretation, an entry in pos_data[]
 118  * is the superinterval reference for one of these singleton lcp-intervals and
 119  * therefore is not fundamentally different from an entry in intervals[].
 120  *
 121  * To reduce memory usage, this algorithm re-uses the suffix array's memory to
 122  * store the generated intervals[] array.  This is possible because SA and LCP
 123  * are accessed linearly, and no more than one interval is generated per suffix.
 124  *
 125  * The techniques used in this algorithm are described in various published
 126  * papers.  The generation of lcp-intervals from the suffix array (SA) and the
 127  * longest-common-prefix array (LCP) is given as Algorithm BottomUpTraverse in
 128  * Kasai et al. (2001) and Algorithm 4.1 ("Computation of lcp-intervals") in
 129  * Abouelhoda et al. (2004).  Both these papers note the equivalence between
 130  * lcp-intervals (including the singleton lcp-interval for each suffix) and
 131  * nodes of the suffix tree.  Abouelhoda et al. (2004) furthermore applies
 132  * bottom-up traversal of the lcp-interval tree to Lempel-Ziv factorization, as
 133  * does Chen at al. (2008).  Algorithm CPS1b of Chen et al. (2008) dynamically
 134  * re-uses the suffix array during bottom-up traversal of the lcp-interval tree.
 135  *
 136  * References:
 137  *
 138  *      Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
 139  *      Arrays and Its Applications.  2001.  CPM '01 Proceedings of the 12th
 140  *      Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
 141  *
 142  *      M.I. Abouelhoda, S. Kurtz, E. Ohlebusch.  2004.  Replacing Suffix Trees
 143  *      With Enhanced Suffix Arrays.  Journal of Discrete Algorithms Volume 2
 144  *      Issue 1, March 2004, pp. 53-86.
 145  *
 146  *      G. Chen, S.J. Puglisi, W.F. Smyth.  2008.  Lempel-Ziv Factorization
 147  *      Using Less Time & Space.  Mathematics in Computer Science June 2008,
 148  *      Volume 1, Issue 4, pp. 605-623.
 149  */
 150 static void
 151 build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n)
 152 {
 153         u32 * const SA_and_LCP = intervals;
 154         u32 next_interval_idx;
 155         u32 open_intervals[LCP_MAX + 1];
 156         u32 *top = open_intervals;
 157         u32 prev_pos = SA_and_LCP[0] & POS_MASK;
 158
 159         *top = 0;
 160         intervals[0] = 0;
 161         next_interval_idx = 1;
 162
 163         for (u32 r = 1; r < n; r++) {
 164                 const u32 next_pos = SA_and_LCP[r] & POS_MASK;
 165                 const u32 next_lcp = SA_and_LCP[r] >> LCP_SHIFT;
 166                 const u32 top_lcp = *top >> LCP_SHIFT;
 167
 168                 if (next_lcp == top_lcp) {
 169                         /* Continuing the deepest open interval  */
 170                         pos_data[prev_pos] = *top;
 171                 } else if (next_lcp > top_lcp) {
 172                         /* Opening a new interval  */
 173                         *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++;
 174                         pos_data[prev_pos] = *top;
 175                 } else {
 176                         /* Closing the deepest open interval  */
 177                         pos_data[prev_pos] = *top;
 178                         for (;;) {
 179                                 const u32 closed_interval_idx = *top-- & POS_MASK;
 180                                 const u32 superinterval_lcp = *top >> LCP_SHIFT;
 181
 182                                 if (next_lcp == superinterval_lcp) {
 183                                         /* Continuing the superinterval */
 184                                         intervals[closed_interval_idx] = *top;
 185                                         break;
 186                                 } else if (next_lcp > superinterval_lcp) {
 187                                         /* Creating a new interval that is a
 188                                          * superinterval of the one being
 189                                          * closed, but still a subinterval of
 190                                          * its superinterval  */
 191                                         *++top = (next_lcp << LCP_SHIFT) | next_interval_idx++;
 192                                         intervals[closed_interval_idx] = *top;
 193                                         break;
 194                                 } else {
 195                                         /* Also closing the superinterval  */
 196                                         intervals[closed_interval_idx] = *top;
 197                                 }
 198                         }
 199                 }
 200                 prev_pos = next_pos;
 201         }
 202
 203         /* Close any still-open intervals.  */
 204         pos_data[prev_pos] = *top;
 205         for (; top > open_intervals; top--)
 206                 intervals[*top & POS_MASK] = *(top - 1);
 207 }
 208
 209 /*
 210  * Advance the LCP-interval tree matchfinder by one byte.
 211  *
 212  * If @record_matches is true, then matches are written to the @matches array
 213  * sorted by strictly decreasing length and strictly decreasing offset, and the
 214  * return value is the number of matches found.  Otherwise, @matches is ignored
 215  * and the return value is always 0.
 216  *
 217  * How this algorithm works:
 218  *
 219  * 'cur_pos' is the position of the current suffix, which is the suffix being
 220  * matched against.  'cur_pos' starts at 0 and is incremented each time this
 221  * function is called.  This function finds each suffix with position less than
 222  * 'cur_pos' that shares a prefix with the current position, but for each
 223  * distinct prefix length it finds only the suffix with the greatest position
 224  * (i.e. the most recently seen in the linear traversal by position).  This
 225  * function accomplishes this using the lcp-interval tree data structure that
 226  * was built by build_LCPIT() and is updated dynamically by this function.
 227  *
 228  * The first step is to read 'pos_data[cur_pos]', which gives the index and lcp
 229  * value of the deepest lcp-interval containing the current suffix --- or,
 230  * equivalently, the parent of the conceptual singleton lcp-interval that
 231  * contains the current suffix.
 232  *
 233  * The second step is to ascend the lcp-interval tree until reaching an interval
 234  * that has not yet been visited, and link the intervals to the current suffix
 235  * along the way.   An lcp-interval has been visited if and only if it has been
 236  * linked to a suffix.  Initially, no lcp-intervals are linked to suffixes.
 237  *
 238  * The third step is to continue ascending the lcp-interval tree, but indirectly
 239  * through suffix links rather than through the original superinterval
 240  * references, and continuing to form links with the current suffix along the
 241  * way.  Each suffix visited during this step, except in a special case to
 242  * handle outdated suffixes, is a match which can be written to matches[].  Each
 243  * intervals[] entry contains the position of the next suffix to visit, which we
 244  * shall call 'match_pos'; this is the most recently seen suffix that belongs to
 245  * that lcp-interval.  'pos_data[match_pos]' then contains the lcp and interval
 246  * index of the next lcp-interval that should be visited.
 247  *
 248  * We can view these arrays as describing a new set of links that gets overlaid
 249  * on top of the original superinterval references of the lcp-interval tree.
 250  * Each such link can connect a node of the lcp-interval tree to an ancestor
 251  * more than one generation removed.
 252  *
 253  * For each one-byte advance, the current position becomes the most recently
 254  * seen suffix for a continuous sequence of lcp-intervals from a leaf interval
 255  * to the root interval.  Conceptually, this algorithm needs to update all these
 256  * nodes to link to 'cur_pos', and then update 'pos_data[cur_pos]' to a "null"
 257  * link.  But actually, if some of these nodes have the same most recently seen
 258  * suffix, then this algorithm just visits the pos_data[] entry for that suffix
 259  * and skips over all these nodes in one step.  Updating the extra nodes is
 260  * accomplished by creating a redirection from the previous suffix to the
 261  * current suffix.
 262  *
 263  * Using this shortcutting scheme, it's possible for a suffix to become out of
 264  * date, which means that it is no longer the most recently seen suffix for the
 265  * lcp-interval under consideration.  This case is detected by noticing when the
 266  * next lcp-interval link actually points deeper in the tree, and it is worked
 267  * around by just continuing until we get to a link that actually takes us
 268  * higher in the tree.  This can be described as a lazy-update scheme.
 269  */
 270 static inline u32
 271 lcpit_advance_one_byte(const u32 cur_pos,
 272                        u32 pos_data[restrict],
 273                        u32 intervals[restrict],
 274                        struct lz_match matches[restrict],
 275                        const bool record_matches)
 276 {
 277         u32 lcp;
 278         u32 interval_idx;
 279         u32 match_pos;
 280         struct lz_match *matchptr;
 281
 282         /* Get the deepest lcp-interval containing the current suffix. */
 283         lcp = pos_data[cur_pos] >> LCP_SHIFT;
 284         interval_idx = pos_data[cur_pos] & POS_MASK;
 285         prefetch(&intervals[pos_data[cur_pos + 1] & POS_MASK]);
 286         pos_data[cur_pos] = 0;
 287
 288         /* Ascend until we reach a visited interval, linking the unvisited
 289          * intervals to the current suffix as we go.  */
 290         while (intervals[interval_idx] & LCP_MASK) {
 291                 const u32 superinterval_lcp = intervals[interval_idx] >> LCP_SHIFT;
 292                 const u32 superinterval_idx = intervals[interval_idx] & POS_MASK;
 293                 intervals[interval_idx] = cur_pos;
 294                 lcp = superinterval_lcp;
 295                 interval_idx = superinterval_idx;
 296         }
 297
 298         match_pos = intervals[interval_idx] & POS_MASK;
 299         if (match_pos == 0) {
 300                 /* Ambiguous case; just don't allow matches with position 0. */
 301                 if (interval_idx != 0)
 302                         intervals[interval_idx] = cur_pos;
 303                 return 0;
 304         }
 305         matchptr = matches;
 306         /* Ascend indirectly via pos_data[] links.  */
 307         for (;;) {
 308                 u32 next_lcp;
 309                 u32 next_interval_idx;
 310
 311                 for (;;) {
 312                         next_lcp = pos_data[match_pos] >> LCP_SHIFT;
 313                         next_interval_idx = pos_data[match_pos] & POS_MASK;
 314                         if (next_lcp < lcp)
 315                                 break;
 316                         /* Suffix was out of date.  */
 317                         match_pos = intervals[next_interval_idx];
 318                 }
 319                 intervals[interval_idx] = cur_pos;
 320                 pos_data[match_pos] = (lcp << LCP_SHIFT) | interval_idx;
 321                 if (record_matches) {
 322                         matchptr->length = lcp;
 323                         matchptr->offset = cur_pos - match_pos;
 324                         matchptr++;
 325                 }
 326                 if (next_interval_idx == 0)
 327                         break;
 328                 match_pos = intervals[next_interval_idx];
 329                 interval_idx = next_interval_idx;
 330                 lcp = next_lcp;
 331         }
 332         return matchptr - matches;
 333 }
 334
 335 /* Expand SA from 32 bits to 64 bits.  */
 336 static void
 337 expand_SA(u32 *p, u32 n)
 338 {
 339         typedef u32 _may_alias_attribute aliased_u32_t;
 340         typedef u64 _may_alias_attribute aliased_u64_t;
 341
 342         aliased_u32_t *SA = (aliased_u32_t *)p;
 343         aliased_u64_t *SA64 = (aliased_u64_t *)p;
 344
 345         u32 r = n - 1;
 346         do {
 347                 SA64[r] = SA[r];
 348         } while (r--);
 349 }
 350
 351 /* Like build_LCP(), but for buffers larger than MAX_NORMAL_BUFSIZE.  */
 352 static void
 353 build_LCP_huge(u64 SA_and_LCP64[restrict], const u32 ISA[restrict],
 354                const u8 T[restrict], const u32 n,
 355                const u32 min_lcp, const u32 max_lcp)
 356 {
 357         u32 h = 0;
 358         for (u32 i = 0; i < n; i++) {
 359                 const u32 r = ISA[i];
 360                 prefetch(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
 361                 if (r > 0) {
 362                         const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK;
 363                         const u32 lim = min(n - i, n - j);
 364                         while (h < lim && T[i + h] == T[j + h])
 365                                 h++;
 366                         u32 stored_lcp = h;
 367                         if (stored_lcp < min_lcp)
 368                                 stored_lcp = 0;
 369                         else if (stored_lcp > max_lcp)
 370                                 stored_lcp = max_lcp;
 371                         SA_and_LCP64[r] |= (u64)stored_lcp << HUGE_LCP_SHIFT;
 372                         if (h > 0)
 373                                 h--;
 374                 }
 375         }
 376 }
 377
 378 /*
 379  * Like build_LCPIT(), but for buffers larger than MAX_NORMAL_BUFSIZE.
 380  *
 381  * This "huge" version is also slightly different in that the lcp value stored
 382  * in each intervals[] entry is the lcp value for that interval, not its
 383  * superinterval.  This lcp value stays put in intervals[] and doesn't get moved
 384  * to pos_data[] during lcpit_advance_one_byte_huge().  One consequence of this
 385  * is that we have to use a special flag to distinguish visited from unvisited
 386  * intervals.  But overall, this scheme keeps the memory usage at 12n instead of
 387  * 16n.  (The non-huge version is 8n.)
 388  */
 389 static void
 390 build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
 391 {
 392         u64 * const SA_and_LCP64 = intervals64;
 393         u32 next_interval_idx;
 394         u32 open_intervals[HUGE_LCP_MAX + 1];
 395         u32 *top = open_intervals;
 396         u32 prev_pos = SA_and_LCP64[0] & HUGE_POS_MASK;
 397
 398         *top = 0;
 399         intervals64[0] = 0;
 400         next_interval_idx = 1;
 401
 402         for (u32 r = 1; r < n; r++) {
 403                 const u32 next_pos = SA_and_LCP64[r] & HUGE_POS_MASK;
 404                 const u32 next_lcp = SA_and_LCP64[r] >> HUGE_LCP_SHIFT;
 405                 const u32 top_lcp = intervals64[*top] >> HUGE_LCP_SHIFT;
 406
 407                 if (next_lcp == top_lcp) {
 408                         /* Continuing the deepest open interval  */
 409                         pos_data[prev_pos] = *top;
 410                 } else if (next_lcp > top_lcp) {
 411                         /* Opening a new interval  */
 412                         intervals64[next_interval_idx] = (u64)next_lcp << HUGE_LCP_SHIFT;
 413                         pos_data[prev_pos] = next_interval_idx;
 414                         *++top = next_interval_idx++;
 415                 } else {
 416                         /* Closing the deepest open interval  */
 417                         pos_data[prev_pos] = *top;
 418                         for (;;) {
 419                                 const u32 closed_interval_idx = *top--;
 420                                 const u32 superinterval_lcp = intervals64[*top] >> HUGE_LCP_SHIFT;
 421
 422                                 if (next_lcp == superinterval_lcp) {
 423                                         /* Continuing the superinterval */
 424                                         intervals64[closed_interval_idx] |=
 425                                                 HUGE_UNVISITED_TAG | *top;
 426                                         break;
 427                                 } else if (next_lcp > superinterval_lcp) {
 428                                         /* Creating a new interval that is a
 429                                          * superinterval of the one being
 430                                          * closed, but still a subinterval of
 431                                          * its superinterval  */
 432                                         intervals64[next_interval_idx] =
 433                                                 (u64)next_lcp << HUGE_LCP_SHIFT;
 434                                         intervals64[closed_interval_idx] |=
 435                                                 HUGE_UNVISITED_TAG | next_interval_idx;
 436                                         *++top = next_interval_idx++;
 437                                         break;
 438                                 } else {
 439                                         /* Also closing the superinterval  */
 440                                         intervals64[closed_interval_idx] |=
 441                                                 HUGE_UNVISITED_TAG | *top;
 442                                 }
 443                         }
 444                 }
 445                 prev_pos = next_pos;
 446         }
 447
 448         /* Close any still-open intervals.  */
 449         pos_data[prev_pos] = *top;
 450         for (; top > open_intervals; top--)
 451                 intervals64[*top] |= HUGE_UNVISITED_TAG | *(top - 1);
 452 }
 453
 454 /* Like lcpit_advance_one_byte(), but for buffers larger than
 455  * MAX_NORMAL_BUFSIZE.  */
 456 static inline u32
 457 lcpit_advance_one_byte_huge(const u32 cur_pos,
 458                             u32 pos_data[restrict],
 459                             u64 intervals64[restrict],
 460                             struct lz_match matches[restrict],
 461                             const bool record_matches)
 462 {
 463         u32 interval_idx;
 464         u32 lcp;
 465         u32 match_pos;
 466         struct lz_match *matchptr;
 467
 468         interval_idx = pos_data[cur_pos];
 469         prefetch(&intervals64[pos_data[cur_pos + 1]]);
 470         pos_data[cur_pos] = 0;
 471
 472         while (intervals64[interval_idx] & HUGE_UNVISITED_TAG) {
 473                 lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT;
 474                 if (lcp == 0)
 475                         return 0;
 476                 const u32 superinterval_idx = intervals64[interval_idx] & HUGE_POS_MASK;
 477                 intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos;
 478                 interval_idx = superinterval_idx;
 479         }
 480
 481         match_pos = intervals64[interval_idx] & HUGE_POS_MASK;
 482         lcp = intervals64[interval_idx] >> HUGE_LCP_SHIFT;
 483         matchptr = matches;
 484         while (lcp != 0) {
 485                 u32 next_interval_idx;
 486                 u32 next_lcp;
 487
 488                 for (;;) {
 489                         next_interval_idx = pos_data[match_pos];
 490                         next_lcp = intervals64[next_interval_idx] >> HUGE_LCP_SHIFT;
 491                         if (next_lcp < lcp)
 492                                 break;
 493                         match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK;
 494                 }
 495                 intervals64[interval_idx] = ((u64)lcp << HUGE_LCP_SHIFT) | cur_pos;
 496                 pos_data[match_pos] = interval_idx;
 497                 if (record_matches) {
 498                         matchptr->length = lcp;
 499                         matchptr->offset = cur_pos - match_pos;
 500                         matchptr++;
 501                 }
 502                 match_pos = intervals64[next_interval_idx] & HUGE_POS_MASK;
 503                 interval_idx = next_interval_idx;
 504                 lcp = next_lcp;
 505         }
 506         return matchptr - matches;
 507 }
 508
 509 static inline u64
 510 get_pos_data_size(size_t max_bufsize)
 511 {
 512         return (u64)max((u64)max_bufsize + PREFETCH_SAFETY,
 513                         DIVSUFSORT_TMP_LEN) * sizeof(u32);
 514 }
 515
 516 static inline u64
 517 get_intervals_size(size_t max_bufsize)
 518 {
 519         return (u64)max_bufsize * (max_bufsize <= MAX_NORMAL_BUFSIZE ?
 520                                    sizeof(u32) : sizeof(u64));
 521 }
 522
 523 /*
 524  * Calculate the number of bytes of memory needed for the LCP-interval tree
 525  * matchfinder.
 526  *
 527  * @max_bufsize - maximum buffer size to support
 528  *
 529  * Returns the number of bytes required.
 530  */
 531 u64
 532 lcpit_matchfinder_get_needed_memory(size_t max_bufsize)
 533 {
 534         return get_pos_data_size(max_bufsize) + get_intervals_size(max_bufsize);
 535 }
 536
 537 /*
 538  * Initialize the LCP-interval tree matchfinder.
 539  *
 540  * @mf - the matchfinder structure to initialize
 541  * @max_bufsize - maximum buffer size to support
 542  * @min_match_len - minimum match length in bytes
 543  * @nice_match_len - only consider this many bytes of each match
 544  *
 545  * Returns true if successfully initialized; false if out of memory.
 546  */
 547 bool
 548 lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize,
 549                        u32 min_match_len, u32 nice_match_len)
 550 {
 551         if (lcpit_matchfinder_get_needed_memory(max_bufsize) > SIZE_MAX)
 552                 return false;
 553         if (max_bufsize > MAX_HUGE_BUFSIZE - PREFETCH_SAFETY)
 554                 return false;
 555
 556         mf->pos_data = MALLOC(get_pos_data_size(max_bufsize));
 557         mf->intervals = MALLOC(get_intervals_size(max_bufsize));
 558         if (!mf->pos_data || !mf->intervals) {
 559                 lcpit_matchfinder_destroy(mf);
 560                 return false;
 561         }
 562
 563         mf->min_match_len = min_match_len;
 564         mf->nice_match_len = min(nice_match_len,
 565                                  (max_bufsize <= MAX_NORMAL_BUFSIZE) ?
 566                                  LCP_MAX : HUGE_LCP_MAX);
 567         for (u32 i = 0; i < PREFETCH_SAFETY; i++)
 568                 mf->pos_data[max_bufsize + i] = 0;
 569         return true;
 570 }
 571
 572 /*
 573  * Build the suffix array SA for the specified byte array T of length n.
 574  *
 575  * The suffix array is a sorted array of the byte array's suffixes, represented
 576  * by indices into the byte array.  It can equivalently be viewed as a mapping
 577  * from suffix rank to suffix position.
 578  *
 579  * To build the suffix array, we use libdivsufsort, which uses an
 580  * induced-sorting-based algorithm.  In practice, this seems to be the fastest
 581  * suffix array construction algorithm currently available.
 582  *
 583  * References:
 584  *
 585  *      Y. Mori.  libdivsufsort, a lightweight suffix-sorting library.
 586  *      https://code.google.com/p/libdivsufsort/.
 587  *
 588  *      G. Nong, S. Zhang, and W.H. Chan.  2009.  Linear Suffix Array
 589  *      Construction by Almost Pure Induced-Sorting.  Data Compression
 590  *      Conference, 2009.  DCC '09.  pp. 193 - 202.
 591  *
 592  *      S.J. Puglisi, W.F. Smyth, and A. Turpin.  2007.  A Taxonomy of Suffix
 593  *      Array Construction Algorithms.  ACM Computing Surveys (CSUR) Volume 39
 594  *      Issue 2, 2007 Article No. 4.
 595  */
 596 static void
 597 build_SA(u32 SA[], const u8 T[], u32 n, u32 *tmp)
 598 {
 599         /* Note: divsufsort() needs temporary space --- one array with 256
 600          * spaces and one array with 65536 spaces.  The implementation of
 601          * divsufsort() has been modified from the original to use the provided
 602          * temporary space instead of allocating its own, since we don't want to
 603          * have to deal with malloc() failures here.  */
 604         divsufsort(T, SA, n, tmp);
 605 }
 606
 607 /*
 608  * Build the inverse suffix array ISA from the suffix array SA.
 609  *
 610  * Whereas the suffix array is a mapping from suffix rank to suffix position,
 611  * the inverse suffix array is a mapping from suffix position to suffix rank.
 612  */
 613 static void
 614 build_ISA(u32 ISA[restrict], const u32 SA[restrict], u32 n)
 615 {
 616         for (u32 r = 0; r < n; r++)
 617                 ISA[SA[r]] = r;
 618 }
 619
 620 /*
 621  * Prepare the LCP-interval tree matchfinder for a new input buffer.
 622  *
 623  * @mf - the initialized matchfinder structure
 624  * @T - the input buffer
 625  * @n - size of the input buffer in bytes.  This must be nonzero and can be at
 626  *      most the max_bufsize with which lcpit_matchfinder_init() was called.
 627  */
 628 void
 629 lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
 630 {
 631         /* intervals[] temporarily stores SA and LCP packed together.
 632          * pos_data[] temporarily stores ISA.
 633          * pos_data[] is also used as the temporary space for divsufsort().  */
 634
 635         build_SA(mf->intervals, T, n, mf->pos_data);
 636         build_ISA(mf->pos_data, mf->intervals, n);
 637         if (n <= MAX_NORMAL_BUFSIZE) {
 638                 build_LCP(mf->intervals, mf->pos_data, T, n,
 639                           mf->min_match_len, mf->nice_match_len);
 640                 build_LCPIT(mf->intervals, mf->pos_data, n);
 641                 mf->huge_mode = false;
 642         } else {
 643                 expand_SA(mf->intervals, n);
 644                 build_LCP_huge(mf->intervals64, mf->pos_data, T, n,
 645                                mf->min_match_len, mf->nice_match_len);
 646                 build_LCPIT_huge(mf->intervals64, mf->pos_data, n);
 647                 mf->huge_mode = true;
 648         }
 649         mf->cur_pos = 0; /* starting at beginning of input buffer  */
 650 }
 651
 652 /*
 653  * Retrieve a list of matches with the next position.
 654  *
 655  * The matches will be recorded in the @matches array, ordered by strictly
 656  * decreasing length and strictly decreasing offset.
 657  *
 658  * The return value is the number of matches found and written to @matches.
 659  * This can be any value in [0, nice_match_len - min_match_len + 1].
 660  */
 661 u32
 662 lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf,
 663                               struct lz_match *matches)
 664 {
 665         if (mf->huge_mode)
 666                 return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
 667                                                    mf->intervals64, matches, true);
 668         else
 669                 return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
 670                                               mf->intervals, matches, true);
 671 }
 672
 673 /*
 674  * Skip the next @count bytes (don't search for matches at them).  @count is
 675  * assumed to be > 0.
 676  */
 677 void
 678 lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count)
 679 {
 680         if (mf->huge_mode) {
 681                 do {
 682                         lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
 683                                                     mf->intervals64, NULL, false);
 684                 } while (--count);
 685         } else {
 686                 do {
 687                         lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
 688                                                mf->intervals, NULL, false);
 689                 } while (--count);
 690         }
 691 }
 692
 693 /*
 694  * Destroy an LCP-interval tree matchfinder that was previously initialized with
 695  * lcpit_matchfinder_init().
 696  */
 697 void
 698 lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf)
 699 {
 700         FREE(mf->pos_data);
 701         FREE(mf->intervals);
 702 }