wimlib.net Git - wimlib/blob - src/lcpit_matchfinder.c

   1 /*
   2  * lcpit_matchfinder.c
   3  *
   4  * A match-finder for Lempel-Ziv compression based on bottom-up construction and
   5  * traversal of the Longest Common Prefix (LCP) interval tree.
   6  *
   7  * Author:      Eric Biggers
   8  * Year:        2014, 2015
   9  *
  10  * The author dedicates this file to the public domain.
  11  * You can do whatever you want with this file.
  12  */
  13
  14 #ifdef HAVE_CONFIG_H
  15 #  include "config.h"
  16 #endif
  17
  18 #include <limits.h>
  19
  20 #include "wimlib/divsufsort.h"
  21 #include "wimlib/lcpit_matchfinder.h"
  22 #include "wimlib/util.h"
  23
  24 #define LCP_BITS                6
  25 #define LCP_MAX                 (((u32)1 << LCP_BITS) - 1)
  26 #define LCP_SHIFT               (32 - LCP_BITS)
  27 #define LCP_MASK                (LCP_MAX << LCP_SHIFT)
  28 #define POS_MASK                (((u32)1 << (32 - LCP_BITS)) - 1)
  29 #define MAX_NORMAL_BUFSIZE      (POS_MASK + 1)
  30
  31 #define HUGE_LCP_BITS           7
  32 #define HUGE_LCP_MAX            (((u32)1 << HUGE_LCP_BITS) - 1)
  33 #define HUGE_LCP_SHIFT          (64 - HUGE_LCP_BITS)
  34 #define HUGE_LCP_MASK           ((u64)HUGE_LCP_MAX << HUGE_LCP_SHIFT)
  35 #define HUGE_POS_MASK           0xFFFFFFFF
  36 #define MAX_HUGE_BUFSIZE        ((u64)HUGE_POS_MASK + 1)
  37 #define HUGE_UNVISITED_TAG      0x100000000
  38
  39 #define PREFETCH_SAFETY         5
  40
  41 /*
  42  * Build the LCP (Longest Common Prefix) array in linear time.
  43  *
  44  * LCP[r] will be the length of the longest common prefix between the suffixes
  45  * with positions SA[r - 1] and SA[r].  LCP[0] will be undefined.
  46  *
  47  * Algorithm taken from Kasai et al. (2001), but modified slightly:
  48  *
  49  *  - With bytes there is no realistic way to reserve a unique symbol for
  50  *    end-of-buffer, so use explicit checks for end-of-buffer.
  51  *
  52  *  - For decreased memory usage and improved memory locality, pack the two
  53  *    logically distinct SA and LCP arrays into a single array SA_and_LCP.
  54  *
  55  *  - Since SA_and_LCP is accessed randomly, improve the cache behavior by
  56  *    reading several entries ahead in ISA and prefetching the upcoming
  57  *    SA_and_LCP entry.
  58  *
  59  *  - If an LCP value is less than the minimum match length, then store 0.  This
  60  *    avoids having to do comparisons against the minimum match length later.
  61  *
  62  *  - If an LCP value is greater than the "nice match length", then store the
  63  *    "nice match length".  This caps the number of bits needed to store each
  64  *    LCP value, and this caps the depth of the LCP-interval tree, without
  65  *    usually hurting the compression ratio too much.
  66  *
  67  * References:
  68  *
  69  *      Kasai et al.  2001.  Linear-Time Longest-Common-Prefix Computation in
  70  *      Suffix Arrays and Its Applications.  CPM '01 Proceedings of the 12th
  71  *      Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
  72  */
  73 static void
  74 build_LCP(u32 SA_and_LCP[restrict], const u32 ISA[restrict],
  75           const u8 T[restrict], const u32 n,
  76           const u32 min_lcp, const u32 max_lcp)
  77 {
  78         u32 h = 0;
  79         for (u32 i = 0; i < n; i++) {
  80                 const u32 r = ISA[i];
  81                 prefetchw(&SA_and_LCP[ISA[i + PREFETCH_SAFETY]]);
  82                 if (r > 0) {
  83                         const u32 j = SA_and_LCP[r - 1] & POS_MASK;
  84                         const u32 lim = min(n - i, n - j);
  85                         while (h < lim && T[i + h] == T[j + h])
  86                                 h++;
  87                         u32 stored_lcp = h;
  88                         if (stored_lcp < min_lcp)
  89                                 stored_lcp = 0;
  90                         else if (stored_lcp > max_lcp)
  91                                 stored_lcp = max_lcp;
  92                         SA_and_LCP[r] |= stored_lcp << LCP_SHIFT;
  93                         if (h > 0)
  94                                 h--;
  95                 }
  96         }
  97 }
  98
  99 /*
 100  * Use the suffix array accompanied with the longest-common-prefix array ---
 101  * which in combination can be called the "enhanced suffix array" --- to
 102  * simulate a bottom-up traversal of the corresponding suffix tree, or
 103  * equivalently the lcp-interval tree.  Do so in suffix rank order, but save the
 104  * superinterval references needed for later bottom-up traversal of the tree in
 105  * suffix position order.
 106  *
 107  * To enumerate the lcp-intervals, this algorithm scans the suffix array and its
 108  * corresponding LCP array linearly.  While doing so, it maintains a stack of
 109  * lcp-intervals that are currently open, meaning that their left boundaries
 110  * have been seen but their right boundaries have not.  The bottom of the stack
 111  * is the interval which covers the entire suffix array (this has lcp=0), and
 112  * the top of the stack is the deepest interval that is currently open (this has
 113  * the greatest lcp of any interval on the stack).  When this algorithm opens an
 114  * lcp-interval, it assigns it a unique index in intervals[] and pushes it onto
 115  * the stack.  When this algorithm closes an interval, it pops it from the stack
 116  * and sets the intervals[] entry of that interval to the index and lcp of that
 117  * interval's superinterval, which is the new top of the stack.
 118  *
 119  * This algorithm also set pos_data[pos] for each suffix position 'pos' to the
 120  * index and lcp of the deepest lcp-interval containing it.  Alternatively, we
 121  * can interpret each suffix as being associated with a singleton lcp-interval,
 122  * or leaf of the suffix tree.  With this interpretation, an entry in pos_data[]
 123  * is the superinterval reference for one of these singleton lcp-intervals and
 124  * therefore is not fundamentally different from an entry in intervals[].
 125  *
 126  * To reduce memory usage, this algorithm re-uses the suffix array's memory to
 127  * store the generated intervals[] array.  This is possible because SA and LCP
 128  * are accessed linearly, and no more than one interval is generated per suffix.
 129  *
 130  * The techniques used in this algorithm are described in various published
 131  * papers.  The generation of lcp-intervals from the suffix array (SA) and the
 132  * longest-common-prefix array (LCP) is given as Algorithm BottomUpTraverse in
 133  * Kasai et al. (2001) and Algorithm 4.1 ("Computation of lcp-intervals") in
 134  * Abouelhoda et al. (2004).  Both these papers note the equivalence between
 135  * lcp-intervals (including the singleton lcp-interval for each suffix) and
 136  * nodes of the suffix tree.  Abouelhoda et al. (2004) furthermore applies
 137  * bottom-up traversal of the lcp-interval tree to Lempel-Ziv factorization, as
 138  * does Chen at al. (2008).  Algorithm CPS1b of Chen et al. (2008) dynamically
 139  * re-uses the suffix array during bottom-up traversal of the lcp-interval tree.
 140  *
 141  * References:
 142  *
 143  *      Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix
 144  *      Arrays and Its Applications.  2001.  CPM '01 Proceedings of the 12th
 145  *      Annual Symposium on Combinatorial Pattern Matching pp. 181-192.
 146  *
 147  *      M.I. Abouelhoda, S. Kurtz, E. Ohlebusch.  2004.  Replacing Suffix Trees
 148  *      With Enhanced Suffix Arrays.  Journal of Discrete Algorithms Volume 2
 149  *      Issue 1, March 2004, pp. 53-86.
 150  *
 151  *      G. Chen, S.J. Puglisi, W.F. Smyth.  2008.  Lempel-Ziv Factorization
 152  *      Using Less Time & Space.  Mathematics in Computer Science June 2008,
 153  *      Volume 1, Issue 4, pp. 605-623.
 154  */
 155 static void
 156 build_LCPIT(u32 intervals[restrict], u32 pos_data[restrict], const u32 n)
 157 {
 158         u32 * const SA_and_LCP = intervals;
 159         u32 next_interval_idx;
 160         u32 open_intervals[LCP_MAX + 1];
 161         u32 *top = open_intervals;
 162         u32 prev_pos = SA_and_LCP[0] & POS_MASK;
 163
 164         *top = 0;
 165         intervals[0] = 0;
 166         next_interval_idx = 1;
 167
 168         for (u32 r = 1; r < n; r++) {
 169                 const u32 next_pos = SA_and_LCP[r] & POS_MASK;
 170                 const u32 next_lcp = SA_and_LCP[r] & LCP_MASK;
 171                 const u32 top_lcp = *top & LCP_MASK;
 172
 173                 prefetchw(&pos_data[SA_and_LCP[r + PREFETCH_SAFETY] & POS_MASK]);
 174
 175                 if (next_lcp == top_lcp) {
 176                         /* Continuing the deepest open interval  */
 177                         pos_data[prev_pos] = *top;
 178                 } else if (next_lcp > top_lcp) {
 179                         /* Opening a new interval  */
 180                         *++top = next_lcp | next_interval_idx++;
 181                         pos_data[prev_pos] = *top;
 182                 } else {
 183                         /* Closing the deepest open interval  */
 184                         pos_data[prev_pos] = *top;
 185                         for (;;) {
 186                                 const u32 closed_interval_idx = *top-- & POS_MASK;
 187                                 const u32 superinterval_lcp = *top & LCP_MASK;
 188
 189                                 if (next_lcp == superinterval_lcp) {
 190                                         /* Continuing the superinterval */
 191                                         intervals[closed_interval_idx] = *top;
 192                                         break;
 193                                 } else if (next_lcp > superinterval_lcp) {
 194                                         /* Creating a new interval that is a
 195                                          * superinterval of the one being
 196                                          * closed, but still a subinterval of
 197                                          * its superinterval  */
 198                                         *++top = next_lcp | next_interval_idx++;
 199                                         intervals[closed_interval_idx] = *top;
 200                                         break;
 201                                 } else {
 202                                         /* Also closing the superinterval  */
 203                                         intervals[closed_interval_idx] = *top;
 204                                 }
 205                         }
 206                 }
 207                 prev_pos = next_pos;
 208         }
 209
 210         /* Close any still-open intervals.  */
 211         pos_data[prev_pos] = *top;
 212         for (; top > open_intervals; top--)
 213                 intervals[*top & POS_MASK] = *(top - 1);
 214 }
 215
 216 /*
 217  * Advance the LCP-interval tree matchfinder by one byte.
 218  *
 219  * If @record_matches is true, then matches are written to the @matches array
 220  * sorted by strictly decreasing length and strictly decreasing offset, and the
 221  * return value is the number of matches found.  Otherwise, @matches is ignored
 222  * and the return value is always 0.
 223  *
 224  * How this algorithm works:
 225  *
 226  * 'cur_pos' is the position of the current suffix, which is the suffix being
 227  * matched against.  'cur_pos' starts at 0 and is incremented each time this
 228  * function is called.  This function finds each suffix with position less than
 229  * 'cur_pos' that shares a prefix with the current suffix, but for each distinct
 230  * prefix length it finds only the suffix with the greatest position (i.e. the
 231  * most recently seen in the linear traversal by position).  This function
 232  * accomplishes this using the lcp-interval tree data structure that was built
 233  * by build_LCPIT() and is updated dynamically by this function.
 234  *
 235  * The first step is to read 'pos_data[cur_pos]', which gives the index and lcp
 236  * value of the deepest lcp-interval containing the current suffix --- or,
 237  * equivalently, the parent of the conceptual singleton lcp-interval that
 238  * contains the current suffix.
 239  *
 240  * The second step is to ascend the lcp-interval tree until reaching an interval
 241  * that has not yet been visited, and link the intervals to the current suffix
 242  * along the way.   An lcp-interval has been visited if and only if it has been
 243  * linked to a suffix.  Initially, no lcp-intervals are linked to suffixes.
 244  *
 245  * The third step is to continue ascending the lcp-interval tree, but indirectly
 246  * through suffix links rather than through the original superinterval
 247  * references, and continuing to form links with the current suffix along the
 248  * way.  Each suffix visited during this step, except in a special case to
 249  * handle outdated suffixes, is a match which can be written to matches[].  Each
 250  * intervals[] entry contains the position of the next suffix to visit, which we
 251  * shall call 'match_pos'; this is the most recently seen suffix that belongs to
 252  * that lcp-interval.  'pos_data[match_pos]' then contains the lcp and interval
 253  * index of the next lcp-interval that should be visited.
 254  *
 255  * We can view these arrays as describing a new set of links that gets overlaid
 256  * on top of the original superinterval references of the lcp-interval tree.
 257  * Each such link can connect a node of the lcp-interval tree to an ancestor
 258  * more than one generation removed.
 259  *
 260  * For each one-byte advance, the current position becomes the most recently
 261  * seen suffix for a continuous sequence of lcp-intervals from a leaf interval
 262  * to the root interval.  Conceptually, this algorithm needs to update all these
 263  * nodes to link to 'cur_pos', and then update 'pos_data[cur_pos]' to a "null"
 264  * link.  But actually, if some of these nodes have the same most recently seen
 265  * suffix, then this algorithm just visits the pos_data[] entry for that suffix
 266  * and skips over all these nodes in one step.  Updating the extra nodes is
 267  * accomplished by creating a redirection from the previous suffix to the
 268  * current suffix.
 269  *
 270  * Using this shortcutting scheme, it's possible for a suffix to become out of
 271  * date, which means that it is no longer the most recently seen suffix for the
 272  * lcp-interval under consideration.  This case is detected by noticing when the
 273  * next lcp-interval link actually points deeper in the tree, and it is worked
 274  * around by just continuing until we get to a link that actually takes us
 275  * higher in the tree.  This can be described as a lazy-update scheme.
 276  */
 277 static inline u32
 278 lcpit_advance_one_byte(const u32 cur_pos,
 279                        u32 pos_data[restrict],
 280                        u32 intervals[restrict],
 281                        u32 next[restrict],
 282                        struct lz_match matches[restrict],
 283                        const bool record_matches)
 284 {
 285         u32 ref;
 286         u32 super_ref;
 287         u32 match_pos;
 288         struct lz_match *matchptr;
 289
 290         /* Get the deepest lcp-interval containing the current suffix. */
 291         ref = pos_data[cur_pos];
 292
 293         /* Prefetch upcoming data, up to 3 positions ahead.  Assume the
 294          * intervals are already visited.  */
 295
 296         /* Prefetch the superinterval via a suffix link for the deepest
 297          * lcp-interval containing the suffix starting 1 position from now.  */
 298         prefetchw(&intervals[pos_data[next[0]] & POS_MASK]);
 299
 300         /* Prefetch suffix link for the deepest lcp-interval containing the
 301          * suffix starting 2 positions from now.  */
 302         next[0] = intervals[next[1]] & POS_MASK;
 303         prefetchw(&pos_data[next[0]]);
 304
 305         /* Prefetch the deepest lcp-interval containing the suffix starting 3
 306          * positions from now.  */
 307         next[1] = pos_data[cur_pos + 3] & POS_MASK;
 308         prefetchw(&intervals[next[1]]);
 309
 310         /* There is no "next suffix" after the current one.  */
 311         pos_data[cur_pos] = 0;
 312
 313         /* Ascend until we reach a visited interval, the root, or a child of the
 314          * root.  Link unvisited intervals to the current suffix as we go.  */
 315         while ((super_ref = intervals[ref & POS_MASK]) & LCP_MASK) {
 316                 intervals[ref & POS_MASK] = cur_pos;
 317                 ref = super_ref;
 318         }
 319
 320         if (super_ref == 0) {
 321                 /* In this case, the current interval may be any of:
 322                  * (1) the root;
 323                  * (2) an unvisited child of the root;
 324                  * (3) an interval last visited by suffix 0
 325                  *
 326                  * We could avoid the ambiguity with (3) by using an lcp
 327                  * placeholder value other than 0 to represent "visited", but
 328                  * it's fastest to use 0.  So we just don't allow matches with
 329                  * position 0.  */
 330
 331                 if (ref != 0)  /* Not the root?  */
 332                         intervals[ref & POS_MASK] = cur_pos;
 333                 return 0;
 334         }
 335
 336         /* Ascend indirectly via pos_data[] links.  */
 337         match_pos = super_ref;
 338         matchptr = matches;
 339         for (;;) {
 340                 while ((super_ref = pos_data[match_pos]) > ref)
 341                         match_pos = intervals[super_ref & POS_MASK];
 342                 intervals[ref & POS_MASK] = cur_pos;
 343                 pos_data[match_pos] = ref;
 344                 if (record_matches) {
 345                         matchptr->length = ref >> LCP_SHIFT;
 346                         matchptr->offset = cur_pos - match_pos;
 347                         matchptr++;
 348                 }
 349                 if (super_ref == 0)
 350                         break;
 351                 ref = super_ref;
 352                 match_pos = intervals[ref & POS_MASK];
 353         }
 354         return matchptr - matches;
 355 }
 356
 357 /* Expand SA from 32 bits to 64 bits.  */
 358 static void
 359 expand_SA(void *p, u32 n)
 360 {
 361         typedef u32 _may_alias_attribute aliased_u32_t;
 362         typedef u64 _may_alias_attribute aliased_u64_t;
 363
 364         aliased_u32_t *SA = p;
 365         aliased_u64_t *SA64 = p;
 366
 367         u32 r = n - 1;
 368         do {
 369                 SA64[r] = SA[r];
 370         } while (r--);
 371 }
 372
 373 /* Like build_LCP(), but for buffers larger than MAX_NORMAL_BUFSIZE.  */
 374 static void
 375 build_LCP_huge(u64 SA_and_LCP64[restrict], const u32 ISA[restrict],
 376                const u8 T[restrict], const u32 n,
 377                const u32 min_lcp, const u32 max_lcp)
 378 {
 379         u32 h = 0;
 380         for (u32 i = 0; i < n; i++) {
 381                 const u32 r = ISA[i];
 382                 prefetchw(&SA_and_LCP64[ISA[i + PREFETCH_SAFETY]]);
 383                 if (r > 0) {
 384                         const u32 j = SA_and_LCP64[r - 1] & HUGE_POS_MASK;
 385                         const u32 lim = min(n - i, n - j);
 386                         while (h < lim && T[i + h] == T[j + h])
 387                                 h++;
 388                         u32 stored_lcp = h;
 389                         if (stored_lcp < min_lcp)
 390                                 stored_lcp = 0;
 391                         else if (stored_lcp > max_lcp)
 392                                 stored_lcp = max_lcp;
 393                         SA_and_LCP64[r] |= (u64)stored_lcp << HUGE_LCP_SHIFT;
 394                         if (h > 0)
 395                                 h--;
 396                 }
 397         }
 398 }
 399
 400 /*
 401  * Like build_LCPIT(), but for buffers larger than MAX_NORMAL_BUFSIZE.
 402  *
 403  * This "huge" version is also slightly different in that the lcp value stored
 404  * in each intervals[] entry is the lcp value for that interval, not its
 405  * superinterval.  This lcp value stays put in intervals[] and doesn't get moved
 406  * to pos_data[] during lcpit_advance_one_byte_huge().  One consequence of this
 407  * is that we have to use a special flag to distinguish visited from unvisited
 408  * intervals.  But overall, this scheme keeps the memory usage at 12n instead of
 409  * 16n.  (The non-huge version is 8n.)
 410  */
 411 static void
 412 build_LCPIT_huge(u64 intervals64[restrict], u32 pos_data[restrict], const u32 n)
 413 {
 414         u64 * const SA_and_LCP64 = intervals64;
 415         u32 next_interval_idx;
 416         u32 open_intervals[HUGE_LCP_MAX + 1];
 417         u32 *top = open_intervals;
 418         u32 prev_pos = SA_and_LCP64[0] & HUGE_POS_MASK;
 419
 420         *top = 0;
 421         intervals64[0] = 0;
 422         next_interval_idx = 1;
 423
 424         for (u32 r = 1; r < n; r++) {
 425                 const u32 next_pos = SA_and_LCP64[r] & HUGE_POS_MASK;
 426                 const u64 next_lcp = SA_and_LCP64[r] & HUGE_LCP_MASK;
 427                 const u64 top_lcp = intervals64[*top];
 428
 429                 prefetchw(&pos_data[SA_and_LCP64[r + PREFETCH_SAFETY] & HUGE_POS_MASK]);
 430
 431                 if (next_lcp == top_lcp) {
 432                         /* Continuing the deepest open interval  */
 433                         pos_data[prev_pos] = *top;
 434                 } else if (next_lcp > top_lcp) {
 435                         /* Opening a new interval  */
 436                         intervals64[next_interval_idx] = next_lcp;
 437                         pos_data[prev_pos] = next_interval_idx;
 438                         *++top = next_interval_idx++;
 439                 } else {
 440                         /* Closing the deepest open interval  */
 441                         pos_data[prev_pos] = *top;
 442                         for (;;) {
 443                                 const u32 closed_interval_idx = *top--;
 444                                 const u64 superinterval_lcp = intervals64[*top];
 445
 446                                 if (next_lcp == superinterval_lcp) {
 447                                         /* Continuing the superinterval */
 448                                         intervals64[closed_interval_idx] |=
 449                                                 HUGE_UNVISITED_TAG | *top;
 450                                         break;
 451                                 } else if (next_lcp > superinterval_lcp) {
 452                                         /* Creating a new interval that is a
 453                                          * superinterval of the one being
 454                                          * closed, but still a subinterval of
 455                                          * its superinterval  */
 456                                         intervals64[next_interval_idx] = next_lcp;
 457                                         intervals64[closed_interval_idx] |=
 458                                                 HUGE_UNVISITED_TAG | next_interval_idx;
 459                                         *++top = next_interval_idx++;
 460                                         break;
 461                                 } else {
 462                                         /* Also closing the superinterval  */
 463                                         intervals64[closed_interval_idx] |=
 464                                                 HUGE_UNVISITED_TAG | *top;
 465                                 }
 466                         }
 467                 }
 468                 prev_pos = next_pos;
 469         }
 470
 471         /* Close any still-open intervals.  */
 472         pos_data[prev_pos] = *top;
 473         for (; top > open_intervals; top--)
 474                 intervals64[*top] |= HUGE_UNVISITED_TAG | *(top - 1);
 475 }
 476
 477 /* Like lcpit_advance_one_byte(), but for buffers larger than
 478  * MAX_NORMAL_BUFSIZE.  */
 479 static inline u32
 480 lcpit_advance_one_byte_huge(const u32 cur_pos,
 481                             u32 pos_data[restrict],
 482                             u64 intervals64[restrict],
 483                             u32 prefetch_next[restrict],
 484                             struct lz_match matches[restrict],
 485                             const bool record_matches)
 486 {
 487         u32 interval_idx;
 488         u32 next_interval_idx;
 489         u64 cur;
 490         u64 next;
 491         u32 match_pos;
 492         struct lz_match *matchptr;
 493
 494         interval_idx = pos_data[cur_pos];
 495
 496         prefetchw(&intervals64[pos_data[prefetch_next[0]] & HUGE_POS_MASK]);
 497
 498         prefetch_next[0] = intervals64[prefetch_next[1]] & HUGE_POS_MASK;
 499         prefetchw(&pos_data[prefetch_next[0]]);
 500
 501         prefetch_next[1] = pos_data[cur_pos + 3] & HUGE_POS_MASK;
 502         prefetchw(&intervals64[prefetch_next[1]]);
 503
 504         pos_data[cur_pos] = 0;
 505
 506         while ((next = intervals64[interval_idx]) & HUGE_UNVISITED_TAG) {
 507                 intervals64[interval_idx] = (next & HUGE_LCP_MASK) | cur_pos;
 508                 interval_idx = next & HUGE_POS_MASK;
 509         }
 510
 511         matchptr = matches;
 512         while (next & HUGE_LCP_MASK) {
 513                 cur = next;
 514                 do {
 515                         match_pos = next & HUGE_POS_MASK;
 516                         next_interval_idx = pos_data[match_pos];
 517                         next = intervals64[next_interval_idx];
 518                 } while (next > cur);
 519                 intervals64[interval_idx] = (cur & HUGE_LCP_MASK) | cur_pos;
 520                 pos_data[match_pos] = interval_idx;
 521                 if (record_matches) {
 522                         matchptr->length = cur >> HUGE_LCP_SHIFT;
 523                         matchptr->offset = cur_pos - match_pos;
 524                         matchptr++;
 525                 }
 526                 interval_idx = next_interval_idx;
 527         }
 528         return matchptr - matches;
 529 }
 530
 531 static inline u64
 532 get_pos_data_size(size_t max_bufsize)
 533 {
 534         return (u64)max((u64)max_bufsize + PREFETCH_SAFETY,
 535                         DIVSUFSORT_TMP_LEN) * sizeof(u32);
 536 }
 537
 538 static inline u64
 539 get_intervals_size(size_t max_bufsize)
 540 {
 541         return ((u64)max_bufsize + PREFETCH_SAFETY) *
 542                 (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64));
 543 }
 544
 545 /*
 546  * Calculate the number of bytes of memory needed for the LCP-interval tree
 547  * matchfinder.
 548  *
 549  * @max_bufsize - maximum buffer size to support
 550  *
 551  * Returns the number of bytes required.
 552  */
 553 u64
 554 lcpit_matchfinder_get_needed_memory(size_t max_bufsize)
 555 {
 556         return get_pos_data_size(max_bufsize) + get_intervals_size(max_bufsize);
 557 }
 558
 559 /*
 560  * Initialize the LCP-interval tree matchfinder.
 561  *
 562  * @mf - the matchfinder structure to initialize
 563  * @max_bufsize - maximum buffer size to support
 564  * @min_match_len - minimum match length in bytes
 565  * @nice_match_len - only consider this many bytes of each match
 566  *
 567  * Returns true if successfully initialized; false if out of memory.
 568  */
 569 bool
 570 lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize,
 571                        u32 min_match_len, u32 nice_match_len)
 572 {
 573         if (lcpit_matchfinder_get_needed_memory(max_bufsize) > SIZE_MAX)
 574                 return false;
 575         if (max_bufsize > MAX_HUGE_BUFSIZE - PREFETCH_SAFETY)
 576                 return false;
 577
 578         mf->pos_data = MALLOC(get_pos_data_size(max_bufsize));
 579         mf->intervals = MALLOC(get_intervals_size(max_bufsize));
 580         if (!mf->pos_data || !mf->intervals) {
 581                 lcpit_matchfinder_destroy(mf);
 582                 return false;
 583         }
 584
 585         mf->min_match_len = min_match_len;
 586         mf->nice_match_len = min(nice_match_len,
 587                                  (max_bufsize <= MAX_NORMAL_BUFSIZE) ?
 588                                  LCP_MAX : HUGE_LCP_MAX);
 589         return true;
 590 }
 591
 592 /*
 593  * Build the suffix array SA for the specified byte array T of length n.
 594  *
 595  * The suffix array is a sorted array of the byte array's suffixes, represented
 596  * by indices into the byte array.  It can equivalently be viewed as a mapping
 597  * from suffix rank to suffix position.
 598  *
 599  * To build the suffix array, we use libdivsufsort, which uses an
 600  * induced-sorting-based algorithm.  In practice, this seems to be the fastest
 601  * suffix array construction algorithm currently available.
 602  *
 603  * References:
 604  *
 605  *      Y. Mori.  libdivsufsort, a lightweight suffix-sorting library.
 606  *      https://code.google.com/p/libdivsufsort/.
 607  *
 608  *      G. Nong, S. Zhang, and W.H. Chan.  2009.  Linear Suffix Array
 609  *      Construction by Almost Pure Induced-Sorting.  Data Compression
 610  *      Conference, 2009.  DCC '09.  pp. 193 - 202.
 611  *
 612  *      S.J. Puglisi, W.F. Smyth, and A. Turpin.  2007.  A Taxonomy of Suffix
 613  *      Array Construction Algorithms.  ACM Computing Surveys (CSUR) Volume 39
 614  *      Issue 2, 2007 Article No. 4.
 615  */
 616 static void
 617 build_SA(u32 SA[], const u8 T[], u32 n, u32 *tmp)
 618 {
 619         /* Note: divsufsort() requires a fixed amount of temporary space.  The
 620          * implementation of divsufsort() has been modified from the original to
 621          * use the provided temporary space instead of allocating its own, since
 622          * we don't want to have to deal with malloc() failures here.  */
 623         divsufsort(T, SA, n, tmp);
 624 }
 625
 626 /*
 627  * Build the inverse suffix array ISA from the suffix array SA.
 628  *
 629  * Whereas the suffix array is a mapping from suffix rank to suffix position,
 630  * the inverse suffix array is a mapping from suffix position to suffix rank.
 631  */
 632 static void
 633 build_ISA(u32 ISA[restrict], const u32 SA[restrict], u32 n)
 634 {
 635         for (u32 r = 0; r < n; r++)
 636                 ISA[SA[r]] = r;
 637 }
 638
 639 /*
 640  * Prepare the LCP-interval tree matchfinder for a new input buffer.
 641  *
 642  * @mf - the initialized matchfinder structure
 643  * @T - the input buffer
 644  * @n - size of the input buffer in bytes.  This must be nonzero and can be at
 645  *      most the max_bufsize with which lcpit_matchfinder_init() was called.
 646  */
 647 void
 648 lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
 649 {
 650         /* intervals[] temporarily stores SA and LCP packed together.
 651          * pos_data[] temporarily stores ISA.
 652          * pos_data[] is also used as the temporary space for divsufsort().  */
 653
 654         build_SA(mf->intervals, T, n, mf->pos_data);
 655         build_ISA(mf->pos_data, mf->intervals, n);
 656         if (n <= MAX_NORMAL_BUFSIZE) {
 657                 for (u32 i = 0; i < PREFETCH_SAFETY; i++) {
 658                         mf->intervals[n + i] = 0;
 659                         mf->pos_data[n + i] = 0;
 660                 }
 661                 build_LCP(mf->intervals, mf->pos_data, T, n,
 662                           mf->min_match_len, mf->nice_match_len);
 663                 build_LCPIT(mf->intervals, mf->pos_data, n);
 664                 mf->huge_mode = false;
 665         } else {
 666                 for (u32 i = 0; i < PREFETCH_SAFETY; i++) {
 667                         mf->intervals64[n + i] = 0;
 668                         mf->pos_data[n + i] = 0;
 669                 }
 670                 expand_SA(mf->intervals, n);
 671                 build_LCP_huge(mf->intervals64, mf->pos_data, T, n,
 672                                mf->min_match_len, mf->nice_match_len);
 673                 build_LCPIT_huge(mf->intervals64, mf->pos_data, n);
 674                 mf->huge_mode = true;
 675         }
 676         mf->cur_pos = 0; /* starting at beginning of input buffer  */
 677         for (u32 i = 0; i < ARRAY_LEN(mf->next); i++)
 678                 mf->next[i] = 0;
 679 }
 680
 681 /*
 682  * Retrieve a list of matches with the next position.
 683  *
 684  * The matches will be recorded in the @matches array, ordered by strictly
 685  * decreasing length and strictly decreasing offset.
 686  *
 687  * The return value is the number of matches found and written to @matches.
 688  * This can be any value in [0, nice_match_len - min_match_len + 1].
 689  */
 690 u32
 691 lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf,
 692                               struct lz_match *matches)
 693 {
 694         if (mf->huge_mode)
 695                 return lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
 696                                                    mf->intervals64, mf->next,
 697                                                    matches, true);
 698         else
 699                 return lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
 700                                               mf->intervals, mf->next,
 701                                               matches, true);
 702 }
 703
 704 /*
 705  * Skip the next @count bytes (don't search for matches at them).  @count is
 706  * assumed to be > 0.
 707  */
 708 void
 709 lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count)
 710 {
 711         if (mf->huge_mode) {
 712                 do {
 713                         lcpit_advance_one_byte_huge(mf->cur_pos++, mf->pos_data,
 714                                                     mf->intervals64, mf->next,
 715                                                     NULL, false);
 716                 } while (--count);
 717         } else {
 718                 do {
 719                         lcpit_advance_one_byte(mf->cur_pos++, mf->pos_data,
 720                                                mf->intervals, mf->next,
 721                                                NULL, false);
 722                 } while (--count);
 723         }
 724 }
 725
 726 /*
 727  * Destroy an LCP-interval tree matchfinder that was previously initialized with
 728  * lcpit_matchfinder_init().
 729  */
 730 void
 731 lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf)
 732 {
 733         FREE(mf->pos_data);
 734         FREE(mf->intervals);
 735 }