wimlib.net Git - wimlib/blob - src/lz_sarray.c

   1 /*
   2  * lz_sarray.c
   3  *
   4  * Suffix array match-finder for Lempel-Ziv compression.
   5  */
   6
   7 /*
   8  * Copyright (c) 2013 Eric Biggers.  All rights reserved.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  *
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  *
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  24  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  28  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  29  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  30  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  31  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifdef HAVE_CONFIG_H
  35 #  include "config.h"
  36 #endif
  37
  38 #include "wimlib/lz_sarray.h"
  39 #include "wimlib/util.h"
  40 #include "divsufsort/divsufsort.h"
  41 #include <string.h>
  42
  43 /* If ENABLE_LZ_DEBUG is defined, verify that the suffix array satisfies its
  44  * definition.
  45  *
  46  * @SA          The constructed suffix array.
  47  * @T           The original data.
  48  * @found       Temporary 'bool' array of length @n.
  49  * @n           Length of the data (length of @SA, @T, and @found arrays).
  50  *
  51  * WARNING: this is for debug use only as it does not necessarily run in linear
  52  * time!!!  */
  53 static void
  54 verify_suffix_array(const input_idx_t SA[restrict],
  55                     const u8 T[restrict],
  56                     bool found[restrict],
  57                     input_idx_t n)
  58 {
  59 #ifdef ENABLE_LZ_DEBUG
  60         /* Ensure the SA contains exactly one of each i in [0, n - 1].  */
  61         for (input_idx_t i = 0; i < n; i++)
  62                 found[i] = false;
  63         for (input_idx_t r = 0; r < n; r++) {
  64                 input_idx_t i = SA[r];
  65                 LZ_ASSERT(i < n);
  66                 LZ_ASSERT(!found[i]);
  67                 found[i] = true;
  68         }
  69
  70         /* Ensure the suffix with rank r is lexicographically lesser than the
  71          * suffix with rank (r + 1) for all r in [0, n - 2].  */
  72         for (input_idx_t r = 0; r < n - 1; r++) {
  73
  74                 input_idx_t i1 = SA[r];
  75                 input_idx_t i2 = SA[r + 1];
  76
  77                 input_idx_t n1 = n - i1;
  78                 input_idx_t n2 = n - i2;
  79
  80                 int res = memcmp(&T[i1], &T[i2], min(n1, n2));
  81                 LZ_ASSERT(res < 0 || (res == 0 && n1 < n2));
  82         }
  83 #endif /* ENABLE_LZ_DEBUG  */
  84 }
  85
  86 /* Compute the inverse suffix array @ISA from the suffix array @SA in linear
  87  * time.
  88  *
  89  * Whereas the suffix array is a mapping from suffix rank to suffix position,
  90  * the inverse suffix array is a mapping from suffix position to suffix rank.
  91  */
  92 static void
  93 compute_inverse_suffix_array(input_idx_t ISA[restrict],
  94                              const input_idx_t SA[restrict],
  95                              input_idx_t n)
  96 {
  97         input_idx_t i;
  98
  99         for (i = 0; i < n; i++)
 100                 ISA[SA[i]] = i;
 101 }
 102
 103
 104 /* Compute the LCP (Longest Common Prefix) array in linear time.
 105  *
 106  * LCP[r] will be the length of the longest common prefix between the suffixes
 107  * with positions SA[r - 1] and  SA[r].  LCP[0] will be undefined.
 108  *
 109  * Algorithm adapted from Kasai et al. 2001: "Linear-Time Longest-Common-Prefix
 110  * Computation in Suffix Arrays and Its Applications".  Modified slightly to
 111  * take into account that with bytes in the real world, there is no unique
 112  * symbol at the end of the string.  */
 113 static void
 114 compute_lcp_array(input_idx_t LCP[restrict],
 115                   const input_idx_t SA[restrict],
 116                   const input_idx_t ISA[restrict],
 117                   const u8 T[restrict],
 118                   input_idx_t n)
 119 {
 120         input_idx_t h, i, r, j, lim;
 121
 122         h = 0;
 123         for (i = 0; i < n; i++) {
 124                 r = ISA[i];
 125                 if (r > 0) {
 126                         j = SA[r - 1];
 127                         lim = min(n - i, n - j);
 128
 129                         while (h < lim && T[i + h] == T[j + h])
 130                                 h++;
 131                         LCP[r] = h;
 132                         if (h > 0)
 133                                 h--;
 134                 }
 135         }
 136 }
 137
 138 /* If ENABLE_LZ_DEBUG is defined, verify that the LCP (Longest Common Prefix)
 139  * array satisfies its definition.
 140  *
 141  * WARNING: this is for debug use only as it does not necessarily run in linear
 142  * time!!!  */
 143 static void
 144 verify_lcp_array(input_idx_t LCP[restrict],
 145                  const input_idx_t SA[restrict],
 146                  const u8 T[restrict],
 147                  input_idx_t n)
 148 {
 149 #ifdef ENABLE_LZ_DEBUG
 150         for (input_idx_t r = 0; r < n - 1; r++) {
 151                 input_idx_t i1 = SA[r];
 152                 input_idx_t i2 = SA[r + 1];
 153                 input_idx_t lcp = LCP[r + 1];
 154
 155                 input_idx_t n1 = n - i1;
 156                 input_idx_t n2 = n - i2;
 157
 158                 LZ_ASSERT(lcp <= min(n1, n2));
 159
 160                 LZ_ASSERT(memcmp(&T[i1], &T[i2], lcp) == 0);
 161                 if (lcp < min(n1, n2))
 162                         LZ_ASSERT(T[i1 + lcp] != T[i2 + lcp]);
 163         }
 164 #endif /* ENABLE_LZ_DEBUG */
 165 }
 166
 167 /* Initialize the SA link array in linear time.
 168  *
 169  * This is similar to computing the LPF (Longest Previous Factor) array, which
 170  * is addressed in several papers.  In particular the algorithms below are based
 171  * on Crochemore et al. 2009: "LPF computation revisited".  However, this
 172  * match-finder does not actually compute or use the LPF array per se.  Rather,
 173  * this function sets up some information necessary to compute the LPF array,
 174  * but later lz_sarray_get_matches() actually uses this information to search
 175  * the suffix array directly and can keep searching beyond the first (longest)
 176  * match whose length would be placed in the LPF array.  This difference from
 177  * the theoretical work is necessary because in many real compression formats
 178  * matches take variable numbers of bits to encode, so a decent parser needs to
 179  * consider more than just the longest match with unspecified offset.
 180  *
 181  * Note: We cap the lcpprev and lcpnext values to the maximum match length so
 182  * that the match-finder need not worry about it later, in the inner loop.
 183  */
 184 static void
 185 init_salink(struct salink link[restrict],
 186             const input_idx_t LCP[restrict],
 187             const input_idx_t SA[restrict],
 188             const u8 T[restrict],
 189             input_idx_t n,
 190             input_idx_t max_match_len)
 191 {
 192         /* Compute salink.next and salink.lcpnext.  */
 193         link[n - 1].next = ~(input_idx_t)0;
 194         link[n - 1].lcpnext = 0;
 195         for (input_idx_t r = n - 2; r != ~(input_idx_t)0; r--) {
 196                 input_idx_t t = r + 1;
 197                 input_idx_t l = LCP[t];
 198                 while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
 199                         l = min(l, link[t].lcpnext);
 200                         t = link[t].next;
 201                 }
 202                 link[r].next = t;
 203                 link[r].lcpnext = min(l, max_match_len);
 204         }
 205
 206         /* Compute salink.prev and salink.lcpprev.  */
 207         link[0].prev = ~(input_idx_t)0;
 208         link[0].lcpprev = 0;
 209         for (input_idx_t r = 1; r < n; r++) {
 210                 input_idx_t t = r - 1;
 211                 input_idx_t l = LCP[r];
 212                 while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
 213                         l = min(l, link[t].lcpprev);
 214                         t = link[t].prev;
 215                 }
 216                 link[r].prev = t;
 217                 link[r].lcpprev = min(l, max_match_len);
 218         }
 219 }
 220
 221 /* If ENABLE_LZ_DEBUG is defined, verify the values computed by init_salink().
 222  *
 223  * WARNING: this is for debug use only as it does not necessarily run in linear
 224  * time!!!  */
 225 static void
 226 verify_salink(const struct salink link[],
 227               const input_idx_t SA[],
 228               const u8 T[],
 229               input_idx_t n,
 230               input_idx_t max_match_len)
 231 {
 232 #ifdef ENABLE_LZ_DEBUG
 233         for (input_idx_t r = 0; r < n; r++) {
 234                 for (input_idx_t prev = r; ; ) {
 235                         if (prev == 0) {
 236                                 LZ_ASSERT(link[r].prev == ~(input_idx_t)0);
 237                                 LZ_ASSERT(link[r].lcpprev == 0);
 238                                 break;
 239                         }
 240
 241                         prev--;
 242
 243                         if (SA[prev] < SA[r]) {
 244                                 LZ_ASSERT(link[r].prev == prev);
 245                                 LZ_ASSERT(link[r].lcpprev <= n - SA[prev]);
 246                                 LZ_ASSERT(link[r].lcpprev <= n - SA[r]);
 247                                 LZ_ASSERT(link[r].lcpprev <= max_match_len);
 248                                 LZ_ASSERT(0 == memcmp(&T[SA[prev]],
 249                                                       &T[SA[r]],
 250                                                       link[r].lcpprev));
 251                                 if (link[r].lcpprev < n - SA[prev] &&
 252                                     link[r].lcpprev < n - SA[r] &&
 253                                     link[r].lcpprev < max_match_len)
 254                                 {
 255                                         LZ_ASSERT(T[SA[prev] + link[r].lcpprev] !=
 256                                                   T[SA[r] + link[r].lcpprev]);
 257                                 }
 258                                 break;
 259                         }
 260                 }
 261
 262                 for (input_idx_t next = r; ; ) {
 263                         if (next == n - 1) {
 264                                 LZ_ASSERT(link[r].next == ~(input_idx_t)0);
 265                                 LZ_ASSERT(link[r].lcpnext == 0);
 266                                 break;
 267                         }
 268
 269                         next++;
 270
 271                         if (SA[next] < SA[r]) {
 272                                 LZ_ASSERT(link[r].next == next);
 273                                 LZ_ASSERT(link[r].lcpnext <= n - SA[next]);
 274                                 LZ_ASSERT(link[r].lcpnext <= n - SA[r]);
 275                                 LZ_ASSERT(link[r].lcpnext <= max_match_len);
 276                                 LZ_ASSERT(0 == memcmp(&T[SA[next]],
 277                                                       &T[SA[r]],
 278                                                       link[r].lcpnext));
 279                                 if (link[r].lcpnext < n - SA[next] &&
 280                                     link[r].lcpnext < n - SA[r] &&
 281                                     link[r].lcpnext < max_match_len)
 282                                 {
 283                                         LZ_ASSERT(T[SA[next] + link[r].lcpnext] !=
 284                                                   T[SA[r] + link[r].lcpnext]);
 285
 286                                 }
 287                                 break;
 288                         }
 289                 }
 290         }
 291 #endif
 292 }
 293
 294 /*
 295  * Initialize the suffix array match-finder.
 296  *
 297  * @mf
 298  *      The suffix array match-finder structure to initialize.  This structure
 299  *      is expected to be zeroed before this function is called.  In the case
 300  *      that this function fails, lz_sarray_destroy() should be called to free
 301  *      any memory that may have been allocated.
 302  *
 303  * @max_window_size
 304  *      The maximum window size to support.
 305  *
 306  *      In the current implementation, the memory needed will be
 307  *      (6 * sizeof(input_idx_t) * @max_window_size) bytes.
 308  *      For (sizeof(input_idx_t) == 4) that's 24 times the window size.
 309  *
 310  *      Memory is saved by saving neither the original window nor the LCP
 311  *      (Longest Common Prefix) array; otherwise 29 times the window size would
 312  *      be required.  (In practice the compressor will likely keep the original
 313  *      window around anyway, although based on this property of the
 314  *      match-finder it theoretically it could overwrite it with the compressed
 315  *      data.)
 316  *
 317  * @min_match_len
 318  *      The minimum length of each match to be found.
 319  *
 320  * @max_match_len
 321  *      The maximum length of each match to be found.
 322  *
 323  * @max_matches_to_consider
 324  *      The maximum number of matches to consider at each position.  This should
 325  *      be greater than @max_matches_to_return because @max_matches_to_consider
 326  *      counts all the returned matches as well as matches of equal length to
 327  *      returned matches that were not returned.  This parameter bounds the
 328  *      amount of work the match-finder does at any one position.  This could be
 329  *      anywhere from 1 to 100+ depending on the compression ratio and
 330  *      performance desired.
 331  *
 332  * @max_matches_to_return
 333  *      Maximum number of matches to return at each position.  Because of the
 334  *      suffix array search algorithm, the order in which matches are returned
 335  *      will be from longest to shortest, so cut-offs due to this parameter will
 336  *      only result in shorter matches being discarded.  This parameter could be
 337  *      anywhere from 1 to (@max_match_len - @min_match_len + 1) depending on
 338  *      the compression performance desired.  However, making it even moderately
 339  *      large (say, greater than 3) may not be very helpful due to the property
 340  *      that the matches are returned from longest to shortest.  But the main
 341  *      thing to keep in mind is that if the compressor decides to output a
 342  *      shorter-than-possible match, ideally it would be best to choose the best
 343  *      match of the desired length rather than truncate a longer match to that
 344  *      length.
 345  *
 346  * After initialization, the suffix-array match-finder can be used for any
 347  * number of input strings (windows) of length less than or equal to
 348  * @max_window_size by successive calls to lz_sarray_load_window().
 349  *
 350  * Returns %true on success, or %false if sufficient memory could not be
 351  * allocated.  See the note for @max_window_size above regarding the needed
 352  * memory size.
 353  */
 354 bool
 355 lz_sarray_init(struct lz_sarray *mf,
 356                input_idx_t max_window_size,
 357                input_idx_t min_match_len,
 358                input_idx_t max_match_len,
 359                u32 max_matches_to_consider,
 360                u32 max_matches_to_return)
 361 {
 362         mf->max_window_size = max_window_size;
 363         mf->min_match_len = min_match_len;
 364         mf->max_match_len = max_match_len;
 365         mf->max_matches_to_consider = max_matches_to_consider;
 366         mf->max_matches_to_return = max_matches_to_return;
 367
 368         /* SA and ISA will share the same storage block.  */
 369         mf->SA = MALLOC(2 * max_window_size * sizeof(mf->SA[0]));
 370         if (mf->SA == NULL)
 371                 return false;
 372
 373         mf->salink = MALLOC(max_window_size * sizeof(mf->salink[0]));
 374         if (mf->salink == NULL)
 375                 return false;
 376
 377         return true;
 378 }
 379
 380 /*
 381  * Prepare the suffix array match-finder to scan the specified window for
 382  * matches.
 383  *
 384  * @mf  Suffix array match-finder previously initialized with lz_sarray_init().
 385  *
 386  * @T   Window, or "block", in which to find matches.
 387  *
 388  * @n   Size of window in bytes.  This must be positive and less than or equal
 389  *      to the @max_window_size passed to lz_sarray_init().
 390  *
 391  * This function runs in linear time (relative to @n).
 392  */
 393 void
 394 lz_sarray_load_window(struct lz_sarray *mf, const u8 T[], input_idx_t n)
 395 {
 396         input_idx_t *ISA, *LCP;
 397
 398         LZ_ASSERT(n > 0 && n <= mf->max_window_size);
 399
 400         /* Compute SA (Suffix Array).
 401          *
 402          * divsufsort() needs temporary space --- one array with 256 spaces and
 403          * one array with 65536 spaces.  The implementation has been modified
 404          * from the original to use the provided temporary space instead of
 405          * allocating its own.
 406          *
 407          * We also check at build-time that divsufsort() uses the same integer
 408          * size expected by this code.  Unfortunately, divsufsort breaks if
 409          * 'sa_idx_t' is defined to be a 16-bit integer; however, that would
 410          * limit blocks to only 65536 bytes anyway.  */
 411         LZ_ASSERT(mf->max_window_size * sizeof(mf->SA[0])
 412                   >= 256 * sizeof(saidx_t));
 413         LZ_ASSERT(mf->max_window_size * sizeof(mf->salink[0])
 414                   >= 256 * 256 * sizeof(saidx_t));
 415         BUILD_BUG_ON(sizeof(input_idx_t) != sizeof(saidx_t));
 416
 417         divsufsort(T, mf->SA, n, (saidx_t*)&mf->SA[n], (saidx_t*)mf->salink);
 418
 419         BUILD_BUG_ON(sizeof(bool) > sizeof(mf->salink[0]));
 420         verify_suffix_array(mf->SA, T, (bool*)mf->salink, n);
 421
 422         /* Compute ISA (Inverse Suffix Array) in a preliminary position.
 423          *
 424          * This is just a trick to save memory.  Since LCP is unneeded after
 425          * this function, it can be computed in any available space.  The
 426          * storage for the ISA is the best choice because the ISA can be built
 427          * quickly in salink for now, then re-built in its real location at the
 428          * end.  This is probably worth it because computing the ISA from the SA
 429          * is very fast, and since this match-finder is memory-hungry we'd like
 430          * to save as much memory as possible.  */
 431         ISA = (input_idx_t*)mf->salink;
 432         compute_inverse_suffix_array(ISA, mf->SA, n);
 433
 434         /* Compute LCP (Longest Common Prefix) array.  */
 435         LCP = mf->SA + n;
 436         compute_lcp_array(LCP, mf->SA, ISA, T, n);
 437         verify_lcp_array(LCP, mf->SA, T, n);
 438
 439         /* Initialize suffix array links.  */
 440         init_salink(mf->salink, LCP, mf->SA, T, n, mf->max_match_len);
 441         verify_salink(mf->salink, mf->SA, T, n, mf->max_match_len);
 442
 443         /* Compute ISA (Inverse Suffix Array) in its final position.  */
 444         ISA = mf->SA + n;
 445         compute_inverse_suffix_array(ISA, mf->SA, n);
 446
 447         /* Save new variables and return.  */
 448         mf->ISA = ISA;
 449         mf->cur_pos = 0;
 450         mf->window_size = n;
 451 }
 452
 453 /* Free memory allocated for the suffix array match-finder.  */
 454 void
 455 lz_sarray_destroy(struct lz_sarray *mf)
 456 {
 457         FREE(mf->SA);
 458         FREE(mf->salink);
 459 }