Suffix array based matchfinder updates

[wimlib] / src / lcpit_matchfinder.c
diff --git a/src/lcpit_matchfinder.c b/src/lcpit_matchfinder.c

new file mode 100644 (file)

index 0000000..dd00979
--- /dev/null
+++ b/src/lcpit_matchfinder.c
@@ -0,0 +1,256 @@
+/*
+ * lcpit_matchfinder.h
+ *
+ * A match-finder for Lempel-Ziv compression based on bottom-up construction and
+ * traversal of the Longest Common Prefix (LCP) interval tree.
+ *
+ * Author:     Eric Biggers
+ * Year:       2014, 2015
+ *
+ * The author dedicates this file to the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#include <limits.h>
+#include <string.h>
+
+#include "wimlib/divsufsort.h"
+#include "wimlib/lcpit_matchfinder.h"
+#include "wimlib/util.h"
+
+#define LCP_BITS               6
+#define LCP_MASK               ((1 << LCP_BITS) - 1)
+#define LCP_MAX                        LCP_MASK
+#define NORMAL_UNVISITED_TAG   ((u32)1 << 31)
+#define MAX_NORMAL_BUFSIZE     ((u32)1 << (31 - LCP_BITS))
+#define HUGE_UNVISITED_TAG     ((u64)1 << 63)
+#define SA_and_LCP_LCP_SHIFT   (32 - LCP_BITS)
+#define SA_and_LCP_POS_MASK    (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1)
+
+/*
+ * Include the template header to define the functions build_LCP(),
+ * build_LCPIT(), and lcpit_advance_one_byte().  There are "normal" and "huge"
+ * versions of each function.  The normal versions assume that a buffer position
+ * and LCP value can be packed into a 32-bit integer, whereas the huge versions
+ * assume that 64 bits is needed.
+ *
+ * Both versions cap LCP values to 6 bits. This limits the depth of the
+ * lcp-interval tree without hurting the compression ratio too much.  Matches of
+ * length 63 are sufficiently long that the compression ratio doesn't change
+ * significantly if we choose one such match over another.
+ */
+#define HUGE_MODE 1
+#include "lcpit_matchfinder_templates.h"
+#undef HUGE_MODE
+
+#define HUGE_MODE 0
+#include "lcpit_matchfinder_templates.h"
+#undef HUGE_MODE
+
+/*
+ * Calculate the number of bytes of memory needed for the LCP-interval tree
+ * matchfinder.
+ *
+ * @max_bufsize - maximum buffer size to support
+ *
+ * Returns the number of bytes required.
+ */
+u64
+lcpit_matchfinder_get_needed_memory(size_t max_bufsize)
+{
+       u64 size = 0;
+
+       /* pos_data (+1 is for prefetch) */
+       size += ((u64)max_bufsize + 1) * sizeof(u32);
+
+       /* intervals or intervals64  */
+       size += max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) *
+               (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64));
+
+       /* SA */
+       size += (u64)max_bufsize * sizeof(u32);
+
+       return size;
+}
+
+/*
+ * Initialize the LCP-interval tree matchfinder.
+ *
+ * @mf - the matchfinder structure to initialize
+ * @max_bufsize - maximum buffer size to support
+ * @min_match_len - minimum match length in bytes
+ * @nice_match_len - only consider this many bytes of each match
+ *
+ * Returns true if successfully initialized; false if out of memory.
+ */
+bool
+lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize,
+                      u32 min_match_len, u32 nice_match_len)
+{
+       if (lcpit_matchfinder_get_needed_memory(max_bufsize) > SIZE_MAX)
+               return false;
+
+       mf->pos_data = MALLOC((max_bufsize + 1) * sizeof(u32));
+       mf->intervals = MALLOC(max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) *
+                              (max_bufsize <= MAX_NORMAL_BUFSIZE ?
+                               sizeof(u32) : sizeof(u64)));
+       mf->SA = MALLOC(max_bufsize * sizeof(u32));
+
+       if (!mf->pos_data || !mf->intervals || !mf->SA) {
+               lcpit_matchfinder_destroy(mf);
+               return false;
+       }
+
+       mf->min_match_len = min_match_len;
+       mf->nice_match_len = min(nice_match_len, LCP_MAX);
+       return true;
+}
+
+/*
+ * Build the suffix array SA for the specified byte array T of length n.
+ *
+ * The suffix array is a sorted array of the byte array's suffixes, represented
+ * by indices into the byte array.  It can equivalently be viewed as a mapping
+ * from suffix rank to suffix position.
+ *
+ * To build the suffix array, we use libdivsufsort, which uses an
+ * induced-sorting-based algorithm.  In practice, this seems to be the fastest
+ * suffix array construction algorithm currently available.
+ *
+ * References:
+ *
+ *     Y. Mori.  libdivsufsort, a lightweight suffix-sorting library.
+ *     https://code.google.com/p/libdivsufsort/.
+ *
+ *     G. Nong, S. Zhang, and W.H. Chan.  2009.  Linear Suffix Array
+ *     Construction by Almost Pure Induced-Sorting.  Data Compression
+ *     Conference, 2009.  DCC '09.  pp. 193 - 202.
+ *
+ *     S.J. Puglisi, W.F. Smyth, and A. Turpin.  2007.  A Taxonomy of Suffix
+ *     Array Construction Algorithms.  ACM Computing Surveys (CSUR) Volume 39
+ *     Issue 2, 2007 Article No. 4.
+ */
+static void
+build_SA(u32 SA[], const u8 T[], u32 n, u32 *tmp)
+{
+       /* Note: divsufsort() needs temporary space --- one array with 256
+        * spaces and one array with 65536 spaces.  The implementation of
+        * divsufsort() has been modified from the original to use the provided
+        * temporary space instead of allocating its own, since we don't want to
+        * have to deal with malloc() failures here.  */
+       divsufsort(T, SA, n, tmp);
+}
+
+/*
+ * Build the inverse suffix array ISA from the suffix array SA.
+ *
+ * Whereas the suffix array is a mapping from suffix rank to suffix position,
+ * the inverse suffix array is a mapping from suffix position to suffix rank.
+ */
+static void
+build_ISA(u32 ISA[restrict], const u32 SA[restrict], u32 n)
+{
+       for (u32 r = 0; r < n; r++)
+               ISA[SA[r]] = r;
+}
+
+/*
+ * Prepare the LCP-interval tree matchfinder for a new input buffer.
+ *
+ * @mf - the initialized matchfinder structure
+ * @T - the input buffer
+ * @n - size of the input buffer in bytes.  This may be at most the max_bufsize
+ *     with which lcpit_matchfinder_init() was called.
+ */
+void
+lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n)
+{
+       if (n == 0)
+               return;
+
+       build_SA(mf->SA, T, n, mf->intervals);
+       build_ISA(mf->pos_data, mf->SA, n);
+       if (n <= MAX_NORMAL_BUFSIZE) {
+               /* "Normal" sized buffer  */
+
+               /* Build LCP, packing it into ->SA  */
+               build_LCP_normal(mf->SA, mf->pos_data, T, n,
+                                mf->min_match_len, mf->nice_match_len);
+               /* Prepare ->intervals and ->pos_data  */
+               build_LCPIT_normal(mf->SA, mf->intervals, mf->pos_data, n);
+               mf->huge_mode = false;
+       } else {
+               /* "Huge" sized buffer  */
+
+               /* Build LCP in the second half of ->intervals64.  It may be
+                * partially overwritten in build_LCPIT_huge(), but this is okay
+                * since each LCP entry is guaranteed to be consumed before it
+                * can possibly be overwritten.  */
+               build_LCP_huge(mf->intervals + n, mf->SA, mf->pos_data, T, n,
+                              mf->min_match_len, mf->nice_match_len);
+               /* Prepare ->intervals64 and ->pos_data  */
+               build_LCPIT_huge(mf->SA, mf->intervals + n, mf->intervals64,
+                                mf->pos_data, n);
+               mf->huge_mode = true;
+       }
+       mf->cur_pos = 0; /* starting at beginning of input buffer  */
+       mf->pos_data[n] = 0; /* safety entry for prefetch() overrun  */
+}
+
+/*
+ * Retrieve a list of matches with the next position.
+ *
+ * The matches will be recorded in the @matches array, ordered by strictly
+ * decreasing length and strictly decreasing offset.
+ *
+ * The return value is the number of matches found and written to @matches.
+ * This can be any value in [0, nice_match_len - min_match_len + 1].
+ *
+ * If the caller attempts to advance beyond the end of the input buffer, the
+ * behavior is undefined.
+ */
+u32
+lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf,
+                             struct lz_match *matches)
+{
+       if (mf->huge_mode)
+               return lcpit_advance_one_byte_huge(mf, matches, true);
+       else
+               return lcpit_advance_one_byte_normal(mf, matches, true);
+}
+
+/*
+ * Skip the next @count bytes (don't search for matches at them).  @count is
+ * assumed to be > 0.
+ *
+ * If the caller attempts to advance beyond the end of the input buffer, the
+ * behavior is undefined.
+ */
+void
+lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count)
+{
+       if (mf->huge_mode) {
+               do {
+                       lcpit_advance_one_byte_huge(mf, NULL, false);
+               } while (--count);
+       } else {
+               do {
+                       lcpit_advance_one_byte_normal(mf, NULL, false);
+               } while (--count);
+       }
+}
+
+/*
+ * Destroy an LCP-interval tree matchfinder that was previously initialized with
+ * lcpit_matchfinder_init().
+ *
+ * If the struct has been zeroed out, this has no effect.
+ */
+void
+lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf)
+{
+       FREE(mf->pos_data);
+       FREE(mf->intervals);
+       FREE(mf->SA);
+       memset(mf, 0, sizeof(*mf));
+}