Separate suffix array match-finder from LZX compressor

author Eric Biggers <ebiggers3@gmail.com>

Mon, 30 Dec 2013 01:37:39 +0000 (19:37 -0600)

committer Eric Biggers <ebiggers3@gmail.com>

Wed, 1 Jan 2014 16:04:48 +0000 (10:04 -0600)
author Eric Biggers <ebiggers3@gmail.com>
Mon, 30 Dec 2013 01:37:39 +0000 (19:37 -0600)
committer Eric Biggers <ebiggers3@gmail.com>
Wed, 1 Jan 2014 16:04:48 +0000 (10:04 -0600)
diff --git a/Makefile.am b/Makefile.am

index 29071d12547a37db485f92c475cc40bb7e2b0ca8..52ba967057d5c27adc0c270d82af4de139dd7374 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -41,7 +41,8 @@ libwim_la_SOURCES =           \
         src/lzms-common.c       \
         src/lzms-compress.c     \
         src/lzms-decompress.c   \
         src/lzms-common.c       \
         src/lzms-compress.c     \
         src/lzms-decompress.c   \
-       src/lz77.c              \
+       src/lz_hash.c           \
+       src/lz_sarray.c         \
         src/divsufsort/divsufsort.c             \
         src/divsufsort/divsufsort.h             \
         src/divsufsort/divsufsort_private.h     \
         src/divsufsort/divsufsort.c             \
         src/divsufsort/divsufsort.h             \
         src/divsufsort/divsufsort_private.h     \
@@ -90,6 +91,9 @@ libwim_la_SOURCES =           \
         include/wimlib/integrity.h      \
         include/wimlib/list.h           \
         include/wimlib/lookup_table.h   \
         include/wimlib/integrity.h      \
         include/wimlib/list.h           \
         include/wimlib/lookup_table.h   \
+       include/wimlib/lz.h             \
+       include/wimlib/lz_hash.h        \
+       include/wimlib/lz_sarray.h      \
         include/wimlib/lzms.h           \
         include/wimlib/lzx.h            \
         include/wimlib/metadata.h       \
         include/wimlib/lzms.h           \
         include/wimlib/lzx.h            \
         include/wimlib/metadata.h       \
diff --git a/README b/README

index ab94e55cbcb91e6b3ffad902415234e2510f162d..abfbeafb8c3fddfcaf233ef13bc8d560f236def4 100644 (file)
--- a/README
+++ b/README
@@ -276,9 +276,9 @@ suffix array construction code from divsufsort
  (https://code.google.com/p/libdivsufsort/) and algorithms from 7-Zip as well as
  several published papers.
  
  (https://code.google.com/p/libdivsufsort/) and algorithms from 7-Zip as well as
  several published papers.
  
-lz77.c contains a hash-table-based LZ77 matchfinder that is based on code from
-zlib but has been rewritten.  This code is applicable to XPRESS, LZX, and LZMS,
-all of which are partly based on LZ77 compression.
+lz_hash.c contains a hash-table-based LZ77 matchfinder that is based on code
+from zlib but has been rewritten.  This code is applicable to XPRESS, LZX, and
+LZMS, all of which are partly based on LZ77 compression.
  
  A limited number of other free programs can handle some parts of the WIM
  file format:
  
  A limited number of other free programs can handle some parts of the WIM
  file format:
diff --git a/include/wimlib/compress_common.h b/include/wimlib/compress_common.h

index 14b32f488b488a0084e4a4ac31a72f4769932c5f..666f10dcc43a031b5a9af818c1a00edbac35364b 100644 (file)
--- a/include/wimlib/compress_common.h
+++ b/include/wimlib/compress_common.h
@@ -64,29 +64,6 @@ bitstream_put_bits(struct output_bitstream *ostream,
  extern void
  bitstream_put_byte(struct output_bitstream *ostream, u8 n);
  
  extern void
  bitstream_put_byte(struct output_bitstream *ostream, u8 n);
  
-struct lz_params {
-       unsigned min_match;
-       unsigned max_match;
-       unsigned max_offset;
-       unsigned nice_match;
-       unsigned good_match;
-       unsigned max_chain_len;
-       unsigned max_lazy_match;
-       unsigned too_far;
-};
-
-typedef void (*lz_record_match_t)(unsigned len, unsigned offset, void *ctx);
-typedef void (*lz_record_literal_t)(u8 lit, void *ctx);
-
-extern void
-lz_analyze_block(const u8 window[restrict],
-                input_idx_t window_size,
-                lz_record_match_t record_match,
-                lz_record_literal_t record_literal,
-                void *record_ctx,
-                const struct lz_params *params,
-                input_idx_t prev_tab[restrict]);
-
  extern void
  make_canonical_huffman_code(unsigned num_syms,
                             unsigned max_codeword_len,
  extern void
  make_canonical_huffman_code(unsigned num_syms,
                             unsigned max_codeword_len,
diff --git a/include/wimlib/lz.h b/include/wimlib/lz.h

new file mode 100644 (file)

index 0000000..4206f14
--- /dev/null
+++ b/include/wimlib/lz.h
@@ -0,0 +1,30 @@
+#ifndef _WIMLIB_LZ_H
+#define _WIMLIB_LZ_H
+
+#include "wimlib/compress_common.h"
+
+//#define ENABLE_LZ_DEBUG
+#ifdef ENABLE_LZ_DEBUG
+#  define LZ_DEBUG DEBUG
+#  define LZ_ASSERT wimlib_assert
+#  include "wimlib/assert.h"
+#  include "wimlib/error.h"
+#else
+#  define LZ_DEBUG(...)
+#  define LZ_ASSERT(...)
+#endif
+
+
+/* Raw LZ match/literal format: just a length and offset.
+ *
+ * The length is the number of bytes of the match, and the offset is the number
+ * of bytes back in the input the match is from the current position.
+ *
+ * This can alternatively be used to represent a literal byte if @len is less
+ * than the minimum match length.  */
+struct raw_match {
+       input_idx_t len;
+       input_idx_t offset;
+};
+
+#endif /* _WIMLIB_LZ_H */
diff --git a/include/wimlib/lz_hash.h b/include/wimlib/lz_hash.h

new file mode 100644 (file)

index 0000000..9d097ae
--- /dev/null
+++ b/include/wimlib/lz_hash.h
@@ -0,0 +1,30 @@
+#ifndef _WIMLIB_LZ_HASH_H
+#define _WIMLIB_LZ_HASH_H
+
+#include "wimlib/compress_common.h"
+
+struct lz_params {
+       unsigned min_match;
+       unsigned max_match;
+       unsigned max_offset;
+       unsigned nice_match;
+       unsigned good_match;
+       unsigned max_chain_len;
+       unsigned max_lazy_match;
+       unsigned too_far;
+};
+
+typedef void (*lz_record_match_t)(unsigned len, unsigned offset, void *ctx);
+typedef void (*lz_record_literal_t)(u8 lit, void *ctx);
+
+extern void
+lz_analyze_block(const u8 window[restrict],
+                input_idx_t window_size,
+                lz_record_match_t record_match,
+                lz_record_literal_t record_literal,
+                void *record_ctx,
+                const struct lz_params *params,
+                input_idx_t prev_tab[restrict]);
+
+
+#endif /* _WIMLIB_LZ_HASH_H  */
diff --git a/include/wimlib/lz_sarray.h b/include/wimlib/lz_sarray.h

new file mode 100644 (file)

index 0000000..d2d49d3
--- /dev/null
+++ b/include/wimlib/lz_sarray.h
@@ -0,0 +1,362 @@
+/*
+ * lz_sarray.h
+ *
+ * Suffix array match-finder for LZ (Lempel-Ziv) compression.
+ */
+
+/*
+ * Copyright (C) 2013 Eric Biggers
+ *
+ * This file is part of wimlib, a library for working with WIM files.
+ *
+ * wimlib is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+ * A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with wimlib; if not, see http://www.gnu.org/licenses/.
+ */
+
+#ifndef _WIMLIB_LZ_SARRAY_H
+#define _WIMLIB_LZ_SARRAY_H
+
+#include "wimlib/compiler.h"
+#include "wimlib/lz.h"
+#include "wimlib/types.h"
+
+struct salink;
+
+/* Suffix array LZ (Lempel-Ziv) match-finder.  */
+struct lz_sarray {
+       /* Allocated window size for the match-finder.  */
+       input_idx_t max_window_size;
+
+       /* Minimum match length to return.  */
+       input_idx_t min_match_len;
+
+       /* Maximum match length to return.  */
+       input_idx_t max_match_len;
+
+       /* Maximum matches to consider at each position (max search depth).  */
+       u32 max_matches_to_consider;
+
+       /* Maximum number of matches to return at each position.  */
+       u32 max_matches_to_return;
+
+       /* Current position in the window  */
+       input_idx_t cur_pos;
+
+       /* Current window size.  */
+       input_idx_t window_size;
+
+       /* Suffix array for window.
+        * This is a mapping from suffix rank to suffix position.  */
+       input_idx_t *SA;
+
+       /* Inverse suffix array for window.
+        * This is a mapping from suffix position to suffix rank.
+        * If 0 <= r < window_size, then ISA[SA[r]] == r.  */
+       input_idx_t *ISA;
+
+       /* Longest common prefix array corresponding to the suffix array SA.
+        * LCP[i] is the length of the longest common prefix between the
+        * suffixes with positions SA[i - 1] and  SA[i].  LCP[0] is undefined.
+        */
+       input_idx_t *LCP;
+
+       /* Suffix array links.
+        *
+        * During a linear scan of the input string to find matches, this array
+        * used to keep track of which rank suffixes in the suffix array appear
+        * before the current position.  Instead of searching in the original
+        * suffix array, scans for matches at a given position traverse a linked
+        * list containing only suffixes that appear before that position.  */
+       struct salink *salink;
+};
+
+/* Suffix array link  */
+struct salink {
+       /* Rank of highest ranked suffix that has rank lower than the suffix
+        * corresponding to this structure and either has a lower position
+        * (initially) or has a position lower than the highest position at
+        * which matches have been searched for so far, or -1 if there is no
+        * such suffix.  */
+       input_idx_t prev;
+
+       /* Rank of lowest ranked suffix that has rank greater than the suffix
+        * corresponding to this structure and either has a lower position
+        * (intially) or has a position lower than the highest position at which
+        * matches have been searched for so far, or -1 if there is no such
+        * suffix.  */
+       input_idx_t next;
+
+       /* Length of longest common prefix between the suffix corresponding to
+        * this structure and the suffix with rank @prev, or 0 if @prev is -1.
+        */
+       input_idx_t lcpprev;
+
+       /* Length of longest common prefix between the suffix corresponding to
+        * this structure and the suffix with rank @next, or 0 if @next is -1.
+        */
+       input_idx_t lcpnext;
+};
+
+extern bool
+lz_sarray_init(struct lz_sarray *mf,
+               input_idx_t max_window_size,
+               input_idx_t min_match_len,
+               input_idx_t max_match_len,
+               u32 max_matches_to_consider,
+               u32 max_matches_to_return);
+
+extern void
+lz_sarray_destroy(struct lz_sarray *mf);
+
+extern void
+lz_sarray_load_window(struct lz_sarray *mf, const u8 window[],
+                     input_idx_t window_size);
+
+static inline input_idx_t
+lz_sarray_get_pos(const struct lz_sarray *mf)
+{
+       return mf->cur_pos;
+}
+
+/* Advance the suffix array match-finder to the next position.  */
+static _always_inline_attribute void
+lz_sarray_update_salink(const input_idx_t i,
+                       const input_idx_t SA[const restrict],
+                       const input_idx_t ISA[const restrict],
+                       struct salink link[const restrict])
+{
+       /* r = Rank of the suffix at the current position.  */
+       const input_idx_t r = ISA[i];
+
+       /* next = rank of LOWEST ranked suffix that is ranked HIGHER than the
+        * current suffix AND has a LOWER position, or -1 if none exists.  */
+       const input_idx_t next = link[r].next;
+
+       /* prev = rank of HIGHEST ranked suffix that is ranked LOWER than the
+        * current suffix AND has a LOWER position, or -1 if none exists.  */
+       const input_idx_t prev = link[r].prev;
+
+       /* Link the suffix at the current position into the linked list that
+        * contains all suffixes in the suffix array that are appear at or
+        * before the current position, sorted by rank.
+        *
+        * Save the values of all fields we overwrite so that rollback is
+        * possible.  */
+       if (next != ~(input_idx_t)0) {
+
+               link[next].prev = r;
+               link[next].lcpprev = link[r].lcpnext;
+       }
+
+       if (prev != ~(input_idx_t)0) {
+
+               link[prev].next = r;
+               link[prev].lcpnext = link[r].lcpprev;
+       }
+}
+
+/* Skip the current position in the suffix array match-finder.  */
+static _always_inline_attribute void
+lz_sarray_skip_position(struct lz_sarray *mf)
+{
+       LZ_ASSERT(mf->cur_pos < mf->window_size);
+       lz_sarray_update_salink(mf->cur_pos++, mf->SA, mf->ISA, mf->salink);
+}
+
+typedef input_idx_t lz_sarray_cost_t;
+#define LZ_SARRAY_INFINITE_COST (~(lz_sarray_cost_t)0)
+
+/*
+ * Use the suffix array match-finder to retrieve a list of LZ matches at the
+ * current position.
+ *
+ * Returns the number of matches written into @matches.  The matches are
+ * returned in decreasing order by length, and each will be of unique length
+ * between the minimum and maximum match lengths passed to lz_sarray_init().  Up
+ * to @max_matches_to_return (passed to lz_sarray_init()) matches will be
+ * returned.
+ *
+ * @eval_match_cost is a function for evaluating the cost of a match when
+ * deciding which ones to return.  It is only used for comparing matches of the
+ * same length.  It needs to be fast, and need not be exact; an implementation
+ * might simply rank matches by their offset, for example, although
+ * implementations may choose to take into account additional information such
+ * as repeat offsets.
+ */
+static _always_inline_attribute u32
+lz_sarray_get_matches(struct lz_sarray *mf,
+                     struct raw_match matches[],
+                     lz_sarray_cost_t (*eval_match_cost)
+                               (input_idx_t length,
+                                input_idx_t offset,
+                                const void *ctx),
+                     const void *eval_match_cost_ctx)
+{
+       LZ_ASSERT(mf->cur_pos < mf->window_size);
+       const input_idx_t i = mf->cur_pos++;
+
+       const input_idx_t * const restrict SA = mf->SA;
+       const input_idx_t * const restrict ISA = mf->ISA;
+       struct salink * const restrict link = mf->salink;
+       const input_idx_t min_match_len = mf->min_match_len;
+       const u32 max_matches_to_consider = mf->max_matches_to_consider;
+       const u32 max_matches_to_return = mf->max_matches_to_return;
+
+       /* r = Rank of the suffix at the current position.  */
+       const input_idx_t r = ISA[i];
+
+       /* Prepare for searching the current position.  */
+       lz_sarray_update_salink(i, SA, ISA, link);
+
+       /* L = rank of next suffix to the left;
+        * R = rank of next suffix to the right;
+        * lenL = length of match between current position and the suffix with rank L;
+        * lenR = length of match between current position and the suffix with rank R.
+        *
+        * This is left and right relative to the rank of the current suffix.
+        * Since the suffixes in the suffix array are sorted, the longest
+        * matches are immediately to the left and right (using the linked list
+        * to ignore all suffixes that occur later in the window).  The match
+        * length decreases the farther left and right we go.  We shall keep the
+        * length on both sides in sync in order to choose the lowest-cost match
+        * of each length.
+        */
+       input_idx_t L = link[r].prev;
+       input_idx_t R = link[r].next;
+       input_idx_t lenL = link[r].lcpprev;
+       input_idx_t lenR = link[r].lcpnext;
+
+       /* nmatches = number of matches found so far.  */
+       u32 nmatches = 0;
+
+       /* best_cost = cost of lowest-cost match found so far.
+        *
+        * We keep track of this so that we can ignore shorter matches that do
+        * not have lower costs than a longer matches already found.
+        */
+       lz_sarray_cost_t best_cost = LZ_SARRAY_INFINITE_COST;
+
+       /* count_remaining = maximum number of possible matches remaining to be
+        * considered.  */
+       u32 count_remaining = max_matches_to_consider;
+
+       /* pending = match currently being considered for a specific length.  */
+       struct raw_match pending;
+       lz_sarray_cost_t pending_cost;
+
+       while (lenL >= min_match_len || lenR >= min_match_len)
+       {
+               pending.len = lenL;
+               pending_cost = LZ_SARRAY_INFINITE_COST;
+               lz_sarray_cost_t cost;
+
+               /* Extend left.  */
+               if (lenL >= min_match_len && lenL >= lenR) {
+                       for (;;) {
+
+                               if (--count_remaining == 0)
+                                       goto out_save_pending;
+
+                               input_idx_t offset = i - SA[L];
+
+                               /* Save match if it has smaller cost.  */
+                               cost = (*eval_match_cost)(lenL, offset,
+                                                         eval_match_cost_ctx);
+                               if (cost < pending_cost) {
+                                       pending.offset = offset;
+                                       pending_cost = cost;
+                               }
+
+                               if (link[L].lcpprev < lenL) {
+                                       /* Match length decreased.  */
+
+                                       lenL = link[L].lcpprev;
+
+                                       /* Save the pending match unless the
+                                        * right side still may have matches of
+                                        * this length to be scanned, or if a
+                                        * previous (longer) match had lower
+                                        * cost.  */
+                                       if (pending.len > lenR) {
+                                               if (pending_cost < best_cost) {
+                                                       best_cost = pending_cost;
+                                                       matches[nmatches++] = pending;
+                                                       if (nmatches == max_matches_to_return)
+                                                               return nmatches;
+                                               }
+                                               pending.len = lenL;
+                                               pending_cost = LZ_SARRAY_INFINITE_COST;
+                                       }
+                                       if (lenL < min_match_len || lenL < lenR)
+                                               break;
+                               }
+                               L = link[L].prev;
+                       }
+               }
+
+               pending.len = lenR;
+
+               /* Extend right.  */
+               if (lenR >= min_match_len && lenR > lenL) {
+                       for (;;) {
+
+                               if (--count_remaining == 0)
+                                       goto out_save_pending;
+
+                               input_idx_t offset = i - SA[R];
+
+                               /* Save match if it has smaller cost.  */
+                               cost = (*eval_match_cost)(lenR,
+                                                         offset,
+                                                         eval_match_cost_ctx);
+                               if (cost < pending_cost) {
+                                       pending.offset = offset;
+                                       pending_cost = cost;
+                               }
+
+                               if (link[R].lcpnext < lenR) {
+                                       /* Match length decreased.  */
+
+                                       lenR = link[R].lcpnext;
+
+                                       /* Save the pending match unless a
+                                        * previous (longer) match had lower
+                                        * cost.  */
+                                       if (pending_cost < best_cost) {
+                                               matches[nmatches++] = pending;
+                                               best_cost = pending_cost;
+                                               if (nmatches == max_matches_to_return)
+                                                       return nmatches;
+                                       }
+
+                                       if (lenR < min_match_len || lenR <= lenL)
+                                               break;
+
+                                       pending.len = lenR;
+                                       pending_cost = LZ_SARRAY_INFINITE_COST;
+                               }
+                               R = link[R].next;
+                       }
+               }
+       }
+       goto out;
+
+out_save_pending:
+       if (pending_cost != LZ_SARRAY_INFINITE_COST)
+               matches[nmatches++] = pending;
+
+out:
+       return nmatches;
+}
+
+#endif /* _WIMLIB_LZ_SARRAY_H */
diff --git a/src/lz77.c b/src/lz_hash.c

similarity index 99%

rename from src/lz77.c

rename to src/lz_hash.c

index b5495da74d1599cd59e12cc8ef65c495550eba55..ac469f202e2f3bdfa0cfbd717b49e31a333377f7 100644 (file)
--- a/src/lz77.c
+++ b/src/lz_hash.c
@@ -1,5 +1,5 @@
  /*
  /*
- * lz77.c
+ * lz_hash.c
   *
   * This file provides the code to analyze a buffer of uncompressed data for
   * matches, as per the LZ77 algorithm.  It uses a hash table to accelerate the
   *
   * This file provides the code to analyze a buffer of uncompressed data for
   * matches, as per the LZ77 algorithm.  It uses a hash table to accelerate the
@@ -30,7 +30,7 @@
  #  include <config.h>
  #endif
  
  #  include <config.h>
  #endif
  
-#include "wimlib/compress_common.h"
+#include "wimlib/lz_hash.h"
  #include "wimlib/util.h"
  
  #include <string.h>
  #include "wimlib/util.h"
  
  #include <string.h>
diff --git a/src/lz_sarray.c b/src/lz_sarray.c

new file mode 100644 (file)

index 0000000..79fca69
--- /dev/null
+++ b/src/lz_sarray.c
@@ -0,0 +1,237 @@
+/*
+ * lz_sarray.c
+ *
+ * Suffix array match-finder for LZ (Lempel-Ziv) compression.
+ */
+
+/*
+ * Copyright (C) 2013 Eric Biggers
+ *
+ * This file is part of wimlib, a library for working with WIM files.
+ *
+ * wimlib is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+ * A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with wimlib; if not, see http://www.gnu.org/licenses/.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "wimlib/lz_sarray.h"
+#include "wimlib/util.h"
+#include "divsufsort/divsufsort.h"
+#include <string.h>
+
+/* Initialize the suffix array match-finder with the specified parameters.
+ *
+ * After initialization, it can be used for any number of input strings of
+ * length less than or equal to @max_window_size.  */
+bool
+lz_sarray_init(struct lz_sarray *mf,
+              input_idx_t max_window_size,
+              input_idx_t min_match_len,
+              input_idx_t max_match_len,
+              u32 max_matches_to_consider,
+              u32 max_matches_to_return)
+{
+       mf->max_window_size = max_window_size;
+       mf->min_match_len = min_match_len;
+       mf->max_match_len = max_match_len;
+       mf->max_matches_to_consider = max_matches_to_consider;
+       mf->max_matches_to_return = max_matches_to_return;
+
+       mf->SA = MALLOC(3U * max_window_size * sizeof(mf->SA[0]));
+       if (mf->SA == NULL)
+               return false;
+
+       mf->salink = MALLOC(max_window_size * sizeof(mf->salink[0]));
+       if (mf->salink == NULL)
+               return false;
+
+       return true;
+}
+
+/* Free memory allocated for the suffix array match-finder.  */
+void
+lz_sarray_destroy(struct lz_sarray *mf)
+{
+       FREE(mf->SA);
+       FREE(mf->salink);
+}
+
+/* Initialize the suffix array match-finder for the specified input.  */
+void
+lz_sarray_load_window(struct lz_sarray *mf, const u8 window[],
+                     input_idx_t window_size)
+{
+       /* Load variables  */
+       const u8 * const restrict T = window;
+       const input_idx_t n = window_size;
+       const input_idx_t max_match_len = mf->max_match_len;
+       input_idx_t * const restrict SA = mf->SA;
+       input_idx_t * const restrict ISA = mf->ISA = SA + window_size;
+       input_idx_t * const restrict LCP = mf->LCP = ISA + window_size;
+       struct salink * const restrict link = mf->salink;
+
+       /* Compute SA (Suffix Array).  */
+       {
+               /* ISA and link are used as temporary space.  */
+               LZ_ASSERT(mf->max_window_size * sizeof(ISA[0]) >= 256 * sizeof(saidx_t));
+               LZ_ASSERT(mf->max_window_size * 2 * sizeof(link[0]) >= 256 * 256 * sizeof(saidx_t));
+
+               if (sizeof(input_idx_t) == sizeof(saidx_t)) {
+                       divsufsort(T, SA, n, (saidx_t*)ISA, (saidx_t*)link);
+               } else {
+                       saidx_t sa[n];
+                       divsufsort(T, sa, n, (saidx_t*)ISA, (saidx_t*)link);
+                       for (input_idx_t i = 0; i < n; i++)
+                               SA[i] = sa[i];
+               }
+       }
+
+#ifdef ENABLE_LZ_DEBUG
+
+       LZ_ASSERT(n > 0);
+
+       /* Verify suffix array.  */
+       {
+               bool found[n];
+               ZERO_ARRAY(found);
+               for (input_idx_t r = 0; r < n; r++) {
+                       input_idx_t i = SA[r];
+                       LZ_ASSERT(i < n);
+                       LZ_ASSERT(!found[i]);
+                       found[i] = true;
+               }
+       }
+
+       for (input_idx_t r = 0; r < n - 1; r++) {
+
+               input_idx_t i1 = SA[r];
+               input_idx_t i2 = SA[r + 1];
+
+               input_idx_t n1 = n - i1;
+               input_idx_t n2 = n - i2;
+
+               LZ_ASSERT(memcmp(&T[i1], &T[i2], min(n1, n2)) <= 0);
+       }
+       LZ_DEBUG("Verified SA (len %u)", n);
+#endif /* ENABLE_LZ_DEBUG */
+
+       /* Compute ISA (Inverse Suffix Array)  */
+       for (input_idx_t r = 0; r < n; r++)
+               ISA[SA[r]] = r;
+
+       /* Compute LCP (longest common prefix) array.
+        *
+        * Algorithm adapted from Kasai et al. 2001: "Linear-Time
+        * Longest-Common-Prefix Computation in Suffix Arrays and Its
+        * Applications".  */
+       {
+               input_idx_t h = 0;
+               for (input_idx_t i = 0; i < n; i++) {
+                       input_idx_t r = ISA[i];
+                       if (r > 0) {
+                               input_idx_t j = SA[r - 1];
+
+                               input_idx_t lim = min(n - i, n - j);
+
+                               while (h < lim && T[i + h] == T[j + h])
+                                       h++;
+                               LCP[r] = h;
+                               if (h > 0)
+                                       h--;
+                       }
+               }
+       }
+
+#ifdef ENABLE_LZ_DEBUG
+       /* Verify LCP array.  */
+       for (input_idx_t r = 0; r < n - 1; r++) {
+               LZ_ASSERT(ISA[SA[r]] == r);
+               LZ_ASSERT(ISA[SA[r + 1]] == r + 1);
+
+               input_idx_t i1 = SA[r];
+               input_idx_t i2 = SA[r + 1];
+               input_idx_t lcp = LCP[r + 1];
+
+               input_idx_t n1 = n - i1;
+               input_idx_t n2 = n - i2;
+
+               LZ_ASSERT(lcp <= min(n1, n2));
+
+               LZ_ASSERT(memcmp(&T[i1], &T[i2], lcp) == 0);
+               if (lcp < min(n1, n2))
+                       LZ_ASSERT(T[i1 + lcp] != T[i2 + lcp]);
+       }
+#endif /* ENABLE_LZ_DEBUG */
+
+       /* Compute salink.next and salink.lcpnext.
+        *
+        * Algorithm adapted from Crochemore et al. 2009:
+        * "LPF computation revisited".
+        *
+        * Note: we cap lcpnext to the maximum match length so that the
+        * match-finder need not worry about it later.  */
+       link[n - 1].next = ~(input_idx_t)0;
+       link[n - 1].prev = ~(input_idx_t)0;
+       link[n - 1].lcpnext = 0;
+       link[n - 1].lcpprev = 0;
+       for (input_idx_t r = n - 2; r != ~(input_idx_t)0; r--) {
+               input_idx_t t = r + 1;
+               input_idx_t l = LCP[t];
+               while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
+                       l = min(l, link[t].lcpnext);
+                       t = link[t].next;
+               }
+               link[r].next = t;
+               link[r].lcpnext = min(l, max_match_len);
+               LZ_ASSERT(t == ~(input_idx_t)0 || l <= n - SA[t]);
+               LZ_ASSERT(l <= n - SA[r]);
+               if (t == ~(input_idx_t)0)
+                       LZ_ASSERT(l == 0);
+               else
+                       LZ_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);
+       }
+
+       /* Compute salink.prev and salink.lcpprev.
+        *
+        * Algorithm adapted from Crochemore et al. 2009:
+        * "LPF computation revisited".
+        *
+        * Note: we cap lcpprev to the maximum match length so that the
+        * match-finder need not worry about it later.  */
+       link[0].prev = ~(input_idx_t)0;
+       link[0].next = ~(input_idx_t)0;
+       link[0].lcpprev = 0;
+       link[0].lcpnext = 0;
+       for (input_idx_t r = 1; r < n; r++) {
+               input_idx_t t = r - 1;
+               input_idx_t l = LCP[r];
+               while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
+                       l = min(l, link[t].lcpprev);
+                       t = link[t].prev;
+               }
+               link[r].prev = t;
+               link[r].lcpprev = min(l, max_match_len);
+               LZ_ASSERT(t == ~(input_idx_t)0 || l <= n - SA[t]);
+               LZ_ASSERT(l <= n - SA[r]);
+               if (t == ~(input_idx_t)0)
+                       LZ_ASSERT(l == 0);
+               else
+                       LZ_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);
+       }
+
+       mf->cur_pos = 0;
+       mf->window_size = n;
+}
diff --git a/src/lzms-compress.c b/src/lzms-compress.c

index af7f85b6ffff41224684f95aa639890c983ef0a1..f7ef633ea0355829f463cf9e71d50e244f1bb88c 100644 (file)
--- a/src/lzms-compress.c
+++ b/src/lzms-compress.c
@@ -39,6 +39,7 @@
  #include "wimlib/compress_common.h"
  #include "wimlib/endianness.h"
  #include "wimlib/error.h"
  #include "wimlib/compress_common.h"
  #include "wimlib/endianness.h"
  #include "wimlib/error.h"
+#include "wimlib/lz_hash.h"
  #include "wimlib/lzms.h"
  #include "wimlib/util.h"
  
  #include "wimlib/lzms.h"
  #include "wimlib/util.h"
  
diff --git a/src/lzx-compress.c b/src/lzx-compress.c

index 6b520ae5f05634eb35d3ab3e36a9c97e5edd20fc..58806632de377a34857d9109d9b401a032d50ea3 100644 (file)
--- a/src/lzx-compress.c
+++ b/src/lzx-compress.c
@@ -158,6 +158,8 @@
  #include "wimlib/compress_common.h"
  #include "wimlib/endianness.h"
  #include "wimlib/error.h"
  #include "wimlib/compress_common.h"
  #include "wimlib/endianness.h"
  #include "wimlib/error.h"
+#include "wimlib/lz_hash.h"
+#include "wimlib/lz_sarray.h"
  #include "wimlib/lzx.h"
  #include "wimlib/util.h"
  #include <pthread.h>
  #include "wimlib/lzx.h"
  #include "wimlib/util.h"
  #include <pthread.h>
@@ -168,8 +170,6 @@
  #  include "wimlib/decompress_common.h"
  #endif
  
  #  include "wimlib/decompress_common.h"
  #endif
  
-#include "divsufsort/divsufsort.h"
-
  typedef u32 block_cost_t;
  #define INFINITE_BLOCK_COST    ((block_cost_t)~0U)
  
  typedef u32 block_cost_t;
  #define INFINITE_BLOCK_COST    ((block_cost_t)~0U)
  
@@ -239,18 +239,6 @@ struct lzx_match {
         u32 data;
  };
  
         u32 data;
  };
  
-/* Raw LZ match/literal format: just a length and offset.
- *
- * The length is the number of bytes of the match, and the offset is the number
- * of bytes back in the input the match is from the current position.
- *
- * If @len < LZX_MIN_MATCH_LEN, then it's really just a literal byte and @offset is
- * meaningless.  */
-struct raw_match {
-       u16 len;
-       input_idx_t offset;
-};
-
  /* Specification for an LZX block.  */
  struct lzx_block_spec {
  
  /* Specification for an LZX block.  */
  struct lzx_block_spec {
  
@@ -321,33 +309,6 @@ struct lzx_optimal {
         struct lzx_lru_queue queue;
  };
  
         struct lzx_lru_queue queue;
  };
  
-/* Suffix array link  */
-struct salink {
-       /* Rank of highest ranked suffix that has rank lower than the suffix
-        * corresponding to this structure and either has a lower position
-        * (initially) or has a position lower than the highest position at
-        * which matches have been searched for so far, or -1 if there is no
-        * such suffix.  */
-       input_idx_t prev;
-
-       /* Rank of lowest ranked suffix that has rank greater than the suffix
-        * corresponding to this structure and either has a lower position
-        * (intially) or has a position lower than the highest position at which
-        * matches have been searched for so far, or -1 if there is no such
-        * suffix.  */
-       input_idx_t next;
-
-       /* Length of longest common prefix between the suffix corresponding to
-        * this structure and the suffix with rank @prev, or 0 if @prev is -1.
-        */
-       input_idx_t lcpprev;
-
-       /* Length of longest common prefix between the suffix corresponding to
-        * this structure and the suffix with rank @next, or 0 if @next is -1.
-        */
-       input_idx_t lcpnext;
-};
-
  /* State of the LZX compressor.  */
  struct lzx_compressor {
  
  /* State of the LZX compressor.  */
  struct lzx_compressor {
  
@@ -407,29 +368,8 @@ struct lzx_compressor {
         /* Fast algorithm only:  Array of hash table links.  */
         input_idx_t *prev_tab;
  
         /* Fast algorithm only:  Array of hash table links.  */
         input_idx_t *prev_tab;
  
-       /* Suffix array for window.
-        * This is a mapping from suffix rank to suffix position.  */
-       input_idx_t *SA;
-
-       /* Inverse suffix array for window.
-        * This is a mapping from suffix position to suffix rank.
-        * If 0 <= r < window_size, then ISA[SA[r]] == r.  */
-       input_idx_t *ISA;
-
-       /* Longest common prefix array corresponding to the suffix array SA.
-        * LCP[i] is the length of the longest common prefix between the
-        * suffixes with positions SA[i - 1] and  SA[i].  LCP[0] is undefined.
-        */
-       input_idx_t *LCP;
-
-       /* Suffix array links.
-        *
-        * During a linear scan of the input string to find matches, this array
-        * used to keep track of which rank suffixes in the suffix array appear
-        * before the current position.  Instead of searching in the original
-        * suffix array, scans for matches at a given position traverse a linked
-        * list containing only suffixes that appear before that position.  */
-       struct salink *salink;
+       /* Slow algorithm only: Suffix array match-finder.  */
+       struct lz_sarray lz_sarray;
  
         /* Position in window of next match to return.  */
         input_idx_t match_window_pos;
  
         /* Position in window of next match to return.  */
         input_idx_t match_window_pos;
@@ -1178,16 +1118,17 @@ lzx_match_cost(unsigned length, unsigned offset, const struct lzx_costs *costs,
  /* Fast heuristic cost evaluation to use in the inner loop of the match-finder.
   * Unlike lzx_match_cost() which does a true cost evaluation, this simply
   * prioritize matches based on their offset.  */
  /* Fast heuristic cost evaluation to use in the inner loop of the match-finder.
   * Unlike lzx_match_cost() which does a true cost evaluation, this simply
   * prioritize matches based on their offset.  */
-static block_cost_t
-lzx_match_cost_fast(unsigned offset, const struct lzx_lru_queue *queue)
+static input_idx_t
+lzx_match_cost_fast(input_idx_t length, input_idx_t offset, const void *_queue)
  {
  {
+       const struct lzx_lru_queue *queue = _queue;
+
         /* It seems well worth it to take the time to give priority to recently
          * used offsets.  */
         /* It seems well worth it to take the time to give priority to recently
          * used offsets.  */
-       for (unsigned i = 0; i < LZX_NUM_RECENT_OFFSETS; i++)
+       for (input_idx_t i = 0; i < LZX_NUM_RECENT_OFFSETS; i++)
                 if (offset == queue->R[i])
                         return i;
  
                 if (offset == queue->R[i])
                         return i;
  
-       BUILD_BUG_ON(LZX_MAX_WINDOW_SIZE >= (block_cost_t)~0U);
         return offset;
  }
  
         return offset;
  }
  
@@ -1228,219 +1169,6 @@ lzx_set_costs(struct lzx_compressor * ctx, const struct lzx_lens * lens)
         }
  }
  
         }
  }
  
-/* Advance the suffix array match-finder to the next position.  */
-static void
-lzx_lz_update_salink(input_idx_t i,
-                    const input_idx_t SA[restrict],
-                    const input_idx_t ISA[restrict],
-                    struct salink link[restrict])
-{
-       /* r = Rank of the suffix at the current position.  */
-       const input_idx_t r = ISA[i];
-
-       /* next = rank of LOWEST ranked suffix that is ranked HIGHER than the
-        * current suffix AND has a LOWER position, or -1 if none exists.  */
-       const input_idx_t next = link[r].next;
-
-       /* prev = rank of HIGHEST ranked suffix that is ranked LOWER than the
-        * current suffix AND has a LOWER position, or -1 if none exists.  */
-       const input_idx_t prev = link[r].prev;
-
-       /* Link the suffix at the current position into the linked list that
-        * contains all suffixes in the suffix array that are appear at or
-        * before the current position, sorted by rank.
-        *
-        * Save the values of all fields we overwrite so that rollback is
-        * possible.  */
-       if (next != (input_idx_t)~0U) {
-
-               link[next].prev = r;
-               link[next].lcpprev = link[r].lcpnext;
-       }
-
-       if (prev != (input_idx_t)~0U) {
-
-               link[prev].next = r;
-               link[prev].lcpnext = link[r].lcpprev;
-       }
-}
-
-/*
- * Use the suffix array match-finder to retrieve a list of LZ matches at the
- * current position.
- *
- * [in]    @i          Current position in the window.
- * [in]    @SA         Suffix array for the window.
- * [in]    @ISA                Inverse suffix array for the window.
- * [inout] @link       Suffix array links used internally by the match-finder.
- * [out]   @matches    The (length, offset) pairs of the resulting matches will
- *                             be written here, sorted in decreasing order by
- *                             length.  All returned lengths will be unique.
- * [in]    @queue      Recently used match offsets, used when evaluating the
- *                             cost of matches.
- * [in]           @min_match_len       Minimum match length to return.
- * [in]           @max_matches_to_consider     Maximum number of matches to consider at
- *                                     the position.
- * [in]           @max_matches_to_return       Maximum number of matches to return.
- *
- * The return value is the number of matches found and written to @matches.
- */
-static unsigned
-lzx_lz_get_matches(const input_idx_t i,
-                  const input_idx_t SA[const restrict],
-                  const input_idx_t ISA[const restrict],
-                  struct salink link[const restrict],
-                  struct raw_match matches[const restrict],
-                  const struct lzx_lru_queue * const restrict queue,
-                  const unsigned min_match_len,
-                  const u32 max_matches_to_consider,
-                  const u32 max_matches_to_return)
-{
-       /* r = Rank of the suffix at the current position.  */
-       const input_idx_t r = ISA[i];
-
-       /* Prepare for searching the current position.  */
-       lzx_lz_update_salink(i, SA, ISA, link);
-
-       /* L = rank of next suffix to the left;
-        * R = rank of next suffix to the right;
-        * lenL = length of match between current position and the suffix with rank L;
-        * lenR = length of match between current position and the suffix with rank R.
-        *
-        * This is left and right relative to the rank of the current suffix.
-        * Since the suffixes in the suffix array are sorted, the longest
-        * matches are immediately to the left and right (using the linked list
-        * to ignore all suffixes that occur later in the window).  The match
-        * length decreases the farther left and right we go.  We shall keep the
-        * length on both sides in sync in order to choose the lowest-cost match
-        * of each length.
-        */
-       input_idx_t L = link[r].prev;
-       input_idx_t R = link[r].next;
-       input_idx_t lenL = link[r].lcpprev;
-       input_idx_t lenR = link[r].lcpnext;
-
-       /* nmatches = number of matches found so far.  */
-       unsigned nmatches = 0;
-
-       /* best_cost = cost of lowest-cost match found so far.
-        *
-        * We keep track of this so that we can ignore shorter matches that do
-        * not have lower costs than a longer matches already found.
-        */
-       block_cost_t best_cost = INFINITE_BLOCK_COST;
-
-       /* count_remaining = maximum number of possible matches remaining to be
-        * considered.  */
-       u32 count_remaining = max_matches_to_consider;
-
-       /* pending = match currently being considered for a specific length.  */
-       struct raw_match pending;
-       block_cost_t pending_cost;
-
-       while (lenL >= min_match_len || lenR >= min_match_len)
-       {
-               pending.len = lenL;
-               pending_cost = INFINITE_BLOCK_COST;
-               block_cost_t cost;
-
-               /* Extend left.  */
-               if (lenL >= min_match_len && lenL >= lenR) {
-                       for (;;) {
-
-                               if (--count_remaining == 0)
-                                       goto out_save_pending;
-
-                               input_idx_t offset = i - SA[L];
-
-                               /* Save match if it has smaller cost.  */
-                               cost = lzx_match_cost_fast(offset, queue);
-                               if (cost < pending_cost) {
-                                       pending.offset = offset;
-                                       pending_cost = cost;
-                               }
-
-                               if (link[L].lcpprev < lenL) {
-                                       /* Match length decreased.  */
-
-                                       lenL = link[L].lcpprev;
-
-                                       /* Save the pending match unless the
-                                        * right side still may have matches of
-                                        * this length to be scanned, or if a
-                                        * previous (longer) match had lower
-                                        * cost.  */
-                                       if (pending.len > lenR) {
-                                               if (pending_cost < best_cost) {
-                                                       best_cost = pending_cost;
-                                                       matches[nmatches++] = pending;
-                                                       if (nmatches == max_matches_to_return)
-                                                               return nmatches;
-                                               }
-                                               pending.len = lenL;
-                                               pending_cost = INFINITE_BLOCK_COST;
-                                       }
-                                       if (lenL < min_match_len || lenL < lenR)
-                                               break;
-                               }
-                               L = link[L].prev;
-                       }
-               }
-
-               pending.len = lenR;
-
-               /* Extend right.  */
-               if (lenR >= min_match_len && lenR > lenL) {
-                       for (;;) {
-
-                               if (--count_remaining == 0)
-                                       goto out_save_pending;
-
-                               input_idx_t offset = i - SA[R];
-
-                               /* Save match if it has smaller cost.  */
-                               cost = lzx_match_cost_fast(offset, queue);
-                               if (cost < pending_cost) {
-                                       pending.offset = offset;
-                                       pending_cost = cost;
-                               }
-
-                               if (link[R].lcpnext < lenR) {
-                                       /* Match length decreased.  */
-
-                                       lenR = link[R].lcpnext;
-
-                                       /* Save the pending match unless a
-                                        * previous (longer) match had lower
-                                        * cost.  */
-                                       if (pending_cost < best_cost) {
-                                               matches[nmatches++] = pending;
-                                               best_cost = pending_cost;
-                                               if (nmatches == max_matches_to_return)
-                                                       return nmatches;
-                                       }
-
-                                       if (lenR < min_match_len || lenR <= lenL)
-                                               break;
-
-                                       pending.len = lenR;
-                                       pending_cost = INFINITE_BLOCK_COST;
-                               }
-                               R = link[R].next;
-                       }
-               }
-       }
-       goto out;
-
-out_save_pending:
-       if (pending_cost != INFINITE_BLOCK_COST)
-               matches[nmatches++] = pending;
-
-out:
-       return nmatches;
-}
-
-
  /* Tell the match-finder to skip the specified number of bytes (@n) in the
   * input.  */
  static void
  /* Tell the match-finder to skip the specified number of bytes (@n) in the
   * input.  */
  static void
@@ -1456,9 +1184,10 @@ lzx_lz_skip_bytes(struct lzx_compressor *ctx, unsigned n)
         } else {
                 while (n--) {
                         ctx->cached_matches[ctx->cached_matches_pos++].len = 0;
         } else {
                 while (n--) {
                         ctx->cached_matches[ctx->cached_matches_pos++].len = 0;
-                       lzx_lz_update_salink(ctx->match_window_pos++, ctx->SA,
-                                            ctx->ISA, ctx->salink);
+                       lz_sarray_skip_position(&ctx->lz_sarray);
+                       ctx->match_window_pos++;
                 }
                 }
+               LZX_ASSERT(lz_sarray_get_pos(&ctx->lz_sarray) == ctx->match_window_pos);
         }
  }
  
         }
  }
  
@@ -1481,24 +1210,11 @@ lzx_lz_get_matches_caching(struct lzx_compressor *ctx,
         if (ctx->matches_cached) {
                 num_matches = matches[-1].len;
         } else {
         if (ctx->matches_cached) {
                 num_matches = matches[-1].len;
         } else {
-               unsigned min_match_len = LZX_MIN_MATCH_LEN;
-               if (!ctx->params.alg_params.slow.use_len2_matches)
-                       min_match_len = max(min_match_len, 3);
-               const u32 max_search_depth = ctx->params.alg_params.slow.max_search_depth;
-               const u32 max_matches_per_pos = ctx->params.alg_params.slow.max_matches_per_pos;
-
-               if (unlikely(max_search_depth == 0 || max_matches_per_pos == 0))
-                       num_matches = 0;
-               else
-                       num_matches = lzx_lz_get_matches(ctx->match_window_pos,
-                                                        ctx->SA,
-                                                        ctx->ISA,
-                                                        ctx->salink,
-                                                        matches,
-                                                        queue,
-                                                        min_match_len,
-                                                        max_search_depth,
-                                                        max_matches_per_pos);
+               LZX_ASSERT(lz_sarray_get_pos(&ctx->lz_sarray) == ctx->match_window_pos);
+               num_matches = lz_sarray_get_matches(&ctx->lz_sarray,
+                                                   matches,
+                                                   lzx_match_cost_fast,
+                                                   queue);
                 matches[-1].len = num_matches;
         }
         ctx->cached_matches_pos += num_matches + 1;
                 matches[-1].len = num_matches;
         }
         ctx->cached_matches_pos += num_matches + 1;
@@ -1898,169 +1614,12 @@ lzx_optimize_blocks(struct lzx_compressor *ctx)
                 lzx_optimize_block(ctx, &ctx->block_specs[i], num_passes);
  }
  
                 lzx_optimize_block(ctx, &ctx->block_specs[i], num_passes);
  }
  
-/* Initialize the suffix array match-finder for the specified input.  */
-static void
-lzx_lz_init_matchfinder(const u8 T[const restrict],
-                       const input_idx_t n,
-                       input_idx_t SA[const restrict],
-                       input_idx_t ISA[const restrict],
-                       input_idx_t LCP[const restrict],
-                       struct salink link[const restrict],
-                       const unsigned max_match_len)
-{
-       /* Compute SA (Suffix Array).  */
-
-       {
-               /* ISA and link are used as temporary space.  */
-               BUILD_BUG_ON(LZX_MIN_WINDOW_SIZE * sizeof(ISA[0]) < 256 * sizeof(saidx_t));
-               BUILD_BUG_ON(LZX_MIN_WINDOW_SIZE * 2 * sizeof(link[0]) < 256 * 256 * sizeof(saidx_t));
-
-               if (sizeof(input_idx_t) == sizeof(saidx_t)) {
-                       divsufsort(T, SA, n, (saidx_t*)ISA, (saidx_t*)link);
-               } else {
-                       saidx_t sa[n];
-                       divsufsort(T, sa, n, (saidx_t*)ISA, (saidx_t*)link);
-                       for (input_idx_t i = 0; i < n; i++)
-                               SA[i] = sa[i];
-               }
-       }
-
-#ifdef ENABLE_LZX_DEBUG
-
-       LZX_ASSERT(n > 0);
-
-       /* Verify suffix array.  */
-       {
-               bool found[n];
-               ZERO_ARRAY(found);
-               for (input_idx_t r = 0; r < n; r++) {
-                       input_idx_t i = SA[r];
-                       LZX_ASSERT(i < n);
-                       LZX_ASSERT(!found[i]);
-                       found[i] = true;
-               }
-       }
-
-       for (input_idx_t r = 0; r < n - 1; r++) {
-
-               input_idx_t i1 = SA[r];
-               input_idx_t i2 = SA[r + 1];
-
-               input_idx_t n1 = n - i1;
-               input_idx_t n2 = n - i2;
-
-               LZX_ASSERT(memcmp(&T[i1], &T[i2], min(n1, n2)) <= 0);
-       }
-       LZX_DEBUG("Verified SA (len %u)", n);
-#endif /* ENABLE_LZX_DEBUG */
-
-       /* Compute ISA (Inverse Suffix Array)  */
-       for (input_idx_t r = 0; r < n; r++)
-               ISA[SA[r]] = r;
-
-       /* Compute LCP (longest common prefix) array.
-        *
-        * Algorithm adapted from Kasai et al. 2001: "Linear-Time
-        * Longest-Common-Prefix Computation in Suffix Arrays and Its
-        * Applications".  */
-       {
-               input_idx_t h = 0;
-               for (input_idx_t i = 0; i < n; i++) {
-                       input_idx_t r = ISA[i];
-                       if (r > 0) {
-                               input_idx_t j = SA[r - 1];
-
-                               input_idx_t lim = min(n - i, n - j);
-
-                               while (h < lim && T[i + h] == T[j + h])
-                                       h++;
-                               LCP[r] = h;
-                               if (h > 0)
-                                       h--;
-                       }
-               }
-       }
-
-#ifdef ENABLE_LZX_DEBUG
-       /* Verify LCP array.  */
-       for (input_idx_t r = 0; r < n - 1; r++) {
-               LZX_ASSERT(ISA[SA[r]] == r);
-               LZX_ASSERT(ISA[SA[r + 1]] == r + 1);
-
-               input_idx_t i1 = SA[r];
-               input_idx_t i2 = SA[r + 1];
-               input_idx_t lcp = LCP[r + 1];
-
-               input_idx_t n1 = n - i1;
-               input_idx_t n2 = n - i2;
-
-               LZX_ASSERT(lcp <= min(n1, n2));
-
-               LZX_ASSERT(memcmp(&T[i1], &T[i2], lcp) == 0);
-               if (lcp < min(n1, n2))
-                       LZX_ASSERT(T[i1 + lcp] != T[i2 + lcp]);
-       }
-#endif /* ENABLE_LZX_DEBUG */
-
-       /* Compute salink.next and salink.lcpnext.
-        *
-        * Algorithm adapted from Crochemore et al. 2009:
-        * "LPF computation revisited".
-        *
-        * Note: we cap lcpnext to the maximum match length so that the
-        * match-finder need not worry about it later.  */
-       link[n - 1].next = (input_idx_t)~0U;
-       link[n - 1].prev = (input_idx_t)~0U;
-       link[n - 1].lcpnext = 0;
-       link[n - 1].lcpprev = 0;
-       for (input_idx_t r = n - 2; r != (input_idx_t)~0U; r--) {
-               input_idx_t t = r + 1;
-               input_idx_t l = LCP[t];
-               while (t != (input_idx_t)~0 && SA[t] > SA[r]) {
-                       l = min(l, link[t].lcpnext);
-                       t = link[t].next;
-               }
-               link[r].next = t;
-               link[r].lcpnext = min(l, max_match_len);
-               LZX_ASSERT(t == (input_idx_t)~0U || l <= n - SA[t]);
-               LZX_ASSERT(l <= n - SA[r]);
-               LZX_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);
-       }
-
-       /* Compute salink.prev and salink.lcpprev.
-        *
-        * Algorithm adapted from Crochemore et al. 2009:
-        * "LPF computation revisited".
-        *
-        * Note: we cap lcpprev to the maximum match length so that the
-        * match-finder need not worry about it later.  */
-       link[0].prev = (input_idx_t)~0;
-       link[0].next = (input_idx_t)~0;
-       link[0].lcpprev = 0;
-       link[0].lcpnext = 0;
-       for (input_idx_t r = 1; r < n; r++) {
-               input_idx_t t = r - 1;
-               input_idx_t l = LCP[r];
-               while (t != (input_idx_t)~0 && SA[t] > SA[r]) {
-                       l = min(l, link[t].lcpprev);
-                       t = link[t].prev;
-               }
-               link[r].prev = t;
-               link[r].lcpprev = min(l, max_match_len);
-               LZX_ASSERT(t == (input_idx_t)~0 || l <= n - SA[t]);
-               LZX_ASSERT(l <= n - SA[r]);
-               LZX_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);
-       }
-}
-
  /* Prepare the input window into one or more LZX blocks ready to be output.  */
  static void
  lzx_prepare_blocks(struct lzx_compressor * ctx)
  {
         /* Initialize the match-finder.  */
  /* Prepare the input window into one or more LZX blocks ready to be output.  */
  static void
  lzx_prepare_blocks(struct lzx_compressor * ctx)
  {
         /* Initialize the match-finder.  */
-       lzx_lz_init_matchfinder(ctx->window, ctx->window_size,
-                               ctx->SA, ctx->ISA, ctx->LCP, ctx->salink,
-                               LZX_MAX_MATCH_LEN);
+       lz_sarray_load_window(&ctx->lz_sarray, ctx->window, ctx->window_size);
         ctx->cached_matches_pos = 0;
         ctx->matches_cached = false;
         ctx->match_window_pos = 0;
         ctx->cached_matches_pos = 0;
         ctx->matches_cached = false;
         ctx->match_window_pos = 0;
@@ -2341,8 +1900,7 @@ lzx_free_compressor(void *_ctx)
                 FREE(ctx->chosen_matches);
                 FREE(ctx->cached_matches);
                 FREE(ctx->optimum);
                 FREE(ctx->chosen_matches);
                 FREE(ctx->cached_matches);
                 FREE(ctx->optimum);
-               FREE(ctx->salink);
-               FREE(ctx->SA);
+               lz_sarray_destroy(&ctx->lz_sarray);
                 FREE(ctx->block_specs);
                 FREE(ctx->prev_tab);
                 FREE(ctx->window);
                 FREE(ctx->block_specs);
                 FREE(ctx->prev_tab);
                 FREE(ctx->window);
@@ -2434,14 +1992,16 @@ lzx_create_compressor(size_t window_size,
                 goto oom;
  
         if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) {
                 goto oom;
  
         if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) {
-               ctx->SA = MALLOC(3U * window_size * sizeof(ctx->SA[0]));
-               if (ctx->SA == NULL)
-                       goto oom;
-               ctx->ISA = ctx->SA + window_size;
-               ctx->LCP = ctx->ISA + window_size;
+               unsigned min_match_len = LZX_MIN_MATCH_LEN;
+               if (!ctx->params.alg_params.slow.use_len2_matches)
+                       min_match_len = max(min_match_len, 3);
  
  
-               ctx->salink = MALLOC(window_size * sizeof(ctx->salink[0]));
-               if (ctx->salink == NULL)
+               if (!lz_sarray_init(&ctx->lz_sarray,
+                                   window_size,
+                                   min_match_len,
+                                   LZX_MAX_MATCH_LEN,
+                                   params->alg_params.slow.max_search_depth,
+                                   params->alg_params.slow.max_matches_per_pos))
                         goto oom;
         }
  
                         goto oom;
         }
  
diff --git a/src/xpress-compress.c b/src/xpress-compress.c

index 2637ce93cab8fa3de765b37fa6e0980431764043..b1638877f212f896e2adbad17034c8627c8af441 100644 (file)
--- a/src/xpress-compress.c
+++ b/src/xpress-compress.c
@@ -34,6 +34,7 @@
  #include "wimlib/compressor_ops.h"
  #include "wimlib/compress_common.h"
  #include "wimlib/error.h"
  #include "wimlib/compressor_ops.h"
  #include "wimlib/compress_common.h"
  #include "wimlib/error.h"
+#include "wimlib/lz_hash.h"
  #include "wimlib/util.h"
  #include "wimlib/xpress.h"
  
  #include "wimlib/util.h"
  #include "wimlib/xpress.h"
  
diff --git a/tests/test-imagex b/tests/test-imagex

index 177cd2b3aa89a9553607108c7a88d2014f063647..4ae0540c532f6e5135320522d6694de7d59d4de0 100755 (executable)
--- a/tests/test-imagex
+++ b/tests/test-imagex
@@ -272,13 +272,13 @@ fi
  if ! diff -q -r tmp/dir tmp/myname || ! diff -q -r dir tmp/dir; then
         error "Recursive diff of applied WIM with original directory failed"
  fi
  if ! diff -q -r tmp/dir tmp/myname || ! diff -q -r dir tmp/dir; then
         error "Recursive diff of applied WIM with original directory failed"
  fi
-if test "`get_link_count tmp/dir/lz77.c`" != 1; then
+if test "`get_link_count tmp/dir/write.c`" != 1; then
         error "Incorrect link count on extracted file"
  fi
         error "Incorrect link count on extracted file"
  fi
-if test "`get_link_count tmp/myname/lz77.c`" != 1; then
+if test "`get_link_count tmp/myname/write.c`" != 1; then
         error "Incorrect link count on extracted file"
  fi
         error "Incorrect link count on extracted file"
  fi
-if test "`get_inode_number tmp/myname/lz77.c`" = "`get_inode_number tmp/dir/lz77.c`"; then
+if test "`get_inode_number tmp/myname/write.c`" = "`get_inode_number tmp/dir/write.c`"; then
         error "Incorrect inode number"
  fi
  rm -rf tmp
         error "Incorrect inode number"
  fi
  rm -rf tmp
@@ -289,13 +289,13 @@ fi
  if ! diff -q -r tmp/dir tmp/myname || ! diff -q -r dir tmp/dir; then
         error "Recursive diff of applied WIM with original directory failed"
  fi
  if ! diff -q -r tmp/dir tmp/myname || ! diff -q -r dir tmp/dir; then
         error "Recursive diff of applied WIM with original directory failed"
  fi
-if test "`get_link_count tmp/dir/lz77.c`" != 2; then
+if test "`get_link_count tmp/dir/write.c`" != 2; then
         error "Incorrect link count on extracted file"
  fi
         error "Incorrect link count on extracted file"
  fi
-if test "`get_link_count tmp/myname/lz77.c`" != 2; then
+if test "`get_link_count tmp/myname/write.c`" != 2; then
         error "Incorrect link count on extracted file"
  fi
         error "Incorrect link count on extracted file"
  fi
-if test "`get_inode_number tmp/myname/lz77.c`" != "`get_inode_number tmp/dir/lz77.c`"; then
+if test "`get_inode_number tmp/myname/write.c`" != "`get_inode_number tmp/dir/write.c`"; then
         error "Incorrect inode number"
  fi
  rm -rf tmp
         error "Incorrect inode number"
  fi
  rm -rf tmp
diff --git a/tests/test-imagex-mount b/tests/test-imagex-mount

index d5bfbcadb6912fbff212ec0887b60a3e0e675c9b..49585e8a7ec4d999ea078e5a2451ec2ed52162f0 100755 (executable)
--- a/tests/test-imagex-mount
+++ b/tests/test-imagex-mount
@@ -75,24 +75,24 @@ for flag in "--compress=none" "--compress=maximum" "--compress=fast"; do
                       "loaded, or you aren't a member of the FUSE group?"
         fi
         echo "Testing extracting file from mounted read-only WIM"
                       "loaded, or you aren't a member of the FUSE group?"
         fi
         echo "Testing extracting file from mounted read-only WIM"
-       if ! cp tmp/lz77.c lz77.c; then
+       if ! cp tmp/write.c write.c; then
                 error "Failed to extract file from read-only mounted WIM"
         fi
                 error "Failed to extract file from read-only mounted WIM"
         fi
-       if ! diff -q dir/lz77.c lz77.c; then
+       if ! diff -q dir/write.c write.c; then
                 error "Extracted file does not match copy in mounted WIM"
         fi
                 error "Extracted file does not match copy in mounted WIM"
         fi
-       if ! diff -q tmp/lz77.c dir/lz77.c; then
+       if ! diff -q tmp/write.c dir/write.c; then
                 error "Extractef file does not match original"
         fi
                 error "Extractef file does not match original"
         fi
-       rm -f lz77.c
+       rm -f write.c
         echo "Testing modifying mounted read-only WIM (should fail)"
         echo "Testing modifying mounted read-only WIM (should fail)"
-       if rm tmp/lz77.c; then
+       if rm tmp/write.c; then
                 error "Removing file from read-only mounted WIM didn't fail"
         fi
         if touch tmp/newfile; then
                 error "Creating file on read-only mounted WIM didn't fail"
         fi
                 error "Removing file from read-only mounted WIM didn't fail"
         fi
         if touch tmp/newfile; then
                 error "Creating file on read-only mounted WIM didn't fail"
         fi
-       if echo 3 > tmp/lz77.c; then
+       if echo 3 > tmp/write.c; then
                 error "Writing to file on read-only mounted WIM didn't fail"
         fi
         echo "Testing diff of mounted read-only WIM with original directory"
                 error "Writing to file on read-only mounted WIM didn't fail"
         fi
         echo "Testing diff of mounted read-only WIM with original directory"
@@ -137,10 +137,10 @@ echo "Testing removing file from mounted WIM"
  if ! imagex mountrw dir.wim dir tmp; then
         error "Failed to re-mount test WIM read-write"
  fi
  if ! imagex mountrw dir.wim dir tmp; then
         error "Failed to re-mount test WIM read-write"
  fi
-if ! rm tmp/lz77.c; then
+if ! rm tmp/write.c; then
         error "Failed to remove file from read-write mounted WIM"
  fi
         error "Failed to remove file from read-write mounted WIM"
  fi
-if test -f tmp/lz77.c; then
+if test -f tmp/write.c; then
         error "Removing file from read-write mounted WIM failed"
  fi
  echo "Testing making directory in mounted WIM"
         error "Removing file from read-write mounted WIM failed"
  fi
  echo "Testing making directory in mounted WIM"
author	Eric Biggers <ebiggers3@gmail.com>
	Mon, 30 Dec 2013 01:37:39 +0000 (19:37 -0600)
committer	Eric Biggers <ebiggers3@gmail.com>
	Wed, 1 Jan 2014 16:04:48 +0000 (10:04 -0600)
Makefile.am		patch \| blob \| history
README		patch \| blob \| history
include/wimlib/compress_common.h		patch \| blob \| history
include/wimlib/lz.h	[new file with mode: 0644]	patch \| blob
include/wimlib/lz_hash.h	[new file with mode: 0644]	patch \| blob
include/wimlib/lz_sarray.h	[new file with mode: 0644]	patch \| blob
src/lz_hash.c	[moved from src/lz77.c with 99% similarity]	patch \| blob \| history
src/lz_sarray.c	[new file with mode: 0644]	patch \| blob
src/lzms-compress.c		patch \| blob \| history
src/lzx-compress.c		patch \| blob \| history
src/xpress-compress.c		patch \| blob \| history
tests/test-imagex		patch \| blob \| history
tests/test-imagex-mount		patch \| blob \| history