src/lzms-common.c \
src/lzms-compress.c \
src/lzms-decompress.c \
- src/lz77.c \
+ src/lz_hash.c \
+ src/lz_sarray.c \
src/divsufsort/divsufsort.c \
src/divsufsort/divsufsort.h \
src/divsufsort/divsufsort_private.h \
include/wimlib/integrity.h \
include/wimlib/list.h \
include/wimlib/lookup_table.h \
+ include/wimlib/lz.h \
+ include/wimlib/lz_hash.h \
+ include/wimlib/lz_sarray.h \
include/wimlib/lzms.h \
include/wimlib/lzx.h \
include/wimlib/metadata.h \
(https://code.google.com/p/libdivsufsort/) and algorithms from 7-Zip as well as
several published papers.
-lz77.c contains a hash-table-based LZ77 matchfinder that is based on code from
-zlib but has been rewritten. This code is applicable to XPRESS, LZX, and LZMS,
-all of which are partly based on LZ77 compression.
+lz_hash.c contains a hash-table-based LZ77 matchfinder that is based on code
+from zlib but has been rewritten. This code is applicable to XPRESS, LZX, and
+LZMS, all of which are partly based on LZ77 compression.
A limited number of other free programs can handle some parts of the WIM
file format:
extern void
bitstream_put_byte(struct output_bitstream *ostream, u8 n);
-struct lz_params {
- unsigned min_match;
- unsigned max_match;
- unsigned max_offset;
- unsigned nice_match;
- unsigned good_match;
- unsigned max_chain_len;
- unsigned max_lazy_match;
- unsigned too_far;
-};
-
-typedef void (*lz_record_match_t)(unsigned len, unsigned offset, void *ctx);
-typedef void (*lz_record_literal_t)(u8 lit, void *ctx);
-
-extern void
-lz_analyze_block(const u8 window[restrict],
- input_idx_t window_size,
- lz_record_match_t record_match,
- lz_record_literal_t record_literal,
- void *record_ctx,
- const struct lz_params *params,
- input_idx_t prev_tab[restrict]);
-
extern void
make_canonical_huffman_code(unsigned num_syms,
unsigned max_codeword_len,
--- /dev/null
+#ifndef _WIMLIB_LZ_H
+#define _WIMLIB_LZ_H
+
+#include "wimlib/compress_common.h"
+
+//#define ENABLE_LZ_DEBUG
+#ifdef ENABLE_LZ_DEBUG
+# define LZ_DEBUG DEBUG
+# define LZ_ASSERT wimlib_assert
+# include "wimlib/assert.h"
+# include "wimlib/error.h"
+#else
+# define LZ_DEBUG(...)
+# define LZ_ASSERT(...)
+#endif
+
+
+/* Raw LZ match/literal format: just a length and offset.
+ *
+ * The length is the number of bytes of the match, and the offset is the number
+ * of bytes back in the input the match is from the current position.
+ *
+ * This can alternatively be used to represent a literal byte if @len is less
+ * than the minimum match length. */
+struct raw_match {
+ input_idx_t len;
+ input_idx_t offset;
+};
+
+#endif /* _WIMLIB_LZ_H */
--- /dev/null
+#ifndef _WIMLIB_LZ_HASH_H
+#define _WIMLIB_LZ_HASH_H
+
+#include "wimlib/compress_common.h"
+
+struct lz_params {
+ unsigned min_match;
+ unsigned max_match;
+ unsigned max_offset;
+ unsigned nice_match;
+ unsigned good_match;
+ unsigned max_chain_len;
+ unsigned max_lazy_match;
+ unsigned too_far;
+};
+
+typedef void (*lz_record_match_t)(unsigned len, unsigned offset, void *ctx);
+typedef void (*lz_record_literal_t)(u8 lit, void *ctx);
+
+extern void
+lz_analyze_block(const u8 window[restrict],
+ input_idx_t window_size,
+ lz_record_match_t record_match,
+ lz_record_literal_t record_literal,
+ void *record_ctx,
+ const struct lz_params *params,
+ input_idx_t prev_tab[restrict]);
+
+
+#endif /* _WIMLIB_LZ_HASH_H */
--- /dev/null
+/*
+ * lz_sarray.h
+ *
+ * Suffix array match-finder for LZ (Lempel-Ziv) compression.
+ */
+
+/*
+ * Copyright (C) 2013 Eric Biggers
+ *
+ * This file is part of wimlib, a library for working with WIM files.
+ *
+ * wimlib is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+ * A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with wimlib; if not, see http://www.gnu.org/licenses/.
+ */
+
+#ifndef _WIMLIB_LZ_SARRAY_H
+#define _WIMLIB_LZ_SARRAY_H
+
+#include "wimlib/compiler.h"
+#include "wimlib/lz.h"
+#include "wimlib/types.h"
+
+struct salink;
+
+/* Suffix array LZ (Lempel-Ziv) match-finder. */
+struct lz_sarray {
+ /* Allocated window size for the match-finder. */
+ input_idx_t max_window_size;
+
+ /* Minimum match length to return. */
+ input_idx_t min_match_len;
+
+ /* Maximum match length to return. */
+ input_idx_t max_match_len;
+
+ /* Maximum matches to consider at each position (max search depth). */
+ u32 max_matches_to_consider;
+
+ /* Maximum number of matches to return at each position. */
+ u32 max_matches_to_return;
+
+ /* Current position in the window */
+ input_idx_t cur_pos;
+
+ /* Current window size. */
+ input_idx_t window_size;
+
+ /* Suffix array for window.
+ * This is a mapping from suffix rank to suffix position. */
+ input_idx_t *SA;
+
+ /* Inverse suffix array for window.
+ * This is a mapping from suffix position to suffix rank.
+ * If 0 <= r < window_size, then ISA[SA[r]] == r. */
+ input_idx_t *ISA;
+
+ /* Longest common prefix array corresponding to the suffix array SA.
+ * LCP[i] is the length of the longest common prefix between the
+ * suffixes with positions SA[i - 1] and SA[i]. LCP[0] is undefined.
+ */
+ input_idx_t *LCP;
+
+ /* Suffix array links.
+ *
+ * During a linear scan of the input string to find matches, this array
+ * used to keep track of which rank suffixes in the suffix array appear
+ * before the current position. Instead of searching in the original
+ * suffix array, scans for matches at a given position traverse a linked
+ * list containing only suffixes that appear before that position. */
+ struct salink *salink;
+};
+
+/* Suffix array link */
+struct salink {
+ /* Rank of highest ranked suffix that has rank lower than the suffix
+ * corresponding to this structure and either has a lower position
+ * (initially) or has a position lower than the highest position at
+ * which matches have been searched for so far, or -1 if there is no
+ * such suffix. */
+ input_idx_t prev;
+
+ /* Rank of lowest ranked suffix that has rank greater than the suffix
+ * corresponding to this structure and either has a lower position
+ * (intially) or has a position lower than the highest position at which
+ * matches have been searched for so far, or -1 if there is no such
+ * suffix. */
+ input_idx_t next;
+
+ /* Length of longest common prefix between the suffix corresponding to
+ * this structure and the suffix with rank @prev, or 0 if @prev is -1.
+ */
+ input_idx_t lcpprev;
+
+ /* Length of longest common prefix between the suffix corresponding to
+ * this structure and the suffix with rank @next, or 0 if @next is -1.
+ */
+ input_idx_t lcpnext;
+};
+
+extern bool
+lz_sarray_init(struct lz_sarray *mf,
+ input_idx_t max_window_size,
+ input_idx_t min_match_len,
+ input_idx_t max_match_len,
+ u32 max_matches_to_consider,
+ u32 max_matches_to_return);
+
+extern void
+lz_sarray_destroy(struct lz_sarray *mf);
+
+extern void
+lz_sarray_load_window(struct lz_sarray *mf, const u8 window[],
+ input_idx_t window_size);
+
+static inline input_idx_t
+lz_sarray_get_pos(const struct lz_sarray *mf)
+{
+ return mf->cur_pos;
+}
+
+/* Advance the suffix array match-finder to the next position. */
+static _always_inline_attribute void
+lz_sarray_update_salink(const input_idx_t i,
+ const input_idx_t SA[const restrict],
+ const input_idx_t ISA[const restrict],
+ struct salink link[const restrict])
+{
+ /* r = Rank of the suffix at the current position. */
+ const input_idx_t r = ISA[i];
+
+ /* next = rank of LOWEST ranked suffix that is ranked HIGHER than the
+ * current suffix AND has a LOWER position, or -1 if none exists. */
+ const input_idx_t next = link[r].next;
+
+ /* prev = rank of HIGHEST ranked suffix that is ranked LOWER than the
+ * current suffix AND has a LOWER position, or -1 if none exists. */
+ const input_idx_t prev = link[r].prev;
+
+ /* Link the suffix at the current position into the linked list that
+ * contains all suffixes in the suffix array that are appear at or
+ * before the current position, sorted by rank.
+ *
+ * Save the values of all fields we overwrite so that rollback is
+ * possible. */
+ if (next != ~(input_idx_t)0) {
+
+ link[next].prev = r;
+ link[next].lcpprev = link[r].lcpnext;
+ }
+
+ if (prev != ~(input_idx_t)0) {
+
+ link[prev].next = r;
+ link[prev].lcpnext = link[r].lcpprev;
+ }
+}
+
+/* Skip the current position in the suffix array match-finder. */
+static _always_inline_attribute void
+lz_sarray_skip_position(struct lz_sarray *mf)
+{
+ LZ_ASSERT(mf->cur_pos < mf->window_size);
+ lz_sarray_update_salink(mf->cur_pos++, mf->SA, mf->ISA, mf->salink);
+}
+
+typedef input_idx_t lz_sarray_cost_t;
+#define LZ_SARRAY_INFINITE_COST (~(lz_sarray_cost_t)0)
+
+/*
+ * Use the suffix array match-finder to retrieve a list of LZ matches at the
+ * current position.
+ *
+ * Returns the number of matches written into @matches. The matches are
+ * returned in decreasing order by length, and each will be of unique length
+ * between the minimum and maximum match lengths passed to lz_sarray_init(). Up
+ * to @max_matches_to_return (passed to lz_sarray_init()) matches will be
+ * returned.
+ *
+ * @eval_match_cost is a function for evaluating the cost of a match when
+ * deciding which ones to return. It is only used for comparing matches of the
+ * same length. It needs to be fast, and need not be exact; an implementation
+ * might simply rank matches by their offset, for example, although
+ * implementations may choose to take into account additional information such
+ * as repeat offsets.
+ */
+static _always_inline_attribute u32
+lz_sarray_get_matches(struct lz_sarray *mf,
+ struct raw_match matches[],
+ lz_sarray_cost_t (*eval_match_cost)
+ (input_idx_t length,
+ input_idx_t offset,
+ const void *ctx),
+ const void *eval_match_cost_ctx)
+{
+ LZ_ASSERT(mf->cur_pos < mf->window_size);
+ const input_idx_t i = mf->cur_pos++;
+
+ const input_idx_t * const restrict SA = mf->SA;
+ const input_idx_t * const restrict ISA = mf->ISA;
+ struct salink * const restrict link = mf->salink;
+ const input_idx_t min_match_len = mf->min_match_len;
+ const u32 max_matches_to_consider = mf->max_matches_to_consider;
+ const u32 max_matches_to_return = mf->max_matches_to_return;
+
+ /* r = Rank of the suffix at the current position. */
+ const input_idx_t r = ISA[i];
+
+ /* Prepare for searching the current position. */
+ lz_sarray_update_salink(i, SA, ISA, link);
+
+ /* L = rank of next suffix to the left;
+ * R = rank of next suffix to the right;
+ * lenL = length of match between current position and the suffix with rank L;
+ * lenR = length of match between current position and the suffix with rank R.
+ *
+ * This is left and right relative to the rank of the current suffix.
+ * Since the suffixes in the suffix array are sorted, the longest
+ * matches are immediately to the left and right (using the linked list
+ * to ignore all suffixes that occur later in the window). The match
+ * length decreases the farther left and right we go. We shall keep the
+ * length on both sides in sync in order to choose the lowest-cost match
+ * of each length.
+ */
+ input_idx_t L = link[r].prev;
+ input_idx_t R = link[r].next;
+ input_idx_t lenL = link[r].lcpprev;
+ input_idx_t lenR = link[r].lcpnext;
+
+ /* nmatches = number of matches found so far. */
+ u32 nmatches = 0;
+
+ /* best_cost = cost of lowest-cost match found so far.
+ *
+ * We keep track of this so that we can ignore shorter matches that do
+ * not have lower costs than a longer matches already found.
+ */
+ lz_sarray_cost_t best_cost = LZ_SARRAY_INFINITE_COST;
+
+ /* count_remaining = maximum number of possible matches remaining to be
+ * considered. */
+ u32 count_remaining = max_matches_to_consider;
+
+ /* pending = match currently being considered for a specific length. */
+ struct raw_match pending;
+ lz_sarray_cost_t pending_cost;
+
+ while (lenL >= min_match_len || lenR >= min_match_len)
+ {
+ pending.len = lenL;
+ pending_cost = LZ_SARRAY_INFINITE_COST;
+ lz_sarray_cost_t cost;
+
+ /* Extend left. */
+ if (lenL >= min_match_len && lenL >= lenR) {
+ for (;;) {
+
+ if (--count_remaining == 0)
+ goto out_save_pending;
+
+ input_idx_t offset = i - SA[L];
+
+ /* Save match if it has smaller cost. */
+ cost = (*eval_match_cost)(lenL, offset,
+ eval_match_cost_ctx);
+ if (cost < pending_cost) {
+ pending.offset = offset;
+ pending_cost = cost;
+ }
+
+ if (link[L].lcpprev < lenL) {
+ /* Match length decreased. */
+
+ lenL = link[L].lcpprev;
+
+ /* Save the pending match unless the
+ * right side still may have matches of
+ * this length to be scanned, or if a
+ * previous (longer) match had lower
+ * cost. */
+ if (pending.len > lenR) {
+ if (pending_cost < best_cost) {
+ best_cost = pending_cost;
+ matches[nmatches++] = pending;
+ if (nmatches == max_matches_to_return)
+ return nmatches;
+ }
+ pending.len = lenL;
+ pending_cost = LZ_SARRAY_INFINITE_COST;
+ }
+ if (lenL < min_match_len || lenL < lenR)
+ break;
+ }
+ L = link[L].prev;
+ }
+ }
+
+ pending.len = lenR;
+
+ /* Extend right. */
+ if (lenR >= min_match_len && lenR > lenL) {
+ for (;;) {
+
+ if (--count_remaining == 0)
+ goto out_save_pending;
+
+ input_idx_t offset = i - SA[R];
+
+ /* Save match if it has smaller cost. */
+ cost = (*eval_match_cost)(lenR,
+ offset,
+ eval_match_cost_ctx);
+ if (cost < pending_cost) {
+ pending.offset = offset;
+ pending_cost = cost;
+ }
+
+ if (link[R].lcpnext < lenR) {
+ /* Match length decreased. */
+
+ lenR = link[R].lcpnext;
+
+ /* Save the pending match unless a
+ * previous (longer) match had lower
+ * cost. */
+ if (pending_cost < best_cost) {
+ matches[nmatches++] = pending;
+ best_cost = pending_cost;
+ if (nmatches == max_matches_to_return)
+ return nmatches;
+ }
+
+ if (lenR < min_match_len || lenR <= lenL)
+ break;
+
+ pending.len = lenR;
+ pending_cost = LZ_SARRAY_INFINITE_COST;
+ }
+ R = link[R].next;
+ }
+ }
+ }
+ goto out;
+
+out_save_pending:
+ if (pending_cost != LZ_SARRAY_INFINITE_COST)
+ matches[nmatches++] = pending;
+
+out:
+ return nmatches;
+}
+
+#endif /* _WIMLIB_LZ_SARRAY_H */
/*
- * lz77.c
+ * lz_hash.c
*
* This file provides the code to analyze a buffer of uncompressed data for
* matches, as per the LZ77 algorithm. It uses a hash table to accelerate the
# include <config.h>
#endif
-#include "wimlib/compress_common.h"
+#include "wimlib/lz_hash.h"
#include "wimlib/util.h"
#include <string.h>
--- /dev/null
+/*
+ * lz_sarray.c
+ *
+ * Suffix array match-finder for LZ (Lempel-Ziv) compression.
+ */
+
+/*
+ * Copyright (C) 2013 Eric Biggers
+ *
+ * This file is part of wimlib, a library for working with WIM files.
+ *
+ * wimlib is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+ * A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with wimlib; if not, see http://www.gnu.org/licenses/.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "wimlib/lz_sarray.h"
+#include "wimlib/util.h"
+#include "divsufsort/divsufsort.h"
+#include <string.h>
+
+/* Initialize the suffix array match-finder with the specified parameters.
+ *
+ * After initialization, it can be used for any number of input strings of
+ * length less than or equal to @max_window_size. */
+bool
+lz_sarray_init(struct lz_sarray *mf,
+ input_idx_t max_window_size,
+ input_idx_t min_match_len,
+ input_idx_t max_match_len,
+ u32 max_matches_to_consider,
+ u32 max_matches_to_return)
+{
+ mf->max_window_size = max_window_size;
+ mf->min_match_len = min_match_len;
+ mf->max_match_len = max_match_len;
+ mf->max_matches_to_consider = max_matches_to_consider;
+ mf->max_matches_to_return = max_matches_to_return;
+
+ mf->SA = MALLOC(3U * max_window_size * sizeof(mf->SA[0]));
+ if (mf->SA == NULL)
+ return false;
+
+ mf->salink = MALLOC(max_window_size * sizeof(mf->salink[0]));
+ if (mf->salink == NULL)
+ return false;
+
+ return true;
+}
+
+/* Free memory allocated for the suffix array match-finder. */
+void
+lz_sarray_destroy(struct lz_sarray *mf)
+{
+ FREE(mf->SA);
+ FREE(mf->salink);
+}
+
+/* Initialize the suffix array match-finder for the specified input. */
+void
+lz_sarray_load_window(struct lz_sarray *mf, const u8 window[],
+ input_idx_t window_size)
+{
+ /* Load variables */
+ const u8 * const restrict T = window;
+ const input_idx_t n = window_size;
+ const input_idx_t max_match_len = mf->max_match_len;
+ input_idx_t * const restrict SA = mf->SA;
+ input_idx_t * const restrict ISA = mf->ISA = SA + window_size;
+ input_idx_t * const restrict LCP = mf->LCP = ISA + window_size;
+ struct salink * const restrict link = mf->salink;
+
+ /* Compute SA (Suffix Array). */
+ {
+ /* ISA and link are used as temporary space. */
+ LZ_ASSERT(mf->max_window_size * sizeof(ISA[0]) >= 256 * sizeof(saidx_t));
+ LZ_ASSERT(mf->max_window_size * 2 * sizeof(link[0]) >= 256 * 256 * sizeof(saidx_t));
+
+ if (sizeof(input_idx_t) == sizeof(saidx_t)) {
+ divsufsort(T, SA, n, (saidx_t*)ISA, (saidx_t*)link);
+ } else {
+ saidx_t sa[n];
+ divsufsort(T, sa, n, (saidx_t*)ISA, (saidx_t*)link);
+ for (input_idx_t i = 0; i < n; i++)
+ SA[i] = sa[i];
+ }
+ }
+
+#ifdef ENABLE_LZ_DEBUG
+
+ LZ_ASSERT(n > 0);
+
+ /* Verify suffix array. */
+ {
+ bool found[n];
+ ZERO_ARRAY(found);
+ for (input_idx_t r = 0; r < n; r++) {
+ input_idx_t i = SA[r];
+ LZ_ASSERT(i < n);
+ LZ_ASSERT(!found[i]);
+ found[i] = true;
+ }
+ }
+
+ for (input_idx_t r = 0; r < n - 1; r++) {
+
+ input_idx_t i1 = SA[r];
+ input_idx_t i2 = SA[r + 1];
+
+ input_idx_t n1 = n - i1;
+ input_idx_t n2 = n - i2;
+
+ LZ_ASSERT(memcmp(&T[i1], &T[i2], min(n1, n2)) <= 0);
+ }
+ LZ_DEBUG("Verified SA (len %u)", n);
+#endif /* ENABLE_LZ_DEBUG */
+
+ /* Compute ISA (Inverse Suffix Array) */
+ for (input_idx_t r = 0; r < n; r++)
+ ISA[SA[r]] = r;
+
+ /* Compute LCP (longest common prefix) array.
+ *
+ * Algorithm adapted from Kasai et al. 2001: "Linear-Time
+ * Longest-Common-Prefix Computation in Suffix Arrays and Its
+ * Applications". */
+ {
+ input_idx_t h = 0;
+ for (input_idx_t i = 0; i < n; i++) {
+ input_idx_t r = ISA[i];
+ if (r > 0) {
+ input_idx_t j = SA[r - 1];
+
+ input_idx_t lim = min(n - i, n - j);
+
+ while (h < lim && T[i + h] == T[j + h])
+ h++;
+ LCP[r] = h;
+ if (h > 0)
+ h--;
+ }
+ }
+ }
+
+#ifdef ENABLE_LZ_DEBUG
+ /* Verify LCP array. */
+ for (input_idx_t r = 0; r < n - 1; r++) {
+ LZ_ASSERT(ISA[SA[r]] == r);
+ LZ_ASSERT(ISA[SA[r + 1]] == r + 1);
+
+ input_idx_t i1 = SA[r];
+ input_idx_t i2 = SA[r + 1];
+ input_idx_t lcp = LCP[r + 1];
+
+ input_idx_t n1 = n - i1;
+ input_idx_t n2 = n - i2;
+
+ LZ_ASSERT(lcp <= min(n1, n2));
+
+ LZ_ASSERT(memcmp(&T[i1], &T[i2], lcp) == 0);
+ if (lcp < min(n1, n2))
+ LZ_ASSERT(T[i1 + lcp] != T[i2 + lcp]);
+ }
+#endif /* ENABLE_LZ_DEBUG */
+
+ /* Compute salink.next and salink.lcpnext.
+ *
+ * Algorithm adapted from Crochemore et al. 2009:
+ * "LPF computation revisited".
+ *
+ * Note: we cap lcpnext to the maximum match length so that the
+ * match-finder need not worry about it later. */
+ link[n - 1].next = ~(input_idx_t)0;
+ link[n - 1].prev = ~(input_idx_t)0;
+ link[n - 1].lcpnext = 0;
+ link[n - 1].lcpprev = 0;
+ for (input_idx_t r = n - 2; r != ~(input_idx_t)0; r--) {
+ input_idx_t t = r + 1;
+ input_idx_t l = LCP[t];
+ while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
+ l = min(l, link[t].lcpnext);
+ t = link[t].next;
+ }
+ link[r].next = t;
+ link[r].lcpnext = min(l, max_match_len);
+ LZ_ASSERT(t == ~(input_idx_t)0 || l <= n - SA[t]);
+ LZ_ASSERT(l <= n - SA[r]);
+ if (t == ~(input_idx_t)0)
+ LZ_ASSERT(l == 0);
+ else
+ LZ_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);
+ }
+
+ /* Compute salink.prev and salink.lcpprev.
+ *
+ * Algorithm adapted from Crochemore et al. 2009:
+ * "LPF computation revisited".
+ *
+ * Note: we cap lcpprev to the maximum match length so that the
+ * match-finder need not worry about it later. */
+ link[0].prev = ~(input_idx_t)0;
+ link[0].next = ~(input_idx_t)0;
+ link[0].lcpprev = 0;
+ link[0].lcpnext = 0;
+ for (input_idx_t r = 1; r < n; r++) {
+ input_idx_t t = r - 1;
+ input_idx_t l = LCP[r];
+ while (t != ~(input_idx_t)0 && SA[t] > SA[r]) {
+ l = min(l, link[t].lcpprev);
+ t = link[t].prev;
+ }
+ link[r].prev = t;
+ link[r].lcpprev = min(l, max_match_len);
+ LZ_ASSERT(t == ~(input_idx_t)0 || l <= n - SA[t]);
+ LZ_ASSERT(l <= n - SA[r]);
+ if (t == ~(input_idx_t)0)
+ LZ_ASSERT(l == 0);
+ else
+ LZ_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);
+ }
+
+ mf->cur_pos = 0;
+ mf->window_size = n;
+}
#include "wimlib/compress_common.h"
#include "wimlib/endianness.h"
#include "wimlib/error.h"
+#include "wimlib/lz_hash.h"
#include "wimlib/lzms.h"
#include "wimlib/util.h"
#include "wimlib/compress_common.h"
#include "wimlib/endianness.h"
#include "wimlib/error.h"
+#include "wimlib/lz_hash.h"
+#include "wimlib/lz_sarray.h"
#include "wimlib/lzx.h"
#include "wimlib/util.h"
#include <pthread.h>
# include "wimlib/decompress_common.h"
#endif
-#include "divsufsort/divsufsort.h"
-
typedef u32 block_cost_t;
#define INFINITE_BLOCK_COST ((block_cost_t)~0U)
u32 data;
};
-/* Raw LZ match/literal format: just a length and offset.
- *
- * The length is the number of bytes of the match, and the offset is the number
- * of bytes back in the input the match is from the current position.
- *
- * If @len < LZX_MIN_MATCH_LEN, then it's really just a literal byte and @offset is
- * meaningless. */
-struct raw_match {
- u16 len;
- input_idx_t offset;
-};
-
/* Specification for an LZX block. */
struct lzx_block_spec {
struct lzx_lru_queue queue;
};
-/* Suffix array link */
-struct salink {
- /* Rank of highest ranked suffix that has rank lower than the suffix
- * corresponding to this structure and either has a lower position
- * (initially) or has a position lower than the highest position at
- * which matches have been searched for so far, or -1 if there is no
- * such suffix. */
- input_idx_t prev;
-
- /* Rank of lowest ranked suffix that has rank greater than the suffix
- * corresponding to this structure and either has a lower position
- * (intially) or has a position lower than the highest position at which
- * matches have been searched for so far, or -1 if there is no such
- * suffix. */
- input_idx_t next;
-
- /* Length of longest common prefix between the suffix corresponding to
- * this structure and the suffix with rank @prev, or 0 if @prev is -1.
- */
- input_idx_t lcpprev;
-
- /* Length of longest common prefix between the suffix corresponding to
- * this structure and the suffix with rank @next, or 0 if @next is -1.
- */
- input_idx_t lcpnext;
-};
-
/* State of the LZX compressor. */
struct lzx_compressor {
/* Fast algorithm only: Array of hash table links. */
input_idx_t *prev_tab;
- /* Suffix array for window.
- * This is a mapping from suffix rank to suffix position. */
- input_idx_t *SA;
-
- /* Inverse suffix array for window.
- * This is a mapping from suffix position to suffix rank.
- * If 0 <= r < window_size, then ISA[SA[r]] == r. */
- input_idx_t *ISA;
-
- /* Longest common prefix array corresponding to the suffix array SA.
- * LCP[i] is the length of the longest common prefix between the
- * suffixes with positions SA[i - 1] and SA[i]. LCP[0] is undefined.
- */
- input_idx_t *LCP;
-
- /* Suffix array links.
- *
- * During a linear scan of the input string to find matches, this array
- * used to keep track of which rank suffixes in the suffix array appear
- * before the current position. Instead of searching in the original
- * suffix array, scans for matches at a given position traverse a linked
- * list containing only suffixes that appear before that position. */
- struct salink *salink;
+ /* Slow algorithm only: Suffix array match-finder. */
+ struct lz_sarray lz_sarray;
/* Position in window of next match to return. */
input_idx_t match_window_pos;
/* Fast heuristic cost evaluation to use in the inner loop of the match-finder.
* Unlike lzx_match_cost() which does a true cost evaluation, this simply
* prioritize matches based on their offset. */
-static block_cost_t
-lzx_match_cost_fast(unsigned offset, const struct lzx_lru_queue *queue)
+static input_idx_t
+lzx_match_cost_fast(input_idx_t length, input_idx_t offset, const void *_queue)
{
+ const struct lzx_lru_queue *queue = _queue;
+
/* It seems well worth it to take the time to give priority to recently
* used offsets. */
- for (unsigned i = 0; i < LZX_NUM_RECENT_OFFSETS; i++)
+ for (input_idx_t i = 0; i < LZX_NUM_RECENT_OFFSETS; i++)
if (offset == queue->R[i])
return i;
- BUILD_BUG_ON(LZX_MAX_WINDOW_SIZE >= (block_cost_t)~0U);
return offset;
}
}
}
-/* Advance the suffix array match-finder to the next position. */
-static void
-lzx_lz_update_salink(input_idx_t i,
- const input_idx_t SA[restrict],
- const input_idx_t ISA[restrict],
- struct salink link[restrict])
-{
- /* r = Rank of the suffix at the current position. */
- const input_idx_t r = ISA[i];
-
- /* next = rank of LOWEST ranked suffix that is ranked HIGHER than the
- * current suffix AND has a LOWER position, or -1 if none exists. */
- const input_idx_t next = link[r].next;
-
- /* prev = rank of HIGHEST ranked suffix that is ranked LOWER than the
- * current suffix AND has a LOWER position, or -1 if none exists. */
- const input_idx_t prev = link[r].prev;
-
- /* Link the suffix at the current position into the linked list that
- * contains all suffixes in the suffix array that are appear at or
- * before the current position, sorted by rank.
- *
- * Save the values of all fields we overwrite so that rollback is
- * possible. */
- if (next != (input_idx_t)~0U) {
-
- link[next].prev = r;
- link[next].lcpprev = link[r].lcpnext;
- }
-
- if (prev != (input_idx_t)~0U) {
-
- link[prev].next = r;
- link[prev].lcpnext = link[r].lcpprev;
- }
-}
-
-/*
- * Use the suffix array match-finder to retrieve a list of LZ matches at the
- * current position.
- *
- * [in] @i Current position in the window.
- * [in] @SA Suffix array for the window.
- * [in] @ISA Inverse suffix array for the window.
- * [inout] @link Suffix array links used internally by the match-finder.
- * [out] @matches The (length, offset) pairs of the resulting matches will
- * be written here, sorted in decreasing order by
- * length. All returned lengths will be unique.
- * [in] @queue Recently used match offsets, used when evaluating the
- * cost of matches.
- * [in] @min_match_len Minimum match length to return.
- * [in] @max_matches_to_consider Maximum number of matches to consider at
- * the position.
- * [in] @max_matches_to_return Maximum number of matches to return.
- *
- * The return value is the number of matches found and written to @matches.
- */
-static unsigned
-lzx_lz_get_matches(const input_idx_t i,
- const input_idx_t SA[const restrict],
- const input_idx_t ISA[const restrict],
- struct salink link[const restrict],
- struct raw_match matches[const restrict],
- const struct lzx_lru_queue * const restrict queue,
- const unsigned min_match_len,
- const u32 max_matches_to_consider,
- const u32 max_matches_to_return)
-{
- /* r = Rank of the suffix at the current position. */
- const input_idx_t r = ISA[i];
-
- /* Prepare for searching the current position. */
- lzx_lz_update_salink(i, SA, ISA, link);
-
- /* L = rank of next suffix to the left;
- * R = rank of next suffix to the right;
- * lenL = length of match between current position and the suffix with rank L;
- * lenR = length of match between current position and the suffix with rank R.
- *
- * This is left and right relative to the rank of the current suffix.
- * Since the suffixes in the suffix array are sorted, the longest
- * matches are immediately to the left and right (using the linked list
- * to ignore all suffixes that occur later in the window). The match
- * length decreases the farther left and right we go. We shall keep the
- * length on both sides in sync in order to choose the lowest-cost match
- * of each length.
- */
- input_idx_t L = link[r].prev;
- input_idx_t R = link[r].next;
- input_idx_t lenL = link[r].lcpprev;
- input_idx_t lenR = link[r].lcpnext;
-
- /* nmatches = number of matches found so far. */
- unsigned nmatches = 0;
-
- /* best_cost = cost of lowest-cost match found so far.
- *
- * We keep track of this so that we can ignore shorter matches that do
- * not have lower costs than a longer matches already found.
- */
- block_cost_t best_cost = INFINITE_BLOCK_COST;
-
- /* count_remaining = maximum number of possible matches remaining to be
- * considered. */
- u32 count_remaining = max_matches_to_consider;
-
- /* pending = match currently being considered for a specific length. */
- struct raw_match pending;
- block_cost_t pending_cost;
-
- while (lenL >= min_match_len || lenR >= min_match_len)
- {
- pending.len = lenL;
- pending_cost = INFINITE_BLOCK_COST;
- block_cost_t cost;
-
- /* Extend left. */
- if (lenL >= min_match_len && lenL >= lenR) {
- for (;;) {
-
- if (--count_remaining == 0)
- goto out_save_pending;
-
- input_idx_t offset = i - SA[L];
-
- /* Save match if it has smaller cost. */
- cost = lzx_match_cost_fast(offset, queue);
- if (cost < pending_cost) {
- pending.offset = offset;
- pending_cost = cost;
- }
-
- if (link[L].lcpprev < lenL) {
- /* Match length decreased. */
-
- lenL = link[L].lcpprev;
-
- /* Save the pending match unless the
- * right side still may have matches of
- * this length to be scanned, or if a
- * previous (longer) match had lower
- * cost. */
- if (pending.len > lenR) {
- if (pending_cost < best_cost) {
- best_cost = pending_cost;
- matches[nmatches++] = pending;
- if (nmatches == max_matches_to_return)
- return nmatches;
- }
- pending.len = lenL;
- pending_cost = INFINITE_BLOCK_COST;
- }
- if (lenL < min_match_len || lenL < lenR)
- break;
- }
- L = link[L].prev;
- }
- }
-
- pending.len = lenR;
-
- /* Extend right. */
- if (lenR >= min_match_len && lenR > lenL) {
- for (;;) {
-
- if (--count_remaining == 0)
- goto out_save_pending;
-
- input_idx_t offset = i - SA[R];
-
- /* Save match if it has smaller cost. */
- cost = lzx_match_cost_fast(offset, queue);
- if (cost < pending_cost) {
- pending.offset = offset;
- pending_cost = cost;
- }
-
- if (link[R].lcpnext < lenR) {
- /* Match length decreased. */
-
- lenR = link[R].lcpnext;
-
- /* Save the pending match unless a
- * previous (longer) match had lower
- * cost. */
- if (pending_cost < best_cost) {
- matches[nmatches++] = pending;
- best_cost = pending_cost;
- if (nmatches == max_matches_to_return)
- return nmatches;
- }
-
- if (lenR < min_match_len || lenR <= lenL)
- break;
-
- pending.len = lenR;
- pending_cost = INFINITE_BLOCK_COST;
- }
- R = link[R].next;
- }
- }
- }
- goto out;
-
-out_save_pending:
- if (pending_cost != INFINITE_BLOCK_COST)
- matches[nmatches++] = pending;
-
-out:
- return nmatches;
-}
-
-
/* Tell the match-finder to skip the specified number of bytes (@n) in the
* input. */
static void
} else {
while (n--) {
ctx->cached_matches[ctx->cached_matches_pos++].len = 0;
- lzx_lz_update_salink(ctx->match_window_pos++, ctx->SA,
- ctx->ISA, ctx->salink);
+ lz_sarray_skip_position(&ctx->lz_sarray);
+ ctx->match_window_pos++;
}
+ LZX_ASSERT(lz_sarray_get_pos(&ctx->lz_sarray) == ctx->match_window_pos);
}
}
if (ctx->matches_cached) {
num_matches = matches[-1].len;
} else {
- unsigned min_match_len = LZX_MIN_MATCH_LEN;
- if (!ctx->params.alg_params.slow.use_len2_matches)
- min_match_len = max(min_match_len, 3);
- const u32 max_search_depth = ctx->params.alg_params.slow.max_search_depth;
- const u32 max_matches_per_pos = ctx->params.alg_params.slow.max_matches_per_pos;
-
- if (unlikely(max_search_depth == 0 || max_matches_per_pos == 0))
- num_matches = 0;
- else
- num_matches = lzx_lz_get_matches(ctx->match_window_pos,
- ctx->SA,
- ctx->ISA,
- ctx->salink,
- matches,
- queue,
- min_match_len,
- max_search_depth,
- max_matches_per_pos);
+ LZX_ASSERT(lz_sarray_get_pos(&ctx->lz_sarray) == ctx->match_window_pos);
+ num_matches = lz_sarray_get_matches(&ctx->lz_sarray,
+ matches,
+ lzx_match_cost_fast,
+ queue);
matches[-1].len = num_matches;
}
ctx->cached_matches_pos += num_matches + 1;
lzx_optimize_block(ctx, &ctx->block_specs[i], num_passes);
}
-/* Initialize the suffix array match-finder for the specified input. */
-static void
-lzx_lz_init_matchfinder(const u8 T[const restrict],
- const input_idx_t n,
- input_idx_t SA[const restrict],
- input_idx_t ISA[const restrict],
- input_idx_t LCP[const restrict],
- struct salink link[const restrict],
- const unsigned max_match_len)
-{
- /* Compute SA (Suffix Array). */
-
- {
- /* ISA and link are used as temporary space. */
- BUILD_BUG_ON(LZX_MIN_WINDOW_SIZE * sizeof(ISA[0]) < 256 * sizeof(saidx_t));
- BUILD_BUG_ON(LZX_MIN_WINDOW_SIZE * 2 * sizeof(link[0]) < 256 * 256 * sizeof(saidx_t));
-
- if (sizeof(input_idx_t) == sizeof(saidx_t)) {
- divsufsort(T, SA, n, (saidx_t*)ISA, (saidx_t*)link);
- } else {
- saidx_t sa[n];
- divsufsort(T, sa, n, (saidx_t*)ISA, (saidx_t*)link);
- for (input_idx_t i = 0; i < n; i++)
- SA[i] = sa[i];
- }
- }
-
-#ifdef ENABLE_LZX_DEBUG
-
- LZX_ASSERT(n > 0);
-
- /* Verify suffix array. */
- {
- bool found[n];
- ZERO_ARRAY(found);
- for (input_idx_t r = 0; r < n; r++) {
- input_idx_t i = SA[r];
- LZX_ASSERT(i < n);
- LZX_ASSERT(!found[i]);
- found[i] = true;
- }
- }
-
- for (input_idx_t r = 0; r < n - 1; r++) {
-
- input_idx_t i1 = SA[r];
- input_idx_t i2 = SA[r + 1];
-
- input_idx_t n1 = n - i1;
- input_idx_t n2 = n - i2;
-
- LZX_ASSERT(memcmp(&T[i1], &T[i2], min(n1, n2)) <= 0);
- }
- LZX_DEBUG("Verified SA (len %u)", n);
-#endif /* ENABLE_LZX_DEBUG */
-
- /* Compute ISA (Inverse Suffix Array) */
- for (input_idx_t r = 0; r < n; r++)
- ISA[SA[r]] = r;
-
- /* Compute LCP (longest common prefix) array.
- *
- * Algorithm adapted from Kasai et al. 2001: "Linear-Time
- * Longest-Common-Prefix Computation in Suffix Arrays and Its
- * Applications". */
- {
- input_idx_t h = 0;
- for (input_idx_t i = 0; i < n; i++) {
- input_idx_t r = ISA[i];
- if (r > 0) {
- input_idx_t j = SA[r - 1];
-
- input_idx_t lim = min(n - i, n - j);
-
- while (h < lim && T[i + h] == T[j + h])
- h++;
- LCP[r] = h;
- if (h > 0)
- h--;
- }
- }
- }
-
-#ifdef ENABLE_LZX_DEBUG
- /* Verify LCP array. */
- for (input_idx_t r = 0; r < n - 1; r++) {
- LZX_ASSERT(ISA[SA[r]] == r);
- LZX_ASSERT(ISA[SA[r + 1]] == r + 1);
-
- input_idx_t i1 = SA[r];
- input_idx_t i2 = SA[r + 1];
- input_idx_t lcp = LCP[r + 1];
-
- input_idx_t n1 = n - i1;
- input_idx_t n2 = n - i2;
-
- LZX_ASSERT(lcp <= min(n1, n2));
-
- LZX_ASSERT(memcmp(&T[i1], &T[i2], lcp) == 0);
- if (lcp < min(n1, n2))
- LZX_ASSERT(T[i1 + lcp] != T[i2 + lcp]);
- }
-#endif /* ENABLE_LZX_DEBUG */
-
- /* Compute salink.next and salink.lcpnext.
- *
- * Algorithm adapted from Crochemore et al. 2009:
- * "LPF computation revisited".
- *
- * Note: we cap lcpnext to the maximum match length so that the
- * match-finder need not worry about it later. */
- link[n - 1].next = (input_idx_t)~0U;
- link[n - 1].prev = (input_idx_t)~0U;
- link[n - 1].lcpnext = 0;
- link[n - 1].lcpprev = 0;
- for (input_idx_t r = n - 2; r != (input_idx_t)~0U; r--) {
- input_idx_t t = r + 1;
- input_idx_t l = LCP[t];
- while (t != (input_idx_t)~0 && SA[t] > SA[r]) {
- l = min(l, link[t].lcpnext);
- t = link[t].next;
- }
- link[r].next = t;
- link[r].lcpnext = min(l, max_match_len);
- LZX_ASSERT(t == (input_idx_t)~0U || l <= n - SA[t]);
- LZX_ASSERT(l <= n - SA[r]);
- LZX_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);
- }
-
- /* Compute salink.prev and salink.lcpprev.
- *
- * Algorithm adapted from Crochemore et al. 2009:
- * "LPF computation revisited".
- *
- * Note: we cap lcpprev to the maximum match length so that the
- * match-finder need not worry about it later. */
- link[0].prev = (input_idx_t)~0;
- link[0].next = (input_idx_t)~0;
- link[0].lcpprev = 0;
- link[0].lcpnext = 0;
- for (input_idx_t r = 1; r < n; r++) {
- input_idx_t t = r - 1;
- input_idx_t l = LCP[r];
- while (t != (input_idx_t)~0 && SA[t] > SA[r]) {
- l = min(l, link[t].lcpprev);
- t = link[t].prev;
- }
- link[r].prev = t;
- link[r].lcpprev = min(l, max_match_len);
- LZX_ASSERT(t == (input_idx_t)~0 || l <= n - SA[t]);
- LZX_ASSERT(l <= n - SA[r]);
- LZX_ASSERT(memcmp(&T[SA[r]], &T[SA[t]], l) == 0);
- }
-}
-
/* Prepare the input window into one or more LZX blocks ready to be output. */
static void
lzx_prepare_blocks(struct lzx_compressor * ctx)
{
/* Initialize the match-finder. */
- lzx_lz_init_matchfinder(ctx->window, ctx->window_size,
- ctx->SA, ctx->ISA, ctx->LCP, ctx->salink,
- LZX_MAX_MATCH_LEN);
+ lz_sarray_load_window(&ctx->lz_sarray, ctx->window, ctx->window_size);
ctx->cached_matches_pos = 0;
ctx->matches_cached = false;
ctx->match_window_pos = 0;
FREE(ctx->chosen_matches);
FREE(ctx->cached_matches);
FREE(ctx->optimum);
- FREE(ctx->salink);
- FREE(ctx->SA);
+ lz_sarray_destroy(&ctx->lz_sarray);
FREE(ctx->block_specs);
FREE(ctx->prev_tab);
FREE(ctx->window);
goto oom;
if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) {
- ctx->SA = MALLOC(3U * window_size * sizeof(ctx->SA[0]));
- if (ctx->SA == NULL)
- goto oom;
- ctx->ISA = ctx->SA + window_size;
- ctx->LCP = ctx->ISA + window_size;
+ unsigned min_match_len = LZX_MIN_MATCH_LEN;
+ if (!ctx->params.alg_params.slow.use_len2_matches)
+ min_match_len = max(min_match_len, 3);
- ctx->salink = MALLOC(window_size * sizeof(ctx->salink[0]));
- if (ctx->salink == NULL)
+ if (!lz_sarray_init(&ctx->lz_sarray,
+ window_size,
+ min_match_len,
+ LZX_MAX_MATCH_LEN,
+ params->alg_params.slow.max_search_depth,
+ params->alg_params.slow.max_matches_per_pos))
goto oom;
}
#include "wimlib/compressor_ops.h"
#include "wimlib/compress_common.h"
#include "wimlib/error.h"
+#include "wimlib/lz_hash.h"
#include "wimlib/util.h"
#include "wimlib/xpress.h"
if ! diff -q -r tmp/dir tmp/myname || ! diff -q -r dir tmp/dir; then
error "Recursive diff of applied WIM with original directory failed"
fi
-if test "`get_link_count tmp/dir/lz77.c`" != 1; then
+if test "`get_link_count tmp/dir/write.c`" != 1; then
error "Incorrect link count on extracted file"
fi
-if test "`get_link_count tmp/myname/lz77.c`" != 1; then
+if test "`get_link_count tmp/myname/write.c`" != 1; then
error "Incorrect link count on extracted file"
fi
-if test "`get_inode_number tmp/myname/lz77.c`" = "`get_inode_number tmp/dir/lz77.c`"; then
+if test "`get_inode_number tmp/myname/write.c`" = "`get_inode_number tmp/dir/write.c`"; then
error "Incorrect inode number"
fi
rm -rf tmp
if ! diff -q -r tmp/dir tmp/myname || ! diff -q -r dir tmp/dir; then
error "Recursive diff of applied WIM with original directory failed"
fi
-if test "`get_link_count tmp/dir/lz77.c`" != 2; then
+if test "`get_link_count tmp/dir/write.c`" != 2; then
error "Incorrect link count on extracted file"
fi
-if test "`get_link_count tmp/myname/lz77.c`" != 2; then
+if test "`get_link_count tmp/myname/write.c`" != 2; then
error "Incorrect link count on extracted file"
fi
-if test "`get_inode_number tmp/myname/lz77.c`" != "`get_inode_number tmp/dir/lz77.c`"; then
+if test "`get_inode_number tmp/myname/write.c`" != "`get_inode_number tmp/dir/write.c`"; then
error "Incorrect inode number"
fi
rm -rf tmp
"loaded, or you aren't a member of the FUSE group?"
fi
echo "Testing extracting file from mounted read-only WIM"
- if ! cp tmp/lz77.c lz77.c; then
+ if ! cp tmp/write.c write.c; then
error "Failed to extract file from read-only mounted WIM"
fi
- if ! diff -q dir/lz77.c lz77.c; then
+ if ! diff -q dir/write.c write.c; then
error "Extracted file does not match copy in mounted WIM"
fi
- if ! diff -q tmp/lz77.c dir/lz77.c; then
+ if ! diff -q tmp/write.c dir/write.c; then
error "Extractef file does not match original"
fi
- rm -f lz77.c
+ rm -f write.c
echo "Testing modifying mounted read-only WIM (should fail)"
- if rm tmp/lz77.c; then
+ if rm tmp/write.c; then
error "Removing file from read-only mounted WIM didn't fail"
fi
if touch tmp/newfile; then
error "Creating file on read-only mounted WIM didn't fail"
fi
- if echo 3 > tmp/lz77.c; then
+ if echo 3 > tmp/write.c; then
error "Writing to file on read-only mounted WIM didn't fail"
fi
echo "Testing diff of mounted read-only WIM with original directory"
if ! imagex mountrw dir.wim dir tmp; then
error "Failed to re-mount test WIM read-write"
fi
-if ! rm tmp/lz77.c; then
+if ! rm tmp/write.c; then
error "Failed to remove file from read-write mounted WIM"
fi
-if test -f tmp/lz77.c; then
+if test -f tmp/write.c; then
error "Removing file from read-write mounted WIM failed"
fi
echo "Testing making directory in mounted WIM"