From: Eric Biggers Date: Sat, 31 Jan 2015 23:51:23 +0000 (-0600) Subject: Suffix array based matchfinder updates X-Git-Tag: v1.8.0~27 X-Git-Url: https://wimlib.net/git/?p=wimlib;a=commitdiff_plain;h=1bfcf1e9daf6ebcf8ff24817f05ac88ba29b3f47 Suffix array based matchfinder updates - Move LCP-interval tree matchfinder to lcpit_matchfinder.c - Support buffer sizes > 2^25 in LCP-interval tree matchfinder - Reduce code duplication in LCP-interval tree routines - Remove linked suffix array matchfinder - Remove lz_mf matchfinder API - Update LZMS compressor to use new LCP-interval tree matchfinder routines --- diff --git a/Makefile.am b/Makefile.am index 79a91c4c..b6846759 100644 --- a/Makefile.am +++ b/Makefile.am @@ -58,12 +58,10 @@ libwim_la_SOURCES = \ src/integrity.c \ src/iterate_dir.c \ src/join.c \ + src/lcpit_matchfinder.c \ + src/lcpit_matchfinder_templates.h \ src/lookup_table.c \ - src/lz_lcp_interval_tree.c \ - src/lz_linked_suffix_array.c \ - src/lz_mf.c \ src/lz_repsearch.c \ - src/lz_suffix_array_utils.c \ src/lzms_common.c \ src/lzms_compress.c \ src/lzms_decompress.c \ @@ -122,14 +120,12 @@ libwim_la_SOURCES = \ include/wimlib/inode.h \ include/wimlib/inode_table.h \ include/wimlib/integrity.h \ + include/wimlib/lcpit_matchfinder.h include/wimlib/list.h \ include/wimlib/lookup_table.h \ include/wimlib/lz_extend.h \ include/wimlib/lz_hash.h \ - include/wimlib/lz_mf.h \ - include/wimlib/lz_mf_ops.h \ include/wimlib/lz_repsearch.h \ - include/wimlib/lz_suffix_array_utils.h \ include/wimlib/lzms_common.h \ include/wimlib/lzms_constants.h \ include/wimlib/lzx_common.h \ diff --git a/include/wimlib/divsufsort.h b/include/wimlib/divsufsort.h index 287c012d..8ec81a9f 100644 --- a/include/wimlib/divsufsort.h +++ b/include/wimlib/divsufsort.h @@ -4,9 +4,8 @@ #include "wimlib/types.h" extern void -divsufsort(const u8 *T, u32 *SA, u32 n, u32 *bucket_A, u32 *bucket_B); +divsufsort(const u8 *T, u32 *SA, u32 n, u32 *tmp); -#define DIVSUFSORT_TMP1_LEN (256) /* bucket_A */ -#define DIVSUFSORT_TMP2_LEN (256 * 256) /* bucket_B */ +#define DIVSUFSORT_TMP_LEN (256 + (256 * 256)) #endif /* _WIMLIB_DIVSUFSORT_H */ diff --git a/include/wimlib/lcpit_matchfinder.h b/include/wimlib/lcpit_matchfinder.h new file mode 100644 index 00000000..ffcd426b --- /dev/null +++ b/include/wimlib/lcpit_matchfinder.h @@ -0,0 +1,74 @@ +/* + * lcpit_matchfinder.h + * + * A match-finder for Lempel-Ziv compression based on bottom-up construction and + * traversal of the Longest Common Prefix (LCP) interval tree. + * + * Author: Eric Biggers + * Year: 2014, 2015 + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + +#ifndef _LCPIT_MATCHFINDER_H +#define _LCPIT_MATCHFINDER_H + +#include "wimlib/types.h" + +struct lcpit_matchfinder { + + bool huge_mode; + + u32 cur_pos; + + /* Mapping: suffix index ("window position") => lcp-interval index */ + u32 *pos_data; + + /* Mapping: lcp-interval index => lcp-interval data + * + * Initially, the lcp-interval data for an lcp-interval contains that + * interval's lcp and superinterval index. + * + * After a lcp-interval is visited during match-finding, its + * lcp-interval data contains that interval's lcp and the position of + * the next suffix to consider as a match when matching against that + * lcp-interval. */ + union { + u32 *intervals; + u64 *intervals64; + }; + + /* The suffix array */ + u32 *SA; + + u32 min_match_len; + u32 nice_match_len; +}; + +struct lz_match { + u32 length; + u32 offset; +}; + +extern u64 +lcpit_matchfinder_get_needed_memory(size_t max_bufsize); + +extern bool +lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize, + u32 min_match_len, u32 nice_match_len); + +extern void +lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf); + +extern void +lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n); + +extern u32 +lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf, + struct lz_match *matches); + +extern void +lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count); + +#endif /* _LCPIT_MATCHFINDER_H */ diff --git a/include/wimlib/lz_mf.h b/include/wimlib/lz_mf.h deleted file mode 100644 index 699c13c6..00000000 --- a/include/wimlib/lz_mf.h +++ /dev/null @@ -1,342 +0,0 @@ -/* - * lz_mf.h - * - * Interface for Lempel-Ziv match-finders. - * - * Copyright (c) 2014 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Example usage of the match-finder API: - * - * ---------------------------------------------------------------------------- - * - * Fill in a 'struct lz_mf_params'. - * (Optional) Call lz_mf_params_valid() to validate the parameters. - * Call lz_mf_alloc() to allocate the match-finder. - * For each block of data to be compressed: - * Call lz_mf_load_window() to load the block into the match finder. - * While the block is not yet fully compressed: - * Call lz_mf_get_matches() to get matches at the current position. - * If matches were found: - * Output the longest match. - * Call lz_mf_skip_positions() to skip the remaining length of the match. - * Else: - * Output a literal. - * End If - * End While - * End For - * Call lz_mf_free() to free the match-finder. - * - * ---------------------------------------------------------------------------- - * - * That example did "greedy parsing" --- that is, always choosing the longest - * match at each position. However, this interface can be (and is intended to - * be) used for "optimal parsing" as well. It can also be used for in-between - * strategies such as "lazy parsing" and "flexible parsing". For the best - * performance try different match-finding algorithms and parameters to see what - * works best for your parsing strategy, and your typical data and block sizes. - */ - -/* - * TODO: this API is going to go away eventually. It has too much indirection - * and is not flexible enough. - */ - -#ifndef _WIMLIB_LZ_MF_H -#define _WIMLIB_LZ_MF_H - -#include "wimlib/types.h" - -/* When ENABLE_LZ_DEBUG is defined, we check all matches for correctness and - * perform other validations. Use for debugging only, as it slows things down - * significantly. */ - -//#define ENABLE_LZ_DEBUG -#ifdef ENABLE_LZ_DEBUG -# include -# include -# define LZ_ASSERT assert -#else -# define LZ_ASSERT(...) -#endif - -struct lz_mf; - -/* Representation of a Lempel-Ziv match. */ -struct lz_match { - - /* The number of bytes matched. */ - u32 len; - - /* The offset back from the current position that was matched. */ - u32 offset; -}; - -/* - * Specifies a match-finding algorithm. - */ -enum lz_mf_algo { - /* - * Longest Common Prefix Interval Tree match-finding algorithm. - * - * This is a suffix array-based algorithm. It works well on medium to - * large windows. However, due to an implementation detail, it is - * currently limited to a maximum window size of 33554432 bytes. - * - * The memory usage is 12 bytes per position. - */ - LZ_MF_LCP_INTERVAL_TREE, - - /* - * Linked Suffix Array match-finding algorithm. - * - * This can be used on very large windows. - * - * The memory usage is 14 bytes per position. - * - * Currently, this method usually performs slightly worse than the LCP - * interval tree algorithm. However, it can be used on windows - * exceeding the 33554432 byte limit of the LCP interval tree algorithm. - */ - LZ_MF_LINKED_SUFFIX_ARRAY, -}; - -/* Parameters for Lempel-Ziv match-finding. */ -struct lz_mf_params { - - /* - * The match-finding algorithm to use. This must be one of the 'enum - * lz_mf_algo' constants defined above. - */ - u32 algorithm; - - /* - * The maximum window size, in bytes, that shall be supported by the - * match-finder. This is the maximum size that can be passed to - * subsequent calls to lz_mf_load_window(). - * - * Note: this interface is intended to be used for block compression, so - * none of the match-finding algorithms support sliding windows. It's - * expected that the window for LZ match-finding simply be the block of - * data being compressed. - * - * Match-finders generally require an amount of memory proportional to - * this parameter. Use lz_mf_get_needed_memory() to query the needed - * memory size for a specific match-finding algorithm and maximum window - * size. - * - * This parameter cannot be 0; there is no default value. - * - * Match-finding algorithms may place additional restrictions on this - * parameter. However, currently only the LCP interval tree - * match-finding algorithm places such a restriction (it doesn't support - * windows larger than 33554432 bytes). - */ - u32 max_window_size; - - /* - * The minimum length, in bytes, of matches that can be produced by the - * match-finder (by a call to lz_mf_get_matches()). - * - * If this parameter is not 0, it must be 2 or greater. - * - * If this parameter is 0, the match-finding algorithm sets it to a - * default value. The default value will be at least 2 and at most 16. - */ - u32 min_match_len; - - /* - * The maximum length, in bytes, of matches that can be produced by the - * match-finder (by a call to lz_mf_get_matches()). - * - * If this parameter is not 0, it must be greater than or equal to - * @min_match_len, or the default value the match-finding algorithm - * selected for @min_match_len in the case that @min_match_len was - * specified as 0. - * - * If this parameter is 0, the match-finding algorithm sets it to a - * default value. In general, the caller must be prepared to handle - * arbitrarily long matches (up to the window size minus 1) in this - * case. - */ - u32 max_match_len; - - /* - * This value describes the maximum amount of work that the - * match-finding algorithm will do at each position. A typical value to - * use is 32. Higher values result in better matches and slower - * performance. - * - * If this parameter is 0, the match-finding algorithm sets it to a - * default value. - */ - u32 max_search_depth; - - /* - * This parameter defines the maximum match length to which the full - * algorithm will be applied. This can also be thought of as the length - * above which the algorithm will not try to search for additional - * matches. - * - * Usually, setting this parameter to a reasonable value (such as 24, - * 32, or 48) will speed up match-finding but will not hurt the - * compression ratio too much. This is because these settings of this - * parameter cause the match-finder to not waste too much time examining - * very long matches, which are already highly compressible. - * - * In addition, if the longest match exceeds this length, the - * match-finding algorithm will still report its full length. - * - * The linked suffix array match-finding algorithm ignores this - * parameter. - * - * If this parameter is 0, the match-finding algorithm sets it to a - * default value. - */ - u32 nice_match_len; -}; - -/* - * Lempel-Ziv match-finder operations structure. - * - * Match-finding algorithms must fill in all members. None can be left as 0 or - * NULL. - * - * Don't directly access any of the members outside of lz_mf.h and lz_mf.c. - * Instead, use the lz_mf_*() wrappers. - */ -struct lz_mf_ops { - bool (*params_valid)(const struct lz_mf_params *); - - u64 (*get_needed_memory)(u32 max_window_size); - - bool (*init)(struct lz_mf *); - - void (*load_window)(struct lz_mf *mf, const u8 *, u32); - - u32 (*get_matches)(struct lz_mf *, struct lz_match *); - - void (*skip_positions)(struct lz_mf *, u32); - - void (*destroy)(struct lz_mf *); - - size_t struct_size; -}; - -/* - * Lempel-Ziv match-finder structure. - * - * Match-finding algorithms must embed this structure inside a private - * structure. - * - * Don't directly access any of the members outside of lz_mf.h, lz_mf.c, and - * match-finding algorithms. Instead, use the lz_mf_*() wrappers. - */ -struct lz_mf { - struct lz_mf_params params; - struct lz_mf_ops ops; - const u8 *cur_window; - u32 cur_window_pos; - u32 cur_window_size; -}; - -extern bool -lz_mf_params_valid(const struct lz_mf_params *params); - -extern u64 -lz_mf_get_needed_memory(enum lz_mf_algo algorithm, u32 max_window_size); - -extern struct lz_mf * -lz_mf_alloc(const struct lz_mf_params *params); - -extern void -lz_mf_load_window(struct lz_mf *mf, const u8 *window, u32 size); - -#ifdef ENABLE_LZ_DEBUG -extern u32 -lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches); -#else -/* See non-inline definition for comment */ -static inline u32 -lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches) -{ - return mf->ops.get_matches(mf, matches); -} -#endif - -#ifdef ENABLE_LZ_DEBUG -extern void -lz_mf_skip_positions(struct lz_mf *mf, u32 n); -#else -/* See non-inline definition for comment */ -static inline void -lz_mf_skip_positions(struct lz_mf *mf, u32 n) -{ - mf->ops.skip_positions(mf, n); -} -#endif - -extern void -lz_mf_free(struct lz_mf *mf); - -/* - * Returns the match-finder's current position in the window. - * - * The current position begins at 0. It increases by 1 when lz_mf_get_matches() - * is called, and by 'n' when lz_mf_skip_positions() is called. - * - * Note: The behavior is undefined if the match-finder is advanced beyond the - * end of the window. (If this happens in ENABLE_LZ_DEBUG mode, an assertion - * will be triggered.) - */ -static inline u32 -lz_mf_get_position(const struct lz_mf *mf) -{ - return mf->cur_window_pos; -} - -/* - * Returns the number of bytes remaining in the window. - */ -static inline u32 -lz_mf_get_bytes_remaining(const struct lz_mf *mf) -{ - return mf->cur_window_size - mf->cur_window_pos; -} - -/* - * Returns a pointer to the current window, offset by the current position. - * Equivalently, this returns a pointer to the byte sequence that the next call - * to lz_mf_get_matches() will match against. - */ -static inline const u8 * -lz_mf_get_window_ptr(const struct lz_mf *mf) -{ - return &mf->cur_window[mf->cur_window_pos]; -} - -#endif /* _WIMLIB_LZ_MF_H */ diff --git a/include/wimlib/lz_mf_ops.h b/include/wimlib/lz_mf_ops.h deleted file mode 100644 index 2186534d..00000000 --- a/include/wimlib/lz_mf_ops.h +++ /dev/null @@ -1,4 +0,0 @@ -#include "wimlib/lz_mf.h" - -extern const struct lz_mf_ops lz_lcp_interval_tree_ops; -extern const struct lz_mf_ops lz_linked_suffix_array_ops; diff --git a/include/wimlib/lz_suffix_array_utils.h b/include/wimlib/lz_suffix_array_utils.h deleted file mode 100644 index d4dcbe8c..00000000 --- a/include/wimlib/lz_suffix_array_utils.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _WIMLIB_LZ_SUFFIX_ARRAY_UTILS_H -#define _WIMLIB_LZ_SUFFIX_ARRAY_UTILS_H - -#include "wimlib/types.h" - -#define BUILD_SA_MIN_TMP_LEN (65536 + 256) - -extern void -build_SA(u32 *SA, const u8 *T, u32 n, u32 *tmp); - -extern void -build_ISA(u32 *ISA, const u32 *SA, u32 n); - -extern void -build_LCP(u32 *LCP, const u32 *SA, const u32 *ISA, const u8 *T, u32 n); - -#endif /* _WIMLIB_LZ_SUFFIX_ARRAY_UTILS_H */ diff --git a/src/divsufsort.c b/src/divsufsort.c index fe2a8ebc..67536956 100644 --- a/src/divsufsort.c +++ b/src/divsufsort.c @@ -29,9 +29,10 @@ #endif #include "wimlib/divsufsort.h" -#include "wimlib/lz_mf.h" #include "wimlib/util.h" +#define DIVSUFSORT_ASSERT(expr) + /*- Constants -*/ #define ALPHABET_SIZE 256 #define BUCKET_A_SIZE (ALPHABET_SIZE) @@ -61,26 +62,26 @@ #define STACK_PUSH(_a, _b, _c, _d)\ do {\ - LZ_ASSERT(ssize < STACK_SIZE);\ + DIVSUFSORT_ASSERT(ssize < STACK_SIZE);\ stack[ssize].a = (_a), stack[ssize].b = (_b),\ stack[ssize].c = (_c), stack[ssize++].d = (_d);\ } while(0) #define STACK_PUSH5(_a, _b, _c, _d, _e)\ do {\ - LZ_ASSERT(ssize < STACK_SIZE);\ + DIVSUFSORT_ASSERT(ssize < STACK_SIZE);\ stack[ssize].a = (_a), stack[ssize].b = (_b),\ stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\ } while(0) #define STACK_POP(_a, _b, _c, _d)\ do {\ - LZ_ASSERT(0 <= ssize);\ + DIVSUFSORT_ASSERT(0 <= ssize);\ if(ssize == 0) { return; }\ (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ (_c) = stack[ssize].c, (_d) = stack[ssize].d;\ } while(0) #define STACK_POP5(_a, _b, _c, _d, _e)\ do {\ - LZ_ASSERT(0 <= ssize);\ + DIVSUFSORT_ASSERT(0 <= ssize);\ if(ssize == 0) { return; }\ (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\ @@ -1528,9 +1529,9 @@ construct_SA(const unsigned char *T, int *SA, i <= j; --j) { if(0 < (s = *j)) { - LZ_ASSERT(T[s] == c1); - LZ_ASSERT(((s + 1) < n) && (T[s] <= T[s + 1])); - LZ_ASSERT(T[s - 1] <= T[s]); + DIVSUFSORT_ASSERT(T[s] == c1); + DIVSUFSORT_ASSERT(((s + 1) < n) && (T[s] <= T[s + 1])); + DIVSUFSORT_ASSERT(T[s - 1] <= T[s]); *j = ~s; c0 = T[--s]; if((0 < s) && (T[s - 1] > c0)) { s = ~s; } @@ -1538,10 +1539,10 @@ construct_SA(const unsigned char *T, int *SA, if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } k = SA + BUCKET_B(c2 = c0, c1); } - LZ_ASSERT(k < j); + DIVSUFSORT_ASSERT(k < j); *k-- = s; } else { - LZ_ASSERT(((s == 0) && (T[s] == c1)) || (s < 0)); + DIVSUFSORT_ASSERT(((s == 0) && (T[s] == c1)) || (s < 0)); *j = ~s; } } @@ -1555,17 +1556,17 @@ construct_SA(const unsigned char *T, int *SA, /* Scan the suffix array from left to right. */ for(i = SA, j = SA + n; i < j; ++i) { if(0 < (s = *i)) { - LZ_ASSERT(T[s - 1] >= T[s]); + DIVSUFSORT_ASSERT(T[s - 1] >= T[s]); c0 = T[--s]; if((s == 0) || (T[s - 1] < c0)) { s = ~s; } if(c0 != c2) { BUCKET_A(c2) = k - SA; k = SA + BUCKET_A(c2 = c0); } - LZ_ASSERT(i < k); + DIVSUFSORT_ASSERT(i < k); *k++ = s; } else { - LZ_ASSERT(s < 0); + DIVSUFSORT_ASSERT(s < 0); *i = ~s; } } @@ -1578,8 +1579,10 @@ construct_SA(const unsigned char *T, int *SA, /* XXX Modified from original: use provided temporary space instead of * allocating it. */ void -divsufsort(const u8 *T, u32 *SA, u32 n, u32 *bucket_A, u32 *bucket_B) +divsufsort(const u8 *T, u32 *SA, u32 n, u32 *tmp) { + u32 *bucket_A = tmp; + u32 *bucket_B = tmp + BUCKET_A_SIZE; u32 m; switch (n) { diff --git a/src/lcpit_matchfinder.c b/src/lcpit_matchfinder.c new file mode 100644 index 00000000..dd00979e --- /dev/null +++ b/src/lcpit_matchfinder.c @@ -0,0 +1,256 @@ +/* + * lcpit_matchfinder.h + * + * A match-finder for Lempel-Ziv compression based on bottom-up construction and + * traversal of the Longest Common Prefix (LCP) interval tree. + * + * Author: Eric Biggers + * Year: 2014, 2015 + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + +#include +#include + +#include "wimlib/divsufsort.h" +#include "wimlib/lcpit_matchfinder.h" +#include "wimlib/util.h" + +#define LCP_BITS 6 +#define LCP_MASK ((1 << LCP_BITS) - 1) +#define LCP_MAX LCP_MASK +#define NORMAL_UNVISITED_TAG ((u32)1 << 31) +#define MAX_NORMAL_BUFSIZE ((u32)1 << (31 - LCP_BITS)) +#define HUGE_UNVISITED_TAG ((u64)1 << 63) +#define SA_and_LCP_LCP_SHIFT (32 - LCP_BITS) +#define SA_and_LCP_POS_MASK (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1) + +/* + * Include the template header to define the functions build_LCP(), + * build_LCPIT(), and lcpit_advance_one_byte(). There are "normal" and "huge" + * versions of each function. The normal versions assume that a buffer position + * and LCP value can be packed into a 32-bit integer, whereas the huge versions + * assume that 64 bits is needed. + * + * Both versions cap LCP values to 6 bits. This limits the depth of the + * lcp-interval tree without hurting the compression ratio too much. Matches of + * length 63 are sufficiently long that the compression ratio doesn't change + * significantly if we choose one such match over another. + */ +#define HUGE_MODE 1 +#include "lcpit_matchfinder_templates.h" +#undef HUGE_MODE + +#define HUGE_MODE 0 +#include "lcpit_matchfinder_templates.h" +#undef HUGE_MODE + +/* + * Calculate the number of bytes of memory needed for the LCP-interval tree + * matchfinder. + * + * @max_bufsize - maximum buffer size to support + * + * Returns the number of bytes required. + */ +u64 +lcpit_matchfinder_get_needed_memory(size_t max_bufsize) +{ + u64 size = 0; + + /* pos_data (+1 is for prefetch) */ + size += ((u64)max_bufsize + 1) * sizeof(u32); + + /* intervals or intervals64 */ + size += max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) * + (max_bufsize <= MAX_NORMAL_BUFSIZE ? sizeof(u32) : sizeof(u64)); + + /* SA */ + size += (u64)max_bufsize * sizeof(u32); + + return size; +} + +/* + * Initialize the LCP-interval tree matchfinder. + * + * @mf - the matchfinder structure to initialize + * @max_bufsize - maximum buffer size to support + * @min_match_len - minimum match length in bytes + * @nice_match_len - only consider this many bytes of each match + * + * Returns true if successfully initialized; false if out of memory. + */ +bool +lcpit_matchfinder_init(struct lcpit_matchfinder *mf, size_t max_bufsize, + u32 min_match_len, u32 nice_match_len) +{ + if (lcpit_matchfinder_get_needed_memory(max_bufsize) > SIZE_MAX) + return false; + + mf->pos_data = MALLOC((max_bufsize + 1) * sizeof(u32)); + mf->intervals = MALLOC(max((u64)max_bufsize, DIVSUFSORT_TMP_LEN) * + (max_bufsize <= MAX_NORMAL_BUFSIZE ? + sizeof(u32) : sizeof(u64))); + mf->SA = MALLOC(max_bufsize * sizeof(u32)); + + if (!mf->pos_data || !mf->intervals || !mf->SA) { + lcpit_matchfinder_destroy(mf); + return false; + } + + mf->min_match_len = min_match_len; + mf->nice_match_len = min(nice_match_len, LCP_MAX); + return true; +} + +/* + * Build the suffix array SA for the specified byte array T of length n. + * + * The suffix array is a sorted array of the byte array's suffixes, represented + * by indices into the byte array. It can equivalently be viewed as a mapping + * from suffix rank to suffix position. + * + * To build the suffix array, we use libdivsufsort, which uses an + * induced-sorting-based algorithm. In practice, this seems to be the fastest + * suffix array construction algorithm currently available. + * + * References: + * + * Y. Mori. libdivsufsort, a lightweight suffix-sorting library. + * https://code.google.com/p/libdivsufsort/. + * + * G. Nong, S. Zhang, and W.H. Chan. 2009. Linear Suffix Array + * Construction by Almost Pure Induced-Sorting. Data Compression + * Conference, 2009. DCC '09. pp. 193 - 202. + * + * S.J. Puglisi, W.F. Smyth, and A. Turpin. 2007. A Taxonomy of Suffix + * Array Construction Algorithms. ACM Computing Surveys (CSUR) Volume 39 + * Issue 2, 2007 Article No. 4. + */ +static void +build_SA(u32 SA[], const u8 T[], u32 n, u32 *tmp) +{ + /* Note: divsufsort() needs temporary space --- one array with 256 + * spaces and one array with 65536 spaces. The implementation of + * divsufsort() has been modified from the original to use the provided + * temporary space instead of allocating its own, since we don't want to + * have to deal with malloc() failures here. */ + divsufsort(T, SA, n, tmp); +} + +/* + * Build the inverse suffix array ISA from the suffix array SA. + * + * Whereas the suffix array is a mapping from suffix rank to suffix position, + * the inverse suffix array is a mapping from suffix position to suffix rank. + */ +static void +build_ISA(u32 ISA[restrict], const u32 SA[restrict], u32 n) +{ + for (u32 r = 0; r < n; r++) + ISA[SA[r]] = r; +} + +/* + * Prepare the LCP-interval tree matchfinder for a new input buffer. + * + * @mf - the initialized matchfinder structure + * @T - the input buffer + * @n - size of the input buffer in bytes. This may be at most the max_bufsize + * with which lcpit_matchfinder_init() was called. + */ +void +lcpit_matchfinder_load_buffer(struct lcpit_matchfinder *mf, const u8 *T, u32 n) +{ + if (n == 0) + return; + + build_SA(mf->SA, T, n, mf->intervals); + build_ISA(mf->pos_data, mf->SA, n); + if (n <= MAX_NORMAL_BUFSIZE) { + /* "Normal" sized buffer */ + + /* Build LCP, packing it into ->SA */ + build_LCP_normal(mf->SA, mf->pos_data, T, n, + mf->min_match_len, mf->nice_match_len); + /* Prepare ->intervals and ->pos_data */ + build_LCPIT_normal(mf->SA, mf->intervals, mf->pos_data, n); + mf->huge_mode = false; + } else { + /* "Huge" sized buffer */ + + /* Build LCP in the second half of ->intervals64. It may be + * partially overwritten in build_LCPIT_huge(), but this is okay + * since each LCP entry is guaranteed to be consumed before it + * can possibly be overwritten. */ + build_LCP_huge(mf->intervals + n, mf->SA, mf->pos_data, T, n, + mf->min_match_len, mf->nice_match_len); + /* Prepare ->intervals64 and ->pos_data */ + build_LCPIT_huge(mf->SA, mf->intervals + n, mf->intervals64, + mf->pos_data, n); + mf->huge_mode = true; + } + mf->cur_pos = 0; /* starting at beginning of input buffer */ + mf->pos_data[n] = 0; /* safety entry for prefetch() overrun */ +} + +/* + * Retrieve a list of matches with the next position. + * + * The matches will be recorded in the @matches array, ordered by strictly + * decreasing length and strictly decreasing offset. + * + * The return value is the number of matches found and written to @matches. + * This can be any value in [0, nice_match_len - min_match_len + 1]. + * + * If the caller attempts to advance beyond the end of the input buffer, the + * behavior is undefined. + */ +u32 +lcpit_matchfinder_get_matches(struct lcpit_matchfinder *mf, + struct lz_match *matches) +{ + if (mf->huge_mode) + return lcpit_advance_one_byte_huge(mf, matches, true); + else + return lcpit_advance_one_byte_normal(mf, matches, true); +} + +/* + * Skip the next @count bytes (don't search for matches at them). @count is + * assumed to be > 0. + * + * If the caller attempts to advance beyond the end of the input buffer, the + * behavior is undefined. + */ +void +lcpit_matchfinder_skip_bytes(struct lcpit_matchfinder *mf, u32 count) +{ + if (mf->huge_mode) { + do { + lcpit_advance_one_byte_huge(mf, NULL, false); + } while (--count); + } else { + do { + lcpit_advance_one_byte_normal(mf, NULL, false); + } while (--count); + } +} + +/* + * Destroy an LCP-interval tree matchfinder that was previously initialized with + * lcpit_matchfinder_init(). + * + * If the struct has been zeroed out, this has no effect. + */ +void +lcpit_matchfinder_destroy(struct lcpit_matchfinder *mf) +{ + FREE(mf->pos_data); + FREE(mf->intervals); + FREE(mf->SA); + memset(mf, 0, sizeof(*mf)); +} diff --git a/src/lcpit_matchfinder_templates.h b/src/lcpit_matchfinder_templates.h new file mode 100644 index 00000000..ac9c8883 --- /dev/null +++ b/src/lcpit_matchfinder_templates.h @@ -0,0 +1,343 @@ +/* + * lcpit_matchfinder_templates.h + * + * This file is included by lcpit_matchfinder.c. + * + * Author: Eric Biggers + * Year: 2014, 2015 + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + +/* + * In normal mode, we can pack a buffer position and a LCP value into a 32-bit + * number. In huge mode, we can't. + */ +#if HUGE_MODE +# define GET_SA_ENTRY(r) (SA[r]) +# define GET_LCP_ENTRY(r) (LCP[r]) +# define SET_LCP_ENTRY(r, val) (LCP[r] = (val)) +# define UNVISITED_TAG HUGE_UNVISITED_TAG +#else +# define GET_SA_ENTRY(r) (SA_and_LCP[r] & SA_and_LCP_POS_MASK) +# define GET_LCP_ENTRY(r) (SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT) +# define SET_LCP_ENTRY(r, val) (SA_and_LCP[r] |= (val) << SA_and_LCP_LCP_SHIFT) +# define UNVISITED_TAG NORMAL_UNVISITED_TAG +#endif + +/* + * Build the LCP (Longest Common Prefix) array in linear time. + * + * LCP[r] will be the length of the longest common prefix between the suffixes + * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined. + * + * Algorithm taken from Kasai et al. (2001), but modified slightly: + * + * - For decreased memory usage and improved memory locality, pack the two + * logically distinct SA and LCP arrays into a single array SA_and_LCP. + * + * - With bytes there is no realistic way to reserve a unique symbol for + * end-of-buffer, so use explicit checks for end-of-buffer. + * + * - If a LCP value is less than the minimum match length, then store 0. This + * avoids having to do comparisons against the minimum match length later. + * + * - If a LCP value is greater than the "nice match length", then store the + * "nice match length". This caps the number of bits needed to store each + * LCP value, and this caps the depth of the LCP-interval tree, without + * usually hurting the compression ratio too much. + * + * References: + * + * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in + * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th + * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. + */ +#if HUGE_MODE +static void +build_LCP_huge(u32 LCP[restrict], const u32 SA[restrict], const u32 ISA[restrict], + const u8 T[restrict], u32 n, u32 min_lcp, u32 max_lcp) +#else +static void +build_LCP_normal(u32 SA_and_LCP[restrict], const u32 ISA[restrict], + const u8 T[restrict], u32 n, u32 min_lcp, u32 max_lcp) +#endif +{ + u32 h = 0; + for (u32 i = 0; i < n; i++) { + u32 r = ISA[i]; + if (r > 0) { + u32 j = GET_SA_ENTRY(r - 1); + u32 lim = min(n - i, n - j); + while (h < lim && T[i + h] == T[j + h]) + h++; + u32 stored_lcp = h; + if (stored_lcp < min_lcp) + stored_lcp = 0; + else if (stored_lcp > max_lcp) + stored_lcp = max_lcp; + SET_LCP_ENTRY(r, stored_lcp); + if (h > 0) + h--; + } + } +} + +/* + * Use the suffix array accompanied with the longest-common-prefix array --- in + * other words, the "enhanced suffix array" --- to simulate a bottom-up + * traversal of the corresponding suffix tree, or equivalently the "lcp-interval + * tree", as described in Abouelhoda et al. (2004). + * + * While doing the traversal, create a table 'intervals' that contains data for + * each lcp-interval, specifically the lcp value of that interval, and the index + * of the superinterval. + * + * Also while doing the traversal, create a table 'pos_data' that contains a + * mapping from suffix index to the deepest lcp-interval containing it. + * + * The result is that we will later be able to do match-finding at a given + * position by looking up that position in 'pos_data' to get the deepest + * lcp-interval containing the corresponding suffix, then proceeding to the + * superintervals. See lcpit_advance_one_byte() for more details. + * + * Note: We limit the depth of the lcp-interval tree by capping the lcp at + * LCP_MAX. This can cause a sub-tree of intervals with lcp greater than + * LCP_MAX to be collapsed into a single interval with lcp LCP_MAX. This avoids + * degenerate cases and does not hurt match-finding very much, since if we find + * a match of length LCP_MAX and extend it as far as possible, that's usually + * good enough because that region of the input must already be highly + * compressible. + * + * References: + * + * M.I. Abouelhoda, S. Kurtz, E. Ohlebusch. 2004. Replacing Suffix Trees + * With Enhanced Suffix Arrays. Journal of Discrete Algorithms Volume 2 + * Issue 1, March 2004, pp. 53-86. + * + * G. Chen, S.J. Puglisi, W.F. Smyth. 2008. Lempel-Ziv Factorization + * Using Less Time & Space. Mathematics in Computer Science June 2008, + * Volume 1, Issue 4, pp. 605-623. + * + * Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix + * Arrays and Its Applications. 2001. CPM '01 Proceedings of the 12th + * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. + */ +#if HUGE_MODE +static void +build_LCPIT_huge(const u32 SA[restrict], u32 LCP[], u64 intervals[], + u32 pos_data[restrict], u32 n) +#else +static void +build_LCPIT_normal(const u32 SA_and_LCP[restrict], u32 intervals[restrict], + u32 pos_data[restrict], u32 n) +#endif +{ + u32 next_interval_idx = 0; + u32 open_intervals[LCP_MAX + 1]; + u32 *top = open_intervals; + u32 prev_pos = GET_SA_ENTRY(0); + + /* The interval with lcp=0 covers the entire array. It remains open + * until the end. */ + *top = next_interval_idx; + intervals[next_interval_idx] = 0; + next_interval_idx++; + + for (u32 r = 1; r < n; r++) { + u32 next_pos = GET_SA_ENTRY(r); + u32 next_lcp = GET_LCP_ENTRY(r); + u32 top_lcp = intervals[*top]; + + if (next_lcp == top_lcp) { + /* continuing the deepest open interval */ + pos_data[prev_pos] = *top; + } else if (next_lcp > top_lcp) { + /* opening a new interval */ + intervals[next_interval_idx] = next_lcp; + *++top = next_interval_idx; + pos_data[prev_pos] = next_interval_idx; + next_interval_idx++; + } else { + /* closing the deepest open interval */ + pos_data[prev_pos] = *top; + for (;;) { + u32 closed_interval_idx = *top; + u32 superinterval_idx = *--top; + u32 superinterval_lcp = intervals[superinterval_idx]; + + if (next_lcp == superinterval_lcp) { + /* continuing the superinterval */ + intervals[closed_interval_idx] |= + (superinterval_idx << LCP_BITS) | + UNVISITED_TAG; + break; + } else if (next_lcp > superinterval_lcp) { + /* creating a new interval that is a + * superinterval of the one being + * closed, but still a subinterval of + * its superinterval */ + intervals[next_interval_idx] = next_lcp; + *++top = next_interval_idx; + intervals[closed_interval_idx] |= + (next_interval_idx << LCP_BITS) | + UNVISITED_TAG; + next_interval_idx++; + break; + } else { + /* also closing the superinterval */ + intervals[closed_interval_idx] |= + (superinterval_idx << LCP_BITS) | + UNVISITED_TAG; + } + } + } + prev_pos = next_pos; + } + + /* close any still-open intervals */ + pos_data[prev_pos] = *top; + while (top > open_intervals) { + u32 closed_interval_idx = *top; + u32 superinterval_idx = *--top; + intervals[closed_interval_idx] |= + (superinterval_idx << LCP_BITS) | UNVISITED_TAG; + } +} + +/* + * Advance the LCP-interval tree matchfinder by one byte. + * + * If @record_matches is true, then matches are recorded in the @matches array, + * and the return value is the number of matches found. Otherwise, @matches is + * ignored and the return value is always 0. + */ +static inline u32 +#if HUGE_MODE +lcpit_advance_one_byte_huge +#else +lcpit_advance_one_byte_normal +#endif +(struct lcpit_matchfinder *mf, struct lz_match * restrict matches, + bool record_matches) +{ + const u32 cur_pos = mf->cur_pos++; + u32 * const pos_data = mf->pos_data; +#if HUGE_MODE + u64 * const intervals = mf->intervals64; +#else + u32 * const intervals = mf->intervals; +#endif + u32 num_matches = 0; + u32 lcp, next_lcp; + u32 interval, next_interval; + u32 cur_match, next_match; + + /* Look up the deepest lcp-interval containing the current suffix. */ + interval = pos_data[cur_pos]; + + /* Prefetch the deepest lcp-interval containing the next suffix. */ + prefetch(&intervals[pos_data[cur_pos + 1]]); + + /* Since the current position is greater than any position previously + * searched, set the "lcp interval of the next match" for this suffix to + * 0. This is the index of the root interval, and this indicates that + * there is no next match. */ + pos_data[cur_pos] = 0; + + /* Ascend the lcp-interval tree until we reach an lcp-interval that has + * already been visited. */ + + while (intervals[interval] & UNVISITED_TAG) { + + /* Visiting this lcp-interval for the first time. Therefore, + * there are no matches with length equal to the lcp of this + * lcp-interval. */ + + /* Extract the LCP and superinterval reference. */ + + lcp = intervals[interval] & LCP_MASK; + + /* If the LCP is shorter than the minimum length of matches to + * be produced, we're done, since the LCP will only ever get + * shorter from here. This also prevents ascending above the + * root of the lcp-interval tree, since the root is guaranteed + * to be a 0-interval. */ + if (lcp == 0) + return 0; + + next_interval = (intervals[interval] & ~UNVISITED_TAG) >> LCP_BITS; + + /* Set the position of the most-recently-seen suffix within this + * lcp-interval. Since this is the first visitation of this + * lcp-interval, this is simply the current suffix. + * + * Note that this overwrites the superinterval reference which + * was previously included in this lcp-interval data slot. + * Further visitations of this lcp-interval will detect that it + * is already visited and will follow the chain of + * most-recently-seen suffixes rather than ascend the tree + * directly. */ + intervals[interval] = (cur_pos << LCP_BITS) | lcp; + + /* Ascend to the superinterval of this lcp-interval. */ + interval = next_interval; + } + + /* We've already visited the current lcp-interval. */ + + /* Extract the LCP of this lcp-interval. */ + lcp = intervals[interval] & LCP_MASK; + + /* Extract the current match for this lcp-interval. This usually is the + * most-recently-seen suffix within this lcp-interval, but it may be + * outdated. */ + cur_match = intervals[interval] >> LCP_BITS; + + for (;;) { + /* If the LCP is shorter than the minimum length of matches to + * be produced, we're done, since the LCP will only ever get + * shorter from here. This also prevents ascending above the + * root of the lcp-interval tree, since the root is guaranteed + * to be a 0-interval. */ + if (lcp == 0) + break; + + /* Advance the current match until the lcp of the *next* match + * is lower than the current lcp. When this is true we know + * that the current match is up to date (lowest offset / + * greatest position for that lcp). */ + + next_match = cur_match; + do { + next_interval = pos_data[next_match]; + next_lcp = intervals[next_interval] & LCP_MASK; + cur_match = next_match; + next_match = intervals[next_interval] >> LCP_BITS; + } while (next_lcp >= lcp); + + /* Link the current position into the match chain, discarding + * any skipped matches. */ + intervals[interval] = (cur_pos << LCP_BITS) | lcp; + pos_data[cur_match] = interval; + + if (record_matches) { + /* Record the match. */ + matches[num_matches].length = lcp; + matches[num_matches].offset = cur_pos - cur_match; + num_matches++; + } + + /* Advance to the next match. */ + interval = next_interval; + lcp = next_lcp; + cur_match = next_match; + } + return num_matches; +} + +#undef GET_SA_ENTRY +#undef GET_LCP_ENTRY +#undef SET_LCP_ENTRY +#undef UNVISITED_TAG diff --git a/src/lz_lcp_interval_tree.c b/src/lz_lcp_interval_tree.c deleted file mode 100644 index 5240a7f3..00000000 --- a/src/lz_lcp_interval_tree.c +++ /dev/null @@ -1,524 +0,0 @@ -/* - * lz_lcp_interval_tree.c - * - * A match-finder for Lempel-Ziv compression based on bottom-up construction and - * traversal of the Longest Common Prefix (LCP) interval tree. - * - * Copyright (c) 2014 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include "wimlib/lz_mf.h" -#include "wimlib/lz_suffix_array_utils.h" -#include "wimlib/util.h" - -/* - * To save space, we pack lcp (longest common prefix) and position values into - * 32-bit integers. Therefore, we must divide the 32 bits into lcp and position - * bits. 6 lcp bits seems to be a good value, since matches of length 64 are - * sufficiently long so that the compression ratio isn't hurt much by choosing - * one such match over another. We also use 1 bit to mark intervals as "not yet - * visited". This leaves 25 bits, which when used for position results in a - * maximum window size of 33554432 bytes. - */ -#define LZ_LCPIT_LCP_BITS 6 -#define LZ_LCPIT_LCP_MASK ((1 << LZ_LCPIT_LCP_BITS) - 1) -#define LZ_LCPIT_LCP_MAX LZ_LCPIT_LCP_MASK -#define LZ_LCPIT_POS_BITS (32 - 1 - LZ_LCPIT_LCP_BITS) -#define LZ_LCPIT_MAX_WINDOW_SIZE (1UL << LZ_LCPIT_POS_BITS) - -#define SA_and_LCP_LCP_SHIFT (32 - LZ_LCPIT_LCP_BITS) -#define SA_and_LCP_POS_MASK (((u32)1 << SA_and_LCP_LCP_SHIFT) - 1) - -struct lz_lcpit { - struct lz_mf base; - - u32 *mem; - - /* Mapping: lcp-interval index => lcp-interval data - * - * Initially, the lcp-interval data for an lcp-interval contains that - * interval's lcp and superinterval index. - * - * After a lcp-interval is visited during match-finding, its - * lcp-interval data contains that interval's lcp and the position of - * the next suffix to consider as a match when matching against that - * lcp-interval. */ - u32 *intervals; - - /* Mapping: suffix index ("window position") => lcp-interval index */ - u32 *pos_data; -}; - -/* - * Build the LCP (Longest Common Prefix) array in linear time. - * - * LCP[r] will be the length of the longest common prefix between the suffixes - * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined. - * - * Algorithm taken from Kasai et al. (2001), but modified slightly: - * - * - For decreased memory usage and improved memory locality, pack the two - * logically distinct SA and LCP arrays into a single array SA_and_LCP. - * - * - With bytes there is no realistic way to reserve a unique symbol for - * end-of-buffer, so use explicit checks for end-of-buffer. - * - * - If a LCP value is less than the minimum match length, then store 0. This - * avoids having to do comparisons against the minimum match length later. - * - * - If a LCP value is greater than the "nice match length", then store the - * "nice match length". This caps the number of bits needed to store each - * LCP value, and this caps the depth of the LCP-interval tree, without - * usually hurting the compression ratio too much. - * - * References: - * - * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in - * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th - * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. - */ -static void -build_LCP_packed(u32 * const restrict SA_and_LCP, const u32 * const restrict ISA, - const u8 * const restrict T, const u32 n, - const u32 min_lcp, const u32 max_lcp) -{ - u32 h, i, r, j, lim, stored_lcp; - - h = 0; - for (i = 0; i < n; i++) { - r = ISA[i]; - if (r > 0) { - j = SA_and_LCP[r - 1] & SA_and_LCP_POS_MASK; - lim = min(n - i, n - j); - while (h < lim && T[i + h] == T[j + h]) - h++; - stored_lcp = h; - if (stored_lcp < min_lcp) - stored_lcp = 0; - else if (stored_lcp > max_lcp) - stored_lcp = max_lcp; - SA_and_LCP[r] |= stored_lcp << SA_and_LCP_LCP_SHIFT; - if (h > 0) - h--; - } - } -} - -/* - * Use the suffix array accompanied with the longest-common-prefix array --- in - * other words, the "enhanced suffix array" --- to simulate a bottom-up - * traversal of the corresponding suffix tree, or equivalently the "lcp-interval - * tree", as described in Abouelhoda et al. (2004). - * - * While doing the traversal, create a table 'intervals' that contains data for - * each lcp-interval, specifically the lcp value of that interval, and the index - * of the superinterval. - * - * Also while doing the traversal, create a table 'pos_data' that contains a - * mapping from suffix index to the deepest lcp-interval containing it. - * - * The result is that we will later be able to do match-finding at a given - * position by looking up that position in 'pos_data' to get the deepest - * lcp-interval containing the corresponding suffix, then proceeding to the - * superintervals. See lz_lcpit_get_matches() for more details. - * - * Note: We limit the depth of the lcp-interval tree by capping the lcp at - * LZ_LCPIT_LCP_MAX. This can cause a sub-tree of intervals with lcp greater - * than LZ_LCPIT_LCP_MAX to be collapsed into a single interval with lcp - * LZ_LCPIT_LCP_MAX. This avoids degenerate cases and does not hurt - * match-finding very much, since if we find a match of length LZ_LCPIT_LCP_MAX - * and extend it as far as possible, that's usually good enough because that - * region of the input must already be highly compressible. - * - * References: - * - * M.I. Abouelhoda, S. Kurtz, E. Ohlebusch. 2004. Replacing Suffix Trees - * With Enhanced Suffix Arrays. Journal of Discrete Algorithms Volume 2 - * Issue 1, March 2004, pp. 53-86. - * - * G. Chen, S.J. Puglisi, W.F. Smyth. 2008. Lempel-Ziv Factorization - * Using Less Time & Space. Mathematics in Computer Science June 2008, - * Volume 1, Issue 4, pp. 605-623. - * - * Kasai et al. Linear-Time Longest-Common-Prefix Computation in Suffix - * Arrays and Its Applications. 2001. CPM '01 Proceedings of the 12th - * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. - */ -static void -build_LCPIT(const u32 * const restrict SA_and_LCP, - u32 * const restrict intervals, u32 * const restrict pos_data, - const u32 n) -{ - u32 next_interval_idx = 0; - u32 open_intervals[LZ_LCPIT_LCP_MAX + 1]; - u32 *top = open_intervals; - u32 prev_pos = SA_and_LCP[0] & SA_and_LCP_POS_MASK; - - /* The interval with lcp=0 covers the entire array. It remains open - * until the end. */ - *top = next_interval_idx; - intervals[next_interval_idx] = 0; - next_interval_idx++; - - for (u32 r = 1; r < n; r++) { - u32 next_pos = SA_and_LCP[r] & SA_and_LCP_POS_MASK; - u32 next_lcp = SA_and_LCP[r] >> SA_and_LCP_LCP_SHIFT; - u32 top_lcp = intervals[*top]; - - if (next_lcp == top_lcp) { - /* continuing the deepest open interval */ - pos_data[prev_pos] = *top; - } else if (next_lcp > top_lcp) { - /* opening a new interval */ - intervals[next_interval_idx] = next_lcp; - *++top = next_interval_idx; - pos_data[prev_pos] = next_interval_idx; - next_interval_idx++; - } else { - /* closing the deepest open interval */ - pos_data[prev_pos] = *top; - for (;;) { - u32 closed_interval_idx = *top; - u32 superinterval_idx = *--top; - u32 superinterval_lcp = intervals[superinterval_idx]; - - if (next_lcp == superinterval_lcp) { - /* continuing the superinterval */ - intervals[closed_interval_idx] |= - (superinterval_idx << LZ_LCPIT_LCP_BITS) | - 0x80000000; - break; - } else if (next_lcp > superinterval_lcp) { - /* creating a new interval that is a - * superinterval of the one being - * closed, but still a subinterval of - * its superinterval */ - intervals[next_interval_idx] = next_lcp; - *++top = next_interval_idx; - intervals[closed_interval_idx] |= - (next_interval_idx << LZ_LCPIT_LCP_BITS) | - 0x80000000; - next_interval_idx++; - break; - } else { - /* also closing the superinterval */ - intervals[closed_interval_idx] |= - (superinterval_idx << LZ_LCPIT_LCP_BITS) | - 0x80000000; - } - } - } - prev_pos = next_pos; - } - - /* close any still-open intervals */ - pos_data[prev_pos] = *top; - while (top > open_intervals) { - u32 closed_interval_idx = *top; - u32 superinterval_idx = *--top; - intervals[closed_interval_idx] |= - (superinterval_idx << LZ_LCPIT_LCP_BITS) | 0x80000000; - } -} - -static void -lz_lcpit_set_default_params(struct lz_mf_params *params) -{ - if (params->min_match_len == 0) - params->min_match_len = 2; - - if (params->max_match_len == 0) - params->max_match_len = UINT32_MAX; - - if (params->max_search_depth == 0) - params->max_search_depth = 32; - - params->max_search_depth = DIV_ROUND_UP(params->max_search_depth, 8); - - if (params->nice_match_len == 0) - params->nice_match_len = LZ_LCPIT_LCP_MAX; - - if (params->nice_match_len < params->min_match_len) - params->nice_match_len = params->min_match_len; - - if (params->nice_match_len > params->max_match_len) - params->nice_match_len = params->max_match_len; - - if (params->nice_match_len > LZ_LCPIT_LCP_MAX) - params->nice_match_len = LZ_LCPIT_LCP_MAX; -} - -static bool -lz_lcpit_params_valid(const struct lz_mf_params *params) -{ - return params->max_window_size <= LZ_LCPIT_MAX_WINDOW_SIZE; -} - -static u64 -lz_lcpit_get_needed_memory(u32 max_window_size) -{ - return sizeof(u32) * (max_window_size + - max(BUILD_SA_MIN_TMP_LEN, - 2 * (u64)max_window_size)); -} - -static bool -lz_lcpit_init(struct lz_mf *_mf) -{ - struct lz_lcpit *mf = (struct lz_lcpit *)_mf; - - lz_lcpit_set_default_params(&mf->base.params); - - mf->mem = MALLOC(lz_lcpit_get_needed_memory(mf->base.params.max_window_size)); - return (mf->mem != NULL); -} - -static void -lz_lcpit_load_window(struct lz_mf *_mf, const u8 T[], u32 n) -{ - struct lz_lcpit *mf = (struct lz_lcpit *)_mf; - - build_SA(&mf->mem[0 * n], T, n, &mf->mem[1 * n]); - build_ISA(&mf->mem[2 * n], &mf->mem[0 * n], n); - build_LCP_packed(&mf->mem[0 * n], &mf->mem[2 * n], T, n, - mf->base.params.min_match_len, - mf->base.params.nice_match_len); - build_LCPIT(&mf->mem[0 * n], &mf->mem[1 * n], &mf->mem[2 * n], n); - mf->intervals = &mf->mem[1 * n]; - mf->pos_data = &mf->mem[2 * n]; -} - -static u32 -lz_lcpit_get_matches(struct lz_mf *_mf, struct lz_match matches[]) -{ - struct lz_lcpit *mf = (struct lz_lcpit *)_mf; - const u32 cur_pos = mf->base.cur_window_pos; - u32 * const pos_data = mf->pos_data; - u32 * const intervals = mf->intervals; - u32 num_matches = 0; - u32 lcp, next_lcp; - u32 interval, next_interval; - u32 cur_match, next_match; - - /* Look up the deepest lcp-interval containing the current suffix. */ - interval = pos_data[cur_pos]; - - /* Since the current position is greater than any position previously - * searched, set the "lcp interval of the next match" for this suffix to - * 0. This is the index of the root interval, and this indicates that - * there is no next match. */ - pos_data[cur_pos] = 0; - - /* Ascend the lcp-interval tree until we reach an lcp-interval that has - * already been visited. */ - - while (intervals[interval] & 0x80000000) { - - /* Visiting this lcp-interval for the first time. Therefore, - * there are no Lempel-Ziv matches with length equal to the lcp - * of this lcp-interval. */ - - /* Extract the LCP and superinterval reference. */ - - lcp = intervals[interval] & LZ_LCPIT_LCP_MASK; - - next_interval = (intervals[interval] & ~0x80000000) - >> LZ_LCPIT_LCP_BITS; - - /* If the LCP is shorter than the minimum length of matches to - * be produced, we're done, since the LCP will only ever get - * shorter from here. This also prevents ascending above the - * root of the lcp-interval tree, since the root is guaranteed - * to be a 0-interval. */ - if (lcp == 0) - goto out; - - /* Set the position of the most-recently-seen suffix within this - * lcp-interval. Since this is the first visitation of this - * lcp-interval, this is simply the current suffix. - * - * Note that this overwrites the superinterval reference which - * was previously included in this lcp-interval data slot. - * Further visitations of this lcp-interval will detect that it - * is already visited and will follow the chain of - * most-recently-seen suffixes rather than ascend the tree - * directly. */ - intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp; - - /* Ascend to the superinterval of this lcp-interval. */ - interval = next_interval; - } - - /* We've already visited the current lcp-interval. */ - - /* Extract the LCP of this lcp-interval. */ - lcp = intervals[interval] & LZ_LCPIT_LCP_MASK; - - /* Extract the current match for this lcp-interval. This usually is the - * most-recently-seen suffix within this lcp-interval, but it may be - * outdated. */ - cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS; - - for (;;) { - /* If the LCP is shorter than the minimum length of matches to - * be produced, we're done, since the LCP will only ever get - * shorter from here. This also prevents ascending above the - * root of the lcp-interval tree, since the root is guaranteed - * to be a 0-interval. */ - if (lcp == 0) - break; - - /* Advance the current match until the lcp of the *next* match - * is lower than the current lcp. When this is true we know - * that the current match is up to date (lowest offset / - * greatest position for that lcp). */ - - next_match = cur_match; - do { - next_interval = pos_data[next_match]; - next_lcp = intervals[next_interval] & LZ_LCPIT_LCP_MASK; - cur_match = next_match; - next_match = intervals[next_interval] >> LZ_LCPIT_LCP_BITS; - } while (next_lcp >= lcp); - - /* Link the current position into the match chain, discarding - * any skipped matches. */ - intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp; - pos_data[cur_match] = interval; - - /* Record the match. */ - matches[num_matches++] = (struct lz_match) { - .len = lcp, - .offset = cur_pos - cur_match, - }; - - /* Bound the number of matches per position. */ - if (num_matches >= mf->base.params.max_search_depth) - break; - - /* Advance to the next match. */ - interval = next_interval; - lcp = next_lcp; - cur_match = next_match; - } - - /* If the length of the longest match is equal to the lcp limit, it may - * have been truncated. Try extending it up to the maximum match - * length. */ - if (num_matches && matches[0].len == mf->base.params.nice_match_len) { - const u8 * const strptr = lz_mf_get_window_ptr(&mf->base); - const u8 * const matchptr = strptr - matches[0].offset; - const u32 len_limit = min(lz_mf_get_bytes_remaining(&mf->base), - mf->base.params.max_match_len); - u32 len; - - len = matches[0].len; - while (len < len_limit && strptr[len] == matchptr[len]) - len++; - matches[0].len = len; - } - - for (u32 i = 0; i < num_matches / 2; i++) - swap(matches[i], matches[num_matches - 1 - i]); -out: - mf->base.cur_window_pos++; - return num_matches; -} - -/* Slightly simplified version of lz_lcpit_get_matches() for updating the data - * structures when we don't actually need matches at the current position. See - * lz_lcpit_get_matches() for explanatory comments. */ -static void -lz_lcpit_skip_position(struct lz_lcpit *mf) -{ - const u32 cur_pos = mf->base.cur_window_pos++; - u32 * const pos_data = mf->pos_data; - u32 * const intervals = mf->intervals; - u32 lcp, next_lcp; - u32 interval, next_interval; - u32 cur_match, next_match; - - interval = pos_data[cur_pos]; - pos_data[cur_pos] = 0; - while (intervals[interval] & 0x80000000) { - lcp = intervals[interval] & LZ_LCPIT_LCP_MASK; - next_interval = (intervals[interval] & ~0x80000000) - >> LZ_LCPIT_LCP_BITS; - if (lcp == 0) - return; - intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp; - interval = next_interval; - } - lcp = intervals[interval] & LZ_LCPIT_LCP_MASK; - cur_match = intervals[interval] >> LZ_LCPIT_LCP_BITS; - while (lcp != 0) { - next_match = cur_match; - do { - next_interval = pos_data[next_match]; - next_lcp = intervals[next_interval] & LZ_LCPIT_LCP_MASK; - cur_match = next_match; - next_match = intervals[next_interval] >> LZ_LCPIT_LCP_BITS; - } while (next_lcp >= lcp); - intervals[interval] = (cur_pos << LZ_LCPIT_LCP_BITS) | lcp; - pos_data[cur_match] = interval; - interval = next_interval; - lcp = next_lcp; - cur_match = next_match; - } -} - -static void -lz_lcpit_skip_positions(struct lz_mf *_mf, u32 n) -{ - struct lz_lcpit *mf = (struct lz_lcpit *)_mf; - - do { - lz_lcpit_skip_position(mf); - } while (--n); -} - -static void -lz_lcpit_destroy(struct lz_mf *_mf) -{ - struct lz_lcpit *mf = (struct lz_lcpit *)_mf; - - FREE(mf->mem); -} - -const struct lz_mf_ops lz_lcp_interval_tree_ops = { - .params_valid = lz_lcpit_params_valid, - .get_needed_memory = lz_lcpit_get_needed_memory, - .init = lz_lcpit_init, - .load_window = lz_lcpit_load_window, - .get_matches = lz_lcpit_get_matches, - .skip_positions = lz_lcpit_skip_positions, - .destroy = lz_lcpit_destroy, - .struct_size = sizeof(struct lz_lcpit), -}; diff --git a/src/lz_linked_suffix_array.c b/src/lz_linked_suffix_array.c deleted file mode 100644 index 2a0f3fd3..00000000 --- a/src/lz_linked_suffix_array.c +++ /dev/null @@ -1,726 +0,0 @@ -/* - * lz_linked_suffix_array.c - * - * Linked suffix array match-finder for Lempel-Ziv compression. - * - * Copyright (c) 2013, 2014 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include "wimlib/lz_mf.h" -#include "wimlib/lz_suffix_array_utils.h" -#include "wimlib/util.h" - -struct salink; - -/* Length type --- must be an unsigned type large enough to hold the maximum - * match length. */ -typedef u16 lz_lsa_len_t; - -/* Type of distances in suffix array links. A larger type would allow skipping - * irrelevant suffixes more quickly, which is especially helpful towards the - * start of the window. However, even a single byte allows skipping 255 at a - * time, which where it matters is already a big improvement over the - * alternative of searching the suffixes consecutively. */ -typedef u8 lz_lsa_delta_t; - -#define LZ_LSA_LEN_MAX ((lz_lsa_len_t)~0UL) -#define LZ_LSA_POS_MAX ((u32)~0UL) -#define LZ_LSA_DELTA_MAX ((lz_lsa_delta_t)~0UL) - -/* State of the linked suffix array match-finder. */ -struct lz_lsa { - - struct lz_mf base; - - /* Suffix array for the current window. - * This is a mapping from suffix rank to suffix position. */ - u32 *SA; - - /* Inverse suffix array for the current window. - * This is a mapping from suffix position to suffix rank. - * If 0 <= r < window_size, then ISA[SA[r]] == r. */ - u32 *ISA; - - /* Suffix array links. - * - * During a linear scan of the input string to find matches, this array - * used to keep track of which rank suffixes in the suffix array appear - * before the current position. Instead of searching in the original - * suffix array, scans for matches at a given position traverse a linked - * list containing (usually) only suffixes that appear before that - * position. */ - struct salink *salink; -}; - -/* Suffix array link. An array of these structures, one per suffix rank, is - * used as a replacement for the raw LCP (Longest Common Prefix) array to allow - * skipping over suffixes that appear later in the window and hence cannot be - * used as LZ77 matches. */ -struct salink { - union { - /* Temporary fields used while this structure is being - * initialized. - * - * Note: we want the entire `struct salink' to be only 6 bytes, - * even though this makes "next_initial" unaligned. */ - struct { - u32 next_initial; - lz_lsa_len_t lcpnext_initial; - } _packed_attribute; - - struct { - /* Intially, the length, in bytes, of the longest common - * prefix (LCP) between the suffix having this rank and - * the suffix with the smallest larger rank that - * starts earlier in the window than the suffix having - * this rank. If no such suffix exists, this will be 0. - * - * Later, during match-finding, after the corresponding - * suffix has entered the LZ77 dictionary, this value - * may be updated by lz_lsa_update_salink() to refer - * instead to a lexicographically closer (but still - * larger) suffix that begins at a later position that - * has entered the LZ77 dictionary. */ - lz_lsa_len_t lcpnext; - - /* Initially, the length, in bytes, of the longest - * common prefix (LCP) between the suffix having this - * rank and the suffix with the largest smaller rank - * that starts earlier in the window than the suffix - * having this rank. If no such suffix exists, this - * will be 0. - * - * Later, during match-finding, after the corresponding - * suffix has entered the LZ77 dictionary, this value - * may be updated by lz_lsa_update_salink() to refer - * instead to a lexicographically closer (but still - * smaller) suffix that begins at a later position that - * has entered the LZ77 dictionary. */ - lz_lsa_len_t lcpprev; - - /* Distance to the suffix referred to in the description - * of "lcpnext" above, but capped to a maximum value to - * save memory; or, 0 if no such suffix exists. If the - * true distance was truncated, this will give the - * distance to the rank of a suffix that is - * lexicographically closer to the current suffix than - * the desired suffix, but appears *later* in the window - * and hence cannot be used as the basis for an LZ77 - * match. */ - lz_lsa_delta_t dist_to_next; - - /* Distance to the suffix referred to in the description - * of "lcpprev" above, but capped to a maximum value to - * save memory; or, 0 if no such suffix exists. If the - * true distance was truncated, this will give the - * distance to the rank of a suffix that is - * lexicographically closer to the current suffix than - * the desired suffix, but appears *later* in the window - * and hence cannot be used as the basis for an LZ77 - * match. */ - lz_lsa_delta_t dist_to_prev; - }; - }; -}; - -/* Initialize the SA link array in linear time. - * - * This is similar to computing the LPF (Longest Previous Factor) array, which - * is addressed in several papers. In particular the algorithms below are based - * on Crochemore et al. 2009: "LPF computation revisited". However, this - * match-finder does not actually compute or use the LPF array per se. Rather, - * this function sets up some information necessary to compute the LPF array, - * but later lz_lsa_get_matches() actually uses this information to search - * the suffix array directly and can keep searching beyond the first (longest) - * match whose length would be placed in the LPF array. This difference from - * the theoretical work is necessary because in many real compression formats - * matches take variable numbers of bits to encode, so a decent parser needs to - * consider more than just the longest match with unspecified offset. - * - * Note: We cap the lcpprev and lcpnext values to the maximum match length so - * that the match-finder need not worry about it later, in the inner loop. - * - * Note: the LCP array is one of the inputs to this function, but it is used as - * temporary space and therefore will be invalidated. - */ -static void -init_salink(struct salink link[restrict], u32 LCP[restrict], - const u32 SA[restrict], const u8 T[restrict], u32 n, - lz_lsa_len_t min_match_len, lz_lsa_len_t max_match_len) -{ - /* Calculate salink.dist_to_next and salink.lcpnext. - * - * Pass 1 calculates, for each suffix rank, the corresponding - * "next_initial" value which is the smallest larger rank that - * corresponds to a suffix starting earlier in the string. It also - * calculates "lcpnext_initial", which is the longest common prefix with - * that suffix, although to eliminate checks in lz_lsa_get_matches(), - * "lcpnext_initial" is set to 0 if it's less than the minimum match - * length or set to the maximum match length if it's greater than the - * maximum match length. - * - * Pass 2 translates each absolute "next_initial", a 4-byte value, into - * a relative "dist_to_next", a 1-byte value. This is done to save - * memory. In the case that the exact relative distance cannot be - * encoded in 1 byte, it is capped to 255. This is valid as long as - * lz_lsa_get_matches() validates each position before using it. - * Note that "lcpnext" need not be updated in this case because it will - * not be used until the actual next rank has been found anyway. - */ - link[n - 1].next_initial = LZ_LSA_POS_MAX; - link[n - 1].lcpnext_initial = 0; - for (u32 r = n - 2; r != LZ_LSA_POS_MAX; r--) { - u32 t = r + 1; - u32 l = LCP[t]; - while (t != LZ_LSA_POS_MAX && SA[t] > SA[r]) { - l = min(l, link[t].lcpnext_initial); - t = link[t].next_initial; - } - link[r].next_initial = t; - - if (l < min_match_len) - l = 0; - else if (l > max_match_len) - l = max_match_len; - link[r].lcpnext_initial = l; - } - for (u32 r = 0; r < n; r++) { - u32 next; - lz_lsa_len_t l; - lz_lsa_delta_t dist_to_next; - - next = link[r].next_initial; - l = link[r].lcpnext_initial; - - if (next == LZ_LSA_POS_MAX) - dist_to_next = 0; - else if (next - r <= LZ_LSA_DELTA_MAX) - dist_to_next = next - r; - else - dist_to_next = LZ_LSA_DELTA_MAX; - - link[r].lcpnext = l; - link[r].dist_to_next = dist_to_next; - } - - /* Calculate salink.dist_to_prev and salink.lcpprev. - * - * This is analgous to dist_to_next and lcpnext as described above, but - * in the other direction. That is, here we're interested in, for each - * rank, the largest smaller rank that corresponds to a suffix starting - * earlier in the string. - * - * To save memory we don't have a "prev_initial" field, but rather store - * those values in the LCP array. */ - LCP[0] = LZ_LSA_POS_MAX; - link[0].lcpprev = 0; - for (u32 r = 1; r < n; r++) { - u32 t = r - 1; - u32 l = LCP[r]; - while (t != LZ_LSA_POS_MAX && SA[t] > SA[r]) { - l = min(l, link[t].lcpprev); - t = LCP[t]; - } - LCP[r] = t; - - if (l < min_match_len) - l = 0; - else if (l > max_match_len) - l = max_match_len; - - link[r].lcpprev = l; - } - for (u32 r = 0; r < n; r++) { - - u32 prev = LCP[r]; - - if (prev == LZ_LSA_POS_MAX) - link[r].dist_to_prev = 0; - else if (r - prev <= LZ_LSA_DELTA_MAX) - link[r].dist_to_prev = r - prev; - else - link[r].dist_to_prev = LZ_LSA_DELTA_MAX; - } -} - -/* If ENABLE_LZ_DEBUG is defined, verify the values computed by init_salink(). - * - * WARNING: this is for debug use only as it does not necessarily run in linear - * time!!! */ -static void -verify_salink(const struct salink link[], const u32 SA[], const u8 T[], u32 n, - lz_lsa_len_t min_match_len, lz_lsa_len_t max_match_len) -{ -#ifdef ENABLE_LZ_DEBUG - for (u32 r = 0; r < n; r++) { - for (u32 prev = r; ; ) { - if (prev == 0) { - LZ_ASSERT(link[r].dist_to_prev == 0); - LZ_ASSERT(link[r].lcpprev == 0); - break; - } - - prev--; - - if (SA[prev] < SA[r]) { - LZ_ASSERT(link[r].dist_to_prev == min(r - prev, LZ_LSA_DELTA_MAX)); - - u32 lcpprev; - for (lcpprev = 0; - lcpprev < min(n - SA[prev], n - SA[r]) && - T[SA[prev] + lcpprev] == T[SA[r] + lcpprev]; - lcpprev++) - ; - if (lcpprev < min_match_len) - lcpprev = 0; - else if (lcpprev > max_match_len) - lcpprev = max_match_len; - - LZ_ASSERT(lcpprev == link[r].lcpprev); - break; - } - } - - for (u32 next = r; ; ) { - if (next == n - 1) { - LZ_ASSERT(link[r].dist_to_next == 0); - LZ_ASSERT(link[r].lcpnext == 0); - break; - } - - next++; - - if (SA[next] < SA[r]) { - LZ_ASSERT(link[r].dist_to_next == min(next - r, LZ_LSA_DELTA_MAX)); - - u32 lcpnext; - for (lcpnext = 0; - lcpnext < min(n - SA[next], n - SA[r]) && - T[SA[next] + lcpnext] == T[SA[r] + lcpnext]; - lcpnext++) - ; - if (lcpnext < min_match_len) - lcpnext = 0; - else if (lcpnext > max_match_len) - lcpnext = max_match_len; - - LZ_ASSERT(lcpnext == link[r].lcpnext); - break; - } - } - } -#endif -} - -static inline void -lz_lsa_update_salink(const u32 r, struct salink link[]) -{ - const u32 next = r + link[r].dist_to_next; - const u32 prev = r - link[r].dist_to_prev; - - if (next != r && link[r].dist_to_next < link[next].dist_to_prev) { - link[next].dist_to_prev = link[r].dist_to_next; - link[next].lcpprev = link[r].lcpnext; - } - - if (prev != r && link[r].dist_to_prev < link[prev].dist_to_next) { - link[prev].dist_to_next = link[r].dist_to_prev; - link[prev].lcpnext = link[r].lcpprev; - } -} - -static void -lz_lsa_set_default_params(struct lz_mf_params *params) -{ - if (params->min_match_len == 0) - params->min_match_len = 2; - - if (params->max_match_len == 0) - params->max_match_len = UINT32_MAX; - - if (params->max_match_len > LZ_LSA_LEN_MAX) - params->max_match_len = LZ_LSA_LEN_MAX; - - if (params->max_search_depth == 0) - params->max_search_depth = 32; - - /* Scale max_search_depth down since this algorithm finds the longest - * matches first. */ - params->max_search_depth = DIV_ROUND_UP(params->max_search_depth, 5); -} - -static u64 -lz_lsa_get_needed_memory(u32 max_window_size) -{ - u64 size = 0; - - /* SA */ - size += (u64)max_window_size * sizeof(u32); - - /* ISA */ - size += (u64)max_window_size * sizeof(u32); - - /* salink and minimum temporary space for divsufsort */ - size += max(BUILD_SA_MIN_TMP_LEN * sizeof(u32), - (u64)max_window_size * sizeof(struct salink)); - - return size; -} - -static bool -lz_lsa_params_valid(const struct lz_mf_params *params) -{ - return true; -} - -static bool -lz_lsa_init(struct lz_mf *_mf) -{ - struct lz_lsa *mf = (struct lz_lsa *)_mf; - const u32 max_window_size = mf->base.params.max_window_size; - - lz_lsa_set_default_params(&mf->base.params); - - /* SA and ISA will share the same allocation. */ - mf->SA = MALLOC(max_window_size * 2 * sizeof(u32)); - if (!mf->SA) - return false; - - mf->salink = MALLOC(max(BUILD_SA_MIN_TMP_LEN * sizeof(u32), - max_window_size * sizeof(struct salink))); - if (!mf->salink) { - FREE(mf->SA); - return false; - } - - return true; -} - -static void -lz_lsa_load_window(struct lz_mf *_mf, const u8 T[], u32 n) -{ - struct lz_lsa *mf = (struct lz_lsa *)_mf; - u32 *ISA, *LCP; - - build_SA(mf->SA, T, n, (u32 *)mf->salink); - - /* Compute ISA (Inverse Suffix Array) in a preliminary position. - * - * This is just a trick to save memory. Since LCP is unneeded after - * this function, it can be computed in any available space. The - * storage for the ISA is the best choice because the ISA can be built - * quickly in salink for now, then re-built in its real location at the - * end. This is probably worth it because computing the ISA from the SA - * is very fast, and since this match-finder is memory-hungry we'd like - * to save as much memory as possible. */ - BUILD_BUG_ON(sizeof(mf->salink[0]) < sizeof(mf->ISA[0])); - ISA = (u32 *)mf->salink; - build_ISA(ISA, mf->SA, n); - - /* Compute LCP (Longest Common Prefix) array. */ - LCP = mf->SA + n; - build_LCP(LCP, mf->SA, ISA, T, n); - - /* Initialize suffix array links. */ - init_salink(mf->salink, LCP, mf->SA, T, n, - mf->base.params.min_match_len, - mf->base.params.max_match_len); - verify_salink(mf->salink, mf->SA, T, n, - mf->base.params.min_match_len, - mf->base.params.max_match_len); - - /* Compute ISA (Inverse Suffix Array) in its final position. */ - ISA = mf->SA + n; - build_ISA(ISA, mf->SA, n); - - /* Save new variables and return. */ - mf->ISA = ISA; -} - -static u32 -lz_lsa_get_matches(struct lz_mf *_mf, struct lz_match matches[]) -{ - struct lz_lsa *mf = (struct lz_lsa *)_mf; - const u32 i = mf->base.cur_window_pos++; - - const u32 * const restrict SA = mf->SA; - const u32 * const restrict ISA = mf->ISA; - struct salink * const restrict link = mf->salink; - - /* r = Rank of the suffix at the current position. */ - const u32 r = ISA[i]; - - /* Prepare for searching the current position. */ - lz_lsa_update_salink(r, link); - - /* Prefetch next position in SA and link. - * - * This can improve performance on large windows since the locations in - * SA and link at which each successive search begins are in general - * randomly distributed. */ - if (likely(i + 1 < mf->base.cur_window_size)) { - const u32 next_r = ISA[i + 1]; - prefetch(&SA[next_r]); - prefetch(&link[next_r]); - } - - /* L = rank of next suffix to the left; - * R = rank of next suffix to the right; - * lenL = length of match between current position and the suffix with rank L; - * lenR = length of match between current position and the suffix with rank R. - * - * This is left and right relative to the rank of the current suffix. - * Since the suffixes in the suffix array are sorted, the longest - * matches are immediately to the left and right (using the linked list - * to ignore all suffixes that occur later in the window). The match - * length decreases the farther left and right we go. We shall keep the - * length on both sides in sync in order to choose the lowest-cost match - * of each length. - */ - u32 L = r - link[r].dist_to_prev; - u32 R = r + link[r].dist_to_next; - u32 lenL = link[r].lcpprev; - u32 lenR = link[r].lcpnext; - - /* num_matches = number of matches found so far. */ - u32 num_matches = 0; - - /* best_offset = offset of lowest-cost match found so far. - * - * Shorter matches that do not have a lower offset than this are - * discarded, since presumably it would be cheaper to output the bytes - * from the longer match instead. */ - u32 best_offset = LZ_LSA_POS_MAX; - - /* count_remaining = maximum number of possible matches remaining to be - * considered. */ - u32 count_remaining = mf->base.params.max_search_depth; - - /* pending_offset = offset of lowest-cost match found for the current - * length, or 0 if none found yet. */ - u32 pending_offset = 0; - - /* Note: some 'goto' statements are used in the remainder of this - * function to remove unnecessary checks and create branches that the - * CPU may predict better. (This function is performance critical.) */ - - if (lenL != 0 && lenL >= lenR) - goto extend_left; - else if (lenR != 0) - goto extend_right; - else - return 0; - -extend_left: - /* Search suffixes on the left until the match length has decreased - * below the next match length on the right or to below the minimum - * match length. */ - for (;;) { - u32 offset; - u32 old_L; - u32 old_lenL; - - /* Check for hard cutoff on amount of work done. */ - if (count_remaining-- == 0) { - if (pending_offset != 0) { - /* Save pending match. */ - matches[num_matches++] = (struct lz_match) { - .len = lenL, - .offset = pending_offset, - }; - } - goto out; - } - - if (SA[L] < i) { - /* Suffix is in LZ77 dictionary. (Check was needed - * because the salink array caps distances to save - * memory.) */ - - offset = i - SA[L]; - - /* Save match offset if it results in lower cost. */ - if (offset < best_offset) { - best_offset = offset; - pending_offset = offset; - } - } - - /* Advance left to previous suffix. */ - - old_L = L; - old_lenL = lenL; - - L -= link[L].dist_to_prev; - - if (link[old_L].lcpprev < old_lenL) { - /* Match length decreased. */ - - lenL = link[old_L].lcpprev; - - if (old_lenL > lenR) { - /* Neither the right side nor the left size has - * any more matches of length @old_lenL. If a - * pending match exists, save it. */ - if (pending_offset != 0) { - matches[num_matches++] = (struct lz_match) { - .len = old_lenL, - .offset = pending_offset, - }; - pending_offset = 0; - } - - if (lenL >= lenR) { - /* New match length on left is still at - * least as large as the next match - * length on the right: Keep extending - * left, unless the minimum match length - * would be underrun. */ - if (lenL == 0) - goto out; - goto extend_left; - } - } - - /* Here we have lenL < lenR. Extend right. - * (No check for whether the minimum match length has - * been underrun is needed, provided that such lengths - * are marked as 0.) */ - goto extend_right; - } - } - -extend_right: - /* Search suffixes on the right until the match length has decreased to - * the next match length on the left or to below the minimum match - * length. */ - for (;;) { - u32 offset; - u32 old_R; - u32 old_lenR; - - /* Check for hard cutoff on amount of work done. */ - if (count_remaining-- == 0) { - if (pending_offset != 0) { - /* Save pending match. */ - matches[num_matches++] = (struct lz_match) { - .len = lenR, - .offset = pending_offset, - }; - } - goto out; - } - - if (SA[R] < i) { - /* Suffix is in LZ77 dictionary. (Check was needed - * because the salink array caps distances to save - * memory.) */ - - offset = i - SA[R]; - - if (offset < best_offset) { - best_offset = offset; - pending_offset = offset; - } - } - - /* Advance right to next suffix. */ - - old_R = R; - old_lenR = lenR; - - R += link[R].dist_to_next; - - if (link[old_R].lcpnext < lenR) { - /* Match length decreased. */ - - lenR = link[old_R].lcpnext; - - /* Neither the right side nor the left size has any more - * matches of length @old_lenR. If a pending match - * exists, save it. */ - if (pending_offset != 0) { - matches[num_matches++] = (struct lz_match) { - .len = old_lenR, - .offset = pending_offset, - }; - pending_offset = 0; - } - - if (lenL >= lenR) { - /* lenL >= lenR: Extend left, unless the - * minimum match length would be underrun, in - * which case we are done. */ - if (lenL == 0) - goto out; - - goto extend_left; - } - /* lenR > lenL: Keep extending right. - * (No check for whether the minimum match length has - * been underrun is needed, provided that such lengths - * are marked as 0.) */ - } - } - -out: - for (u32 i = 0; i < num_matches / 2; i++) - swap(matches[i], matches[num_matches - 1 - i]); - return num_matches; -} - -static void -lz_lsa_skip_positions(struct lz_mf *_mf, u32 n) -{ - struct lz_lsa *mf = (struct lz_lsa *)_mf; - do { - lz_lsa_update_salink(mf->ISA[mf->base.cur_window_pos++], mf->salink); - } while (--n); -} - -static void -lz_lsa_destroy(struct lz_mf *_mf) -{ - struct lz_lsa *mf = (struct lz_lsa *)_mf; - - FREE(mf->SA); - FREE(mf->salink); -} - -const struct lz_mf_ops lz_linked_suffix_array_ops = { - .params_valid = lz_lsa_params_valid, - .get_needed_memory = lz_lsa_get_needed_memory, - .init = lz_lsa_init, - .load_window = lz_lsa_load_window, - .get_matches = lz_lsa_get_matches, - .skip_positions = lz_lsa_skip_positions, - .destroy = lz_lsa_destroy, - .struct_size = sizeof(struct lz_lsa), -}; diff --git a/src/lz_mf.c b/src/lz_mf.c deleted file mode 100644 index ee7c80d8..00000000 --- a/src/lz_mf.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - * lz_mf.c - * - * Interface for Lempel-Ziv match-finders. - * - * Copyright (c) 2014 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include "wimlib/lz_mf.h" -#include "wimlib/lz_mf_ops.h" -#include "wimlib/util.h" - -/* Available match-finding algorithms. */ -static const struct lz_mf_ops *mf_ops[] = { - [LZ_MF_LCP_INTERVAL_TREE] = &lz_lcp_interval_tree_ops, - [LZ_MF_LINKED_SUFFIX_ARRAY] = &lz_linked_suffix_array_ops, -}; - -static const struct lz_mf_ops * -get_mf_ops(enum lz_mf_algo algorithm) -{ - if ((unsigned int)algorithm >= ARRAY_LEN(mf_ops)) - return NULL; - return mf_ops[(unsigned int)algorithm]; -} - -/* - * Returns an upper bound on the number of bytes of memory that will be consumed - * by a match-finder allocated with the specified algorithm and maximum window - * size. - * - * The returned value does not include the size of the window itself. The - * caller must account for this separately if needed. - * - * If @algorithm is invalid, returns 0. - */ -u64 -lz_mf_get_needed_memory(enum lz_mf_algo algorithm, u32 max_window_size) -{ - const struct lz_mf_ops *ops; - - ops = get_mf_ops(algorithm); - if (!ops) - return 0; - return ops->struct_size + ops->get_needed_memory(max_window_size); -} -/* - * Returns %true if and only if the specified parameters can be validly used to - * create a match-finder using lz_mf_alloc(). - */ -bool -lz_mf_params_valid(const struct lz_mf_params *params) -{ - const struct lz_mf_ops *ops; - - /* Require that a valid algorithm be specified. */ - ops = get_mf_ops(params->algorithm); - if (!ops) - return false; - - /* Don't allow empty windows. Otherwise, some match-finding algorithms - * might need special-case code to handle empty windows. */ - if (params->max_window_size == 0) - return false; - - /* Don't allow length-1 matches, so that match-finding algorithms don't - * need to worry about this case. Most LZ-based compression formats - * don't allow length-1 matches, since they usually aren't helpful for - * compression. Also, if a compressor really does need length-1 - * matches, it can easily maintain its own table of length 256 - * containing the most-recently-seen position for each byte value. - * - * min_match_len == 0 is valid, since that means the match-finding - * algorithm will fill in a default value. */ - if (params->min_match_len == 1) - return false; - - if (params->max_match_len != 0) { - - /* Don't allow length-1 matches (same reason as above). */ - if (params->max_match_len == 1) - return false; - - /* Don't allow the maximum match length to be shorter than the - * minimum match length. */ - if (params->max_match_len < params->min_match_len) - return false; - } - - /* Don't allow the needed memory size to overflow a 'size_t'. */ - if (sizeof(size_t) < sizeof(u64)) { - u64 needed_mem = ops->get_needed_memory(params->max_window_size); - if ((size_t)needed_mem != needed_mem) - return false; - } - - /* Call the algorithm-specific routine to finish the validation. */ - return ops->params_valid(params); -} - -/* - * Allocate a new match-finder. - * - * @params - * The parameters for the match-finder. See the declaration of 'struct - * lz_mf_params' for more information. - * - * Returns a pointer to the new match-finder, or NULL if out of memory or the - * parameters are invalid. Call lz_mf_params_valid() beforehand to test the - * parameter validity separately. - */ -struct lz_mf * -lz_mf_alloc(const struct lz_mf_params *params) -{ - struct lz_mf *mf; - const struct lz_mf_ops *ops; - - /* Validate the parameters. */ - if (!lz_mf_params_valid(params)) - return NULL; - - /* Get the match-finder operations structure. Since we just validated - * the parameters, this is guaranteed to return a valid structure. */ - ops = get_mf_ops(params->algorithm); - LZ_ASSERT(ops != NULL); - - /* Allocate memory for the match-finder structure. */ - LZ_ASSERT(ops->struct_size >= sizeof(struct lz_mf)); - mf = CALLOC(1, ops->struct_size); - if (!mf) - return NULL; - - /* Set the parameters and operations fields. */ - mf->params = *params; - mf->ops = *ops; - - /* Perform algorithm-specific initialization. Normally this is where - * most of the necessary memory is allocated. */ - if (!mf->ops.init(mf)) { - FREE(mf); - return NULL; - } - - /* The algorithm must have set min_match_len and max_match_len if either - * was 0. */ - LZ_ASSERT(mf->params.min_match_len >= 2); - LZ_ASSERT(mf->params.max_match_len >= mf->params.min_match_len); - - return mf; -} - -/* - * Load a window into the match-finder. - * - * @mf - * The match-finder into which to load the window. - * @window - * Pointer to the window to load. This memory must remain available, - * unmodified, while the match-finder is being used. - * @size - * The size of the window, in bytes. This can't be larger than the - * @max_window_size parameter. In addition, this can't be 0. - * - * Note: this interface does not support sliding windows! - */ -void -lz_mf_load_window(struct lz_mf *mf, const u8 *window, u32 size) -{ - /* Can't be an empty window, and can't be larger than the maximum window - * size with which the match-finder was allocated. */ - LZ_ASSERT(size > 0); - LZ_ASSERT(size <= mf->params.max_window_size); - - /* Save the window and initialize the current position. */ - mf->cur_window = window; - mf->cur_window_size = size; - mf->cur_window_pos = 0; - - /* Call into the algorithm-specific window load code. */ - mf->ops.load_window(mf, window, size); -} - -/* - * Retrieve a list of matches at the next position in the window. - * - * @mf - * The match-finder into which a window has been loaded using - * lz_mf_load_window(). - * @matches - * The array into which the matches will be returned. The returned match - * count will not exceed the minimum of @max_search_depth and (@len_limit - - * @min_match_len + 1), where @len_limit is itself defined as - * min(@max_match_len, @nice_match_len). - * - * The return value is the number of matches that were found and stored in the - * 'matches' array. The matches will be ordered by strictly increasing length - * and strictly increasing offset. No match shall have length less than - * @min_match_len, and no match shall have length greater than @max_match_len. - * The return value may be 0, which indicates that no matches were found. - * - * On completion, the match-finder is advanced to the next position in the - * window. - * - * Note: in-non-debug mode, the inline definition of this gets used instead. - * They are the same, except that the non-inline version below validates the - * results to help debug match-finding algorithms. - */ -#ifdef ENABLE_LZ_DEBUG -u32 -lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches) -{ - LZ_ASSERT(mf->cur_window_pos < mf->cur_window_size); - - const u32 orig_pos = mf->cur_window_pos; - const u32 len_limit = min(mf->params.max_match_len, - lz_mf_get_bytes_remaining(mf)); - const u8 * const strptr = lz_mf_get_window_ptr(mf); - - const u32 num_matches = mf->ops.get_matches(mf, matches); - - LZ_ASSERT(mf->cur_window_pos == orig_pos + 1); - -#if 0 - fprintf(stderr, "Pos %"PRIu32"/%"PRIu32": %"PRIu32" matches\n", - orig_pos, mf->cur_window_size, num_matches); - for (u32 i = 0; i < num_matches; i++) { - fprintf(stderr, "\tLen %"PRIu32" Offset %"PRIu32"\n", - matches[i].len, matches[i].offset); - } -#endif - - /* Validate the matches. */ - for (u32 i = 0; i < num_matches; i++) { - const u32 len = matches[i].len; - const u32 offset = matches[i].offset; - const u8 *matchptr; - - /* Length valid? */ - LZ_ASSERT(len >= mf->params.min_match_len); - LZ_ASSERT(len <= len_limit); - - /* Offset valid? */ - LZ_ASSERT(offset >= 1); - LZ_ASSERT(offset <= orig_pos); - - /* Lengths and offsets strictly increasing? */ - if (i > 0) { - LZ_ASSERT(len > matches[i - 1].len); - LZ_ASSERT(offset > matches[i - 1].offset); - } - - /* Actually a match? */ - matchptr = strptr - offset; - LZ_ASSERT(!memcmp(strptr, matchptr, len)); - - /* Match can't be extended further? */ - LZ_ASSERT(len == len_limit || strptr[len] != matchptr[len]); - } - - return num_matches; -} -#endif /* ENABLE_LZ_DEBUG */ - -/* - * Skip 'n' positions in the match-finder. This is a faster alternative to - * calling lz_mf_get_matches() at each position to advance the match-finder. - * - * 'n' must be greater than 0. - * - * Note: in-non-debug mode, the inline definition of this gets used instead. - * They are the same, except the non-inline version below does extra checks. - */ -#ifdef ENABLE_LZ_DEBUG -void -lz_mf_skip_positions(struct lz_mf *mf, const u32 n) -{ - LZ_ASSERT(n > 0); - LZ_ASSERT(n <= lz_mf_get_bytes_remaining(mf)); - - const u32 orig_pos = mf->cur_window_pos; - - mf->ops.skip_positions(mf, n); - - LZ_ASSERT(mf->cur_window_pos == orig_pos + n); -} -#endif - -/* - * Free the match-finder. - * - * This frees all memory that was allocated by the call to lz_mf_alloc(). - */ -void -lz_mf_free(struct lz_mf *mf) -{ - if (mf) { - mf->ops.destroy(mf); - #ifdef ENABLE_LZ_DEBUG - memset(mf, 0, mf->ops.struct_size); - #endif - FREE(mf); - } -} diff --git a/src/lz_suffix_array_utils.c b/src/lz_suffix_array_utils.c deleted file mode 100644 index f0b476c2..00000000 --- a/src/lz_suffix_array_utils.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * lz_suffix_array_utils.c - * - * Common utilities for suffix-array based Lempel-Ziv match-finding algorithms. - * - * Copyright (c) 2014 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include "wimlib/divsufsort.h" -#include "wimlib/lz_mf.h" -#include "wimlib/lz_suffix_array_utils.h" -#include "wimlib/util.h" - -/* If ENABLE_LZ_DEBUG is defined, verify that the suffix array satisfies its - * definition. - * - * WARNING: this is for debug use only as it does not necessarily run in linear - * time!!! */ -static void -verify_SA(const u32 *SA, const u8 *T, u32 n, u32 *tmp) -{ -#ifdef ENABLE_LZ_DEBUG - /* Ensure the SA contains exactly one of each i in [0, n - 1]. */ - for (u32 i = 0; i < n; i++) - tmp[i] = 0; - for (u32 r = 0; r < n; r++) { - u32 i = SA[r]; - LZ_ASSERT(i < n); - LZ_ASSERT(!tmp[i]); - tmp[i] = 1; - } - - /* Ensure the suffix with rank r is lexicographically less than the - * suffix with rank (r + 1) for all r in [0, n - 2]. */ - for (u32 r = 0; r < n - 1; r++) { - - u32 i1 = SA[r]; - u32 i2 = SA[r + 1]; - - u32 n1 = n - i1; - u32 n2 = n - i2; - - int res = memcmp(&T[i1], &T[i2], min(n1, n2)); - LZ_ASSERT(res < 0 || (res == 0 && n1 < n2)); - } -#endif /* ENABLE_LZ_DEBUG */ -} - -/* - * Build the suffix array (SA) for the specified "text". - * - * The SA is a sorted array of the text's suffixes, represented by indices into - * the text. It can equivalently be viewed as a mapping from suffix rank to - * suffix position. - * - * To build the SA, we currently rely on libdivsufsort, which uses an - * induced-sorting-based algorithm. In practice, this seems to be the fastest - * suffix array construction algorithm currently available. - * - * References: - * - * Y. Mori. libdivsufsort, a lightweight suffix-sorting library. - * https://code.google.com/p/libdivsufsort/. - * - * G. Nong, S. Zhang, and W.H. Chan. 2009. Linear Suffix Array - * Construction by Almost Pure Induced-Sorting. Data Compression - * Conference, 2009. DCC '09. pp. 193 - 202. - * - * S.J. Puglisi, W.F. Smyth, and A. Turpin. 2007. A Taxonomy of Suffix - * Array Construction Algorithms. ACM Computing Surveys (CSUR) Volume 39 - * Issue 2, 2007 Article No. 4. - */ -void -build_SA(u32 *SA, const u8 *T, u32 n, u32 *tmp) -{ - BUILD_BUG_ON(BUILD_SA_MIN_TMP_LEN != - DIVSUFSORT_TMP1_LEN + DIVSUFSORT_TMP2_LEN); - - /* Note: divsufsort() needs temporary space --- one array with 256 - * spaces and one array with 65536 spaces. The implementation of - * divsufsort() has been modified from the original to use the provided - * temporary space instead of allocating its own, since we don't want to - * have to deal with malloc() failures here. */ - divsufsort(T, SA, n, tmp, tmp + DIVSUFSORT_TMP1_LEN); - - verify_SA(SA, T, n, tmp); -} - - -/* Build the inverse suffix array @ISA from the suffix array @SA in linear time. - * - * Whereas the suffix array is a mapping from suffix rank to suffix position, - * the inverse suffix array is a mapping from suffix position to suffix rank. - */ -void -build_ISA(u32 * restrict ISA, const u32 * restrict SA, u32 n) -{ - for (u32 r = 0; r < n; r++) - ISA[SA[r]] = r; -} - -/* If ENABLE_LZ_DEBUG is defined, verify that the LCP (Longest Common Prefix) - * array satisfies its definition. - * - * WARNING: this is for debug use only as it does not necessarily run in linear - * time!!! */ -static void -verify_LCP(const u32 *LCP, const u32 *SA, const u8 *T, u32 n) -{ -#ifdef ENABLE_LZ_DEBUG - for (u32 r = 0; r < n - 1; r++) { - u32 i1 = SA[r]; - u32 i2 = SA[r + 1]; - u32 lcp = LCP[r + 1]; - - u32 n1 = n - i1; - u32 n2 = n - i2; - - LZ_ASSERT(lcp <= min(n1, n2)); - - LZ_ASSERT(memcmp(&T[i1], &T[i2], lcp) == 0); - if (lcp < min(n1, n2)) - LZ_ASSERT(T[i1 + lcp] != T[i2 + lcp]); - } -#endif /* ENABLE_LZ_DEBUG */ -} - -/* - * Build the LCP (Longest Common Prefix) array in linear time. - * - * LCP[r] will be the length of the longest common prefix between the suffixes - * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined. - * - * Algorithm taken from Kasai et al. (2001), but modified slightly to take into - * account that with bytes in the real world, there is no unique symbol at the - * end of the string. - * - * References: - * - * Kasai et al. 2001. Linear-Time Longest-Common-Prefix Computation in - * Suffix Arrays and Its Applications. CPM '01 Proceedings of the 12th - * Annual Symposium on Combinatorial Pattern Matching pp. 181-192. - */ -void -build_LCP(u32 * restrict LCP, const u32 * restrict SA, - const u32 * restrict ISA, const u8 * restrict T, u32 n) -{ - u32 h, i, r, j, lim; - - h = 0; - for (i = 0; i < n; i++) { - r = ISA[i]; - if (r > 0) { - j = SA[r - 1]; - lim = min(n - i, n - j); - - while (h < lim && T[i + h] == T[j + h]) - h++; - LCP[r] = h; - if (h > 0) - h--; - } - } - - verify_LCP(LCP, SA, T, n); -} diff --git a/src/lzms_compress.c b/src/lzms_compress.c index 411d667f..1e2a4fce 100644 --- a/src/lzms_compress.c +++ b/src/lzms_compress.c @@ -33,7 +33,7 @@ #include "wimlib/compressor_ops.h" #include "wimlib/endianness.h" #include "wimlib/error.h" -#include "wimlib/lz_mf.h" +#include "wimlib/lcpit_matchfinder.h" #include "wimlib/lz_repsearch.h" #include "wimlib/lzms_common.h" #include "wimlib/unaligned.h" @@ -144,7 +144,6 @@ struct lzms_huffman_encoder { struct lzms_compressor_params { u32 min_match_length; u32 nice_match_length; - u32 max_search_depth; u32 optim_array_length; }; @@ -159,7 +158,7 @@ struct lzms_compressor { u32 cur_window_size; /* Lempel-Ziv match-finder */ - struct lz_mf *mf; + struct lcpit_matchfinder mf; /* Temporary space to store found matches */ struct lz_match *matches; @@ -873,8 +872,8 @@ lzms_consider_lz_repeat_offset_match(const struct lzms_compressor *c, /* * Consider coding each match in @matches as an explicit offset match. * - * @matches must be sorted by strictly increasing length and strictly increasing - * offset. This is guaranteed by the match-finder. + * @matches must be sorted by strictly decreasing length. This is guaranteed by + * the match-finder. * * We consider each length from the minimum (2) to the longest * (matches[num_matches - 1].len). For each length, we consider only the @@ -905,7 +904,7 @@ lzms_consider_lz_explicit_offset_matches(const struct lzms_compressor *c, base_cost += lzms_rc_bit_cost(&c->lz_match_range_encoder, cur_optimum_ptr->state.lz_match_state, 0); len = 2; - i = 0; + i = num_matches - 1; do { position_cost = base_cost + lzms_lz_offset_cost(c, matches[i].offset); do { @@ -916,8 +915,8 @@ lzms_consider_lz_explicit_offset_matches(const struct lzms_compressor *c, << MC_OFFSET_SHIFT) | len; (cur_optimum_ptr + len)->cost = cost; } - } while (++len <= matches[i].len); - } while (++i != num_matches); + } while (++len <= matches[i].length); + } while (i--); } static void @@ -1039,8 +1038,7 @@ begin: for (;;) { /* Find explicit offset matches with the current position. */ - num_matches = lz_mf_get_matches(c->mf, c->matches); - + num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches); if (num_matches) { /* * Find the longest repeat offset match with the current @@ -1072,7 +1070,7 @@ begin: * choose it immediately. */ if (rep_max_len >= c->params.nice_match_length) { - lz_mf_skip_positions(c->mf, rep_max_len - 1); + lcpit_matchfinder_skip_bytes(&c->mf, rep_max_len - 1); window_ptr += rep_max_len; if (cur_optimum_ptr != c->optimum) @@ -1110,20 +1108,29 @@ begin: rep_max_len, rep_max_idx); } - longest_len = c->matches[num_matches - 1].len; + longest_len = c->matches[0].length; /* If there's a very long explicit offset match, choose * it immediately. */ if (longest_len >= c->params.nice_match_length) { - lz_mf_skip_positions(c->mf, longest_len - 1); + u32 offset = c->matches[0].offset; + + /* Extend the match as far as possible. (The + * LCP-interval tree matchfinder only reports up + * to the "nice" length.) */ + longest_len = lz_extend(window_ptr, + window_ptr - offset, + longest_len, + window_end - window_ptr); + + lcpit_matchfinder_skip_bytes(&c->mf, longest_len - 1); window_ptr += longest_len; if (cur_optimum_ptr != c->optimum) lzms_encode_item_list(c, cur_optimum_ptr); - lzms_encode_lz_explicit_offset_match(c, longest_len, - c->matches[num_matches - 1].offset); + lzms_encode_lz_explicit_offset_match(c, longest_len, offset); c->optimum[0].state = cur_optimum_ptr->state; @@ -1131,8 +1138,7 @@ begin: lzms_update_match_state(&c->optimum[0].state, 0); lzms_update_lz_match_state(&c->optimum[0].state, 0); - c->optimum[0].state.lru.upcoming_offset = - c->matches[num_matches - 1].offset; + c->optimum[0].state.lru.upcoming_offset = offset; lzms_update_lz_lru_queue(&c->optimum[0].state.lru); goto begin; @@ -1400,39 +1406,17 @@ lzms_build_params(unsigned int compression_level, else params->min_match_length = 3; - /* Scale nice_match_length and max_search_depth with the compression - * level. But to allow an optimization on length cost calculations, - * don't allow nice_match_length to exceed LZMS_NUM_FAST_LENGTH. */ + /* Scale nice_match_length with the compression level. But to allow an + * optimization on length cost calculations, don't allow + * nice_match_length to exceed LZMS_NUM_FAST_LENGTH. */ params->nice_match_length = ((u64)compression_level * 32) / 50; if (params->nice_match_length < params->min_match_length) params->nice_match_length = params->min_match_length; if (params->nice_match_length > LZMS_NUM_FAST_LENGTHS) params->nice_match_length = LZMS_NUM_FAST_LENGTHS; - params->max_search_depth = compression_level; - params->optim_array_length = 1024; } -/* Given the internal compression parameters and maximum window size, build the - * Lempel-Ziv match-finder parameters. */ -static void -lzms_build_mf_params(const struct lzms_compressor_params *lzms_params, - u32 max_window_size, struct lz_mf_params *mf_params) -{ - memset(mf_params, 0, sizeof(*mf_params)); - - /* Choose an appropriate match-finding algorithm. */ - if (max_window_size <= 33554432) - mf_params->algorithm = LZ_MF_LCP_INTERVAL_TREE; - else - mf_params->algorithm = LZ_MF_LINKED_SUFFIX_ARRAY; - - mf_params->max_window_size = max_window_size; - mf_params->min_match_len = lzms_params->min_match_length; - mf_params->max_search_depth = lzms_params->max_search_depth; - mf_params->nice_match_len = lzms_params->nice_match_length; -} - static void lzms_free_compressor(void *_c); @@ -1440,14 +1424,12 @@ static u64 lzms_get_needed_memory(size_t max_block_size, unsigned int compression_level) { struct lzms_compressor_params params; - struct lz_mf_params mf_params; u64 size = 0; if (max_block_size > LZMS_MAX_BUFFER_SIZE) return 0; lzms_build_params(compression_level, ¶ms); - lzms_build_mf_params(¶ms, max_block_size, &mf_params); size += sizeof(struct lzms_compressor); @@ -1455,10 +1437,10 @@ lzms_get_needed_memory(size_t max_block_size, unsigned int compression_level) size += max_block_size; /* mf */ - size += lz_mf_get_needed_memory(mf_params.algorithm, max_block_size); + size += lcpit_matchfinder_get_needed_memory(max_block_size); /* matches */ - size += min(params.max_search_depth, params.nice_match_length) * + size += (params.nice_match_length - params.min_match_length + 1) * sizeof(struct lz_match); /* optimum */ @@ -1474,15 +1456,11 @@ lzms_create_compressor(size_t max_block_size, unsigned int compression_level, { struct lzms_compressor *c; struct lzms_compressor_params params; - struct lz_mf_params mf_params; if (max_block_size > LZMS_MAX_BUFFER_SIZE) return WIMLIB_ERR_INVALID_PARAM; lzms_build_params(compression_level, ¶ms); - lzms_build_mf_params(¶ms, max_block_size, &mf_params); - if (!lz_mf_params_valid(&mf_params)) - return WIMLIB_ERR_INVALID_PARAM; c = CALLOC(1, sizeof(struct lzms_compressor)); if (!c) @@ -1494,12 +1472,12 @@ lzms_create_compressor(size_t max_block_size, unsigned int compression_level, if (!c->cur_window) goto oom; - c->mf = lz_mf_alloc(&mf_params); - if (!c->mf) + if (!lcpit_matchfinder_init(&c->mf, max_block_size, + c->params.min_match_length, + c->params.nice_match_length)) goto oom; - c->matches = MALLOC(min(params.max_search_depth, - params.nice_match_length) * + c->matches = MALLOC((params.nice_match_length - params.min_match_length + 1) * sizeof(struct lz_match)); if (!c->matches) goto oom; @@ -1549,7 +1527,7 @@ lzms_compress(const void *uncompressed_data, size_t uncompressed_size, c->last_target_usages, false); /* Load the window into the match-finder. */ - lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size); + lcpit_matchfinder_load_buffer(&c->mf, c->cur_window, c->cur_window_size); /* Compute and encode a literal/match sequence that decompresses to the * preprocessed data. */ @@ -1566,7 +1544,7 @@ lzms_free_compressor(void *_c) if (c) { FREE(c->cur_window); - lz_mf_free(c->mf); + lcpit_matchfinder_destroy(&c->mf); FREE(c->matches); FREE(c->optimum); FREE(c);