From b7071062542143113ad654d89ee6b0603b23b524 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Jun 2014 22:01:14 -0500 Subject: [PATCH] Switch from suffix array match-finder to binary tree match-finder This uses less memory (8 bytes overhead per position vs. 14), is faster, requires less code (no libdivsufsort), and in some tests actually results in a better compression ratio. A binary tree match-finder was used in wimlib v1.5.2 but it didn't seem as good then, probably because it was combined with a slow block division algorithm for LZX. Repeat offsets are now handled differently. The binary tree match-finder itself doesn't have any logic for repeat offsets at all; instead, the match-choosing code (now implemented separately for LZX and LZMS) now does special checks for matches at repeat offsets. Since less memory is used for match-finding now, increase the default LZMS pack chunk size from 2^25 (33554432) to 2^26 (67108864). --- Makefile.am | 7 +- README | 21 +- doc/man1/imagex-capture.1.in | 6 +- include/wimlib.h | 24 +- include/wimlib/divsufsort.h | 49 - include/wimlib/lz_bt.h | 83 ++ include/wimlib/lz_optimal.h | 551 ------------ include/wimlib/lz_sarray.h | 516 ----------- include/wimlib/lzx.h | 2 +- programs/imagex.c | 2 - src/divsufsort.c | 1631 ---------------------------------- src/lz_bt.c | 706 +++++++++++++++ src/lz_sarray.c | 558 ------------ src/lzms-compress.c | 407 +++++++-- src/lzx-compress.c | 1078 +++++++++++++++------- src/wim.c | 4 +- 16 files changed, 1898 insertions(+), 3747 deletions(-) delete mode 100644 include/wimlib/divsufsort.h create mode 100644 include/wimlib/lz_bt.h delete mode 100644 include/wimlib/lz_optimal.h delete mode 100644 include/wimlib/lz_sarray.h delete mode 100644 src/divsufsort.c create mode 100644 src/lz_bt.c delete mode 100644 src/lz_sarray.c diff --git a/Makefile.am b/Makefile.am index a7304ce5..b3abad47 100644 --- a/Makefile.am +++ b/Makefile.am @@ -29,7 +29,6 @@ libwim_la_SOURCES = \ src/decompress_common.c \ src/delete_image.c \ src/dentry.c \ - src/divsufsort.c \ src/encoding.c \ src/export_image.c \ src/extract.c \ @@ -44,8 +43,8 @@ libwim_la_SOURCES = \ src/lzms-common.c \ src/lzms-compress.c \ src/lzms-decompress.c \ + src/lz_bt.c \ src/lz_hash.c \ - src/lz_sarray.c \ src/lzx-common.c \ src/lzx-compress.c \ src/lzx-decompress.c \ @@ -85,7 +84,6 @@ libwim_la_SOURCES = \ include/wimlib/decompressor_ops.h \ include/wimlib/decompress_common.h \ include/wimlib/dentry.h \ - include/wimlib/divsufsort.h \ include/wimlib/encoding.h \ include/wimlib/endianness.h \ include/wimlib/error.h \ @@ -98,9 +96,8 @@ libwim_la_SOURCES = \ include/wimlib/list.h \ include/wimlib/lookup_table.h \ include/wimlib/lz.h \ + include/wimlib/lz_bt.h \ include/wimlib/lz_hash.h \ - include/wimlib/lz_optimal.h \ - include/wimlib/lz_sarray.h \ include/wimlib/lzms.h \ include/wimlib/lzx.h \ include/wimlib/metadata.h \ diff --git a/README b/README index a242e7f6..5d70d962 100644 --- a/README +++ b/README @@ -330,17 +330,16 @@ used by recent versions of Windows). See http://www.tuxera.com/community/ntfs-3g-download/ for more information. The LZX decompressor (lzx-decompress.c) was originally based on code from the -cabextract project (http://www.cabextract.org.uk) but has been rewritten. +cabextract project (http://www.cabextract.org.uk). The LZX compressor +(lzx-compress.c) was originally based on code written by Matthew Russotto +(www.russotto.net/chm/). However I have since rewritten and made many +improvements to both the decompressor and compressor. -The LZX compressor (lzx-compress.c) was originally based on code written by -Matthew Russotto (www.russotto.net/chm/) but has been rewritten. It now uses -suffix array construction code from divsufsort -(https://code.google.com/p/libdivsufsort/) and algorithms from 7-Zip as well as -several published papers. +lz_hash.c contains LZ77 match-finding code that uses hash chains. It is based +on code from zlib but I have since rewritten it. -lz_hash.c contains a hash-table-based LZ77 matchfinder that is based on code -from zlib but has been rewritten. This code is applicable to XPRESS, LZX, and -LZMS, all of which are partly based on LZ77 compression. +lz_bt.c contains LZ77 match-finding code that uses binary trees. It is based on +code from liblzma but I have since rewritten it. A limited number of other free programs can handle some parts of the WIM file format: @@ -349,8 +348,8 @@ file format: other archive formats). However, wimlib is designed specifically to handle WIM files and provides features previously only available in Microsoft's implementation, such as the ability to mount WIMs read-write as well as - read-only, the ability to create LZX or XPRESS compressed WIMs, and the - correct handling of security descriptors and hard links. + read-only, the ability to create compressed WIMs, and the correct handling + of security descriptors and hard links. * ImagePyX (https://github.com/maxpat78/ImagePyX) is a Python program that provides similar capabilities to wimlib-imagex. One thing to note, though, is that it does not support compression and decompression by itself, but diff --git a/doc/man1/imagex-capture.1.in b/doc/man1/imagex-capture.1.in index a1b6ff2b..8ce01979 100644 --- a/doc/man1/imagex-capture.1.in +++ b/doc/man1/imagex-capture.1.in @@ -244,19 +244,19 @@ option use a different version number in their header and are only compatible with WIMGAPI Windows 8 and later, and DISM Windows 8.1 and later. .IP "" The default compression type and chunk size in packed resources is LZMS with -2^25 (33554432) byte chunks. This is independent of the WIM's main compression +2^26 (67108864) byte chunks. This is independent of the WIM's main compression type and chunk size. .TP \fB--pack-chunk-size\fR=\fISIZE\fR, \fB--solid-chunk-size\fR=\fISIZE\fR Like \fB--chunk-size\fR, but set the chunk size used in packed resources. The -default is LZMS compression with 2^25 (33554432) byte chunks. This option only +default is LZMS compression with 2^26 (67108864) byte chunks. This option only has an effect when \fB--pack-streams\fR is also specified. For maximum compatibility with the Microsoft implementation, do not use either of these options. .TP \fB--pack-compress\fR=\fITYPE\fR, \fB--solid-compress\fR=\fITYPE\fR Like \fB--compress\fR, but set the compression format used in packed resources. -The default is LZMS compression with 2^25 (33554432) byte chunks. This option +The default is LZMS compression with 2^26 (67108864) byte chunks. This option only has an effect when \fB--pack-streams\fR is also specified. For maximum compatibility with the Microsoft implementation, do not use either of these options. diff --git a/include/wimlib.h b/include/wimlib.h index da8d4262..7d39ba64 100644 --- a/include/wimlib.h +++ b/include/wimlib.h @@ -1874,7 +1874,7 @@ typedef int (*wimlib_iterate_lookup_table_callback_t)(const struct wimlib_resour * all streams recompressed in solid mode. * * Currently, new solid blocks will, by default, be written using LZMS - * compression with 32 MiB (33554432 byte) chunks. Use + * compression with 64 MiB (67108864 byte) chunks. Use * wimlib_set_output_pack_compression_type() and/or * wimlib_set_output_pack_chunk_size() to change this. This is independent of * the WIM's main compression type and chunk size; you can have a WIM that @@ -4189,11 +4189,11 @@ struct wimlib_lzx_compressor_params { struct wimlib_compressor_params_header hdr; /** Relatively fast LZX compression algorithm with a decent compression - * ratio; the suggested default. */ + * ratio. */ #define WIMLIB_LZX_ALGORITHM_FAST 0 /** Slower LZX compression algorithm that provides a better compression - * ratio. */ + * ratio. This is the default. */ #define WIMLIB_LZX_ALGORITHM_SLOW 1 /** Algorithm to use to perform the compression: either @@ -4212,10 +4212,10 @@ struct wimlib_lzx_compressor_params { uint32_t fast_reserved1[10]; } fast; - /** Parameters for the slow algorithm. */ + /** Parameters for the "slow" algorithm. */ struct wimlib_lzx_slow_params { /** If set to 1, the compressor can output length 2 - * matches. If set 0, the compressor only outputs + * matches. If set 0, the compressor can only output * matches of length 3 or greater. Suggested value: 1 */ uint32_t use_len2_matches : 1; @@ -4243,11 +4243,10 @@ struct wimlib_lzx_compressor_params { * position. Suggested value: 50. */ uint32_t max_search_depth; - /** Maximum number of potentially good matches to - * consider for each position. Suggested value: 3. */ - uint32_t max_matches_per_pos; + /* Note: max_matches_per_pos has been removed and no + * longer has any effect. */ - uint32_t slow_reserved2[2]; + uint32_t slow_reserved2[3]; /** Assumed cost of a main symbol with zero frequency. * Must be at least 1 and no more than 16. Suggested @@ -4294,9 +4293,10 @@ struct wimlib_lzms_compressor_params { * value: 50. */ uint32_t max_search_depth; - /** Maximum number of potentially good matches to consider at each - * position. Suggested value: 3. */ - uint32_t max_matches_per_pos; + /* Note: max_matches_per_pos has been removed and no longer has any + * effect. */ + + uint32_t reserved1; /** Length of the array for the near-optimal LZ parsing algorithm. This * must be at least 1. Suggested value: 1024. */ diff --git a/include/wimlib/divsufsort.h b/include/wimlib/divsufsort.h deleted file mode 100644 index b5c1c924..00000000 --- a/include/wimlib/divsufsort.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * divsufsort.h for libdivsufsort-lite - * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef _DIVSUFSORT_H -#define _DIVSUFSORT_H 1 - -/*- Prototypes -*/ - -/** - * Constructs the suffix array of a given string. - * @param T[0..n-1] The input string. - * @param SA[0..n-1] The output array of suffixes. - * @param n The length of the given string. - * @param bucket_A Temporary array with 256 entries - * @param bucket_B Temporary array with 65536 entries - */ -extern void -divsufsort(const unsigned char *T, int *SA, int n, - int *bucket_A, int *bucket_B); - -#define DIVSUFSORT_TMP1_SIZE (256 * sizeof(saidx_t)) /* bucket_A */ -#define DIVSUFSORT_TMP2_SIZE (256 * 256 * sizeof(saidx_t)) /* bucket_B */ - -typedef int saidx_t; - -#endif /* _DIVSUFSORT_H */ diff --git a/include/wimlib/lz_bt.h b/include/wimlib/lz_bt.h new file mode 100644 index 00000000..cc9ed5d3 --- /dev/null +++ b/include/wimlib/lz_bt.h @@ -0,0 +1,83 @@ +/* + * lz_bt.h + * + * Binary tree match-finder for Lempel-Ziv compression. + * + * Author: Eric Biggers + * Year: 2014 + * + * The author hereby releases this file into the public domain. + * You can do whatever you want with this file. + */ + +#ifndef _WIMLIB_LZ_BT_H +#define _WIMLIB_LZ_BT_H + +#include "wimlib/types.h" + +/* Position type for the binary tree match-finder. + * This can be changed to 'u16' if no window will exceed 65536 bytes. */ +typedef u32 lz_bt_pos_t; + +/* Match length type for the binary tree match-finder. */ +typedef unsigned lz_bt_len_t; + +/* The binary tree match-finder structure. */ +struct lz_bt { + lz_bt_pos_t *hash_tab; + lz_bt_pos_t *digram_tab; + lz_bt_pos_t *child_tab; + const u8 *cur_window; + lz_bt_pos_t cur_window_pos; + lz_bt_pos_t cur_window_size; + lz_bt_pos_t max_window_size; + lz_bt_len_t min_match_len; + lz_bt_len_t max_match_len; + lz_bt_len_t num_fast_bytes; + u32 max_search_depth; +}; + +struct raw_match; + +extern u64 +lz_bt_get_needed_memory(lz_bt_pos_t max_window_size); + +extern bool +lz_bt_init(struct lz_bt *mf, + lz_bt_pos_t max_window_size, + lz_bt_len_t min_match_len, + lz_bt_len_t max_match_len, + lz_bt_len_t num_fast_bytes, + u32 max_search_depth); + +extern void +lz_bt_load_window(struct lz_bt *mf, const u8 *window, lz_bt_pos_t window_size); + +extern lz_bt_len_t +lz_bt_get_matches(struct lz_bt *mf, struct raw_match *matches); + +static inline lz_bt_pos_t +lz_bt_get_position(const struct lz_bt *mf) +{ + return mf->cur_window_pos; +} + +static inline const u8 * +lz_bt_get_window_ptr(const struct lz_bt *mf) +{ + return &mf->cur_window[mf->cur_window_pos]; +} + +static inline lz_bt_pos_t +lz_bt_get_remaining_size(const struct lz_bt *mf) +{ + return mf->cur_window_size - mf->cur_window_pos; +} + +extern void +lz_bt_skip_positions(struct lz_bt *mf, unsigned n); + +extern void +lz_bt_destroy(struct lz_bt *mf); + +#endif /* _WIMLIB_LZ_BT_H */ diff --git a/include/wimlib/lz_optimal.h b/include/wimlib/lz_optimal.h deleted file mode 100644 index 1e841986..00000000 --- a/include/wimlib/lz_optimal.h +++ /dev/null @@ -1,551 +0,0 @@ -/* - * lz_optimal.h - * - * Near-optimal LZ (Lempel-Ziv) parsing, or "match choosing". - * See lz_get_near_optimal_match() for details of the algorithm. - * - * This code is not concerned with actually *finding* LZ matches, as it relies - * on an underlying match-finder implementation that can do so. - */ - -/* - * Copyright (c) 2013 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* Define the following structures before including this header: - * - * LZ_COMPRESSOR - * LZ_ADAPTIVE_STATE - * - * Also, the type lz_mc_cost_t can be optionally overridden by providing an - * appropriate typedef and defining LZ_MC_COST_T_DEFINED. */ - -#ifndef _LZ_OPTIMAL_H -#define _LZ_OPTIMAL_H - -#include "wimlib/lz.h" - -#ifndef LZ_MC_COST_T_DEFINED - typedef input_idx_t lz_mc_cost_t; -#endif - -#define LZ_MC_INFINITE_COST (~(lz_mc_cost_t)0) - -struct lz_mc_pos_data; - -/* State of the Lempel-Ziv match-chooser. - * - * This is defined here for benefit of the inlined code. It's not intended for - * code outside the match-chooser itself to read or write members from this - * structure. */ -struct lz_match_chooser { - /* Temporary space used for the match-choosing algorithm. The size of - * this array must be at least one more than @nice_len but otherwise is - * arbitrary. More space decreases the frequency at which the algorithm - * is forced to terminate early. 4096 spaces seems sufficient for most - * real data. */ - struct lz_mc_pos_data *optimum; - input_idx_t array_space; - - /* When a match with length greater than or equal to this length is - * found, choose it immediately without further consideration. */ - input_idx_t nice_len; - - /* When matches have been chosen, optimum_cur_idx is set to the position - * in the window of the next match/literal to return and optimum_end_idx - * is set to the position in the window at the end of the last - * match/literal to return. */ - input_idx_t optimum_cur_idx; - input_idx_t optimum_end_idx; -}; - -/* - * Match chooser position data: - * - * An array of these structures is used during the match-choosing algorithm. - * They correspond to consecutive positions in the window and are used to keep - * track of the cost to reach each position, and the match/literal choices that - * need to be chosen to reach that position. - */ -struct lz_mc_pos_data { - /* The approximate minimum cost, in bits, to reach this position in the - * window which has been found so far. */ - lz_mc_cost_t cost; - - /* The union here is just for clarity, since the fields are used in two - * slightly different ways. Initially, the @prev structure is filled in - * first, and links go from later in the window to earlier in the - * window. Later, @next structure is filled in and links go from - * earlier in the window to later in the window. */ - union { - struct { - /* Position of the start of the match or literal that - * was taken to get to this position in the approximate - * minimum-cost parse. */ - input_idx_t link; - - /* Offset (as in an LZ (length, offset) pair) of the - * match or literal that was taken to get to this - * position in the approximate minimum-cost parse. */ - input_idx_t match_offset; - } prev; - struct { - /* Position at which the match or literal starting at - * this position ends in the minimum-cost parse. */ - input_idx_t link; - - /* Offset (as in an LZ (length, offset) pair) of the - * match or literal starting at this position in the - * approximate minimum-cost parse. */ - input_idx_t match_offset; - } next; - }; - - /* Format-dependent adaptive state that exists after an approximate - * minimum-cost path to reach this position is taken. For example, for - * LZX this is the list of recently used match offsets. If the format - * does not have any adaptive state that affects match costs, - * LZ_ADAPTIVE_STATE could be set to a dummy structure of size 0. */ - LZ_ADAPTIVE_STATE state; -}; - -/* Initialize the match-chooser. - * - * After calling this, multiple data buffers can be scanned with it if each is - * preceded with a call to lz_match_chooser_begin(). */ -static bool -lz_match_chooser_init(struct lz_match_chooser *mc, - input_idx_t array_space, - input_idx_t nice_len, input_idx_t max_match_len) -{ - input_idx_t extra_len = min(nice_len, max_match_len); - - LZ_ASSERT(array_space > 0); - mc->optimum = MALLOC((array_space + extra_len) * sizeof(mc->optimum[0])); - if (mc->optimum == NULL) - return false; - mc->array_space = array_space; - mc->nice_len = nice_len; - return true; -} - -static inline u64 -lz_match_chooser_get_needed_memory(input_idx_t array_space, - input_idx_t nice_len, - input_idx_t max_match_len) -{ - input_idx_t extra_len = min(nice_len, max_match_len); - return ((u64)(array_space + extra_len) * - sizeof(((struct lz_match_chooser*)0)->optimum[0])); -} - -/* Free memory allocated in lz_match_chooser_init(). */ -static void -lz_match_chooser_destroy(struct lz_match_chooser *mc) -{ - FREE(mc->optimum); -} - -/* Call this before starting to parse each new input string. */ -static void -lz_match_chooser_begin(struct lz_match_chooser *mc) -{ - mc->optimum_cur_idx = 0; - mc->optimum_end_idx = 0; -} - -/* - * Reverse the linked list of near-optimal matches so that they can be returned - * in forwards order. - * - * Returns the first match in the list. - */ -static _always_inline_attribute struct raw_match -lz_match_chooser_reverse_list(struct lz_match_chooser *mc, input_idx_t cur_pos) -{ - unsigned prev_link, saved_prev_link; - unsigned prev_match_offset, saved_prev_match_offset; - - mc->optimum_end_idx = cur_pos; - - saved_prev_link = mc->optimum[cur_pos].prev.link; - saved_prev_match_offset = mc->optimum[cur_pos].prev.match_offset; - - do { - prev_link = saved_prev_link; - prev_match_offset = saved_prev_match_offset; - - saved_prev_link = mc->optimum[prev_link].prev.link; - saved_prev_match_offset = mc->optimum[prev_link].prev.match_offset; - - mc->optimum[prev_link].next.link = cur_pos; - mc->optimum[prev_link].next.match_offset = prev_match_offset; - - cur_pos = prev_link; - } while (cur_pos != 0); - - mc->optimum_cur_idx = mc->optimum[0].next.link; - - return (struct raw_match) - { .len = mc->optimum_cur_idx, - .offset = mc->optimum[0].next.match_offset, - }; -} - -/* Format-specific functions inlined into lz_get_near_optimal_match(). */ - -/* Get the list of possible matches at the next position. The return value must - * be the number of matches found (which may be 0) and a pointer to the returned - * matches must be written into @matches_ret. Matches must be of distinct - * lengths and sorted in decreasing order by length. Furthermore, match lengths - * may not exceed the @max_match_len passed to lz_match_chooser_init(), and all - * match lengths must be at least 2. */ -typedef u32 (*lz_get_matches_t)(LZ_COMPRESSOR *ctx, - const LZ_ADAPTIVE_STATE *state, - struct raw_match **matches_ret); - -/* Skip the specified number of bytes (don't search for matches at them). This - * is expected to be faster than simply getting the matches at each position, - * but the exact performance difference will be dependent on the match-finder - * implementation. */ -typedef void (*lz_skip_bytes_t)(LZ_COMPRESSOR *ctx, input_idx_t n); - -/* Get the cost of the literal located at the position at which matches have - * most recently been searched. This can optionally update the @state to take - * into account format-dependent state that affects match costs, such as repeat - * offsets. */ -typedef lz_mc_cost_t (*lz_get_prev_literal_cost_t)(LZ_COMPRESSOR *ctx, - LZ_ADAPTIVE_STATE *state); - -/* Get the cost of a match. This can optionally update the @state to take into - * account format-dependent state that affects match costs, such as repeat - * offsets. */ -typedef lz_mc_cost_t (*lz_get_match_cost_t)(LZ_COMPRESSOR *ctx, - LZ_ADAPTIVE_STATE *state, - input_idx_t length, - input_idx_t offset); - -/* - * lz_get_near_optimal_match() - - * - * Choose an approximately optimal match or literal to use at the next position - * in the string, or "window", being LZ-encoded. - * - * This is based on the algorithm used in 7-Zip's DEFLATE encoder, written by - * Igor Pavlov. However it also attempts to account for adaptive state, such as - * an LRU queue of recent match offsets. - * - * Unlike a greedy parser that always takes the longest match, or even a "lazy" - * parser with one match/literal look-ahead like zlib, the algorithm used here - * may look ahead many matches/literals to determine the approximately optimal - * match/literal to code next. The motivation is that the compression ratio is - * improved if the compressor can do things like use a shorter-than-possible - * match in order to allow a longer match later, and also take into account the - * estimated real cost of coding each match/literal based on the underlying - * entropy encoding. - * - * Still, this is not a true optimal parser for several reasons: - * - * - Very long matches (at least @nice_len) are taken immediately. This is - * because locations with long matches are likely to have many possible - * alternatives that would cause slow optimal parsing, but also such locations - * are already highly compressible so it is not too harmful to just grab the - * longest match. - * - * - Not all possible matches at each location are considered. Users of this - * code are expected to provide a @get_matches() function that returns a list - * of potentially good matches at the current position, but no more than one - * per length. It therefore must use some sort of heuristic (e.g. smallest or - * repeat offset) to choose a good match to consider for a given length, if - * multiple exist. Furthermore, the @get_matches() implementation may limit - * the total number of matches returned and/or the number of computational - * steps spent searching for matches at each position. - * - * - This function relies on the user-provided @get_match_cost() and - * @get_prev_literal_cost() functions to evaluate match and literal costs, - * respectively, but real compression formats use entropy encoding of the - * literal/match sequence, so the real cost of coding each match or literal is - * unknown until the parse is fully determined. It can be approximated based - * on iterative parses, but the end result is not guaranteed to be globally - * optimal. - * - * - Although this function allows @get_match_cost() and - * @get_prev_literal_cost() to take into account adaptive state, coding - * decisions made with respect to the adaptive state will be locally optimal - * but will not necessarily be globally optimal. This is because the - * algorithm only keeps the least-costly path to get to a given location and - * does not take into account that a slightly more costly path could result in - * a different adaptive state that ultimately results in a lower global cost. - * - * - The array space used by this function is bounded, so in degenerate cases it - * is forced to start returning matches/literals before the algorithm has - * really finished. - * - * Each call to this function does one of two things: - * - * 1. Build a sequence of near-optimal matches/literals, up to some point, that - * will be returned by subsequent calls to this function, then return the - * first one. - * - * OR - * - * 2. Return the next match/literal previously computed by a call to this - * function. - * - * The return value is a (length, offset) pair specifying the match or literal - * chosen. For literals, the length is 0 or 1 and the offset is meaningless. - * - * NOTE: this code has been factored out of the LZX compressor so that it can be - * shared by other formats such as LZMS. It is inlined so there is no loss of - * performance, especially with the different implementations of match-finding, - * cost evaluation, and adaptive state. - */ -static _always_inline_attribute struct raw_match -lz_get_near_optimal_match(struct lz_match_chooser *mc, - lz_get_matches_t get_matches, - lz_skip_bytes_t skip_bytes, - lz_get_prev_literal_cost_t get_prev_literal_cost, - lz_get_match_cost_t get_match_cost, - LZ_COMPRESSOR *ctx, - const LZ_ADAPTIVE_STATE *initial_state) -{ - u32 num_possible_matches; - struct raw_match *possible_matches; - struct raw_match match; - input_idx_t longest_match_len; - - if (mc->optimum_cur_idx != mc->optimum_end_idx) { - /* Case 2: Return the next match/literal already found. */ - match.len = mc->optimum[mc->optimum_cur_idx].next.link - - mc->optimum_cur_idx; - match.offset = mc->optimum[mc->optimum_cur_idx].next.match_offset; - - mc->optimum_cur_idx = mc->optimum[mc->optimum_cur_idx].next.link; - return match; - } - - /* Case 1: Compute a new list of matches/literals to return. */ - - mc->optimum_cur_idx = 0; - mc->optimum_end_idx = 0; - - /* Get matches at this position. */ - num_possible_matches = (*get_matches)(ctx, - initial_state, - &possible_matches); - - /* If no matches found, return literal. */ - if (num_possible_matches == 0) - return (struct raw_match){ .len = 0 }; - - /* The matches that were found are sorted in decreasing order by length. - * Get the length of the longest one. */ - longest_match_len = possible_matches[0].len; - - /* Greedy heuristic: if the longest match that was found is greater - * than nice_len, return it immediately; don't both doing more work. */ - if (longest_match_len >= mc->nice_len) { - (*skip_bytes)(ctx, longest_match_len - 1); - return possible_matches[0]; - } - - /* Calculate the cost to reach the next position by coding a literal. - */ - mc->optimum[1].state = *initial_state; - mc->optimum[1].cost = (*get_prev_literal_cost)(ctx, &mc->optimum[1].state); - mc->optimum[1].prev.link = 0; - - /* Calculate the cost to reach any position up to and including that - * reached by the longest match. Use the shortest available match that - * reaches each position, assuming that @get_matches() only returned - * shorter matches because their estimated costs were less than that of - * the longest match. */ - for (input_idx_t len = 2, match_idx = num_possible_matches - 1; - len <= longest_match_len; len++) - { - - LZ_ASSERT(match_idx < num_possible_matches); - LZ_ASSERT(len <= possible_matches[match_idx].len); - - mc->optimum[len].state = *initial_state; - mc->optimum[len].prev.link = 0; - mc->optimum[len].prev.match_offset = possible_matches[match_idx].offset; - mc->optimum[len].cost = (*get_match_cost)(ctx, - &mc->optimum[len].state, - len, - possible_matches[match_idx].offset); - if (len == possible_matches[match_idx].len) - match_idx--; - } - - /* Step forward, calculating the estimated minimum cost to reach each - * position. The algorithm may find multiple paths to reach each - * position; only the lowest-cost path is saved. - * - * The progress of the parse is tracked in the @mc->optimum array, which - * for each position contains the minimum cost to reach that position, - * the index of the start of the match/literal taken to reach that - * position through the minimum-cost path, the offset of the match taken - * (not relevant for literals), and the adaptive state that will exist - * at that position after the minimum-cost path is taken. The @cur_pos - * variable stores the position at which the algorithm is currently - * considering coding choices, and the @len_end variable stores the - * greatest position at which the costs of coding choices have been - * saved. (Actually, the algorithm guarantees that all positions up to - * and including @len_end are reachable by at least one path.) - * - * The loop terminates when any one of the following conditions occurs: - * - * 1. A match with length greater than or equal to @nice_len is found. - * When this occurs, the algorithm chooses this match - * unconditionally, and consequently the near-optimal match/literal - * sequence up to and including that match is fully determined and it - * can begin returning the match/literal list. - * - * 2. @cur_pos reaches a position not overlapped by a preceding match. - * In such cases, the near-optimal match/literal sequence up to - * @cur_pos is fully determined and it can begin returning the - * match/literal list. - * - * 3. Failing either of the above in a degenerate case, the loop - * terminates when space in the @mc->optimum array is exhausted. - * This terminates the algorithm and forces it to start returning - * matches/literals even though they may not be globally optimal. - * - * Upon loop termination, a nonempty list of matches/literals will have - * been produced and stored in the @optimum array. These - * matches/literals are linked in reverse order, so the last thing this - * function does is reverse this list and return the first - * match/literal, leaving the rest to be returned immediately by - * subsequent calls to this function. - */ - input_idx_t cur_pos = 0; - input_idx_t len_end = longest_match_len; - for (;;) { - /* Advance to next position. */ - cur_pos++; - - /* Check termination conditions (2) and (3) noted above. */ - if (cur_pos == len_end || cur_pos == mc->array_space) - return lz_match_chooser_reverse_list(mc, cur_pos); - - /* Retrieve a (possibly empty) list of potentially useful - * matches available at this position. */ - num_possible_matches = (*get_matches)(ctx, - &mc->optimum[cur_pos].state, - &possible_matches); - - if (num_possible_matches == 0) - longest_match_len = 0; - else - longest_match_len = possible_matches[0].len; - - /* Greedy heuristic and termination condition (1) noted above: - * if we found a match greater than @nice_len, choose it - * unconditionally and begin returning matches/literals. */ - if (longest_match_len >= mc->nice_len) { - /* Build the list of matches to return and get - * the first one. */ - match = lz_match_chooser_reverse_list(mc, cur_pos); - - /* Append the long match to the end of the list. */ - mc->optimum[cur_pos].next.match_offset = - possible_matches[0].offset; - mc->optimum[cur_pos].next.link = cur_pos + longest_match_len; - mc->optimum_end_idx = cur_pos + longest_match_len; - - /* Skip over the remaining bytes of the long match. */ - (*skip_bytes)(ctx, longest_match_len - 1); - - /* Return first match in the list. */ - return match; - } - - /* Load minimum cost to reach the current position. */ - input_idx_t cur_cost = mc->optimum[cur_pos].cost; - - /* Consider proceeding with a literal byte. */ - { - LZ_ADAPTIVE_STATE state; - lz_mc_cost_t cost; - - state = mc->optimum[cur_pos].state; - cost = cur_cost + (*get_prev_literal_cost)(ctx, &state); - - if (cost < mc->optimum[cur_pos + 1].cost) { - mc->optimum[cur_pos + 1].cost = cost; - mc->optimum[cur_pos + 1].prev.link = cur_pos; - mc->optimum[cur_pos + 1].state = state; - } - } - - /* If no matches were found, continue to the next position. - * Otherwise, consider proceeding with a match. */ - - if (num_possible_matches == 0) - continue; - - /* Initialize any uninitialized costs up to the length of the - * longest match found. */ - while (len_end < cur_pos + longest_match_len) - mc->optimum[++len_end].cost = LZ_MC_INFINITE_COST; - - /* Calculate the minimum cost to reach any position up to and - * including that reached by the longest match. Use the - * shortest available match that reaches each position, assuming - * that @get_matches() only returned shorter matches because - * their estimated costs were less than that of the longest - * match. */ - for (input_idx_t len = 2, match_idx = num_possible_matches - 1; - len <= longest_match_len; len++) - { - LZ_ASSERT(match_idx < num_possible_matches); - LZ_ASSERT(len <= possible_matches[match_idx].len); - - LZ_ADAPTIVE_STATE state; - lz_mc_cost_t cost; - - state = mc->optimum[cur_pos].state; - cost = cur_cost + (*get_match_cost)(ctx, - &state, - len, - possible_matches[match_idx].offset); - - if (cost < mc->optimum[cur_pos + len].cost) { - mc->optimum[cur_pos + len].cost = cost; - mc->optimum[cur_pos + len].prev.link = cur_pos; - mc->optimum[cur_pos + len].prev.match_offset = - possible_matches[match_idx].offset; - mc->optimum[cur_pos + len].state = state; - } - - if (len == possible_matches[match_idx].len) - match_idx--; - } - } -} - -#endif /* _LZ_OPTIMAL_H */ diff --git a/include/wimlib/lz_sarray.h b/include/wimlib/lz_sarray.h deleted file mode 100644 index e609d0c2..00000000 --- a/include/wimlib/lz_sarray.h +++ /dev/null @@ -1,516 +0,0 @@ -/* - * lz_sarray.h - * - * Suffix array match-finder for Lempel-Ziv compression. - */ - -/* - * Copyright (c) 2013, 2014 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _WIMLIB_LZ_SARRAY_H -#define _WIMLIB_LZ_SARRAY_H - -#include "wimlib/compiler.h" /* must define '_always_inline_attribute', - 'likely()', and 'prefetch()'. */ -#include "wimlib/lz.h" /* must define 'struct raw_match' and LZ_ASSERT() */ -#include "wimlib/types.h" /* must define 'bool', 'u8', 'u16, and 'u32' */ - -struct salink; - -/* Position type --- must be an unsigned type large enough to hold the length of - * longest window for which the suffix array match-finder will be used. */ -typedef u32 lz_sarray_pos_t; - -/* Length type --- must be an unsigned type large enough to hold the maximum - * match length. */ -typedef u16 lz_sarray_len_t; - -/* Cost type, for the user-provided match cost evaluation function. */ -typedef lz_sarray_pos_t lz_sarray_cost_t; - -/* Type of distances in suffix array links. A larger type would allow skipping - * irrelevant suffixes more quickly, which is especially helpful towards the - * start of the window. However, even a single byte allows skipping 255 at a - * time, which where it matters is already a big improvement over the - * alternative of searching the suffixes consecutively. */ -typedef u8 lz_sarray_delta_t; - -#define LZ_SARRAY_LEN_MAX ((lz_sarray_len_t)~0UL) -#define LZ_SARRAY_POS_MAX ((lz_sarray_pos_t)~0UL) -#define LZ_SARRAY_DELTA_MAX ((lz_sarray_delta_t)~0UL) -#define LZ_SARRAY_INFINITE_COST ((lz_sarray_cost_t)~0UL) - -/* State of the suffix array LZ (Lempel-Ziv) match-finder. - * - * This is defined here for benefit of the inlined code. It's not intended for - * code outside the match-finder itself to read or write members from this - * structure. */ -struct lz_sarray { - /* Allocated window size for the match-finder. - * - * Note: this match-finder does not store the window itself, as the - * suffix array (@SA) and associated arrays (@ISA, @LCP, @salink) are - * sufficient to find matches. This number is the maximum length of - * those arrays, or also the maximum window (block) size that can be - * passed to lz_sarray_load_window(). */ - lz_sarray_pos_t max_window_size; - - /* Minimum length of matches to return. */ - lz_sarray_len_t min_match_len; - - /* Maximum length of matches to return. */ - lz_sarray_len_t max_match_len; - - /* Maximum matches to consider at each position (max search depth). */ - u32 max_matches_to_consider; - - /* Maximum number of matches to return at each position. */ - u32 max_matches_to_return; - - /* Current position in the window. */ - lz_sarray_pos_t cur_pos; - - /* Current window size. */ - lz_sarray_pos_t window_size; - - /* Suffix array for the current window. - * This is a mapping from suffix rank to suffix position. */ - lz_sarray_pos_t *SA; - - /* Inverse suffix array for the current window. - * This is a mapping from suffix position to suffix rank. - * If 0 <= r < window_size, then ISA[SA[r]] == r. */ - lz_sarray_pos_t *ISA; - - /* Suffix array links. - * - * During a linear scan of the input string to find matches, this array - * used to keep track of which rank suffixes in the suffix array appear - * before the current position. Instead of searching in the original - * suffix array, scans for matches at a given position traverse a linked - * list containing (usually) only suffixes that appear before that - * position. */ - struct salink *salink; -}; - -/* Suffix array link. An array of these structures, one per suffix rank, is - * used as a replacement for the raw LCP (Longest Common Prefix) array to allow - * skipping over suffixes that appear later in the window and hence cannot be - * used as LZ77 matches. */ -struct salink { - union { - /* Temporary fields used while this structure is being - * initialized. - * - * Note: we want the entire `struct salink' to be only 6 bytes, - * even though this makes "next_initial" unaligned. */ - struct { - lz_sarray_pos_t next_initial; - lz_sarray_len_t lcpnext_initial; - } _packed_attribute; - - struct { - /* Intially, the length, in bytes, of the longest common - * prefix (LCP) between the suffix having this rank and - * the suffix with the smallest larger rank that - * starts earlier in the window than the suffix having - * this rank. If no such suffix exists, this will be 0. - * - * Later, during match-finding, after the corresponding - * suffix has entered the LZ77 dictionary, this value - * may be updated by lz_sarray_update_salink() to refer - * instead to a lexicographically closer (but still - * larger) suffix that begins at a later position that - * has entered the LZ77 dictionary. */ - lz_sarray_len_t lcpnext; - - /* Initially, the length, in bytes, of the longest - * common prefix (LCP) between the suffix having this - * rank and the suffix with the largest smaller rank - * that starts earlier in the window than the suffix - * having this rank. If no such suffix exists, this - * will be 0. - * - * Later, during match-finding, after the corresponding - * suffix has entered the LZ77 dictionary, this value - * may be updated by lz_sarray_update_salink() to refer - * instead to a lexicographically closer (but still - * smaller) suffix that begins at a later position that - * has entered the LZ77 dictionary. */ - lz_sarray_len_t lcpprev; - - /* Distance to the suffix referred to in the description - * of "lcpnext" above, but capped to a maximum value to - * save memory; or, 0 if no such suffix exists. If the - * true distance was truncated, this will give the - * distance to the rank of a suffix that is - * lexicographically closer to the current suffix than - * the desired suffix, but appears *later* in the window - * and hence cannot be used as the basis for an LZ77 - * match. */ - lz_sarray_delta_t dist_to_next; - - /* Distance to the suffix referred to in the description - * of "lcpprev" above, but capped to a maximum value to - * save memory; or, 0 if no such suffix exists. If the - * true distance was truncated, this will give the - * distance to the rank of a suffix that is - * lexicographically closer to the current suffix than - * the desired suffix, but appears *later* in the window - * and hence cannot be used as the basis for an LZ77 - * match. */ - lz_sarray_delta_t dist_to_prev; - }; - }; -}; - - -/*-----------------------------------*/ -/* Functions defined in lz_sarray.c */ -/*-----------------------------------*/ - -extern bool -lz_sarray_init(struct lz_sarray *mf, - lz_sarray_pos_t max_window_size, - lz_sarray_len_t min_match_len, - lz_sarray_len_t max_match_len, - u32 max_matches_to_consider, - u32 max_matches_to_return); - -extern u64 -lz_sarray_get_needed_memory(lz_sarray_pos_t max_window_size); - -extern void -lz_sarray_destroy(struct lz_sarray *mf); - -extern void -lz_sarray_load_window(struct lz_sarray *mf, const u8 T[], lz_sarray_pos_t n); - -/*-------------------*/ -/* Inline functions */ -/*-------------------*/ - -static _always_inline_attribute lz_sarray_pos_t -lz_sarray_get_pos(const struct lz_sarray *mf) -{ - return mf->cur_pos; -} - -/* Advance the suffix array match-finder to the next position. */ -static _always_inline_attribute void -lz_sarray_update_salink(const lz_sarray_pos_t r, struct salink link[]) -{ - const lz_sarray_pos_t next = r + link[r].dist_to_next; - const lz_sarray_pos_t prev = r - link[r].dist_to_prev; - - if (next != r && link[r].dist_to_next < link[next].dist_to_prev) { - link[next].dist_to_prev = link[r].dist_to_next; - link[next].lcpprev = link[r].lcpnext; - } - - if (prev != r && link[r].dist_to_prev < link[prev].dist_to_next) { - link[prev].dist_to_next = link[r].dist_to_prev; - link[prev].lcpnext = link[r].lcpprev; - } -} - -/* Skip the current position in the suffix array match-finder. */ -static _always_inline_attribute void -lz_sarray_skip_position(struct lz_sarray *mf) -{ - LZ_ASSERT(mf->cur_pos < mf->window_size); - lz_sarray_update_salink(mf->ISA[mf->cur_pos++], mf->salink); -} - -/* - * Use the suffix array match-finder to retrieve a list of matches at the - * current position. - * - * Returns the number of matches written into @matches. The matches are - * returned in decreasing order by length, and each will be of unique length - * between the minimum and maximum match lengths (inclusively) passed to - * lz_sarray_init(). Up to @max_matches_to_return (passed to lz_sarray_init()) - * matches will be returned. - * - * @eval_match_cost is a function for evaluating the cost of a match when - * deciding which ones to return. It needs to be fast, and need not be exact; - * an implementation might simply rank matches by their offset, for example, - * although implementations may choose to take into account additional - * information such as repeat offsets. - */ -static _always_inline_attribute u32 -lz_sarray_get_matches(struct lz_sarray *mf, - struct raw_match matches[], - lz_sarray_cost_t (*eval_match_cost) - (lz_sarray_pos_t length, - lz_sarray_pos_t offset, - const void *ctx), - const void *eval_match_cost_ctx) -{ - LZ_ASSERT(mf->cur_pos < mf->window_size); - const lz_sarray_pos_t i = mf->cur_pos++; - - const lz_sarray_pos_t * const restrict SA = mf->SA; - const lz_sarray_pos_t * const restrict ISA = mf->ISA; - struct salink * const restrict link = mf->salink; - const u32 max_matches_to_consider = mf->max_matches_to_consider; - const u32 max_matches_to_return = mf->max_matches_to_return; - - /* r = Rank of the suffix at the current position. */ - const lz_sarray_pos_t r = ISA[i]; - - /* Prepare for searching the current position. */ - lz_sarray_update_salink(r, link); - -#if 1 - /* Prefetch next position in SA and link. - * - * This can improve performance on large windows since the locations in - * SA and link at which each successive search begins are in general - * randomly distributed. */ - if (likely(i + 1 < mf->window_size)) { - const lz_sarray_pos_t next_r = ISA[i + 1]; - prefetch(&SA[next_r]); - prefetch(&link[next_r]); - } -#endif - - /* L = rank of next suffix to the left; - * R = rank of next suffix to the right; - * lenL = length of match between current position and the suffix with rank L; - * lenR = length of match between current position and the suffix with rank R. - * - * This is left and right relative to the rank of the current suffix. - * Since the suffixes in the suffix array are sorted, the longest - * matches are immediately to the left and right (using the linked list - * to ignore all suffixes that occur later in the window). The match - * length decreases the farther left and right we go. We shall keep the - * length on both sides in sync in order to choose the lowest-cost match - * of each length. - */ - lz_sarray_pos_t L = r - link[r].dist_to_prev; - lz_sarray_pos_t R = r + link[r].dist_to_next; - lz_sarray_pos_t lenL = link[r].lcpprev; - lz_sarray_pos_t lenR = link[r].lcpnext; - - /* nmatches = number of matches found so far. */ - u32 nmatches = 0; - - /* best_cost = cost of lowest-cost match found so far. - * - * Shorter matches that do not have a lower cost than this are - * discarded, since presumably it would be cheaper to output the bytes - * from the longer match instead. */ - lz_sarray_cost_t best_cost = LZ_SARRAY_INFINITE_COST; - - /* count_remaining = maximum number of possible matches remaining to be - * considered. */ - u32 count_remaining = max_matches_to_consider; - - /* pending_offset = offset of lowest-cost match found for the current - * length, or 0 if none found yet. */ - lz_sarray_pos_t pending_offset = 0; - - /* Note: some 'goto' statements are used in the remainder of this - * function to remove unnecessary checks and create branches that the - * CPU may predict better. (This function is performance critical.) */ - - if (lenL != 0 && lenL >= lenR) - goto extend_left; - else if (lenR != 0) - goto extend_right; - else - return 0; - -extend_left: - /* Search suffixes on the left until the match length has decreased - * below the next match length on the right or to below the minimum - * match length. */ - for (;;) { - lz_sarray_pos_t offset; - lz_sarray_cost_t cost; - lz_sarray_pos_t old_L; - lz_sarray_pos_t old_lenL; - - /* Check for hard cutoff on amount of work done. */ - if (count_remaining-- == 0) { - if (pending_offset != 0) { - /* Save pending match. */ - matches[nmatches++] = (struct raw_match){ - .len = lenL, - .offset = pending_offset, - }; - } - return nmatches; - } - - if (SA[L] < i) { - /* Suffix is in LZ77 dictionary. (Check was needed - * because the salink array caps distances to save - * memory.) */ - - offset = i - SA[L]; - - /* Save match offset if it results in lower cost. */ - cost = (*eval_match_cost)(lenL, offset, - eval_match_cost_ctx); - if (cost < best_cost) { - best_cost = cost; - pending_offset = offset; - } - } - - /* Advance left to previous suffix. */ - - old_L = L; - old_lenL = lenL; - - L -= link[L].dist_to_prev; - - if (link[old_L].lcpprev < old_lenL) { - /* Match length decreased. */ - - lenL = link[old_L].lcpprev; - - if (old_lenL > lenR) { - /* Neither the right side nor the left size has - * any more matches of length @old_lenL. If a - * pending match exists, save it. */ - if (pending_offset != 0) { - matches[nmatches++] = (struct raw_match){ - .len = old_lenL, - .offset = pending_offset, - }; - if (nmatches == max_matches_to_return) - return nmatches; - - pending_offset = 0; - } - - if (lenL >= lenR) { - /* New match length on left is still at - * least as large as the next match - * length on the right: Keep extending - * left, unless the minimum match length - * would be underrun. */ - if (lenL == 0) - return nmatches; - goto extend_left; - } - } - - /* Here we have lenL < lenR. Extend right. - * (No check for whether the minimum match length has - * been underrun is needed, provided that such lengths - * are marked as 0.) */ - goto extend_right; - } - } - -extend_right: - /* Search suffixes on the right until the match length has decreased to - * the next match length on the left or to below the minimum match - * length. */ - for (;;) { - lz_sarray_pos_t offset; - lz_sarray_cost_t cost; - lz_sarray_pos_t old_R; - lz_sarray_pos_t old_lenR; - - /* Check for hard cutoff on amount of work done. */ - if (count_remaining-- == 0) { - if (pending_offset != 0) { - /* Save pending match. */ - matches[nmatches++] = (struct raw_match){ - .len = lenR, - .offset = pending_offset, - }; - } - return nmatches; - } - - if (SA[R] < i) { - /* Suffix is in LZ77 dictionary. (Check was needed - * because the salink array caps distances to save - * memory.) */ - - offset = i - SA[R]; - - /* Save match offset if it results in lower cost. */ - cost = (*eval_match_cost)(lenR, - offset, - eval_match_cost_ctx); - if (cost < best_cost) { - best_cost = cost; - pending_offset = offset; - } - } - - /* Advance right to next suffix. */ - - old_R = R; - old_lenR = lenR; - - R += link[R].dist_to_next; - - if (link[old_R].lcpnext < lenR) { - /* Match length decreased. */ - - lenR = link[old_R].lcpnext; - - /* Neither the right side nor the left size has any more - * matches of length @old_lenR. If a pending match - * exists, save it. */ - if (pending_offset != 0) { - matches[nmatches++] = (struct raw_match){ - .len = old_lenR, - .offset = pending_offset, - }; - if (nmatches == max_matches_to_return) - return nmatches; - - pending_offset = 0; - } - - if (lenL >= lenR) { - /* lenL >= lenR: Extend left, unless the - * minimum match length would be underrun, in - * which case we are done. */ - if (lenL == 0) - return nmatches; - - goto extend_left; - } - /* lenR > lenL: Keep extending right. - * (No check for whether the minimum match length has - * been underrun is needed, provided that such lengths - * are marked as 0.) */ - } - } -} - -#endif /* _WIMLIB_LZ_SARRAY_H */ diff --git a/include/wimlib/lzx.h b/include/wimlib/lzx.h index 7104dc14..50d80f62 100644 --- a/include/wimlib/lzx.h +++ b/include/wimlib/lzx.h @@ -122,7 +122,7 @@ extern const u32 lzx_position_base[LZX_MAX_POSITION_SLOTS]; * the formatted offset without actually looking at the array. */ static inline unsigned -lzx_get_position_slot_raw(unsigned formatted_offset) +lzx_get_position_slot_raw(u32 formatted_offset) { if (formatted_offset >= 196608) { return (formatted_offset >> 17) + 34; diff --git a/programs/imagex.c b/programs/imagex.c index 233e1f82..1f8018a3 100644 --- a/programs/imagex.c +++ b/programs/imagex.c @@ -505,7 +505,6 @@ set_compress_slow(void) .nice_match_length = 96, .num_optim_passes = 4, .max_search_depth = 100, - .max_matches_per_pos = 10, .main_nostat_cost = 15, .len_nostat_cost = 15, .aligned_nostat_cost = 7, @@ -521,7 +520,6 @@ set_compress_slow(void) .max_match_length = UINT32_MAX, .nice_match_length = 96, .max_search_depth = 100, - .max_matches_per_pos = 10, .optim_array_length = 1024, }; diff --git a/src/divsufsort.c b/src/divsufsort.c deleted file mode 100644 index 6353a958..00000000 --- a/src/divsufsort.c +++ /dev/null @@ -1,1631 +0,0 @@ -/* - * divsufsort.c for libdivsufsort-lite - * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#define assert(x) - -#include "wimlib/divsufsort.h" -#include - -/*- Constants -*/ -#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1) -# undef ALPHABET_SIZE -#endif -#if !defined(ALPHABET_SIZE) -# define ALPHABET_SIZE (256) -#endif -#define BUCKET_A_SIZE (ALPHABET_SIZE) -#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE) -#if defined(SS_INSERTIONSORT_THRESHOLD) -# if SS_INSERTIONSORT_THRESHOLD < 1 -# undef SS_INSERTIONSORT_THRESHOLD -# define SS_INSERTIONSORT_THRESHOLD (1) -# endif -#else -# define SS_INSERTIONSORT_THRESHOLD (8) -#endif -#if defined(SS_BLOCKSIZE) -# if SS_BLOCKSIZE < 0 -# undef SS_BLOCKSIZE -# define SS_BLOCKSIZE (0) -# elif 32768 <= SS_BLOCKSIZE -# undef SS_BLOCKSIZE -# define SS_BLOCKSIZE (32767) -# endif -#else -# define SS_BLOCKSIZE (1024) -#endif -/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */ -#if SS_BLOCKSIZE == 0 -# define SS_MISORT_STACKSIZE (96) -#elif SS_BLOCKSIZE <= 4096 -# define SS_MISORT_STACKSIZE (16) -#else -# define SS_MISORT_STACKSIZE (24) -#endif -#define SS_SMERGE_STACKSIZE (32) -#define TR_INSERTIONSORT_THRESHOLD (8) -#define TR_STACKSIZE (64) - - -/*- Macros -*/ -#ifndef SWAP -# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0) -#endif /* SWAP */ -#ifndef MIN -# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b)) -#endif /* MIN */ -#ifndef MAX -# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b)) -#endif /* MAX */ -#define STACK_PUSH(_a, _b, _c, _d)\ - do {\ - assert(ssize < STACK_SIZE);\ - stack[ssize].a = (_a), stack[ssize].b = (_b),\ - stack[ssize].c = (_c), stack[ssize++].d = (_d);\ - } while(0) -#define STACK_PUSH5(_a, _b, _c, _d, _e)\ - do {\ - assert(ssize < STACK_SIZE);\ - stack[ssize].a = (_a), stack[ssize].b = (_b),\ - stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\ - } while(0) -#define STACK_POP(_a, _b, _c, _d)\ - do {\ - assert(0 <= ssize);\ - if(ssize == 0) { return; }\ - (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ - (_c) = stack[ssize].c, (_d) = stack[ssize].d;\ - } while(0) -#define STACK_POP5(_a, _b, _c, _d, _e)\ - do {\ - assert(0 <= ssize);\ - if(ssize == 0) { return; }\ - (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ - (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\ - } while(0) -#define BUCKET_A(_c0) bucket_A[(_c0)] -#if ALPHABET_SIZE == 256 -#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)]) -#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)]) -#else -#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)]) -#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)]) -#endif - - -/*- Private Functions -*/ - -static const int lg_table[256]= { - -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, - 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, - 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, - 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 -}; - -#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) - -static inline -int -ss_ilg(int n) { -#if SS_BLOCKSIZE == 0 - return (n & 0xffff0000) ? - ((n & 0xff000000) ? - 24 + lg_table[(n >> 24) & 0xff] : - 16 + lg_table[(n >> 16) & 0xff]) : - ((n & 0x0000ff00) ? - 8 + lg_table[(n >> 8) & 0xff] : - 0 + lg_table[(n >> 0) & 0xff]); -#elif SS_BLOCKSIZE < 256 - return lg_table[n]; -#else - return (n & 0xff00) ? - 8 + lg_table[(n >> 8) & 0xff] : - 0 + lg_table[(n >> 0) & 0xff]; -#endif -} - -#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ - -#if SS_BLOCKSIZE != 0 - -static const int sqq_table[256] = { - 0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61, - 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89, - 90, 91, 93, 94, 96, 97, 98, 99, 101, 102, 103, 104, 106, 107, 108, 109, -110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, -128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, -143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, -156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168, -169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180, -181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191, -192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201, -202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211, -212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221, -221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230, -230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, -239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, -247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255 -}; - -static inline -int -ss_isqrt(int x) { - int y, e; - - if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; } - e = (x & 0xffff0000) ? - ((x & 0xff000000) ? - 24 + lg_table[(x >> 24) & 0xff] : - 16 + lg_table[(x >> 16) & 0xff]) : - ((x & 0x0000ff00) ? - 8 + lg_table[(x >> 8) & 0xff] : - 0 + lg_table[(x >> 0) & 0xff]); - - if(e >= 16) { - y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7); - if(e >= 24) { y = (y + 1 + x / y) >> 1; } - y = (y + 1 + x / y) >> 1; - } else if(e >= 8) { - y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1; - } else { - return sqq_table[x] >> 4; - } - - return (x < (y * y)) ? y - 1 : y; -} - -#endif /* SS_BLOCKSIZE != 0 */ - - -/*---------------------------------------------------------------------------*/ - -/* Compares two suffixes. */ -static inline -int -ss_compare(const unsigned char *T, - const int *p1, const int *p2, - int depth) { - const unsigned char *U1, *U2, *U1n, *U2n; - - for(U1 = T + depth + *p1, - U2 = T + depth + *p2, - U1n = T + *(p1 + 1) + 2, - U2n = T + *(p2 + 1) + 2; - (U1 < U1n) && (U2 < U2n) && (*U1 == *U2); - ++U1, ++U2) { - } - - return U1 < U1n ? - (U2 < U2n ? *U1 - *U2 : 1) : - (U2 < U2n ? -1 : 0); -} - - -/*---------------------------------------------------------------------------*/ - -#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) - -/* Insertionsort for small size groups */ -static -void -ss_insertionsort(const unsigned char *T, const int *PA, - int *first, int *last, int depth) { - int *i, *j; - int t; - int r; - - for(i = last - 2; first <= i; --i) { - for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) { - do { *(j - 1) = *j; } while((++j < last) && (*j < 0)); - if(last <= j) { break; } - } - if(r == 0) { *j = ~*j; } - *(j - 1) = t; - } -} - -#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */ - - -/*---------------------------------------------------------------------------*/ - -#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) - -static inline -void -ss_fixdown(const unsigned char *Td, const int *PA, - int *SA, int i, int size) { - int j, k; - int v; - int c, d, e; - - for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { - d = Td[PA[SA[k = j++]]]; - if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; } - if(d <= c) { break; } - } - SA[i] = v; -} - -/* Simple top-down heapsort. */ -static -void -ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) { - int i, m; - int t; - - m = size; - if((size % 2) == 0) { - m--; - if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); } - } - - for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); } - if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); } - for(i = m - 1; 0 < i; --i) { - t = SA[0], SA[0] = SA[i]; - ss_fixdown(Td, PA, SA, 0, i); - SA[i] = t; - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Returns the median of three elements. */ -static inline -int * -ss_median3(const unsigned char *Td, const int *PA, - int *v1, int *v2, int *v3) { - int *t; - if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); } - if(Td[PA[*v2]] > Td[PA[*v3]]) { - if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; } - else { return v3; } - } - return v2; -} - -/* Returns the median of five elements. */ -static inline -int * -ss_median5(const unsigned char *Td, const int *PA, - int *v1, int *v2, int *v3, int *v4, int *v5) { - int *t; - if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); } - if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); } - if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); } - if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); } - if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); } - if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; } - return v3; -} - -/* Returns the pivot element. */ -static inline -int * -ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) { - int *middle; - int t; - - t = last - first; - middle = first + t / 2; - - if(t <= 512) { - if(t <= 32) { - return ss_median3(Td, PA, first, middle, last - 1); - } else { - t >>= 2; - return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1); - } - } - t >>= 3; - first = ss_median3(Td, PA, first, first + t, first + (t << 1)); - middle = ss_median3(Td, PA, middle - t, middle, middle + t); - last = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1); - return ss_median3(Td, PA, first, middle, last); -} - - -/*---------------------------------------------------------------------------*/ - -/* Binary partition for substrings. */ -static inline -int * -ss_partition(const int *PA, - int *first, int *last, int depth) { - int *a, *b; - int t; - for(a = first - 1, b = last;;) { - for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; } - for(; (a < --b) && ((PA[*b] + depth) < (PA[*b + 1] + 1));) { } - if(b <= a) { break; } - t = ~*b; - *b = *a; - *a = t; - } - if(first < a) { *first = ~*first; } - return a; -} - -/* Multikey introsort for medium size groups. */ -static -void -ss_mintrosort(const unsigned char *T, const int *PA, - int *first, int *last, - int depth) { -#define STACK_SIZE SS_MISORT_STACKSIZE - struct { int *a, *b, c; int d; } stack[STACK_SIZE]; - const unsigned char *Td; - int *a, *b, *c, *d, *e, *f; - int s, t; - int ssize; - int limit; - int v, x = 0; - - for(ssize = 0, limit = ss_ilg(last - first);;) { - - if((last - first) <= SS_INSERTIONSORT_THRESHOLD) { -#if 1 < SS_INSERTIONSORT_THRESHOLD - if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); } -#endif - STACK_POP(first, last, depth, limit); - continue; - } - - Td = T + depth; - if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); } - if(limit < 0) { - for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) { - if((x = Td[PA[*a]]) != v) { - if(1 < (a - first)) { break; } - v = x; - first = a; - } - } - if(Td[PA[*first] - 1] < v) { - first = ss_partition(PA, first, a, depth); - } - if((a - first) <= (last - a)) { - if(1 < (a - first)) { - STACK_PUSH(a, last, depth, -1); - last = a, depth += 1, limit = ss_ilg(a - first); - } else { - first = a, limit = -1; - } - } else { - if(1 < (last - a)) { - STACK_PUSH(first, a, depth + 1, ss_ilg(a - first)); - first = a, limit = -1; - } else { - last = a, depth += 1, limit = ss_ilg(a - first); - } - } - continue; - } - - /* choose pivot */ - a = ss_pivot(Td, PA, first, last); - v = Td[PA[*a]]; - SWAP(*first, *a); - - /* partition */ - for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { } - if(((a = b) < last) && (x < v)) { - for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - } - for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { } - if((b < (d = c)) && (x > v)) { - for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - for(; b < c;) { - SWAP(*b, *c); - for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - - if(a <= d) { - c = b - 1; - - if((s = a - first) > (t = b - a)) { s = t; } - for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - if((s = d - c) > (t = last - d - 1)) { s = t; } - for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - - a = first + (b - a), c = last - (d - c); - b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth); - - if((a - first) <= (last - c)) { - if((last - c) <= (c - b)) { - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - STACK_PUSH(c, last, depth, limit); - last = a; - } else if((a - first) <= (c - b)) { - STACK_PUSH(c, last, depth, limit); - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - last = a; - } else { - STACK_PUSH(c, last, depth, limit); - STACK_PUSH(first, a, depth, limit); - first = b, last = c, depth += 1, limit = ss_ilg(c - b); - } - } else { - if((a - first) <= (c - b)) { - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - STACK_PUSH(first, a, depth, limit); - first = c; - } else if((last - c) <= (c - b)) { - STACK_PUSH(first, a, depth, limit); - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - first = c; - } else { - STACK_PUSH(first, a, depth, limit); - STACK_PUSH(c, last, depth, limit); - first = b, last = c, depth += 1, limit = ss_ilg(c - b); - } - } - } else { - limit += 1; - if(Td[PA[*first] - 1] < v) { - first = ss_partition(PA, first, last, depth); - limit = ss_ilg(last - first); - } - depth += 1; - } - } -#undef STACK_SIZE -} - -#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ - - -/*---------------------------------------------------------------------------*/ - -#if SS_BLOCKSIZE != 0 - -static inline -void -ss_blockswap(int *a, int *b, int n) { - int t; - for(; 0 < n; --n, ++a, ++b) { - t = *a, *a = *b, *b = t; - } -} - -static inline -void -ss_rotate(int *first, int *middle, int *last) { - int *a, *b, t; - int l, r; - l = middle - first, r = last - middle; - for(; (0 < l) && (0 < r);) { - if(l == r) { ss_blockswap(first, middle, l); break; } - if(l < r) { - a = last - 1, b = middle - 1; - t = *a; - do { - *a-- = *b, *b-- = *a; - if(b < first) { - *a = t; - last = a; - if((r -= l + 1) <= l) { break; } - a -= 1, b = middle - 1; - t = *a; - } - } while(1); - } else { - a = first, b = middle; - t = *a; - do { - *a++ = *b, *b++ = *a; - if(last <= b) { - *a = t; - first = a + 1; - if((l -= r + 1) <= r) { break; } - a += 1, b = middle; - t = *a; - } - } while(1); - } - } -} - - -/*---------------------------------------------------------------------------*/ - -static -void -ss_inplacemerge(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int depth) { - const int *p; - int *a, *b; - int len, half; - int q, r; - int x; - - for(;;) { - if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); } - else { x = 0; p = PA + *(last - 1); } - for(a = first, len = middle - first, half = len >> 1, r = -1; - 0 < len; - len = half, half >>= 1) { - b = a + half; - q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth); - if(q < 0) { - a = b + 1; - half -= (len & 1) ^ 1; - } else { - r = q; - } - } - if(a < middle) { - if(r == 0) { *a = ~*a; } - ss_rotate(a, middle, last); - last -= middle - a; - middle = a; - if(first == middle) { break; } - } - --last; - if(x != 0) { while(*--last < 0) { } } - if(middle == last) { break; } - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Merge-forward with internal buffer. */ -static -void -ss_mergeforward(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int *buf, int depth) { - int *a, *b, *c, *bufend; - int t; - int r; - - bufend = buf + (middle - first) - 1; - ss_blockswap(buf, first, middle - first); - - for(t = *(a = first), b = buf, c = middle;;) { - r = ss_compare(T, PA + *b, PA + *c, depth); - if(r < 0) { - do { - *a++ = *b; - if(bufend <= b) { *bufend = t; return; } - *b++ = *a; - } while(*b < 0); - } else if(r > 0) { - do { - *a++ = *c, *c++ = *a; - if(last <= c) { - while(b < bufend) { *a++ = *b, *b++ = *a; } - *a = *b, *b = t; - return; - } - } while(*c < 0); - } else { - *c = ~*c; - do { - *a++ = *b; - if(bufend <= b) { *bufend = t; return; } - *b++ = *a; - } while(*b < 0); - - do { - *a++ = *c, *c++ = *a; - if(last <= c) { - while(b < bufend) { *a++ = *b, *b++ = *a; } - *a = *b, *b = t; - return; - } - } while(*c < 0); - } - } -} - -/* Merge-backward with internal buffer. */ -static -void -ss_mergebackward(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int *buf, int depth) { - const int *p1, *p2; - int *a, *b, *c, *bufend; - int t; - int r; - int x; - - bufend = buf + (last - middle) - 1; - ss_blockswap(buf, middle, last - middle); - - x = 0; - if(*bufend < 0) { p1 = PA + ~*bufend; x |= 1; } - else { p1 = PA + *bufend; } - if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; } - else { p2 = PA + *(middle - 1); } - for(t = *(a = last - 1), b = bufend, c = middle - 1;;) { - r = ss_compare(T, p1, p2, depth); - if(0 < r) { - if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } - *a-- = *b; - if(b <= buf) { *buf = t; break; } - *b-- = *a; - if(*b < 0) { p1 = PA + ~*b; x |= 1; } - else { p1 = PA + *b; } - } else if(r < 0) { - if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } - *a-- = *c, *c-- = *a; - if(c < first) { - while(buf < b) { *a-- = *b, *b-- = *a; } - *a = *b, *b = t; - break; - } - if(*c < 0) { p2 = PA + ~*c; x |= 2; } - else { p2 = PA + *c; } - } else { - if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } - *a-- = ~*b; - if(b <= buf) { *buf = t; break; } - *b-- = *a; - if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } - *a-- = *c, *c-- = *a; - if(c < first) { - while(buf < b) { *a-- = *b, *b-- = *a; } - *a = *b, *b = t; - break; - } - if(*b < 0) { p1 = PA + ~*b; x |= 1; } - else { p1 = PA + *b; } - if(*c < 0) { p2 = PA + ~*c; x |= 2; } - else { p2 = PA + *c; } - } - } -} - -/* D&C based merge. */ -static -void -ss_swapmerge(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int *buf, int bufsize, int depth) { -#define STACK_SIZE SS_SMERGE_STACKSIZE -#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a))) -#define MERGE_CHECK(a, b, c)\ - do {\ - if(((c) & 1) ||\ - (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\ - *(a) = ~*(a);\ - }\ - if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\ - *(b) = ~*(b);\ - }\ - } while(0) - struct { int *a, *b, *c; int d; } stack[STACK_SIZE]; - int *l, *r, *lm, *rm; - int m, len, half; - int ssize; - int check, next; - - for(check = 0, ssize = 0;;) { - if((last - middle) <= bufsize) { - if((first < middle) && (middle < last)) { - ss_mergebackward(T, PA, first, middle, last, buf, depth); - } - MERGE_CHECK(first, last, check); - STACK_POP(first, middle, last, check); - continue; - } - - if((middle - first) <= bufsize) { - if(first < middle) { - ss_mergeforward(T, PA, first, middle, last, buf, depth); - } - MERGE_CHECK(first, last, check); - STACK_POP(first, middle, last, check); - continue; - } - - for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1; - 0 < len; - len = half, half >>= 1) { - if(ss_compare(T, PA + GETIDX(*(middle + m + half)), - PA + GETIDX(*(middle - m - half - 1)), depth) < 0) { - m += half + 1; - half -= (len & 1) ^ 1; - } - } - - if(0 < m) { - lm = middle - m, rm = middle + m; - ss_blockswap(lm, middle, m); - l = r = middle, next = 0; - if(rm < last) { - if(*rm < 0) { - *rm = ~*rm; - if(first < lm) { for(; *--l < 0;) { } next |= 4; } - next |= 1; - } else if(first < lm) { - for(; *r < 0; ++r) { } - next |= 2; - } - } - - if((l - first) <= (last - r)) { - STACK_PUSH(r, rm, last, (next & 3) | (check & 4)); - middle = lm, last = l, check = (check & 3) | (next & 4); - } else { - if((next & 2) && (r == middle)) { next ^= 6; } - STACK_PUSH(first, lm, l, (check & 3) | (next & 4)); - first = r, middle = rm, check = (next & 3) | (check & 4); - } - } else { - if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) { - *middle = ~*middle; - } - MERGE_CHECK(first, last, check); - STACK_POP(first, middle, last, check); - } - } -#undef STACK_SIZE -} - -#endif /* SS_BLOCKSIZE != 0 */ - - -/*---------------------------------------------------------------------------*/ - -/* Substring sort */ -static -void -sssort(const unsigned char *T, const int *PA, - int *first, int *last, - int *buf, int bufsize, - int depth, int n, int lastsuffix) { - int *a; -#if SS_BLOCKSIZE != 0 - int *b, *middle, *curbuf; - int j, k, curbufsize, limit; -#endif - int i; - - if(lastsuffix != 0) { ++first; } - -#if SS_BLOCKSIZE == 0 - ss_mintrosort(T, PA, first, last, depth); -#else - if((bufsize < SS_BLOCKSIZE) && - (bufsize < (last - first)) && - (bufsize < (limit = ss_isqrt(last - first)))) { - if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; } - buf = middle = last - limit, bufsize = limit; - } else { - middle = last, limit = 0; - } - for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) { -#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE - ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth); -#elif 1 < SS_BLOCKSIZE - ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth); -#endif - curbufsize = last - (a + SS_BLOCKSIZE); - curbuf = a + SS_BLOCKSIZE; - if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; } - for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) { - ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth); - } - } -#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE - ss_mintrosort(T, PA, a, middle, depth); -#elif 1 < SS_BLOCKSIZE - ss_insertionsort(T, PA, a, middle, depth); -#endif - for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) { - if(i & 1) { - ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth); - a -= k; - } - } - if(limit != 0) { -#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE - ss_mintrosort(T, PA, middle, last, depth); -#elif 1 < SS_BLOCKSIZE - ss_insertionsort(T, PA, middle, last, depth); -#endif - ss_inplacemerge(T, PA, first, middle, last, depth); - } -#endif - - if(lastsuffix != 0) { - /* Insert last type B* suffix. */ - int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2; - for(a = first, i = *(first - 1); - (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth))); - ++a) { - *(a - 1) = *a; - } - *(a - 1) = i; - } -} - - -/*---------------------------------------------------------------------------*/ - -static inline -int -tr_ilg(int n) { - return (n & 0xffff0000) ? - ((n & 0xff000000) ? - 24 + lg_table[(n >> 24) & 0xff] : - 16 + lg_table[(n >> 16) & 0xff]) : - ((n & 0x0000ff00) ? - 8 + lg_table[(n >> 8) & 0xff] : - 0 + lg_table[(n >> 0) & 0xff]); -} - - -/*---------------------------------------------------------------------------*/ - -/* Simple insertionsort for small size groups. */ -static -void -tr_insertionsort(const int *ISAd, int *first, int *last) { - int *a, *b; - int t, r; - - for(a = first + 1; a < last; ++a) { - for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) { - do { *(b + 1) = *b; } while((first <= --b) && (*b < 0)); - if(b < first) { break; } - } - if(r == 0) { *b = ~*b; } - *(b + 1) = t; - } -} - - -/*---------------------------------------------------------------------------*/ - -static inline -void -tr_fixdown(const int *ISAd, int *SA, int i, int size) { - int j, k; - int v; - int c, d, e; - - for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { - d = ISAd[SA[k = j++]]; - if(d < (e = ISAd[SA[j]])) { k = j; d = e; } - if(d <= c) { break; } - } - SA[i] = v; -} - -/* Simple top-down heapsort. */ -static -void -tr_heapsort(const int *ISAd, int *SA, int size) { - int i, m; - int t; - - m = size; - if((size % 2) == 0) { - m--; - if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); } - } - - for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); } - if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); } - for(i = m - 1; 0 < i; --i) { - t = SA[0], SA[0] = SA[i]; - tr_fixdown(ISAd, SA, 0, i); - SA[i] = t; - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Returns the median of three elements. */ -static inline -int * -tr_median3(const int *ISAd, int *v1, int *v2, int *v3) { - int *t; - if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); } - if(ISAd[*v2] > ISAd[*v3]) { - if(ISAd[*v1] > ISAd[*v3]) { return v1; } - else { return v3; } - } - return v2; -} - -/* Returns the median of five elements. */ -static inline -int * -tr_median5(const int *ISAd, - int *v1, int *v2, int *v3, int *v4, int *v5) { - int *t; - if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); } - if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); } - if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); } - if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); } - if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); } - if(ISAd[*v3] > ISAd[*v4]) { return v4; } - return v3; -} - -/* Returns the pivot element. */ -static inline -int * -tr_pivot(const int *ISAd, int *first, int *last) { - int *middle; - int t; - - t = last - first; - middle = first + t / 2; - - if(t <= 512) { - if(t <= 32) { - return tr_median3(ISAd, first, middle, last - 1); - } else { - t >>= 2; - return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1); - } - } - t >>= 3; - first = tr_median3(ISAd, first, first + t, first + (t << 1)); - middle = tr_median3(ISAd, middle - t, middle, middle + t); - last = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1); - return tr_median3(ISAd, first, middle, last); -} - - -/*---------------------------------------------------------------------------*/ - -typedef struct _trbudget_t trbudget_t; -struct _trbudget_t { - int chance; - int remain; - int incval; - int count; -}; - -static inline -void -trbudget_init(trbudget_t *budget, int chance, int incval) { - budget->chance = chance; - budget->remain = budget->incval = incval; -} - -static inline -int -trbudget_check(trbudget_t *budget, int size) { - if(size <= budget->remain) { budget->remain -= size; return 1; } - if(budget->chance == 0) { budget->count += size; return 0; } - budget->remain += budget->incval - size; - budget->chance -= 1; - return 1; -} - - -/*---------------------------------------------------------------------------*/ - -static inline -void -tr_partition(const int *ISAd, - int *first, int *middle, int *last, - int **pa, int **pb, int v) { - int *a, *b, *c, *d, *e, *f; - int t, s; - int x = 0; - - for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { } - if(((a = b) < last) && (x < v)) { - for(; (++b < last) && ((x = ISAd[*b]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - } - for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { } - if((b < (d = c)) && (x > v)) { - for(; (b < --c) && ((x = ISAd[*c]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - for(; b < c;) { - SWAP(*b, *c); - for(; (++b < c) && ((x = ISAd[*b]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - for(; (b < --c) && ((x = ISAd[*c]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - - if(a <= d) { - c = b - 1; - if((s = a - first) > (t = b - a)) { s = t; } - for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - if((s = d - c) > (t = last - d - 1)) { s = t; } - for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - first += (b - a), last -= (d - c); - } - *pa = first, *pb = last; -} - -static -void -tr_copy(int *ISA, const int *SA, - int *first, int *a, int *b, int *last, - int depth) { - /* sort suffixes of middle partition - by using sorted order of suffixes of left and right partition. */ - int *c, *d, *e; - int s, v; - - v = b - SA - 1; - for(c = first, d = a - 1; c <= d; ++c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *++d = s; - ISA[s] = d - SA; - } - } - for(c = last - 1, e = d + 1, d = b; e < d; --c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *--d = s; - ISA[s] = d - SA; - } - } -} - -static -void -tr_partialcopy(int *ISA, const int *SA, - int *first, int *a, int *b, int *last, - int depth) { - int *c, *d, *e; - int s, v; - int rank, lastrank, newrank = -1; - - v = b - SA - 1; - lastrank = -1; - for(c = first, d = a - 1; c <= d; ++c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *++d = s; - rank = ISA[s + depth]; - if(lastrank != rank) { lastrank = rank; newrank = d - SA; } - ISA[s] = newrank; - } - } - - lastrank = -1; - for(e = d; first <= e; --e) { - rank = ISA[*e]; - if(lastrank != rank) { lastrank = rank; newrank = e - SA; } - if(newrank != rank) { ISA[*e] = newrank; } - } - - lastrank = -1; - for(c = last - 1, e = d + 1, d = b; e < d; --c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *--d = s; - rank = ISA[s + depth]; - if(lastrank != rank) { lastrank = rank; newrank = d - SA; } - ISA[s] = newrank; - } - } -} - -static -void -tr_introsort(int *ISA, const int *ISAd, - int *SA, int *first, int *last, - trbudget_t *budget) { -#define STACK_SIZE TR_STACKSIZE - struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE]; - int *a, *b, *c; - int t; - int v, x = 0; - int incr = ISAd - ISA; - int limit, next; - int ssize, trlink = -1; - - for(ssize = 0, limit = tr_ilg(last - first);;) { - - if(limit < 0) { - if(limit == -1) { - /* tandem repeat partition */ - tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1); - - /* update ranks */ - if(a < last) { - for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } - } - if(b < last) { - for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } - } - - /* push */ - if(1 < (b - a)) { - STACK_PUSH5(NULL, a, b, 0, 0); - STACK_PUSH5(ISAd - incr, first, last, -2, trlink); - trlink = ssize - 2; - } - if((a - first) <= (last - b)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink); - last = a, limit = tr_ilg(a - first); - } else if(1 < (last - b)) { - first = b, limit = tr_ilg(last - b); - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } else { - if(1 < (last - b)) { - STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink); - first = b, limit = tr_ilg(last - b); - } else if(1 < (a - first)) { - last = a, limit = tr_ilg(a - first); - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } else if(limit == -2) { - /* tandem repeat copy */ - a = stack[--ssize].b, b = stack[ssize].c; - if(stack[ssize].d == 0) { - tr_copy(ISA, SA, first, a, b, last, ISAd - ISA); - } else { - if(0 <= trlink) { stack[trlink].d = -1; } - tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA); - } - STACK_POP5(ISAd, first, last, limit, trlink); - } else { - /* sorted partition */ - if(0 <= *first) { - a = first; - do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a)); - first = a; - } - if(first < last) { - a = first; do { *a = ~*a; } while(*++a < 0); - next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1; - if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } } - - /* push */ - if(trbudget_check(budget, a - first)) { - if((a - first) <= (last - a)) { - STACK_PUSH5(ISAd, a, last, -3, trlink); - ISAd += incr, last = a, limit = next; - } else { - if(1 < (last - a)) { - STACK_PUSH5(ISAd + incr, first, a, next, trlink); - first = a, limit = -3; - } else { - ISAd += incr, last = a, limit = next; - } - } - } else { - if(0 <= trlink) { stack[trlink].d = -1; } - if(1 < (last - a)) { - first = a, limit = -3; - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - continue; - } - - if((last - first) <= TR_INSERTIONSORT_THRESHOLD) { - tr_insertionsort(ISAd, first, last); - limit = -3; - continue; - } - - if(limit-- == 0) { - tr_heapsort(ISAd, first, last - first); - for(a = last - 1; first < a; a = b) { - for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; } - } - limit = -3; - continue; - } - - /* choose pivot */ - a = tr_pivot(ISAd, first, last); - SWAP(*first, *a); - v = ISAd[*first]; - - /* partition */ - tr_partition(ISAd, first, first + 1, last, &a, &b, v); - if((last - first) != (b - a)) { - next = (ISA[*a] != v) ? tr_ilg(b - a) : -1; - - /* update ranks */ - for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } - if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } } - - /* push */ - if((1 < (b - a)) && (trbudget_check(budget, b - a))) { - if((a - first) <= (last - b)) { - if((last - b) <= (b - a)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - STACK_PUSH5(ISAd, b, last, limit, trlink); - last = a; - } else if(1 < (last - b)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - first = b; - } else { - ISAd += incr, first = a, last = b, limit = next; - } - } else if((a - first) <= (b - a)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd, b, last, limit, trlink); - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - last = a; - } else { - STACK_PUSH5(ISAd, b, last, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } else { - STACK_PUSH5(ISAd, b, last, limit, trlink); - STACK_PUSH5(ISAd, first, a, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } else { - if((a - first) <= (b - a)) { - if(1 < (last - b)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - STACK_PUSH5(ISAd, first, a, limit, trlink); - first = b; - } else if(1 < (a - first)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - last = a; - } else { - ISAd += incr, first = a, last = b, limit = next; - } - } else if((last - b) <= (b - a)) { - if(1 < (last - b)) { - STACK_PUSH5(ISAd, first, a, limit, trlink); - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - first = b; - } else { - STACK_PUSH5(ISAd, first, a, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } else { - STACK_PUSH5(ISAd, first, a, limit, trlink); - STACK_PUSH5(ISAd, b, last, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } - } else { - if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; } - if((a - first) <= (last - b)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd, b, last, limit, trlink); - last = a; - } else if(1 < (last - b)) { - first = b; - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } else { - if(1 < (last - b)) { - STACK_PUSH5(ISAd, first, a, limit, trlink); - first = b; - } else if(1 < (a - first)) { - last = a; - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } - } else { - if(trbudget_check(budget, last - first)) { - limit = tr_ilg(last - first), ISAd += incr; - } else { - if(0 <= trlink) { stack[trlink].d = -1; } - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } -#undef STACK_SIZE -} - - - -/*---------------------------------------------------------------------------*/ - -/* Tandem repeat sort */ -static -void -trsort(int *ISA, int *SA, int n, int depth) { - int *ISAd; - int *first, *last; - trbudget_t budget; - int t, skip, unsorted; - - trbudget_init(&budget, tr_ilg(n) * 2 / 3, n); -/* trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */ - for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) { - first = SA; - skip = 0; - unsorted = 0; - do { - if((t = *first) < 0) { first -= t; skip += t; } - else { - if(skip != 0) { *(first + skip) = skip; skip = 0; } - last = SA + ISA[t] + 1; - if(1 < (last - first)) { - budget.count = 0; - tr_introsort(ISA, ISAd, SA, first, last, &budget); - if(budget.count != 0) { unsorted += budget.count; } - else { skip = first - last; } - } else if((last - first) == 1) { - skip = -1; - } - first = last; - } - } while(first < (SA + n)); - if(skip != 0) { *(first + skip) = skip; } - if(unsorted == 0) { break; } - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Sorts suffixes of type B*. */ -static -int -sort_typeBstar(const unsigned char *T, int *SA, - int *bucket_A, int *bucket_B, - int n) { - int *PAb, *ISAb, *buf; - int i, j, k, t, m, bufsize; - int c0, c1; - - /* Initialize bucket arrays. */ - for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; } - for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; } - - /* Count the number of occurrences of the first one or two characters of each - type A, B and B* suffix. Moreover, store the beginning position of all - type B* suffixes into the array SA. */ - for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) { - /* type A suffix. */ - do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1)); - if(0 <= i) { - /* type B* suffix. */ - ++BUCKET_BSTAR(c0, c1); - SA[--m] = i; - /* type B suffix. */ - for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { - ++BUCKET_B(c0, c1); - } - } - } - m = n - m; -/* -note: - A type B* suffix is lexicographically smaller than a type B suffix that - begins with the same first two characters. -*/ - - /* Calculate the index of start/end point of each bucket. */ - for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { - t = i + BUCKET_A(c0); - BUCKET_A(c0) = i + j; /* start point */ - i = t + BUCKET_B(c0, c0); - for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { - j += BUCKET_BSTAR(c0, c1); - BUCKET_BSTAR(c0, c1) = j; /* end point */ - i += BUCKET_B(c0, c1); - } - } - - if(0 < m) { - /* Sort the type B* suffixes by their first two characters. */ - PAb = SA + n - m; ISAb = SA + m; - for(i = m - 2; 0 <= i; --i) { - t = PAb[i], c0 = T[t], c1 = T[t + 1]; - SA[--BUCKET_BSTAR(c0, c1)] = i; - } - t = PAb[m - 1], c0 = T[t], c1 = T[t + 1]; - SA[--BUCKET_BSTAR(c0, c1)] = m - 1; - - /* Sort the type B* substrings using sssort. */ - buf = SA + m, bufsize = n - (2 * m); - for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { - for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { - i = BUCKET_BSTAR(c0, c1); - if(1 < (j - i)) { - sssort(T, PAb, SA + i, SA + j, - buf, bufsize, 2, n, *(SA + i) == (m - 1)); - } - } - } - - /* Compute ranks of type B* substrings. */ - for(i = m - 1; 0 <= i; --i) { - if(0 <= SA[i]) { - j = i; - do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i])); - SA[i + 1] = i - j; - if(i <= 0) { break; } - } - j = i; - do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0); - ISAb[SA[i]] = j; - } - - /* Construct the inverse suffix array of type B* suffixes using trsort. */ - trsort(ISAb, SA, m, 1); - - /* Set the sorted order of tyoe B* suffixes. */ - for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { - for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { } - if(0 <= i) { - t = i; - for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { } - SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t; - } - } - - /* Calculate the index of start/end point of each bucket. */ - BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */ - for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) { - i = BUCKET_A(c0 + 1) - 1; - for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { - t = i - BUCKET_B(c0, c1); - BUCKET_B(c0, c1) = i; /* end point */ - - /* Move all type B* suffixes to the correct position. */ - for(i = t, j = BUCKET_BSTAR(c0, c1); - j <= k; - --i, --k) { SA[i] = SA[k]; } - } - BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */ - BUCKET_B(c0, c0) = i; /* end point */ - } - } - - return m; -} - -/* Constructs the suffix array by using the sorted order of type B* suffixes. */ -static -void -construct_SA(const unsigned char *T, int *SA, - int *bucket_A, int *bucket_B, - int n, int m) { - int *i, *j, *k; - int s; - int c0, c1, c2; - - if(0 < m) { - /* Construct the sorted order of type B suffixes by using - the sorted order of type B* suffixes. */ - for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { - /* Scan the suffix array from right to left. */ - for(i = SA + BUCKET_BSTAR(c1, c1 + 1), - j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; - i <= j; - --j) { - if(0 < (s = *j)) { - assert(T[s] == c1); - assert(((s + 1) < n) && (T[s] <= T[s + 1])); - assert(T[s - 1] <= T[s]); - *j = ~s; - c0 = T[--s]; - if((0 < s) && (T[s - 1] > c0)) { s = ~s; } - if(c0 != c2) { - if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } - k = SA + BUCKET_B(c2 = c0, c1); - } - assert(k < j); - *k-- = s; - } else { - assert(((s == 0) && (T[s] == c1)) || (s < 0)); - *j = ~s; - } - } - } - } - - /* Construct the suffix array by using - the sorted order of type B suffixes. */ - k = SA + BUCKET_A(c2 = T[n - 1]); - *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1); - /* Scan the suffix array from left to right. */ - for(i = SA, j = SA + n; i < j; ++i) { - if(0 < (s = *i)) { - assert(T[s - 1] >= T[s]); - c0 = T[--s]; - if((s == 0) || (T[s - 1] < c0)) { s = ~s; } - if(c0 != c2) { - BUCKET_A(c2) = k - SA; - k = SA + BUCKET_A(c2 = c0); - } - assert(i < k); - *k++ = s; - } else { - assert(s < 0); - *i = ~s; - } - } -} - -/*---------------------------------------------------------------------------*/ - -/*- Function -*/ - -/* XXX Modified from original: use provided temporary space instead of - * allocating it. */ -void -divsufsort(const unsigned char *T, int *SA, int n, - int *bucket_A, int *bucket_B) -{ - int m; - - switch (n) { - case 0: - break; - - case 1: - SA[0] = 0; - break; - - case 2: - m = (T[0] < T[1]); - SA[m ^ 1] = 0; - SA[m] = 1; - break; - - default: - m = sort_typeBstar(T, SA, bucket_A, bucket_B, n); - construct_SA(T, SA, bucket_A, bucket_B, n, m); - break; - } -} diff --git a/src/lz_bt.c b/src/lz_bt.c new file mode 100644 index 00000000..938449be --- /dev/null +++ b/src/lz_bt.c @@ -0,0 +1,706 @@ +/* + * lz_bt.c + * + * Binary tree match-finder for Lempel-Ziv compression. + * + * Author: Eric Biggers + * Year: 2014 + * + * The author hereby releases this file into the public domain. + * You can do whatever you want with this file. + */ + +/* + * Note: the binary tree search/update algorithm is based on code from the + * public domain LZMA SDK (authors: Igor Pavlov, Lasse Collin). + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "wimlib/lz.h" +#include "wimlib/lz_bt.h" +#include "wimlib/util.h" +#include +#include + +#define LZ_BT_HASH_BITS 16 +#define LZ_BT_HASH_SIZE (1 << LZ_BT_HASH_BITS) +#define LZ_BT_HASH_MASK (LZ_BT_HASH_SIZE - 1) +#define LZ_BT_DIGRAM_TAB_SIZE (256 * 256) + +static u32 crc32_table[256]; +static pthread_once_t crc32_table_filled = PTHREAD_ONCE_INIT; + +static void +crc32_init(void) +{ + for (u32 b = 0; b < 256; b++) { + u32 r = b; + for (int i = 0; i < 8; i++) { + if (r & 1) + r = (r >> 1) ^ 0xEDB88320; + else + r >>= 1; + } + crc32_table[b] = r; + } +} + +/* + * Compute the hash code for the next 3-byte sequence in the window. + * + * @p + * A pointer to the next 3-byte sequence in the window. + * + * Returns the resulting hash code. + */ +static inline u32 +lz_bt_hash(const u8 *p) +{ + u32 hash = 0; + + hash ^= crc32_table[p[0]]; + hash ^= p[1]; + hash ^= (u32)p[2] << 8; + + return hash & LZ_BT_HASH_MASK; +} + +/* + * Compute the number of bytes of memory that would be needed to initialize a + * binary tree match-finder with the specified maximum window size. + * + * @max_window_size + * The maximum window size, in bytes, to query. + * + * Returns the number of bytes that would be allocated by lz_bt_init(), + * excluding the size of the 'struct lz_bt' itself. + */ +u64 +lz_bt_get_needed_memory(lz_bt_pos_t max_window_size) +{ + u64 len; + + len = LZ_BT_HASH_SIZE + LZ_BT_DIGRAM_TAB_SIZE; + len += 2 * (u64)max_window_size; + + return len * sizeof(lz_bt_pos_t); +} + +/* + * Initialize a binary tree match-finder. + * + * @mf + * The match-finder structure to initialize. + * @max_window_size + * The maximum window size that shall be supported by subsequent calls to + * lz_bt_load_window(). + * @min_match_len + * The minimum length of matches that shall be produced by subsequent calls + * to lz_bt_get_matches(). This must be at least 2. + * @max_match_len + * The maximum length of matches that shall be produced by subsequent calls + * to lz_bt_get_matches(). This must be at least @min_match_len. + * @num_fast_bytes + * The maximum length of matches that shall be produced just using the + * binary tree search algorithm. If the longest match has this length, + * then lz_bt_get_matches() will extend it up to @max_match_len. This must + * be at least @min_match_len and no more than @max_match_len. + * @max_search_depth + * The maximum depth to descend into the binary search tree before halting + * the search. + * + * Returns %true if successful; %false if out of memory. + */ +bool +lz_bt_init(struct lz_bt *mf, + lz_bt_pos_t max_window_size, + lz_bt_len_t min_match_len, + lz_bt_len_t max_match_len, + lz_bt_len_t num_fast_bytes, + u32 max_search_depth) +{ + u64 len; + + /* Check and set parameters. */ + LZ_ASSERT(min_match_len >= 2); + LZ_ASSERT(max_match_len >= min_match_len); + LZ_ASSERT(num_fast_bytes >= min_match_len); + LZ_ASSERT(num_fast_bytes <= max_match_len); + + mf->max_window_size = max_window_size; + mf->min_match_len = min_match_len; + mf->max_match_len = max_match_len; + mf->num_fast_bytes = num_fast_bytes; + mf->max_search_depth = max_search_depth; + + /* Allocate space for 'hash_tab', 'digram_tab', and 'child_tab'. */ + len = LZ_BT_HASH_SIZE + (2 * (u64)max_window_size); + if (mf->min_match_len <= 2) + len += LZ_BT_DIGRAM_TAB_SIZE; + len *= sizeof(lz_bt_pos_t); + if ((size_t)len != len || !(mf->hash_tab = MALLOC(len))) + return false; + if (mf->min_match_len <= 2) { + mf->digram_tab = mf->hash_tab + LZ_BT_HASH_SIZE; + mf->child_tab = mf->digram_tab + LZ_BT_DIGRAM_TAB_SIZE; + } else { + mf->child_tab = mf->hash_tab + LZ_BT_HASH_SIZE; + } + + /* Fill in the CRC32 table if not done already. */ + pthread_once(&crc32_table_filled, crc32_init); + + return true; +} + +/* + * Destroy a binary tree match-finder. + * + * @mf + * The match-finder structure to destroy. + */ +void +lz_bt_destroy(struct lz_bt *mf) +{ + FREE(mf->hash_tab); + /* mf->hash_tab shares storage with mf->digram_tab and mf->child_tab. */ +} + +/* + * Load a window into a binary tree match-finder. + * + * @mf + * The match-finder structure into which to load the window. + * @window + * Pointer to the window to load. This memory must remain available, + * unmodified, while the match-finder is being used. + * @window_size + * The size of the window, in bytes. This can't be larger than the + * @max_window_size with which lz_bt_init() was called. + */ +void +lz_bt_load_window(struct lz_bt *mf, const u8 *window, lz_bt_pos_t window_size) +{ + LZ_ASSERT(window_size <= mf->max_window_size); + size_t clear_len; + + mf->cur_window = window; + mf->cur_window_pos = 0; + mf->cur_window_size = window_size; + + /* Clear the hash and digram tables. + * Note: The child table need not be cleared. */ + clear_len = LZ_BT_HASH_SIZE; + if (mf->min_match_len <= 2) + clear_len += LZ_BT_DIGRAM_TAB_SIZE; + memset(mf->hash_tab, 0, clear_len * sizeof(lz_bt_pos_t)); +} + +/* + * Search the binary tree of the current hash code for matches. At the same + * time, update this tree to add the current position in the window. + * + * @window + * The window being searched. + * @cur_window_pos + * The current position in the window. + * @min_len + * Ignore matches shorter than this length. This must be at least 1. + * @max_len + * Don't produce any matches longer than this length. If we find a match + * this long, terminate the search and return. + * @max_depth + * Stop if we reach this depth in the binary tree. + * @child_tab + * Table of child pointers for the binary tree. The children of the node + * for position 'i' in the window are child_tab[i * 2] and child_tab[i*2 + + * 1]. Zero is reserved for the 'null' value (no child). Consequently, we + * don't recognize matches beginning at position 0. In fact, the node for + * position 0 in the window will not be used at all, which is just as well + * because we use 0-based indices which don't work for position 0. + * @cur_match + * The position in the window at which the binary tree for the current hash + * code is rooted. This can be 0, which indicates that the binary tree for + * the current hash code is empty. + * @matches + * The array in which to produce the matches. The matches will be produced + * in order of increasing length and increasing offset. No more than one + * match shall have any given length, nor shall any match be shorter than + * @min_len, nor shall any match be longer than @max_len, nor shall any two + * matches have the same offset. + * + * Returns the number of matches found and written to @matches. + */ +static lz_bt_len_t +do_search(const u8 window[restrict], + const lz_bt_pos_t cur_window_pos, + const lz_bt_len_t min_len, + const lz_bt_len_t max_len, + const u32 max_depth, + lz_bt_pos_t child_tab[restrict], + lz_bt_pos_t cur_match, + struct raw_match matches[restrict]) +{ + /* + * Here's my explanation of how this code actually works. Beware: this + * algorithm is a *lot* trickier than searching for matches via hash + * chains. But it can be significantly better, especially when doing + * "optimal" parsing, which is why it gets used, e.g. in LZMA as well as + * here. + * + * --------------------------------------------------------------------- + * + * Data structure + * + * Basically, there is not just one binary tree, but rather one binary + * tree per hash code. For a given hash code, the binary tree indexes + * previous positions in the window that have that same hash code. The + * key for each node is the "string", or byte sequence, beginning at the + * corresponding position in the window. + * + * Each tree maintains the invariant that if node C is a child of node + * P, then the window position represented by node C is smaller than + * ("left of") the window position represented by node P. Equivalently, + * while descending into a tree, the match distances ("offsets") from + * the current position are non-decreasing --- actually strictly + * increasing, because each node represents a unique position. + * + * In addition, not all previous positions sharing the same hash code + * will necessarily be represented in each binary tree; see the + * "Updating" section. + * + * --------------------------------------------------------------------- + * + * Searching + * + * Suppose we want to search for LZ77-style matches with the string + * beginning at the current window position and extending for @max_len + * bytes. To do this, we can search for this string in the binary tree + * for this string's hash code. Each node visited during the search is + * a potential match. This method will find the matches efficiently + * because they will converge on the current string, due to the nature + * of the binary search. + * + * Naively, when visiting a node that represents a match of length N, we + * must compare N + 1 bytes in order to determine the length of that + * match and the lexicographic ordering of that match relative to the + * current string (which determines whether we need to step left or + * right into the next level of the tree, as per the standard binary + * tree search algorithm). However, as an optimization, we need not + * explicitly examine the full length of the match at each node. To see + * that this is true, suppose that we examine a node during the search, + * and we find that the corresponding match is less (alt. greater) than + * the current string. Then, because of how binary tree search + * operates, the match must be lexicographically greater (alt. lesser) + * than any ancestor node that corresponded to a match lexicographically + * lesser (alt. greater) than the current string. Therefore, the match + * must be at least as long as the match for any such ancestor node. + * Therefore, the lengths of lexicographically-lesser (alt. greater) + * matches must be non-decreasing as they are encountered by the tree + * search. + * + * Using this observation, we can maintain two variables, + * 'longest_lt_match_len' and 'longest_gt_match_len', that represent the + * length of the longest lexicographically lesser and greater, + * respectively, match that has been examined so far. Then, when + * examining a new match, we need only start comparing at the index + * min(longest_lt_match_len, longest_gt_match_len) byte. Note that we + * cannot know beforehand whether the match will be lexicographically + * lesser or greater, hence the need for taking the minimum of these two + * lengths. + * + * As noted earlier, as we descend into the tree, the potential matches + * will have strictly increasing offsets. To make things faster for + * higher-level parsing / match-choosing code, we do not want to return + * a shorter match that has a larger offset than a longer match. This + * is because a longer match can always be truncated to a shorter match + * if needed, and smaller offsets usually (depending on the compression + * format) take fewer bits to encode than larger offsets. + * Consequently, we keep a potential match only if it is longer than the + * previous longest match that has been found. This has the added + * advantage of producing the array of matches sorted by strictly + * increasing lengths as well as strictly decreasing offsets. + * + * In degenerate cases, the binary tree might become severely + * unbalanced. To prevent excessive running times, we stop immediately + * (and return any matches that happen to have been found so far) if the + * current depth exceeds @max_depth. Note that this cutoff can occur + * before the longest match has been found, which is usually bad for the + * compression ratio. + * + * --------------------------------------------------------------------- + * + * Updating + * + * I've explained how to find matches by searching the binary tree of + * the current hash code. But how do we get the binary tree in the + * first place? Since the tree is built incrementally, the real + * question is how do we update the tree to "add" the current window + * position. + * + * The tree maintains the invariant that a node's parent always has a + * larger position (a.k.a. smaller match offset) than itself. + * Therefore, the root node must always have the largest position; and + * since the current position is larger than any previous position, the + * current position must become the root of the tree. + * + * A correct, but silly, approach is to simply add the previous root as + * a child of the new root, using either the left or right child pointer + * depending on the lexicographic ordering of the strings. This works, + * but it really just produces a linked list, so it's not sufficient. + * + * Instead, we can initially mark the new root's left child pointer as + * "pending (less than)" and its right child pointer as "pending + * (greater than)". Then, during the search, when we examine a match + * that is lexicographically less than the current string, we link the + * "pending (less than)" pointer to the node of that match, then set the + * right child pointer of *that* node as "pending (less than)". + * Similarly, when we examine a match that is lexicographically greater + * than the current string, we link the "pending (greater than)" pointer + * to the node of that match, then set the left child pointer of *that* + * node as "pending (greater than)". + * + * If the search terminates before the current string is found (up to a + * precision of @max_len bytes), then we set "pending (less than)" and + * "pending (greater than)" to point to nothing. Alternatively, if the + * search terminates due to finding the current string (up to a + * precision of @max_len bytes), then we set "pending (less than)" and + * "pending (greater than)" to point to the appropriate children of that + * match. + * + * Why does this work? Well, we can think of it this way: the "pending + * (less than)" pointer is reserved for the next match we find that is + * lexicographically *less than* the current string, and the "pending + * (greater than)" pointer is reserved for the next match we find that + * is lexicographically *greater than* the current string. This + * explains why when we find a match that is lexicographically less than + * the current string, we set the "pending (less than)" pointer to point + * to that match. And the reason we change "pending (less than)" to the + * right pointer of the match in that case is because we're walking down + * into that subtree, and the next match lexicographically *less than* + * the current string is guaranteed to be lexicographically *greater + * than* that match, so it should be set as the right subtree of that + * match. But the next match in that subtree that is lexicographically + * *greater than* the current string will need to be moved to the + * "pending (greater than)" pointer farther up the tree. + * + * It's complicated, but it should make sense if you think about it. + * The algorithm basically just moves subtrees into the correct + * locations as it walks down the tree for the search. But also, if the + * algorithm actually finds a match of length @max_len with the current + * string, it no longer needs that match node and can discard it. The + * algorithm also will discard nodes if the search terminates due to the + * depth limit. For these reasons, the binary tree might not, in fact, + * contain all valid positions. + */ + + lz_bt_len_t num_matches = 0; + lz_bt_len_t longest_lt_match_len = 0; + lz_bt_len_t longest_gt_match_len = 0; + lz_bt_len_t longest_match_len = min_len - 1; + lz_bt_pos_t *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0]; + lz_bt_pos_t *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1]; + const u8 *strptr = &window[cur_window_pos]; + u32 depth_remaining = max_depth; + for (;;) { + const u8 *matchptr; + lz_bt_len_t len; + + if (depth_remaining-- == 0 || cur_match == 0) { + *pending_lt_ptr = 0; + *pending_gt_ptr = 0; + return num_matches; + } + + matchptr = &window[cur_match]; + len = min(longest_lt_match_len, longest_gt_match_len); + + if (matchptr[len] == strptr[len]) { + + while (++len != max_len) + if (matchptr[len] != strptr[len]) + break; + + if (len > longest_match_len) { + longest_match_len = len; + + matches[num_matches++] = (struct raw_match) { + .len = len, + .offset = cur_window_pos - cur_match, + }; + + if (len == max_len) { + *pending_lt_ptr = child_tab[cur_match * 2 + 0]; + *pending_gt_ptr = child_tab[cur_match * 2 + 1]; + return num_matches; + } + } + } + + if (matchptr[len] < strptr[len]) { + *pending_lt_ptr = cur_match; + pending_lt_ptr = &child_tab[cur_match * 2 + 1]; + cur_match = *pending_lt_ptr; + longest_lt_match_len = len; + } else { + *pending_gt_ptr = cur_match; + pending_gt_ptr = &child_tab[cur_match * 2 + 0]; + cur_match = *pending_gt_ptr; + longest_gt_match_len = len; + } + } +} + +/* + * Retrieve a list of matches at the next position in the window. + * + * @mf + * The binary tree match-finder structure into which a window has been + * loaded using lz_bt_load_window(). + * @matches + * The array into which the matches will be returned. The length of this + * array must be at least (@mf->num_fast_bytes - @mf->min_match_len + 1). + * + * The return value is the number of matches that were found and stored in the + * 'matches' array. The matches will be ordered by strictly increasing length + * and strictly increasing offset. No match shall have length less than + * @min_match_len, and no match shall have length greater than @max_match_len. + * The return value may be 0, which indicates that no matches were found. + * + * On completion, the binary tree match-finder is advanced to the next position + * in the window. + */ +lz_bt_len_t +lz_bt_get_matches(struct lz_bt *mf, struct raw_match matches[]) +{ + lz_bt_pos_t bytes_remaining; + lz_bt_len_t num_matches; + lz_bt_pos_t cur_match; + u32 hash; + + LZ_ASSERT(mf->cur_window_pos < mf->cur_window_size); + + bytes_remaining = lz_bt_get_remaining_size(mf); + + /* If there are fewer than 3 bytes remaining, we can't even compute a + * hash to look up a binary tree root. If there are exactly 2 bytes + * remaining we could still search for a length-2 match using the digram + * table, but it's not worth bothering. (Note: this is also useful for + * LZX, since this excludes the length 2 match having the maximum + * offset, which isn't allowed.) */ + if (bytes_remaining < 3) { + mf->cur_window_pos++; + return 0; + } + + num_matches = 0; + + /* Search the digram table for a length 2 match. */ + if (mf->min_match_len <= 2) { + u8 c1, c2; + u16 digram; + + c1 = mf->cur_window[mf->cur_window_pos]; + c2 = mf->cur_window[mf->cur_window_pos + 1]; + digram = (u16)c1 | ((u16)c2 << 8); + cur_match = mf->digram_tab[digram]; + mf->digram_tab[digram] = mf->cur_window_pos; + + /* We're only interested in matches of length exactly 2, since + * those won't be found during the binary tree search. */ + if (cur_match != 0 && mf->cur_window[cur_match + 2] != + mf->cur_window[mf->cur_window_pos + 2]) + { + matches[num_matches++] = (struct raw_match) { + .len = 2, + .offset = mf->cur_window_pos - cur_match, + }; + } + } + + /* Hash the length-3 byte sequence beginning at the current position in + * the window. */ + hash = lz_bt_hash(&mf->cur_window[mf->cur_window_pos]); + + /* The corresponding hash bucket in 'hash_tab' contains the root of the + * binary tree of previous window positions that have the same hash + * code. */ + cur_match = mf->hash_tab[hash]; + + /* Update the hash bucket to point to the binary tree rooted at the + * current position, which we will construct in do_search(). */ + mf->hash_tab[hash] = mf->cur_window_pos; + + /* Search the binary tree for matches. At the same time, build the + * binary tree rooted at the current position, which replaces the one we + * search. */ + num_matches += do_search(mf->cur_window, + mf->cur_window_pos, + max(3, mf->min_match_len), + min(bytes_remaining, mf->num_fast_bytes), + mf->max_search_depth, + mf->child_tab, + cur_match, + &matches[num_matches]); + + /* If the longest match is @num_fast_bytes in length, it may have been + * truncated. Try extending it up to the maximum match length. */ + if (num_matches != 0 && matches[num_matches - 1].len == mf->num_fast_bytes) { + lz_bt_pos_t limit; + const u8 *strptr, *matchptr; + lz_bt_len_t len; + + limit = min(bytes_remaining, mf->max_match_len); + strptr = &mf->cur_window[mf->cur_window_pos]; + matchptr = strptr - matches[num_matches - 1].offset; + len = matches[num_matches - 1].len; + while (len < limit && strptr[len] == matchptr[len]) + len++; + matches[num_matches - 1].len = len; + } + +#ifdef ENABLE_LZ_DEBUG + /* Check the matches. */ + for (lz_bt_len_t i = 0; i < num_matches; i++) { + const u8 *matchptr, *strptr; + + /* Length valid? */ + LZ_ASSERT(matches[i].len >= mf->min_match_len); + LZ_ASSERT(matches[i].len <= min(mf->max_match_len, bytes_remaining)); + + /* Offset valid? */ + LZ_ASSERT(matches[i].offset >= 1); + LZ_ASSERT(matches[i].offset <= lz_bt_get_position(mf)); + + /* Lengths and offsets strictly increasing? */ + if (i > 0) { + LZ_ASSERT(matches[i].len > matches[i - 1].len); + LZ_ASSERT(matches[i].offset > matches[i - 1].offset); + } + + /* Actually a match? */ + strptr = lz_bt_get_window_ptr(mf); + matchptr = strptr - matches[i].offset; + LZ_ASSERT(!memcmp(strptr, matchptr, matches[i].len)); + + /* Match can't be extended further? */ + LZ_ASSERT(matches[i].len == min(mf->max_match_len, bytes_remaining) || + strptr[matches[i].len] != matchptr[matches[i].len]); + } +#endif /* ENABLE_LZ_DEBUG */ + + /* Advance to the next position in the window. */ + mf->cur_window_pos++; + + /* Return the number of matches found. */ + return num_matches; +} + +/* This is the same as do_search(), but it does not save any matches. + * See do_search() for explanatory comments. */ +static void +do_skip(const u8 window[restrict], + const lz_bt_pos_t cur_window_pos, + const lz_bt_len_t max_len, + u32 depth_remaining, + lz_bt_pos_t child_tab[restrict], + lz_bt_pos_t cur_match) +{ + lz_bt_len_t longest_lt_match_len = 0; + lz_bt_len_t longest_gt_match_len = 0; + lz_bt_pos_t *pending_lt_ptr = &child_tab[cur_window_pos * 2 + 0]; + lz_bt_pos_t *pending_gt_ptr = &child_tab[cur_window_pos * 2 + 1]; + const u8 * const strptr = &window[cur_window_pos]; + for (;;) { + const u8 *matchptr; + lz_bt_len_t len; + + if (depth_remaining-- == 0 || cur_match == 0) { + *pending_lt_ptr = 0; + *pending_gt_ptr = 0; + return; + } + + matchptr = &window[cur_match]; + len = min(longest_lt_match_len, longest_gt_match_len); + + if (matchptr[len] == strptr[len]) { + do { + if (++len == max_len) { + *pending_lt_ptr = child_tab[cur_match * 2 + 0]; + *pending_gt_ptr = child_tab[cur_match * 2 + 1]; + return; + } + } while (matchptr[len] == strptr[len]); + } + if (matchptr[len] < strptr[len]) { + *pending_lt_ptr = cur_match; + pending_lt_ptr = &child_tab[cur_match * 2 + 1]; + cur_match = *pending_lt_ptr; + longest_lt_match_len = len; + } else { + *pending_gt_ptr = cur_match; + pending_gt_ptr = &child_tab[cur_match * 2 + 0]; + cur_match = *pending_gt_ptr; + longest_gt_match_len = len; + } + } +} + +/* Skip the current position in the binary tree match-finder. */ +static void +lz_bt_skip_position(struct lz_bt *mf) +{ + lz_bt_pos_t bytes_remaining; + u32 hash; + lz_bt_pos_t cur_match; + + LZ_ASSERT(mf->cur_window_pos < mf->cur_window_size); + + bytes_remaining = lz_bt_get_remaining_size(mf); + + /* As explained in lz_bt_get_matches(), we don't search for matches if + * there are fewer than 3 bytes remaining in the window. */ + if (bytes_remaining < 3) { + mf->cur_window_pos++; + return; + } + + /* Update the digram table. */ + if (mf->min_match_len <= 2) { + u8 c1, c2; + u16 digram; + + c1 = mf->cur_window[mf->cur_window_pos]; + c2 = mf->cur_window[mf->cur_window_pos + 1]; + digram = (u16)c1 | ((u16)c2 << 8); + mf->digram_tab[digram] = mf->cur_window_pos; + } + + /* Update the hash table. */ + hash = lz_bt_hash(&mf->cur_window[mf->cur_window_pos]); + cur_match = mf->hash_tab[hash]; + mf->hash_tab[hash] = mf->cur_window_pos; + + /* Update the binary tree for the appropriate hash code. */ + do_skip(mf->cur_window, + mf->cur_window_pos, + min(bytes_remaining, mf->num_fast_bytes), + mf->max_search_depth, + mf->child_tab, + cur_match); + + /* Advance to the next position. */ + mf->cur_window_pos++; +} + +/* Skip 'n' positions in the binary tree match-finder. */ +void +lz_bt_skip_positions(struct lz_bt *mf, unsigned n) +{ + while (n--) + lz_bt_skip_position(mf); +} diff --git a/src/lz_sarray.c b/src/lz_sarray.c deleted file mode 100644 index 82ac97b9..00000000 --- a/src/lz_sarray.c +++ /dev/null @@ -1,558 +0,0 @@ -/* - * lz_sarray.c - * - * Suffix array match-finder for Lempel-Ziv compression. - */ - -/* - * Copyright (c) 2013, 2014 Eric Biggers. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#include "wimlib/divsufsort.h" -#include "wimlib/lz_sarray.h" -#include "wimlib/util.h" -#include - -/* If ENABLE_LZ_DEBUG is defined, verify that the suffix array satisfies its - * definition. - * - * @SA The constructed suffix array. - * @T The original data. - * @found Temporary 'bool' array of length @n. - * @n Length of the data (length of @SA, @T, and @found arrays). - * - * WARNING: this is for debug use only as it does not necessarily run in linear - * time!!! */ -static void -verify_suffix_array(const lz_sarray_pos_t SA[restrict], - const u8 T[restrict], - bool found[restrict], - lz_sarray_pos_t n) -{ -#ifdef ENABLE_LZ_DEBUG - /* Ensure the SA contains exactly one of each i in [0, n - 1]. */ - for (lz_sarray_pos_t i = 0; i < n; i++) - found[i] = false; - for (lz_sarray_pos_t r = 0; r < n; r++) { - lz_sarray_pos_t i = SA[r]; - LZ_ASSERT(i < n); - LZ_ASSERT(!found[i]); - found[i] = true; - } - - /* Ensure the suffix with rank r is lexicographically lesser than the - * suffix with rank (r + 1) for all r in [0, n - 2]. */ - for (lz_sarray_pos_t r = 0; r < n - 1; r++) { - - lz_sarray_pos_t i1 = SA[r]; - lz_sarray_pos_t i2 = SA[r + 1]; - - lz_sarray_pos_t n1 = n - i1; - lz_sarray_pos_t n2 = n - i2; - - int res = memcmp(&T[i1], &T[i2], min(n1, n2)); - LZ_ASSERT(res < 0 || (res == 0 && n1 < n2)); - } -#endif /* ENABLE_LZ_DEBUG */ -} - -/* Compute the inverse suffix array @ISA from the suffix array @SA in linear - * time. - * - * Whereas the suffix array is a mapping from suffix rank to suffix position, - * the inverse suffix array is a mapping from suffix position to suffix rank. - */ -static void -compute_inverse_suffix_array(lz_sarray_pos_t ISA[restrict], - const lz_sarray_pos_t SA[restrict], - lz_sarray_pos_t n) -{ - lz_sarray_pos_t r; - - for (r = 0; r < n; r++) - ISA[SA[r]] = r; -} - - -/* Compute the LCP (Longest Common Prefix) array in linear time. - * - * LCP[r] will be the length of the longest common prefix between the suffixes - * with positions SA[r - 1] and SA[r]. LCP[0] will be undefined. - * - * Algorithm adapted from Kasai et al. 2001: "Linear-Time Longest-Common-Prefix - * Computation in Suffix Arrays and Its Applications". Modified slightly to - * take into account that with bytes in the real world, there is no unique - * symbol at the end of the string. */ -static void -compute_lcp_array(lz_sarray_pos_t LCP[restrict], - const lz_sarray_pos_t SA[restrict], - const lz_sarray_pos_t ISA[restrict], - const u8 T[restrict], - lz_sarray_pos_t n) -{ - lz_sarray_pos_t h, i, r, j, lim; - - h = 0; - for (i = 0; i < n; i++) { - r = ISA[i]; - if (r > 0) { - j = SA[r - 1]; - lim = min(n - i, n - j); - - while (h < lim && T[i + h] == T[j + h]) - h++; - LCP[r] = h; - if (h > 0) - h--; - } - } -} - -/* If ENABLE_LZ_DEBUG is defined, verify that the LCP (Longest Common Prefix) - * array satisfies its definition. - * - * WARNING: this is for debug use only as it does not necessarily run in linear - * time!!! */ -static void -verify_lcp_array(lz_sarray_pos_t LCP[restrict], - const lz_sarray_pos_t SA[restrict], - const u8 T[restrict], - lz_sarray_pos_t n) -{ -#ifdef ENABLE_LZ_DEBUG - for (lz_sarray_pos_t r = 0; r < n - 1; r++) { - lz_sarray_pos_t i1 = SA[r]; - lz_sarray_pos_t i2 = SA[r + 1]; - lz_sarray_pos_t lcp = LCP[r + 1]; - - lz_sarray_pos_t n1 = n - i1; - lz_sarray_pos_t n2 = n - i2; - - LZ_ASSERT(lcp <= min(n1, n2)); - - LZ_ASSERT(memcmp(&T[i1], &T[i2], lcp) == 0); - if (lcp < min(n1, n2)) - LZ_ASSERT(T[i1 + lcp] != T[i2 + lcp]); - } -#endif /* ENABLE_LZ_DEBUG */ -} - -/* Initialize the SA link array in linear time. - * - * This is similar to computing the LPF (Longest Previous Factor) array, which - * is addressed in several papers. In particular the algorithms below are based - * on Crochemore et al. 2009: "LPF computation revisited". However, this - * match-finder does not actually compute or use the LPF array per se. Rather, - * this function sets up some information necessary to compute the LPF array, - * but later lz_sarray_get_matches() actually uses this information to search - * the suffix array directly and can keep searching beyond the first (longest) - * match whose length would be placed in the LPF array. This difference from - * the theoretical work is necessary because in many real compression formats - * matches take variable numbers of bits to encode, so a decent parser needs to - * consider more than just the longest match with unspecified offset. - * - * Note: We cap the lcpprev and lcpnext values to the maximum match length so - * that the match-finder need not worry about it later, in the inner loop. - * - * Note: the LCP array is one of the inputs to this function, but it is used as - * temporary space and therefore will be invalidated. - */ -static void -init_salink(struct salink link[restrict], - lz_sarray_pos_t LCP[restrict], - const lz_sarray_pos_t SA[restrict], - const u8 T[restrict], - lz_sarray_pos_t n, - lz_sarray_len_t min_match_len, - lz_sarray_len_t max_match_len) -{ - /* Calculate salink.dist_to_next and salink.lcpnext. - * - * Pass 1 calculates, for each suffix rank, the corresponding - * "next_initial" value which is the smallest larger rank that - * corresponds to a suffix starting earlier in the string. It also - * calculates "lcpnext_initial", which is the longest common prefix with - * that suffix, although to eliminate checks in lz_sarray_get_matches(), - * "lcpnext_initial" is set to 0 if it's less than the minimum match - * length or set to the maximum match length if it's greater than the - * maximum match length. - * - * Pass 2 translates each absolute "next_initial", a 4-byte value, into - * a relative "dist_to_next", a 1-byte value. This is done to save - * memory. In the case that the exact relative distance cannot be - * encoded in 1 byte, it is capped to 255. This is valid as long as - * lz_sarray_get_matches() validates each position before using it. - * Note that "lcpnext" need not be updated in this case because it will - * not be used until the actual next rank has been found anyway. - */ - link[n - 1].next_initial = LZ_SARRAY_POS_MAX; - link[n - 1].lcpnext_initial = 0; - for (lz_sarray_pos_t r = n - 2; r != LZ_SARRAY_POS_MAX; r--) { - lz_sarray_pos_t t = r + 1; - lz_sarray_pos_t l = LCP[t]; - while (t != LZ_SARRAY_POS_MAX && SA[t] > SA[r]) { - l = min(l, link[t].lcpnext_initial); - t = link[t].next_initial; - } - link[r].next_initial = t; - - if (l < min_match_len) - l = 0; - else if (l > max_match_len) - l = max_match_len; - link[r].lcpnext_initial = l; - } - for (lz_sarray_pos_t r = 0; r < n; r++) { - lz_sarray_pos_t next; - lz_sarray_len_t l; - lz_sarray_delta_t dist_to_next; - - next = link[r].next_initial; - l = link[r].lcpnext_initial; - - if (next == LZ_SARRAY_POS_MAX) - dist_to_next = 0; - else if (next - r <= LZ_SARRAY_DELTA_MAX) - dist_to_next = next - r; - else - dist_to_next = LZ_SARRAY_DELTA_MAX; - - link[r].lcpnext = l; - link[r].dist_to_next = dist_to_next; - } - - /* Calculate salink.dist_to_prev and salink.lcpprev. - * - * This is analgous to dist_to_next and lcpnext as described above, but - * in the other direction. That is, here we're interested in, for each - * rank, the largest smaller rank that corresponds to a suffix starting - * earlier in the string. - * - * To save memory we don't have a "prev_initial" field, but rather store - * those values in the LCP array. */ - LCP[0] = LZ_SARRAY_POS_MAX; - link[0].lcpprev = 0; - for (lz_sarray_pos_t r = 1; r < n; r++) { - lz_sarray_pos_t t = r - 1; - lz_sarray_pos_t l = LCP[r]; - while (t != LZ_SARRAY_POS_MAX && SA[t] > SA[r]) { - l = min(l, link[t].lcpprev); - t = LCP[t]; - } - LCP[r] = t; - - if (l < min_match_len) - l = 0; - else if (l > max_match_len) - l = max_match_len; - - link[r].lcpprev = l; - } - for (lz_sarray_pos_t r = 0; r < n; r++) { - - lz_sarray_pos_t prev = LCP[r]; - - if (prev == LZ_SARRAY_POS_MAX) - link[r].dist_to_prev = 0; - else if (r - prev <= LZ_SARRAY_DELTA_MAX) - link[r].dist_to_prev = r - prev; - else - link[r].dist_to_prev = LZ_SARRAY_DELTA_MAX; - } -} - -/* If ENABLE_LZ_DEBUG is defined, verify the values computed by init_salink(). - * - * WARNING: this is for debug use only as it does not necessarily run in linear - * time!!! */ -static void -verify_salink(const struct salink link[], - const lz_sarray_pos_t SA[], - const u8 T[], - lz_sarray_pos_t n, - lz_sarray_len_t min_match_len, - lz_sarray_len_t max_match_len) -{ -#ifdef ENABLE_LZ_DEBUG - for (lz_sarray_pos_t r = 0; r < n; r++) { - for (lz_sarray_pos_t prev = r; ; ) { - if (prev == 0) { - LZ_ASSERT(link[r].dist_to_prev == 0); - LZ_ASSERT(link[r].lcpprev == 0); - break; - } - - prev--; - - if (SA[prev] < SA[r]) { - LZ_ASSERT(link[r].dist_to_prev == min(r - prev, LZ_SARRAY_DELTA_MAX)); - - lz_sarray_pos_t lcpprev; - for (lcpprev = 0; - lcpprev < min(n - SA[prev], n - SA[r]) && - T[SA[prev] + lcpprev] == T[SA[r] + lcpprev]; - lcpprev++) - ; - if (lcpprev < min_match_len) - lcpprev = 0; - else if (lcpprev > max_match_len) - lcpprev = max_match_len; - - LZ_ASSERT(lcpprev == link[r].lcpprev); - break; - } - } - - for (lz_sarray_pos_t next = r; ; ) { - if (next == n - 1) { - LZ_ASSERT(link[r].dist_to_next == 0); - LZ_ASSERT(link[r].lcpnext == 0); - break; - } - - next++; - - if (SA[next] < SA[r]) { - LZ_ASSERT(link[r].dist_to_next == min(next - r, LZ_SARRAY_DELTA_MAX)); - - lz_sarray_pos_t lcpnext; - for (lcpnext = 0; - lcpnext < min(n - SA[next], n - SA[r]) && - T[SA[next] + lcpnext] == T[SA[r] + lcpnext]; - lcpnext++) - ; - if (lcpnext < min_match_len) - lcpnext = 0; - else if (lcpnext > max_match_len) - lcpnext = max_match_len; - - LZ_ASSERT(lcpnext == link[r].lcpnext); - break; - } - } - } -#endif -} - -/* - * Initialize the suffix array match-finder. - * - * @mf - * The suffix array match-finder structure to initialize. This structure - * is expected to be zeroed before this function is called. In the case - * that this function fails, lz_sarray_destroy() should be called to free - * any memory that may have been allocated. - * - * @max_window_size - * The maximum window size to support. This must be greater than 0. - * - * The amount of needed memory will depend on this value; see - * lz_sarray_get_needed_memory() for details. - * - * @min_match_len - * The minimum length of each match to be found. Must be greater than 0. - * - * @max_match_len - * The maximum length of each match to be found. Must be greater than or - * equal to @min_match_len. - * - * @max_matches_to_consider - * The maximum number of matches to consider at each position. This should - * be greater than @max_matches_to_return because @max_matches_to_consider - * counts all the returned matches as well as matches of equal length to - * returned matches that were not returned. This parameter bounds the - * amount of work the match-finder does at any one position. This could be - * anywhere from 1 to 100+ depending on the compression ratio and - * performance desired. - * - * @max_matches_to_return - * Maximum number of matches to return at each position. Because of the - * suffix array search algorithm, the order in which matches are returned - * will be from longest to shortest, so cut-offs due to this parameter will - * only result in shorter matches being discarded. This parameter could be - * anywhere from 1 to (@max_match_len - @min_match_len + 1) depending on - * the compression performance desired. However, making it even moderately - * large (say, greater than 3) may not be very helpful due to the property - * that the matches are returned from longest to shortest. But the main - * thing to keep in mind is that if the compressor decides to output a - * shorter-than-possible match, ideally it would be best to choose the best - * match of the desired length rather than truncate a longer match to that - * length. - * - * After initialization, the suffix-array match-finder can be used for any - * number of input strings (windows) of length less than or equal to - * @max_window_size by successive calls to lz_sarray_load_window(). - * - * Returns %true on success, or %false if sufficient memory could not be - * allocated. See the note for @max_window_size above regarding the needed - * memory size. - */ -bool -lz_sarray_init(struct lz_sarray *mf, - lz_sarray_pos_t max_window_size, - lz_sarray_len_t min_match_len, - lz_sarray_len_t max_match_len, - u32 max_matches_to_consider, - u32 max_matches_to_return) -{ - LZ_ASSERT(min_match_len > 0); - LZ_ASSERT(max_window_size > 0); - LZ_ASSERT(max_match_len >= min_match_len); - - mf->max_window_size = max_window_size; - mf->min_match_len = min_match_len; - mf->max_match_len = max_match_len; - mf->max_matches_to_consider = max_matches_to_consider; - mf->max_matches_to_return = max_matches_to_return; - - /* SA and ISA will share the same storage block. */ - if ((u64)2 * max_window_size * sizeof(mf->SA[0]) != - 2 * max_window_size * sizeof(mf->SA[0])) - return false; - mf->SA = MALLOC(max_window_size * sizeof(mf->SA[0]) + - max(DIVSUFSORT_TMP1_SIZE, - max_window_size * sizeof(mf->SA[0]))); - if (mf->SA == NULL) - return false; - - if ((u64)max_window_size * sizeof(mf->salink[0]) != - max_window_size * sizeof(mf->salink[0])) - return false; - mf->salink = MALLOC(max(DIVSUFSORT_TMP2_SIZE, - max_window_size * sizeof(mf->salink[0]))); - if (mf->salink == NULL) - return false; - - return true; -} - -/* - * Return the number of bytes of memory that lz_sarray_init() would allocate for - * the specified maximum window size. - * - * This should be (14 * @max_window_size) unless the type definitions have been - * changed. - */ -u64 -lz_sarray_get_needed_memory(lz_sarray_pos_t max_window_size) -{ - u64 size = 0; - - /* SA and ISA: 8 bytes per position */ - size += (u64)max_window_size * sizeof(((struct lz_sarray*)0)->SA[0]) + - max(DIVSUFSORT_TMP1_SIZE, - (u64)max_window_size * sizeof(((struct lz_sarray*)0)->SA[0])); - - /* salink: 6 bytes per position */ - size += max(DIVSUFSORT_TMP2_SIZE, - (u64)max_window_size * sizeof(((struct lz_sarray*)0)->salink[0])); - - return size; -} - -/* - * Prepare the suffix array match-finder to scan the specified window for - * matches. - * - * @mf Suffix array match-finder previously initialized with lz_sarray_init(). - * - * @T Window, or "block", in which to find matches. - * - * @n Size of window in bytes. This must be positive and less than or equal - * to the @max_window_size passed to lz_sarray_init(). - * - * This function runs in linear time (relative to @n). - */ -void -lz_sarray_load_window(struct lz_sarray *mf, const u8 T[], lz_sarray_pos_t n) -{ - lz_sarray_pos_t *ISA, *LCP; - - LZ_ASSERT(n > 0 && n <= mf->max_window_size); - - /* Compute SA (Suffix Array). - * - * divsufsort() needs temporary space --- one array with 256 spaces and - * one array with 65536 spaces. The implementation of divsufsort() has - * been modified from the original to use the provided temporary space - * instead of allocating its own. - * - * We also check at build-time that divsufsort() uses the same integer - * size expected by this code. Unfortunately, divsufsort breaks if - * 'sa_idx_t' is defined to be a 16-bit integer; however, that would - * limit blocks to only 65536 bytes anyway. */ - BUILD_BUG_ON(sizeof(lz_sarray_pos_t) != sizeof(saidx_t)); - - divsufsort(T, mf->SA, n, (saidx_t*)&mf->SA[n], (saidx_t*)mf->salink); - - BUILD_BUG_ON(sizeof(bool) > sizeof(mf->salink[0])); - verify_suffix_array(mf->SA, T, (bool*)mf->salink, n); - - /* Compute ISA (Inverse Suffix Array) in a preliminary position. - * - * This is just a trick to save memory. Since LCP is unneeded after - * this function, it can be computed in any available space. The - * storage for the ISA is the best choice because the ISA can be built - * quickly in salink for now, then re-built in its real location at the - * end. This is probably worth it because computing the ISA from the SA - * is very fast, and since this match-finder is memory-hungry we'd like - * to save as much memory as possible. */ - BUILD_BUG_ON(sizeof(mf->salink[0]) < sizeof(mf->ISA[0])); - ISA = (lz_sarray_pos_t*)mf->salink; - compute_inverse_suffix_array(ISA, mf->SA, n); - - /* Compute LCP (Longest Common Prefix) array. */ - LCP = mf->SA + n; - compute_lcp_array(LCP, mf->SA, ISA, T, n); - verify_lcp_array(LCP, mf->SA, T, n); - - /* Initialize suffix array links. */ - init_salink(mf->salink, LCP, mf->SA, T, n, - mf->min_match_len, mf->max_match_len); - verify_salink(mf->salink, mf->SA, T, n, - mf->min_match_len, mf->max_match_len); - - /* Compute ISA (Inverse Suffix Array) in its final position. */ - ISA = mf->SA + n; - compute_inverse_suffix_array(ISA, mf->SA, n); - - /* Save new variables and return. */ - mf->ISA = ISA; - mf->cur_pos = 0; - mf->window_size = n; -} - -/* Free memory allocated for the suffix array match-finder. */ -void -lz_sarray_destroy(struct lz_sarray *mf) -{ - FREE(mf->SA); - FREE(mf->salink); -} diff --git a/src/lzms-compress.c b/src/lzms-compress.c index 3b0caab4..0e9f0c36 100644 --- a/src/lzms-compress.c +++ b/src/lzms-compress.c @@ -3,7 +3,7 @@ */ /* - * Copyright (C) 2013 Eric Biggers + * Copyright (C) 2013, 2014 Eric Biggers * * This file is part of wimlib, a library for working with WIM files. * @@ -24,6 +24,9 @@ /* This a compressor for the LZMS compression format. More details about this * format can be found in lzms-decompress.c. * + * Also see lzx-compress.c for general information about match-finding and + * match-choosing that also applies to this LZMS compressor. + * * NOTE: this compressor currently does not code any delta matches. */ @@ -38,7 +41,8 @@ #include "wimlib/compress_common.h" #include "wimlib/endianness.h" #include "wimlib/error.h" -#include "wimlib/lz_sarray.h" +#include "wimlib/lz.h" +#include "wimlib/lz_bt.h" #include "wimlib/lzms.h" #include "wimlib/util.h" @@ -46,18 +50,6 @@ #include #include -struct lzms_compressor; -struct lzms_adaptive_state { - struct lzms_lz_lru_queues lru; - u8 main_state; - u8 match_state; - u8 lz_match_state; - u8 lz_repeat_match_state[LZMS_NUM_RECENT_OFFSETS - 1]; -}; -#define LZ_ADAPTIVE_STATE struct lzms_adaptive_state -#define LZ_COMPRESSOR struct lzms_compressor -#include "wimlib/lz_optimal.h" - /* Stucture used for writing raw bits to the end of the LZMS-compressed data as * a series of 16-bit little endian coding units. */ struct lzms_output_bitstream { @@ -168,6 +160,8 @@ struct lzms_huffman_encoder { /* State of the LZMS compressor. */ struct lzms_compressor { + struct wimlib_lzms_compressor_params params; + /* Pointer to a buffer holding the preprocessed data to compress. */ u8 *window; @@ -177,14 +171,16 @@ struct lzms_compressor { /* Size of the data in @buffer. */ u32 window_size; - /* Suffix array match-finder. */ - struct lz_sarray lz_sarray; + /* Binary tree match-finder. */ + struct lz_bt mf; /* Temporary space to store found matches. */ struct raw_match *matches; - /* Match-chooser. */ - struct lz_match_chooser mc; + /* Match-chooser data. */ + struct lzms_mc_pos_data *optimum; + unsigned optimum_cur_idx; + unsigned optimum_end_idx; /* Maximum block size this compressor instantiation allows. This is the * allocated size of @window. */ @@ -220,6 +216,28 @@ struct lzms_compressor { s32 last_target_usages[65536]; }; +struct lzms_mc_pos_data { + u32 cost; +#define MC_INFINITE_COST ((u32)~0UL) + union { + struct { + u32 link; + u32 match_offset; + } prev; + struct { + u32 link; + u32 match_offset; + } next; + }; + struct lzms_adaptive_state { + struct lzms_lz_lru_queues lru; + u8 main_state; + u8 match_state; + u8 lz_match_state; + u8 lz_repeat_match_state[LZMS_NUM_RECENT_OFFSETS - 1]; + } state; +}; + /* Initialize the output bitstream @os to write forwards to the specified * compressed data buffer @out that is @out_limit 16-bit integers long. */ static void @@ -585,21 +603,6 @@ lzms_encode_lz_match(struct lzms_compressor *ctx, u32 length, u32 offset) lzms_end_encode_item(ctx, length); } -/* Fast heuristic cost evaluation to use in the inner loop of the match-finder. - * Unlike lzms_get_lz_match_cost(), which does a true cost evaluation, this - * simply prioritize matches based on their offset. */ -static input_idx_t -lzms_lz_match_cost_fast(input_idx_t length, input_idx_t offset, const void *_lru) -{ - const struct lzms_lz_lru_queues *lru = _lru; - - for (input_idx_t i = 0; i < LZMS_NUM_RECENT_OFFSETS; i++) - if (offset == lru->recent_offsets[i]) - return i; - - return offset; -} - #define LZMS_COST_SHIFT 5 /*#define LZMS_RC_COSTS_USE_FLOATING_POINT*/ @@ -749,29 +752,22 @@ lzms_length_cost(const struct lzms_huffman_encoder *enc, u32 length) } static u32 -lzms_get_matches(struct lzms_compressor *ctx, - const struct lzms_adaptive_state *state, - struct raw_match **matches_ret) +lzms_get_matches(struct lzms_compressor *ctx, struct raw_match **matches_ret) { *matches_ret = ctx->matches; - return lz_sarray_get_matches(&ctx->lz_sarray, - ctx->matches, - lzms_lz_match_cost_fast, - &state->lru); + return lz_bt_get_matches(&ctx->mf, ctx->matches); } static void -lzms_skip_bytes(struct lzms_compressor *ctx, input_idx_t n) +lzms_skip_bytes(struct lzms_compressor *ctx, u32 n) { - while (n--) - lz_sarray_skip_position(&ctx->lz_sarray); + lz_bt_skip_positions(&ctx->mf, n); } static u32 -lzms_get_prev_literal_cost(struct lzms_compressor *ctx, - struct lzms_adaptive_state *state) +lzms_get_literal_cost(struct lzms_compressor *ctx, + struct lzms_adaptive_state *state, u8 literal) { - u8 literal = ctx->window[lz_sarray_get_pos(&ctx->lz_sarray) - 1]; u32 cost = 0; state->lru.upcoming_offset = 0; @@ -788,7 +784,7 @@ lzms_get_prev_literal_cost(struct lzms_compressor *ctx, static u32 lzms_get_lz_match_cost(struct lzms_compressor *ctx, struct lzms_adaptive_state *state, - input_idx_t length, input_idx_t offset) + u32 length, u32 offset) { u32 cost = 0; int recent_offset_idx; @@ -839,25 +835,267 @@ lzms_get_lz_match_cost(struct lzms_compressor *ctx, return cost; } +static struct raw_match +lzms_match_chooser_reverse_list(struct lzms_compressor *ctx, unsigned cur_pos) +{ + unsigned prev_link, saved_prev_link; + unsigned prev_match_offset, saved_prev_match_offset; + + ctx->optimum_end_idx = cur_pos; + + saved_prev_link = ctx->optimum[cur_pos].prev.link; + saved_prev_match_offset = ctx->optimum[cur_pos].prev.match_offset; + + do { + prev_link = saved_prev_link; + prev_match_offset = saved_prev_match_offset; + + saved_prev_link = ctx->optimum[prev_link].prev.link; + saved_prev_match_offset = ctx->optimum[prev_link].prev.match_offset; + + ctx->optimum[prev_link].next.link = cur_pos; + ctx->optimum[prev_link].next.match_offset = prev_match_offset; + + cur_pos = prev_link; + } while (cur_pos != 0); + + ctx->optimum_cur_idx = ctx->optimum[0].next.link; + + return (struct raw_match) + { .len = ctx->optimum_cur_idx, + .offset = ctx->optimum[0].next.match_offset, + }; +} + +/* This is similar to lzx_get_near_optimal_match() in lzx-compress.c. + * Read that one if you want to understand it. */ static struct raw_match lzms_get_near_optimal_match(struct lzms_compressor *ctx) { + u32 num_matches; + struct raw_match *matches; + struct raw_match match; + u32 longest_len; + u32 longest_rep_len; + u32 longest_rep_offset; + struct raw_match *matchptr; + unsigned cur_pos; + unsigned end_pos; struct lzms_adaptive_state initial_state; + if (ctx->optimum_cur_idx != ctx->optimum_end_idx) { + match.len = ctx->optimum[ctx->optimum_cur_idx].next.link - + ctx->optimum_cur_idx; + match.offset = ctx->optimum[ctx->optimum_cur_idx].next.match_offset; + + ctx->optimum_cur_idx = ctx->optimum[ctx->optimum_cur_idx].next.link; + return match; + } + + ctx->optimum_cur_idx = 0; + ctx->optimum_end_idx = 0; + + longest_rep_len = ctx->params.min_match_length - 1; + if (lz_bt_get_position(&ctx->mf) >= 1) { + u32 limit = min(ctx->params.max_match_length, + lz_bt_get_remaining_size(&ctx->mf)); + for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS; i++) { + u32 offset = ctx->lru.lz.recent_offsets[i]; + const u8 *strptr = lz_bt_get_window_ptr(&ctx->mf); + const u8 *matchptr = strptr - offset; + u32 len = 0; + while (len < limit && strptr[len] == matchptr[len]) + len++; + if (len > longest_rep_len) { + longest_rep_len = len; + longest_rep_offset = offset; + } + } + } + + if (longest_rep_len >= ctx->params.nice_match_length) { + lzms_skip_bytes(ctx, longest_rep_len); + return (struct raw_match) { + .len = longest_rep_len, + .offset = longest_rep_offset, + }; + } + + num_matches = lzms_get_matches(ctx, &matches); + + if (num_matches) { + longest_len = matches[num_matches - 1].len; + if (longest_len >= ctx->params.nice_match_length) { + lzms_skip_bytes(ctx, longest_len - 1); + return matches[num_matches - 1]; + } + } else { + longest_len = 1; + } + initial_state.lru = ctx->lru.lz; initial_state.main_state = ctx->main_range_encoder.state; initial_state.match_state = ctx->match_range_encoder.state; initial_state.lz_match_state = ctx->lz_match_range_encoder.state; for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS - 1; i++) - initial_state.lz_repeat_match_state[i] = - ctx->lz_repeat_match_range_encoders[i].state; - return lz_get_near_optimal_match(&ctx->mc, - lzms_get_matches, - lzms_skip_bytes, - lzms_get_prev_literal_cost, - lzms_get_lz_match_cost, - ctx, - &initial_state); + initial_state.lz_repeat_match_state[i] = ctx->lz_repeat_match_range_encoders[i].state; + + ctx->optimum[1].state = initial_state; + ctx->optimum[1].cost = lzms_get_literal_cost(ctx, + &ctx->optimum[1].state, + *(lz_bt_get_window_ptr(&ctx->mf) - 1)); + ctx->optimum[1].prev.link = 0; + + matchptr = matches; + for (u32 len = 2; len <= longest_len; len++) { + u32 offset = matchptr->offset; + + ctx->optimum[len].state = initial_state; + ctx->optimum[len].prev.link = 0; + ctx->optimum[len].prev.match_offset = offset; + ctx->optimum[len].cost = lzms_get_lz_match_cost(ctx, + &ctx->optimum[len].state, + len, offset); + if (len == matchptr->len) + matchptr++; + } + end_pos = longest_len; + + if (longest_rep_len >= ctx->params.min_match_length) { + struct lzms_adaptive_state state; + u32 cost; + + while (end_pos < longest_rep_len) + ctx->optimum[++end_pos].cost = MC_INFINITE_COST; + + state = initial_state; + cost = lzms_get_lz_match_cost(ctx, + &state, + longest_rep_len, + longest_rep_offset); + if (cost <= ctx->optimum[longest_rep_len].cost) { + ctx->optimum[longest_rep_len].state = state; + ctx->optimum[longest_rep_len].prev.link = 0; + ctx->optimum[longest_rep_len].prev.match_offset = longest_rep_offset; + ctx->optimum[longest_rep_len].cost = cost; + } + } + + cur_pos = 0; + for (;;) { + u32 cost; + struct lzms_adaptive_state state; + + cur_pos++; + + if (cur_pos == end_pos || cur_pos == ctx->params.optim_array_length) + return lzms_match_chooser_reverse_list(ctx, cur_pos); + + longest_rep_len = ctx->params.min_match_length - 1; + u32 limit = min(ctx->params.max_match_length, + lz_bt_get_remaining_size(&ctx->mf)); + for (int i = 0; i < LZMS_NUM_RECENT_OFFSETS; i++) { + u32 offset = ctx->optimum[cur_pos].state.lru.recent_offsets[i]; + const u8 *strptr = lz_bt_get_window_ptr(&ctx->mf); + const u8 *matchptr = strptr - offset; + u32 len = 0; + while (len < limit && strptr[len] == matchptr[len]) + len++; + if (len > longest_rep_len) { + longest_rep_len = len; + longest_rep_offset = offset; + } + } + + if (longest_rep_len >= ctx->params.nice_match_length) { + match = lzms_match_chooser_reverse_list(ctx, cur_pos); + + ctx->optimum[cur_pos].next.match_offset = longest_rep_offset; + ctx->optimum[cur_pos].next.link = cur_pos + longest_rep_len; + ctx->optimum_end_idx = cur_pos + longest_rep_len; + + lzms_skip_bytes(ctx, longest_rep_len); + + return match; + } + + num_matches = lzms_get_matches(ctx, &matches); + + if (num_matches) { + longest_len = matches[num_matches - 1].len; + if (longest_len >= ctx->params.nice_match_length) { + match = lzms_match_chooser_reverse_list(ctx, cur_pos); + + ctx->optimum[cur_pos].next.match_offset = + matches[num_matches - 1].offset; + ctx->optimum[cur_pos].next.link = cur_pos + longest_len; + ctx->optimum_end_idx = cur_pos + longest_len; + + lzms_skip_bytes(ctx, longest_len - 1); + + return match; + } + } else { + longest_len = 1; + } + + while (end_pos < cur_pos + longest_len) + ctx->optimum[++end_pos].cost = MC_INFINITE_COST; + + state = ctx->optimum[cur_pos].state; + cost = ctx->optimum[cur_pos].cost + + lzms_get_literal_cost(ctx, + &state, + *(lz_bt_get_window_ptr(&ctx->mf) - 1)); + if (cost < ctx->optimum[cur_pos + 1].cost) { + ctx->optimum[cur_pos + 1].state = state; + ctx->optimum[cur_pos + 1].cost = cost; + ctx->optimum[cur_pos + 1].prev.link = cur_pos; + } + + matchptr = matches; + for (u32 len = 2; len <= longest_len; len++) { + u32 offset; + + offset = matchptr->offset; + state = ctx->optimum[cur_pos].state; + + cost = ctx->optimum[cur_pos].cost + + lzms_get_lz_match_cost(ctx, &state, len, offset); + if (cost < ctx->optimum[cur_pos + len].cost) { + ctx->optimum[cur_pos + len].state = state; + ctx->optimum[cur_pos + len].prev.link = cur_pos; + ctx->optimum[cur_pos + len].prev.match_offset = offset; + ctx->optimum[cur_pos + len].cost = cost; + } + if (len == matchptr->len) + matchptr++; + } + + if (longest_rep_len >= ctx->params.min_match_length) { + + while (end_pos < cur_pos + longest_rep_len) + ctx->optimum[++end_pos].cost = MC_INFINITE_COST; + + state = ctx->optimum[cur_pos].state; + + cost = ctx->optimum[cur_pos].cost + + lzms_get_lz_match_cost(ctx, + &state, + longest_rep_len, + longest_rep_offset); + if (cost <= ctx->optimum[cur_pos + longest_rep_len].cost) { + ctx->optimum[cur_pos + longest_rep_len].state = + state; + ctx->optimum[cur_pos + longest_rep_len].prev.link = + cur_pos; + ctx->optimum[cur_pos + longest_rep_len].prev.match_offset = + longest_rep_offset; + ctx->optimum[cur_pos + longest_rep_len].cost = + cost; + } + } + } } /* @@ -865,13 +1103,9 @@ lzms_get_near_optimal_match(struct lzms_compressor *ctx) * * Notes: * - * - This uses near-optimal LZ parsing backed by a suffix-array match-finder. - * More details can be found in the corresponding files (lz_optimal.h, - * lz_sarray.{h,c}). + * - This uses near-optimal LZ parsing backed by a binary tree match-finder. * - * - This does not output any delta matches. It would take a specialized - * algorithm to find them, then more code in lz_optimal.h and here to handle - * evaluating and outputting them. + * - This does not output any delta matches. * * - The costs of literals and matches are estimated using the range encoder * states and the semi-adaptive Huffman codes. Except for range encoding @@ -886,11 +1120,12 @@ lzms_encode(struct lzms_compressor *ctx) { struct raw_match match; - /* Load window into suffix array match-finder. */ - lz_sarray_load_window(&ctx->lz_sarray, ctx->window, ctx->window_size); + /* Load window into the binary tree match-finder. */ + lz_bt_load_window(&ctx->mf, ctx->window, ctx->window_size); /* Reset the match-chooser. */ - lz_match_chooser_begin(&ctx->mc); + ctx->optimum_cur_idx = 0; + ctx->optimum_end_idx = 0; while (ctx->cur_window_pos != ctx->window_size) { match = lzms_get_near_optimal_match(ctx); @@ -1162,8 +1397,8 @@ lzms_free_compressor(void *_ctx) if (ctx) { FREE(ctx->window); FREE(ctx->matches); - lz_sarray_destroy(&ctx->lz_sarray); - lz_match_chooser_destroy(&ctx->mc); + lz_bt_destroy(&ctx->mf); + FREE(ctx->optimum); FREE(ctx); } } @@ -1176,7 +1411,6 @@ static const struct wimlib_lzms_compressor_params lzms_default = { .max_match_length = UINT32_MAX, .nice_match_length = 32, .max_search_depth = 50, - .max_matches_per_pos = 3, .optim_array_length = 1024, }; @@ -1220,22 +1454,24 @@ lzms_create_compressor(size_t max_block_size, ctx->matches = MALLOC(min(params->max_match_length - params->min_match_length + 1, - params->max_matches_per_pos) * + params->max_search_depth + 2) * sizeof(ctx->matches[0])); if (ctx->matches == NULL) goto oom; - if (!lz_sarray_init(&ctx->lz_sarray, max_block_size, - params->min_match_length, - min(params->max_match_length, LZ_SARRAY_LEN_MAX), - params->max_search_depth, - params->max_matches_per_pos)) + if (!lz_bt_init(&ctx->mf, + max_block_size, + params->min_match_length, + params->max_match_length, + params->nice_match_length, + params->max_search_depth)) goto oom; - if (!lz_match_chooser_init(&ctx->mc, - params->optim_array_length, - params->nice_match_length, - params->max_match_length)) + ctx->optimum = MALLOC((params->optim_array_length + + min(params->nice_match_length, + params->max_match_length)) * + sizeof(ctx->optimum[0])); + if (!ctx->optimum) goto oom; /* Initialize position and length slot data if not done already. */ @@ -1245,6 +1481,7 @@ lzms_create_compressor(size_t max_block_size, lzms_init_rc_costs(); ctx->max_block_size = max_block_size; + memcpy(&ctx->params, params, sizeof(*params)); *ctx_ret = ctx; return 0; @@ -1264,13 +1501,13 @@ lzms_get_needed_memory(size_t max_block_size, size += max_block_size; size += sizeof(struct lzms_compressor); - size += lz_sarray_get_needed_memory(max_block_size); - size += lz_match_chooser_get_needed_memory(params->optim_array_length, - params->nice_match_length, - params->max_match_length); - size += min(params->max_match_length - - params->min_match_length + 1, - params->max_matches_per_pos) * + size += lz_bt_get_needed_memory(max_block_size); + size += (params->optim_array_length + + min(params->nice_match_length, + params->max_match_length)) * + sizeof(((struct lzms_compressor *)0)->optimum[0]); + size += min(params->max_match_length - params->min_match_length + 1, + params->max_search_depth + 2) * sizeof(((struct lzms_compressor*)0)->matches[0]); return size; } diff --git a/src/lzx-compress.c b/src/lzx-compress.c index c86b03ca..e65b6c83 100644 --- a/src/lzx-compress.c +++ b/src/lzx-compress.c @@ -1,11 +1,9 @@ /* * lzx-compress.c - * - * LZX compression routines */ /* - * Copyright (C) 2012, 2013 Eric Biggers + * Copyright (C) 2012, 2013, 2014 Eric Biggers * * This file is part of wimlib, a library for working with WIM files. * @@ -25,128 +23,208 @@ /* - * This file contains a compressor for the LZX compression format, as used in - * the WIM file format. + * This file contains a compressor for the LZX ("Lempel-Ziv eXtended"?) + * compression format, as used in the WIM (Windows IMaging) file format. This + * code may need some slight modifications to be used outside of the WIM format. + * In particular, in other situations the LZX block header might be slightly + * different, and a sliding window rather than a fixed-size window might be + * required. * - * Format - * ====== + * ---------------------------------------------------------------------------- * - * First, the primary reference for the LZX compression format is the - * specification released by Microsoft. + * Format Overview * - * Second, the comments in lzx-decompress.c provide some more information about - * the LZX compression format, including errors in the Microsoft specification. + * The primary reference for LZX is the specification released by Microsoft. + * However, the comments in lzx-decompress.c provide more information about LZX + * and note some errors in the Microsoft specification. * - * Do note that LZX shares many similarities with DEFLATE, the algorithm used by - * zlib and gzip. Both LZX and DEFLATE use LZ77 matching and Huffman coding, - * and certain other details are quite similar, such as the method for storing - * Huffman codes. However, some of the main differences are: + * LZX shares many similarities with DEFLATE, the format used by zlib and gzip. + * Both LZX and DEFLATE use LZ77 matching and Huffman coding. Certain details + * are quite similar, such as the method for storing Huffman codes. However, + * the main differences are: * * - LZX preprocesses the data to attempt to make x86 machine code slightly more * compressible before attempting to compress it further. + * * - LZX uses a "main" alphabet which combines literals and matches, with the * match symbols containing a "length header" (giving all or part of the match * length) and a "position slot" (giving, roughly speaking, the order of * magnitude of the match offset). - * - LZX does not have static Huffman blocks; however it does have two types of - * dynamic Huffman blocks ("aligned offset" and "verbatim"). + * + * - LZX does not have static Huffman blocks (that is, the kind with preset + * Huffman codes); however it does have two types of dynamic Huffman blocks + * ("verbatim" and "aligned"). + * * - LZX has a minimum match length of 2 rather than 3. + * * - In LZX, match offsets 0 through 2 actually represent entries in an LRU * queue of match offsets. This is very useful for certain types of files, * such as binary files that have repeating records. * - * Algorithms - * ========== + * ---------------------------------------------------------------------------- + * + * Algorithmic Overview * - * There are actually two distinct overall algorithms implemented here. We - * shall refer to them as the "slow" algorithm and the "fast" algorithm. The - * "slow" algorithm spends more time compressing to achieve a higher compression - * ratio compared to the "fast" algorithm. More details are presented below. + * At a high level, any implementation of LZX compression must operate as + * follows: * - * Slow algorithm - * -------------- + * 1. Preprocess the input data to translate the targets of 32-bit x86 call + * instructions to absolute offsets. (Actually, this is required for WIM, + * but might not be in other places LZX is used.) * - * The "slow" algorithm to generate LZX-compressed data is roughly as follows: + * 2. Find a sequence of LZ77-style matches and literal bytes that expands to + * the preprocessed data. * - * 1. Preprocess the input data to translate the targets of x86 call - * instructions to absolute offsets. + * 3. Divide the match/literal sequence into one or more LZX blocks, each of + * which may be "uncompressed", "verbatim", or "aligned". * - * 2. Build the suffix array and inverse suffix array for the input data. The - * suffix array contains the indices of all suffixes of the input data, - * sorted lexcographically by the corresponding suffixes. The "position" of - * a suffix is the index of that suffix in the original string, whereas the - * "rank" of a suffix is the index at which that suffix's position is found - * in the suffix array. + * 4. Output each LZX block. * - * 3. Build the longest common prefix array corresponding to the suffix array. + * Step (1) is fairly straightforward. It requires looking for 0xe8 bytes in + * the input data and performing a translation on the 4 bytes following each + * one. * - * 4. For each suffix, find the highest lower ranked suffix that has a lower - * position, the lowest higher ranked suffix that has a lower position, and - * the length of the common prefix shared between each. This information is - * later used to link suffix ranks into a doubly-linked list for searching - * the suffix array. + * Step (4) is complicated, but it is mostly determined by the LZX format. The + * only real choice we have is what algorithm to use to build the length-limited + * canonical Huffman codes. See lzx_write_all_blocks() for details. * - * 5. Set a default cost model for matches/literals. + * That leaves steps (2) and (3) as where all the hard stuff happens. Focusing + * on step (2), we need to do LZ77-style parsing on the input data, or "window", + * to divide it into a sequence of matches and literals. Each position in the + * window might have multiple matches associated with it, and we need to choose + * which one, if any, to actually use. Therefore, the problem can really be + * divided into two areas of concern: (a) finding matches at a given position, + * which we shall call "match-finding", and (b) choosing whether to use a + * match or a literal at a given position, and if using a match, which one (if + * there is more than one available). We shall call this "match-choosing". We + * first consider match-finding, then match-choosing. * - * 6. Determine the lowest cost sequence of LZ77 matches ((offset, length) - * pairs) and literal bytes to divide the input into. Raw match-finding is - * done by searching the suffix array using a linked list to avoid - * considering any suffixes that start after the current position. Each run - * of the match-finder returns the approximate lowest-cost longest match as - * well as any shorter matches that have even lower approximate costs. Each - * such run also adds the suffix rank of the current position into the linked - * list being used to search the suffix array. Parsing, or match-choosing, - * is solved as a minimum-cost path problem using a forward "optimal parsing" - * algorithm based on the Deflate encoder from 7-Zip. This algorithm moves - * forward calculating the minimum cost to reach each byte until either a - * very long match is found or until a position is found at which no matches - * start or overlap. + * ---------------------------------------------------------------------------- * - * 7. Build the Huffman codes needed to output the matches/literals. + * Match-finding * - * 8. Up to a certain number of iterations, use the resulting Huffman codes to - * refine a cost model and go back to Step #6 to determine an improved - * sequence of matches and literals. + * Given a position in the window, we want to find LZ77-style "matches" with + * that position at previous positions in the window. With LZX, the minimum + * match length is 2 and the maximum match length is 257. The only restriction + * on offsets is that LZX does not allow the last 2 bytes of the window to match + * the the beginning of the window. * - * 9. Output the resulting block using the match/literal sequences and the - * Huffman codes that were computed for the block. + * Depending on how good a compression ratio we want (see the "Match-choosing" + * section), we may want to find: (a) all matches, or (b) just the longest + * match, or (c) just some "promising" matches that we are able to find quickly, + * or (d) just the longest match that we're able to find quickly. Below we + * introduce the match-finding methods that the code currently uses or has + * previously used: * - * Note: the algorithm does not yet attempt to split the input into multiple LZX - * blocks; it instead uses a series of blocks of LZX_DIV_BLOCK_SIZE bytes. + * - Hash chains. Maintain a table that maps hash codes, computed from + * fixed-length byte sequences, to linked lists containing previous window + * positions. To search for matches, compute the hash for the current + * position in the window and search the appropriate hash chain. When + * advancing to the next position, prepend the current position to the + * appropriate hash list. This is a good approach for producing matches with + * stategy (d) and is useful for fast compression. Therefore, we provide an + * option to use this method for LZX compression. See lz_hash.c for the + * implementation. * - * Fast algorithm - * -------------- + * - Binary trees. Similar to hash chains, but each hash bucket contains a + * binary tree of previous window positions rather than a linked list. This + * is a good approach for producing matches with stategy (c) and is useful for + * achieving a good compression ratio. Therefore, we provide an option to use + * this method; see lz_bt.c for the implementation. * - * The fast algorithm (and the only one available in wimlib v1.5.1 and earlier) - * spends much less time on the main bottlenecks of the compression process --- - * that is, the match finding and match choosing. Matches are found and chosen - * with hash chains using a greedy parse with one position of look-ahead. No - * block splitting is done; only compressing the full input into an aligned - * offset block is considered. + * - Suffix arrays. This code previously used this method to produce matches + * with stategy (c), but I've dropped it because it was slower than the binary + * trees approach, used more memory, and did not improve the compression ratio + * enough to compensate. Download wimlib v1.6.2 if you want the code. + * However, the suffix array method was basically as follows. Build the + * suffix array for the entire window. The suffix array contains each + * possible window position, sorted by the lexicographic order of the strings + * that begin at those positions. Find the matches at a given position by + * searching the suffix array outwards, in both directions, from the suffix + * array slot for that position. This produces the longest matches first, but + * "matches" that actually occur at later positions in the window must be + * skipped. To do this skipping, use an auxiliary array with dynamically + * constructed linked lists. Also, use the inverse suffix array to quickly + * find the suffix array slot for a given position without doing a binary + * search. * - * Acknowledgments - * =============== + * ---------------------------------------------------------------------------- * - * Acknowledgments to several open-source projects and research papers that made - * it possible to implement this code: + * Match-choosing * - * - divsufsort (author: Yuta Mori), for the suffix array construction code, - * located in a separate file (divsufsort.c). + * Usually, choosing the longest match is best because it encodes the most data + * in that one item. However, sometimes the longest match is not optimal + * because (a) choosing a long match now might prevent using an even longer + * match later, or (b) more generally, what we actually care about is the number + * of bits it will ultimately take to output each match or literal, which is + * actually dependent on the entropy encoding using by the underlying + * compression format. Consequently, a longer match usually, but not always, + * takes fewer bits to encode than multiple shorter matches or literals that + * cover the same data. * - * - "Linear-Time Longest-Common-Prefix Computation in Suffix Arrays and Its - * Applications" (Kasai et al. 2001), for the LCP array computation. + * This problem of choosing the truly best match/literal sequence is probably + * impossible to solve efficiently when combined with entropy encoding. If we + * knew how many bits it takes to output each match/literal, then we could + * choose the optimal sequence using shortest-path search a la Dijkstra's + * algorithm. However, with entropy encoding, the chosen match/literal sequence + * affects its own encoding. Therefore, we can't know how many bits it will + * take to actually output any one match or literal until we have actually + * chosen the full sequence of matches and literals. * - * - "LPF computation revisited" (Crochemore et al. 2009) for the prev and next - * array computations. + * Notwithstanding the entropy encoding problem, we also aren't guaranteed to + * choose the optimal match/literal sequence unless the match-finder (see + * section "Match-finder") provides the match-chooser with all possible matches + * at each position. However, this is not computationally efficient. For + * example, there might be many matches of the same length, and usually (but not + * always) the best choice is the one with the smallest offset. So in practice, + * it's fine to only consider the smallest offset for a given match length at a + * given position. (Actually, for LZX, it's also worth considering repeat + * offsets.) * - * - 7-Zip (author: Igor Pavlov) for the algorithm for forward optimal parsing - * (match-choosing). + * In addition, as mentioned earlier, in LZX we have the choice of using + * multiple blocks, each of which resets the Huffman codes. This expands the + * search space even further. Therefore, to simplify the problem, we currently + * we don't attempt to actually choose the LZX blocks based on the data. + * Instead, we just divide the data into fixed-size blocks of LZX_DIV_BLOCK_SIZE + * bytes each, and always use verbatim or aligned blocks (never uncompressed). + * A previous version of this code recursively split the input data into + * equal-sized blocks, up to a maximum depth, and chose the lowest-cost block + * divisions. However, this made compression much slower and did not actually + * help very much. It remains an open question whether a sufficiently fast and + * useful block-splitting algorithm is possible for LZX. Essentially the same + * problem also applies to DEFLATE. The Microsoft LZX compressor seemingly does + * do block splitting, although I don't know how fast or useful it is, + * specifically. * - * - zlib (author: Jean-loup Gailly and Mark Adler), for the hash table - * match-finding algorithm (used in lz77.c). + * Now, back to the entropy encoding problem. The "solution" is to use an + * iterative approach to compute a good, but not necessarily optimal, + * match/literal sequence. Start with a fixed assignment of symbol costs and + * choose an "optimal" match/literal sequence based on those costs, using + * shortest-path seach a la Dijkstra's algorithm. Then, for each iteration of + * the optimization, update the costs based on the entropy encoding of the + * current match/literal sequence, then choose a new match/literal sequence + * based on the updated costs. Usually, the actual cost to output the current + * match/literal sequence will decrease in each iteration until it converges on + * a fixed point. This result may not be the truly optimal match/literal + * sequence, but it usually is much better than one chosen by doing a "greedy" + * parse where we always chooe the longest match. * - * - lzx-compress (author: Matthew T. Russotto), on which some parts of this - * code were originally based. + * An alternative to both greedy parsing and iterative, near-optimal parsing is + * "lazy" parsing. Briefly, "lazy" parsing considers just the longest match at + * each position, but it waits to choose that match until it has also examined + * the next position. This is actually a useful approach; it's used by zlib, + * for example. Therefore, for fast compression we combine lazy parsing with + * the hash chain max-finder. For normal/high compression we combine + * near-optimal parsing with the binary tree match-finder. + * + * Anyway, if you've read through this comment, you hopefully should have a + * better idea of why things are done in a certain way in this LZX compressor, + * as well as in other compressors for LZ77-based formats (including third-party + * ones). In my opinion, the phrase "compression algorithm" is often mis-used + * in place of "compression format", since there can be many different + * algorithms that all generate compressed data in the same format. The + * challenge is to design an algorithm that is efficient but still gives a good + * compression ratio. */ #ifdef HAVE_CONFIG_H @@ -158,8 +236,9 @@ #include "wimlib/compress_common.h" #include "wimlib/endianness.h" #include "wimlib/error.h" +#include "wimlib/lz.h" #include "wimlib/lz_hash.h" -#include "wimlib/lz_sarray.h" +#include "wimlib/lz_bt.h" #include "wimlib/lzx.h" #include "wimlib/util.h" #include @@ -168,14 +247,17 @@ # include "wimlib/decompress_common.h" #endif -typedef u32 block_cost_t; -#define INFINITE_BLOCK_COST (~(block_cost_t)0) - #define LZX_OPTIM_ARRAY_SIZE 4096 #define LZX_DIV_BLOCK_SIZE 32768 -#define LZX_MAX_CACHE_PER_POS 10 +#define LZX_CACHE_PER_POS 10 + +#define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1)) +#define LZX_CACHE_SIZE (LZX_CACHE_LEN * sizeof(struct raw_match)) + +/* Dependent on behavior of lz_bt_get_matches(). */ +#define LZX_MAX_MATCHES_PER_POS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1) /* Codewords for the LZX main, length, and aligned offset Huffman codes */ struct lzx_codewords { @@ -250,24 +332,16 @@ struct lzx_block_spec { /* The number of bytes of uncompressed data this block represents. */ input_idx_t block_size; - /* The position in the 'chosen_matches' array in the `struct - * lzx_compressor' at which the match/literal specifications for - * this block begin. */ - input_idx_t chosen_matches_start_pos; + /* The match/literal sequence for this block. */ + struct lzx_match *chosen_matches; - /* The number of match/literal specifications for this block. */ + /* The length of the @chosen_matches sequence. */ input_idx_t num_chosen_matches; /* Huffman codes for this block. */ struct lzx_codes codes; }; -/* Include template for the match-choosing algorithm. */ -#define LZ_COMPRESSOR struct lzx_compressor -#define LZ_ADAPTIVE_STATE struct lzx_lru_queue -struct lzx_compressor; -#include "wimlib/lz_optimal.h" - /* State of the LZX compressor. */ struct lzx_compressor { @@ -327,14 +401,14 @@ struct lzx_compressor { /* Fast algorithm only: Array of hash table links. */ input_idx_t *prev_tab; - /* Slow algorithm only: Suffix array match-finder. */ - struct lz_sarray lz_sarray; + /* Slow algorithm only: Binary tree match-finder. */ + struct lz_bt mf; /* Position in window of next match to return. */ input_idx_t match_window_pos; - /* The match-finder shall ensure the length of matches does not exceed - * this position in the input. */ + /* The end-of-block position. We can't allow any matches to span this + * position. */ input_idx_t match_window_end; /* Matches found by the match-finder are cached in the following array @@ -343,23 +417,78 @@ struct lzx_compressor { * be preferred with different cost models, but seems to be a worthwhile * speedup. */ struct raw_match *cached_matches; - unsigned cached_matches_pos; + struct raw_match *cache_ptr; bool matches_cached; + struct raw_match *cache_limit; + + /* Match-chooser state. + * When matches have been chosen, optimum_cur_idx is set to the position + * in the window of the next match/literal to return and optimum_end_idx + * is set to the position in the window at the end of the last + * match/literal to return. */ + struct lzx_mc_pos_data *optimum; + unsigned optimum_cur_idx; + unsigned optimum_end_idx; +}; - /* Match chooser. */ - struct lz_match_chooser mc; +/* + * Match chooser position data: + * + * An array of these structures is used during the match-choosing algorithm. + * They correspond to consecutive positions in the window and are used to keep + * track of the cost to reach each position, and the match/literal choices that + * need to be chosen to reach that position. + */ +struct lzx_mc_pos_data { + /* The approximate minimum cost, in bits, to reach this position in the + * window which has been found so far. */ + u32 cost; +#define MC_INFINITE_COST ((u32)~0UL) + + /* The union here is just for clarity, since the fields are used in two + * slightly different ways. Initially, the @prev structure is filled in + * first, and links go from later in the window to earlier in the + * window. Later, @next structure is filled in and links go from + * earlier in the window to later in the window. */ + union { + struct { + /* Position of the start of the match or literal that + * was taken to get to this position in the approximate + * minimum-cost parse. */ + input_idx_t link; + + /* Offset (as in an LZ (length, offset) pair) of the + * match or literal that was taken to get to this + * position in the approximate minimum-cost parse. */ + input_idx_t match_offset; + } prev; + struct { + /* Position at which the match or literal starting at + * this position ends in the minimum-cost parse. */ + input_idx_t link; + + /* Offset (as in an LZ (length, offset) pair) of the + * match or literal starting at this position in the + * approximate minimum-cost parse. */ + input_idx_t match_offset; + } next; + }; + + /* Adaptive state that exists after an approximate minimum-cost path to + * reach this position is taken. */ + struct lzx_lru_queue queue; }; /* Returns the LZX position slot that corresponds to a given match offset, * taking into account the recent offset queue and updating it if the offset is * found in it. */ static unsigned -lzx_get_position_slot(unsigned offset, struct lzx_lru_queue *queue) +lzx_get_position_slot(u32 offset, struct lzx_lru_queue *queue) { unsigned position_slot; /* See if the offset was recently used. */ - for (unsigned i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) { + for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) { if (offset == queue->R[i]) { /* Found it. */ @@ -379,7 +508,7 @@ lzx_get_position_slot(unsigned offset, struct lzx_lru_queue *queue) position_slot = lzx_get_position_slot_raw(offset + LZX_OFFSET_OFFSET); /* Bring the new offset to the front of the queue. */ - for (unsigned i = LZX_NUM_RECENT_OFFSETS - 1; i > 0; i--) + for (int i = LZX_NUM_RECENT_OFFSETS - 1; i > 0; i--) queue->R[i] = queue->R[i - 1]; queue->R[0] = offset; @@ -919,7 +1048,7 @@ lzx_write_all_blocks(struct lzx_compressor *ctx, struct output_bitstream *ostrea spec->block_size, ctx->max_window_size, ctx->num_main_syms, - &ctx->chosen_matches[spec->chosen_matches_start_pos], + spec->chosen_matches, spec->num_chosen_matches, &spec->codes, prev_codes, @@ -943,7 +1072,7 @@ lzx_tally_literal(u8 lit, struct lzx_freqs *freqs) * alphabets. The return value is a 32-bit number that provides the match in an * intermediate representation documented below. */ static u32 -lzx_tally_match(unsigned match_len, unsigned match_offset, +lzx_tally_match(unsigned match_len, u32 match_offset, struct lzx_freqs *freqs, struct lzx_lru_queue *queue) { unsigned position_slot; @@ -1033,7 +1162,7 @@ lzx_record_literal(u8 lit, void *_ctx) /* Returns the cost, in bits, to output a literal byte using the specified cost * model. */ -static unsigned +static u32 lzx_literal_cost(u8 c, const struct lzx_costs * costs) { return costs->main[c]; @@ -1044,13 +1173,14 @@ lzx_literal_cost(u8 c, const struct lzx_costs * costs) * codes, return the approximate number of bits it will take to represent this * match in the compressed output. Take into account the match offset LRU * queue and optionally update it. */ -static unsigned -lzx_match_cost(unsigned length, unsigned offset, const struct lzx_costs *costs, +static u32 +lzx_match_cost(unsigned length, u32 offset, const struct lzx_costs *costs, struct lzx_lru_queue *queue) { unsigned position_slot; unsigned len_header, main_symbol; - unsigned cost = 0; + unsigned num_extra_bits; + u32 cost = 0; position_slot = lzx_get_position_slot(offset, queue); @@ -1061,7 +1191,7 @@ lzx_match_cost(unsigned length, unsigned offset, const struct lzx_costs *costs, cost += costs->main[main_symbol]; /* Account for extra position information. */ - unsigned num_extra_bits = lzx_get_num_extra_bits(position_slot); + num_extra_bits = lzx_get_num_extra_bits(position_slot); if (num_extra_bits >= 3) { cost += num_extra_bits - 3; cost += costs->aligned[(offset + LZX_OFFSET_OFFSET) & 7]; @@ -1077,23 +1207,6 @@ lzx_match_cost(unsigned length, unsigned offset, const struct lzx_costs *costs, } -/* Fast heuristic cost evaluation to use in the inner loop of the match-finder. - * Unlike lzx_match_cost() which does a true cost evaluation, this simply - * prioritize matches based on their offset. */ -static input_idx_t -lzx_match_cost_fast(input_idx_t length, input_idx_t offset, const void *_queue) -{ - const struct lzx_lru_queue *queue = _queue; - - /* It seems well worth it to take the time to give priority to recently - * used offsets. */ - for (input_idx_t i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) - if (offset == queue->R[i]) - return i; - - return offset; -} - /* Set the cost model @ctx->costs from the Huffman codeword lengths specified in * @lens. * @@ -1131,74 +1244,61 @@ lzx_set_costs(struct lzx_compressor * ctx, const struct lzx_lens * lens) } } -/* Tell the match-finder to skip the specified number of bytes (@n) in the - * input. */ -static void -lzx_lz_skip_bytes(struct lzx_compressor *ctx, input_idx_t n) -{ - LZX_ASSERT(n <= ctx->match_window_end - ctx->match_window_pos); - if (ctx->matches_cached) { - ctx->match_window_pos += n; - while (n--) { - ctx->cached_matches_pos += - ctx->cached_matches[ctx->cached_matches_pos].len + 1; - } - } else { - while (n--) { - ctx->cached_matches[ctx->cached_matches_pos++].len = 0; - lz_sarray_skip_position(&ctx->lz_sarray); - ctx->match_window_pos++; - } - LZX_ASSERT(lz_sarray_get_pos(&ctx->lz_sarray) == ctx->match_window_pos); - } -} - /* Retrieve a list of matches available at the next position in the input. * * A pointer to the matches array is written into @matches_ret, and the return * value is the number of matches found. */ -static u32 -lzx_lz_get_matches_caching(struct lzx_compressor *ctx, - const struct lzx_lru_queue *queue, - struct raw_match **matches_ret) +static unsigned +lzx_get_matches(struct lzx_compressor *ctx, + const struct raw_match **matches_ret) { - u32 num_matches; + struct raw_match *cache_ptr; struct raw_match *matches; + unsigned num_matches; - LZX_ASSERT(ctx->match_window_pos <= ctx->match_window_end); - - matches = &ctx->cached_matches[ctx->cached_matches_pos + 1]; + LZX_ASSERT(ctx->match_window_pos < ctx->match_window_end); + cache_ptr = ctx->cache_ptr; + matches = cache_ptr + 1; if (ctx->matches_cached) { - num_matches = matches[-1].len; + num_matches = cache_ptr->len; } else { - LZX_ASSERT(lz_sarray_get_pos(&ctx->lz_sarray) == ctx->match_window_pos); - num_matches = lz_sarray_get_matches(&ctx->lz_sarray, - matches, - lzx_match_cost_fast, - queue); - matches[-1].len = num_matches; + num_matches = lz_bt_get_matches(&ctx->mf, matches); + cache_ptr->len = num_matches; } - ctx->cached_matches_pos += num_matches + 1; - *matches_ret = matches; - /* Cap the length of returned matches to the number of bytes remaining, - * if it is not the whole window. */ - if (ctx->match_window_end < ctx->window_size) { - unsigned maxlen = ctx->match_window_end - ctx->match_window_pos; - for (u32 i = 0; i < num_matches; i++) - if (matches[i].len > maxlen) - matches[i].len = maxlen; + /* Don't allow matches to span the end of an LZX block. */ + if (ctx->match_window_end < ctx->window_size && num_matches != 0) { + unsigned limit = ctx->match_window_end - ctx->match_window_pos; + + if (limit >= LZX_MIN_MATCH_LEN) { + + unsigned i = num_matches - 1; + do { + if (matches[i].len >= limit) { + matches[i].len = limit; + + /* Truncation might produce multiple + * matches with length 'limit'. Keep at + * most 1. */ + num_matches = i + 1; + } + } while (i--); + } else { + num_matches = 0; + } + cache_ptr->len = num_matches; } + #if 0 fprintf(stderr, "Pos %u/%u: %u matches\n", - ctx->match_window_pos, ctx->match_window_end, num_matches); + ctx->match_window_pos, ctx->window_size, num_matches); for (unsigned i = 0; i < num_matches; i++) fprintf(stderr, "\tLen %u Offset %u\n", matches[i].len, matches[i].offset); #endif #ifdef ENABLE_LZX_DEBUG - for (u32 i = 0; i < num_matches; i++) { + for (unsigned i = 0; i < num_matches; i++) { LZX_ASSERT(matches[i].len >= LZX_MIN_MATCH_LEN); LZX_ASSERT(matches[i].len <= LZX_MAX_MATCH_LEN); LZX_ASSERT(matches[i].len <= ctx->match_window_end - ctx->match_window_pos); @@ -1207,39 +1307,424 @@ lzx_lz_get_matches_caching(struct lzx_compressor *ctx, LZX_ASSERT(!memcmp(&ctx->window[ctx->match_window_pos], &ctx->window[ctx->match_window_pos - matches[i].offset], matches[i].len)); + if (i) { + LZX_ASSERT(matches[i].len > matches[i - 1].len); + LZX_ASSERT(matches[i].offset > matches[i - 1].offset); + } } #endif - ctx->match_window_pos++; + ctx->cache_ptr = matches + num_matches; + *matches_ret = matches; return num_matches; } -static u32 -lzx_get_prev_literal_cost(struct lzx_compressor *ctx, - struct lzx_lru_queue *queue) +static void +lzx_skip_bytes(struct lzx_compressor *ctx, unsigned n) { - return lzx_literal_cost(ctx->window[ctx->match_window_pos - 1], - &ctx->costs); + struct raw_match *cache_ptr; + + LZX_ASSERT(n <= ctx->match_window_end - ctx->match_window_pos); + + cache_ptr = ctx->cache_ptr; + ctx->match_window_pos += n; + if (ctx->matches_cached) { + while (n--) + cache_ptr += 1 + cache_ptr->len; + } else { + lz_bt_skip_positions(&ctx->mf, n); + while (n--) { + cache_ptr->len = 0; + cache_ptr += 1; + } + } + ctx->cache_ptr = cache_ptr; } -static u32 -lzx_get_match_cost(struct lzx_compressor *ctx, - struct lzx_lru_queue *queue, - input_idx_t length, input_idx_t offset) +/* + * Reverse the linked list of near-optimal matches so that they can be returned + * in forwards order. + * + * Returns the first match in the list. + */ +static struct raw_match +lzx_match_chooser_reverse_list(struct lzx_compressor *ctx, unsigned cur_pos) { - return lzx_match_cost(length, offset, &ctx->costs, queue); + unsigned prev_link, saved_prev_link; + unsigned prev_match_offset, saved_prev_match_offset; + + ctx->optimum_end_idx = cur_pos; + + saved_prev_link = ctx->optimum[cur_pos].prev.link; + saved_prev_match_offset = ctx->optimum[cur_pos].prev.match_offset; + + do { + prev_link = saved_prev_link; + prev_match_offset = saved_prev_match_offset; + + saved_prev_link = ctx->optimum[prev_link].prev.link; + saved_prev_match_offset = ctx->optimum[prev_link].prev.match_offset; + + ctx->optimum[prev_link].next.link = cur_pos; + ctx->optimum[prev_link].next.match_offset = prev_match_offset; + + cur_pos = prev_link; + } while (cur_pos != 0); + + ctx->optimum_cur_idx = ctx->optimum[0].next.link; + + return (struct raw_match) + { .len = ctx->optimum_cur_idx, + .offset = ctx->optimum[0].next.match_offset, + }; } +/* + * lzx_get_near_optimal_match() - + * + * Choose an approximately optimal match or literal to use at the next position + * in the string, or "window", being LZ-encoded. + * + * This is based on algorithms used in 7-Zip, including the DEFLATE encoder + * and the LZMA encoder, written by Igor Pavlov. + * + * Unlike a greedy parser that always takes the longest match, or even a "lazy" + * parser with one match/literal look-ahead like zlib, the algorithm used here + * may look ahead many matches/literals to determine the approximately optimal + * match/literal to code next. The motivation is that the compression ratio is + * improved if the compressor can do things like use a shorter-than-possible + * match in order to allow a longer match later, and also take into account the + * estimated real cost of coding each match/literal based on the underlying + * entropy encoding. + * + * Still, this is not a true optimal parser for several reasons: + * + * - Real compression formats use entropy encoding of the literal/match + * sequence, so the real cost of coding each match or literal is unknown until + * the parse is fully determined. It can be approximated based on iterative + * parses, but the end result is not guaranteed to be globally optimal. + * + * - Very long matches are chosen immediately. This is because locations with + * long matches are likely to have many possible alternatives that would cause + * slow optimal parsing, but also such locations are already highly + * compressible so it is not too harmful to just grab the longest match. + * + * - Not all possible matches at each location are considered because the + * underlying match-finder limits the number and type of matches produced at + * each position. For example, for a given match length it's usually not + * worth it to only consider matches other than the lowest-offset match, + * except in the case of a repeat offset. + * + * - Although we take into account the adaptive state (in LZX, the recent offset + * queue), coding decisions made with respect to the adaptive state will be + * locally optimal but will not necessarily be globally optimal. This is + * because the algorithm only keeps the least-costly path to get to a given + * location and does not take into account that a slightly more costly path + * could result in a different adaptive state that ultimately results in a + * lower global cost. + * + * - The array space used by this function is bounded, so in degenerate cases it + * is forced to start returning matches/literals before the algorithm has + * really finished. + * + * Each call to this function does one of two things: + * + * 1. Build a sequence of near-optimal matches/literals, up to some point, that + * will be returned by subsequent calls to this function, then return the + * first one. + * + * OR + * + * 2. Return the next match/literal previously computed by a call to this + * function. + * + * The return value is a (length, offset) pair specifying the match or literal + * chosen. For literals, the length is 0 or 1 and the offset is meaningless. + */ static struct raw_match -lzx_lz_get_near_optimal_match(struct lzx_compressor *ctx) +lzx_get_near_optimal_match(struct lzx_compressor *ctx) { - return lz_get_near_optimal_match(&ctx->mc, - lzx_lz_get_matches_caching, - lzx_lz_skip_bytes, - lzx_get_prev_literal_cost, - lzx_get_match_cost, - ctx, - &ctx->queue); + unsigned num_matches; + const struct raw_match *matches; + const struct raw_match *matchptr; + struct raw_match match; + unsigned longest_len; + unsigned longest_rep_len; + u32 longest_rep_offset; + unsigned cur_pos; + unsigned end_pos; + + if (ctx->optimum_cur_idx != ctx->optimum_end_idx) { + /* Case 2: Return the next match/literal already found. */ + match.len = ctx->optimum[ctx->optimum_cur_idx].next.link - + ctx->optimum_cur_idx; + match.offset = ctx->optimum[ctx->optimum_cur_idx].next.match_offset; + + ctx->optimum_cur_idx = ctx->optimum[ctx->optimum_cur_idx].next.link; + return match; + } + + /* Case 1: Compute a new list of matches/literals to return. */ + + ctx->optimum_cur_idx = 0; + ctx->optimum_end_idx = 0; + + /* Search for matches at recent offsets. Only keep the one with the + * longest match length. */ + longest_rep_len = LZX_MIN_MATCH_LEN - 1; + if (ctx->match_window_pos >= 1) { + unsigned limit = min(LZX_MAX_MATCH_LEN, + ctx->match_window_end - ctx->match_window_pos); + for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) { + u32 offset = ctx->queue.R[i]; + const u8 *strptr = &ctx->window[ctx->match_window_pos]; + const u8 *matchptr = strptr - offset; + unsigned len = 0; + while (len < limit && strptr[len] == matchptr[len]) + len++; + if (len > longest_rep_len) { + longest_rep_len = len; + longest_rep_offset = offset; + } + } + } + + /* If there's a long match with a recent offset, take it. */ + if (longest_rep_len >= ctx->params.alg_params.slow.nice_match_length) { + lzx_skip_bytes(ctx, longest_rep_len); + return (struct raw_match) { + .len = longest_rep_len, + .offset = longest_rep_offset, + }; + } + + /* Search other matches. */ + num_matches = lzx_get_matches(ctx, &matches); + + /* If there's a long match, take it. */ + if (num_matches) { + longest_len = matches[num_matches - 1].len; + if (longest_len >= ctx->params.alg_params.slow.nice_match_length) { + lzx_skip_bytes(ctx, longest_len - 1); + return matches[num_matches - 1]; + } + } else { + longest_len = 1; + } + + /* Calculate the cost to reach the next position by coding a literal. + */ + ctx->optimum[1].queue = ctx->queue; + ctx->optimum[1].cost = lzx_literal_cost(ctx->window[ctx->match_window_pos - 1], + &ctx->costs); + ctx->optimum[1].prev.link = 0; + + /* Calculate the cost to reach any position up to and including that + * reached by the longest match. */ + matchptr = matches; + for (unsigned len = 2; len <= longest_len; len++) { + u32 offset = matchptr->offset; + + ctx->optimum[len].queue = ctx->queue; + ctx->optimum[len].prev.link = 0; + ctx->optimum[len].prev.match_offset = offset; + ctx->optimum[len].cost = lzx_match_cost(len, offset, &ctx->costs, + &ctx->optimum[len].queue); + if (len == matchptr->len) + matchptr++; + } + end_pos = longest_len; + + if (longest_rep_len >= LZX_MIN_MATCH_LEN) { + struct lzx_lru_queue queue; + u32 cost; + + while (end_pos < longest_rep_len) + ctx->optimum[++end_pos].cost = MC_INFINITE_COST; + + queue = ctx->queue; + cost = lzx_match_cost(longest_rep_len, longest_rep_offset, + &ctx->costs, &queue); + if (cost <= ctx->optimum[longest_rep_len].cost) { + ctx->optimum[longest_rep_len].queue = queue; + ctx->optimum[longest_rep_len].prev.link = 0; + ctx->optimum[longest_rep_len].prev.match_offset = longest_rep_offset; + ctx->optimum[longest_rep_len].cost = cost; + } + } + + /* Step forward, calculating the estimated minimum cost to reach each + * position. The algorithm may find multiple paths to reach each + * position; only the lowest-cost path is saved. + * + * The progress of the parse is tracked in the @ctx->optimum array, which + * for each position contains the minimum cost to reach that position, + * the index of the start of the match/literal taken to reach that + * position through the minimum-cost path, the offset of the match taken + * (not relevant for literals), and the adaptive state that will exist + * at that position after the minimum-cost path is taken. The @cur_pos + * variable stores the position at which the algorithm is currently + * considering coding choices, and the @end_pos variable stores the + * greatest position at which the costs of coding choices have been + * saved. (Actually, the algorithm guarantees that all positions up to + * and including @end_pos are reachable by at least one path.) + * + * The loop terminates when any one of the following conditions occurs: + * + * 1. A match with length greater than or equal to @nice_match_length is + * found. When this occurs, the algorithm chooses this match + * unconditionally, and consequently the near-optimal match/literal + * sequence up to and including that match is fully determined and it + * can begin returning the match/literal list. + * + * 2. @cur_pos reaches a position not overlapped by a preceding match. + * In such cases, the near-optimal match/literal sequence up to + * @cur_pos is fully determined and it can begin returning the + * match/literal list. + * + * 3. Failing either of the above in a degenerate case, the loop + * terminates when space in the @ctx->optimum array is exhausted. + * This terminates the algorithm and forces it to start returning + * matches/literals even though they may not be globally optimal. + * + * Upon loop termination, a nonempty list of matches/literals will have + * been produced and stored in the @optimum array. These + * matches/literals are linked in reverse order, so the last thing this + * function does is reverse this list and return the first + * match/literal, leaving the rest to be returned immediately by + * subsequent calls to this function. + */ + cur_pos = 0; + for (;;) { + u32 cost; + + /* Advance to next position. */ + cur_pos++; + + /* Check termination conditions (2) and (3) noted above. */ + if (cur_pos == end_pos || cur_pos == LZX_OPTIM_ARRAY_SIZE) + return lzx_match_chooser_reverse_list(ctx, cur_pos); + + /* Search for matches at recent offsets. */ + longest_rep_len = LZX_MIN_MATCH_LEN - 1; + unsigned limit = min(LZX_MAX_MATCH_LEN, + ctx->match_window_end - ctx->match_window_pos); + for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) { + u32 offset = ctx->optimum[cur_pos].queue.R[i]; + const u8 *strptr = &ctx->window[ctx->match_window_pos]; + const u8 *matchptr = strptr - offset; + unsigned len = 0; + while (len < limit && strptr[len] == matchptr[len]) + len++; + if (len > longest_rep_len) { + longest_rep_len = len; + longest_rep_offset = offset; + } + } + + /* If we found a long match at a recent offset, choose it + * immediately. */ + if (longest_rep_len >= ctx->params.alg_params.slow.nice_match_length) { + /* Build the list of matches to return and get + * the first one. */ + match = lzx_match_chooser_reverse_list(ctx, cur_pos); + + /* Append the long match to the end of the list. */ + ctx->optimum[cur_pos].next.match_offset = longest_rep_offset; + ctx->optimum[cur_pos].next.link = cur_pos + longest_rep_len; + ctx->optimum_end_idx = cur_pos + longest_rep_len; + + /* Skip over the remaining bytes of the long match. */ + lzx_skip_bytes(ctx, longest_rep_len); + + /* Return first match in the list. */ + return match; + } + + /* Search other matches. */ + num_matches = lzx_get_matches(ctx, &matches); + + /* If there's a long match, take it. */ + if (num_matches) { + longest_len = matches[num_matches - 1].len; + if (longest_len >= ctx->params.alg_params.slow.nice_match_length) { + /* Build the list of matches to return and get + * the first one. */ + match = lzx_match_chooser_reverse_list(ctx, cur_pos); + + /* Append the long match to the end of the list. */ + ctx->optimum[cur_pos].next.match_offset = + matches[num_matches - 1].offset; + ctx->optimum[cur_pos].next.link = cur_pos + longest_len; + ctx->optimum_end_idx = cur_pos + longest_len; + + /* Skip over the remaining bytes of the long match. */ + lzx_skip_bytes(ctx, longest_len - 1); + + /* Return first match in the list. */ + return match; + } + } else { + longest_len = 1; + } + + while (end_pos < cur_pos + longest_len) + ctx->optimum[++end_pos].cost = MC_INFINITE_COST; + + /* Consider coding a literal. */ + cost = ctx->optimum[cur_pos].cost + + lzx_literal_cost(ctx->window[ctx->match_window_pos - 1], + &ctx->costs); + if (cost < ctx->optimum[cur_pos + 1].cost) { + ctx->optimum[cur_pos + 1].queue = ctx->optimum[cur_pos].queue; + ctx->optimum[cur_pos + 1].cost = cost; + ctx->optimum[cur_pos + 1].prev.link = cur_pos; + } + + /* Consider coding a match. */ + matchptr = matches; + for (unsigned len = 2; len <= longest_len; len++) { + u32 offset; + struct lzx_lru_queue queue; + + offset = matchptr->offset; + queue = ctx->optimum[cur_pos].queue; + + cost = ctx->optimum[cur_pos].cost + + lzx_match_cost(len, offset, &ctx->costs, &queue); + if (cost < ctx->optimum[cur_pos + len].cost) { + ctx->optimum[cur_pos + len].queue = queue; + ctx->optimum[cur_pos + len].prev.link = cur_pos; + ctx->optimum[cur_pos + len].prev.match_offset = offset; + ctx->optimum[cur_pos + len].cost = cost; + } + if (len == matchptr->len) + matchptr++; + } + + if (longest_rep_len >= LZX_MIN_MATCH_LEN) { + struct lzx_lru_queue queue; + + while (end_pos < cur_pos + longest_rep_len) + ctx->optimum[++end_pos].cost = MC_INFINITE_COST; + + queue = ctx->optimum[cur_pos].queue; + + cost = ctx->optimum[cur_pos].cost + + lzx_match_cost(longest_rep_len, longest_rep_offset, + &ctx->costs, &queue); + if (cost <= ctx->optimum[cur_pos + longest_rep_len].cost) { + ctx->optimum[cur_pos + longest_rep_len].queue = + queue; + ctx->optimum[cur_pos + longest_rep_len].prev.link = + cur_pos; + ctx->optimum[cur_pos + longest_rep_len].prev.match_offset = + longest_rep_offset; + ctx->optimum[cur_pos + longest_rep_len].cost = + cost; + } + } + } } /* Set default symbol costs for the LZX Huffman codes. */ @@ -1299,132 +1784,96 @@ lzx_optimize_block(struct lzx_compressor *ctx, struct lzx_block_spec *spec, unsigned num_passes) { const struct lzx_lru_queue orig_queue = ctx->queue; - struct lzx_freqs freqs; - - unsigned orig_window_pos = spec->window_pos; - unsigned orig_cached_pos = ctx->cached_matches_pos; unsigned num_passes_remaining = num_passes; + struct lzx_freqs freqs; - LZX_ASSERT(ctx->match_window_pos == spec->window_pos); + LZX_ASSERT(num_passes >= 1); + LZX_ASSERT(lz_bt_get_position(&ctx->mf) == spec->window_pos); ctx->match_window_end = spec->window_pos + spec->block_size; - spec->chosen_matches_start_pos = spec->window_pos; - - LZX_ASSERT(num_passes >= 1); + spec->chosen_matches = &ctx->chosen_matches[spec->window_pos]; + ctx->matches_cached = false; /* The first optimal parsing pass is done using the cost model already * set in ctx->costs. Each later pass is done using a cost model * computed from the previous pass. */ do { - --num_passes_remaining; + const u8 *window_ptr; + const u8 *window_end; + struct lzx_match *next_chosen_match; - ctx->match_window_pos = orig_window_pos; - ctx->cached_matches_pos = orig_cached_pos; - ctx->queue = orig_queue; - spec->num_chosen_matches = 0; + --num_passes_remaining; + ctx->match_window_pos = spec->window_pos; + ctx->cache_ptr = ctx->cached_matches; memset(&freqs, 0, sizeof(freqs)); - - const u8 *window_ptr = &ctx->window[spec->window_pos]; - const u8 *window_end = &window_ptr[spec->block_size]; - struct lzx_match *next_chosen_match = - &ctx->chosen_matches[spec->chosen_matches_start_pos]; + window_ptr = &ctx->window[spec->window_pos]; + window_end = window_ptr + spec->block_size; + next_chosen_match = spec->chosen_matches; while (window_ptr != window_end) { struct raw_match raw_match; struct lzx_match lzx_match; - raw_match = lzx_lz_get_near_optimal_match(ctx); + raw_match = lzx_get_near_optimal_match(ctx); + + LZX_ASSERT(!(raw_match.len == LZX_MIN_MATCH_LEN && + raw_match.offset == ctx->max_window_size - + LZX_MIN_MATCH_LEN)); if (raw_match.len >= LZX_MIN_MATCH_LEN) { - if (unlikely(raw_match.len == LZX_MIN_MATCH_LEN && - raw_match.offset == ctx->max_window_size - - LZX_MIN_MATCH_LEN)) - { - /* Degenerate case where the parser - * generated the minimum match length - * with the maximum offset. There - * aren't actually enough position slots - * to represent this offset, as noted in - * the comments in - * lzx_get_num_main_syms(), so we cannot - * allow it. Use literals instead. - * - * Note that this case only occurs if - * the match-finder can generate matches - * to the very start of the window. The - * suffix array match-finder can, - * although typical hash chain and - * binary tree match-finders use 0 as a - * null value and therefore cannot - * generate such matches. */ - BUILD_BUG_ON(LZX_MIN_MATCH_LEN != 2); - lzx_match.data = lzx_tally_literal(*window_ptr++, - &freqs); - *next_chosen_match++ = lzx_match; - lzx_match.data = lzx_tally_literal(*window_ptr++, - &freqs); - } else { - lzx_match.data = lzx_tally_match(raw_match.len, - raw_match.offset, - &freqs, - &ctx->queue); - window_ptr += raw_match.len; - } + lzx_match.data = lzx_tally_match(raw_match.len, + raw_match.offset, + &freqs, + &ctx->queue); + window_ptr += raw_match.len; } else { - lzx_match.data = lzx_tally_literal(*window_ptr++, &freqs); + lzx_match.data = lzx_tally_literal(*window_ptr, + &freqs); + window_ptr += 1; } *next_chosen_match++ = lzx_match; } - spec->num_chosen_matches = next_chosen_match - - &ctx->chosen_matches[spec->chosen_matches_start_pos]; - - lzx_make_huffman_codes(&freqs, &spec->codes, - ctx->num_main_syms); - if (num_passes_remaining) + spec->num_chosen_matches = next_chosen_match - spec->chosen_matches; + lzx_make_huffman_codes(&freqs, &spec->codes, ctx->num_main_syms); + if (num_passes_remaining) { lzx_set_costs(ctx, &spec->codes.lens); - ctx->matches_cached = true; + ctx->queue = orig_queue; + ctx->matches_cached = true; + } } while (num_passes_remaining); spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes); - ctx->matches_cached = false; -} - -static void -lzx_optimize_blocks(struct lzx_compressor *ctx) -{ - lzx_lru_queue_init(&ctx->queue); - lz_match_chooser_begin(&ctx->mc); - - const unsigned num_passes = ctx->params.alg_params.slow.num_optim_passes; - - for (unsigned i = 0; i < ctx->num_blocks; i++) - lzx_optimize_block(ctx, &ctx->block_specs[i], num_passes); } /* Prepare the input window into one or more LZX blocks ready to be output. */ static void lzx_prepare_blocks(struct lzx_compressor * ctx) { - /* Initialize the match-finder. */ - lz_sarray_load_window(&ctx->lz_sarray, ctx->window, ctx->window_size); - ctx->cached_matches_pos = 0; - ctx->matches_cached = false; - ctx->match_window_pos = 0; - /* Set up a default cost model. */ lzx_set_default_costs(&ctx->costs, ctx->num_main_syms); - /* TODO: The compression ratio could be slightly improved by performing + /* Set up the block specifications. + * TODO: The compression ratio could be slightly improved by performing * data-dependent block splitting instead of using fixed-size blocks. * Doing so well is a computationally hard problem, however. */ ctx->num_blocks = DIV_ROUND_UP(ctx->window_size, LZX_DIV_BLOCK_SIZE); for (unsigned i = 0; i < ctx->num_blocks; i++) { unsigned pos = LZX_DIV_BLOCK_SIZE * i; ctx->block_specs[i].window_pos = pos; - ctx->block_specs[i].block_size = min(ctx->window_size - pos, LZX_DIV_BLOCK_SIZE); + ctx->block_specs[i].block_size = min(ctx->window_size - pos, + LZX_DIV_BLOCK_SIZE); } + /* Load the window into the match-finder. */ + lz_bt_load_window(&ctx->mf, ctx->window, ctx->window_size); + /* Determine sequence of matches/literals to output for each block. */ - lzx_optimize_blocks(ctx); + lzx_lru_queue_init(&ctx->queue); + ctx->optimum_cur_idx = 0; + ctx->optimum_end_idx = 0; + for (unsigned i = 0; i < ctx->num_blocks; i++) { + lzx_optimize_block(ctx, &ctx->block_specs[i], + ctx->params.alg_params.slow.num_optim_passes); + } } /* @@ -1485,7 +1934,7 @@ lzx_prepare_block_fast(struct lzx_compressor * ctx) spec->window_pos = 0; spec->block_size = ctx->window_size; spec->num_chosen_matches = (record_ctx.matches - ctx->chosen_matches); - spec->chosen_matches_start_pos = 0; + spec->chosen_matches = ctx->chosen_matches; lzx_make_huffman_codes(&record_ctx.freqs, &spec->codes, ctx->num_main_syms); ctx->num_blocks = 1; @@ -1607,8 +2056,8 @@ lzx_free_compressor(void *_ctx) if (ctx) { FREE(ctx->chosen_matches); FREE(ctx->cached_matches); - lz_match_chooser_destroy(&ctx->mc); - lz_sarray_destroy(&ctx->lz_sarray); + FREE(ctx->optimum); + lz_bt_destroy(&ctx->mf); FREE(ctx->block_specs); FREE(ctx->prev_tab); FREE(ctx->window); @@ -1639,7 +2088,6 @@ static const struct wimlib_lzx_compressor_params lzx_slow_default = { .nice_match_length = 32, .num_optim_passes = 2, .max_search_depth = 50, - .max_matches_per_pos = 3, .main_nostat_cost = 15, .len_nostat_cost = 15, .aligned_nostat_cost = 7, @@ -1708,34 +2156,30 @@ lzx_create_compressor(size_t window_size, if (!params->alg_params.slow.use_len2_matches) min_match_len = max(min_match_len, 3); - if (!lz_sarray_init(&ctx->lz_sarray, - window_size, - min_match_len, - LZX_MAX_MATCH_LEN, - params->alg_params.slow.max_search_depth, - params->alg_params.slow.max_matches_per_pos)) + if (!lz_bt_init(&ctx->mf, + window_size, + min_match_len, + LZX_MAX_MATCH_LEN, + params->alg_params.slow.nice_match_length, + params->alg_params.slow.max_search_depth)) goto oom; } if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) { - if (!lz_match_chooser_init(&ctx->mc, - LZX_OPTIM_ARRAY_SIZE, - params->alg_params.slow.nice_match_length, - LZX_MAX_MATCH_LEN)) + ctx->optimum = MALLOC((LZX_OPTIM_ARRAY_SIZE + + min(params->alg_params.slow.nice_match_length, + LZX_MAX_MATCH_LEN)) * + sizeof(ctx->optimum[0])); + if (!ctx->optimum) goto oom; } if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) { - u32 cache_per_pos; - - cache_per_pos = params->alg_params.slow.max_matches_per_pos; - if (cache_per_pos > LZX_MAX_CACHE_PER_POS) - cache_per_pos = LZX_MAX_CACHE_PER_POS; - - ctx->cached_matches = MALLOC(window_size * (cache_per_pos + 1) * - sizeof(ctx->cached_matches[0])); + ctx->cached_matches = MALLOC(LZX_CACHE_SIZE); if (ctx->cached_matches == NULL) goto oom; + ctx->cache_limit = ctx->cached_matches + + LZX_CACHE_LEN - (LZX_MAX_MATCHES_PER_POS + 1); } ctx->chosen_matches = MALLOC(window_size * sizeof(ctx->chosen_matches[0])); @@ -1772,18 +2216,12 @@ lzx_get_needed_memory(size_t max_block_size, if (params->algorithm == WIMLIB_LZX_ALGORITHM_SLOW) { size += max_block_size * sizeof(((struct lzx_compressor*)0)->chosen_matches[0]); - size += lz_sarray_get_needed_memory(max_block_size); - size += lz_match_chooser_get_needed_memory(LZX_OPTIM_ARRAY_SIZE, - params->alg_params.slow.nice_match_length, - LZX_MAX_MATCH_LEN); - u32 cache_per_pos; - - cache_per_pos = params->alg_params.slow.max_matches_per_pos; - if (cache_per_pos > LZX_MAX_CACHE_PER_POS) - cache_per_pos = LZX_MAX_CACHE_PER_POS; - - size += max_block_size * (cache_per_pos + 1) * - sizeof(((struct lzx_compressor*)0)->cached_matches[0]); + size += lz_bt_get_needed_memory(max_block_size); + size += (LZX_OPTIM_ARRAY_SIZE + + min(params->alg_params.slow.nice_match_length, + LZX_MAX_MATCH_LEN)) * + sizeof(((struct lzx_compressor *)0)->optimum[0]); + size += LZX_CACHE_SIZE; } else { size += max_block_size * sizeof(((struct lzx_compressor*)0)->prev_tab[0]); } diff --git a/src/wim.c b/src/wim.c index 2942863c..2dd07872 100644 --- a/src/wim.c +++ b/src/wim.c @@ -66,9 +66,7 @@ static u32 wim_default_pack_chunk_size(int ctype) { switch (ctype) { case WIMLIB_COMPRESSION_TYPE_LZMS: - /* Note: WIMGAPI uses 1 << 26, but lower sizes are compatible. - * */ - return 1U << 25; /* 33554432 */ + return 1U << 26; /* 67108864 */ default: return 1U << 15; /* 32768 */ } -- 2.43.0