4 * Interface for Lempel-Ziv match-finders.
6 * Copyright (c) 2014 Eric Biggers. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * Example usage of the match-finder API:
35 * ----------------------------------------------------------------------------
37 * Fill in a 'struct lz_mf_params'.
38 * (Optional) Call lz_mf_params_valid() to validate the parameters.
39 * Call lz_mf_alloc() to allocate the match-finder.
40 * For each block of data to be compressed:
41 * Call lz_mf_load_window() to load the block into the match finder.
42 * While the block is not yet fully compressed:
43 * Call lz_mf_get_matches() to get matches at the current position.
44 * If matches were found:
45 * Output the longest match.
46 * Call lz_mf_skip_positions() to skip the remaining length of the match.
52 * Call lz_mf_free() to free the match-finder.
54 * ----------------------------------------------------------------------------
56 * That example did "greedy parsing" --- that is, always choosing the longest
57 * match at each position. However, this interface can be (and is intended to
58 * be) used for "optimal parsing" as well. It can also be used for in-between
59 * strategies such as "lazy parsing" and "flexible parsing". For the best
60 * performance try different match-finding algorithms and parameters to see what
61 * works best for your parsing strategy, and your typical data and block sizes.
64 #ifndef _WIMLIB_LZ_MF_H
65 #define _WIMLIB_LZ_MF_H
67 #include "wimlib/types.h"
69 /* When ENABLE_LZ_DEBUG is defined, we check all matches for correctness and
70 * perform other validations. Use for debugging only, as it slows things down
73 //#define ENABLE_LZ_DEBUG
74 #ifdef ENABLE_LZ_DEBUG
77 # define LZ_ASSERT assert
79 # define LZ_ASSERT(...)
84 /* Representation of a Lempel-Ziv match. */
86 /* The number of bytes matched. */
89 /* The offset back from the current position that was matched. */
94 * Specifies a match-finding algorithm.
99 * Use the default algorithm for the specified maximum window size.
104 * "Null" algorithm that never reports any matches.
106 * This algorithm exists for comparison, benchmarking, and testing
107 * purposes only. It is not intended to be used in real compressors.
112 * Brute Force match-finding algorithm.
114 * This algorithm exists for comparison, benchmarking, and testing
115 * purposes only. It is not intended to be used in real compressors.
117 LZ_MF_BRUTE_FORCE = 2,
120 * Hash Chain match-finding algorithm.
122 * This works well on small windows.
124 * The memory usage is 4 bytes per position, plus 131072 bytes for a
127 * lz_mf_skip_positions() with this algorithm is very fast, so it's good
128 * if you're doing "greedy" rather than "optimal" parsing. However, if
129 * using large windows you might be better off with binary trees or
130 * suffix arrays, even if doing greedy parsing.
132 LZ_MF_HASH_CHAINS = 3,
135 * Binary Tree match-finding algorithm.
137 * This works well on small to medium-sized windows.
139 * The memory usage is 8 bytes per position, plus 262144 bytes for a
142 * lz_mf_skip_positions() with this algorithm takes a significant amount
143 * of time, almost as much as a call to lz_mf_get_matches(). This makes
144 * this algorithm better suited for optimal parsing than for greedy
145 * parsing. However, if the window size becomes sufficiently large,
146 * this algorithm can outperform hash chains, even when using greedy
149 LZ_MF_BINARY_TREES = 4,
152 * Longest Common Prefix Interval Tree match-finding algorithm.
154 * This is a suffix array-based algorithm. It works well on medium to
155 * large windows. However, due to an implementation detail, it is
156 * currently limited to a maximum window size of 33554432 bytes.
158 * The memory usage is 12 bytes per position.
160 * Unlike the hash chain and binary tree algorithms, the LCP interval
161 * tree algorithm performs most of its work in lz_mf_load_window(). The
162 * calls to lz_mf_get_matches() and lz_mf_skip_positions() take
163 * relatively little time, and lz_mf_skip_positions() is not much faster
164 * than lz_mf_get_matches(). Therefore, if you're using this algorithm
165 * you might as well be doing "optimal" rather than "greedy" parsing.
167 LZ_MF_LCP_INTERVAL_TREE = 5,
170 * Linked Suffix Array match-finding algorithm.
172 * This can be used on very large windows.
174 * The memory usage is 14 bytes per position.
176 * Currently, this method usually performs slightly worse than the LCP
177 * interval tree algorithm. However, it can be used on windows
178 * exceeding the 33554432 byte limit of the LCP interval tree algorithm.
180 LZ_MF_LINKED_SUFFIX_ARRAY = 6,
183 /* Parameters for Lempel-Ziv match-finding. */
184 struct lz_mf_params {
187 * The match-finding algorithm to use. This must be one of the 'enum
188 * lz_mf_algo' constants defined above.
190 * If this is LZ_MF_DEFAULT, the default algorithm for the specified
191 * @max_window_size is used.
196 * The maximum window size, in bytes, that shall be supported by the
197 * match-finder. This is the maximum size that can be passed to
198 * subsequent calls to lz_mf_load_window().
200 * Note: this interface is intended to be used for block compression, so
201 * none of the match-finding algorithms support sliding windows. It's
202 * expected that the window for LZ match-finding simply be the block of
203 * data being compressed.
205 * Match-finders generally require an amount of memory proportional to
206 * this parameter. Use lz_mf_get_needed_memory() to query the needed
207 * memory size for a specific match-finding algorithm and maximum window
210 * This parameter cannot be 0; there is no default value.
212 * Match-finding algorithms may place additional restrictions on this
213 * parameter. However, currently only the LCP interval tree
214 * match-finding algorithm places such a restriction (it doesn't support
215 * windows larger than 33554432 bytes).
220 * The minimum length, in bytes, of matches that can be produced by the
221 * match-finder (by a call to lz_mf_get_matches()).
223 * If this parameter is not 0, it must be 2 or greater.
225 * If this parameter is 0, the match-finding algorithm sets it to a
226 * default value. The default value will be at least 2 and at most 16.
231 * The maximum length, in bytes, of matches that can be produced by the
232 * match-finder (by a call to lz_mf_get_matches()).
234 * If this parameter is not 0, it must be greater than or equal to
235 * @min_match_len, or the default value the match-finding algorithm
236 * selected for @min_match_len in the case that @min_match_len was
239 * If this parameter is 0, the match-finding algorithm sets it to a
240 * default value. In general, the caller must be prepared to handle
241 * arbitrarily long matches (up to the window size minus 1) in this
247 * When using the hash chains or binary trees match-finding algorithm,
248 * this parameter defines the maximum number of search steps at each
249 * position. A typical value to use is 32. Higher values result in
250 * better matches and slower performance.
252 * The suffix array-based match-finding algorithms treat this parameter
253 * slightly differently because they find the longest matches first.
254 * They still honor the intent of the parameter but may scale it down to
255 * an appropriate value.
257 * If this parameter is 0, the match-finding algorithm sets it to a
260 u32 max_search_depth;
263 * When using the hash chains, binary trees, or LCP interval tree
264 * match-finding algorithm, this parameter defines the maximum match
265 * length to which the full algorithm will be applied. This can also be
266 * thought of as the length above which the algorithm will not try to
267 * search for additional matches.
269 * Usually, setting this parameter to a reasonable value (such as 24,
270 * 32, or 48) will speed up match-finding but will not hurt the
271 * compression ratio too much. This is because these settings of this
272 * parameter cause the match-finder to not waste too much time examining
273 * very long matches, which are already highly compressible.
275 * In addition, if the longest match exceeds this length, the
276 * match-finding algorithm will still report its full length.
278 * The linked suffix array match-finding algorithm ignores this
281 * If this parameter is 0, the match-finding algorithm sets it to a
288 * Lempel-Ziv match-finder operations structure.
290 * Match-finding algorithms must fill in all members. None can be left as 0 or
293 * Don't directly access any of the members outside of lz_mf.h and lz_mf.c.
294 * Instead, use the lz_mf_*() wrappers.
297 bool (*params_valid)(const struct lz_mf_params *);
299 u64 (*get_needed_memory)(u32 max_window_size);
301 bool (*init)(struct lz_mf *);
303 void (*load_window)(struct lz_mf *mf, const u8 *, u32);
305 u32 (*get_matches)(struct lz_mf *, struct lz_match *);
307 void (*skip_positions)(struct lz_mf *, u32);
309 void (*destroy)(struct lz_mf *);
315 * Lempel-Ziv match-finder structure.
317 * Match-finding algorithms must embed this structure inside a private
320 * Don't directly access any of the members outside of lz_mf.h, lz_mf.c, and
321 * match-finding algorithms. Instead, use the lz_mf_*() wrappers.
324 struct lz_mf_params params;
325 struct lz_mf_ops ops;
326 const u8 *cur_window;
332 lz_mf_params_valid(const struct lz_mf_params *params);
335 lz_mf_get_needed_memory(enum lz_mf_algo algorithm, u32 max_window_size);
337 extern struct lz_mf *
338 lz_mf_alloc(const struct lz_mf_params *params);
341 lz_mf_load_window(struct lz_mf *mf, const u8 *window, u32 size);
343 #ifdef ENABLE_LZ_DEBUG
345 lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches);
347 /* See non-inline definition for comment */
349 lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches)
351 return mf->ops.get_matches(mf, matches);
355 #ifdef ENABLE_LZ_DEBUG
357 lz_mf_skip_positions(struct lz_mf *mf, u32 n);
359 /* See non-inline definition for comment */
361 lz_mf_skip_positions(struct lz_mf *mf, u32 n)
363 mf->ops.skip_positions(mf, n);
368 lz_mf_free(struct lz_mf *mf);
371 * Returns the match-finder's current position in the window.
373 * The current position begins at 0. It increases by 1 when lz_mf_get_matches()
374 * is called, and by 'n' when lz_mf_skip_positions() is called.
376 * Note: The behavior is undefined if the match-finder is advanced beyond the
377 * end of the window. (If this happens in ENABLE_LZ_DEBUG mode, an assertion
378 * will be triggered.)
381 lz_mf_get_position(const struct lz_mf *mf)
383 return mf->cur_window_pos;
387 * Returns the number of bytes remaining in the window.
390 lz_mf_get_bytes_remaining(const struct lz_mf *mf)
392 return mf->cur_window_size - mf->cur_window_pos;
396 * Returns a pointer to the current window, offset by the current position.
397 * Equivalently, this returns a pointer to the byte sequence that the next call
398 * to lz_mf_get_matches() will match against.
400 static inline const u8 *
401 lz_mf_get_window_ptr(const struct lz_mf *mf)
403 return &mf->cur_window[mf->cur_window_pos];
406 #endif /* _WIMLIB_LZ_MF_H */