4 * Interface for Lempel-Ziv match-finders.
6 * Copyright (c) 2014 Eric Biggers. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * Example usage of the match-finder API:
35 * ----------------------------------------------------------------------------
37 * Fill in a 'struct lz_mf_params'.
38 * (Optional) Call lz_mf_params_valid() to validate the parameters.
39 * Call lz_mf_alloc() to allocate the match-finder.
40 * For each block of data to be compressed:
41 * Call lz_mf_load_window() to load the block into the match finder.
42 * While the block is not yet fully compressed:
43 * Call lz_mf_get_matches() to get matches at the current position.
44 * If matches were found:
45 * Output the longest match.
46 * Call lz_mf_skip_positions() to skip the remaining length of the match.
52 * Call lz_mf_free() to free the match-finder.
54 * ----------------------------------------------------------------------------
56 * That example did "greedy parsing" --- that is, always choosing the longest
57 * match at each position. However, this interface can be (and is intended to
58 * be) used for "optimal parsing" as well. It can also be used for in-between
59 * strategies such as "lazy parsing" and "flexible parsing". For the best
60 * performance try different match-finding algorithms and parameters to see what
61 * works best for your parsing strategy, and your typical data and block sizes.
64 #ifndef _WIMLIB_LZ_MF_H
65 #define _WIMLIB_LZ_MF_H
67 #include "wimlib/types.h"
69 /* When ENABLE_LZ_DEBUG is defined, we check all matches for correctness and
70 * perform other validations. Use for debugging only, as it slows things down
73 //#define ENABLE_LZ_DEBUG
74 #ifdef ENABLE_LZ_DEBUG
77 # define LZ_ASSERT assert
79 # define LZ_ASSERT(...)
84 /* Representation of a Lempel-Ziv match. */
87 /* The number of bytes matched. */
90 /* The offset back from the current position that was matched. */
95 * Specifies a match-finding algorithm.
100 * Use the default algorithm for the specified maximum window size.
105 * "Null" algorithm that never reports any matches.
107 * This algorithm exists for comparison, benchmarking, and testing
108 * purposes only. It is not intended to be used in real compressors.
113 * Brute Force match-finding algorithm.
115 * This algorithm exists for comparison, benchmarking, and testing
116 * purposes only. It is not intended to be used in real compressors.
118 LZ_MF_BRUTE_FORCE = 2,
121 * Hash Chain match-finding algorithm.
123 * This works well on small windows.
125 * The memory usage is 4 bytes per position, plus 131072 bytes for a
128 * lz_mf_skip_positions() with this algorithm is very fast, so it's good
129 * if you're doing "greedy" rather than "optimal" parsing. However, if
130 * using large windows you might be better off with binary trees or
131 * suffix arrays, even if doing greedy parsing.
133 LZ_MF_HASH_CHAINS = 3,
136 * Binary Tree match-finding algorithm.
138 * This works well on small to medium-sized windows.
140 * The memory usage is 8 bytes per position, plus 262144 bytes for a
143 * lz_mf_skip_positions() with this algorithm takes a significant amount
144 * of time, almost as much as a call to lz_mf_get_matches(). This makes
145 * this algorithm better suited for optimal parsing than for greedy
146 * parsing. However, if the window size becomes sufficiently large,
147 * this algorithm can outperform hash chains, even when using greedy
150 LZ_MF_BINARY_TREES = 4,
153 * Longest Common Prefix Interval Tree match-finding algorithm.
155 * This is a suffix array-based algorithm. It works well on medium to
156 * large windows. However, due to an implementation detail, it is
157 * currently limited to a maximum window size of 33554432 bytes.
159 * The memory usage is 12 bytes per position.
161 * Unlike the hash chain and binary tree algorithms, the LCP interval
162 * tree algorithm performs most of its work in lz_mf_load_window(). The
163 * calls to lz_mf_get_matches() and lz_mf_skip_positions() take
164 * relatively little time, and lz_mf_skip_positions() is not much faster
165 * than lz_mf_get_matches(). Therefore, if you're using this algorithm
166 * you might as well be doing "optimal" rather than "greedy" parsing.
168 LZ_MF_LCP_INTERVAL_TREE = 5,
171 * Linked Suffix Array match-finding algorithm.
173 * This can be used on very large windows.
175 * The memory usage is 14 bytes per position.
177 * Currently, this method usually performs slightly worse than the LCP
178 * interval tree algorithm. However, it can be used on windows
179 * exceeding the 33554432 byte limit of the LCP interval tree algorithm.
181 LZ_MF_LINKED_SUFFIX_ARRAY = 6,
184 /* Parameters for Lempel-Ziv match-finding. */
185 struct lz_mf_params {
188 * The match-finding algorithm to use. This must be one of the 'enum
189 * lz_mf_algo' constants defined above.
191 * If this is LZ_MF_DEFAULT, the default algorithm for the specified
192 * @max_window_size is used.
197 * The maximum window size, in bytes, that shall be supported by the
198 * match-finder. This is the maximum size that can be passed to
199 * subsequent calls to lz_mf_load_window().
201 * Note: this interface is intended to be used for block compression, so
202 * none of the match-finding algorithms support sliding windows. It's
203 * expected that the window for LZ match-finding simply be the block of
204 * data being compressed.
206 * Match-finders generally require an amount of memory proportional to
207 * this parameter. Use lz_mf_get_needed_memory() to query the needed
208 * memory size for a specific match-finding algorithm and maximum window
211 * This parameter cannot be 0; there is no default value.
213 * Match-finding algorithms may place additional restrictions on this
214 * parameter. However, currently only the LCP interval tree
215 * match-finding algorithm places such a restriction (it doesn't support
216 * windows larger than 33554432 bytes).
221 * The minimum length, in bytes, of matches that can be produced by the
222 * match-finder (by a call to lz_mf_get_matches()).
224 * If this parameter is not 0, it must be 2 or greater.
226 * If this parameter is 0, the match-finding algorithm sets it to a
227 * default value. The default value will be at least 2 and at most 16.
232 * The maximum length, in bytes, of matches that can be produced by the
233 * match-finder (by a call to lz_mf_get_matches()).
235 * If this parameter is not 0, it must be greater than or equal to
236 * @min_match_len, or the default value the match-finding algorithm
237 * selected for @min_match_len in the case that @min_match_len was
240 * If this parameter is 0, the match-finding algorithm sets it to a
241 * default value. In general, the caller must be prepared to handle
242 * arbitrarily long matches (up to the window size minus 1) in this
248 * When using the hash chains or binary trees match-finding algorithm,
249 * this parameter defines the maximum number of search steps at each
250 * position. A typical value to use is 32. Higher values result in
251 * better matches and slower performance.
253 * The suffix array-based match-finding algorithms treat this parameter
254 * slightly differently because they find the longest matches first.
255 * They still honor the intent of the parameter but may scale it down to
256 * an appropriate value.
258 * If this parameter is 0, the match-finding algorithm sets it to a
261 u32 max_search_depth;
264 * When using the hash chains, binary trees, or LCP interval tree
265 * match-finding algorithm, this parameter defines the maximum match
266 * length to which the full algorithm will be applied. This can also be
267 * thought of as the length above which the algorithm will not try to
268 * search for additional matches.
270 * Usually, setting this parameter to a reasonable value (such as 24,
271 * 32, or 48) will speed up match-finding but will not hurt the
272 * compression ratio too much. This is because these settings of this
273 * parameter cause the match-finder to not waste too much time examining
274 * very long matches, which are already highly compressible.
276 * In addition, if the longest match exceeds this length, the
277 * match-finding algorithm will still report its full length.
279 * The linked suffix array match-finding algorithm ignores this
282 * If this parameter is 0, the match-finding algorithm sets it to a
289 * Lempel-Ziv match-finder operations structure.
291 * Match-finding algorithms must fill in all members. None can be left as 0 or
294 * Don't directly access any of the members outside of lz_mf.h and lz_mf.c.
295 * Instead, use the lz_mf_*() wrappers.
298 bool (*params_valid)(const struct lz_mf_params *);
300 u64 (*get_needed_memory)(u32 max_window_size);
302 bool (*init)(struct lz_mf *);
304 void (*load_window)(struct lz_mf *mf, const u8 *, u32);
306 u32 (*get_matches)(struct lz_mf *, struct lz_match *);
308 void (*skip_positions)(struct lz_mf *, u32);
310 void (*destroy)(struct lz_mf *);
316 * Lempel-Ziv match-finder structure.
318 * Match-finding algorithms must embed this structure inside a private
321 * Don't directly access any of the members outside of lz_mf.h, lz_mf.c, and
322 * match-finding algorithms. Instead, use the lz_mf_*() wrappers.
325 struct lz_mf_params params;
326 struct lz_mf_ops ops;
327 const u8 *cur_window;
333 lz_mf_params_valid(const struct lz_mf_params *params);
336 lz_mf_get_needed_memory(enum lz_mf_algo algorithm, u32 max_window_size);
338 extern struct lz_mf *
339 lz_mf_alloc(const struct lz_mf_params *params);
342 lz_mf_load_window(struct lz_mf *mf, const u8 *window, u32 size);
344 #ifdef ENABLE_LZ_DEBUG
346 lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches);
348 /* See non-inline definition for comment */
350 lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches)
352 return mf->ops.get_matches(mf, matches);
356 #ifdef ENABLE_LZ_DEBUG
358 lz_mf_skip_positions(struct lz_mf *mf, u32 n);
360 /* See non-inline definition for comment */
362 lz_mf_skip_positions(struct lz_mf *mf, u32 n)
364 mf->ops.skip_positions(mf, n);
369 lz_mf_free(struct lz_mf *mf);
372 * Returns the match-finder's current position in the window.
374 * The current position begins at 0. It increases by 1 when lz_mf_get_matches()
375 * is called, and by 'n' when lz_mf_skip_positions() is called.
377 * Note: The behavior is undefined if the match-finder is advanced beyond the
378 * end of the window. (If this happens in ENABLE_LZ_DEBUG mode, an assertion
379 * will be triggered.)
382 lz_mf_get_position(const struct lz_mf *mf)
384 return mf->cur_window_pos;
388 * Returns the number of bytes remaining in the window.
391 lz_mf_get_bytes_remaining(const struct lz_mf *mf)
393 return mf->cur_window_size - mf->cur_window_pos;
397 * Returns a pointer to the current window, offset by the current position.
398 * Equivalently, this returns a pointer to the byte sequence that the next call
399 * to lz_mf_get_matches() will match against.
401 static inline const u8 *
402 lz_mf_get_window_ptr(const struct lz_mf *mf)
404 return &mf->cur_window[mf->cur_window_pos];
407 #endif /* _WIMLIB_LZ_MF_H */