4 * Interface for Lempel-Ziv match-finders.
6 * Copyright (c) 2014 Eric Biggers. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * Example usage of the match-finder API:
35 * ----------------------------------------------------------------------------
37 * Fill in a 'struct lz_mf_params'.
38 * (Optional) Call lz_mf_params_valid() to validate the parameters.
39 * Call lz_mf_alloc() to allocate the match-finder.
40 * For each block of data to be compressed:
41 * Call lz_mf_load_window() to load the block into the match finder.
42 * While the block is not yet fully compressed:
43 * Call lz_mf_get_matches() to get matches at the current position.
44 * If matches were found:
45 * Output the longest match.
46 * Call lz_mf_skip_positions() to skip the remaining length of the match.
52 * Call lz_mf_free() to free the match-finder.
54 * ----------------------------------------------------------------------------
56 * That example did "greedy parsing" --- that is, always choosing the longest
57 * match at each position. However, this interface can be (and is intended to
58 * be) used for "optimal parsing" as well. It can also be used for in-between
59 * strategies such as "lazy parsing" and "flexible parsing". For the best
60 * performance try different match-finding algorithms and parameters to see what
61 * works best for your parsing strategy, and your typical data and block sizes.
64 #ifndef _WIMLIB_LZ_MF_H
65 #define _WIMLIB_LZ_MF_H
67 #include "wimlib/types.h"
69 /* When ENABLE_LZ_DEBUG is defined, we check all matches for correctness and
70 * perform other validations. Use for debugging only, as it slows things down
73 //#define ENABLE_LZ_DEBUG
74 #ifdef ENABLE_LZ_DEBUG
77 # define LZ_ASSERT assert
79 # define LZ_ASSERT(...)
84 /* Representation of a Lempel-Ziv match. */
87 /* The number of bytes matched. */
90 /* The offset back from the current position that was matched. */
95 * Specifies a match-finding algorithm.
100 * Use the default algorithm for the specified maximum window size.
105 * "Null" algorithm that never reports any matches.
107 * This algorithm exists for comparison, benchmarking, and testing
108 * purposes only. It is not intended to be used in real compressors.
113 * Hash Chain match-finding algorithm.
115 * This works well on small windows.
117 * The memory usage is 4 bytes per position, plus 131072 bytes for a
120 * lz_mf_skip_positions() with this algorithm is very fast, so it's good
121 * if you're doing "greedy" rather than "optimal" parsing. However, if
122 * using large windows you might be better off with binary trees or
123 * suffix arrays, even if doing greedy parsing.
125 LZ_MF_HASH_CHAINS = 3,
128 * Binary Tree match-finding algorithm.
130 * This works well on small to medium-sized windows.
132 * The memory usage is 8 bytes per position, plus 262144 bytes for a
135 * lz_mf_skip_positions() with this algorithm takes a significant amount
136 * of time, almost as much as a call to lz_mf_get_matches(). This makes
137 * this algorithm better suited for optimal parsing than for greedy
138 * parsing. However, if the window size becomes sufficiently large,
139 * this algorithm can outperform hash chains, even when using greedy
142 LZ_MF_BINARY_TREES = 4,
145 * Longest Common Prefix Interval Tree match-finding algorithm.
147 * This is a suffix array-based algorithm. It works well on medium to
148 * large windows. However, due to an implementation detail, it is
149 * currently limited to a maximum window size of 33554432 bytes.
151 * The memory usage is 12 bytes per position.
153 * Unlike the hash chain and binary tree algorithms, the LCP interval
154 * tree algorithm performs most of its work in lz_mf_load_window(). The
155 * calls to lz_mf_get_matches() and lz_mf_skip_positions() take
156 * relatively little time, and lz_mf_skip_positions() is not much faster
157 * than lz_mf_get_matches(). Therefore, if you're using this algorithm
158 * you might as well be doing "optimal" rather than "greedy" parsing.
160 LZ_MF_LCP_INTERVAL_TREE = 5,
163 * Linked Suffix Array match-finding algorithm.
165 * This can be used on very large windows.
167 * The memory usage is 14 bytes per position.
169 * Currently, this method usually performs slightly worse than the LCP
170 * interval tree algorithm. However, it can be used on windows
171 * exceeding the 33554432 byte limit of the LCP interval tree algorithm.
173 LZ_MF_LINKED_SUFFIX_ARRAY = 6,
176 /* Parameters for Lempel-Ziv match-finding. */
177 struct lz_mf_params {
180 * The match-finding algorithm to use. This must be one of the 'enum
181 * lz_mf_algo' constants defined above.
183 * If this is LZ_MF_DEFAULT, the default algorithm for the specified
184 * @max_window_size is used.
189 * The maximum window size, in bytes, that shall be supported by the
190 * match-finder. This is the maximum size that can be passed to
191 * subsequent calls to lz_mf_load_window().
193 * Note: this interface is intended to be used for block compression, so
194 * none of the match-finding algorithms support sliding windows. It's
195 * expected that the window for LZ match-finding simply be the block of
196 * data being compressed.
198 * Match-finders generally require an amount of memory proportional to
199 * this parameter. Use lz_mf_get_needed_memory() to query the needed
200 * memory size for a specific match-finding algorithm and maximum window
203 * This parameter cannot be 0; there is no default value.
205 * Match-finding algorithms may place additional restrictions on this
206 * parameter. However, currently only the LCP interval tree
207 * match-finding algorithm places such a restriction (it doesn't support
208 * windows larger than 33554432 bytes).
213 * The minimum length, in bytes, of matches that can be produced by the
214 * match-finder (by a call to lz_mf_get_matches()).
216 * If this parameter is not 0, it must be 2 or greater.
218 * If this parameter is 0, the match-finding algorithm sets it to a
219 * default value. The default value will be at least 2 and at most 16.
224 * The maximum length, in bytes, of matches that can be produced by the
225 * match-finder (by a call to lz_mf_get_matches()).
227 * If this parameter is not 0, it must be greater than or equal to
228 * @min_match_len, or the default value the match-finding algorithm
229 * selected for @min_match_len in the case that @min_match_len was
232 * If this parameter is 0, the match-finding algorithm sets it to a
233 * default value. In general, the caller must be prepared to handle
234 * arbitrarily long matches (up to the window size minus 1) in this
240 * When using the hash chains or binary trees match-finding algorithm,
241 * this parameter defines the maximum number of search steps at each
242 * position. A typical value to use is 32. Higher values result in
243 * better matches and slower performance.
245 * The suffix array-based match-finding algorithms treat this parameter
246 * slightly differently because they find the longest matches first.
247 * They still honor the intent of the parameter but may scale it down to
248 * an appropriate value.
250 * If this parameter is 0, the match-finding algorithm sets it to a
253 u32 max_search_depth;
256 * When using the hash chains, binary trees, or LCP interval tree
257 * match-finding algorithm, this parameter defines the maximum match
258 * length to which the full algorithm will be applied. This can also be
259 * thought of as the length above which the algorithm will not try to
260 * search for additional matches.
262 * Usually, setting this parameter to a reasonable value (such as 24,
263 * 32, or 48) will speed up match-finding but will not hurt the
264 * compression ratio too much. This is because these settings of this
265 * parameter cause the match-finder to not waste too much time examining
266 * very long matches, which are already highly compressible.
268 * In addition, if the longest match exceeds this length, the
269 * match-finding algorithm will still report its full length.
271 * The linked suffix array match-finding algorithm ignores this
274 * If this parameter is 0, the match-finding algorithm sets it to a
281 * Lempel-Ziv match-finder operations structure.
283 * Match-finding algorithms must fill in all members. None can be left as 0 or
286 * Don't directly access any of the members outside of lz_mf.h and lz_mf.c.
287 * Instead, use the lz_mf_*() wrappers.
290 bool (*params_valid)(const struct lz_mf_params *);
292 u64 (*get_needed_memory)(u32 max_window_size);
294 bool (*init)(struct lz_mf *);
296 void (*load_window)(struct lz_mf *mf, const u8 *, u32);
298 u32 (*get_matches)(struct lz_mf *, struct lz_match *);
300 void (*skip_positions)(struct lz_mf *, u32);
302 void (*destroy)(struct lz_mf *);
308 * Lempel-Ziv match-finder structure.
310 * Match-finding algorithms must embed this structure inside a private
313 * Don't directly access any of the members outside of lz_mf.h, lz_mf.c, and
314 * match-finding algorithms. Instead, use the lz_mf_*() wrappers.
317 struct lz_mf_params params;
318 struct lz_mf_ops ops;
319 const u8 *cur_window;
325 lz_mf_params_valid(const struct lz_mf_params *params);
328 lz_mf_get_needed_memory(enum lz_mf_algo algorithm, u32 max_window_size);
330 extern struct lz_mf *
331 lz_mf_alloc(const struct lz_mf_params *params);
334 lz_mf_load_window(struct lz_mf *mf, const u8 *window, u32 size);
336 #ifdef ENABLE_LZ_DEBUG
338 lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches);
340 /* See non-inline definition for comment */
342 lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches)
344 return mf->ops.get_matches(mf, matches);
348 #ifdef ENABLE_LZ_DEBUG
350 lz_mf_skip_positions(struct lz_mf *mf, u32 n);
352 /* See non-inline definition for comment */
354 lz_mf_skip_positions(struct lz_mf *mf, u32 n)
356 mf->ops.skip_positions(mf, n);
361 lz_mf_free(struct lz_mf *mf);
364 * Returns the match-finder's current position in the window.
366 * The current position begins at 0. It increases by 1 when lz_mf_get_matches()
367 * is called, and by 'n' when lz_mf_skip_positions() is called.
369 * Note: The behavior is undefined if the match-finder is advanced beyond the
370 * end of the window. (If this happens in ENABLE_LZ_DEBUG mode, an assertion
371 * will be triggered.)
374 lz_mf_get_position(const struct lz_mf *mf)
376 return mf->cur_window_pos;
380 * Returns the number of bytes remaining in the window.
383 lz_mf_get_bytes_remaining(const struct lz_mf *mf)
385 return mf->cur_window_size - mf->cur_window_pos;
389 * Returns a pointer to the current window, offset by the current position.
390 * Equivalently, this returns a pointer to the byte sequence that the next call
391 * to lz_mf_get_matches() will match against.
393 static inline const u8 *
394 lz_mf_get_window_ptr(const struct lz_mf *mf)
396 return &mf->cur_window[mf->cur_window_pos];
399 #endif /* _WIMLIB_LZ_MF_H */