4 * Interface for Lempel-Ziv match-finders.
6 * Copyright (c) 2014 Eric Biggers. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * Example usage of the match-finder API:
35 * ----------------------------------------------------------------------------
37 * Fill in a 'struct lz_mf_params'.
38 * (Optional) Call lz_mf_params_valid() to validate the parameters.
39 * Call lz_mf_alloc() to allocate the match-finder.
40 * For each block of data to be compressed:
41 * Call lz_mf_load_window() to load the block into the match finder.
42 * While the block is not yet fully compressed:
43 * Call lz_mf_get_matches() to get matches at the current position.
44 * If matches were found:
45 * Output the longest match.
46 * Call lz_mf_skip_positions() to skip the remaining length of the match.
52 * Call lz_mf_free() to free the match-finder.
54 * ----------------------------------------------------------------------------
56 * That example did "greedy parsing" --- that is, always choosing the longest
57 * match at each position. However, this interface can be (and is intended to
58 * be) used for "optimal parsing" as well. It can also be used for in-between
59 * strategies such as "lazy parsing" and "flexible parsing". For the best
60 * performance try different match-finding algorithms and parameters to see what
61 * works best for your parsing strategy, and your typical data and block sizes.
65 * TODO: this API is going to go away eventually. It has too much indirection
66 * and is not flexible enough.
69 #ifndef _WIMLIB_LZ_MF_H
70 #define _WIMLIB_LZ_MF_H
72 #include "wimlib/types.h"
74 /* When ENABLE_LZ_DEBUG is defined, we check all matches for correctness and
75 * perform other validations. Use for debugging only, as it slows things down
78 //#define ENABLE_LZ_DEBUG
79 #ifdef ENABLE_LZ_DEBUG
82 # define LZ_ASSERT assert
84 # define LZ_ASSERT(...)
89 /* Representation of a Lempel-Ziv match. */
92 /* The number of bytes matched. */
95 /* The offset back from the current position that was matched. */
100 * Specifies a match-finding algorithm.
104 * Longest Common Prefix Interval Tree match-finding algorithm.
106 * This is a suffix array-based algorithm. It works well on medium to
107 * large windows. However, due to an implementation detail, it is
108 * currently limited to a maximum window size of 33554432 bytes.
110 * The memory usage is 12 bytes per position.
112 LZ_MF_LCP_INTERVAL_TREE,
115 * Linked Suffix Array match-finding algorithm.
117 * This can be used on very large windows.
119 * The memory usage is 14 bytes per position.
121 * Currently, this method usually performs slightly worse than the LCP
122 * interval tree algorithm. However, it can be used on windows
123 * exceeding the 33554432 byte limit of the LCP interval tree algorithm.
125 LZ_MF_LINKED_SUFFIX_ARRAY,
128 /* Parameters for Lempel-Ziv match-finding. */
129 struct lz_mf_params {
132 * The match-finding algorithm to use. This must be one of the 'enum
133 * lz_mf_algo' constants defined above.
138 * The maximum window size, in bytes, that shall be supported by the
139 * match-finder. This is the maximum size that can be passed to
140 * subsequent calls to lz_mf_load_window().
142 * Note: this interface is intended to be used for block compression, so
143 * none of the match-finding algorithms support sliding windows. It's
144 * expected that the window for LZ match-finding simply be the block of
145 * data being compressed.
147 * Match-finders generally require an amount of memory proportional to
148 * this parameter. Use lz_mf_get_needed_memory() to query the needed
149 * memory size for a specific match-finding algorithm and maximum window
152 * This parameter cannot be 0; there is no default value.
154 * Match-finding algorithms may place additional restrictions on this
155 * parameter. However, currently only the LCP interval tree
156 * match-finding algorithm places such a restriction (it doesn't support
157 * windows larger than 33554432 bytes).
162 * The minimum length, in bytes, of matches that can be produced by the
163 * match-finder (by a call to lz_mf_get_matches()).
165 * If this parameter is not 0, it must be 2 or greater.
167 * If this parameter is 0, the match-finding algorithm sets it to a
168 * default value. The default value will be at least 2 and at most 16.
173 * The maximum length, in bytes, of matches that can be produced by the
174 * match-finder (by a call to lz_mf_get_matches()).
176 * If this parameter is not 0, it must be greater than or equal to
177 * @min_match_len, or the default value the match-finding algorithm
178 * selected for @min_match_len in the case that @min_match_len was
181 * If this parameter is 0, the match-finding algorithm sets it to a
182 * default value. In general, the caller must be prepared to handle
183 * arbitrarily long matches (up to the window size minus 1) in this
189 * This value describes the maximum amount of work that the
190 * match-finding algorithm will do at each position. A typical value to
191 * use is 32. Higher values result in better matches and slower
194 * If this parameter is 0, the match-finding algorithm sets it to a
197 u32 max_search_depth;
200 * This parameter defines the maximum match length to which the full
201 * algorithm will be applied. This can also be thought of as the length
202 * above which the algorithm will not try to search for additional
205 * Usually, setting this parameter to a reasonable value (such as 24,
206 * 32, or 48) will speed up match-finding but will not hurt the
207 * compression ratio too much. This is because these settings of this
208 * parameter cause the match-finder to not waste too much time examining
209 * very long matches, which are already highly compressible.
211 * In addition, if the longest match exceeds this length, the
212 * match-finding algorithm will still report its full length.
214 * The linked suffix array match-finding algorithm ignores this
217 * If this parameter is 0, the match-finding algorithm sets it to a
224 * Lempel-Ziv match-finder operations structure.
226 * Match-finding algorithms must fill in all members. None can be left as 0 or
229 * Don't directly access any of the members outside of lz_mf.h and lz_mf.c.
230 * Instead, use the lz_mf_*() wrappers.
233 bool (*params_valid)(const struct lz_mf_params *);
235 u64 (*get_needed_memory)(u32 max_window_size);
237 bool (*init)(struct lz_mf *);
239 void (*load_window)(struct lz_mf *mf, const u8 *, u32);
241 u32 (*get_matches)(struct lz_mf *, struct lz_match *);
243 void (*skip_positions)(struct lz_mf *, u32);
245 void (*destroy)(struct lz_mf *);
251 * Lempel-Ziv match-finder structure.
253 * Match-finding algorithms must embed this structure inside a private
256 * Don't directly access any of the members outside of lz_mf.h, lz_mf.c, and
257 * match-finding algorithms. Instead, use the lz_mf_*() wrappers.
260 struct lz_mf_params params;
261 struct lz_mf_ops ops;
262 const u8 *cur_window;
268 lz_mf_params_valid(const struct lz_mf_params *params);
271 lz_mf_get_needed_memory(enum lz_mf_algo algorithm, u32 max_window_size);
273 extern struct lz_mf *
274 lz_mf_alloc(const struct lz_mf_params *params);
277 lz_mf_load_window(struct lz_mf *mf, const u8 *window, u32 size);
279 #ifdef ENABLE_LZ_DEBUG
281 lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches);
283 /* See non-inline definition for comment */
285 lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches)
287 return mf->ops.get_matches(mf, matches);
291 #ifdef ENABLE_LZ_DEBUG
293 lz_mf_skip_positions(struct lz_mf *mf, u32 n);
295 /* See non-inline definition for comment */
297 lz_mf_skip_positions(struct lz_mf *mf, u32 n)
299 mf->ops.skip_positions(mf, n);
304 lz_mf_free(struct lz_mf *mf);
307 * Returns the match-finder's current position in the window.
309 * The current position begins at 0. It increases by 1 when lz_mf_get_matches()
310 * is called, and by 'n' when lz_mf_skip_positions() is called.
312 * Note: The behavior is undefined if the match-finder is advanced beyond the
313 * end of the window. (If this happens in ENABLE_LZ_DEBUG mode, an assertion
314 * will be triggered.)
317 lz_mf_get_position(const struct lz_mf *mf)
319 return mf->cur_window_pos;
323 * Returns the number of bytes remaining in the window.
326 lz_mf_get_bytes_remaining(const struct lz_mf *mf)
328 return mf->cur_window_size - mf->cur_window_pos;
332 * Returns a pointer to the current window, offset by the current position.
333 * Equivalently, this returns a pointer to the byte sequence that the next call
334 * to lz_mf_get_matches() will match against.
336 static inline const u8 *
337 lz_mf_get_window_ptr(const struct lz_mf *mf)
339 return &mf->cur_window[mf->cur_window_pos];
342 #endif /* _WIMLIB_LZ_MF_H */