4 * Interface for Lempel-Ziv match-finders.
6 * Copyright (c) 2014 Eric Biggers. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 #include "wimlib/lz_mf.h"
37 #include "wimlib/lz_mf_ops.h"
38 #include "wimlib/util.h"
40 /* Available match-finding algorithms. */
41 static const struct lz_mf_ops *mf_ops[] = {
42 [LZ_MF_NULL] = &lz_null_ops,
43 [LZ_MF_BRUTE_FORCE] = &lz_brute_force_ops,
44 [LZ_MF_HASH_CHAINS] = &lz_hash_chains_ops,
45 [LZ_MF_BINARY_TREES] = &lz_binary_trees_ops,
46 [LZ_MF_LCP_INTERVAL_TREE] = &lz_lcp_interval_tree_ops,
47 [LZ_MF_LINKED_SUFFIX_ARRAY] = &lz_linked_suffix_array_ops,
51 * Automatically select a match-finding algorithm to use, in the case that the
52 * user did not specify one.
54 static const struct lz_mf_ops *
55 select_mf_ops(enum lz_mf_algo algorithm, u32 max_window_size)
57 if (algorithm == LZ_MF_DEFAULT) {
58 if (max_window_size <= 32768)
59 algorithm = LZ_MF_HASH_CHAINS;
60 else if (max_window_size <= 2097152)
61 algorithm = LZ_MF_BINARY_TREES;
62 else if (max_window_size <= 33554432)
63 algorithm = LZ_MF_LCP_INTERVAL_TREE;
65 algorithm = LZ_MF_LINKED_SUFFIX_ARRAY;
67 if ((int)algorithm < 0 || (int)algorithm >= ARRAY_LEN(mf_ops))
69 return mf_ops[(int)algorithm];
73 * Returns an upper bound on the number of bytes of memory that will be consumed
74 * by a match-finder allocated with the specified algorithm and maximum window
77 * The returned value does not include the size of the window itself. The
78 * caller must account for this separately if needed.
80 * If @algorithm is invalid, returns 0.
83 lz_mf_get_needed_memory(enum lz_mf_algo algorithm, u32 max_window_size)
85 const struct lz_mf_ops *ops;
87 ops = select_mf_ops(algorithm, max_window_size);
90 return ops->struct_size + ops->get_needed_memory(max_window_size);
93 * Returns %true if and only if the specified parameters can be validly used to
94 * create a match-finder using lz_mf_alloc().
97 lz_mf_params_valid(const struct lz_mf_params *params)
99 const struct lz_mf_ops *ops;
101 /* Require that a valid algorithm, or LZ_MF_DEFAULT, be specified. */
102 ops = select_mf_ops(params->algorithm, params->max_window_size);
106 /* Don't allow empty windows. Otherwise, some match-finding algorithms
107 * might need special-case code to handle empty windows. */
108 if (params->max_window_size == 0)
111 /* Don't allow length-1 matches, so that match-finding algorithms don't
112 * need to worry about this case. Most LZ-based compression formats
113 * don't allow length-1 matches, since they usually aren't helpful for
114 * compression. Also, if a compressor really does need length-1
115 * matches, it can easily maintain its own table of length 256
116 * containing the most-recently-seen position for each byte value.
118 * min_match_len == 0 is valid, since that means the match-finding
119 * algorithm will fill in a default value. */
120 if (params->min_match_len == 1)
123 if (params->max_match_len != 0) {
125 /* Don't allow length-1 matches (same reason as above). */
126 if (params->max_match_len == 1)
129 /* Don't allow the maximum match length to be shorter than the
130 * minimum match length. */
131 if (params->max_match_len < params->min_match_len)
135 /* Don't allow the needed memory size to overflow a 'size_t'. */
136 if (sizeof(size_t) < sizeof(u64)) {
137 u64 needed_mem = ops->get_needed_memory(params->max_window_size);
138 if ((size_t)needed_mem != needed_mem)
142 /* Call the algorithm-specific routine to finish the validation. */
143 return ops->params_valid(params);
147 * Allocate a new match-finder.
150 * The parameters for the match-finder. See the declaration of 'struct
151 * lz_mf_params' for more information.
153 * Returns a pointer to the new match-finder, or NULL if out of memory or the
154 * parameters are invalid. Call lz_mf_params_valid() beforehand to test the
155 * parameter validity separately.
158 lz_mf_alloc(const struct lz_mf_params *params)
161 const struct lz_mf_ops *ops;
163 /* Validate the parameters. */
164 if (!lz_mf_params_valid(params))
167 /* Get the match-finder operations structure. Since we just validated
168 * the parameters, this is guaranteed to return a valid structure. */
169 ops = select_mf_ops(params->algorithm, params->max_window_size);
170 LZ_ASSERT(ops != NULL);
172 /* Allocate memory for the match-finder structure. */
173 LZ_ASSERT(ops->struct_size >= sizeof(struct lz_mf));
174 mf = CALLOC(1, ops->struct_size);
178 /* Set the parameters and operations fields. */
179 mf->params = *params;
182 /* Perform algorithm-specific initialization. Normally this is where
183 * most of the necessary memory is allocated. */
184 if (!mf->ops.init(mf)) {
189 /* The algorithm must have set min_match_len and max_match_len if either
191 LZ_ASSERT(mf->params.min_match_len >= 2);
192 LZ_ASSERT(mf->params.max_match_len >= mf->params.min_match_len);
198 * Load a window into the match-finder.
201 * The match-finder into which to load the window.
203 * Pointer to the window to load. This memory must remain available,
204 * unmodified, while the match-finder is being used.
206 * The size of the window, in bytes. This can't be larger than the
207 * @max_window_size parameter. In addition, this can't be 0.
209 * Note: this interface does not support sliding windows!
212 lz_mf_load_window(struct lz_mf *mf, const u8 *window, u32 size)
214 /* Can't be an empty window, and can't be larger than the maximum window
215 * size with which the match-finder was allocated. */
217 LZ_ASSERT(size <= mf->params.max_window_size);
219 /* Save the window and initialize the current position. */
220 mf->cur_window = window;
221 mf->cur_window_size = size;
222 mf->cur_window_pos = 0;
224 /* Call into the algorithm-specific window load code. */
225 mf->ops.load_window(mf, window, size);
229 * Retrieve a list of matches at the next position in the window.
232 * The match-finder into which a window has been loaded using
233 * lz_mf_load_window().
235 * The array into which the matches will be returned. The returned match
236 * count will not exceed the minimum of @max_search_depth and (@len_limit -
237 * @min_match_len + 1), where @len_limit is itself defined as
238 * min(@max_match_len, @nice_match_len).
240 * The return value is the number of matches that were found and stored in the
241 * 'matches' array. The matches will be ordered by strictly increasing length
242 * and strictly increasing offset. No match shall have length less than
243 * @min_match_len, and no match shall have length greater than @max_match_len.
244 * The return value may be 0, which indicates that no matches were found.
246 * On completion, the match-finder is advanced to the next position in the
249 * Note: in-non-debug mode, the inline definition of this gets used instead.
250 * They are the same, except that the non-inline version below validates the
251 * results to help debug match-finding algorithms.
253 #ifdef ENABLE_LZ_DEBUG
255 lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches)
257 LZ_ASSERT(mf->cur_window_pos < mf->cur_window_size);
259 const u32 orig_pos = mf->cur_window_pos;
260 const u32 len_limit = min(mf->params.max_match_len,
261 lz_mf_get_bytes_remaining(mf));
262 const u8 * const strptr = lz_mf_get_window_ptr(mf);
264 const u32 num_matches = mf->ops.get_matches(mf, matches);
266 LZ_ASSERT(mf->cur_window_pos == orig_pos + 1);
269 fprintf(stderr, "Pos %"PRIu32"/%"PRIu32": %"PRIu32" matches\n",
270 orig_pos, mf->cur_window_size, num_matches);
271 for (u32 i = 0; i < num_matches; i++) {
272 fprintf(stderr, "\tLen %"PRIu32" Offset %"PRIu32"\n",
273 matches[i].len, matches[i].offset);
277 /* Validate the matches. */
278 for (u32 i = 0; i < num_matches; i++) {
279 const u32 len = matches[i].len;
280 const u32 offset = matches[i].offset;
284 LZ_ASSERT(len >= mf->params.min_match_len);
285 LZ_ASSERT(len <= len_limit);
288 LZ_ASSERT(offset >= 1);
289 LZ_ASSERT(offset <= orig_pos);
291 /* Lengths and offsets strictly increasing? */
293 LZ_ASSERT(len > matches[i - 1].len);
294 LZ_ASSERT(offset > matches[i - 1].offset);
297 /* Actually a match? */
298 matchptr = strptr - offset;
299 LZ_ASSERT(!memcmp(strptr, matchptr, len));
301 /* Match can't be extended further? */
302 LZ_ASSERT(len == len_limit || strptr[len] != matchptr[len]);
307 #endif /* ENABLE_LZ_DEBUG */
310 * Skip 'n' positions in the match-finder. This is a faster alternative to
311 * calling lz_mf_get_matches() at each position to advance the match-finder.
313 * 'n' must be greater than 0.
315 * Note: in-non-debug mode, the inline definition of this gets used instead.
316 * They are the same, except the non-inline version below does extra checks.
318 #ifdef ENABLE_LZ_DEBUG
320 lz_mf_skip_positions(struct lz_mf *mf, const u32 n)
323 LZ_ASSERT(n <= lz_mf_get_bytes_remaining(mf));
325 const u32 orig_pos = mf->cur_window_pos;
327 mf->ops.skip_positions(mf, n);
329 LZ_ASSERT(mf->cur_window_pos == orig_pos + n);
334 * Free the match-finder.
336 * This frees all memory that was allocated by the call to lz_mf_alloc().
339 lz_mf_free(struct lz_mf *mf)
343 #ifdef ENABLE_LZ_DEBUG
344 memset(mf, 0, mf->ops.struct_size);