wimlib.net Git - wimlib/blob - include/wimlib/lz_mf.h

   1 /*
   2  * lz_mf.h
   3  *
   4  * Interface for Lempel-Ziv match-finders.
   5  *
   6  * Copyright (c) 2014 Eric Biggers.  All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  *
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
  23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  26  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  29  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  * Example usage of the match-finder API:
  34  *
  35  * ----------------------------------------------------------------------------
  36  *
  37  * Fill in a 'struct lz_mf_params'.
  38  * (Optional) Call lz_mf_params_valid() to validate the parameters.
  39  * Call lz_mf_alloc() to allocate the match-finder.
  40  * For each block of data to be compressed:
  41  *      Call lz_mf_load_window() to load the block into the match finder.
  42  *      While the block is not yet fully compressed:
  43  *              Call lz_mf_get_matches() to get matches at the current position.
  44  *              If matches were found:
  45  *                      Output the longest match.
  46  *                      Call lz_mf_skip_positions() to skip the remaining length of the match.
  47  *              Else:
  48  *                      Output a literal.
  49  *              End If
  50  *      End While
  51  * End For
  52  * Call lz_mf_free() to free the match-finder.
  53  *
  54  * ----------------------------------------------------------------------------
  55  *
  56  * That example did "greedy parsing" --- that is, always choosing the longest
  57  * match at each position.  However, this interface can be (and is intended to
  58  * be) used for "optimal parsing" as well.  It can also be used for in-between
  59  * strategies such as "lazy parsing" and "flexible parsing".  For the best
  60  * performance try different match-finding algorithms and parameters to see what
  61  * works best for your parsing strategy, and your typical data and block sizes.
  62  */
  63
  64 #ifndef _WIMLIB_LZ_MF_H
  65 #define _WIMLIB_LZ_MF_H
  66
  67 #include "wimlib/types.h"
  68
  69 /* When ENABLE_LZ_DEBUG is defined, we check all matches for correctness and
  70  * perform other validations.  Use for debugging only, as it slows things down
  71  * significantly.  */
  72
  73 //#define ENABLE_LZ_DEBUG
  74 #ifdef ENABLE_LZ_DEBUG
  75 #  include <assert.h>
  76 #  include <string.h>
  77 #  define LZ_ASSERT assert
  78 #else
  79 #  define LZ_ASSERT(...)
  80 #endif
  81
  82 struct lz_mf;
  83
  84 /* Representation of a Lempel-Ziv match.  */
  85 struct lz_match {
  86
  87         /* The number of bytes matched.  */
  88         u32 len;
  89
  90         /* The offset back from the current position that was matched.  */
  91         u32 offset;
  92 };
  93
  94 /*
  95  * Specifies a match-finding algorithm.
  96  */
  97 enum lz_mf_algo {
  98
  99         /*
 100          * Use the default algorithm for the specified maximum window size.
 101          */
 102         LZ_MF_DEFAULT = 0,
 103
 104         /*
 105          * "Null" algorithm that never reports any matches.
 106          *
 107          * This algorithm exists for comparison, benchmarking, and testing
 108          * purposes only.  It is not intended to be used in real compressors.
 109          */
 110         LZ_MF_NULL = 1,
 111
 112         /*
 113          * Hash Chain match-finding algorithm.
 114          *
 115          * This works well on small windows.
 116          *
 117          * The memory usage is 4 bytes per position, plus 131072 bytes for a
 118          * hash table.
 119          *
 120          * lz_mf_skip_positions() with this algorithm is very fast, so it's good
 121          * if you're doing "greedy" rather than "optimal" parsing.  However, if
 122          * using large windows you might be better off with binary trees or
 123          * suffix arrays, even if doing greedy parsing.
 124          */
 125         LZ_MF_HASH_CHAINS = 3,
 126
 127         /*
 128          * Binary Tree match-finding algorithm.
 129          *
 130          * This works well on small to medium-sized windows.
 131          *
 132          * The memory usage is 8 bytes per position, plus 262144 bytes for a
 133          * hash table.
 134          *
 135          * lz_mf_skip_positions() with this algorithm takes a significant amount
 136          * of time, almost as much as a call to lz_mf_get_matches().  This makes
 137          * this algorithm better suited for optimal parsing than for greedy
 138          * parsing.  However, if the window size becomes sufficiently large,
 139          * this algorithm can outperform hash chains, even when using greedy
 140          * parsing.
 141          */
 142         LZ_MF_BINARY_TREES = 4,
 143
 144         /*
 145          * Longest Common Prefix Interval Tree match-finding algorithm.
 146          *
 147          * This is a suffix array-based algorithm.  It works well on medium to
 148          * large windows.  However, due to an implementation detail, it is
 149          * currently limited to a maximum window size of 33554432 bytes.
 150          *
 151          * The memory usage is 12 bytes per position.
 152          *
 153          * Unlike the hash chain and binary tree algorithms, the LCP interval
 154          * tree algorithm performs most of its work in lz_mf_load_window().  The
 155          * calls to lz_mf_get_matches() and lz_mf_skip_positions() take
 156          * relatively little time, and lz_mf_skip_positions() is not much faster
 157          * than lz_mf_get_matches().  Therefore, if you're using this algorithm
 158          * you might as well be doing "optimal" rather than "greedy" parsing.
 159          */
 160         LZ_MF_LCP_INTERVAL_TREE = 5,
 161
 162         /*
 163          * Linked Suffix Array match-finding algorithm.
 164          *
 165          * This can be used on very large windows.
 166          *
 167          * The memory usage is 14 bytes per position.
 168          *
 169          * Currently, this method usually performs slightly worse than the LCP
 170          * interval tree algorithm.  However, it can be used on windows
 171          * exceeding the 33554432 byte limit of the LCP interval tree algorithm.
 172          */
 173         LZ_MF_LINKED_SUFFIX_ARRAY = 6,
 174 };
 175
 176 /* Parameters for Lempel-Ziv match-finding.  */
 177 struct lz_mf_params {
 178
 179         /*
 180          * The match-finding algorithm to use.  This must be one of the 'enum
 181          * lz_mf_algo' constants defined above.
 182          *
 183          * If this is LZ_MF_DEFAULT, the default algorithm for the specified
 184          * @max_window_size is used.
 185          */
 186         u32 algorithm;
 187
 188         /*
 189          * The maximum window size, in bytes, that shall be supported by the
 190          * match-finder.  This is the maximum size that can be passed to
 191          * subsequent calls to lz_mf_load_window().
 192          *
 193          * Note: this interface is intended to be used for block compression, so
 194          * none of the match-finding algorithms support sliding windows.  It's
 195          * expected that the window for LZ match-finding simply be the block of
 196          * data being compressed.
 197          *
 198          * Match-finders generally require an amount of memory proportional to
 199          * this parameter.  Use lz_mf_get_needed_memory() to query the needed
 200          * memory size for a specific match-finding algorithm and maximum window
 201          * size.
 202          *
 203          * This parameter cannot be 0; there is no default value.
 204          *
 205          * Match-finding algorithms may place additional restrictions on this
 206          * parameter.  However, currently only the LCP interval tree
 207          * match-finding algorithm places such a restriction (it doesn't support
 208          * windows larger than 33554432 bytes).
 209          */
 210         u32 max_window_size;
 211
 212         /*
 213          * The minimum length, in bytes, of matches that can be produced by the
 214          * match-finder (by a call to lz_mf_get_matches()).
 215          *
 216          * If this parameter is not 0, it must be 2 or greater.
 217          *
 218          * If this parameter is 0, the match-finding algorithm sets it to a
 219          * default value.  The default value will be at least 2 and at most 16.
 220          */
 221         u32 min_match_len;
 222
 223         /*
 224          * The maximum length, in bytes, of matches that can be produced by the
 225          * match-finder (by a call to lz_mf_get_matches()).
 226          *
 227          * If this parameter is not 0, it must be greater than or equal to
 228          * @min_match_len, or the default value the match-finding algorithm
 229          * selected for @min_match_len in the case that @min_match_len was
 230          * specified as 0.
 231          *
 232          * If this parameter is 0, the match-finding algorithm sets it to a
 233          * default value.  In general, the caller must be prepared to handle
 234          * arbitrarily long matches (up to the window size minus 1) in this
 235          * case.
 236          */
 237         u32 max_match_len;
 238
 239         /*
 240          * When using the hash chains or binary trees match-finding algorithm,
 241          * this parameter defines the maximum number of search steps at each
 242          * position.  A typical value to use is 32.  Higher values result in
 243          * better matches and slower performance.
 244          *
 245          * The suffix array-based match-finding algorithms treat this parameter
 246          * slightly differently because they find the longest matches first.
 247          * They still honor the intent of the parameter but may scale it down to
 248          * an appropriate value.
 249          *
 250          * If this parameter is 0, the match-finding algorithm sets it to a
 251          * default value.
 252          */
 253         u32 max_search_depth;
 254
 255         /*
 256          * When using the hash chains, binary trees, or LCP interval tree
 257          * match-finding algorithm, this parameter defines the maximum match
 258          * length to which the full algorithm will be applied.  This can also be
 259          * thought of as the length above which the algorithm will not try to
 260          * search for additional matches.
 261          *
 262          * Usually, setting this parameter to a reasonable value (such as 24,
 263          * 32, or 48) will speed up match-finding but will not hurt the
 264          * compression ratio too much.  This is because these settings of this
 265          * parameter cause the match-finder to not waste too much time examining
 266          * very long matches, which are already highly compressible.
 267          *
 268          * In addition, if the longest match exceeds this length, the
 269          * match-finding algorithm will still report its full length.
 270          *
 271          * The linked suffix array match-finding algorithm ignores this
 272          * parameter.
 273          *
 274          * If this parameter is 0, the match-finding algorithm sets it to a
 275          * default value.
 276          */
 277         u32 nice_match_len;
 278 };
 279
 280 /*
 281  * Lempel-Ziv match-finder operations structure.
 282  *
 283  * Match-finding algorithms must fill in all members.  None can be left as 0 or
 284  * NULL.
 285  *
 286  * Don't directly access any of the members outside of lz_mf.h and lz_mf.c.
 287  * Instead, use the lz_mf_*() wrappers.
 288  */
 289 struct lz_mf_ops {
 290         bool (*params_valid)(const struct lz_mf_params *);
 291
 292         u64 (*get_needed_memory)(u32 max_window_size);
 293
 294         bool (*init)(struct lz_mf *);
 295
 296         void (*load_window)(struct lz_mf *mf, const u8 *, u32);
 297
 298         u32 (*get_matches)(struct lz_mf *, struct lz_match *);
 299
 300         void (*skip_positions)(struct lz_mf *, u32);
 301
 302         void (*destroy)(struct lz_mf *);
 303
 304         size_t struct_size;
 305 };
 306
 307 /*
 308  * Lempel-Ziv match-finder structure.
 309  *
 310  * Match-finding algorithms must embed this structure inside a private
 311  * structure.
 312  *
 313  * Don't directly access any of the members outside of lz_mf.h, lz_mf.c, and
 314  * match-finding algorithms.  Instead, use the lz_mf_*() wrappers.
 315  */
 316 struct lz_mf {
 317         struct lz_mf_params params;
 318         struct lz_mf_ops ops;
 319         const u8 *cur_window;
 320         u32 cur_window_pos;
 321         u32 cur_window_size;
 322 };
 323
 324 extern bool
 325 lz_mf_params_valid(const struct lz_mf_params *params);
 326
 327 extern u64
 328 lz_mf_get_needed_memory(enum lz_mf_algo algorithm, u32 max_window_size);
 329
 330 extern struct lz_mf *
 331 lz_mf_alloc(const struct lz_mf_params *params);
 332
 333 extern void
 334 lz_mf_load_window(struct lz_mf *mf, const u8 *window, u32 size);
 335
 336 #ifdef ENABLE_LZ_DEBUG
 337 extern u32
 338 lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches);
 339 #else
 340 /* See non-inline definition for comment  */
 341 static inline u32
 342 lz_mf_get_matches(struct lz_mf *mf, struct lz_match *matches)
 343 {
 344         return mf->ops.get_matches(mf, matches);
 345 }
 346 #endif
 347
 348 #ifdef ENABLE_LZ_DEBUG
 349 extern void
 350 lz_mf_skip_positions(struct lz_mf *mf, u32 n);
 351 #else
 352 /* See non-inline definition for comment  */
 353 static inline void
 354 lz_mf_skip_positions(struct lz_mf *mf, u32 n)
 355 {
 356         mf->ops.skip_positions(mf, n);
 357 }
 358 #endif
 359
 360 extern void
 361 lz_mf_free(struct lz_mf *mf);
 362
 363 /*
 364  * Returns the match-finder's current position in the window.
 365  *
 366  * The current position begins at 0.  It increases by 1 when lz_mf_get_matches()
 367  * is called, and by 'n' when lz_mf_skip_positions() is called.
 368  *
 369  * Note: The behavior is undefined if the match-finder is advanced beyond the
 370  * end of the window.  (If this happens in ENABLE_LZ_DEBUG mode, an assertion
 371  * will be triggered.)
 372  */
 373 static inline u32
 374 lz_mf_get_position(const struct lz_mf *mf)
 375 {
 376         return mf->cur_window_pos;
 377 }
 378
 379 /*
 380  * Returns the number of bytes remaining in the window.
 381  */
 382 static inline u32
 383 lz_mf_get_bytes_remaining(const struct lz_mf *mf)
 384 {
 385         return mf->cur_window_size - mf->cur_window_pos;
 386 }
 387
 388 /*
 389  * Returns a pointer to the current window, offset by the current position.
 390  * Equivalently, this returns a pointer to the byte sequence that the next call
 391  * to lz_mf_get_matches() will match against.
 392  */
 393 static inline const u8 *
 394 lz_mf_get_window_ptr(const struct lz_mf *mf)
 395 {
 396         return &mf->cur_window[mf->cur_window_pos];
 397 }
 398
 399 #endif /* _WIMLIB_LZ_MF_H */