wimlib.net Git - wimlib/blob - src/lzx-compress.c

   1 /*
   2  * lzx-compress.c
   3  *
   4  * A compressor that produces output compatible with the LZX compression format.
   5  */
   6
   7 /*
   8  * Copyright (C) 2012, 2013, 2014 Eric Biggers
   9  *
  10  * This file is part of wimlib, a library for working with WIM files.
  11  *
  12  * wimlib is free software; you can redistribute it and/or modify it under the
  13  * terms of the GNU General Public License as published by the Free
  14  * Software Foundation; either version 3 of the License, or (at your option)
  15  * any later version.
  16  *
  17  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
  18  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  19  * A PARTICULAR PURPOSE. See the GNU General Public License for more
  20  * details.
  21  *
  22  * You should have received a copy of the GNU General Public License
  23  * along with wimlib; if not, see http://www.gnu.org/licenses/.
  24  */
  25
  26
  27 /*
  28  * This file contains a compressor for the LZX ("Lempel-Ziv eXtended"?)
  29  * compression format, as used in the WIM (Windows IMaging) file format.  This
  30  * code may need some slight modifications to be used outside of the WIM format.
  31  * In particular, in other situations the LZX block header might be slightly
  32  * different, and a sliding window rather than a fixed-size window might be
  33  * required.
  34  *
  35  * ----------------------------------------------------------------------------
  36  *
  37  *                               Format Overview
  38  *
  39  * The primary reference for LZX is the specification released by Microsoft.
  40  * However, the comments in lzx-decompress.c provide more information about LZX
  41  * and note some errors in the Microsoft specification.
  42  *
  43  * LZX shares many similarities with DEFLATE, the format used by zlib and gzip.
  44  * Both LZX and DEFLATE use LZ77 matching and Huffman coding.  Certain details
  45  * are quite similar, such as the method for storing Huffman codes.  However,
  46  * the main differences are:
  47  *
  48  * - LZX preprocesses the data to attempt to make x86 machine code slightly more
  49  *   compressible before attempting to compress it further.
  50  *
  51  * - LZX uses a "main" alphabet which combines literals and matches, with the
  52  *   match symbols containing a "length header" (giving all or part of the match
  53  *   length) and a "position slot" (giving, roughly speaking, the order of
  54  *   magnitude of the match offset).
  55  *
  56  * - LZX does not have static Huffman blocks (that is, the kind with preset
  57  *   Huffman codes); however it does have two types of dynamic Huffman blocks
  58  *   ("verbatim" and "aligned").
  59  *
  60  * - LZX has a minimum match length of 2 rather than 3.
  61  *
  62  * - In LZX, match offsets 0 through 2 actually represent entries in an LRU
  63  *   queue of match offsets.  This is very useful for certain types of files,
  64  *   such as binary files that have repeating records.
  65  *
  66  * ----------------------------------------------------------------------------
  67  *
  68  *                            Algorithmic Overview
  69  *
  70  * At a high level, any implementation of LZX compression must operate as
  71  * follows:
  72  *
  73  * 1. Preprocess the input data to translate the targets of 32-bit x86 call
  74  *    instructions to absolute offsets.  (Actually, this is required for WIM,
  75  *    but might not be in other places LZX is used.)
  76  *
  77  * 2. Find a sequence of LZ77-style matches and literal bytes that expands to
  78  *    the preprocessed data.
  79  *
  80  * 3. Divide the match/literal sequence into one or more LZX blocks, each of
  81  *    which may be "uncompressed", "verbatim", or "aligned".
  82  *
  83  * 4. Output each LZX block.
  84  *
  85  * Step (1) is fairly straightforward.  It requires looking for 0xe8 bytes in
  86  * the input data and performing a translation on the 4 bytes following each
  87  * one.
  88  *
  89  * Step (4) is complicated, but it is mostly determined by the LZX format.  The
  90  * only real choice we have is what algorithm to use to build the length-limited
  91  * canonical Huffman codes.  See lzx_write_all_blocks() for details.
  92  *
  93  * That leaves steps (2) and (3) as where all the hard stuff happens.  Focusing
  94  * on step (2), we need to do LZ77-style parsing on the input data, or "window",
  95  * to divide it into a sequence of matches and literals.  Each position in the
  96  * window might have multiple matches associated with it, and we need to choose
  97  * which one, if any, to actually use.  Therefore, the problem can really be
  98  * divided into two areas of concern: (a) finding matches at a given position,
  99  * which we shall call "match-finding", and (b) choosing whether to use a
 100  * match or a literal at a given position, and if using a match, which one (if
 101  * there is more than one available).  We shall call this "match-choosing".  We
 102  * first consider match-finding, then match-choosing.
 103  *
 104  * ----------------------------------------------------------------------------
 105  *
 106  *                               Match-finding
 107  *
 108  * Given a position in the window, we want to find LZ77-style "matches" with
 109  * that position at previous positions in the window.  With LZX, the minimum
 110  * match length is 2 and the maximum match length is 257.  The only restriction
 111  * on offsets is that LZX does not allow the last 2 bytes of the window to match
 112  * the beginning of the window.
 113  *
 114  * There are a number of algorithms that can be used for this, including hash
 115  * chains, binary trees, and suffix arrays.  Binary trees generally work well
 116  * for LZX compression since it uses medium-size windows (2^15 to 2^21 bytes).
 117  * However, when compressing in a fast mode where many positions are skipped
 118  * (not searched for matches), hash chains are faster.
 119  *
 120  * Since the match-finders are not specific to LZX, I will not explain them in
 121  * detail here.  Instead, see lz_hash_chains.c and lz_binary_trees.c.
 122  *
 123  * ----------------------------------------------------------------------------
 124  *
 125  *                               Match-choosing
 126  *
 127  * Usually, choosing the longest match is best because it encodes the most data
 128  * in that one item.  However, sometimes the longest match is not optimal
 129  * because (a) choosing a long match now might prevent using an even longer
 130  * match later, or (b) more generally, what we actually care about is the number
 131  * of bits it will ultimately take to output each match or literal, which is
 132  * actually dependent on the entropy encoding using by the underlying
 133  * compression format.  Consequently, a longer match usually, but not always,
 134  * takes fewer bits to encode than multiple shorter matches or literals that
 135  * cover the same data.
 136  *
 137  * This problem of choosing the truly best match/literal sequence is probably
 138  * impossible to solve efficiently when combined with entropy encoding.  If we
 139  * knew how many bits it takes to output each match/literal, then we could
 140  * choose the optimal sequence using shortest-path search a la Dijkstra's
 141  * algorithm.  However, with entropy encoding, the chosen match/literal sequence
 142  * affects its own encoding.  Therefore, we can't know how many bits it will
 143  * take to actually output any one match or literal until we have actually
 144  * chosen the full sequence of matches and literals.
 145  *
 146  * Notwithstanding the entropy encoding problem, we also aren't guaranteed to
 147  * choose the optimal match/literal sequence unless the match-finder (see
 148  * section "Match-finder") provides the match-chooser with all possible matches
 149  * at each position.  However, this is not computationally efficient.  For
 150  * example, there might be many matches of the same length, and usually (but not
 151  * always) the best choice is the one with the smallest offset.  So in practice,
 152  * it's fine to only consider the smallest offset for a given match length at a
 153  * given position.  (Actually, for LZX, it's also worth considering repeat
 154  * offsets.)
 155  *
 156  * In addition, as mentioned earlier, in LZX we have the choice of using
 157  * multiple blocks, each of which resets the Huffman codes.  This expands the
 158  * search space even further.  Therefore, to simplify the problem, we currently
 159  * we don't attempt to actually choose the LZX blocks based on the data.
 160  * Instead, we just divide the data into fixed-size blocks of LZX_DIV_BLOCK_SIZE
 161  * bytes each, and always use verbatim or aligned blocks (never uncompressed).
 162  * A previous version of this code recursively split the input data into
 163  * equal-sized blocks, up to a maximum depth, and chose the lowest-cost block
 164  * divisions.  However, this made compression much slower and did not actually
 165  * help very much.  It remains an open question whether a sufficiently fast and
 166  * useful block-splitting algorithm is possible for LZX.  Essentially the same
 167  * problem also applies to DEFLATE.  The Microsoft LZX compressor seemingly does
 168  * do block splitting, although I don't know how fast or useful it is,
 169  * specifically.
 170  *
 171  * Now, back to the entropy encoding problem.  The "solution" is to use an
 172  * iterative approach to compute a good, but not necessarily optimal,
 173  * match/literal sequence.  Start with a fixed assignment of symbol costs and
 174  * choose an "optimal" match/literal sequence based on those costs, using
 175  * shortest-path seach a la Dijkstra's algorithm.  Then, for each iteration of
 176  * the optimization, update the costs based on the entropy encoding of the
 177  * current match/literal sequence, then choose a new match/literal sequence
 178  * based on the updated costs.  Usually, the actual cost to output the current
 179  * match/literal sequence will decrease in each iteration until it converges on
 180  * a fixed point.  This result may not be the truly optimal match/literal
 181  * sequence, but it usually is much better than one chosen by doing a "greedy"
 182  * parse where we always chooe the longest match.
 183  *
 184  * An alternative to both greedy parsing and iterative, near-optimal parsing is
 185  * "lazy" parsing.  Briefly, "lazy" parsing considers just the longest match at
 186  * each position, but it waits to choose that match until it has also examined
 187  * the next position.  This is actually a useful approach; it's used by zlib,
 188  * for example.  Therefore, for fast compression we combine lazy parsing with
 189  * the hash chain max-finder.  For normal/high compression we combine
 190  * near-optimal parsing with the binary tree match-finder.
 191  */
 192
 193 #ifdef HAVE_CONFIG_H
 194 #  include "config.h"
 195 #endif
 196
 197 #include "wimlib/compressor_ops.h"
 198 #include "wimlib/compress_common.h"
 199 #include "wimlib/endianness.h"
 200 #include "wimlib/error.h"
 201 #include "wimlib/lz_mf.h"
 202 #include "wimlib/lzx.h"
 203 #include "wimlib/util.h"
 204 #include <string.h>
 205
 206 #define LZX_OPTIM_ARRAY_LENGTH  4096
 207
 208 #define LZX_DIV_BLOCK_SIZE      32768
 209
 210 #define LZX_CACHE_PER_POS       8
 211
 212 #define LZX_MAX_MATCHES_PER_POS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
 213
 214 #define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1))
 215
 216 /* Codewords for the LZX main, length, and aligned offset Huffman codes  */
 217 struct lzx_codewords {
 218         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 219         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 220         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 221 };
 222
 223 /* Codeword lengths (in bits) for the LZX main, length, and aligned offset
 224  * Huffman codes.
 225  *
 226  * A 0 length means the codeword has zero frequency.
 227  */
 228 struct lzx_lens {
 229         u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 230         u8 len[LZX_LENCODE_NUM_SYMBOLS];
 231         u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 232 };
 233
 234 /* Costs for the LZX main, length, and aligned offset Huffman symbols.
 235  *
 236  * If a codeword has zero frequency, it must still be assigned some nonzero cost
 237  * --- generally a high cost, since even if it gets used in the next iteration,
 238  * it probably will not be used very many times.  */
 239 struct lzx_costs {
 240         u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 241         u8 len[LZX_LENCODE_NUM_SYMBOLS];
 242         u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 243 };
 244
 245 /* The LZX main, length, and aligned offset Huffman codes  */
 246 struct lzx_codes {
 247         struct lzx_codewords codewords;
 248         struct lzx_lens lens;
 249 };
 250
 251 /* Tables for tallying symbol frequencies in the three LZX alphabets  */
 252 struct lzx_freqs {
 253         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 254         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 255         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 256 };
 257
 258 /* LZX intermediate match/literal format  */
 259 struct lzx_item {
 260         /* Bit     Description
 261          *
 262          * 31      1 if a match, 0 if a literal.
 263          *
 264          * 30-25   position slot.  This can be at most 50, so it will fit in 6
 265          *         bits.
 266          *
 267          * 8-24    position footer.  This is the offset of the real formatted
 268          *         offset from the position base.  This can be at most 17 bits
 269          *         (since lzx_extra_bits[LZX_MAX_POSITION_SLOTS - 1] is 17).
 270          *
 271          * 0-7     length of match, minus 2.  This can be at most
 272          *         (LZX_MAX_MATCH_LEN - 2) == 255, so it will fit in 8 bits.  */
 273         u32 data;
 274 };
 275
 276 /* Specification for an LZX block.  */
 277 struct lzx_block_spec {
 278
 279         /* One of the LZX_BLOCKTYPE_* constants indicating which type of this
 280          * block.  */
 281         int block_type;
 282
 283         /* 0-based position in the window at which this block starts.  */
 284         u32 window_pos;
 285
 286         /* The number of bytes of uncompressed data this block represents.  */
 287         u32 block_size;
 288
 289         /* The match/literal sequence for this block.  */
 290         struct lzx_item *chosen_items;
 291
 292         /* The length of the @chosen_items sequence.  */
 293         u32 num_chosen_items;
 294
 295         /* Huffman codes for this block.  */
 296         struct lzx_codes codes;
 297 };
 298
 299 struct lzx_compressor;
 300
 301 struct lzx_compressor_params {
 302         struct lz_match (*choose_item_func)(struct lzx_compressor *);
 303         enum lz_mf_algo mf_algo;
 304         u32 num_optim_passes;
 305         u32 min_match_length;
 306         u32 nice_match_length;
 307         u32 max_search_depth;
 308 };
 309
 310 /* State of the LZX compressor.  */
 311 struct lzx_compressor {
 312
 313         /* The buffer of data to be compressed.
 314          *
 315          * 0xe8 byte preprocessing is done directly on the data here before
 316          * further compression.
 317          *
 318          * Note that this compressor does *not* use a real sliding window!!!!
 319          * It's not needed in the WIM format, since every chunk is compressed
 320          * independently.  This is by design, to allow random access to the
 321          * chunks.  */
 322         u8 *cur_window;
 323
 324         /* Number of bytes of data to be compressed, which is the number of
 325          * bytes of data in @cur_window that are actually valid.  */
 326         u32 cur_window_size;
 327
 328         /* Allocated size of @cur_window.  */
 329         u32 max_window_size;
 330
 331         /* log2 order of the LZX window size for LZ match offset encoding
 332          * purposes.  Will be >= LZX_MIN_WINDOW_ORDER and <=
 333          * LZX_MAX_WINDOW_ORDER.
 334          *
 335          * Note: 1 << @window_order is normally equal to @max_window_size, but
 336          * it will be greater than @max_window_size in the event that the
 337          * compressor was created with a non-power-of-2 block size.  (See
 338          * lzx_get_window_order().)  */
 339         unsigned window_order;
 340
 341         /* Compression parameters.  */
 342         struct lzx_compressor_params params;
 343
 344         unsigned (*get_matches_func)(struct lzx_compressor *, const struct lz_match **);
 345         void (*skip_bytes_func)(struct lzx_compressor *, unsigned n);
 346
 347         /* Number of symbols in the main alphabet (depends on the @window_order
 348          * since it determines the maximum allowed offset).  */
 349         unsigned num_main_syms;
 350
 351         /* The current match offset LRU queue.  */
 352         struct lzx_lru_queue queue;
 353
 354         /* Space for the sequences of matches/literals that were chosen for each
 355          * block.  */
 356         struct lzx_item *chosen_items;
 357
 358         /* Information about the LZX blocks the preprocessed input was divided
 359          * into.  */
 360         struct lzx_block_spec *block_specs;
 361
 362         /* Number of LZX blocks the input was divided into; a.k.a. the number of
 363          * elements of @block_specs that are valid.  */
 364         unsigned num_blocks;
 365
 366         /* This is simply filled in with zeroes and used to avoid special-casing
 367          * the output of the first compressed Huffman code, which conceptually
 368          * has a delta taken from a code with all symbols having zero-length
 369          * codewords.  */
 370         struct lzx_codes zero_codes;
 371
 372         /* The current cost model.  */
 373         struct lzx_costs costs;
 374
 375         /* Lempel-Ziv match-finder.  */
 376         struct lz_mf *mf;
 377
 378         /* Position in window of next match to return.  */
 379         u32 match_window_pos;
 380
 381         /* The end-of-block position.  We can't allow any matches to span this
 382          * position.  */
 383         u32 match_window_end;
 384
 385         /* When doing more than one match-choosing pass over the data, matches
 386          * found by the match-finder are cached in the following array to
 387          * achieve a slight speedup when the same matches are needed on
 388          * subsequent passes.  This is suboptimal because different matches may
 389          * be preferred with different cost models, but seems to be a worthwhile
 390          * speedup.  */
 391         struct lz_match *cached_matches;
 392         struct lz_match *cache_ptr;
 393         struct lz_match *cache_limit;
 394
 395         /* Match-chooser state, used when doing near-optimal parsing.
 396          *
 397          * When matches have been chosen, optimum_cur_idx is set to the position
 398          * in the window of the next match/literal to return and optimum_end_idx
 399          * is set to the position in the window at the end of the last
 400          * match/literal to return.  */
 401         struct lzx_mc_pos_data *optimum;
 402         unsigned optimum_cur_idx;
 403         unsigned optimum_end_idx;
 404
 405         /* Previous match, used when doing lazy parsing.  */
 406         struct lz_match prev_match;
 407 };
 408
 409 /*
 410  * Match chooser position data:
 411  *
 412  * An array of these structures is used during the match-choosing algorithm.
 413  * They correspond to consecutive positions in the window and are used to keep
 414  * track of the cost to reach each position, and the match/literal choices that
 415  * need to be chosen to reach that position.
 416  */
 417 struct lzx_mc_pos_data {
 418         /* The approximate minimum cost, in bits, to reach this position in the
 419          * window which has been found so far.  */
 420         u32 cost;
 421 #define MC_INFINITE_COST ((u32)~0UL)
 422
 423         /* The union here is just for clarity, since the fields are used in two
 424          * slightly different ways.  Initially, the @prev structure is filled in
 425          * first, and links go from later in the window to earlier in the
 426          * window.  Later, @next structure is filled in and links go from
 427          * earlier in the window to later in the window.  */
 428         union {
 429                 struct {
 430                         /* Position of the start of the match or literal that
 431                          * was taken to get to this position in the approximate
 432                          * minimum-cost parse.  */
 433                         u32 link;
 434
 435                         /* Offset (as in an LZ (length, offset) pair) of the
 436                          * match or literal that was taken to get to this
 437                          * position in the approximate minimum-cost parse.  */
 438                         u32 match_offset;
 439                 } prev;
 440                 struct {
 441                         /* Position at which the match or literal starting at
 442                          * this position ends in the minimum-cost parse.  */
 443                         u32 link;
 444
 445                         /* Offset (as in an LZ (length, offset) pair) of the
 446                          * match or literal starting at this position in the
 447                          * approximate minimum-cost parse.  */
 448                         u32 match_offset;
 449                 } next;
 450         };
 451
 452         /* Adaptive state that exists after an approximate minimum-cost path to
 453          * reach this position is taken.
 454          *
 455          * Note: we update this whenever we update the pending minimum-cost
 456          * path.  This is in contrast to LZMA, which also has an optimal parser
 457          * that maintains a repeat offset queue per position, but will only
 458          * compute the queue once that position is actually reached in the
 459          * parse, meaning that matches are being considered *starting* at that
 460          * position.  However, the two methods seem to have approximately the
 461          * same performance if appropriate optimizations are used.  Intuitively
 462          * the LZMA method seems faster, but it actually suffers from 1-2 extra
 463          * hard-to-predict branches at each position.  Probably it works better
 464          * for LZMA than LZX because LZMA has a larger adaptive state than LZX,
 465          * and the LZMA encoder considers more possibilities.  */
 466         struct lzx_lru_queue queue;
 467 };
 468
 469
 470 /*
 471  * Structure to keep track of the current state of sending bits to the
 472  * compressed output buffer.
 473  *
 474  * The LZX bitstream is encoded as a sequence of 16-bit coding units.
 475  */
 476 struct lzx_output_bitstream {
 477
 478         /* Bits that haven't yet been written to the output buffer.  */
 479         u32 bitbuf;
 480
 481         /* Number of bits currently held in @bitbuf.  */
 482         u32 bitcount;
 483
 484         /* Pointer to the start of the output buffer.  */
 485         le16 *start;
 486
 487         /* Pointer to the position in the output buffer at which the next coding
 488          * unit should be written.  */
 489         le16 *next;
 490
 491         /* Pointer past the end of the output buffer.  */
 492         le16 *end;
 493 };
 494
 495 /*
 496  * Initialize the output bitstream.
 497  *
 498  * @os
 499  *      The output bitstream structure to initialize.
 500  * @buffer
 501  *      The buffer being written to.
 502  * @size
 503  *      Size of @buffer, in bytes.
 504  */
 505 static void
 506 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, u32 size)
 507 {
 508         os->bitbuf = 0;
 509         os->bitcount = 0;
 510         os->start = buffer;
 511         os->next = os->start;
 512         os->end = os->start + size / sizeof(le16);
 513 }
 514
 515 /*
 516  * Write some bits to the output bitstream.
 517  *
 518  * The bits are given by the low-order @num_bits bits of @bits.  Higher-order
 519  * bits in @bits cannot be set.  At most 17 bits can be written at once.
 520  *
 521  * @max_bits is a compile-time constant that specifies the maximum number of
 522  * bits that can ever be written at the call site.  Currently, it is used to
 523  * optimize away the conditional code for writing a second 16-bit coding unit
 524  * when writing fewer than 17 bits.
 525  *
 526  * If the output buffer space is exhausted, then the bits will be ignored, and
 527  * lzx_flush_output() will return 0 when it gets called.
 528  */
 529 static _always_inline_attribute void
 530 lzx_write_varbits(struct lzx_output_bitstream *os,
 531                   const u32 bits, const unsigned int num_bits,
 532                   const unsigned int max_num_bits)
 533 {
 534         /* This code is optimized for LZX, which never needs to write more than
 535          * 17 bits at once.  */
 536         LZX_ASSERT(num_bits <= 17);
 537         LZX_ASSERT(num_bits <= max_num_bits);
 538         LZX_ASSERT(os->bitcount <= 15);
 539
 540         /* Add the bits to the bit buffer variable.  @bitcount will be at most
 541          * 15, so there will be just enough space for the maximum possible
 542          * @num_bits of 17.  */
 543         os->bitcount += num_bits;
 544         os->bitbuf = (os->bitbuf << num_bits) | bits;
 545
 546         /* Check whether any coding units need to be written.  */
 547         if (os->bitcount >= 16) {
 548
 549                 os->bitcount -= 16;
 550
 551                 /* Write a coding unit, unless it would overflow the buffer.  */
 552                 if (os->next != os->end)
 553                         *os->next++ = cpu_to_le16(os->bitbuf >> os->bitcount);
 554
 555                 /* If writing 17 bits, a second coding unit might need to be
 556                  * written.  But because 'max_num_bits' is a compile-time
 557                  * constant, the compiler will optimize away this code at most
 558                  * call sites.  */
 559                 if (max_num_bits == 17 && os->bitcount == 16) {
 560                         if (os->next != os->end)
 561                                 *os->next++ = cpu_to_le16(os->bitbuf);
 562                         os->bitcount = 0;
 563                 }
 564         }
 565 }
 566
 567 /* Use when @num_bits is a compile-time constant.  Otherwise use
 568  * lzx_write_varbits().  */
 569 static _always_inline_attribute void
 570 lzx_write_bits(struct lzx_output_bitstream *os,
 571                const u32 bits, const unsigned int num_bits)
 572 {
 573         lzx_write_varbits(os, bits, num_bits, num_bits);
 574 }
 575
 576 /*
 577  * Flush the last coding unit to the output buffer if needed.  Return the total
 578  * number of bytes written to the output buffer, or 0 if an overflow occurred.
 579  */
 580 static u32
 581 lzx_flush_output(struct lzx_output_bitstream *os)
 582 {
 583         if (os->next == os->end)
 584                 return 0;
 585
 586         if (os->bitcount != 0)
 587                 *os->next++ = cpu_to_le16(os->bitbuf << (16 - os->bitcount));
 588
 589         return (const u8 *)os->next - (const u8 *)os->start;
 590 }
 591
 592 /* Returns the LZX position slot that corresponds to a given match offset,
 593  * taking into account the recent offset queue and updating it if the offset is
 594  * found in it.  */
 595 static unsigned
 596 lzx_get_position_slot(u32 offset, struct lzx_lru_queue *queue)
 597 {
 598         unsigned position_slot;
 599
 600         /* See if the offset was recently used.  */
 601         for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
 602                 if (offset == queue->R[i]) {
 603                         /* Found it.  */
 604
 605                         /* Bring the repeat offset to the front of the
 606                          * queue.  Note: this is, in fact, not a real
 607                          * LRU queue because repeat matches are simply
 608                          * swapped to the front.  */
 609                         swap(queue->R[0], queue->R[i]);
 610
 611                         /* The resulting position slot is simply the first index
 612                          * at which the offset was found in the queue.  */
 613                         return i;
 614                 }
 615         }
 616
 617         /* The offset was not recently used; look up its real position slot.  */
 618         position_slot = lzx_get_position_slot_raw(offset + LZX_OFFSET_OFFSET);
 619
 620         /* Bring the new offset to the front of the queue.  */
 621         for (int i = LZX_NUM_RECENT_OFFSETS - 1; i > 0; i--)
 622                 queue->R[i] = queue->R[i - 1];
 623         queue->R[0] = offset;
 624
 625         return position_slot;
 626 }
 627
 628 /* Build the main, length, and aligned offset Huffman codes used in LZX.
 629  *
 630  * This takes as input the frequency tables for each code and produces as output
 631  * a set of tables that map symbols to codewords and codeword lengths.  */
 632 static void
 633 lzx_make_huffman_codes(const struct lzx_freqs *freqs,
 634                        struct lzx_codes *codes,
 635                        unsigned num_main_syms)
 636 {
 637         make_canonical_huffman_code(num_main_syms,
 638                                     LZX_MAX_MAIN_CODEWORD_LEN,
 639                                     freqs->main,
 640                                     codes->lens.main,
 641                                     codes->codewords.main);
 642
 643         make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
 644                                     LZX_MAX_LEN_CODEWORD_LEN,
 645                                     freqs->len,
 646                                     codes->lens.len,
 647                                     codes->codewords.len);
 648
 649         make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
 650                                     LZX_MAX_ALIGNED_CODEWORD_LEN,
 651                                     freqs->aligned,
 652                                     codes->lens.aligned,
 653                                     codes->codewords.aligned);
 654 }
 655
 656 /*
 657  * Output a precomputed LZX match.
 658  *
 659  * @os:
 660  *      The bitstream to which to write the match.
 661  * @block_type:
 662  *      The type of the LZX block (LZX_BLOCKTYPE_ALIGNED or
 663  *      LZX_BLOCKTYPE_VERBATIM)
 664  * @match:
 665  *      The match data.
 666  * @codes:
 667  *      Pointer to a structure that contains the codewords for the main, length,
 668  *      and aligned offset Huffman codes for the current LZX compressed block.
 669  */
 670 static void
 671 lzx_write_match(struct lzx_output_bitstream *os, int block_type,
 672                 struct lzx_item match, const struct lzx_codes *codes)
 673 {
 674         unsigned match_len_minus_2 = match.data & 0xff;
 675         u32 position_footer = (match.data >> 8) & 0x1ffff;
 676         unsigned position_slot = (match.data >> 25) & 0x3f;
 677         unsigned len_header;
 678         unsigned len_footer;
 679         unsigned main_symbol;
 680         unsigned num_extra_bits;
 681
 682         /* If the match length is less than MIN_MATCH_LEN (= 2) +
 683          * NUM_PRIMARY_LENS (= 7), the length header contains the match length
 684          * minus MIN_MATCH_LEN, and there is no length footer.
 685          *
 686          * Otherwise, the length header contains NUM_PRIMARY_LENS, and the
 687          * length footer contains the match length minus NUM_PRIMARY_LENS minus
 688          * MIN_MATCH_LEN. */
 689         if (match_len_minus_2 < LZX_NUM_PRIMARY_LENS) {
 690                 len_header = match_len_minus_2;
 691         } else {
 692                 len_header = LZX_NUM_PRIMARY_LENS;
 693                 len_footer = match_len_minus_2 - LZX_NUM_PRIMARY_LENS;
 694         }
 695
 696         /* Combine the position slot with the length header into a single symbol
 697          * that will be encoded with the main code.
 698          *
 699          * The actual main symbol is offset by LZX_NUM_CHARS because values
 700          * under LZX_NUM_CHARS are used to indicate a literal byte rather than a
 701          * match.  */
 702         main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
 703
 704         /* Output main symbol. */
 705         lzx_write_varbits(os, codes->codewords.main[main_symbol],
 706                           codes->lens.main[main_symbol],
 707                           LZX_MAX_MAIN_CODEWORD_LEN);
 708
 709         /* If there is a length footer, output it using the
 710          * length Huffman code. */
 711         if (len_header == LZX_NUM_PRIMARY_LENS) {
 712                 lzx_write_varbits(os, codes->codewords.len[len_footer],
 713                                   codes->lens.len[len_footer],
 714                                   LZX_MAX_LEN_CODEWORD_LEN);
 715         }
 716
 717         /* Output the position footer.  */
 718
 719         num_extra_bits = lzx_get_num_extra_bits(position_slot);
 720
 721         if ((block_type == LZX_BLOCKTYPE_ALIGNED) && (num_extra_bits >= 3)) {
 722
 723                 /* Aligned offset blocks: The low 3 bits of the position footer
 724                  * are Huffman-encoded using the aligned offset code.  The
 725                  * remaining bits are output literally.  */
 726
 727                 lzx_write_varbits(os,
 728                                   position_footer >> 3, num_extra_bits - 3, 14);
 729
 730                 lzx_write_varbits(os,
 731                                   codes->codewords.aligned[position_footer & 7],
 732                                   codes->lens.aligned[position_footer & 7],
 733                                   LZX_MAX_ALIGNED_CODEWORD_LEN);
 734         } else {
 735                 /* Verbatim blocks, or fewer than 3 extra bits:  All position
 736                  * footer bits are output literally.  */
 737                 lzx_write_varbits(os, position_footer, num_extra_bits, 17);
 738         }
 739 }
 740
 741 /* Output an LZX literal (encoded with the main Huffman code).  */
 742 static void
 743 lzx_write_literal(struct lzx_output_bitstream *os, unsigned literal,
 744                   const struct lzx_codes *codes)
 745 {
 746         lzx_write_varbits(os, codes->codewords.main[literal],
 747                           codes->lens.main[literal], LZX_MAX_MAIN_CODEWORD_LEN);
 748 }
 749
 750 static unsigned
 751 lzx_build_precode(const u8 lens[restrict],
 752                   const u8 prev_lens[restrict],
 753                   const unsigned num_syms,
 754                   u32 precode_freqs[restrict LZX_PRECODE_NUM_SYMBOLS],
 755                   u8 output_syms[restrict num_syms],
 756                   u8 precode_lens[restrict LZX_PRECODE_NUM_SYMBOLS],
 757                   u32 precode_codewords[restrict LZX_PRECODE_NUM_SYMBOLS],
 758                   unsigned *num_additional_bits_ret)
 759 {
 760         memset(precode_freqs, 0,
 761                LZX_PRECODE_NUM_SYMBOLS * sizeof(precode_freqs[0]));
 762
 763         /* Since the code word lengths use a form of RLE encoding, the goal here
 764          * is to find each run of identical lengths when going through them in
 765          * symbol order (including runs of length 1).  For each run, as many
 766          * lengths are encoded using RLE as possible, and the rest are output
 767          * literally.
 768          *
 769          * output_syms[] will be filled in with the length symbols that will be
 770          * output, including RLE codes, not yet encoded using the precode.
 771          *
 772          * cur_run_len keeps track of how many code word lengths are in the
 773          * current run of identical lengths.  */
 774         unsigned output_syms_idx = 0;
 775         unsigned cur_run_len = 1;
 776         unsigned num_additional_bits = 0;
 777         for (unsigned i = 1; i <= num_syms; i++) {
 778
 779                 if (i != num_syms && lens[i] == lens[i - 1]) {
 780                         /* Still in a run--- keep going. */
 781                         cur_run_len++;
 782                         continue;
 783                 }
 784
 785                 /* Run ended! Check if it is a run of zeroes or a run of
 786                  * nonzeroes. */
 787
 788                 /* The symbol that was repeated in the run--- not to be confused
 789                  * with the length *of* the run (cur_run_len) */
 790                 unsigned len_in_run = lens[i - 1];
 791
 792                 if (len_in_run == 0) {
 793                         /* A run of 0's.  Encode it in as few length
 794                          * codes as we can. */
 795
 796                         /* The magic length 18 indicates a run of 20 + n zeroes,
 797                          * where n is an uncompressed literal 5-bit integer that
 798                          * follows the magic length. */
 799                         while (cur_run_len >= 20) {
 800                                 unsigned additional_bits;
 801
 802                                 additional_bits = min(cur_run_len - 20, 0x1f);
 803                                 num_additional_bits += 5;
 804                                 precode_freqs[18]++;
 805                                 output_syms[output_syms_idx++] = 18;
 806                                 output_syms[output_syms_idx++] = additional_bits;
 807                                 cur_run_len -= 20 + additional_bits;
 808                         }
 809
 810                         /* The magic length 17 indicates a run of 4 + n zeroes,
 811                          * where n is an uncompressed literal 4-bit integer that
 812                          * follows the magic length. */
 813                         while (cur_run_len >= 4) {
 814                                 unsigned additional_bits;
 815
 816                                 additional_bits = min(cur_run_len - 4, 0xf);
 817                                 num_additional_bits += 4;
 818                                 precode_freqs[17]++;
 819                                 output_syms[output_syms_idx++] = 17;
 820                                 output_syms[output_syms_idx++] = additional_bits;
 821                                 cur_run_len -= 4 + additional_bits;
 822                         }
 823
 824                 } else {
 825
 826                         /* A run of nonzero lengths. */
 827
 828                         /* The magic length 19 indicates a run of 4 + n
 829                          * nonzeroes, where n is a literal bit that follows the
 830                          * magic length, and where the value of the lengths in
 831                          * the run is given by an extra length symbol, encoded
 832                          * with the precode, that follows the literal bit.
 833                          *
 834                          * The extra length symbol is encoded as a difference
 835                          * from the length of the codeword for the first symbol
 836                          * in the run in the previous code.
 837                          * */
 838                         while (cur_run_len >= 4) {
 839                                 unsigned additional_bits;
 840                                 signed char delta;
 841
 842                                 additional_bits = (cur_run_len > 4);
 843                                 num_additional_bits += 1;
 844                                 delta = (signed char)prev_lens[i - cur_run_len] -
 845                                         (signed char)len_in_run;
 846                                 if (delta < 0)
 847                                         delta += 17;
 848                                 precode_freqs[19]++;
 849                                 precode_freqs[(unsigned char)delta]++;
 850                                 output_syms[output_syms_idx++] = 19;
 851                                 output_syms[output_syms_idx++] = additional_bits;
 852                                 output_syms[output_syms_idx++] = delta;
 853                                 cur_run_len -= 4 + additional_bits;
 854                         }
 855                 }
 856
 857                 /* Any remaining lengths in the run are outputted without RLE,
 858                  * as a difference from the length of that codeword in the
 859                  * previous code. */
 860                 while (cur_run_len > 0) {
 861                         signed char delta;
 862
 863                         delta = (signed char)prev_lens[i - cur_run_len] -
 864                                 (signed char)len_in_run;
 865                         if (delta < 0)
 866                                 delta += 17;
 867
 868                         precode_freqs[(unsigned char)delta]++;
 869                         output_syms[output_syms_idx++] = delta;
 870                         cur_run_len--;
 871                 }
 872
 873                 cur_run_len = 1;
 874         }
 875
 876         /* Build the precode from the frequencies of the length symbols. */
 877
 878         make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS,
 879                                     LZX_MAX_PRE_CODEWORD_LEN,
 880                                     precode_freqs, precode_lens,
 881                                     precode_codewords);
 882
 883         *num_additional_bits_ret = num_additional_bits;
 884
 885         return output_syms_idx;
 886 }
 887
 888 /*
 889  * Output a Huffman code in the compressed form used in LZX.
 890  *
 891  * The Huffman code is represented in the output as a logical series of codeword
 892  * lengths from which the Huffman code, which must be in canonical form, can be
 893  * reconstructed.
 894  *
 895  * The codeword lengths are themselves compressed using a separate Huffman code,
 896  * the "precode", which contains a symbol for each possible codeword length in
 897  * the larger code as well as several special symbols to represent repeated
 898  * codeword lengths (a form of run-length encoding).  The precode is itself
 899  * constructed in canonical form, and its codeword lengths are represented
 900  * literally in 20 4-bit fields that immediately precede the compressed codeword
 901  * lengths of the larger code.
 902  *
 903  * Furthermore, the codeword lengths of the larger code are actually represented
 904  * as deltas from the codeword lengths of the corresponding code in the previous
 905  * block.
 906  *
 907  * @os:
 908  *      Bitstream to which to write the compressed Huffman code.
 909  * @lens:
 910  *      The codeword lengths, indexed by symbol, in the Huffman code.
 911  * @prev_lens:
 912  *      The codeword lengths, indexed by symbol, in the corresponding Huffman
 913  *      code in the previous block, or all zeroes if this is the first block.
 914  * @num_syms:
 915  *      The number of symbols in the Huffman code.
 916  */
 917 static void
 918 lzx_write_compressed_code(struct lzx_output_bitstream *os,
 919                           const u8 lens[restrict],
 920                           const u8 prev_lens[restrict],
 921                           unsigned num_syms)
 922 {
 923         u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
 924         u8 output_syms[num_syms];
 925         u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
 926         u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
 927         unsigned i;
 928         unsigned num_output_syms;
 929         u8 precode_sym;
 930         unsigned dummy;
 931
 932         num_output_syms = lzx_build_precode(lens,
 933                                             prev_lens,
 934                                             num_syms,
 935                                             precode_freqs,
 936                                             output_syms,
 937                                             precode_lens,
 938                                             precode_codewords,
 939                                             &dummy);
 940
 941         /* Write the lengths of the precode codes to the output. */
 942         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 943                 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
 944
 945         /* Write the length symbols, encoded with the precode, to the output. */
 946
 947         for (i = 0; i < num_output_syms; ) {
 948                 precode_sym = output_syms[i++];
 949
 950                 lzx_write_varbits(os, precode_codewords[precode_sym],
 951                                   precode_lens[precode_sym],
 952                                   LZX_MAX_PRE_CODEWORD_LEN);
 953                 switch (precode_sym) {
 954                 case 17:
 955                         lzx_write_bits(os, output_syms[i++], 4);
 956                         break;
 957                 case 18:
 958                         lzx_write_bits(os, output_syms[i++], 5);
 959                         break;
 960                 case 19:
 961                         lzx_write_bits(os, output_syms[i++], 1);
 962                         lzx_write_varbits(os, precode_codewords[output_syms[i]],
 963                                           precode_lens[output_syms[i]],
 964                                           LZX_MAX_PRE_CODEWORD_LEN);
 965                         i++;
 966                         break;
 967                 default:
 968                         break;
 969                 }
 970         }
 971 }
 972
 973 /*
 974  * Write all matches and literal bytes (which were precomputed) in an LZX
 975  * compressed block to the output bitstream in the final compressed
 976  * representation.
 977  *
 978  * @os
 979  *      The output bitstream.
 980  * @block_type
 981  *      The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
 982  *      LZX_BLOCKTYPE_VERBATIM).
 983  * @items
 984  *      The array of matches/literals to output.
 985  * @num_items
 986  *      Number of matches/literals to output (length of @items).
 987  * @codes
 988  *      The main, length, and aligned offset Huffman codes for the current
 989  *      LZX compressed block.
 990  */
 991 static void
 992 lzx_write_items(struct lzx_output_bitstream *os, int block_type,
 993                 const struct lzx_item items[], u32 num_items,
 994                 const struct lzx_codes *codes)
 995 {
 996         for (u32 i = 0; i < num_items; i++) {
 997                 /* The high bit of the 32-bit intermediate representation
 998                  * indicates whether the item is an actual LZ-style match (1) or
 999                  * a literal byte (0).  */
1000                 if (items[i].data & 0x80000000)
1001                         lzx_write_match(os, block_type, items[i], codes);
1002                 else
1003                         lzx_write_literal(os, items[i].data, codes);
1004         }
1005 }
1006
1007 /* Write an LZX aligned offset or verbatim block to the output.  */
1008 static void
1009 lzx_write_compressed_block(int block_type,
1010                            u32 block_size,
1011                            unsigned window_order,
1012                            unsigned num_main_syms,
1013                            struct lzx_item * chosen_items,
1014                            u32 num_chosen_items,
1015                            const struct lzx_codes * codes,
1016                            const struct lzx_codes * prev_codes,
1017                            struct lzx_output_bitstream * os)
1018 {
1019         LZX_ASSERT(block_type == LZX_BLOCKTYPE_ALIGNED ||
1020                    block_type == LZX_BLOCKTYPE_VERBATIM);
1021
1022         /* The first three bits indicate the type of block and are one of the
1023          * LZX_BLOCKTYPE_* constants.  */
1024         lzx_write_bits(os, block_type, 3);
1025
1026         /* Output the block size.
1027          *
1028          * The original LZX format seemed to always encode the block size in 3
1029          * bytes.  However, the implementation in WIMGAPI, as used in WIM files,
1030          * uses the first bit to indicate whether the block is the default size
1031          * (32768) or a different size given explicitly by the next 16 bits.
1032          *
1033          * By default, this compressor uses a window size of 32768 and therefore
1034          * follows the WIMGAPI behavior.  However, this compressor also supports
1035          * window sizes greater than 32768 bytes, which do not appear to be
1036          * supported by WIMGAPI.  In such cases, we retain the default size bit
1037          * to mean a size of 32768 bytes but output non-default block size in 24
1038          * bits rather than 16.  The compatibility of this behavior is unknown
1039          * because WIMs created with chunk size greater than 32768 can seemingly
1040          * only be opened by wimlib anyway.  */
1041         if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
1042                 lzx_write_bits(os, 1, 1);
1043         } else {
1044                 lzx_write_bits(os, 0, 1);
1045
1046                 if (window_order >= 16)
1047                         lzx_write_bits(os, block_size >> 16, 8);
1048
1049                 lzx_write_bits(os, block_size & 0xFFFF, 16);
1050         }
1051
1052         /* Output the aligned offset code.  */
1053         if (block_type == LZX_BLOCKTYPE_ALIGNED) {
1054                 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1055                         lzx_write_bits(os, codes->lens.aligned[i],
1056                                        LZX_ALIGNEDCODE_ELEMENT_SIZE);
1057                 }
1058         }
1059
1060         /* Output the main code (two parts).  */
1061         lzx_write_compressed_code(os, codes->lens.main,
1062                                   prev_codes->lens.main,
1063                                   LZX_NUM_CHARS);
1064         lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
1065                                   prev_codes->lens.main + LZX_NUM_CHARS,
1066                                   num_main_syms - LZX_NUM_CHARS);
1067
1068         /* Output the length code.  */
1069         lzx_write_compressed_code(os, codes->lens.len,
1070                                   prev_codes->lens.len,
1071                                   LZX_LENCODE_NUM_SYMBOLS);
1072
1073         /* Output the compressed matches and literals.  */
1074         lzx_write_items(os, block_type, chosen_items, num_chosen_items, codes);
1075 }
1076
1077 /* Write out the LZX blocks that were computed.  */
1078 static void
1079 lzx_write_all_blocks(struct lzx_compressor *c, struct lzx_output_bitstream *os)
1080 {
1081
1082         const struct lzx_codes *prev_codes = &c->zero_codes;
1083         for (unsigned i = 0; i < c->num_blocks; i++) {
1084                 const struct lzx_block_spec *spec = &c->block_specs[i];
1085
1086                 lzx_write_compressed_block(spec->block_type,
1087                                            spec->block_size,
1088                                            c->window_order,
1089                                            c->num_main_syms,
1090                                            spec->chosen_items,
1091                                            spec->num_chosen_items,
1092                                            &spec->codes,
1093                                            prev_codes,
1094                                            os);
1095
1096                 prev_codes = &spec->codes;
1097         }
1098 }
1099
1100 /* Constructs an LZX match from a literal byte and updates the main code symbol
1101  * frequencies.  */
1102 static inline u32
1103 lzx_tally_literal(u8 lit, struct lzx_freqs *freqs)
1104 {
1105         freqs->main[lit]++;
1106         return (u32)lit;
1107 }
1108
1109 /* Constructs an LZX match from an offset and a length, and updates the LRU
1110  * queue and the frequency of symbols in the main, length, and aligned offset
1111  * alphabets.  The return value is a 32-bit number that provides the match in an
1112  * intermediate representation documented below.  */
1113 static inline u32
1114 lzx_tally_match(unsigned match_len, u32 match_offset,
1115                 struct lzx_freqs *freqs, struct lzx_lru_queue *queue)
1116 {
1117         unsigned position_slot;
1118         u32 position_footer;
1119         u32 len_header;
1120         unsigned main_symbol;
1121         unsigned len_footer;
1122         unsigned adjusted_match_len;
1123
1124         LZX_ASSERT(match_len >= LZX_MIN_MATCH_LEN && match_len <= LZX_MAX_MATCH_LEN);
1125
1126         /* The match offset shall be encoded as a position slot (itself encoded
1127          * as part of the main symbol) and a position footer.  */
1128         position_slot = lzx_get_position_slot(match_offset, queue);
1129         position_footer = (match_offset + LZX_OFFSET_OFFSET) &
1130                                 (((u32)1 << lzx_get_num_extra_bits(position_slot)) - 1);
1131
1132         /* The match length shall be encoded as a length header (itself encoded
1133          * as part of the main symbol) and an optional length footer.  */
1134         adjusted_match_len = match_len - LZX_MIN_MATCH_LEN;
1135         if (adjusted_match_len < LZX_NUM_PRIMARY_LENS) {
1136                 /* No length footer needed.  */
1137                 len_header = adjusted_match_len;
1138         } else {
1139                 /* Length footer needed.  It will be encoded using the length
1140                  * code.  */
1141                 len_header = LZX_NUM_PRIMARY_LENS;
1142                 len_footer = adjusted_match_len - LZX_NUM_PRIMARY_LENS;
1143                 freqs->len[len_footer]++;
1144         }
1145
1146         /* Account for the main symbol.  */
1147         main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
1148
1149         freqs->main[main_symbol]++;
1150
1151         /* In an aligned offset block, 3 bits of the position footer are output
1152          * as an aligned offset symbol.  Account for this, although we may
1153          * ultimately decide to output the block as verbatim.  */
1154
1155         /* The following check is equivalent to:
1156          *
1157          * if (lzx_extra_bits[position_slot] >= 3)
1158          *
1159          * Note that this correctly excludes position slots that correspond to
1160          * recent offsets.  */
1161         if (position_slot >= 8)
1162                 freqs->aligned[position_footer & 7]++;
1163
1164         /* Pack the position slot, position footer, and match length into an
1165          * intermediate representation.  See `struct lzx_item' for details.
1166          */
1167         LZX_ASSERT(LZX_MAX_POSITION_SLOTS <= 64);
1168         LZX_ASSERT(lzx_get_num_extra_bits(LZX_MAX_POSITION_SLOTS - 1) <= 17);
1169         LZX_ASSERT(LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1 <= 256);
1170
1171         LZX_ASSERT(position_slot      <= (1U << (31 - 25)) - 1);
1172         LZX_ASSERT(position_footer    <= (1U << (25 -  8)) - 1);
1173         LZX_ASSERT(adjusted_match_len <= (1U << (8  -  0)) - 1);
1174         return 0x80000000 |
1175                 (position_slot << 25) |
1176                 (position_footer << 8) |
1177                 (adjusted_match_len);
1178 }
1179
1180 /* Returns the cost, in bits, to output a literal byte using the specified cost
1181  * model.  */
1182 static u32
1183 lzx_literal_cost(u8 c, const struct lzx_costs * costs)
1184 {
1185         return costs->main[c];
1186 }
1187
1188 /* Returns the cost, in bits, to output a repeat offset match of the specified
1189  * length and position slot (repeat index) using the specified cost model.  */
1190 static u32
1191 lzx_repmatch_cost(u32 len, unsigned position_slot, const struct lzx_costs *costs)
1192 {
1193         unsigned len_header, main_symbol;
1194         u32 cost = 0;
1195
1196         len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
1197         main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
1198
1199         /* Account for main symbol.  */
1200         cost += costs->main[main_symbol];
1201
1202         /* Account for extra length information.  */
1203         if (len_header == LZX_NUM_PRIMARY_LENS)
1204                 cost += costs->len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
1205
1206         return cost;
1207 }
1208
1209 /* Set the cost model @c->costs from the Huffman codeword lengths specified in
1210  * @lens.
1211  *
1212  * The cost model and codeword lengths are almost the same thing, but the
1213  * Huffman codewords with length 0 correspond to symbols with zero frequency
1214  * that still need to be assigned actual costs.  The specific values assigned
1215  * are arbitrary, but they should be fairly high (near the maximum codeword
1216  * length) to take into account the fact that uses of these symbols are expected
1217  * to be rare.  */
1218 static void
1219 lzx_set_costs(struct lzx_compressor *c, const struct lzx_lens * lens,
1220               unsigned nostat)
1221 {
1222         unsigned i;
1223
1224         /* Main code  */
1225         for (i = 0; i < c->num_main_syms; i++)
1226                 c->costs.main[i] = lens->main[i] ? lens->main[i] : nostat;
1227
1228         /* Length code  */
1229         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
1230                 c->costs.len[i] = lens->len[i] ? lens->len[i] : nostat;
1231
1232         /* Aligned offset code  */
1233         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
1234                 c->costs.aligned[i] = lens->aligned[i] ? lens->aligned[i] : nostat / 2;
1235 }
1236
1237 /* Don't allow matches to span the end of an LZX block.  */
1238 static inline u32
1239 maybe_truncate_matches(struct lz_match matches[], u32 num_matches,
1240                        struct lzx_compressor *c)
1241 {
1242         if (c->match_window_end < c->cur_window_size && num_matches != 0) {
1243                 u32 limit = c->match_window_end - c->match_window_pos;
1244
1245                 if (limit >= LZX_MIN_MATCH_LEN) {
1246
1247                         u32 i = num_matches - 1;
1248                         do {
1249                                 if (matches[i].len >= limit) {
1250                                         matches[i].len = limit;
1251
1252                                         /* Truncation might produce multiple
1253                                          * matches with length 'limit'.  Keep at
1254                                          * most 1.  */
1255                                         num_matches = i + 1;
1256                                 }
1257                         } while (i--);
1258                 } else {
1259                         num_matches = 0;
1260                 }
1261         }
1262         return num_matches;
1263 }
1264
1265 static unsigned
1266 lzx_get_matches_fillcache_singleblock(struct lzx_compressor *c,
1267                                       const struct lz_match **matches_ret)
1268 {
1269         struct lz_match *cache_ptr;
1270         struct lz_match *matches;
1271         unsigned num_matches;
1272
1273         cache_ptr = c->cache_ptr;
1274         matches = cache_ptr + 1;
1275         if (likely(cache_ptr <= c->cache_limit)) {
1276                 num_matches = lz_mf_get_matches(c->mf, matches);
1277                 cache_ptr->len = num_matches;
1278                 c->cache_ptr = matches + num_matches;
1279         } else {
1280                 num_matches = 0;
1281         }
1282         c->match_window_pos++;
1283         *matches_ret = matches;
1284         return num_matches;
1285 }
1286
1287 static unsigned
1288 lzx_get_matches_fillcache_multiblock(struct lzx_compressor *c,
1289                                      const struct lz_match **matches_ret)
1290 {
1291         struct lz_match *cache_ptr;
1292         struct lz_match *matches;
1293         unsigned num_matches;
1294
1295         cache_ptr = c->cache_ptr;
1296         matches = cache_ptr + 1;
1297         if (likely(cache_ptr <= c->cache_limit)) {
1298                 num_matches = lz_mf_get_matches(c->mf, matches);
1299                 num_matches = maybe_truncate_matches(matches, num_matches, c);
1300                 cache_ptr->len = num_matches;
1301                 c->cache_ptr = matches + num_matches;
1302         } else {
1303                 num_matches = 0;
1304         }
1305         c->match_window_pos++;
1306         *matches_ret = matches;
1307         return num_matches;
1308 }
1309
1310 static unsigned
1311 lzx_get_matches_usecache(struct lzx_compressor *c,
1312                          const struct lz_match **matches_ret)
1313 {
1314         struct lz_match *cache_ptr;
1315         struct lz_match *matches;
1316         unsigned num_matches;
1317
1318         cache_ptr = c->cache_ptr;
1319         matches = cache_ptr + 1;
1320         if (cache_ptr <= c->cache_limit) {
1321                 num_matches = cache_ptr->len;
1322                 c->cache_ptr = matches + num_matches;
1323         } else {
1324                 num_matches = 0;
1325         }
1326         c->match_window_pos++;
1327         *matches_ret = matches;
1328         return num_matches;
1329 }
1330
1331 static unsigned
1332 lzx_get_matches_usecache_nocheck(struct lzx_compressor *c,
1333                                  const struct lz_match **matches_ret)
1334 {
1335         struct lz_match *cache_ptr;
1336         struct lz_match *matches;
1337         unsigned num_matches;
1338
1339         cache_ptr = c->cache_ptr;
1340         matches = cache_ptr + 1;
1341         num_matches = cache_ptr->len;
1342         c->cache_ptr = matches + num_matches;
1343         c->match_window_pos++;
1344         *matches_ret = matches;
1345         return num_matches;
1346 }
1347
1348 static unsigned
1349 lzx_get_matches_nocache_singleblock(struct lzx_compressor *c,
1350                                     const struct lz_match **matches_ret)
1351 {
1352         struct lz_match *matches;
1353         unsigned num_matches;
1354
1355         matches = c->cache_ptr;
1356         num_matches = lz_mf_get_matches(c->mf, matches);
1357         c->match_window_pos++;
1358         *matches_ret = matches;
1359         return num_matches;
1360 }
1361
1362 static unsigned
1363 lzx_get_matches_nocache_multiblock(struct lzx_compressor *c,
1364                                    const struct lz_match **matches_ret)
1365 {
1366         struct lz_match *matches;
1367         unsigned num_matches;
1368
1369         matches = c->cache_ptr;
1370         num_matches = lz_mf_get_matches(c->mf, matches);
1371         num_matches = maybe_truncate_matches(matches, num_matches, c);
1372         c->match_window_pos++;
1373         *matches_ret = matches;
1374         return num_matches;
1375 }
1376
1377 /*
1378  * Find matches at the next position in the window.
1379  *
1380  * Returns the number of matches found and sets *matches_ret to point to the
1381  * matches array.  The matches will be sorted by strictly increasing length and
1382  * offset.
1383  */
1384 static inline unsigned
1385 lzx_get_matches(struct lzx_compressor *c,
1386                 const struct lz_match **matches_ret)
1387 {
1388         return (*c->get_matches_func)(c, matches_ret);
1389 }
1390
1391 static void
1392 lzx_skip_bytes_fillcache(struct lzx_compressor *c, unsigned n)
1393 {
1394         struct lz_match *cache_ptr;
1395
1396         cache_ptr = c->cache_ptr;
1397         c->match_window_pos += n;
1398         lz_mf_skip_positions(c->mf, n);
1399         if (cache_ptr <= c->cache_limit) {
1400                 do {
1401                         cache_ptr->len = 0;
1402                         cache_ptr += 1;
1403                 } while (--n && cache_ptr <= c->cache_limit);
1404         }
1405         c->cache_ptr = cache_ptr;
1406 }
1407
1408 static void
1409 lzx_skip_bytes_usecache(struct lzx_compressor *c, unsigned n)
1410 {
1411         struct lz_match *cache_ptr;
1412
1413         cache_ptr = c->cache_ptr;
1414         c->match_window_pos += n;
1415         if (cache_ptr <= c->cache_limit) {
1416                 do {
1417                         cache_ptr += 1 + cache_ptr->len;
1418                 } while (--n && cache_ptr <= c->cache_limit);
1419         }
1420         c->cache_ptr = cache_ptr;
1421 }
1422
1423 static void
1424 lzx_skip_bytes_usecache_nocheck(struct lzx_compressor *c, unsigned n)
1425 {
1426         struct lz_match *cache_ptr;
1427
1428         cache_ptr = c->cache_ptr;
1429         c->match_window_pos += n;
1430         do {
1431                 cache_ptr += 1 + cache_ptr->len;
1432         } while (--n);
1433         c->cache_ptr = cache_ptr;
1434 }
1435
1436 static void
1437 lzx_skip_bytes_nocache(struct lzx_compressor *c, unsigned n)
1438 {
1439         c->match_window_pos += n;
1440         lz_mf_skip_positions(c->mf, n);
1441 }
1442
1443 /*
1444  * Skip the specified number of positions in the window (don't search for
1445  * matches at them).
1446  */
1447 static inline void
1448 lzx_skip_bytes(struct lzx_compressor *c, unsigned n)
1449 {
1450         return (*c->skip_bytes_func)(c, n);
1451 }
1452
1453 /*
1454  * Reverse the linked list of near-optimal matches so that they can be returned
1455  * in forwards order.
1456  *
1457  * Returns the first match in the list.
1458  */
1459 static struct lz_match
1460 lzx_match_chooser_reverse_list(struct lzx_compressor *c, unsigned cur_pos)
1461 {
1462         unsigned prev_link, saved_prev_link;
1463         unsigned prev_match_offset, saved_prev_match_offset;
1464
1465         c->optimum_end_idx = cur_pos;
1466
1467         saved_prev_link = c->optimum[cur_pos].prev.link;
1468         saved_prev_match_offset = c->optimum[cur_pos].prev.match_offset;
1469
1470         do {
1471                 prev_link = saved_prev_link;
1472                 prev_match_offset = saved_prev_match_offset;
1473
1474                 saved_prev_link = c->optimum[prev_link].prev.link;
1475                 saved_prev_match_offset = c->optimum[prev_link].prev.match_offset;
1476
1477                 c->optimum[prev_link].next.link = cur_pos;
1478                 c->optimum[prev_link].next.match_offset = prev_match_offset;
1479
1480                 cur_pos = prev_link;
1481         } while (cur_pos != 0);
1482
1483         c->optimum_cur_idx = c->optimum[0].next.link;
1484
1485         return (struct lz_match)
1486                 { .len = c->optimum_cur_idx,
1487                   .offset = c->optimum[0].next.match_offset,
1488                 };
1489 }
1490
1491 /*
1492  * lzx_choose_near_optimal_match() -
1493  *
1494  * Choose an approximately optimal match or literal to use at the next position
1495  * in the string, or "window", being LZ-encoded.
1496  *
1497  * This is based on algorithms used in 7-Zip, including the DEFLATE encoder
1498  * and the LZMA encoder, written by Igor Pavlov.
1499  *
1500  * Unlike a greedy parser that always takes the longest match, or even a "lazy"
1501  * parser with one match/literal look-ahead like zlib, the algorithm used here
1502  * may look ahead many matches/literals to determine the approximately optimal
1503  * match/literal to code next.  The motivation is that the compression ratio is
1504  * improved if the compressor can do things like use a shorter-than-possible
1505  * match in order to allow a longer match later, and also take into account the
1506  * estimated real cost of coding each match/literal based on the underlying
1507  * entropy encoding.
1508  *
1509  * Still, this is not a true optimal parser for several reasons:
1510  *
1511  * - Real compression formats use entropy encoding of the literal/match
1512  *   sequence, so the real cost of coding each match or literal is unknown until
1513  *   the parse is fully determined.  It can be approximated based on iterative
1514  *   parses, but the end result is not guaranteed to be globally optimal.
1515  *
1516  * - Very long matches are chosen immediately.  This is because locations with
1517  *   long matches are likely to have many possible alternatives that would cause
1518  *   slow optimal parsing, but also such locations are already highly
1519  *   compressible so it is not too harmful to just grab the longest match.
1520  *
1521  * - Not all possible matches at each location are considered because the
1522  *   underlying match-finder limits the number and type of matches produced at
1523  *   each position.  For example, for a given match length it's usually not
1524  *   worth it to only consider matches other than the lowest-offset match,
1525  *   except in the case of a repeat offset.
1526  *
1527  * - Although we take into account the adaptive state (in LZX, the recent offset
1528  *   queue), coding decisions made with respect to the adaptive state will be
1529  *   locally optimal but will not necessarily be globally optimal.  This is
1530  *   because the algorithm only keeps the least-costly path to get to a given
1531  *   location and does not take into account that a slightly more costly path
1532  *   could result in a different adaptive state that ultimately results in a
1533  *   lower global cost.
1534  *
1535  * - The array space used by this function is bounded, so in degenerate cases it
1536  *   is forced to start returning matches/literals before the algorithm has
1537  *   really finished.
1538  *
1539  * Each call to this function does one of two things:
1540  *
1541  * 1. Build a sequence of near-optimal matches/literals, up to some point, that
1542  *    will be returned by subsequent calls to this function, then return the
1543  *    first one.
1544  *
1545  * OR
1546  *
1547  * 2. Return the next match/literal previously computed by a call to this
1548  *    function.
1549  *
1550  * The return value is a (length, offset) pair specifying the match or literal
1551  * chosen.  For literals, the length is 0 or 1 and the offset is meaningless.
1552  */
1553 static struct lz_match
1554 lzx_choose_near_optimal_item(struct lzx_compressor *c)
1555 {
1556         unsigned num_matches;
1557         const struct lz_match *matches;
1558         struct lz_match match;
1559         u32 longest_len;
1560         u32 longest_rep_len;
1561         unsigned longest_rep_slot;
1562         unsigned cur_pos;
1563         unsigned end_pos;
1564         struct lzx_mc_pos_data *optimum = c->optimum;
1565
1566         if (c->optimum_cur_idx != c->optimum_end_idx) {
1567                 /* Case 2: Return the next match/literal already found.  */
1568                 match.len = optimum[c->optimum_cur_idx].next.link -
1569                                     c->optimum_cur_idx;
1570                 match.offset = optimum[c->optimum_cur_idx].next.match_offset;
1571
1572                 c->optimum_cur_idx = optimum[c->optimum_cur_idx].next.link;
1573                 return match;
1574         }
1575
1576         /* Case 1:  Compute a new list of matches/literals to return.  */
1577
1578         c->optimum_cur_idx = 0;
1579         c->optimum_end_idx = 0;
1580
1581         /* Search for matches at repeat offsets.  As a heuristic, we only keep
1582          * the one with the longest match length.  */
1583         longest_rep_len = LZX_MIN_MATCH_LEN - 1;
1584         if (c->match_window_pos >= 1) {
1585                 unsigned limit = min(LZX_MAX_MATCH_LEN,
1586                                      c->match_window_end - c->match_window_pos);
1587                 for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
1588                         u32 offset = c->queue.R[i];
1589                         const u8 *strptr = &c->cur_window[c->match_window_pos];
1590                         const u8 *matchptr = strptr - offset;
1591                         unsigned len = 0;
1592                         while (len < limit && strptr[len] == matchptr[len])
1593                                 len++;
1594                         if (len > longest_rep_len) {
1595                                 longest_rep_len = len;
1596                                 longest_rep_slot = i;
1597                         }
1598                 }
1599         }
1600
1601         /* If there's a long match with a repeat offset, choose it immediately.  */
1602         if (longest_rep_len >= c->params.nice_match_length) {
1603                 lzx_skip_bytes(c, longest_rep_len);
1604                 return (struct lz_match) {
1605                         .len = longest_rep_len,
1606                         .offset = c->queue.R[longest_rep_slot],
1607                 };
1608         }
1609
1610         /* Find other matches.  */
1611         num_matches = lzx_get_matches(c, &matches);
1612
1613         /* If there's a long match, choose it immediately.  */
1614         if (num_matches) {
1615                 longest_len = matches[num_matches - 1].len;
1616                 if (longest_len >= c->params.nice_match_length) {
1617                         lzx_skip_bytes(c, longest_len - 1);
1618                         return matches[num_matches - 1];
1619                 }
1620         } else {
1621                 longest_len = 1;
1622         }
1623
1624         /* Calculate the cost to reach the next position by coding a literal.  */
1625         optimum[1].queue = c->queue;
1626         optimum[1].cost = lzx_literal_cost(c->cur_window[c->match_window_pos - 1],
1627                                               &c->costs);
1628         optimum[1].prev.link = 0;
1629
1630         /* Calculate the cost to reach any position up to and including that
1631          * reached by the longest match.
1632          *
1633          * Note: We consider only the lowest-offset match that reaches each
1634          * position.
1635          *
1636          * Note: Some of the cost calculation stays the same for each offset,
1637          * regardless of how many lengths it gets used for.  Therefore, to
1638          * improve performance, we hand-code the cost calculation instead of
1639          * calling lzx_match_cost() to do a from-scratch cost evaluation at each
1640          * length.  */
1641         for (unsigned i = 0, len = 2; i < num_matches; i++) {
1642                 u32 offset;
1643                 struct lzx_lru_queue queue;
1644                 u32 position_cost;
1645                 unsigned position_slot;
1646                 unsigned num_extra_bits;
1647
1648                 offset = matches[i].offset;
1649                 queue = c->queue;
1650                 position_cost = 0;
1651
1652                 position_slot = lzx_get_position_slot(offset, &queue);
1653                 num_extra_bits = lzx_get_num_extra_bits(position_slot);
1654                 if (num_extra_bits >= 3) {
1655                         position_cost += num_extra_bits - 3;
1656                         position_cost += c->costs.aligned[(offset + LZX_OFFSET_OFFSET) & 7];
1657                 } else {
1658                         position_cost += num_extra_bits;
1659                 }
1660
1661                 do {
1662                         unsigned len_header;
1663                         unsigned main_symbol;
1664                         u32 cost;
1665
1666                         cost = position_cost;
1667
1668                         len_header = min(len - LZX_MIN_MATCH_LEN, LZX_NUM_PRIMARY_LENS);
1669                         main_symbol = ((position_slot << 3) | len_header) + LZX_NUM_CHARS;
1670                         cost += c->costs.main[main_symbol];
1671                         if (len_header == LZX_NUM_PRIMARY_LENS)
1672                                 cost += c->costs.len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
1673
1674                         optimum[len].queue = queue;
1675                         optimum[len].prev.link = 0;
1676                         optimum[len].prev.match_offset = offset;
1677                         optimum[len].cost = cost;
1678                 } while (++len <= matches[i].len);
1679         }
1680         end_pos = longest_len;
1681
1682         if (longest_rep_len >= LZX_MIN_MATCH_LEN) {
1683                 u32 cost;
1684
1685                 while (end_pos < longest_rep_len)
1686                         optimum[++end_pos].cost = MC_INFINITE_COST;
1687
1688                 cost = lzx_repmatch_cost(longest_rep_len, longest_rep_slot,
1689                                          &c->costs);
1690                 if (cost <= optimum[longest_rep_len].cost) {
1691                         optimum[longest_rep_len].queue = c->queue;
1692                         swap(optimum[longest_rep_len].queue.R[0],
1693                              optimum[longest_rep_len].queue.R[longest_rep_slot]);
1694                         optimum[longest_rep_len].prev.link = 0;
1695                         optimum[longest_rep_len].prev.match_offset =
1696                                 optimum[longest_rep_len].queue.R[0];
1697                         optimum[longest_rep_len].cost = cost;
1698                 }
1699         }
1700
1701         /* Step forward, calculating the estimated minimum cost to reach each
1702          * position.  The algorithm may find multiple paths to reach each
1703          * position; only the lowest-cost path is saved.
1704          *
1705          * The progress of the parse is tracked in the @optimum array, which for
1706          * each position contains the minimum cost to reach that position, the
1707          * index of the start of the match/literal taken to reach that position
1708          * through the minimum-cost path, the offset of the match taken (not
1709          * relevant for literals), and the adaptive state that will exist at
1710          * that position after the minimum-cost path is taken.  The @cur_pos
1711          * variable stores the position at which the algorithm is currently
1712          * considering coding choices, and the @end_pos variable stores the
1713          * greatest position at which the costs of coding choices have been
1714          * saved.
1715          *
1716          * The loop terminates when any one of the following conditions occurs:
1717          *
1718          * 1. A match with length greater than or equal to @nice_match_length is
1719          *    found.  When this occurs, the algorithm chooses this match
1720          *    unconditionally, and consequently the near-optimal match/literal
1721          *    sequence up to and including that match is fully determined and it
1722          *    can begin returning the match/literal list.
1723          *
1724          * 2. @cur_pos reaches a position not overlapped by a preceding match.
1725          *    In such cases, the near-optimal match/literal sequence up to
1726          *    @cur_pos is fully determined and it can begin returning the
1727          *    match/literal list.
1728          *
1729          * 3. Failing either of the above in a degenerate case, the loop
1730          *    terminates when space in the @optimum array is exhausted.
1731          *    This terminates the algorithm and forces it to start returning
1732          *    matches/literals even though they may not be globally optimal.
1733          *
1734          * Upon loop termination, a nonempty list of matches/literals will have
1735          * been produced and stored in the @optimum array.  These
1736          * matches/literals are linked in reverse order, so the last thing this
1737          * function does is reverse this list and return the first
1738          * match/literal, leaving the rest to be returned immediately by
1739          * subsequent calls to this function.
1740          */
1741         cur_pos = 0;
1742         for (;;) {
1743                 u32 cost;
1744
1745                 /* Advance to next position.  */
1746                 cur_pos++;
1747
1748                 /* Check termination conditions (2) and (3) noted above.  */
1749                 if (cur_pos == end_pos || cur_pos == LZX_OPTIM_ARRAY_LENGTH)
1750                         return lzx_match_chooser_reverse_list(c, cur_pos);
1751
1752                 /* Search for matches at repeat offsets.  Again, as a heuristic
1753                  * we only keep the longest one.  */
1754                 longest_rep_len = LZX_MIN_MATCH_LEN - 1;
1755                 unsigned limit = min(LZX_MAX_MATCH_LEN,
1756                                      c->match_window_end - c->match_window_pos);
1757                 for (int i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
1758                         u32 offset = optimum[cur_pos].queue.R[i];
1759                         const u8 *strptr = &c->cur_window[c->match_window_pos];
1760                         const u8 *matchptr = strptr - offset;
1761                         unsigned len = 0;
1762                         while (len < limit && strptr[len] == matchptr[len])
1763                                 len++;
1764                         if (len > longest_rep_len) {
1765                                 longest_rep_len = len;
1766                                 longest_rep_slot = i;
1767                         }
1768                 }
1769
1770                 /* If we found a long match at a repeat offset, choose it
1771                  * immediately.  */
1772                 if (longest_rep_len >= c->params.nice_match_length) {
1773                         /* Build the list of matches to return and get
1774                          * the first one.  */
1775                         match = lzx_match_chooser_reverse_list(c, cur_pos);
1776
1777                         /* Append the long match to the end of the list.  */
1778                         optimum[cur_pos].next.match_offset =
1779                                 optimum[cur_pos].queue.R[longest_rep_slot];
1780                         optimum[cur_pos].next.link = cur_pos + longest_rep_len;
1781                         c->optimum_end_idx = cur_pos + longest_rep_len;
1782
1783                         /* Skip over the remaining bytes of the long match.  */
1784                         lzx_skip_bytes(c, longest_rep_len);
1785
1786                         /* Return first match in the list.  */
1787                         return match;
1788                 }
1789
1790                 /* Find other matches.  */
1791                 num_matches = lzx_get_matches(c, &matches);
1792
1793                 /* If there's a long match, choose it immediately.  */
1794                 if (num_matches) {
1795                         longest_len = matches[num_matches - 1].len;
1796                         if (longest_len >= c->params.nice_match_length) {
1797                                 /* Build the list of matches to return and get
1798                                  * the first one.  */
1799                                 match = lzx_match_chooser_reverse_list(c, cur_pos);
1800
1801                                 /* Append the long match to the end of the list.  */
1802                                 optimum[cur_pos].next.match_offset =
1803                                         matches[num_matches - 1].offset;
1804                                 optimum[cur_pos].next.link = cur_pos + longest_len;
1805                                 c->optimum_end_idx = cur_pos + longest_len;
1806
1807                                 /* Skip over the remaining bytes of the long match.  */
1808                                 lzx_skip_bytes(c, longest_len - 1);
1809
1810                                 /* Return first match in the list.  */
1811                                 return match;
1812                         }
1813                 } else {
1814                         longest_len = 1;
1815                 }
1816
1817                 /* If we are reaching any positions for the first time, we need
1818                  * to initialize their costs to infinity.  */
1819                 while (end_pos < cur_pos + longest_len)
1820                         optimum[++end_pos].cost = MC_INFINITE_COST;
1821
1822                 /* Consider coding a literal.  */
1823                 cost = optimum[cur_pos].cost +
1824                         lzx_literal_cost(c->cur_window[c->match_window_pos - 1],
1825                                          &c->costs);
1826                 if (cost < optimum[cur_pos + 1].cost) {
1827                         optimum[cur_pos + 1].queue = optimum[cur_pos].queue;
1828                         optimum[cur_pos + 1].cost = cost;
1829                         optimum[cur_pos + 1].prev.link = cur_pos;
1830                 }
1831
1832                 /* Consider coding a match.
1833                  *
1834                  * The hard-coded cost calculation is done for the same reason
1835                  * stated in the comment for the similar loop earlier.
1836                  * Actually, it is *this* one that has the biggest effect on
1837                  * performance; overall LZX compression is > 10% faster with
1838                  * this code compared to calling lzx_match_cost() with each
1839                  * length.  */
1840                 for (unsigned i = 0, len = 2; i < num_matches; i++) {
1841                         u32 offset;
1842                         u32 position_cost;
1843                         unsigned position_slot;
1844                         unsigned num_extra_bits;
1845
1846                         offset = matches[i].offset;
1847                         position_cost = optimum[cur_pos].cost;
1848
1849                         /* Yet another optimization: instead of calling
1850                          * lzx_get_position_slot(), hand-inline the search of
1851                          * the repeat offset queue.  Then we can omit the
1852                          * extra_bits calculation for repeat offset matches, and
1853                          * also only compute the updated queue if we actually do
1854                          * find a new lowest cost path.  */
1855                         for (position_slot = 0; position_slot < LZX_NUM_RECENT_OFFSETS; position_slot++)
1856                                 if (offset == optimum[cur_pos].queue.R[position_slot])
1857                                         goto have_position_cost;
1858
1859                         position_slot = lzx_get_position_slot_raw(offset + LZX_OFFSET_OFFSET);
1860
1861                         num_extra_bits = lzx_get_num_extra_bits(position_slot);
1862                         if (num_extra_bits >= 3) {
1863                                 position_cost += num_extra_bits - 3;
1864                                 position_cost += c->costs.aligned[
1865                                                 (offset + LZX_OFFSET_OFFSET) & 7];
1866                         } else {
1867                                 position_cost += num_extra_bits;
1868                         }
1869
1870                 have_position_cost:
1871
1872                         do {
1873                                 unsigned len_header;
1874                                 unsigned main_symbol;
1875                                 u32 cost;
1876
1877                                 cost = position_cost;
1878
1879                                 len_header = min(len - LZX_MIN_MATCH_LEN,
1880                                                  LZX_NUM_PRIMARY_LENS);
1881                                 main_symbol = ((position_slot << 3) | len_header) +
1882                                                 LZX_NUM_CHARS;
1883                                 cost += c->costs.main[main_symbol];
1884                                 if (len_header == LZX_NUM_PRIMARY_LENS) {
1885                                         cost += c->costs.len[len -
1886                                                         LZX_MIN_MATCH_LEN -
1887                                                         LZX_NUM_PRIMARY_LENS];
1888                                 }
1889                                 if (cost < optimum[cur_pos + len].cost) {
1890                                         if (position_slot < LZX_NUM_RECENT_OFFSETS) {
1891                                                 optimum[cur_pos + len].queue = optimum[cur_pos].queue;
1892                                                 swap(optimum[cur_pos + len].queue.R[0],
1893                                                      optimum[cur_pos + len].queue.R[position_slot]);
1894                                         } else {
1895                                                 optimum[cur_pos + len].queue.R[0] = offset;
1896                                                 optimum[cur_pos + len].queue.R[1] = optimum[cur_pos].queue.R[0];
1897                                                 optimum[cur_pos + len].queue.R[2] = optimum[cur_pos].queue.R[1];
1898                                         }
1899                                         optimum[cur_pos + len].prev.link = cur_pos;
1900                                         optimum[cur_pos + len].prev.match_offset = offset;
1901                                         optimum[cur_pos + len].cost = cost;
1902                                 }
1903                         } while (++len <= matches[i].len);
1904                 }
1905
1906                 /* Consider coding a repeat offset match.
1907                  *
1908                  * As a heuristic, we only consider the longest length of the
1909                  * longest repeat offset match.  This does not, however,
1910                  * necessarily mean that we will never consider any other repeat
1911                  * offsets, because above we detect repeat offset matches that
1912                  * were found by the regular match-finder.  Therefore, this
1913                  * special handling of the longest repeat-offset match is only
1914                  * helpful for coding a repeat offset match that was *not* found
1915                  * by the match-finder, e.g. due to being obscured by a less
1916                  * distant match that is at least as long.
1917                  *
1918                  * Note: an alternative, used in LZMA, is to consider every
1919                  * length of every repeat offset match.  This is a more thorough
1920                  * search, and it makes it unnecessary to detect repeat offset
1921                  * matches that were found by the regular match-finder.  But by
1922                  * my tests, for LZX the LZMA method slows down the compressor
1923                  * by ~10% and doesn't actually help the compression ratio too
1924                  * much.
1925                  *
1926                  * Also tested a compromise approach: consider every 3rd length
1927                  * of the longest repeat offset match.  Still didn't seem quite
1928                  * worth it, though.
1929                  */
1930                 if (longest_rep_len >= LZX_MIN_MATCH_LEN) {
1931
1932                         while (end_pos < cur_pos + longest_rep_len)
1933                                 optimum[++end_pos].cost = MC_INFINITE_COST;
1934
1935                         cost = optimum[cur_pos].cost +
1936                                 lzx_repmatch_cost(longest_rep_len, longest_rep_slot,
1937                                                   &c->costs);
1938                         if (cost <= optimum[cur_pos + longest_rep_len].cost) {
1939                                 optimum[cur_pos + longest_rep_len].queue =
1940                                         optimum[cur_pos].queue;
1941                                 swap(optimum[cur_pos + longest_rep_len].queue.R[0],
1942                                      optimum[cur_pos + longest_rep_len].queue.R[longest_rep_slot]);
1943                                 optimum[cur_pos + longest_rep_len].prev.link =
1944                                         cur_pos;
1945                                 optimum[cur_pos + longest_rep_len].prev.match_offset =
1946                                         optimum[cur_pos + longest_rep_len].queue.R[0];
1947                                 optimum[cur_pos + longest_rep_len].cost =
1948                                         cost;
1949                         }
1950                 }
1951         }
1952 }
1953
1954 static struct lz_match
1955 lzx_choose_lazy_item(struct lzx_compressor *c)
1956 {
1957         const struct lz_match *matches;
1958         struct lz_match cur_match;
1959         struct lz_match next_match;
1960         u32 num_matches;
1961
1962         if (c->prev_match.len) {
1963                 cur_match = c->prev_match;
1964                 c->prev_match.len = 0;
1965         } else {
1966                 num_matches = lzx_get_matches(c, &matches);
1967                 if (num_matches == 0 ||
1968                     (matches[num_matches - 1].len <= 3 &&
1969                      (matches[num_matches - 1].len <= 2 ||
1970                       matches[num_matches - 1].offset > 4096)))
1971                 {
1972                         return (struct lz_match) { };
1973                 }
1974
1975                 cur_match = matches[num_matches - 1];
1976         }
1977
1978         if (cur_match.len >= c->params.nice_match_length) {
1979                 lzx_skip_bytes(c, cur_match.len - 1);
1980                 return cur_match;
1981         }
1982
1983         num_matches = lzx_get_matches(c, &matches);
1984         if (num_matches == 0 ||
1985             (matches[num_matches - 1].len <= 3 &&
1986              (matches[num_matches - 1].len <= 2 ||
1987               matches[num_matches - 1].offset > 4096)))
1988         {
1989                 lzx_skip_bytes(c, cur_match.len - 2);
1990                 return cur_match;
1991         }
1992
1993         next_match = matches[num_matches - 1];
1994
1995         if (next_match.len <= cur_match.len) {
1996                 lzx_skip_bytes(c, cur_match.len - 2);
1997                 return cur_match;
1998         } else {
1999                 c->prev_match = next_match;
2000                 return (struct lz_match) { };
2001         }
2002 }
2003
2004 /*
2005  * Return the next match or literal to use, delegating to the currently selected
2006  * match-choosing algorithm.
2007  *
2008  * If the length of the returned 'struct lz_match' is less than
2009  * LZX_MIN_MATCH_LEN, then it is really a literal.
2010  */
2011 static inline struct lz_match
2012 lzx_choose_item(struct lzx_compressor *c)
2013 {
2014         return (*c->params.choose_item_func)(c);
2015 }
2016
2017 /* Set default symbol costs for the LZX Huffman codes.  */
2018 static void
2019 lzx_set_default_costs(struct lzx_costs * costs, unsigned num_main_syms)
2020 {
2021         unsigned i;
2022
2023         /* Main code (part 1): Literal symbols  */
2024         for (i = 0; i < LZX_NUM_CHARS; i++)
2025                 costs->main[i] = 8;
2026
2027         /* Main code (part 2): Match header symbols  */
2028         for (; i < num_main_syms; i++)
2029                 costs->main[i] = 10;
2030
2031         /* Length code  */
2032         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
2033                 costs->len[i] = 8;
2034
2035         /* Aligned offset code  */
2036         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
2037                 costs->aligned[i] = 3;
2038 }
2039
2040 /* Given the frequencies of symbols in an LZX-compressed block and the
2041  * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
2042  * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
2043  * will take fewer bits to output.  */
2044 static int
2045 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
2046                                const struct lzx_codes * codes)
2047 {
2048         unsigned aligned_cost = 0;
2049         unsigned verbatim_cost = 0;
2050
2051         /* Verbatim blocks have a constant 3 bits per position footer.  Aligned
2052          * offset blocks have an aligned offset symbol per position footer, plus
2053          * an extra 24 bits per block to output the lengths necessary to
2054          * reconstruct the aligned offset code itself.  */
2055         for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2056                 verbatim_cost += 3 * freqs->aligned[i];
2057                 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
2058         }
2059         aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE * LZX_ALIGNEDCODE_NUM_SYMBOLS;
2060         if (aligned_cost < verbatim_cost)
2061                 return LZX_BLOCKTYPE_ALIGNED;
2062         else
2063                 return LZX_BLOCKTYPE_VERBATIM;
2064 }
2065
2066 /* Find a sequence of matches/literals with which to output the specified LZX
2067  * block, then set the block's type to that which has the minimum cost to output
2068  * (either verbatim or aligned).  */
2069 static void
2070 lzx_choose_items_for_block(struct lzx_compressor *c, struct lzx_block_spec *spec)
2071 {
2072         const struct lzx_lru_queue orig_queue = c->queue;
2073         u32 num_passes_remaining = c->params.num_optim_passes;
2074         struct lzx_freqs freqs;
2075         const u8 *window_ptr;
2076         const u8 *window_end;
2077         struct lzx_item *next_chosen_item;
2078         struct lz_match lz_match;
2079         struct lzx_item lzx_item;
2080
2081         LZX_ASSERT(num_passes_remaining >= 1);
2082         LZX_ASSERT(lz_mf_get_position(c->mf) == spec->window_pos);
2083
2084         c->match_window_end = spec->window_pos + spec->block_size;
2085
2086         if (c->params.num_optim_passes > 1) {
2087                 if (spec->block_size == c->cur_window_size)
2088                         c->get_matches_func = lzx_get_matches_fillcache_singleblock;
2089                 else
2090                         c->get_matches_func = lzx_get_matches_fillcache_multiblock;
2091                 c->skip_bytes_func = lzx_skip_bytes_fillcache;
2092         } else {
2093                 if (spec->block_size == c->cur_window_size)
2094                         c->get_matches_func = lzx_get_matches_nocache_singleblock;
2095                 else
2096                         c->get_matches_func = lzx_get_matches_nocache_multiblock;
2097                 c->skip_bytes_func = lzx_skip_bytes_nocache;
2098         }
2099
2100         /* The first optimal parsing pass is done using the cost model already
2101          * set in c->costs.  Each later pass is done using a cost model
2102          * computed from the previous pass.
2103          *
2104          * To improve performance we only generate the array containing the
2105          * matches and literals in intermediate form on the final pass.  */
2106
2107         while (--num_passes_remaining) {
2108                 c->match_window_pos = spec->window_pos;
2109                 c->cache_ptr = c->cached_matches;
2110                 memset(&freqs, 0, sizeof(freqs));
2111                 window_ptr = &c->cur_window[spec->window_pos];
2112                 window_end = window_ptr + spec->block_size;
2113
2114                 while (window_ptr != window_end) {
2115
2116                         lz_match = lzx_choose_item(c);
2117
2118                         LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
2119                                      lz_match.offset == c->max_window_size -
2120                                                          LZX_MIN_MATCH_LEN));
2121                         if (lz_match.len >= LZX_MIN_MATCH_LEN) {
2122                                 lzx_tally_match(lz_match.len, lz_match.offset,
2123                                                 &freqs, &c->queue);
2124                                 window_ptr += lz_match.len;
2125                         } else {
2126                                 lzx_tally_literal(*window_ptr, &freqs);
2127                                 window_ptr += 1;
2128                         }
2129                 }
2130                 lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
2131                 lzx_set_costs(c, &spec->codes.lens, 15);
2132                 c->queue = orig_queue;
2133                 if (c->cache_ptr <= c->cache_limit) {
2134                         c->get_matches_func = lzx_get_matches_usecache_nocheck;
2135                         c->skip_bytes_func = lzx_skip_bytes_usecache_nocheck;
2136                 } else {
2137                         c->get_matches_func = lzx_get_matches_usecache;
2138                         c->skip_bytes_func = lzx_skip_bytes_usecache;
2139                 }
2140         }
2141
2142         c->match_window_pos = spec->window_pos;
2143         c->cache_ptr = c->cached_matches;
2144         memset(&freqs, 0, sizeof(freqs));
2145         window_ptr = &c->cur_window[spec->window_pos];
2146         window_end = window_ptr + spec->block_size;
2147
2148         spec->chosen_items = &c->chosen_items[spec->window_pos];
2149         next_chosen_item = spec->chosen_items;
2150
2151         unsigned unseen_cost = 9;
2152         while (window_ptr != window_end) {
2153
2154                 lz_match = lzx_choose_item(c);
2155
2156                 LZX_ASSERT(!(lz_match.len == LZX_MIN_MATCH_LEN &&
2157                              lz_match.offset == c->max_window_size -
2158                                                  LZX_MIN_MATCH_LEN));
2159                 if (lz_match.len >= LZX_MIN_MATCH_LEN) {
2160                         lzx_item.data = lzx_tally_match(lz_match.len,
2161                                                          lz_match.offset,
2162                                                          &freqs, &c->queue);
2163                         window_ptr += lz_match.len;
2164                 } else {
2165                         lzx_item.data = lzx_tally_literal(*window_ptr, &freqs);
2166                         window_ptr += 1;
2167                 }
2168                 *next_chosen_item++ = lzx_item;
2169
2170                 /* When doing one-pass "near-optimal" parsing, update the cost
2171                  * model occassionally.  */
2172                 if (unlikely((next_chosen_item - spec->chosen_items) % 2048 == 0) &&
2173                     c->params.choose_item_func == lzx_choose_near_optimal_item &&
2174                     c->params.num_optim_passes == 1)
2175                 {
2176                         lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
2177                         lzx_set_costs(c, &spec->codes.lens, unseen_cost);
2178                         if (unseen_cost < 15)
2179                                 unseen_cost++;
2180                 }
2181         }
2182         spec->num_chosen_items = next_chosen_item - spec->chosen_items;
2183         lzx_make_huffman_codes(&freqs, &spec->codes, c->num_main_syms);
2184         spec->block_type = lzx_choose_verbatim_or_aligned(&freqs, &spec->codes);
2185 }
2186
2187 /* Prepare the input window into one or more LZX blocks ready to be output.  */
2188 static void
2189 lzx_prepare_blocks(struct lzx_compressor *c)
2190 {
2191         /* Set up a default cost model.  */
2192         if (c->params.choose_item_func == lzx_choose_near_optimal_item)
2193                 lzx_set_default_costs(&c->costs, c->num_main_syms);
2194
2195         /* Set up the block specifications.
2196          * TODO: The compression ratio could be slightly improved by performing
2197          * data-dependent block splitting instead of using fixed-size blocks.
2198          * Doing so well is a computationally hard problem, however.  */
2199         c->num_blocks = DIV_ROUND_UP(c->cur_window_size, LZX_DIV_BLOCK_SIZE);
2200         for (unsigned i = 0; i < c->num_blocks; i++) {
2201                 u32 pos = LZX_DIV_BLOCK_SIZE * i;
2202                 c->block_specs[i].window_pos = pos;
2203                 c->block_specs[i].block_size = min(c->cur_window_size - pos,
2204                                                    LZX_DIV_BLOCK_SIZE);
2205         }
2206
2207         /* Load the window into the match-finder.  */
2208         lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size);
2209
2210         /* Determine sequence of matches/literals to output for each block.  */
2211         lzx_lru_queue_init(&c->queue);
2212         c->optimum_cur_idx = 0;
2213         c->optimum_end_idx = 0;
2214         c->prev_match.len = 0;
2215         for (unsigned i = 0; i < c->num_blocks; i++)
2216                 lzx_choose_items_for_block(c, &c->block_specs[i]);
2217 }
2218
2219 static void
2220 lzx_build_params(unsigned int compression_level,
2221                  u32 max_window_size,
2222                  struct lzx_compressor_params *lzx_params)
2223 {
2224         if (compression_level < 25) {
2225                 lzx_params->choose_item_func = lzx_choose_lazy_item;
2226                 lzx_params->num_optim_passes  = 1;
2227                 if (max_window_size <= 262144)
2228                         lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
2229                 else
2230                         lzx_params->mf_algo = LZ_MF_BINARY_TREES;
2231                 lzx_params->min_match_length  = 3;
2232                 lzx_params->nice_match_length = 25 + compression_level * 2;
2233                 lzx_params->max_search_depth  = 25 + compression_level;
2234         } else {
2235                 lzx_params->choose_item_func = lzx_choose_near_optimal_item;
2236                 lzx_params->num_optim_passes  = compression_level / 20;
2237                 if (max_window_size <= 32768 && lzx_params->num_optim_passes == 1)
2238                         lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
2239                 else
2240                         lzx_params->mf_algo = LZ_MF_BINARY_TREES;
2241                 lzx_params->min_match_length  = (compression_level >= 45) ? 2 : 3;
2242                 lzx_params->nice_match_length = min(((u64)compression_level * 32) / 50,
2243                                                     LZX_MAX_MATCH_LEN);
2244                 lzx_params->max_search_depth  = min(((u64)compression_level * 50) / 50,
2245                                                     LZX_MAX_MATCH_LEN);
2246         }
2247 }
2248
2249 static void
2250 lzx_build_mf_params(const struct lzx_compressor_params *lzx_params,
2251                     u32 max_window_size, struct lz_mf_params *mf_params)
2252 {
2253         memset(mf_params, 0, sizeof(*mf_params));
2254
2255         mf_params->algorithm = lzx_params->mf_algo;
2256         mf_params->max_window_size = max_window_size;
2257         mf_params->min_match_len = lzx_params->min_match_length;
2258         mf_params->max_match_len = LZX_MAX_MATCH_LEN;
2259         mf_params->max_search_depth = lzx_params->max_search_depth;
2260         mf_params->nice_match_len = lzx_params->nice_match_length;
2261 }
2262
2263 static void
2264 lzx_free_compressor(void *_c);
2265
2266 static u64
2267 lzx_get_needed_memory(size_t max_block_size, unsigned int compression_level)
2268 {
2269         struct lzx_compressor_params params;
2270         u64 size = 0;
2271         unsigned window_order;
2272         u32 max_window_size;
2273
2274         window_order = lzx_get_window_order(max_block_size);
2275         if (window_order == 0)
2276                 return 0;
2277         max_window_size = max_block_size;
2278
2279         lzx_build_params(compression_level, max_window_size, &params);
2280
2281         size += sizeof(struct lzx_compressor);
2282
2283         size += max_window_size;
2284
2285         size += DIV_ROUND_UP(max_window_size, LZX_DIV_BLOCK_SIZE) *
2286                 sizeof(struct lzx_block_spec);
2287
2288         size += max_window_size * sizeof(struct lzx_item);
2289
2290         size += lz_mf_get_needed_memory(params.mf_algo, max_window_size);
2291         if (params.choose_item_func == lzx_choose_near_optimal_item) {
2292                 size += (LZX_OPTIM_ARRAY_LENGTH + params.nice_match_length) *
2293                         sizeof(struct lzx_mc_pos_data);
2294         }
2295         if (params.num_optim_passes > 1)
2296                 size += LZX_CACHE_LEN * sizeof(struct lz_match);
2297         else
2298                 size += LZX_MAX_MATCHES_PER_POS * sizeof(struct lz_match);
2299         return size;
2300 }
2301
2302 static int
2303 lzx_create_compressor(size_t max_block_size, unsigned int compression_level,
2304                       void **c_ret)
2305 {
2306         struct lzx_compressor *c;
2307         struct lzx_compressor_params params;
2308         struct lz_mf_params mf_params;
2309         unsigned window_order;
2310         u32 max_window_size;
2311
2312         window_order = lzx_get_window_order(max_block_size);
2313         if (window_order == 0)
2314                 return WIMLIB_ERR_INVALID_PARAM;
2315         max_window_size = max_block_size;
2316
2317         lzx_build_params(compression_level, max_window_size, &params);
2318         lzx_build_mf_params(&params, max_window_size, &mf_params);
2319         if (!lz_mf_params_valid(&mf_params))
2320                 return WIMLIB_ERR_INVALID_PARAM;
2321
2322         c = CALLOC(1, sizeof(struct lzx_compressor));
2323         if (!c)
2324                 goto oom;
2325
2326         c->params = params;
2327         c->num_main_syms = lzx_get_num_main_syms(window_order);
2328         c->max_window_size = max_window_size;
2329         c->window_order = window_order;
2330
2331         c->cur_window = ALIGNED_MALLOC(max_window_size, 16);
2332         if (!c->cur_window)
2333                 goto oom;
2334
2335         c->block_specs = MALLOC(DIV_ROUND_UP(max_window_size,
2336                                              LZX_DIV_BLOCK_SIZE) *
2337                                 sizeof(struct lzx_block_spec));
2338         if (!c->block_specs)
2339                 goto oom;
2340
2341         c->chosen_items = MALLOC(max_window_size * sizeof(struct lzx_item));
2342         if (!c->chosen_items)
2343                 goto oom;
2344
2345         c->mf = lz_mf_alloc(&mf_params);
2346         if (!c->mf)
2347                 goto oom;
2348
2349         if (params.choose_item_func == lzx_choose_near_optimal_item) {
2350                 c->optimum = MALLOC((LZX_OPTIM_ARRAY_LENGTH +
2351                                      params.nice_match_length) *
2352                                     sizeof(struct lzx_mc_pos_data));
2353                 if (!c->optimum)
2354                         goto oom;
2355         }
2356
2357         if (params.num_optim_passes > 1) {
2358                 c->cached_matches = MALLOC(LZX_CACHE_LEN *
2359                                            sizeof(struct lz_match));
2360                 if (!c->cached_matches)
2361                         goto oom;
2362                 c->cache_limit = c->cached_matches + LZX_CACHE_LEN -
2363                                  (LZX_MAX_MATCHES_PER_POS + 1);
2364         } else {
2365                 c->cached_matches = MALLOC(LZX_MAX_MATCHES_PER_POS *
2366                                            sizeof(struct lz_match));
2367                 if (!c->cached_matches)
2368                         goto oom;
2369         }
2370
2371         *c_ret = c;
2372         return 0;
2373
2374 oom:
2375         lzx_free_compressor(c);
2376         return WIMLIB_ERR_NOMEM;
2377 }
2378
2379 static size_t
2380 lzx_compress(const void *uncompressed_data, size_t uncompressed_size,
2381              void *compressed_data, size_t compressed_size_avail, void *_c)
2382 {
2383         struct lzx_compressor *c = _c;
2384         struct lzx_output_bitstream os;
2385
2386         /* Don't bother compressing very small inputs.  */
2387         if (uncompressed_size < 100)
2388                 return 0;
2389
2390         /* The input data must be preprocessed.  To avoid changing the original
2391          * input, copy it to a temporary buffer.  */
2392         memcpy(c->cur_window, uncompressed_data, uncompressed_size);
2393         c->cur_window_size = uncompressed_size;
2394
2395         /* Preprocess the data.  */
2396         lzx_do_e8_preprocessing(c->cur_window, c->cur_window_size);
2397
2398         /* Prepare the compressed data.  */
2399         lzx_prepare_blocks(c);
2400
2401         /* Generate the compressed data and return its size, or 0 if an overflow
2402          * occurred.  */
2403         lzx_init_output(&os, compressed_data, compressed_size_avail);
2404         lzx_write_all_blocks(c, &os);
2405         return lzx_flush_output(&os);
2406 }
2407
2408 static void
2409 lzx_free_compressor(void *_c)
2410 {
2411         struct lzx_compressor *c = _c;
2412
2413         if (c) {
2414                 ALIGNED_FREE(c->cur_window);
2415                 FREE(c->block_specs);
2416                 FREE(c->chosen_items);
2417                 lz_mf_free(c->mf);
2418                 FREE(c->optimum);
2419                 FREE(c->cached_matches);
2420                 FREE(c);
2421         }
2422 }
2423
2424 const struct compressor_ops lzx_compressor_ops = {
2425         .get_needed_memory  = lzx_get_needed_memory,
2426         .create_compressor  = lzx_create_compressor,
2427         .compress           = lzx_compress,
2428         .free_compressor    = lzx_free_compressor,
2429 };