wimlib.net Git - wimlib/blob - src/lzx-compress.c

   1 /*
   2  * lzx-compress.c
   3  *
   4  * A compressor that produces output compatible with the LZX compression format.
   5  */
   6
   7 /*
   8  * Copyright (C) 2012, 2013, 2014 Eric Biggers
   9  *
  10  * This file is free software; you can redistribute it and/or modify it under
  11  * the terms of the GNU Lesser General Public License as published by the Free
  12  * Software Foundation; either version 3 of the License, or (at your option) any
  13  * later version.
  14  *
  15  * This file is distributed in the hope that it will be useful, but WITHOUT
  16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  18  * details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this file; if not, see http://www.gnu.org/licenses/.
  22  */
  23
  24
  25 /*
  26  * This file contains a compressor for the LZX ("Lempel-Ziv eXtended")
  27  * compression format, as used in the WIM (Windows IMaging) file format.
  28  *
  29  * Two different parsing algorithms are implemented: "near-optimal" and "lazy".
  30  * "Near-optimal" is significantly slower than "lazy", but results in a better
  31  * compression ratio.  The "near-optimal" algorithm is used at the default
  32  * compression level.
  33  *
  34  * This file may need some slight modifications to be used outside of the WIM
  35  * format.  In particular, in other situations the LZX block header might be
  36  * slightly different, and a sliding window rather than a fixed-size window
  37  * might be required.
  38  *
  39  * Note: LZX is a compression format derived from DEFLATE, the format used by
  40  * zlib and gzip.  Both LZX and DEFLATE use LZ77 matching and Huffman coding.
  41  * Certain details are quite similar, such as the method for storing Huffman
  42  * codes.  However, the main differences are:
  43  *
  44  * - LZX preprocesses the data to attempt to make x86 machine code slightly more
  45  *   compressible before attempting to compress it further.
  46  *
  47  * - LZX uses a "main" alphabet which combines literals and matches, with the
  48  *   match symbols containing a "length header" (giving all or part of the match
  49  *   length) and an "offset slot" (giving, roughly speaking, the order of
  50  *   magnitude of the match offset).
  51  *
  52  * - LZX does not have static Huffman blocks (that is, the kind with preset
  53  *   Huffman codes); however it does have two types of dynamic Huffman blocks
  54  *   ("verbatim" and "aligned").
  55  *
  56  * - LZX has a minimum match length of 2 rather than 3.  Length 2 matches can be
  57  *   useful, but generally only if the parser is smart about choosing them.
  58  *
  59  * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue
  60  *   of match offsets.  This is very useful for certain types of files, such as
  61  *   binary files that have repeating records.
  62  */
  63
  64 #ifdef HAVE_CONFIG_H
  65 #  include "config.h"
  66 #endif
  67
  68 #include "wimlib/compress_common.h"
  69 #include "wimlib/compressor_ops.h"
  70 #include "wimlib/endianness.h"
  71 #include "wimlib/error.h"
  72 #include "wimlib/lz_mf.h"
  73 #include "wimlib/lz_repsearch.h"
  74 #include "wimlib/lzx.h"
  75 #include "wimlib/util.h"
  76
  77 #include <string.h>
  78 #include <limits.h>
  79
  80 #define LZX_OPTIM_ARRAY_LENGTH  4096
  81
  82 #define LZX_DIV_BLOCK_SIZE      32768
  83
  84 #define LZX_CACHE_PER_POS       8
  85
  86 #define LZX_MAX_MATCHES_PER_POS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
  87
  88 #define LZX_CACHE_LEN (LZX_DIV_BLOCK_SIZE * (LZX_CACHE_PER_POS + 1))
  89
  90 struct lzx_compressor;
  91
  92 /* Codewords for the LZX Huffman codes.  */
  93 struct lzx_codewords {
  94         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
  95         u32 len[LZX_LENCODE_NUM_SYMBOLS];
  96         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
  97 };
  98
  99 /* Codeword lengths (in bits) for the LZX Huffman codes.
 100  * A zero length means the corresponding codeword has zero frequency.  */
 101 struct lzx_lens {
 102         u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 103         u8 len[LZX_LENCODE_NUM_SYMBOLS];
 104         u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 105 };
 106
 107 /* Estimated cost, in bits, to output each symbol in the LZX Huffman codes.  */
 108 struct lzx_costs {
 109         u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 110         u8 len[LZX_LENCODE_NUM_SYMBOLS];
 111         u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 112 };
 113
 114 /* Codewords and lengths for the LZX Huffman codes.  */
 115 struct lzx_codes {
 116         struct lzx_codewords codewords;
 117         struct lzx_lens lens;
 118 };
 119
 120 /* Symbol frequency counters for the LZX Huffman codes.  */
 121 struct lzx_freqs {
 122         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 123         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 124         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 125 };
 126
 127 /* Intermediate LZX match/literal format  */
 128 struct lzx_item {
 129
 130         /* Bits 0  -  9: Main symbol
 131          * Bits 10 - 17: Length symbol
 132          * Bits 18 - 22: Number of extra offset bits
 133          * Bits 23+    : Extra offset bits  */
 134         u64 data;
 135 };
 136
 137 /* Internal compression parameters  */
 138 struct lzx_compressor_params {
 139         u32 (*choose_items_for_block)(struct lzx_compressor *, u32, u32);
 140         u32 num_optim_passes;
 141         enum lz_mf_algo mf_algo;
 142         u32 min_match_length;
 143         u32 nice_match_length;
 144         u32 max_search_depth;
 145 };
 146
 147 /*
 148  * Match chooser position data:
 149  *
 150  * An array of these structures is used during the near-optimal match-choosing
 151  * algorithm.  They correspond to consecutive positions in the window and are
 152  * used to keep track of the cost to reach each position, and the match/literal
 153  * choices that need to be chosen to reach that position.
 154  */
 155 struct lzx_mc_pos_data {
 156
 157         /* The cost, in bits, of the lowest-cost path that has been found to
 158          * reach this position.  This can change as progressively lower cost
 159          * paths are found to reach this position.  */
 160         u32 cost;
 161 #define MC_INFINITE_COST UINT32_MAX
 162
 163         /* The match or literal that was taken to reach this position.  This can
 164          * change as progressively lower cost paths are found to reach this
 165          * position.
 166          *
 167          * This variable is divided into two bitfields.
 168          *
 169          * Literals:
 170          *      Low bits are 1, high bits are the literal.
 171          *
 172          * Explicit offset matches:
 173          *      Low bits are the match length, high bits are the offset plus 2.
 174          *
 175          * Repeat offset matches:
 176          *      Low bits are the match length, high bits are the queue index.
 177          */
 178         u32 mc_item_data;
 179 #define MC_OFFSET_SHIFT 9
 180 #define MC_LEN_MASK ((1 << MC_OFFSET_SHIFT) - 1)
 181
 182         /* The state of the LZX recent match offsets queue at this position.
 183          * This is filled in lazily, only after the minimum-cost path to this
 184          * position is found.
 185          *
 186          * Note: the way we handle this adaptive state in the "minimum-cost"
 187          * parse is actually only an approximation.  It's possible for the
 188          * globally optimal, minimum cost path to contain a prefix, ending at a
 189          * position, where that path prefix is *not* the minimum cost path to
 190          * that position.  This can happen if such a path prefix results in a
 191          * different adaptive state which results in lower costs later.  We do
 192          * not solve this problem; we only consider the lowest cost to reach
 193          * each position, which seems to be an acceptable approximation.  */
 194         struct lzx_lru_queue queue _aligned_attribute(16);
 195
 196 } _aligned_attribute(16);
 197
 198 /* State of the LZX compressor  */
 199 struct lzx_compressor {
 200
 201         /* Internal compression parameters  */
 202         struct lzx_compressor_params params;
 203
 204         /* The preprocessed buffer of data being compressed  */
 205         u8 *cur_window;
 206
 207         /* Number of bytes of data to be compressed, which is the number of
 208          * bytes of data in @cur_window that are actually valid.  */
 209         u32 cur_window_size;
 210
 211         /* log2 order of the LZX window size for LZ match offset encoding
 212          * purposes.  Will be >= LZX_MIN_WINDOW_ORDER and <=
 213          * LZX_MAX_WINDOW_ORDER.
 214          *
 215          * Note: 1 << @window_order is normally equal to @max_window_size,
 216          * a.k.a. the allocated size of @cur_window, but it will be greater than
 217          * @max_window_size in the event that the compressor was created with a
 218          * non-power-of-2 block size.  (See lzx_get_window_order().)  */
 219         unsigned window_order;
 220
 221         /* Number of symbols in the main alphabet.  This depends on
 222          * @window_order, since @window_order determines the maximum possible
 223          * offset.  It does not, however, depend on the *actual* size of the
 224          * current data buffer being processed, which might be less than 1 <<
 225          * @window_order.  */
 226         unsigned num_main_syms;
 227
 228         /* Lempel-Ziv match-finder  */
 229         struct lz_mf *mf;
 230
 231         /* Match-finder wrapper functions and data for near-optimal parsing.
 232          *
 233          * When doing more than one match-choosing pass over the data, matches
 234          * found by the match-finder are cached to achieve a slight speedup when
 235          * the same matches are needed on subsequent passes.  This is suboptimal
 236          * because different matches may be preferred with different cost
 237          * models, but it is a very worthwhile speedup.  */
 238         unsigned (*get_matches_func)(struct lzx_compressor *, const struct lz_match **);
 239         void (*skip_bytes_func)(struct lzx_compressor *, unsigned n);
 240         u32 match_window_pos;
 241         u32 match_window_end;
 242         struct lz_match *cached_matches;
 243         struct lz_match *cache_ptr;
 244         struct lz_match *cache_limit;
 245
 246         /* Position data for near-optimal parsing.  */
 247         struct lzx_mc_pos_data optimum[LZX_OPTIM_ARRAY_LENGTH + LZX_MAX_MATCH_LEN];
 248
 249         /* The cost model currently being used for near-optimal parsing.  */
 250         struct lzx_costs costs;
 251
 252         /* The current match offset LRU queue.  */
 253         struct lzx_lru_queue queue;
 254
 255         /* Frequency counters for the current block.  */
 256         struct lzx_freqs freqs;
 257
 258         /* The Huffman codes for the current and previous blocks.  */
 259         struct lzx_codes codes[2];
 260
 261         /* Which 'struct lzx_codes' is being used for the current block.  The
 262          * other was used for the previous block (if this isn't the first
 263          * block).  */
 264         unsigned int codes_index;
 265
 266         /* Dummy lengths that are always 0.  */
 267         struct lzx_lens zero_lens;
 268
 269         /* Matches/literals that were chosen for the current block.  */
 270         struct lzx_item chosen_items[LZX_DIV_BLOCK_SIZE];
 271
 272         /* Table mapping match offset => offset slot for small offsets  */
 273 #define LZX_NUM_FAST_OFFSETS 32768
 274         u8 offset_slot_fast[LZX_NUM_FAST_OFFSETS];
 275 };
 276
 277 /*
 278  * Structure to keep track of the current state of sending bits to the
 279  * compressed output buffer.
 280  *
 281  * The LZX bitstream is encoded as a sequence of 16-bit coding units.
 282  */
 283 struct lzx_output_bitstream {
 284
 285         /* Bits that haven't yet been written to the output buffer.  */
 286         u32 bitbuf;
 287
 288         /* Number of bits currently held in @bitbuf.  */
 289         u32 bitcount;
 290
 291         /* Pointer to the start of the output buffer.  */
 292         le16 *start;
 293
 294         /* Pointer to the position in the output buffer at which the next coding
 295          * unit should be written.  */
 296         le16 *next;
 297
 298         /* Pointer past the end of the output buffer.  */
 299         le16 *end;
 300 };
 301
 302 /*
 303  * Initialize the output bitstream.
 304  *
 305  * @os
 306  *      The output bitstream structure to initialize.
 307  * @buffer
 308  *      The buffer being written to.
 309  * @size
 310  *      Size of @buffer, in bytes.
 311  */
 312 static void
 313 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, u32 size)
 314 {
 315         os->bitbuf = 0;
 316         os->bitcount = 0;
 317         os->start = buffer;
 318         os->next = os->start;
 319         os->end = os->start + size / sizeof(le16);
 320 }
 321
 322 /*
 323  * Write some bits to the output bitstream.
 324  *
 325  * The bits are given by the low-order @num_bits bits of @bits.  Higher-order
 326  * bits in @bits cannot be set.  At most 17 bits can be written at once.
 327  *
 328  * @max_num_bits is a compile-time constant that specifies the maximum number of
 329  * bits that can ever be written at the call site.  Currently, it is used to
 330  * optimize away the conditional code for writing a second 16-bit coding unit
 331  * when writing fewer than 17 bits.
 332  *
 333  * If the output buffer space is exhausted, then the bits will be ignored, and
 334  * lzx_flush_output() will return 0 when it gets called.
 335  */
 336 static inline void
 337 lzx_write_varbits(struct lzx_output_bitstream *os,
 338                   const u32 bits, const unsigned int num_bits,
 339                   const unsigned int max_num_bits)
 340 {
 341         /* This code is optimized for LZX, which never needs to write more than
 342          * 17 bits at once.  */
 343         LZX_ASSERT(num_bits <= 17);
 344         LZX_ASSERT(num_bits <= max_num_bits);
 345         LZX_ASSERT(os->bitcount <= 15);
 346
 347         /* Add the bits to the bit buffer variable.  @bitcount will be at most
 348          * 15, so there will be just enough space for the maximum possible
 349          * @num_bits of 17.  */
 350         os->bitcount += num_bits;
 351         os->bitbuf = (os->bitbuf << num_bits) | bits;
 352
 353         /* Check whether any coding units need to be written.  */
 354         if (os->bitcount >= 16) {
 355
 356                 os->bitcount -= 16;
 357
 358                 /* Write a coding unit, unless it would overflow the buffer.  */
 359                 if (os->next != os->end)
 360                         put_unaligned_u16_le(os->bitbuf >> os->bitcount, os->next++);
 361
 362                 /* If writing 17 bits, a second coding unit might need to be
 363                  * written.  But because 'max_num_bits' is a compile-time
 364                  * constant, the compiler will optimize away this code at most
 365                  * call sites.  */
 366                 if (max_num_bits == 17 && os->bitcount == 16) {
 367                         if (os->next != os->end)
 368                                 put_unaligned_u16_le(os->bitbuf, os->next++);
 369                         os->bitcount = 0;
 370                 }
 371         }
 372 }
 373
 374 /* Use when @num_bits is a compile-time constant.  Otherwise use
 375  * lzx_write_varbits().  */
 376 static inline void
 377 lzx_write_bits(struct lzx_output_bitstream *os,
 378                const u32 bits, const unsigned int num_bits)
 379 {
 380         lzx_write_varbits(os, bits, num_bits, num_bits);
 381 }
 382
 383 /*
 384  * Flush the last coding unit to the output buffer if needed.  Return the total
 385  * number of bytes written to the output buffer, or 0 if an overflow occurred.
 386  */
 387 static u32
 388 lzx_flush_output(struct lzx_output_bitstream *os)
 389 {
 390         if (os->next == os->end)
 391                 return 0;
 392
 393         if (os->bitcount != 0)
 394                 put_unaligned_u16_le(os->bitbuf << (16 - os->bitcount), os->next++);
 395
 396         return (const u8 *)os->next - (const u8 *)os->start;
 397 }
 398
 399 /* Build the main, length, and aligned offset Huffman codes used in LZX.
 400  *
 401  * This takes as input the frequency tables for each code and produces as output
 402  * a set of tables that map symbols to codewords and codeword lengths.  */
 403 static void
 404 lzx_make_huffman_codes(const struct lzx_freqs *freqs, struct lzx_codes *codes,
 405                        unsigned num_main_syms)
 406 {
 407         make_canonical_huffman_code(num_main_syms,
 408                                     LZX_MAX_MAIN_CODEWORD_LEN,
 409                                     freqs->main,
 410                                     codes->lens.main,
 411                                     codes->codewords.main);
 412
 413         make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
 414                                     LZX_MAX_LEN_CODEWORD_LEN,
 415                                     freqs->len,
 416                                     codes->lens.len,
 417                                     codes->codewords.len);
 418
 419         make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
 420                                     LZX_MAX_ALIGNED_CODEWORD_LEN,
 421                                     freqs->aligned,
 422                                     codes->lens.aligned,
 423                                     codes->codewords.aligned);
 424 }
 425
 426 static unsigned
 427 lzx_compute_precode_items(const u8 lens[restrict],
 428                           const u8 prev_lens[restrict],
 429                           const unsigned num_lens,
 430                           u32 precode_freqs[restrict],
 431                           unsigned precode_items[restrict])
 432 {
 433         unsigned *itemptr;
 434         unsigned run_start;
 435         unsigned run_end;
 436         unsigned extra_bits;
 437         int delta;
 438         u8 len;
 439
 440         itemptr = precode_items;
 441         run_start = 0;
 442         do {
 443                 /* Find the next run of codeword lengths.  */
 444
 445                 /* len = the length being repeated  */
 446                 len = lens[run_start];
 447
 448                 run_end = run_start + 1;
 449
 450                 /* Fast case for a single length.  */
 451                 if (likely(run_end == num_lens || len != lens[run_end])) {
 452                         delta = prev_lens[run_start] - len;
 453                         if (delta < 0)
 454                                 delta += 17;
 455                         precode_freqs[delta]++;
 456                         *itemptr++ = delta;
 457                         run_start++;
 458                         continue;
 459                 }
 460
 461                 /* Extend the run.  */
 462                 do {
 463                         run_end++;
 464                 } while (run_end != num_lens && len == lens[run_end]);
 465
 466                 if (len == 0) {
 467                         /* Run of zeroes.  */
 468
 469                         /* Symbol 18: RLE 20 to 51 zeroes at a time.  */
 470                         while ((run_end - run_start) >= 20) {
 471                                 extra_bits = min((run_end - run_start) - 20, 0x1f);
 472                                 precode_freqs[18]++;
 473                                 *itemptr++ = 18 | (extra_bits << 5);
 474                                 run_start += 20 + extra_bits;
 475                         }
 476
 477                         /* Symbol 17: RLE 4 to 19 zeroes at a time.  */
 478                         if ((run_end - run_start) >= 4) {
 479                                 extra_bits = min((run_end - run_start) - 4, 0xf);
 480                                 precode_freqs[17]++;
 481                                 *itemptr++ = 17 | (extra_bits << 5);
 482                                 run_start += 4 + extra_bits;
 483                         }
 484                 } else {
 485
 486                         /* A run of nonzero lengths. */
 487
 488                         /* Symbol 19: RLE 4 to 5 of any length at a time.  */
 489                         while ((run_end - run_start) >= 4) {
 490                                 extra_bits = (run_end - run_start) > 4;
 491                                 delta = prev_lens[run_start] - len;
 492                                 if (delta < 0)
 493                                         delta += 17;
 494                                 precode_freqs[19]++;
 495                                 precode_freqs[delta]++;
 496                                 *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
 497                                 run_start += 4 + extra_bits;
 498                         }
 499                 }
 500
 501                 /* Output any remaining lengths without RLE.  */
 502                 while (run_start != run_end) {
 503                         delta = prev_lens[run_start] - len;
 504                         if (delta < 0)
 505                                 delta += 17;
 506                         precode_freqs[delta]++;
 507                         *itemptr++ = delta;
 508                         run_start++;
 509                 }
 510         } while (run_start != num_lens);
 511
 512         return itemptr - precode_items;
 513 }
 514
 515 /*
 516  * Output a Huffman code in the compressed form used in LZX.
 517  *
 518  * The Huffman code is represented in the output as a logical series of codeword
 519  * lengths from which the Huffman code, which must be in canonical form, can be
 520  * reconstructed.
 521  *
 522  * The codeword lengths are themselves compressed using a separate Huffman code,
 523  * the "precode", which contains a symbol for each possible codeword length in
 524  * the larger code as well as several special symbols to represent repeated
 525  * codeword lengths (a form of run-length encoding).  The precode is itself
 526  * constructed in canonical form, and its codeword lengths are represented
 527  * literally in 20 4-bit fields that immediately precede the compressed codeword
 528  * lengths of the larger code.
 529  *
 530  * Furthermore, the codeword lengths of the larger code are actually represented
 531  * as deltas from the codeword lengths of the corresponding code in the previous
 532  * block.
 533  *
 534  * @os:
 535  *      Bitstream to which to write the compressed Huffman code.
 536  * @lens:
 537  *      The codeword lengths, indexed by symbol, in the Huffman code.
 538  * @prev_lens:
 539  *      The codeword lengths, indexed by symbol, in the corresponding Huffman
 540  *      code in the previous block, or all zeroes if this is the first block.
 541  * @num_lens:
 542  *      The number of symbols in the Huffman code.
 543  */
 544 static void
 545 lzx_write_compressed_code(struct lzx_output_bitstream *os,
 546                           const u8 lens[restrict],
 547                           const u8 prev_lens[restrict],
 548                           unsigned num_lens)
 549 {
 550         u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
 551         u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
 552         u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
 553         unsigned precode_items[num_lens];
 554         unsigned num_precode_items;
 555         unsigned precode_item;
 556         unsigned precode_sym;
 557         unsigned i;
 558
 559         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 560                 precode_freqs[i] = 0;
 561
 562         /* Compute the "items" (RLE / literal tokens and extra bits) with which
 563          * the codeword lengths in the larger code will be output.  */
 564         num_precode_items = lzx_compute_precode_items(lens,
 565                                                       prev_lens,
 566                                                       num_lens,
 567                                                       precode_freqs,
 568                                                       precode_items);
 569
 570         /* Build the precode.  */
 571         make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS,
 572                                     LZX_MAX_PRE_CODEWORD_LEN,
 573                                     precode_freqs, precode_lens,
 574                                     precode_codewords);
 575
 576         /* Output the lengths of the codewords in the precode.  */
 577         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 578                 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
 579
 580         /* Output the encoded lengths of the codewords in the larger code.  */
 581         for (i = 0; i < num_precode_items; i++) {
 582                 precode_item = precode_items[i];
 583                 precode_sym = precode_item & 0x1F;
 584                 lzx_write_varbits(os, precode_codewords[precode_sym],
 585                                   precode_lens[precode_sym],
 586                                   LZX_MAX_PRE_CODEWORD_LEN);
 587                 if (precode_sym >= 17) {
 588                         if (precode_sym == 17) {
 589                                 lzx_write_bits(os, precode_item >> 5, 4);
 590                         } else if (precode_sym == 18) {
 591                                 lzx_write_bits(os, precode_item >> 5, 5);
 592                         } else {
 593                                 lzx_write_bits(os, (precode_item >> 5) & 1, 1);
 594                                 precode_sym = precode_item >> 6;
 595                                 lzx_write_varbits(os, precode_codewords[precode_sym],
 596                                                   precode_lens[precode_sym],
 597                                                   LZX_MAX_PRE_CODEWORD_LEN);
 598                         }
 599                 }
 600         }
 601 }
 602
 603 /* Output a match or literal.  */
 604 static inline void
 605 lzx_write_item(struct lzx_output_bitstream *os, struct lzx_item item,
 606                unsigned ones_if_aligned, const struct lzx_codes *codes)
 607 {
 608         u64 data = item.data;
 609         unsigned main_symbol;
 610         unsigned len_symbol;
 611         unsigned num_extra_bits;
 612         u32 extra_bits;
 613
 614         main_symbol = data & 0x3FF;
 615
 616         lzx_write_varbits(os, codes->codewords.main[main_symbol],
 617                           codes->lens.main[main_symbol],
 618                           LZX_MAX_MAIN_CODEWORD_LEN);
 619
 620         if (main_symbol < LZX_NUM_CHARS)  /* Literal?  */
 621                 return;
 622
 623         len_symbol = (data >> 10) & 0xFF;
 624
 625         if (len_symbol != LZX_LENCODE_NUM_SYMBOLS) {
 626                 lzx_write_varbits(os, codes->codewords.len[len_symbol],
 627                                   codes->lens.len[len_symbol],
 628                                   LZX_MAX_LEN_CODEWORD_LEN);
 629         }
 630
 631         num_extra_bits = (data >> 18) & 0x1F;
 632         if (num_extra_bits == 0)  /* Small offset or repeat offset match?  */
 633                 return;
 634
 635         extra_bits = data >> 23;
 636
 637         /*if (block_type == LZX_BLOCKTYPE_ALIGNED && num_extra_bits >= 3) {*/
 638         if ((num_extra_bits & ones_if_aligned) >= 3) {
 639
 640                 /* Aligned offset blocks: The low 3 bits of the extra offset
 641                  * bits are Huffman-encoded using the aligned offset code.  The
 642                  * remaining bits are output literally.  */
 643
 644                 lzx_write_varbits(os, extra_bits >> 3, num_extra_bits - 3, 14);
 645
 646                 lzx_write_varbits(os, codes->codewords.aligned[extra_bits & 7],
 647                                   codes->lens.aligned[extra_bits & 7],
 648                                   LZX_MAX_ALIGNED_CODEWORD_LEN);
 649         } else {
 650                 /* Verbatim blocks, or fewer than 3 extra bits:  All extra
 651                  * offset bits are output literally.  */
 652                 lzx_write_varbits(os, extra_bits, num_extra_bits, 17);
 653         }
 654 }
 655
 656 /*
 657  * Write all matches and literal bytes (which were precomputed) in an LZX
 658  * compressed block to the output bitstream in the final compressed
 659  * representation.
 660  *
 661  * @os
 662  *      The output bitstream.
 663  * @block_type
 664  *      The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
 665  *      LZX_BLOCKTYPE_VERBATIM).
 666  * @items
 667  *      The array of matches/literals to output.
 668  * @num_items
 669  *      Number of matches/literals to output (length of @items).
 670  * @codes
 671  *      The main, length, and aligned offset Huffman codes for the current
 672  *      LZX compressed block.
 673  */
 674 static void
 675 lzx_write_items(struct lzx_output_bitstream *os, int block_type,
 676                 const struct lzx_item items[], u32 num_items,
 677                 const struct lzx_codes *codes)
 678 {
 679         unsigned ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED);
 680
 681         for (u32 i = 0; i < num_items; i++)
 682                 lzx_write_item(os, items[i], ones_if_aligned, codes);
 683 }
 684
 685 /* Write an LZX aligned offset or verbatim block to the output bitstream.  */
 686 static void
 687 lzx_write_compressed_block(int block_type,
 688                            u32 block_size,
 689                            unsigned window_order,
 690                            unsigned num_main_syms,
 691                            struct lzx_item * chosen_items,
 692                            u32 num_chosen_items,
 693                            const struct lzx_codes * codes,
 694                            const struct lzx_lens * prev_lens,
 695                            struct lzx_output_bitstream * os)
 696 {
 697         LZX_ASSERT(block_type == LZX_BLOCKTYPE_ALIGNED ||
 698                    block_type == LZX_BLOCKTYPE_VERBATIM);
 699
 700         /* The first three bits indicate the type of block and are one of the
 701          * LZX_BLOCKTYPE_* constants.  */
 702         lzx_write_bits(os, block_type, 3);
 703
 704         /* Output the block size.
 705          *
 706          * The original LZX format seemed to always encode the block size in 3
 707          * bytes.  However, the implementation in WIMGAPI, as used in WIM files,
 708          * uses the first bit to indicate whether the block is the default size
 709          * (32768) or a different size given explicitly by the next 16 bits.
 710          *
 711          * By default, this compressor uses a window size of 32768 and therefore
 712          * follows the WIMGAPI behavior.  However, this compressor also supports
 713          * window sizes greater than 32768 bytes, which do not appear to be
 714          * supported by WIMGAPI.  In such cases, we retain the default size bit
 715          * to mean a size of 32768 bytes but output non-default block size in 24
 716          * bits rather than 16.  The compatibility of this behavior is unknown
 717          * because WIMs created with chunk size greater than 32768 can seemingly
 718          * only be opened by wimlib anyway.  */
 719         if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
 720                 lzx_write_bits(os, 1, 1);
 721         } else {
 722                 lzx_write_bits(os, 0, 1);
 723
 724                 if (window_order >= 16)
 725                         lzx_write_bits(os, block_size >> 16, 8);
 726
 727                 lzx_write_bits(os, block_size & 0xFFFF, 16);
 728         }
 729
 730         /* If it's an aligned offset block, output the aligned offset code.  */
 731         if (block_type == LZX_BLOCKTYPE_ALIGNED) {
 732                 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
 733                         lzx_write_bits(os, codes->lens.aligned[i],
 734                                        LZX_ALIGNEDCODE_ELEMENT_SIZE);
 735                 }
 736         }
 737
 738         /* Output the main code (two parts).  */
 739         lzx_write_compressed_code(os, codes->lens.main,
 740                                   prev_lens->main,
 741                                   LZX_NUM_CHARS);
 742         lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
 743                                   prev_lens->main + LZX_NUM_CHARS,
 744                                   num_main_syms - LZX_NUM_CHARS);
 745
 746         /* Output the length code.  */
 747         lzx_write_compressed_code(os, codes->lens.len,
 748                                   prev_lens->len,
 749                                   LZX_LENCODE_NUM_SYMBOLS);
 750
 751         /* Output the compressed matches and literals.  */
 752         lzx_write_items(os, block_type, chosen_items, num_chosen_items, codes);
 753 }
 754
 755 /* Don't allow matches to span the end of an LZX block.  */
 756 static inline unsigned
 757 maybe_truncate_matches(struct lz_match matches[], unsigned num_matches,
 758                        struct lzx_compressor *c)
 759 {
 760         if (c->match_window_end < c->cur_window_size && num_matches != 0) {
 761                 u32 limit = c->match_window_end - c->match_window_pos;
 762
 763                 if (limit >= LZX_MIN_MATCH_LEN) {
 764
 765                         unsigned i = num_matches - 1;
 766                         do {
 767                                 if (matches[i].len >= limit) {
 768                                         matches[i].len = limit;
 769
 770                                         /* Truncation might produce multiple
 771                                          * matches with length 'limit'.  Keep at
 772                                          * most 1.  */
 773                                         num_matches = i + 1;
 774                                 }
 775                         } while (i--);
 776                 } else {
 777                         num_matches = 0;
 778                 }
 779         }
 780         return num_matches;
 781 }
 782
 783 static unsigned
 784 lzx_get_matches_fillcache_singleblock(struct lzx_compressor *c,
 785                                       const struct lz_match **matches_ret)
 786 {
 787         struct lz_match *cache_ptr;
 788         struct lz_match *matches;
 789         unsigned num_matches;
 790
 791         cache_ptr = c->cache_ptr;
 792         matches = cache_ptr + 1;
 793         if (likely(cache_ptr <= c->cache_limit)) {
 794                 num_matches = lz_mf_get_matches(c->mf, matches);
 795                 cache_ptr->len = num_matches;
 796                 c->cache_ptr = matches + num_matches;
 797         } else {
 798                 num_matches = 0;
 799         }
 800         c->match_window_pos++;
 801         *matches_ret = matches;
 802         return num_matches;
 803 }
 804
 805 static unsigned
 806 lzx_get_matches_fillcache_multiblock(struct lzx_compressor *c,
 807                                      const struct lz_match **matches_ret)
 808 {
 809         struct lz_match *cache_ptr;
 810         struct lz_match *matches;
 811         unsigned num_matches;
 812
 813         cache_ptr = c->cache_ptr;
 814         matches = cache_ptr + 1;
 815         if (likely(cache_ptr <= c->cache_limit)) {
 816                 num_matches = lz_mf_get_matches(c->mf, matches);
 817                 num_matches = maybe_truncate_matches(matches, num_matches, c);
 818                 cache_ptr->len = num_matches;
 819                 c->cache_ptr = matches + num_matches;
 820         } else {
 821                 num_matches = 0;
 822         }
 823         c->match_window_pos++;
 824         *matches_ret = matches;
 825         return num_matches;
 826 }
 827
 828 static unsigned
 829 lzx_get_matches_usecache(struct lzx_compressor *c,
 830                          const struct lz_match **matches_ret)
 831 {
 832         struct lz_match *cache_ptr;
 833         struct lz_match *matches;
 834         unsigned num_matches;
 835
 836         cache_ptr = c->cache_ptr;
 837         matches = cache_ptr + 1;
 838         if (cache_ptr <= c->cache_limit) {
 839                 num_matches = cache_ptr->len;
 840                 c->cache_ptr = matches + num_matches;
 841         } else {
 842                 num_matches = 0;
 843         }
 844         c->match_window_pos++;
 845         *matches_ret = matches;
 846         return num_matches;
 847 }
 848
 849 static unsigned
 850 lzx_get_matches_usecache_nocheck(struct lzx_compressor *c,
 851                                  const struct lz_match **matches_ret)
 852 {
 853         struct lz_match *cache_ptr;
 854         struct lz_match *matches;
 855         unsigned num_matches;
 856
 857         cache_ptr = c->cache_ptr;
 858         matches = cache_ptr + 1;
 859         num_matches = cache_ptr->len;
 860         c->cache_ptr = matches + num_matches;
 861         c->match_window_pos++;
 862         *matches_ret = matches;
 863         return num_matches;
 864 }
 865
 866 static unsigned
 867 lzx_get_matches_nocache_singleblock(struct lzx_compressor *c,
 868                                     const struct lz_match **matches_ret)
 869 {
 870         struct lz_match *matches;
 871         unsigned num_matches;
 872
 873         matches = c->cache_ptr;
 874         num_matches = lz_mf_get_matches(c->mf, matches);
 875         c->match_window_pos++;
 876         *matches_ret = matches;
 877         return num_matches;
 878 }
 879
 880 static unsigned
 881 lzx_get_matches_nocache_multiblock(struct lzx_compressor *c,
 882                                    const struct lz_match **matches_ret)
 883 {
 884         struct lz_match *matches;
 885         unsigned num_matches;
 886
 887         matches = c->cache_ptr;
 888         num_matches = lz_mf_get_matches(c->mf, matches);
 889         num_matches = maybe_truncate_matches(matches, num_matches, c);
 890         c->match_window_pos++;
 891         *matches_ret = matches;
 892         return num_matches;
 893 }
 894
 895 /*
 896  * Find matches at the next position in the window.
 897  *
 898  * This uses a wrapper function around the underlying match-finder.
 899  *
 900  * Returns the number of matches found and sets *matches_ret to point to the
 901  * matches array.  The matches will be sorted by strictly increasing length and
 902  * offset.
 903  */
 904 static inline unsigned
 905 lzx_get_matches(struct lzx_compressor *c, const struct lz_match **matches_ret)
 906 {
 907         return (*c->get_matches_func)(c, matches_ret);
 908 }
 909
 910 static void
 911 lzx_skip_bytes_fillcache(struct lzx_compressor *c, unsigned n)
 912 {
 913         struct lz_match *cache_ptr;
 914
 915         cache_ptr = c->cache_ptr;
 916         c->match_window_pos += n;
 917         lz_mf_skip_positions(c->mf, n);
 918         if (cache_ptr <= c->cache_limit) {
 919                 do {
 920                         cache_ptr->len = 0;
 921                         cache_ptr += 1;
 922                 } while (--n && cache_ptr <= c->cache_limit);
 923         }
 924         c->cache_ptr = cache_ptr;
 925 }
 926
 927 static void
 928 lzx_skip_bytes_usecache(struct lzx_compressor *c, unsigned n)
 929 {
 930         struct lz_match *cache_ptr;
 931
 932         cache_ptr = c->cache_ptr;
 933         c->match_window_pos += n;
 934         if (cache_ptr <= c->cache_limit) {
 935                 do {
 936                         cache_ptr += 1 + cache_ptr->len;
 937                 } while (--n && cache_ptr <= c->cache_limit);
 938         }
 939         c->cache_ptr = cache_ptr;
 940 }
 941
 942 static void
 943 lzx_skip_bytes_usecache_nocheck(struct lzx_compressor *c, unsigned n)
 944 {
 945         struct lz_match *cache_ptr;
 946
 947         cache_ptr = c->cache_ptr;
 948         c->match_window_pos += n;
 949         do {
 950                 cache_ptr += 1 + cache_ptr->len;
 951         } while (--n);
 952         c->cache_ptr = cache_ptr;
 953 }
 954
 955 static void
 956 lzx_skip_bytes_nocache(struct lzx_compressor *c, unsigned n)
 957 {
 958         c->match_window_pos += n;
 959         lz_mf_skip_positions(c->mf, n);
 960 }
 961
 962 /*
 963  * Skip the specified number of positions in the window (don't search for
 964  * matches at them).
 965  *
 966  * This uses a wrapper function around the underlying match-finder.
 967  */
 968 static inline void
 969 lzx_skip_bytes(struct lzx_compressor *c, unsigned n)
 970 {
 971         return (*c->skip_bytes_func)(c, n);
 972 }
 973
 974 /* Tally, and optionally record, the specified literal byte.  */
 975 static inline void
 976 lzx_declare_literal(struct lzx_compressor *c, unsigned literal,
 977                     struct lzx_item **next_chosen_item)
 978 {
 979         unsigned main_symbol = literal;
 980
 981         c->freqs.main[main_symbol]++;
 982
 983         if (next_chosen_item) {
 984                 *(*next_chosen_item)++ = (struct lzx_item) {
 985                         .data = main_symbol,
 986                 };
 987         }
 988 }
 989
 990 /* Tally, and optionally record, the specified repeat offset match.  */
 991 static inline void
 992 lzx_declare_repeat_offset_match(struct lzx_compressor *c,
 993                                 unsigned len, unsigned rep_index,
 994                                 struct lzx_item **next_chosen_item)
 995 {
 996         unsigned len_header;
 997         unsigned main_symbol;
 998         unsigned len_symbol;
 999
1000         if (len - LZX_MIN_MATCH_LEN < LZX_NUM_PRIMARY_LENS) {
1001                 len_header = len - LZX_MIN_MATCH_LEN;
1002                 len_symbol = LZX_LENCODE_NUM_SYMBOLS;
1003         } else {
1004                 len_header = LZX_NUM_PRIMARY_LENS;
1005                 len_symbol = len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS;
1006                 c->freqs.len[len_symbol]++;
1007         }
1008
1009         main_symbol = LZX_NUM_CHARS + ((rep_index << 3) | len_header);
1010
1011         c->freqs.main[main_symbol]++;
1012
1013         if (next_chosen_item) {
1014                 *(*next_chosen_item)++ = (struct lzx_item) {
1015                         .data = (u64)main_symbol | ((u64)len_symbol << 10),
1016                 };
1017         }
1018 }
1019
1020 /* Tally, and optionally record, the specified explicit offset match.  */
1021 static inline void
1022 lzx_declare_explicit_offset_match(struct lzx_compressor *c, unsigned len, u32 offset,
1023                                   struct lzx_item **next_chosen_item)
1024 {
1025         unsigned len_header;
1026         unsigned main_symbol;
1027         unsigned len_symbol;
1028         unsigned offset_slot;
1029         unsigned num_extra_bits;
1030         u32 extra_bits;
1031
1032         if (len - LZX_MIN_MATCH_LEN < LZX_NUM_PRIMARY_LENS) {
1033                 len_header = len - LZX_MIN_MATCH_LEN;
1034                 len_symbol = LZX_LENCODE_NUM_SYMBOLS;
1035         } else {
1036                 len_header = LZX_NUM_PRIMARY_LENS;
1037                 len_symbol = len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS;
1038                 c->freqs.len[len_symbol]++;
1039         }
1040
1041         offset_slot = lzx_get_offset_slot_raw(offset + LZX_OFFSET_OFFSET);
1042
1043         main_symbol = LZX_NUM_CHARS + ((offset_slot << 3) | len_header);
1044
1045         c->freqs.main[main_symbol]++;
1046
1047         if (offset_slot >= 8)
1048                 c->freqs.aligned[(offset + LZX_OFFSET_OFFSET) & 7]++;
1049
1050         if (next_chosen_item) {
1051
1052                 num_extra_bits = lzx_extra_offset_bits[offset_slot];
1053
1054                 extra_bits = (offset + LZX_OFFSET_OFFSET) -
1055                              lzx_offset_slot_base[offset_slot];
1056
1057                 *(*next_chosen_item)++ = (struct lzx_item) {
1058                         .data = (u64)main_symbol |
1059                                 ((u64)len_symbol << 10) |
1060                                 ((u64)num_extra_bits << 18) |
1061                                 ((u64)extra_bits << 23),
1062                 };
1063         }
1064 }
1065
1066 /* Tally, and optionally record, the specified match or literal.  */
1067 static inline void
1068 lzx_declare_item(struct lzx_compressor *c, u32 mc_item_data,
1069                  struct lzx_item **next_chosen_item)
1070 {
1071         u32 len = mc_item_data & MC_LEN_MASK;
1072         u32 offset_data = mc_item_data >> MC_OFFSET_SHIFT;
1073
1074         if (len == 1)
1075                 lzx_declare_literal(c, offset_data, next_chosen_item);
1076         else if (offset_data < LZX_NUM_RECENT_OFFSETS)
1077                 lzx_declare_repeat_offset_match(c, len, offset_data,
1078                                                 next_chosen_item);
1079         else
1080                 lzx_declare_explicit_offset_match(c, len,
1081                                                   offset_data - LZX_OFFSET_OFFSET,
1082                                                   next_chosen_item);
1083 }
1084
1085 static inline void
1086 lzx_record_item_list(struct lzx_compressor *c,
1087                      struct lzx_mc_pos_data *cur_optimum_ptr,
1088                      struct lzx_item **next_chosen_item)
1089 {
1090         struct lzx_mc_pos_data *end_optimum_ptr;
1091         u32 saved_item;
1092         u32 item;
1093
1094         /* The list is currently in reverse order (last item to first item).
1095          * Reverse it.  */
1096         end_optimum_ptr = cur_optimum_ptr;
1097         saved_item = cur_optimum_ptr->mc_item_data;
1098         do {
1099                 item = saved_item;
1100                 cur_optimum_ptr -= item & MC_LEN_MASK;
1101                 saved_item = cur_optimum_ptr->mc_item_data;
1102                 cur_optimum_ptr->mc_item_data = item;
1103         } while (cur_optimum_ptr != c->optimum);
1104
1105         /* Walk the list of items from beginning to end, tallying and recording
1106          * each item.  */
1107         do {
1108                 lzx_declare_item(c, cur_optimum_ptr->mc_item_data, next_chosen_item);
1109                 cur_optimum_ptr += (cur_optimum_ptr->mc_item_data) & MC_LEN_MASK;
1110         } while (cur_optimum_ptr != end_optimum_ptr);
1111 }
1112
1113 static inline void
1114 lzx_tally_item_list(struct lzx_compressor *c, struct lzx_mc_pos_data *cur_optimum_ptr)
1115 {
1116         /* Since we're just tallying the items, we don't need to reverse the
1117          * list.  Processing the items in reverse order is fine.  */
1118         do {
1119                 lzx_declare_item(c, cur_optimum_ptr->mc_item_data, NULL);
1120                 cur_optimum_ptr -= (cur_optimum_ptr->mc_item_data & MC_LEN_MASK);
1121         } while (cur_optimum_ptr != c->optimum);
1122 }
1123
1124 /* Tally, and optionally (if next_chosen_item != NULL) record, in order, all
1125  * items in the current list of items found by the match-chooser.  */
1126 static void
1127 lzx_declare_item_list(struct lzx_compressor *c, struct lzx_mc_pos_data *cur_optimum_ptr,
1128                       struct lzx_item **next_chosen_item)
1129 {
1130         if (next_chosen_item)
1131                 lzx_record_item_list(c, cur_optimum_ptr, next_chosen_item);
1132         else
1133                 lzx_tally_item_list(c, cur_optimum_ptr);
1134 }
1135
1136 /* Set the cost model @c->costs from the Huffman codeword lengths specified in
1137  * @lens.
1138  *
1139  * The cost model and codeword lengths are almost the same thing, but the
1140  * Huffman codewords with length 0 correspond to symbols with zero frequency
1141  * that still need to be assigned actual costs.  The specific values assigned
1142  * are arbitrary, but they should be fairly high (near the maximum codeword
1143  * length) to take into account the fact that uses of these symbols are expected
1144  * to be rare.  */
1145 static void
1146 lzx_set_costs(struct lzx_compressor *c, const struct lzx_lens * lens)
1147 {
1148         unsigned i;
1149
1150         /* Main code  */
1151         for (i = 0; i < c->num_main_syms; i++)
1152                 c->costs.main[i] = lens->main[i] ? lens->main[i] : 15;
1153
1154         /* Length code  */
1155         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
1156                 c->costs.len[i] = lens->len[i] ? lens->len[i] : 15;
1157
1158         /* Aligned offset code  */
1159         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
1160                 c->costs.aligned[i] = lens->aligned[i] ? lens->aligned[i] : 7;
1161 }
1162
1163 /* Set default LZX Huffman symbol costs to bootstrap the iterative optimization
1164  * algorithm.  */
1165 static void
1166 lzx_set_default_costs(struct lzx_costs * costs, unsigned num_main_syms)
1167 {
1168         unsigned i;
1169
1170         /* Main code (part 1): Literal symbols  */
1171         for (i = 0; i < LZX_NUM_CHARS; i++)
1172                 costs->main[i] = 8;
1173
1174         /* Main code (part 2): Match header symbols  */
1175         for (; i < num_main_syms; i++)
1176                 costs->main[i] = 10;
1177
1178         /* Length code  */
1179         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
1180                 costs->len[i] = 8;
1181
1182         /* Aligned offset code  */
1183         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++)
1184                 costs->aligned[i] = 3;
1185 }
1186
1187 /* Return the cost, in bits, to output a literal byte using the specified cost
1188  * model.  */
1189 static inline u32
1190 lzx_literal_cost(unsigned literal, const struct lzx_costs * costs)
1191 {
1192         return costs->main[literal];
1193 }
1194
1195 /* Return the cost, in bits, to output a match of the specified length and
1196  * offset slot using the specified cost model.  Does not take into account
1197  * extra offset bits.  */
1198 static inline u32
1199 lzx_match_cost_raw(unsigned len, unsigned offset_slot,
1200                    const struct lzx_costs *costs)
1201 {
1202         u32 cost;
1203         unsigned len_header;
1204         unsigned main_symbol;
1205
1206         if (len - LZX_MIN_MATCH_LEN < LZX_NUM_PRIMARY_LENS) {
1207                 len_header = len - LZX_MIN_MATCH_LEN;
1208                 cost = 0;
1209         } else {
1210                 len_header = LZX_NUM_PRIMARY_LENS;
1211
1212                 /* Account for length symbol.  */
1213                 cost = costs->len[len - LZX_MIN_MATCH_LEN - LZX_NUM_PRIMARY_LENS];
1214         }
1215
1216         /* Account for main symbol.  */
1217         main_symbol = LZX_NUM_CHARS + ((offset_slot << 3) | len_header);
1218         cost += costs->main[main_symbol];
1219
1220         return cost;
1221 }
1222
1223 /* Equivalent to lzx_match_cost_raw(), but assumes the length is small enough
1224  * that it doesn't require a length symbol.  */
1225 static inline u32
1226 lzx_match_cost_raw_smalllen(unsigned len, unsigned offset_slot,
1227                             const struct lzx_costs *costs)
1228 {
1229         LZX_ASSERT(len < LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS);
1230         return costs->main[LZX_NUM_CHARS +
1231                            ((offset_slot << 3) | (len - LZX_MIN_MATCH_LEN))];
1232 }
1233
1234 /*
1235  * Consider coding the match at repeat offset index @rep_idx.  Consider each
1236  * length from the minimum (2) to the full match length (@rep_len).
1237  */
1238 static inline void
1239 lzx_consider_repeat_offset_match(struct lzx_compressor *c,
1240                                  struct lzx_mc_pos_data *cur_optimum_ptr,
1241                                  unsigned rep_len, unsigned rep_idx)
1242 {
1243         u32 base_cost = cur_optimum_ptr->cost;
1244         u32 cost;
1245         unsigned len;
1246
1247 #if 1   /* Optimized version */
1248
1249         if (rep_len < LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS) {
1250                 /* All lengths being considered are small.  */
1251                 len = 2;
1252                 do {
1253                         cost = base_cost +
1254                                lzx_match_cost_raw_smalllen(len, rep_idx, &c->costs);
1255                         if (cost < (cur_optimum_ptr + len)->cost) {
1256                                 (cur_optimum_ptr + len)->mc_item_data =
1257                                         (rep_idx << MC_OFFSET_SHIFT) | len;
1258                                 (cur_optimum_ptr + len)->cost = cost;
1259                         }
1260                 } while (++len <= rep_len);
1261         } else {
1262                 /* Some lengths being considered are small, and some are big.
1263                  * Start with the optimized loop for small lengths, then switch
1264                  * to the optimized loop for big lengths.  */
1265                 len = 2;
1266                 do {
1267                         cost = base_cost +
1268                                lzx_match_cost_raw_smalllen(len, rep_idx, &c->costs);
1269                         if (cost < (cur_optimum_ptr + len)->cost) {
1270                                 (cur_optimum_ptr + len)->mc_item_data =
1271                                         (rep_idx << MC_OFFSET_SHIFT) | len;
1272                                 (cur_optimum_ptr + len)->cost = cost;
1273                         }
1274                 } while (++len < LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS);
1275
1276                 /* The main symbol is now fixed.  */
1277                 base_cost += c->costs.main[LZX_NUM_CHARS +
1278                                            ((rep_idx << 3) | LZX_NUM_PRIMARY_LENS)];
1279                 do {
1280                         cost = base_cost +
1281                                c->costs.len[len - LZX_MIN_MATCH_LEN -
1282                                             LZX_NUM_PRIMARY_LENS];
1283                         if (cost < (cur_optimum_ptr + len)->cost) {
1284                                 (cur_optimum_ptr + len)->mc_item_data =
1285                                         (rep_idx << MC_OFFSET_SHIFT) | len;
1286                                 (cur_optimum_ptr + len)->cost = cost;
1287                         }
1288                 } while (++len <= rep_len);
1289         }
1290
1291 #else   /* Unoptimized version  */
1292
1293         len = 2;
1294         do {
1295                 cost = base_cost +
1296                        lzx_match_cost_raw(len, rep_idx, &c->costs);
1297                 if (cost < (cur_optimum_ptr + len)->cost) {
1298                         (cur_optimum_ptr + len)->mc_item_data =
1299                                 (rep_idx << MC_OFFSET_SHIFT) | len;
1300                         (cur_optimum_ptr + len)->cost = cost;
1301                 }
1302         } while (++len <= rep_len);
1303 #endif
1304 }
1305
1306 /*
1307  * Consider coding each match in @matches as an explicit offset match.
1308  *
1309  * @matches must be sorted by strictly increasing length and strictly
1310  * increasing offset.  This is guaranteed by the match-finder.
1311  *
1312  * We consider each length from the minimum (2) to the longest
1313  * (matches[num_matches - 1].len).  For each length, we consider only
1314  * the smallest offset for which that length is available.  Although
1315  * this is not guaranteed to be optimal due to the possibility of a
1316  * larger offset costing less than a smaller offset to code, this is a
1317  * very useful heuristic.
1318  */
1319 static inline void
1320 lzx_consider_explicit_offset_matches(struct lzx_compressor *c,
1321                                      struct lzx_mc_pos_data *cur_optimum_ptr,
1322                                      const struct lz_match matches[],
1323                                      unsigned num_matches)
1324 {
1325         LZX_ASSERT(num_matches > 0);
1326
1327         unsigned i;
1328         unsigned len;
1329         unsigned offset_slot;
1330         u32 position_cost;
1331         u32 cost;
1332         u32 offset_data;
1333
1334
1335 #if 1   /* Optimized version */
1336
1337         if (matches[num_matches - 1].offset < LZX_NUM_FAST_OFFSETS) {
1338
1339                 /*
1340                  * Offset is small; the offset slot can be looked up directly in
1341                  * c->offset_slot_fast.
1342                  *
1343                  * Additional optimizations:
1344                  *
1345                  * - Since the offset is small, it falls in the exponential part
1346                  *   of the offset slot bases and the number of extra offset
1347                  *   bits can be calculated directly as (offset_slot >> 1) - 1.
1348                  *
1349                  * - Just consider the number of extra offset bits; don't
1350                  *   account for the aligned offset code.  Usually this has
1351                  *   almost no effect on the compression ratio.
1352                  *
1353                  * - Start out in a loop optimized for small lengths.  When the
1354                  *   length becomes high enough that a length symbol will be
1355                  *   needed, jump into a loop optimized for big lengths.
1356                  */
1357
1358                 LZX_ASSERT(offset_slot <= 37); /* for extra bits formula  */
1359
1360                 len = 2;
1361                 i = 0;
1362                 do {
1363                         offset_slot = c->offset_slot_fast[matches[i].offset];
1364                         position_cost = cur_optimum_ptr->cost +
1365                                         ((offset_slot >> 1) - 1);
1366                         offset_data = matches[i].offset + LZX_OFFSET_OFFSET;
1367                         do {
1368                                 if (len >= LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS)
1369                                         goto biglen;
1370                                 cost = position_cost +
1371                                        lzx_match_cost_raw_smalllen(len, offset_slot,
1372                                                                    &c->costs);
1373                                 if (cost < (cur_optimum_ptr + len)->cost) {
1374                                         (cur_optimum_ptr + len)->cost = cost;
1375                                         (cur_optimum_ptr + len)->mc_item_data =
1376                                                 (offset_data << MC_OFFSET_SHIFT) | len;
1377                                 }
1378                         } while (++len <= matches[i].len);
1379                 } while (++i != num_matches);
1380
1381                 return;
1382
1383                 do {
1384                         offset_slot = c->offset_slot_fast[matches[i].offset];
1385         biglen:
1386                         position_cost = cur_optimum_ptr->cost +
1387                                         ((offset_slot >> 1) - 1) +
1388                                         c->costs.main[LZX_NUM_CHARS +
1389                                                       ((offset_slot << 3) |
1390                                                        LZX_NUM_PRIMARY_LENS)];
1391                         offset_data = matches[i].offset + LZX_OFFSET_OFFSET;
1392                         do {
1393                                 cost = position_cost +
1394                                        c->costs.len[len - LZX_MIN_MATCH_LEN -
1395                                                     LZX_NUM_PRIMARY_LENS];
1396                                 if (cost < (cur_optimum_ptr + len)->cost) {
1397                                         (cur_optimum_ptr + len)->cost = cost;
1398                                         (cur_optimum_ptr + len)->mc_item_data =
1399                                                 (offset_data << MC_OFFSET_SHIFT) | len;
1400                                 }
1401                         } while (++len <= matches[i].len);
1402                 } while (++i != num_matches);
1403         } else {
1404                 len = 2;
1405                 i = 0;
1406                 do {
1407                         offset_data = matches[i].offset + LZX_OFFSET_OFFSET;
1408                         offset_slot = lzx_get_offset_slot_raw(offset_data);
1409                         position_cost = cur_optimum_ptr->cost +
1410                                         lzx_extra_offset_bits[offset_slot];
1411                         do {
1412                                 cost = position_cost +
1413                                        lzx_match_cost_raw(len, offset_slot, &c->costs);
1414                                 if (cost < (cur_optimum_ptr + len)->cost) {
1415                                         (cur_optimum_ptr + len)->cost = cost;
1416                                         (cur_optimum_ptr + len)->mc_item_data =
1417                                                 (offset_data << MC_OFFSET_SHIFT) | len;
1418                                 }
1419                         } while (++len <= matches[i].len);
1420                 } while (++i != num_matches);
1421         }
1422
1423 #else   /* Unoptimized version */
1424
1425         unsigned num_extra_bits;
1426
1427         len = 2;
1428         i = 0;
1429         do {
1430                 offset_data = matches[i].offset + LZX_OFFSET_OFFSET;
1431                 position_cost = cur_optimum_ptr->cost;
1432                 offset_slot = lzx_get_offset_slot_raw(offset_data);
1433                 num_extra_bits = lzx_extra_offset_bits[offset_slot];
1434                 if (num_extra_bits >= 3) {
1435                         position_cost += num_extra_bits - 3;
1436                         position_cost += c->costs.aligned[offset_data & 7];
1437                 } else {
1438                         position_cost += num_extra_bits;
1439                 }
1440                 do {
1441                         cost = position_cost +
1442                                lzx_match_cost_raw(len, offset_slot, &c->costs);
1443                         if (cost < (cur_optimum_ptr + len)->cost) {
1444                                 (cur_optimum_ptr + len)->cost = cost;
1445                                 (cur_optimum_ptr + len)->mc_item_data =
1446                                         (offset_data << MC_OFFSET_SHIFT) | len;
1447                         }
1448                 } while (++len <= matches[i].len);
1449         } while (++i != num_matches);
1450 #endif
1451 }
1452
1453 /*
1454  * Search for repeat offset matches with the current position.
1455  */
1456 static inline unsigned
1457 lzx_repsearch(const u8 * const strptr, const u32 bytes_remaining,
1458               const struct lzx_lru_queue *queue, unsigned *rep_max_idx_ret)
1459 {
1460         BUILD_BUG_ON(LZX_NUM_RECENT_OFFSETS != 3);
1461         return lz_repsearch3(strptr, min(bytes_remaining, LZX_MAX_MATCH_LEN),
1462                              queue->R, rep_max_idx_ret);
1463 }
1464
1465 /*
1466  * The main near-optimal parsing routine.
1467  *
1468  * Briefly, the algorithm does an approximate minimum-cost path search to find a
1469  * "near-optimal" sequence of matches and literals to output, based on the
1470  * current cost model.  The algorithm steps forward, position by position (byte
1471  * by byte), and updates the minimum cost path to reach each later position that
1472  * can be reached using a match or literal from the current position.  This is
1473  * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1474  * the graph edges are possible matches/literals to code, and the cost of each
1475  * edge is the estimated number of bits that will be required to output the
1476  * corresponding match or literal.  But one difference is that we actually
1477  * compute the lowest-cost path in pieces, where each piece is terminated when
1478  * there are no choices to be made.
1479  *
1480  * This function will run this algorithm on the portion of the window from
1481  * &c->cur_window[c->match_window_pos] to &c->cur_window[c->match_window_end].
1482  *
1483  * On entry, c->queue must be the current state of the match offset LRU queue,
1484  * and c->costs must be the current cost model to use for Huffman symbols.
1485  *
1486  * On exit, c->queue will be the state that the LRU queue would be in if the
1487  * chosen items were to be coded.
1488  *
1489  * If next_chosen_item != NULL, then all items chosen will be recorded (saved in
1490  * the chosen_items array).  Otherwise, all items chosen will only be tallied
1491  * (symbol frequencies tallied in c->freqs).
1492  */
1493 static void
1494 lzx_optim_pass(struct lzx_compressor *c, struct lzx_item **next_chosen_item)
1495 {
1496         const u8 *block_end;
1497         struct lzx_lru_queue *begin_queue;
1498         const u8 *window_ptr;
1499         struct lzx_mc_pos_data *cur_optimum_ptr;
1500         struct lzx_mc_pos_data *end_optimum_ptr;
1501         const struct lz_match *matches;
1502         unsigned num_matches;
1503         unsigned longest_len;
1504         unsigned rep_max_len;
1505         unsigned rep_max_idx;
1506         unsigned literal;
1507         unsigned len;
1508         u32 cost;
1509         u32 offset_data;
1510
1511         block_end = &c->cur_window[c->match_window_end];
1512         begin_queue = &c->queue;
1513 begin:
1514         /* Start building a new list of items, which will correspond to the next
1515          * piece of the overall minimum-cost path.
1516          *
1517          * *begin_queue is the current state of the match offset LRU queue.  */
1518
1519         window_ptr = &c->cur_window[c->match_window_pos];
1520
1521         if (window_ptr == block_end) {
1522                 c->queue = *begin_queue;
1523                 return;
1524         }
1525
1526         cur_optimum_ptr = c->optimum;
1527         cur_optimum_ptr->cost = 0;
1528         cur_optimum_ptr->queue = *begin_queue;
1529
1530         end_optimum_ptr = cur_optimum_ptr;
1531
1532         /* The following loop runs once for each per byte in the window, except
1533          * in a couple shortcut cases.  */
1534         for (;;) {
1535
1536                 /* Find explicit offset matches with the current position.  */
1537                 num_matches = lzx_get_matches(c, &matches);
1538
1539                 if (num_matches) {
1540                         /*
1541                          * Find the longest repeat offset match with the current
1542                          * position.
1543                          *
1544                          * Heuristics:
1545                          *
1546                          * - Only search for repeat offset matches if the
1547                          *   match-finder already found at least one match.
1548                          *
1549                          * - Only consider the longest repeat offset match.  It
1550                          *   seems to be rare for the optimal parse to include a
1551                          *   repeat offset match that doesn't have the longest
1552                          *   length (allowing for the possibility that not all
1553                          *   of that length is actually used).
1554                          */
1555                         rep_max_len = lzx_repsearch(window_ptr,
1556                                                     block_end - window_ptr,
1557                                                     &cur_optimum_ptr->queue,
1558                                                     &rep_max_idx);
1559
1560                         if (rep_max_len) {
1561                                 /* If there's a very long repeat offset match,
1562                                  * choose it immediately.  */
1563                                 if (rep_max_len >= c->params.nice_match_length) {
1564
1565                                         swap(cur_optimum_ptr->queue.R[0],
1566                                              cur_optimum_ptr->queue.R[rep_max_idx]);
1567                                         begin_queue = &cur_optimum_ptr->queue;
1568
1569                                         cur_optimum_ptr += rep_max_len;
1570                                         cur_optimum_ptr->mc_item_data =
1571                                                 (rep_max_idx << MC_OFFSET_SHIFT) |
1572                                                 rep_max_len;
1573
1574                                         lzx_skip_bytes(c, rep_max_len - 1);
1575                                         break;
1576                                 }
1577
1578                                 /* If reaching any positions for the first time,
1579                                  * initialize their costs to "infinity".  */
1580                                 while (end_optimum_ptr < cur_optimum_ptr + rep_max_len)
1581                                         (++end_optimum_ptr)->cost = MC_INFINITE_COST;
1582
1583                                 /* Consider coding a repeat offset match.  */
1584                                 lzx_consider_repeat_offset_match(c,
1585                                                                  cur_optimum_ptr,
1586                                                                  rep_max_len,
1587                                                                  rep_max_idx);
1588                         }
1589
1590                         longest_len = matches[num_matches - 1].len;
1591
1592                         /* If there's a very long explicit offset match, choose
1593                          * it immediately.  */
1594                         if (longest_len >= c->params.nice_match_length) {
1595
1596                                 cur_optimum_ptr->queue.R[2] =
1597                                         cur_optimum_ptr->queue.R[1];
1598                                 cur_optimum_ptr->queue.R[1] =
1599                                         cur_optimum_ptr->queue.R[0];
1600                                 cur_optimum_ptr->queue.R[0] =
1601                                         matches[num_matches - 1].offset;
1602                                 begin_queue = &cur_optimum_ptr->queue;
1603
1604                                 offset_data = matches[num_matches - 1].offset +
1605                                               LZX_OFFSET_OFFSET;
1606                                 cur_optimum_ptr += longest_len;
1607                                 cur_optimum_ptr->mc_item_data =
1608                                         (offset_data << MC_OFFSET_SHIFT) |
1609                                         longest_len;
1610
1611                                 lzx_skip_bytes(c, longest_len - 1);
1612                                 break;
1613                         }
1614
1615                         /* If reaching any positions for the first time,
1616                          * initialize their costs to "infinity".  */
1617                         while (end_optimum_ptr < cur_optimum_ptr + longest_len)
1618                                 (++end_optimum_ptr)->cost = MC_INFINITE_COST;
1619
1620                         /* Consider coding an explicit offset match.  */
1621                         lzx_consider_explicit_offset_matches(c, cur_optimum_ptr,
1622                                                              matches, num_matches);
1623                 } else {
1624                         /* No matches found.  The only choice at this position
1625                          * is to code a literal.  */
1626
1627                         if (end_optimum_ptr == cur_optimum_ptr) {
1628                         #if 1
1629                                 /* Optimization for single literals.  */
1630                                 if (likely(cur_optimum_ptr == c->optimum)) {
1631                                         lzx_declare_literal(c, *window_ptr++,
1632                                                             next_chosen_item);
1633                                         if (window_ptr == block_end) {
1634                                                 c->queue = cur_optimum_ptr->queue;
1635                                                 return;
1636                                         }
1637                                         continue;
1638                                 }
1639                         #endif
1640                                 (++end_optimum_ptr)->cost = MC_INFINITE_COST;
1641                         }
1642                 }
1643
1644                 /* Consider coding a literal.
1645
1646                  * To avoid an extra unpredictable brench, actually checking the
1647                  * preferability of coding a literal is integrated into the
1648                  * queue update code below.  */
1649                 literal = *window_ptr++;
1650                 cost = cur_optimum_ptr->cost + lzx_literal_cost(literal, &c->costs);
1651
1652                 /* Advance to the next position.  */
1653                 cur_optimum_ptr++;
1654
1655                 /* The lowest-cost path to the current position is now known.
1656                  * Finalize the recent offsets queue that results from taking
1657                  * this lowest-cost path.  */
1658
1659                 if (cost < cur_optimum_ptr->cost) {
1660                         /* Literal: queue remains unchanged.  */
1661                         cur_optimum_ptr->cost = cost;
1662                         cur_optimum_ptr->mc_item_data = (literal << MC_OFFSET_SHIFT) | 1;
1663                         cur_optimum_ptr->queue = (cur_optimum_ptr - 1)->queue;
1664                 } else {
1665                         /* Match: queue update is needed.  */
1666                         len = cur_optimum_ptr->mc_item_data & MC_LEN_MASK;
1667                         offset_data = cur_optimum_ptr->mc_item_data >> MC_OFFSET_SHIFT;
1668                         if (offset_data >= LZX_NUM_RECENT_OFFSETS) {
1669                                 /* Explicit offset match: offset is inserted at front  */
1670                                 cur_optimum_ptr->queue.R[0] = offset_data - LZX_OFFSET_OFFSET;
1671                                 cur_optimum_ptr->queue.R[1] = (cur_optimum_ptr - len)->queue.R[0];
1672                                 cur_optimum_ptr->queue.R[2] = (cur_optimum_ptr - len)->queue.R[1];
1673                         } else {
1674                                 /* Repeat offset match: offset is swapped to front  */
1675                                 cur_optimum_ptr->queue = (cur_optimum_ptr - len)->queue;
1676                                 swap(cur_optimum_ptr->queue.R[0],
1677                                      cur_optimum_ptr->queue.R[offset_data]);
1678                         }
1679                 }
1680
1681                 /*
1682                  * This loop will terminate when either of the following
1683                  * conditions is true:
1684                  *
1685                  * (1) cur_optimum_ptr == end_optimum_ptr
1686                  *
1687                  *      There are no paths that extend beyond the current
1688                  *      position.  In this case, any path to a later position
1689                  *      must pass through the current position, so we can go
1690                  *      ahead and choose the list of items that led to this
1691                  *      position.
1692                  *
1693                  * (2) cur_optimum_ptr == &c->optimum[LZX_OPTIM_ARRAY_LENGTH]
1694                  *
1695                  *      This bounds the number of times the algorithm can step
1696                  *      forward before it is guaranteed to start choosing items.
1697                  *      This limits the memory usage.  But
1698                  *      LZX_OPTIM_ARRAY_LENGTH is high enough that on most
1699                  *      inputs this limit is never reached.
1700                  *
1701                  * Note: no check for end-of-block is needed because
1702                  * end-of-block will trigger condition (1).
1703                  */
1704                 if (cur_optimum_ptr == end_optimum_ptr ||
1705                     cur_optimum_ptr == &c->optimum[LZX_OPTIM_ARRAY_LENGTH])
1706                 {
1707                         begin_queue = &cur_optimum_ptr->queue;
1708                         break;
1709                 }
1710         }
1711
1712         /* Choose the current list of items that constitute the minimum-cost
1713          * path to the current position.  */
1714         lzx_declare_item_list(c, cur_optimum_ptr, next_chosen_item);
1715         goto begin;
1716 }
1717
1718 /* Fast heuristic scoring for lazy parsing: how "good" is this match?  */
1719 static inline unsigned
1720 lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
1721 {
1722         unsigned score = len;
1723
1724         if (adjusted_offset < 2048)
1725                 score++;
1726
1727         if (adjusted_offset < 1024)
1728                 score++;
1729
1730         return score;
1731 }
1732
1733 static inline unsigned
1734 lzx_repeat_offset_match_score(unsigned len, unsigned slot)
1735 {
1736         return len + 3;
1737 }
1738
1739 /* Lazy parsing  */
1740 static u32
1741 lzx_choose_lazy_items_for_block(struct lzx_compressor *c,
1742                                 u32 block_start_pos, u32 block_size)
1743 {
1744         const u8 *window_ptr;
1745         const u8 *block_end;
1746         struct lz_mf *mf;
1747         struct lz_match *matches;
1748         unsigned num_matches;
1749         unsigned cur_len;
1750         u32 cur_offset_data;
1751         unsigned cur_score;
1752         unsigned rep_max_len;
1753         unsigned rep_max_idx;
1754         unsigned rep_score;
1755         unsigned prev_len;
1756         unsigned prev_score;
1757         u32 prev_offset_data;
1758         unsigned skip_len;
1759         struct lzx_item *next_chosen_item;
1760
1761         window_ptr = &c->cur_window[block_start_pos];
1762         block_end = window_ptr + block_size;
1763         matches = c->cached_matches;
1764         mf = c->mf;
1765         next_chosen_item = c->chosen_items;
1766
1767         prev_len = 0;
1768         prev_offset_data = 0;
1769         prev_score = 0;
1770
1771         while (window_ptr != block_end) {
1772
1773                 /* Find explicit offset matches with the current position.  */
1774                 num_matches = lz_mf_get_matches(mf, matches);
1775                 window_ptr++;
1776
1777                 if (num_matches == 0 ||
1778                     (matches[num_matches - 1].len == 3 &&
1779                      matches[num_matches - 1].offset >= 8192 - LZX_OFFSET_OFFSET &&
1780                      matches[num_matches - 1].offset != c->queue.R[0] &&
1781                      matches[num_matches - 1].offset != c->queue.R[1] &&
1782                      matches[num_matches - 1].offset != c->queue.R[2]))
1783                 {
1784                         /* No match found, or the only match found was a distant
1785                          * length 3 match.  Output the previous match if there
1786                          * is one; otherwise output a literal.  */
1787
1788                 no_match_found:
1789
1790                         if (prev_len) {
1791                                 skip_len = prev_len - 2;
1792                                 goto output_prev_match;
1793                         } else {
1794                                 lzx_declare_literal(c, *(window_ptr - 1),
1795                                                     &next_chosen_item);
1796                                 continue;
1797                         }
1798                 }
1799
1800                 /* Find the longest repeat offset match with the current
1801                  * position.  */
1802                 if (likely(block_end - (window_ptr - 1) >= 2)) {
1803                         rep_max_len = lzx_repsearch((window_ptr - 1),
1804                                                     block_end - (window_ptr - 1),
1805                                                     &c->queue, &rep_max_idx);
1806                 } else {
1807                         rep_max_len = 0;
1808                 }
1809
1810                 cur_len = matches[num_matches - 1].len;
1811                 cur_offset_data = matches[num_matches - 1].offset + LZX_OFFSET_OFFSET;
1812                 cur_score = lzx_explicit_offset_match_score(cur_len, cur_offset_data);
1813
1814                 /* Select the better of the explicit and repeat offset matches.  */
1815                 if (rep_max_len >= 3 &&
1816                     (rep_score = lzx_repeat_offset_match_score(rep_max_len,
1817                                                                rep_max_idx)) >= cur_score)
1818                 {
1819                         cur_len = rep_max_len;
1820                         cur_offset_data = rep_max_idx;
1821                         cur_score = rep_score;
1822                 }
1823
1824                 if (unlikely(cur_len > block_end - (window_ptr - 1))) {
1825                         /* Nearing end of block.  */
1826                         cur_len = block_end - (window_ptr - 1);
1827                         if (cur_len < 3)
1828                                 goto no_match_found;
1829                 }
1830
1831                 if (prev_len == 0 || cur_score > prev_score) {
1832                         /* No previous match, or the current match is better
1833                          * than the previous match.
1834                          *
1835                          * If there's a previous match, then output a literal in
1836                          * its place.
1837                          *
1838                          * In both cases, if the current match is very long,
1839                          * then output it immediately.  Otherwise, attempt a
1840                          * lazy match by waiting to see if there's a better
1841                          * match at the next position.  */
1842
1843                         if (prev_len)
1844                                 lzx_declare_literal(c, *(window_ptr - 2), &next_chosen_item);
1845
1846                         prev_len = cur_len;
1847                         prev_offset_data = cur_offset_data;
1848                         prev_score = cur_score;
1849
1850                         if (prev_len >= c->params.nice_match_length) {
1851                                 skip_len = prev_len - 1;
1852                                 goto output_prev_match;
1853                         }
1854                         continue;
1855                 }
1856
1857                 /* Current match is not better than the previous match, so
1858                  * output the previous match.  */
1859
1860                 skip_len = prev_len - 2;
1861
1862         output_prev_match:
1863                 if (prev_offset_data < LZX_NUM_RECENT_OFFSETS) {
1864                         lzx_declare_repeat_offset_match(c, prev_len,
1865                                                         prev_offset_data,
1866                                                         &next_chosen_item);
1867                         swap(c->queue.R[0], c->queue.R[prev_offset_data]);
1868                 } else {
1869                         lzx_declare_explicit_offset_match(c, prev_len,
1870                                                           prev_offset_data - LZX_OFFSET_OFFSET,
1871                                                           &next_chosen_item);
1872                         c->queue.R[2] = c->queue.R[1];
1873                         c->queue.R[1] = c->queue.R[0];
1874                         c->queue.R[0] = prev_offset_data - LZX_OFFSET_OFFSET;
1875                 }
1876                 lz_mf_skip_positions(mf, skip_len);
1877                 window_ptr += skip_len;
1878                 prev_len = 0;
1879         }
1880
1881         return next_chosen_item - c->chosen_items;
1882 }
1883
1884 /* Given the frequencies of symbols in an LZX-compressed block and the
1885  * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
1886  * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
1887  * will take fewer bits to output.  */
1888 static int
1889 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
1890                                const struct lzx_codes * codes)
1891 {
1892         u32 aligned_cost = 0;
1893         u32 verbatim_cost = 0;
1894
1895         /* A verbatim block requires 3 bits in each place that an aligned symbol
1896          * would be used in an aligned offset block.  */
1897         for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1898                 verbatim_cost += 3 * freqs->aligned[i];
1899                 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
1900         }
1901
1902         /* Account for output of the aligned offset code.  */
1903         aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE * LZX_ALIGNEDCODE_NUM_SYMBOLS;
1904
1905         if (aligned_cost < verbatim_cost)
1906                 return LZX_BLOCKTYPE_ALIGNED;
1907         else
1908                 return LZX_BLOCKTYPE_VERBATIM;
1909 }
1910
1911 /* Near-optimal parsing  */
1912 static u32
1913 lzx_choose_near_optimal_items_for_block(struct lzx_compressor *c,
1914                                         u32 block_start_pos, u32 block_size)
1915 {
1916         u32 num_passes_remaining = c->params.num_optim_passes;
1917         struct lzx_lru_queue orig_queue;
1918         struct lzx_item *next_chosen_item;
1919         struct lzx_item **next_chosen_item_ptr;
1920
1921         /* Choose appropriate match-finder wrapper functions.  */
1922         if (num_passes_remaining > 1) {
1923                 if (block_size == c->cur_window_size)
1924                         c->get_matches_func = lzx_get_matches_fillcache_singleblock;
1925                 else
1926                         c->get_matches_func = lzx_get_matches_fillcache_multiblock;
1927                 c->skip_bytes_func = lzx_skip_bytes_fillcache;
1928         } else {
1929                 if (block_size == c->cur_window_size)
1930                         c->get_matches_func = lzx_get_matches_nocache_singleblock;
1931                 else
1932                         c->get_matches_func = lzx_get_matches_nocache_multiblock;
1933                 c->skip_bytes_func = lzx_skip_bytes_nocache;
1934         }
1935
1936         /* No matches will extend beyond the end of the block.  */
1937         c->match_window_end = block_start_pos + block_size;
1938
1939         /* The first optimization pass will use a default cost model.  Each
1940          * additional optimization pass will use a cost model computed from the
1941          * previous pass.
1942          *
1943          * To improve performance we only generate the array containing the
1944          * matches and literals in intermediate form on the final pass.  For
1945          * earlier passes, tallying symbol frequencies is sufficient.  */
1946         lzx_set_default_costs(&c->costs, c->num_main_syms);
1947
1948         next_chosen_item_ptr = NULL;
1949         orig_queue = c->queue;
1950         do {
1951                 /* Reset the match-finder wrapper.  */
1952                 c->match_window_pos = block_start_pos;
1953                 c->cache_ptr = c->cached_matches;
1954
1955                 if (num_passes_remaining == 1) {
1956                         /* Last pass: actually generate the items.  */
1957                         next_chosen_item = c->chosen_items;
1958                         next_chosen_item_ptr = &next_chosen_item;
1959                 }
1960
1961                 /* Choose the items.  */
1962                 lzx_optim_pass(c, next_chosen_item_ptr);
1963
1964                 if (num_passes_remaining > 1) {
1965                         /* This isn't the last pass.  */
1966
1967                         /* Make the Huffman codes from the symbol frequencies.  */
1968                         lzx_make_huffman_codes(&c->freqs, &c->codes[c->codes_index],
1969                                                c->num_main_syms);
1970
1971                         /* Update symbol costs.  */
1972                         lzx_set_costs(c, &c->codes[c->codes_index].lens);
1973
1974                         /* Reset symbol frequencies.  */
1975                         memset(&c->freqs, 0, sizeof(c->freqs));
1976
1977                         /* Reset the match offset LRU queue to what it was at
1978                          * the beginning of the block.  */
1979                         c->queue = orig_queue;
1980
1981                         /* Choose appopriate match-finder wrapper functions.  */
1982                         if (c->cache_ptr <= c->cache_limit) {
1983                                 c->get_matches_func = lzx_get_matches_usecache_nocheck;
1984                                 c->skip_bytes_func = lzx_skip_bytes_usecache_nocheck;
1985                         } else {
1986                                 c->get_matches_func = lzx_get_matches_usecache;
1987                                 c->skip_bytes_func = lzx_skip_bytes_usecache;
1988                         }
1989                 }
1990         } while (--num_passes_remaining);
1991
1992         /* Return the number of items chosen.  */
1993         return next_chosen_item - c->chosen_items;
1994 }
1995
1996 /*
1997  * Choose the matches/literals with which to output the block of data beginning
1998  * at '&c->cur_window[block_start_pos]' and extending for 'block_size' bytes.
1999  *
2000  * The frequences of the Huffman symbols in the block will be tallied in
2001  * 'c->freqs'.
2002  *
2003  * 'c->queue' must specify the state of the queue at the beginning of this block.
2004  * This function will update it to the state of the queue at the end of this
2005  * block.
2006  *
2007  * Returns the number of matches/literals that were chosen and written to
2008  * 'c->chosen_items' in the 'struct lzx_item' intermediate representation.
2009  */
2010 static u32
2011 lzx_choose_items_for_block(struct lzx_compressor *c,
2012                            u32 block_start_pos, u32 block_size)
2013 {
2014         return (*c->params.choose_items_for_block)(c, block_start_pos, block_size);
2015 }
2016
2017 /* Initialize c->offset_slot_fast.  */
2018 static void
2019 lzx_init_offset_slot_fast(struct lzx_compressor *c)
2020 {
2021         u8 slot = 0;
2022
2023         for (u32 offset = 0; offset < LZX_NUM_FAST_OFFSETS; offset++) {
2024
2025                 while (offset + LZX_OFFSET_OFFSET >= lzx_offset_slot_base[slot + 1])
2026                         slot++;
2027
2028                 c->offset_slot_fast[offset] = slot;
2029         }
2030 }
2031
2032 /* Set internal compression parameters for the specified compression level and
2033  * maximum window size.  */
2034 static void
2035 lzx_build_params(unsigned int compression_level, u32 max_window_size,
2036                  struct lzx_compressor_params *lzx_params)
2037 {
2038         if (compression_level < 25) {
2039
2040                 /* Fast compression: Use lazy parsing.  */
2041
2042                 lzx_params->choose_items_for_block = lzx_choose_lazy_items_for_block;
2043                 lzx_params->num_optim_passes = 1;
2044
2045                 /* When lazy parsing, the hash chain match-finding algorithm is
2046                  * fastest unless the window is too large.
2047                  *
2048                  * TODO: something like hash arrays would actually be better
2049                  * than binary trees on large windows.  */
2050                 if (max_window_size <= 262144)
2051                         lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
2052                 else
2053                         lzx_params->mf_algo = LZ_MF_BINARY_TREES;
2054
2055                 /* When lazy parsing, don't bother with length 2 matches.  */
2056                 lzx_params->min_match_length = 3;
2057
2058                 /* Scale nice_match_length and max_search_depth with the
2059                  * compression level.  */
2060                 lzx_params->nice_match_length = 25 + compression_level * 2;
2061                 lzx_params->max_search_depth = 25 + compression_level;
2062         } else {
2063
2064                 /* Normal / high compression: Use near-optimal parsing.  */
2065
2066                 lzx_params->choose_items_for_block = lzx_choose_near_optimal_items_for_block;
2067
2068                 /* Set a number of optimization passes appropriate for the
2069                  * compression level.  */
2070
2071                 lzx_params->num_optim_passes = 1;
2072
2073                 if (compression_level >= 40)
2074                         lzx_params->num_optim_passes++;
2075
2076                 /* Use more optimization passes for higher compression levels.
2077                  * But the more passes there are, the less they help --- so
2078                  * don't add them linearly.  */
2079                 if (compression_level >= 70) {
2080                         lzx_params->num_optim_passes++;
2081                         if (compression_level >= 100)
2082                                 lzx_params->num_optim_passes++;
2083                         if (compression_level >= 150)
2084                                 lzx_params->num_optim_passes++;
2085                         if (compression_level >= 200)
2086                                 lzx_params->num_optim_passes++;
2087                         if (compression_level >= 300)
2088                                 lzx_params->num_optim_passes++;
2089                 }
2090
2091                 /* When doing near-optimal parsing, the hash chain match-finding
2092                  * algorithm is good if the window size is small and we're only
2093                  * doing one optimization pass.  Otherwise, the binary tree
2094                  * algorithm is the way to go.  */
2095                 if (max_window_size <= 32768 && lzx_params->num_optim_passes == 1)
2096                         lzx_params->mf_algo = LZ_MF_HASH_CHAINS;
2097                 else
2098                         lzx_params->mf_algo = LZ_MF_BINARY_TREES;
2099
2100                 /* When doing near-optimal parsing, allow length 2 matches if
2101                  * the compression level is sufficiently high.  */
2102                 if (compression_level >= 45)
2103                         lzx_params->min_match_length = 2;
2104                 else
2105                         lzx_params->min_match_length = 3;
2106
2107                 /* Scale nice_match_length and max_search_depth with the
2108                  * compression level.  */
2109                 lzx_params->nice_match_length = min(((u64)compression_level * 32) / 50,
2110                                                     LZX_MAX_MATCH_LEN);
2111                 lzx_params->max_search_depth = min(((u64)compression_level * 50) / 50,
2112                                                    LZX_MAX_MATCH_LEN);
2113         }
2114 }
2115
2116 /* Given the internal compression parameters and maximum window size, build the
2117  * Lempel-Ziv match-finder parameters.  */
2118 static void
2119 lzx_build_mf_params(const struct lzx_compressor_params *lzx_params,
2120                     u32 max_window_size, struct lz_mf_params *mf_params)
2121 {
2122         memset(mf_params, 0, sizeof(*mf_params));
2123
2124         mf_params->algorithm = lzx_params->mf_algo;
2125         mf_params->max_window_size = max_window_size;
2126         mf_params->min_match_len = lzx_params->min_match_length;
2127         mf_params->max_match_len = LZX_MAX_MATCH_LEN;
2128         mf_params->max_search_depth = lzx_params->max_search_depth;
2129         mf_params->nice_match_len = lzx_params->nice_match_length;
2130 }
2131
2132 static void
2133 lzx_free_compressor(void *_c);
2134
2135 static u64
2136 lzx_get_needed_memory(size_t max_block_size, unsigned int compression_level)
2137 {
2138         struct lzx_compressor_params params;
2139         u64 size = 0;
2140         unsigned window_order;
2141         u32 max_window_size;
2142
2143         window_order = lzx_get_window_order(max_block_size);
2144         if (window_order == 0)
2145                 return 0;
2146         max_window_size = max_block_size;
2147
2148         lzx_build_params(compression_level, max_window_size, &params);
2149
2150         size += sizeof(struct lzx_compressor);
2151
2152         /* cur_window */
2153         size += max_window_size;
2154
2155         /* mf */
2156         size += lz_mf_get_needed_memory(params.mf_algo, max_window_size);
2157
2158         /* cached_matches */
2159         if (params.num_optim_passes > 1)
2160                 size += LZX_CACHE_LEN * sizeof(struct lz_match);
2161         else
2162                 size += LZX_MAX_MATCHES_PER_POS * sizeof(struct lz_match);
2163         return size;
2164 }
2165
2166 static int
2167 lzx_create_compressor(size_t max_block_size, unsigned int compression_level,
2168                       void **c_ret)
2169 {
2170         struct lzx_compressor *c;
2171         struct lzx_compressor_params params;
2172         struct lz_mf_params mf_params;
2173         unsigned window_order;
2174         u32 max_window_size;
2175
2176         window_order = lzx_get_window_order(max_block_size);
2177         if (window_order == 0)
2178                 return WIMLIB_ERR_INVALID_PARAM;
2179         max_window_size = max_block_size;
2180
2181         lzx_build_params(compression_level, max_window_size, &params);
2182         lzx_build_mf_params(&params, max_window_size, &mf_params);
2183         if (!lz_mf_params_valid(&mf_params))
2184                 return WIMLIB_ERR_INVALID_PARAM;
2185
2186         c = CALLOC(1, sizeof(struct lzx_compressor));
2187         if (!c)
2188                 goto oom;
2189
2190         c->params = params;
2191         c->num_main_syms = lzx_get_num_main_syms(window_order);
2192         c->window_order = window_order;
2193
2194         /* The window is allocated as 16-byte aligned to speed up memcpy() and
2195          * enable lzx_e8_filter() optimization on x86_64.  */
2196         c->cur_window = ALIGNED_MALLOC(max_window_size, 16);
2197         if (!c->cur_window)
2198                 goto oom;
2199
2200         c->mf = lz_mf_alloc(&mf_params);
2201         if (!c->mf)
2202                 goto oom;
2203
2204         if (params.num_optim_passes > 1) {
2205                 c->cached_matches = MALLOC(LZX_CACHE_LEN *
2206                                            sizeof(struct lz_match));
2207                 if (!c->cached_matches)
2208                         goto oom;
2209                 c->cache_limit = c->cached_matches + LZX_CACHE_LEN -
2210                                  (LZX_MAX_MATCHES_PER_POS + 1);
2211         } else {
2212                 c->cached_matches = MALLOC(LZX_MAX_MATCHES_PER_POS *
2213                                            sizeof(struct lz_match));
2214                 if (!c->cached_matches)
2215                         goto oom;
2216         }
2217
2218         lzx_init_offset_slot_fast(c);
2219
2220         *c_ret = c;
2221         return 0;
2222
2223 oom:
2224         lzx_free_compressor(c);
2225         return WIMLIB_ERR_NOMEM;
2226 }
2227
2228 static size_t
2229 lzx_compress(const void *uncompressed_data, size_t uncompressed_size,
2230              void *compressed_data, size_t compressed_size_avail, void *_c)
2231 {
2232         struct lzx_compressor *c = _c;
2233         struct lzx_output_bitstream os;
2234         u32 num_chosen_items;
2235         const struct lzx_lens *prev_lens;
2236         u32 block_start_pos;
2237         u32 block_size;
2238         int block_type;
2239
2240         /* Don't bother compressing very small inputs.  */
2241         if (uncompressed_size < 100)
2242                 return 0;
2243
2244         /* The input data must be preprocessed.  To avoid changing the original
2245          * input data, copy it to a temporary buffer.  */
2246         memcpy(c->cur_window, uncompressed_data, uncompressed_size);
2247         c->cur_window_size = uncompressed_size;
2248
2249         /* Preprocess the data.  */
2250         lzx_do_e8_preprocessing(c->cur_window, c->cur_window_size);
2251
2252         /* Load the window into the match-finder.  */
2253         lz_mf_load_window(c->mf, c->cur_window, c->cur_window_size);
2254
2255         /* Initialize the match offset LRU queue.  */
2256         lzx_lru_queue_init(&c->queue);
2257
2258         /* Initialize the output bitstream.  */
2259         lzx_init_output(&os, compressed_data, compressed_size_avail);
2260
2261         /* Compress the data block by block.
2262          *
2263          * TODO: The compression ratio could be slightly improved by performing
2264          * data-dependent block splitting instead of using fixed-size blocks.
2265          * Doing so well is a computationally hard problem, however.  */
2266         block_start_pos = 0;
2267         c->codes_index = 0;
2268         prev_lens = &c->zero_lens;
2269         do {
2270                 /* Compute the block size.  */
2271                 block_size = min(LZX_DIV_BLOCK_SIZE,
2272                                  uncompressed_size - block_start_pos);
2273
2274                 /* Reset symbol frequencies.  */
2275                 memset(&c->freqs, 0, sizeof(c->freqs));
2276
2277                 /* Prepare the matches/literals for the block.  */
2278                 num_chosen_items = lzx_choose_items_for_block(c,
2279                                                               block_start_pos,
2280                                                               block_size);
2281
2282                 /* Make the Huffman codes from the symbol frequencies.  */
2283                 lzx_make_huffman_codes(&c->freqs, &c->codes[c->codes_index],
2284                                        c->num_main_syms);
2285
2286                 /* Choose the best block type.
2287                  *
2288                  * Note: we currently don't consider uncompressed blocks.  */
2289                 block_type = lzx_choose_verbatim_or_aligned(&c->freqs,
2290                                                             &c->codes[c->codes_index]);
2291
2292                 /* Write the compressed block to the output buffer.  */
2293                 lzx_write_compressed_block(block_type,
2294                                            block_size,
2295                                            c->window_order,
2296                                            c->num_main_syms,
2297                                            c->chosen_items,
2298                                            num_chosen_items,
2299                                            &c->codes[c->codes_index],
2300                                            prev_lens,
2301                                            &os);
2302
2303                 /* The current codeword lengths become the previous lengths.  */
2304                 prev_lens = &c->codes[c->codes_index].lens;
2305                 c->codes_index ^= 1;
2306
2307                 block_start_pos += block_size;
2308
2309         } while (block_start_pos != uncompressed_size);
2310
2311         return lzx_flush_output(&os);
2312 }
2313
2314 static void
2315 lzx_free_compressor(void *_c)
2316 {
2317         struct lzx_compressor *c = _c;
2318
2319         if (c) {
2320                 ALIGNED_FREE(c->cur_window);
2321                 lz_mf_free(c->mf);
2322                 FREE(c->cached_matches);
2323                 FREE(c);
2324         }
2325 }
2326
2327 const struct compressor_ops lzx_compressor_ops = {
2328         .get_needed_memory  = lzx_get_needed_memory,
2329         .create_compressor  = lzx_create_compressor,
2330         .compress           = lzx_compress,
2331         .free_compressor    = lzx_free_compressor,
2332 };