wimlib.net Git - wimlib/blob - src/lzx_compress.c

   1 /*
   2  * lzx_compress.c
   3  *
   4  * A compressor for the LZX compression format, as used in WIM archives.
   5  */
   6
   7 /*
   8  * Copyright (C) 2012-2017 Eric Biggers
   9  *
  10  * This file is free software; you can redistribute it and/or modify it under
  11  * the terms of the GNU Lesser General Public License as published by the Free
  12  * Software Foundation; either version 3 of the License, or (at your option) any
  13  * later version.
  14  *
  15  * This file is distributed in the hope that it will be useful, but WITHOUT
  16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  18  * details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this file; if not, see http://www.gnu.org/licenses/.
  22  */
  23
  24
  25 /*
  26  * This file contains a compressor for the LZX ("Lempel-Ziv eXtended")
  27  * compression format, as used in the WIM (Windows IMaging) file format.
  28  *
  29  * Two different LZX-compatible algorithms are implemented: "near-optimal" and
  30  * "lazy".  "Near-optimal" is significantly slower than "lazy", but results in a
  31  * better compression ratio.  The "near-optimal" algorithm is used at the
  32  * default compression level.
  33  *
  34  * This file may need some slight modifications to be used outside of the WIM
  35  * format.  In particular, in other situations the LZX block header might be
  36  * slightly different, and sliding window support might be required.
  37  *
  38  * LZX is a compression format derived from DEFLATE, the format used by zlib and
  39  * gzip.  Both LZX and DEFLATE use LZ77 matching and Huffman coding.  Certain
  40  * details are quite similar, such as the method for storing Huffman codes.
  41  * However, the main differences are:
  42  *
  43  * - LZX preprocesses the data to attempt to make x86 machine code slightly more
  44  *   compressible before attempting to compress it further.
  45  *
  46  * - LZX uses a "main" alphabet which combines literals and matches, with the
  47  *   match symbols containing a "length header" (giving all or part of the match
  48  *   length) and an "offset slot" (giving, roughly speaking, the order of
  49  *   magnitude of the match offset).
  50  *
  51  * - LZX does not have static Huffman blocks (that is, the kind with preset
  52  *   Huffman codes); however it does have two types of dynamic Huffman blocks
  53  *   ("verbatim" and "aligned").
  54  *
  55  * - LZX has a minimum match length of 2 rather than 3.  Length 2 matches can be
  56  *   useful, but generally only if the compressor is smart about choosing them.
  57  *
  58  * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue
  59  *   of match offsets.  This is very useful for certain types of files, such as
  60  *   binary files that have repeating records.
  61  */
  62
  63 /******************************************************************************/
  64 /*                            General parameters                              */
  65 /*----------------------------------------------------------------------------*/
  66
  67 /*
  68  * The compressor uses the faster algorithm at levels <= MAX_FAST_LEVEL.  It
  69  * uses the slower algorithm at levels > MAX_FAST_LEVEL.
  70  */
  71 #define MAX_FAST_LEVEL                          34
  72
  73 /*
  74  * The compressor-side limits on the codeword lengths (in bits) for each Huffman
  75  * code.  To make outputting bits slightly faster, some of these limits are
  76  * lower than the limits defined by the LZX format.  This does not significantly
  77  * affect the compression ratio.
  78  */
  79 #define MAIN_CODEWORD_LIMIT                     16
  80 #define LENGTH_CODEWORD_LIMIT                   12
  81 #define ALIGNED_CODEWORD_LIMIT                  7
  82 #define PRE_CODEWORD_LIMIT                      7
  83
  84
  85 /******************************************************************************/
  86 /*                         Block splitting parameters                         */
  87 /*----------------------------------------------------------------------------*/
  88
  89 /*
  90  * The compressor always outputs blocks of at least this size in bytes, except
  91  * for the last block which may need to be smaller.
  92  */
  93 #define MIN_BLOCK_SIZE                          6500
  94
  95 /*
  96  * The compressor attempts to end a block when it reaches this size in bytes.
  97  * The final size might be slightly larger due to matches extending beyond the
  98  * end of the block.  Specifically:
  99  *
 100  *  - The near-optimal compressor may choose a match of up to LZX_MAX_MATCH_LEN
 101  *    bytes starting at position 'SOFT_MAX_BLOCK_SIZE - 1'.
 102  *
 103  *  - The lazy compressor may choose a sequence of literals starting at position
 104  *    'SOFT_MAX_BLOCK_SIZE - 1' when it sees a sequence of increasingly better
 105  *    matches.  The final match may be up to LZX_MAX_MATCH_LEN bytes.  The
 106  *    length of the literal sequence is approximately limited by the "nice match
 107  *    length" parameter.
 108  */
 109 #define SOFT_MAX_BLOCK_SIZE                     100000
 110
 111 /*
 112  * The number of observed items (matches and literals) that represents
 113  * sufficient data for the compressor to decide whether the current block should
 114  * be ended or not.
 115  */
 116 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK        400
 117
 118
 119 /******************************************************************************/
 120 /*                      Parameters for slower algorithm                       */
 121 /*----------------------------------------------------------------------------*/
 122
 123 /*
 124  * The log base 2 of the number of entries in the hash table for finding length
 125  * 2 matches.  This could be as high as 16, but using a smaller hash table
 126  * speeds up compression due to reduced cache pressure.
 127  */
 128 #define BT_MATCHFINDER_HASH2_ORDER              12
 129
 130 /*
 131  * The number of lz_match structures in the match cache, excluding the extra
 132  * "overflow" entries.  This value should be high enough so that nearly the
 133  * time, all matches found in a given block can fit in the match cache.
 134  * However, fallback behavior (immediately terminating the block) on cache
 135  * overflow is still required.
 136  */
 137 #define CACHE_LENGTH                            (SOFT_MAX_BLOCK_SIZE * 5)
 138
 139 /*
 140  * An upper bound on the number of matches that can ever be saved in the match
 141  * cache for a single position.  Since each match we save for a single position
 142  * has a distinct length, we can use the number of possible match lengths in LZX
 143  * as this bound.  This bound is guaranteed to be valid in all cases, although
 144  * if 'nice_match_length < LZX_MAX_MATCH_LEN', then it will never actually be
 145  * reached.
 146  */
 147 #define MAX_MATCHES_PER_POS                     LZX_NUM_LENS
 148
 149 /*
 150  * A scaling factor that makes it possible to consider fractional bit costs.  A
 151  * single bit has a cost of BIT_COST.
 152  *
 153  * Note: this is only useful as a statistical trick for when the true costs are
 154  * unknown.  Ultimately, each token in LZX requires a whole number of bits to
 155  * output.
 156  */
 157 #define BIT_COST                                64
 158
 159 /*
 160  * Should the compressor take into account the costs of aligned offset symbols
 161  * instead of assuming that all are equally likely?
 162  */
 163 #define CONSIDER_ALIGNED_COSTS                  1
 164
 165 /*
 166  * Should the "minimum" cost path search algorithm consider "gap" matches, where
 167  * a normal match is followed by a literal, then by a match with the same
 168  * offset?  This is one specific, somewhat common situation in which the true
 169  * minimum cost path is often different from the path found by looking only one
 170  * edge ahead.
 171  */
 172 #define CONSIDER_GAP_MATCHES                    1
 173
 174 /******************************************************************************/
 175 /*                                  Includes                                  */
 176 /*----------------------------------------------------------------------------*/
 177
 178 #ifdef HAVE_CONFIG_H
 179 #  include "config.h"
 180 #endif
 181
 182 #include "wimlib/compress_common.h"
 183 #include "wimlib/compressor_ops.h"
 184 #include "wimlib/error.h"
 185 #include "wimlib/lz_extend.h"
 186 #include "wimlib/lzx_common.h"
 187 #include "wimlib/unaligned.h"
 188 #include "wimlib/util.h"
 189
 190 /* Note: BT_MATCHFINDER_HASH2_ORDER must be defined before including
 191  * bt_matchfinder.h. */
 192
 193 /* Matchfinders with 16-bit positions */
 194 #define mf_pos_t        u16
 195 #define MF_SUFFIX       _16
 196 #include "wimlib/bt_matchfinder.h"
 197 #include "wimlib/hc_matchfinder.h"
 198
 199 /* Matchfinders with 32-bit positions */
 200 #undef mf_pos_t
 201 #undef MF_SUFFIX
 202 #define mf_pos_t        u32
 203 #define MF_SUFFIX       _32
 204 #include "wimlib/bt_matchfinder.h"
 205 #include "wimlib/hc_matchfinder.h"
 206
 207 /******************************************************************************/
 208 /*                            Compressor structure                            */
 209 /*----------------------------------------------------------------------------*/
 210
 211 /* Codewords for the Huffman codes */
 212 struct lzx_codewords {
 213         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 214         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 215         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 216 };
 217
 218 /*
 219  * Codeword lengths, in bits, for the Huffman codes.
 220  *
 221  * A codeword length of 0 means the corresponding codeword has zero frequency.
 222  *
 223  * The main and length codes each have one extra entry for use as a sentinel.
 224  * See lzx_write_compressed_code().
 225  */
 226 struct lzx_lens {
 227         u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS + 1];
 228         u8 len[LZX_LENCODE_NUM_SYMBOLS + 1];
 229         u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 230 };
 231
 232 /* Codewords and lengths for the Huffman codes */
 233 struct lzx_codes {
 234         struct lzx_codewords codewords;
 235         struct lzx_lens lens;
 236 };
 237
 238 /* Symbol frequency counters for the Huffman-encoded alphabets */
 239 struct lzx_freqs {
 240         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 241         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 242         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 243 };
 244
 245 /* Block split statistics.  See the "Block splitting algorithm" section later in
 246  * this file for details. */
 247 #define NUM_LITERAL_OBSERVATION_TYPES 8
 248 #define NUM_MATCH_OBSERVATION_TYPES 2
 249 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
 250                                NUM_MATCH_OBSERVATION_TYPES)
 251 struct lzx_block_split_stats {
 252         u32 new_observations[NUM_OBSERVATION_TYPES];
 253         u32 observations[NUM_OBSERVATION_TYPES];
 254         u32 num_new_observations;
 255         u32 num_observations;
 256 };
 257
 258 /*
 259  * Represents a run of literals followed by a match or end-of-block.  This
 260  * structure is needed to temporarily store items chosen by the compressor,
 261  * since items cannot be written until all items for the block have been chosen
 262  * and the block's Huffman codes have been computed.
 263  */
 264 struct lzx_sequence {
 265
 266         /*
 267          * Bits 9..31: the number of literals in this run.  This may be 0 and
 268          * can be at most about SOFT_MAX_BLOCK_LENGTH.  The literals are not
 269          * stored explicitly in this structure; instead, they are read directly
 270          * from the uncompressed data.
 271          *
 272          * Bits 0..8: the length of the match which follows the literals, or 0
 273          * if this literal run was the last in the block, so there is no match
 274          * which follows it.  This can be at most LZX_MAX_MATCH_LEN.
 275          */
 276         u32 litrunlen_and_matchlen;
 277 #define SEQ_MATCHLEN_BITS       9
 278 #define SEQ_MATCHLEN_MASK       (((u32)1 << SEQ_MATCHLEN_BITS) - 1)
 279
 280         /*
 281          * If 'matchlen' doesn't indicate end-of-block, then this contains:
 282          *
 283          * Bits 10..31: either the offset plus LZX_OFFSET_ADJUSTMENT or a recent
 284          * offset code, depending on the offset slot encoded in the main symbol.
 285          *
 286          * Bits 0..9: the main symbol.
 287          */
 288         u32 adjusted_offset_and_mainsym;
 289 #define SEQ_MAINSYM_BITS        10
 290 #define SEQ_MAINSYM_MASK        (((u32)1 << SEQ_MAINSYM_BITS) - 1)
 291 } _aligned_attribute(8);
 292
 293 /*
 294  * This structure represents a byte position in the input buffer and a node in
 295  * the graph of possible match/literal choices.
 296  *
 297  * Logically, each incoming edge to this node is labeled with a literal or a
 298  * match that can be taken to reach this position from an earlier position; and
 299  * each outgoing edge from this node is labeled with a literal or a match that
 300  * can be taken to advance from this position to a later position.
 301  */
 302 struct lzx_optimum_node {
 303
 304         /* The cost, in bits, of the lowest-cost path that has been found to
 305          * reach this position.  This can change as progressively lower cost
 306          * paths are found to reach this position.  */
 307         u32 cost;
 308
 309         /*
 310          * The best arrival to this node, i.e. the match or literal that was
 311          * used to arrive to this position at the given 'cost'.  This can change
 312          * as progressively lower cost paths are found to reach this position.
 313          *
 314          * For non-gap matches, this variable is divided into two bitfields
 315          * whose meanings depend on the item type:
 316          *
 317          * Literals:
 318          *      Low bits are 0, high bits are the literal.
 319          *
 320          * Explicit offset matches:
 321          *      Low bits are the match length, high bits are the offset plus
 322          *      LZX_OFFSET_ADJUSTMENT.
 323          *
 324          * Repeat offset matches:
 325          *      Low bits are the match length, high bits are the queue index.
 326          *
 327          * For gap matches, identified by OPTIMUM_GAP_MATCH set, special
 328          * behavior applies --- see the code.
 329          */
 330         u32 item;
 331 #define OPTIMUM_OFFSET_SHIFT    SEQ_MATCHLEN_BITS
 332 #define OPTIMUM_LEN_MASK        SEQ_MATCHLEN_MASK
 333 #if CONSIDER_GAP_MATCHES
 334 #  define OPTIMUM_GAP_MATCH 0x80000000
 335 #endif
 336
 337 } _aligned_attribute(8);
 338
 339 /* The cost model for near-optimal parsing */
 340 struct lzx_costs {
 341
 342         /*
 343          * 'match_cost[offset_slot][len - LZX_MIN_MATCH_LEN]' is the cost of a
 344          * length 'len' match which has an offset belonging to 'offset_slot'.
 345          * The cost includes the main symbol, the length symbol if required, and
 346          * the extra offset bits if any, excluding any entropy-coded bits
 347          * (aligned offset bits).  It does *not* include the cost of the aligned
 348          * offset symbol which may be required.
 349          */
 350         u16 match_cost[LZX_MAX_OFFSET_SLOTS][LZX_NUM_LENS];
 351
 352         /* Cost of each symbol in the main code */
 353         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 354
 355         /* Cost of each symbol in the length code */
 356         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 357
 358 #if CONSIDER_ALIGNED_COSTS
 359         /* Cost of each symbol in the aligned offset code */
 360         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 361 #endif
 362 };
 363
 364 struct lzx_output_bitstream;
 365
 366 /* The main LZX compressor structure */
 367 struct lzx_compressor {
 368
 369         /* The buffer for preprocessed input data, if not using destructive
 370          * compression */
 371         void *in_buffer;
 372
 373         /* If true, then the compressor need not preserve the input buffer if it
 374          * compresses the data successfully */
 375         bool destructive;
 376
 377         /* Pointer to the compress() implementation chosen at allocation time */
 378         void (*impl)(struct lzx_compressor *, const u8 *, size_t,
 379                      struct lzx_output_bitstream *);
 380
 381         /* The log base 2 of the window size for match offset encoding purposes.
 382          * This will be >= LZX_MIN_WINDOW_ORDER and <= LZX_MAX_WINDOW_ORDER. */
 383         unsigned window_order;
 384
 385         /* The number of symbols in the main alphabet.  This depends on the
 386          * window order, since the window order determines the maximum possible
 387          * match offset. */
 388         unsigned num_main_syms;
 389
 390         /* The "nice" match length: if a match of this length is found, then it
 391          * is chosen immediately without further consideration. */
 392         unsigned nice_match_length;
 393
 394         /* The maximum search depth: at most this many potential matches are
 395          * considered at each position. */
 396         unsigned max_search_depth;
 397
 398         /* The number of optimization passes per block */
 399         unsigned num_optim_passes;
 400
 401         /* The symbol frequency counters for the current block */
 402         struct lzx_freqs freqs;
 403
 404         /* Block split statistics for the current block */
 405         struct lzx_block_split_stats split_stats;
 406
 407         /* The Huffman codes for the current and previous blocks.  The one with
 408          * index 'codes_index' is for the current block, and the other one is
 409          * for the previous block. */
 410         struct lzx_codes codes[2];
 411         unsigned codes_index;
 412
 413         /* The matches and literals that the compressor has chosen for the
 414          * current block.  The required length of this array is limited by the
 415          * maximum number of matches that can ever be chosen for a single block,
 416          * plus one for the special entry at the end. */
 417         struct lzx_sequence chosen_sequences[
 418                        DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
 419
 420         /* Tables for mapping adjusted offsets to offset slots */
 421         u8 offset_slot_tab_1[32768]; /* offset slots [0, 29] */
 422         u8 offset_slot_tab_2[128]; /* offset slots [30, 49] */
 423
 424         union {
 425                 /* Data for lzx_compress_lazy() */
 426                 struct {
 427                         /* Hash chains matchfinder (MUST BE LAST!!!) */
 428                         union {
 429                                 struct hc_matchfinder_16 hc_mf_16;
 430                                 struct hc_matchfinder_32 hc_mf_32;
 431                         };
 432                 };
 433
 434                 /* Data for lzx_compress_near_optimal() */
 435                 struct {
 436                         /*
 437                          * Array of nodes, one per position, for running the
 438                          * minimum-cost path algorithm.
 439                          *
 440                          * This array must be large enough to accommodate the
 441                          * worst-case number of nodes, which occurs if the
 442                          * compressor finds a match of length LZX_MAX_MATCH_LEN
 443                          * at position 'SOFT_MAX_BLOCK_SIZE - 1', producing a
 444                          * block of size 'SOFT_MAX_BLOCK_SIZE - 1 +
 445                          * LZX_MAX_MATCH_LEN'.  Add one for the end-of-block
 446                          * node.
 447                          */
 448                         struct lzx_optimum_node optimum_nodes[
 449                                                     SOFT_MAX_BLOCK_SIZE - 1 +
 450                                                     LZX_MAX_MATCH_LEN + 1];
 451
 452                         /* The cost model for the current optimization pass */
 453                         struct lzx_costs costs;
 454
 455                         /*
 456                          * Cached matches for the current block.  This array
 457                          * contains the matches that were found at each position
 458                          * in the block.  Specifically, for each position, there
 459                          * is a special 'struct lz_match' whose 'length' field
 460                          * contains the number of matches that were found at
 461                          * that position; this is followed by the matches
 462                          * themselves, if any, sorted by strictly increasing
 463                          * length.
 464                          *
 465                          * Note: in rare cases, there will be a very high number
 466                          * of matches in the block and this array will overflow.
 467                          * If this happens, we force the end of the current
 468                          * block.  CACHE_LENGTH is the length at which we
 469                          * actually check for overflow.  The extra slots beyond
 470                          * this are enough to absorb the worst case overflow,
 471                          * which occurs if starting at &match_cache[CACHE_LENGTH
 472                          * - 1], we write the match count header, then write
 473                          * MAX_MATCHES_PER_POS matches, then skip searching for
 474                          * matches at 'LZX_MAX_MATCH_LEN - 1' positions and
 475                          * write the match count header for each.
 476                          */
 477                         struct lz_match match_cache[CACHE_LENGTH +
 478                                                     MAX_MATCHES_PER_POS +
 479                                                     LZX_MAX_MATCH_LEN - 1];
 480
 481                         /* Binary trees matchfinder (MUST BE LAST!!!) */
 482                         union {
 483                                 struct bt_matchfinder_16 bt_mf_16;
 484                                 struct bt_matchfinder_32 bt_mf_32;
 485                         };
 486                 };
 487         };
 488 };
 489
 490 /******************************************************************************/
 491 /*                            Matchfinder utilities                           */
 492 /*----------------------------------------------------------------------------*/
 493
 494 /*
 495  * Will a matchfinder using 16-bit positions be sufficient for compressing
 496  * buffers of up to the specified size?  The limit could be 65536 bytes, but we
 497  * also want to optimize out the use of offset_slot_tab_2 in the 16-bit case.
 498  * This requires that the limit be no more than the length of offset_slot_tab_1
 499  * (currently 32768).
 500  */
 501 static forceinline bool
 502 lzx_is_16_bit(size_t max_bufsize)
 503 {
 504         STATIC_ASSERT(ARRAY_LEN(((struct lzx_compressor *)0)->offset_slot_tab_1) == 32768);
 505         return max_bufsize <= 32768;
 506 }
 507
 508 /*
 509  * Return the offset slot for the specified adjusted match offset.
 510  */
 511 static forceinline unsigned
 512 lzx_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
 513                     bool is_16_bit)
 514 {
 515         if (__builtin_constant_p(adjusted_offset) &&
 516             adjusted_offset < LZX_NUM_RECENT_OFFSETS)
 517                 return adjusted_offset;
 518         if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
 519                 return c->offset_slot_tab_1[adjusted_offset];
 520         return c->offset_slot_tab_2[adjusted_offset >> 14];
 521 }
 522
 523 /*
 524  * For a match that has the specified length and adjusted offset, tally its main
 525  * symbol, and if needed its length symbol; then return its main symbol.
 526  */
 527 static forceinline unsigned
 528 lzx_tally_main_and_lensyms(struct lzx_compressor *c, unsigned length,
 529                            u32 adjusted_offset, bool is_16_bit)
 530 {
 531         unsigned mainsym;
 532
 533         if (length >= LZX_MIN_SECONDARY_LEN) {
 534                 /* Length symbol needed */
 535                 c->freqs.len[length - LZX_MIN_SECONDARY_LEN]++;
 536                 mainsym = LZX_NUM_CHARS + LZX_NUM_PRIMARY_LENS;
 537         } else {
 538                 /* No length symbol needed */
 539                 mainsym = LZX_NUM_CHARS + length - LZX_MIN_MATCH_LEN;
 540         }
 541
 542         mainsym += LZX_NUM_LEN_HEADERS *
 543                    lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
 544         c->freqs.main[mainsym]++;
 545         return mainsym;
 546 }
 547
 548 /*
 549  * The following macros call either the 16-bit or the 32-bit version of a
 550  * matchfinder function based on the value of 'is_16_bit', which will be known
 551  * at compilation time.
 552  */
 553
 554 #define CALL_HC_MF(is_16_bit, c, funcname, ...)                               \
 555         ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \
 556                        CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__));
 557
 558 #define CALL_BT_MF(is_16_bit, c, funcname, ...)                               \
 559         ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \
 560                        CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__));
 561
 562 /******************************************************************************/
 563 /*                             Output bitstream                               */
 564 /*----------------------------------------------------------------------------*/
 565
 566 /*
 567  * The LZX bitstream is encoded as a sequence of little endian 16-bit coding
 568  * units.  Bits are ordered from most significant to least significant within
 569  * each coding unit.
 570  */
 571
 572 /*
 573  * Structure to keep track of the current state of sending bits to the
 574  * compressed output buffer.
 575  */
 576 struct lzx_output_bitstream {
 577
 578         /* Bits that haven't yet been written to the output buffer */
 579         machine_word_t bitbuf;
 580
 581         /* Number of bits currently held in @bitbuf */
 582         machine_word_t bitcount;
 583
 584         /* Pointer to the start of the output buffer */
 585         u8 *start;
 586
 587         /* Pointer to the position in the output buffer at which the next coding
 588          * unit should be written */
 589         u8 *next;
 590
 591         /* Pointer to just past the end of the output buffer, rounded down by
 592          * one byte if needed to make 'end - start' a multiple of 2 */
 593         u8 *end;
 594 };
 595
 596 /* Can the specified number of bits always be added to 'bitbuf' after all
 597  * pending 16-bit coding units have been flushed?  */
 598 #define CAN_BUFFER(n)   ((n) <= WORDBITS - 15)
 599
 600 /* Initialize the output bitstream to write to the specified buffer. */
 601 static void
 602 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
 603 {
 604         os->bitbuf = 0;
 605         os->bitcount = 0;
 606         os->start = buffer;
 607         os->next = buffer;
 608         os->end = (u8 *)buffer + (size & ~1);
 609 }
 610
 611 /*
 612  * Add some bits to the bitbuffer variable of the output bitstream.  The caller
 613  * must make sure there is enough room.
 614  */
 615 static forceinline void
 616 lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 617 {
 618         os->bitbuf = (os->bitbuf << num_bits) | bits;
 619         os->bitcount += num_bits;
 620 }
 621
 622 /*
 623  * Flush bits from the bitbuffer variable to the output buffer.  'max_num_bits'
 624  * specifies the maximum number of bits that may have been added since the last
 625  * flush.
 626  */
 627 static forceinline void
 628 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
 629 {
 630         /* Masking the number of bits to shift is only needed to avoid undefined
 631          * behavior; we don't actually care about the results of bad shifts.  On
 632          * x86, the explicit masking generates no extra code.  */
 633         const u32 shift_mask = WORDBITS - 1;
 634
 635         if (os->end - os->next < 6)
 636                 return;
 637         put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
 638                                             shift_mask), os->next + 0);
 639         if (max_num_bits > 16)
 640                 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
 641                                                 shift_mask), os->next + 2);
 642         if (max_num_bits > 32)
 643                 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
 644                                                 shift_mask), os->next + 4);
 645         os->next += (os->bitcount >> 4) << 1;
 646         os->bitcount &= 15;
 647 }
 648
 649 /* Add at most 16 bits to the bitbuffer and flush it.  */
 650 static forceinline void
 651 lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 652 {
 653         lzx_add_bits(os, bits, num_bits);
 654         lzx_flush_bits(os, 16);
 655 }
 656
 657 /*
 658  * Flush the last coding unit to the output buffer if needed.  Return the total
 659  * number of bytes written to the output buffer, or 0 if an overflow occurred.
 660  */
 661 static size_t
 662 lzx_flush_output(struct lzx_output_bitstream *os)
 663 {
 664         if (os->end - os->next < 6)
 665                 return 0;
 666
 667         if (os->bitcount != 0) {
 668                 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
 669                 os->next += 2;
 670         }
 671
 672         return os->next - os->start;
 673 }
 674
 675 /******************************************************************************/
 676 /*                           Preparing Huffman codes                          */
 677 /*----------------------------------------------------------------------------*/
 678
 679 /*
 680  * Build the Huffman codes.  This takes as input the frequency tables for each
 681  * code and produces as output a set of tables that map symbols to codewords and
 682  * codeword lengths.
 683  */
 684 static void
 685 lzx_build_huffman_codes(struct lzx_compressor *c)
 686 {
 687         const struct lzx_freqs *freqs = &c->freqs;
 688         struct lzx_codes *codes = &c->codes[c->codes_index];
 689
 690         STATIC_ASSERT(MAIN_CODEWORD_LIMIT >= 9 &&
 691                       MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
 692         make_canonical_huffman_code(c->num_main_syms,
 693                                     MAIN_CODEWORD_LIMIT,
 694                                     freqs->main,
 695                                     codes->lens.main,
 696                                     codes->codewords.main);
 697
 698         STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 8 &&
 699                       LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
 700         make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
 701                                     LENGTH_CODEWORD_LIMIT,
 702                                     freqs->len,
 703                                     codes->lens.len,
 704                                     codes->codewords.len);
 705
 706         STATIC_ASSERT(ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
 707                       ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
 708         make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
 709                                     ALIGNED_CODEWORD_LIMIT,
 710                                     freqs->aligned,
 711                                     codes->lens.aligned,
 712                                     codes->codewords.aligned);
 713 }
 714
 715 /* Reset the symbol frequencies for the current block. */
 716 static void
 717 lzx_reset_symbol_frequencies(struct lzx_compressor *c)
 718 {
 719         memset(&c->freqs, 0, sizeof(c->freqs));
 720 }
 721
 722 static unsigned
 723 lzx_compute_precode_items(const u8 lens[restrict],
 724                           const u8 prev_lens[restrict],
 725                           u32 precode_freqs[restrict],
 726                           unsigned precode_items[restrict])
 727 {
 728         unsigned *itemptr;
 729         unsigned run_start;
 730         unsigned run_end;
 731         unsigned extra_bits;
 732         int delta;
 733         u8 len;
 734
 735         itemptr = precode_items;
 736         run_start = 0;
 737
 738         while (!((len = lens[run_start]) & 0x80)) {
 739
 740                 /* len = the length being repeated  */
 741
 742                 /* Find the next run of codeword lengths.  */
 743
 744                 run_end = run_start + 1;
 745
 746                 /* Fast case for a single length.  */
 747                 if (likely(len != lens[run_end])) {
 748                         delta = prev_lens[run_start] - len;
 749                         if (delta < 0)
 750                                 delta += 17;
 751                         precode_freqs[delta]++;
 752                         *itemptr++ = delta;
 753                         run_start++;
 754                         continue;
 755                 }
 756
 757                 /* Extend the run.  */
 758                 do {
 759                         run_end++;
 760                 } while (len == lens[run_end]);
 761
 762                 if (len == 0) {
 763                         /* Run of zeroes.  */
 764
 765                         /* Symbol 18: RLE 20 to 51 zeroes at a time.  */
 766                         while ((run_end - run_start) >= 20) {
 767                                 extra_bits = min((run_end - run_start) - 20, 0x1F);
 768                                 precode_freqs[18]++;
 769                                 *itemptr++ = 18 | (extra_bits << 5);
 770                                 run_start += 20 + extra_bits;
 771                         }
 772
 773                         /* Symbol 17: RLE 4 to 19 zeroes at a time.  */
 774                         if ((run_end - run_start) >= 4) {
 775                                 extra_bits = min((run_end - run_start) - 4, 0xF);
 776                                 precode_freqs[17]++;
 777                                 *itemptr++ = 17 | (extra_bits << 5);
 778                                 run_start += 4 + extra_bits;
 779                         }
 780                 } else {
 781
 782                         /* A run of nonzero lengths. */
 783
 784                         /* Symbol 19: RLE 4 to 5 of any length at a time.  */
 785                         while ((run_end - run_start) >= 4) {
 786                                 extra_bits = (run_end - run_start) > 4;
 787                                 delta = prev_lens[run_start] - len;
 788                                 if (delta < 0)
 789                                         delta += 17;
 790                                 precode_freqs[19]++;
 791                                 precode_freqs[delta]++;
 792                                 *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
 793                                 run_start += 4 + extra_bits;
 794                         }
 795                 }
 796
 797                 /* Output any remaining lengths without RLE.  */
 798                 while (run_start != run_end) {
 799                         delta = prev_lens[run_start] - len;
 800                         if (delta < 0)
 801                                 delta += 17;
 802                         precode_freqs[delta]++;
 803                         *itemptr++ = delta;
 804                         run_start++;
 805                 }
 806         }
 807
 808         return itemptr - precode_items;
 809 }
 810
 811 /******************************************************************************/
 812 /*                          Outputting compressed data                        */
 813 /*----------------------------------------------------------------------------*/
 814
 815 /*
 816  * Output a Huffman code in the compressed form used in LZX.
 817  *
 818  * The Huffman code is represented in the output as a logical series of codeword
 819  * lengths from which the Huffman code, which must be in canonical form, can be
 820  * reconstructed.
 821  *
 822  * The codeword lengths are themselves compressed using a separate Huffman code,
 823  * the "precode", which contains a symbol for each possible codeword length in
 824  * the larger code as well as several special symbols to represent repeated
 825  * codeword lengths (a form of run-length encoding).  The precode is itself
 826  * constructed in canonical form, and its codeword lengths are represented
 827  * literally in 20 4-bit fields that immediately precede the compressed codeword
 828  * lengths of the larger code.
 829  *
 830  * Furthermore, the codeword lengths of the larger code are actually represented
 831  * as deltas from the codeword lengths of the corresponding code in the previous
 832  * block.
 833  *
 834  * @os:
 835  *      Bitstream to which to write the compressed Huffman code.
 836  * @lens:
 837  *      The codeword lengths, indexed by symbol, in the Huffman code.
 838  * @prev_lens:
 839  *      The codeword lengths, indexed by symbol, in the corresponding Huffman
 840  *      code in the previous block, or all zeroes if this is the first block.
 841  * @num_lens:
 842  *      The number of symbols in the Huffman code.
 843  */
 844 static void
 845 lzx_write_compressed_code(struct lzx_output_bitstream *os,
 846                           const u8 lens[restrict],
 847                           const u8 prev_lens[restrict],
 848                           unsigned num_lens)
 849 {
 850         u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
 851         u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
 852         u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
 853         unsigned precode_items[num_lens];
 854         unsigned num_precode_items;
 855         unsigned precode_item;
 856         unsigned precode_sym;
 857         unsigned i;
 858         u8 saved = lens[num_lens];
 859         *(u8 *)(lens + num_lens) = 0x80;
 860
 861         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 862                 precode_freqs[i] = 0;
 863
 864         /* Compute the "items" (RLE / literal tokens and extra bits) with which
 865          * the codeword lengths in the larger code will be output.  */
 866         num_precode_items = lzx_compute_precode_items(lens,
 867                                                       prev_lens,
 868                                                       precode_freqs,
 869                                                       precode_items);
 870
 871         /* Build the precode.  */
 872         STATIC_ASSERT(PRE_CODEWORD_LIMIT >= 5 &&
 873                       PRE_CODEWORD_LIMIT <= LZX_MAX_PRE_CODEWORD_LEN);
 874         make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS, PRE_CODEWORD_LIMIT,
 875                                     precode_freqs, precode_lens,
 876                                     precode_codewords);
 877
 878         /* Output the lengths of the codewords in the precode.  */
 879         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 880                 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
 881
 882         /* Output the encoded lengths of the codewords in the larger code.  */
 883         for (i = 0; i < num_precode_items; i++) {
 884                 precode_item = precode_items[i];
 885                 precode_sym = precode_item & 0x1F;
 886                 lzx_add_bits(os, precode_codewords[precode_sym],
 887                              precode_lens[precode_sym]);
 888                 if (precode_sym >= 17) {
 889                         if (precode_sym == 17) {
 890                                 lzx_add_bits(os, precode_item >> 5, 4);
 891                         } else if (precode_sym == 18) {
 892                                 lzx_add_bits(os, precode_item >> 5, 5);
 893                         } else {
 894                                 lzx_add_bits(os, (precode_item >> 5) & 1, 1);
 895                                 precode_sym = precode_item >> 6;
 896                                 lzx_add_bits(os, precode_codewords[precode_sym],
 897                                              precode_lens[precode_sym]);
 898                         }
 899                 }
 900                 STATIC_ASSERT(CAN_BUFFER(2 * PRE_CODEWORD_LIMIT + 1));
 901                 lzx_flush_bits(os, 2 * PRE_CODEWORD_LIMIT + 1);
 902         }
 903
 904         *(u8 *)(lens + num_lens) = saved;
 905 }
 906
 907 /*
 908  * Write all matches and literal bytes (which were precomputed) in an LZX
 909  * compressed block to the output bitstream in the final compressed
 910  * representation.
 911  *
 912  * @os
 913  *      The output bitstream.
 914  * @block_type
 915  *      The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
 916  *      LZX_BLOCKTYPE_VERBATIM).
 917  * @block_data
 918  *      The uncompressed data of the block.
 919  * @sequences
 920  *      The matches and literals to output, given as a series of sequences.
 921  * @codes
 922  *      The main, length, and aligned offset Huffman codes for the block.
 923  */
 924 static void
 925 lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 926                     const u8 *block_data, const struct lzx_sequence sequences[],
 927                     const struct lzx_codes *codes)
 928 {
 929         const struct lzx_sequence *seq = sequences;
 930         unsigned min_aligned_offset_slot;
 931
 932         if (block_type == LZX_BLOCKTYPE_ALIGNED)
 933                 min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
 934         else
 935                 min_aligned_offset_slot = LZX_MAX_OFFSET_SLOTS;
 936
 937         for (;;) {
 938                 /* Output the next sequence.  */
 939
 940                 u32 litrunlen = seq->litrunlen_and_matchlen >> SEQ_MATCHLEN_BITS;
 941                 unsigned matchlen = seq->litrunlen_and_matchlen & SEQ_MATCHLEN_MASK;
 942                 STATIC_ASSERT((u32)~SEQ_MATCHLEN_MASK >> SEQ_MATCHLEN_BITS >=
 943                               SOFT_MAX_BLOCK_SIZE);
 944                 u32 adjusted_offset;
 945                 unsigned main_symbol;
 946                 unsigned offset_slot;
 947                 unsigned num_extra_bits;
 948                 u32 extra_bits;
 949
 950                 /* Output the literal run of the sequence.  */
 951
 952                 if (litrunlen) {  /* Is the literal run nonempty?  */
 953
 954                         /* Verify optimization is enabled on 64-bit  */
 955                         STATIC_ASSERT(WORDBITS < 64 ||
 956                                       CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT));
 957
 958                         if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) {
 959
 960                                 /* 64-bit: write 3 literals at a time.  */
 961                                 while (litrunlen >= 3) {
 962                                         unsigned lit0 = block_data[0];
 963                                         unsigned lit1 = block_data[1];
 964                                         unsigned lit2 = block_data[2];
 965                                         lzx_add_bits(os, codes->codewords.main[lit0],
 966                                                      codes->lens.main[lit0]);
 967                                         lzx_add_bits(os, codes->codewords.main[lit1],
 968                                                      codes->lens.main[lit1]);
 969                                         lzx_add_bits(os, codes->codewords.main[lit2],
 970                                                      codes->lens.main[lit2]);
 971                                         lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
 972                                         block_data += 3;
 973                                         litrunlen -= 3;
 974                                 }
 975                                 if (litrunlen--) {
 976                                         unsigned lit = *block_data++;
 977                                         lzx_add_bits(os, codes->codewords.main[lit],
 978                                                      codes->lens.main[lit]);
 979                                         if (litrunlen--) {
 980                                                 unsigned lit = *block_data++;
 981                                                 lzx_add_bits(os, codes->codewords.main[lit],
 982                                                              codes->lens.main[lit]);
 983                                                 lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
 984                                         } else {
 985                                                 lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
 986                                         }
 987                                 }
 988                         } else {
 989                                 /* 32-bit: write 1 literal at a time.  */
 990                                 do {
 991                                         unsigned lit = *block_data++;
 992                                         lzx_add_bits(os, codes->codewords.main[lit],
 993                                                      codes->lens.main[lit]);
 994                                         lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
 995                                 } while (--litrunlen);
 996                         }
 997                 }
 998
 999                 /* Was this the last literal run?  */
1000                 if (matchlen == 0)
1001                         return;
1002
1003                 /* Nope; output the match.  */
1004
1005                 block_data += matchlen;
1006
1007                 adjusted_offset = seq->adjusted_offset_and_mainsym >> SEQ_MAINSYM_BITS;
1008                 main_symbol = seq->adjusted_offset_and_mainsym & SEQ_MAINSYM_MASK;
1009
1010                 offset_slot = (main_symbol - LZX_NUM_CHARS) / LZX_NUM_LEN_HEADERS;
1011                 num_extra_bits = lzx_extra_offset_bits[offset_slot];
1012                 extra_bits = adjusted_offset - (lzx_offset_slot_base[offset_slot] +
1013                                                 LZX_OFFSET_ADJUSTMENT);
1014
1015         #define MAX_MATCH_BITS (MAIN_CODEWORD_LIMIT +           \
1016                                 LENGTH_CODEWORD_LIMIT +         \
1017                                 LZX_MAX_NUM_EXTRA_BITS -        \
1018                                 LZX_NUM_ALIGNED_OFFSET_BITS +   \
1019                                 ALIGNED_CODEWORD_LIMIT)
1020
1021                 /* Verify optimization is enabled on 64-bit  */
1022                 STATIC_ASSERT(WORDBITS < 64 || CAN_BUFFER(MAX_MATCH_BITS));
1023
1024                 /* Output the main symbol for the match.  */
1025
1026                 lzx_add_bits(os, codes->codewords.main[main_symbol],
1027                              codes->lens.main[main_symbol]);
1028                 if (!CAN_BUFFER(MAX_MATCH_BITS))
1029                         lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
1030
1031                 /* If needed, output the length symbol for the match.  */
1032
1033                 if (matchlen >= LZX_MIN_SECONDARY_LEN) {
1034                         lzx_add_bits(os, codes->codewords.len[matchlen -
1035                                                               LZX_MIN_SECONDARY_LEN],
1036                                      codes->lens.len[matchlen -
1037                                                      LZX_MIN_SECONDARY_LEN]);
1038                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1039                                 lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
1040                 }
1041
1042                 /* Output the extra offset bits for the match.  In aligned
1043                  * offset blocks, the lowest 3 bits of the adjusted offset are
1044                  * Huffman-encoded using the aligned offset code, provided that
1045                  * there are at least extra 3 offset bits required.  All other
1046                  * extra offset bits are output verbatim.  */
1047
1048                 if (offset_slot >= min_aligned_offset_slot) {
1049
1050                         lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
1051                                      num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS);
1052                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1053                                 lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS -
1054                                                    LZX_NUM_ALIGNED_OFFSET_BITS);
1055
1056                         lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
1057                                                                   LZX_ALIGNED_OFFSET_BITMASK],
1058                                      codes->lens.aligned[adjusted_offset &
1059                                                          LZX_ALIGNED_OFFSET_BITMASK]);
1060                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1061                                 lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
1062                 } else {
1063                         STATIC_ASSERT(CAN_BUFFER(LZX_MAX_NUM_EXTRA_BITS));
1064
1065                         lzx_add_bits(os, extra_bits, num_extra_bits);
1066                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1067                                 lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS);
1068                 }
1069
1070                 if (CAN_BUFFER(MAX_MATCH_BITS))
1071                         lzx_flush_bits(os, MAX_MATCH_BITS);
1072
1073                 /* Advance to the next sequence.  */
1074                 seq++;
1075         }
1076 }
1077
1078 static void
1079 lzx_write_compressed_block(const u8 *block_begin,
1080                            int block_type,
1081                            u32 block_size,
1082                            unsigned window_order,
1083                            unsigned num_main_syms,
1084                            const struct lzx_sequence sequences[],
1085                            const struct lzx_codes * codes,
1086                            const struct lzx_lens * prev_lens,
1087                            struct lzx_output_bitstream * os)
1088 {
1089         /* The first three bits indicate the type of block and are one of the
1090          * LZX_BLOCKTYPE_* constants.  */
1091         lzx_write_bits(os, block_type, 3);
1092
1093         /*
1094          * Output the block size.
1095          *
1096          * The original LZX format encoded the block size in 24 bits.  However,
1097          * the LZX format used in WIM archives uses 1 bit to specify whether the
1098          * block has the default size of 32768 bytes, then optionally 16 bits to
1099          * specify a non-default size.  This works fine for Microsoft's WIM
1100          * software (WIMGAPI), which never compresses more than 32768 bytes at a
1101          * time with LZX.  However, as an extension, our LZX compressor supports
1102          * compressing up to 2097152 bytes, with a corresponding increase in
1103          * window size.  It is possible for blocks in these larger buffers to
1104          * exceed 65535 bytes; such blocks cannot have their size represented in
1105          * 16 bits.
1106          *
1107          * The chosen solution was to use 24 bits for the block size when
1108          * possibly required --- specifically, when the compressor has been
1109          * allocated to be capable of compressing more than 32768 bytes at once
1110          * (which also causes the number of main symbols to be increased).
1111          */
1112         if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
1113                 lzx_write_bits(os, 1, 1);
1114         } else {
1115                 lzx_write_bits(os, 0, 1);
1116
1117                 if (window_order >= 16)
1118                         lzx_write_bits(os, block_size >> 16, 8);
1119
1120                 lzx_write_bits(os, block_size & 0xFFFF, 16);
1121         }
1122
1123         /* If it's an aligned offset block, output the aligned offset code.  */
1124         if (block_type == LZX_BLOCKTYPE_ALIGNED) {
1125                 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1126                         lzx_write_bits(os, codes->lens.aligned[i],
1127                                        LZX_ALIGNEDCODE_ELEMENT_SIZE);
1128                 }
1129         }
1130
1131         /* Output the main code (two parts).  */
1132         lzx_write_compressed_code(os, codes->lens.main,
1133                                   prev_lens->main,
1134                                   LZX_NUM_CHARS);
1135         lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
1136                                   prev_lens->main + LZX_NUM_CHARS,
1137                                   num_main_syms - LZX_NUM_CHARS);
1138
1139         /* Output the length code.  */
1140         lzx_write_compressed_code(os, codes->lens.len,
1141                                   prev_lens->len,
1142                                   LZX_LENCODE_NUM_SYMBOLS);
1143
1144         /* Output the compressed matches and literals.  */
1145         lzx_write_sequences(os, block_type, block_begin, sequences, codes);
1146 }
1147
1148 /*
1149  * Given the frequencies of symbols in an LZX-compressed block and the
1150  * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
1151  * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
1152  * will take fewer bits to output.
1153  */
1154 static int
1155 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
1156                                const struct lzx_codes * codes)
1157 {
1158         u32 verbatim_cost = 0;
1159         u32 aligned_cost = 0;
1160
1161         /* A verbatim block requires 3 bits in each place that an aligned offset
1162          * symbol would be used in an aligned offset block.  */
1163         for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1164                 verbatim_cost += LZX_NUM_ALIGNED_OFFSET_BITS * freqs->aligned[i];
1165                 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
1166         }
1167
1168         /* Account for the cost of sending the codeword lengths of the aligned
1169          * offset code.  */
1170         aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE *
1171                         LZX_ALIGNEDCODE_NUM_SYMBOLS;
1172
1173         if (aligned_cost < verbatim_cost)
1174                 return LZX_BLOCKTYPE_ALIGNED;
1175         else
1176                 return LZX_BLOCKTYPE_VERBATIM;
1177 }
1178
1179 /*
1180  * Flush an LZX block:
1181  *
1182  * 1. Build the Huffman codes.
1183  * 2. Decide whether to output the block as VERBATIM or ALIGNED.
1184  * 3. Write the block.
1185  * 4. Swap the indices of the current and previous Huffman codes.
1186  *
1187  * Note: we never output UNCOMPRESSED blocks.  This probably should be
1188  * implemented sometime, but it doesn't make much difference.
1189  */
1190 static void
1191 lzx_flush_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
1192                 const u8 *block_begin, u32 block_size, u32 seq_idx)
1193 {
1194         int block_type;
1195
1196         lzx_build_huffman_codes(c);
1197
1198         block_type = lzx_choose_verbatim_or_aligned(&c->freqs,
1199                                                     &c->codes[c->codes_index]);
1200         lzx_write_compressed_block(block_begin,
1201                                    block_type,
1202                                    block_size,
1203                                    c->window_order,
1204                                    c->num_main_syms,
1205                                    &c->chosen_sequences[seq_idx],
1206                                    &c->codes[c->codes_index],
1207                                    &c->codes[c->codes_index ^ 1].lens,
1208                                    os);
1209         c->codes_index ^= 1;
1210 }
1211
1212 /******************************************************************************/
1213 /*                          Block splitting algorithm                         */
1214 /*----------------------------------------------------------------------------*/
1215
1216 /*
1217  * The problem of block splitting is to decide when it is worthwhile to start a
1218  * new block with new entropy codes.  There is a theoretically optimal solution:
1219  * recursively consider every possible block split, considering the exact cost
1220  * of each block, and choose the minimum cost approach.  But this is far too
1221  * slow.  Instead, as an approximation, we can count symbols and after every N
1222  * symbols, compare the expected distribution of symbols based on the previous
1223  * data with the actual distribution.  If they differ "by enough", then start a
1224  * new block.
1225  *
1226  * As an optimization and heuristic, we don't distinguish between every symbol
1227  * but rather we combine many symbols into a single "observation type".  For
1228  * literals we only look at the high bits and low bits, and for matches we only
1229  * look at whether the match is long or not.  The assumption is that for typical
1230  * "real" data, places that are good block boundaries will tend to be noticeable
1231  * based only on changes in these aggregate frequencies, without looking for
1232  * subtle differences in individual symbols.  For example, a change from ASCII
1233  * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1234  * to many matches (generally more compressible), would be easily noticed based
1235  * on the aggregates.
1236  *
1237  * For determining whether the frequency distributions are "different enough" to
1238  * start a new block, the simply heuristic of splitting when the sum of absolute
1239  * differences exceeds a constant seems to be good enough.
1240  *
1241  * Finally, for an approximation, it is not strictly necessary that the exact
1242  * symbols being used are considered.  With "near-optimal parsing", for example,
1243  * the actual symbols that will be used are unknown until after the block
1244  * boundary is chosen and the block has been optimized.  Since the final choices
1245  * cannot be used, we can use preliminary "greedy" choices instead.
1246  */
1247
1248 /* Initialize the block split statistics when starting a new block. */
1249 static void
1250 lzx_init_block_split_stats(struct lzx_block_split_stats *stats)
1251 {
1252         memset(stats, 0, sizeof(*stats));
1253 }
1254
1255 /* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
1256  * literal, for 8 possible literal observation types.  */
1257 static forceinline void
1258 lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
1259 {
1260         stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1261         stats->num_new_observations++;
1262 }
1263
1264 /* Match observation.  Heuristic: use one observation type for "short match" and
1265  * one observation type for "long match".  */
1266 static forceinline void
1267 lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length)
1268 {
1269         stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
1270         stats->num_new_observations++;
1271 }
1272
1273 static bool
1274 lzx_should_end_block(struct lzx_block_split_stats *stats)
1275 {
1276         if (stats->num_observations > 0) {
1277
1278                 /* Note: to avoid slow divisions, we do not divide by
1279                  * 'num_observations', but rather do all math with the numbers
1280                  * multiplied by 'num_observations'. */
1281                 u32 total_delta = 0;
1282                 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1283                         u32 expected = stats->observations[i] *
1284                                        stats->num_new_observations;
1285                         u32 actual = stats->new_observations[i] *
1286                                      stats->num_observations;
1287                         u32 delta = (actual > expected) ? actual - expected :
1288                                                           expected - actual;
1289                         total_delta += delta;
1290                 }
1291
1292                 /* Ready to end the block? */
1293                 if (total_delta >=
1294                     stats->num_new_observations * 7 / 8 * stats->num_observations)
1295                         return true;
1296         }
1297
1298         for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1299                 stats->num_observations += stats->new_observations[i];
1300                 stats->observations[i] += stats->new_observations[i];
1301                 stats->new_observations[i] = 0;
1302         }
1303         stats->num_new_observations = 0;
1304         return false;
1305 }
1306
1307 /******************************************************************************/
1308 /*                   Slower ("near-optimal") compression algorithm            */
1309 /*----------------------------------------------------------------------------*/
1310
1311 /*
1312  * Least-recently-used queue for match offsets.
1313  *
1314  * This is represented as a 64-bit integer for efficiency.  There are three
1315  * offsets of 21 bits each.  Bit 64 is garbage.
1316  */
1317 struct lzx_lru_queue {
1318         u64 R;
1319 } _aligned_attribute(8);
1320
1321 #define LZX_QUEUE_OFFSET_SHIFT  21
1322 #define LZX_QUEUE_OFFSET_MASK   (((u64)1 << LZX_QUEUE_OFFSET_SHIFT) - 1)
1323
1324 #define LZX_QUEUE_R0_SHIFT (0 * LZX_QUEUE_OFFSET_SHIFT)
1325 #define LZX_QUEUE_R1_SHIFT (1 * LZX_QUEUE_OFFSET_SHIFT)
1326 #define LZX_QUEUE_R2_SHIFT (2 * LZX_QUEUE_OFFSET_SHIFT)
1327
1328 #define LZX_QUEUE_R0_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R0_SHIFT)
1329 #define LZX_QUEUE_R1_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R1_SHIFT)
1330 #define LZX_QUEUE_R2_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R2_SHIFT)
1331
1332 #define LZX_QUEUE_INITIALIZER {                 \
1333         ((u64)1 << LZX_QUEUE_R0_SHIFT) |        \
1334         ((u64)1 << LZX_QUEUE_R1_SHIFT) |        \
1335         ((u64)1 << LZX_QUEUE_R2_SHIFT) }
1336
1337 static forceinline u64
1338 lzx_lru_queue_R0(struct lzx_lru_queue queue)
1339 {
1340         return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1341 }
1342
1343 static forceinline u64
1344 lzx_lru_queue_R1(struct lzx_lru_queue queue)
1345 {
1346         return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1347 }
1348
1349 static forceinline u64
1350 lzx_lru_queue_R2(struct lzx_lru_queue queue)
1351 {
1352         return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1353 }
1354
1355 /* Push a match offset onto the front (most recently used) end of the queue.  */
1356 static forceinline struct lzx_lru_queue
1357 lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
1358 {
1359         return (struct lzx_lru_queue) {
1360                 .R = (queue.R << LZX_QUEUE_OFFSET_SHIFT) | offset,
1361         };
1362 }
1363
1364 /* Swap a match offset to the front of the queue.  */
1365 static forceinline struct lzx_lru_queue
1366 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
1367 {
1368         unsigned shift = idx * 21;
1369         const u64 mask = LZX_QUEUE_R0_MASK;
1370         const u64 mask_high = mask << shift;
1371
1372         return (struct lzx_lru_queue) {
1373                 (queue.R & ~(mask | mask_high)) |
1374                 ((queue.R & mask_high) >> shift) |
1375                 ((queue.R & mask) << shift)
1376         };
1377 }
1378
1379 static forceinline u32
1380 lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
1381                    bool record)
1382 {
1383         struct lzx_sequence *seq =
1384                 &c->chosen_sequences[ARRAY_LEN(c->chosen_sequences) - 1];
1385         u32 node_idx = block_size;
1386         u32 litrun_end; /* if record=true: end of the current literal run */
1387
1388         if (record) {
1389                 /* The last sequence has matchlen 0 */
1390                 seq->litrunlen_and_matchlen = 0;
1391                 litrun_end = node_idx;
1392         }
1393
1394         for (;;) {
1395                 u32 item;
1396                 unsigned matchlen;
1397                 u32 adjusted_offset;
1398                 unsigned mainsym;
1399
1400                 /* Tally literals until either a match or the beginning of the
1401                  * block is reached.  Note: the item in the node at the
1402                  * beginning of the block (c->optimum_nodes[0]) has all bits
1403                  * set, causing this loop to end when it is reached. */
1404                 for (;;) {
1405                         item = c->optimum_nodes[node_idx].item;
1406                         if (item & OPTIMUM_LEN_MASK)
1407                                 break;
1408                         c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
1409                         node_idx--;
1410                 }
1411
1412         #if CONSIDER_GAP_MATCHES
1413                 if (item & OPTIMUM_GAP_MATCH) {
1414                         if (node_idx == 0)
1415                                 break;
1416                         /* Tally/record the rep0 match after the gap. */
1417                         matchlen = item & OPTIMUM_LEN_MASK;
1418                         mainsym = lzx_tally_main_and_lensyms(c, matchlen, 0,
1419                                                              is_16_bit);
1420                         if (record) {
1421                                 seq->litrunlen_and_matchlen |=
1422                                         (litrun_end - node_idx) <<
1423                                          SEQ_MATCHLEN_BITS;
1424                                 seq--;
1425                                 seq->litrunlen_and_matchlen = matchlen;
1426                                 seq->adjusted_offset_and_mainsym = mainsym;
1427                                 litrun_end = node_idx - matchlen;
1428                         }
1429
1430                         /* Tally the literal in the gap. */
1431                         c->freqs.main[(u8)(item >> OPTIMUM_OFFSET_SHIFT)]++;
1432
1433                         /* Fall through and tally the match before the gap.
1434                          * (It was temporarily saved in the 'cost' field of the
1435                          * previous node, which was free to reuse.) */
1436                         item = c->optimum_nodes[--node_idx].cost;
1437                         node_idx -= matchlen;
1438                 }
1439         #else /* CONSIDER_GAP_MATCHES */
1440                 if (node_idx == 0)
1441                         break;
1442         #endif /* !CONSIDER_GAP_MATCHES */
1443
1444                 /* Tally/record a match. */
1445                 matchlen = item & OPTIMUM_LEN_MASK;
1446                 adjusted_offset = item >> OPTIMUM_OFFSET_SHIFT;
1447                 mainsym = lzx_tally_main_and_lensyms(c, matchlen,
1448                                                      adjusted_offset,
1449                                                      is_16_bit);
1450                 if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET +
1451                                        LZX_OFFSET_ADJUSTMENT)
1452                         c->freqs.aligned[adjusted_offset &
1453                                          LZX_ALIGNED_OFFSET_BITMASK]++;
1454                 if (record) {
1455                         seq->litrunlen_and_matchlen |=
1456                                 (litrun_end - node_idx) << SEQ_MATCHLEN_BITS;
1457                         seq--;
1458                         seq->litrunlen_and_matchlen = matchlen;
1459                         seq->adjusted_offset_and_mainsym =
1460                                 (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym;
1461                         litrun_end = node_idx - matchlen;
1462                 }
1463                 node_idx -= matchlen;
1464         }
1465
1466         /* Record the literal run length for the first sequence. */
1467         if (record) {
1468                 seq->litrunlen_and_matchlen |=
1469                         (litrun_end - node_idx) << SEQ_MATCHLEN_BITS;
1470         }
1471
1472         /* Return the index in chosen_sequences at which the sequences begin. */
1473         return seq - &c->chosen_sequences[0];
1474 }
1475
1476 /*
1477  * Given the minimum-cost path computed through the item graph for the current
1478  * block, walk the path and count how many of each symbol in each Huffman-coded
1479  * alphabet would be required to output the items (matches and literals) along
1480  * the path.
1481  *
1482  * Note that the path will be walked backwards (from the end of the block to the
1483  * beginning of the block), but this doesn't matter because this function only
1484  * computes frequencies.
1485  */
1486 static forceinline void
1487 lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1488 {
1489         lzx_walk_item_list(c, block_size, is_16_bit, false);
1490 }
1491
1492 /*
1493  * Like lzx_tally_item_list(), but this function also generates the list of
1494  * lzx_sequences for the minimum-cost path and writes it to c->chosen_sequences,
1495  * ready to be output to the bitstream after the Huffman codes are computed.
1496  * The lzx_sequences will be written to decreasing memory addresses as the path
1497  * is walked backwards, which means they will end up in the expected
1498  * first-to-last order.  The return value is the index in c->chosen_sequences at
1499  * which the lzx_sequences begin.
1500  */
1501 static forceinline u32
1502 lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1503 {
1504         return lzx_walk_item_list(c, block_size, is_16_bit, true);
1505 }
1506
1507 /*
1508  * Find an inexpensive path through the graph of possible match/literal choices
1509  * for the current block.  The nodes of the graph are
1510  * c->optimum_nodes[0...block_size].  They correspond directly to the bytes in
1511  * the current block, plus one extra node for end-of-block.  The edges of the
1512  * graph are matches and literals.  The goal is to find the minimum cost path
1513  * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]', given the cost
1514  * model 'c->costs'.
1515  *
1516  * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and
1517  * proceeding forwards one node at a time.  At each node, a selection of matches
1518  * (len >= 2), as well as the literal byte (len = 1), is considered.  An item of
1519  * length 'len' provides a new path to reach the node 'len' bytes later.  If
1520  * such a path is the lowest cost found so far to reach that later node, then
1521  * that later node is updated with the new cost and the "arrival" which provided
1522  * that cost.
1523  *
1524  * Note that although this algorithm is based on minimum cost path search, due
1525  * to various simplifying assumptions the result is not guaranteed to be the
1526  * true minimum cost, or "optimal", path over the graph of all valid LZX
1527  * representations of this block.
1528  *
1529  * Also, note that because of the presence of the recent offsets queue (which is
1530  * a type of adaptive state), the algorithm cannot work backwards and compute
1531  * "cost to end" instead of "cost to beginning".  Furthermore, the way the
1532  * algorithm handles this adaptive state in the "minimum cost" parse is actually
1533  * only an approximation.  It's possible for the globally optimal, minimum cost
1534  * path to contain a prefix, ending at a position, where that path prefix is
1535  * *not* the minimum cost path to that position.  This can happen if such a path
1536  * prefix results in a different adaptive state which results in lower costs
1537  * later.  The algorithm does not solve this problem in general; it only looks
1538  * one step ahead, with the exception of special consideration for "gap
1539  * matches".
1540  */
1541 static forceinline struct lzx_lru_queue
1542 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
1543                        const u8 * const restrict block_begin,
1544                        const u32 block_size,
1545                        const struct lzx_lru_queue initial_queue,
1546                        bool is_16_bit)
1547 {
1548         struct lzx_optimum_node *cur_node = c->optimum_nodes;
1549         struct lzx_optimum_node * const end_node = cur_node + block_size;
1550         struct lz_match *cache_ptr = c->match_cache;
1551         const u8 *in_next = block_begin;
1552         const u8 * const block_end = block_begin + block_size;
1553
1554         /*
1555          * Instead of storing the match offset LRU queues in the
1556          * 'lzx_optimum_node' structures, we save memory (and cache lines) by
1557          * storing them in a smaller array.  This works because the algorithm
1558          * only requires a limited history of the adaptive state.  Once a given
1559          * state is more than LZX_MAX_MATCH_LEN bytes behind the current node
1560          * (more if gap match consideration is enabled; we just round up to 512
1561          * so it's a power of 2), it is no longer needed.
1562          *
1563          * The QUEUE() macro finds the queue for the given node.  This macro has
1564          * been optimized by taking advantage of 'struct lzx_lru_queue' and
1565          * 'struct lzx_optimum_node' both being 8 bytes in size and alignment.
1566          */
1567         struct lzx_lru_queue queues[512];
1568         STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1);
1569         STATIC_ASSERT(sizeof(c->optimum_nodes[0]) == sizeof(queues[0]));
1570 #define QUEUE(node) \
1571         (*(struct lzx_lru_queue *)((char *)queues + \
1572                         ((uintptr_t)(node) % (ARRAY_LEN(queues) * sizeof(queues[0])))))
1573         /*(queues[(uintptr_t)(node) / sizeof(*(node)) % ARRAY_LEN(queues)])*/
1574
1575 #if CONSIDER_GAP_MATCHES
1576         u32 matches_before_gap[ARRAY_LEN(queues)];
1577 #define MATCH_BEFORE_GAP(node) \
1578         (matches_before_gap[(uintptr_t)(node) / sizeof(*(node)) % \
1579                             ARRAY_LEN(matches_before_gap)])
1580 #endif
1581
1582         /*
1583          * Initially, the cost to reach each node is "infinity".
1584          *
1585          * The first node actually should have cost 0, but "infinity"
1586          * (0xFFFFFFFF) works just as well because it immediately overflows.
1587          *
1588          * The following statement also intentionally sets the 'item' of the
1589          * first node, which would otherwise have no meaning, to 0xFFFFFFFF for
1590          * use as a sentinel.  See lzx_walk_item_list().
1591          */
1592         memset(c->optimum_nodes, 0xFF,
1593                (block_size + 1) * sizeof(c->optimum_nodes[0]));
1594
1595         /* Initialize the recent offsets queue for the first node. */
1596         QUEUE(cur_node) = initial_queue;
1597
1598         do { /* For each node in the block in position order... */
1599
1600                 unsigned num_matches;
1601                 unsigned literal;
1602                 u32 cost;
1603
1604                 /*
1605                  * A selection of matches for the block was already saved in
1606                  * memory so that we don't have to run the uncompressed data
1607                  * through the matchfinder on every optimization pass.  However,
1608                  * we still search for repeat offset matches during each
1609                  * optimization pass because we cannot predict the state of the
1610                  * recent offsets queue.  But as a heuristic, we don't bother
1611                  * searching for repeat offset matches if the general-purpose
1612                  * matchfinder failed to find any matches.
1613                  *
1614                  * Note that a match of length n at some offset implies there is
1615                  * also a match of length l for LZX_MIN_MATCH_LEN <= l <= n at
1616                  * that same offset.  In other words, we don't necessarily need
1617                  * to use the full length of a match.  The key heuristic that
1618                  * saves a significicant amount of time is that for each
1619                  * distinct length, we only consider the smallest offset for
1620                  * which that length is available.  This heuristic also applies
1621                  * to repeat offsets, which we order specially: R0 < R1 < R2 <
1622                  * any explicit offset.  Of course, this heuristic may be
1623                  * produce suboptimal results because offset slots in LZX are
1624                  * subject to entropy encoding, but in practice this is a useful
1625                  * heuristic.
1626                  */
1627
1628                 num_matches = cache_ptr->length;
1629                 cache_ptr++;
1630
1631                 if (num_matches) {
1632                         struct lz_match *end_matches = cache_ptr + num_matches;
1633                         unsigned next_len = LZX_MIN_MATCH_LEN;
1634                         unsigned max_len = min(block_end - in_next, LZX_MAX_MATCH_LEN);
1635                         const u8 *matchptr;
1636
1637                         /* Consider rep0 matches. */
1638                         matchptr = in_next - lzx_lru_queue_R0(QUEUE(cur_node));
1639                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1640                                 goto rep0_done;
1641                         STATIC_ASSERT(LZX_MIN_MATCH_LEN == 2);
1642                         do {
1643                                 u32 cost = cur_node->cost +
1644                                            c->costs.match_cost[0][
1645                                                         next_len - LZX_MIN_MATCH_LEN];
1646                                 if (cost <= (cur_node + next_len)->cost) {
1647                                         (cur_node + next_len)->cost = cost;
1648                                         (cur_node + next_len)->item =
1649                                                 (0 << OPTIMUM_OFFSET_SHIFT) | next_len;
1650                                 }
1651                                 if (unlikely(++next_len > max_len)) {
1652                                         cache_ptr = end_matches;
1653                                         goto done_matches;
1654                                 }
1655                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1656
1657                 rep0_done:
1658
1659                         /* Consider rep1 matches. */
1660                         matchptr = in_next - lzx_lru_queue_R1(QUEUE(cur_node));
1661                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1662                                 goto rep1_done;
1663                         if (matchptr[next_len - 1] != in_next[next_len - 1])
1664                                 goto rep1_done;
1665                         for (unsigned len = 2; len < next_len - 1; len++)
1666                                 if (matchptr[len] != in_next[len])
1667                                         goto rep1_done;
1668                         do {
1669                                 u32 cost = cur_node->cost +
1670                                            c->costs.match_cost[1][
1671                                                         next_len - LZX_MIN_MATCH_LEN];
1672                                 if (cost <= (cur_node + next_len)->cost) {
1673                                         (cur_node + next_len)->cost = cost;
1674                                         (cur_node + next_len)->item =
1675                                                 (1 << OPTIMUM_OFFSET_SHIFT) | next_len;
1676                                 }
1677                                 if (unlikely(++next_len > max_len)) {
1678                                         cache_ptr = end_matches;
1679                                         goto done_matches;
1680                                 }
1681                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1682
1683                 rep1_done:
1684
1685                         /* Consider rep2 matches. */
1686                         matchptr = in_next - lzx_lru_queue_R2(QUEUE(cur_node));
1687                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1688                                 goto rep2_done;
1689                         if (matchptr[next_len - 1] != in_next[next_len - 1])
1690                                 goto rep2_done;
1691                         for (unsigned len = 2; len < next_len - 1; len++)
1692                                 if (matchptr[len] != in_next[len])
1693                                         goto rep2_done;
1694                         do {
1695                                 u32 cost = cur_node->cost +
1696                                            c->costs.match_cost[2][
1697                                                         next_len - LZX_MIN_MATCH_LEN];
1698                                 if (cost <= (cur_node + next_len)->cost) {
1699                                         (cur_node + next_len)->cost = cost;
1700                                         (cur_node + next_len)->item =
1701                                                 (2 << OPTIMUM_OFFSET_SHIFT) | next_len;
1702                                 }
1703                                 if (unlikely(++next_len > max_len)) {
1704                                         cache_ptr = end_matches;
1705                                         goto done_matches;
1706                                 }
1707                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1708
1709                 rep2_done:
1710
1711                         while (next_len > cache_ptr->length)
1712                                 if (++cache_ptr == end_matches)
1713                                         goto done_matches;
1714
1715                         /* Consider explicit offset matches. */
1716                         for (;;) {
1717                                 u32 offset = cache_ptr->offset;
1718                                 u32 adjusted_offset = offset + LZX_OFFSET_ADJUSTMENT;
1719                                 unsigned offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
1720                                 u32 base_cost = cur_node->cost;
1721                                 u32 cost;
1722
1723                         #if CONSIDER_ALIGNED_COSTS
1724                                 if (offset >= LZX_MIN_ALIGNED_OFFSET)
1725                                         base_cost += c->costs.aligned[adjusted_offset &
1726                                                                       LZX_ALIGNED_OFFSET_BITMASK];
1727                         #endif
1728                                 do {
1729                                         cost = base_cost +
1730                                                c->costs.match_cost[offset_slot][
1731                                                                 next_len - LZX_MIN_MATCH_LEN];
1732                                         if (cost < (cur_node + next_len)->cost) {
1733                                                 (cur_node + next_len)->cost = cost;
1734                                                 (cur_node + next_len)->item =
1735                                                         (adjusted_offset << OPTIMUM_OFFSET_SHIFT) | next_len;
1736                                         }
1737                                 } while (++next_len <= cache_ptr->length);
1738
1739                                 if (++cache_ptr == end_matches) {
1740                                 #if CONSIDER_GAP_MATCHES
1741                                         /* Also consider the longest explicit
1742                                          * offset match as a "gap match": match
1743                                          * + lit + rep0. */
1744                                         s32 remaining = (block_end - in_next) - (s32)next_len;
1745                                         if (likely(remaining >= 2)) {
1746                                                 const u8 *strptr = in_next + next_len;
1747                                                 const u8 *matchptr = strptr - offset;
1748                                                 if (load_u16_unaligned(strptr) == load_u16_unaligned(matchptr)) {
1749                                                         STATIC_ASSERT(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2 >= 250);
1750                                                         STATIC_ASSERT(ARRAY_LEN(queues) == ARRAY_LEN(matches_before_gap));
1751                                                         unsigned limit = min(remaining,
1752                                                                              min(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2,
1753                                                                                  LZX_MAX_MATCH_LEN));
1754                                                         unsigned rep0_len = lz_extend(strptr, matchptr, 2, limit);
1755                                                         u8 lit = strptr[-1];
1756                                                         cost += c->costs.main[lit] +
1757                                                                 c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN];
1758                                                         unsigned total_len = next_len + rep0_len;
1759                                                         if (cost < (cur_node + total_len)->cost) {
1760                                                                 (cur_node + total_len)->cost = cost;
1761                                                                 (cur_node + total_len)->item =
1762                                                                         OPTIMUM_GAP_MATCH |
1763                                                                         ((u32)lit << OPTIMUM_OFFSET_SHIFT) |
1764                                                                         rep0_len;
1765                                                                 MATCH_BEFORE_GAP(cur_node + total_len) =
1766                                                                         (adjusted_offset << OPTIMUM_OFFSET_SHIFT) |
1767                                                                         (next_len - 1);
1768                                                         }
1769                                                 }
1770                                         }
1771                                 #endif /* CONSIDER_GAP_MATCHES */
1772                                         break;
1773                                 }
1774                         }
1775                 }
1776
1777         done_matches:
1778
1779                 /* Consider coding a literal.
1780
1781                  * To avoid an extra branch, actually checking the preferability
1782                  * of coding the literal is integrated into the queue update
1783                  * code below. */
1784                 literal = *in_next++;
1785                 cost = cur_node->cost + c->costs.main[literal];
1786
1787                 /* Advance to the next position. */
1788                 cur_node++;
1789
1790                 /* The lowest-cost path to the current position is now known.
1791                  * Finalize the recent offsets queue that results from taking
1792                  * this lowest-cost path. */
1793
1794                 if (cost <= cur_node->cost) {
1795                         /* Literal: queue remains unchanged. */
1796                         cur_node->cost = cost;
1797                         cur_node->item = (u32)literal << OPTIMUM_OFFSET_SHIFT;
1798                         QUEUE(cur_node) = QUEUE(cur_node - 1);
1799                 } else {
1800                         /* Match: queue update is needed. */
1801                         unsigned len = cur_node->item & OPTIMUM_LEN_MASK;
1802                 #if CONSIDER_GAP_MATCHES
1803                         s32 adjusted_offset = (s32)cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1804                         STATIC_ASSERT(OPTIMUM_GAP_MATCH == 0x80000000); /* assuming sign extension */
1805                 #else
1806                         u32 adjusted_offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1807                 #endif
1808
1809                         if (adjusted_offset >= LZX_NUM_RECENT_OFFSETS) {
1810                                 /* Explicit offset match: insert offset at front. */
1811                                 QUEUE(cur_node) =
1812                                         lzx_lru_queue_push(QUEUE(cur_node - len),
1813                                                            adjusted_offset - LZX_OFFSET_ADJUSTMENT);
1814                         }
1815                 #if CONSIDER_GAP_MATCHES
1816                         else if (adjusted_offset < 0) {
1817                                 /* "Gap match": Explicit offset match, then a
1818                                  * literal, then rep0 match.  Save the explicit
1819                                  * offset match information in the cost field of
1820                                  * the previous node, which isn't needed
1821                                  * anymore.  Then insert the offset at the front
1822                                  * of the queue. */
1823                                 u32 match_before_gap = MATCH_BEFORE_GAP(cur_node);
1824                                 (cur_node - 1)->cost = match_before_gap;
1825                                 QUEUE(cur_node) =
1826                                         lzx_lru_queue_push(QUEUE(cur_node - len - 1 -
1827                                                                  (match_before_gap & OPTIMUM_LEN_MASK)),
1828                                                            (match_before_gap >> OPTIMUM_OFFSET_SHIFT) -
1829                                                            LZX_OFFSET_ADJUSTMENT);
1830                         }
1831                 #endif
1832                         else {
1833                                 /* Repeat offset match: swap offset to front. */
1834                                 QUEUE(cur_node) =
1835                                         lzx_lru_queue_swap(QUEUE(cur_node - len),
1836                                                            adjusted_offset);
1837                         }
1838                 }
1839         } while (cur_node != end_node);
1840
1841         /* Return the recent offsets queue at the end of the path. */
1842         return QUEUE(cur_node);
1843 }
1844
1845 /*
1846  * Given the costs for the main and length codewords (c->costs.main and
1847  * c->costs.len), initialize the match cost array (c->costs.match_cost) which
1848  * directly provides the cost of every possible (length, offset slot) pair.
1849  */
1850 static void
1851 lzx_compute_match_costs(struct lzx_compressor *c)
1852 {
1853         unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) /
1854                                         LZX_NUM_LEN_HEADERS;
1855         struct lzx_costs *costs = &c->costs;
1856         unsigned main_symbol = LZX_NUM_CHARS;
1857
1858         for (unsigned offset_slot = 0; offset_slot < num_offset_slots;
1859              offset_slot++)
1860         {
1861                 u32 extra_cost = lzx_extra_offset_bits[offset_slot] * BIT_COST;
1862                 unsigned i;
1863
1864         #if CONSIDER_ALIGNED_COSTS
1865                 if (offset_slot >= LZX_MIN_ALIGNED_OFFSET_SLOT)
1866                         extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS * BIT_COST;
1867         #endif
1868
1869                 for (i = 0; i < LZX_NUM_PRIMARY_LENS; i++) {
1870                         costs->match_cost[offset_slot][i] =
1871                                 costs->main[main_symbol++] + extra_cost;
1872                 }
1873
1874                 extra_cost += costs->main[main_symbol++];
1875
1876                 for (; i < LZX_NUM_LENS; i++) {
1877                         costs->match_cost[offset_slot][i] =
1878                                 costs->len[i - LZX_NUM_PRIMARY_LENS] +
1879                                 extra_cost;
1880                 }
1881         }
1882 }
1883
1884 /*
1885  * Fast approximation for log2f(x).  This is not as accurate as the standard C
1886  * version.  It does not need to be perfectly accurate because it is only used
1887  * for estimating symbol costs, which is very approximate anyway.
1888  */
1889 static float
1890 log2f_fast(float x)
1891 {
1892         union {
1893                 float f;
1894                 s32 i;
1895         } u = { .f = x };
1896
1897         /* Extract the exponent and subtract 127 to remove the bias.  This gives
1898          * the integer part of the result. */
1899         float res = ((u.i >> 23) & 0xFF) - 127;
1900
1901         /* Set the exponent to 0 (plus bias of 127).  This transforms the number
1902          * to the range [1, 2) while retaining the same mantissa. */
1903         u.i = (u.i & ~(0xFF << 23)) | (127 << 23);
1904
1905         /*
1906          * Approximate the log2 of the transformed number using a degree 2
1907          * interpolating polynomial for log2(x) over the interval [1, 2).  Then
1908          * add this to the extracted exponent to produce the final approximation
1909          * of log2(x).
1910          *
1911          * The coefficients of the interpolating polynomial used here were found
1912          * using the script tools/log2_interpolation.r.
1913          */
1914         return res - 1.653124006f + u.f * (1.9941812f - u.f * 0.3347490189f);
1915
1916 }
1917
1918 /*
1919  * Return the estimated cost of a symbol which has been estimated to have the
1920  * given probability.
1921  */
1922 static u32
1923 lzx_cost_for_probability(float prob)
1924 {
1925         /*
1926          * The basic formula is:
1927          *
1928          *      entropy = -log2(probability)
1929          *
1930          * Use this to get the cost in fractional bits.  Then multiply by our
1931          * scaling factor of BIT_COST and convert to an integer.
1932          *
1933          * In addition, the minimum cost is BIT_COST (one bit) because the
1934          * entropy coding method will be Huffman codes.
1935          *
1936          * Careful: even though 'prob' should be <= 1.0, 'log2f_fast(prob)' may
1937          * be positive due to inaccuracy in our log2 approximation.  Therefore,
1938          * we cannot, in general, assume the computed cost is non-negative, and
1939          * we should make sure negative costs get rounded up correctly.
1940          */
1941         s32 cost = -log2f_fast(prob) * BIT_COST;
1942         return max(cost, BIT_COST);
1943 }
1944
1945 /*
1946  * Mapping: number of used literals => heuristic probability of a literal times
1947  * 6870.  Generated by running this R command:
1948  *
1949  *      cat(paste(round(6870*2^-((304+(0:256))/64)), collapse=", "))
1950  */
1951 static const u8 literal_scaled_probs[257] = {
1952         255, 253, 250, 247, 244, 242, 239, 237, 234, 232, 229, 227, 224, 222,
1953         219, 217, 215, 212, 210, 208, 206, 203, 201, 199, 197, 195, 193, 191,
1954         189, 186, 184, 182, 181, 179, 177, 175, 173, 171, 169, 167, 166, 164,
1955         162, 160, 159, 157, 155, 153, 152, 150, 149, 147, 145, 144, 142, 141,
1956         139, 138, 136, 135, 133, 132, 130, 129, 128, 126, 125, 124, 122, 121,
1957         120, 118, 117, 116, 115, 113, 112, 111, 110, 109, 107, 106, 105, 104,
1958         103, 102, 101, 100, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86,
1959         86, 85, 84, 83, 82, 81, 80, 79, 78, 78, 77, 76, 75, 74, 73, 73, 72, 71,
1960         70, 70, 69, 68, 67, 67, 66, 65, 65, 64, 63, 62, 62, 61, 60, 60, 59, 59,
1961         58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 52, 51, 51, 50, 50, 49, 49, 48,
1962         48, 47, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 40, 40, 40,
1963         39, 39, 38, 38, 38, 37, 37, 36, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33,
1964         32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 27, 27,
1965         27, 26, 26, 26, 25, 25, 25, 25, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22,
1966         22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18,
1967         18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16
1968 };
1969
1970 /*
1971  * Mapping: length symbol => default cost of that symbol.  This is derived from
1972  * sample data but has been slightly edited to add more bias towards the
1973  * shortest lengths, which are the most common.
1974  */
1975 static const u16 lzx_default_len_costs[LZX_LENCODE_NUM_SYMBOLS] = {
1976         300, 310, 320, 330, 360, 396, 399, 416, 451, 448, 463, 466, 505, 492,
1977         503, 514, 547, 531, 566, 561, 589, 563, 592, 586, 623, 602, 639, 627,
1978         659, 643, 657, 650, 685, 662, 661, 672, 685, 686, 696, 680, 657, 682,
1979         666, 699, 674, 699, 679, 709, 688, 712, 692, 714, 694, 716, 698, 712,
1980         706, 727, 714, 727, 713, 723, 712, 718, 719, 719, 720, 735, 725, 735,
1981         728, 740, 727, 739, 727, 742, 716, 733, 733, 740, 738, 746, 737, 747,
1982         738, 745, 736, 748, 742, 749, 745, 749, 743, 748, 741, 752, 745, 752,
1983         747, 750, 747, 752, 748, 753, 750, 752, 753, 753, 749, 744, 752, 755,
1984         753, 756, 745, 748, 746, 745, 723, 757, 755, 758, 755, 758, 752, 757,
1985         754, 757, 755, 759, 755, 758, 753, 755, 755, 758, 757, 761, 755, 750,
1986         758, 759, 759, 760, 758, 751, 757, 757, 759, 759, 758, 759, 758, 761,
1987         750, 761, 758, 760, 759, 761, 758, 761, 760, 752, 759, 760, 759, 759,
1988         757, 762, 760, 761, 761, 748, 761, 760, 762, 763, 752, 762, 762, 763,
1989         762, 762, 763, 763, 762, 763, 762, 763, 762, 763, 763, 764, 763, 762,
1990         763, 762, 762, 762, 764, 764, 763, 764, 763, 763, 763, 762, 763, 763,
1991         762, 764, 764, 763, 762, 763, 763, 763, 763, 762, 764, 763, 762, 764,
1992         764, 763, 763, 765, 764, 764, 762, 763, 764, 765, 763, 764, 763, 764,
1993         762, 764, 764, 754, 763, 764, 763, 763, 762, 763, 584,
1994 };
1995
1996 /* Set default costs to bootstrap the iterative optimization algorithm. */
1997 static void
1998 lzx_set_default_costs(struct lzx_compressor *c)
1999 {
2000         unsigned i;
2001         u32 num_literals = 0;
2002         u32 num_used_literals = 0;
2003         float inv_num_matches = 1.0f / c->freqs.main[LZX_NUM_CHARS];
2004         float inv_num_items;
2005         float prob_match = 1.0f;
2006         u32 match_cost;
2007         float base_literal_prob;
2008
2009         /* Some numbers here have been hardcoded to assume a bit cost of 64. */
2010         STATIC_ASSERT(BIT_COST == 64);
2011
2012         /* Estimate the number of literals that will used.  'num_literals' is
2013          * the total number, whereas 'num_used_literals' is the number of
2014          * distinct symbols. */
2015         for (i = 0; i < LZX_NUM_CHARS; i++) {
2016                 num_literals += c->freqs.main[i];
2017                 num_used_literals += (c->freqs.main[i] != 0);
2018         }
2019
2020         /* Note: all match headers were tallied as symbol 'LZX_NUM_CHARS'.  We
2021          * don't attempt to estimate which ones will be used. */
2022
2023         inv_num_items = 1.0f / (num_literals + c->freqs.main[LZX_NUM_CHARS]);
2024         base_literal_prob = literal_scaled_probs[num_used_literals] *
2025                             (1.0f / 6870.0f);
2026
2027         /* Literal costs.  We use two different methods to compute the
2028          * probability of each literal and mix together their results. */
2029         for (i = 0; i < LZX_NUM_CHARS; i++) {
2030                 u32 freq = c->freqs.main[i];
2031                 if (freq != 0) {
2032                         float prob = 0.5f * ((freq * inv_num_items) +
2033                                              base_literal_prob);
2034                         c->costs.main[i] = lzx_cost_for_probability(prob);
2035                         prob_match -= prob;
2036                 } else {
2037                         c->costs.main[i] = 11 * BIT_COST;
2038                 }
2039         }
2040
2041         /* Match header costs.  We just assume that all match headers are
2042          * equally probable, but we do take into account the relative cost of a
2043          * match header vs. a literal depending on how common matches are
2044          * expected to be vs. literals. */
2045         prob_match = max(prob_match, 0.15f);
2046         match_cost = lzx_cost_for_probability(prob_match / (c->num_main_syms -
2047                                                             LZX_NUM_CHARS));
2048         for (; i < c->num_main_syms; i++)
2049                 c->costs.main[i] = match_cost;
2050
2051         /* Length symbol costs.  These are just set to fixed values which
2052          * reflect the fact the smallest lengths are typically the most common,
2053          * and therefore are typically the cheapest. */
2054         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
2055                 c->costs.len[i] = lzx_default_len_costs[i];
2056
2057 #if CONSIDER_ALIGNED_COSTS
2058         /* Aligned offset symbol costs.  These are derived from the estimated
2059          * probability of each aligned offset symbol. */
2060         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2061                 /* We intentionally tallied the frequencies in the wrong slots,
2062                  * not accounting for LZX_OFFSET_ADJUSTMENT, since doing the
2063                  * fixup here is faster: a constant 8 subtractions here vs. one
2064                  * addition for every match. */
2065                 unsigned j = (i - LZX_OFFSET_ADJUSTMENT) & LZX_ALIGNED_OFFSET_BITMASK;
2066                 if (c->freqs.aligned[j] != 0) {
2067                         float prob = c->freqs.aligned[j] * inv_num_matches;
2068                         c->costs.aligned[i] = lzx_cost_for_probability(prob);
2069                 } else {
2070                         c->costs.aligned[i] =
2071                                 (2 * LZX_NUM_ALIGNED_OFFSET_BITS) * BIT_COST;
2072                 }
2073         }
2074 #endif
2075 }
2076
2077 /* Update the current cost model to reflect the computed Huffman codes.  */
2078 static void
2079 lzx_set_costs_from_codes(struct lzx_compressor *c)
2080 {
2081         unsigned i;
2082         const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
2083
2084         for (i = 0; i < c->num_main_syms; i++) {
2085                 c->costs.main[i] = (lens->main[i] ? lens->main[i] :
2086                                     MAIN_CODEWORD_LIMIT) * BIT_COST;
2087         }
2088
2089         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
2090                 c->costs.len[i] = (lens->len[i] ? lens->len[i] :
2091                                    LENGTH_CODEWORD_LIMIT) * BIT_COST;
2092         }
2093
2094 #if CONSIDER_ALIGNED_COSTS
2095         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2096                 c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] :
2097                                        ALIGNED_CODEWORD_LIMIT) * BIT_COST;
2098         }
2099 #endif
2100 }
2101
2102 /*
2103  * Choose a "near-optimal" literal/match sequence to use for the current block,
2104  * then flush the block.  Because the cost of each Huffman symbol is unknown
2105  * until the Huffman codes have been built and the Huffman codes themselves
2106  * depend on the symbol frequencies, this uses an iterative optimization
2107  * algorithm to approximate an optimal solution.  The first optimization pass
2108  * for the block uses default costs; additional passes use costs derived from
2109  * the Huffman codes computed in the previous pass.
2110  */
2111 static forceinline struct lzx_lru_queue
2112 lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
2113                              struct lzx_output_bitstream * const restrict os,
2114                              const u8 * const restrict block_begin,
2115                              const u32 block_size,
2116                              const struct lzx_lru_queue initial_queue,
2117                              bool is_16_bit)
2118 {
2119         unsigned num_passes_remaining = c->num_optim_passes;
2120         struct lzx_lru_queue new_queue;
2121         u32 seq_idx;
2122
2123         lzx_set_default_costs(c);
2124
2125         for (;;) {
2126                 lzx_compute_match_costs(c);
2127                 new_queue = lzx_find_min_cost_path(c, block_begin, block_size,
2128                                                    initial_queue, is_16_bit);
2129
2130                 if (--num_passes_remaining == 0)
2131                         break;
2132
2133                 /* At least one optimization pass remains.  Update the costs. */
2134                 lzx_reset_symbol_frequencies(c);
2135                 lzx_tally_item_list(c, block_size, is_16_bit);
2136                 lzx_build_huffman_codes(c);
2137                 lzx_set_costs_from_codes(c);
2138         }
2139
2140         /* Done optimizing.  Generate the sequence list and flush the block. */
2141         lzx_reset_symbol_frequencies(c);
2142         seq_idx = lzx_record_item_list(c, block_size, is_16_bit);
2143         lzx_flush_block(c, os, block_begin, block_size, seq_idx);
2144         return new_queue;
2145 }
2146
2147 /*
2148  * This is the "near-optimal" LZX compressor.
2149  *
2150  * For each block, it performs a relatively thorough graph search to find an
2151  * inexpensive (in terms of compressed size) way to output the block.
2152  *
2153  * Note: there are actually many things this algorithm leaves on the table in
2154  * terms of compression ratio.  So although it may be "near-optimal", it is
2155  * certainly not "optimal".  The goal is not to produce the optimal compression
2156  * ratio, which for LZX is probably impossible within any practical amount of
2157  * time, but rather to produce a compression ratio significantly better than a
2158  * simpler "greedy" or "lazy" parse while still being relatively fast.
2159  */
2160 static forceinline void
2161 lzx_compress_near_optimal(struct lzx_compressor * restrict c,
2162                           const u8 * const restrict in_begin, size_t in_nbytes,
2163                           struct lzx_output_bitstream * restrict os,
2164                           bool is_16_bit)
2165 {
2166         const u8 *       in_next = in_begin;
2167         const u8 * const in_end  = in_begin + in_nbytes;
2168         u32 max_len = LZX_MAX_MATCH_LEN;
2169         u32 nice_len = min(c->nice_match_length, max_len);
2170         u32 next_hashes[2] = {0, 0};
2171         struct lzx_lru_queue queue = LZX_QUEUE_INITIALIZER;
2172
2173         /* Initialize the matchfinder. */
2174         CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
2175
2176         do {
2177                 /* Starting a new block */
2178
2179                 const u8 * const in_block_begin = in_next;
2180                 const u8 * const in_max_block_end =
2181                         in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2182                 struct lz_match *cache_ptr = c->match_cache;
2183                 const u8 *next_search_pos = in_next;
2184                 const u8 *next_observation = in_next;
2185                 const u8 *next_pause_point =
2186                         min(in_next + min(MIN_BLOCK_SIZE,
2187                                           in_max_block_end - in_next),
2188                             in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2189                                                    in_max_block_end - in_next));
2190
2191                 lzx_init_block_split_stats(&c->split_stats);
2192                 lzx_reset_symbol_frequencies(c);
2193
2194                 if (in_next >= next_pause_point)
2195                         goto pause;
2196
2197                 /*
2198                  * Run the input buffer through the matchfinder, caching the
2199                  * matches, until we decide to end the block.
2200                  *
2201                  * For a tighter matchfinding loop, we compute a "pause point",
2202                  * which is the next position at which we may need to check
2203                  * whether to end the block or to decrease max_len.  We then
2204                  * only do these extra checks upon reaching the pause point.
2205                  */
2206         resume_matchfinding:
2207                 do {
2208                         if (in_next >= next_search_pos) {
2209                                 /* Search for matches at this position. */
2210                                 struct lz_match *lz_matchptr;
2211                                 u32 best_len;
2212
2213                                 lz_matchptr = CALL_BT_MF(is_16_bit, c,
2214                                                          bt_matchfinder_get_matches,
2215                                                          in_begin,
2216                                                          in_next - in_begin,
2217                                                          max_len,
2218                                                          nice_len,
2219                                                          c->max_search_depth,
2220                                                          next_hashes,
2221                                                          &best_len,
2222                                                          cache_ptr + 1);
2223                                 cache_ptr->length = lz_matchptr - (cache_ptr + 1);
2224                                 cache_ptr = lz_matchptr;
2225
2226                                 /* Accumulate literal/match statistics for block
2227                                  * splitting and for generating the initial cost
2228                                  * model. */
2229                                 if (in_next >= next_observation) {
2230                                         best_len = cache_ptr[-1].length;
2231                                         if (best_len >= 3) {
2232                                                 /* Match (len >= 3) */
2233
2234                                                 /*
2235                                                  * Note: for performance reasons this has
2236                                                  * been simplified significantly:
2237                                                  *
2238                                                  * - We wait until later to account for
2239                                                  *   LZX_OFFSET_ADJUSTMENT.
2240                                                  * - We don't account for repeat offsets.
2241                                                  * - We don't account for different match headers.
2242                                                  */
2243                                                 c->freqs.aligned[cache_ptr[-1].offset &
2244                                                         LZX_ALIGNED_OFFSET_BITMASK]++;
2245                                                 c->freqs.main[LZX_NUM_CHARS]++;
2246
2247                                                 lzx_observe_match(&c->split_stats, best_len);
2248                                                 next_observation = in_next + best_len;
2249                                         } else {
2250                                                 /* Literal */
2251                                                 c->freqs.main[*in_next]++;
2252                                                 lzx_observe_literal(&c->split_stats, *in_next);
2253                                                 next_observation = in_next + 1;
2254                                         }
2255                                 }
2256
2257                                 /*
2258                                  * If there was a very long match found, then
2259                                  * don't cache any matches for the bytes covered
2260                                  * by that match.  This avoids degenerate
2261                                  * behavior when compressing highly redundant
2262                                  * data, where the number of matches can be very
2263                                  * large.
2264                                  *
2265                                  * This heuristic doesn't actually hurt the
2266                                  * compression ratio *too* much.  If there's a
2267                                  * long match, then the data must be highly
2268                                  * compressible, so it doesn't matter as much
2269                                  * what we do.
2270                                  */
2271                                 if (best_len >= nice_len)
2272                                         next_search_pos = in_next + best_len;
2273                         } else {
2274                                 /* Don't search for matches at this position. */
2275                                 CALL_BT_MF(is_16_bit, c,
2276                                            bt_matchfinder_skip_position,
2277                                            in_begin,
2278                                            in_next - in_begin,
2279                                            nice_len,
2280                                            c->max_search_depth,
2281                                            next_hashes);
2282                                 cache_ptr->length = 0;
2283                                 cache_ptr++;
2284                         }
2285                 } while (++in_next < next_pause_point &&
2286                          likely(cache_ptr < &c->match_cache[CACHE_LENGTH]));
2287
2288         pause:
2289
2290                 /* Adjust max_len and nice_len if we're nearing the end of the
2291                  * input buffer.  In addition, if we are so close to the end of
2292                  * the input buffer that there cannot be any more matches, then
2293                  * just advance through the last few positions and record no
2294                  * matches. */
2295                 if (unlikely(max_len > in_end - in_next)) {
2296                         max_len = in_end - in_next;
2297                         nice_len = min(max_len, nice_len);
2298                         if (max_len < BT_MATCHFINDER_REQUIRED_NBYTES) {
2299                                 while (in_next != in_end) {
2300                                         cache_ptr->length = 0;
2301                                         cache_ptr++;
2302                                         in_next++;
2303                                 }
2304                         }
2305                 }
2306
2307                 /* End the block if the match cache may overflow. */
2308                 if (unlikely(cache_ptr >= &c->match_cache[CACHE_LENGTH]))
2309                         goto end_block;
2310
2311                 /* End the block if the soft maximum size has been reached. */
2312                 if (in_next >= in_max_block_end)
2313                         goto end_block;
2314
2315                 /* End the block if the block splitting algorithm thinks this is
2316                  * a good place to do so. */
2317                 if (c->split_stats.num_new_observations >=
2318                                 NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2319                     in_max_block_end - in_next >= MIN_BLOCK_SIZE &&
2320                     lzx_should_end_block(&c->split_stats))
2321                         goto end_block;
2322
2323                 /* It's not time to end the block yet.  Compute the next pause
2324                  * point and resume matchfinding. */
2325                 next_pause_point =
2326                         min(in_next + min(NUM_OBSERVATIONS_PER_BLOCK_CHECK * 2 -
2327                                             c->split_stats.num_new_observations,
2328                                           in_max_block_end - in_next),
2329                             in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2330                                                    in_max_block_end - in_next));
2331                 goto resume_matchfinding;
2332
2333         end_block:
2334                 /* We've decided on a block boundary and cached matches.  Now
2335                  * choose a match/literal sequence and flush the block. */
2336                 queue = lzx_optimize_and_flush_block(c, os, in_block_begin,
2337                                                      in_next - in_block_begin,
2338                                                      queue, is_16_bit);
2339         } while (in_next != in_end);
2340 }
2341
2342 static void
2343 lzx_compress_near_optimal_16(struct lzx_compressor *c, const u8 *in,
2344                              size_t in_nbytes, struct lzx_output_bitstream *os)
2345 {
2346         lzx_compress_near_optimal(c, in, in_nbytes, os, true);
2347 }
2348
2349 static void
2350 lzx_compress_near_optimal_32(struct lzx_compressor *c, const u8 *in,
2351                              size_t in_nbytes, struct lzx_output_bitstream *os)
2352 {
2353         lzx_compress_near_optimal(c, in, in_nbytes, os, false);
2354 }
2355
2356 /******************************************************************************/
2357 /*                     Faster ("lazy") compression algorithm                  */
2358 /*----------------------------------------------------------------------------*/
2359
2360 /*
2361  * Called when the compressor chooses to use a literal.  This tallies the
2362  * Huffman symbol for the literal, increments the current literal run length,
2363  * and "observes" the literal for the block split statistics.
2364  */
2365 static forceinline void
2366 lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
2367 {
2368         lzx_observe_literal(&c->split_stats, literal);
2369         c->freqs.main[literal]++;
2370         ++*litrunlen_p;
2371 }
2372
2373 /*
2374  * Called when the compressor chooses to use a match.  This tallies the Huffman
2375  * symbol(s) for a match, saves the match data and the length of the preceding
2376  * literal run, updates the recent offsets queue, and "observes" the match for
2377  * the block split statistics.
2378  */
2379 static forceinline void
2380 lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 adjusted_offset,
2381                  u32 recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
2382                  u32 *litrunlen_p, struct lzx_sequence **next_seq_p)
2383 {
2384         struct lzx_sequence *next_seq = *next_seq_p;
2385         unsigned mainsym;
2386
2387         lzx_observe_match(&c->split_stats, length);
2388
2389         mainsym = lzx_tally_main_and_lensyms(c, length, adjusted_offset,
2390                                              is_16_bit);
2391         next_seq->litrunlen_and_matchlen =
2392                 (*litrunlen_p << SEQ_MATCHLEN_BITS) | length;
2393         next_seq->adjusted_offset_and_mainsym =
2394                 (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym;
2395
2396         /* Update the recent offsets queue. */
2397         if (adjusted_offset < LZX_NUM_RECENT_OFFSETS) {
2398                 /* Repeat offset match. */
2399                 swap(recent_offsets[0], recent_offsets[adjusted_offset]);
2400         } else {
2401                 /* Explicit offset match. */
2402
2403                 /* Tally the aligned offset symbol if needed. */
2404                 if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET + LZX_OFFSET_ADJUSTMENT)
2405                         c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
2406
2407                 recent_offsets[2] = recent_offsets[1];
2408                 recent_offsets[1] = recent_offsets[0];
2409                 recent_offsets[0] = adjusted_offset - LZX_OFFSET_ADJUSTMENT;
2410         }
2411
2412         /* Reset the literal run length and advance to the next sequence. */
2413         *next_seq_p = next_seq + 1;
2414         *litrunlen_p = 0;
2415 }
2416
2417 /*
2418  * Called when the compressor ends a block.  This finshes the last lzx_sequence,
2419  * which is just a literal run with no following match.  This literal run might
2420  * be empty.
2421  */
2422 static forceinline void
2423 lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
2424 {
2425         last_seq->litrunlen_and_matchlen = litrunlen << SEQ_MATCHLEN_BITS;
2426 }
2427
2428 /*
2429  * Find the longest repeat offset match with the current position.  If a match
2430  * is found, return its length and set *best_rep_idx_ret to the index of its
2431  * offset in @recent_offsets.  Otherwise, return 0.
2432  *
2433  * Don't bother with length 2 matches; consider matches of length >= 3 only.
2434  * Also assume that max_len >= 3.
2435  */
2436 static unsigned
2437 lzx_find_longest_repeat_offset_match(const u8 * const in_next,
2438                                      const u32 recent_offsets[],
2439                                      const unsigned max_len,
2440                                      unsigned *best_rep_idx_ret)
2441 {
2442         STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); /* loop is unrolled */
2443
2444         const u32 seq3 = load_u24_unaligned(in_next);
2445         const u8 *matchptr;
2446         unsigned best_rep_len = 0;
2447         unsigned best_rep_idx = 0;
2448         unsigned rep_len;
2449
2450         /* Check for rep0 match (most recent offset) */
2451         matchptr = in_next - recent_offsets[0];
2452         if (load_u24_unaligned(matchptr) == seq3)
2453                 best_rep_len = lz_extend(in_next, matchptr, 3, max_len);
2454
2455         /* Check for rep1 match (second most recent offset) */
2456         matchptr = in_next - recent_offsets[1];
2457         if (load_u24_unaligned(matchptr) == seq3) {
2458                 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2459                 if (rep_len > best_rep_len) {
2460                         best_rep_len = rep_len;
2461                         best_rep_idx = 1;
2462                 }
2463         }
2464
2465         /* Check for rep2 match (third most recent offset) */
2466         matchptr = in_next - recent_offsets[2];
2467         if (load_u24_unaligned(matchptr) == seq3) {
2468                 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2469                 if (rep_len > best_rep_len) {
2470                         best_rep_len = rep_len;
2471                         best_rep_idx = 2;
2472                 }
2473         }
2474
2475         *best_rep_idx_ret = best_rep_idx;
2476         return best_rep_len;
2477 }
2478
2479 /*
2480  * Fast heuristic scoring for lazy parsing: how "good" is this match?
2481  * This is mainly determined by the length: longer matches are better.
2482  * However, we also give a bonus to close (small offset) matches and to repeat
2483  * offset matches, since those require fewer bits to encode.
2484  */
2485
2486 static forceinline unsigned
2487 lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
2488 {
2489         unsigned score = len;
2490
2491         if (adjusted_offset < 4096)
2492                 score++;
2493         if (adjusted_offset < 256)
2494                 score++;
2495
2496         return score;
2497 }
2498
2499 static forceinline unsigned
2500 lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
2501 {
2502         return rep_len + 3;
2503 }
2504
2505 /*
2506  * This is the "lazy" LZX compressor.  The basic idea is that before it chooses
2507  * a match, it checks to see if there's a longer match at the next position.  If
2508  * yes, it chooses a literal and continues to the next position.  If no, it
2509  * chooses the match.
2510  *
2511  * Some additional heuristics are used as well.  Repeat offset matches are
2512  * considered favorably and sometimes are chosen immediately.  In addition, long
2513  * matches (at least "nice_len" bytes) are chosen immediately as well.  Finally,
2514  * when we decide whether a match is "better" than another, we take the offset
2515  * into consideration as well as the length.
2516  */
2517 static forceinline void
2518 lzx_compress_lazy(struct lzx_compressor * restrict c,
2519                   const u8 * const restrict in_begin, size_t in_nbytes,
2520                   struct lzx_output_bitstream * restrict os, bool is_16_bit)
2521 {
2522         const u8 *       in_next = in_begin;
2523         const u8 * const in_end  = in_begin + in_nbytes;
2524         unsigned max_len = LZX_MAX_MATCH_LEN;
2525         unsigned nice_len = min(c->nice_match_length, max_len);
2526         STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
2527         u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1};
2528         u32 next_hashes[2] = {0, 0};
2529
2530         /* Initialize the matchfinder. */
2531         CALL_HC_MF(is_16_bit, c, hc_matchfinder_init);
2532
2533         do {
2534                 /* Starting a new block */
2535
2536                 const u8 * const in_block_begin = in_next;
2537                 const u8 * const in_max_block_end =
2538                         in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2539                 struct lzx_sequence *next_seq = c->chosen_sequences;
2540                 u32 litrunlen = 0;
2541                 unsigned cur_len;
2542                 u32 cur_offset;
2543                 u32 cur_adjusted_offset;
2544                 unsigned cur_score;
2545                 unsigned next_len;
2546                 u32 next_offset;
2547                 u32 next_adjusted_offset;
2548                 unsigned next_score;
2549                 unsigned best_rep_len;
2550                 unsigned best_rep_idx;
2551                 unsigned rep_score;
2552                 unsigned skip_len;
2553
2554                 lzx_reset_symbol_frequencies(c);
2555                 lzx_init_block_split_stats(&c->split_stats);
2556
2557                 do {
2558                         /* Adjust max_len and nice_len if we're nearing the end
2559                          * of the input buffer. */
2560                         if (unlikely(max_len > in_end - in_next)) {
2561                                 max_len = in_end - in_next;
2562                                 nice_len = min(max_len, nice_len);
2563                         }
2564
2565                         /* Find the longest match (subject to the
2566                          * max_search_depth cutoff parameter) with the current
2567                          * position.  Don't bother with length 2 matches; only
2568                          * look for matches of length >= 3. */
2569                         cur_len = CALL_HC_MF(is_16_bit, c,
2570                                              hc_matchfinder_longest_match,
2571                                              in_begin,
2572                                              in_next,
2573                                              2,
2574                                              max_len,
2575                                              nice_len,
2576                                              c->max_search_depth,
2577                                              next_hashes,
2578                                              &cur_offset);
2579
2580                         /* If there was no match found, or the only match found
2581                          * was a distant short match, then choose a literal. */
2582                         if (cur_len < 3 ||
2583                             (cur_len == 3 &&
2584                              cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT &&
2585                              cur_offset != recent_offsets[0] &&
2586                              cur_offset != recent_offsets[1] &&
2587                              cur_offset != recent_offsets[2]))
2588                         {
2589                                 lzx_choose_literal(c, *in_next, &litrunlen);
2590                                 in_next++;
2591                                 continue;
2592                         }
2593
2594                         /* Heuristic: if this match has the most recent offset,
2595                          * then go ahead and choose it as a rep0 match. */
2596                         if (cur_offset == recent_offsets[0]) {
2597                                 in_next++;
2598                                 skip_len = cur_len - 1;
2599                                 cur_adjusted_offset = 0;
2600                                 goto choose_cur_match;
2601                         }
2602
2603                         /* Compute the longest match's score as an explicit
2604                          * offset match. */
2605                         cur_adjusted_offset = cur_offset + LZX_OFFSET_ADJUSTMENT;
2606                         cur_score = lzx_explicit_offset_match_score(cur_len, cur_adjusted_offset);
2607
2608                         /* Find the longest repeat offset match at this
2609                          * position.  If we find one and it's "better" than the
2610                          * explicit offset match we found, then go ahead and
2611                          * choose the repeat offset match immediately. */
2612                         best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2613                                                                             recent_offsets,
2614                                                                             max_len,
2615                                                                             &best_rep_idx);
2616                         in_next++;
2617
2618                         if (best_rep_len != 0 &&
2619                             (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2620                                                                        best_rep_idx)) >= cur_score)
2621                         {
2622                                 cur_len = best_rep_len;
2623                                 cur_adjusted_offset = best_rep_idx;
2624                                 skip_len = best_rep_len - 1;
2625                                 goto choose_cur_match;
2626                         }
2627
2628                 have_cur_match:
2629                         /*
2630                          * We have a match at the current position.  If the
2631                          * match is very long, then choose it immediately.
2632                          * Otherwise, see if there's a better match at the next
2633                          * position.
2634                          */
2635
2636                         if (cur_len >= nice_len) {
2637                                 skip_len = cur_len - 1;
2638                                 goto choose_cur_match;
2639                         }
2640
2641                         if (unlikely(max_len > in_end - in_next)) {
2642                                 max_len = in_end - in_next;
2643                                 nice_len = min(max_len, nice_len);
2644                         }
2645
2646                         next_len = CALL_HC_MF(is_16_bit, c,
2647                                               hc_matchfinder_longest_match,
2648                                               in_begin,
2649                                               in_next,
2650                                               cur_len - 2,
2651                                               max_len,
2652                                               nice_len,
2653                                               c->max_search_depth / 2,
2654                                               next_hashes,
2655                                               &next_offset);
2656
2657                         if (next_len <= cur_len - 2) {
2658                                 /* No potentially better match was found. */
2659                                 in_next++;
2660                                 skip_len = cur_len - 2;
2661                                 goto choose_cur_match;
2662                         }
2663
2664                         next_adjusted_offset = next_offset + LZX_OFFSET_ADJUSTMENT;
2665                         next_score = lzx_explicit_offset_match_score(next_len, next_adjusted_offset);
2666
2667                         best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2668                                                                             recent_offsets,
2669                                                                             max_len,
2670                                                                             &best_rep_idx);
2671                         in_next++;
2672
2673                         if (best_rep_len != 0 &&
2674                             (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2675                                                                        best_rep_idx)) >= next_score)
2676                         {
2677
2678                                 if (rep_score > cur_score) {
2679                                         /* The next match is better, and it's a
2680                                          * repeat offset match. */
2681                                         lzx_choose_literal(c, *(in_next - 2),
2682                                                            &litrunlen);
2683                                         cur_len = best_rep_len;
2684                                         cur_adjusted_offset = best_rep_idx;
2685                                         skip_len = cur_len - 1;
2686                                         goto choose_cur_match;
2687                                 }
2688                         } else {
2689                                 if (next_score > cur_score) {
2690                                         /* The next match is better, and it's an
2691                                          * explicit offset match. */
2692                                         lzx_choose_literal(c, *(in_next - 2),
2693                                                            &litrunlen);
2694                                         cur_len = next_len;
2695                                         cur_adjusted_offset = next_adjusted_offset;
2696                                         cur_score = next_score;
2697                                         goto have_cur_match;
2698                                 }
2699                         }
2700
2701                         /* The original match was better; choose it. */
2702                         skip_len = cur_len - 2;
2703
2704                 choose_cur_match:
2705                         /* Choose a match and have the matchfinder skip over its
2706                          * remaining bytes. */
2707                         lzx_choose_match(c, cur_len, cur_adjusted_offset,
2708                                          recent_offsets, is_16_bit,
2709                                          &litrunlen, &next_seq);
2710                         CALL_HC_MF(is_16_bit, c,
2711                                    hc_matchfinder_skip_bytes,
2712                                    in_begin,
2713                                    in_next,
2714                                    in_end,
2715                                    skip_len,
2716                                    next_hashes);
2717                         in_next += skip_len;
2718
2719                         /* Keep going until it's time to end the block. */
2720                 } while (in_next < in_max_block_end &&
2721                          !(c->split_stats.num_new_observations >=
2722                                         NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2723                            in_next - in_block_begin >= MIN_BLOCK_SIZE &&
2724                            in_end - in_next >= MIN_BLOCK_SIZE &&
2725                            lzx_should_end_block(&c->split_stats)));
2726
2727                 /* Flush the block. */
2728                 lzx_finish_sequence(next_seq, litrunlen);
2729                 lzx_flush_block(c, os, in_block_begin, in_next - in_block_begin, 0);
2730
2731                 /* Keep going until we've reached the end of the input buffer. */
2732         } while (in_next != in_end);
2733 }
2734
2735 static void
2736 lzx_compress_lazy_16(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2737                      struct lzx_output_bitstream *os)
2738 {
2739         lzx_compress_lazy(c, in, in_nbytes, os, true);
2740 }
2741
2742 static void
2743 lzx_compress_lazy_32(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2744                      struct lzx_output_bitstream *os)
2745 {
2746         lzx_compress_lazy(c, in, in_nbytes, os, false);
2747 }
2748
2749 /******************************************************************************/
2750 /*                          Compressor operations                             */
2751 /*----------------------------------------------------------------------------*/
2752
2753 /*
2754  * Generate tables for mapping match offsets (actually, "adjusted" match
2755  * offsets) to offset slots.
2756  */
2757 static void
2758 lzx_init_offset_slot_tabs(struct lzx_compressor *c)
2759 {
2760         u32 adjusted_offset = 0;
2761         unsigned slot = 0;
2762
2763         /* slots [0, 29] */
2764         for (; adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1);
2765              adjusted_offset++)
2766         {
2767                 if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
2768                                        LZX_OFFSET_ADJUSTMENT)
2769                         slot++;
2770                 c->offset_slot_tab_1[adjusted_offset] = slot;
2771         }
2772
2773         /* slots [30, 49] */
2774         for (; adjusted_offset < LZX_MAX_WINDOW_SIZE;
2775              adjusted_offset += (u32)1 << 14)
2776         {
2777                 if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
2778                                        LZX_OFFSET_ADJUSTMENT)
2779                         slot++;
2780                 c->offset_slot_tab_2[adjusted_offset >> 14] = slot;
2781         }
2782 }
2783
2784 static size_t
2785 lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
2786 {
2787         if (compression_level <= MAX_FAST_LEVEL) {
2788                 if (lzx_is_16_bit(max_bufsize))
2789                         return offsetof(struct lzx_compressor, hc_mf_16) +
2790                                hc_matchfinder_size_16(max_bufsize);
2791                 else
2792                         return offsetof(struct lzx_compressor, hc_mf_32) +
2793                                hc_matchfinder_size_32(max_bufsize);
2794         } else {
2795                 if (lzx_is_16_bit(max_bufsize))
2796                         return offsetof(struct lzx_compressor, bt_mf_16) +
2797                                bt_matchfinder_size_16(max_bufsize);
2798                 else
2799                         return offsetof(struct lzx_compressor, bt_mf_32) +
2800                                bt_matchfinder_size_32(max_bufsize);
2801         }
2802 }
2803
2804 /* Compute the amount of memory needed to allocate an LZX compressor. */
2805 static u64
2806 lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2807                       bool destructive)
2808 {
2809         u64 size = 0;
2810
2811         if (max_bufsize > LZX_MAX_WINDOW_SIZE)
2812                 return 0;
2813
2814         size += lzx_get_compressor_size(max_bufsize, compression_level);
2815         if (!destructive)
2816                 size += max_bufsize; /* account for in_buffer */
2817         return size;
2818 }
2819
2820 /* Allocate an LZX compressor. */
2821 static int
2822 lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
2823                       bool destructive, void **c_ret)
2824 {
2825         unsigned window_order;
2826         struct lzx_compressor *c;
2827
2828         /* Validate the maximum buffer size and get the window order from it. */
2829         window_order = lzx_get_window_order(max_bufsize);
2830         if (window_order == 0)
2831                 return WIMLIB_ERR_INVALID_PARAM;
2832
2833         /* Allocate the compressor. */
2834         c = MALLOC(lzx_get_compressor_size(max_bufsize, compression_level));
2835         if (!c)
2836                 goto oom0;
2837
2838         c->window_order = window_order;
2839         c->num_main_syms = lzx_get_num_main_syms(window_order);
2840         c->destructive = destructive;
2841
2842         /* Allocate the buffer for preprocessed data if needed. */
2843         if (!c->destructive) {
2844                 c->in_buffer = MALLOC(max_bufsize);
2845                 if (!c->in_buffer)
2846                         goto oom1;
2847         }
2848
2849         if (compression_level <= MAX_FAST_LEVEL) {
2850
2851                 /* Fast compression: Use lazy parsing. */
2852                 if (lzx_is_16_bit(max_bufsize))
2853                         c->impl = lzx_compress_lazy_16;
2854                 else
2855                         c->impl = lzx_compress_lazy_32;
2856
2857                 /* Scale max_search_depth and nice_match_length with the
2858                  * compression level. */
2859                 c->max_search_depth = (60 * compression_level) / 20;
2860                 c->nice_match_length = (80 * compression_level) / 20;
2861
2862                 /* lzx_compress_lazy() needs max_search_depth >= 2 because it
2863                  * halves the max_search_depth when attempting a lazy match, and
2864                  * max_search_depth must be at least 1. */
2865                 c->max_search_depth = max(c->max_search_depth, 2);
2866         } else {
2867
2868                 /* Normal / high compression: Use near-optimal parsing. */
2869                 if (lzx_is_16_bit(max_bufsize))
2870                         c->impl = lzx_compress_near_optimal_16;
2871                 else
2872                         c->impl = lzx_compress_near_optimal_32;
2873
2874                 /* Scale max_search_depth and nice_match_length with the
2875                  * compression level. */
2876                 c->max_search_depth = (24 * compression_level) / 50;
2877                 c->nice_match_length = (48 * compression_level) / 50;
2878
2879                 /* Also scale num_optim_passes with the compression level.  But
2880                  * the more passes there are, the less they help --- so don't
2881                  * add them linearly.  */
2882                 c->num_optim_passes = 1;
2883                 c->num_optim_passes += (compression_level >= 45);
2884                 c->num_optim_passes += (compression_level >= 70);
2885                 c->num_optim_passes += (compression_level >= 100);
2886                 c->num_optim_passes += (compression_level >= 150);
2887                 c->num_optim_passes += (compression_level >= 200);
2888                 c->num_optim_passes += (compression_level >= 300);
2889
2890                 /* max_search_depth must be at least 1. */
2891                 c->max_search_depth = max(c->max_search_depth, 1);
2892         }
2893
2894         /* Prepare the offset => offset slot mapping. */
2895         lzx_init_offset_slot_tabs(c);
2896
2897         *c_ret = c;
2898         return 0;
2899
2900 oom1:
2901         FREE(c);
2902 oom0:
2903         return WIMLIB_ERR_NOMEM;
2904 }
2905
2906 /* Compress a buffer of data. */
2907 static size_t
2908 lzx_compress(const void *restrict in, size_t in_nbytes,
2909              void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2910 {
2911         struct lzx_compressor *c = _c;
2912         struct lzx_output_bitstream os;
2913         size_t result;
2914
2915         /* Don't bother trying to compress very small inputs. */
2916         if (in_nbytes < 64)
2917                 return 0;
2918
2919         /* If the compressor is in "destructive" mode, then we can directly
2920          * preprocess the input data.  Otherwise, we need to copy it into an
2921          * internal buffer first. */
2922         if (!c->destructive) {
2923                 memcpy(c->in_buffer, in, in_nbytes);
2924                 in = c->in_buffer;
2925         }
2926
2927         /* Preprocess the input data. */
2928         lzx_preprocess((void *)in, in_nbytes);
2929
2930         /* Initially, the previous Huffman codeword lengths are all zeroes. */
2931         c->codes_index = 0;
2932         memset(&c->codes[1].lens, 0, sizeof(struct lzx_lens));
2933
2934         /* Initialize the output bitstream. */
2935         lzx_init_output(&os, out, out_nbytes_avail);
2936
2937         /* Call the compression level-specific compress() function. */
2938         (*c->impl)(c, in, in_nbytes, &os);
2939
2940         /* Flush the output bitstream. */
2941         result = lzx_flush_output(&os);
2942
2943         /* If the data did not compress to less than its original size and we
2944          * preprocessed the original buffer, then postprocess it to restore it
2945          * to its original state. */
2946         if (result == 0 && c->destructive)
2947                 lzx_postprocess((void *)in, in_nbytes);
2948
2949         /* Return the number of compressed bytes, or 0 if the input did not
2950          * compress to less than its original size. */
2951         return result;
2952 }
2953
2954 /* Free an LZX compressor. */
2955 static void
2956 lzx_free_compressor(void *_c)
2957 {
2958         struct lzx_compressor *c = _c;
2959
2960         if (!c->destructive)
2961                 FREE(c->in_buffer);
2962         FREE(c);
2963 }
2964
2965 const struct compressor_ops lzx_compressor_ops = {
2966         .get_needed_memory  = lzx_get_needed_memory,
2967         .create_compressor  = lzx_create_compressor,
2968         .compress           = lzx_compress,
2969         .free_compressor    = lzx_free_compressor,
2970 };