wimlib.net Git - wimlib/blob - src/lzx_compress.c

   1 /*
   2  * lzx_compress.c
   3  *
   4  * A compressor for the LZX compression format, as used in WIM archives.
   5  */
   6
   7 /*
   8  * Copyright (C) 2012-2017 Eric Biggers
   9  *
  10  * This file is free software; you can redistribute it and/or modify it under
  11  * the terms of the GNU Lesser General Public License as published by the Free
  12  * Software Foundation; either version 3 of the License, or (at your option) any
  13  * later version.
  14  *
  15  * This file is distributed in the hope that it will be useful, but WITHOUT
  16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  18  * details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this file; if not, see http://www.gnu.org/licenses/.
  22  */
  23
  24
  25 /*
  26  * This file contains a compressor for the LZX ("Lempel-Ziv eXtended")
  27  * compression format, as used in the WIM (Windows IMaging) file format.
  28  *
  29  * Two different LZX-compatible algorithms are implemented: "near-optimal" and
  30  * "lazy".  "Near-optimal" is significantly slower than "lazy", but results in a
  31  * better compression ratio.  The "near-optimal" algorithm is used at the
  32  * default compression level.
  33  *
  34  * This file may need some slight modifications to be used outside of the WIM
  35  * format.  In particular, in other situations the LZX block header might be
  36  * slightly different, and sliding window support might be required.
  37  *
  38  * LZX is a compression format derived from DEFLATE, the format used by zlib and
  39  * gzip.  Both LZX and DEFLATE use LZ77 matching and Huffman coding.  Certain
  40  * details are quite similar, such as the method for storing Huffman codes.
  41  * However, the main differences are:
  42  *
  43  * - LZX preprocesses the data to attempt to make x86 machine code slightly more
  44  *   compressible before attempting to compress it further.
  45  *
  46  * - LZX uses a "main" alphabet which combines literals and matches, with the
  47  *   match symbols containing a "length header" (giving all or part of the match
  48  *   length) and an "offset slot" (giving, roughly speaking, the order of
  49  *   magnitude of the match offset).
  50  *
  51  * - LZX does not have static Huffman blocks (that is, the kind with preset
  52  *   Huffman codes); however it does have two types of dynamic Huffman blocks
  53  *   ("verbatim" and "aligned").
  54  *
  55  * - LZX has a minimum match length of 2 rather than 3.  Length 2 matches can be
  56  *   useful, but generally only if the compressor is smart about choosing them.
  57  *
  58  * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue
  59  *   of match offsets.  This is very useful for certain types of files, such as
  60  *   binary files that have repeating records.
  61  */
  62
  63 /******************************************************************************/
  64 /*                            General parameters                              */
  65 /*----------------------------------------------------------------------------*/
  66
  67 /*
  68  * The compressor uses the faster algorithm at levels <= MAX_FAST_LEVEL.  It
  69  * uses the slower algorithm at levels > MAX_FAST_LEVEL.
  70  */
  71 #define MAX_FAST_LEVEL                          34
  72
  73 /*
  74  * The compressor-side limits on the codeword lengths (in bits) for each Huffman
  75  * code.  To make outputting bits slightly faster, some of these limits are
  76  * lower than the limits defined by the LZX format.  This does not significantly
  77  * affect the compression ratio.
  78  */
  79 #define MAIN_CODEWORD_LIMIT                     16
  80 #define LENGTH_CODEWORD_LIMIT                   12
  81 #define ALIGNED_CODEWORD_LIMIT                  7
  82 #define PRE_CODEWORD_LIMIT                      7
  83
  84
  85 /******************************************************************************/
  86 /*                         Block splitting parameters                         */
  87 /*----------------------------------------------------------------------------*/
  88
  89 /*
  90  * The compressor always outputs blocks of at least this size in bytes, except
  91  * for the last block which may need to be smaller.
  92  */
  93 #define MIN_BLOCK_SIZE                          6500
  94
  95 /*
  96  * The compressor attempts to end a block when it reaches this size in bytes.
  97  * The final size might be slightly larger due to matches extending beyond the
  98  * end of the block.  Specifically:
  99  *
 100  *  - The near-optimal compressor may choose a match of up to LZX_MAX_MATCH_LEN
 101  *    bytes starting at position 'SOFT_MAX_BLOCK_SIZE - 1'.
 102  *
 103  *  - The lazy compressor may choose a sequence of literals starting at position
 104  *    'SOFT_MAX_BLOCK_SIZE - 1' when it sees a sequence of increasingly better
 105  *    matches.  The final match may be up to LZX_MAX_MATCH_LEN bytes.  The
 106  *    length of the literal sequence is approximately limited by the "nice match
 107  *    length" parameter.
 108  */
 109 #define SOFT_MAX_BLOCK_SIZE                     100000
 110
 111 /*
 112  * The number of observed items (matches and literals) that represents
 113  * sufficient data for the compressor to decide whether the current block should
 114  * be ended or not.
 115  */
 116 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK        400
 117
 118
 119 /******************************************************************************/
 120 /*                      Parameters for slower algorithm                       */
 121 /*----------------------------------------------------------------------------*/
 122
 123 /*
 124  * The log base 2 of the number of entries in the hash table for finding length
 125  * 2 matches.  This could be as high as 16, but using a smaller hash table
 126  * speeds up compression due to reduced cache pressure.
 127  */
 128 #define BT_MATCHFINDER_HASH2_ORDER              12
 129
 130 /*
 131  * The number of lz_match structures in the match cache, excluding the extra
 132  * "overflow" entries.  This value should be high enough so that nearly the
 133  * time, all matches found in a given block can fit in the match cache.
 134  * However, fallback behavior (immediately terminating the block) on cache
 135  * overflow is still required.
 136  */
 137 #define CACHE_LENGTH                            (SOFT_MAX_BLOCK_SIZE * 5)
 138
 139 /*
 140  * An upper bound on the number of matches that can ever be saved in the match
 141  * cache for a single position.  Since each match we save for a single position
 142  * has a distinct length, we can use the number of possible match lengths in LZX
 143  * as this bound.  This bound is guaranteed to be valid in all cases, although
 144  * if 'nice_match_length < LZX_MAX_MATCH_LEN', then it will never actually be
 145  * reached.
 146  */
 147 #define MAX_MATCHES_PER_POS                     LZX_NUM_LENS
 148
 149 /*
 150  * A scaling factor that makes it possible to consider fractional bit costs.  A
 151  * single bit has a cost of BIT_COST.
 152  *
 153  * Note: this is only useful as a statistical trick for when the true costs are
 154  * unknown.  Ultimately, each token in LZX requires a whole number of bits to
 155  * output.
 156  */
 157 #define BIT_COST                                64
 158
 159 /*
 160  * Should the compressor take into account the costs of aligned offset symbols
 161  * instead of assuming that all are equally likely?
 162  */
 163 #define CONSIDER_ALIGNED_COSTS                  1
 164
 165 /*
 166  * Should the "minimum" cost path search algorithm consider "gap" matches, where
 167  * a normal match is followed by a literal, then by a match with the same
 168  * offset?  This is one specific, somewhat common situation in which the true
 169  * minimum cost path is often different from the path found by looking only one
 170  * edge ahead.
 171  */
 172 #define CONSIDER_GAP_MATCHES                    1
 173
 174 /******************************************************************************/
 175 /*                                  Includes                                  */
 176 /*----------------------------------------------------------------------------*/
 177
 178 #ifdef HAVE_CONFIG_H
 179 #  include "config.h"
 180 #endif
 181
 182 #include "wimlib/compress_common.h"
 183 #include "wimlib/compressor_ops.h"
 184 #include "wimlib/error.h"
 185 #include "wimlib/lzx_common.h"
 186 #include "wimlib/unaligned.h"
 187 #include "wimlib/util.h"
 188
 189 /* Note: BT_MATCHFINDER_HASH2_ORDER must be defined before including
 190  * bt_matchfinder.h. */
 191
 192 /* Matchfinders with 16-bit positions */
 193 #define mf_pos_t        u16
 194 #define MF_SUFFIX       _16
 195 #include "wimlib/bt_matchfinder.h"
 196 #include "wimlib/hc_matchfinder.h"
 197
 198 /* Matchfinders with 32-bit positions */
 199 #undef mf_pos_t
 200 #undef MF_SUFFIX
 201 #define mf_pos_t        u32
 202 #define MF_SUFFIX       _32
 203 #include "wimlib/bt_matchfinder.h"
 204 #include "wimlib/hc_matchfinder.h"
 205
 206 /******************************************************************************/
 207 /*                            Compressor structure                            */
 208 /*----------------------------------------------------------------------------*/
 209
 210 /* Codewords for the Huffman codes */
 211 struct lzx_codewords {
 212         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 213         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 214         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 215 };
 216
 217 /*
 218  * Codeword lengths, in bits, for the Huffman codes.
 219  *
 220  * A codeword length of 0 means the corresponding codeword has zero frequency.
 221  *
 222  * The main and length codes each have one extra entry for use as a sentinel.
 223  * See lzx_write_compressed_code().
 224  */
 225 struct lzx_lens {
 226         u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS + 1];
 227         u8 len[LZX_LENCODE_NUM_SYMBOLS + 1];
 228         u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 229 };
 230
 231 /* Codewords and lengths for the Huffman codes */
 232 struct lzx_codes {
 233         struct lzx_codewords codewords;
 234         struct lzx_lens lens;
 235 };
 236
 237 /* Symbol frequency counters for the Huffman-encoded alphabets */
 238 struct lzx_freqs {
 239         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 240         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 241         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 242 };
 243
 244 /* Block split statistics.  See the "Block splitting algorithm" section later in
 245  * this file for details. */
 246 #define NUM_LITERAL_OBSERVATION_TYPES 8
 247 #define NUM_MATCH_OBSERVATION_TYPES 2
 248 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
 249                                NUM_MATCH_OBSERVATION_TYPES)
 250 struct lzx_block_split_stats {
 251         u32 new_observations[NUM_OBSERVATION_TYPES];
 252         u32 observations[NUM_OBSERVATION_TYPES];
 253         u32 num_new_observations;
 254         u32 num_observations;
 255 };
 256
 257 /*
 258  * Represents a run of literals followed by a match or end-of-block.  This
 259  * structure is needed to temporarily store items chosen by the compressor,
 260  * since items cannot be written until all items for the block have been chosen
 261  * and the block's Huffman codes have been computed.
 262  */
 263 struct lzx_sequence {
 264
 265         /*
 266          * Bits 9..31: the number of literals in this run.  This may be 0 and
 267          * can be at most about SOFT_MAX_BLOCK_LENGTH.  The literals are not
 268          * stored explicitly in this structure; instead, they are read directly
 269          * from the uncompressed data.
 270          *
 271          * Bits 0..8: the length of the match which follows the literals, or 0
 272          * if this literal run was the last in the block, so there is no match
 273          * which follows it.  This can be at most LZX_MAX_MATCH_LEN.
 274          */
 275         u32 litrunlen_and_matchlen;
 276 #define SEQ_MATCHLEN_BITS       9
 277 #define SEQ_MATCHLEN_MASK       (((u32)1 << SEQ_MATCHLEN_BITS) - 1)
 278
 279         /*
 280          * If 'matchlen' doesn't indicate end-of-block, then this contains:
 281          *
 282          * Bits 10..31: either the offset plus LZX_OFFSET_ADJUSTMENT or a recent
 283          * offset code, depending on the offset slot encoded in the main symbol.
 284          *
 285          * Bits 0..9: the main symbol.
 286          */
 287         u32 adjusted_offset_and_mainsym;
 288 #define SEQ_MAINSYM_BITS        10
 289 #define SEQ_MAINSYM_MASK        (((u32)1 << SEQ_MAINSYM_BITS) - 1)
 290 } _aligned_attribute(8);
 291
 292 /*
 293  * This structure represents a byte position in the input buffer and a node in
 294  * the graph of possible match/literal choices.
 295  *
 296  * Logically, each incoming edge to this node is labeled with a literal or a
 297  * match that can be taken to reach this position from an earlier position; and
 298  * each outgoing edge from this node is labeled with a literal or a match that
 299  * can be taken to advance from this position to a later position.
 300  */
 301 struct lzx_optimum_node {
 302
 303         /* The cost, in bits, of the lowest-cost path that has been found to
 304          * reach this position.  This can change as progressively lower cost
 305          * paths are found to reach this position.  */
 306         u32 cost;
 307
 308         /*
 309          * The best arrival to this node, i.e. the match or literal that was
 310          * used to arrive to this position at the given 'cost'.  This can change
 311          * as progressively lower cost paths are found to reach this position.
 312          *
 313          * For non-gap matches, this variable is divided into two bitfields
 314          * whose meanings depend on the item type:
 315          *
 316          * Literals:
 317          *      Low bits are 0, high bits are the literal.
 318          *
 319          * Explicit offset matches:
 320          *      Low bits are the match length, high bits are the offset plus
 321          *      LZX_OFFSET_ADJUSTMENT.
 322          *
 323          * Repeat offset matches:
 324          *      Low bits are the match length, high bits are the queue index.
 325          *
 326          * For gap matches, identified by OPTIMUM_GAP_MATCH set, special
 327          * behavior applies --- see the code.
 328          */
 329         u32 item;
 330 #define OPTIMUM_OFFSET_SHIFT    SEQ_MATCHLEN_BITS
 331 #define OPTIMUM_LEN_MASK        SEQ_MATCHLEN_MASK
 332 #if CONSIDER_GAP_MATCHES
 333 #  define OPTIMUM_GAP_MATCH 0x80000000
 334 #endif
 335
 336 } _aligned_attribute(8);
 337
 338 /* The cost model for near-optimal parsing */
 339 struct lzx_costs {
 340
 341         /*
 342          * 'match_cost[offset_slot][len - LZX_MIN_MATCH_LEN]' is the cost of a
 343          * length 'len' match which has an offset belonging to 'offset_slot'.
 344          * The cost includes the main symbol, the length symbol if required, and
 345          * the extra offset bits if any, excluding any entropy-coded bits
 346          * (aligned offset bits).  It does *not* include the cost of the aligned
 347          * offset symbol which may be required.
 348          */
 349         u16 match_cost[LZX_MAX_OFFSET_SLOTS][LZX_NUM_LENS];
 350
 351         /* Cost of each symbol in the main code */
 352         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 353
 354         /* Cost of each symbol in the length code */
 355         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 356
 357 #if CONSIDER_ALIGNED_COSTS
 358         /* Cost of each symbol in the aligned offset code */
 359         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 360 #endif
 361 };
 362
 363 struct lzx_output_bitstream;
 364
 365 /* The main LZX compressor structure */
 366 struct lzx_compressor {
 367
 368         /* The buffer for preprocessed input data, if not using destructive
 369          * compression */
 370         void *in_buffer;
 371
 372         /* If true, then the compressor need not preserve the input buffer if it
 373          * compresses the data successfully */
 374         bool destructive;
 375
 376         /* Pointer to the compress() implementation chosen at allocation time */
 377         void (*impl)(struct lzx_compressor *, const u8 *, size_t,
 378                      struct lzx_output_bitstream *);
 379
 380         /* The log base 2 of the window size for match offset encoding purposes.
 381          * This will be >= LZX_MIN_WINDOW_ORDER and <= LZX_MAX_WINDOW_ORDER. */
 382         unsigned window_order;
 383
 384         /* The number of symbols in the main alphabet.  This depends on the
 385          * window order, since the window order determines the maximum possible
 386          * match offset. */
 387         unsigned num_main_syms;
 388
 389         /* The "nice" match length: if a match of this length is found, then it
 390          * is chosen immediately without further consideration. */
 391         unsigned nice_match_length;
 392
 393         /* The maximum search depth: at most this many potential matches are
 394          * considered at each position. */
 395         unsigned max_search_depth;
 396
 397         /* The number of optimization passes per block */
 398         unsigned num_optim_passes;
 399
 400         /* The symbol frequency counters for the current block */
 401         struct lzx_freqs freqs;
 402
 403         /* Block split statistics for the current block */
 404         struct lzx_block_split_stats split_stats;
 405
 406         /* The Huffman codes for the current and previous blocks.  The one with
 407          * index 'codes_index' is for the current block, and the other one is
 408          * for the previous block. */
 409         struct lzx_codes codes[2];
 410         unsigned codes_index;
 411
 412         /* The matches and literals that the compressor has chosen for the
 413          * current block.  The required length of this array is limited by the
 414          * maximum number of matches that can ever be chosen for a single block,
 415          * plus one for the special entry at the end. */
 416         struct lzx_sequence chosen_sequences[
 417                        DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
 418
 419         /* Tables for mapping adjusted offsets to offset slots */
 420         u8 offset_slot_tab_1[32768]; /* offset slots [0, 29] */
 421         u8 offset_slot_tab_2[128]; /* offset slots [30, 49] */
 422
 423         union {
 424                 /* Data for lzx_compress_lazy() */
 425                 struct {
 426                         /* Hash chains matchfinder (MUST BE LAST!!!) */
 427                         union {
 428                                 struct hc_matchfinder_16 hc_mf_16;
 429                                 struct hc_matchfinder_32 hc_mf_32;
 430                         };
 431                 };
 432
 433                 /* Data for lzx_compress_near_optimal() */
 434                 struct {
 435                         /*
 436                          * Array of nodes, one per position, for running the
 437                          * minimum-cost path algorithm.
 438                          *
 439                          * This array must be large enough to accommodate the
 440                          * worst-case number of nodes, which occurs if the
 441                          * compressor finds a match of length LZX_MAX_MATCH_LEN
 442                          * at position 'SOFT_MAX_BLOCK_SIZE - 1', producing a
 443                          * block of size 'SOFT_MAX_BLOCK_SIZE - 1 +
 444                          * LZX_MAX_MATCH_LEN'.  Add one for the end-of-block
 445                          * node.
 446                          */
 447                         struct lzx_optimum_node optimum_nodes[
 448                                                     SOFT_MAX_BLOCK_SIZE - 1 +
 449                                                     LZX_MAX_MATCH_LEN + 1];
 450
 451                         /* The cost model for the current optimization pass */
 452                         struct lzx_costs costs;
 453
 454                         /*
 455                          * Cached matches for the current block.  This array
 456                          * contains the matches that were found at each position
 457                          * in the block.  Specifically, for each position, there
 458                          * is a special 'struct lz_match' whose 'length' field
 459                          * contains the number of matches that were found at
 460                          * that position; this is followed by the matches
 461                          * themselves, if any, sorted by strictly increasing
 462                          * length.
 463                          *
 464                          * Note: in rare cases, there will be a very high number
 465                          * of matches in the block and this array will overflow.
 466                          * If this happens, we force the end of the current
 467                          * block.  CACHE_LENGTH is the length at which we
 468                          * actually check for overflow.  The extra slots beyond
 469                          * this are enough to absorb the worst case overflow,
 470                          * which occurs if starting at &match_cache[CACHE_LENGTH
 471                          * - 1], we write the match count header, then write
 472                          * MAX_MATCHES_PER_POS matches, then skip searching for
 473                          * matches at 'LZX_MAX_MATCH_LEN - 1' positions and
 474                          * write the match count header for each.
 475                          */
 476                         struct lz_match match_cache[CACHE_LENGTH +
 477                                                     MAX_MATCHES_PER_POS +
 478                                                     LZX_MAX_MATCH_LEN - 1];
 479
 480                         /* Binary trees matchfinder (MUST BE LAST!!!) */
 481                         union {
 482                                 struct bt_matchfinder_16 bt_mf_16;
 483                                 struct bt_matchfinder_32 bt_mf_32;
 484                         };
 485                 };
 486         };
 487 };
 488
 489 /******************************************************************************/
 490 /*                            Matchfinder utilities                           */
 491 /*----------------------------------------------------------------------------*/
 492
 493 /*
 494  * Will a matchfinder using 16-bit positions be sufficient for compressing
 495  * buffers of up to the specified size?  The limit could be 65536 bytes, but we
 496  * also want to optimize out the use of offset_slot_tab_2 in the 16-bit case.
 497  * This requires that the limit be no more than the length of offset_slot_tab_1
 498  * (currently 32768).
 499  */
 500 static forceinline bool
 501 lzx_is_16_bit(size_t max_bufsize)
 502 {
 503         STATIC_ASSERT(ARRAY_LEN(((struct lzx_compressor *)0)->offset_slot_tab_1) == 32768);
 504         return max_bufsize <= 32768;
 505 }
 506
 507 /*
 508  * Return the offset slot for the specified adjusted match offset.
 509  */
 510 static forceinline unsigned
 511 lzx_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
 512                     bool is_16_bit)
 513 {
 514         if (__builtin_constant_p(adjusted_offset) &&
 515             adjusted_offset < LZX_NUM_RECENT_OFFSETS)
 516                 return adjusted_offset;
 517         if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
 518                 return c->offset_slot_tab_1[adjusted_offset];
 519         return c->offset_slot_tab_2[adjusted_offset >> 14];
 520 }
 521
 522 /*
 523  * For a match that has the specified length and adjusted offset, tally its main
 524  * symbol, and if needed its length symbol; then return its main symbol.
 525  */
 526 static forceinline unsigned
 527 lzx_tally_main_and_lensyms(struct lzx_compressor *c, unsigned length,
 528                            u32 adjusted_offset, bool is_16_bit)
 529 {
 530         unsigned mainsym;
 531
 532         if (length >= LZX_MIN_SECONDARY_LEN) {
 533                 /* Length symbol needed */
 534                 c->freqs.len[length - LZX_MIN_SECONDARY_LEN]++;
 535                 mainsym = LZX_NUM_CHARS + LZX_NUM_PRIMARY_LENS;
 536         } else {
 537                 /* No length symbol needed */
 538                 mainsym = LZX_NUM_CHARS + length - LZX_MIN_MATCH_LEN;
 539         }
 540
 541         mainsym += LZX_NUM_LEN_HEADERS *
 542                    lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
 543         c->freqs.main[mainsym]++;
 544         return mainsym;
 545 }
 546
 547 /*
 548  * The following macros call either the 16-bit or the 32-bit version of a
 549  * matchfinder function based on the value of 'is_16_bit', which will be known
 550  * at compilation time.
 551  */
 552
 553 #define CALL_HC_MF(is_16_bit, c, funcname, ...)                               \
 554         ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \
 555                        CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__));
 556
 557 #define CALL_BT_MF(is_16_bit, c, funcname, ...)                               \
 558         ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \
 559                        CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__));
 560
 561 /******************************************************************************/
 562 /*                             Output bitstream                               */
 563 /*----------------------------------------------------------------------------*/
 564
 565 /*
 566  * The LZX bitstream is encoded as a sequence of little endian 16-bit coding
 567  * units.  Bits are ordered from most significant to least significant within
 568  * each coding unit.
 569  */
 570
 571 /*
 572  * Structure to keep track of the current state of sending bits to the
 573  * compressed output buffer.
 574  */
 575 struct lzx_output_bitstream {
 576
 577         /* Bits that haven't yet been written to the output buffer */
 578         machine_word_t bitbuf;
 579
 580         /* Number of bits currently held in @bitbuf */
 581         machine_word_t bitcount;
 582
 583         /* Pointer to the start of the output buffer */
 584         u8 *start;
 585
 586         /* Pointer to the position in the output buffer at which the next coding
 587          * unit should be written */
 588         u8 *next;
 589
 590         /* Pointer to just past the end of the output buffer, rounded down by
 591          * one byte if needed to make 'end - start' a multiple of 2 */
 592         u8 *end;
 593 };
 594
 595 /* Can the specified number of bits always be added to 'bitbuf' after all
 596  * pending 16-bit coding units have been flushed?  */
 597 #define CAN_BUFFER(n)   ((n) <= WORDBITS - 15)
 598
 599 /* Initialize the output bitstream to write to the specified buffer. */
 600 static void
 601 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
 602 {
 603         os->bitbuf = 0;
 604         os->bitcount = 0;
 605         os->start = buffer;
 606         os->next = buffer;
 607         os->end = (u8 *)buffer + (size & ~1);
 608 }
 609
 610 /*
 611  * Add some bits to the bitbuffer variable of the output bitstream.  The caller
 612  * must make sure there is enough room.
 613  */
 614 static forceinline void
 615 lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 616 {
 617         os->bitbuf = (os->bitbuf << num_bits) | bits;
 618         os->bitcount += num_bits;
 619 }
 620
 621 /*
 622  * Flush bits from the bitbuffer variable to the output buffer.  'max_num_bits'
 623  * specifies the maximum number of bits that may have been added since the last
 624  * flush.
 625  */
 626 static forceinline void
 627 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
 628 {
 629         /* Masking the number of bits to shift is only needed to avoid undefined
 630          * behavior; we don't actually care about the results of bad shifts.  On
 631          * x86, the explicit masking generates no extra code.  */
 632         const u32 shift_mask = WORDBITS - 1;
 633
 634         if (os->end - os->next < 6)
 635                 return;
 636         put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
 637                                             shift_mask), os->next + 0);
 638         if (max_num_bits > 16)
 639                 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
 640                                                 shift_mask), os->next + 2);
 641         if (max_num_bits > 32)
 642                 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
 643                                                 shift_mask), os->next + 4);
 644         os->next += (os->bitcount >> 4) << 1;
 645         os->bitcount &= 15;
 646 }
 647
 648 /* Add at most 16 bits to the bitbuffer and flush it.  */
 649 static forceinline void
 650 lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 651 {
 652         lzx_add_bits(os, bits, num_bits);
 653         lzx_flush_bits(os, 16);
 654 }
 655
 656 /*
 657  * Flush the last coding unit to the output buffer if needed.  Return the total
 658  * number of bytes written to the output buffer, or 0 if an overflow occurred.
 659  */
 660 static size_t
 661 lzx_flush_output(struct lzx_output_bitstream *os)
 662 {
 663         if (os->end - os->next < 6)
 664                 return 0;
 665
 666         if (os->bitcount != 0) {
 667                 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
 668                 os->next += 2;
 669         }
 670
 671         return os->next - os->start;
 672 }
 673
 674 /******************************************************************************/
 675 /*                           Preparing Huffman codes                          */
 676 /*----------------------------------------------------------------------------*/
 677
 678 /*
 679  * Build the Huffman codes.  This takes as input the frequency tables for each
 680  * code and produces as output a set of tables that map symbols to codewords and
 681  * codeword lengths.
 682  */
 683 static void
 684 lzx_build_huffman_codes(struct lzx_compressor *c)
 685 {
 686         const struct lzx_freqs *freqs = &c->freqs;
 687         struct lzx_codes *codes = &c->codes[c->codes_index];
 688
 689         STATIC_ASSERT(MAIN_CODEWORD_LIMIT >= 9 &&
 690                       MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
 691         make_canonical_huffman_code(c->num_main_syms,
 692                                     MAIN_CODEWORD_LIMIT,
 693                                     freqs->main,
 694                                     codes->lens.main,
 695                                     codes->codewords.main);
 696
 697         STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 8 &&
 698                       LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
 699         make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
 700                                     LENGTH_CODEWORD_LIMIT,
 701                                     freqs->len,
 702                                     codes->lens.len,
 703                                     codes->codewords.len);
 704
 705         STATIC_ASSERT(ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
 706                       ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
 707         make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
 708                                     ALIGNED_CODEWORD_LIMIT,
 709                                     freqs->aligned,
 710                                     codes->lens.aligned,
 711                                     codes->codewords.aligned);
 712 }
 713
 714 /* Reset the symbol frequencies for the current block. */
 715 static void
 716 lzx_reset_symbol_frequencies(struct lzx_compressor *c)
 717 {
 718         memset(&c->freqs, 0, sizeof(c->freqs));
 719 }
 720
 721 static unsigned
 722 lzx_compute_precode_items(const u8 lens[restrict],
 723                           const u8 prev_lens[restrict],
 724                           u32 precode_freqs[restrict],
 725                           unsigned precode_items[restrict])
 726 {
 727         unsigned *itemptr;
 728         unsigned run_start;
 729         unsigned run_end;
 730         unsigned extra_bits;
 731         int delta;
 732         u8 len;
 733
 734         itemptr = precode_items;
 735         run_start = 0;
 736
 737         while (!((len = lens[run_start]) & 0x80)) {
 738
 739                 /* len = the length being repeated  */
 740
 741                 /* Find the next run of codeword lengths.  */
 742
 743                 run_end = run_start + 1;
 744
 745                 /* Fast case for a single length.  */
 746                 if (likely(len != lens[run_end])) {
 747                         delta = prev_lens[run_start] - len;
 748                         if (delta < 0)
 749                                 delta += 17;
 750                         precode_freqs[delta]++;
 751                         *itemptr++ = delta;
 752                         run_start++;
 753                         continue;
 754                 }
 755
 756                 /* Extend the run.  */
 757                 do {
 758                         run_end++;
 759                 } while (len == lens[run_end]);
 760
 761                 if (len == 0) {
 762                         /* Run of zeroes.  */
 763
 764                         /* Symbol 18: RLE 20 to 51 zeroes at a time.  */
 765                         while ((run_end - run_start) >= 20) {
 766                                 extra_bits = min((run_end - run_start) - 20, 0x1F);
 767                                 precode_freqs[18]++;
 768                                 *itemptr++ = 18 | (extra_bits << 5);
 769                                 run_start += 20 + extra_bits;
 770                         }
 771
 772                         /* Symbol 17: RLE 4 to 19 zeroes at a time.  */
 773                         if ((run_end - run_start) >= 4) {
 774                                 extra_bits = min((run_end - run_start) - 4, 0xF);
 775                                 precode_freqs[17]++;
 776                                 *itemptr++ = 17 | (extra_bits << 5);
 777                                 run_start += 4 + extra_bits;
 778                         }
 779                 } else {
 780
 781                         /* A run of nonzero lengths. */
 782
 783                         /* Symbol 19: RLE 4 to 5 of any length at a time.  */
 784                         while ((run_end - run_start) >= 4) {
 785                                 extra_bits = (run_end - run_start) > 4;
 786                                 delta = prev_lens[run_start] - len;
 787                                 if (delta < 0)
 788                                         delta += 17;
 789                                 precode_freqs[19]++;
 790                                 precode_freqs[delta]++;
 791                                 *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
 792                                 run_start += 4 + extra_bits;
 793                         }
 794                 }
 795
 796                 /* Output any remaining lengths without RLE.  */
 797                 while (run_start != run_end) {
 798                         delta = prev_lens[run_start] - len;
 799                         if (delta < 0)
 800                                 delta += 17;
 801                         precode_freqs[delta]++;
 802                         *itemptr++ = delta;
 803                         run_start++;
 804                 }
 805         }
 806
 807         return itemptr - precode_items;
 808 }
 809
 810 /******************************************************************************/
 811 /*                          Outputting compressed data                        */
 812 /*----------------------------------------------------------------------------*/
 813
 814 /*
 815  * Output a Huffman code in the compressed form used in LZX.
 816  *
 817  * The Huffman code is represented in the output as a logical series of codeword
 818  * lengths from which the Huffman code, which must be in canonical form, can be
 819  * reconstructed.
 820  *
 821  * The codeword lengths are themselves compressed using a separate Huffman code,
 822  * the "precode", which contains a symbol for each possible codeword length in
 823  * the larger code as well as several special symbols to represent repeated
 824  * codeword lengths (a form of run-length encoding).  The precode is itself
 825  * constructed in canonical form, and its codeword lengths are represented
 826  * literally in 20 4-bit fields that immediately precede the compressed codeword
 827  * lengths of the larger code.
 828  *
 829  * Furthermore, the codeword lengths of the larger code are actually represented
 830  * as deltas from the codeword lengths of the corresponding code in the previous
 831  * block.
 832  *
 833  * @os:
 834  *      Bitstream to which to write the compressed Huffman code.
 835  * @lens:
 836  *      The codeword lengths, indexed by symbol, in the Huffman code.
 837  * @prev_lens:
 838  *      The codeword lengths, indexed by symbol, in the corresponding Huffman
 839  *      code in the previous block, or all zeroes if this is the first block.
 840  * @num_lens:
 841  *      The number of symbols in the Huffman code.
 842  */
 843 static void
 844 lzx_write_compressed_code(struct lzx_output_bitstream *os,
 845                           const u8 lens[restrict],
 846                           const u8 prev_lens[restrict],
 847                           unsigned num_lens)
 848 {
 849         u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
 850         u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
 851         u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
 852         unsigned precode_items[num_lens];
 853         unsigned num_precode_items;
 854         unsigned precode_item;
 855         unsigned precode_sym;
 856         unsigned i;
 857         u8 saved = lens[num_lens];
 858         *(u8 *)(lens + num_lens) = 0x80;
 859
 860         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 861                 precode_freqs[i] = 0;
 862
 863         /* Compute the "items" (RLE / literal tokens and extra bits) with which
 864          * the codeword lengths in the larger code will be output.  */
 865         num_precode_items = lzx_compute_precode_items(lens,
 866                                                       prev_lens,
 867                                                       precode_freqs,
 868                                                       precode_items);
 869
 870         /* Build the precode.  */
 871         STATIC_ASSERT(PRE_CODEWORD_LIMIT >= 5 &&
 872                       PRE_CODEWORD_LIMIT <= LZX_MAX_PRE_CODEWORD_LEN);
 873         make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS, PRE_CODEWORD_LIMIT,
 874                                     precode_freqs, precode_lens,
 875                                     precode_codewords);
 876
 877         /* Output the lengths of the codewords in the precode.  */
 878         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 879                 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
 880
 881         /* Output the encoded lengths of the codewords in the larger code.  */
 882         for (i = 0; i < num_precode_items; i++) {
 883                 precode_item = precode_items[i];
 884                 precode_sym = precode_item & 0x1F;
 885                 lzx_add_bits(os, precode_codewords[precode_sym],
 886                              precode_lens[precode_sym]);
 887                 if (precode_sym >= 17) {
 888                         if (precode_sym == 17) {
 889                                 lzx_add_bits(os, precode_item >> 5, 4);
 890                         } else if (precode_sym == 18) {
 891                                 lzx_add_bits(os, precode_item >> 5, 5);
 892                         } else {
 893                                 lzx_add_bits(os, (precode_item >> 5) & 1, 1);
 894                                 precode_sym = precode_item >> 6;
 895                                 lzx_add_bits(os, precode_codewords[precode_sym],
 896                                              precode_lens[precode_sym]);
 897                         }
 898                 }
 899                 STATIC_ASSERT(CAN_BUFFER(2 * PRE_CODEWORD_LIMIT + 1));
 900                 lzx_flush_bits(os, 2 * PRE_CODEWORD_LIMIT + 1);
 901         }
 902
 903         *(u8 *)(lens + num_lens) = saved;
 904 }
 905
 906 /*
 907  * Write all matches and literal bytes (which were precomputed) in an LZX
 908  * compressed block to the output bitstream in the final compressed
 909  * representation.
 910  *
 911  * @os
 912  *      The output bitstream.
 913  * @block_type
 914  *      The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
 915  *      LZX_BLOCKTYPE_VERBATIM).
 916  * @block_data
 917  *      The uncompressed data of the block.
 918  * @sequences
 919  *      The matches and literals to output, given as a series of sequences.
 920  * @codes
 921  *      The main, length, and aligned offset Huffman codes for the block.
 922  */
 923 static void
 924 lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 925                     const u8 *block_data, const struct lzx_sequence sequences[],
 926                     const struct lzx_codes *codes)
 927 {
 928         const struct lzx_sequence *seq = sequences;
 929         unsigned min_aligned_offset_slot;
 930
 931         if (block_type == LZX_BLOCKTYPE_ALIGNED)
 932                 min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
 933         else
 934                 min_aligned_offset_slot = LZX_MAX_OFFSET_SLOTS;
 935
 936         for (;;) {
 937                 /* Output the next sequence.  */
 938
 939                 u32 litrunlen = seq->litrunlen_and_matchlen >> SEQ_MATCHLEN_BITS;
 940                 unsigned matchlen = seq->litrunlen_and_matchlen & SEQ_MATCHLEN_MASK;
 941                 STATIC_ASSERT((u32)~SEQ_MATCHLEN_MASK >> SEQ_MATCHLEN_BITS >=
 942                               SOFT_MAX_BLOCK_SIZE);
 943                 u32 adjusted_offset;
 944                 unsigned main_symbol;
 945                 unsigned offset_slot;
 946                 unsigned num_extra_bits;
 947                 u32 extra_bits;
 948
 949                 /* Output the literal run of the sequence.  */
 950
 951                 if (litrunlen) {  /* Is the literal run nonempty?  */
 952
 953                         /* Verify optimization is enabled on 64-bit  */
 954                         STATIC_ASSERT(WORDBITS < 64 ||
 955                                       CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT));
 956
 957                         if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) {
 958
 959                                 /* 64-bit: write 3 literals at a time.  */
 960                                 while (litrunlen >= 3) {
 961                                         unsigned lit0 = block_data[0];
 962                                         unsigned lit1 = block_data[1];
 963                                         unsigned lit2 = block_data[2];
 964                                         lzx_add_bits(os, codes->codewords.main[lit0],
 965                                                      codes->lens.main[lit0]);
 966                                         lzx_add_bits(os, codes->codewords.main[lit1],
 967                                                      codes->lens.main[lit1]);
 968                                         lzx_add_bits(os, codes->codewords.main[lit2],
 969                                                      codes->lens.main[lit2]);
 970                                         lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
 971                                         block_data += 3;
 972                                         litrunlen -= 3;
 973                                 }
 974                                 if (litrunlen--) {
 975                                         unsigned lit = *block_data++;
 976                                         lzx_add_bits(os, codes->codewords.main[lit],
 977                                                      codes->lens.main[lit]);
 978                                         if (litrunlen--) {
 979                                                 unsigned lit = *block_data++;
 980                                                 lzx_add_bits(os, codes->codewords.main[lit],
 981                                                              codes->lens.main[lit]);
 982                                                 lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
 983                                         } else {
 984                                                 lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
 985                                         }
 986                                 }
 987                         } else {
 988                                 /* 32-bit: write 1 literal at a time.  */
 989                                 do {
 990                                         unsigned lit = *block_data++;
 991                                         lzx_add_bits(os, codes->codewords.main[lit],
 992                                                      codes->lens.main[lit]);
 993                                         lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
 994                                 } while (--litrunlen);
 995                         }
 996                 }
 997
 998                 /* Was this the last literal run?  */
 999                 if (matchlen == 0)
1000                         return;
1001
1002                 /* Nope; output the match.  */
1003
1004                 block_data += matchlen;
1005
1006                 adjusted_offset = seq->adjusted_offset_and_mainsym >> SEQ_MAINSYM_BITS;
1007                 main_symbol = seq->adjusted_offset_and_mainsym & SEQ_MAINSYM_MASK;
1008
1009                 offset_slot = (main_symbol - LZX_NUM_CHARS) / LZX_NUM_LEN_HEADERS;
1010                 num_extra_bits = lzx_extra_offset_bits[offset_slot];
1011                 extra_bits = adjusted_offset - (lzx_offset_slot_base[offset_slot] +
1012                                                 LZX_OFFSET_ADJUSTMENT);
1013
1014         #define MAX_MATCH_BITS (MAIN_CODEWORD_LIMIT +           \
1015                                 LENGTH_CODEWORD_LIMIT +         \
1016                                 LZX_MAX_NUM_EXTRA_BITS -        \
1017                                 LZX_NUM_ALIGNED_OFFSET_BITS +   \
1018                                 ALIGNED_CODEWORD_LIMIT)
1019
1020                 /* Verify optimization is enabled on 64-bit  */
1021                 STATIC_ASSERT(WORDBITS < 64 || CAN_BUFFER(MAX_MATCH_BITS));
1022
1023                 /* Output the main symbol for the match.  */
1024
1025                 lzx_add_bits(os, codes->codewords.main[main_symbol],
1026                              codes->lens.main[main_symbol]);
1027                 if (!CAN_BUFFER(MAX_MATCH_BITS))
1028                         lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
1029
1030                 /* If needed, output the length symbol for the match.  */
1031
1032                 if (matchlen >= LZX_MIN_SECONDARY_LEN) {
1033                         lzx_add_bits(os, codes->codewords.len[matchlen -
1034                                                               LZX_MIN_SECONDARY_LEN],
1035                                      codes->lens.len[matchlen -
1036                                                      LZX_MIN_SECONDARY_LEN]);
1037                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1038                                 lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
1039                 }
1040
1041                 /* Output the extra offset bits for the match.  In aligned
1042                  * offset blocks, the lowest 3 bits of the adjusted offset are
1043                  * Huffman-encoded using the aligned offset code, provided that
1044                  * there are at least extra 3 offset bits required.  All other
1045                  * extra offset bits are output verbatim.  */
1046
1047                 if (offset_slot >= min_aligned_offset_slot) {
1048
1049                         lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
1050                                      num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS);
1051                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1052                                 lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS -
1053                                                    LZX_NUM_ALIGNED_OFFSET_BITS);
1054
1055                         lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
1056                                                                   LZX_ALIGNED_OFFSET_BITMASK],
1057                                      codes->lens.aligned[adjusted_offset &
1058                                                          LZX_ALIGNED_OFFSET_BITMASK]);
1059                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1060                                 lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
1061                 } else {
1062                         STATIC_ASSERT(CAN_BUFFER(LZX_MAX_NUM_EXTRA_BITS));
1063
1064                         lzx_add_bits(os, extra_bits, num_extra_bits);
1065                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1066                                 lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS);
1067                 }
1068
1069                 if (CAN_BUFFER(MAX_MATCH_BITS))
1070                         lzx_flush_bits(os, MAX_MATCH_BITS);
1071
1072                 /* Advance to the next sequence.  */
1073                 seq++;
1074         }
1075 }
1076
1077 static void
1078 lzx_write_compressed_block(const u8 *block_begin,
1079                            int block_type,
1080                            u32 block_size,
1081                            unsigned window_order,
1082                            unsigned num_main_syms,
1083                            const struct lzx_sequence sequences[],
1084                            const struct lzx_codes * codes,
1085                            const struct lzx_lens * prev_lens,
1086                            struct lzx_output_bitstream * os)
1087 {
1088         /* The first three bits indicate the type of block and are one of the
1089          * LZX_BLOCKTYPE_* constants.  */
1090         lzx_write_bits(os, block_type, 3);
1091
1092         /*
1093          * Output the block size.
1094          *
1095          * The original LZX format encoded the block size in 24 bits.  However,
1096          * the LZX format used in WIM archives uses 1 bit to specify whether the
1097          * block has the default size of 32768 bytes, then optionally 16 bits to
1098          * specify a non-default size.  This works fine for Microsoft's WIM
1099          * software (WIMGAPI), which never compresses more than 32768 bytes at a
1100          * time with LZX.  However, as an extension, our LZX compressor supports
1101          * compressing up to 2097152 bytes, with a corresponding increase in
1102          * window size.  It is possible for blocks in these larger buffers to
1103          * exceed 65535 bytes; such blocks cannot have their size represented in
1104          * 16 bits.
1105          *
1106          * The chosen solution was to use 24 bits for the block size when
1107          * possibly required --- specifically, when the compressor has been
1108          * allocated to be capable of compressing more than 32768 bytes at once
1109          * (which also causes the number of main symbols to be increased).
1110          */
1111         if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
1112                 lzx_write_bits(os, 1, 1);
1113         } else {
1114                 lzx_write_bits(os, 0, 1);
1115
1116                 if (window_order >= 16)
1117                         lzx_write_bits(os, block_size >> 16, 8);
1118
1119                 lzx_write_bits(os, block_size & 0xFFFF, 16);
1120         }
1121
1122         /* If it's an aligned offset block, output the aligned offset code.  */
1123         if (block_type == LZX_BLOCKTYPE_ALIGNED) {
1124                 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1125                         lzx_write_bits(os, codes->lens.aligned[i],
1126                                        LZX_ALIGNEDCODE_ELEMENT_SIZE);
1127                 }
1128         }
1129
1130         /* Output the main code (two parts).  */
1131         lzx_write_compressed_code(os, codes->lens.main,
1132                                   prev_lens->main,
1133                                   LZX_NUM_CHARS);
1134         lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
1135                                   prev_lens->main + LZX_NUM_CHARS,
1136                                   num_main_syms - LZX_NUM_CHARS);
1137
1138         /* Output the length code.  */
1139         lzx_write_compressed_code(os, codes->lens.len,
1140                                   prev_lens->len,
1141                                   LZX_LENCODE_NUM_SYMBOLS);
1142
1143         /* Output the compressed matches and literals.  */
1144         lzx_write_sequences(os, block_type, block_begin, sequences, codes);
1145 }
1146
1147 /*
1148  * Given the frequencies of symbols in an LZX-compressed block and the
1149  * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
1150  * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
1151  * will take fewer bits to output.
1152  */
1153 static int
1154 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
1155                                const struct lzx_codes * codes)
1156 {
1157         u32 verbatim_cost = 0;
1158         u32 aligned_cost = 0;
1159
1160         /* A verbatim block requires 3 bits in each place that an aligned offset
1161          * symbol would be used in an aligned offset block.  */
1162         for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1163                 verbatim_cost += LZX_NUM_ALIGNED_OFFSET_BITS * freqs->aligned[i];
1164                 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
1165         }
1166
1167         /* Account for the cost of sending the codeword lengths of the aligned
1168          * offset code.  */
1169         aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE *
1170                         LZX_ALIGNEDCODE_NUM_SYMBOLS;
1171
1172         if (aligned_cost < verbatim_cost)
1173                 return LZX_BLOCKTYPE_ALIGNED;
1174         else
1175                 return LZX_BLOCKTYPE_VERBATIM;
1176 }
1177
1178 /*
1179  * Flush an LZX block:
1180  *
1181  * 1. Build the Huffman codes.
1182  * 2. Decide whether to output the block as VERBATIM or ALIGNED.
1183  * 3. Write the block.
1184  * 4. Swap the indices of the current and previous Huffman codes.
1185  *
1186  * Note: we never output UNCOMPRESSED blocks.  This probably should be
1187  * implemented sometime, but it doesn't make much difference.
1188  */
1189 static void
1190 lzx_flush_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
1191                 const u8 *block_begin, u32 block_size, u32 seq_idx)
1192 {
1193         int block_type;
1194
1195         lzx_build_huffman_codes(c);
1196
1197         block_type = lzx_choose_verbatim_or_aligned(&c->freqs,
1198                                                     &c->codes[c->codes_index]);
1199         lzx_write_compressed_block(block_begin,
1200                                    block_type,
1201                                    block_size,
1202                                    c->window_order,
1203                                    c->num_main_syms,
1204                                    &c->chosen_sequences[seq_idx],
1205                                    &c->codes[c->codes_index],
1206                                    &c->codes[c->codes_index ^ 1].lens,
1207                                    os);
1208         c->codes_index ^= 1;
1209 }
1210
1211 /******************************************************************************/
1212 /*                          Block splitting algorithm                         */
1213 /*----------------------------------------------------------------------------*/
1214
1215 /*
1216  * The problem of block splitting is to decide when it is worthwhile to start a
1217  * new block with new entropy codes.  There is a theoretically optimal solution:
1218  * recursively consider every possible block split, considering the exact cost
1219  * of each block, and choose the minimum cost approach.  But this is far too
1220  * slow.  Instead, as an approximation, we can count symbols and after every N
1221  * symbols, compare the expected distribution of symbols based on the previous
1222  * data with the actual distribution.  If they differ "by enough", then start a
1223  * new block.
1224  *
1225  * As an optimization and heuristic, we don't distinguish between every symbol
1226  * but rather we combine many symbols into a single "observation type".  For
1227  * literals we only look at the high bits and low bits, and for matches we only
1228  * look at whether the match is long or not.  The assumption is that for typical
1229  * "real" data, places that are good block boundaries will tend to be noticeable
1230  * based only on changes in these aggregate frequencies, without looking for
1231  * subtle differences in individual symbols.  For example, a change from ASCII
1232  * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1233  * to many matches (generally more compressible), would be easily noticed based
1234  * on the aggregates.
1235  *
1236  * For determining whether the frequency distributions are "different enough" to
1237  * start a new block, the simply heuristic of splitting when the sum of absolute
1238  * differences exceeds a constant seems to be good enough.
1239  *
1240  * Finally, for an approximation, it is not strictly necessary that the exact
1241  * symbols being used are considered.  With "near-optimal parsing", for example,
1242  * the actual symbols that will be used are unknown until after the block
1243  * boundary is chosen and the block has been optimized.  Since the final choices
1244  * cannot be used, we can use preliminary "greedy" choices instead.
1245  */
1246
1247 /* Initialize the block split statistics when starting a new block. */
1248 static void
1249 lzx_init_block_split_stats(struct lzx_block_split_stats *stats)
1250 {
1251         memset(stats, 0, sizeof(*stats));
1252 }
1253
1254 /* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
1255  * literal, for 8 possible literal observation types.  */
1256 static forceinline void
1257 lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
1258 {
1259         stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1260         stats->num_new_observations++;
1261 }
1262
1263 /* Match observation.  Heuristic: use one observation type for "short match" and
1264  * one observation type for "long match".  */
1265 static forceinline void
1266 lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length)
1267 {
1268         stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
1269         stats->num_new_observations++;
1270 }
1271
1272 static bool
1273 lzx_should_end_block(struct lzx_block_split_stats *stats)
1274 {
1275         if (stats->num_observations > 0) {
1276
1277                 /* Note: to avoid slow divisions, we do not divide by
1278                  * 'num_observations', but rather do all math with the numbers
1279                  * multiplied by 'num_observations'. */
1280                 u32 total_delta = 0;
1281                 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1282                         u32 expected = stats->observations[i] *
1283                                        stats->num_new_observations;
1284                         u32 actual = stats->new_observations[i] *
1285                                      stats->num_observations;
1286                         u32 delta = (actual > expected) ? actual - expected :
1287                                                           expected - actual;
1288                         total_delta += delta;
1289                 }
1290
1291                 /* Ready to end the block? */
1292                 if (total_delta >=
1293                     stats->num_new_observations * 7 / 8 * stats->num_observations)
1294                         return true;
1295         }
1296
1297         for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1298                 stats->num_observations += stats->new_observations[i];
1299                 stats->observations[i] += stats->new_observations[i];
1300                 stats->new_observations[i] = 0;
1301         }
1302         stats->num_new_observations = 0;
1303         return false;
1304 }
1305
1306 /******************************************************************************/
1307 /*                   Slower ("near-optimal") compression algorithm            */
1308 /*----------------------------------------------------------------------------*/
1309
1310 /*
1311  * Least-recently-used queue for match offsets.
1312  *
1313  * This is represented as a 64-bit integer for efficiency.  There are three
1314  * offsets of 21 bits each.  Bit 64 is garbage.
1315  */
1316 struct lzx_lru_queue {
1317         u64 R;
1318 } _aligned_attribute(8);
1319
1320 #define LZX_QUEUE_OFFSET_SHIFT  21
1321 #define LZX_QUEUE_OFFSET_MASK   (((u64)1 << LZX_QUEUE_OFFSET_SHIFT) - 1)
1322
1323 #define LZX_QUEUE_R0_SHIFT (0 * LZX_QUEUE_OFFSET_SHIFT)
1324 #define LZX_QUEUE_R1_SHIFT (1 * LZX_QUEUE_OFFSET_SHIFT)
1325 #define LZX_QUEUE_R2_SHIFT (2 * LZX_QUEUE_OFFSET_SHIFT)
1326
1327 #define LZX_QUEUE_R0_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R0_SHIFT)
1328 #define LZX_QUEUE_R1_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R1_SHIFT)
1329 #define LZX_QUEUE_R2_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R2_SHIFT)
1330
1331 #define LZX_QUEUE_INITIALIZER {                 \
1332         ((u64)1 << LZX_QUEUE_R0_SHIFT) |        \
1333         ((u64)1 << LZX_QUEUE_R1_SHIFT) |        \
1334         ((u64)1 << LZX_QUEUE_R2_SHIFT) }
1335
1336 static forceinline u64
1337 lzx_lru_queue_R0(struct lzx_lru_queue queue)
1338 {
1339         return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1340 }
1341
1342 static forceinline u64
1343 lzx_lru_queue_R1(struct lzx_lru_queue queue)
1344 {
1345         return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1346 }
1347
1348 static forceinline u64
1349 lzx_lru_queue_R2(struct lzx_lru_queue queue)
1350 {
1351         return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1352 }
1353
1354 /* Push a match offset onto the front (most recently used) end of the queue.  */
1355 static forceinline struct lzx_lru_queue
1356 lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
1357 {
1358         return (struct lzx_lru_queue) {
1359                 .R = (queue.R << LZX_QUEUE_OFFSET_SHIFT) | offset,
1360         };
1361 }
1362
1363 /* Swap a match offset to the front of the queue.  */
1364 static forceinline struct lzx_lru_queue
1365 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
1366 {
1367         unsigned shift = idx * 21;
1368         const u64 mask = LZX_QUEUE_R0_MASK;
1369         const u64 mask_high = mask << shift;
1370
1371         return (struct lzx_lru_queue) {
1372                 (queue.R & ~(mask | mask_high)) |
1373                 ((queue.R & mask_high) >> shift) |
1374                 ((queue.R & mask) << shift)
1375         };
1376 }
1377
1378 static forceinline u32
1379 lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
1380                    bool record)
1381 {
1382         struct lzx_sequence *seq =
1383                 &c->chosen_sequences[ARRAY_LEN(c->chosen_sequences) - 1];
1384         u32 node_idx = block_size;
1385         u32 litrun_end; /* if record=true: end of the current literal run */
1386
1387         if (record) {
1388                 /* The last sequence has matchlen 0 */
1389                 seq->litrunlen_and_matchlen = 0;
1390                 litrun_end = node_idx;
1391         }
1392
1393         for (;;) {
1394                 u32 item;
1395                 unsigned matchlen;
1396                 u32 adjusted_offset;
1397                 unsigned mainsym;
1398
1399                 /* Tally literals until either a match or the beginning of the
1400                  * block is reached.  Note: the item in the node at the
1401                  * beginning of the block (c->optimum_nodes[0]) has all bits
1402                  * set, causing this loop to end when it is reached. */
1403                 for (;;) {
1404                         item = c->optimum_nodes[node_idx].item;
1405                         if (item & OPTIMUM_LEN_MASK)
1406                                 break;
1407                         c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
1408                         node_idx--;
1409                 }
1410
1411         #if CONSIDER_GAP_MATCHES
1412                 if (item & OPTIMUM_GAP_MATCH) {
1413                         if (node_idx == 0)
1414                                 break;
1415                         /* Tally/record the rep0 match after the gap. */
1416                         matchlen = item & OPTIMUM_LEN_MASK;
1417                         mainsym = lzx_tally_main_and_lensyms(c, matchlen, 0,
1418                                                              is_16_bit);
1419                         if (record) {
1420                                 seq->litrunlen_and_matchlen |=
1421                                         (litrun_end - node_idx) <<
1422                                          SEQ_MATCHLEN_BITS;
1423                                 seq--;
1424                                 seq->litrunlen_and_matchlen = matchlen;
1425                                 seq->adjusted_offset_and_mainsym = mainsym;
1426                                 litrun_end = node_idx - matchlen;
1427                         }
1428
1429                         /* Tally the literal in the gap. */
1430                         c->freqs.main[(u8)(item >> OPTIMUM_OFFSET_SHIFT)]++;
1431
1432                         /* Fall through and tally the match before the gap.
1433                          * (It was temporarily saved in the 'cost' field of the
1434                          * previous node, which was free to reuse.) */
1435                         item = c->optimum_nodes[--node_idx].cost;
1436                         node_idx -= matchlen;
1437                 }
1438         #else /* CONSIDER_GAP_MATCHES */
1439                 if (node_idx == 0)
1440                         break;
1441         #endif /* !CONSIDER_GAP_MATCHES */
1442
1443                 /* Tally/record a match. */
1444                 matchlen = item & OPTIMUM_LEN_MASK;
1445                 adjusted_offset = item >> OPTIMUM_OFFSET_SHIFT;
1446                 mainsym = lzx_tally_main_and_lensyms(c, matchlen,
1447                                                      adjusted_offset,
1448                                                      is_16_bit);
1449                 if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET +
1450                                        LZX_OFFSET_ADJUSTMENT)
1451                         c->freqs.aligned[adjusted_offset &
1452                                          LZX_ALIGNED_OFFSET_BITMASK]++;
1453                 if (record) {
1454                         seq->litrunlen_and_matchlen |=
1455                                 (litrun_end - node_idx) << SEQ_MATCHLEN_BITS;
1456                         seq--;
1457                         seq->litrunlen_and_matchlen = matchlen;
1458                         seq->adjusted_offset_and_mainsym =
1459                                 (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym;
1460                         litrun_end = node_idx - matchlen;
1461                 }
1462                 node_idx -= matchlen;
1463         }
1464
1465         /* Record the literal run length for the first sequence. */
1466         if (record) {
1467                 seq->litrunlen_and_matchlen |=
1468                         (litrun_end - node_idx) << SEQ_MATCHLEN_BITS;
1469         }
1470
1471         /* Return the index in chosen_sequences at which the sequences begin. */
1472         return seq - &c->chosen_sequences[0];
1473 }
1474
1475 /*
1476  * Given the minimum-cost path computed through the item graph for the current
1477  * block, walk the path and count how many of each symbol in each Huffman-coded
1478  * alphabet would be required to output the items (matches and literals) along
1479  * the path.
1480  *
1481  * Note that the path will be walked backwards (from the end of the block to the
1482  * beginning of the block), but this doesn't matter because this function only
1483  * computes frequencies.
1484  */
1485 static forceinline void
1486 lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1487 {
1488         lzx_walk_item_list(c, block_size, is_16_bit, false);
1489 }
1490
1491 /*
1492  * Like lzx_tally_item_list(), but this function also generates the list of
1493  * lzx_sequences for the minimum-cost path and writes it to c->chosen_sequences,
1494  * ready to be output to the bitstream after the Huffman codes are computed.
1495  * The lzx_sequences will be written to decreasing memory addresses as the path
1496  * is walked backwards, which means they will end up in the expected
1497  * first-to-last order.  The return value is the index in c->chosen_sequences at
1498  * which the lzx_sequences begin.
1499  */
1500 static forceinline u32
1501 lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1502 {
1503         return lzx_walk_item_list(c, block_size, is_16_bit, true);
1504 }
1505
1506 /*
1507  * Find an inexpensive path through the graph of possible match/literal choices
1508  * for the current block.  The nodes of the graph are
1509  * c->optimum_nodes[0...block_size].  They correspond directly to the bytes in
1510  * the current block, plus one extra node for end-of-block.  The edges of the
1511  * graph are matches and literals.  The goal is to find the minimum cost path
1512  * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]', given the cost
1513  * model 'c->costs'.
1514  *
1515  * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and
1516  * proceeding forwards one node at a time.  At each node, a selection of matches
1517  * (len >= 2), as well as the literal byte (len = 1), is considered.  An item of
1518  * length 'len' provides a new path to reach the node 'len' bytes later.  If
1519  * such a path is the lowest cost found so far to reach that later node, then
1520  * that later node is updated with the new cost and the "arrival" which provided
1521  * that cost.
1522  *
1523  * Note that although this algorithm is based on minimum cost path search, due
1524  * to various simplifying assumptions the result is not guaranteed to be the
1525  * true minimum cost, or "optimal", path over the graph of all valid LZX
1526  * representations of this block.
1527  *
1528  * Also, note that because of the presence of the recent offsets queue (which is
1529  * a type of adaptive state), the algorithm cannot work backwards and compute
1530  * "cost to end" instead of "cost to beginning".  Furthermore, the way the
1531  * algorithm handles this adaptive state in the "minimum cost" parse is actually
1532  * only an approximation.  It's possible for the globally optimal, minimum cost
1533  * path to contain a prefix, ending at a position, where that path prefix is
1534  * *not* the minimum cost path to that position.  This can happen if such a path
1535  * prefix results in a different adaptive state which results in lower costs
1536  * later.  The algorithm does not solve this problem in general; it only looks
1537  * one step ahead, with the exception of special consideration for "gap
1538  * matches".
1539  */
1540 static forceinline struct lzx_lru_queue
1541 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
1542                        const u8 * const restrict block_begin,
1543                        const u32 block_size,
1544                        const struct lzx_lru_queue initial_queue,
1545                        bool is_16_bit)
1546 {
1547         struct lzx_optimum_node *cur_node = c->optimum_nodes;
1548         struct lzx_optimum_node * const end_node = cur_node + block_size;
1549         struct lz_match *cache_ptr = c->match_cache;
1550         const u8 *in_next = block_begin;
1551         const u8 * const block_end = block_begin + block_size;
1552
1553         /*
1554          * Instead of storing the match offset LRU queues in the
1555          * 'lzx_optimum_node' structures, we save memory (and cache lines) by
1556          * storing them in a smaller array.  This works because the algorithm
1557          * only requires a limited history of the adaptive state.  Once a given
1558          * state is more than LZX_MAX_MATCH_LEN bytes behind the current node
1559          * (more if gap match consideration is enabled; we just round up to 512
1560          * so it's a power of 2), it is no longer needed.
1561          *
1562          * The QUEUE() macro finds the queue for the given node.  This macro has
1563          * been optimized by taking advantage of 'struct lzx_lru_queue' and
1564          * 'struct lzx_optimum_node' both being 8 bytes in size and alignment.
1565          */
1566         struct lzx_lru_queue queues[512];
1567         STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1);
1568         STATIC_ASSERT(sizeof(c->optimum_nodes[0]) == sizeof(queues[0]));
1569 #define QUEUE(node) \
1570         (*(struct lzx_lru_queue *)((char *)queues + \
1571                         ((uintptr_t)(node) % (ARRAY_LEN(queues) * sizeof(queues[0])))))
1572         /*(queues[(uintptr_t)(node) / sizeof(*(node)) % ARRAY_LEN(queues)])*/
1573
1574 #if CONSIDER_GAP_MATCHES
1575         u32 matches_before_gap[ARRAY_LEN(queues)];
1576 #define MATCH_BEFORE_GAP(node) \
1577         (matches_before_gap[(uintptr_t)(node) / sizeof(*(node)) % \
1578                             ARRAY_LEN(matches_before_gap)])
1579 #endif
1580
1581         /*
1582          * Initially, the cost to reach each node is "infinity".
1583          *
1584          * The first node actually should have cost 0, but "infinity"
1585          * (0xFFFFFFFF) works just as well because it immediately overflows.
1586          *
1587          * The following statement also intentionally sets the 'item' of the
1588          * first node, which would otherwise have no meaning, to 0xFFFFFFFF for
1589          * use as a sentinel.  See lzx_walk_item_list().
1590          */
1591         memset(c->optimum_nodes, 0xFF,
1592                (block_size + 1) * sizeof(c->optimum_nodes[0]));
1593
1594         /* Initialize the recent offsets queue for the first node. */
1595         QUEUE(cur_node) = initial_queue;
1596
1597         do { /* For each node in the block in position order... */
1598
1599                 unsigned num_matches;
1600                 unsigned literal;
1601                 u32 cost;
1602
1603                 /*
1604                  * A selection of matches for the block was already saved in
1605                  * memory so that we don't have to run the uncompressed data
1606                  * through the matchfinder on every optimization pass.  However,
1607                  * we still search for repeat offset matches during each
1608                  * optimization pass because we cannot predict the state of the
1609                  * recent offsets queue.  But as a heuristic, we don't bother
1610                  * searching for repeat offset matches if the general-purpose
1611                  * matchfinder failed to find any matches.
1612                  *
1613                  * Note that a match of length n at some offset implies there is
1614                  * also a match of length l for LZX_MIN_MATCH_LEN <= l <= n at
1615                  * that same offset.  In other words, we don't necessarily need
1616                  * to use the full length of a match.  The key heuristic that
1617                  * saves a significicant amount of time is that for each
1618                  * distinct length, we only consider the smallest offset for
1619                  * which that length is available.  This heuristic also applies
1620                  * to repeat offsets, which we order specially: R0 < R1 < R2 <
1621                  * any explicit offset.  Of course, this heuristic may be
1622                  * produce suboptimal results because offset slots in LZX are
1623                  * subject to entropy encoding, but in practice this is a useful
1624                  * heuristic.
1625                  */
1626
1627                 num_matches = cache_ptr->length;
1628                 cache_ptr++;
1629
1630                 if (num_matches) {
1631                         struct lz_match *end_matches = cache_ptr + num_matches;
1632                         unsigned next_len = LZX_MIN_MATCH_LEN;
1633                         unsigned max_len = min(block_end - in_next, LZX_MAX_MATCH_LEN);
1634                         const u8 *matchptr;
1635
1636                         /* Consider rep0 matches. */
1637                         matchptr = in_next - lzx_lru_queue_R0(QUEUE(cur_node));
1638                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1639                                 goto rep0_done;
1640                         STATIC_ASSERT(LZX_MIN_MATCH_LEN == 2);
1641                         do {
1642                                 u32 cost = cur_node->cost +
1643                                            c->costs.match_cost[0][
1644                                                         next_len - LZX_MIN_MATCH_LEN];
1645                                 if (cost <= (cur_node + next_len)->cost) {
1646                                         (cur_node + next_len)->cost = cost;
1647                                         (cur_node + next_len)->item =
1648                                                 (0 << OPTIMUM_OFFSET_SHIFT) | next_len;
1649                                 }
1650                                 if (unlikely(++next_len > max_len)) {
1651                                         cache_ptr = end_matches;
1652                                         goto done_matches;
1653                                 }
1654                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1655
1656                 rep0_done:
1657
1658                         /* Consider rep1 matches. */
1659                         matchptr = in_next - lzx_lru_queue_R1(QUEUE(cur_node));
1660                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1661                                 goto rep1_done;
1662                         if (matchptr[next_len - 1] != in_next[next_len - 1])
1663                                 goto rep1_done;
1664                         for (unsigned len = 2; len < next_len - 1; len++)
1665                                 if (matchptr[len] != in_next[len])
1666                                         goto rep1_done;
1667                         do {
1668                                 u32 cost = cur_node->cost +
1669                                            c->costs.match_cost[1][
1670                                                         next_len - LZX_MIN_MATCH_LEN];
1671                                 if (cost <= (cur_node + next_len)->cost) {
1672                                         (cur_node + next_len)->cost = cost;
1673                                         (cur_node + next_len)->item =
1674                                                 (1 << OPTIMUM_OFFSET_SHIFT) | next_len;
1675                                 }
1676                                 if (unlikely(++next_len > max_len)) {
1677                                         cache_ptr = end_matches;
1678                                         goto done_matches;
1679                                 }
1680                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1681
1682                 rep1_done:
1683
1684                         /* Consider rep2 matches. */
1685                         matchptr = in_next - lzx_lru_queue_R2(QUEUE(cur_node));
1686                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1687                                 goto rep2_done;
1688                         if (matchptr[next_len - 1] != in_next[next_len - 1])
1689                                 goto rep2_done;
1690                         for (unsigned len = 2; len < next_len - 1; len++)
1691                                 if (matchptr[len] != in_next[len])
1692                                         goto rep2_done;
1693                         do {
1694                                 u32 cost = cur_node->cost +
1695                                            c->costs.match_cost[2][
1696                                                         next_len - LZX_MIN_MATCH_LEN];
1697                                 if (cost <= (cur_node + next_len)->cost) {
1698                                         (cur_node + next_len)->cost = cost;
1699                                         (cur_node + next_len)->item =
1700                                                 (2 << OPTIMUM_OFFSET_SHIFT) | next_len;
1701                                 }
1702                                 if (unlikely(++next_len > max_len)) {
1703                                         cache_ptr = end_matches;
1704                                         goto done_matches;
1705                                 }
1706                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1707
1708                 rep2_done:
1709
1710                         while (next_len > cache_ptr->length)
1711                                 if (++cache_ptr == end_matches)
1712                                         goto done_matches;
1713
1714                         /* Consider explicit offset matches. */
1715                         for (;;) {
1716                                 u32 offset = cache_ptr->offset;
1717                                 u32 adjusted_offset = offset + LZX_OFFSET_ADJUSTMENT;
1718                                 unsigned offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
1719                                 u32 base_cost = cur_node->cost;
1720                                 u32 cost;
1721
1722                         #if CONSIDER_ALIGNED_COSTS
1723                                 if (offset >= LZX_MIN_ALIGNED_OFFSET)
1724                                         base_cost += c->costs.aligned[adjusted_offset &
1725                                                                       LZX_ALIGNED_OFFSET_BITMASK];
1726                         #endif
1727                                 do {
1728                                         cost = base_cost +
1729                                                c->costs.match_cost[offset_slot][
1730                                                                 next_len - LZX_MIN_MATCH_LEN];
1731                                         if (cost < (cur_node + next_len)->cost) {
1732                                                 (cur_node + next_len)->cost = cost;
1733                                                 (cur_node + next_len)->item =
1734                                                         (adjusted_offset << OPTIMUM_OFFSET_SHIFT) | next_len;
1735                                         }
1736                                 } while (++next_len <= cache_ptr->length);
1737
1738                                 if (++cache_ptr == end_matches) {
1739                                 #if CONSIDER_GAP_MATCHES
1740                                         /* Also consider the longest explicit
1741                                          * offset match as a "gap match": match
1742                                          * + lit + rep0. */
1743                                         s32 remaining = (block_end - in_next) - (s32)next_len;
1744                                         if (likely(remaining >= 2)) {
1745                                                 const u8 *strptr = in_next + next_len;
1746                                                 const u8 *matchptr = strptr - offset;
1747                                                 if (load_u16_unaligned(strptr) == load_u16_unaligned(matchptr)) {
1748                                                         STATIC_ASSERT(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2 >= 250);
1749                                                         STATIC_ASSERT(ARRAY_LEN(queues) == ARRAY_LEN(matches_before_gap));
1750                                                         unsigned limit = min(remaining,
1751                                                                              min(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2,
1752                                                                                  LZX_MAX_MATCH_LEN));
1753                                                         unsigned rep0_len = lz_extend(strptr, matchptr, 2, limit);
1754                                                         u8 lit = strptr[-1];
1755                                                         cost += c->costs.main[lit] +
1756                                                                 c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN];
1757                                                         unsigned total_len = next_len + rep0_len;
1758                                                         if (cost < (cur_node + total_len)->cost) {
1759                                                                 (cur_node + total_len)->cost = cost;
1760                                                                 (cur_node + total_len)->item =
1761                                                                         OPTIMUM_GAP_MATCH |
1762                                                                         ((u32)lit << OPTIMUM_OFFSET_SHIFT) |
1763                                                                         rep0_len;
1764                                                                 MATCH_BEFORE_GAP(cur_node + total_len) =
1765                                                                         (adjusted_offset << OPTIMUM_OFFSET_SHIFT) |
1766                                                                         (next_len - 1);
1767                                                         }
1768                                                 }
1769                                         }
1770                                 #endif /* CONSIDER_GAP_MATCHES */
1771                                         break;
1772                                 }
1773                         }
1774                 }
1775
1776         done_matches:
1777
1778                 /* Consider coding a literal.
1779
1780                  * To avoid an extra branch, actually checking the preferability
1781                  * of coding the literal is integrated into the queue update
1782                  * code below. */
1783                 literal = *in_next++;
1784                 cost = cur_node->cost + c->costs.main[literal];
1785
1786                 /* Advance to the next position. */
1787                 cur_node++;
1788
1789                 /* The lowest-cost path to the current position is now known.
1790                  * Finalize the recent offsets queue that results from taking
1791                  * this lowest-cost path. */
1792
1793                 if (cost <= cur_node->cost) {
1794                         /* Literal: queue remains unchanged. */
1795                         cur_node->cost = cost;
1796                         cur_node->item = (u32)literal << OPTIMUM_OFFSET_SHIFT;
1797                         QUEUE(cur_node) = QUEUE(cur_node - 1);
1798                 } else {
1799                         /* Match: queue update is needed. */
1800                         unsigned len = cur_node->item & OPTIMUM_LEN_MASK;
1801                 #if CONSIDER_GAP_MATCHES
1802                         s32 adjusted_offset = (s32)cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1803                         STATIC_ASSERT(OPTIMUM_GAP_MATCH == 0x80000000); /* assuming sign extension */
1804                 #else
1805                         u32 adjusted_offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1806                 #endif
1807
1808                         if (adjusted_offset >= LZX_NUM_RECENT_OFFSETS) {
1809                                 /* Explicit offset match: insert offset at front. */
1810                                 QUEUE(cur_node) =
1811                                         lzx_lru_queue_push(QUEUE(cur_node - len),
1812                                                            adjusted_offset - LZX_OFFSET_ADJUSTMENT);
1813                         }
1814                 #if CONSIDER_GAP_MATCHES
1815                         else if (adjusted_offset < 0) {
1816                                 /* "Gap match": Explicit offset match, then a
1817                                  * literal, then rep0 match.  Save the explicit
1818                                  * offset match information in the cost field of
1819                                  * the previous node, which isn't needed
1820                                  * anymore.  Then insert the offset at the front
1821                                  * of the queue. */
1822                                 u32 match_before_gap = MATCH_BEFORE_GAP(cur_node);
1823                                 (cur_node - 1)->cost = match_before_gap;
1824                                 QUEUE(cur_node) =
1825                                         lzx_lru_queue_push(QUEUE(cur_node - len - 1 -
1826                                                                  (match_before_gap & OPTIMUM_LEN_MASK)),
1827                                                            (match_before_gap >> OPTIMUM_OFFSET_SHIFT) -
1828                                                            LZX_OFFSET_ADJUSTMENT);
1829                         }
1830                 #endif
1831                         else {
1832                                 /* Repeat offset match: swap offset to front. */
1833                                 QUEUE(cur_node) =
1834                                         lzx_lru_queue_swap(QUEUE(cur_node - len),
1835                                                            adjusted_offset);
1836                         }
1837                 }
1838         } while (cur_node != end_node);
1839
1840         /* Return the recent offsets queue at the end of the path. */
1841         return QUEUE(cur_node);
1842 }
1843
1844 /*
1845  * Given the costs for the main and length codewords (c->costs.main and
1846  * c->costs.len), initialize the match cost array (c->costs.match_cost) which
1847  * directly provides the cost of every possible (length, offset slot) pair.
1848  */
1849 static void
1850 lzx_compute_match_costs(struct lzx_compressor *c)
1851 {
1852         unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) /
1853                                         LZX_NUM_LEN_HEADERS;
1854         struct lzx_costs *costs = &c->costs;
1855         unsigned main_symbol = LZX_NUM_CHARS;
1856
1857         for (unsigned offset_slot = 0; offset_slot < num_offset_slots;
1858              offset_slot++)
1859         {
1860                 u32 extra_cost = lzx_extra_offset_bits[offset_slot] * BIT_COST;
1861                 unsigned i;
1862
1863         #if CONSIDER_ALIGNED_COSTS
1864                 if (offset_slot >= LZX_MIN_ALIGNED_OFFSET_SLOT)
1865                         extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS * BIT_COST;
1866         #endif
1867
1868                 for (i = 0; i < LZX_NUM_PRIMARY_LENS; i++) {
1869                         costs->match_cost[offset_slot][i] =
1870                                 costs->main[main_symbol++] + extra_cost;
1871                 }
1872
1873                 extra_cost += costs->main[main_symbol++];
1874
1875                 for (; i < LZX_NUM_LENS; i++) {
1876                         costs->match_cost[offset_slot][i] =
1877                                 costs->len[i - LZX_NUM_PRIMARY_LENS] +
1878                                 extra_cost;
1879                 }
1880         }
1881 }
1882
1883 /*
1884  * Fast approximation for log2f(x).  This is not as accurate as the standard C
1885  * version.  It does not need to be perfectly accurate because it is only used
1886  * for estimating symbol costs, which is very approximate anyway.
1887  */
1888 static float
1889 log2f_fast(float x)
1890 {
1891         union {
1892                 float f;
1893                 s32 i;
1894         } u = { .f = x };
1895
1896         /* Extract the exponent and subtract 127 to remove the bias.  This gives
1897          * the integer part of the result. */
1898         float res = ((u.i >> 23) & 0xFF) - 127;
1899
1900         /* Set the exponent to 0 (plus bias of 127).  This transforms the number
1901          * to the range [1, 2) while retaining the same mantissa. */
1902         u.i = (u.i & ~(0xFF << 23)) | (127 << 23);
1903
1904         /*
1905          * Approximate the log2 of the transformed number using a degree 2
1906          * interpolating polynomial for log2(x) over the interval [1, 2).  Then
1907          * add this to the extracted exponent to produce the final approximation
1908          * of log2(x).
1909          *
1910          * The coefficients of the interpolating polynomial used here were found
1911          * using the script tools/log2_interpolation.r.
1912          */
1913         return res - 1.653124006f + u.f * (1.9941812f - u.f * 0.3347490189f);
1914
1915 }
1916
1917 /*
1918  * Return the estimated cost of a symbol which has been estimated to have the
1919  * given probability.
1920  */
1921 static u32
1922 lzx_cost_for_probability(float prob)
1923 {
1924         /*
1925          * The basic formula is:
1926          *
1927          *      entropy = -log2(probability)
1928          *
1929          * Use this to get the cost in fractional bits.  Then multiply by our
1930          * scaling factor of BIT_COST and convert to an integer.
1931          *
1932          * In addition, the minimum cost is BIT_COST (one bit) because the
1933          * entropy coding method will be Huffman codes.
1934          *
1935          * Careful: even though 'prob' should be <= 1.0, 'log2f_fast(prob)' may
1936          * be positive due to inaccuracy in our log2 approximation.  Therefore,
1937          * we cannot, in general, assume the computed cost is non-negative, and
1938          * we should make sure negative costs get rounded up correctly.
1939          */
1940         s32 cost = -log2f_fast(prob) * BIT_COST;
1941         return max(cost, BIT_COST);
1942 }
1943
1944 /*
1945  * Mapping: number of used literals => heuristic probability of a literal times
1946  * 6870.  Generated by running this R command:
1947  *
1948  *      cat(paste(round(6870*2^-((304+(0:256))/64)), collapse=", "))
1949  */
1950 static const u8 literal_scaled_probs[257] = {
1951         255, 253, 250, 247, 244, 242, 239, 237, 234, 232, 229, 227, 224, 222,
1952         219, 217, 215, 212, 210, 208, 206, 203, 201, 199, 197, 195, 193, 191,
1953         189, 186, 184, 182, 181, 179, 177, 175, 173, 171, 169, 167, 166, 164,
1954         162, 160, 159, 157, 155, 153, 152, 150, 149, 147, 145, 144, 142, 141,
1955         139, 138, 136, 135, 133, 132, 130, 129, 128, 126, 125, 124, 122, 121,
1956         120, 118, 117, 116, 115, 113, 112, 111, 110, 109, 107, 106, 105, 104,
1957         103, 102, 101, 100, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86,
1958         86, 85, 84, 83, 82, 81, 80, 79, 78, 78, 77, 76, 75, 74, 73, 73, 72, 71,
1959         70, 70, 69, 68, 67, 67, 66, 65, 65, 64, 63, 62, 62, 61, 60, 60, 59, 59,
1960         58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 52, 51, 51, 50, 50, 49, 49, 48,
1961         48, 47, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 40, 40, 40,
1962         39, 39, 38, 38, 38, 37, 37, 36, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33,
1963         32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 27, 27,
1964         27, 26, 26, 26, 25, 25, 25, 25, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22,
1965         22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18,
1966         18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16
1967 };
1968
1969 /*
1970  * Mapping: length symbol => default cost of that symbol.  This is derived from
1971  * sample data but has been slightly edited to add more bias towards the
1972  * shortest lengths, which are the most common.
1973  */
1974 static const u16 lzx_default_len_costs[LZX_LENCODE_NUM_SYMBOLS] = {
1975         300, 310, 320, 330, 360, 396, 399, 416, 451, 448, 463, 466, 505, 492,
1976         503, 514, 547, 531, 566, 561, 589, 563, 592, 586, 623, 602, 639, 627,
1977         659, 643, 657, 650, 685, 662, 661, 672, 685, 686, 696, 680, 657, 682,
1978         666, 699, 674, 699, 679, 709, 688, 712, 692, 714, 694, 716, 698, 712,
1979         706, 727, 714, 727, 713, 723, 712, 718, 719, 719, 720, 735, 725, 735,
1980         728, 740, 727, 739, 727, 742, 716, 733, 733, 740, 738, 746, 737, 747,
1981         738, 745, 736, 748, 742, 749, 745, 749, 743, 748, 741, 752, 745, 752,
1982         747, 750, 747, 752, 748, 753, 750, 752, 753, 753, 749, 744, 752, 755,
1983         753, 756, 745, 748, 746, 745, 723, 757, 755, 758, 755, 758, 752, 757,
1984         754, 757, 755, 759, 755, 758, 753, 755, 755, 758, 757, 761, 755, 750,
1985         758, 759, 759, 760, 758, 751, 757, 757, 759, 759, 758, 759, 758, 761,
1986         750, 761, 758, 760, 759, 761, 758, 761, 760, 752, 759, 760, 759, 759,
1987         757, 762, 760, 761, 761, 748, 761, 760, 762, 763, 752, 762, 762, 763,
1988         762, 762, 763, 763, 762, 763, 762, 763, 762, 763, 763, 764, 763, 762,
1989         763, 762, 762, 762, 764, 764, 763, 764, 763, 763, 763, 762, 763, 763,
1990         762, 764, 764, 763, 762, 763, 763, 763, 763, 762, 764, 763, 762, 764,
1991         764, 763, 763, 765, 764, 764, 762, 763, 764, 765, 763, 764, 763, 764,
1992         762, 764, 764, 754, 763, 764, 763, 763, 762, 763, 584,
1993 };
1994
1995 /* Set default costs to bootstrap the iterative optimization algorithm. */
1996 static void
1997 lzx_set_default_costs(struct lzx_compressor *c)
1998 {
1999         unsigned i;
2000         u32 num_literals = 0;
2001         u32 num_used_literals = 0;
2002         float inv_num_matches = 1.0f / c->freqs.main[LZX_NUM_CHARS];
2003         float inv_num_items;
2004         float prob_match = 1.0f;
2005         u32 match_cost;
2006         float base_literal_prob;
2007
2008         /* Some numbers here have been hardcoded to assume a bit cost of 64. */
2009         STATIC_ASSERT(BIT_COST == 64);
2010
2011         /* Estimate the number of literals that will used.  'num_literals' is
2012          * the total number, whereas 'num_used_literals' is the number of
2013          * distinct symbols. */
2014         for (i = 0; i < LZX_NUM_CHARS; i++) {
2015                 num_literals += c->freqs.main[i];
2016                 num_used_literals += (c->freqs.main[i] != 0);
2017         }
2018
2019         /* Note: all match headers were tallied as symbol 'LZX_NUM_CHARS'.  We
2020          * don't attempt to estimate which ones will be used. */
2021
2022         inv_num_items = 1.0f / (num_literals + c->freqs.main[LZX_NUM_CHARS]);
2023         base_literal_prob = literal_scaled_probs[num_used_literals] *
2024                             (1.0f / 6870.0f);
2025
2026         /* Literal costs.  We use two different methods to compute the
2027          * probability of each literal and mix together their results. */
2028         for (i = 0; i < LZX_NUM_CHARS; i++) {
2029                 u32 freq = c->freqs.main[i];
2030                 if (freq != 0) {
2031                         float prob = 0.5f * ((freq * inv_num_items) +
2032                                              base_literal_prob);
2033                         c->costs.main[i] = lzx_cost_for_probability(prob);
2034                         prob_match -= prob;
2035                 } else {
2036                         c->costs.main[i] = 11 * BIT_COST;
2037                 }
2038         }
2039
2040         /* Match header costs.  We just assume that all match headers are
2041          * equally probable, but we do take into account the relative cost of a
2042          * match header vs. a literal depending on how common matches are
2043          * expected to be vs. literals. */
2044         prob_match = max(prob_match, 0.15f);
2045         match_cost = lzx_cost_for_probability(prob_match / (c->num_main_syms -
2046                                                             LZX_NUM_CHARS));
2047         for (; i < c->num_main_syms; i++)
2048                 c->costs.main[i] = match_cost;
2049
2050         /* Length symbol costs.  These are just set to fixed values which
2051          * reflect the fact the smallest lengths are typically the most common,
2052          * and therefore are typically the cheapest. */
2053         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
2054                 c->costs.len[i] = lzx_default_len_costs[i];
2055
2056 #if CONSIDER_ALIGNED_COSTS
2057         /* Aligned offset symbol costs.  These are derived from the estimated
2058          * probability of each aligned offset symbol. */
2059         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2060                 /* We intentionally tallied the frequencies in the wrong slots,
2061                  * not accounting for LZX_OFFSET_ADJUSTMENT, since doing the
2062                  * fixup here is faster: a constant 8 subtractions here vs. one
2063                  * addition for every match. */
2064                 unsigned j = (i - LZX_OFFSET_ADJUSTMENT) & LZX_ALIGNED_OFFSET_BITMASK;
2065                 if (c->freqs.aligned[j] != 0) {
2066                         float prob = c->freqs.aligned[j] * inv_num_matches;
2067                         c->costs.aligned[i] = lzx_cost_for_probability(prob);
2068                 } else {
2069                         c->costs.aligned[i] =
2070                                 (2 * LZX_NUM_ALIGNED_OFFSET_BITS) * BIT_COST;
2071                 }
2072         }
2073 #endif
2074 }
2075
2076 /* Update the current cost model to reflect the computed Huffman codes.  */
2077 static void
2078 lzx_set_costs_from_codes(struct lzx_compressor *c)
2079 {
2080         unsigned i;
2081         const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
2082
2083         for (i = 0; i < c->num_main_syms; i++) {
2084                 c->costs.main[i] = (lens->main[i] ? lens->main[i] :
2085                                     MAIN_CODEWORD_LIMIT) * BIT_COST;
2086         }
2087
2088         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
2089                 c->costs.len[i] = (lens->len[i] ? lens->len[i] :
2090                                    LENGTH_CODEWORD_LIMIT) * BIT_COST;
2091         }
2092
2093 #if CONSIDER_ALIGNED_COSTS
2094         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2095                 c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] :
2096                                        ALIGNED_CODEWORD_LIMIT) * BIT_COST;
2097         }
2098 #endif
2099 }
2100
2101 /*
2102  * Choose a "near-optimal" literal/match sequence to use for the current block,
2103  * then flush the block.  Because the cost of each Huffman symbol is unknown
2104  * until the Huffman codes have been built and the Huffman codes themselves
2105  * depend on the symbol frequencies, this uses an iterative optimization
2106  * algorithm to approximate an optimal solution.  The first optimization pass
2107  * for the block uses default costs; additional passes use costs derived from
2108  * the Huffman codes computed in the previous pass.
2109  */
2110 static forceinline struct lzx_lru_queue
2111 lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
2112                              struct lzx_output_bitstream * const restrict os,
2113                              const u8 * const restrict block_begin,
2114                              const u32 block_size,
2115                              const struct lzx_lru_queue initial_queue,
2116                              bool is_16_bit)
2117 {
2118         unsigned num_passes_remaining = c->num_optim_passes;
2119         struct lzx_lru_queue new_queue;
2120         u32 seq_idx;
2121
2122         lzx_set_default_costs(c);
2123
2124         for (;;) {
2125                 lzx_compute_match_costs(c);
2126                 new_queue = lzx_find_min_cost_path(c, block_begin, block_size,
2127                                                    initial_queue, is_16_bit);
2128
2129                 if (--num_passes_remaining == 0)
2130                         break;
2131
2132                 /* At least one optimization pass remains.  Update the costs. */
2133                 lzx_reset_symbol_frequencies(c);
2134                 lzx_tally_item_list(c, block_size, is_16_bit);
2135                 lzx_build_huffman_codes(c);
2136                 lzx_set_costs_from_codes(c);
2137         }
2138
2139         /* Done optimizing.  Generate the sequence list and flush the block. */
2140         lzx_reset_symbol_frequencies(c);
2141         seq_idx = lzx_record_item_list(c, block_size, is_16_bit);
2142         lzx_flush_block(c, os, block_begin, block_size, seq_idx);
2143         return new_queue;
2144 }
2145
2146 /*
2147  * This is the "near-optimal" LZX compressor.
2148  *
2149  * For each block, it performs a relatively thorough graph search to find an
2150  * inexpensive (in terms of compressed size) way to output the block.
2151  *
2152  * Note: there are actually many things this algorithm leaves on the table in
2153  * terms of compression ratio.  So although it may be "near-optimal", it is
2154  * certainly not "optimal".  The goal is not to produce the optimal compression
2155  * ratio, which for LZX is probably impossible within any practical amount of
2156  * time, but rather to produce a compression ratio significantly better than a
2157  * simpler "greedy" or "lazy" parse while still being relatively fast.
2158  */
2159 static forceinline void
2160 lzx_compress_near_optimal(struct lzx_compressor * restrict c,
2161                           const u8 * const restrict in_begin, size_t in_nbytes,
2162                           struct lzx_output_bitstream * restrict os,
2163                           bool is_16_bit)
2164 {
2165         const u8 *       in_next = in_begin;
2166         const u8 * const in_end  = in_begin + in_nbytes;
2167         u32 max_len = LZX_MAX_MATCH_LEN;
2168         u32 nice_len = min(c->nice_match_length, max_len);
2169         u32 next_hashes[2] = {0, 0};
2170         struct lzx_lru_queue queue = LZX_QUEUE_INITIALIZER;
2171
2172         /* Initialize the matchfinder. */
2173         CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
2174
2175         do {
2176                 /* Starting a new block */
2177
2178                 const u8 * const in_block_begin = in_next;
2179                 const u8 * const in_max_block_end =
2180                         in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2181                 struct lz_match *cache_ptr = c->match_cache;
2182                 const u8 *next_search_pos = in_next;
2183                 const u8 *next_observation = in_next;
2184                 const u8 *next_pause_point =
2185                         min(in_next + min(MIN_BLOCK_SIZE,
2186                                           in_max_block_end - in_next),
2187                             in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2188                                                    in_max_block_end - in_next));
2189
2190                 lzx_init_block_split_stats(&c->split_stats);
2191                 lzx_reset_symbol_frequencies(c);
2192
2193                 if (in_next >= next_pause_point)
2194                         goto pause;
2195
2196                 /*
2197                  * Run the input buffer through the matchfinder, caching the
2198                  * matches, until we decide to end the block.
2199                  *
2200                  * For a tighter matchfinding loop, we compute a "pause point",
2201                  * which is the next position at which we may need to check
2202                  * whether to end the block or to decrease max_len.  We then
2203                  * only do these extra checks upon reaching the pause point.
2204                  */
2205         resume_matchfinding:
2206                 do {
2207                         if (in_next >= next_search_pos) {
2208                                 /* Search for matches at this position. */
2209                                 struct lz_match *lz_matchptr;
2210                                 u32 best_len;
2211
2212                                 lz_matchptr = CALL_BT_MF(is_16_bit, c,
2213                                                          bt_matchfinder_get_matches,
2214                                                          in_begin,
2215                                                          in_next - in_begin,
2216                                                          max_len,
2217                                                          nice_len,
2218                                                          c->max_search_depth,
2219                                                          next_hashes,
2220                                                          &best_len,
2221                                                          cache_ptr + 1);
2222                                 cache_ptr->length = lz_matchptr - (cache_ptr + 1);
2223                                 cache_ptr = lz_matchptr;
2224
2225                                 /* Accumulate literal/match statistics for block
2226                                  * splitting and for generating the initial cost
2227                                  * model. */
2228                                 if (in_next >= next_observation) {
2229                                         best_len = cache_ptr[-1].length;
2230                                         if (best_len >= 3) {
2231                                                 /* Match (len >= 3) */
2232
2233                                                 /*
2234                                                  * Note: for performance reasons this has
2235                                                  * been simplified significantly:
2236                                                  *
2237                                                  * - We wait until later to account for
2238                                                  *   LZX_OFFSET_ADJUSTMENT.
2239                                                  * - We don't account for repeat offsets.
2240                                                  * - We don't account for different match headers.
2241                                                  */
2242                                                 c->freqs.aligned[cache_ptr[-1].offset &
2243                                                         LZX_ALIGNED_OFFSET_BITMASK]++;
2244                                                 c->freqs.main[LZX_NUM_CHARS]++;
2245
2246                                                 lzx_observe_match(&c->split_stats, best_len);
2247                                                 next_observation = in_next + best_len;
2248                                         } else {
2249                                                 /* Literal */
2250                                                 c->freqs.main[*in_next]++;
2251                                                 lzx_observe_literal(&c->split_stats, *in_next);
2252                                                 next_observation = in_next + 1;
2253                                         }
2254                                 }
2255
2256                                 /*
2257                                  * If there was a very long match found, then
2258                                  * don't cache any matches for the bytes covered
2259                                  * by that match.  This avoids degenerate
2260                                  * behavior when compressing highly redundant
2261                                  * data, where the number of matches can be very
2262                                  * large.
2263                                  *
2264                                  * This heuristic doesn't actually hurt the
2265                                  * compression ratio *too* much.  If there's a
2266                                  * long match, then the data must be highly
2267                                  * compressible, so it doesn't matter as much
2268                                  * what we do.
2269                                  */
2270                                 if (best_len >= nice_len)
2271                                         next_search_pos = in_next + best_len;
2272                         } else {
2273                                 /* Don't search for matches at this position. */
2274                                 CALL_BT_MF(is_16_bit, c,
2275                                            bt_matchfinder_skip_byte,
2276                                            in_begin,
2277                                            in_next - in_begin,
2278                                            nice_len,
2279                                            c->max_search_depth,
2280                                            next_hashes);
2281                                 cache_ptr->length = 0;
2282                                 cache_ptr++;
2283                         }
2284                 } while (++in_next < next_pause_point &&
2285                          likely(cache_ptr < &c->match_cache[CACHE_LENGTH]));
2286
2287         pause:
2288
2289                 /* Adjust max_len and nice_len if we're nearing the end of the
2290                  * input buffer.  In addition, if we are so close to the end of
2291                  * the input buffer that there cannot be any more matches, then
2292                  * just advance through the last few positions and record no
2293                  * matches. */
2294                 if (unlikely(max_len > in_end - in_next)) {
2295                         max_len = in_end - in_next;
2296                         nice_len = min(max_len, nice_len);
2297                         if (max_len < BT_MATCHFINDER_REQUIRED_NBYTES) {
2298                                 while (in_next != in_end) {
2299                                         cache_ptr->length = 0;
2300                                         cache_ptr++;
2301                                         in_next++;
2302                                 }
2303                         }
2304                 }
2305
2306                 /* End the block if the match cache may overflow. */
2307                 if (unlikely(cache_ptr >= &c->match_cache[CACHE_LENGTH]))
2308                         goto end_block;
2309
2310                 /* End the block if the soft maximum size has been reached. */
2311                 if (in_next >= in_max_block_end)
2312                         goto end_block;
2313
2314                 /* End the block if the block splitting algorithm thinks this is
2315                  * a good place to do so. */
2316                 if (c->split_stats.num_new_observations >=
2317                                 NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2318                     in_max_block_end - in_next >= MIN_BLOCK_SIZE &&
2319                     lzx_should_end_block(&c->split_stats))
2320                         goto end_block;
2321
2322                 /* It's not time to end the block yet.  Compute the next pause
2323                  * point and resume matchfinding. */
2324                 next_pause_point =
2325                         min(in_next + min(NUM_OBSERVATIONS_PER_BLOCK_CHECK * 2 -
2326                                             c->split_stats.num_new_observations,
2327                                           in_max_block_end - in_next),
2328                             in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2329                                                    in_max_block_end - in_next));
2330                 goto resume_matchfinding;
2331
2332         end_block:
2333                 /* We've decided on a block boundary and cached matches.  Now
2334                  * choose a match/literal sequence and flush the block. */
2335                 queue = lzx_optimize_and_flush_block(c, os, in_block_begin,
2336                                                      in_next - in_block_begin,
2337                                                      queue, is_16_bit);
2338         } while (in_next != in_end);
2339 }
2340
2341 static void
2342 lzx_compress_near_optimal_16(struct lzx_compressor *c, const u8 *in,
2343                              size_t in_nbytes, struct lzx_output_bitstream *os)
2344 {
2345         lzx_compress_near_optimal(c, in, in_nbytes, os, true);
2346 }
2347
2348 static void
2349 lzx_compress_near_optimal_32(struct lzx_compressor *c, const u8 *in,
2350                              size_t in_nbytes, struct lzx_output_bitstream *os)
2351 {
2352         lzx_compress_near_optimal(c, in, in_nbytes, os, false);
2353 }
2354
2355 /******************************************************************************/
2356 /*                     Faster ("lazy") compression algorithm                  */
2357 /*----------------------------------------------------------------------------*/
2358
2359 /*
2360  * Called when the compressor chooses to use a literal.  This tallies the
2361  * Huffman symbol for the literal, increments the current literal run length,
2362  * and "observes" the literal for the block split statistics.
2363  */
2364 static forceinline void
2365 lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
2366 {
2367         lzx_observe_literal(&c->split_stats, literal);
2368         c->freqs.main[literal]++;
2369         ++*litrunlen_p;
2370 }
2371
2372 /*
2373  * Called when the compressor chooses to use a match.  This tallies the Huffman
2374  * symbol(s) for a match, saves the match data and the length of the preceding
2375  * literal run, updates the recent offsets queue, and "observes" the match for
2376  * the block split statistics.
2377  */
2378 static forceinline void
2379 lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 adjusted_offset,
2380                  u32 recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
2381                  u32 *litrunlen_p, struct lzx_sequence **next_seq_p)
2382 {
2383         struct lzx_sequence *next_seq = *next_seq_p;
2384         unsigned mainsym;
2385
2386         lzx_observe_match(&c->split_stats, length);
2387
2388         mainsym = lzx_tally_main_and_lensyms(c, length, adjusted_offset,
2389                                              is_16_bit);
2390         next_seq->litrunlen_and_matchlen =
2391                 (*litrunlen_p << SEQ_MATCHLEN_BITS) | length;
2392         next_seq->adjusted_offset_and_mainsym =
2393                 (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym;
2394
2395         /* Update the recent offsets queue. */
2396         if (adjusted_offset < LZX_NUM_RECENT_OFFSETS) {
2397                 /* Repeat offset match. */
2398                 swap(recent_offsets[0], recent_offsets[adjusted_offset]);
2399         } else {
2400                 /* Explicit offset match. */
2401
2402                 /* Tally the aligned offset symbol if needed. */
2403                 if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET + LZX_OFFSET_ADJUSTMENT)
2404                         c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
2405
2406                 recent_offsets[2] = recent_offsets[1];
2407                 recent_offsets[1] = recent_offsets[0];
2408                 recent_offsets[0] = adjusted_offset - LZX_OFFSET_ADJUSTMENT;
2409         }
2410
2411         /* Reset the literal run length and advance to the next sequence. */
2412         *next_seq_p = next_seq + 1;
2413         *litrunlen_p = 0;
2414 }
2415
2416 /*
2417  * Called when the compressor ends a block.  This finshes the last lzx_sequence,
2418  * which is just a literal run with no following match.  This literal run might
2419  * be empty.
2420  */
2421 static forceinline void
2422 lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
2423 {
2424         last_seq->litrunlen_and_matchlen = litrunlen << SEQ_MATCHLEN_BITS;
2425 }
2426
2427 /*
2428  * Find the longest repeat offset match with the current position.  If a match
2429  * is found, return its length and set *best_rep_idx_ret to the index of its
2430  * offset in @recent_offsets.  Otherwise, return 0.
2431  *
2432  * Don't bother with length 2 matches; consider matches of length >= 3 only.
2433  * Also assume that max_len >= 3.
2434  */
2435 static unsigned
2436 lzx_find_longest_repeat_offset_match(const u8 * const in_next,
2437                                      const u32 recent_offsets[],
2438                                      const unsigned max_len,
2439                                      unsigned *best_rep_idx_ret)
2440 {
2441         STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); /* loop is unrolled */
2442
2443         const u32 seq3 = load_u24_unaligned(in_next);
2444         const u8 *matchptr;
2445         unsigned best_rep_len = 0;
2446         unsigned best_rep_idx = 0;
2447         unsigned rep_len;
2448
2449         /* Check for rep0 match (most recent offset) */
2450         matchptr = in_next - recent_offsets[0];
2451         if (load_u24_unaligned(matchptr) == seq3)
2452                 best_rep_len = lz_extend(in_next, matchptr, 3, max_len);
2453
2454         /* Check for rep1 match (second most recent offset) */
2455         matchptr = in_next - recent_offsets[1];
2456         if (load_u24_unaligned(matchptr) == seq3) {
2457                 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2458                 if (rep_len > best_rep_len) {
2459                         best_rep_len = rep_len;
2460                         best_rep_idx = 1;
2461                 }
2462         }
2463
2464         /* Check for rep2 match (third most recent offset) */
2465         matchptr = in_next - recent_offsets[2];
2466         if (load_u24_unaligned(matchptr) == seq3) {
2467                 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2468                 if (rep_len > best_rep_len) {
2469                         best_rep_len = rep_len;
2470                         best_rep_idx = 2;
2471                 }
2472         }
2473
2474         *best_rep_idx_ret = best_rep_idx;
2475         return best_rep_len;
2476 }
2477
2478 /*
2479  * Fast heuristic scoring for lazy parsing: how "good" is this match?
2480  * This is mainly determined by the length: longer matches are better.
2481  * However, we also give a bonus to close (small offset) matches and to repeat
2482  * offset matches, since those require fewer bits to encode.
2483  */
2484
2485 static forceinline unsigned
2486 lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
2487 {
2488         unsigned score = len;
2489
2490         if (adjusted_offset < 4096)
2491                 score++;
2492         if (adjusted_offset < 256)
2493                 score++;
2494
2495         return score;
2496 }
2497
2498 static forceinline unsigned
2499 lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
2500 {
2501         return rep_len + 3;
2502 }
2503
2504 /*
2505  * This is the "lazy" LZX compressor.  The basic idea is that before it chooses
2506  * a match, it checks to see if there's a longer match at the next position.  If
2507  * yes, it chooses a literal and continues to the next position.  If no, it
2508  * chooses the match.
2509  *
2510  * Some additional heuristics are used as well.  Repeat offset matches are
2511  * considered favorably and sometimes are chosen immediately.  In addition, long
2512  * matches (at least "nice_len" bytes) are chosen immediately as well.  Finally,
2513  * when we decide whether a match is "better" than another, we take the offset
2514  * into consideration as well as the length.
2515  */
2516 static forceinline void
2517 lzx_compress_lazy(struct lzx_compressor * restrict c,
2518                   const u8 * const restrict in_begin, size_t in_nbytes,
2519                   struct lzx_output_bitstream * restrict os, bool is_16_bit)
2520 {
2521         const u8 *       in_next = in_begin;
2522         const u8 * const in_end  = in_begin + in_nbytes;
2523         unsigned max_len = LZX_MAX_MATCH_LEN;
2524         unsigned nice_len = min(c->nice_match_length, max_len);
2525         STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
2526         u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1};
2527         u32 next_hashes[2] = {0, 0};
2528
2529         /* Initialize the matchfinder. */
2530         CALL_HC_MF(is_16_bit, c, hc_matchfinder_init);
2531
2532         do {
2533                 /* Starting a new block */
2534
2535                 const u8 * const in_block_begin = in_next;
2536                 const u8 * const in_max_block_end =
2537                         in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2538                 struct lzx_sequence *next_seq = c->chosen_sequences;
2539                 u32 litrunlen = 0;
2540                 unsigned cur_len;
2541                 u32 cur_offset;
2542                 u32 cur_adjusted_offset;
2543                 unsigned cur_score;
2544                 unsigned next_len;
2545                 u32 next_offset;
2546                 u32 next_adjusted_offset;
2547                 unsigned next_score;
2548                 unsigned best_rep_len;
2549                 unsigned best_rep_idx;
2550                 unsigned rep_score;
2551                 unsigned skip_len;
2552
2553                 lzx_reset_symbol_frequencies(c);
2554                 lzx_init_block_split_stats(&c->split_stats);
2555
2556                 do {
2557                         /* Adjust max_len and nice_len if we're nearing the end
2558                          * of the input buffer. */
2559                         if (unlikely(max_len > in_end - in_next)) {
2560                                 max_len = in_end - in_next;
2561                                 nice_len = min(max_len, nice_len);
2562                         }
2563
2564                         /* Find the longest match (subject to the
2565                          * max_search_depth cutoff parameter) with the current
2566                          * position.  Don't bother with length 2 matches; only
2567                          * look for matches of length >= 3. */
2568                         cur_len = CALL_HC_MF(is_16_bit, c,
2569                                              hc_matchfinder_longest_match,
2570                                              in_begin,
2571                                              in_next,
2572                                              2,
2573                                              max_len,
2574                                              nice_len,
2575                                              c->max_search_depth,
2576                                              next_hashes,
2577                                              &cur_offset);
2578
2579                         /* If there was no match found, or the only match found
2580                          * was a distant short match, then choose a literal. */
2581                         if (cur_len < 3 ||
2582                             (cur_len == 3 &&
2583                              cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT &&
2584                              cur_offset != recent_offsets[0] &&
2585                              cur_offset != recent_offsets[1] &&
2586                              cur_offset != recent_offsets[2]))
2587                         {
2588                                 lzx_choose_literal(c, *in_next, &litrunlen);
2589                                 in_next++;
2590                                 continue;
2591                         }
2592
2593                         /* Heuristic: if this match has the most recent offset,
2594                          * then go ahead and choose it as a rep0 match. */
2595                         if (cur_offset == recent_offsets[0]) {
2596                                 in_next++;
2597                                 skip_len = cur_len - 1;
2598                                 cur_adjusted_offset = 0;
2599                                 goto choose_cur_match;
2600                         }
2601
2602                         /* Compute the longest match's score as an explicit
2603                          * offset match. */
2604                         cur_adjusted_offset = cur_offset + LZX_OFFSET_ADJUSTMENT;
2605                         cur_score = lzx_explicit_offset_match_score(cur_len, cur_adjusted_offset);
2606
2607                         /* Find the longest repeat offset match at this
2608                          * position.  If we find one and it's "better" than the
2609                          * explicit offset match we found, then go ahead and
2610                          * choose the repeat offset match immediately. */
2611                         best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2612                                                                             recent_offsets,
2613                                                                             max_len,
2614                                                                             &best_rep_idx);
2615                         in_next++;
2616
2617                         if (best_rep_len != 0 &&
2618                             (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2619                                                                        best_rep_idx)) >= cur_score)
2620                         {
2621                                 cur_len = best_rep_len;
2622                                 cur_adjusted_offset = best_rep_idx;
2623                                 skip_len = best_rep_len - 1;
2624                                 goto choose_cur_match;
2625                         }
2626
2627                 have_cur_match:
2628                         /*
2629                          * We have a match at the current position.  If the
2630                          * match is very long, then choose it immediately.
2631                          * Otherwise, see if there's a better match at the next
2632                          * position.
2633                          */
2634
2635                         if (cur_len >= nice_len) {
2636                                 skip_len = cur_len - 1;
2637                                 goto choose_cur_match;
2638                         }
2639
2640                         if (unlikely(max_len > in_end - in_next)) {
2641                                 max_len = in_end - in_next;
2642                                 nice_len = min(max_len, nice_len);
2643                         }
2644
2645                         next_len = CALL_HC_MF(is_16_bit, c,
2646                                               hc_matchfinder_longest_match,
2647                                               in_begin,
2648                                               in_next,
2649                                               cur_len - 2,
2650                                               max_len,
2651                                               nice_len,
2652                                               c->max_search_depth / 2,
2653                                               next_hashes,
2654                                               &next_offset);
2655
2656                         if (next_len <= cur_len - 2) {
2657                                 /* No potentially better match was found. */
2658                                 in_next++;
2659                                 skip_len = cur_len - 2;
2660                                 goto choose_cur_match;
2661                         }
2662
2663                         next_adjusted_offset = next_offset + LZX_OFFSET_ADJUSTMENT;
2664                         next_score = lzx_explicit_offset_match_score(next_len, next_adjusted_offset);
2665
2666                         best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2667                                                                             recent_offsets,
2668                                                                             max_len,
2669                                                                             &best_rep_idx);
2670                         in_next++;
2671
2672                         if (best_rep_len != 0 &&
2673                             (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2674                                                                        best_rep_idx)) >= next_score)
2675                         {
2676
2677                                 if (rep_score > cur_score) {
2678                                         /* The next match is better, and it's a
2679                                          * repeat offset match. */
2680                                         lzx_choose_literal(c, *(in_next - 2),
2681                                                            &litrunlen);
2682                                         cur_len = best_rep_len;
2683                                         cur_adjusted_offset = best_rep_idx;
2684                                         skip_len = cur_len - 1;
2685                                         goto choose_cur_match;
2686                                 }
2687                         } else {
2688                                 if (next_score > cur_score) {
2689                                         /* The next match is better, and it's an
2690                                          * explicit offset match. */
2691                                         lzx_choose_literal(c, *(in_next - 2),
2692                                                            &litrunlen);
2693                                         cur_len = next_len;
2694                                         cur_adjusted_offset = next_adjusted_offset;
2695                                         cur_score = next_score;
2696                                         goto have_cur_match;
2697                                 }
2698                         }
2699
2700                         /* The original match was better; choose it. */
2701                         skip_len = cur_len - 2;
2702
2703                 choose_cur_match:
2704                         /* Choose a match and have the matchfinder skip over its
2705                          * remaining bytes. */
2706                         lzx_choose_match(c, cur_len, cur_adjusted_offset,
2707                                          recent_offsets, is_16_bit,
2708                                          &litrunlen, &next_seq);
2709                         CALL_HC_MF(is_16_bit, c,
2710                                    hc_matchfinder_skip_bytes,
2711                                    in_begin,
2712                                    in_next,
2713                                    in_end,
2714                                    skip_len,
2715                                    next_hashes);
2716                         in_next += skip_len;
2717
2718                         /* Keep going until it's time to end the block. */
2719                 } while (in_next < in_max_block_end &&
2720                          !(c->split_stats.num_new_observations >=
2721                                         NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2722                            in_next - in_block_begin >= MIN_BLOCK_SIZE &&
2723                            in_end - in_next >= MIN_BLOCK_SIZE &&
2724                            lzx_should_end_block(&c->split_stats)));
2725
2726                 /* Flush the block. */
2727                 lzx_finish_sequence(next_seq, litrunlen);
2728                 lzx_flush_block(c, os, in_block_begin, in_next - in_block_begin, 0);
2729
2730                 /* Keep going until we've reached the end of the input buffer. */
2731         } while (in_next != in_end);
2732 }
2733
2734 static void
2735 lzx_compress_lazy_16(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2736                      struct lzx_output_bitstream *os)
2737 {
2738         lzx_compress_lazy(c, in, in_nbytes, os, true);
2739 }
2740
2741 static void
2742 lzx_compress_lazy_32(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2743                      struct lzx_output_bitstream *os)
2744 {
2745         lzx_compress_lazy(c, in, in_nbytes, os, false);
2746 }
2747
2748 /******************************************************************************/
2749 /*                          Compressor operations                             */
2750 /*----------------------------------------------------------------------------*/
2751
2752 /*
2753  * Generate tables for mapping match offsets (actually, "adjusted" match
2754  * offsets) to offset slots.
2755  */
2756 static void
2757 lzx_init_offset_slot_tabs(struct lzx_compressor *c)
2758 {
2759         u32 adjusted_offset = 0;
2760         unsigned slot = 0;
2761
2762         /* slots [0, 29] */
2763         for (; adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1);
2764              adjusted_offset++)
2765         {
2766                 if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
2767                                        LZX_OFFSET_ADJUSTMENT)
2768                         slot++;
2769                 c->offset_slot_tab_1[adjusted_offset] = slot;
2770         }
2771
2772         /* slots [30, 49] */
2773         for (; adjusted_offset < LZX_MAX_WINDOW_SIZE;
2774              adjusted_offset += (u32)1 << 14)
2775         {
2776                 if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
2777                                        LZX_OFFSET_ADJUSTMENT)
2778                         slot++;
2779                 c->offset_slot_tab_2[adjusted_offset >> 14] = slot;
2780         }
2781 }
2782
2783 static size_t
2784 lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
2785 {
2786         if (compression_level <= MAX_FAST_LEVEL) {
2787                 if (lzx_is_16_bit(max_bufsize))
2788                         return offsetof(struct lzx_compressor, hc_mf_16) +
2789                                hc_matchfinder_size_16(max_bufsize);
2790                 else
2791                         return offsetof(struct lzx_compressor, hc_mf_32) +
2792                                hc_matchfinder_size_32(max_bufsize);
2793         } else {
2794                 if (lzx_is_16_bit(max_bufsize))
2795                         return offsetof(struct lzx_compressor, bt_mf_16) +
2796                                bt_matchfinder_size_16(max_bufsize);
2797                 else
2798                         return offsetof(struct lzx_compressor, bt_mf_32) +
2799                                bt_matchfinder_size_32(max_bufsize);
2800         }
2801 }
2802
2803 /* Compute the amount of memory needed to allocate an LZX compressor. */
2804 static u64
2805 lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2806                       bool destructive)
2807 {
2808         u64 size = 0;
2809
2810         if (max_bufsize > LZX_MAX_WINDOW_SIZE)
2811                 return 0;
2812
2813         size += lzx_get_compressor_size(max_bufsize, compression_level);
2814         if (!destructive)
2815                 size += max_bufsize; /* account for in_buffer */
2816         return size;
2817 }
2818
2819 /* Allocate an LZX compressor. */
2820 static int
2821 lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
2822                       bool destructive, void **c_ret)
2823 {
2824         unsigned window_order;
2825         struct lzx_compressor *c;
2826
2827         /* Validate the maximum buffer size and get the window order from it. */
2828         window_order = lzx_get_window_order(max_bufsize);
2829         if (window_order == 0)
2830                 return WIMLIB_ERR_INVALID_PARAM;
2831
2832         /* Allocate the compressor. */
2833         c = MALLOC(lzx_get_compressor_size(max_bufsize, compression_level));
2834         if (!c)
2835                 goto oom0;
2836
2837         c->window_order = window_order;
2838         c->num_main_syms = lzx_get_num_main_syms(window_order);
2839         c->destructive = destructive;
2840
2841         /* Allocate the buffer for preprocessed data if needed. */
2842         if (!c->destructive) {
2843                 c->in_buffer = MALLOC(max_bufsize);
2844                 if (!c->in_buffer)
2845                         goto oom1;
2846         }
2847
2848         if (compression_level <= MAX_FAST_LEVEL) {
2849
2850                 /* Fast compression: Use lazy parsing. */
2851                 if (lzx_is_16_bit(max_bufsize))
2852                         c->impl = lzx_compress_lazy_16;
2853                 else
2854                         c->impl = lzx_compress_lazy_32;
2855
2856                 /* Scale max_search_depth and nice_match_length with the
2857                  * compression level. */
2858                 c->max_search_depth = (60 * compression_level) / 20;
2859                 c->nice_match_length = (80 * compression_level) / 20;
2860
2861                 /* lzx_compress_lazy() needs max_search_depth >= 2 because it
2862                  * halves the max_search_depth when attempting a lazy match, and
2863                  * max_search_depth must be at least 1. */
2864                 c->max_search_depth = max(c->max_search_depth, 2);
2865         } else {
2866
2867                 /* Normal / high compression: Use near-optimal parsing. */
2868                 if (lzx_is_16_bit(max_bufsize))
2869                         c->impl = lzx_compress_near_optimal_16;
2870                 else
2871                         c->impl = lzx_compress_near_optimal_32;
2872
2873                 /* Scale max_search_depth and nice_match_length with the
2874                  * compression level. */
2875                 c->max_search_depth = (24 * compression_level) / 50;
2876                 c->nice_match_length = (48 * compression_level) / 50;
2877
2878                 /* Also scale num_optim_passes with the compression level.  But
2879                  * the more passes there are, the less they help --- so don't
2880                  * add them linearly.  */
2881                 c->num_optim_passes = 1;
2882                 c->num_optim_passes += (compression_level >= 45);
2883                 c->num_optim_passes += (compression_level >= 70);
2884                 c->num_optim_passes += (compression_level >= 100);
2885                 c->num_optim_passes += (compression_level >= 150);
2886                 c->num_optim_passes += (compression_level >= 200);
2887                 c->num_optim_passes += (compression_level >= 300);
2888
2889                 /* max_search_depth must be at least 1. */
2890                 c->max_search_depth = max(c->max_search_depth, 1);
2891         }
2892
2893         /* Prepare the offset => offset slot mapping. */
2894         lzx_init_offset_slot_tabs(c);
2895
2896         *c_ret = c;
2897         return 0;
2898
2899 oom1:
2900         FREE(c);
2901 oom0:
2902         return WIMLIB_ERR_NOMEM;
2903 }
2904
2905 /* Compress a buffer of data. */
2906 static size_t
2907 lzx_compress(const void *restrict in, size_t in_nbytes,
2908              void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2909 {
2910         struct lzx_compressor *c = _c;
2911         struct lzx_output_bitstream os;
2912         size_t result;
2913
2914         /* Don't bother trying to compress very small inputs. */
2915         if (in_nbytes < 64)
2916                 return 0;
2917
2918         /* If the compressor is in "destructive" mode, then we can directly
2919          * preprocess the input data.  Otherwise, we need to copy it into an
2920          * internal buffer first. */
2921         if (!c->destructive) {
2922                 memcpy(c->in_buffer, in, in_nbytes);
2923                 in = c->in_buffer;
2924         }
2925
2926         /* Preprocess the input data. */
2927         lzx_preprocess((void *)in, in_nbytes);
2928
2929         /* Initially, the previous Huffman codeword lengths are all zeroes. */
2930         c->codes_index = 0;
2931         memset(&c->codes[1].lens, 0, sizeof(struct lzx_lens));
2932
2933         /* Initialize the output bitstream. */
2934         lzx_init_output(&os, out, out_nbytes_avail);
2935
2936         /* Call the compression level-specific compress() function. */
2937         (*c->impl)(c, in, in_nbytes, &os);
2938
2939         /* Flush the output bitstream. */
2940         result = lzx_flush_output(&os);
2941
2942         /* If the data did not compress to less than its original size and we
2943          * preprocessed the original buffer, then postprocess it to restore it
2944          * to its original state. */
2945         if (result == 0 && c->destructive)
2946                 lzx_postprocess((void *)in, in_nbytes);
2947
2948         /* Return the number of compressed bytes, or 0 if the input did not
2949          * compress to less than its original size. */
2950         return result;
2951 }
2952
2953 /* Free an LZX compressor. */
2954 static void
2955 lzx_free_compressor(void *_c)
2956 {
2957         struct lzx_compressor *c = _c;
2958
2959         if (!c->destructive)
2960                 FREE(c->in_buffer);
2961         FREE(c);
2962 }
2963
2964 const struct compressor_ops lzx_compressor_ops = {
2965         .get_needed_memory  = lzx_get_needed_memory,
2966         .create_compressor  = lzx_create_compressor,
2967         .compress           = lzx_compress,
2968         .free_compressor    = lzx_free_compressor,
2969 };