wimlib.net Git - wimlib/blob - src/lzx_compress.c

   1 /*
   2  * lzx_compress.c
   3  *
   4  * A compressor for the LZX compression format, as used in WIM archives.
   5  */
   6
   7 /*
   8  * Copyright (C) 2012-2016 Eric Biggers
   9  *
  10  * This file is free software; you can redistribute it and/or modify it under
  11  * the terms of the GNU Lesser General Public License as published by the Free
  12  * Software Foundation; either version 3 of the License, or (at your option) any
  13  * later version.
  14  *
  15  * This file is distributed in the hope that it will be useful, but WITHOUT
  16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  18  * details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this file; if not, see http://www.gnu.org/licenses/.
  22  */
  23
  24
  25 /*
  26  * This file contains a compressor for the LZX ("Lempel-Ziv eXtended")
  27  * compression format, as used in the WIM (Windows IMaging) file format.
  28  *
  29  * Two different LZX-compatible algorithms are implemented: "near-optimal" and
  30  * "lazy".  "Near-optimal" is significantly slower than "lazy", but results in a
  31  * better compression ratio.  The "near-optimal" algorithm is used at the
  32  * default compression level.
  33  *
  34  * This file may need some slight modifications to be used outside of the WIM
  35  * format.  In particular, in other situations the LZX block header might be
  36  * slightly different, and sliding window support might be required.
  37  *
  38  * LZX is a compression format derived from DEFLATE, the format used by zlib and
  39  * gzip.  Both LZX and DEFLATE use LZ77 matching and Huffman coding.  Certain
  40  * details are quite similar, such as the method for storing Huffman codes.
  41  * However, the main differences are:
  42  *
  43  * - LZX preprocesses the data to attempt to make x86 machine code slightly more
  44  *   compressible before attempting to compress it further.
  45  *
  46  * - LZX uses a "main" alphabet which combines literals and matches, with the
  47  *   match symbols containing a "length header" (giving all or part of the match
  48  *   length) and an "offset slot" (giving, roughly speaking, the order of
  49  *   magnitude of the match offset).
  50  *
  51  * - LZX does not have static Huffman blocks (that is, the kind with preset
  52  *   Huffman codes); however it does have two types of dynamic Huffman blocks
  53  *   ("verbatim" and "aligned").
  54  *
  55  * - LZX has a minimum match length of 2 rather than 3.  Length 2 matches can be
  56  *   useful, but generally only if the compressor is smart about choosing them.
  57  *
  58  * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue
  59  *   of match offsets.  This is very useful for certain types of files, such as
  60  *   binary files that have repeating records.
  61  */
  62
  63 /******************************************************************************/
  64 /*                            General parameters                              */
  65 /*----------------------------------------------------------------------------*/
  66
  67 /*
  68  * The compressor uses the faster algorithm at levels <= MAX_FAST_LEVEL.  It
  69  * uses the slower algorithm at levels > MAX_FAST_LEVEL.
  70  */
  71 #define MAX_FAST_LEVEL                          34
  72
  73 /*
  74  * The compressor-side limits on the codeword lengths (in bits) for each Huffman
  75  * code.  To make outputting bits slightly faster, some of these limits are
  76  * lower than the limits defined by the LZX format.  This does not significantly
  77  * affect the compression ratio.
  78  */
  79 #define MAIN_CODEWORD_LIMIT                     16
  80 #define LENGTH_CODEWORD_LIMIT                   12
  81 #define ALIGNED_CODEWORD_LIMIT                  7
  82 #define PRE_CODEWORD_LIMIT                      7
  83
  84
  85 /******************************************************************************/
  86 /*                         Block splitting parameters                         */
  87 /*----------------------------------------------------------------------------*/
  88
  89 /*
  90  * The compressor always outputs blocks of at least this size in bytes, except
  91  * for the last block which may need to be smaller.
  92  */
  93 #define MIN_BLOCK_SIZE                          6500
  94
  95 /*
  96  * The compressor attempts to end a block when it reaches this size in bytes.
  97  * The final size might be slightly larger due to matches extending beyond the
  98  * end of the block.  Specifically:
  99  *
 100  *  - The near-optimal compressor may choose a match of up to LZX_MAX_MATCH_LEN
 101  *    bytes starting at position 'SOFT_MAX_BLOCK_SIZE - 1'.
 102  *
 103  *  - The lazy compressor may choose a sequence of literals starting at position
 104  *    'SOFT_MAX_BLOCK_SIZE - 1' when it sees a sequence of increasingly better
 105  *    matches.  The final match may be up to LZX_MAX_MATCH_LEN bytes.  The
 106  *    length of the literal sequence is approximately limited by the "nice match
 107  *    length" parameter.
 108  */
 109 #define SOFT_MAX_BLOCK_SIZE                     100000
 110
 111 /*
 112  * The number of observed items (matches and literals) that represents
 113  * sufficient data for the compressor to decide whether the current block should
 114  * be ended or not.
 115  */
 116 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK        400
 117
 118
 119 /******************************************************************************/
 120 /*                      Parameters for slower algorithm                       */
 121 /*----------------------------------------------------------------------------*/
 122
 123 /*
 124  * The log base 2 of the number of entries in the hash table for finding length
 125  * 2 matches.  This could be as high as 16, but using a smaller hash table
 126  * speeds up compression due to reduced cache pressure.
 127  */
 128 #define BT_MATCHFINDER_HASH2_ORDER              12
 129
 130 /*
 131  * The number of lz_match structures in the match cache, excluding the extra
 132  * "overflow" entries.  This value should be high enough so that nearly the
 133  * time, all matches found in a given block can fit in the match cache.
 134  * However, fallback behavior (immediately terminating the block) on cache
 135  * overflow is still required.
 136  */
 137 #define CACHE_LENGTH                            (SOFT_MAX_BLOCK_SIZE * 5)
 138
 139 /*
 140  * An upper bound on the number of matches that can ever be saved in the match
 141  * cache for a single position.  Since each match we save for a single position
 142  * has a distinct length, we can use the number of possible match lengths in LZX
 143  * as this bound.  This bound is guaranteed to be valid in all cases, although
 144  * if 'nice_match_length < LZX_MAX_MATCH_LEN', then it will never actually be
 145  * reached.
 146  */
 147 #define MAX_MATCHES_PER_POS                     LZX_NUM_LENS
 148
 149 /*
 150  * A scaling factor that makes it possible to consider fractional bit costs.  A
 151  * single bit has a cost of BIT_COST.
 152  *
 153  * Note: this is only useful as a statistical trick for when the true costs are
 154  * unknown.  Ultimately, each token in LZX requires a whole number of bits to
 155  * output.
 156  */
 157 #define BIT_COST                                64
 158
 159 /*
 160  * Should the compressor take into account the costs of aligned offset symbols
 161  * instead of assuming that all are equally likely?
 162  */
 163 #define CONSIDER_ALIGNED_COSTS                  1
 164
 165 /*
 166  * Should the "minimum" cost path search algorithm consider "gap" matches, where
 167  * a normal match is followed by a literal, then by a match with the same
 168  * offset?  This is one specific, somewhat common situation in which the true
 169  * minimum cost path is often different from the path found by looking only one
 170  * edge ahead.
 171  */
 172 #define CONSIDER_GAP_MATCHES                    1
 173
 174 /******************************************************************************/
 175 /*                                  Includes                                  */
 176 /*----------------------------------------------------------------------------*/
 177
 178 #ifdef HAVE_CONFIG_H
 179 #  include "config.h"
 180 #endif
 181
 182 #include "wimlib/compress_common.h"
 183 #include "wimlib/compressor_ops.h"
 184 #include "wimlib/error.h"
 185 #include "wimlib/lz_extend.h"
 186 #include "wimlib/lzx_common.h"
 187 #include "wimlib/unaligned.h"
 188 #include "wimlib/util.h"
 189
 190 /* Note: BT_MATCHFINDER_HASH2_ORDER must be defined before including
 191  * bt_matchfinder.h. */
 192
 193 /* Matchfinders with 16-bit positions */
 194 #define mf_pos_t        u16
 195 #define MF_SUFFIX       _16
 196 #include "wimlib/bt_matchfinder.h"
 197 #include "wimlib/hc_matchfinder.h"
 198
 199 /* Matchfinders with 32-bit positions */
 200 #undef mf_pos_t
 201 #undef MF_SUFFIX
 202 #define mf_pos_t        u32
 203 #define MF_SUFFIX       _32
 204 #include "wimlib/bt_matchfinder.h"
 205 #include "wimlib/hc_matchfinder.h"
 206
 207 /******************************************************************************/
 208 /*                            Compressor structure                            */
 209 /*----------------------------------------------------------------------------*/
 210
 211 /* Codewords for the Huffman codes */
 212 struct lzx_codewords {
 213         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 214         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 215         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 216 };
 217
 218 /*
 219  * Codeword lengths, in bits, for the Huffman codes.
 220  *
 221  * A codeword length of 0 means the corresponding codeword has zero frequency.
 222  *
 223  * The main and length codes each have one extra entry for use as a sentinel.
 224  * See lzx_write_compressed_code().
 225  */
 226 struct lzx_lens {
 227         u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS + 1];
 228         u8 len[LZX_LENCODE_NUM_SYMBOLS + 1];
 229         u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 230 };
 231
 232 /* Codewords and lengths for the Huffman codes */
 233 struct lzx_codes {
 234         struct lzx_codewords codewords;
 235         struct lzx_lens lens;
 236 };
 237
 238 /* Symbol frequency counters for the Huffman-encoded alphabets */
 239 struct lzx_freqs {
 240         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 241         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 242         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 243 };
 244
 245 /* Block split statistics.  See the "Block splitting algorithm" section later in
 246  * this file for details. */
 247 #define NUM_LITERAL_OBSERVATION_TYPES 8
 248 #define NUM_MATCH_OBSERVATION_TYPES 2
 249 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
 250                                NUM_MATCH_OBSERVATION_TYPES)
 251 struct lzx_block_split_stats {
 252         u32 new_observations[NUM_OBSERVATION_TYPES];
 253         u32 observations[NUM_OBSERVATION_TYPES];
 254         u32 num_new_observations;
 255         u32 num_observations;
 256 };
 257
 258 /*
 259  * Represents a run of literals followed by a match or end-of-block.  This
 260  * structure is needed to temporarily store items chosen by the compressor,
 261  * since items cannot be written until all items for the block have been chosen
 262  * and the block's Huffman codes have been computed.
 263  */
 264 struct lzx_sequence {
 265
 266         /* The number of literals in the run.  This may be 0.  The literals are
 267          * not stored explicitly in this structure; instead, they are read
 268          * directly from the uncompressed data.  */
 269         u32 litrunlen : 24;
 270
 271         /* If the next field doesn't indicate end-of-block, then this is the
 272          * match length minus LZX_MIN_MATCH_LEN.  */
 273         u32 adjusted_length : 8;
 274
 275         /* If bit 31 is clear, then this field contains the match header in bits
 276          * 0-8, and either the match offset plus LZX_OFFSET_ADJUSTMENT or a
 277          * recent offset code in bits 9-30.  Otherwise (if bit 31 is set), this
 278          * sequence's literal run was the last literal run in the block, so
 279          * there is no match that follows it.  */
 280         u32 adjusted_offset_and_match_hdr;
 281 };
 282
 283 /*
 284  * This structure represents a byte position in the input buffer and a node in
 285  * the graph of possible match/literal choices.
 286  *
 287  * Logically, each incoming edge to this node is labeled with a literal or a
 288  * match that can be taken to reach this position from an earlier position; and
 289  * each outgoing edge from this node is labeled with a literal or a match that
 290  * can be taken to advance from this position to a later position.
 291  */
 292 struct lzx_optimum_node {
 293
 294         /* The cost, in bits, of the lowest-cost path that has been found to
 295          * reach this position.  This can change as progressively lower cost
 296          * paths are found to reach this position.  */
 297         u32 cost;
 298
 299         /*
 300          * The best arrival to this node, i.e. the match or literal that was
 301          * used to arrive to this position at the given 'cost'.  This can change
 302          * as progressively lower cost paths are found to reach this position.
 303          *
 304          * For non-gap matches, this variable is divided into two bitfields
 305          * whose meanings depend on the item type:
 306          *
 307          * Literals:
 308          *      Low bits are 0, high bits are the literal.
 309          *
 310          * Explicit offset matches:
 311          *      Low bits are the match length, high bits are the offset plus
 312          *      LZX_OFFSET_ADJUSTMENT.
 313          *
 314          * Repeat offset matches:
 315          *      Low bits are the match length, high bits are the queue index.
 316          *
 317          * For gap matches, identified by OPTIMUM_GAP_MATCH set, special
 318          * behavior applies --- see the code.
 319          */
 320         u32 item;
 321 #define OPTIMUM_OFFSET_SHIFT 9
 322 #define OPTIMUM_LEN_MASK ((1 << OPTIMUM_OFFSET_SHIFT) - 1)
 323 #if CONSIDER_GAP_MATCHES
 324 #  define OPTIMUM_GAP_MATCH 0x80000000
 325 #endif
 326
 327 } _aligned_attribute(8);
 328
 329 /* The cost model for near-optimal parsing */
 330 struct lzx_costs {
 331
 332         /*
 333          * 'match_cost[offset_slot][len - LZX_MIN_MATCH_LEN]' is the cost of a
 334          * length 'len' match which has an offset belonging to 'offset_slot'.
 335          * The cost includes the main symbol, the length symbol if required, and
 336          * the extra offset bits if any, excluding any entropy-coded bits
 337          * (aligned offset bits).  It does *not* include the cost of the aligned
 338          * offset symbol which may be required.
 339          */
 340         u16 match_cost[LZX_MAX_OFFSET_SLOTS][LZX_NUM_LENS];
 341
 342         /* Cost of each symbol in the main code */
 343         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 344
 345         /* Cost of each symbol in the length code */
 346         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 347
 348 #if CONSIDER_ALIGNED_COSTS
 349         /* Cost of each symbol in the aligned offset code */
 350         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 351 #endif
 352 };
 353
 354 struct lzx_output_bitstream;
 355
 356 /* The main LZX compressor structure */
 357 struct lzx_compressor {
 358
 359         /* The buffer for preprocessed input data, if not using destructive
 360          * compression */
 361         void *in_buffer;
 362
 363         /* If true, then the compressor need not preserve the input buffer if it
 364          * compresses the data successfully */
 365         bool destructive;
 366
 367         /* Pointer to the compress() implementation chosen at allocation time */
 368         void (*impl)(struct lzx_compressor *, const u8 *, size_t,
 369                      struct lzx_output_bitstream *);
 370
 371         /* The log base 2 of the window size for match offset encoding purposes.
 372          * This will be >= LZX_MIN_WINDOW_ORDER and <= LZX_MAX_WINDOW_ORDER. */
 373         unsigned window_order;
 374
 375         /* The number of symbols in the main alphabet.  This depends on the
 376          * window order, since the window order determines the maximum possible
 377          * match offset. */
 378         unsigned num_main_syms;
 379
 380         /* The "nice" match length: if a match of this length is found, then it
 381          * is chosen immediately without further consideration. */
 382         unsigned nice_match_length;
 383
 384         /* The maximum search depth: at most this many potential matches are
 385          * considered at each position. */
 386         unsigned max_search_depth;
 387
 388         /* The number of optimization passes per block */
 389         unsigned num_optim_passes;
 390
 391         /* The symbol frequency counters for the current block */
 392         struct lzx_freqs freqs;
 393
 394         /* Block split statistics for the current block */
 395         struct lzx_block_split_stats split_stats;
 396
 397         /* The Huffman codes for the current and previous blocks.  The one with
 398          * index 'codes_index' is for the current block, and the other one is
 399          * for the previous block. */
 400         struct lzx_codes codes[2];
 401         unsigned codes_index;
 402
 403         /* The matches and literals that the compressor has chosen for the
 404          * current block.  The required length of this array is limited by the
 405          * maximum number of matches that can ever be chosen for a single block,
 406          * plus one for the special entry at the end. */
 407         struct lzx_sequence chosen_sequences[
 408                        DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
 409
 410         /* Tables for mapping adjusted offsets to offset slots */
 411         u8 offset_slot_tab_1[32768]; /* offset slots [0, 29] */
 412         u8 offset_slot_tab_2[128]; /* offset slots [30, 49] */
 413
 414         union {
 415                 /* Data for lzx_compress_lazy() */
 416                 struct {
 417                         /* Hash chains matchfinder (MUST BE LAST!!!) */
 418                         union {
 419                                 struct hc_matchfinder_16 hc_mf_16;
 420                                 struct hc_matchfinder_32 hc_mf_32;
 421                         };
 422                 };
 423
 424                 /* Data for lzx_compress_near_optimal() */
 425                 struct {
 426                         /*
 427                          * Array of nodes, one per position, for running the
 428                          * minimum-cost path algorithm.
 429                          *
 430                          * This array must be large enough to accommodate the
 431                          * worst-case number of nodes, which occurs if the
 432                          * compressor finds a match of length LZX_MAX_MATCH_LEN
 433                          * at position 'SOFT_MAX_BLOCK_SIZE - 1', producing a
 434                          * block of size 'SOFT_MAX_BLOCK_SIZE - 1 +
 435                          * LZX_MAX_MATCH_LEN'.  Add one for the end-of-block
 436                          * node.
 437                          */
 438                         struct lzx_optimum_node optimum_nodes[
 439                                                     SOFT_MAX_BLOCK_SIZE - 1 +
 440                                                     LZX_MAX_MATCH_LEN + 1];
 441
 442                         /* The cost model for the current optimization pass */
 443                         struct lzx_costs costs;
 444
 445                         /*
 446                          * Cached matches for the current block.  This array
 447                          * contains the matches that were found at each position
 448                          * in the block.  Specifically, for each position, there
 449                          * is a special 'struct lz_match' whose 'length' field
 450                          * contains the number of matches that were found at
 451                          * that position; this is followed by the matches
 452                          * themselves, if any, sorted by strictly increasing
 453                          * length.
 454                          *
 455                          * Note: in rare cases, there will be a very high number
 456                          * of matches in the block and this array will overflow.
 457                          * If this happens, we force the end of the current
 458                          * block.  CACHE_LENGTH is the length at which we
 459                          * actually check for overflow.  The extra slots beyond
 460                          * this are enough to absorb the worst case overflow,
 461                          * which occurs if starting at &match_cache[CACHE_LENGTH
 462                          * - 1], we write the match count header, then write
 463                          * MAX_MATCHES_PER_POS matches, then skip searching for
 464                          * matches at 'LZX_MAX_MATCH_LEN - 1' positions and
 465                          * write the match count header for each.
 466                          */
 467                         struct lz_match match_cache[CACHE_LENGTH +
 468                                                     MAX_MATCHES_PER_POS +
 469                                                     LZX_MAX_MATCH_LEN - 1];
 470
 471                         /* Binary trees matchfinder (MUST BE LAST!!!) */
 472                         union {
 473                                 struct bt_matchfinder_16 bt_mf_16;
 474                                 struct bt_matchfinder_32 bt_mf_32;
 475                         };
 476                 };
 477         };
 478 };
 479
 480 /******************************************************************************/
 481 /*                            Matchfinder utilities                           */
 482 /*----------------------------------------------------------------------------*/
 483
 484 /*
 485  * Will a matchfinder using 16-bit positions be sufficient for compressing
 486  * buffers of up to the specified size?  The limit could be 65536 bytes, but we
 487  * also want to optimize out the use of offset_slot_tab_2 in the 16-bit case.
 488  * This requires that the limit be no more than the length of offset_slot_tab_1
 489  * (currently 32768).
 490  */
 491 static forceinline bool
 492 lzx_is_16_bit(size_t max_bufsize)
 493 {
 494         STATIC_ASSERT(ARRAY_LEN(((struct lzx_compressor *)0)->offset_slot_tab_1) == 32768);
 495         return max_bufsize <= 32768;
 496 }
 497
 498 /*
 499  * Return the offset slot for the specified adjusted match offset.
 500  */
 501 static forceinline unsigned
 502 lzx_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
 503                     bool is_16_bit)
 504 {
 505         if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
 506                 return c->offset_slot_tab_1[adjusted_offset];
 507         return c->offset_slot_tab_2[adjusted_offset >> 14];
 508 }
 509
 510 /*
 511  * The following macros call either the 16-bit or the 32-bit version of a
 512  * matchfinder function based on the value of 'is_16_bit', which will be known
 513  * at compilation time.
 514  */
 515
 516 #define CALL_HC_MF(is_16_bit, c, funcname, ...)                               \
 517         ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \
 518                        CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__));
 519
 520 #define CALL_BT_MF(is_16_bit, c, funcname, ...)                               \
 521         ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \
 522                        CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__));
 523
 524 /******************************************************************************/
 525 /*                             Output bitstream                               */
 526 /*----------------------------------------------------------------------------*/
 527
 528 /*
 529  * The LZX bitstream is encoded as a sequence of little endian 16-bit coding
 530  * units.  Bits are ordered from most significant to least significant within
 531  * each coding unit.
 532  */
 533
 534 /*
 535  * Structure to keep track of the current state of sending bits to the
 536  * compressed output buffer.
 537  */
 538 struct lzx_output_bitstream {
 539
 540         /* Bits that haven't yet been written to the output buffer */
 541         machine_word_t bitbuf;
 542
 543         /* Number of bits currently held in @bitbuf */
 544         machine_word_t bitcount;
 545
 546         /* Pointer to the start of the output buffer */
 547         u8 *start;
 548
 549         /* Pointer to the position in the output buffer at which the next coding
 550          * unit should be written */
 551         u8 *next;
 552
 553         /* Pointer to just past the end of the output buffer, rounded down by
 554          * one byte if needed to make 'end - start' a multiple of 2 */
 555         u8 *end;
 556 };
 557
 558 /* Can the specified number of bits always be added to 'bitbuf' after all
 559  * pending 16-bit coding units have been flushed?  */
 560 #define CAN_BUFFER(n)   ((n) <= WORDBITS - 15)
 561
 562 /* Initialize the output bitstream to write to the specified buffer. */
 563 static void
 564 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
 565 {
 566         os->bitbuf = 0;
 567         os->bitcount = 0;
 568         os->start = buffer;
 569         os->next = buffer;
 570         os->end = (u8 *)buffer + (size & ~1);
 571 }
 572
 573 /*
 574  * Add some bits to the bitbuffer variable of the output bitstream.  The caller
 575  * must make sure there is enough room.
 576  */
 577 static forceinline void
 578 lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 579 {
 580         os->bitbuf = (os->bitbuf << num_bits) | bits;
 581         os->bitcount += num_bits;
 582 }
 583
 584 /*
 585  * Flush bits from the bitbuffer variable to the output buffer.  'max_num_bits'
 586  * specifies the maximum number of bits that may have been added since the last
 587  * flush.
 588  */
 589 static forceinline void
 590 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
 591 {
 592         /* Masking the number of bits to shift is only needed to avoid undefined
 593          * behavior; we don't actually care about the results of bad shifts.  On
 594          * x86, the explicit masking generates no extra code.  */
 595         const u32 shift_mask = WORDBITS - 1;
 596
 597         if (os->end - os->next < 6)
 598                 return;
 599         put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
 600                                             shift_mask), os->next + 0);
 601         if (max_num_bits > 16)
 602                 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
 603                                                 shift_mask), os->next + 2);
 604         if (max_num_bits > 32)
 605                 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
 606                                                 shift_mask), os->next + 4);
 607         os->next += (os->bitcount >> 4) << 1;
 608         os->bitcount &= 15;
 609 }
 610
 611 /* Add at most 16 bits to the bitbuffer and flush it.  */
 612 static forceinline void
 613 lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 614 {
 615         lzx_add_bits(os, bits, num_bits);
 616         lzx_flush_bits(os, 16);
 617 }
 618
 619 /*
 620  * Flush the last coding unit to the output buffer if needed.  Return the total
 621  * number of bytes written to the output buffer, or 0 if an overflow occurred.
 622  */
 623 static size_t
 624 lzx_flush_output(struct lzx_output_bitstream *os)
 625 {
 626         if (os->end - os->next < 6)
 627                 return 0;
 628
 629         if (os->bitcount != 0) {
 630                 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
 631                 os->next += 2;
 632         }
 633
 634         return os->next - os->start;
 635 }
 636
 637 /******************************************************************************/
 638 /*                           Preparing Huffman codes                          */
 639 /*----------------------------------------------------------------------------*/
 640
 641 /*
 642  * Build the Huffman codes.  This takes as input the frequency tables for each
 643  * code and produces as output a set of tables that map symbols to codewords and
 644  * codeword lengths.
 645  */
 646 static void
 647 lzx_build_huffman_codes(struct lzx_compressor *c)
 648 {
 649         const struct lzx_freqs *freqs = &c->freqs;
 650         struct lzx_codes *codes = &c->codes[c->codes_index];
 651
 652         STATIC_ASSERT(MAIN_CODEWORD_LIMIT >= 9 &&
 653                       MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
 654         make_canonical_huffman_code(c->num_main_syms,
 655                                     MAIN_CODEWORD_LIMIT,
 656                                     freqs->main,
 657                                     codes->lens.main,
 658                                     codes->codewords.main);
 659
 660         STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 8 &&
 661                       LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
 662         make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
 663                                     LENGTH_CODEWORD_LIMIT,
 664                                     freqs->len,
 665                                     codes->lens.len,
 666                                     codes->codewords.len);
 667
 668         STATIC_ASSERT(ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
 669                       ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
 670         make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
 671                                     ALIGNED_CODEWORD_LIMIT,
 672                                     freqs->aligned,
 673                                     codes->lens.aligned,
 674                                     codes->codewords.aligned);
 675 }
 676
 677 /* Reset the symbol frequencies for the current block. */
 678 static void
 679 lzx_reset_symbol_frequencies(struct lzx_compressor *c)
 680 {
 681         memset(&c->freqs, 0, sizeof(c->freqs));
 682 }
 683
 684 static unsigned
 685 lzx_compute_precode_items(const u8 lens[restrict],
 686                           const u8 prev_lens[restrict],
 687                           u32 precode_freqs[restrict],
 688                           unsigned precode_items[restrict])
 689 {
 690         unsigned *itemptr;
 691         unsigned run_start;
 692         unsigned run_end;
 693         unsigned extra_bits;
 694         int delta;
 695         u8 len;
 696
 697         itemptr = precode_items;
 698         run_start = 0;
 699
 700         while (!((len = lens[run_start]) & 0x80)) {
 701
 702                 /* len = the length being repeated  */
 703
 704                 /* Find the next run of codeword lengths.  */
 705
 706                 run_end = run_start + 1;
 707
 708                 /* Fast case for a single length.  */
 709                 if (likely(len != lens[run_end])) {
 710                         delta = prev_lens[run_start] - len;
 711                         if (delta < 0)
 712                                 delta += 17;
 713                         precode_freqs[delta]++;
 714                         *itemptr++ = delta;
 715                         run_start++;
 716                         continue;
 717                 }
 718
 719                 /* Extend the run.  */
 720                 do {
 721                         run_end++;
 722                 } while (len == lens[run_end]);
 723
 724                 if (len == 0) {
 725                         /* Run of zeroes.  */
 726
 727                         /* Symbol 18: RLE 20 to 51 zeroes at a time.  */
 728                         while ((run_end - run_start) >= 20) {
 729                                 extra_bits = min((run_end - run_start) - 20, 0x1F);
 730                                 precode_freqs[18]++;
 731                                 *itemptr++ = 18 | (extra_bits << 5);
 732                                 run_start += 20 + extra_bits;
 733                         }
 734
 735                         /* Symbol 17: RLE 4 to 19 zeroes at a time.  */
 736                         if ((run_end - run_start) >= 4) {
 737                                 extra_bits = min((run_end - run_start) - 4, 0xF);
 738                                 precode_freqs[17]++;
 739                                 *itemptr++ = 17 | (extra_bits << 5);
 740                                 run_start += 4 + extra_bits;
 741                         }
 742                 } else {
 743
 744                         /* A run of nonzero lengths. */
 745
 746                         /* Symbol 19: RLE 4 to 5 of any length at a time.  */
 747                         while ((run_end - run_start) >= 4) {
 748                                 extra_bits = (run_end - run_start) > 4;
 749                                 delta = prev_lens[run_start] - len;
 750                                 if (delta < 0)
 751                                         delta += 17;
 752                                 precode_freqs[19]++;
 753                                 precode_freqs[delta]++;
 754                                 *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
 755                                 run_start += 4 + extra_bits;
 756                         }
 757                 }
 758
 759                 /* Output any remaining lengths without RLE.  */
 760                 while (run_start != run_end) {
 761                         delta = prev_lens[run_start] - len;
 762                         if (delta < 0)
 763                                 delta += 17;
 764                         precode_freqs[delta]++;
 765                         *itemptr++ = delta;
 766                         run_start++;
 767                 }
 768         }
 769
 770         return itemptr - precode_items;
 771 }
 772
 773 /******************************************************************************/
 774 /*                          Outputting compressed data                        */
 775 /*----------------------------------------------------------------------------*/
 776
 777 /*
 778  * Output a Huffman code in the compressed form used in LZX.
 779  *
 780  * The Huffman code is represented in the output as a logical series of codeword
 781  * lengths from which the Huffman code, which must be in canonical form, can be
 782  * reconstructed.
 783  *
 784  * The codeword lengths are themselves compressed using a separate Huffman code,
 785  * the "precode", which contains a symbol for each possible codeword length in
 786  * the larger code as well as several special symbols to represent repeated
 787  * codeword lengths (a form of run-length encoding).  The precode is itself
 788  * constructed in canonical form, and its codeword lengths are represented
 789  * literally in 20 4-bit fields that immediately precede the compressed codeword
 790  * lengths of the larger code.
 791  *
 792  * Furthermore, the codeword lengths of the larger code are actually represented
 793  * as deltas from the codeword lengths of the corresponding code in the previous
 794  * block.
 795  *
 796  * @os:
 797  *      Bitstream to which to write the compressed Huffman code.
 798  * @lens:
 799  *      The codeword lengths, indexed by symbol, in the Huffman code.
 800  * @prev_lens:
 801  *      The codeword lengths, indexed by symbol, in the corresponding Huffman
 802  *      code in the previous block, or all zeroes if this is the first block.
 803  * @num_lens:
 804  *      The number of symbols in the Huffman code.
 805  */
 806 static void
 807 lzx_write_compressed_code(struct lzx_output_bitstream *os,
 808                           const u8 lens[restrict],
 809                           const u8 prev_lens[restrict],
 810                           unsigned num_lens)
 811 {
 812         u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
 813         u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
 814         u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
 815         unsigned precode_items[num_lens];
 816         unsigned num_precode_items;
 817         unsigned precode_item;
 818         unsigned precode_sym;
 819         unsigned i;
 820         u8 saved = lens[num_lens];
 821         *(u8 *)(lens + num_lens) = 0x80;
 822
 823         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 824                 precode_freqs[i] = 0;
 825
 826         /* Compute the "items" (RLE / literal tokens and extra bits) with which
 827          * the codeword lengths in the larger code will be output.  */
 828         num_precode_items = lzx_compute_precode_items(lens,
 829                                                       prev_lens,
 830                                                       precode_freqs,
 831                                                       precode_items);
 832
 833         /* Build the precode.  */
 834         STATIC_ASSERT(PRE_CODEWORD_LIMIT >= 5 &&
 835                       PRE_CODEWORD_LIMIT <= LZX_MAX_PRE_CODEWORD_LEN);
 836         make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS, PRE_CODEWORD_LIMIT,
 837                                     precode_freqs, precode_lens,
 838                                     precode_codewords);
 839
 840         /* Output the lengths of the codewords in the precode.  */
 841         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 842                 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
 843
 844         /* Output the encoded lengths of the codewords in the larger code.  */
 845         for (i = 0; i < num_precode_items; i++) {
 846                 precode_item = precode_items[i];
 847                 precode_sym = precode_item & 0x1F;
 848                 lzx_add_bits(os, precode_codewords[precode_sym],
 849                              precode_lens[precode_sym]);
 850                 if (precode_sym >= 17) {
 851                         if (precode_sym == 17) {
 852                                 lzx_add_bits(os, precode_item >> 5, 4);
 853                         } else if (precode_sym == 18) {
 854                                 lzx_add_bits(os, precode_item >> 5, 5);
 855                         } else {
 856                                 lzx_add_bits(os, (precode_item >> 5) & 1, 1);
 857                                 precode_sym = precode_item >> 6;
 858                                 lzx_add_bits(os, precode_codewords[precode_sym],
 859                                              precode_lens[precode_sym]);
 860                         }
 861                 }
 862                 STATIC_ASSERT(CAN_BUFFER(2 * PRE_CODEWORD_LIMIT + 1));
 863                 lzx_flush_bits(os, 2 * PRE_CODEWORD_LIMIT + 1);
 864         }
 865
 866         *(u8 *)(lens + num_lens) = saved;
 867 }
 868
 869 /*
 870  * Write all matches and literal bytes (which were precomputed) in an LZX
 871  * compressed block to the output bitstream in the final compressed
 872  * representation.
 873  *
 874  * @os
 875  *      The output bitstream.
 876  * @block_type
 877  *      The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
 878  *      LZX_BLOCKTYPE_VERBATIM).
 879  * @block_data
 880  *      The uncompressed data of the block.
 881  * @sequences
 882  *      The matches and literals to output, given as a series of sequences.
 883  * @codes
 884  *      The main, length, and aligned offset Huffman codes for the block.
 885  */
 886 static void
 887 lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 888                     const u8 *block_data, const struct lzx_sequence sequences[],
 889                     const struct lzx_codes *codes)
 890 {
 891         const struct lzx_sequence *seq = sequences;
 892         unsigned min_aligned_offset_slot;
 893
 894         if (block_type == LZX_BLOCKTYPE_ALIGNED)
 895                 min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
 896         else
 897                 min_aligned_offset_slot = LZX_MAX_OFFSET_SLOTS;
 898
 899         for (;;) {
 900                 /* Output the next sequence.  */
 901
 902                 unsigned litrunlen = seq->litrunlen;
 903                 unsigned match_hdr;
 904                 unsigned main_symbol;
 905                 unsigned adjusted_length;
 906                 u32 adjusted_offset;
 907                 unsigned offset_slot;
 908                 unsigned num_extra_bits;
 909                 u32 extra_bits;
 910
 911                 /* Output the literal run of the sequence.  */
 912
 913                 if (litrunlen) {  /* Is the literal run nonempty?  */
 914
 915                         /* Verify optimization is enabled on 64-bit  */
 916                         STATIC_ASSERT(WORDBITS < 64 ||
 917                                       CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT));
 918
 919                         if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) {
 920
 921                                 /* 64-bit: write 3 literals at a time.  */
 922                                 while (litrunlen >= 3) {
 923                                         unsigned lit0 = block_data[0];
 924                                         unsigned lit1 = block_data[1];
 925                                         unsigned lit2 = block_data[2];
 926                                         lzx_add_bits(os, codes->codewords.main[lit0],
 927                                                      codes->lens.main[lit0]);
 928                                         lzx_add_bits(os, codes->codewords.main[lit1],
 929                                                      codes->lens.main[lit1]);
 930                                         lzx_add_bits(os, codes->codewords.main[lit2],
 931                                                      codes->lens.main[lit2]);
 932                                         lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
 933                                         block_data += 3;
 934                                         litrunlen -= 3;
 935                                 }
 936                                 if (litrunlen--) {
 937                                         unsigned lit = *block_data++;
 938                                         lzx_add_bits(os, codes->codewords.main[lit],
 939                                                      codes->lens.main[lit]);
 940                                         if (litrunlen--) {
 941                                                 unsigned lit = *block_data++;
 942                                                 lzx_add_bits(os, codes->codewords.main[lit],
 943                                                              codes->lens.main[lit]);
 944                                                 lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
 945                                         } else {
 946                                                 lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
 947                                         }
 948                                 }
 949                         } else {
 950                                 /* 32-bit: write 1 literal at a time.  */
 951                                 do {
 952                                         unsigned lit = *block_data++;
 953                                         lzx_add_bits(os, codes->codewords.main[lit],
 954                                                      codes->lens.main[lit]);
 955                                         lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
 956                                 } while (--litrunlen);
 957                         }
 958                 }
 959
 960                 /* Was this the last literal run?  */
 961                 if (seq->adjusted_offset_and_match_hdr & 0x80000000)
 962                         return;
 963
 964                 /* Nope; output the match.  */
 965
 966                 match_hdr = seq->adjusted_offset_and_match_hdr & 0x1FF;
 967                 main_symbol = LZX_NUM_CHARS + match_hdr;
 968                 adjusted_length = seq->adjusted_length;
 969
 970                 block_data += adjusted_length + LZX_MIN_MATCH_LEN;
 971
 972                 offset_slot = match_hdr / LZX_NUM_LEN_HEADERS;
 973                 adjusted_offset = seq->adjusted_offset_and_match_hdr >> 9;
 974
 975                 num_extra_bits = lzx_extra_offset_bits[offset_slot];
 976                 extra_bits = adjusted_offset - (lzx_offset_slot_base[offset_slot] +
 977                                                 LZX_OFFSET_ADJUSTMENT);
 978
 979         #define MAX_MATCH_BITS (MAIN_CODEWORD_LIMIT +           \
 980                                 LENGTH_CODEWORD_LIMIT +         \
 981                                 LZX_MAX_NUM_EXTRA_BITS -        \
 982                                 LZX_NUM_ALIGNED_OFFSET_BITS +   \
 983                                 ALIGNED_CODEWORD_LIMIT)
 984
 985                 /* Verify optimization is enabled on 64-bit  */
 986                 STATIC_ASSERT(WORDBITS < 64 || CAN_BUFFER(MAX_MATCH_BITS));
 987
 988                 /* Output the main symbol for the match.  */
 989
 990                 lzx_add_bits(os, codes->codewords.main[main_symbol],
 991                              codes->lens.main[main_symbol]);
 992                 if (!CAN_BUFFER(MAX_MATCH_BITS))
 993                         lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
 994
 995                 /* If needed, output the length symbol for the match.  */
 996
 997                 if (adjusted_length >= LZX_NUM_PRIMARY_LENS) {
 998                         lzx_add_bits(os, codes->codewords.len[adjusted_length -
 999                                                               LZX_NUM_PRIMARY_LENS],
1000                                      codes->lens.len[adjusted_length -
1001                                                      LZX_NUM_PRIMARY_LENS]);
1002                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1003                                 lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
1004                 }
1005
1006                 /* Output the extra offset bits for the match.  In aligned
1007                  * offset blocks, the lowest 3 bits of the adjusted offset are
1008                  * Huffman-encoded using the aligned offset code, provided that
1009                  * there are at least extra 3 offset bits required.  All other
1010                  * extra offset bits are output verbatim.  */
1011
1012                 if (offset_slot >= min_aligned_offset_slot) {
1013
1014                         lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
1015                                      num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS);
1016                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1017                                 lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS -
1018                                                    LZX_NUM_ALIGNED_OFFSET_BITS);
1019
1020                         lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
1021                                                                   LZX_ALIGNED_OFFSET_BITMASK],
1022                                      codes->lens.aligned[adjusted_offset &
1023                                                          LZX_ALIGNED_OFFSET_BITMASK]);
1024                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1025                                 lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
1026                 } else {
1027                         STATIC_ASSERT(CAN_BUFFER(LZX_MAX_NUM_EXTRA_BITS));
1028
1029                         lzx_add_bits(os, extra_bits, num_extra_bits);
1030                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1031                                 lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS);
1032                 }
1033
1034                 if (CAN_BUFFER(MAX_MATCH_BITS))
1035                         lzx_flush_bits(os, MAX_MATCH_BITS);
1036
1037                 /* Advance to the next sequence.  */
1038                 seq++;
1039         }
1040 }
1041
1042 static void
1043 lzx_write_compressed_block(const u8 *block_begin,
1044                            int block_type,
1045                            u32 block_size,
1046                            unsigned window_order,
1047                            unsigned num_main_syms,
1048                            const struct lzx_sequence sequences[],
1049                            const struct lzx_codes * codes,
1050                            const struct lzx_lens * prev_lens,
1051                            struct lzx_output_bitstream * os)
1052 {
1053         /* The first three bits indicate the type of block and are one of the
1054          * LZX_BLOCKTYPE_* constants.  */
1055         lzx_write_bits(os, block_type, 3);
1056
1057         /*
1058          * Output the block size.
1059          *
1060          * The original LZX format encoded the block size in 24 bits.  However,
1061          * the LZX format used in WIM archives uses 1 bit to specify whether the
1062          * block has the default size of 32768 bytes, then optionally 16 bits to
1063          * specify a non-default size.  This works fine for Microsoft's WIM
1064          * software (WIMGAPI), which never compresses more than 32768 bytes at a
1065          * time with LZX.  However, as an extension, our LZX compressor supports
1066          * compressing up to 2097152 bytes, with a corresponding increase in
1067          * window size.  It is possible for blocks in these larger buffers to
1068          * exceed 65535 bytes; such blocks cannot have their size represented in
1069          * 16 bits.
1070          *
1071          * The chosen solution was to use 24 bits for the block size when
1072          * possibly required --- specifically, when the compressor has been
1073          * allocated to be capable of compressing more than 32768 bytes at once
1074          * (which also causes the number of main symbols to be increased).
1075          */
1076         if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
1077                 lzx_write_bits(os, 1, 1);
1078         } else {
1079                 lzx_write_bits(os, 0, 1);
1080
1081                 if (window_order >= 16)
1082                         lzx_write_bits(os, block_size >> 16, 8);
1083
1084                 lzx_write_bits(os, block_size & 0xFFFF, 16);
1085         }
1086
1087         /* If it's an aligned offset block, output the aligned offset code.  */
1088         if (block_type == LZX_BLOCKTYPE_ALIGNED) {
1089                 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1090                         lzx_write_bits(os, codes->lens.aligned[i],
1091                                        LZX_ALIGNEDCODE_ELEMENT_SIZE);
1092                 }
1093         }
1094
1095         /* Output the main code (two parts).  */
1096         lzx_write_compressed_code(os, codes->lens.main,
1097                                   prev_lens->main,
1098                                   LZX_NUM_CHARS);
1099         lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
1100                                   prev_lens->main + LZX_NUM_CHARS,
1101                                   num_main_syms - LZX_NUM_CHARS);
1102
1103         /* Output the length code.  */
1104         lzx_write_compressed_code(os, codes->lens.len,
1105                                   prev_lens->len,
1106                                   LZX_LENCODE_NUM_SYMBOLS);
1107
1108         /* Output the compressed matches and literals.  */
1109         lzx_write_sequences(os, block_type, block_begin, sequences, codes);
1110 }
1111
1112 /*
1113  * Given the frequencies of symbols in an LZX-compressed block and the
1114  * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
1115  * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
1116  * will take fewer bits to output.
1117  */
1118 static int
1119 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
1120                                const struct lzx_codes * codes)
1121 {
1122         u32 verbatim_cost = 0;
1123         u32 aligned_cost = 0;
1124
1125         /* A verbatim block requires 3 bits in each place that an aligned offset
1126          * symbol would be used in an aligned offset block.  */
1127         for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1128                 verbatim_cost += LZX_NUM_ALIGNED_OFFSET_BITS * freqs->aligned[i];
1129                 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
1130         }
1131
1132         /* Account for the cost of sending the codeword lengths of the aligned
1133          * offset code.  */
1134         aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE *
1135                         LZX_ALIGNEDCODE_NUM_SYMBOLS;
1136
1137         if (aligned_cost < verbatim_cost)
1138                 return LZX_BLOCKTYPE_ALIGNED;
1139         else
1140                 return LZX_BLOCKTYPE_VERBATIM;
1141 }
1142
1143 /*
1144  * Flush an LZX block:
1145  *
1146  * 1. Build the Huffman codes.
1147  * 2. Decide whether to output the block as VERBATIM or ALIGNED.
1148  * 3. Write the block.
1149  * 4. Swap the indices of the current and previous Huffman codes.
1150  *
1151  * Note: we never output UNCOMPRESSED blocks.  This probably should be
1152  * implemented sometime, but it doesn't make much difference.
1153  */
1154 static void
1155 lzx_flush_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
1156                 const u8 *block_begin, u32 block_size, u32 seq_idx)
1157 {
1158         int block_type;
1159
1160         lzx_build_huffman_codes(c);
1161
1162         block_type = lzx_choose_verbatim_or_aligned(&c->freqs,
1163                                                     &c->codes[c->codes_index]);
1164         lzx_write_compressed_block(block_begin,
1165                                    block_type,
1166                                    block_size,
1167                                    c->window_order,
1168                                    c->num_main_syms,
1169                                    &c->chosen_sequences[seq_idx],
1170                                    &c->codes[c->codes_index],
1171                                    &c->codes[c->codes_index ^ 1].lens,
1172                                    os);
1173         c->codes_index ^= 1;
1174 }
1175
1176 /******************************************************************************/
1177 /*                          Block splitting algorithm                         */
1178 /*----------------------------------------------------------------------------*/
1179
1180 /*
1181  * The problem of block splitting is to decide when it is worthwhile to start a
1182  * new block with new entropy codes.  There is a theoretically optimal solution:
1183  * recursively consider every possible block split, considering the exact cost
1184  * of each block, and choose the minimum cost approach.  But this is far too
1185  * slow.  Instead, as an approximation, we can count symbols and after every N
1186  * symbols, compare the expected distribution of symbols based on the previous
1187  * data with the actual distribution.  If they differ "by enough", then start a
1188  * new block.
1189  *
1190  * As an optimization and heuristic, we don't distinguish between every symbol
1191  * but rather we combine many symbols into a single "observation type".  For
1192  * literals we only look at the high bits and low bits, and for matches we only
1193  * look at whether the match is long or not.  The assumption is that for typical
1194  * "real" data, places that are good block boundaries will tend to be noticable
1195  * based only on changes in these aggregate frequencies, without looking for
1196  * subtle differences in individual symbols.  For example, a change from ASCII
1197  * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1198  * to many matches (generally more compressible), would be easily noticed based
1199  * on the aggregates.
1200  *
1201  * For determining whether the frequency distributions are "different enough" to
1202  * start a new block, the simply heuristic of splitting when the sum of absolute
1203  * differences exceeds a constant seems to be good enough.
1204  *
1205  * Finally, for an approximation, it is not strictly necessary that the exact
1206  * symbols being used are considered.  With "near-optimal parsing", for example,
1207  * the actual symbols that will be used are unknown until after the block
1208  * boundary is chosen and the block has been optimized.  Since the final choices
1209  * cannot be used, we can use preliminary "greedy" choices instead.
1210  */
1211
1212 /* Initialize the block split statistics when starting a new block. */
1213 static void
1214 lzx_init_block_split_stats(struct lzx_block_split_stats *stats)
1215 {
1216         memset(stats, 0, sizeof(*stats));
1217 }
1218
1219 /* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
1220  * literal, for 8 possible literal observation types.  */
1221 static forceinline void
1222 lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
1223 {
1224         stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1225         stats->num_new_observations++;
1226 }
1227
1228 /* Match observation.  Heuristic: use one observation type for "short match" and
1229  * one observation type for "long match".  */
1230 static forceinline void
1231 lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length)
1232 {
1233         stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
1234         stats->num_new_observations++;
1235 }
1236
1237 static bool
1238 lzx_should_end_block(struct lzx_block_split_stats *stats)
1239 {
1240         if (stats->num_observations > 0) {
1241
1242                 /* Note: to avoid slow divisions, we do not divide by
1243                  * 'num_observations', but rather do all math with the numbers
1244                  * multiplied by 'num_observations'. */
1245                 u32 total_delta = 0;
1246                 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1247                         u32 expected = stats->observations[i] *
1248                                        stats->num_new_observations;
1249                         u32 actual = stats->new_observations[i] *
1250                                      stats->num_observations;
1251                         u32 delta = (actual > expected) ? actual - expected :
1252                                                           expected - actual;
1253                         total_delta += delta;
1254                 }
1255
1256                 /* Ready to end the block? */
1257                 if (total_delta >=
1258                     stats->num_new_observations * 7 / 8 * stats->num_observations)
1259                         return true;
1260         }
1261
1262         for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1263                 stats->num_observations += stats->new_observations[i];
1264                 stats->observations[i] += stats->new_observations[i];
1265                 stats->new_observations[i] = 0;
1266         }
1267         stats->num_new_observations = 0;
1268         return false;
1269 }
1270
1271 /******************************************************************************/
1272 /*                   Slower ("near-optimal") compression algorithm            */
1273 /*----------------------------------------------------------------------------*/
1274
1275 /*
1276  * Least-recently-used queue for match offsets.
1277  *
1278  * This is represented as a 64-bit integer for efficiency.  There are three
1279  * offsets of 21 bits each.  Bit 64 is garbage.
1280  */
1281 struct lzx_lru_queue {
1282         u64 R;
1283 } _aligned_attribute(8);
1284
1285 #define LZX_QUEUE_OFFSET_SHIFT  21
1286 #define LZX_QUEUE_OFFSET_MASK   (((u64)1 << LZX_QUEUE_OFFSET_SHIFT) - 1)
1287
1288 #define LZX_QUEUE_R0_SHIFT (0 * LZX_QUEUE_OFFSET_SHIFT)
1289 #define LZX_QUEUE_R1_SHIFT (1 * LZX_QUEUE_OFFSET_SHIFT)
1290 #define LZX_QUEUE_R2_SHIFT (2 * LZX_QUEUE_OFFSET_SHIFT)
1291
1292 #define LZX_QUEUE_R0_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R0_SHIFT)
1293 #define LZX_QUEUE_R1_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R1_SHIFT)
1294 #define LZX_QUEUE_R2_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R2_SHIFT)
1295
1296 #define LZX_QUEUE_INITIALIZER {                 \
1297         ((u64)1 << LZX_QUEUE_R0_SHIFT) |        \
1298         ((u64)1 << LZX_QUEUE_R1_SHIFT) |        \
1299         ((u64)1 << LZX_QUEUE_R2_SHIFT) }
1300
1301 static forceinline u64
1302 lzx_lru_queue_R0(struct lzx_lru_queue queue)
1303 {
1304         return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1305 }
1306
1307 static forceinline u64
1308 lzx_lru_queue_R1(struct lzx_lru_queue queue)
1309 {
1310         return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1311 }
1312
1313 static forceinline u64
1314 lzx_lru_queue_R2(struct lzx_lru_queue queue)
1315 {
1316         return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1317 }
1318
1319 /* Push a match offset onto the front (most recently used) end of the queue.  */
1320 static forceinline struct lzx_lru_queue
1321 lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
1322 {
1323         return (struct lzx_lru_queue) {
1324                 .R = (queue.R << LZX_QUEUE_OFFSET_SHIFT) | offset,
1325         };
1326 }
1327
1328 /* Swap a match offset to the front of the queue.  */
1329 static forceinline struct lzx_lru_queue
1330 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
1331 {
1332         unsigned shift = idx * 21;
1333         const u64 mask = LZX_QUEUE_R0_MASK;
1334         const u64 mask_high = mask << shift;
1335
1336         return (struct lzx_lru_queue) {
1337                 (queue.R & ~(mask | mask_high)) |
1338                 ((queue.R & mask_high) >> shift) |
1339                 ((queue.R & mask) << shift)
1340         };
1341 }
1342
1343 static forceinline u32
1344 lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
1345                    bool record)
1346 {
1347         u32 node_idx = block_size;
1348         u32 seq_idx = ARRAY_LEN(c->chosen_sequences) - 1;
1349         u32 lit_start_node;
1350
1351         if (record) {
1352                 /* Special value to mark last sequence  */
1353                 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = 0x80000000;
1354                 lit_start_node = node_idx;
1355         }
1356
1357         for (;;) {
1358                 u32 item;
1359                 u32 len;
1360                 u32 adjusted_offset;
1361                 unsigned v;
1362                 unsigned offset_slot;
1363
1364                 /* Tally literals until either a match or the beginning of the
1365                  * block is reached.  Note: the item in the node at the
1366                  * beginning of the block has all bits set, causing this loop to
1367                  * end when it is reached. */
1368                 for (;;) {
1369                         item = c->optimum_nodes[node_idx].item;
1370                         if (item & OPTIMUM_LEN_MASK)
1371                                 break;
1372                         c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
1373                         node_idx--;
1374                 }
1375
1376         #if CONSIDER_GAP_MATCHES
1377                 if (item & OPTIMUM_GAP_MATCH) {
1378
1379                         if (node_idx == 0)
1380                                 break;
1381
1382                         /* Record the literal run length for the next sequence
1383                          * (the "previous sequence" when walking backwards). */
1384                         len = item & OPTIMUM_LEN_MASK;
1385                         if (record) {
1386                                 c->chosen_sequences[seq_idx--].litrunlen =
1387                                                 lit_start_node - node_idx;
1388                                 lit_start_node = node_idx - len;
1389                         }
1390
1391                         /* Tally the rep0 match after the gap. */
1392                         v = len - LZX_MIN_MATCH_LEN;
1393                         if (record)
1394                                 c->chosen_sequences[seq_idx].adjusted_length = v;
1395                         if (v >= LZX_NUM_PRIMARY_LENS) {
1396                                 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
1397                                 v = LZX_NUM_PRIMARY_LENS;
1398                         }
1399                         c->freqs.main[LZX_NUM_CHARS + v]++;
1400                         if (record)
1401                                 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = v;
1402
1403                         /* Tally the literal in the gap. */
1404                         c->freqs.main[(u8)(item >> OPTIMUM_OFFSET_SHIFT)]++;
1405
1406                         /* Fall through and tally the match before the gap.
1407                          * (It was temporarily saved in the 'cost' field of the
1408                          * previous node, which was free to reuse.) */
1409                         item = c->optimum_nodes[--node_idx].cost;
1410                         node_idx -= len;
1411                 }
1412         #else /* CONSIDER_GAP_MATCHES */
1413                 if (node_idx == 0)
1414                         break;
1415         #endif /* !CONSIDER_GAP_MATCHES */
1416
1417                 len = item & OPTIMUM_LEN_MASK;
1418                 adjusted_offset = item >> OPTIMUM_OFFSET_SHIFT;
1419
1420                 /* Record the literal run length for the next sequence (the
1421                  * "previous sequence" when walking backwards). */
1422                 if (record) {
1423                         c->chosen_sequences[seq_idx--].litrunlen =
1424                                         lit_start_node - node_idx;
1425                         node_idx -= len;
1426                         lit_start_node = node_idx;
1427                 } else {
1428                         node_idx -= len;
1429                 }
1430
1431                 /* Record a match. */
1432
1433                 /* Tally the aligned offset symbol if needed. */
1434                 if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET + LZX_OFFSET_ADJUSTMENT)
1435                         c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
1436
1437                 /* Record the adjusted length. */
1438                 v = len - LZX_MIN_MATCH_LEN;
1439                 if (record)
1440                         c->chosen_sequences[seq_idx].adjusted_length = v;
1441
1442                 /* Tally the length symbol if needed. */
1443                 if (v >= LZX_NUM_PRIMARY_LENS) {
1444                         c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
1445                         v = LZX_NUM_PRIMARY_LENS;
1446                 }
1447
1448                 /* Tally the main symbol. */
1449                 offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
1450                 v += offset_slot * LZX_NUM_LEN_HEADERS;
1451                 c->freqs.main[LZX_NUM_CHARS + v]++;
1452
1453                 /* Record the adjusted offset and match header. */
1454                 if (record) {
1455                         c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr =
1456                                         (adjusted_offset << 9) | v;
1457                 }
1458         }
1459
1460         /* Record the literal run length for the first sequence. */
1461         if (record)
1462                 c->chosen_sequences[seq_idx].litrunlen = lit_start_node - node_idx;
1463
1464         /* Return the index in chosen_sequences at which the sequences begin. */
1465         return seq_idx;
1466 }
1467
1468 /*
1469  * Given the minimum-cost path computed through the item graph for the current
1470  * block, walk the path and count how many of each symbol in each Huffman-coded
1471  * alphabet would be required to output the items (matches and literals) along
1472  * the path.
1473  *
1474  * Note that the path will be walked backwards (from the end of the block to the
1475  * beginning of the block), but this doesn't matter because this function only
1476  * computes frequencies.
1477  */
1478 static forceinline void
1479 lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1480 {
1481         lzx_walk_item_list(c, block_size, is_16_bit, false);
1482 }
1483
1484 /*
1485  * Like lzx_tally_item_list(), but this function also generates the list of
1486  * lzx_sequences for the minimum-cost path and writes it to c->chosen_sequences,
1487  * ready to be output to the bitstream after the Huffman codes are computed.
1488  * The lzx_sequences will be written to decreasing memory addresses as the path
1489  * is walked backwards, which means they will end up in the expected
1490  * first-to-last order.  The return value is the index in c->chosen_sequences at
1491  * which the lzx_sequences begin.
1492  */
1493 static forceinline u32
1494 lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1495 {
1496         return lzx_walk_item_list(c, block_size, is_16_bit, true);
1497 }
1498
1499 /*
1500  * Find an inexpensive path through the graph of possible match/literal choices
1501  * for the current block.  The nodes of the graph are
1502  * c->optimum_nodes[0...block_size].  They correspond directly to the bytes in
1503  * the current block, plus one extra node for end-of-block.  The edges of the
1504  * graph are matches and literals.  The goal is to find the minimum cost path
1505  * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]', given the cost
1506  * model 'c->costs'.
1507  *
1508  * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and
1509  * proceeding forwards one node at a time.  At each node, a selection of matches
1510  * (len >= 2), as well as the literal byte (len = 1), is considered.  An item of
1511  * length 'len' provides a new path to reach the node 'len' bytes later.  If
1512  * such a path is the lowest cost found so far to reach that later node, then
1513  * that later node is updated with the new cost and the "arrival" which provided
1514  * that cost.
1515  *
1516  * Note that although this algorithm is based on minimum cost path search, due
1517  * to various simplifying assumptions the result is not guaranteed to be the
1518  * true minimum cost, or "optimal", path over the graph of all valid LZX
1519  * representations of this block.
1520  *
1521  * Also, note that because of the presence of the recent offsets queue (which is
1522  * a type of adaptive state), the algorithm cannot work backwards and compute
1523  * "cost to end" instead of "cost to beginning".  Furthermore, the way the
1524  * algorithm handles this adaptive state in the "minimum cost" parse is actually
1525  * only an approximation.  It's possible for the globally optimal, minimum cost
1526  * path to contain a prefix, ending at a position, where that path prefix is
1527  * *not* the minimum cost path to that position.  This can happen if such a path
1528  * prefix results in a different adaptive state which results in lower costs
1529  * later.  The algorithm does not solve this problem in general; it only looks
1530  * one step ahead, with the exception of special consideration for "gap
1531  * matches".
1532  */
1533 static forceinline struct lzx_lru_queue
1534 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
1535                        const u8 * const restrict block_begin,
1536                        const u32 block_size,
1537                        const struct lzx_lru_queue initial_queue,
1538                        bool is_16_bit)
1539 {
1540         struct lzx_optimum_node *cur_node = c->optimum_nodes;
1541         struct lzx_optimum_node * const end_node = cur_node + block_size;
1542         struct lz_match *cache_ptr = c->match_cache;
1543         const u8 *in_next = block_begin;
1544         const u8 * const block_end = block_begin + block_size;
1545
1546         /*
1547          * Instead of storing the match offset LRU queues in the
1548          * 'lzx_optimum_node' structures, we save memory (and cache lines) by
1549          * storing them in a smaller array.  This works because the algorithm
1550          * only requires a limited history of the adaptive state.  Once a given
1551          * state is more than LZX_MAX_MATCH_LEN bytes behind the current node
1552          * (more if gap match consideration is enabled; we just round up to 512
1553          * so it's a power of 2), it is no longer needed.
1554          *
1555          * The QUEUE() macro finds the queue for the given node.  This macro has
1556          * been optimized by taking advantage of 'struct lzx_lru_queue' and
1557          * 'struct lzx_optimum_node' both being 8 bytes in size and alignment.
1558          */
1559         struct lzx_lru_queue queues[512];
1560         STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1);
1561         STATIC_ASSERT(sizeof(c->optimum_nodes[0]) == sizeof(queues[0]));
1562 #define QUEUE(node) \
1563         (*(struct lzx_lru_queue *)((char *)queues + \
1564                         ((uintptr_t)(node) % (ARRAY_LEN(queues) * sizeof(queues[0])))))
1565         /*(queues[(uintptr_t)(node) / sizeof(*(node)) % ARRAY_LEN(queues)])*/
1566
1567 #if CONSIDER_GAP_MATCHES
1568         u32 matches_before_gap[ARRAY_LEN(queues)];
1569 #define MATCH_BEFORE_GAP(node) \
1570         (matches_before_gap[(uintptr_t)(node) / sizeof(*(node)) % \
1571                             ARRAY_LEN(matches_before_gap)])
1572 #endif
1573
1574         /*
1575          * Initially, the cost to reach each node is "infinity".
1576          *
1577          * The first node actually should have cost 0, but "infinity"
1578          * (0xFFFFFFFF) works just as well because it immediately overflows.
1579          *
1580          * The following statement also intentionally sets the 'item' of the
1581          * first node, which would otherwise have no meaning, to 0xFFFFFFFF for
1582          * use as a sentinel.  See lzx_walk_item_list().
1583          */
1584         memset(c->optimum_nodes, 0xFF,
1585                (block_size + 1) * sizeof(c->optimum_nodes[0]));
1586
1587         /* Initialize the recent offsets queue for the first node. */
1588         QUEUE(cur_node) = initial_queue;
1589
1590         do { /* For each node in the block in position order... */
1591
1592                 unsigned num_matches;
1593                 unsigned literal;
1594                 u32 cost;
1595
1596                 /*
1597                  * A selection of matches for the block was already saved in
1598                  * memory so that we don't have to run the uncompressed data
1599                  * through the matchfinder on every optimization pass.  However,
1600                  * we still search for repeat offset matches during each
1601                  * optimization pass because we cannot predict the state of the
1602                  * recent offsets queue.  But as a heuristic, we don't bother
1603                  * searching for repeat offset matches if the general-purpose
1604                  * matchfinder failed to find any matches.
1605                  *
1606                  * Note that a match of length n at some offset implies there is
1607                  * also a match of length l for LZX_MIN_MATCH_LEN <= l <= n at
1608                  * that same offset.  In other words, we don't necessarily need
1609                  * to use the full length of a match.  The key heuristic that
1610                  * saves a significicant amount of time is that for each
1611                  * distinct length, we only consider the smallest offset for
1612                  * which that length is available.  This heuristic also applies
1613                  * to repeat offsets, which we order specially: R0 < R1 < R2 <
1614                  * any explicit offset.  Of course, this heuristic may be
1615                  * produce suboptimal results because offset slots in LZX are
1616                  * subject to entropy encoding, but in practice this is a useful
1617                  * heuristic.
1618                  */
1619
1620                 num_matches = cache_ptr->length;
1621                 cache_ptr++;
1622
1623                 if (num_matches) {
1624                         struct lz_match *end_matches = cache_ptr + num_matches;
1625                         unsigned next_len = LZX_MIN_MATCH_LEN;
1626                         unsigned max_len = min(block_end - in_next, LZX_MAX_MATCH_LEN);
1627                         const u8 *matchptr;
1628
1629                         /* Consider rep0 matches. */
1630                         matchptr = in_next - lzx_lru_queue_R0(QUEUE(cur_node));
1631                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1632                                 goto rep0_done;
1633                         STATIC_ASSERT(LZX_MIN_MATCH_LEN == 2);
1634                         do {
1635                                 u32 cost = cur_node->cost +
1636                                            c->costs.match_cost[0][
1637                                                         next_len - LZX_MIN_MATCH_LEN];
1638                                 if (cost <= (cur_node + next_len)->cost) {
1639                                         (cur_node + next_len)->cost = cost;
1640                                         (cur_node + next_len)->item =
1641                                                 (0 << OPTIMUM_OFFSET_SHIFT) | next_len;
1642                                 }
1643                                 if (unlikely(++next_len > max_len)) {
1644                                         cache_ptr = end_matches;
1645                                         goto done_matches;
1646                                 }
1647                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1648
1649                 rep0_done:
1650
1651                         /* Consider rep1 matches. */
1652                         matchptr = in_next - lzx_lru_queue_R1(QUEUE(cur_node));
1653                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1654                                 goto rep1_done;
1655                         if (matchptr[next_len - 1] != in_next[next_len - 1])
1656                                 goto rep1_done;
1657                         for (unsigned len = 2; len < next_len - 1; len++)
1658                                 if (matchptr[len] != in_next[len])
1659                                         goto rep1_done;
1660                         do {
1661                                 u32 cost = cur_node->cost +
1662                                            c->costs.match_cost[1][
1663                                                         next_len - LZX_MIN_MATCH_LEN];
1664                                 if (cost <= (cur_node + next_len)->cost) {
1665                                         (cur_node + next_len)->cost = cost;
1666                                         (cur_node + next_len)->item =
1667                                                 (1 << OPTIMUM_OFFSET_SHIFT) | next_len;
1668                                 }
1669                                 if (unlikely(++next_len > max_len)) {
1670                                         cache_ptr = end_matches;
1671                                         goto done_matches;
1672                                 }
1673                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1674
1675                 rep1_done:
1676
1677                         /* Consider rep2 matches. */
1678                         matchptr = in_next - lzx_lru_queue_R2(QUEUE(cur_node));
1679                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1680                                 goto rep2_done;
1681                         if (matchptr[next_len - 1] != in_next[next_len - 1])
1682                                 goto rep2_done;
1683                         for (unsigned len = 2; len < next_len - 1; len++)
1684                                 if (matchptr[len] != in_next[len])
1685                                         goto rep2_done;
1686                         do {
1687                                 u32 cost = cur_node->cost +
1688                                            c->costs.match_cost[2][
1689                                                         next_len - LZX_MIN_MATCH_LEN];
1690                                 if (cost <= (cur_node + next_len)->cost) {
1691                                         (cur_node + next_len)->cost = cost;
1692                                         (cur_node + next_len)->item =
1693                                                 (2 << OPTIMUM_OFFSET_SHIFT) | next_len;
1694                                 }
1695                                 if (unlikely(++next_len > max_len)) {
1696                                         cache_ptr = end_matches;
1697                                         goto done_matches;
1698                                 }
1699                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1700
1701                 rep2_done:
1702
1703                         while (next_len > cache_ptr->length)
1704                                 if (++cache_ptr == end_matches)
1705                                         goto done_matches;
1706
1707                         /* Consider explicit offset matches. */
1708                         for (;;) {
1709                                 u32 offset = cache_ptr->offset;
1710                                 u32 adjusted_offset = offset + LZX_OFFSET_ADJUSTMENT;
1711                                 unsigned offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
1712                                 u32 base_cost = cur_node->cost;
1713                                 u32 cost;
1714
1715                         #if CONSIDER_ALIGNED_COSTS
1716                                 if (offset >= LZX_MIN_ALIGNED_OFFSET)
1717                                         base_cost += c->costs.aligned[adjusted_offset &
1718                                                                       LZX_ALIGNED_OFFSET_BITMASK];
1719                         #endif
1720                                 do {
1721                                         cost = base_cost +
1722                                                c->costs.match_cost[offset_slot][
1723                                                                 next_len - LZX_MIN_MATCH_LEN];
1724                                         if (cost < (cur_node + next_len)->cost) {
1725                                                 (cur_node + next_len)->cost = cost;
1726                                                 (cur_node + next_len)->item =
1727                                                         (adjusted_offset << OPTIMUM_OFFSET_SHIFT) | next_len;
1728                                         }
1729                                 } while (++next_len <= cache_ptr->length);
1730
1731                                 if (++cache_ptr == end_matches) {
1732                                 #if CONSIDER_GAP_MATCHES
1733                                         /* Also consider the longest explicit
1734                                          * offset match as a "gap match": match
1735                                          * + lit + rep0. */
1736                                         s32 remaining = (block_end - in_next) - (s32)next_len;
1737                                         if (likely(remaining >= 2)) {
1738                                                 const u8 *strptr = in_next + next_len;
1739                                                 const u8 *matchptr = strptr - offset;
1740                                                 if (load_u16_unaligned(strptr) == load_u16_unaligned(matchptr)) {
1741                                                         STATIC_ASSERT(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2 >= 250);
1742                                                         STATIC_ASSERT(ARRAY_LEN(queues) == ARRAY_LEN(matches_before_gap));
1743                                                         unsigned limit = min(remaining,
1744                                                                              min(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2,
1745                                                                                  LZX_MAX_MATCH_LEN));
1746                                                         unsigned rep0_len = lz_extend(strptr, matchptr, 2, limit);
1747                                                         u8 lit = strptr[-1];
1748                                                         cost += c->costs.main[lit] +
1749                                                                 c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN];
1750                                                         unsigned total_len = next_len + rep0_len;
1751                                                         if (cost < (cur_node + total_len)->cost) {
1752                                                                 (cur_node + total_len)->cost = cost;
1753                                                                 (cur_node + total_len)->item =
1754                                                                         OPTIMUM_GAP_MATCH |
1755                                                                         ((u32)lit << OPTIMUM_OFFSET_SHIFT) |
1756                                                                         rep0_len;
1757                                                                 MATCH_BEFORE_GAP(cur_node + total_len) =
1758                                                                         (adjusted_offset << OPTIMUM_OFFSET_SHIFT) |
1759                                                                         (next_len - 1);
1760                                                         }
1761                                                 }
1762                                         }
1763                                 #endif /* CONSIDER_GAP_MATCHES */
1764                                         break;
1765                                 }
1766                         }
1767                 }
1768
1769         done_matches:
1770
1771                 /* Consider coding a literal.
1772
1773                  * To avoid an extra branch, actually checking the preferability
1774                  * of coding the literal is integrated into the queue update
1775                  * code below. */
1776                 literal = *in_next++;
1777                 cost = cur_node->cost + c->costs.main[literal];
1778
1779                 /* Advance to the next position. */
1780                 cur_node++;
1781
1782                 /* The lowest-cost path to the current position is now known.
1783                  * Finalize the recent offsets queue that results from taking
1784                  * this lowest-cost path. */
1785
1786                 if (cost <= cur_node->cost) {
1787                         /* Literal: queue remains unchanged. */
1788                         cur_node->cost = cost;
1789                         cur_node->item = (u32)literal << OPTIMUM_OFFSET_SHIFT;
1790                         QUEUE(cur_node) = QUEUE(cur_node - 1);
1791                 } else {
1792                         /* Match: queue update is needed. */
1793                         unsigned len = cur_node->item & OPTIMUM_LEN_MASK;
1794                 #if CONSIDER_GAP_MATCHES
1795                         s32 adjusted_offset = (s32)cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1796                         STATIC_ASSERT(OPTIMUM_GAP_MATCH == 0x80000000); /* assuming sign extension */
1797                 #else
1798                         u32 adjusted_offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1799                 #endif
1800
1801                         if (adjusted_offset >= LZX_NUM_RECENT_OFFSETS) {
1802                                 /* Explicit offset match: insert offset at front. */
1803                                 QUEUE(cur_node) =
1804                                         lzx_lru_queue_push(QUEUE(cur_node - len),
1805                                                            adjusted_offset - LZX_OFFSET_ADJUSTMENT);
1806                         }
1807                 #if CONSIDER_GAP_MATCHES
1808                         else if (adjusted_offset < 0) {
1809                                 /* "Gap match": Explicit offset match, then a
1810                                  * literal, then rep0 match.  Save the explicit
1811                                  * offset match information in the cost field of
1812                                  * the previous node, which isn't needed
1813                                  * anymore.  Then insert the offset at the front
1814                                  * of the queue. */
1815                                 u32 match_before_gap = MATCH_BEFORE_GAP(cur_node);
1816                                 (cur_node - 1)->cost = match_before_gap;
1817                                 QUEUE(cur_node) =
1818                                         lzx_lru_queue_push(QUEUE(cur_node - len - 1 -
1819                                                                  (match_before_gap & OPTIMUM_LEN_MASK)),
1820                                                            (match_before_gap >> OPTIMUM_OFFSET_SHIFT) -
1821                                                            LZX_OFFSET_ADJUSTMENT);
1822                         }
1823                 #endif
1824                         else {
1825                                 /* Repeat offset match: swap offset to front. */
1826                                 QUEUE(cur_node) =
1827                                         lzx_lru_queue_swap(QUEUE(cur_node - len),
1828                                                            adjusted_offset);
1829                         }
1830                 }
1831         } while (cur_node != end_node);
1832
1833         /* Return the recent offsets queue at the end of the path. */
1834         return QUEUE(cur_node);
1835 }
1836
1837 /*
1838  * Given the costs for the main and length codewords (c->costs.main and
1839  * c->costs.len), initialize the match cost array (c->costs.match_cost) which
1840  * directly provides the cost of every possible (length, offset slot) pair.
1841  */
1842 static void
1843 lzx_compute_match_costs(struct lzx_compressor *c)
1844 {
1845         unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) /
1846                                         LZX_NUM_LEN_HEADERS;
1847         struct lzx_costs *costs = &c->costs;
1848         unsigned main_symbol = LZX_NUM_CHARS;
1849
1850         for (unsigned offset_slot = 0; offset_slot < num_offset_slots;
1851              offset_slot++)
1852         {
1853                 u32 extra_cost = lzx_extra_offset_bits[offset_slot] * BIT_COST;
1854                 unsigned i;
1855
1856         #if CONSIDER_ALIGNED_COSTS
1857                 if (offset_slot >= LZX_MIN_ALIGNED_OFFSET_SLOT)
1858                         extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS * BIT_COST;
1859         #endif
1860
1861                 for (i = 0; i < LZX_NUM_PRIMARY_LENS; i++) {
1862                         costs->match_cost[offset_slot][i] =
1863                                 costs->main[main_symbol++] + extra_cost;
1864                 }
1865
1866                 extra_cost += costs->main[main_symbol++];
1867
1868                 for (; i < LZX_NUM_LENS; i++) {
1869                         costs->match_cost[offset_slot][i] =
1870                                 costs->len[i - LZX_NUM_PRIMARY_LENS] +
1871                                 extra_cost;
1872                 }
1873         }
1874 }
1875
1876 /*
1877  * Fast approximation for log2f(x).  This is not as accurate as the standard C
1878  * version.  It does not need to be perfectly accurate because it is only used
1879  * for estimating symbol costs, which is very approximate anyway.
1880  */
1881 static float
1882 log2f_fast(float x)
1883 {
1884         union {
1885                 float f;
1886                 s32 i;
1887         } u = { .f = x };
1888
1889         /* Extract the exponent and subtract 127 to remove the bias.  This gives
1890          * the integer part of the result. */
1891         float res = ((u.i >> 23) & 0xFF) - 127;
1892
1893         /* Set the exponent to 0 (plus bias of 127).  This transforms the number
1894          * to the range [1, 2) while retaining the same mantissa. */
1895         u.i = (u.i & ~(0xFF << 23)) | (127 << 23);
1896
1897         /*
1898          * Approximate the log2 of the transformed number using a degree 2
1899          * interpolating polynomial for log2(x) over the interval [1, 2).  Then
1900          * add this to the extracted exponent to produce the final approximation
1901          * of log2(x).
1902          *
1903          * The coefficients of the interpolating polynomial used here were found
1904          * using the script tools/log2_interpolation.r.
1905          */
1906         return res - 1.653124006f + u.f * (1.9941812f - u.f * 0.3347490189f);
1907
1908 }
1909
1910 /*
1911  * Return the estimated cost of a symbol which has been estimated to have the
1912  * given probability.
1913  */
1914 static u32
1915 lzx_cost_for_probability(float prob)
1916 {
1917         /*
1918          * The basic formula is:
1919          *
1920          *      entropy = -log2(probability)
1921          *
1922          * Use this to get the cost in fractional bits.  Then multiply by our
1923          * scaling factor of BIT_COST and convert to an integer.
1924          *
1925          * In addition, the minimum cost is BIT_COST (one bit) because the
1926          * entropy coding method will be Huffman codes.
1927          *
1928          * Careful: even though 'prob' should be <= 1.0, 'log2f_fast(prob)' may
1929          * be positive due to inaccuracy in our log2 approximation.  Therefore,
1930          * we cannot, in general, assume the computed cost is non-negative, and
1931          * we should make sure negative costs get rounded up correctly.
1932          */
1933         s32 cost = -log2f_fast(prob) * BIT_COST;
1934         return max(cost, BIT_COST);
1935 }
1936
1937 /*
1938  * Mapping: number of used literals => heuristic probability of a literal times
1939  * 6870.  Generated by running this R command:
1940  *
1941  *      cat(paste(round(6870*2^-((304+(0:256))/64)), collapse=", "))
1942  */
1943 static const u8 literal_scaled_probs[257] = {
1944         255, 253, 250, 247, 244, 242, 239, 237, 234, 232, 229, 227, 224, 222,
1945         219, 217, 215, 212, 210, 208, 206, 203, 201, 199, 197, 195, 193, 191,
1946         189, 186, 184, 182, 181, 179, 177, 175, 173, 171, 169, 167, 166, 164,
1947         162, 160, 159, 157, 155, 153, 152, 150, 149, 147, 145, 144, 142, 141,
1948         139, 138, 136, 135, 133, 132, 130, 129, 128, 126, 125, 124, 122, 121,
1949         120, 118, 117, 116, 115, 113, 112, 111, 110, 109, 107, 106, 105, 104,
1950         103, 102, 101, 100, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86,
1951         86, 85, 84, 83, 82, 81, 80, 79, 78, 78, 77, 76, 75, 74, 73, 73, 72, 71,
1952         70, 70, 69, 68, 67, 67, 66, 65, 65, 64, 63, 62, 62, 61, 60, 60, 59, 59,
1953         58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 52, 51, 51, 50, 50, 49, 49, 48,
1954         48, 47, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 40, 40, 40,
1955         39, 39, 38, 38, 38, 37, 37, 36, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33,
1956         32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 27, 27,
1957         27, 26, 26, 26, 25, 25, 25, 25, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22,
1958         22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18,
1959         18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16
1960 };
1961
1962 /*
1963  * Mapping: length symbol => default cost of that symbol.  This is derived from
1964  * sample data but has been slightly edited to add more bias towards the
1965  * shortest lengths, which are the most common.
1966  */
1967 static const u16 lzx_default_len_costs[LZX_LENCODE_NUM_SYMBOLS] = {
1968         300, 310, 320, 330, 360, 396, 399, 416, 451, 448, 463, 466, 505, 492,
1969         503, 514, 547, 531, 566, 561, 589, 563, 592, 586, 623, 602, 639, 627,
1970         659, 643, 657, 650, 685, 662, 661, 672, 685, 686, 696, 680, 657, 682,
1971         666, 699, 674, 699, 679, 709, 688, 712, 692, 714, 694, 716, 698, 712,
1972         706, 727, 714, 727, 713, 723, 712, 718, 719, 719, 720, 735, 725, 735,
1973         728, 740, 727, 739, 727, 742, 716, 733, 733, 740, 738, 746, 737, 747,
1974         738, 745, 736, 748, 742, 749, 745, 749, 743, 748, 741, 752, 745, 752,
1975         747, 750, 747, 752, 748, 753, 750, 752, 753, 753, 749, 744, 752, 755,
1976         753, 756, 745, 748, 746, 745, 723, 757, 755, 758, 755, 758, 752, 757,
1977         754, 757, 755, 759, 755, 758, 753, 755, 755, 758, 757, 761, 755, 750,
1978         758, 759, 759, 760, 758, 751, 757, 757, 759, 759, 758, 759, 758, 761,
1979         750, 761, 758, 760, 759, 761, 758, 761, 760, 752, 759, 760, 759, 759,
1980         757, 762, 760, 761, 761, 748, 761, 760, 762, 763, 752, 762, 762, 763,
1981         762, 762, 763, 763, 762, 763, 762, 763, 762, 763, 763, 764, 763, 762,
1982         763, 762, 762, 762, 764, 764, 763, 764, 763, 763, 763, 762, 763, 763,
1983         762, 764, 764, 763, 762, 763, 763, 763, 763, 762, 764, 763, 762, 764,
1984         764, 763, 763, 765, 764, 764, 762, 763, 764, 765, 763, 764, 763, 764,
1985         762, 764, 764, 754, 763, 764, 763, 763, 762, 763, 584,
1986 };
1987
1988 /* Set default costs to bootstrap the iterative optimization algorithm. */
1989 static void
1990 lzx_set_default_costs(struct lzx_compressor *c)
1991 {
1992         unsigned i;
1993         u32 num_literals = 0;
1994         u32 num_used_literals = 0;
1995         float inv_num_matches = 1.0f / c->freqs.main[LZX_NUM_CHARS];
1996         float inv_num_items;
1997         float prob_match = 1.0f;
1998         u32 match_cost;
1999         float base_literal_prob;
2000
2001         /* Some numbers here have been hardcoded to assume a bit cost of 64. */
2002         STATIC_ASSERT(BIT_COST == 64);
2003
2004         /* Estimate the number of literals that will used.  'num_literals' is
2005          * the total number, whereas 'num_used_literals' is the number of
2006          * distinct symbols. */
2007         for (i = 0; i < LZX_NUM_CHARS; i++) {
2008                 num_literals += c->freqs.main[i];
2009                 num_used_literals += (c->freqs.main[i] != 0);
2010         }
2011
2012         /* Note: all match headers were tallied as symbol 'LZX_NUM_CHARS'.  We
2013          * don't attempt to estimate which ones will be used. */
2014
2015         inv_num_items = 1.0f / (num_literals + c->freqs.main[LZX_NUM_CHARS]);
2016         base_literal_prob = literal_scaled_probs[num_used_literals] *
2017                             (1.0f / 6870.0f);
2018
2019         /* Literal costs.  We use two different methods to compute the
2020          * probability of each literal and mix together their results. */
2021         for (i = 0; i < LZX_NUM_CHARS; i++) {
2022                 u32 freq = c->freqs.main[i];
2023                 if (freq != 0) {
2024                         float prob = 0.5f * ((freq * inv_num_items) +
2025                                              base_literal_prob);
2026                         c->costs.main[i] = lzx_cost_for_probability(prob);
2027                         prob_match -= prob;
2028                 } else {
2029                         c->costs.main[i] = 11 * BIT_COST;
2030                 }
2031         }
2032
2033         /* Match header costs.  We just assume that all match headers are
2034          * equally probable, but we do take into account the relative cost of a
2035          * match header vs. a literal depending on how common matches are
2036          * expected to be vs. literals. */
2037         prob_match = max(prob_match, 0.15f);
2038         match_cost = lzx_cost_for_probability(prob_match / (c->num_main_syms -
2039                                                             LZX_NUM_CHARS));
2040         for (; i < c->num_main_syms; i++)
2041                 c->costs.main[i] = match_cost;
2042
2043         /* Length symbol costs.  These are just set to fixed values which
2044          * reflect the fact the smallest lengths are typically the most common,
2045          * and therefore are typically the cheapest. */
2046         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
2047                 c->costs.len[i] = lzx_default_len_costs[i];
2048
2049 #if CONSIDER_ALIGNED_COSTS
2050         /* Aligned offset symbol costs.  These are derived from the estimated
2051          * probability of each aligned offset symbol. */
2052         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2053                 /* We intentionally tallied the frequencies in the wrong slots,
2054                  * not accounting for LZX_OFFSET_ADJUSTMENT, since doing the
2055                  * fixup here is faster: a constant 8 subtractions here vs. one
2056                  * addition for every match. */
2057                 unsigned j = (i - LZX_OFFSET_ADJUSTMENT) & LZX_ALIGNED_OFFSET_BITMASK;
2058                 if (c->freqs.aligned[j] != 0) {
2059                         float prob = c->freqs.aligned[j] * inv_num_matches;
2060                         c->costs.aligned[i] = lzx_cost_for_probability(prob);
2061                 } else {
2062                         c->costs.aligned[i] =
2063                                 (2 * LZX_NUM_ALIGNED_OFFSET_BITS) * BIT_COST;
2064                 }
2065         }
2066 #endif
2067 }
2068
2069 /* Update the current cost model to reflect the computed Huffman codes.  */
2070 static void
2071 lzx_set_costs_from_codes(struct lzx_compressor *c)
2072 {
2073         unsigned i;
2074         const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
2075
2076         for (i = 0; i < c->num_main_syms; i++) {
2077                 c->costs.main[i] = (lens->main[i] ? lens->main[i] :
2078                                     MAIN_CODEWORD_LIMIT) * BIT_COST;
2079         }
2080
2081         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
2082                 c->costs.len[i] = (lens->len[i] ? lens->len[i] :
2083                                    LENGTH_CODEWORD_LIMIT) * BIT_COST;
2084         }
2085
2086 #if CONSIDER_ALIGNED_COSTS
2087         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2088                 c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] :
2089                                        ALIGNED_CODEWORD_LIMIT) * BIT_COST;
2090         }
2091 #endif
2092 }
2093
2094 /*
2095  * Choose a "near-optimal" literal/match sequence to use for the current block,
2096  * then flush the block.  Because the cost of each Huffman symbol is unknown
2097  * until the Huffman codes have been built and the Huffman codes themselves
2098  * depend on the symbol frequencies, this uses an iterative optimization
2099  * algorithm to approximate an optimal solution.  The first optimization pass
2100  * for the block uses default costs; additional passes use costs derived from
2101  * the Huffman codes computed in the previous pass.
2102  */
2103 static forceinline struct lzx_lru_queue
2104 lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
2105                              struct lzx_output_bitstream * const restrict os,
2106                              const u8 * const restrict block_begin,
2107                              const u32 block_size,
2108                              const struct lzx_lru_queue initial_queue,
2109                              bool is_16_bit)
2110 {
2111         unsigned num_passes_remaining = c->num_optim_passes;
2112         struct lzx_lru_queue new_queue;
2113         u32 seq_idx;
2114
2115         lzx_set_default_costs(c);
2116
2117         for (;;) {
2118                 lzx_compute_match_costs(c);
2119                 new_queue = lzx_find_min_cost_path(c, block_begin, block_size,
2120                                                    initial_queue, is_16_bit);
2121
2122                 if (--num_passes_remaining == 0)
2123                         break;
2124
2125                 /* At least one optimization pass remains.  Update the costs. */
2126                 lzx_reset_symbol_frequencies(c);
2127                 lzx_tally_item_list(c, block_size, is_16_bit);
2128                 lzx_build_huffman_codes(c);
2129                 lzx_set_costs_from_codes(c);
2130         }
2131
2132         /* Done optimizing.  Generate the sequence list and flush the block. */
2133         lzx_reset_symbol_frequencies(c);
2134         seq_idx = lzx_record_item_list(c, block_size, is_16_bit);
2135         lzx_flush_block(c, os, block_begin, block_size, seq_idx);
2136         return new_queue;
2137 }
2138
2139 /*
2140  * This is the "near-optimal" LZX compressor.
2141  *
2142  * For each block, it performs a relatively thorough graph search to find an
2143  * inexpensive (in terms of compressed size) way to output the block.
2144  *
2145  * Note: there are actually many things this algorithm leaves on the table in
2146  * terms of compression ratio.  So although it may be "near-optimal", it is
2147  * certainly not "optimal".  The goal is not to produce the optimal compression
2148  * ratio, which for LZX is probably impossible within any practical amount of
2149  * time, but rather to produce a compression ratio significantly better than a
2150  * simpler "greedy" or "lazy" parse while still being relatively fast.
2151  */
2152 static forceinline void
2153 lzx_compress_near_optimal(struct lzx_compressor * restrict c,
2154                           const u8 * const restrict in_begin, size_t in_nbytes,
2155                           struct lzx_output_bitstream * restrict os,
2156                           bool is_16_bit)
2157 {
2158         const u8 *       in_next = in_begin;
2159         const u8 * const in_end  = in_begin + in_nbytes;
2160         u32 max_len = LZX_MAX_MATCH_LEN;
2161         u32 nice_len = min(c->nice_match_length, max_len);
2162         u32 next_hashes[2] = {0, 0};
2163         struct lzx_lru_queue queue = LZX_QUEUE_INITIALIZER;
2164
2165         /* Initialize the matchfinder. */
2166         CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
2167
2168         do {
2169                 /* Starting a new block */
2170
2171                 const u8 * const in_block_begin = in_next;
2172                 const u8 * const in_max_block_end =
2173                         in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2174                 struct lz_match *cache_ptr = c->match_cache;
2175                 const u8 *next_search_pos = in_next;
2176                 const u8 *next_observation = in_next;
2177                 const u8 *next_pause_point =
2178                         min(in_next + min(MIN_BLOCK_SIZE,
2179                                           in_max_block_end - in_next),
2180                             in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2181                                                    in_max_block_end - in_next));
2182
2183                 lzx_init_block_split_stats(&c->split_stats);
2184                 lzx_reset_symbol_frequencies(c);
2185
2186                 if (in_next >= next_pause_point)
2187                         goto pause;
2188
2189                 /*
2190                  * Run the input buffer through the matchfinder, caching the
2191                  * matches, until we decide to end the block.
2192                  *
2193                  * For a tighter matchfinding loop, we compute a "pause point",
2194                  * which is the next position at which we may need to check
2195                  * whether to end the block or to decrease max_len.  We then
2196                  * only do these extra checks upon reaching the pause point.
2197                  */
2198         resume_matchfinding:
2199                 do {
2200                         if (in_next >= next_search_pos) {
2201                                 /* Search for matches at this position. */
2202                                 struct lz_match *lz_matchptr;
2203                                 u32 best_len;
2204
2205                                 lz_matchptr = CALL_BT_MF(is_16_bit, c,
2206                                                          bt_matchfinder_get_matches,
2207                                                          in_begin,
2208                                                          in_next - in_begin,
2209                                                          max_len,
2210                                                          nice_len,
2211                                                          c->max_search_depth,
2212                                                          next_hashes,
2213                                                          &best_len,
2214                                                          cache_ptr + 1);
2215                                 cache_ptr->length = lz_matchptr - (cache_ptr + 1);
2216                                 cache_ptr = lz_matchptr;
2217
2218                                 /* Accumulate literal/match statistics for block
2219                                  * splitting and for generating the initial cost
2220                                  * model. */
2221                                 if (in_next >= next_observation) {
2222                                         best_len = cache_ptr[-1].length;
2223                                         if (best_len >= 3) {
2224                                                 /* Match (len >= 3) */
2225
2226                                                 /*
2227                                                  * Note: for performance reasons this has
2228                                                  * been simplified significantly:
2229                                                  *
2230                                                  * - We wait until later to account for
2231                                                  *   LZX_OFFSET_ADJUSTMENT.
2232                                                  * - We don't account for repeat offsets.
2233                                                  * - We don't account for different match headers.
2234                                                  */
2235                                                 c->freqs.aligned[cache_ptr[-1].offset &
2236                                                         LZX_ALIGNED_OFFSET_BITMASK]++;
2237                                                 c->freqs.main[LZX_NUM_CHARS]++;
2238
2239                                                 lzx_observe_match(&c->split_stats, best_len);
2240                                                 next_observation = in_next + best_len;
2241                                         } else {
2242                                                 /* Literal */
2243                                                 c->freqs.main[*in_next]++;
2244                                                 lzx_observe_literal(&c->split_stats, *in_next);
2245                                                 next_observation = in_next + 1;
2246                                         }
2247                                 }
2248
2249                                 /*
2250                                  * If there was a very long match found, then
2251                                  * don't cache any matches for the bytes covered
2252                                  * by that match.  This avoids degenerate
2253                                  * behavior when compressing highly redundant
2254                                  * data, where the number of matches can be very
2255                                  * large.
2256                                  *
2257                                  * This heuristic doesn't actually hurt the
2258                                  * compression ratio *too* much.  If there's a
2259                                  * long match, then the data must be highly
2260                                  * compressible, so it doesn't matter as much
2261                                  * what we do.
2262                                  */
2263                                 if (best_len >= nice_len)
2264                                         next_search_pos = in_next + best_len;
2265                         } else {
2266                                 /* Don't search for matches at this position. */
2267                                 CALL_BT_MF(is_16_bit, c,
2268                                            bt_matchfinder_skip_position,
2269                                            in_begin,
2270                                            in_next - in_begin,
2271                                            nice_len,
2272                                            c->max_search_depth,
2273                                            next_hashes);
2274                                 cache_ptr->length = 0;
2275                                 cache_ptr++;
2276                         }
2277                 } while (++in_next < next_pause_point &&
2278                          likely(cache_ptr < &c->match_cache[CACHE_LENGTH]));
2279
2280         pause:
2281
2282                 /* Adjust max_len and nice_len if we're nearing the end of the
2283                  * input buffer.  In addition, if we are so close to the end of
2284                  * the input buffer that there cannot be any more matches, then
2285                  * just advance through the last few positions and record no
2286                  * matches. */
2287                 if (unlikely(max_len > in_end - in_next)) {
2288                         max_len = in_end - in_next;
2289                         nice_len = min(max_len, nice_len);
2290                         if (max_len < BT_MATCHFINDER_REQUIRED_NBYTES) {
2291                                 while (in_next != in_end) {
2292                                         cache_ptr->length = 0;
2293                                         cache_ptr++;
2294                                         in_next++;
2295                                 }
2296                         }
2297                 }
2298
2299                 /* End the block if the match cache may overflow. */
2300                 if (unlikely(cache_ptr >= &c->match_cache[CACHE_LENGTH]))
2301                         goto end_block;
2302
2303                 /* End the block if the soft maximum size has been reached. */
2304                 if (in_next >= in_max_block_end)
2305                         goto end_block;
2306
2307                 /* End the block if the block splitting algorithm thinks this is
2308                  * a good place to do so. */
2309                 if (c->split_stats.num_new_observations >=
2310                                 NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2311                     in_max_block_end - in_next >= MIN_BLOCK_SIZE &&
2312                     lzx_should_end_block(&c->split_stats))
2313                         goto end_block;
2314
2315                 /* It's not time to end the block yet.  Compute the next pause
2316                  * point and resume matchfinding. */
2317                 next_pause_point =
2318                         min(in_next + min(NUM_OBSERVATIONS_PER_BLOCK_CHECK * 2 -
2319                                             c->split_stats.num_new_observations,
2320                                           in_max_block_end - in_next),
2321                             in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2322                                                    in_max_block_end - in_next));
2323                 goto resume_matchfinding;
2324
2325         end_block:
2326                 /* We've decided on a block boundary and cached matches.  Now
2327                  * choose a match/literal sequence and flush the block. */
2328                 queue = lzx_optimize_and_flush_block(c, os, in_block_begin,
2329                                                      in_next - in_block_begin,
2330                                                      queue, is_16_bit);
2331         } while (in_next != in_end);
2332 }
2333
2334 static void
2335 lzx_compress_near_optimal_16(struct lzx_compressor *c, const u8 *in,
2336                              size_t in_nbytes, struct lzx_output_bitstream *os)
2337 {
2338         lzx_compress_near_optimal(c, in, in_nbytes, os, true);
2339 }
2340
2341 static void
2342 lzx_compress_near_optimal_32(struct lzx_compressor *c, const u8 *in,
2343                              size_t in_nbytes, struct lzx_output_bitstream *os)
2344 {
2345         lzx_compress_near_optimal(c, in, in_nbytes, os, false);
2346 }
2347
2348 /******************************************************************************/
2349 /*                     Faster ("lazy") compression algorithm                  */
2350 /*----------------------------------------------------------------------------*/
2351
2352 /*
2353  * Called when the compressor chooses to use a literal.  This tallies the
2354  * Huffman symbol for the literal, increments the current literal run length,
2355  * and "observes" the literal for the block split statistics.
2356  */
2357 static forceinline void
2358 lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
2359 {
2360         lzx_observe_literal(&c->split_stats, literal);
2361         c->freqs.main[literal]++;
2362         ++*litrunlen_p;
2363 }
2364
2365 /*
2366  * Called when the compressor chooses to use a match.  This tallies the Huffman
2367  * symbol(s) for a match, saves the match data and the length of the preceding
2368  * literal run, updates the recent offsets queue, and "observes" the match for
2369  * the block split statistics.
2370  */
2371 static forceinline void
2372 lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 adjusted_offset,
2373                  u32 recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
2374                  u32 *litrunlen_p, struct lzx_sequence **next_seq_p)
2375 {
2376         u32 litrunlen = *litrunlen_p;
2377         struct lzx_sequence *next_seq = *next_seq_p;
2378         unsigned offset_slot;
2379         unsigned v;
2380
2381         lzx_observe_match(&c->split_stats, length);
2382
2383         v = length - LZX_MIN_MATCH_LEN;
2384
2385         /* Save the literal run length and adjusted length. */
2386         next_seq->litrunlen = litrunlen;
2387         next_seq->adjusted_length = v;
2388
2389         /* Compute the length header, then tally the length symbol if needed. */
2390         if (v >= LZX_NUM_PRIMARY_LENS) {
2391                 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
2392                 v = LZX_NUM_PRIMARY_LENS;
2393         }
2394
2395         /* Compute the offset slot. */
2396         offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
2397
2398         /* Compute the match header. */
2399         v += offset_slot * LZX_NUM_LEN_HEADERS;
2400
2401         /* Save the adjusted offset and match header. */
2402         next_seq->adjusted_offset_and_match_hdr = (adjusted_offset << 9) | v;
2403
2404         /* Tally the main symbol. */
2405         c->freqs.main[LZX_NUM_CHARS + v]++;
2406
2407         /* Update the recent offsets queue. */
2408         if (adjusted_offset < LZX_NUM_RECENT_OFFSETS) {
2409                 /* Repeat offset match. */
2410                 swap(recent_offsets[0], recent_offsets[adjusted_offset]);
2411         } else {
2412                 /* Explicit offset match. */
2413
2414                 /* Tally the aligned offset symbol if needed. */
2415                 if (adjusted_offset >= 16)
2416                         c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
2417
2418                 recent_offsets[2] = recent_offsets[1];
2419                 recent_offsets[1] = recent_offsets[0];
2420                 recent_offsets[0] = adjusted_offset - LZX_OFFSET_ADJUSTMENT;
2421         }
2422
2423         /* Reset the literal run length and advance to the next sequence. */
2424         *next_seq_p = next_seq + 1;
2425         *litrunlen_p = 0;
2426 }
2427
2428 /*
2429  * Called when the compressor ends a block.  This finshes the last lzx_sequence,
2430  * which is just a literal run with no following match.  This literal run might
2431  * be empty.
2432  */
2433 static forceinline void
2434 lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
2435 {
2436         last_seq->litrunlen = litrunlen;
2437
2438         /* Special value to mark last sequence */
2439         last_seq->adjusted_offset_and_match_hdr = 0x80000000;
2440 }
2441
2442 /*
2443  * Find the longest repeat offset match with the current position.  If a match
2444  * is found, return its length and set *best_rep_idx_ret to the index of its
2445  * offset in @recent_offsets.  Otherwise, return 0.
2446  *
2447  * Don't bother with length 2 matches; consider matches of length >= 3 only.
2448  * Also assume that max_len >= 3.
2449  */
2450 static unsigned
2451 lzx_find_longest_repeat_offset_match(const u8 * const in_next,
2452                                      const u32 recent_offsets[],
2453                                      const unsigned max_len,
2454                                      unsigned *best_rep_idx_ret)
2455 {
2456         STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); /* loop is unrolled */
2457
2458         const u32 seq3 = load_u24_unaligned(in_next);
2459         const u8 *matchptr;
2460         unsigned best_rep_len = 0;
2461         unsigned best_rep_idx = 0;
2462         unsigned rep_len;
2463
2464         /* Check for rep0 match (most recent offset) */
2465         matchptr = in_next - recent_offsets[0];
2466         if (load_u24_unaligned(matchptr) == seq3)
2467                 best_rep_len = lz_extend(in_next, matchptr, 3, max_len);
2468
2469         /* Check for rep1 match (second most recent offset) */
2470         matchptr = in_next - recent_offsets[1];
2471         if (load_u24_unaligned(matchptr) == seq3) {
2472                 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2473                 if (rep_len > best_rep_len) {
2474                         best_rep_len = rep_len;
2475                         best_rep_idx = 1;
2476                 }
2477         }
2478
2479         /* Check for rep2 match (third most recent offset) */
2480         matchptr = in_next - recent_offsets[2];
2481         if (load_u24_unaligned(matchptr) == seq3) {
2482                 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2483                 if (rep_len > best_rep_len) {
2484                         best_rep_len = rep_len;
2485                         best_rep_idx = 2;
2486                 }
2487         }
2488
2489         *best_rep_idx_ret = best_rep_idx;
2490         return best_rep_len;
2491 }
2492
2493 /*
2494  * Fast heuristic scoring for lazy parsing: how "good" is this match?
2495  * This is mainly determined by the length: longer matches are better.
2496  * However, we also give a bonus to close (small offset) matches and to repeat
2497  * offset matches, since those require fewer bits to encode.
2498  */
2499
2500 static forceinline unsigned
2501 lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
2502 {
2503         unsigned score = len;
2504
2505         if (adjusted_offset < 4096)
2506                 score++;
2507         if (adjusted_offset < 256)
2508                 score++;
2509
2510         return score;
2511 }
2512
2513 static forceinline unsigned
2514 lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
2515 {
2516         return rep_len + 3;
2517 }
2518
2519 /*
2520  * This is the "lazy" LZX compressor.  The basic idea is that before it chooses
2521  * a match, it checks to see if there's a longer match at the next position.  If
2522  * yes, it chooses a literal and continues to the next position.  If no, it
2523  * chooses the match.
2524  *
2525  * Some additional heuristics are used as well.  Repeat offset matches are
2526  * considered favorably and sometimes are chosen immediately.  In addition, long
2527  * matches (at least "nice_len" bytes) are chosen immediately as well.  Finally,
2528  * when we decide whether a match is "better" than another, we take the offset
2529  * into consideration as well as the length.
2530  */
2531 static forceinline void
2532 lzx_compress_lazy(struct lzx_compressor * restrict c,
2533                   const u8 * const restrict in_begin, size_t in_nbytes,
2534                   struct lzx_output_bitstream * restrict os, bool is_16_bit)
2535 {
2536         const u8 *       in_next = in_begin;
2537         const u8 * const in_end  = in_begin + in_nbytes;
2538         unsigned max_len = LZX_MAX_MATCH_LEN;
2539         unsigned nice_len = min(c->nice_match_length, max_len);
2540         STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
2541         u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1};
2542         u32 next_hashes[2] = {0, 0};
2543
2544         /* Initialize the matchfinder. */
2545         CALL_HC_MF(is_16_bit, c, hc_matchfinder_init);
2546
2547         do {
2548                 /* Starting a new block */
2549
2550                 const u8 * const in_block_begin = in_next;
2551                 const u8 * const in_max_block_end =
2552                         in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2553                 struct lzx_sequence *next_seq = c->chosen_sequences;
2554                 u32 litrunlen = 0;
2555                 unsigned cur_len;
2556                 u32 cur_offset;
2557                 u32 cur_adjusted_offset;
2558                 unsigned cur_score;
2559                 unsigned next_len;
2560                 u32 next_offset;
2561                 u32 next_adjusted_offset;
2562                 unsigned next_score;
2563                 unsigned best_rep_len;
2564                 unsigned best_rep_idx;
2565                 unsigned rep_score;
2566                 unsigned skip_len;
2567
2568                 lzx_reset_symbol_frequencies(c);
2569                 lzx_init_block_split_stats(&c->split_stats);
2570
2571                 do {
2572                         /* Adjust max_len and nice_len if we're nearing the end
2573                          * of the input buffer. */
2574                         if (unlikely(max_len > in_end - in_next)) {
2575                                 max_len = in_end - in_next;
2576                                 nice_len = min(max_len, nice_len);
2577                         }
2578
2579                         /* Find the longest match (subject to the
2580                          * max_search_depth cutoff parameter) with the current
2581                          * position.  Don't bother with length 2 matches; only
2582                          * look for matches of length >= 3. */
2583                         cur_len = CALL_HC_MF(is_16_bit, c,
2584                                              hc_matchfinder_longest_match,
2585                                              in_begin,
2586                                              in_next - in_begin,
2587                                              2,
2588                                              max_len,
2589                                              nice_len,
2590                                              c->max_search_depth,
2591                                              next_hashes,
2592                                              &cur_offset);
2593
2594                         /* If there was no match found, or the only match found
2595                          * was a distant short match, then choose a literal. */
2596                         if (cur_len < 3 ||
2597                             (cur_len == 3 &&
2598                              cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT &&
2599                              cur_offset != recent_offsets[0] &&
2600                              cur_offset != recent_offsets[1] &&
2601                              cur_offset != recent_offsets[2]))
2602                         {
2603                                 lzx_choose_literal(c, *in_next, &litrunlen);
2604                                 in_next++;
2605                                 continue;
2606                         }
2607
2608                         /* Heuristic: if this match has the most recent offset,
2609                          * then go ahead and choose it as a rep0 match. */
2610                         if (cur_offset == recent_offsets[0]) {
2611                                 in_next++;
2612                                 skip_len = cur_len - 1;
2613                                 cur_adjusted_offset = 0;
2614                                 goto choose_cur_match;
2615                         }
2616
2617                         /* Compute the longest match's score as an explicit
2618                          * offset match. */
2619                         cur_adjusted_offset = cur_offset + LZX_OFFSET_ADJUSTMENT;
2620                         cur_score = lzx_explicit_offset_match_score(cur_len, cur_adjusted_offset);
2621
2622                         /* Find the longest repeat offset match at this
2623                          * position.  If we find one and it's "better" than the
2624                          * explicit offset match we found, then go ahead and
2625                          * choose the repeat offset match immediately. */
2626                         best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2627                                                                             recent_offsets,
2628                                                                             max_len,
2629                                                                             &best_rep_idx);
2630                         in_next++;
2631
2632                         if (best_rep_len != 0 &&
2633                             (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2634                                                                        best_rep_idx)) >= cur_score)
2635                         {
2636                                 cur_len = best_rep_len;
2637                                 cur_adjusted_offset = best_rep_idx;
2638                                 skip_len = best_rep_len - 1;
2639                                 goto choose_cur_match;
2640                         }
2641
2642                 have_cur_match:
2643                         /*
2644                          * We have a match at the current position.  If the
2645                          * match is very long, then choose it immediately.
2646                          * Otherwise, see if there's a better match at the next
2647                          * position.
2648                          */
2649
2650                         if (cur_len >= nice_len) {
2651                                 skip_len = cur_len - 1;
2652                                 goto choose_cur_match;
2653                         }
2654
2655                         if (unlikely(max_len > in_end - in_next)) {
2656                                 max_len = in_end - in_next;
2657                                 nice_len = min(max_len, nice_len);
2658                         }
2659
2660                         next_len = CALL_HC_MF(is_16_bit, c,
2661                                               hc_matchfinder_longest_match,
2662                                               in_begin,
2663                                               in_next - in_begin,
2664                                               cur_len - 2,
2665                                               max_len,
2666                                               nice_len,
2667                                               c->max_search_depth / 2,
2668                                               next_hashes,
2669                                               &next_offset);
2670
2671                         if (next_len <= cur_len - 2) {
2672                                 /* No potentially better match was found. */
2673                                 in_next++;
2674                                 skip_len = cur_len - 2;
2675                                 goto choose_cur_match;
2676                         }
2677
2678                         next_adjusted_offset = next_offset + LZX_OFFSET_ADJUSTMENT;
2679                         next_score = lzx_explicit_offset_match_score(next_len, next_adjusted_offset);
2680
2681                         best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2682                                                                             recent_offsets,
2683                                                                             max_len,
2684                                                                             &best_rep_idx);
2685                         in_next++;
2686
2687                         if (best_rep_len != 0 &&
2688                             (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2689                                                                        best_rep_idx)) >= next_score)
2690                         {
2691
2692                                 if (rep_score > cur_score) {
2693                                         /* The next match is better, and it's a
2694                                          * repeat offset match. */
2695                                         lzx_choose_literal(c, *(in_next - 2),
2696                                                            &litrunlen);
2697                                         cur_len = best_rep_len;
2698                                         cur_adjusted_offset = best_rep_idx;
2699                                         skip_len = cur_len - 1;
2700                                         goto choose_cur_match;
2701                                 }
2702                         } else {
2703                                 if (next_score > cur_score) {
2704                                         /* The next match is better, and it's an
2705                                          * explicit offset match. */
2706                                         lzx_choose_literal(c, *(in_next - 2),
2707                                                            &litrunlen);
2708                                         cur_len = next_len;
2709                                         cur_adjusted_offset = next_adjusted_offset;
2710                                         cur_score = next_score;
2711                                         goto have_cur_match;
2712                                 }
2713                         }
2714
2715                         /* The original match was better; choose it. */
2716                         skip_len = cur_len - 2;
2717
2718                 choose_cur_match:
2719                         /* Choose a match and have the matchfinder skip over its
2720                          * remaining bytes. */
2721                         lzx_choose_match(c, cur_len, cur_adjusted_offset,
2722                                          recent_offsets, is_16_bit,
2723                                          &litrunlen, &next_seq);
2724                         in_next = CALL_HC_MF(is_16_bit, c,
2725                                              hc_matchfinder_skip_positions,
2726                                              in_begin,
2727                                              in_next - in_begin,
2728                                              in_end - in_begin,
2729                                              skip_len,
2730                                              next_hashes);
2731
2732                         /* Keep going until it's time to end the block. */
2733                 } while (in_next < in_max_block_end &&
2734                          !(c->split_stats.num_new_observations >=
2735                                         NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2736                            in_next - in_block_begin >= MIN_BLOCK_SIZE &&
2737                            in_end - in_next >= MIN_BLOCK_SIZE &&
2738                            lzx_should_end_block(&c->split_stats)));
2739
2740                 /* Flush the block. */
2741                 lzx_finish_sequence(next_seq, litrunlen);
2742                 lzx_flush_block(c, os, in_block_begin, in_next - in_block_begin, 0);
2743
2744                 /* Keep going until we've reached the end of the input buffer. */
2745         } while (in_next != in_end);
2746 }
2747
2748 static void
2749 lzx_compress_lazy_16(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2750                      struct lzx_output_bitstream *os)
2751 {
2752         lzx_compress_lazy(c, in, in_nbytes, os, true);
2753 }
2754
2755 static void
2756 lzx_compress_lazy_32(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2757                      struct lzx_output_bitstream *os)
2758 {
2759         lzx_compress_lazy(c, in, in_nbytes, os, false);
2760 }
2761
2762 /******************************************************************************/
2763 /*                          Compressor operations                             */
2764 /*----------------------------------------------------------------------------*/
2765
2766 /*
2767  * Generate tables for mapping match offsets (actually, "adjusted" match
2768  * offsets) to offset slots.
2769  */
2770 static void
2771 lzx_init_offset_slot_tabs(struct lzx_compressor *c)
2772 {
2773         u32 adjusted_offset = 0;
2774         unsigned slot = 0;
2775
2776         /* slots [0, 29] */
2777         for (; adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1);
2778              adjusted_offset++)
2779         {
2780                 if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
2781                                        LZX_OFFSET_ADJUSTMENT)
2782                         slot++;
2783                 c->offset_slot_tab_1[adjusted_offset] = slot;
2784         }
2785
2786         /* slots [30, 49] */
2787         for (; adjusted_offset < LZX_MAX_WINDOW_SIZE;
2788              adjusted_offset += (u32)1 << 14)
2789         {
2790                 if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
2791                                        LZX_OFFSET_ADJUSTMENT)
2792                         slot++;
2793                 c->offset_slot_tab_2[adjusted_offset >> 14] = slot;
2794         }
2795 }
2796
2797 static size_t
2798 lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
2799 {
2800         if (compression_level <= MAX_FAST_LEVEL) {
2801                 if (lzx_is_16_bit(max_bufsize))
2802                         return offsetof(struct lzx_compressor, hc_mf_16) +
2803                                hc_matchfinder_size_16(max_bufsize);
2804                 else
2805                         return offsetof(struct lzx_compressor, hc_mf_32) +
2806                                hc_matchfinder_size_32(max_bufsize);
2807         } else {
2808                 if (lzx_is_16_bit(max_bufsize))
2809                         return offsetof(struct lzx_compressor, bt_mf_16) +
2810                                bt_matchfinder_size_16(max_bufsize);
2811                 else
2812                         return offsetof(struct lzx_compressor, bt_mf_32) +
2813                                bt_matchfinder_size_32(max_bufsize);
2814         }
2815 }
2816
2817 /* Compute the amount of memory needed to allocate an LZX compressor. */
2818 static u64
2819 lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2820                       bool destructive)
2821 {
2822         u64 size = 0;
2823
2824         if (max_bufsize > LZX_MAX_WINDOW_SIZE)
2825                 return 0;
2826
2827         size += lzx_get_compressor_size(max_bufsize, compression_level);
2828         if (!destructive)
2829                 size += max_bufsize; /* account for in_buffer */
2830         return size;
2831 }
2832
2833 /* Allocate an LZX compressor. */
2834 static int
2835 lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
2836                       bool destructive, void **c_ret)
2837 {
2838         unsigned window_order;
2839         struct lzx_compressor *c;
2840
2841         /* Validate the maximum buffer size and get the window order from it. */
2842         window_order = lzx_get_window_order(max_bufsize);
2843         if (window_order == 0)
2844                 return WIMLIB_ERR_INVALID_PARAM;
2845
2846         /* Allocate the compressor. */
2847         c = MALLOC(lzx_get_compressor_size(max_bufsize, compression_level));
2848         if (!c)
2849                 goto oom0;
2850
2851         c->window_order = window_order;
2852         c->num_main_syms = lzx_get_num_main_syms(window_order);
2853         c->destructive = destructive;
2854
2855         /* Allocate the buffer for preprocessed data if needed. */
2856         if (!c->destructive) {
2857                 c->in_buffer = MALLOC(max_bufsize);
2858                 if (!c->in_buffer)
2859                         goto oom1;
2860         }
2861
2862         if (compression_level <= MAX_FAST_LEVEL) {
2863
2864                 /* Fast compression: Use lazy parsing. */
2865                 if (lzx_is_16_bit(max_bufsize))
2866                         c->impl = lzx_compress_lazy_16;
2867                 else
2868                         c->impl = lzx_compress_lazy_32;
2869
2870                 /* Scale max_search_depth and nice_match_length with the
2871                  * compression level. */
2872                 c->max_search_depth = (60 * compression_level) / 20;
2873                 c->nice_match_length = (80 * compression_level) / 20;
2874
2875                 /* lzx_compress_lazy() needs max_search_depth >= 2 because it
2876                  * halves the max_search_depth when attempting a lazy match, and
2877                  * max_search_depth must be at least 1. */
2878                 c->max_search_depth = max(c->max_search_depth, 2);
2879         } else {
2880
2881                 /* Normal / high compression: Use near-optimal parsing. */
2882                 if (lzx_is_16_bit(max_bufsize))
2883                         c->impl = lzx_compress_near_optimal_16;
2884                 else
2885                         c->impl = lzx_compress_near_optimal_32;
2886
2887                 /* Scale max_search_depth and nice_match_length with the
2888                  * compression level. */
2889                 c->max_search_depth = (24 * compression_level) / 50;
2890                 c->nice_match_length = (48 * compression_level) / 50;
2891
2892                 /* Also scale num_optim_passes with the compression level.  But
2893                  * the more passes there are, the less they help --- so don't
2894                  * add them linearly.  */
2895                 c->num_optim_passes = 1;
2896                 c->num_optim_passes += (compression_level >= 45);
2897                 c->num_optim_passes += (compression_level >= 70);
2898                 c->num_optim_passes += (compression_level >= 100);
2899                 c->num_optim_passes += (compression_level >= 150);
2900                 c->num_optim_passes += (compression_level >= 200);
2901                 c->num_optim_passes += (compression_level >= 300);
2902
2903                 /* max_search_depth must be at least 1. */
2904                 c->max_search_depth = max(c->max_search_depth, 1);
2905         }
2906
2907         /* Prepare the offset => offset slot mapping. */
2908         lzx_init_offset_slot_tabs(c);
2909
2910         *c_ret = c;
2911         return 0;
2912
2913 oom1:
2914         FREE(c);
2915 oom0:
2916         return WIMLIB_ERR_NOMEM;
2917 }
2918
2919 /* Compress a buffer of data. */
2920 static size_t
2921 lzx_compress(const void *restrict in, size_t in_nbytes,
2922              void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2923 {
2924         struct lzx_compressor *c = _c;
2925         struct lzx_output_bitstream os;
2926         size_t result;
2927
2928         /* Don't bother trying to compress very small inputs. */
2929         if (in_nbytes < 64)
2930                 return 0;
2931
2932         /* If the compressor is in "destructive" mode, then we can directly
2933          * preprocess the input data.  Otherwise, we need to copy it into an
2934          * internal buffer first. */
2935         if (!c->destructive) {
2936                 memcpy(c->in_buffer, in, in_nbytes);
2937                 in = c->in_buffer;
2938         }
2939
2940         /* Preprocess the input data. */
2941         lzx_preprocess((void *)in, in_nbytes);
2942
2943         /* Initially, the previous Huffman codeword lengths are all zeroes. */
2944         c->codes_index = 0;
2945         memset(&c->codes[1].lens, 0, sizeof(struct lzx_lens));
2946
2947         /* Initialize the output bitstream. */
2948         lzx_init_output(&os, out, out_nbytes_avail);
2949
2950         /* Call the compression level-specific compress() function. */
2951         (*c->impl)(c, in, in_nbytes, &os);
2952
2953         /* Flush the output bitstream. */
2954         result = lzx_flush_output(&os);
2955
2956         /* If the data did not compress to less than its original size and we
2957          * preprocessed the original buffer, then postprocess it to restore it
2958          * to its original state. */
2959         if (result == 0 && c->destructive)
2960                 lzx_postprocess((void *)in, in_nbytes);
2961
2962         /* Return the number of compressed bytes, or 0 if the input did not
2963          * compress to less than its original size. */
2964         return result;
2965 }
2966
2967 /* Free an LZX compressor. */
2968 static void
2969 lzx_free_compressor(void *_c)
2970 {
2971         struct lzx_compressor *c = _c;
2972
2973         if (!c->destructive)
2974                 FREE(c->in_buffer);
2975         FREE(c);
2976 }
2977
2978 const struct compressor_ops lzx_compressor_ops = {
2979         .get_needed_memory  = lzx_get_needed_memory,
2980         .create_compressor  = lzx_create_compressor,
2981         .compress           = lzx_compress,
2982         .free_compressor    = lzx_free_compressor,
2983 };