wimlib.net Git - wimlib/blob - src/lzx_compress.c

   1 /*
   2  * lzx_compress.c
   3  *
   4  * A compressor for the LZX compression format, as used in WIM archives.
   5  */
   6
   7 /*
   8  * Copyright (C) 2012-2016 Eric Biggers
   9  *
  10  * This file is free software; you can redistribute it and/or modify it under
  11  * the terms of the GNU Lesser General Public License as published by the Free
  12  * Software Foundation; either version 3 of the License, or (at your option) any
  13  * later version.
  14  *
  15  * This file is distributed in the hope that it will be useful, but WITHOUT
  16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  18  * details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this file; if not, see http://www.gnu.org/licenses/.
  22  */
  23
  24
  25 /*
  26  * This file contains a compressor for the LZX ("Lempel-Ziv eXtended")
  27  * compression format, as used in the WIM (Windows IMaging) file format.
  28  *
  29  * Two different LZX-compatible algorithms are implemented: "near-optimal" and
  30  * "lazy".  "Near-optimal" is significantly slower than "lazy", but results in a
  31  * better compression ratio.  The "near-optimal" algorithm is used at the
  32  * default compression level.
  33  *
  34  * This file may need some slight modifications to be used outside of the WIM
  35  * format.  In particular, in other situations the LZX block header might be
  36  * slightly different, and sliding window support might be required.
  37  *
  38  * LZX is a compression format derived from DEFLATE, the format used by zlib and
  39  * gzip.  Both LZX and DEFLATE use LZ77 matching and Huffman coding.  Certain
  40  * details are quite similar, such as the method for storing Huffman codes.
  41  * However, the main differences are:
  42  *
  43  * - LZX preprocesses the data to attempt to make x86 machine code slightly more
  44  *   compressible before attempting to compress it further.
  45  *
  46  * - LZX uses a "main" alphabet which combines literals and matches, with the
  47  *   match symbols containing a "length header" (giving all or part of the match
  48  *   length) and an "offset slot" (giving, roughly speaking, the order of
  49  *   magnitude of the match offset).
  50  *
  51  * - LZX does not have static Huffman blocks (that is, the kind with preset
  52  *   Huffman codes); however it does have two types of dynamic Huffman blocks
  53  *   ("verbatim" and "aligned").
  54  *
  55  * - LZX has a minimum match length of 2 rather than 3.  Length 2 matches can be
  56  *   useful, but generally only if the compressor is smart about choosing them.
  57  *
  58  * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue
  59  *   of match offsets.  This is very useful for certain types of files, such as
  60  *   binary files that have repeating records.
  61  */
  62
  63 /******************************************************************************/
  64 /*                            General parameters                              */
  65 /*----------------------------------------------------------------------------*/
  66
  67 /*
  68  * The compressor uses the faster algorithm at levels <= MAX_FAST_LEVEL.  It
  69  * uses the slower algorithm at levels > MAX_FAST_LEVEL.
  70  */
  71 #define MAX_FAST_LEVEL                          34
  72
  73 /*
  74  * The compressor-side limits on the codeword lengths (in bits) for each Huffman
  75  * code.  To make outputting bits slightly faster, some of these limits are
  76  * lower than the limits defined by the LZX format.  This does not significantly
  77  * affect the compression ratio.
  78  */
  79 #define MAIN_CODEWORD_LIMIT                     16
  80 #define LENGTH_CODEWORD_LIMIT                   12
  81 #define ALIGNED_CODEWORD_LIMIT                  7
  82 #define PRE_CODEWORD_LIMIT                      7
  83
  84
  85 /******************************************************************************/
  86 /*                         Block splitting parameters                         */
  87 /*----------------------------------------------------------------------------*/
  88
  89 /*
  90  * The compressor always outputs blocks of at least this size in bytes, except
  91  * for the last block which may need to be smaller.
  92  */
  93 #define MIN_BLOCK_SIZE                          6500
  94
  95 /*
  96  * The compressor attempts to end a block when it reaches this size in bytes.
  97  * The final size might be slightly larger due to matches extending beyond the
  98  * end of the block.  Specifically:
  99  *
 100  *  - The near-optimal compressor may choose a match of up to LZX_MAX_MATCH_LEN
 101  *    bytes starting at position 'SOFT_MAX_BLOCK_SIZE - 1'.
 102  *
 103  *  - The lazy compressor may choose a sequence of literals starting at position
 104  *    'SOFT_MAX_BLOCK_SIZE - 1' when it sees a sequence of increasingly better
 105  *    matches.  The final match may be up to LZX_MAX_MATCH_LEN bytes.  The
 106  *    length of the literal sequence is approximately limited by the "nice match
 107  *    length" parameter.
 108  */
 109 #define SOFT_MAX_BLOCK_SIZE                     100000
 110
 111 /*
 112  * The number of observed items (matches and literals) that represents
 113  * sufficient data for the compressor to decide whether the current block should
 114  * be ended or not.
 115  */
 116 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK        400
 117
 118
 119 /******************************************************************************/
 120 /*                      Parameters for slower algorithm                       */
 121 /*----------------------------------------------------------------------------*/
 122
 123 /*
 124  * The log base 2 of the number of entries in the hash table for finding length
 125  * 2 matches.  This could be as high as 16, but using a smaller hash table
 126  * speeds up compression due to reduced cache pressure.
 127  */
 128 #define BT_MATCHFINDER_HASH2_ORDER              12
 129
 130 /*
 131  * The number of lz_match structures in the match cache, excluding the extra
 132  * "overflow" entries.  This value should be high enough so that nearly the
 133  * time, all matches found in a given block can fit in the match cache.
 134  * However, fallback behavior (immediately terminating the block) on cache
 135  * overflow is still required.
 136  */
 137 #define CACHE_LENGTH                            (SOFT_MAX_BLOCK_SIZE * 5)
 138
 139 /*
 140  * An upper bound on the number of matches that can ever be saved in the match
 141  * cache for a single position.  Since each match we save for a single position
 142  * has a distinct length, we can use the number of possible match lengths in LZX
 143  * as this bound.  This bound is guaranteed to be valid in all cases, although
 144  * if 'nice_match_length < LZX_MAX_MATCH_LEN', then it will never actually be
 145  * reached.
 146  */
 147 #define MAX_MATCHES_PER_POS                     LZX_NUM_LENS
 148
 149 /*
 150  * A scaling factor that makes it possible to consider fractional bit costs.  A
 151  * single bit has a cost of BIT_COST.
 152  *
 153  * Note: this is only useful as a statistical trick for when the true costs are
 154  * unknown.  Ultimately, each token in LZX requires a whole number of bits to
 155  * output.
 156  */
 157 #define BIT_COST                                64
 158
 159 /*
 160  * Should the compressor take into account the costs of aligned offset symbols
 161  * instead of assuming that all are equally likely?
 162  */
 163 #define CONSIDER_ALIGNED_COSTS                  1
 164
 165 /*
 166  * Should the "minimum" cost path search algorithm consider "gap" matches, where
 167  * a normal match is followed by a literal, then by a match with the same
 168  * offset?  This is one specific, somewhat common situation in which the true
 169  * minimum cost path is often different from the path found by looking only one
 170  * edge ahead.
 171  */
 172 #define CONSIDER_GAP_MATCHES                    1
 173
 174 /******************************************************************************/
 175 /*                                  Includes                                  */
 176 /*----------------------------------------------------------------------------*/
 177
 178 #ifdef HAVE_CONFIG_H
 179 #  include "config.h"
 180 #endif
 181
 182 #include "wimlib/compress_common.h"
 183 #include "wimlib/compressor_ops.h"
 184 #include "wimlib/error.h"
 185 #include "wimlib/lz_extend.h"
 186 #include "wimlib/lzx_common.h"
 187 #include "wimlib/unaligned.h"
 188 #include "wimlib/util.h"
 189
 190 /* Note: BT_MATCHFINDER_HASH2_ORDER must be defined before including
 191  * bt_matchfinder.h. */
 192
 193 /* Matchfinders with 16-bit positions */
 194 #define mf_pos_t        u16
 195 #define MF_SUFFIX       _16
 196 #include "wimlib/bt_matchfinder.h"
 197 #include "wimlib/hc_matchfinder.h"
 198
 199 /* Matchfinders with 32-bit positions */
 200 #undef mf_pos_t
 201 #undef MF_SUFFIX
 202 #define mf_pos_t        u32
 203 #define MF_SUFFIX       _32
 204 #include "wimlib/bt_matchfinder.h"
 205 #include "wimlib/hc_matchfinder.h"
 206
 207 /******************************************************************************/
 208 /*                            Compressor structure                            */
 209 /*----------------------------------------------------------------------------*/
 210
 211 /* Codewords for the Huffman codes */
 212 struct lzx_codewords {
 213         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 214         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 215         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 216 };
 217
 218 /*
 219  * Codeword lengths, in bits, for the Huffman codes.
 220  *
 221  * A codeword length of 0 means the corresponding codeword has zero frequency.
 222  *
 223  * The main and length codes each have one extra entry for use as a sentinel.
 224  * See lzx_write_compressed_code().
 225  */
 226 struct lzx_lens {
 227         u8 main[LZX_MAINCODE_MAX_NUM_SYMBOLS + 1];
 228         u8 len[LZX_LENCODE_NUM_SYMBOLS + 1];
 229         u8 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 230 };
 231
 232 /* Codewords and lengths for the Huffman codes */
 233 struct lzx_codes {
 234         struct lzx_codewords codewords;
 235         struct lzx_lens lens;
 236 };
 237
 238 /* Symbol frequency counters for the Huffman-encoded alphabets */
 239 struct lzx_freqs {
 240         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 241         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 242         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 243 };
 244
 245 /* Block split statistics.  See the "Block splitting algorithm" section later in
 246  * this file for details. */
 247 #define NUM_LITERAL_OBSERVATION_TYPES 8
 248 #define NUM_MATCH_OBSERVATION_TYPES 2
 249 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
 250                                NUM_MATCH_OBSERVATION_TYPES)
 251 struct lzx_block_split_stats {
 252         u32 new_observations[NUM_OBSERVATION_TYPES];
 253         u32 observations[NUM_OBSERVATION_TYPES];
 254         u32 num_new_observations;
 255         u32 num_observations;
 256 };
 257
 258 /*
 259  * Represents a run of literals followed by a match or end-of-block.  This
 260  * structure is needed to temporarily store items chosen by the compressor,
 261  * since items cannot be written until all items for the block have been chosen
 262  * and the block's Huffman codes have been computed.
 263  */
 264 struct lzx_sequence {
 265
 266         /* The number of literals in the run.  This may be 0.  The literals are
 267          * not stored explicitly in this structure; instead, they are read
 268          * directly from the uncompressed data.  */
 269         u16 litrunlen;
 270
 271         /* If the next field doesn't indicate end-of-block, then this is the
 272          * match length minus LZX_MIN_MATCH_LEN.  */
 273         u16 adjusted_length;
 274
 275         /* If bit 31 is clear, then this field contains the match header in bits
 276          * 0-8, and either the match offset plus LZX_OFFSET_ADJUSTMENT or a
 277          * recent offset code in bits 9-30.  Otherwise (if bit 31 is set), this
 278          * sequence's literal run was the last literal run in the block, so
 279          * there is no match that follows it.  */
 280         u32 adjusted_offset_and_match_hdr;
 281 };
 282
 283 /*
 284  * This structure represents a byte position in the input buffer and a node in
 285  * the graph of possible match/literal choices.
 286  *
 287  * Logically, each incoming edge to this node is labeled with a literal or a
 288  * match that can be taken to reach this position from an earlier position; and
 289  * each outgoing edge from this node is labeled with a literal or a match that
 290  * can be taken to advance from this position to a later position.
 291  */
 292 struct lzx_optimum_node {
 293
 294         /* The cost, in bits, of the lowest-cost path that has been found to
 295          * reach this position.  This can change as progressively lower cost
 296          * paths are found to reach this position.  */
 297         u32 cost;
 298
 299         /*
 300          * The best arrival to this node, i.e. the match or literal that was
 301          * used to arrive to this position at the given 'cost'.  This can change
 302          * as progressively lower cost paths are found to reach this position.
 303          *
 304          * For non-gap matches, this variable is divided into two bitfields
 305          * whose meanings depend on the item type:
 306          *
 307          * Literals:
 308          *      Low bits are 0, high bits are the literal.
 309          *
 310          * Explicit offset matches:
 311          *      Low bits are the match length, high bits are the offset plus
 312          *      LZX_OFFSET_ADJUSTMENT.
 313          *
 314          * Repeat offset matches:
 315          *      Low bits are the match length, high bits are the queue index.
 316          *
 317          * For gap matches, identified by OPTIMUM_GAP_MATCH set, special
 318          * behavior applies --- see the code.
 319          */
 320         u32 item;
 321 #define OPTIMUM_OFFSET_SHIFT 9
 322 #define OPTIMUM_LEN_MASK ((1 << OPTIMUM_OFFSET_SHIFT) - 1)
 323 #if CONSIDER_GAP_MATCHES
 324 #  define OPTIMUM_GAP_MATCH 0x80000000
 325 #endif
 326
 327 } _aligned_attribute(8);
 328
 329 /* The cost model for near-optimal parsing */
 330 struct lzx_costs {
 331
 332         /*
 333          * 'match_cost[offset_slot][len - LZX_MIN_MATCH_LEN]' is the cost of a
 334          * length 'len' match which has an offset belonging to 'offset_slot'.
 335          * The cost includes the main symbol, the length symbol if required, and
 336          * the extra offset bits if any, excluding any entropy-coded bits
 337          * (aligned offset bits).  It does *not* include the cost of the aligned
 338          * offset symbol which may be required.
 339          */
 340         u16 match_cost[LZX_MAX_OFFSET_SLOTS][LZX_NUM_LENS];
 341
 342         /* Cost of each symbol in the main code */
 343         u32 main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
 344
 345         /* Cost of each symbol in the length code */
 346         u32 len[LZX_LENCODE_NUM_SYMBOLS];
 347
 348 #if CONSIDER_ALIGNED_COSTS
 349         /* Cost of each symbol in the aligned offset code */
 350         u32 aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
 351 #endif
 352 };
 353
 354 struct lzx_output_bitstream;
 355
 356 /* The main LZX compressor structure */
 357 struct lzx_compressor {
 358
 359         /* The buffer for preprocessed input data, if not using destructive
 360          * compression */
 361         void *in_buffer;
 362
 363         /* If true, then the compressor need not preserve the input buffer if it
 364          * compresses the data successfully */
 365         bool destructive;
 366
 367         /* Pointer to the compress() implementation chosen at allocation time */
 368         void (*impl)(struct lzx_compressor *, const u8 *, size_t,
 369                      struct lzx_output_bitstream *);
 370
 371         /* The log base 2 of the window size for match offset encoding purposes.
 372          * This will be >= LZX_MIN_WINDOW_ORDER and <= LZX_MAX_WINDOW_ORDER. */
 373         unsigned window_order;
 374
 375         /* The number of symbols in the main alphabet.  This depends on the
 376          * window order, since the window order determines the maximum possible
 377          * match offset. */
 378         unsigned num_main_syms;
 379
 380         /* The "nice" match length: if a match of this length is found, then it
 381          * is chosen immediately without further consideration. */
 382         unsigned nice_match_length;
 383
 384         /* The maximum search depth: at most this many potential matches are
 385          * considered at each position. */
 386         unsigned max_search_depth;
 387
 388         /* The number of optimization passes per block */
 389         unsigned num_optim_passes;
 390
 391         /* The symbol frequency counters for the current block */
 392         struct lzx_freqs freqs;
 393
 394         /* Block split statistics for the current block */
 395         struct lzx_block_split_stats split_stats;
 396
 397         /* The Huffman codes for the current and previous blocks.  The one with
 398          * index 'codes_index' is for the current block, and the other one is
 399          * for the previous block. */
 400         struct lzx_codes codes[2];
 401         unsigned codes_index;
 402
 403         /* The matches and literals that the compressor has chosen for the
 404          * current block.  The required length of this array is limited by the
 405          * maximum number of matches that can ever be chosen for a single block,
 406          * plus one for the special entry at the end. */
 407         struct lzx_sequence chosen_sequences[
 408                        DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
 409
 410         /* Tables for mapping adjusted offsets to offset slots */
 411         u8 offset_slot_tab_1[32768]; /* offset slots [0, 29] */
 412         u8 offset_slot_tab_2[128]; /* offset slots [30, 49] */
 413
 414         union {
 415                 /* Data for lzx_compress_lazy() */
 416                 struct {
 417                         /* Hash chains matchfinder (MUST BE LAST!!!) */
 418                         union {
 419                                 struct hc_matchfinder_16 hc_mf_16;
 420                                 struct hc_matchfinder_32 hc_mf_32;
 421                         };
 422                 };
 423
 424                 /* Data for lzx_compress_near_optimal() */
 425                 struct {
 426                         /*
 427                          * Array of nodes, one per position, for running the
 428                          * minimum-cost path algorithm.
 429                          *
 430                          * This array must be large enough to accommodate the
 431                          * worst-case number of nodes, which occurs if the
 432                          * compressor finds a match of length LZX_MAX_MATCH_LEN
 433                          * at position 'SOFT_MAX_BLOCK_SIZE - 1', producing a
 434                          * block of size 'SOFT_MAX_BLOCK_SIZE - 1 +
 435                          * LZX_MAX_MATCH_LEN'.  Add one for the end-of-block
 436                          * node.
 437                          */
 438                         struct lzx_optimum_node optimum_nodes[
 439                                                     SOFT_MAX_BLOCK_SIZE - 1 +
 440                                                     LZX_MAX_MATCH_LEN + 1];
 441
 442                         /* The cost model for the current optimization pass */
 443                         struct lzx_costs costs;
 444
 445                         /*
 446                          * Cached matches for the current block.  This array
 447                          * contains the matches that were found at each position
 448                          * in the block.  Specifically, for each position, there
 449                          * is a special 'struct lz_match' whose 'length' field
 450                          * contains the number of matches that were found at
 451                          * that position; this is followed by the matches
 452                          * themselves, if any, sorted by strictly increasing
 453                          * length.
 454                          *
 455                          * Note: in rare cases, there will be a very high number
 456                          * of matches in the block and this array will overflow.
 457                          * If this happens, we force the end of the current
 458                          * block.  CACHE_LENGTH is the length at which we
 459                          * actually check for overflow.  The extra slots beyond
 460                          * this are enough to absorb the worst case overflow,
 461                          * which occurs if starting at &match_cache[CACHE_LENGTH
 462                          * - 1], we write the match count header, then write
 463                          * MAX_MATCHES_PER_POS matches, then skip searching for
 464                          * matches at 'LZX_MAX_MATCH_LEN - 1' positions and
 465                          * write the match count header for each.
 466                          */
 467                         struct lz_match match_cache[CACHE_LENGTH +
 468                                                     MAX_MATCHES_PER_POS +
 469                                                     LZX_MAX_MATCH_LEN - 1];
 470
 471                         /* Binary trees matchfinder (MUST BE LAST!!!) */
 472                         union {
 473                                 struct bt_matchfinder_16 bt_mf_16;
 474                                 struct bt_matchfinder_32 bt_mf_32;
 475                         };
 476                 };
 477         };
 478 };
 479
 480 /******************************************************************************/
 481 /*                            Matchfinder utilities                           */
 482 /*----------------------------------------------------------------------------*/
 483
 484 /*
 485  * Will a matchfinder using 16-bit positions be sufficient for compressing
 486  * buffers of up to the specified size?  The limit could be 65536 bytes, but we
 487  * also want to optimize out the use of offset_slot_tab_2 in the 16-bit case.
 488  * This requires that the limit be no more than the length of offset_slot_tab_1
 489  * (currently 32768).
 490  */
 491 static inline bool
 492 lzx_is_16_bit(size_t max_bufsize)
 493 {
 494         STATIC_ASSERT(ARRAY_LEN(((struct lzx_compressor *)0)->offset_slot_tab_1) == 32768);
 495         return max_bufsize <= 32768;
 496 }
 497
 498 /*
 499  * Return the offset slot for the specified adjusted match offset.
 500  */
 501 static inline unsigned
 502 lzx_get_offset_slot(struct lzx_compressor *c, u32 adjusted_offset,
 503                     bool is_16_bit)
 504 {
 505         if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
 506                 return c->offset_slot_tab_1[adjusted_offset];
 507         return c->offset_slot_tab_2[adjusted_offset >> 14];
 508 }
 509
 510 /*
 511  * The following macros call either the 16-bit or the 32-bit version of a
 512  * matchfinder function based on the value of 'is_16_bit', which will be known
 513  * at compilation time.
 514  */
 515
 516 #define CALL_HC_MF(is_16_bit, c, funcname, ...)                               \
 517         ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \
 518                        CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__));
 519
 520 #define CALL_BT_MF(is_16_bit, c, funcname, ...)                               \
 521         ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \
 522                        CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__));
 523
 524 /******************************************************************************/
 525 /*                             Output bitstream                               */
 526 /*----------------------------------------------------------------------------*/
 527
 528 /*
 529  * The LZX bitstream is encoded as a sequence of little endian 16-bit coding
 530  * units.  Bits are ordered from most significant to least significant within
 531  * each coding unit.
 532  */
 533
 534 /*
 535  * Structure to keep track of the current state of sending bits to the
 536  * compressed output buffer.
 537  */
 538 struct lzx_output_bitstream {
 539
 540         /* Bits that haven't yet been written to the output buffer */
 541         machine_word_t bitbuf;
 542
 543         /* Number of bits currently held in @bitbuf */
 544         machine_word_t bitcount;
 545
 546         /* Pointer to the start of the output buffer */
 547         u8 *start;
 548
 549         /* Pointer to the position in the output buffer at which the next coding
 550          * unit should be written */
 551         u8 *next;
 552
 553         /* Pointer to just past the end of the output buffer, rounded down by
 554          * one byte if needed to make 'end - start' a multiple of 2 */
 555         u8 *end;
 556 };
 557
 558 /* Can the specified number of bits always be added to 'bitbuf' after all
 559  * pending 16-bit coding units have been flushed?  */
 560 #define CAN_BUFFER(n)   ((n) <= WORDBITS - 15)
 561
 562 /* Initialize the output bitstream to write to the specified buffer. */
 563 static void
 564 lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
 565 {
 566         os->bitbuf = 0;
 567         os->bitcount = 0;
 568         os->start = buffer;
 569         os->next = buffer;
 570         os->end = (u8 *)buffer + (size & ~1);
 571 }
 572
 573 /*
 574  * Add some bits to the bitbuffer variable of the output bitstream.  The caller
 575  * must make sure there is enough room.
 576  */
 577 static inline void
 578 lzx_add_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 579 {
 580         os->bitbuf = (os->bitbuf << num_bits) | bits;
 581         os->bitcount += num_bits;
 582 }
 583
 584 /*
 585  * Flush bits from the bitbuffer variable to the output buffer.  'max_num_bits'
 586  * specifies the maximum number of bits that may have been added since the last
 587  * flush.
 588  */
 589 static inline void
 590 lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
 591 {
 592         /* Masking the number of bits to shift is only needed to avoid undefined
 593          * behavior; we don't actually care about the results of bad shifts.  On
 594          * x86, the explicit masking generates no extra code.  */
 595         const u32 shift_mask = WORDBITS - 1;
 596
 597         if (os->end - os->next < 6)
 598                 return;
 599         put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
 600                                             shift_mask), os->next + 0);
 601         if (max_num_bits > 16)
 602                 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
 603                                                 shift_mask), os->next + 2);
 604         if (max_num_bits > 32)
 605                 put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
 606                                                 shift_mask), os->next + 4);
 607         os->next += (os->bitcount >> 4) << 1;
 608         os->bitcount &= 15;
 609 }
 610
 611 /* Add at most 16 bits to the bitbuffer and flush it.  */
 612 static inline void
 613 lzx_write_bits(struct lzx_output_bitstream *os, u32 bits, unsigned num_bits)
 614 {
 615         lzx_add_bits(os, bits, num_bits);
 616         lzx_flush_bits(os, 16);
 617 }
 618
 619 /*
 620  * Flush the last coding unit to the output buffer if needed.  Return the total
 621  * number of bytes written to the output buffer, or 0 if an overflow occurred.
 622  */
 623 static size_t
 624 lzx_flush_output(struct lzx_output_bitstream *os)
 625 {
 626         if (os->end - os->next < 6)
 627                 return 0;
 628
 629         if (os->bitcount != 0) {
 630                 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
 631                 os->next += 2;
 632         }
 633
 634         return os->next - os->start;
 635 }
 636
 637 /******************************************************************************/
 638 /*                           Preparing Huffman codes                          */
 639 /*----------------------------------------------------------------------------*/
 640
 641 /*
 642  * Build the Huffman codes.  This takes as input the frequency tables for each
 643  * code and produces as output a set of tables that map symbols to codewords and
 644  * codeword lengths.
 645  */
 646 static void
 647 lzx_build_huffman_codes(struct lzx_compressor *c)
 648 {
 649         const struct lzx_freqs *freqs = &c->freqs;
 650         struct lzx_codes *codes = &c->codes[c->codes_index];
 651
 652         STATIC_ASSERT(MAIN_CODEWORD_LIMIT >= 9 &&
 653                       MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
 654         make_canonical_huffman_code(c->num_main_syms,
 655                                     MAIN_CODEWORD_LIMIT,
 656                                     freqs->main,
 657                                     codes->lens.main,
 658                                     codes->codewords.main);
 659
 660         STATIC_ASSERT(LENGTH_CODEWORD_LIMIT >= 8 &&
 661                       LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
 662         make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
 663                                     LENGTH_CODEWORD_LIMIT,
 664                                     freqs->len,
 665                                     codes->lens.len,
 666                                     codes->codewords.len);
 667
 668         STATIC_ASSERT(ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
 669                       ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
 670         make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
 671                                     ALIGNED_CODEWORD_LIMIT,
 672                                     freqs->aligned,
 673                                     codes->lens.aligned,
 674                                     codes->codewords.aligned);
 675 }
 676
 677 /* Reset the symbol frequencies for the current block. */
 678 static void
 679 lzx_reset_symbol_frequencies(struct lzx_compressor *c)
 680 {
 681         memset(&c->freqs, 0, sizeof(c->freqs));
 682 }
 683
 684 static unsigned
 685 lzx_compute_precode_items(const u8 lens[restrict],
 686                           const u8 prev_lens[restrict],
 687                           u32 precode_freqs[restrict],
 688                           unsigned precode_items[restrict])
 689 {
 690         unsigned *itemptr;
 691         unsigned run_start;
 692         unsigned run_end;
 693         unsigned extra_bits;
 694         int delta;
 695         u8 len;
 696
 697         itemptr = precode_items;
 698         run_start = 0;
 699
 700         while (!((len = lens[run_start]) & 0x80)) {
 701
 702                 /* len = the length being repeated  */
 703
 704                 /* Find the next run of codeword lengths.  */
 705
 706                 run_end = run_start + 1;
 707
 708                 /* Fast case for a single length.  */
 709                 if (likely(len != lens[run_end])) {
 710                         delta = prev_lens[run_start] - len;
 711                         if (delta < 0)
 712                                 delta += 17;
 713                         precode_freqs[delta]++;
 714                         *itemptr++ = delta;
 715                         run_start++;
 716                         continue;
 717                 }
 718
 719                 /* Extend the run.  */
 720                 do {
 721                         run_end++;
 722                 } while (len == lens[run_end]);
 723
 724                 if (len == 0) {
 725                         /* Run of zeroes.  */
 726
 727                         /* Symbol 18: RLE 20 to 51 zeroes at a time.  */
 728                         while ((run_end - run_start) >= 20) {
 729                                 extra_bits = min((run_end - run_start) - 20, 0x1F);
 730                                 precode_freqs[18]++;
 731                                 *itemptr++ = 18 | (extra_bits << 5);
 732                                 run_start += 20 + extra_bits;
 733                         }
 734
 735                         /* Symbol 17: RLE 4 to 19 zeroes at a time.  */
 736                         if ((run_end - run_start) >= 4) {
 737                                 extra_bits = min((run_end - run_start) - 4, 0xF);
 738                                 precode_freqs[17]++;
 739                                 *itemptr++ = 17 | (extra_bits << 5);
 740                                 run_start += 4 + extra_bits;
 741                         }
 742                 } else {
 743
 744                         /* A run of nonzero lengths. */
 745
 746                         /* Symbol 19: RLE 4 to 5 of any length at a time.  */
 747                         while ((run_end - run_start) >= 4) {
 748                                 extra_bits = (run_end - run_start) > 4;
 749                                 delta = prev_lens[run_start] - len;
 750                                 if (delta < 0)
 751                                         delta += 17;
 752                                 precode_freqs[19]++;
 753                                 precode_freqs[delta]++;
 754                                 *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
 755                                 run_start += 4 + extra_bits;
 756                         }
 757                 }
 758
 759                 /* Output any remaining lengths without RLE.  */
 760                 while (run_start != run_end) {
 761                         delta = prev_lens[run_start] - len;
 762                         if (delta < 0)
 763                                 delta += 17;
 764                         precode_freqs[delta]++;
 765                         *itemptr++ = delta;
 766                         run_start++;
 767                 }
 768         }
 769
 770         return itemptr - precode_items;
 771 }
 772
 773 /******************************************************************************/
 774 /*                          Outputting compressed data                        */
 775 /*----------------------------------------------------------------------------*/
 776
 777 /*
 778  * Output a Huffman code in the compressed form used in LZX.
 779  *
 780  * The Huffman code is represented in the output as a logical series of codeword
 781  * lengths from which the Huffman code, which must be in canonical form, can be
 782  * reconstructed.
 783  *
 784  * The codeword lengths are themselves compressed using a separate Huffman code,
 785  * the "precode", which contains a symbol for each possible codeword length in
 786  * the larger code as well as several special symbols to represent repeated
 787  * codeword lengths (a form of run-length encoding).  The precode is itself
 788  * constructed in canonical form, and its codeword lengths are represented
 789  * literally in 20 4-bit fields that immediately precede the compressed codeword
 790  * lengths of the larger code.
 791  *
 792  * Furthermore, the codeword lengths of the larger code are actually represented
 793  * as deltas from the codeword lengths of the corresponding code in the previous
 794  * block.
 795  *
 796  * @os:
 797  *      Bitstream to which to write the compressed Huffman code.
 798  * @lens:
 799  *      The codeword lengths, indexed by symbol, in the Huffman code.
 800  * @prev_lens:
 801  *      The codeword lengths, indexed by symbol, in the corresponding Huffman
 802  *      code in the previous block, or all zeroes if this is the first block.
 803  * @num_lens:
 804  *      The number of symbols in the Huffman code.
 805  */
 806 static void
 807 lzx_write_compressed_code(struct lzx_output_bitstream *os,
 808                           const u8 lens[restrict],
 809                           const u8 prev_lens[restrict],
 810                           unsigned num_lens)
 811 {
 812         u32 precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
 813         u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
 814         u32 precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
 815         unsigned precode_items[num_lens];
 816         unsigned num_precode_items;
 817         unsigned precode_item;
 818         unsigned precode_sym;
 819         unsigned i;
 820         u8 saved = lens[num_lens];
 821         *(u8 *)(lens + num_lens) = 0x80;
 822
 823         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 824                 precode_freqs[i] = 0;
 825
 826         /* Compute the "items" (RLE / literal tokens and extra bits) with which
 827          * the codeword lengths in the larger code will be output.  */
 828         num_precode_items = lzx_compute_precode_items(lens,
 829                                                       prev_lens,
 830                                                       precode_freqs,
 831                                                       precode_items);
 832
 833         /* Build the precode.  */
 834         STATIC_ASSERT(PRE_CODEWORD_LIMIT >= 5 &&
 835                       PRE_CODEWORD_LIMIT <= LZX_MAX_PRE_CODEWORD_LEN);
 836         make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS, PRE_CODEWORD_LIMIT,
 837                                     precode_freqs, precode_lens,
 838                                     precode_codewords);
 839
 840         /* Output the lengths of the codewords in the precode.  */
 841         for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
 842                 lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
 843
 844         /* Output the encoded lengths of the codewords in the larger code.  */
 845         for (i = 0; i < num_precode_items; i++) {
 846                 precode_item = precode_items[i];
 847                 precode_sym = precode_item & 0x1F;
 848                 lzx_add_bits(os, precode_codewords[precode_sym],
 849                              precode_lens[precode_sym]);
 850                 if (precode_sym >= 17) {
 851                         if (precode_sym == 17) {
 852                                 lzx_add_bits(os, precode_item >> 5, 4);
 853                         } else if (precode_sym == 18) {
 854                                 lzx_add_bits(os, precode_item >> 5, 5);
 855                         } else {
 856                                 lzx_add_bits(os, (precode_item >> 5) & 1, 1);
 857                                 precode_sym = precode_item >> 6;
 858                                 lzx_add_bits(os, precode_codewords[precode_sym],
 859                                              precode_lens[precode_sym]);
 860                         }
 861                 }
 862                 STATIC_ASSERT(CAN_BUFFER(2 * PRE_CODEWORD_LIMIT + 1));
 863                 lzx_flush_bits(os, 2 * PRE_CODEWORD_LIMIT + 1);
 864         }
 865
 866         *(u8 *)(lens + num_lens) = saved;
 867 }
 868
 869 /*
 870  * Write all matches and literal bytes (which were precomputed) in an LZX
 871  * compressed block to the output bitstream in the final compressed
 872  * representation.
 873  *
 874  * @os
 875  *      The output bitstream.
 876  * @block_type
 877  *      The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
 878  *      LZX_BLOCKTYPE_VERBATIM).
 879  * @block_data
 880  *      The uncompressed data of the block.
 881  * @sequences
 882  *      The matches and literals to output, given as a series of sequences.
 883  * @codes
 884  *      The main, length, and aligned offset Huffman codes for the block.
 885  */
 886 static void
 887 lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
 888                     const u8 *block_data, const struct lzx_sequence sequences[],
 889                     const struct lzx_codes *codes)
 890 {
 891         const struct lzx_sequence *seq = sequences;
 892         u32 ones_if_aligned = 0 - (block_type == LZX_BLOCKTYPE_ALIGNED);
 893
 894         for (;;) {
 895                 /* Output the next sequence.  */
 896
 897                 unsigned litrunlen = seq->litrunlen;
 898                 unsigned match_hdr;
 899                 unsigned main_symbol;
 900                 unsigned adjusted_length;
 901                 u32 adjusted_offset;
 902                 unsigned offset_slot;
 903                 unsigned num_extra_bits;
 904                 u32 extra_bits;
 905
 906                 /* Output the literal run of the sequence.  */
 907
 908                 if (litrunlen) {  /* Is the literal run nonempty?  */
 909
 910                         /* Verify optimization is enabled on 64-bit  */
 911                         STATIC_ASSERT(WORDBITS < 64 ||
 912                                       CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT));
 913
 914                         if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) {
 915
 916                                 /* 64-bit: write 3 literals at a time.  */
 917                                 while (litrunlen >= 3) {
 918                                         unsigned lit0 = block_data[0];
 919                                         unsigned lit1 = block_data[1];
 920                                         unsigned lit2 = block_data[2];
 921                                         lzx_add_bits(os, codes->codewords.main[lit0],
 922                                                      codes->lens.main[lit0]);
 923                                         lzx_add_bits(os, codes->codewords.main[lit1],
 924                                                      codes->lens.main[lit1]);
 925                                         lzx_add_bits(os, codes->codewords.main[lit2],
 926                                                      codes->lens.main[lit2]);
 927                                         lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
 928                                         block_data += 3;
 929                                         litrunlen -= 3;
 930                                 }
 931                                 if (litrunlen--) {
 932                                         unsigned lit = *block_data++;
 933                                         lzx_add_bits(os, codes->codewords.main[lit],
 934                                                      codes->lens.main[lit]);
 935                                         if (litrunlen--) {
 936                                                 unsigned lit = *block_data++;
 937                                                 lzx_add_bits(os, codes->codewords.main[lit],
 938                                                              codes->lens.main[lit]);
 939                                                 lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
 940                                         } else {
 941                                                 lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
 942                                         }
 943                                 }
 944                         } else {
 945                                 /* 32-bit: write 1 literal at a time.  */
 946                                 do {
 947                                         unsigned lit = *block_data++;
 948                                         lzx_add_bits(os, codes->codewords.main[lit],
 949                                                      codes->lens.main[lit]);
 950                                         lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
 951                                 } while (--litrunlen);
 952                         }
 953                 }
 954
 955                 /* Was this the last literal run?  */
 956                 if (seq->adjusted_offset_and_match_hdr & 0x80000000)
 957                         return;
 958
 959                 /* Nope; output the match.  */
 960
 961                 match_hdr = seq->adjusted_offset_and_match_hdr & 0x1FF;
 962                 main_symbol = LZX_NUM_CHARS + match_hdr;
 963                 adjusted_length = seq->adjusted_length;
 964
 965                 block_data += adjusted_length + LZX_MIN_MATCH_LEN;
 966
 967                 offset_slot = match_hdr / LZX_NUM_LEN_HEADERS;
 968                 adjusted_offset = seq->adjusted_offset_and_match_hdr >> 9;
 969
 970                 num_extra_bits = lzx_extra_offset_bits[offset_slot];
 971                 extra_bits = adjusted_offset - lzx_offset_slot_base[offset_slot];
 972
 973         #define MAX_MATCH_BITS  (MAIN_CODEWORD_LIMIT + LENGTH_CODEWORD_LIMIT + \
 974                                  14 + ALIGNED_CODEWORD_LIMIT)
 975
 976                 /* Verify optimization is enabled on 64-bit  */
 977                 STATIC_ASSERT(WORDBITS < 64 || CAN_BUFFER(MAX_MATCH_BITS));
 978
 979                 /* Output the main symbol for the match.  */
 980
 981                 lzx_add_bits(os, codes->codewords.main[main_symbol],
 982                              codes->lens.main[main_symbol]);
 983                 if (!CAN_BUFFER(MAX_MATCH_BITS))
 984                         lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
 985
 986                 /* If needed, output the length symbol for the match.  */
 987
 988                 if (adjusted_length >= LZX_NUM_PRIMARY_LENS) {
 989                         lzx_add_bits(os, codes->codewords.len[adjusted_length -
 990                                                               LZX_NUM_PRIMARY_LENS],
 991                                      codes->lens.len[adjusted_length -
 992                                                      LZX_NUM_PRIMARY_LENS]);
 993                         if (!CAN_BUFFER(MAX_MATCH_BITS))
 994                                 lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
 995                 }
 996
 997                 /* Output the extra offset bits for the match.  In aligned
 998                  * offset blocks, the lowest 3 bits of the adjusted offset are
 999                  * Huffman-encoded using the aligned offset code, provided that
1000                  * there are at least extra 3 offset bits required.  All other
1001                  * extra offset bits are output verbatim.  */
1002
1003                 if ((adjusted_offset & ones_if_aligned) >= 16) {
1004
1005                         lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
1006                                      num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS);
1007                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1008                                 lzx_flush_bits(os, 14);
1009
1010                         lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
1011                                                                   LZX_ALIGNED_OFFSET_BITMASK],
1012                                      codes->lens.aligned[adjusted_offset &
1013                                                          LZX_ALIGNED_OFFSET_BITMASK]);
1014                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1015                                 lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
1016                 } else {
1017                         STATIC_ASSERT(CAN_BUFFER(17));
1018
1019                         lzx_add_bits(os, extra_bits, num_extra_bits);
1020                         if (!CAN_BUFFER(MAX_MATCH_BITS))
1021                                 lzx_flush_bits(os, 17);
1022                 }
1023
1024                 if (CAN_BUFFER(MAX_MATCH_BITS))
1025                         lzx_flush_bits(os, MAX_MATCH_BITS);
1026
1027                 /* Advance to the next sequence.  */
1028                 seq++;
1029         }
1030 }
1031
1032 static void
1033 lzx_write_compressed_block(const u8 *block_begin,
1034                            int block_type,
1035                            u32 block_size,
1036                            unsigned window_order,
1037                            unsigned num_main_syms,
1038                            const struct lzx_sequence sequences[],
1039                            const struct lzx_codes * codes,
1040                            const struct lzx_lens * prev_lens,
1041                            struct lzx_output_bitstream * os)
1042 {
1043         /* The first three bits indicate the type of block and are one of the
1044          * LZX_BLOCKTYPE_* constants.  */
1045         lzx_write_bits(os, block_type, 3);
1046
1047         /*
1048          * Output the block size.
1049          *
1050          * The original LZX format encoded the block size in 24 bits.  However,
1051          * the LZX format used in WIM archives uses 1 bit to specify whether the
1052          * block has the default size of 32768 bytes, then optionally 16 bits to
1053          * specify a non-default size.  This works fine for Microsoft's WIM
1054          * software (WIMGAPI), which never compresses more than 32768 bytes at a
1055          * time with LZX.  However, as an extension, our LZX compressor supports
1056          * compressing up to 2097152 bytes, with a corresponding increase in
1057          * window size.  It is possible for blocks in these larger buffers to
1058          * exceed 65535 bytes; such blocks cannot have their size represented in
1059          * 16 bits.
1060          *
1061          * The chosen solution was to use 24 bits for the block size when
1062          * possibly required --- specifically, when the compressor has been
1063          * allocated to be capable of compressing more than 32768 bytes at once
1064          * (which also causes the number of main symbols to be increased).
1065          */
1066         if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
1067                 lzx_write_bits(os, 1, 1);
1068         } else {
1069                 lzx_write_bits(os, 0, 1);
1070
1071                 if (window_order >= 16)
1072                         lzx_write_bits(os, block_size >> 16, 8);
1073
1074                 lzx_write_bits(os, block_size & 0xFFFF, 16);
1075         }
1076
1077         /* If it's an aligned offset block, output the aligned offset code.  */
1078         if (block_type == LZX_BLOCKTYPE_ALIGNED) {
1079                 for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1080                         lzx_write_bits(os, codes->lens.aligned[i],
1081                                        LZX_ALIGNEDCODE_ELEMENT_SIZE);
1082                 }
1083         }
1084
1085         /* Output the main code (two parts).  */
1086         lzx_write_compressed_code(os, codes->lens.main,
1087                                   prev_lens->main,
1088                                   LZX_NUM_CHARS);
1089         lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
1090                                   prev_lens->main + LZX_NUM_CHARS,
1091                                   num_main_syms - LZX_NUM_CHARS);
1092
1093         /* Output the length code.  */
1094         lzx_write_compressed_code(os, codes->lens.len,
1095                                   prev_lens->len,
1096                                   LZX_LENCODE_NUM_SYMBOLS);
1097
1098         /* Output the compressed matches and literals.  */
1099         lzx_write_sequences(os, block_type, block_begin, sequences, codes);
1100 }
1101
1102 /*
1103  * Given the frequencies of symbols in an LZX-compressed block and the
1104  * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
1105  * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
1106  * will take fewer bits to output.
1107  */
1108 static int
1109 lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
1110                                const struct lzx_codes * codes)
1111 {
1112         u32 verbatim_cost = 0;
1113         u32 aligned_cost = 0;
1114
1115         /* A verbatim block requires 3 bits in each place that an aligned offset
1116          * symbol would be used in an aligned offset block.  */
1117         for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
1118                 verbatim_cost += LZX_NUM_ALIGNED_OFFSET_BITS * freqs->aligned[i];
1119                 aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
1120         }
1121
1122         /* Account for the cost of sending the codeword lengths of the aligned
1123          * offset code.  */
1124         aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE *
1125                         LZX_ALIGNEDCODE_NUM_SYMBOLS;
1126
1127         if (aligned_cost < verbatim_cost)
1128                 return LZX_BLOCKTYPE_ALIGNED;
1129         else
1130                 return LZX_BLOCKTYPE_VERBATIM;
1131 }
1132
1133 /*
1134  * Flush an LZX block:
1135  *
1136  * 1. Build the Huffman codes.
1137  * 2. Decide whether to output the block as VERBATIM or ALIGNED.
1138  * 3. Write the block.
1139  * 4. Swap the indices of the current and previous Huffman codes.
1140  *
1141  * Note: we never output UNCOMPRESSED blocks.  This probably should be
1142  * implemented sometime, but it doesn't make much difference.
1143  */
1144 static void
1145 lzx_flush_block(struct lzx_compressor *c, struct lzx_output_bitstream *os,
1146                 const u8 *block_begin, u32 block_size, u32 seq_idx)
1147 {
1148         int block_type;
1149
1150         lzx_build_huffman_codes(c);
1151
1152         block_type = lzx_choose_verbatim_or_aligned(&c->freqs,
1153                                                     &c->codes[c->codes_index]);
1154         lzx_write_compressed_block(block_begin,
1155                                    block_type,
1156                                    block_size,
1157                                    c->window_order,
1158                                    c->num_main_syms,
1159                                    &c->chosen_sequences[seq_idx],
1160                                    &c->codes[c->codes_index],
1161                                    &c->codes[c->codes_index ^ 1].lens,
1162                                    os);
1163         c->codes_index ^= 1;
1164 }
1165
1166 /******************************************************************************/
1167 /*                          Block splitting algorithm                         */
1168 /*----------------------------------------------------------------------------*/
1169
1170 /*
1171  * The problem of block splitting is to decide when it is worthwhile to start a
1172  * new block with new entropy codes.  There is a theoretically optimal solution:
1173  * recursively consider every possible block split, considering the exact cost
1174  * of each block, and choose the minimum cost approach.  But this is far too
1175  * slow.  Instead, as an approximation, we can count symbols and after every N
1176  * symbols, compare the expected distribution of symbols based on the previous
1177  * data with the actual distribution.  If they differ "by enough", then start a
1178  * new block.
1179  *
1180  * As an optimization and heuristic, we don't distinguish between every symbol
1181  * but rather we combine many symbols into a single "observation type".  For
1182  * literals we only look at the high bits and low bits, and for matches we only
1183  * look at whether the match is long or not.  The assumption is that for typical
1184  * "real" data, places that are good block boundaries will tend to be noticable
1185  * based only on changes in these aggregate frequencies, without looking for
1186  * subtle differences in individual symbols.  For example, a change from ASCII
1187  * bytes to non-ASCII bytes, or from few matches (generally less compressible)
1188  * to many matches (generally more compressible), would be easily noticed based
1189  * on the aggregates.
1190  *
1191  * For determining whether the frequency distributions are "different enough" to
1192  * start a new block, the simply heuristic of splitting when the sum of absolute
1193  * differences exceeds a constant seems to be good enough.
1194  *
1195  * Finally, for an approximation, it is not strictly necessary that the exact
1196  * symbols being used are considered.  With "near-optimal parsing", for example,
1197  * the actual symbols that will be used are unknown until after the block
1198  * boundary is chosen and the block has been optimized.  Since the final choices
1199  * cannot be used, we can use preliminary "greedy" choices instead.
1200  */
1201
1202 /* Initialize the block split statistics when starting a new block. */
1203 static void
1204 lzx_init_block_split_stats(struct lzx_block_split_stats *stats)
1205 {
1206         memset(stats, 0, sizeof(*stats));
1207 }
1208
1209 /* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
1210  * literal, for 8 possible literal observation types.  */
1211 static inline void
1212 lzx_observe_literal(struct lzx_block_split_stats *stats, u8 lit)
1213 {
1214         stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
1215         stats->num_new_observations++;
1216 }
1217
1218 /* Match observation.  Heuristic: use one observation type for "short match" and
1219  * one observation type for "long match".  */
1220 static inline void
1221 lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length)
1222 {
1223         stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
1224         stats->num_new_observations++;
1225 }
1226
1227 static bool
1228 lzx_should_end_block(struct lzx_block_split_stats *stats)
1229 {
1230         if (stats->num_observations > 0) {
1231
1232                 /* Note: to avoid slow divisions, we do not divide by
1233                  * 'num_observations', but rather do all math with the numbers
1234                  * multiplied by 'num_observations'. */
1235                 u32 total_delta = 0;
1236                 for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1237                         u32 expected = stats->observations[i] *
1238                                        stats->num_new_observations;
1239                         u32 actual = stats->new_observations[i] *
1240                                      stats->num_observations;
1241                         u32 delta = (actual > expected) ? actual - expected :
1242                                                           expected - actual;
1243                         total_delta += delta;
1244                 }
1245
1246                 /* Ready to end the block? */
1247                 if (total_delta >=
1248                     stats->num_new_observations * 7 / 8 * stats->num_observations)
1249                         return true;
1250         }
1251
1252         for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
1253                 stats->num_observations += stats->new_observations[i];
1254                 stats->observations[i] += stats->new_observations[i];
1255                 stats->new_observations[i] = 0;
1256         }
1257         stats->num_new_observations = 0;
1258         return false;
1259 }
1260
1261 /******************************************************************************/
1262 /*                   Slower ("near-optimal") compression algorithm            */
1263 /*----------------------------------------------------------------------------*/
1264
1265 /*
1266  * Least-recently-used queue for match offsets.
1267  *
1268  * This is represented as a 64-bit integer for efficiency.  There are three
1269  * offsets of 21 bits each.  Bit 64 is garbage.
1270  */
1271 struct lzx_lru_queue {
1272         u64 R;
1273 } _aligned_attribute(8);
1274
1275 #define LZX_QUEUE_OFFSET_SHIFT  21
1276 #define LZX_QUEUE_OFFSET_MASK   (((u64)1 << LZX_QUEUE_OFFSET_SHIFT) - 1)
1277
1278 #define LZX_QUEUE_R0_SHIFT (0 * LZX_QUEUE_OFFSET_SHIFT)
1279 #define LZX_QUEUE_R1_SHIFT (1 * LZX_QUEUE_OFFSET_SHIFT)
1280 #define LZX_QUEUE_R2_SHIFT (2 * LZX_QUEUE_OFFSET_SHIFT)
1281
1282 #define LZX_QUEUE_R0_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R0_SHIFT)
1283 #define LZX_QUEUE_R1_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R1_SHIFT)
1284 #define LZX_QUEUE_R2_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R2_SHIFT)
1285
1286 #define LZX_QUEUE_INITIALIZER {                 \
1287         ((u64)1 << LZX_QUEUE_R0_SHIFT) |        \
1288         ((u64)1 << LZX_QUEUE_R1_SHIFT) |        \
1289         ((u64)1 << LZX_QUEUE_R2_SHIFT) }
1290
1291 static inline u64
1292 lzx_lru_queue_R0(struct lzx_lru_queue queue)
1293 {
1294         return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1295 }
1296
1297 static inline u64
1298 lzx_lru_queue_R1(struct lzx_lru_queue queue)
1299 {
1300         return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1301 }
1302
1303 static inline u64
1304 lzx_lru_queue_R2(struct lzx_lru_queue queue)
1305 {
1306         return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK;
1307 }
1308
1309 /* Push a match offset onto the front (most recently used) end of the queue.  */
1310 static inline struct lzx_lru_queue
1311 lzx_lru_queue_push(struct lzx_lru_queue queue, u32 offset)
1312 {
1313         return (struct lzx_lru_queue) {
1314                 .R = (queue.R << LZX_QUEUE_OFFSET_SHIFT) | offset,
1315         };
1316 }
1317
1318 /* Swap a match offset to the front of the queue.  */
1319 static inline struct lzx_lru_queue
1320 lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
1321 {
1322         unsigned shift = idx * 21;
1323         const u64 mask = LZX_QUEUE_R0_MASK;
1324         const u64 mask_high = mask << shift;
1325
1326         return (struct lzx_lru_queue) {
1327                 (queue.R & ~(mask | mask_high)) |
1328                 ((queue.R & mask_high) >> shift) |
1329                 ((queue.R & mask) << shift)
1330         };
1331 }
1332
1333 static inline u32
1334 lzx_walk_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit,
1335                    bool record)
1336 {
1337         u32 node_idx = block_size;
1338         u32 seq_idx = ARRAY_LEN(c->chosen_sequences) - 1;
1339         u32 lit_start_node;
1340
1341         if (record) {
1342                 /* Special value to mark last sequence  */
1343                 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = 0x80000000;
1344                 lit_start_node = node_idx;
1345         }
1346
1347         for (;;) {
1348                 u32 item;
1349                 u32 len;
1350                 u32 adjusted_offset;
1351                 unsigned v;
1352                 unsigned offset_slot;
1353
1354                 /* Tally literals until either a match or the beginning of the
1355                  * block is reached.  Note: the item in the node at the
1356                  * beginning of the block has all bits set, causing this loop to
1357                  * end when it is reached. */
1358                 for (;;) {
1359                         item = c->optimum_nodes[node_idx].item;
1360                         if (item & OPTIMUM_LEN_MASK)
1361                                 break;
1362                         c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
1363                         node_idx--;
1364                 }
1365
1366         #if CONSIDER_GAP_MATCHES
1367                 if (item & OPTIMUM_GAP_MATCH) {
1368
1369                         if (node_idx == 0)
1370                                 break;
1371
1372                         /* Record the literal run length for the next sequence
1373                          * (the "previous sequence" when walking backwards). */
1374                         len = item & OPTIMUM_LEN_MASK;
1375                         if (record) {
1376                                 c->chosen_sequences[seq_idx--].litrunlen =
1377                                                 lit_start_node - node_idx;
1378                                 lit_start_node = node_idx - len;
1379                         }
1380
1381                         /* Tally the rep0 match after the gap. */
1382                         v = len - LZX_MIN_MATCH_LEN;
1383                         if (record)
1384                                 c->chosen_sequences[seq_idx].adjusted_length = v;
1385                         if (v >= LZX_NUM_PRIMARY_LENS) {
1386                                 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
1387                                 v = LZX_NUM_PRIMARY_LENS;
1388                         }
1389                         c->freqs.main[LZX_NUM_CHARS + v]++;
1390                         if (record)
1391                                 c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr = v;
1392
1393                         /* Tally the literal in the gap. */
1394                         c->freqs.main[(u8)(item >> OPTIMUM_OFFSET_SHIFT)]++;
1395
1396                         /* Fall through and tally the match before the gap.
1397                          * (It was temporarily saved in the 'cost' field of the
1398                          * previous node, which was free to reuse.) */
1399                         item = c->optimum_nodes[--node_idx].cost;
1400                         node_idx -= len;
1401                 }
1402         #else /* CONSIDER_GAP_MATCHES */
1403                 if (node_idx == 0)
1404                         break;
1405         #endif /* !CONSIDER_GAP_MATCHES */
1406
1407                 len = item & OPTIMUM_LEN_MASK;
1408                 adjusted_offset = item >> OPTIMUM_OFFSET_SHIFT;
1409
1410                 /* Record the literal run length for the next sequence (the
1411                  * "previous sequence" when walking backwards). */
1412                 if (record) {
1413                         c->chosen_sequences[seq_idx--].litrunlen =
1414                                         lit_start_node - node_idx;
1415                         node_idx -= len;
1416                         lit_start_node = node_idx;
1417                 } else {
1418                         node_idx -= len;
1419                 }
1420
1421                 /* Record a match. */
1422
1423                 /* Tally the aligned offset symbol if needed. */
1424                 if (adjusted_offset >= 16)
1425                         c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
1426
1427                 /* Record the adjusted length. */
1428                 v = len - LZX_MIN_MATCH_LEN;
1429                 if (record)
1430                         c->chosen_sequences[seq_idx].adjusted_length = v;
1431
1432                 /* Tally the length symbol if needed. */
1433                 if (v >= LZX_NUM_PRIMARY_LENS) {
1434                         c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
1435                         v = LZX_NUM_PRIMARY_LENS;
1436                 }
1437
1438                 /* Tally the main symbol. */
1439                 offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
1440                 v += offset_slot * LZX_NUM_LEN_HEADERS;
1441                 c->freqs.main[LZX_NUM_CHARS + v]++;
1442
1443                 /* Record the adjusted offset and match header. */
1444                 if (record) {
1445                         c->chosen_sequences[seq_idx].adjusted_offset_and_match_hdr =
1446                                         (adjusted_offset << 9) | v;
1447                 }
1448         }
1449
1450         /* Record the literal run length for the first sequence. */
1451         if (record)
1452                 c->chosen_sequences[seq_idx].litrunlen = lit_start_node - node_idx;
1453
1454         /* Return the index in chosen_sequences at which the sequences begin. */
1455         return seq_idx;
1456 }
1457
1458 /*
1459  * Given the minimum-cost path computed through the item graph for the current
1460  * block, walk the path and count how many of each symbol in each Huffman-coded
1461  * alphabet would be required to output the items (matches and literals) along
1462  * the path.
1463  *
1464  * Note that the path will be walked backwards (from the end of the block to the
1465  * beginning of the block), but this doesn't matter because this function only
1466  * computes frequencies.
1467  */
1468 static inline void
1469 lzx_tally_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1470 {
1471         lzx_walk_item_list(c, block_size, is_16_bit, false);
1472 }
1473
1474 /*
1475  * Like lzx_tally_item_list(), but this function also generates the list of
1476  * lzx_sequences for the minimum-cost path and writes it to c->chosen_sequences,
1477  * ready to be output to the bitstream after the Huffman codes are computed.
1478  * The lzx_sequences will be written to decreasing memory addresses as the path
1479  * is walked backwards, which means they will end up in the expected
1480  * first-to-last order.  The return value is the index in c->chosen_sequences at
1481  * which the lzx_sequences begin.
1482  */
1483 static inline u32
1484 lzx_record_item_list(struct lzx_compressor *c, u32 block_size, bool is_16_bit)
1485 {
1486         return lzx_walk_item_list(c, block_size, is_16_bit, true);
1487 }
1488
1489 /*
1490  * Find an inexpensive path through the graph of possible match/literal choices
1491  * for the current block.  The nodes of the graph are
1492  * c->optimum_nodes[0...block_size].  They correspond directly to the bytes in
1493  * the current block, plus one extra node for end-of-block.  The edges of the
1494  * graph are matches and literals.  The goal is to find the minimum cost path
1495  * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]', given the cost
1496  * model 'c->costs'.
1497  *
1498  * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and
1499  * proceeding forwards one node at a time.  At each node, a selection of matches
1500  * (len >= 2), as well as the literal byte (len = 1), is considered.  An item of
1501  * length 'len' provides a new path to reach the node 'len' bytes later.  If
1502  * such a path is the lowest cost found so far to reach that later node, then
1503  * that later node is updated with the new cost and the "arrival" which provided
1504  * that cost.
1505  *
1506  * Note that although this algorithm is based on minimum cost path search, due
1507  * to various simplifying assumptions the result is not guaranteed to be the
1508  * true minimum cost, or "optimal", path over the graph of all valid LZX
1509  * representations of this block.
1510  *
1511  * Also, note that because of the presence of the recent offsets queue (which is
1512  * a type of adaptive state), the algorithm cannot work backwards and compute
1513  * "cost to end" instead of "cost to beginning".  Furthermore, the way the
1514  * algorithm handles this adaptive state in the "minimum cost" parse is actually
1515  * only an approximation.  It's possible for the globally optimal, minimum cost
1516  * path to contain a prefix, ending at a position, where that path prefix is
1517  * *not* the minimum cost path to that position.  This can happen if such a path
1518  * prefix results in a different adaptive state which results in lower costs
1519  * later.  The algorithm does not solve this problem in general; it only looks
1520  * one step ahead, with the exception of special consideration for "gap
1521  * matches".
1522  */
1523 static inline struct lzx_lru_queue
1524 lzx_find_min_cost_path(struct lzx_compressor * const restrict c,
1525                        const u8 * const restrict block_begin,
1526                        const u32 block_size,
1527                        const struct lzx_lru_queue initial_queue,
1528                        bool is_16_bit)
1529 {
1530         struct lzx_optimum_node *cur_node = c->optimum_nodes;
1531         struct lzx_optimum_node * const end_node = cur_node + block_size;
1532         struct lz_match *cache_ptr = c->match_cache;
1533         const u8 *in_next = block_begin;
1534         const u8 * const block_end = block_begin + block_size;
1535
1536         /*
1537          * Instead of storing the match offset LRU queues in the
1538          * 'lzx_optimum_node' structures, we save memory (and cache lines) by
1539          * storing them in a smaller array.  This works because the algorithm
1540          * only requires a limited history of the adaptive state.  Once a given
1541          * state is more than LZX_MAX_MATCH_LEN bytes behind the current node
1542          * (more if gap match consideration is enabled; we just round up to 512
1543          * so it's a power of 2), it is no longer needed.
1544          *
1545          * The QUEUE() macro finds the queue for the given node.  This macro has
1546          * been optimized by taking advantage of 'struct lzx_lru_queue' and
1547          * 'struct lzx_optimum_node' both being 8 bytes in size and alignment.
1548          */
1549         struct lzx_lru_queue queues[512];
1550         STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1);
1551         STATIC_ASSERT(sizeof(c->optimum_nodes[0]) == sizeof(queues[0]));
1552 #define QUEUE(node) \
1553         (*(struct lzx_lru_queue *)((char *)queues + \
1554                         ((uintptr_t)(node) % (ARRAY_LEN(queues) * sizeof(queues[0])))))
1555         /*(queues[(uintptr_t)(node) / sizeof(*(node)) % ARRAY_LEN(queues)])*/
1556
1557 #if CONSIDER_GAP_MATCHES
1558         u32 matches_before_gap[ARRAY_LEN(queues)];
1559 #define MATCH_BEFORE_GAP(node) \
1560         (matches_before_gap[(uintptr_t)(node) / sizeof(*(node)) % \
1561                             ARRAY_LEN(matches_before_gap)])
1562 #endif
1563
1564         /*
1565          * Initially, the cost to reach each node is "infinity".
1566          *
1567          * The first node actually should have cost 0, but "infinity"
1568          * (0xFFFFFFFF) works just as well because it immediately overflows.
1569          *
1570          * The following statement also intentionally sets the 'item' of the
1571          * first node, which would otherwise have no meaning, to 0xFFFFFFFF for
1572          * use as a sentinel.  See lzx_walk_item_list().
1573          */
1574         memset(c->optimum_nodes, 0xFF,
1575                (block_size + 1) * sizeof(c->optimum_nodes[0]));
1576
1577         /* Initialize the recent offsets queue for the first node. */
1578         QUEUE(cur_node) = initial_queue;
1579
1580         do { /* For each node in the block in position order... */
1581
1582                 unsigned num_matches;
1583                 unsigned literal;
1584                 u32 cost;
1585
1586                 /*
1587                  * A selection of matches for the block was already saved in
1588                  * memory so that we don't have to run the uncompressed data
1589                  * through the matchfinder on every optimization pass.  However,
1590                  * we still search for repeat offset matches during each
1591                  * optimization pass because we cannot predict the state of the
1592                  * recent offsets queue.  But as a heuristic, we don't bother
1593                  * searching for repeat offset matches if the general-purpose
1594                  * matchfinder failed to find any matches.
1595                  *
1596                  * Note that a match of length n at some offset implies there is
1597                  * also a match of length l for LZX_MIN_MATCH_LEN <= l <= n at
1598                  * that same offset.  In other words, we don't necessarily need
1599                  * to use the full length of a match.  The key heuristic that
1600                  * saves a significicant amount of time is that for each
1601                  * distinct length, we only consider the smallest offset for
1602                  * which that length is available.  This heuristic also applies
1603                  * to repeat offsets, which we order specially: R0 < R1 < R2 <
1604                  * any explicit offset.  Of course, this heuristic may be
1605                  * produce suboptimal results because offset slots in LZX are
1606                  * subject to entropy encoding, but in practice this is a useful
1607                  * heuristic.
1608                  */
1609
1610                 num_matches = cache_ptr->length;
1611                 cache_ptr++;
1612
1613                 if (num_matches) {
1614                         struct lz_match *end_matches = cache_ptr + num_matches;
1615                         unsigned next_len = LZX_MIN_MATCH_LEN;
1616                         unsigned max_len = min(block_end - in_next, LZX_MAX_MATCH_LEN);
1617                         const u8 *matchptr;
1618
1619                         /* Consider rep0 matches. */
1620                         matchptr = in_next - lzx_lru_queue_R0(QUEUE(cur_node));
1621                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1622                                 goto rep0_done;
1623                         STATIC_ASSERT(LZX_MIN_MATCH_LEN == 2);
1624                         do {
1625                                 u32 cost = cur_node->cost +
1626                                            c->costs.match_cost[0][
1627                                                         next_len - LZX_MIN_MATCH_LEN];
1628                                 if (cost <= (cur_node + next_len)->cost) {
1629                                         (cur_node + next_len)->cost = cost;
1630                                         (cur_node + next_len)->item =
1631                                                 (0 << OPTIMUM_OFFSET_SHIFT) | next_len;
1632                                 }
1633                                 if (unlikely(++next_len > max_len)) {
1634                                         cache_ptr = end_matches;
1635                                         goto done_matches;
1636                                 }
1637                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1638
1639                 rep0_done:
1640
1641                         /* Consider rep1 matches. */
1642                         matchptr = in_next - lzx_lru_queue_R1(QUEUE(cur_node));
1643                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1644                                 goto rep1_done;
1645                         if (matchptr[next_len - 1] != in_next[next_len - 1])
1646                                 goto rep1_done;
1647                         for (unsigned len = 2; len < next_len - 1; len++)
1648                                 if (matchptr[len] != in_next[len])
1649                                         goto rep1_done;
1650                         do {
1651                                 u32 cost = cur_node->cost +
1652                                            c->costs.match_cost[1][
1653                                                         next_len - LZX_MIN_MATCH_LEN];
1654                                 if (cost <= (cur_node + next_len)->cost) {
1655                                         (cur_node + next_len)->cost = cost;
1656                                         (cur_node + next_len)->item =
1657                                                 (1 << OPTIMUM_OFFSET_SHIFT) | next_len;
1658                                 }
1659                                 if (unlikely(++next_len > max_len)) {
1660                                         cache_ptr = end_matches;
1661                                         goto done_matches;
1662                                 }
1663                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1664
1665                 rep1_done:
1666
1667                         /* Consider rep2 matches. */
1668                         matchptr = in_next - lzx_lru_queue_R2(QUEUE(cur_node));
1669                         if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
1670                                 goto rep2_done;
1671                         if (matchptr[next_len - 1] != in_next[next_len - 1])
1672                                 goto rep2_done;
1673                         for (unsigned len = 2; len < next_len - 1; len++)
1674                                 if (matchptr[len] != in_next[len])
1675                                         goto rep2_done;
1676                         do {
1677                                 u32 cost = cur_node->cost +
1678                                            c->costs.match_cost[2][
1679                                                         next_len - LZX_MIN_MATCH_LEN];
1680                                 if (cost <= (cur_node + next_len)->cost) {
1681                                         (cur_node + next_len)->cost = cost;
1682                                         (cur_node + next_len)->item =
1683                                                 (2 << OPTIMUM_OFFSET_SHIFT) | next_len;
1684                                 }
1685                                 if (unlikely(++next_len > max_len)) {
1686                                         cache_ptr = end_matches;
1687                                         goto done_matches;
1688                                 }
1689                         } while (in_next[next_len - 1] == matchptr[next_len - 1]);
1690
1691                 rep2_done:
1692
1693                         while (next_len > cache_ptr->length)
1694                                 if (++cache_ptr == end_matches)
1695                                         goto done_matches;
1696
1697                         /* Consider explicit offset matches. */
1698                         for (;;) {
1699                                 u32 offset = cache_ptr->offset;
1700                                 u32 adjusted_offset = offset + LZX_OFFSET_ADJUSTMENT;
1701                                 unsigned offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
1702                                 u32 base_cost = cur_node->cost;
1703                                 u32 cost;
1704
1705                         #if CONSIDER_ALIGNED_COSTS
1706                                 if (offset >= 16 - LZX_OFFSET_ADJUSTMENT)
1707                                         base_cost += c->costs.aligned[adjusted_offset &
1708                                                                       LZX_ALIGNED_OFFSET_BITMASK];
1709                         #endif
1710                                 do {
1711                                         cost = base_cost +
1712                                                c->costs.match_cost[offset_slot][
1713                                                                 next_len - LZX_MIN_MATCH_LEN];
1714                                         if (cost < (cur_node + next_len)->cost) {
1715                                                 (cur_node + next_len)->cost = cost;
1716                                                 (cur_node + next_len)->item =
1717                                                         (adjusted_offset << OPTIMUM_OFFSET_SHIFT) | next_len;
1718                                         }
1719                                 } while (++next_len <= cache_ptr->length);
1720
1721                                 if (++cache_ptr == end_matches) {
1722                                 #if CONSIDER_GAP_MATCHES
1723                                         /* Also consider the longest explicit
1724                                          * offset match as a "gap match": match
1725                                          * + lit + rep0. */
1726                                         s32 remaining = (block_end - in_next) - (s32)next_len;
1727                                         if (likely(remaining >= 2)) {
1728                                                 const u8 *strptr = in_next + next_len;
1729                                                 const u8 *matchptr = strptr - offset;
1730                                                 if (load_u16_unaligned(strptr) == load_u16_unaligned(matchptr)) {
1731                                                         STATIC_ASSERT(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2 >= 250);
1732                                                         STATIC_ASSERT(ARRAY_LEN(queues) == ARRAY_LEN(matches_before_gap));
1733                                                         unsigned limit = min(remaining,
1734                                                                              min(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2,
1735                                                                                  LZX_MAX_MATCH_LEN));
1736                                                         unsigned rep0_len = lz_extend(strptr, matchptr, 2, limit);
1737                                                         u8 lit = strptr[-1];
1738                                                         cost += c->costs.main[lit] +
1739                                                                 c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN];
1740                                                         unsigned total_len = next_len + rep0_len;
1741                                                         if (cost < (cur_node + total_len)->cost) {
1742                                                                 (cur_node + total_len)->cost = cost;
1743                                                                 (cur_node + total_len)->item =
1744                                                                         OPTIMUM_GAP_MATCH |
1745                                                                         ((u32)lit << OPTIMUM_OFFSET_SHIFT) |
1746                                                                         rep0_len;
1747                                                                 MATCH_BEFORE_GAP(cur_node + total_len) =
1748                                                                         (adjusted_offset << OPTIMUM_OFFSET_SHIFT) |
1749                                                                         (next_len - 1);
1750                                                         }
1751                                                 }
1752                                         }
1753                                 #endif /* CONSIDER_GAP_MATCHES */
1754                                         break;
1755                                 }
1756                         }
1757                 }
1758
1759         done_matches:
1760
1761                 /* Consider coding a literal.
1762
1763                  * To avoid an extra branch, actually checking the preferability
1764                  * of coding the literal is integrated into the queue update
1765                  * code below. */
1766                 literal = *in_next++;
1767                 cost = cur_node->cost + c->costs.main[literal];
1768
1769                 /* Advance to the next position. */
1770                 cur_node++;
1771
1772                 /* The lowest-cost path to the current position is now known.
1773                  * Finalize the recent offsets queue that results from taking
1774                  * this lowest-cost path. */
1775
1776                 if (cost <= cur_node->cost) {
1777                         /* Literal: queue remains unchanged. */
1778                         cur_node->cost = cost;
1779                         cur_node->item = (u32)literal << OPTIMUM_OFFSET_SHIFT;
1780                         QUEUE(cur_node) = QUEUE(cur_node - 1);
1781                 } else {
1782                         /* Match: queue update is needed. */
1783                         unsigned len = cur_node->item & OPTIMUM_LEN_MASK;
1784                 #if CONSIDER_GAP_MATCHES
1785                         s32 adjusted_offset = (s32)cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1786                         STATIC_ASSERT(OPTIMUM_GAP_MATCH == 0x80000000); /* assuming sign extension */
1787                 #else
1788                         u32 adjusted_offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1789                 #endif
1790
1791                         if (adjusted_offset >= LZX_NUM_RECENT_OFFSETS) {
1792                                 /* Explicit offset match: insert offset at front. */
1793                                 QUEUE(cur_node) =
1794                                         lzx_lru_queue_push(QUEUE(cur_node - len),
1795                                                            adjusted_offset - LZX_OFFSET_ADJUSTMENT);
1796                         }
1797                 #if CONSIDER_GAP_MATCHES
1798                         else if (adjusted_offset < 0) {
1799                                 /* "Gap match": Explicit offset match, then a
1800                                  * literal, then rep0 match.  Save the explicit
1801                                  * offset match information in the cost field of
1802                                  * the previous node, which isn't needed
1803                                  * anymore.  Then insert the offset at the front
1804                                  * of the queue. */
1805                                 u32 match_before_gap = MATCH_BEFORE_GAP(cur_node);
1806                                 (cur_node - 1)->cost = match_before_gap;
1807                                 QUEUE(cur_node) =
1808                                         lzx_lru_queue_push(QUEUE(cur_node - len - 1 -
1809                                                                  (match_before_gap & OPTIMUM_LEN_MASK)),
1810                                                            (match_before_gap >> OPTIMUM_OFFSET_SHIFT) -
1811                                                            LZX_OFFSET_ADJUSTMENT);
1812                         }
1813                 #endif
1814                         else {
1815                                 /* Repeat offset match: swap offset to front. */
1816                                 QUEUE(cur_node) =
1817                                         lzx_lru_queue_swap(QUEUE(cur_node - len),
1818                                                            adjusted_offset);
1819                         }
1820                 }
1821         } while (cur_node != end_node);
1822
1823         /* Return the recent offsets queue at the end of the path. */
1824         return QUEUE(cur_node);
1825 }
1826
1827 /*
1828  * Given the costs for the main and length codewords (c->costs.main and
1829  * c->costs.len), initialize the match cost array (c->costs.match_cost) which
1830  * directly provides the cost of every possible (length, offset slot) pair.
1831  */
1832 static void
1833 lzx_compute_match_costs(struct lzx_compressor *c)
1834 {
1835         unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) /
1836                                         LZX_NUM_LEN_HEADERS;
1837         struct lzx_costs *costs = &c->costs;
1838         unsigned main_symbol = LZX_NUM_CHARS;
1839
1840         for (unsigned offset_slot = 0; offset_slot < num_offset_slots;
1841              offset_slot++)
1842         {
1843                 u32 extra_cost = lzx_extra_offset_bits[offset_slot] * BIT_COST;
1844                 unsigned i;
1845
1846         #if CONSIDER_ALIGNED_COSTS
1847                 if (offset_slot >= 8)
1848                         extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS * BIT_COST;
1849         #endif
1850
1851                 for (i = 0; i < LZX_NUM_PRIMARY_LENS; i++) {
1852                         costs->match_cost[offset_slot][i] =
1853                                 costs->main[main_symbol++] + extra_cost;
1854                 }
1855
1856                 extra_cost += costs->main[main_symbol++];
1857
1858                 for (; i < LZX_NUM_LENS; i++) {
1859                         costs->match_cost[offset_slot][i] =
1860                                 costs->len[i - LZX_NUM_PRIMARY_LENS] +
1861                                 extra_cost;
1862                 }
1863         }
1864 }
1865
1866 /*
1867  * Fast approximation for log2f(x).  This is not as accurate as the standard C
1868  * version.  It does not need to be perfectly accurate because it is only used
1869  * for estimating symbol costs, which is very approximate anyway.
1870  */
1871 static float
1872 log2f_fast(float x)
1873 {
1874         union {
1875                 float f;
1876                 s32 i;
1877         } u = { .f = x };
1878
1879         /* Extract the exponent and subtract 127 to remove the bias.  This gives
1880          * the integer part of the result. */
1881         float res = ((u.i >> 23) & 0xFF) - 127;
1882
1883         /* Set the exponent to 0 (plus bias of 127).  This transforms the number
1884          * to the range [1, 2) while retaining the same mantissa. */
1885         u.i = (u.i & ~(0xFF << 23)) | (127 << 23);
1886
1887         /*
1888          * Approximate the log2 of the transformed number using a degree 2
1889          * interpolating polynomial for log2(x) over the interval [1, 2).  Then
1890          * add this to the extracted exponent to produce the final approximation
1891          * of log2(x).
1892          *
1893          * The coefficients of the interpolating polynomial used here were found
1894          * using the script tools/log2_interpolation.r.
1895          */
1896         return res - 1.653124006f + u.f * (1.9941812f - u.f * 0.3347490189f);
1897
1898 }
1899
1900 /*
1901  * Return the estimated cost of a symbol which has been estimated to have the
1902  * given probability.
1903  */
1904 static u32
1905 lzx_cost_for_probability(float prob)
1906 {
1907         /*
1908          * The basic formula is:
1909          *
1910          *      entropy = -log2(probability)
1911          *
1912          * Use this to get the cost in fractional bits.  Then multiply by our
1913          * scaling factor of BIT_COST and truncate to a u32.
1914          *
1915          * In addition, the minimum cost is BIT_COST (one bit) because the
1916          * entropy coding method will be Huffman codes.
1917          */
1918         u32 cost = -log2f_fast(prob) * BIT_COST;
1919         return max(cost, BIT_COST);
1920 }
1921
1922 /*
1923  * Mapping: number of used literals => heuristic probability of a literal times
1924  * 6870.  Generated by running this R command:
1925  *
1926  *      cat(paste(round(6870*2^-((304+(0:256))/64)), collapse=", "))
1927  */
1928 static const u8 literal_scaled_probs[257] = {
1929         255, 253, 250, 247, 244, 242, 239, 237, 234, 232, 229, 227, 224, 222,
1930         219, 217, 215, 212, 210, 208, 206, 203, 201, 199, 197, 195, 193, 191,
1931         189, 186, 184, 182, 181, 179, 177, 175, 173, 171, 169, 167, 166, 164,
1932         162, 160, 159, 157, 155, 153, 152, 150, 149, 147, 145, 144, 142, 141,
1933         139, 138, 136, 135, 133, 132, 130, 129, 128, 126, 125, 124, 122, 121,
1934         120, 118, 117, 116, 115, 113, 112, 111, 110, 109, 107, 106, 105, 104,
1935         103, 102, 101, 100, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86,
1936         86, 85, 84, 83, 82, 81, 80, 79, 78, 78, 77, 76, 75, 74, 73, 73, 72, 71,
1937         70, 70, 69, 68, 67, 67, 66, 65, 65, 64, 63, 62, 62, 61, 60, 60, 59, 59,
1938         58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 52, 51, 51, 50, 50, 49, 49, 48,
1939         48, 47, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 40, 40, 40,
1940         39, 39, 38, 38, 38, 37, 37, 36, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33,
1941         32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 27, 27,
1942         27, 26, 26, 26, 25, 25, 25, 25, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22,
1943         22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18,
1944         18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16
1945 };
1946
1947 /*
1948  * Mapping: length symbol => default cost of that symbol.  This is derived from
1949  * sample data but has been slightly edited to add more bias towards the
1950  * shortest lengths, which are the most common.
1951  */
1952 static const u16 lzx_default_len_costs[LZX_LENCODE_NUM_SYMBOLS] = {
1953         300, 310, 320, 330, 360, 396, 399, 416, 451, 448, 463, 466, 505, 492,
1954         503, 514, 547, 531, 566, 561, 589, 563, 592, 586, 623, 602, 639, 627,
1955         659, 643, 657, 650, 685, 662, 661, 672, 685, 686, 696, 680, 657, 682,
1956         666, 699, 674, 699, 679, 709, 688, 712, 692, 714, 694, 716, 698, 712,
1957         706, 727, 714, 727, 713, 723, 712, 718, 719, 719, 720, 735, 725, 735,
1958         728, 740, 727, 739, 727, 742, 716, 733, 733, 740, 738, 746, 737, 747,
1959         738, 745, 736, 748, 742, 749, 745, 749, 743, 748, 741, 752, 745, 752,
1960         747, 750, 747, 752, 748, 753, 750, 752, 753, 753, 749, 744, 752, 755,
1961         753, 756, 745, 748, 746, 745, 723, 757, 755, 758, 755, 758, 752, 757,
1962         754, 757, 755, 759, 755, 758, 753, 755, 755, 758, 757, 761, 755, 750,
1963         758, 759, 759, 760, 758, 751, 757, 757, 759, 759, 758, 759, 758, 761,
1964         750, 761, 758, 760, 759, 761, 758, 761, 760, 752, 759, 760, 759, 759,
1965         757, 762, 760, 761, 761, 748, 761, 760, 762, 763, 752, 762, 762, 763,
1966         762, 762, 763, 763, 762, 763, 762, 763, 762, 763, 763, 764, 763, 762,
1967         763, 762, 762, 762, 764, 764, 763, 764, 763, 763, 763, 762, 763, 763,
1968         762, 764, 764, 763, 762, 763, 763, 763, 763, 762, 764, 763, 762, 764,
1969         764, 763, 763, 765, 764, 764, 762, 763, 764, 765, 763, 764, 763, 764,
1970         762, 764, 764, 754, 763, 764, 763, 763, 762, 763, 584,
1971 };
1972
1973 /* Set default costs to bootstrap the iterative optimization algorithm. */
1974 static void
1975 lzx_set_default_costs(struct lzx_compressor *c)
1976 {
1977         unsigned i;
1978         u32 num_literals = 0;
1979         u32 num_used_literals = 0;
1980         float inv_num_matches = 1.0f / c->freqs.main[LZX_NUM_CHARS];
1981         float inv_num_items;
1982         float prob_match = 1.0f;
1983         u32 match_cost;
1984         float base_literal_prob;
1985
1986         /* Some numbers here have been hardcoded to assume a bit cost of 64. */
1987         STATIC_ASSERT(BIT_COST == 64);
1988
1989         /* Estimate the number of literals that will used.  'num_literals' is
1990          * the total number, whereas 'num_used_literals' is the number of
1991          * distinct symbols. */
1992         for (i = 0; i < LZX_NUM_CHARS; i++) {
1993                 num_literals += c->freqs.main[i];
1994                 num_used_literals += (c->freqs.main[i] != 0);
1995         }
1996
1997         /* Note: all match headers were tallied as symbol 'LZX_NUM_CHARS'.  We
1998          * don't attempt to estimate which ones will be used. */
1999
2000         inv_num_items = 1.0f / (num_literals + c->freqs.main[LZX_NUM_CHARS]);
2001         base_literal_prob = literal_scaled_probs[num_used_literals] *
2002                             (1.0f / 6870.0f);
2003
2004         /* Literal costs.  We use two different methods to compute the
2005          * probability of each literal and mix together their results. */
2006         for (i = 0; i < LZX_NUM_CHARS; i++) {
2007                 u32 freq = c->freqs.main[i];
2008                 if (freq != 0) {
2009                         float prob = 0.5f * ((freq * inv_num_items) +
2010                                              base_literal_prob);
2011                         c->costs.main[i] = lzx_cost_for_probability(prob);
2012                         prob_match -= prob;
2013                 } else {
2014                         c->costs.main[i] = 11 * BIT_COST;
2015                 }
2016         }
2017
2018         /* Match header costs.  We just assume that all match headers are
2019          * equally probable, but we do take into account the relative cost of a
2020          * match header vs. a literal depending on how common matches are
2021          * expected to be vs. literals. */
2022         prob_match = max(prob_match, 0.15f);
2023         match_cost = lzx_cost_for_probability(prob_match / (c->num_main_syms -
2024                                                             LZX_NUM_CHARS));
2025         for (; i < c->num_main_syms; i++)
2026                 c->costs.main[i] = match_cost;
2027
2028         /* Length symbol costs.  These are just set to fixed values which
2029          * reflect the fact the smallest lengths are typically the most common,
2030          * and therefore are typically the cheapest. */
2031         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
2032                 c->costs.len[i] = lzx_default_len_costs[i];
2033
2034 #if CONSIDER_ALIGNED_COSTS
2035         /* Aligned offset symbol costs.  These are derived from the estimated
2036          * probability of each aligned offset symbol. */
2037         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2038                 /* We intentionally tallied the frequencies in the wrong slots,
2039                  * not accounting for LZX_OFFSET_ADJUSTMENT, since doing the
2040                  * fixup here is faster: a constant 8 subtractions here vs. one
2041                  * addition for every match. */
2042                 unsigned j = (i - LZX_OFFSET_ADJUSTMENT) & LZX_ALIGNED_OFFSET_BITMASK;
2043                 if (c->freqs.aligned[j] != 0) {
2044                         float prob = c->freqs.aligned[j] * inv_num_matches;
2045                         c->costs.aligned[i] = lzx_cost_for_probability(prob);
2046                 } else {
2047                         c->costs.aligned[i] =
2048                                 (2 * LZX_NUM_ALIGNED_OFFSET_BITS) * BIT_COST;
2049                 }
2050         }
2051 #endif
2052 }
2053
2054 /* Update the current cost model to reflect the computed Huffman codes.  */
2055 static void
2056 lzx_set_costs_from_codes(struct lzx_compressor *c)
2057 {
2058         unsigned i;
2059         const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
2060
2061         for (i = 0; i < c->num_main_syms; i++) {
2062                 c->costs.main[i] = (lens->main[i] ? lens->main[i] :
2063                                     MAIN_CODEWORD_LIMIT) * BIT_COST;
2064         }
2065
2066         for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
2067                 c->costs.len[i] = (lens->len[i] ? lens->len[i] :
2068                                    LENGTH_CODEWORD_LIMIT) * BIT_COST;
2069         }
2070
2071 #if CONSIDER_ALIGNED_COSTS
2072         for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
2073                 c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] :
2074                                        ALIGNED_CODEWORD_LIMIT) * BIT_COST;
2075         }
2076 #endif
2077 }
2078
2079 /*
2080  * Choose a "near-optimal" literal/match sequence to use for the current block,
2081  * then flush the block.  Because the cost of each Huffman symbol is unknown
2082  * until the Huffman codes have been built and the Huffman codes themselves
2083  * depend on the symbol frequencies, this uses an iterative optimization
2084  * algorithm to approximate an optimal solution.  The first optimization pass
2085  * for the block uses default costs; additional passes use costs derived from
2086  * the Huffman codes computed in the previous pass.
2087  */
2088 static inline struct lzx_lru_queue
2089 lzx_optimize_and_flush_block(struct lzx_compressor * const restrict c,
2090                              struct lzx_output_bitstream * const restrict os,
2091                              const u8 * const restrict block_begin,
2092                              const u32 block_size,
2093                              const struct lzx_lru_queue initial_queue,
2094                              bool is_16_bit)
2095 {
2096         unsigned num_passes_remaining = c->num_optim_passes;
2097         struct lzx_lru_queue new_queue;
2098         u32 seq_idx;
2099
2100         lzx_set_default_costs(c);
2101
2102         for (;;) {
2103                 lzx_compute_match_costs(c);
2104                 new_queue = lzx_find_min_cost_path(c, block_begin, block_size,
2105                                                    initial_queue, is_16_bit);
2106
2107                 if (--num_passes_remaining == 0)
2108                         break;
2109
2110                 /* At least one optimization pass remains.  Update the costs. */
2111                 lzx_reset_symbol_frequencies(c);
2112                 lzx_tally_item_list(c, block_size, is_16_bit);
2113                 lzx_build_huffman_codes(c);
2114                 lzx_set_costs_from_codes(c);
2115         }
2116
2117         /* Done optimizing.  Generate the sequence list and flush the block. */
2118         lzx_reset_symbol_frequencies(c);
2119         seq_idx = lzx_record_item_list(c, block_size, is_16_bit);
2120         lzx_flush_block(c, os, block_begin, block_size, seq_idx);
2121         return new_queue;
2122 }
2123
2124 /*
2125  * This is the "near-optimal" LZX compressor.
2126  *
2127  * For each block, it performs a relatively thorough graph search to find an
2128  * inexpensive (in terms of compressed size) way to output the block.
2129  *
2130  * Note: there are actually many things this algorithm leaves on the table in
2131  * terms of compression ratio.  So although it may be "near-optimal", it is
2132  * certainly not "optimal".  The goal is not to produce the optimal compression
2133  * ratio, which for LZX is probably impossible within any practical amount of
2134  * time, but rather to produce a compression ratio significantly better than a
2135  * simpler "greedy" or "lazy" parse while still being relatively fast.
2136  */
2137 static inline void
2138 lzx_compress_near_optimal(struct lzx_compressor * restrict c,
2139                           const u8 * const restrict in_begin, size_t in_nbytes,
2140                           struct lzx_output_bitstream * restrict os,
2141                           bool is_16_bit)
2142 {
2143         const u8 *       in_next = in_begin;
2144         const u8 * const in_end  = in_begin + in_nbytes;
2145         u32 max_len = LZX_MAX_MATCH_LEN;
2146         u32 nice_len = min(c->nice_match_length, max_len);
2147         u32 next_hashes[2] = {0, 0};
2148         struct lzx_lru_queue queue = LZX_QUEUE_INITIALIZER;
2149
2150         /* Initialize the matchfinder. */
2151         CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
2152
2153         do {
2154                 /* Starting a new block */
2155
2156                 const u8 * const in_block_begin = in_next;
2157                 const u8 * const in_max_block_end =
2158                         in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2159                 struct lz_match *cache_ptr = c->match_cache;
2160                 const u8 *next_search_pos = in_next;
2161                 const u8 *next_observation = in_next;
2162                 const u8 *next_pause_point =
2163                         min(in_next + min(MIN_BLOCK_SIZE,
2164                                           in_max_block_end - in_next),
2165                             in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2166                                                    in_max_block_end - in_next));
2167
2168                 lzx_init_block_split_stats(&c->split_stats);
2169                 lzx_reset_symbol_frequencies(c);
2170
2171                 if (in_next >= next_pause_point)
2172                         goto pause;
2173
2174                 /*
2175                  * Run the input buffer through the matchfinder, caching the
2176                  * matches, until we decide to end the block.
2177                  *
2178                  * For a tighter matchfinding loop, we compute a "pause point",
2179                  * which is the next position at which we may need to check
2180                  * whether to end the block or to decrease max_len.  We then
2181                  * only do these extra checks upon reaching the pause point.
2182                  */
2183         resume_matchfinding:
2184                 do {
2185                         if (in_next >= next_search_pos) {
2186                                 /* Search for matches at this position. */
2187                                 struct lz_match *lz_matchptr;
2188                                 u32 best_len;
2189
2190                                 lz_matchptr = CALL_BT_MF(is_16_bit, c,
2191                                                          bt_matchfinder_get_matches,
2192                                                          in_begin,
2193                                                          in_next - in_begin,
2194                                                          max_len,
2195                                                          nice_len,
2196                                                          c->max_search_depth,
2197                                                          next_hashes,
2198                                                          &best_len,
2199                                                          cache_ptr + 1);
2200                                 cache_ptr->length = lz_matchptr - (cache_ptr + 1);
2201                                 cache_ptr = lz_matchptr;
2202
2203                                 /* Accumulate literal/match statistics for block
2204                                  * splitting and for generating the initial cost
2205                                  * model. */
2206                                 if (in_next >= next_observation) {
2207                                         best_len = cache_ptr[-1].length;
2208                                         if (best_len >= 3) {
2209                                                 /* Match (len >= 3) */
2210
2211                                                 /*
2212                                                  * Note: for performance reasons this has
2213                                                  * been simplified significantly:
2214                                                  *
2215                                                  * - We wait until later to account for
2216                                                  *   LZX_OFFSET_ADJUSTMENT.
2217                                                  * - We don't account for repeat offsets.
2218                                                  * - We don't account for different match headers.
2219                                                  */
2220                                                 c->freqs.aligned[cache_ptr[-1].offset &
2221                                                         LZX_ALIGNED_OFFSET_BITMASK]++;
2222                                                 c->freqs.main[LZX_NUM_CHARS]++;
2223
2224                                                 lzx_observe_match(&c->split_stats, best_len);
2225                                                 next_observation = in_next + best_len;
2226                                         } else {
2227                                                 /* Literal */
2228                                                 c->freqs.main[*in_next]++;
2229                                                 lzx_observe_literal(&c->split_stats, *in_next);
2230                                                 next_observation = in_next + 1;
2231                                         }
2232                                 }
2233
2234                                 /*
2235                                  * If there was a very long match found, then
2236                                  * don't cache any matches for the bytes covered
2237                                  * by that match.  This avoids degenerate
2238                                  * behavior when compressing highly redundant
2239                                  * data, where the number of matches can be very
2240                                  * large.
2241                                  *
2242                                  * This heuristic doesn't actually hurt the
2243                                  * compression ratio *too* much.  If there's a
2244                                  * long match, then the data must be highly
2245                                  * compressible, so it doesn't matter as much
2246                                  * what we do.
2247                                  */
2248                                 if (best_len >= nice_len)
2249                                         next_search_pos = in_next + best_len;
2250                         } else {
2251                                 /* Don't search for matches at this position. */
2252                                 CALL_BT_MF(is_16_bit, c,
2253                                            bt_matchfinder_skip_position,
2254                                            in_begin,
2255                                            in_next - in_begin,
2256                                            nice_len,
2257                                            c->max_search_depth,
2258                                            next_hashes);
2259                                 cache_ptr->length = 0;
2260                                 cache_ptr++;
2261                         }
2262                 } while (++in_next < next_pause_point &&
2263                          likely(cache_ptr < &c->match_cache[CACHE_LENGTH]));
2264
2265         pause:
2266
2267                 /* Adjust max_len and nice_len if we're nearing the end of the
2268                  * input buffer.  In addition, if we are so close to the end of
2269                  * the input buffer that there cannot be any more matches, then
2270                  * just advance through the last few positions and record no
2271                  * matches. */
2272                 if (unlikely(max_len > in_end - in_next)) {
2273                         max_len = in_end - in_next;
2274                         nice_len = min(max_len, nice_len);
2275                         if (max_len < BT_MATCHFINDER_REQUIRED_NBYTES) {
2276                                 while (in_next != in_end) {
2277                                         cache_ptr->length = 0;
2278                                         cache_ptr++;
2279                                         in_next++;
2280                                 }
2281                         }
2282                 }
2283
2284                 /* End the block if the match cache may overflow. */
2285                 if (unlikely(cache_ptr >= &c->match_cache[CACHE_LENGTH]))
2286                         goto end_block;
2287
2288                 /* End the block if the soft maximum size has been reached. */
2289                 if (in_next >= in_max_block_end)
2290                         goto end_block;
2291
2292                 /* End the block if the block splitting algorithm thinks this is
2293                  * a good place to do so. */
2294                 if (c->split_stats.num_new_observations >=
2295                                 NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2296                     in_max_block_end - in_next >= MIN_BLOCK_SIZE &&
2297                     lzx_should_end_block(&c->split_stats))
2298                         goto end_block;
2299
2300                 /* It's not time to end the block yet.  Compute the next pause
2301                  * point and resume matchfinding. */
2302                 next_pause_point =
2303                         min(in_next + min(NUM_OBSERVATIONS_PER_BLOCK_CHECK * 2 -
2304                                             c->split_stats.num_new_observations,
2305                                           in_max_block_end - in_next),
2306                             in_max_block_end - min(LZX_MAX_MATCH_LEN - 1,
2307                                                    in_max_block_end - in_next));
2308                 goto resume_matchfinding;
2309
2310         end_block:
2311                 /* We've decided on a block boundary and cached matches.  Now
2312                  * choose a match/literal sequence and flush the block. */
2313                 queue = lzx_optimize_and_flush_block(c, os, in_block_begin,
2314                                                      in_next - in_block_begin,
2315                                                      queue, is_16_bit);
2316         } while (in_next != in_end);
2317 }
2318
2319 static void
2320 lzx_compress_near_optimal_16(struct lzx_compressor *c, const u8 *in,
2321                              size_t in_nbytes, struct lzx_output_bitstream *os)
2322 {
2323         lzx_compress_near_optimal(c, in, in_nbytes, os, true);
2324 }
2325
2326 static void
2327 lzx_compress_near_optimal_32(struct lzx_compressor *c, const u8 *in,
2328                              size_t in_nbytes, struct lzx_output_bitstream *os)
2329 {
2330         lzx_compress_near_optimal(c, in, in_nbytes, os, false);
2331 }
2332
2333 /******************************************************************************/
2334 /*                     Faster ("lazy") compression algorithm                  */
2335 /*----------------------------------------------------------------------------*/
2336
2337 /*
2338  * Called when the compressor chooses to use a literal.  This tallies the
2339  * Huffman symbol for the literal, increments the current literal run length,
2340  * and "observes" the literal for the block split statistics.
2341  */
2342 static inline void
2343 lzx_choose_literal(struct lzx_compressor *c, unsigned literal, u32 *litrunlen_p)
2344 {
2345         lzx_observe_literal(&c->split_stats, literal);
2346         c->freqs.main[literal]++;
2347         ++*litrunlen_p;
2348 }
2349
2350 /*
2351  * Called when the compressor chooses to use a match.  This tallies the Huffman
2352  * symbol(s) for a match, saves the match data and the length of the preceding
2353  * literal run, updates the recent offsets queue, and "observes" the match for
2354  * the block split statistics.
2355  */
2356 static inline void
2357 lzx_choose_match(struct lzx_compressor *c, unsigned length, u32 adjusted_offset,
2358                  u32 recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
2359                  u32 *litrunlen_p, struct lzx_sequence **next_seq_p)
2360 {
2361         u32 litrunlen = *litrunlen_p;
2362         struct lzx_sequence *next_seq = *next_seq_p;
2363         unsigned offset_slot;
2364         unsigned v;
2365
2366         lzx_observe_match(&c->split_stats, length);
2367
2368         v = length - LZX_MIN_MATCH_LEN;
2369
2370         /* Save the literal run length and adjusted length. */
2371         next_seq->litrunlen = litrunlen;
2372         next_seq->adjusted_length = v;
2373
2374         /* Compute the length header, then tally the length symbol if needed. */
2375         if (v >= LZX_NUM_PRIMARY_LENS) {
2376                 c->freqs.len[v - LZX_NUM_PRIMARY_LENS]++;
2377                 v = LZX_NUM_PRIMARY_LENS;
2378         }
2379
2380         /* Compute the offset slot. */
2381         offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
2382
2383         /* Compute the match header. */
2384         v += offset_slot * LZX_NUM_LEN_HEADERS;
2385
2386         /* Save the adjusted offset and match header. */
2387         next_seq->adjusted_offset_and_match_hdr = (adjusted_offset << 9) | v;
2388
2389         /* Tally the main symbol. */
2390         c->freqs.main[LZX_NUM_CHARS + v]++;
2391
2392         /* Update the recent offsets queue. */
2393         if (adjusted_offset < LZX_NUM_RECENT_OFFSETS) {
2394                 /* Repeat offset match. */
2395                 swap(recent_offsets[0], recent_offsets[adjusted_offset]);
2396         } else {
2397                 /* Explicit offset match. */
2398
2399                 /* Tally the aligned offset symbol if needed. */
2400                 if (adjusted_offset >= 16)
2401                         c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
2402
2403                 recent_offsets[2] = recent_offsets[1];
2404                 recent_offsets[1] = recent_offsets[0];
2405                 recent_offsets[0] = adjusted_offset - LZX_OFFSET_ADJUSTMENT;
2406         }
2407
2408         /* Reset the literal run length and advance to the next sequence. */
2409         *next_seq_p = next_seq + 1;
2410         *litrunlen_p = 0;
2411 }
2412
2413 /*
2414  * Called when the compressor ends a block.  This finshes the last lzx_sequence,
2415  * which is just a literal run with no following match.  This literal run might
2416  * be empty.
2417  */
2418 static inline void
2419 lzx_finish_sequence(struct lzx_sequence *last_seq, u32 litrunlen)
2420 {
2421         last_seq->litrunlen = litrunlen;
2422
2423         /* Special value to mark last sequence */
2424         last_seq->adjusted_offset_and_match_hdr = 0x80000000;
2425 }
2426
2427 /*
2428  * Find the longest repeat offset match with the current position.  If a match
2429  * is found, return its length and set *best_rep_idx_ret to the index of its
2430  * offset in @recent_offsets.  Otherwise, return 0.
2431  *
2432  * Don't bother with length 2 matches; consider matches of length >= 3 only.
2433  * Also assume that max_len >= 3.
2434  */
2435 static unsigned
2436 lzx_find_longest_repeat_offset_match(const u8 * const in_next,
2437                                      const u32 recent_offsets[],
2438                                      const unsigned max_len,
2439                                      unsigned *best_rep_idx_ret)
2440 {
2441         STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); /* loop is unrolled */
2442
2443         const u32 seq3 = load_u24_unaligned(in_next);
2444         const u8 *matchptr;
2445         unsigned best_rep_len = 0;
2446         unsigned best_rep_idx = 0;
2447         unsigned rep_len;
2448
2449         /* Check for rep0 match (most recent offset) */
2450         matchptr = in_next - recent_offsets[0];
2451         if (load_u24_unaligned(matchptr) == seq3)
2452                 best_rep_len = lz_extend(in_next, matchptr, 3, max_len);
2453
2454         /* Check for rep1 match (second most recent offset) */
2455         matchptr = in_next - recent_offsets[1];
2456         if (load_u24_unaligned(matchptr) == seq3) {
2457                 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2458                 if (rep_len > best_rep_len) {
2459                         best_rep_len = rep_len;
2460                         best_rep_idx = 1;
2461                 }
2462         }
2463
2464         /* Check for rep2 match (third most recent offset) */
2465         matchptr = in_next - recent_offsets[2];
2466         if (load_u24_unaligned(matchptr) == seq3) {
2467                 rep_len = lz_extend(in_next, matchptr, 3, max_len);
2468                 if (rep_len > best_rep_len) {
2469                         best_rep_len = rep_len;
2470                         best_rep_idx = 2;
2471                 }
2472         }
2473
2474         *best_rep_idx_ret = best_rep_idx;
2475         return best_rep_len;
2476 }
2477
2478 /*
2479  * Fast heuristic scoring for lazy parsing: how "good" is this match?
2480  * This is mainly determined by the length: longer matches are better.
2481  * However, we also give a bonus to close (small offset) matches and to repeat
2482  * offset matches, since those require fewer bits to encode.
2483  */
2484
2485 static inline unsigned
2486 lzx_explicit_offset_match_score(unsigned len, u32 adjusted_offset)
2487 {
2488         unsigned score = len;
2489
2490         if (adjusted_offset < 4096)
2491                 score++;
2492         if (adjusted_offset < 256)
2493                 score++;
2494
2495         return score;
2496 }
2497
2498 static inline unsigned
2499 lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
2500 {
2501         return rep_len + 3;
2502 }
2503
2504 /*
2505  * This is the "lazy" LZX compressor.  The basic idea is that before it chooses
2506  * a match, it checks to see if there's a longer match at the next position.  If
2507  * yes, it chooses a literal and continues to the next position.  If no, it
2508  * chooses the match.
2509  *
2510  * Some additional heuristics are used as well.  Repeat offset matches are
2511  * considered favorably and sometimes are chosen immediately.  In addition, long
2512  * matches (at least "nice_len" bytes) are chosen immediately as well.  Finally,
2513  * when we decide whether a match is "better" than another, we take the offset
2514  * into consideration as well as the length.
2515  */
2516 static inline void
2517 lzx_compress_lazy(struct lzx_compressor * restrict c,
2518                   const u8 * const restrict in_begin, size_t in_nbytes,
2519                   struct lzx_output_bitstream * restrict os, bool is_16_bit)
2520 {
2521         const u8 *       in_next = in_begin;
2522         const u8 * const in_end  = in_begin + in_nbytes;
2523         unsigned max_len = LZX_MAX_MATCH_LEN;
2524         unsigned nice_len = min(c->nice_match_length, max_len);
2525         STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
2526         u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1};
2527         u32 next_hashes[2] = {0, 0};
2528
2529         /* Initialize the matchfinder. */
2530         CALL_HC_MF(is_16_bit, c, hc_matchfinder_init);
2531
2532         do {
2533                 /* Starting a new block */
2534
2535                 const u8 * const in_block_begin = in_next;
2536                 const u8 * const in_max_block_end =
2537                         in_next + min(SOFT_MAX_BLOCK_SIZE, in_end - in_next);
2538                 struct lzx_sequence *next_seq = c->chosen_sequences;
2539                 u32 litrunlen = 0;
2540                 unsigned cur_len;
2541                 u32 cur_offset;
2542                 u32 cur_adjusted_offset;
2543                 unsigned cur_score;
2544                 unsigned next_len;
2545                 u32 next_offset;
2546                 u32 next_adjusted_offset;
2547                 unsigned next_score;
2548                 unsigned best_rep_len;
2549                 unsigned best_rep_idx;
2550                 unsigned rep_score;
2551                 unsigned skip_len;
2552
2553                 lzx_reset_symbol_frequencies(c);
2554                 lzx_init_block_split_stats(&c->split_stats);
2555
2556                 do {
2557                         /* Adjust max_len and nice_len if we're nearing the end
2558                          * of the input buffer. */
2559                         if (unlikely(max_len > in_end - in_next)) {
2560                                 max_len = in_end - in_next;
2561                                 nice_len = min(max_len, nice_len);
2562                         }
2563
2564                         /* Find the longest match (subject to the
2565                          * max_search_depth cutoff parameter) with the current
2566                          * position.  Don't bother with length 2 matches; only
2567                          * look for matches of length >= 3. */
2568                         cur_len = CALL_HC_MF(is_16_bit, c,
2569                                              hc_matchfinder_longest_match,
2570                                              in_begin,
2571                                              in_next - in_begin,
2572                                              2,
2573                                              max_len,
2574                                              nice_len,
2575                                              c->max_search_depth,
2576                                              next_hashes,
2577                                              &cur_offset);
2578
2579                         /* If there was no match found, or the only match found
2580                          * was a distant short match, then choose a literal. */
2581                         if (cur_len < 3 ||
2582                             (cur_len == 3 &&
2583                              cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT &&
2584                              cur_offset != recent_offsets[0] &&
2585                              cur_offset != recent_offsets[1] &&
2586                              cur_offset != recent_offsets[2]))
2587                         {
2588                                 lzx_choose_literal(c, *in_next, &litrunlen);
2589                                 in_next++;
2590                                 continue;
2591                         }
2592
2593                         /* Heuristic: if this match has the most recent offset,
2594                          * then go ahead and choose it as a rep0 match. */
2595                         if (cur_offset == recent_offsets[0]) {
2596                                 in_next++;
2597                                 skip_len = cur_len - 1;
2598                                 cur_adjusted_offset = 0;
2599                                 goto choose_cur_match;
2600                         }
2601
2602                         /* Compute the longest match's score as an explicit
2603                          * offset match. */
2604                         cur_adjusted_offset = cur_offset + LZX_OFFSET_ADJUSTMENT;
2605                         cur_score = lzx_explicit_offset_match_score(cur_len, cur_adjusted_offset);
2606
2607                         /* Find the longest repeat offset match at this
2608                          * position.  If we find one and it's "better" than the
2609                          * explicit offset match we found, then go ahead and
2610                          * choose the repeat offset match immediately. */
2611                         best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2612                                                                             recent_offsets,
2613                                                                             max_len,
2614                                                                             &best_rep_idx);
2615                         in_next++;
2616
2617                         if (best_rep_len != 0 &&
2618                             (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2619                                                                        best_rep_idx)) >= cur_score)
2620                         {
2621                                 cur_len = best_rep_len;
2622                                 cur_adjusted_offset = best_rep_idx;
2623                                 skip_len = best_rep_len - 1;
2624                                 goto choose_cur_match;
2625                         }
2626
2627                 have_cur_match:
2628                         /*
2629                          * We have a match at the current position.  If the
2630                          * match is very long, then choose it immediately.
2631                          * Otherwise, see if there's a better match at the next
2632                          * position.
2633                          */
2634
2635                         if (cur_len >= nice_len) {
2636                                 skip_len = cur_len - 1;
2637                                 goto choose_cur_match;
2638                         }
2639
2640                         if (unlikely(max_len > in_end - in_next)) {
2641                                 max_len = in_end - in_next;
2642                                 nice_len = min(max_len, nice_len);
2643                         }
2644
2645                         next_len = CALL_HC_MF(is_16_bit, c,
2646                                               hc_matchfinder_longest_match,
2647                                               in_begin,
2648                                               in_next - in_begin,
2649                                               cur_len - 2,
2650                                               max_len,
2651                                               nice_len,
2652                                               c->max_search_depth / 2,
2653                                               next_hashes,
2654                                               &next_offset);
2655
2656                         if (next_len <= cur_len - 2) {
2657                                 /* No potentially better match was found. */
2658                                 in_next++;
2659                                 skip_len = cur_len - 2;
2660                                 goto choose_cur_match;
2661                         }
2662
2663                         next_adjusted_offset = next_offset + LZX_OFFSET_ADJUSTMENT;
2664                         next_score = lzx_explicit_offset_match_score(next_len, next_adjusted_offset);
2665
2666                         best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
2667                                                                             recent_offsets,
2668                                                                             max_len,
2669                                                                             &best_rep_idx);
2670                         in_next++;
2671
2672                         if (best_rep_len != 0 &&
2673                             (rep_score = lzx_repeat_offset_match_score(best_rep_len,
2674                                                                        best_rep_idx)) >= next_score)
2675                         {
2676
2677                                 if (rep_score > cur_score) {
2678                                         /* The next match is better, and it's a
2679                                          * repeat offset match. */
2680                                         lzx_choose_literal(c, *(in_next - 2),
2681                                                            &litrunlen);
2682                                         cur_len = best_rep_len;
2683                                         cur_adjusted_offset = best_rep_idx;
2684                                         skip_len = cur_len - 1;
2685                                         goto choose_cur_match;
2686                                 }
2687                         } else {
2688                                 if (next_score > cur_score) {
2689                                         /* The next match is better, and it's an
2690                                          * explicit offset match. */
2691                                         lzx_choose_literal(c, *(in_next - 2),
2692                                                            &litrunlen);
2693                                         cur_len = next_len;
2694                                         cur_adjusted_offset = next_adjusted_offset;
2695                                         cur_score = next_score;
2696                                         goto have_cur_match;
2697                                 }
2698                         }
2699
2700                         /* The original match was better; choose it. */
2701                         skip_len = cur_len - 2;
2702
2703                 choose_cur_match:
2704                         /* Choose a match and have the matchfinder skip over its
2705                          * remaining bytes. */
2706                         lzx_choose_match(c, cur_len, cur_adjusted_offset,
2707                                          recent_offsets, is_16_bit,
2708                                          &litrunlen, &next_seq);
2709                         in_next = CALL_HC_MF(is_16_bit, c,
2710                                              hc_matchfinder_skip_positions,
2711                                              in_begin,
2712                                              in_next - in_begin,
2713                                              in_end - in_begin,
2714                                              skip_len,
2715                                              next_hashes);
2716
2717                         /* Keep going until it's time to end the block. */
2718                 } while (in_next < in_max_block_end &&
2719                          !(c->split_stats.num_new_observations >=
2720                                         NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
2721                            in_next - in_block_begin >= MIN_BLOCK_SIZE &&
2722                            in_end - in_next >= MIN_BLOCK_SIZE &&
2723                            lzx_should_end_block(&c->split_stats)));
2724
2725                 /* Flush the block. */
2726                 lzx_finish_sequence(next_seq, litrunlen);
2727                 lzx_flush_block(c, os, in_block_begin, in_next - in_block_begin, 0);
2728
2729                 /* Keep going until we've reached the end of the input buffer. */
2730         } while (in_next != in_end);
2731 }
2732
2733 static void
2734 lzx_compress_lazy_16(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2735                      struct lzx_output_bitstream *os)
2736 {
2737         lzx_compress_lazy(c, in, in_nbytes, os, true);
2738 }
2739
2740 static void
2741 lzx_compress_lazy_32(struct lzx_compressor *c, const u8 *in, size_t in_nbytes,
2742                      struct lzx_output_bitstream *os)
2743 {
2744         lzx_compress_lazy(c, in, in_nbytes, os, false);
2745 }
2746
2747 /******************************************************************************/
2748 /*                          Compressor operations                             */
2749 /*----------------------------------------------------------------------------*/
2750
2751 /*
2752  * Generate tables for mapping match offsets (actually, "adjusted" match
2753  * offsets) to offset slots.
2754  */
2755 static void
2756 lzx_init_offset_slot_tabs(struct lzx_compressor *c)
2757 {
2758         u32 adjusted_offset = 0;
2759         unsigned slot = 0;
2760
2761         /* slots [0, 29] */
2762         for (; adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1);
2763              adjusted_offset++)
2764         {
2765                 if (adjusted_offset >= lzx_offset_slot_base[slot + 1])
2766                         slot++;
2767                 c->offset_slot_tab_1[adjusted_offset] = slot;
2768         }
2769
2770         /* slots [30, 49] */
2771         for (; adjusted_offset < LZX_MAX_WINDOW_SIZE;
2772              adjusted_offset += (u32)1 << 14)
2773         {
2774                 if (adjusted_offset >= lzx_offset_slot_base[slot + 1])
2775                         slot++;
2776                 c->offset_slot_tab_2[adjusted_offset >> 14] = slot;
2777         }
2778 }
2779
2780 static size_t
2781 lzx_get_compressor_size(size_t max_bufsize, unsigned compression_level)
2782 {
2783         if (compression_level <= MAX_FAST_LEVEL) {
2784                 if (lzx_is_16_bit(max_bufsize))
2785                         return offsetof(struct lzx_compressor, hc_mf_16) +
2786                                hc_matchfinder_size_16(max_bufsize);
2787                 else
2788                         return offsetof(struct lzx_compressor, hc_mf_32) +
2789                                hc_matchfinder_size_32(max_bufsize);
2790         } else {
2791                 if (lzx_is_16_bit(max_bufsize))
2792                         return offsetof(struct lzx_compressor, bt_mf_16) +
2793                                bt_matchfinder_size_16(max_bufsize);
2794                 else
2795                         return offsetof(struct lzx_compressor, bt_mf_32) +
2796                                bt_matchfinder_size_32(max_bufsize);
2797         }
2798 }
2799
2800 /* Compute the amount of memory needed to allocate an LZX compressor. */
2801 static u64
2802 lzx_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2803                       bool destructive)
2804 {
2805         u64 size = 0;
2806
2807         if (max_bufsize > LZX_MAX_WINDOW_SIZE)
2808                 return 0;
2809
2810         size += lzx_get_compressor_size(max_bufsize, compression_level);
2811         if (!destructive)
2812                 size += max_bufsize; /* account for in_buffer */
2813         return size;
2814 }
2815
2816 /* Allocate an LZX compressor. */
2817 static int
2818 lzx_create_compressor(size_t max_bufsize, unsigned compression_level,
2819                       bool destructive, void **c_ret)
2820 {
2821         unsigned window_order;
2822         struct lzx_compressor *c;
2823
2824         /* Validate the maximum buffer size and get the window order from it. */
2825         window_order = lzx_get_window_order(max_bufsize);
2826         if (window_order == 0)
2827                 return WIMLIB_ERR_INVALID_PARAM;
2828
2829         /* Allocate the compressor. */
2830         c = MALLOC(lzx_get_compressor_size(max_bufsize, compression_level));
2831         if (!c)
2832                 goto oom0;
2833
2834         c->window_order = window_order;
2835         c->num_main_syms = lzx_get_num_main_syms(window_order);
2836         c->destructive = destructive;
2837
2838         /* Allocate the buffer for preprocessed data if needed. */
2839         if (!c->destructive) {
2840                 c->in_buffer = MALLOC(max_bufsize);
2841                 if (!c->in_buffer)
2842                         goto oom1;
2843         }
2844
2845         if (compression_level <= MAX_FAST_LEVEL) {
2846
2847                 /* Fast compression: Use lazy parsing. */
2848                 if (lzx_is_16_bit(max_bufsize))
2849                         c->impl = lzx_compress_lazy_16;
2850                 else
2851                         c->impl = lzx_compress_lazy_32;
2852
2853                 /* Scale max_search_depth and nice_match_length with the
2854                  * compression level. */
2855                 c->max_search_depth = (60 * compression_level) / 20;
2856                 c->nice_match_length = (80 * compression_level) / 20;
2857
2858                 /* lzx_compress_lazy() needs max_search_depth >= 2 because it
2859                  * halves the max_search_depth when attempting a lazy match, and
2860                  * max_search_depth must be at least 1. */
2861                 c->max_search_depth = max(c->max_search_depth, 2);
2862         } else {
2863
2864                 /* Normal / high compression: Use near-optimal parsing. */
2865                 if (lzx_is_16_bit(max_bufsize))
2866                         c->impl = lzx_compress_near_optimal_16;
2867                 else
2868                         c->impl = lzx_compress_near_optimal_32;
2869
2870                 /* Scale max_search_depth and nice_match_length with the
2871                  * compression level. */
2872                 c->max_search_depth = (24 * compression_level) / 50;
2873                 c->nice_match_length = (48 * compression_level) / 50;
2874
2875                 /* Also scale num_optim_passes with the compression level.  But
2876                  * the more passes there are, the less they help --- so don't
2877                  * add them linearly.  */
2878                 c->num_optim_passes = 1;
2879                 c->num_optim_passes += (compression_level >= 45);
2880                 c->num_optim_passes += (compression_level >= 70);
2881                 c->num_optim_passes += (compression_level >= 100);
2882                 c->num_optim_passes += (compression_level >= 150);
2883                 c->num_optim_passes += (compression_level >= 200);
2884                 c->num_optim_passes += (compression_level >= 300);
2885
2886                 /* max_search_depth must be at least 1. */
2887                 c->max_search_depth = max(c->max_search_depth, 1);
2888         }
2889
2890         /* Prepare the offset => offset slot mapping. */
2891         lzx_init_offset_slot_tabs(c);
2892
2893         *c_ret = c;
2894         return 0;
2895
2896 oom1:
2897         FREE(c);
2898 oom0:
2899         return WIMLIB_ERR_NOMEM;
2900 }
2901
2902 /* Compress a buffer of data. */
2903 static size_t
2904 lzx_compress(const void *restrict in, size_t in_nbytes,
2905              void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2906 {
2907         struct lzx_compressor *c = _c;
2908         struct lzx_output_bitstream os;
2909         size_t result;
2910
2911         /* Don't bother trying to compress very small inputs. */
2912         if (in_nbytes < 64)
2913                 return 0;
2914
2915         /* If the compressor is in "destructive" mode, then we can directly
2916          * preprocess the input data.  Otherwise, we need to copy it into an
2917          * internal buffer first. */
2918         if (!c->destructive) {
2919                 memcpy(c->in_buffer, in, in_nbytes);
2920                 in = c->in_buffer;
2921         }
2922
2923         /* Preprocess the input data. */
2924         lzx_preprocess((void *)in, in_nbytes);
2925
2926         /* Initially, the previous Huffman codeword lengths are all zeroes. */
2927         c->codes_index = 0;
2928         memset(&c->codes[1].lens, 0, sizeof(struct lzx_lens));
2929
2930         /* Initialize the output bitstream. */
2931         lzx_init_output(&os, out, out_nbytes_avail);
2932
2933         /* Call the compression level-specific compress() function. */
2934         (*c->impl)(c, in, in_nbytes, &os);
2935
2936         /* Flush the output bitstream. */
2937         result = lzx_flush_output(&os);
2938
2939         /* If the data did not compress to less than its original size and we
2940          * preprocessed the original buffer, then postprocess it to restore it
2941          * to its original state. */
2942         if (result == 0 && c->destructive)
2943                 lzx_postprocess((void *)in, in_nbytes);
2944
2945         /* Return the number of compressed bytes, or 0 if the input did not
2946          * compress to less than its original size. */
2947         return result;
2948 }
2949
2950 /* Free an LZX compressor. */
2951 static void
2952 lzx_free_compressor(void *_c)
2953 {
2954         struct lzx_compressor *c = _c;
2955
2956         if (!c->destructive)
2957                 FREE(c->in_buffer);
2958         FREE(c);
2959 }
2960
2961 const struct compressor_ops lzx_compressor_ops = {
2962         .get_needed_memory  = lzx_get_needed_memory,
2963         .create_compressor  = lzx_create_compressor,
2964         .compress           = lzx_compress,
2965         .free_compressor    = lzx_free_compressor,
2966 };