wimlib.net Git - wimlib/blob - src/lzms_compress.c

   1 /*
   2  * lzms_compress.c
   3  *
   4  * A compressor for the LZMS compression format.
   5  */
   6
   7 /*
   8  * Copyright (C) 2013, 2014, 2015 Eric Biggers
   9  *
  10  * This file is free software; you can redistribute it and/or modify it under
  11  * the terms of the GNU Lesser General Public License as published by the Free
  12  * Software Foundation; either version 3 of the License, or (at your option) any
  13  * later version.
  14  *
  15  * This file is distributed in the hope that it will be useful, but WITHOUT
  16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  18  * details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this file; if not, see https://www.gnu.org/licenses/.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #  include "config.h"
  26 #endif
  27
  28 #include <limits.h>
  29 #include <string.h>
  30
  31 #include "wimlib/compress_common.h"
  32 #include "wimlib/compressor_ops.h"
  33 #include "wimlib/error.h"
  34 #include "wimlib/lcpit_matchfinder.h"
  35 #include "wimlib/lzms_common.h"
  36 #include "wimlib/matchfinder_common.h"
  37 #include "wimlib/unaligned.h"
  38 #include "wimlib/util.h"
  39
  40 /*
  41  * MAX_FAST_LENGTH is the maximum match length for which the length slot can be
  42  * looked up directly in 'fast_length_slot_tab' and the length cost can be
  43  * looked up directly in 'fast_length_cost_tab'.
  44  *
  45  * We also limit the 'nice_match_len' parameter to this value.  Consequently, if
  46  * the longest match found is shorter than 'nice_match_len', then it must also
  47  * be shorter than MAX_FAST_LENGTH.  This makes it possible to do fast lookups
  48  * of length costs using 'fast_length_cost_tab' without having to keep checking
  49  * whether the length exceeds MAX_FAST_LENGTH or not.
  50  */
  51 #define MAX_FAST_LENGTH         255
  52
  53 /* NUM_OPTIM_NODES is the maximum number of bytes the parsing algorithm will
  54  * step forward before forcing the pending items to be encoded.  If this value
  55  * is increased, then there will be fewer forced flushes, but the probability
  56  * entries and Huffman codes will be more likely to become outdated.  */
  57 #define NUM_OPTIM_NODES         2048
  58
  59 /* COST_SHIFT is a scaling factor that makes it possible to consider fractional
  60  * bit costs.  A single bit has a cost of (1 << COST_SHIFT).  */
  61 #define COST_SHIFT              6
  62
  63 /* Length of the hash table for finding delta matches  */
  64 #define DELTA_HASH_ORDER        17
  65 #define DELTA_HASH_LENGTH       ((u32)1 << DELTA_HASH_ORDER)
  66
  67 /* The number of bytes to hash when finding delta matches; also taken to be the
  68  * minimum length of an explicit offset delta match  */
  69 #define NBYTES_HASHED_FOR_DELTA 3
  70
  71 /* The number of delta match powers to consider (must be <=
  72  * LZMS_NUM_DELTA_POWER_SYMS)  */
  73 #define NUM_POWERS_TO_CONSIDER  6
  74
  75 /* This structure tracks the state of writing bits as a series of 16-bit coding
  76  * units, starting at the end of the output buffer and proceeding backwards.  */
  77 struct lzms_output_bitstream {
  78
  79         /* Bits that haven't yet been written to the output buffer  */
  80         u64 bitbuf;
  81
  82         /* Number of bits currently held in @bitbuf  */
  83         unsigned bitcount;
  84
  85         /* Pointer to the beginning of the output buffer (this is the "end" when
  86          * writing backwards!)  */
  87         u8 *begin;
  88
  89         /* Pointer to just past the next position in the output buffer at which
  90          * to output a 16-bit coding unit  */
  91         u8 *next;
  92 };
  93
  94 /* This structure tracks the state of range encoding and its output, which
  95  * starts at the beginning of the output buffer and proceeds forwards.  */
  96 struct lzms_range_encoder {
  97
  98         /* The lower boundary of the current range.  Logically, this is a 33-bit
  99          * integer whose high bit is needed to detect carries.  */
 100         u64 lower_bound;
 101
 102         /* The size of the current range  */
 103         u32 range_size;
 104
 105         /* The next 16-bit coding unit to output  */
 106         u16 cache;
 107
 108         /* The number of 16-bit coding units whose output has been delayed due
 109          * to possible carrying.  The first such coding unit is @cache; all
 110          * subsequent such coding units are 0xffff.  */
 111         u32 cache_size;
 112
 113         /* Pointer to the beginning of the output buffer  */
 114         u8 *begin;
 115
 116         /* Pointer to the position in the output buffer at which the next coding
 117          * unit must be written  */
 118         u8 *next;
 119
 120         /* Pointer to just past the end of the output buffer  */
 121         u8 *end;
 122 };
 123
 124 /* Bookkeeping information for an adaptive Huffman code  */
 125 struct lzms_huffman_rebuild_info {
 126
 127         /* The remaining number of symbols to encode until this code must be
 128          * rebuilt  */
 129         unsigned num_syms_until_rebuild;
 130
 131         /* The number of symbols in this code  */
 132         unsigned num_syms;
 133
 134         /* The rebuild frequency of this code, in symbols  */
 135         unsigned rebuild_freq;
 136
 137         /* The Huffman codeword of each symbol in this code  */
 138         u32 *codewords;
 139
 140         /* The length of each Huffman codeword, in bits  */
 141         u8 *lens;
 142
 143         /* The frequency of each symbol in this code  */
 144         u32 *freqs;
 145 };
 146
 147 /*
 148  * The compressor-internal representation of a match or literal.
 149  *
 150  * Literals have length=1; matches have length > 1.  (We disallow matches of
 151  * length 1, even though this is a valid length in LZMS.)
 152  *
 153  * The source is encoded as follows:
 154  *
 155  * - Literals: the literal byte itself
 156  * - Explicit offset LZ matches: the match offset plus (LZMS_NUM_LZ_REPS - 1)
 157  * - Repeat offset LZ matches: the index of the offset in recent_lz_offsets
 158  * - Explicit offset delta matches: DELTA_SOURCE_TAG is set, the next 3 bits are
 159  *   the power, and the remainder is the raw offset plus (LZMS_NUM_DELTA_REPS-1)
 160  * - Repeat offset delta matches: DELTA_SOURCE_TAG is set, and the remainder is
 161  *   the index of the (power, raw_offset) pair in recent_delta_pairs
 162  */
 163 struct lzms_item {
 164         u32 length;
 165         u32 source;
 166 };
 167
 168 #define DELTA_SOURCE_TAG                ((u32)1 << 31)
 169 #define DELTA_SOURCE_POWER_SHIFT        28
 170 #define DELTA_SOURCE_RAW_OFFSET_MASK    (((u32)1 << DELTA_SOURCE_POWER_SHIFT) - 1)
 171
 172 static void __attribute__((unused))
 173 check_that_powers_fit_in_bitfield(void)
 174 {
 175         STATIC_ASSERT(LZMS_NUM_DELTA_POWER_SYMS <= (1 << (31 - DELTA_SOURCE_POWER_SHIFT)));
 176 }
 177
 178 /* A stripped-down version of the adaptive state in LZMS which excludes the
 179  * probability entries and Huffman codes  */
 180 struct lzms_adaptive_state {
 181
 182         /* Recent offsets for LZ matches  */
 183         u32 recent_lz_offsets[LZMS_NUM_LZ_REPS + 1];
 184         u32 prev_lz_offset; /* 0 means none */
 185         u32 upcoming_lz_offset; /* 0 means none */
 186
 187         /* Recent (power, raw offset) pairs for delta matches.
 188          * The low DELTA_SOURCE_POWER_SHIFT bits of each entry are the raw
 189          * offset, and the high bits are the power.  */
 190         u32 recent_delta_pairs[LZMS_NUM_DELTA_REPS + 1];
 191         u32 prev_delta_pair; /* 0 means none */
 192         u32 upcoming_delta_pair; /* 0 means none  */
 193
 194         /* States for predicting the probabilities of item types  */
 195         u8 main_state;
 196         u8 match_state;
 197         u8 lz_state;
 198         u8 lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
 199         u8 delta_state;
 200         u8 delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
 201 } __attribute__((aligned(64)));
 202
 203 /*
 204  * This structure represents a byte position in the preprocessed input data and
 205  * a node in the graph of possible match/literal choices.
 206  *
 207  * Logically, each incoming edge to this node is labeled with a literal or a
 208  * match that can be taken to reach this position from an earlier position; and
 209  * each outgoing edge from this node is labeled with a literal or a match that
 210  * can be taken to advance from this position to a later position.
 211  */
 212 struct lzms_optimum_node {
 213
 214         /*
 215          * The cost of the lowest-cost path that has been found to reach this
 216          * position.  This can change as progressively lower cost paths are
 217          * found to reach this position.
 218          */
 219         u32 cost;
 220 #define INFINITE_COST UINT32_MAX
 221
 222         /*
 223          * @item is the last item that was taken to reach this position to reach
 224          * it with the stored @cost.  This can change as progressively lower
 225          * cost paths are found to reach this position.
 226          *
 227          * In some cases we look ahead more than one item.  If we looked ahead n
 228          * items to reach this position, then @item is the last item taken,
 229          * @extra_items contains the other items ordered from second-to-last to
 230          * first, and @num_extra_items is n - 1.
 231          */
 232         unsigned num_extra_items;
 233         struct lzms_item item;
 234         struct lzms_item extra_items[2];
 235
 236         /*
 237          * The adaptive state that exists at this position.  This is filled in
 238          * lazily, only after the minimum-cost path to this position is found.
 239          *
 240          * Note: the way the algorithm handles this adaptive state in the
 241          * "minimum-cost" parse is actually only an approximation.  It's
 242          * possible for the globally optimal, minimum cost path to contain a
 243          * prefix, ending at a position, where that path prefix is *not* the
 244          * minimum cost path to that position.  This can happen if such a path
 245          * prefix results in a different adaptive state which results in lower
 246          * costs later.  Although the algorithm does do some heuristic
 247          * multi-item lookaheads, it does not solve this problem in general.
 248          *
 249          * Note: this adaptive state structure also does not include the
 250          * probability entries or current Huffman codewords.  Those aren't
 251          * maintained per-position and are only updated occasionally.
 252          */
 253         struct lzms_adaptive_state state;
 254 } __attribute__((aligned(64)));
 255
 256 /* The main compressor structure  */
 257 struct lzms_compressor {
 258
 259         /* The matchfinder for LZ matches  */
 260         struct lcpit_matchfinder mf;
 261
 262         /* The preprocessed buffer of data being compressed  */
 263         u8 *in_buffer;
 264
 265         /* The number of bytes of data to be compressed, which is the number of
 266          * bytes of data in @in_buffer that are actually valid  */
 267         size_t in_nbytes;
 268
 269         /*
 270          * Boolean flags to enable consideration of various types of multi-step
 271          * operations during parsing.
 272          *
 273          * Among other cases, multi-step operations can help with gaps where two
 274          * matches are separated by a non-matching byte.
 275          *
 276          * This idea is borrowed from Igor Pavlov's LZMA encoder.
 277          */
 278         bool try_lit_lzrep0;
 279         bool try_lzrep_lit_lzrep0;
 280         bool try_lzmatch_lit_lzrep0;
 281
 282         /*
 283          * If true, the compressor can use delta matches.  This slows down
 284          * compression.  It improves the compression ratio greatly, slightly, or
 285          * not at all, depending on the input data.
 286          */
 287         bool use_delta_matches;
 288
 289         /* If true, the compressor need not preserve the input buffer if it
 290          * compresses the data successfully.  */
 291         bool destructive;
 292
 293         /* 'last_target_usages' is a large array that is only needed for
 294          * preprocessing, so it is in union with fields that don't need to be
 295          * initialized until after preprocessing.  */
 296         union {
 297         struct {
 298
 299         /* Temporary space to store matches found by the LZ matchfinder  */
 300         struct lz_match matches[MAX_FAST_LENGTH - LZMS_MIN_MATCH_LENGTH + 1];
 301
 302         /* Hash table for finding delta matches  */
 303         u32 delta_hash_table[DELTA_HASH_LENGTH];
 304
 305         /* For each delta power, the hash code for the next sequence  */
 306         u32 next_delta_hashes[NUM_POWERS_TO_CONSIDER];
 307
 308         /* The per-byte graph nodes for near-optimal parsing  */
 309         struct lzms_optimum_node optimum_nodes[NUM_OPTIM_NODES + MAX_FAST_LENGTH +
 310                                                1 + MAX_FAST_LENGTH];
 311
 312         /* Table: length => current cost for small match lengths  */
 313         u32 fast_length_cost_tab[MAX_FAST_LENGTH + 1];
 314
 315         /* Range encoder which outputs to the beginning of the compressed data
 316          * buffer, proceeding forwards  */
 317         struct lzms_range_encoder rc;
 318
 319         /* Bitstream which outputs to the end of the compressed data buffer,
 320          * proceeding backwards  */
 321         struct lzms_output_bitstream os;
 322
 323         /* States and probability entries for item type disambiguation  */
 324         unsigned main_state;
 325         unsigned match_state;
 326         unsigned lz_state;
 327         unsigned lz_rep_states[LZMS_NUM_LZ_REP_DECISIONS];
 328         unsigned delta_state;
 329         unsigned delta_rep_states[LZMS_NUM_DELTA_REP_DECISIONS];
 330         struct lzms_probabilites probs;
 331
 332         /* Huffman codes  */
 333
 334         struct lzms_huffman_rebuild_info literal_rebuild_info;
 335         u32 literal_codewords[LZMS_NUM_LITERAL_SYMS];
 336         u8 literal_lens[LZMS_NUM_LITERAL_SYMS];
 337         u32 literal_freqs[LZMS_NUM_LITERAL_SYMS];
 338
 339         struct lzms_huffman_rebuild_info lz_offset_rebuild_info;
 340         u32 lz_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
 341         u8 lz_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
 342         u32 lz_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
 343
 344         struct lzms_huffman_rebuild_info length_rebuild_info;
 345         u32 length_codewords[LZMS_NUM_LENGTH_SYMS];
 346         u8 length_lens[LZMS_NUM_LENGTH_SYMS];
 347         u32 length_freqs[LZMS_NUM_LENGTH_SYMS];
 348
 349         struct lzms_huffman_rebuild_info delta_offset_rebuild_info;
 350         u32 delta_offset_codewords[LZMS_MAX_NUM_OFFSET_SYMS];
 351         u8 delta_offset_lens[LZMS_MAX_NUM_OFFSET_SYMS];
 352         u32 delta_offset_freqs[LZMS_MAX_NUM_OFFSET_SYMS];
 353
 354         struct lzms_huffman_rebuild_info delta_power_rebuild_info;
 355         u32 delta_power_codewords[LZMS_NUM_DELTA_POWER_SYMS];
 356         u8 delta_power_lens[LZMS_NUM_DELTA_POWER_SYMS];
 357         u32 delta_power_freqs[LZMS_NUM_DELTA_POWER_SYMS];
 358
 359         }; /* struct */
 360
 361         s32 last_target_usages[65536];
 362
 363         }; /* union */
 364
 365         /* Table: length => length slot for small match lengths  */
 366         u8 fast_length_slot_tab[MAX_FAST_LENGTH + 1];
 367
 368         /* Tables for mapping offsets to offset slots  */
 369
 370         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
 371         u8 offset_slot_tab_1[0xe4a5];
 372
 373         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
 374         u16 offset_slot_tab_2[0x3d0000 >> 11];
 375
 376         /* slots [427, 799); 16 <= num_extra_bits  */
 377         u16 offset_slot_tab_3[((LZMS_MAX_MATCH_OFFSET + 1) - 0xe4a5) >> 16];
 378 };
 379
 380 /******************************************************************************
 381  *                   Offset and length slot acceleration                      *
 382  ******************************************************************************/
 383
 384 /* Generate the acceleration table for length slots.  */
 385 static void
 386 lzms_init_fast_length_slot_tab(struct lzms_compressor *c)
 387 {
 388         unsigned slot = 0;
 389         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
 390                 if (len >= lzms_length_slot_base[slot + 1])
 391                         slot++;
 392                 c->fast_length_slot_tab[len] = slot;
 393         }
 394 }
 395
 396 /* Generate the acceleration tables for offset slots.  */
 397 static void
 398 lzms_init_offset_slot_tabs(struct lzms_compressor *c)
 399 {
 400         u32 offset;
 401         unsigned slot = 0;
 402
 403         /* slots [0, 167); 0 <= num_extra_bits <= 10 */
 404         for (offset = 1; offset < 0xe4a5; offset++) {
 405                 if (offset >= lzms_offset_slot_base[slot + 1])
 406                         slot++;
 407                 c->offset_slot_tab_1[offset] = slot;
 408         }
 409
 410         /* slots [167, 427); 11 <= num_extra_bits <= 15 */
 411         for (; offset < 0x3de4a5; offset += (u32)1 << 11) {
 412                 if (offset >= lzms_offset_slot_base[slot + 1])
 413                         slot++;
 414                 c->offset_slot_tab_2[(offset - 0xe4a5) >> 11] = slot;
 415         }
 416
 417         /* slots [427, 799); 16 <= num_extra_bits  */
 418         for (; offset < LZMS_MAX_MATCH_OFFSET + 1; offset += (u32)1 << 16) {
 419                 if (offset >= lzms_offset_slot_base[slot + 1])
 420                         slot++;
 421                 c->offset_slot_tab_3[(offset - 0xe4a5) >> 16] = slot;
 422         }
 423 }
 424
 425 /*
 426  * Return the length slot for the specified match length, using the compressor's
 427  * acceleration table if the length is small enough.
 428  */
 429 static forceinline unsigned
 430 lzms_comp_get_length_slot(const struct lzms_compressor *c, u32 length)
 431 {
 432         if (likely(length <= MAX_FAST_LENGTH))
 433                 return c->fast_length_slot_tab[length];
 434         return lzms_get_length_slot(length);
 435 }
 436
 437 /*
 438  * Return the offset slot for the specified match offset, using the compressor's
 439  * acceleration tables to speed up the mapping.
 440  */
 441 static forceinline unsigned
 442 lzms_comp_get_offset_slot(const struct lzms_compressor *c, u32 offset)
 443 {
 444         if (offset < 0xe4a5)
 445                 return c->offset_slot_tab_1[offset];
 446         offset -= 0xe4a5;
 447         if (offset < 0x3d0000)
 448                 return c->offset_slot_tab_2[offset >> 11];
 449         return c->offset_slot_tab_3[offset >> 16];
 450 }
 451
 452 /******************************************************************************
 453  *                             Range encoding                                 *
 454  ******************************************************************************/
 455
 456 /*
 457  * Initialize the range encoder @rc to write forwards to the specified buffer
 458  * @out that is @size bytes long.
 459  */
 460 static void
 461 lzms_range_encoder_init(struct lzms_range_encoder *rc, u8 *out, size_t size)
 462 {
 463         rc->lower_bound = 0;
 464         rc->range_size = 0xffffffff;
 465         rc->cache = 0;
 466         rc->cache_size = 1;
 467         rc->begin = out;
 468         rc->next = out - sizeof(le16);
 469         rc->end = out + (size & ~1);
 470 }
 471
 472 /*
 473  * Attempt to flush bits from the range encoder.
 474  *
 475  * The basic idea is that we're writing bits from @rc->lower_bound to the
 476  * output.  However, due to carrying, the writing of coding units with the
 477  * maximum value, as well as one prior coding unit, must be delayed until it is
 478  * determined whether a carry is needed.
 479  *
 480  * This is based on the public domain code for LZMA written by Igor Pavlov, but
 481  * with the following differences:
 482  *
 483  *      - In LZMS, 16-bit coding units are required rather than 8-bit.
 484  *
 485  *      - In LZMS, the first coding unit is not ignored by the decompressor, so
 486  *        the encoder cannot output a dummy value to that position.
 487  */
 488 static void
 489 lzms_range_encoder_shift_low(struct lzms_range_encoder *rc)
 490 {
 491         if ((u32)(rc->lower_bound) < 0xffff0000 ||
 492             (u32)(rc->lower_bound >> 32) != 0)
 493         {
 494                 /* Carry not needed (rc->lower_bound < 0xffff0000), or carry
 495                  * occurred ((rc->lower_bound >> 32) != 0, a.k.a. the carry bit
 496                  * is 1).  */
 497                 do {
 498                         if (likely(rc->next >= rc->begin)) {
 499                                 if (rc->next != rc->end) {
 500                                         put_unaligned_le16(rc->cache +
 501                                                            (u16)(rc->lower_bound >> 32),
 502                                                            rc->next);
 503                                         rc->next += sizeof(le16);
 504                                 }
 505                         } else {
 506                                 rc->next += sizeof(le16);
 507                         }
 508                         rc->cache = 0xffff;
 509                 } while (--rc->cache_size != 0);
 510
 511                 rc->cache = (rc->lower_bound >> 16) & 0xffff;
 512         }
 513         ++rc->cache_size;
 514         rc->lower_bound = (rc->lower_bound & 0xffff) << 16;
 515 }
 516
 517 static bool
 518 lzms_range_encoder_flush(struct lzms_range_encoder *rc)
 519 {
 520         for (int i = 0; i < 4; i++)
 521                 lzms_range_encoder_shift_low(rc);
 522         return rc->next != rc->end;
 523 }
 524
 525 /*
 526  * Encode the next bit using the range encoder.
 527  *
 528  * @prob is the probability out of LZMS_PROBABILITY_DENOMINATOR that the next
 529  * bit is 0 rather than 1.
 530  */
 531 static forceinline void
 532 lzms_range_encode_bit(struct lzms_range_encoder *rc, int bit, u32 prob)
 533 {
 534         /* Normalize if needed.  */
 535         if (rc->range_size <= 0xffff) {
 536                 rc->range_size <<= 16;
 537                 lzms_range_encoder_shift_low(rc);
 538         }
 539
 540         u32 bound = (rc->range_size >> LZMS_PROBABILITY_BITS) * prob;
 541         if (bit == 0) {
 542                 rc->range_size = bound;
 543         } else {
 544                 rc->lower_bound += bound;
 545                 rc->range_size -= bound;
 546         }
 547 }
 548
 549 /*
 550  * Encode a bit.  This wraps around lzms_range_encode_bit() to handle using and
 551  * updating the state and its corresponding probability entry.
 552  */
 553 static forceinline void
 554 lzms_encode_bit(int bit, unsigned *state_p, unsigned num_states,
 555                 struct lzms_probability_entry *probs,
 556                 struct lzms_range_encoder *rc)
 557 {
 558         struct lzms_probability_entry *prob_entry;
 559         u32 prob;
 560
 561         /* Load the probability entry for the current state.  */
 562         prob_entry = &probs[*state_p];
 563
 564         /* Update the state based on the next bit.  */
 565         *state_p = ((*state_p << 1) | bit) & (num_states - 1);
 566
 567         /* Get the probability that the bit is 0.  */
 568         prob = lzms_get_probability(prob_entry);
 569
 570         /* Update the probability entry.  */
 571         lzms_update_probability_entry(prob_entry, bit);
 572
 573         /* Encode the bit using the range encoder.  */
 574         lzms_range_encode_bit(rc, bit, prob);
 575 }
 576
 577 /* Helper functions for encoding bits in the various decision classes  */
 578
 579 static void
 580 lzms_encode_main_bit(struct lzms_compressor *c, int bit)
 581 {
 582         lzms_encode_bit(bit, &c->main_state, LZMS_NUM_MAIN_PROBS,
 583                         c->probs.main, &c->rc);
 584 }
 585
 586 static void
 587 lzms_encode_match_bit(struct lzms_compressor *c, int bit)
 588 {
 589         lzms_encode_bit(bit, &c->match_state, LZMS_NUM_MATCH_PROBS,
 590                         c->probs.match, &c->rc);
 591 }
 592
 593 static void
 594 lzms_encode_lz_bit(struct lzms_compressor *c, int bit)
 595 {
 596         lzms_encode_bit(bit, &c->lz_state, LZMS_NUM_LZ_PROBS,
 597                         c->probs.lz, &c->rc);
 598 }
 599
 600 static void
 601 lzms_encode_lz_rep_bit(struct lzms_compressor *c, int bit, int idx)
 602 {
 603         lzms_encode_bit(bit, &c->lz_rep_states[idx], LZMS_NUM_LZ_REP_PROBS,
 604                         c->probs.lz_rep[idx], &c->rc);
 605 }
 606
 607 static void
 608 lzms_encode_delta_bit(struct lzms_compressor *c, int bit)
 609 {
 610         lzms_encode_bit(bit, &c->delta_state, LZMS_NUM_DELTA_PROBS,
 611                         c->probs.delta, &c->rc);
 612 }
 613
 614 static void
 615 lzms_encode_delta_rep_bit(struct lzms_compressor *c, int bit, int idx)
 616 {
 617         lzms_encode_bit(bit, &c->delta_rep_states[idx], LZMS_NUM_DELTA_REP_PROBS,
 618                         c->probs.delta_rep[idx], &c->rc);
 619 }
 620
 621 /******************************************************************************
 622  *                   Huffman encoding and verbatim bits                       *
 623  ******************************************************************************/
 624
 625 /*
 626  * Initialize the output bitstream @os to write backwards to the specified
 627  * buffer @out that is @size bytes long.
 628  */
 629 static void
 630 lzms_output_bitstream_init(struct lzms_output_bitstream *os,
 631                            u8 *out, size_t size)
 632 {
 633         os->bitbuf = 0;
 634         os->bitcount = 0;
 635         os->begin = out;
 636         os->next = out + (size & ~1);
 637 }
 638
 639 /*
 640  * Write some bits, contained in the low-order @num_bits bits of @bits, to the
 641  * output bitstream @os.
 642  *
 643  * @max_num_bits is a compile-time constant that specifies the maximum number of
 644  * bits that can ever be written at this call site.
 645  */
 646 static forceinline void
 647 lzms_write_bits(struct lzms_output_bitstream *os, const u32 bits,
 648                 const unsigned num_bits, const unsigned max_num_bits)
 649 {
 650         /* Add the bits to the bit buffer variable.  */
 651         os->bitcount += num_bits;
 652         os->bitbuf = (os->bitbuf << num_bits) | bits;
 653
 654         /* Check whether any coding units need to be written.  */
 655         while (os->bitcount >= 16) {
 656
 657                 os->bitcount -= 16;
 658
 659                 /* Write a coding unit, unless it would underflow the buffer. */
 660                 if (os->next != os->begin) {
 661                         os->next -= sizeof(le16);
 662                         put_unaligned_le16(os->bitbuf >> os->bitcount, os->next);
 663                 }
 664
 665                 /* Optimization for call sites that never write more than 16
 666                  * bits at once.  */
 667                 if (max_num_bits <= 16)
 668                         break;
 669         }
 670 }
 671
 672 /*
 673  * Flush the output bitstream, ensuring that all bits written to it have been
 674  * written to memory.  Return %true if all bits have been output successfully,
 675  * or %false if an overrun occurred.
 676  */
 677 static bool
 678 lzms_output_bitstream_flush(struct lzms_output_bitstream *os)
 679 {
 680         if (os->next == os->begin)
 681                 return false;
 682
 683         if (os->bitcount != 0) {
 684                 os->next -= sizeof(le16);
 685                 put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
 686         }
 687
 688         return true;
 689 }
 690
 691 static void
 692 lzms_build_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
 693 {
 694         make_canonical_huffman_code(rebuild_info->num_syms,
 695                                     LZMS_MAX_CODEWORD_LENGTH,
 696                                     rebuild_info->freqs,
 697                                     rebuild_info->lens,
 698                                     rebuild_info->codewords);
 699         rebuild_info->num_syms_until_rebuild = rebuild_info->rebuild_freq;
 700 }
 701
 702 static void
 703 lzms_init_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info,
 704                        unsigned num_syms, unsigned rebuild_freq,
 705                        u32 *codewords, u8 *lens, u32 *freqs)
 706 {
 707         rebuild_info->num_syms = num_syms;
 708         rebuild_info->rebuild_freq = rebuild_freq;
 709         rebuild_info->codewords = codewords;
 710         rebuild_info->lens = lens;
 711         rebuild_info->freqs = freqs;
 712         lzms_init_symbol_frequencies(freqs, num_syms);
 713         lzms_build_huffman_code(rebuild_info);
 714 }
 715
 716 static void
 717 lzms_rebuild_huffman_code(struct lzms_huffman_rebuild_info *rebuild_info)
 718 {
 719         lzms_build_huffman_code(rebuild_info);
 720         lzms_dilute_symbol_frequencies(rebuild_info->freqs, rebuild_info->num_syms);
 721 }
 722
 723 /*
 724  * Encode a symbol using the specified Huffman code.  Then, if the Huffman code
 725  * needs to be rebuilt, rebuild it and return true; otherwise return false.
 726  */
 727 static forceinline bool
 728 lzms_huffman_encode_symbol(unsigned sym,
 729                            const u32 *codewords, const u8 *lens, u32 *freqs,
 730                            struct lzms_output_bitstream *os,
 731                            struct lzms_huffman_rebuild_info *rebuild_info)
 732 {
 733         lzms_write_bits(os, codewords[sym], lens[sym], LZMS_MAX_CODEWORD_LENGTH);
 734         ++freqs[sym];
 735         if (--rebuild_info->num_syms_until_rebuild == 0) {
 736                 lzms_rebuild_huffman_code(rebuild_info);
 737                 return true;
 738         }
 739         return false;
 740 }
 741
 742 /* Helper routines to encode symbols using the various Huffman codes  */
 743
 744 static bool
 745 lzms_encode_literal_symbol(struct lzms_compressor *c, unsigned sym)
 746 {
 747         return lzms_huffman_encode_symbol(sym, c->literal_codewords,
 748                                           c->literal_lens, c->literal_freqs,
 749                                           &c->os, &c->literal_rebuild_info);
 750 }
 751
 752 static bool
 753 lzms_encode_lz_offset_symbol(struct lzms_compressor *c, unsigned sym)
 754 {
 755         return lzms_huffman_encode_symbol(sym, c->lz_offset_codewords,
 756                                           c->lz_offset_lens, c->lz_offset_freqs,
 757                                           &c->os, &c->lz_offset_rebuild_info);
 758 }
 759
 760 static bool
 761 lzms_encode_length_symbol(struct lzms_compressor *c, unsigned sym)
 762 {
 763         return lzms_huffman_encode_symbol(sym, c->length_codewords,
 764                                           c->length_lens, c->length_freqs,
 765                                           &c->os, &c->length_rebuild_info);
 766 }
 767
 768 static bool
 769 lzms_encode_delta_offset_symbol(struct lzms_compressor *c, unsigned sym)
 770 {
 771         return lzms_huffman_encode_symbol(sym, c->delta_offset_codewords,
 772                                           c->delta_offset_lens, c->delta_offset_freqs,
 773                                           &c->os, &c->delta_offset_rebuild_info);
 774 }
 775
 776 static bool
 777 lzms_encode_delta_power_symbol(struct lzms_compressor *c, unsigned sym)
 778 {
 779         return lzms_huffman_encode_symbol(sym, c->delta_power_codewords,
 780                                           c->delta_power_lens, c->delta_power_freqs,
 781                                           &c->os, &c->delta_power_rebuild_info);
 782 }
 783
 784 static void
 785 lzms_update_fast_length_costs(struct lzms_compressor *c);
 786
 787 /*
 788  * Encode a match length.  If this causes the Huffman code for length symbols to
 789  * be rebuilt, also update the length costs array used by the parser.
 790  */
 791 static void
 792 lzms_encode_length(struct lzms_compressor *c, u32 length)
 793 {
 794         unsigned slot = lzms_comp_get_length_slot(c, length);
 795
 796         if (lzms_encode_length_symbol(c, slot))
 797                 lzms_update_fast_length_costs(c);
 798
 799         lzms_write_bits(&c->os, length - lzms_length_slot_base[slot],
 800                         lzms_extra_length_bits[slot],
 801                         LZMS_MAX_EXTRA_LENGTH_BITS);
 802 }
 803
 804 /* Encode the offset of an LZ match.  */
 805 static void
 806 lzms_encode_lz_offset(struct lzms_compressor *c, u32 offset)
 807 {
 808         unsigned slot = lzms_comp_get_offset_slot(c, offset);
 809
 810         lzms_encode_lz_offset_symbol(c, slot);
 811         lzms_write_bits(&c->os, offset - lzms_offset_slot_base[slot],
 812                         lzms_extra_offset_bits[slot],
 813                         LZMS_MAX_EXTRA_OFFSET_BITS);
 814 }
 815
 816 /* Encode the raw offset of a delta match.  */
 817 static void
 818 lzms_encode_delta_raw_offset(struct lzms_compressor *c, u32 raw_offset)
 819 {
 820         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
 821
 822         lzms_encode_delta_offset_symbol(c, slot);
 823         lzms_write_bits(&c->os, raw_offset - lzms_offset_slot_base[slot],
 824                         lzms_extra_offset_bits[slot],
 825                         LZMS_MAX_EXTRA_OFFSET_BITS);
 826 }
 827
 828 /******************************************************************************
 829  *                             Item encoding                                  *
 830  ******************************************************************************/
 831
 832 /* Encode the specified item, which may be a literal or any type of match.  */
 833 static void
 834 lzms_encode_item(struct lzms_compressor *c, u32 length, u32 source)
 835 {
 836         /* Main bit: 0 = literal, 1 = match  */
 837         int main_bit = (length > 1);
 838         lzms_encode_main_bit(c, main_bit);
 839
 840         if (!main_bit) {
 841                 /* Literal  */
 842                 unsigned literal = source;
 843                 lzms_encode_literal_symbol(c, literal);
 844         } else {
 845                 /* Match  */
 846
 847                 /* Match bit: 0 = LZ match, 1 = delta match  */
 848                 int match_bit = (source & DELTA_SOURCE_TAG) != 0;
 849                 lzms_encode_match_bit(c, match_bit);
 850
 851                 if (!match_bit) {
 852                         /* LZ match  */
 853
 854                         /* LZ bit: 0 = explicit offset, 1 = repeat offset  */
 855                         int lz_bit = (source < LZMS_NUM_LZ_REPS);
 856                         lzms_encode_lz_bit(c, lz_bit);
 857
 858                         if (!lz_bit) {
 859                                 /* Explicit offset LZ match  */
 860                                 u32 offset = source - (LZMS_NUM_LZ_REPS - 1);
 861                                 lzms_encode_lz_offset(c, offset);
 862                         } else {
 863                                 /* Repeat offset LZ match  */
 864                                 int rep_idx = source;
 865                                 for (int i = 0; i < rep_idx; i++)
 866                                         lzms_encode_lz_rep_bit(c, 1, i);
 867                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
 868                                         lzms_encode_lz_rep_bit(c, 0, rep_idx);
 869                         }
 870                 } else {
 871                         /* Delta match  */
 872
 873                         source &= ~DELTA_SOURCE_TAG;
 874
 875                         /* Delta bit: 0 = explicit offset, 1 = repeat offset  */
 876                         int delta_bit = (source < LZMS_NUM_DELTA_REPS);
 877                         lzms_encode_delta_bit(c, delta_bit);
 878
 879                         if (!delta_bit) {
 880                                 /* Explicit offset delta match  */
 881                                 u32 power = source >> DELTA_SOURCE_POWER_SHIFT;
 882                                 u32 raw_offset = (source & DELTA_SOURCE_RAW_OFFSET_MASK) -
 883                                                  (LZMS_NUM_DELTA_REPS - 1);
 884                                 lzms_encode_delta_power_symbol(c, power);
 885                                 lzms_encode_delta_raw_offset(c, raw_offset);
 886                         } else {
 887                                 /* Repeat offset delta match  */
 888                                 int rep_idx = source;
 889                                 for (int i = 0; i < rep_idx; i++)
 890                                         lzms_encode_delta_rep_bit(c, 1, i);
 891                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
 892                                         lzms_encode_delta_rep_bit(c, 0, rep_idx);
 893                         }
 894                 }
 895
 896                 /* Match length (encoded the same way for any match type)  */
 897                 lzms_encode_length(c, length);
 898         }
 899 }
 900
 901 /* Encode a list of matches and literals chosen by the parsing algorithm.  */
 902 static void
 903 lzms_encode_nonempty_item_list(struct lzms_compressor *c,
 904                                struct lzms_optimum_node *end_node)
 905 {
 906         /* Since we've stored at each node the item we took to arrive at that
 907          * node, we can trace our chosen path in backwards order.  However, for
 908          * encoding we need to trace our chosen path in forwards order.  To make
 909          * this possible, the following loop moves the items from their
 910          * destination nodes to their source nodes, which effectively reverses
 911          * the path.  (Think of it like reversing a singly-linked list.)  */
 912         struct lzms_optimum_node *cur_node = end_node;
 913         struct lzms_item saved_item = cur_node->item;
 914         do {
 915                 struct lzms_item item = saved_item;
 916                 if (cur_node->num_extra_items > 0) {
 917                         /* Handle an arrival via multi-item lookahead.  */
 918                         unsigned i = 0;
 919                         struct lzms_optimum_node *orig_node = cur_node;
 920                         do {
 921                                 cur_node -= item.length;
 922                                 cur_node->item = item;
 923                                 item = orig_node->extra_items[i];
 924                         } while (++i != orig_node->num_extra_items);
 925                 }
 926                 cur_node -= item.length;
 927                 saved_item = cur_node->item;
 928                 cur_node->item = item;
 929         } while (cur_node != c->optimum_nodes);
 930
 931         /* Now trace the chosen path in forwards order, encoding each item.  */
 932         do {
 933                 lzms_encode_item(c, cur_node->item.length, cur_node->item.source);
 934                 cur_node += cur_node->item.length;
 935         } while (cur_node != end_node);
 936 }
 937
 938 static forceinline void
 939 lzms_encode_item_list(struct lzms_compressor *c,
 940                       struct lzms_optimum_node *end_node)
 941 {
 942         if (end_node != c->optimum_nodes)
 943                 lzms_encode_nonempty_item_list(c, end_node);
 944 }
 945
 946 /******************************************************************************
 947  *                             Cost evaluation                                *
 948  ******************************************************************************/
 949
 950 /*
 951  * If p is the predicted probability of the next bit being a 0, then the number
 952  * of bits required to encode a 0 bit using a binary range encoder is the real
 953  * number -log2(p), and the number of bits required to encode a 1 bit is the
 954  * real number -log2(1 - p).  To avoid computing either of these expressions at
 955  * runtime, 'lzms_bit_costs' is a precomputed table that stores a mapping from
 956  * probability to cost for each possible probability.  Specifically, the array
 957  * indices are the numerators of the possible probabilities in LZMS, where the
 958  * denominators are LZMS_PROBABILITY_DENOMINATOR; and the stored costs are the
 959  * bit costs multiplied by 1<<COST_SHIFT and rounded to the nearest integer.
 960  * Furthermore, the values stored for 0% and 100% probabilities are equal to the
 961  * adjacent values, since these probabilities are not actually permitted.  This
 962  * allows us to use the num_recent_zero_bits value from the
 963  * lzms_probability_entry as the array index without fixing up these two special
 964  * cases.
 965  */
 966 static const u32 lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR + 1] = {
 967         384, 384, 320, 283, 256, 235, 219, 204,
 968         192, 181, 171, 163, 155, 147, 140, 134,
 969         128, 122, 117, 112, 107, 103, 99,  94,
 970         91,  87,  83,  80,  76,  73,  70,  67,
 971         64,  61,  58,  56,  53,  51,  48,  46,
 972         43,  41,  39,  37,  35,  33,  30,  29,
 973         27,  25,  23,  21,  19,  17,  16,  14,
 974         12,  11,  9,   8,   6,   4,   3,   1,
 975         1
 976 };
 977
 978 static void __attribute__((unused))
 979 check_cost_shift(void)
 980 {
 981         /* lzms_bit_costs is hard-coded to the current COST_SHIFT.  */
 982         STATIC_ASSERT(COST_SHIFT == 6);
 983 }
 984
 985 #if 0
 986 #include <math.h>
 987
 988 static void
 989 lzms_compute_bit_costs(void)
 990 {
 991         for (u32 i = 0; i <= LZMS_PROBABILITY_DENOMINATOR; i++) {
 992                 u32 prob = i;
 993                 if (prob == 0)
 994                         prob++;
 995                 else if (prob == LZMS_PROBABILITY_DENOMINATOR)
 996                         prob--;
 997
 998                 lzms_bit_costs[i] = round(-log2((double)prob / LZMS_PROBABILITY_DENOMINATOR) *
 999                                           (1 << COST_SHIFT));
1000         }
1001 }
1002 #endif
1003
1004 /* Return the cost to encode a 0 bit in the specified context.  */
1005 static forceinline u32
1006 lzms_bit_0_cost(unsigned state, const struct lzms_probability_entry *probs)
1007 {
1008         return lzms_bit_costs[probs[state].num_recent_zero_bits];
1009 }
1010
1011 /* Return the cost to encode a 1 bit in the specified context.  */
1012 static forceinline u32
1013 lzms_bit_1_cost(unsigned state, const struct lzms_probability_entry *probs)
1014 {
1015         return lzms_bit_costs[LZMS_PROBABILITY_DENOMINATOR -
1016                               probs[state].num_recent_zero_bits];
1017 }
1018
1019 /* Return the cost to encode a literal, including the main bit.  */
1020 static forceinline u32
1021 lzms_literal_cost(struct lzms_compressor *c, unsigned main_state, unsigned literal)
1022 {
1023         return lzms_bit_0_cost(main_state, c->probs.main) +
1024                 ((u32)c->literal_lens[literal] << COST_SHIFT);
1025 }
1026
1027 /* Update 'fast_length_cost_tab' to use the latest Huffman code.  */
1028 static void
1029 lzms_update_fast_length_costs(struct lzms_compressor *c)
1030 {
1031         int slot = -1;
1032         u32 cost = 0;
1033         for (u32 len = LZMS_MIN_MATCH_LENGTH; len <= MAX_FAST_LENGTH; len++) {
1034                 if (len >= lzms_length_slot_base[slot + 1]) {
1035                         slot++;
1036                         cost = (u32)(c->length_lens[slot] +
1037                                      lzms_extra_length_bits[slot]) << COST_SHIFT;
1038                 }
1039                 c->fast_length_cost_tab[len] = cost;
1040         }
1041 }
1042
1043 /* Return the cost to encode the specified match length, which must not exceed
1044  * MAX_FAST_LENGTH.  */
1045 static forceinline u32
1046 lzms_fast_length_cost(const struct lzms_compressor *c, u32 length)
1047 {
1048         return c->fast_length_cost_tab[length];
1049 }
1050
1051 /* Return the cost to encode the specified LZ match offset.  */
1052 static forceinline u32
1053 lzms_lz_offset_cost(const struct lzms_compressor *c, u32 offset)
1054 {
1055         unsigned slot = lzms_comp_get_offset_slot(c, offset);
1056         u32 num_bits = c->lz_offset_lens[slot] + lzms_extra_offset_bits[slot];
1057         return num_bits << COST_SHIFT;
1058 }
1059
1060 /* Return the cost to encode the specified delta power and raw offset.  */
1061 static forceinline u32
1062 lzms_delta_source_cost(const struct lzms_compressor *c, u32 power, u32 raw_offset)
1063 {
1064         unsigned slot = lzms_comp_get_offset_slot(c, raw_offset);
1065         u32 num_bits = c->delta_power_lens[power] + c->delta_offset_lens[slot] +
1066                        lzms_extra_offset_bits[slot];
1067         return num_bits << COST_SHIFT;
1068 }
1069
1070 /******************************************************************************
1071  *                              Adaptive state                                *
1072  ******************************************************************************/
1073
1074 static void
1075 lzms_init_adaptive_state(struct lzms_adaptive_state *state)
1076 {
1077         for (int i = 0; i < LZMS_NUM_LZ_REPS + 1; i++)
1078                 state->recent_lz_offsets[i] = i + 1;
1079         state->prev_lz_offset = 0;
1080         state->upcoming_lz_offset = 0;
1081
1082         for (int i = 0; i < LZMS_NUM_DELTA_REPS + 1; i++)
1083                 state->recent_delta_pairs[i] = i + 1;
1084         state->prev_delta_pair = 0;
1085         state->upcoming_delta_pair = 0;
1086
1087         state->main_state = 0;
1088         state->match_state = 0;
1089         state->lz_state = 0;
1090         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
1091                 state->lz_rep_states[i] = 0;
1092         state->delta_state = 0;
1093         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
1094                 state->delta_rep_states[i] = 0;
1095 }
1096
1097 /*
1098  * Update the LRU queues for match sources when advancing by one item.
1099  *
1100  * Note: using LZMA as a point of comparison, the LRU queues in LZMS are more
1101  * complex because:
1102  *      - there are separate queues for LZ and delta matches
1103  *      - updates to the queues are delayed by one encoded item (this prevents
1104  *        sources from being bumped up to index 0 too early)
1105  */
1106 static void
1107 lzms_update_lru_queues(struct lzms_adaptive_state *state)
1108 {
1109         if (state->prev_lz_offset != 0) {
1110                 for (int i = LZMS_NUM_LZ_REPS - 1; i >= 0; i--)
1111                         state->recent_lz_offsets[i + 1] = state->recent_lz_offsets[i];
1112                 state->recent_lz_offsets[0] = state->prev_lz_offset;
1113         }
1114         state->prev_lz_offset = state->upcoming_lz_offset;
1115
1116         if (state->prev_delta_pair != 0) {
1117                 for (int i = LZMS_NUM_DELTA_REPS - 1; i >= 0; i--)
1118                         state->recent_delta_pairs[i + 1] = state->recent_delta_pairs[i];
1119                 state->recent_delta_pairs[0] = state->prev_delta_pair;
1120         }
1121         state->prev_delta_pair = state->upcoming_delta_pair;
1122 }
1123
1124 static forceinline void
1125 lzms_update_state(u8 *state_p, int bit, unsigned num_states)
1126 {
1127         *state_p = ((*state_p << 1) | bit) & (num_states - 1);
1128 }
1129
1130 static forceinline void
1131 lzms_update_main_state(struct lzms_adaptive_state *state, int is_match)
1132 {
1133         lzms_update_state(&state->main_state, is_match, LZMS_NUM_MAIN_PROBS);
1134 }
1135
1136 static forceinline void
1137 lzms_update_match_state(struct lzms_adaptive_state *state, int is_delta)
1138 {
1139         lzms_update_state(&state->match_state, is_delta, LZMS_NUM_MATCH_PROBS);
1140 }
1141
1142 static forceinline void
1143 lzms_update_lz_state(struct lzms_adaptive_state *state, int is_rep)
1144 {
1145         lzms_update_state(&state->lz_state, is_rep, LZMS_NUM_LZ_PROBS);
1146 }
1147
1148 static forceinline void
1149 lzms_update_lz_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1150 {
1151         for (int i = 0; i < rep_idx; i++)
1152                 lzms_update_state(&state->lz_rep_states[i], 1, LZMS_NUM_LZ_REP_PROBS);
1153
1154         if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1155                 lzms_update_state(&state->lz_rep_states[rep_idx], 0, LZMS_NUM_LZ_REP_PROBS);
1156 }
1157
1158 static forceinline void
1159 lzms_update_delta_state(struct lzms_adaptive_state *state, int is_rep)
1160 {
1161         lzms_update_state(&state->delta_state, is_rep, LZMS_NUM_DELTA_PROBS);
1162 }
1163
1164 static forceinline void
1165 lzms_update_delta_rep_states(struct lzms_adaptive_state *state, int rep_idx)
1166 {
1167         for (int i = 0; i < rep_idx; i++)
1168                 lzms_update_state(&state->delta_rep_states[i], 1, LZMS_NUM_DELTA_REP_PROBS);
1169
1170         if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1171                 lzms_update_state(&state->delta_rep_states[rep_idx], 0, LZMS_NUM_DELTA_REP_PROBS);
1172 }
1173
1174 /******************************************************************************
1175  *                              Matchfinding                                  *
1176  ******************************************************************************/
1177
1178 /* Note: this code just handles finding delta matches.  The code for finding LZ
1179  * matches is elsewhere.  */
1180
1181
1182 /* Initialize the delta matchfinder for a new input buffer.  */
1183 static void
1184 lzms_init_delta_matchfinder(struct lzms_compressor *c)
1185 {
1186         /* Set all entries to use an invalid power, which will never match.  */
1187         STATIC_ASSERT(NUM_POWERS_TO_CONSIDER < (1 << (32 - DELTA_SOURCE_POWER_SHIFT)));
1188         memset(c->delta_hash_table, 0xFF, sizeof(c->delta_hash_table));
1189
1190         /* Initialize the next hash code for each power.  We can just use zeroes
1191          * initially; it doesn't really matter.  */
1192         for (u32 i = 0; i < NUM_POWERS_TO_CONSIDER; i++)
1193                 c->next_delta_hashes[i] = 0;
1194 }
1195
1196 /*
1197  * Compute a DELTA_HASH_ORDER-bit hash code for the first
1198  * NBYTES_HASHED_FOR_DELTA bytes of the sequence beginning at @p when taken in a
1199  * delta context with the specified @span.
1200  */
1201 static forceinline u32
1202 lzms_delta_hash(const u8 *p, const u32 pos, u32 span)
1203 {
1204         /* A delta match has a certain span and an offset that is a multiple of
1205          * that span.  To reduce wasted space we use a single combined hash
1206          * table for all spans and positions, but to minimize collisions we
1207          * include in the hash code computation the span and the low-order bits
1208          * of the current position.  */
1209
1210         STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1211         u8 d0 = *(p + 0) - *(p + 0 - span);
1212         u8 d1 = *(p + 1) - *(p + 1 - span);
1213         u8 d2 = *(p + 2) - *(p + 2 - span);
1214         u32 v = ((span + (pos & (span - 1))) << 24) |
1215                 ((u32)d2 << 16) | ((u32)d1 << 8) | d0;
1216         return lz_hash(v, DELTA_HASH_ORDER);
1217 }
1218
1219 /*
1220  * Given a match between @in_next and @matchptr in a delta context with the
1221  * specified @span and having the initial @len, extend the match as far as
1222  * possible, up to a limit of @max_len.
1223  */
1224 static forceinline u32
1225 lzms_extend_delta_match(const u8 *in_next, const u8 *matchptr,
1226                         u32 len, u32 max_len, u32 span)
1227 {
1228         while (len < max_len &&
1229                (u8)(*(in_next + len) - *(in_next + len - span)) ==
1230                (u8)(*(matchptr + len) - *(matchptr + len - span)))
1231         {
1232                 len++;
1233         }
1234         return len;
1235 }
1236
1237 static void
1238 lzms_delta_matchfinder_skip_bytes(struct lzms_compressor *c,
1239                                   const u8 *in_next, u32 count)
1240 {
1241         u32 pos = in_next - c->in_buffer;
1242         if (unlikely(c->in_nbytes - (pos + count) <= NBYTES_HASHED_FOR_DELTA + 1))
1243                 return;
1244         do {
1245                 /* Update the hash table for each power.  */
1246                 for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1247                         const u32 span = (u32)1 << power;
1248                         if (unlikely(pos < span))
1249                                 continue;
1250                         const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1251                         const u32 hash = c->next_delta_hashes[power];
1252                         c->delta_hash_table[hash] =
1253                                 (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1254                         c->next_delta_hashes[power] = next_hash;
1255                         prefetchw(&c->delta_hash_table[next_hash]);
1256                 }
1257         } while (in_next++, pos++, --count);
1258 }
1259
1260 /*
1261  * Skip the next @count bytes (don't search for matches at them).  @in_next
1262  * points to the first byte to skip.  The return value is @in_next + count.
1263  */
1264 static const u8 *
1265 lzms_skip_bytes(struct lzms_compressor *c, u32 count, const u8 *in_next)
1266 {
1267         lcpit_matchfinder_skip_bytes(&c->mf, count);
1268         if (c->use_delta_matches)
1269                 lzms_delta_matchfinder_skip_bytes(c, in_next, count);
1270         return in_next + count;
1271 }
1272
1273 /******************************************************************************
1274  *                          "Near-optimal" parsing                            *
1275  ******************************************************************************/
1276
1277 /*
1278  * The main near-optimal parsing routine.
1279  *
1280  * Briefly, the algorithm does an approximate minimum-cost path search to find a
1281  * "near-optimal" sequence of matches and literals to output, based on the
1282  * current cost model.  The algorithm steps forward, position by position (byte
1283  * by byte), and updates the minimum cost path to reach each later position that
1284  * can be reached using a match or literal from the current position.  This is
1285  * essentially Dijkstra's algorithm in disguise: the graph nodes are positions,
1286  * the graph edges are possible matches/literals to code, and the cost of each
1287  * edge is the estimated number of bits (scaled up by COST_SHIFT) that will be
1288  * required to output the corresponding match or literal.  But one difference is
1289  * that we actually compute the lowest-cost path in pieces, where each piece is
1290  * terminated when there are no choices to be made.
1291  *
1292  * The costs of literals and matches are estimated using the range encoder
1293  * states and the semi-adaptive Huffman codes.  Except for range encoding
1294  * states, costs are assumed to be constant throughout a single run of the
1295  * parsing algorithm, which can parse up to NUM_OPTIM_NODES bytes of data.  This
1296  * introduces a source of non-optimality because the probabilities and Huffman
1297  * codes can change over this part of the data.  And of course, there are
1298  * various other reasons why the result isn't optimal in terms of compression
1299  * ratio.
1300  */
1301 static void
1302 lzms_near_optimal_parse(struct lzms_compressor *c)
1303 {
1304         const u8 *in_next = c->in_buffer;
1305         const u8 * const in_end = &c->in_buffer[c->in_nbytes];
1306         struct lzms_optimum_node *cur_node;
1307         struct lzms_optimum_node *end_node;
1308
1309         /* Set initial length costs for lengths <= MAX_FAST_LENGTH.  */
1310         lzms_update_fast_length_costs(c);
1311
1312         /* Set up the initial adaptive state.  */
1313         lzms_init_adaptive_state(&c->optimum_nodes[0].state);
1314
1315 begin:
1316         /* Start building a new list of items, which will correspond to the next
1317          * piece of the overall minimum-cost path.  */
1318
1319         cur_node = c->optimum_nodes;
1320         cur_node->cost = 0;
1321         end_node = cur_node;
1322
1323         if (in_next == in_end)
1324                 return;
1325
1326         /* The following loop runs once for each per byte in the input buffer,
1327          * except in a few shortcut cases.  */
1328         for (;;) {
1329                 u32 num_matches;
1330
1331                 /* Repeat offset LZ matches  */
1332                 if (likely(in_next - c->in_buffer >= LZMS_NUM_LZ_REPS &&
1333                            in_end - in_next >= 2))
1334                 {
1335                         for (int rep_idx = 0; rep_idx < LZMS_NUM_LZ_REPS; rep_idx++) {
1336
1337                                 /* Looking for a repeat offset LZ match at queue
1338                                  * index @rep_idx  */
1339
1340                                 const u32 offset = cur_node->state.recent_lz_offsets[rep_idx];
1341                                 const u8 * const matchptr = in_next - offset;
1342
1343                                 /* Check the first 2 bytes before entering the extension loop.  */
1344                                 if (load_u16_unaligned(in_next) != load_u16_unaligned(matchptr))
1345                                         continue;
1346
1347                                 /* Extend the match to its full length.  */
1348                                 const u32 rep_len = lz_extend(in_next, matchptr, 2, in_end - in_next);
1349
1350                                 /* Early out for long repeat offset LZ match */
1351                                 if (rep_len >= c->mf.nice_match_len) {
1352
1353                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1354
1355                                         lzms_encode_item_list(c, cur_node);
1356                                         lzms_encode_item(c, rep_len, rep_idx);
1357
1358                                         c->optimum_nodes[0].state = cur_node->state;
1359                                         cur_node = &c->optimum_nodes[0];
1360
1361                                         cur_node->state.upcoming_lz_offset =
1362                                                 cur_node->state.recent_lz_offsets[rep_idx];
1363                                         cur_node->state.upcoming_delta_pair = 0;
1364                                         for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1365                                                 cur_node->state.recent_lz_offsets[i] =
1366                                                         cur_node->state.recent_lz_offsets[i + 1];
1367                                         lzms_update_lru_queues(&cur_node->state);
1368                                         lzms_update_main_state(&cur_node->state, 1);
1369                                         lzms_update_match_state(&cur_node->state, 0);
1370                                         lzms_update_lz_state(&cur_node->state, 1);
1371                                         lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1372                                         goto begin;
1373                                 }
1374
1375                                 while (end_node < cur_node + rep_len)
1376                                         (++end_node)->cost = INFINITE_COST;
1377
1378                                 u32 base_cost = cur_node->cost +
1379                                                 lzms_bit_1_cost(cur_node->state.main_state,
1380                                                                 c->probs.main) +
1381                                                 lzms_bit_0_cost(cur_node->state.match_state,
1382                                                                 c->probs.match) +
1383                                                 lzms_bit_1_cost(cur_node->state.lz_state,
1384                                                                 c->probs.lz);
1385
1386                                 for (int i = 0; i < rep_idx; i++)
1387                                         base_cost += lzms_bit_1_cost(cur_node->state.lz_rep_states[i],
1388                                                                      c->probs.lz_rep[i]);
1389
1390                                 if (rep_idx < LZMS_NUM_LZ_REP_DECISIONS)
1391                                         base_cost += lzms_bit_0_cost(cur_node->state.lz_rep_states[rep_idx],
1392                                                                      c->probs.lz_rep[rep_idx]);
1393
1394                                 u32 len = 2;
1395                                 do {
1396                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1397                                         if (cost < (cur_node + len)->cost) {
1398                                                 (cur_node + len)->cost = cost;
1399                                                 (cur_node + len)->item = (struct lzms_item) {
1400                                                         .length = len,
1401                                                         .source = rep_idx,
1402                                                 };
1403                                                 (cur_node + len)->num_extra_items = 0;
1404                                         }
1405                                 } while (++len <= rep_len);
1406
1407
1408                                 /* try LZ-rep + lit + LZ-rep0  */
1409                                 if (c->try_lzrep_lit_lzrep0 &&
1410                                     in_end - (in_next + rep_len) >= 3 &&
1411                                     load_u16_unaligned(in_next + rep_len + 1) ==
1412                                     load_u16_unaligned(matchptr + rep_len + 1))
1413                                 {
1414                                         const u32 rep0_len = lz_extend(in_next + rep_len + 1,
1415                                                                        matchptr + rep_len + 1,
1416                                                                        2,
1417                                                                        min(c->mf.nice_match_len,
1418                                                                            in_end - (in_next + rep_len + 1)));
1419
1420                                         unsigned main_state = cur_node->state.main_state;
1421                                         unsigned match_state = cur_node->state.match_state;
1422                                         unsigned lz_state = cur_node->state.lz_state;
1423                                         unsigned lz_rep0_state = cur_node->state.lz_rep_states[0];
1424
1425                                         /* update states for LZ-rep  */
1426                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1427                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1428                                         lz_state = ((lz_state << 1) | 1) % LZMS_NUM_LZ_PROBS;
1429                                         lz_rep0_state = ((lz_rep0_state << 1) | (rep_idx > 0)) %
1430                                                                 LZMS_NUM_LZ_REP_PROBS;
1431
1432                                         /* LZ-rep cost  */
1433                                         u32 cost = base_cost + lzms_fast_length_cost(c, rep_len);
1434
1435                                         /* add literal cost  */
1436                                         cost += lzms_literal_cost(c, main_state, *(in_next + rep_len));
1437
1438                                         /* update state for literal  */
1439                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1440
1441                                         /* add LZ-rep0 cost  */
1442                                         cost += lzms_bit_1_cost(main_state, c->probs.main) +
1443                                                 lzms_bit_0_cost(match_state, c->probs.match) +
1444                                                 lzms_bit_1_cost(lz_state, c->probs.lz) +
1445                                                 lzms_bit_0_cost(lz_rep0_state, c->probs.lz_rep[0]) +
1446                                                 lzms_fast_length_cost(c, rep0_len);
1447
1448                                         const u32 total_len = rep_len + 1 + rep0_len;
1449
1450                                         while (end_node < cur_node + total_len)
1451                                                 (++end_node)->cost = INFINITE_COST;
1452
1453                                         if (cost < (cur_node + total_len)->cost) {
1454                                                 (cur_node + total_len)->cost = cost;
1455                                                 (cur_node + total_len)->item = (struct lzms_item) {
1456                                                         .length = rep0_len,
1457                                                         .source = 0,
1458                                                 };
1459                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1460                                                         .length = 1,
1461                                                         .source = *(in_next + rep_len),
1462                                                 };
1463                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1464                                                         .length = rep_len,
1465                                                         .source = rep_idx,
1466                                                 };
1467                                                 (cur_node + total_len)->num_extra_items = 2;
1468                                         }
1469                                 }
1470                         }
1471                 }
1472
1473                 /* Repeat offset delta matches  */
1474                 if (c->use_delta_matches &&
1475                     likely(in_next - c->in_buffer >= LZMS_NUM_DELTA_REPS + 1 &&
1476                            (in_end - in_next >= 2)))
1477                 {
1478                         for (int rep_idx = 0; rep_idx < LZMS_NUM_DELTA_REPS; rep_idx++) {
1479
1480                                 /* Looking for a repeat offset delta match at
1481                                  * queue index @rep_idx  */
1482
1483                                 const u32 pair = cur_node->state.recent_delta_pairs[rep_idx];
1484                                 const u32 power = pair >> DELTA_SOURCE_POWER_SHIFT;
1485                                 const u32 raw_offset = pair & DELTA_SOURCE_RAW_OFFSET_MASK;
1486                                 const u32 span = (u32)1 << power;
1487                                 const u32 offset = raw_offset << power;
1488                                 const u8 * const matchptr = in_next - offset;
1489
1490                                 /* Check the first 2 bytes before entering the
1491                                  * extension loop.  */
1492                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1493                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1494                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1495                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))))
1496                                         continue;
1497
1498                                 /* Extend the match to its full length.  */
1499                                 const u32 rep_len = lzms_extend_delta_match(in_next, matchptr,
1500                                                                             2, in_end - in_next,
1501                                                                             span);
1502
1503                                 /* Early out for long repeat offset delta match */
1504                                 if (rep_len >= c->mf.nice_match_len) {
1505
1506                                         in_next = lzms_skip_bytes(c, rep_len, in_next);
1507
1508                                         lzms_encode_item_list(c, cur_node);
1509                                         lzms_encode_item(c, rep_len, DELTA_SOURCE_TAG | rep_idx);
1510
1511                                         c->optimum_nodes[0].state = cur_node->state;
1512                                         cur_node = &c->optimum_nodes[0];
1513
1514                                         cur_node->state.upcoming_delta_pair = pair;
1515                                         cur_node->state.upcoming_lz_offset = 0;
1516                                         for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1517                                                 cur_node->state.recent_delta_pairs[i] =
1518                                                         cur_node->state.recent_delta_pairs[i + 1];
1519                                         lzms_update_lru_queues(&cur_node->state);
1520                                         lzms_update_main_state(&cur_node->state, 1);
1521                                         lzms_update_match_state(&cur_node->state, 1);
1522                                         lzms_update_delta_state(&cur_node->state, 1);
1523                                         lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1524                                         goto begin;
1525                                 }
1526
1527                                 while (end_node < cur_node + rep_len)
1528                                         (++end_node)->cost = INFINITE_COST;
1529
1530                                 u32 base_cost = cur_node->cost +
1531                                                 lzms_bit_1_cost(cur_node->state.main_state,
1532                                                                 c->probs.main) +
1533                                                 lzms_bit_1_cost(cur_node->state.match_state,
1534                                                                 c->probs.match) +
1535                                                 lzms_bit_1_cost(cur_node->state.delta_state,
1536                                                                 c->probs.delta);
1537
1538                                 for (int i = 0; i < rep_idx; i++)
1539                                         base_cost += lzms_bit_1_cost(cur_node->state.delta_rep_states[i],
1540                                                                      c->probs.delta_rep[i]);
1541
1542                                 if (rep_idx < LZMS_NUM_DELTA_REP_DECISIONS)
1543                                         base_cost += lzms_bit_0_cost(cur_node->state.delta_rep_states[rep_idx],
1544                                                                      c->probs.delta_rep[rep_idx]);
1545
1546                                 u32 len = 2;
1547                                 do {
1548                                         u32 cost = base_cost + lzms_fast_length_cost(c, len);
1549                                         if (cost < (cur_node + len)->cost) {
1550                                                 (cur_node + len)->cost = cost;
1551                                                 (cur_node + len)->item = (struct lzms_item) {
1552                                                         .length = len,
1553                                                         .source = DELTA_SOURCE_TAG | rep_idx,
1554                                                 };
1555                                                 (cur_node + len)->num_extra_items = 0;
1556                                         }
1557                                 } while (++len <= rep_len);
1558                         }
1559                 }
1560
1561                 /* Explicit offset LZ matches  */
1562                 num_matches = lcpit_matchfinder_get_matches(&c->mf, c->matches);
1563                 if (num_matches) {
1564
1565                         u32 best_len = c->matches[0].length;
1566
1567                         /* Early out for long explicit offset LZ match  */
1568                         if (best_len >= c->mf.nice_match_len) {
1569
1570                                 const u32 offset = c->matches[0].offset;
1571
1572                                 /* Extend the match as far as possible.
1573                                  * This is necessary because the LCP-interval
1574                                  * tree matchfinder only reports up to
1575                                  * nice_match_len bytes.  */
1576                                 best_len = lz_extend(in_next, in_next - offset,
1577                                                      best_len, in_end - in_next);
1578
1579                                 in_next = lzms_skip_bytes(c, best_len - 1, in_next + 1);
1580
1581                                 lzms_encode_item_list(c, cur_node);
1582                                 lzms_encode_item(c, best_len, offset + LZMS_NUM_LZ_REPS - 1);
1583
1584                                 c->optimum_nodes[0].state = cur_node->state;
1585                                 cur_node = &c->optimum_nodes[0];
1586
1587                                 cur_node->state.upcoming_lz_offset = offset;
1588                                 cur_node->state.upcoming_delta_pair = 0;
1589                                 lzms_update_lru_queues(&cur_node->state);
1590                                 lzms_update_main_state(&cur_node->state, 1);
1591                                 lzms_update_match_state(&cur_node->state, 0);
1592                                 lzms_update_lz_state(&cur_node->state, 0);
1593                                 goto begin;
1594                         }
1595
1596                         while (end_node < cur_node + best_len)
1597                                 (++end_node)->cost = INFINITE_COST;
1598
1599                         u32 base_cost = cur_node->cost +
1600                                         lzms_bit_1_cost(cur_node->state.main_state,
1601                                                         c->probs.main) +
1602                                         lzms_bit_0_cost(cur_node->state.match_state,
1603                                                         c->probs.match) +
1604                                         lzms_bit_0_cost(cur_node->state.lz_state,
1605                                                         c->probs.lz);
1606
1607                         if (c->try_lzmatch_lit_lzrep0 &&
1608                             likely(in_end - (in_next + c->matches[0].length) >= 3))
1609                         {
1610                                 /* try LZ-match + lit + LZ-rep0  */
1611
1612                                 u32 l = 2;
1613                                 u32 i = num_matches - 1;
1614                                 do {
1615                                         const u32 len = c->matches[i].length;
1616                                         const u32 offset = c->matches[i].offset;
1617                                         const u32 position_cost = base_cost +
1618                                                                   lzms_lz_offset_cost(c, offset);
1619                                         do {
1620                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1621                                                 if (cost < (cur_node + l)->cost) {
1622                                                         (cur_node + l)->cost = cost;
1623                                                         (cur_node + l)->item = (struct lzms_item) {
1624                                                                 .length = l,
1625                                                                 .source = offset + (LZMS_NUM_LZ_REPS - 1),
1626                                                         };
1627                                                         (cur_node + l)->num_extra_items = 0;
1628                                                 }
1629                                         } while (++l <= len);
1630
1631                                         const u8 * const matchptr = in_next - offset;
1632                                         if (load_u16_unaligned(matchptr + len + 1) !=
1633                                             load_u16_unaligned(in_next + len + 1))
1634                                                 continue;
1635
1636                                         const u32 rep0_len = lz_extend(in_next + len + 1,
1637                                                                        matchptr + len + 1,
1638                                                                        2,
1639                                                                        min(c->mf.nice_match_len,
1640                                                                            in_end - (in_next + len + 1)));
1641
1642                                         unsigned main_state = cur_node->state.main_state;
1643                                         unsigned match_state = cur_node->state.match_state;
1644                                         unsigned lz_state = cur_node->state.lz_state;
1645
1646                                         /* update states for LZ-match  */
1647                                         main_state = ((main_state << 1) | 1) % LZMS_NUM_MAIN_PROBS;
1648                                         match_state = ((match_state << 1) | 0) % LZMS_NUM_MATCH_PROBS;
1649                                         lz_state = ((lz_state << 1) | 0) % LZMS_NUM_LZ_PROBS;
1650
1651                                         /* LZ-match cost  */
1652                                         u32 cost = position_cost + lzms_fast_length_cost(c, len);
1653
1654                                         /* add literal cost  */
1655                                         cost += lzms_literal_cost(c, main_state, *(in_next + len));
1656
1657                                         /* update state for literal  */
1658                                         main_state = ((main_state << 1) | 0) % LZMS_NUM_MAIN_PROBS;
1659
1660                                         /* add LZ-rep0 cost  */
1661                                         cost += lzms_bit_1_cost(main_state, c->probs.main) +
1662                                                 lzms_bit_0_cost(match_state, c->probs.match) +
1663                                                 lzms_bit_1_cost(lz_state, c->probs.lz) +
1664                                                 lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1665                                                                 c->probs.lz_rep[0]) +
1666                                                 lzms_fast_length_cost(c, rep0_len);
1667
1668                                         const u32 total_len = len + 1 + rep0_len;
1669
1670                                         while (end_node < cur_node + total_len)
1671                                                 (++end_node)->cost = INFINITE_COST;
1672
1673                                         if (cost < (cur_node + total_len)->cost) {
1674                                                 (cur_node + total_len)->cost = cost;
1675                                                 (cur_node + total_len)->item = (struct lzms_item) {
1676                                                         .length = rep0_len,
1677                                                         .source = 0,
1678                                                 };
1679                                                 (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1680                                                         .length = 1,
1681                                                         .source = *(in_next + len),
1682                                                 };
1683                                                 (cur_node + total_len)->extra_items[1] = (struct lzms_item) {
1684                                                         .length = len,
1685                                                         .source = offset + LZMS_NUM_LZ_REPS - 1,
1686                                                 };
1687                                                 (cur_node + total_len)->num_extra_items = 2;
1688                                         }
1689                                 } while (i--);
1690                         } else {
1691                                 u32 l = 2;
1692                                 u32 i = num_matches - 1;
1693                                 do {
1694                                         u32 position_cost = base_cost +
1695                                                             lzms_lz_offset_cost(c, c->matches[i].offset);
1696                                         do {
1697                                                 u32 cost = position_cost + lzms_fast_length_cost(c, l);
1698                                                 if (cost < (cur_node + l)->cost) {
1699                                                         (cur_node + l)->cost = cost;
1700                                                         (cur_node + l)->item = (struct lzms_item) {
1701                                                                 .length = l,
1702                                                                 .source = c->matches[i].offset +
1703                                                                           (LZMS_NUM_LZ_REPS - 1),
1704                                                         };
1705                                                         (cur_node + l)->num_extra_items = 0;
1706                                                 }
1707                                         } while (++l <= c->matches[i].length);
1708                                 } while (i--);
1709                         }
1710                 }
1711
1712                 /* Explicit offset delta matches  */
1713                 if (c->use_delta_matches &&
1714                     likely(in_end - in_next >= NBYTES_HASHED_FOR_DELTA + 1))
1715                 {
1716                         const u32 pos = in_next - c->in_buffer;
1717
1718                         /* Consider each possible power (log2 of span)  */
1719                         STATIC_ASSERT(NUM_POWERS_TO_CONSIDER <= LZMS_NUM_DELTA_POWER_SYMS);
1720                         for (u32 power = 0; power < NUM_POWERS_TO_CONSIDER; power++) {
1721
1722                                 const u32 span = (u32)1 << power;
1723
1724                                 if (unlikely(pos < span))
1725                                         continue;
1726
1727                                 const u32 next_hash = lzms_delta_hash(in_next + 1, pos + 1, span);
1728                                 const u32 hash = c->next_delta_hashes[power];
1729                                 const u32 cur_match = c->delta_hash_table[hash];
1730
1731                                 c->delta_hash_table[hash] = (power << DELTA_SOURCE_POWER_SHIFT) | pos;
1732                                 c->next_delta_hashes[power] = next_hash;
1733                                 prefetchw(&c->delta_hash_table[next_hash]);
1734
1735                                 if (power != cur_match >> DELTA_SOURCE_POWER_SHIFT)
1736                                         continue;
1737
1738                                 const u32 offset = pos - (cur_match & DELTA_SOURCE_RAW_OFFSET_MASK);
1739
1740                                 /* The offset must be a multiple of span.  */
1741                                 if (offset & (span - 1))
1742                                         continue;
1743
1744                                 const u8 * const matchptr = in_next - offset;
1745
1746                                 /* Check the first 3 bytes before entering the
1747                                  * extension loop.  */
1748                                 STATIC_ASSERT(NBYTES_HASHED_FOR_DELTA == 3);
1749                                 if (((u8)(*(in_next + 0) - *(in_next + 0 - span)) !=
1750                                      (u8)(*(matchptr + 0) - *(matchptr + 0 - span))) ||
1751                                     ((u8)(*(in_next + 1) - *(in_next + 1 - span)) !=
1752                                      (u8)(*(matchptr + 1) - *(matchptr + 1 - span))) ||
1753                                     ((u8)(*(in_next + 2) - *(in_next + 2 - span)) !=
1754                                      (u8)(*(matchptr + 2) - *(matchptr + 2 - span))))
1755                                         continue;
1756
1757                                 /* Extend the delta match to its full length.  */
1758                                 const u32 len = lzms_extend_delta_match(in_next,
1759                                                                         matchptr,
1760                                                                         NBYTES_HASHED_FOR_DELTA,
1761                                                                         in_end - in_next,
1762                                                                         span);
1763
1764                                 const u32 raw_offset = offset >> power;
1765
1766                                 if (unlikely(raw_offset > DELTA_SOURCE_RAW_OFFSET_MASK -
1767                                                           (LZMS_NUM_DELTA_REPS - 1)))
1768                                         continue;
1769
1770                                 const u32 pair = (power << DELTA_SOURCE_POWER_SHIFT) |
1771                                                  raw_offset;
1772                                 const u32 source = DELTA_SOURCE_TAG |
1773                                                    (pair + LZMS_NUM_DELTA_REPS - 1);
1774
1775                                 /* Early out for long explicit offset delta match  */
1776                                 if (len >= c->mf.nice_match_len) {
1777
1778                                         in_next = lzms_skip_bytes(c, len - 1, in_next + 1);
1779
1780                                         lzms_encode_item_list(c, cur_node);
1781                                         lzms_encode_item(c, len, source);
1782
1783                                         c->optimum_nodes[0].state = cur_node->state;
1784                                         cur_node = &c->optimum_nodes[0];
1785
1786                                         cur_node->state.upcoming_lz_offset = 0;
1787                                         cur_node->state.upcoming_delta_pair = pair;
1788                                         lzms_update_lru_queues(&cur_node->state);
1789                                         lzms_update_main_state(&cur_node->state, 1);
1790                                         lzms_update_match_state(&cur_node->state, 1);
1791                                         lzms_update_delta_state(&cur_node->state, 0);
1792                                         goto begin;
1793                                 }
1794
1795                                 while (end_node < cur_node + len)
1796                                         (++end_node)->cost = INFINITE_COST;
1797
1798                                 u32 base_cost = cur_node->cost +
1799                                                 lzms_bit_1_cost(cur_node->state.main_state,
1800                                                                 c->probs.main) +
1801                                                 lzms_bit_1_cost(cur_node->state.match_state,
1802                                                                 c->probs.match) +
1803                                                 lzms_bit_0_cost(cur_node->state.delta_state,
1804                                                                 c->probs.delta) +
1805                                                 lzms_delta_source_cost(c, power, raw_offset);
1806
1807                                 u32 l = NBYTES_HASHED_FOR_DELTA;
1808                                 do {
1809                                         u32 cost = base_cost + lzms_fast_length_cost(c, l);
1810                                         if (cost < (cur_node + l)->cost) {
1811                                                 (cur_node + l)->cost = cost;
1812                                                 (cur_node + l)->item = (struct lzms_item) {
1813                                                         .length = l,
1814                                                         .source = source,
1815                                                 };
1816                                                 (cur_node + l)->num_extra_items = 0;
1817                                         }
1818                                 } while (++l <= len);
1819                         }
1820                 }
1821
1822                 /* Literal  */
1823                 if (end_node < cur_node + 1)
1824                         (++end_node)->cost = INFINITE_COST;
1825                 const u32 cur_and_lit_cost = cur_node->cost +
1826                                              lzms_literal_cost(c, cur_node->state.main_state,
1827                                                                *in_next);
1828                 if (cur_and_lit_cost < (cur_node + 1)->cost) {
1829                         (cur_node + 1)->cost = cur_and_lit_cost;
1830                         (cur_node + 1)->item = (struct lzms_item) {
1831                                 .length = 1,
1832                                 .source = *in_next,
1833                         };
1834                         (cur_node + 1)->num_extra_items = 0;
1835                 } else if (c->try_lit_lzrep0 && in_end - (in_next + 1) >= 2) {
1836                         /* try lit + LZ-rep0  */
1837                         const u32 offset =
1838                                 (cur_node->state.prev_lz_offset) ?
1839                                         cur_node->state.prev_lz_offset :
1840                                         cur_node->state.recent_lz_offsets[0];
1841
1842                         if (load_u16_unaligned(in_next + 1) ==
1843                             load_u16_unaligned(in_next + 1 - offset))
1844                         {
1845                                 const u32 rep0_len = lz_extend(in_next + 1,
1846                                                                in_next + 1 - offset,
1847                                                                2,
1848                                                                min(in_end - (in_next + 1),
1849                                                                    c->mf.nice_match_len));
1850
1851                                 unsigned main_state = cur_node->state.main_state;
1852
1853                                 /* Update main_state after literal  */
1854                                 main_state = (main_state << 1 | 0) % LZMS_NUM_MAIN_PROBS;
1855
1856                                 /* Add cost of LZ-rep0  */
1857                                 const u32 cost = cur_and_lit_cost +
1858                                                  lzms_bit_1_cost(main_state, c->probs.main) +
1859                                                  lzms_bit_0_cost(cur_node->state.match_state,
1860                                                                  c->probs.match) +
1861                                                  lzms_bit_1_cost(cur_node->state.lz_state,
1862                                                                  c->probs.lz) +
1863                                                  lzms_bit_0_cost(cur_node->state.lz_rep_states[0],
1864                                                                  c->probs.lz_rep[0]) +
1865                                                  lzms_fast_length_cost(c, rep0_len);
1866
1867                                 const u32 total_len = 1 + rep0_len;
1868
1869                                 while (end_node < cur_node + total_len)
1870                                         (++end_node)->cost = INFINITE_COST;
1871
1872                                 if (cost < (cur_node + total_len)->cost) {
1873                                         (cur_node + total_len)->cost = cost;
1874                                         (cur_node + total_len)->item = (struct lzms_item) {
1875                                                 .length = rep0_len,
1876                                                 .source = 0,
1877                                         };
1878                                         (cur_node + total_len)->extra_items[0] = (struct lzms_item) {
1879                                                 .length = 1,
1880                                                 .source = *in_next,
1881                                         };
1882                                         (cur_node + total_len)->num_extra_items = 1;
1883                                 }
1884                         }
1885                 }
1886
1887                 /* Advance to the next position.  */
1888                 in_next++;
1889                 cur_node++;
1890
1891                 /* The lowest-cost path to the current position is now known.
1892                  * Finalize the adaptive state that results from taking this
1893                  * lowest-cost path.  */
1894                 struct lzms_item item_to_take = cur_node->item;
1895                 struct lzms_optimum_node *source_node = cur_node - item_to_take.length;
1896                 int next_item_idx = -1;
1897                 for (unsigned i = 0; i < cur_node->num_extra_items; i++) {
1898                         item_to_take = cur_node->extra_items[i];
1899                         source_node -= item_to_take.length;
1900                         next_item_idx++;
1901                 }
1902                 cur_node->state = source_node->state;
1903                 for (;;) {
1904                         const u32 length = item_to_take.length;
1905                         u32 source = item_to_take.source;
1906
1907                         cur_node->state.upcoming_lz_offset = 0;
1908                         cur_node->state.upcoming_delta_pair = 0;
1909                         if (length > 1) {
1910                                 /* Match  */
1911
1912                                 lzms_update_main_state(&cur_node->state, 1);
1913
1914                                 if (source & DELTA_SOURCE_TAG) {
1915                                         /* Delta match  */
1916
1917                                         lzms_update_match_state(&cur_node->state, 1);
1918                                         source &= ~DELTA_SOURCE_TAG;
1919
1920                                         if (source >= LZMS_NUM_DELTA_REPS) {
1921                                                 /* Explicit offset delta match  */
1922                                                 lzms_update_delta_state(&cur_node->state, 0);
1923                                                 cur_node->state.upcoming_delta_pair =
1924                                                         source - (LZMS_NUM_DELTA_REPS - 1);
1925                                         } else {
1926                                                 /* Repeat offset delta match  */
1927                                                 int rep_idx = source;
1928
1929                                                 lzms_update_delta_state(&cur_node->state, 1);
1930                                                 lzms_update_delta_rep_states(&cur_node->state, rep_idx);
1931
1932                                                 cur_node->state.upcoming_delta_pair =
1933                                                         cur_node->state.recent_delta_pairs[rep_idx];
1934
1935                                                 for (int i = rep_idx; i < LZMS_NUM_DELTA_REPS; i++)
1936                                                         cur_node->state.recent_delta_pairs[i] =
1937                                                                 cur_node->state.recent_delta_pairs[i + 1];
1938                                         }
1939                                 } else {
1940                                         lzms_update_match_state(&cur_node->state, 0);
1941
1942                                         if (source >= LZMS_NUM_LZ_REPS) {
1943                                                 /* Explicit offset LZ match  */
1944                                                 lzms_update_lz_state(&cur_node->state, 0);
1945                                                 cur_node->state.upcoming_lz_offset =
1946                                                         source - (LZMS_NUM_LZ_REPS - 1);
1947                                         } else {
1948                                                 /* Repeat offset LZ match  */
1949                                                 int rep_idx = source;
1950
1951                                                 lzms_update_lz_state(&cur_node->state, 1);
1952                                                 lzms_update_lz_rep_states(&cur_node->state, rep_idx);
1953
1954                                                 cur_node->state.upcoming_lz_offset =
1955                                                         cur_node->state.recent_lz_offsets[rep_idx];
1956
1957                                                 for (int i = rep_idx; i < LZMS_NUM_LZ_REPS; i++)
1958                                                         cur_node->state.recent_lz_offsets[i] =
1959                                                                 cur_node->state.recent_lz_offsets[i + 1];
1960                                         }
1961                                 }
1962                         } else {
1963                                 /* Literal  */
1964                                 lzms_update_main_state(&cur_node->state, 0);
1965                         }
1966
1967                         lzms_update_lru_queues(&cur_node->state);
1968
1969                         if (next_item_idx < 0)
1970                                 break;
1971                         if (next_item_idx == 0)
1972                                 item_to_take = cur_node->item;
1973                         else
1974                                 item_to_take = cur_node->extra_items[next_item_idx - 1];
1975                         --next_item_idx;
1976                 }
1977
1978                 /*
1979                  * This loop will terminate when either of the following
1980                  * conditions is true:
1981                  *
1982                  * (1) cur_node == end_node
1983                  *
1984                  *      There are no paths that extend beyond the current
1985                  *      position.  In this case, any path to a later position
1986                  *      must pass through the current position, so we can go
1987                  *      ahead and choose the list of items that led to this
1988                  *      position.
1989                  *
1990                  * (2) cur_node == &c->optimum_nodes[NUM_OPTIM_NODES]
1991                  *
1992                  *      This bounds the number of times the algorithm can step
1993                  *      forward before it is guaranteed to start choosing items.
1994                  *      This limits the memory usage.  It also guarantees that
1995                  *      the parser will not go too long without updating the
1996                  *      probability tables.
1997                  *
1998                  * Note: no check for end-of-buffer is needed because
1999                  * end-of-buffer will trigger condition (1).
2000                  */
2001                 if (cur_node == end_node ||
2002                     cur_node == &c->optimum_nodes[NUM_OPTIM_NODES])
2003                 {
2004                         lzms_encode_nonempty_item_list(c, cur_node);
2005                         c->optimum_nodes[0].state = cur_node->state;
2006                         goto begin;
2007                 }
2008         }
2009 }
2010
2011 static void
2012 lzms_init_states_and_probabilities(struct lzms_compressor *c)
2013 {
2014         c->main_state = 0;
2015         c->match_state = 0;
2016         c->lz_state = 0;
2017         for (int i = 0; i < LZMS_NUM_LZ_REP_DECISIONS; i++)
2018                 c->lz_rep_states[i] = 0;
2019         c->delta_state = 0;
2020         for (int i = 0; i < LZMS_NUM_DELTA_REP_DECISIONS; i++)
2021                 c->delta_rep_states[i] = 0;
2022
2023         lzms_init_probabilities(&c->probs);
2024 }
2025
2026 static void
2027 lzms_init_huffman_codes(struct lzms_compressor *c, unsigned num_offset_slots)
2028 {
2029         lzms_init_huffman_code(&c->literal_rebuild_info,
2030                                LZMS_NUM_LITERAL_SYMS,
2031                                LZMS_LITERAL_CODE_REBUILD_FREQ,
2032                                c->literal_codewords,
2033                                c->literal_lens,
2034                                c->literal_freqs);
2035
2036         lzms_init_huffman_code(&c->lz_offset_rebuild_info,
2037                                num_offset_slots,
2038                                LZMS_LZ_OFFSET_CODE_REBUILD_FREQ,
2039                                c->lz_offset_codewords,
2040                                c->lz_offset_lens,
2041                                c->lz_offset_freqs);
2042
2043         lzms_init_huffman_code(&c->length_rebuild_info,
2044                                LZMS_NUM_LENGTH_SYMS,
2045                                LZMS_LENGTH_CODE_REBUILD_FREQ,
2046                                c->length_codewords,
2047                                c->length_lens,
2048                                c->length_freqs);
2049
2050         lzms_init_huffman_code(&c->delta_offset_rebuild_info,
2051                                num_offset_slots,
2052                                LZMS_DELTA_OFFSET_CODE_REBUILD_FREQ,
2053                                c->delta_offset_codewords,
2054                                c->delta_offset_lens,
2055                                c->delta_offset_freqs);
2056
2057         lzms_init_huffman_code(&c->delta_power_rebuild_info,
2058                                LZMS_NUM_DELTA_POWER_SYMS,
2059                                LZMS_DELTA_POWER_CODE_REBUILD_FREQ,
2060                                c->delta_power_codewords,
2061                                c->delta_power_lens,
2062                                c->delta_power_freqs);
2063 }
2064
2065 /*
2066  * Flush the output streams, prepare the final compressed data, and return its
2067  * size in bytes.
2068  *
2069  * A return value of 0 indicates that the data could not be compressed to fit in
2070  * the available space.
2071  */
2072 static size_t
2073 lzms_finalize(struct lzms_compressor *c)
2074 {
2075         size_t num_forwards_bytes;
2076         size_t num_backwards_bytes;
2077
2078         /* Flush both the forwards and backwards streams, and make sure they
2079          * didn't cross each other and start overwriting each other's data.  */
2080         if (!lzms_output_bitstream_flush(&c->os))
2081                 return 0;
2082
2083         if (!lzms_range_encoder_flush(&c->rc))
2084                 return 0;
2085
2086         if (c->rc.next > c->os.next)
2087                 return 0;
2088
2089         /* Now the compressed buffer contains the data output by the forwards
2090          * bitstream, then empty space, then data output by the backwards
2091          * bitstream.  Move the data output by the backwards bitstream to be
2092          * adjacent to the data output by the forward bitstream, and calculate
2093          * the compressed size that this results in.  */
2094         num_forwards_bytes = c->rc.next - c->rc.begin;
2095         num_backwards_bytes = c->rc.end - c->os.next;
2096
2097         memmove(c->rc.next, c->os.next, num_backwards_bytes);
2098
2099         return num_forwards_bytes + num_backwards_bytes;
2100 }
2101
2102 static u64
2103 lzms_get_needed_memory(size_t max_bufsize, unsigned compression_level,
2104                        bool destructive)
2105 {
2106         u64 size = 0;
2107
2108         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2109                 return 0;
2110
2111         size += sizeof(struct lzms_compressor);
2112
2113         if (!destructive)
2114                 size += max_bufsize; /* in_buffer */
2115
2116         /* mf */
2117         size += lcpit_matchfinder_get_needed_memory(max_bufsize);
2118
2119         return size;
2120 }
2121
2122 static int
2123 lzms_create_compressor(size_t max_bufsize, unsigned compression_level,
2124                        bool destructive, void **c_ret)
2125 {
2126         struct lzms_compressor *c;
2127         u32 nice_match_len;
2128
2129         if (max_bufsize > LZMS_MAX_BUFFER_SIZE)
2130                 return WIMLIB_ERR_INVALID_PARAM;
2131
2132         c = ALIGNED_MALLOC(sizeof(struct lzms_compressor), 64);
2133         if (!c)
2134                 goto oom0;
2135
2136         c->destructive = destructive;
2137
2138         /* Scale nice_match_len with the compression level.  But to allow an
2139          * optimization for length cost calculations, don't allow nice_match_len
2140          * to exceed MAX_FAST_LENGTH.  */
2141         nice_match_len = min(((u64)compression_level * 63) / 50, MAX_FAST_LENGTH);
2142
2143         c->use_delta_matches = (compression_level >= 35);
2144         c->try_lzmatch_lit_lzrep0 = (compression_level >= 45);
2145         c->try_lit_lzrep0 = (compression_level >= 60);
2146         c->try_lzrep_lit_lzrep0 = (compression_level >= 60);
2147
2148         if (!c->destructive) {
2149                 c->in_buffer = MALLOC(max_bufsize);
2150                 if (!c->in_buffer)
2151                         goto oom1;
2152         }
2153
2154         if (!lcpit_matchfinder_init(&c->mf, max_bufsize, 2, nice_match_len))
2155                 goto oom2;
2156
2157         lzms_init_fast_length_slot_tab(c);
2158         lzms_init_offset_slot_tabs(c);
2159
2160         *c_ret = c;
2161         return 0;
2162
2163 oom2:
2164         if (!c->destructive)
2165                 FREE(c->in_buffer);
2166 oom1:
2167         ALIGNED_FREE(c);
2168 oom0:
2169         return WIMLIB_ERR_NOMEM;
2170 }
2171
2172 static size_t
2173 lzms_compress(const void *restrict in, size_t in_nbytes,
2174               void *restrict out, size_t out_nbytes_avail, void *restrict _c)
2175 {
2176         struct lzms_compressor *c = _c;
2177         size_t result;
2178
2179         /* Don't bother trying to compress extremely small inputs.  */
2180         if (in_nbytes < 4)
2181                 return 0;
2182
2183         /* Copy the input data into the internal buffer and preprocess it.  */
2184         if (c->destructive)
2185                 c->in_buffer = (void *)in;
2186         else
2187                 memcpy(c->in_buffer, in, in_nbytes);
2188         c->in_nbytes = in_nbytes;
2189         lzms_x86_filter(c->in_buffer, in_nbytes, c->last_target_usages, false);
2190
2191         /* Prepare the matchfinders.  */
2192         lcpit_matchfinder_load_buffer(&c->mf, c->in_buffer, c->in_nbytes);
2193         if (c->use_delta_matches)
2194                 lzms_init_delta_matchfinder(c);
2195
2196         /* Initialize the encoder structures.  */
2197         lzms_range_encoder_init(&c->rc, out, out_nbytes_avail);
2198         lzms_output_bitstream_init(&c->os, out, out_nbytes_avail);
2199         lzms_init_states_and_probabilities(c);
2200         lzms_init_huffman_codes(c, lzms_get_num_offset_slots(c->in_nbytes));
2201
2202         /* The main loop: parse and encode.  */
2203         lzms_near_optimal_parse(c);
2204
2205         /* Return the compressed data size or 0.  */
2206         result = lzms_finalize(c);
2207         if (!result && c->destructive)
2208                 lzms_x86_filter(c->in_buffer, c->in_nbytes, c->last_target_usages, true);
2209         return result;
2210 }
2211
2212 static void
2213 lzms_free_compressor(void *_c)
2214 {
2215         struct lzms_compressor *c = _c;
2216
2217         if (!c->destructive)
2218                 FREE(c->in_buffer);
2219         lcpit_matchfinder_destroy(&c->mf);
2220         ALIGNED_FREE(c);
2221 }
2222
2223 const struct compressor_ops lzms_compressor_ops = {
2224         .get_needed_memory  = lzms_get_needed_memory,
2225         .create_compressor  = lzms_create_compressor,
2226         .compress           = lzms_compress,
2227         .free_compressor    = lzms_free_compressor,
2228 };